From c41b20e721ea4f6f20f66a66e7f0c3c97a2ca9c2 Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Fri, 11 Dec 2009 16:35:39 -0500 Subject: Fix misspellings of "truly" in comments. Some comments misspell "truly"; this fixes them. No code changes. Signed-off-by: Adam Buchbinder Signed-off-by: Jiri Kosina --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8c1b2d290718..9ab578f1bb65 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2541,7 +2541,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable); * @buffer: The ring buffer to enable writes * * Note, multiple disables will need the same number of enables - * to truely enable the writing (much like preempt_disable). + * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable(struct ring_buffer *buffer) { @@ -2577,7 +2577,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); * @cpu: The CPU to enable. * * Note, multiple disables will need the same number of enables - * to truely enable the writing (much like preempt_disable). + * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) { -- cgit v1.2.3 From 2a61aa401638529cd4231f6106980d307fba98fa Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Fri, 11 Dec 2009 16:35:40 -0500 Subject: Fix misspellings of "invocation" in comments. Some comments misspell "invocation"; this fixes them. No code changes. Signed-off-by: Adam Buchbinder Signed-off-by: Jiri Kosina --- fs/buffer.c | 2 +- fs/mpage.c | 2 +- include/linux/mmzone.h | 2 +- kernel/sched_cpupri.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/fs/buffer.c b/fs/buffer.c index 6fa530256bfd..1d920bab5e70 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2893,7 +2893,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, /* * The page straddles i_size. It must be zeroed out on each and every - * writepage invokation because it may be mmapped. "A file is mapped + * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." diff --git a/fs/mpage.c b/fs/mpage.c index 42381bd6543b..598d54e200eb 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -561,7 +561,7 @@ page_is_mapped: if (page->index >= end_index) { /* * The page straddles i_size. It must be zeroed out on each - * and every writepage invokation because it may be mmapped. + * and every writepage invocation because it may be mmapped. * "A file is mapped in multiples of the page size. For a file * that is not a multiple of the page size, the remaining memory * is zeroed when mapped, and writes to that region are not diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 30fe668c2542..e60a340fe890 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -349,7 +349,7 @@ struct zone { * prev_priority holds the scanning priority for this zone. It is * defined as the scanning priority at which we achieved our reclaim * target at the previous try_to_free_pages() or balance_pgdat() - * invokation. + * invocation. * * We use prev_priority as a measure of how much stress page reclaim is * under - it drives the swappiness decision: whether to unmap mapped diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 597b33099dfa..3db4b1a0e921 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -58,7 +58,7 @@ static int convert_prio(int prio) * @lowest_mask: A mask to fill in with selected CPUs (or NULL) * * Note: This function returns the recommended CPUs as calculated during the - * current invokation. By the time the call returns, the CPUs may have in + * current invocation. By the time the call returns, the CPUs may have in * fact changed priorities any number of times. While not ideal, it is not * an issue of correctness since the normal rebalancer logic will correct * any discrepancies created by racing against the uncertainty of the current -- cgit v1.2.3 From c9404c9c392d557a4687c4cbda022b03cb787ce9 Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Fri, 18 Dec 2009 15:40:42 -0500 Subject: Fix misspelling of "should" and "shouldn't" in comments. Some comments misspell "should" or "shouldn't"; this fixes them. No code changes. Signed-off-by: Adam Buchbinder Signed-off-by: Jiri Kosina --- arch/x86/kernel/ptrace.c | 2 +- drivers/ata/libata-core.c | 2 +- drivers/gpu/drm/radeon/atombios.h | 2 +- drivers/ieee1394/pcilynx.c | 2 +- drivers/input/tablet/aiptek.c | 2 +- drivers/mmc/card/sdio_uart.c | 2 +- drivers/scsi/lpfc/lpfc_scsi.c | 4 ++-- drivers/video/sstfb.c | 2 +- fs/locks.c | 2 +- kernel/audit.c | 2 +- mm/slub.c | 2 +- 11 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 017d937639fe..118428085ea2 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -604,7 +604,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, struct perf_event_attr attr; /* - * We shoud have at least an inactive breakpoint at this + * We should have at least an inactive breakpoint at this * slot. It means the user is writing dr7 without having * written the address register first */ diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 6728328f3bea..698ef474767e 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -2232,7 +2232,7 @@ retry: * Some drives were very specific about that exact sequence. * * Note that ATA4 says lba is mandatory so the second check - * shoud never trigger. + * should never trigger. */ if (ata_id_major_version(id) < 4 || !ata_id_has_lba(id)) { err_mask = ata_dev_init_params(dev, id[3], id[6]); diff --git a/drivers/gpu/drm/radeon/atombios.h b/drivers/gpu/drm/radeon/atombios.h index 91ad0d1c1b17..2a88029f6a1e 100644 --- a/drivers/gpu/drm/radeon/atombios.h +++ b/drivers/gpu/drm/radeon/atombios.h @@ -2275,7 +2275,7 @@ typedef struct _ATOM_LCD_RTS_RECORD { UCHAR ucRTSValue; } ATOM_LCD_RTS_RECORD; -/* !! If the record below exits, it shoud always be the first record for easy use in command table!!! */ +/* !! If the record below exits, it should always be the first record for easy use in command table!!! */ typedef struct _ATOM_LCD_MODE_CONTROL_CAP { UCHAR ucRecordType; USHORT usLCDCap; diff --git a/drivers/ieee1394/pcilynx.c b/drivers/ieee1394/pcilynx.c index 9555fd253865..bf47fee79808 100644 --- a/drivers/ieee1394/pcilynx.c +++ b/drivers/ieee1394/pcilynx.c @@ -1452,7 +1452,7 @@ static int __devinit add_card(struct pci_dev *dev, PRINT(KERN_ERR, lynx->id, "unable to read bus info block from i2c"); } else { PRINT(KERN_INFO, lynx->id, "got bus info block from serial eeprom"); - /* FIXME: probably we shoud rewrite the max_rec, max_ROM(1394a), + /* FIXME: probably we should rewrite the max_rec, max_ROM(1394a), * generation(1394a) and link_spd(1394a) field and recalculate * the CRC */ diff --git a/drivers/input/tablet/aiptek.c b/drivers/input/tablet/aiptek.c index 7d005a3616d7..4be039d7dcad 100644 --- a/drivers/input/tablet/aiptek.c +++ b/drivers/input/tablet/aiptek.c @@ -362,7 +362,7 @@ static const int macroKeyEvents[] = { }; /*********************************************************************** - * Map values to strings and back. Every map shoudl have the following + * Map values to strings and back. Every map should have the following * as its last element: { NULL, AIPTEK_INVALID_VALUE }. */ #define AIPTEK_INVALID_VALUE -1 diff --git a/drivers/mmc/card/sdio_uart.c b/drivers/mmc/card/sdio_uart.c index f53755533e7e..a4e37758be41 100644 --- a/drivers/mmc/card/sdio_uart.c +++ b/drivers/mmc/card/sdio_uart.c @@ -581,7 +581,7 @@ static int uart_carrier_raised(struct tty_port *tport) struct sdio_uart_port *port = container_of(tport, struct sdio_uart_port, port); unsigned int ret = sdio_uart_claim_func(port); - if (ret) /* Missing hardware shoudn't block for carrier */ + if (ret) /* Missing hardware shouldn't block for carrier */ return 1; ret = sdio_uart_get_mctrl(port); sdio_uart_release_func(port); diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c index a246410ce9df..28c6bfd3e82e 100644 --- a/drivers/scsi/lpfc/lpfc_scsi.c +++ b/drivers/scsi/lpfc/lpfc_scsi.c @@ -1574,7 +1574,7 @@ lpfc_bg_scsi_prep_dma_buf(struct lpfc_hba *phba, case LPFC_PG_TYPE_NO_DIF: num_bde = lpfc_bg_setup_bpl(phba, scsi_cmnd, bpl, datasegcnt); - /* we shoud have 2 or more entries in buffer list */ + /* we should have 2 or more entries in buffer list */ if (num_bde < 2) goto err; break; @@ -1611,7 +1611,7 @@ lpfc_bg_scsi_prep_dma_buf(struct lpfc_hba *phba, num_bde = lpfc_bg_setup_bpl_prot(phba, scsi_cmnd, bpl, datasegcnt, protsegcnt); - /* we shoud have 3 or more entries in buffer list */ + /* we should have 3 or more entries in buffer list */ if (num_bde < 3) goto err; break; diff --git a/drivers/video/sstfb.c b/drivers/video/sstfb.c index 609d0a521ca2..79840f11fecb 100644 --- a/drivers/video/sstfb.c +++ b/drivers/video/sstfb.c @@ -1102,7 +1102,7 @@ static void sst_set_vidmod_ics(struct fb_info *info, const int bpp) * detect dac type * prerequisite : write to FbiInitx enabled, video and fbi and pci fifo reset, * dram refresh disabled, FbiInit remaped. - * TODO: mmh.. maybe i shoud put the "prerequisite" in the func ... + * TODO: mmh.. maybe i should put the "prerequisite" in the func ... */ diff --git a/fs/locks.c b/fs/locks.c index a8794f233bc9..cde572db112f 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1454,7 +1454,7 @@ EXPORT_SYMBOL(generic_setlease); * leases held by processes on this node. * * There is also no break_lease method; filesystems that - * handle their own leases shoud break leases themselves from the + * handle their own leases should break leases themselves from the * filesystem's open, create, and (on truncate) setattr methods. * * Warning: the only current setlease methods exist only to disable diff --git a/kernel/audit.c b/kernel/audit.c index 5feed232be9d..78f7f86aa238 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -398,7 +398,7 @@ static void kauditd_send_skb(struct sk_buff *skb) skb_get(skb); err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); if (err < 0) { - BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ + BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); audit_log_lost("auditd dissapeared\n"); audit_pid = 0; diff --git a/mm/slub.c b/mm/slub.c index 8d71aaf888d7..00e0961b11fe 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3086,7 +3086,7 @@ static void slab_mem_offline_callback(void *arg) /* * if n->nr_slabs > 0, slabs still exist on the node * that is going down. We were unable to free them, - * and offline_pages() function shoudn't call this + * and offline_pages() function shouldn't call this * callback. So, we must fail. */ BUG_ON(slabs_node(s, offline_node)); -- cgit v1.2.3 From af66585270ef99aa6097faf3bd7344855077e75d Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Sun, 17 Jan 2010 19:14:26 -0200 Subject: fix comment typo boo -> boot in ksysfs.c Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Jiri Kosina --- kernel/ksysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 3feaf5a74514..ac08efca54c3 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj, } KERNEL_ATTR_RO(uevent_seqnum); -/* uevent helper program, used during early boo */ +/* uevent helper program, used during early boot */ static ssize_t uevent_helper_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { -- cgit v1.2.3 From 350f82586b7554240bee18c41cc5c842f63265ae Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Mon, 1 Feb 2010 18:26:59 -0500 Subject: Remove redundant trailing semicolons from macros Signed-off-by: Edward Z. Yang Acked-by: Rusty Russell Signed-off-by: Jiri Kosina --- kernel/params.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index cf1b69183127..2278ce244cf8 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -402,8 +402,8 @@ int param_get_string(char *buffer, struct kernel_param *kp) } /* sysfs output in /sys/modules/XYZ/parameters/ */ -#define to_module_attr(n) container_of(n, struct module_attribute, attr); -#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); +#define to_module_attr(n) container_of(n, struct module_attribute, attr) +#define to_module_kobject(n) container_of(n, struct module_kobject, kobj) extern struct kernel_param __start___param[], __stop___param[]; @@ -421,7 +421,7 @@ struct module_param_attrs }; #ifdef CONFIG_SYSFS -#define to_param_attr(n) container_of(n, struct param_attribute, mattr); +#define to_param_attr(n) container_of(n, struct param_attribute, mattr) static ssize_t param_attr_show(struct module_attribute *mattr, struct module *mod, char *buf) -- cgit v1.2.3 From 9ce8e498ee58bb8a866a6c3c08fcb385ed66e9d2 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Tue, 2 Feb 2010 08:54:51 +0200 Subject: devres: typo fix s/dev/devm/ Signed-off-by: Baruch Siach Signed-off-by: Jiri Kosina --- kernel/irq/devres.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index d06df9c41cba..30d56bafc9c2 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq); * Except for the extra @dev argument, this function takes the * same arguments and performs the same function as free_irq(). * This function instead of free_irq() should be used to manually - * free IRQs allocated with dev_request_irq(). + * free IRQs allocated with devm_request_irq(). */ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) { -- cgit v1.2.3 From 1537a3638cbf741d3826c1002026cce487a6bee0 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Fri, 29 Jan 2010 15:57:49 +0800 Subject: tree-wide: fix 'lenght' typo in comments and code Some misspelled occurences of 'octet' and some comments were also fixed as I was on it. Signed-off-by: Daniel Mack Cc: Jiri Kosina Cc: Joe Perches Cc: Junio C Hamano Signed-off-by: Jiri Kosina --- arch/arm/mach-at91/include/mach/at91_mci.h | 2 +- arch/s390/crypto/sha_common.c | 2 +- drivers/crypto/hifn_795x.c | 2 +- drivers/isdn/mISDN/l1oip_core.c | 2 +- drivers/usb/wusbcore/wusbhc.h | 2 +- drivers/uwb/uwbd.c | 2 +- fs/cifs/asn1.c | 2 +- fs/ext4/mballoc.c | 2 +- fs/udf/inode.c | 2 +- kernel/trace/trace.h | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/arch/arm/mach-at91/include/mach/at91_mci.h b/arch/arm/mach-at91/include/mach/at91_mci.h index 550d503a1bca..57f8ee154943 100644 --- a/arch/arm/mach-at91/include/mach/at91_mci.h +++ b/arch/arm/mach-at91/include/mach/at91_mci.h @@ -77,7 +77,7 @@ #define AT91_MCI_BLKR 0x18 /* Block Register */ #define AT91_MCI_BLKR_BCNT(n) ((0xffff & (n)) << 0) /* Block count */ -#define AT91_MCI_BLKR_BLKLEN(n) ((0xffff & (n)) << 16) /* Block lenght */ +#define AT91_MCI_BLKR_BLKLEN(n) ((0xffff & (n)) << 16) /* Block length */ #define AT91_MCI_RSPR(n) (0x20 + ((n) * 4)) /* Response Registers 0-3 */ #define AT91_MCR_RDR 0x30 /* Receive Data Register */ diff --git a/arch/s390/crypto/sha_common.c b/arch/s390/crypto/sha_common.c index 7903ec47e6b9..f42dbabc0d30 100644 --- a/arch/s390/crypto/sha_common.c +++ b/arch/s390/crypto/sha_common.c @@ -79,7 +79,7 @@ int s390_sha_final(struct shash_desc *desc, u8 *out) memset(ctx->buf + index, 0x00, end - index - 8); /* - * Append message length. Well, SHA-512 wants a 128 bit lenght value, + * Append message length. Well, SHA-512 wants a 128 bit length value, * nevertheless we use u64, should be enough for now... */ bits = ctx->count * 8; diff --git a/drivers/crypto/hifn_795x.c b/drivers/crypto/hifn_795x.c index 09ad9154d86c..73e8b1713b54 100644 --- a/drivers/crypto/hifn_795x.c +++ b/drivers/crypto/hifn_795x.c @@ -321,7 +321,7 @@ static atomic_t hifn_dev_number; #define HIFN_PUBOPLEN_MOD_M 0x0000007f /* modulus length mask */ #define HIFN_PUBOPLEN_MOD_S 0 /* modulus length shift */ #define HIFN_PUBOPLEN_EXP_M 0x0003ff80 /* exponent length mask */ -#define HIFN_PUBOPLEN_EXP_S 7 /* exponent lenght shift */ +#define HIFN_PUBOPLEN_EXP_S 7 /* exponent length shift */ #define HIFN_PUBOPLEN_RED_M 0x003c0000 /* reducend length mask */ #define HIFN_PUBOPLEN_RED_S 18 /* reducend length shift */ diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c index 0843fcf8b381..325b1ad7d4b8 100644 --- a/drivers/isdn/mISDN/l1oip_core.c +++ b/drivers/isdn/mISDN/l1oip_core.c @@ -477,7 +477,7 @@ l1oip_socket_parse(struct l1oip *hc, struct sockaddr_in *sin, u8 *buf, int len) printk(KERN_DEBUG "%s: received frame, parsing... (%d)\n", __func__, len); - /* check lenght */ + /* check length */ if (len < 1+1+2) { printk(KERN_WARNING "%s: packet error - length %d below " "4 bytes\n", __func__, len); diff --git a/drivers/usb/wusbcore/wusbhc.h b/drivers/usb/wusbcore/wusbhc.h index fd2fd4e277e1..759cda55f7c3 100644 --- a/drivers/usb/wusbcore/wusbhc.h +++ b/drivers/usb/wusbcore/wusbhc.h @@ -198,7 +198,7 @@ struct wusb_port { * ports) this HC will take. Read-only. * * @port Array of port status for each fake root port. Guaranteed to - * always be the same lenght during device existence + * always be the same length during device existence * [this allows for some unlocked but referenced reading]. * * @mmcies_max Max number of Information Elements this HC can send diff --git a/drivers/uwb/uwbd.c b/drivers/uwb/uwbd.c index 5a777d8624da..6210fe1fd1bb 100644 --- a/drivers/uwb/uwbd.c +++ b/drivers/uwb/uwbd.c @@ -43,7 +43,7 @@ * * EVENTS * - * Events have a type, a subtype, a lenght, some other stuff and the + * Events have a type, a subtype, a length, some other stuff and the * data blob, which depends on the event. The header is 'struct * uwb_event'; for payloads, see 'struct uwbd_evt_*'. * diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index 20692fbfdb24..a20bea598933 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -136,7 +136,7 @@ asn1_enum_decode(struct asn1_ctx *ctx, __le32 *val) return 0; } - ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to lenght octet */ + ch = *(ctx->pointer)++; /* ch has 0xa, ptr points to length octet */ if ((ch) == ASN1_ENUM) /* if ch value is ENUM, 0xa */ *val = *(++(ctx->pointer)); /* value has enum value */ else diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index d34afad3e137..b794dd8141a0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -69,7 +69,7 @@ * * pa_lstart -> the logical start block for this prealloc space * pa_pstart -> the physical start block for this prealloc space - * pa_len -> lenght for this prealloc space + * pa_len -> length for this prealloc space * pa_free -> free space available in this prealloc space * * The inode preallocation space is used looking at the _logical_ start diff --git a/fs/udf/inode.c b/fs/udf/inode.c index f90231eb2916..772a4fa557f2 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -102,7 +102,7 @@ void udf_clear_inode(struct inode *inode) if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB && inode->i_size != iinfo->i_lenExtents) { printk(KERN_WARNING "UDF-fs (%s): Inode %lu (mode %o) has " - "inode size %llu different from extent lenght %llu. " + "inode size %llu different from extent length %llu. " "Filesystem need not be standards compliant.\n", inode->i_sb->s_id, inode->i_ino, inode->i_mode, (unsigned long long)inode->i_size, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4df6a77eb196..e4b32c8aa85f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -549,7 +549,7 @@ static inline int ftrace_trace_task(struct task_struct *task) * struct trace_parser - servers for reading the user input separated by spaces * @cont: set if the input is not complete - no final space char was found * @buffer: holds the parsed user input - * @idx: user input lenght + * @idx: user input length * @size: buffer size */ struct trace_parser { -- cgit v1.2.3 From b80109e256bc17ed66c9d559175f087b03ca2a8e Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 9 Feb 2010 15:07:40 +1100 Subject: Remove reference to kthread_create_on_cpu kthread_create_on_cpu doesn't exist so update a comment in kthread.c to reflect this. Signed-off-by: Anton Blanchard Acked-by: Rusty Russell Signed-off-by: Jiri Kosina --- kernel/kthread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index fbb6222fe7e0..82ed0ea15194 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create) * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start - * it. See also kthread_run(), kthread_create_on_cpu(). + * it. See also kthread_run(). * * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either call do_exit() directly if it is a -- cgit v1.2.3 From 5c42dc7070c94622ca914b5a2e227f3744e857e7 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Thu, 11 Feb 2010 15:04:36 +0100 Subject: devres/irq: Fix devm_irq_match comment Fix the reference (in comment). Signed-off-by: Jean Delvare Signed-off-by: Jiri Kosina --- kernel/irq/devres.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 30d56bafc9c2..1ef4ffcdfa55 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data) * automatically freed on driver detach. * * If an IRQ allocated with this function needs to be freed - * separately, dev_free_irq() must be used. + * separately, devm_free_irq() must be used. */ int devm_request_threaded_irq(struct device *dev, unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, -- cgit v1.2.3 From dfff0615d28bdb3e8d213e5537dd069265912667 Mon Sep 17 00:00:00 2001 From: Uwe Kleine-König Date: Fri, 12 Feb 2010 21:58:11 +0100 Subject: tree-wide: fix typos "ass?o[sc]iac?te" -> "associate" in comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uwe Kleine-König Signed-off-by: Jiri Kosina --- drivers/usb/musb/davinci.c | 2 +- drivers/zorro/zorro.ids | 2 +- include/net/irda/irttp.h | 2 +- kernel/irq/chip.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/drivers/usb/musb/davinci.c b/drivers/usb/musb/davinci.c index 66913811af5e..a883f9dd3f8a 100644 --- a/drivers/usb/musb/davinci.c +++ b/drivers/usb/musb/davinci.c @@ -274,7 +274,7 @@ static irqreturn_t davinci_interrupt(int irq, void *__hci) /* NOTE: DaVinci shadows the Mentor IRQs. Don't manage them through * the Mentor registers (except for setup), use the TI ones and EOI. * - * Docs describe irq "vector" registers asociated with the CPPI and + * Docs describe irq "vector" registers associated with the CPPI and * USB EOI registers. These hold a bitmask corresponding to the * current IRQ, not an irq handler address. Would using those bits * resolve some of the races observed in this dispatch code?? diff --git a/drivers/zorro/zorro.ids b/drivers/zorro/zorro.ids index 0c0f99e2dd62..de24e3decedd 100644 --- a/drivers/zorro/zorro.ids +++ b/drivers/zorro/zorro.ids @@ -108,7 +108,7 @@ 0c00 500XP/SupraDrive WordSync [SCSI Host Adapter] 0d00 SupraDrive WordSync II [SCSI Host Adapter] 1000 2400zi+ [Modem] -0422 Computer Systems Assosiates +0422 Computer Systems Associates 1100 Magnum 40 [Accelerator and SCSI Host Adapter] 1500 12 Gauge [SCSI Host Adapter] 0439 Marc Michael Groth diff --git a/include/net/irda/irttp.h b/include/net/irda/irttp.h index 0788c23d2828..11aee7a2972a 100644 --- a/include/net/irda/irttp.h +++ b/include/net/irda/irttp.h @@ -97,7 +97,7 @@ #define TTP_MAX_SDU_SIZE 0x01 /* - * This structure contains all data assosiated with one instance of a TTP + * This structure contains all data associated with one instance of a TTP * connection. */ struct tsap_cb { diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ecc3fa28f666..ec8a96382461 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -520,7 +520,7 @@ out: * signal. The occurence is latched into the irq controller hardware * and must be acked in order to be reenabled. After the ack another * interrupt can happen on the same source even before the first one - * is handled by the assosiacted event handler. If this happens it + * is handled by the associated event handler. If this happens it * might be necessary to disable (mask) the interrupt depending on the * controller hardware. This requires to reenable the interrupt inside * of the loop which handles the interrupts which have arrived while -- cgit v1.2.3 From 44ee63587dce85593c22497140db16f4e5027860 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 17 Feb 2010 10:50:50 +0900 Subject: percpu: Add __percpu sparse annotations to hw_breakpoint Add __percpu sparse annotations to hw_breakpoint. These annotations are to make sparse consider percpu variables to be in a different address space and warn if accessed without going through percpu accessors. This patch doesn't affect normal builds. In kernel/hw_breakpoint.c, per_cpu(nr_task_bp_pinned, cpu)'s will trigger spurious noderef related warnings from sparse. Changing it to &per_cpu(nr_task_bp_pinned[0], cpu) will work around the problem but deemed to ugly by the maintainer. Leave it alone until better solution can be found. Signed-off-by: Tejun Heo Cc: Stephen Rothwell Cc: K.Prasad LKML-Reference: <4B7B4B7A.9050902@kernel.org> Signed-off-by: Frederic Weisbecker --- include/linux/hw_breakpoint.h | 8 ++++---- kernel/hw_breakpoint.c | 10 +++++----- samples/hw_breakpoint/data_breakpoint.c | 6 +++--- 3 files changed, 12 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 5977b724f7c6..c70d27af03f9 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -66,14 +66,14 @@ register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, perf_overflow_handler_t triggered, int cpu); -extern struct perf_event ** +extern struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered); extern int register_perf_hw_breakpoint(struct perf_event *bp); extern int __register_perf_hw_breakpoint(struct perf_event *bp); extern void unregister_hw_breakpoint(struct perf_event *bp); -extern void unregister_wide_hw_breakpoint(struct perf_event **cpu_events); +extern void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events); extern int dbg_reserve_bp_slot(struct perf_event *bp); extern int dbg_release_bp_slot(struct perf_event *bp); @@ -100,7 +100,7 @@ static inline struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, perf_overflow_handler_t triggered, int cpu) { return NULL; } -static inline struct perf_event ** +static inline struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered) { return NULL; } static inline int @@ -109,7 +109,7 @@ static inline int __register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } static inline void unregister_hw_breakpoint(struct perf_event *bp) { } static inline void -unregister_wide_hw_breakpoint(struct perf_event **cpu_events) { } +unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { } static inline int reserve_bp_slot(struct perf_event *bp) {return -ENOSYS; } static inline void release_bp_slot(struct perf_event *bp) { } diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 967e66143e11..6542eacb3fa5 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -413,17 +413,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); * * @return a set of per_cpu pointers to perf events */ -struct perf_event ** +struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered) { - struct perf_event **cpu_events, **pevent, *bp; + struct perf_event * __percpu *cpu_events, **pevent, *bp; long err; int cpu; cpu_events = alloc_percpu(typeof(*cpu_events)); if (!cpu_events) - return ERR_PTR(-ENOMEM); + return (void __percpu __force *)ERR_PTR(-ENOMEM); get_online_cpus(); for_each_online_cpu(cpu) { @@ -451,7 +451,7 @@ fail: put_online_cpus(); free_percpu(cpu_events); - return ERR_PTR(err); + return (void __percpu __force *)ERR_PTR(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); @@ -459,7 +459,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel * @cpu_events: the per cpu set of events to unregister */ -void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) +void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { int cpu; struct perf_event **pevent; diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c index c69cbe9b2426..bd0f337afcab 100644 --- a/samples/hw_breakpoint/data_breakpoint.c +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -34,7 +34,7 @@ #include #include -struct perf_event **sample_hbp; +struct perf_event * __percpu *sample_hbp; static char ksym_name[KSYM_NAME_LEN] = "pid_max"; module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO); @@ -61,8 +61,8 @@ static int __init hw_break_module_init(void) attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; sample_hbp = register_wide_hw_breakpoint(&attr, sample_hbp_handler); - if (IS_ERR(sample_hbp)) { - ret = PTR_ERR(sample_hbp); + if (IS_ERR((void __force *)sample_hbp)) { + ret = PTR_ERR((void __force *)sample_hbp); goto fail; } -- cgit v1.2.3 From 622ea685f1fafdf84d612440535c84341f0860b8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 27 Feb 2010 14:53:07 -0800 Subject: rcu: Fix holdoff for accelerated GPs for last non-dynticked CPU Make the holdoff only happen when the full number of attempts have been made. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1267311188-16603-1-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/rcutree_plugin.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 464ad2cdee00..79b53bda8943 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1010,6 +1010,10 @@ int rcu_needs_cpu(int cpu) int c = 0; int thatcpu; + /* Check for being in the holdoff period. */ + if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) + return rcu_needs_cpu_quick_check(cpu); + /* Don't bother unless we are the last non-dyntick-idle CPU. */ for_each_cpu_not(thatcpu, nohz_cpu_mask) if (thatcpu != cpu) { @@ -1041,10 +1045,8 @@ int rcu_needs_cpu(int cpu) } /* If RCU callbacks are still pending, RCU still needs this CPU. */ - if (c) { + if (c) raise_softirq(RCU_SOFTIRQ); - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - } return c; } -- cgit v1.2.3 From ae1f30384baef4056438d81b305a6a5199b0d16c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 28 Feb 2010 19:42:38 +0100 Subject: tracing: Include irqflags headers from trace clock trace_clock.c includes spinlock.h, which ends up including asm/system.h, which in turn includes linux/irqflags.h in x86. So the definition of raw_local_irq_save is luckily covered there, but this is not the case in parisc: tip/kernel/trace/trace_clock.c:86: error: implicit declaration of function 'raw_local_irq_save' tip/kernel/trace/trace_clock.c:112: error: implicit declaration of function 'raw_local_irq_restore' We need to include linux/irqflags.h directly from trace_clock.c to avoid such build error. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Robert Richter Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/trace/trace_clock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 84a3a7ba072a..6fbfb8f417b9 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -13,6 +13,7 @@ * Tracer plugins will chose a default from these clocks. */ #include +#include #include #include #include -- cgit v1.2.3 From 1e259e0a9982078896f3404240096cbea01daca4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 28 Feb 2010 20:51:15 +0100 Subject: hw-breakpoints: Remove stub unthrottle callback We support event unthrottling in breakpoint events. It means that if we have more than sysctl_perf_event_sample_rate/HZ, perf will throttle, ignoring subsequent events until the next tick. So if ptrace exceeds this max rate, it will omit events, which breaks the ptrace determinism that is supposed to report every triggered breakpoints. This is likely to happen if we set sysctl_perf_event_sample_rate to 1. This patch removes support for unthrottling in breakpoint events to break throttling and restore ptrace determinism. Signed-off-by: Frederic Weisbecker Cc: 2.6.33.x Cc: Peter Zijlstra Cc: K.Prasad Cc: Paul Mackerras --- arch/x86/kernel/hw_breakpoint.c | 5 ----- kernel/hw_breakpoint.c | 1 - 2 files changed, 6 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index bb6006e3e295..1e8ceadc0d6a 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -531,8 +531,3 @@ void hw_breakpoint_pmu_read(struct perf_event *bp) { /* TODO */ } - -void hw_breakpoint_pmu_unthrottle(struct perf_event *bp) -{ - /* TODO */ -} diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 967e66143e11..4d99512ee149 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -489,5 +489,4 @@ struct pmu perf_ops_bp = { .enable = arch_install_hw_breakpoint, .disable = arch_uninstall_hw_breakpoint, .read = hw_breakpoint_pmu_read, - .unthrottle = hw_breakpoint_pmu_unthrottle }; -- cgit v1.2.3 From 90a6501f94aedd7fb40f5556334843194fb598be Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 28 Feb 2010 08:32:18 -0800 Subject: sched, rcu: Fix rcu_dereference() for RCU-lockdep Make rcu_dereference() of runqueue data structures be rcu_dereference_sched(). Located-by: Ingo Molnar Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <20100228163218.GD6846@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3e1fd96c6cf9..5a5ea2cd924f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3476,7 +3476,7 @@ static void run_rebalance_domains(struct softirq_action *h) static inline int on_null_domain(int cpu) { - return !rcu_dereference(cpu_rq(cpu)->sd); + return !rcu_dereference_sched(cpu_rq(cpu)->sd); } /* -- cgit v1.2.3 From ad6759fbf35d104dbf573cd6f4c6784ad6823f7e Mon Sep 17 00:00:00 2001 From: john stultz Date: Mon, 1 Mar 2010 12:34:43 -0800 Subject: timekeeping: Prevent oops when GENERIC_TIME=n Aaro Koskinen reported an issue in kernel.org bugzilla #15366, where on non-GENERIC_TIME systems, accessing /sys/devices/system/clocksource/clocksource0/current_clocksource results in an oops. It seems the timekeeper/clocksource rework missed initializing the curr_clocksource value in the !GENERIC_TIME case. Thanks to Aaro for reporting and diagnosing the issue as well as testing the fix! Reported-by: Aaro Koskinen Signed-off-by: John Stultz Cc: Martin Schwidefsky Cc: stable@kernel.org LKML-Reference: <1267475683.4216.61.camel@localhost.localdomain> Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1f663d23e85e..1f5dde637457 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -592,6 +592,10 @@ static inline void clocksource_select(void) { } */ static int __init clocksource_done_booting(void) { + mutex_lock(&clocksource_mutex); + curr_clocksource = clocksource_default_clock(); + mutex_unlock(&clocksource_mutex); + finished_booting = 1; /* -- cgit v1.2.3 From 320ebf09cbb6d01954c9a060266aa8e0d27f4638 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Mar 2010 12:35:37 +0100 Subject: perf, x86: Restrict the ANY flag The ANY flag can show SMT data of another task (like 'top'), so we want to disable it when system-wide profiling is disabled. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 3 +++ include/linux/perf_event.h | 15 +++++++++++++++ kernel/perf_event.c | 15 --------------- 3 files changed, 18 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 6531b4bdb22d..aab2e1ce9dee 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -503,6 +503,9 @@ static int __hw_perf_event_init(struct perf_event *event) */ if (attr->type == PERF_TYPE_RAW) { hwc->config |= x86_pmu.raw_event(attr->config); + if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) && + perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return -EACCES; return 0; } diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 04f06b4be297..90e0521b1690 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -857,6 +857,21 @@ extern int sysctl_perf_event_paranoid; extern int sysctl_perf_event_mlock; extern int sysctl_perf_event_sample_rate; +static inline bool perf_paranoid_tracepoint_raw(void) +{ + return sysctl_perf_event_paranoid > -1; +} + +static inline bool perf_paranoid_cpu(void) +{ + return sysctl_perf_event_paranoid > 0; +} + +static inline bool perf_paranoid_kernel(void) +{ + return sysctl_perf_event_paranoid > 1; +} + extern void perf_event_init(void); extern void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size); extern void perf_bp_event(struct perf_event *event, void *data); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a661e7991865..482d5e1d3764 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -56,21 +56,6 @@ static atomic_t nr_task_events __read_mostly; */ int sysctl_perf_event_paranoid __read_mostly = 1; -static inline bool perf_paranoid_tracepoint_raw(void) -{ - return sysctl_perf_event_paranoid > -1; -} - -static inline bool perf_paranoid_cpu(void) -{ - return sysctl_perf_event_paranoid > 0; -} - -static inline bool perf_paranoid_kernel(void) -{ - return sysctl_perf_event_paranoid > 1; -} - int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ /* -- cgit v1.2.3 From ac91d85456372a90af5b85eb6620fd2efb1e431b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 2 Mar 2010 17:54:50 +0800 Subject: tracing: Fix warning in s_next of trace file ops This warning in s_next() can be triggered by lseek(): [] ? s_next+0x77/0x80 [] warn_slowpath_common+0x81/0xa0 [] ? s_next+0x77/0x80 [] warn_slowpath_null+0x1a/0x20 [] s_next+0x77/0x80 [] traverse+0x117/0x200 [] seq_lseek+0xa3/0x120 [] ? seq_lseek+0x0/0x120 [] vfs_llseek+0x41/0x50 [] sys_llseek+0x66/0xa0 [] sysenter_do_call+0x12/0x26 The iterator "leftover" variable is zeroed in the opening of the trace file. But lseek can call s_start() which will call s_next() without reseting the "leftover" variable back to zero, which might trigger the WARN_ON_ONCE(iter->leftover) that is in s_next(). Cc: stable@kernel.org Signed-off-by: Lai Jiangshan LKML-Reference: <4B8CE06A.9090207@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 032c57ca6502..5edf410bc540 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1703,6 +1703,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) ftrace_enable_cpu(); + iter->leftover = 0; for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; -- cgit v1.2.3 From db1466b3e1bd1727375cdbfcbea4bcce2f860f61 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Mar 2010 07:46:56 -0800 Subject: rcu: Use wrapper function instead of exporting tasklist_lock Lockdep-RCU commit d11c563d exported tasklist_lock, which is not a good thing. This patch instead exports a function that uses lockdep to check whether tasklist_lock is held. Suggested-by: Christoph Hellwig Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: Christoph Hellwig LKML-Reference: <1267631219-8713-1-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/cred.h | 2 +- include/linux/sched.h | 4 ++++ kernel/exit.c | 2 +- kernel/fork.c | 9 ++++++++- kernel/pid.c | 4 +++- 5 files changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/cred.h b/include/linux/cred.h index 4db09f89b637..52507c3e1387 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -280,7 +280,7 @@ static inline void put_cred(const struct cred *_cred) * task or by holding tasklist_lock to prevent it from being unlinked. */ #define __task_cred(task) \ - ((const struct cred *)(rcu_dereference_check((task)->real_cred, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock)))) + ((const struct cred *)(rcu_dereference_check((task)->real_cred, rcu_read_lock_held() || lockdep_tasklist_lock_is_held()))) /** * get_task_cred - Get another task's objective credentials diff --git a/include/linux/sched.h b/include/linux/sched.h index 0eef87b58ea5..a47af2064dcc 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -258,6 +258,10 @@ extern spinlock_t mmlist_lock; struct task_struct; +#ifdef CONFIG_PROVE_RCU +extern int lockdep_tasklist_lock_is_held(void); +#endif /* #ifdef CONFIG_PROVE_RCU */ + extern void sched_init(void); extern void sched_init_smp(void); extern asmlinkage void schedule_tail(struct task_struct *prev); diff --git a/kernel/exit.c b/kernel/exit.c index 45ed043b8bf5..fed3a4db6f04 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -87,7 +87,7 @@ static void __exit_signal(struct task_struct *tsk) sighand = rcu_dereference_check(tsk->sighand, rcu_read_lock_held() || - lockdep_is_held(&tasklist_lock)); + lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); posix_cpu_timers_exit(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 17bbf093356d..8691c540a470 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -86,7 +86,14 @@ int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ -EXPORT_SYMBOL_GPL(tasklist_lock); + +#ifdef CONFIG_PROVE_RCU +int lockdep_tasklist_lock_is_held(void) +{ + return lockdep_is_held(&tasklist_lock); +} +EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held); +#endif /* #ifdef CONFIG_PROVE_RCU */ int nr_processes(void) { diff --git a/kernel/pid.c b/kernel/pid.c index b08e697cd83f..b6064405f367 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) struct task_struct *result = NULL; if (pid) { struct hlist_node *first; - first = rcu_dereference_check(pid->tasks[type].first, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock)); + first = rcu_dereference_check(pid->tasks[type].first, + rcu_read_lock_held() || + lockdep_tasklist_lock_is_held()); if (first) result = hlist_entry(first, struct task_struct, pids[(type)].node); } -- cgit v1.2.3 From cc5b83a9f884fe8722a275069a5a6fde39988455 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Mar 2010 07:46:59 -0800 Subject: rcu: Add control variables to lockdep_rcu_dereference() diagnostics Add the values of rcu_scheduler_active() and debug_locks() to the lockdep_rcu_dereference() output to help diagnose RCU lockdep splats that occur shortly after the scheduler starts. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1267631219-8713-4-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0c30d0455de1..681bc2e1e187 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3822,6 +3822,7 @@ void lockdep_rcu_dereference(const char *file, const int line) printk("%s:%d invoked rcu_dereference_check() without protection!\n", file, line); printk("\nother info that might help us debug this:\n\n"); + printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); dump_stack(); -- cgit v1.2.3 From 8d53dd546f36073e0d29b0cfc24c665db301e3e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Mar 2010 17:50:18 -0800 Subject: rcu, ftrace: Fix RCU lockdep splat in ftrace_perf_buf_prepare() Change the pair of rcu_dereference() calls in ftrace_perf_buf_prepare() to rcu_dereference_sched(). Signed-off-by: Paul E. McKenney Acked-by: Frederic Weisbecker Cc: Steven Rostedt Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: Frederic Weisbecker LKML-Reference: <1267667418-32233-3-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- include/trace/ftrace.h | 4 ++-- kernel/trace/trace_event_profile.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 0804cd594803..601ad7744247 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -699,9 +699,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * __cpu = smp_processor_id(); * * if (in_nmi()) - * trace_buf = rcu_dereference(perf_trace_buf_nmi); + * trace_buf = rcu_dereference_sched(perf_trace_buf_nmi); * else - * trace_buf = rcu_dereference(perf_trace_buf); + * trace_buf = rcu_dereference_sched(perf_trace_buf); * * if (!trace_buf) * goto end; diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index f0d693005075..c1cc3ab633de 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -138,9 +138,9 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, cpu = smp_processor_id(); if (in_nmi()) - trace_buf = rcu_dereference(perf_trace_buf_nmi); + trace_buf = rcu_dereference_sched(perf_trace_buf_nmi); else - trace_buf = rcu_dereference(perf_trace_buf); + trace_buf = rcu_dereference_sched(perf_trace_buf); if (!trace_buf) goto err; -- cgit v1.2.3 From 801c29fd1fdeb84f60241beb445ff5db154450ae Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 5 Mar 2010 20:02:19 -0500 Subject: function-graph: Fix unused reference to ftrace_set_func() The declaration of ftrace_set_func() is at the start of the ftrace.c file and wrapped with a #ifdef CONFIG_FUNCTION_GRAPH condition. If function graph tracing is enabled but CONFIG_DYNAMIC_FTRACE is not, a warning about that function being declared static and unused is given. This really should have been placed within the CONFIG_FUNCTION_GRAPH condition that uses ftrace_set_func(). Moving the declaration down fixes the warning and makes the code cleaner. Reported-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d996353473fd..d0407c9f368c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -85,10 +85,6 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); -#endif - static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op = ftrace_list; @@ -2300,6 +2296,8 @@ __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; +static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); + static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); -- cgit v1.2.3 From a094fe04c751698a18c3a0d376a3bdb117f1e0d8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 5 Mar 2010 20:08:58 -0500 Subject: function-graph: Use comment notation for func names of dangling '}' When a '}' does not have a matching function start, the name is printed within parenthesis. But this makes it confusing between ending '}' and function starts. This patch makes the function name appear in C comment notation. Old view: 3) 1.281 us | } (might_fault) 3) 3.620 us | } (filldir) 3) 5.251 us | } (call_filldir) 3) | call_filldir() { 3) | filldir() { New view: 3) 1.281 us | } /* might_fault */ 3) 3.620 us | } /* filldir */ 3) 5.251 us | } /* call_filldir */ 3) | call_filldir() { 3) | filldir() { Requested-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e998a824e9db..7b1f24618d97 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -920,7 +920,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (!ret) return TRACE_TYPE_PARTIAL_LINE; } else { - ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func); + ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } -- cgit v1.2.3 From 1acaa1b2d9b5904c9cce06122990a2d71046ce16 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 5 Mar 2010 18:23:50 -0300 Subject: tracing: Update the comm field in the right variable in update_max_tr The latency output showed: # | task: -3 (uid:0 nice:0 policy:1 rt_prio:99) The comm is missing in the "task:" and it looks like a minus 3 is the output. The correct display should be: # | task: migration/0-3 (uid:0 nice:0 policy:1 rt_prio:99) The problem is that the comm is being stored in the wrong data structure. The max_tr.data[cpu] is what stores the comm, not the tr->data[cpu]. Before this patch the max_tr.data[cpu]->comm was zeroed and the /debug/trace ended up showing just the '-' sign followed by the pid. Also remove a needless initialization of max_data. Signed-off-by: Arnaldo Carvalho de Melo LKML-Reference: <1267824230-23861-1-git-send-email-acme@infradead.org> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 032c57ca6502..6efd5cb3c252 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -592,7 +592,7 @@ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data = tr->data[cpu]; - struct trace_array_cpu *max_data = tr->data[cpu]; + struct trace_array_cpu *max_data; max_tr.cpu = cpu; max_tr.time_start = data->preempt_timestamp; @@ -602,7 +602,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; - memcpy(data->comm, tsk->comm, TASK_COMM_LEN); + memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); max_data->pid = tsk->pid; max_data->uid = task_uid(tsk); max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; -- cgit v1.2.3 From 0e95017355dcf43031da6d0e360a748717e56df1 Mon Sep 17 00:00:00 2001 From: Tim Bird Date: Thu, 25 Feb 2010 15:36:43 -0800 Subject: function-graph: Add tracing_thresh support to function_graph tracer Add support for tracing_thresh to the function_graph tracer. This version of this feature isolates the checks into new entry and return functions, to avoid adding more conditional code into the main function_graph paths. When the tracing_thresh is set and the function graph tracer is enabled, only the functions that took longer than the time in microseconds that was set in tracing_thresh are recorded. To do this efficiently, only the function exits are recorded: [tracing]# echo 100 > tracing_thresh [tracing]# echo function_graph > current_tracer [tracing]# cat trace # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 1) ! 119.214 us | } /* smp_apic_timer_interrupt */ 1) <========== | 0) ! 101.527 us | } /* __rcu_process_callbacks */ 0) ! 126.461 us | } /* rcu_process_callbacks */ 0) ! 145.111 us | } /* __do_softirq */ 0) ! 149.667 us | } /* do_softirq */ 0) ! 168.817 us | } /* irq_exit */ 0) ! 248.254 us | } /* smp_apic_timer_interrupt */ Also, add support for specifying tracing_thresh on the kernel command line. When used like so: "tracing_thresh=200 ftrace=function_graph" this can be used to analyse system startup. It is important to disable tracing soon after boot, in order to avoid losing the trace data. Acked-by: Frederic Weisbecker Signed-off-by: Tim Bird LKML-Reference: <4B87098B.4040308@am.sony.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 20 ++++++++++++++++++-- kernel/trace/trace.h | 3 ++- kernel/trace/trace_functions_graph.c | 25 +++++++++++++++++++++++-- 3 files changed, 43 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6efd5cb3c252..ababedb4e87f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -374,6 +374,21 @@ static int __init set_buf_size(char *str) } __setup("trace_buf_size=", set_buf_size); +static int __init set_tracing_thresh(char *str) +{ + unsigned long threshhold; + int ret; + + if (!str) + return 0; + ret = strict_strtoul(str, 0, &threshhold); + if (ret < 0) + return 0; + tracing_thresh = threshhold * 1000; + return 1; +} +__setup("tracing_thresh=", set_tracing_thresh); + unsigned long nsecs_to_usecs(unsigned long nsecs) { return nsecs / 1000; @@ -579,9 +594,10 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) static arch_spinlock_t ftrace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +unsigned long __read_mostly tracing_thresh; + #ifdef CONFIG_TRACER_MAX_TRACE unsigned long __read_mostly tracing_max_latency; -unsigned long __read_mostly tracing_thresh; /* * Copy the new maximum trace into the separate maximum-trace @@ -4248,10 +4264,10 @@ static __init int tracer_init_debugfs(void) #ifdef CONFIG_TRACER_MAX_TRACE trace_create_file("tracing_max_latency", 0644, d_tracer, &tracing_max_latency, &tracing_max_lat_fops); +#endif trace_create_file("tracing_thresh", 0644, d_tracer, &tracing_thresh, &tracing_max_lat_fops); -#endif trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fd05bcaf91b0..1bc8cd1431d7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -396,9 +396,10 @@ extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); extern unsigned long nsecs_to_usecs(unsigned long nsecs); +extern unsigned long tracing_thresh; + #ifdef CONFIG_TRACER_MAX_TRACE extern unsigned long tracing_max_latency; -extern unsigned long tracing_thresh; void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7b1f24618d97..e9df04b60267 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -237,6 +237,14 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return ret; } +int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) +{ + if (tracing_thresh) + return 1; + else + return trace_graph_entry(trace); +} + static void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, @@ -290,13 +298,26 @@ void set_graph_array(struct trace_array *tr) smp_mb(); } +void trace_graph_thresh_return(struct ftrace_graph_ret *trace) +{ + if (tracing_thresh && + (trace->rettime - trace->calltime < tracing_thresh)) + return; + else + trace_graph_return(trace); +} + static int graph_trace_init(struct trace_array *tr) { int ret; set_graph_array(tr); - ret = register_ftrace_graph(&trace_graph_return, - &trace_graph_entry); + if (tracing_thresh) + ret = register_ftrace_graph(&trace_graph_thresh_return, + &trace_graph_thresh_entry); + else + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry); if (ret) return ret; tracing_start_cmdline_record(); -- cgit v1.2.3 From dc1d628a67a8f042e711ea5accc0beedc3ef0092 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Mar 2010 15:55:04 +0100 Subject: perf: Provide generic perf_sample_data initialization This makes it easier to extend perf_sample_data and fixes a bug on arm and sparc, which failed to set ->raw to NULL, which can cause crashes when combined with PERF_SAMPLE_RAW. It also optimizes PowerPC and tracepoint, because the struct initialization is forced to zero out the whole structure. Signed-off-by: Peter Zijlstra Acked-by: Jean Pihet Reviewed-by: Frederic Weisbecker Acked-by: David S. Miller Cc: Jamie Iles Cc: Paul Mackerras Cc: Stephane Eranian Cc: stable@kernel.org LKML-Reference: <20100304140100.315416040@chello.nl> Signed-off-by: Ingo Molnar --- arch/arm/kernel/perf_event.c | 4 ++-- arch/powerpc/kernel/perf_event.c | 8 ++++---- arch/sparc/kernel/perf_event.c | 2 +- arch/x86/kernel/cpu/perf_event.c | 3 +-- arch/x86/kernel/cpu/perf_event_intel.c | 6 ++---- include/linux/perf_event.h | 7 +++++++ kernel/perf_event.c | 21 ++++++++------------- 7 files changed, 25 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/arch/arm/kernel/perf_event.c b/arch/arm/kernel/perf_event.c index c54ceb3d1f97..3875d99cc40f 100644 --- a/arch/arm/kernel/perf_event.c +++ b/arch/arm/kernel/perf_event.c @@ -965,7 +965,7 @@ armv6pmu_handle_irq(int irq_num, */ armv6_pmcr_write(pmcr); - data.addr = 0; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx <= armpmu->num_events; ++idx) { @@ -1945,7 +1945,7 @@ static irqreturn_t armv7pmu_handle_irq(int irq_num, void *dev) */ regs = get_irq_regs(); - data.addr = 0; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); for (idx = 0; idx <= armpmu->num_events; ++idx) { diff --git a/arch/powerpc/kernel/perf_event.c b/arch/powerpc/kernel/perf_event.c index b6cf8f1f4d35..5120bd44f69a 100644 --- a/arch/powerpc/kernel/perf_event.c +++ b/arch/powerpc/kernel/perf_event.c @@ -1164,10 +1164,10 @@ static void record_and_restart(struct perf_event *event, unsigned long val, * Finally record data if requested. */ if (record) { - struct perf_sample_data data = { - .addr = ~0ULL, - .period = event->hw.last_period, - }; + struct perf_sample_data data; + + perf_sample_data_init(&data, ~0ULL); + data.period = event->hw.last_period; if (event->attr.sample_type & PERF_SAMPLE_ADDR) perf_get_data_addr(regs, &data.addr); diff --git a/arch/sparc/kernel/perf_event.c b/arch/sparc/kernel/perf_event.c index 9f2b2bac8b2b..6504208f375f 100644 --- a/arch/sparc/kernel/perf_event.c +++ b/arch/sparc/kernel/perf_event.c @@ -1189,7 +1189,7 @@ static int __kprobes perf_event_nmi_handler(struct notifier_block *self, regs = args->regs; - data.addr = 0; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 97cddbf32936..42aafd11e170 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1097,8 +1097,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) int idx, handled = 0; u64 val; - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 73102df8bfc1..44b60c852107 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -590,10 +590,9 @@ static void intel_pmu_drain_bts_buffer(void) ds->bts_index = ds->bts_buffer_base; + perf_sample_data_init(&data, 0); data.period = event->hw.last_period; - data.addr = 0; - data.raw = NULL; regs.ip = 0; /* @@ -742,8 +741,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) int bit, loops; u64 ack, status; - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); cpuc = &__get_cpu_var(cpu_hw_events); diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 90e0521b1690..6f8cd7da1a01 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -801,6 +801,13 @@ struct perf_sample_data { struct perf_raw_record *raw; }; +static inline +void perf_sample_data_init(struct perf_sample_data *data, u64 addr) +{ + data->addr = addr; + data->raw = NULL; +} + extern void perf_output_sample(struct perf_output_handle *handle, struct perf_event_header *header, struct perf_sample_data *data, diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e68745053013..4393b9e73740 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4108,8 +4108,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, if (rctx < 0) return; - data.addr = addr; - data.raw = NULL; + perf_sample_data_init(&data, addr); do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); @@ -4154,11 +4153,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) struct perf_event *event; u64 period; - event = container_of(hrtimer, struct perf_event, hw.hrtimer); + event = container_of(hrtimer, struct perf_event, hw.hrtimer); event->pmu->read(event); - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); data.period = event->hw.last_period; regs = get_irq_regs(); /* @@ -4322,17 +4320,15 @@ static const struct pmu perf_ops_task_clock = { void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { + struct pt_regs *regs = get_irq_regs(); + struct perf_sample_data data; struct perf_raw_record raw = { .size = entry_size, .data = record, }; - struct perf_sample_data data = { - .addr = addr, - .raw = &raw, - }; - - struct pt_regs *regs = get_irq_regs(); + perf_sample_data_init(&data, addr); + data.raw = &raw; if (!regs) regs = task_pt_regs(current); @@ -4448,8 +4444,7 @@ void perf_bp_event(struct perf_event *bp, void *data) struct perf_sample_data sample; struct pt_regs *regs = data; - sample.raw = NULL; - sample.addr = bp->attr.bp_addr; + perf_sample_data_init(&sample, bp->attr.bp_addr); if (!perf_exclude_event(bp, regs)) perf_swevent_add(bp, 1, 1, &sample, regs); -- cgit v1.2.3 From 3f379b03fbfddd20536389a85c6456f8233d1f8d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Mar 2010 15:03:25 -0800 Subject: ftrace: Replace read_barrier_depends() with rcu_dereference_raw() Replace the calls to read_barrier_depends() in ftrace_list_func() with rcu_dereference_raw() to improve readability. The reason that we use rcu_dereference_raw() here is that removed entries are never freed, instead they are simply leaked. This is one of a very few cases where use of rcu_dereference_raw() is the long-term right answer. And I don't yet know of any others. ;-) Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt Cc: Frederic Weisbecker Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1267830207-9474-1-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 83783579378f..8c5adc0e5db3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -88,18 +89,22 @@ ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); #endif +/* + * Traverse the ftrace_list, invoking all entries. The reason that we + * can use rcu_dereference_raw() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism. The rcu_dereference_raw() calls are needed to handle + * concurrent insertions into the ftrace_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { - struct ftrace_ops *op = ftrace_list; - - /* in case someone actually ports this to alpha! */ - read_barrier_depends(); + struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ while (op != &ftrace_list_end) { - /* silly alpha */ - read_barrier_depends(); op->func(ip, parent_ip); - op = op->next; + op = rcu_dereference_raw(op->next); /*see above*/ }; } @@ -154,8 +159,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) * the ops->next pointer is valid before another CPU sees * the ops pointer included into the ftrace_list. */ - smp_wmb(); - ftrace_list = ops; + rcu_assign_pointer(ftrace_list, ops); if (ftrace_enabled) { ftrace_func_t func; -- cgit v1.2.3 From 007b09243b099811124f69d492adeebe9e439f96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Mar 2010 15:03:26 -0800 Subject: rcu: Increase RCU CPU stall timeouts if PROVE_RCU CONFIG_PROVE_RCU imposes additional overhead on the kernel, so increase the RCU CPU stall timeouts in an attempt to allow for this effect. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1267830207-9474-2-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/rcutree.h | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1439eb504c22..4a525a30e08e 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -246,12 +246,21 @@ struct rcu_data { #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ #ifdef CONFIG_RCU_CPU_STALL_DETECTOR -#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ -#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ - /* to take at least one */ - /* scheduling clock irq */ - /* before ratting on them. */ + +#ifdef CONFIG_PROVE_RCU +#define RCU_STALL_DELAY_DELTA (5 * HZ) +#else +#define RCU_STALL_DELAY_DELTA 0 +#endif + +#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) + /* for rsp->jiffies_stall */ +#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) + /* for rsp->jiffies_stall */ +#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ + /* to take at least one */ + /* scheduling clock irq */ + /* before ratting on them. */ #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ -- cgit v1.2.3 From ab3b3aa5dd01b3aaa6b15caee113b21b1b6520c4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 6 Mar 2010 14:17:52 +0300 Subject: sched: Cleanup: remove unused variable in try_to_wake_up() We haven't used the "orig_rq" variable since 055a00865d "Fix/add missing update_rq_clock() calls" Signed-off-by: Dan Carpenter Cc: Peter Zijlstra Cc: Andreas Herrmann Cc: Gautham R Shenoy Cc: efault@gmx.de LKML-Reference: <20100306111752.GL4958@bicker> Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6a212c97f523..2c1db81f80eb 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2359,7 +2359,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, { int cpu, orig_cpu, this_cpu, success = 0; unsigned long flags; - struct rq *rq, *orig_rq; + struct rq *rq; if (!sched_feat(SYNC_WAKEUPS)) wake_flags &= ~WF_SYNC; @@ -2367,7 +2367,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, this_cpu = get_cpu(); smp_wmb(); - rq = orig_rq = task_rq_lock(p, &flags); + rq = task_rq_lock(p, &flags); update_rq_clock(rq); if (!(p->state & state)) goto out; -- cgit v1.2.3 From 3d07467b7aa91623b31d7b5888a123a2c8c8e9cc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Mar 2010 17:07:24 +0100 Subject: sched: Fix pick_next_highest_task_rt() for cgroups Since pick_next_highest_task_rt() already iterates all the cgroups and is really only interested in tasks, skip over the !task entries. Reported-by: Dhaval Giani Signed-off-by: Peter Zijlstra Tested-by: Dhaval Giani LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_rt.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bf3e38fdbe6d..c4fb42a66cab 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1146,7 +1146,12 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) if (next && next->prio < idx) continue; list_for_each_entry(rt_se, array->queue + idx, run_list) { - struct task_struct *p = rt_task_of(rt_se); + struct task_struct *p; + + if (!rt_entity_is_task(rt_se)) + continue; + + p = rt_task_of(rt_se); if (pick_rt_task(rq, p, cpu)) { next = p; break; -- cgit v1.2.3 From baed7fc9b580bd3fb8252ff1d9b36eaf1f86b670 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Mar 2010 15:21:18 -0800 Subject: Add generic sys_ipc wrapper Add a generic implementation of the ipc demultiplexer syscall. Except for s390 and sparc64 all implementations of the sys_ipc are nearly identical. There are slight differences in the types of the parameters, where mips and powerpc as the only 64-bit architectures with sys_ipc use unsigned long for the "third" argument as it gets casted to a pointer later, while it traditionally is an "int" like most other paramters. frv goes even further and uses unsigned long for all parameters execept for "ptr" which is a pointer type everywhere. The change from int to unsigned long for "third" and back to "int" for the others on frv should be fine due to the in-register calling conventions for syscalls (we already had a similar issue with the generic sys_ptrace), but I'd prefer to have the arch maintainers looks over this in details. Except for that h8300, m68k and m68knommu lack an impplementation of the semtimedop sub call which this patch adds, and various architectures have gets used - at least on i386 it seems superflous as the compat code on x86-64 and ia64 doesn't even bother to implement it. [akpm@linux-foundation.org: add sys_ipc to sys_ni.c] Signed-off-by: Christoph Hellwig Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: Jeff Dike Cc: Hirokazu Takata Cc: Thomas Gleixner Cc: Ingo Molnar Reviewed-by: H. Peter Anvin Cc: Al Viro Cc: Arnd Bergmann Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: "Luck, Tony" Cc: James Morris Cc: Andreas Schwab Acked-by: Jesper Nilsson Acked-by: Russell King Acked-by: David Howells Acked-by: Kyle McMartin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/unistd.h | 1 + arch/arm/kernel/sys_arm.c | 82 -------------------------- arch/arm/kernel/sys_oabi-compat.c | 3 - arch/cris/include/asm/unistd.h | 1 + arch/cris/kernel/sys_cris.c | 78 ------------------------- arch/frv/include/asm/unistd.h | 1 + arch/frv/kernel/sys_frv.c | 89 ---------------------------- arch/h8300/include/asm/unistd.h | 1 + arch/h8300/kernel/sys_h8300.c | 88 ---------------------------- arch/m32r/include/asm/unistd.h | 1 + arch/m32r/kernel/sys_m32r.c | 81 -------------------------- arch/m68k/include/asm/unistd.h | 1 + arch/m68k/kernel/sys_m68k.c | 81 -------------------------- arch/m68knommu/kernel/sys_m68k.c | 86 --------------------------- arch/mips/include/asm/unistd.h | 1 + arch/mips/kernel/syscall.c | 88 ---------------------------- arch/mn10300/include/asm/unistd.h | 1 + arch/mn10300/kernel/sys_mn10300.c | 88 ---------------------------- arch/powerpc/include/asm/syscalls.h | 2 - arch/powerpc/include/asm/unistd.h | 1 + arch/powerpc/kernel/syscalls.c | 94 ------------------------------ arch/s390/kernel/entry.h | 2 +- arch/s390/kernel/sys_s390.c | 2 +- arch/s390/kernel/syscalls.S | 2 +- arch/sh/include/asm/syscalls.h | 2 - arch/sh/include/asm/unistd_32.h | 1 + arch/sh/include/asm/unistd_64.h | 1 + arch/sh/kernel/sys_sh.c | 104 --------------------------------- arch/sparc/include/asm/unistd.h | 4 +- arch/sparc/kernel/sys_sparc_32.c | 113 ------------------------------------ arch/sparc/kernel/sys_sparc_64.c | 2 +- arch/sparc/kernel/systbls.h | 2 +- arch/sparc/kernel/systbls_64.S | 2 +- arch/um/sys-i386/syscalls.c | 86 --------------------------- arch/x86/include/asm/syscalls.h | 1 - arch/x86/include/asm/unistd_32.h | 1 + arch/x86/kernel/sys_i386_32.c | 85 --------------------------- include/linux/syscalls.h | 2 + ipc/Makefile | 2 +- ipc/syscall.c | 99 +++++++++++++++++++++++++++++++ kernel/sys_ni.c | 1 + 41 files changed, 124 insertions(+), 1259 deletions(-) create mode 100644 ipc/syscall.c (limited to 'kernel') diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index e6eeb2d29953..dd2bf53000fe 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h @@ -448,6 +448,7 @@ #if !defined(CONFIG_AEABI) || defined(CONFIG_OABI_COMPAT) #define __ARCH_WANT_SYS_TIME +#define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_UTIME diff --git a/arch/arm/kernel/sys_arm.c b/arch/arm/kernel/sys_arm.c index a2e0e6f2ea7f..4350f75e578c 100644 --- a/arch/arm/kernel/sys_arm.c +++ b/arch/arm/kernel/sys_arm.c @@ -28,88 +28,6 @@ #include #include -#if !defined(CONFIG_AEABI) || defined(CONFIG_OABI_COMPAT) -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -asmlinkage int sys_ipc(uint call, int first, int second, int third, - void __user *ptr, long fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - switch (call) { - case SEMOP: - return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL); - case SEMTIMEDOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, second, - (const struct timespec __user *)fifth); - - case SEMGET: - return sys_semget (first, second, third); - case SEMCTL: { - union semun fourth; - if (!ptr) - return -EINVAL; - if (get_user(fourth.__pad, (void __user * __user *) ptr)) - return -EFAULT; - return sys_semctl (first, second, third, fourth); - } - - case MSGSND: - return sys_msgsnd(first, (struct msgbuf __user *) ptr, - second, third); - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - if (!ptr) - return -EINVAL; - if (copy_from_user(&tmp,(struct ipc_kludge __user *)ptr, - sizeof (tmp))) - return -EFAULT; - return sys_msgrcv (first, tmp.msgp, second, - tmp.msgtyp, third); - } - default: - return sys_msgrcv (first, - (struct msgbuf __user *) ptr, - second, fifth, third); - } - case MSGGET: - return sys_msgget ((key_t) first, second); - case MSGCTL: - return sys_msgctl(first, second, (struct msqid_ds __user *)ptr); - - case SHMAT: - switch (version) { - default: { - ulong raddr; - ret = do_shmat(first, (char __user *)ptr, second, &raddr); - if (ret) - return ret; - return put_user(raddr, (ulong __user *)third); - } - case 1: /* Of course, we don't support iBCS2! */ - return -EINVAL; - } - case SHMDT: - return sys_shmdt ((char __user *)ptr); - case SHMGET: - return sys_shmget (first, second, third); - case SHMCTL: - return sys_shmctl (first, second, - (struct shmid_ds __user *) ptr); - default: - return -ENOSYS; - } -} -#endif - /* Fork a new task - this creates a new program thread. * This is called indirectly via a small wrapper */ diff --git a/arch/arm/kernel/sys_oabi-compat.c b/arch/arm/kernel/sys_oabi-compat.c index d59a0cd537f0..33ff678e32f2 100644 --- a/arch/arm/kernel/sys_oabi-compat.c +++ b/arch/arm/kernel/sys_oabi-compat.c @@ -346,9 +346,6 @@ asmlinkage long sys_oabi_semop(int semid, struct oabi_sembuf __user *tsops, return sys_oabi_semtimedop(semid, tsops, nsops, NULL); } -extern asmlinkage int sys_ipc(uint call, int first, int second, int third, - void __user *ptr, long fifth); - asmlinkage int sys_oabi_ipc(uint call, int first, int second, int third, void __user *ptr, long fifth) { diff --git a/arch/cris/include/asm/unistd.h b/arch/cris/include/asm/unistd.h index 8cffd22623fd..f6fad83b3a8c 100644 --- a/arch/cris/include/asm/unistd.h +++ b/arch/cris/include/asm/unistd.h @@ -352,6 +352,7 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE #define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL diff --git a/arch/cris/kernel/sys_cris.c b/arch/cris/kernel/sys_cris.c index 22f9d6cd947f..7aa036ec78ff 100644 --- a/arch/cris/kernel/sys_cris.c +++ b/arch/cris/kernel/sys_cris.c @@ -33,81 +33,3 @@ sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, /* bug(?): 8Kb pages here */ return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } - -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. (same as arch/i386) - */ - -asmlinkage int sys_ipc (uint call, int first, int second, - int third, void __user *ptr, long fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - switch (call) { - case SEMOP: - return sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL); - case SEMTIMEDOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, second, - (const struct timespec __user *)fifth); - - case SEMGET: - return sys_semget (first, second, third); - case SEMCTL: { - union semun fourth; - if (!ptr) - return -EINVAL; - if (get_user(fourth.__pad, (void * __user *) ptr)) - return -EFAULT; - return sys_semctl (first, second, third, fourth); - } - - case MSGSND: - return sys_msgsnd (first, (struct msgbuf __user *) ptr, - second, third); - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - if (!ptr) - return -EINVAL; - - if (copy_from_user(&tmp, - (struct ipc_kludge __user *) ptr, - sizeof (tmp))) - return -EFAULT; - return sys_msgrcv (first, tmp.msgp, second, - tmp.msgtyp, third); - } - default: - return sys_msgrcv (first, - (struct msgbuf __user *) ptr, - second, fifth, third); - } - case MSGGET: - return sys_msgget ((key_t) first, second); - case MSGCTL: - return sys_msgctl (first, second, (struct msqid_ds __user *) ptr); - - case SHMAT: { - ulong raddr; - ret = do_shmat (first, (char __user *) ptr, second, &raddr); - if (ret) - return ret; - return put_user (raddr, (ulong __user *) third); - } - case SHMDT: - return sys_shmdt ((char __user *)ptr); - case SHMGET: - return sys_shmget (first, second, third); - case SHMCTL: - return sys_shmctl (first, second, - (struct shmid_ds __user *) ptr); - default: - return -ENOSYS; - } -} diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h index be6ef0f5cd42..b28da499e22a 100644 --- a/arch/frv/include/asm/unistd.h +++ b/arch/frv/include/asm/unistd.h @@ -354,6 +354,7 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM /* #define __ARCH_WANT_SYS_GETHOSTNAME */ +#define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE /* #define __ARCH_WANT_SYS_SGETMASK */ /* #define __ARCH_WANT_SYS_SIGNAL */ diff --git a/arch/frv/kernel/sys_frv.c b/arch/frv/kernel/sys_frv.c index 1d3d4c9e2521..9c4980825bbb 100644 --- a/arch/frv/kernel/sys_frv.c +++ b/arch/frv/kernel/sys_frv.c @@ -42,92 +42,3 @@ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff >> (PAGE_SHIFT - 12)); } - -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -asmlinkage long sys_ipc(unsigned long call, - unsigned long first, - unsigned long second, - unsigned long third, - void __user *ptr, - unsigned long fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - switch (call) { - case SEMOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL); - case SEMTIMEDOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, second, - (const struct timespec __user *)fifth); - - case SEMGET: - return sys_semget (first, second, third); - case SEMCTL: { - union semun fourth; - if (!ptr) - return -EINVAL; - if (get_user(fourth.__pad, (void * __user *) ptr)) - return -EFAULT; - return sys_semctl (first, second, third, fourth); - } - - case MSGSND: - return sys_msgsnd (first, (struct msgbuf __user *) ptr, - second, third); - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - if (!ptr) - return -EINVAL; - - if (copy_from_user(&tmp, - (struct ipc_kludge __user *) ptr, - sizeof (tmp))) - return -EFAULT; - return sys_msgrcv (first, tmp.msgp, second, - tmp.msgtyp, third); - } - default: - return sys_msgrcv (first, - (struct msgbuf __user *) ptr, - second, fifth, third); - } - case MSGGET: - return sys_msgget ((key_t) first, second); - case MSGCTL: - return sys_msgctl (first, second, (struct msqid_ds __user *) ptr); - - case SHMAT: - switch (version) { - default: { - ulong raddr; - ret = do_shmat (first, (char __user *) ptr, second, &raddr); - if (ret) - return ret; - return put_user (raddr, (ulong __user *) third); - } - case 1: /* iBCS2 emulator entry point */ - if (!segment_eq(get_fs(), get_ds())) - return -EINVAL; - /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */ - return do_shmat (first, (char __user *) ptr, second, (ulong *) third); - } - case SHMDT: - return sys_shmdt ((char __user *)ptr); - case SHMGET: - return sys_shmget (first, second, third); - case SHMCTL: - return sys_shmctl (first, second, - (struct shmid_ds __user *) ptr); - default: - return -ENOSYS; - } -} diff --git a/arch/h8300/include/asm/unistd.h b/arch/h8300/include/asm/unistd.h index 54dab4726954..50f2c5a36591 100644 --- a/arch/h8300/include/asm/unistd.h +++ b/arch/h8300/include/asm/unistd.h @@ -336,6 +336,7 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE #define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL diff --git a/arch/h8300/kernel/sys_h8300.c b/arch/h8300/kernel/sys_h8300.c index 1f13fd6e5309..f9b3f44da69f 100644 --- a/arch/h8300/kernel/sys_h8300.c +++ b/arch/h8300/kernel/sys_h8300.c @@ -26,94 +26,6 @@ #include #include -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -asmlinkage int sys_ipc (uint call, int first, int second, - int third, void *ptr, long fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - if (call <= SEMCTL) - switch (call) { - case SEMOP: - return sys_semop (first, (struct sembuf *)ptr, second); - case SEMGET: - return sys_semget (first, second, third); - case SEMCTL: { - union semun fourth; - if (!ptr) - return -EINVAL; - if (get_user(fourth.__pad, (void **) ptr)) - return -EFAULT; - return sys_semctl (first, second, third, fourth); - } - default: - return -EINVAL; - } - if (call <= MSGCTL) - switch (call) { - case MSGSND: - return sys_msgsnd (first, (struct msgbuf *) ptr, - second, third); - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - if (!ptr) - return -EINVAL; - if (copy_from_user (&tmp, - (struct ipc_kludge *)ptr, - sizeof (tmp))) - return -EFAULT; - return sys_msgrcv (first, tmp.msgp, second, - tmp.msgtyp, third); - } - default: - return sys_msgrcv (first, - (struct msgbuf *) ptr, - second, fifth, third); - } - case MSGGET: - return sys_msgget ((key_t) first, second); - case MSGCTL: - return sys_msgctl (first, second, - (struct msqid_ds *) ptr); - default: - return -EINVAL; - } - if (call <= SHMCTL) - switch (call) { - case SHMAT: - switch (version) { - default: { - ulong raddr; - ret = do_shmat (first, (char *) ptr, - second, &raddr); - if (ret) - return ret; - return put_user (raddr, (ulong *) third); - } - } - case SHMDT: - return sys_shmdt ((char *)ptr); - case SHMGET: - return sys_shmget (first, second, third); - case SHMCTL: - return sys_shmctl (first, second, - (struct shmid_ds *) ptr); - default: - return -EINVAL; - } - - return -EINVAL; -} - /* sys_cacheflush -- no support. */ asmlinkage int sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len) diff --git a/arch/m32r/include/asm/unistd.h b/arch/m32r/include/asm/unistd.h index cf701c933249..76125777483c 100644 --- a/arch/m32r/include/asm/unistd.h +++ b/arch/m32r/include/asm/unistd.h @@ -339,6 +339,7 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/m32r/kernel/sys_m32r.c b/arch/m32r/kernel/sys_m32r.c index d3c865c5a6ba..cf2e7279ce9b 100644 --- a/arch/m32r/kernel/sys_m32r.c +++ b/arch/m32r/kernel/sys_m32r.c @@ -76,87 +76,6 @@ asmlinkage int sys_tas(int __user *addr) return oldval; } -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -asmlinkage int sys_ipc(uint call, int first, int second, - int third, void __user *ptr, long fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - switch (call) { - case SEMOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, - second, NULL); - case SEMTIMEDOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, - second, (const struct timespec __user *)fifth); - case SEMGET: - return sys_semget (first, second, third); - case SEMCTL: { - union semun fourth; - if (!ptr) - return -EINVAL; - if (get_user(fourth.__pad, (void __user * __user *) ptr)) - return -EFAULT; - return sys_semctl (first, second, third, fourth); - } - - case MSGSND: - return sys_msgsnd (first, (struct msgbuf __user *) ptr, - second, third); - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - if (!ptr) - return -EINVAL; - - if (copy_from_user(&tmp, - (struct ipc_kludge __user *) ptr, - sizeof (tmp))) - return -EFAULT; - return sys_msgrcv (first, tmp.msgp, second, - tmp.msgtyp, third); - } - default: - return sys_msgrcv (first, - (struct msgbuf __user *) ptr, - second, fifth, third); - } - case MSGGET: - return sys_msgget ((key_t) first, second); - case MSGCTL: - return sys_msgctl (first, second, - (struct msqid_ds __user *) ptr); - case SHMAT: { - ulong raddr; - - if (!access_ok(VERIFY_WRITE, (ulong __user *) third, - sizeof(ulong))) - return -EFAULT; - ret = do_shmat (first, (char __user *) ptr, second, &raddr); - if (ret) - return ret; - return put_user (raddr, (ulong __user *) third); - } - case SHMDT: - return sys_shmdt ((char __user *)ptr); - case SHMGET: - return sys_shmget (first, second, third); - case SHMCTL: - return sys_shmctl (first, second, - (struct shmid_ds __user *) ptr); - default: - return -ENOSYS; - } -} - asmlinkage int sys_uname(struct old_utsname __user * name) { int err; diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h index d801154310ea..60b15d0aa072 100644 --- a/arch/m68k/include/asm/unistd.h +++ b/arch/m68k/include/asm/unistd.h @@ -351,6 +351,7 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE #define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL diff --git a/arch/m68k/kernel/sys_m68k.c b/arch/m68k/kernel/sys_m68k.c index 7b309e7b6cef..77896692eb0a 100644 --- a/arch/m68k/kernel/sys_m68k.c +++ b/arch/m68k/kernel/sys_m68k.c @@ -46,87 +46,6 @@ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -asmlinkage int sys_ipc (uint call, int first, int second, - int third, void __user *ptr, long fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - if (call <= SEMCTL) - switch (call) { - case SEMOP: - return sys_semop (first, ptr, second); - case SEMGET: - return sys_semget (first, second, third); - case SEMCTL: { - union semun fourth; - if (!ptr) - return -EINVAL; - if (get_user(fourth.__pad, (void __user *__user *) ptr)) - return -EFAULT; - return sys_semctl (first, second, third, fourth); - } - default: - return -ENOSYS; - } - if (call <= MSGCTL) - switch (call) { - case MSGSND: - return sys_msgsnd (first, ptr, second, third); - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - if (!ptr) - return -EINVAL; - if (copy_from_user (&tmp, ptr, sizeof (tmp))) - return -EFAULT; - return sys_msgrcv (first, tmp.msgp, second, - tmp.msgtyp, third); - } - default: - return sys_msgrcv (first, ptr, - second, fifth, third); - } - case MSGGET: - return sys_msgget ((key_t) first, second); - case MSGCTL: - return sys_msgctl (first, second, ptr); - default: - return -ENOSYS; - } - if (call <= SHMCTL) - switch (call) { - case SHMAT: - switch (version) { - default: { - ulong raddr; - ret = do_shmat (first, ptr, second, &raddr); - if (ret) - return ret; - return put_user (raddr, (ulong __user *) third); - } - } - case SHMDT: - return sys_shmdt (ptr); - case SHMGET: - return sys_shmget (first, second, third); - case SHMCTL: - return sys_shmctl (first, second, ptr); - default: - return -ENOSYS; - } - - return -EINVAL; -} - /* Convert virtual (user) address VADDR to physical address PADDR */ #define virt_to_phys_040(vaddr) \ ({ \ diff --git a/arch/m68knommu/kernel/sys_m68k.c b/arch/m68knommu/kernel/sys_m68k.c index 3e371cc9fd91..d65e9c4c930c 100644 --- a/arch/m68knommu/kernel/sys_m68k.c +++ b/arch/m68knommu/kernel/sys_m68k.c @@ -27,92 +27,6 @@ #include #include -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -asmlinkage int sys_ipc (uint call, int first, int second, - int third, void *ptr, long fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - if (call <= SEMCTL) - switch (call) { - case SEMOP: - return sys_semop (first, (struct sembuf *)ptr, second); - case SEMGET: - return sys_semget (first, second, third); - case SEMCTL: { - union semun fourth; - if (!ptr) - return -EINVAL; - if (get_user(fourth.__pad, (void **) ptr)) - return -EFAULT; - return sys_semctl (first, second, third, fourth); - } - default: - return -EINVAL; - } - if (call <= MSGCTL) - switch (call) { - case MSGSND: - return sys_msgsnd (first, (struct msgbuf *) ptr, - second, third); - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - if (!ptr) - return -EINVAL; - if (copy_from_user (&tmp, - (struct ipc_kludge *)ptr, - sizeof (tmp))) - return -EFAULT; - return sys_msgrcv (first, tmp.msgp, second, - tmp.msgtyp, third); - } - default: - return sys_msgrcv (first, - (struct msgbuf *) ptr, - second, fifth, third); - } - case MSGGET: - return sys_msgget ((key_t) first, second); - case MSGCTL: - return sys_msgctl (first, second, - (struct msqid_ds *) ptr); - default: - return -EINVAL; - } - if (call <= SHMCTL) - switch (call) { - case SHMAT: - switch (version) { - default: { - ulong raddr; - ret = do_shmat (first, ptr, second, &raddr); - if (ret) - return ret; - return put_user (raddr, (ulong __user *) third); - } - } - case SHMDT: - return sys_shmdt (ptr); - case SHMGET: - return sys_shmget (first, second, third); - case SHMCTL: - return sys_shmctl (first, second, ptr); - default: - return -ENOSYS; - } - - return -EINVAL; -} - /* sys_cacheflush -- flush (part of) the processor cache. */ asmlinkage int sys_cacheflush (unsigned long addr, int scope, int cache, unsigned long len) diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index 65c679ecbe6b..97fe472095f2 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -1004,6 +1004,7 @@ #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE #define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_UTIME diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index 3f7f466190b4..257bf0141775 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -406,94 +406,6 @@ _sys_sysmips(nabi_no_regargs struct pt_regs regs) return -EINVAL; } -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, int, second, - unsigned long, third, void __user *, ptr, long, fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - switch (call) { - case SEMOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, - second, NULL); - case SEMTIMEDOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, - second, - (const struct timespec __user *)fifth); - case SEMGET: - return sys_semget(first, second, third); - case SEMCTL: { - union semun fourth; - if (!ptr) - return -EINVAL; - if (get_user(fourth.__pad, (void __user *__user *) ptr)) - return -EFAULT; - return sys_semctl(first, second, third, fourth); - } - - case MSGSND: - return sys_msgsnd(first, (struct msgbuf __user *) ptr, - second, third); - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - if (!ptr) - return -EINVAL; - - if (copy_from_user(&tmp, - (struct ipc_kludge __user *) ptr, - sizeof(tmp))) - return -EFAULT; - return sys_msgrcv(first, tmp.msgp, second, - tmp.msgtyp, third); - } - default: - return sys_msgrcv(first, - (struct msgbuf __user *) ptr, - second, fifth, third); - } - case MSGGET: - return sys_msgget((key_t) first, second); - case MSGCTL: - return sys_msgctl(first, second, - (struct msqid_ds __user *) ptr); - - case SHMAT: - switch (version) { - default: { - unsigned long raddr; - ret = do_shmat(first, (char __user *) ptr, second, - &raddr); - if (ret) - return ret; - return put_user(raddr, (unsigned long __user *) third); - } - case 1: /* iBCS2 emulator entry point */ - if (!segment_eq(get_fs(), get_ds())) - return -EINVAL; - return do_shmat(first, (char __user *) ptr, second, - (unsigned long *) third); - } - case SHMDT: - return sys_shmdt((char __user *)ptr); - case SHMGET: - return sys_shmget(first, second, third); - case SHMCTL: - return sys_shmctl(first, second, - (struct shmid_ds __user *) ptr); - default: - return -ENOSYS; - } -} - /* * No implemented yet ... */ diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h index d13a56e99bad..9d056f515929 100644 --- a/arch/mn10300/include/asm/unistd.h +++ b/arch/mn10300/include/asm/unistd.h @@ -363,6 +363,7 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE #define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL diff --git a/arch/mn10300/kernel/sys_mn10300.c b/arch/mn10300/kernel/sys_mn10300.c index bef69d6daf15..815f1355fad4 100644 --- a/arch/mn10300/kernel/sys_mn10300.c +++ b/arch/mn10300/kernel/sys_mn10300.c @@ -31,91 +31,3 @@ asmlinkage long old_mmap(unsigned long addr, unsigned long len, return -EINVAL; return sys_mmap_pgoff(addr, len, prot, flags, fd, offset >> PAGE_SHIFT); } - -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -asmlinkage long sys_ipc(uint call, int first, int second, - int third, void __user *ptr, long fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - switch (call) { - case SEMOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, - second, NULL); - case SEMTIMEDOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, - second, - (const struct timespec __user *)fifth); - case SEMGET: - return sys_semget(first, second, third); - case SEMCTL: { - union semun fourth; - if (!ptr) - return -EINVAL; - if (get_user(fourth.__pad, (void __user * __user *) ptr)) - return -EFAULT; - return sys_semctl(first, second, third, fourth); - } - - case MSGSND: - return sys_msgsnd(first, (struct msgbuf __user *) ptr, - second, third); - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - if (!ptr) - return -EINVAL; - - if (copy_from_user(&tmp, - (struct ipc_kludge __user *) ptr, - sizeof(tmp))) - return -EFAULT; - return sys_msgrcv(first, tmp.msgp, second, - tmp.msgtyp, third); - } - default: - return sys_msgrcv(first, - (struct msgbuf __user *) ptr, - second, fifth, third); - } - case MSGGET: - return sys_msgget((key_t) first, second); - case MSGCTL: - return sys_msgctl(first, second, - (struct msqid_ds __user *) ptr); - - case SHMAT: - switch (version) { - default: { - ulong raddr; - ret = do_shmat(first, (char __user *) ptr, second, - &raddr); - if (ret) - return ret; - return put_user(raddr, (ulong *) third); - } - case 1: /* iBCS2 emulator entry point */ - if (!segment_eq(get_fs(), get_ds())) - return -EINVAL; - return do_shmat(first, (char __user *) ptr, second, - (ulong *) third); - } - case SHMDT: - return sys_shmdt((char __user *)ptr); - case SHMGET: - return sys_shmget(first, second, third); - case SHMCTL: - return sys_shmctl(first, second, - (struct shmid_ds __user *) ptr); - default: - return -EINVAL; - } -} diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index eb8eb400c664..23bb74e7f946 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -35,8 +35,6 @@ asmlinkage long sys_pipe2(int __user *fildes, int flags); asmlinkage long sys_rt_sigaction(int sig, const struct sigaction __user *act, struct sigaction __user *oact, size_t sigsetsize); -asmlinkage int sys_ipc(uint call, int first, unsigned long second, - long third, void __user *ptr, long fifth); asmlinkage long ppc64_personality(unsigned long personality); asmlinkage int ppc_rtas(struct rtas_args __user *uargs); asmlinkage time_t sys64_time(time_t __user * tloc); diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index f6ca76176766..c13821fe8741 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -364,6 +364,7 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE #define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index 3370e62e43d4..5251221e7a5a 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -42,100 +42,6 @@ #include #include -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -int sys_ipc(uint call, int first, unsigned long second, long third, - void __user *ptr, long fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - ret = -ENOSYS; - switch (call) { - case SEMOP: - ret = sys_semtimedop(first, (struct sembuf __user *)ptr, - (unsigned)second, NULL); - break; - case SEMTIMEDOP: - ret = sys_semtimedop(first, (struct sembuf __user *)ptr, - (unsigned)second, - (const struct timespec __user *) fifth); - break; - case SEMGET: - ret = sys_semget (first, (int)second, third); - break; - case SEMCTL: { - union semun fourth; - - ret = -EINVAL; - if (!ptr) - break; - if ((ret = get_user(fourth.__pad, (void __user * __user *)ptr))) - break; - ret = sys_semctl(first, (int)second, third, fourth); - break; - } - case MSGSND: - ret = sys_msgsnd(first, (struct msgbuf __user *)ptr, - (size_t)second, third); - break; - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - - ret = -EINVAL; - if (!ptr) - break; - if ((ret = copy_from_user(&tmp, - (struct ipc_kludge __user *) ptr, - sizeof (tmp)) ? -EFAULT : 0)) - break; - ret = sys_msgrcv(first, tmp.msgp, (size_t) second, - tmp.msgtyp, third); - break; - } - default: - ret = sys_msgrcv (first, (struct msgbuf __user *) ptr, - (size_t)second, fifth, third); - break; - } - break; - case MSGGET: - ret = sys_msgget((key_t)first, (int)second); - break; - case MSGCTL: - ret = sys_msgctl(first, (int)second, - (struct msqid_ds __user *)ptr); - break; - case SHMAT: { - ulong raddr; - ret = do_shmat(first, (char __user *)ptr, (int)second, &raddr); - if (ret) - break; - ret = put_user(raddr, (ulong __user *) third); - break; - } - case SHMDT: - ret = sys_shmdt((char __user *)ptr); - break; - case SHMGET: - ret = sys_shmget(first, (size_t)second, third); - break; - case SHMCTL: - ret = sys_shmctl(first, (int)second, - (struct shmid_ds __user *)ptr); - break; - } - - return ret; -} - static inline unsigned long do_mmap2(unsigned long addr, size_t len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long off, int shift) diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index 5de54d2af0b2..15fd68b196c0 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -30,7 +30,7 @@ struct fadvise64_64_args; struct old_sigaction; long sys_mmap2(struct s390_mmap_arg_struct __user *arg); -long sys_ipc(uint call, int first, unsigned long second, +long sys_s390_ipc(uint call, int first, unsigned long second, unsigned long third, void __user *ptr); long sys_s390_newuname(struct new_utsname __user *name); long sys_s390_personality(unsigned long personality); diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c index b2563509b5a9..b8b78092ab7c 100644 --- a/arch/s390/kernel/sys_s390.c +++ b/arch/s390/kernel/sys_s390.c @@ -64,7 +64,7 @@ out: * * This is really horribly ugly. */ -SYSCALL_DEFINE5(ipc, uint, call, int, first, unsigned long, second, +SYSCALL_DEFINE5(s390_ipc, uint, call, int, first, unsigned long, second, unsigned long, third, void __user *, ptr) { struct ipc_kludge tmp; diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index 2a24766567af..990ac8b321c8 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -125,7 +125,7 @@ NI_SYSCALL /* vm86old for i386 */ SYSCALL(sys_wait4,sys_wait4,compat_sys_wait4_wrapper) SYSCALL(sys_swapoff,sys_swapoff,sys32_swapoff_wrapper) /* 115 */ SYSCALL(sys_sysinfo,sys_sysinfo,compat_sys_sysinfo_wrapper) -SYSCALL(sys_ipc,sys_ipc,sys32_ipc_wrapper) +SYSCALL(sys_s390_ipc,sys_s390_ipc,sys32_ipc_wrapper) SYSCALL(sys_fsync,sys_fsync,sys32_fsync_wrapper) SYSCALL(sys_sigreturn,sys_sigreturn,sys32_sigreturn) SYSCALL(sys_clone,sys_clone,sys_clone_wrapper) /* 120 */ diff --git a/arch/sh/include/asm/syscalls.h b/arch/sh/include/asm/syscalls.h index c1e2b8deb837..c1ce2862f7be 100644 --- a/arch/sh/include/asm/syscalls.h +++ b/arch/sh/include/asm/syscalls.h @@ -11,8 +11,6 @@ asmlinkage int old_mmap(unsigned long addr, unsigned long len, asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff); -asmlinkage int sys_ipc(uint call, int first, int second, - int third, void __user *ptr, long fifth); asmlinkage int sys_uname(struct old_utsname __user *name); #ifdef CONFIG_SUPERH32 diff --git a/arch/sh/include/asm/unistd_32.h b/arch/sh/include/asm/unistd_32.h index 365744b05269..a48f65e2e429 100644 --- a/arch/sh/include/asm/unistd_32.h +++ b/arch/sh/include/asm/unistd_32.h @@ -358,6 +358,7 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE #define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL diff --git a/arch/sh/include/asm/unistd_64.h b/arch/sh/include/asm/unistd_64.h index 25de158aac3a..7709b2b8f752 100644 --- a/arch/sh/include/asm/unistd_64.h +++ b/arch/sh/include/asm/unistd_64.h @@ -398,6 +398,7 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE #define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL diff --git a/arch/sh/kernel/sys_sh.c b/arch/sh/kernel/sys_sh.c index 71399cde03b5..c18cfaa67fdd 100644 --- a/arch/sh/kernel/sys_sh.c +++ b/arch/sh/kernel/sys_sh.c @@ -53,110 +53,6 @@ asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, return sys_mmap_pgoff(addr, len, prot, flags, fd, pgoff); } -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -asmlinkage int sys_ipc(uint call, int first, int second, - int third, void __user *ptr, long fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - if (call <= SEMTIMEDOP) - switch (call) { - case SEMOP: - return sys_semtimedop(first, - (struct sembuf __user *)ptr, - second, NULL); - case SEMTIMEDOP: - return sys_semtimedop(first, - (struct sembuf __user *)ptr, second, - (const struct timespec __user *)fifth); - case SEMGET: - return sys_semget (first, second, third); - case SEMCTL: { - union semun fourth; - if (!ptr) - return -EINVAL; - if (get_user(fourth.__pad, (void __user * __user *) ptr)) - return -EFAULT; - return sys_semctl (first, second, third, fourth); - } - default: - return -EINVAL; - } - - if (call <= MSGCTL) - switch (call) { - case MSGSND: - return sys_msgsnd (first, (struct msgbuf __user *) ptr, - second, third); - case MSGRCV: - switch (version) { - case 0: - { - struct ipc_kludge tmp; - - if (!ptr) - return -EINVAL; - - if (copy_from_user(&tmp, - (struct ipc_kludge __user *) ptr, - sizeof (tmp))) - return -EFAULT; - - return sys_msgrcv (first, tmp.msgp, second, - tmp.msgtyp, third); - } - default: - return sys_msgrcv (first, - (struct msgbuf __user *) ptr, - second, fifth, third); - } - case MSGGET: - return sys_msgget ((key_t) first, second); - case MSGCTL: - return sys_msgctl (first, second, - (struct msqid_ds __user *) ptr); - default: - return -EINVAL; - } - if (call <= SHMCTL) - switch (call) { - case SHMAT: - switch (version) { - default: { - ulong raddr; - ret = do_shmat (first, (char __user *) ptr, - second, &raddr); - if (ret) - return ret; - return put_user (raddr, (ulong __user *) third); - } - case 1: /* iBCS2 emulator entry point */ - if (!segment_eq(get_fs(), get_ds())) - return -EINVAL; - return do_shmat (first, (char __user *) ptr, - second, (ulong *) third); - } - case SHMDT: - return sys_shmdt ((char __user *)ptr); - case SHMGET: - return sys_shmget (first, second, third); - case SHMCTL: - return sys_shmctl (first, second, - (struct shmid_ds __user *) ptr); - default: - return -EINVAL; - } - - return -EINVAL; -} - /* sys_cacheflush -- flush (part of) the processor cache. */ asmlinkage int sys_cacheflush(unsigned long addr, unsigned long len, int op) { diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index cb4b9bfd0d87..d0b3b01ac9d4 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -432,7 +432,9 @@ #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK #define __ARCH_WANT_SYS_RT_SIGSUSPEND -#ifndef __32bit_syscall_numbers__ +#ifdef __32bit_syscall_numbers__ +#define __ARCH_WANT_SYS_IPC +#else #define __ARCH_WANT_COMPAT_SYS_TIME #define __ARCH_WANT_COMPAT_SYS_RT_SIGSUSPEND #endif diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c index 3a82e65d8db2..ee995b7dae7e 100644 --- a/arch/sparc/kernel/sys_sparc_32.c +++ b/arch/sparc/kernel/sys_sparc_32.c @@ -98,119 +98,6 @@ out: return error; } -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ - -asmlinkage int sys_ipc (uint call, int first, int second, int third, void __user *ptr, long fifth) -{ - int version, err; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - if (call <= SEMCTL) - switch (call) { - case SEMOP: - err = sys_semtimedop (first, (struct sembuf __user *)ptr, second, NULL); - goto out; - case SEMTIMEDOP: - err = sys_semtimedop (first, (struct sembuf __user *)ptr, second, (const struct timespec __user *) fifth); - goto out; - case SEMGET: - err = sys_semget (first, second, third); - goto out; - case SEMCTL: { - union semun fourth; - err = -EINVAL; - if (!ptr) - goto out; - err = -EFAULT; - if (get_user(fourth.__pad, - (void __user * __user *)ptr)) - goto out; - err = sys_semctl (first, second, third, fourth); - goto out; - } - default: - err = -ENOSYS; - goto out; - } - if (call <= MSGCTL) - switch (call) { - case MSGSND: - err = sys_msgsnd (first, (struct msgbuf __user *) ptr, - second, third); - goto out; - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - err = -EINVAL; - if (!ptr) - goto out; - err = -EFAULT; - if (copy_from_user(&tmp, (struct ipc_kludge __user *) ptr, sizeof (tmp))) - goto out; - err = sys_msgrcv (first, tmp.msgp, second, tmp.msgtyp, third); - goto out; - } - case 1: default: - err = sys_msgrcv (first, - (struct msgbuf __user *) ptr, - second, fifth, third); - goto out; - } - case MSGGET: - err = sys_msgget ((key_t) first, second); - goto out; - case MSGCTL: - err = sys_msgctl (first, second, (struct msqid_ds __user *) ptr); - goto out; - default: - err = -ENOSYS; - goto out; - } - if (call <= SHMCTL) - switch (call) { - case SHMAT: - switch (version) { - case 0: default: { - ulong raddr; - err = do_shmat (first, (char __user *) ptr, second, &raddr); - if (err) - goto out; - err = -EFAULT; - if (put_user (raddr, (ulong __user *) third)) - goto out; - err = 0; - goto out; - } - case 1: /* iBCS2 emulator entry point */ - err = -EINVAL; - goto out; - } - case SHMDT: - err = sys_shmdt ((char __user *)ptr); - goto out; - case SHMGET: - err = sys_shmget (first, second, third); - goto out; - case SHMCTL: - err = sys_shmctl (first, second, (struct shmid_ds __user *) ptr); - goto out; - default: - err = -ENOSYS; - goto out; - } - else - err = -ENOSYS; -out: - return err; -} - int sparc_mmap_check(unsigned long addr, unsigned long len) { if (ARCH_SUN4C && diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index cb1bef6f14b7..45410e939628 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -426,7 +426,7 @@ out: * This is really horribly ugly. */ -SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, unsigned long, second, +SYSCALL_DEFINE6(sparc_ipc, unsigned int, call, int, first, unsigned long, second, unsigned long, third, void __user *, ptr, long, fifth) { long err; diff --git a/arch/sparc/kernel/systbls.h b/arch/sparc/kernel/systbls.h index 68312fe8da74..2c331c37e748 100644 --- a/arch/sparc/kernel/systbls.h +++ b/arch/sparc/kernel/systbls.h @@ -10,7 +10,7 @@ struct new_utsname; extern asmlinkage unsigned long sys_getpagesize(void); extern asmlinkage long sparc_pipe(struct pt_regs *regs); -extern asmlinkage long sys_ipc(unsigned int call, int first, +extern asmlinkage long sys_sparc_ipc(unsigned int call, int first, unsigned long second, unsigned long third, void __user *ptr, long fifth); diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S index 17614251fb6d..30ca2b1d3a17 100644 --- a/arch/sparc/kernel/systbls_64.S +++ b/arch/sparc/kernel/systbls_64.S @@ -136,7 +136,7 @@ sys_call_table: /*200*/ .word sys_ssetmask, sys_nis_syscall, sys_newlstat, sys_uselib, sys_nis_syscall .word sys_readahead, sys_socketcall, sys_syslog, sys_lookup_dcookie, sys_fadvise64 /*210*/ .word sys_fadvise64_64, sys_tgkill, sys_waitpid, sys_swapoff, sys_sysinfo - .word sys_ipc, sys_nis_syscall, sys_clone, sys_ioprio_get, sys_adjtimex + .word sys_sparc_ipc, sys_nis_syscall, sys_clone, sys_ioprio_get, sys_adjtimex /*220*/ .word sys_nis_syscall, sys_ni_syscall, sys_delete_module, sys_ni_syscall, sys_getpgid .word sys_bdflush, sys_sysfs, sys_nis_syscall, sys_setfsuid, sys_setfsgid /*230*/ .word sys_select, sys_nis_syscall, sys_splice, sys_stime, sys_statfs64 diff --git a/arch/um/sys-i386/syscalls.c b/arch/um/sys-i386/syscalls.c index d0aa8f125ee6..70ca357393b8 100644 --- a/arch/um/sys-i386/syscalls.c +++ b/arch/um/sys-i386/syscalls.c @@ -34,92 +34,6 @@ long sys_clone(unsigned long clone_flags, unsigned long newsp, return ret; } -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -long sys_ipc (uint call, int first, int second, - int third, void __user *ptr, long fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - switch (call) { - case SEMOP: - return sys_semtimedop(first, (struct sembuf __user *) ptr, - second, NULL); - case SEMTIMEDOP: - return sys_semtimedop(first, (struct sembuf __user *) ptr, - second, - (const struct timespec __user *) fifth); - case SEMGET: - return sys_semget (first, second, third); - case SEMCTL: { - union semun fourth; - if (!ptr) - return -EINVAL; - if (get_user(fourth.__pad, (void __user * __user *) ptr)) - return -EFAULT; - return sys_semctl (first, second, third, fourth); - } - - case MSGSND: - return sys_msgsnd (first, (struct msgbuf *) ptr, - second, third); - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - if (!ptr) - return -EINVAL; - - if (copy_from_user(&tmp, - (struct ipc_kludge *) ptr, - sizeof (tmp))) - return -EFAULT; - return sys_msgrcv (first, tmp.msgp, second, - tmp.msgtyp, third); - } - default: - panic("msgrcv with version != 0"); - return sys_msgrcv (first, - (struct msgbuf *) ptr, - second, fifth, third); - } - case MSGGET: - return sys_msgget ((key_t) first, second); - case MSGCTL: - return sys_msgctl (first, second, (struct msqid_ds *) ptr); - - case SHMAT: - switch (version) { - default: { - ulong raddr; - ret = do_shmat (first, (char *) ptr, second, &raddr); - if (ret) - return ret; - return put_user (raddr, (ulong *) third); - } - case 1: /* iBCS2 emulator entry point */ - if (!segment_eq(get_fs(), get_ds())) - return -EINVAL; - return do_shmat (first, (char *) ptr, second, (ulong *) third); - } - case SHMDT: - return sys_shmdt ((char *)ptr); - case SHMGET: - return sys_shmget (first, second, third); - case SHMCTL: - return sys_shmctl (first, second, - (struct shmid_ds *) ptr); - default: - return -ENOSYS; - } -} - long sys_sigaction(int sig, const struct old_sigaction __user *act, struct old_sigaction __user *oact) { diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 86ab6a0623fd..50f6a569f0d1 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -54,7 +54,6 @@ unsigned long sys_sigreturn(struct pt_regs *); struct oldold_utsname; struct old_utsname; -asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); asmlinkage int sys_uname(struct old_utsname __user *); asmlinkage int sys_olduname(struct oldold_utsname __user *); diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index daa65d9aae95..45e64a17b86e 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -354,6 +354,7 @@ #define __ARCH_WANT_STAT64 #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME +#define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE #define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 7955e90c8341..8b5c348fdcf2 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -24,91 +24,6 @@ #include -/* - * sys_ipc() is the de-multiplexer for the SysV IPC calls.. - * - * This is really horribly ugly. - */ -asmlinkage int sys_ipc(uint call, int first, int second, - int third, void __user *ptr, long fifth) -{ - int version, ret; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - switch (call) { - case SEMOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL); - case SEMTIMEDOP: - return sys_semtimedop(first, (struct sembuf __user *)ptr, second, - (const struct timespec __user *)fifth); - - case SEMGET: - return sys_semget(first, second, third); - case SEMCTL: { - union semun fourth; - if (!ptr) - return -EINVAL; - if (get_user(fourth.__pad, (void __user * __user *) ptr)) - return -EFAULT; - return sys_semctl(first, second, third, fourth); - } - - case MSGSND: - return sys_msgsnd(first, (struct msgbuf __user *) ptr, - second, third); - case MSGRCV: - switch (version) { - case 0: { - struct ipc_kludge tmp; - if (!ptr) - return -EINVAL; - - if (copy_from_user(&tmp, - (struct ipc_kludge __user *) ptr, - sizeof(tmp))) - return -EFAULT; - return sys_msgrcv(first, tmp.msgp, second, - tmp.msgtyp, third); - } - default: - return sys_msgrcv(first, - (struct msgbuf __user *) ptr, - second, fifth, third); - } - case MSGGET: - return sys_msgget((key_t) first, second); - case MSGCTL: - return sys_msgctl(first, second, (struct msqid_ds __user *) ptr); - - case SHMAT: - switch (version) { - default: { - ulong raddr; - ret = do_shmat(first, (char __user *) ptr, second, &raddr); - if (ret) - return ret; - return put_user(raddr, (ulong __user *) third); - } - case 1: /* iBCS2 emulator entry point */ - if (!segment_eq(get_fs(), get_ds())) - return -EINVAL; - /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */ - return do_shmat(first, (char __user *) ptr, second, (ulong *) third); - } - case SHMDT: - return sys_shmdt((char __user *)ptr); - case SHMGET: - return sys_shmget(first, second, third); - case SHMCTL: - return sys_shmctl(first, second, - (struct shmid_ds __user *) ptr); - default: - return -ENOSYS; - } -} - /* * Old cruft */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index b60907e3b0d5..fbb61ae70e06 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -684,6 +684,8 @@ asmlinkage long sys_shmat(int shmid, char __user *shmaddr, int shmflg); asmlinkage long sys_shmget(key_t key, size_t size, int flag); asmlinkage long sys_shmdt(char __user *shmaddr); asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); +asmlinkage long sys_ipc(unsigned int call, int first, int second, + unsigned long third, void __user *ptr, long fifth); asmlinkage long sys_mq_open(const char __user *name, int oflag, mode_t mode, struct mq_attr __user *attr); asmlinkage long sys_mq_unlink(const char __user *name); diff --git a/ipc/Makefile b/ipc/Makefile index 4e1955ea815d..9075e172e52c 100644 --- a/ipc/Makefile +++ b/ipc/Makefile @@ -3,7 +3,7 @@ # obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o -obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o ipcns_notifier.o +obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o ipcns_notifier.o syscall.o obj-$(CONFIG_SYSVIPC_SYSCTL) += ipc_sysctl.o obj_mq-$(CONFIG_COMPAT) += compat_mq.o obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y) diff --git a/ipc/syscall.c b/ipc/syscall.c new file mode 100644 index 000000000000..355a3da9ec73 --- /dev/null +++ b/ipc/syscall.c @@ -0,0 +1,99 @@ +/* + * sys_ipc() is the old de-multiplexer for the SysV IPC calls. + * + * This is really horribly ugly, and new architectures should just wire up + * the individual syscalls instead. + */ +#include + +#ifdef __ARCH_WANT_SYS_IPC +#include +#include +#include +#include +#include + +SYSCALL_DEFINE6(ipc, unsigned int, call, int, first, int, second, + unsigned long, third, void __user *, ptr, long, fifth) +{ + int version, ret; + + version = call >> 16; /* hack for backward compatibility */ + call &= 0xffff; + + switch (call) { + case SEMOP: + return sys_semtimedop(first, (struct sembuf __user *)ptr, + second, NULL); + case SEMTIMEDOP: + return sys_semtimedop(first, (struct sembuf __user *)ptr, + second, + (const struct timespec __user *)fifth); + + case SEMGET: + return sys_semget(first, second, third); + case SEMCTL: { + union semun fourth; + if (!ptr) + return -EINVAL; + if (get_user(fourth.__pad, (void __user * __user *) ptr)) + return -EFAULT; + return sys_semctl(first, second, third, fourth); + } + + case MSGSND: + return sys_msgsnd(first, (struct msgbuf __user *) ptr, + second, third); + case MSGRCV: + switch (version) { + case 0: { + struct ipc_kludge tmp; + if (!ptr) + return -EINVAL; + + if (copy_from_user(&tmp, + (struct ipc_kludge __user *) ptr, + sizeof(tmp))) + return -EFAULT; + return sys_msgrcv(first, tmp.msgp, second, + tmp.msgtyp, third); + } + default: + return sys_msgrcv(first, + (struct msgbuf __user *) ptr, + second, fifth, third); + } + case MSGGET: + return sys_msgget((key_t) first, second); + case MSGCTL: + return sys_msgctl(first, second, (struct msqid_ds __user *)ptr); + + case SHMAT: + switch (version) { + default: { + unsigned long raddr; + ret = do_shmat(first, (char __user *)ptr, + second, &raddr); + if (ret) + return ret; + return put_user(raddr, (unsigned long __user *) third); + } + case 1: + /* + * This was the entry point for kernel-originating calls + * from iBCS2 in 2.2 days. + */ + return -EINVAL; + } + case SHMDT: + return sys_shmdt((char __user *)ptr); + case SHMGET: + return sys_shmget(first, second, third); + case SHMCTL: + return sys_shmctl(first, second, + (struct shmid_ds __user *) ptr); + default: + return -ENOSYS; + } +} +#endif diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 695384f12a7d..70f2ea758ffe 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -126,6 +126,7 @@ cond_syscall(sys_setreuid16); cond_syscall(sys_setuid16); cond_syscall(sys_vm86old); cond_syscall(sys_vm86); +cond_syscall(sys_ipc); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); cond_syscall(sys_flock); -- cgit v1.2.3 From e28cbf22933d0c0ccaf3c4c27a1a263b41f73859 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Mar 2010 15:21:19 -0800 Subject: improve sys_newuname() for compat architectures On an architecture that supports 32-bit compat we need to override the reported machine in uname with the 32-bit value. Instead of doing this separately in every architecture introduce a COMPAT_UTS_MACHINE define in and apply it directly in sys_newuname(). Signed-off-by: Christoph Hellwig Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: Jeff Dike Cc: Hirokazu Takata Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Al Viro Cc: Arnd Bergmann Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: "Luck, Tony" Cc: James Morris Cc: Andreas Schwab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/compat.h | 3 ++- arch/mips/include/asm/compat.h | 3 ++- arch/mips/kernel/linux32.c | 16 ---------------- arch/mips/kernel/scall64-n32.S | 2 +- arch/mips/kernel/scall64-o32.S | 2 +- arch/parisc/include/asm/compat.h | 3 ++- arch/parisc/kernel/sys_parisc.c | 15 --------------- arch/parisc/kernel/syscall_table.S | 2 +- arch/powerpc/include/asm/compat.h | 3 ++- arch/powerpc/include/asm/syscalls.h | 2 -- arch/powerpc/include/asm/systbl.h | 2 +- arch/powerpc/kernel/syscalls.c | 13 ------------- arch/s390/include/asm/compat.h | 3 ++- arch/s390/kernel/compat_wrapper.S | 2 +- arch/s390/kernel/entry.h | 2 -- arch/s390/kernel/sys_s390.c | 11 ----------- arch/s390/kernel/syscalls.S | 2 +- arch/sparc/include/asm/compat.h | 3 ++- arch/sparc/kernel/sys_sparc_64.c | 11 ----------- arch/sparc/kernel/systbls.h | 3 --- arch/sparc/kernel/systbls_64.S | 4 ++-- arch/um/sys-x86_64/syscall_table.c | 5 ----- arch/um/sys-x86_64/syscalls.c | 14 -------------- arch/x86/include/asm/compat.h | 3 ++- arch/x86/include/asm/syscalls.h | 3 --- arch/x86/include/asm/unistd_64.h | 2 +- arch/x86/kernel/sys_x86_64.c | 12 ------------ kernel/sys.c | 13 +++++++++++++ 28 files changed, 36 insertions(+), 123 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/include/asm/compat.h b/arch/ia64/include/asm/compat.h index dfcf75b8426d..f90edc85b509 100644 --- a/arch/ia64/include/asm/compat.h +++ b/arch/ia64/include/asm/compat.h @@ -5,7 +5,8 @@ */ #include -#define COMPAT_USER_HZ 100 +#define COMPAT_USER_HZ 100 +#define COMPAT_UTS_MACHINE "i686\0\0\0" typedef u32 compat_size_t; typedef s32 compat_ssize_t; diff --git a/arch/mips/include/asm/compat.h b/arch/mips/include/asm/compat.h index f58aed354bfd..613f6912dfc1 100644 --- a/arch/mips/include/asm/compat.h +++ b/arch/mips/include/asm/compat.h @@ -8,7 +8,8 @@ #include #include -#define COMPAT_USER_HZ 100 +#define COMPAT_USER_HZ 100 +#define COMPAT_UTS_MACHINE "mips\0\0\0" typedef u32 compat_size_t; typedef s32 compat_ssize_t; diff --git a/arch/mips/kernel/linux32.c b/arch/mips/kernel/linux32.c index bde79ef602e6..a39d0597a375 100644 --- a/arch/mips/kernel/linux32.c +++ b/arch/mips/kernel/linux32.c @@ -249,22 +249,6 @@ SYSCALL_DEFINE5(n32_msgrcv, int, msqid, u32, msgp, size_t, msgsz, } #endif -SYSCALL_DEFINE1(32_newuname, struct new_utsname __user *, name) -{ - int ret = 0; - - down_read(&uts_sem); - if (copy_to_user(name, utsname(), sizeof *name)) - ret = -EFAULT; - up_read(&uts_sem); - - if (current->personality == PER_LINUX32 && !ret) - if (copy_to_user(name->machine, "mips\0\0\0", 8)) - ret = -EFAULT; - - return ret; -} - SYSCALL_DEFINE1(32_personality, unsigned long, personality) { int ret; diff --git a/arch/mips/kernel/scall64-n32.S b/arch/mips/kernel/scall64-n32.S index 66b5a48676dd..44337ba03717 100644 --- a/arch/mips/kernel/scall64-n32.S +++ b/arch/mips/kernel/scall64-n32.S @@ -181,7 +181,7 @@ EXPORT(sysn32_call_table) PTR sys_exit PTR compat_sys_wait4 PTR sys_kill /* 6060 */ - PTR sys_32_newuname + PTR sys_newuname PTR sys_semget PTR sys_semop PTR sys_n32_semctl diff --git a/arch/mips/kernel/scall64-o32.S b/arch/mips/kernel/scall64-o32.S index 515f9eab2b28..813689ef2384 100644 --- a/arch/mips/kernel/scall64-o32.S +++ b/arch/mips/kernel/scall64-o32.S @@ -325,7 +325,7 @@ sys_call_table: PTR sys32_sigreturn PTR sys32_clone /* 4120 */ PTR sys_setdomainname - PTR sys_32_newuname + PTR sys_newuname PTR sys_ni_syscall /* sys_modify_ldt */ PTR compat_sys_adjtimex PTR sys_mprotect /* 4125 */ diff --git a/arch/parisc/include/asm/compat.h b/arch/parisc/include/asm/compat.h index 7f32611a7a5e..02b77baa5da6 100644 --- a/arch/parisc/include/asm/compat.h +++ b/arch/parisc/include/asm/compat.h @@ -7,7 +7,8 @@ #include #include -#define COMPAT_USER_HZ 100 +#define COMPAT_USER_HZ 100 +#define COMPAT_UTS_MACHINE "parisc\0\0" typedef u32 compat_size_t; typedef s32 compat_ssize_t; diff --git a/arch/parisc/kernel/sys_parisc.c b/arch/parisc/kernel/sys_parisc.c index 9147391afb03..c9b932260f47 100644 --- a/arch/parisc/kernel/sys_parisc.c +++ b/arch/parisc/kernel/sys_parisc.c @@ -234,18 +234,3 @@ long parisc_personality(unsigned long personality) return err; } - -long parisc_newuname(struct new_utsname __user *name) -{ - int err = sys_newuname(name); - -#ifdef CONFIG_COMPAT - if (!err && personality(current->personality) == PER_LINUX32) { - if (__put_user(0, name->machine + 6) || - __put_user(0, name->machine + 7)) - err = -EFAULT; - } -#endif - - return err; -} diff --git a/arch/parisc/kernel/syscall_table.S b/arch/parisc/kernel/syscall_table.S index de5f6dab48b7..3d52c978738f 100644 --- a/arch/parisc/kernel/syscall_table.S +++ b/arch/parisc/kernel/syscall_table.S @@ -127,7 +127,7 @@ ENTRY_SAME(socketpair) ENTRY_SAME(setpgid) ENTRY_SAME(send) - ENTRY_OURS(newuname) + ENTRY_SAME(newuname) ENTRY_SAME(umask) /* 60 */ ENTRY_SAME(chroot) ENTRY_COMP(ustat) diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h index 4774c2f92232..396d21a80058 100644 --- a/arch/powerpc/include/asm/compat.h +++ b/arch/powerpc/include/asm/compat.h @@ -7,7 +7,8 @@ #include #include -#define COMPAT_USER_HZ 100 +#define COMPAT_USER_HZ 100 +#define COMPAT_UTS_MACHINE "ppc\0\0" typedef u32 compat_size_t; typedef s32 compat_ssize_t; diff --git a/arch/powerpc/include/asm/syscalls.h b/arch/powerpc/include/asm/syscalls.h index 23bb74e7f946..4084e567d28e 100644 --- a/arch/powerpc/include/asm/syscalls.h +++ b/arch/powerpc/include/asm/syscalls.h @@ -7,7 +7,6 @@ #include #include -struct new_utsname; struct pt_regs; struct rtas_args; struct sigaction; @@ -38,7 +37,6 @@ asmlinkage long sys_rt_sigaction(int sig, asmlinkage long ppc64_personality(unsigned long personality); asmlinkage int ppc_rtas(struct rtas_args __user *uargs); asmlinkage time_t sys64_time(time_t __user * tloc); -asmlinkage long ppc_newuname(struct new_utsname __user * name); asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize); diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index 07d2d19ab5e9..a5ee345b6a5c 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -125,7 +125,7 @@ SYSCALL_SPU(fsync) SYS32ONLY(sigreturn) PPC_SYS(clone) COMPAT_SYS_SPU(setdomainname) -PPC_SYS_SPU(newuname) +SYSCALL_SPU(newuname) SYSCALL(ni_syscall) COMPAT_SYS_SPU(adjtimex) SYSCALL_SPU(mprotect) diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index 5251221e7a5a..69d3c5d50a54 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -132,19 +132,6 @@ static inline int override_machine(char __user *mach) return 0; } -long ppc_newuname(struct new_utsname __user * name) -{ - int err = 0; - - down_read(&uts_sem); - if (copy_to_user(name, utsname(), sizeof(*name))) - err = -EFAULT; - up_read(&uts_sem); - if (!err) - err = override_machine(name->machine); - return err; -} - int sys_uname(struct old_utsname __user *name) { int err = 0; diff --git a/arch/s390/include/asm/compat.h b/arch/s390/include/asm/compat.h index 01a08020bc0e..104f2007f097 100644 --- a/arch/s390/include/asm/compat.h +++ b/arch/s390/include/asm/compat.h @@ -35,7 +35,8 @@ extern long psw32_user_bits; -#define COMPAT_USER_HZ 100 +#define COMPAT_USER_HZ 100 +#define COMPAT_UTS_MACHINE "s390\0\0\0\0" typedef u32 compat_size_t; typedef s32 compat_ssize_t; diff --git a/arch/s390/kernel/compat_wrapper.S b/arch/s390/kernel/compat_wrapper.S index 30de2d0e52bb..672ce52341b4 100644 --- a/arch/s390/kernel/compat_wrapper.S +++ b/arch/s390/kernel/compat_wrapper.S @@ -547,7 +547,7 @@ sys32_setdomainname_wrapper: .globl sys32_newuname_wrapper sys32_newuname_wrapper: llgtr %r2,%r2 # struct new_utsname * - jg sys_s390_newuname # branch to system call + jg sys_newuname # branch to system call .globl compat_sys_adjtimex_wrapper compat_sys_adjtimex_wrapper: diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h index 15fd68b196c0..eb15c12ec158 100644 --- a/arch/s390/kernel/entry.h +++ b/arch/s390/kernel/entry.h @@ -24,7 +24,6 @@ int __cpuinit start_secondary(void *cpuvoid); void __init startup_init(void); void die(const char * str, struct pt_regs * regs, long err); -struct new_utsname; struct s390_mmap_arg_struct; struct fadvise64_64_args; struct old_sigaction; @@ -32,7 +31,6 @@ struct old_sigaction; long sys_mmap2(struct s390_mmap_arg_struct __user *arg); long sys_s390_ipc(uint call, int first, unsigned long second, unsigned long third, void __user *ptr); -long sys_s390_newuname(struct new_utsname __user *name); long sys_s390_personality(unsigned long personality); long sys_s390_fadvise64(int fd, u32 offset_high, u32 offset_low, size_t len, int advice); diff --git a/arch/s390/kernel/sys_s390.c b/arch/s390/kernel/sys_s390.c index b8b78092ab7c..7b6b0f81a283 100644 --- a/arch/s390/kernel/sys_s390.c +++ b/arch/s390/kernel/sys_s390.c @@ -131,17 +131,6 @@ SYSCALL_DEFINE5(s390_ipc, uint, call, int, first, unsigned long, second, } #ifdef CONFIG_64BIT -SYSCALL_DEFINE1(s390_newuname, struct new_utsname __user *, name) -{ - int ret = sys_newuname(name); - - if (personality(current->personality) == PER_LINUX32 && !ret) { - ret = copy_to_user(name->machine, "s390\0\0\0\0", 8); - if (ret) ret = -EFAULT; - } - return ret; -} - SYSCALL_DEFINE1(s390_personality, unsigned long, personality) { int ret; diff --git a/arch/s390/kernel/syscalls.S b/arch/s390/kernel/syscalls.S index 990ac8b321c8..201ce6bed34e 100644 --- a/arch/s390/kernel/syscalls.S +++ b/arch/s390/kernel/syscalls.S @@ -130,7 +130,7 @@ SYSCALL(sys_fsync,sys_fsync,sys32_fsync_wrapper) SYSCALL(sys_sigreturn,sys_sigreturn,sys32_sigreturn) SYSCALL(sys_clone,sys_clone,sys_clone_wrapper) /* 120 */ SYSCALL(sys_setdomainname,sys_setdomainname,sys32_setdomainname_wrapper) -SYSCALL(sys_newuname,sys_s390_newuname,sys32_newuname_wrapper) +SYSCALL(sys_newuname,sys_newuname,sys32_newuname_wrapper) NI_SYSCALL /* modify_ldt for i386 */ SYSCALL(sys_adjtimex,sys_adjtimex,compat_sys_adjtimex_wrapper) SYSCALL(sys_mprotect,sys_mprotect,sys32_mprotect_wrapper) /* 125 */ diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h index 0e706257918f..5016f76ea98a 100644 --- a/arch/sparc/include/asm/compat.h +++ b/arch/sparc/include/asm/compat.h @@ -5,7 +5,8 @@ */ #include -#define COMPAT_USER_HZ 100 +#define COMPAT_USER_HZ 100 +#define COMPAT_UTS_MACHINE "sparc\0\0" typedef u32 compat_size_t; typedef s32 compat_ssize_t; diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index 45410e939628..3d435c42e6db 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -510,17 +510,6 @@ out: return err; } -SYSCALL_DEFINE1(sparc64_newuname, struct new_utsname __user *, name) -{ - int ret = sys_newuname(name); - - if (current->personality == PER_LINUX32 && !ret) { - ret = (copy_to_user(name->machine, "sparc\0\0", 8) - ? -EFAULT : 0); - } - return ret; -} - SYSCALL_DEFINE1(sparc64_personality, unsigned long, personality) { int ret; diff --git a/arch/sparc/kernel/systbls.h b/arch/sparc/kernel/systbls.h index 2c331c37e748..118759cd7342 100644 --- a/arch/sparc/kernel/systbls.h +++ b/arch/sparc/kernel/systbls.h @@ -6,15 +6,12 @@ #include #include -struct new_utsname; - extern asmlinkage unsigned long sys_getpagesize(void); extern asmlinkage long sparc_pipe(struct pt_regs *regs); extern asmlinkage long sys_sparc_ipc(unsigned int call, int first, unsigned long second, unsigned long third, void __user *ptr, long fifth); -extern asmlinkage long sparc64_newuname(struct new_utsname __user *name); extern asmlinkage long sparc64_personality(unsigned long personality); extern asmlinkage long sys64_munmap(unsigned long addr, size_t len); extern asmlinkage unsigned long sys64_mremap(unsigned long addr, diff --git a/arch/sparc/kernel/systbls_64.S b/arch/sparc/kernel/systbls_64.S index 30ca2b1d3a17..9db058dd039e 100644 --- a/arch/sparc/kernel/systbls_64.S +++ b/arch/sparc/kernel/systbls_64.S @@ -55,7 +55,7 @@ sys_call_table32: /*170*/ .word sys32_lsetxattr, sys32_fsetxattr, sys_getxattr, sys_lgetxattr, compat_sys_getdents .word sys_setsid, sys_fchdir, sys32_fgetxattr, sys_listxattr, sys_llistxattr /*180*/ .word sys32_flistxattr, sys_removexattr, sys_lremovexattr, compat_sys_sigpending, sys_ni_syscall - .word sys32_setpgid, sys32_fremovexattr, sys32_tkill, sys32_exit_group, sys_sparc64_newuname + .word sys32_setpgid, sys32_fremovexattr, sys32_tkill, sys32_exit_group, sys_newuname /*190*/ .word sys32_init_module, sys_sparc64_personality, sys_remap_file_pages, sys32_epoll_create, sys32_epoll_ctl .word sys32_epoll_wait, sys32_ioprio_set, sys_getppid, sys32_sigaction, sys_sgetmask /*200*/ .word sys32_ssetmask, sys_sigsuspend, compat_sys_newlstat, sys_uselib, compat_sys_old_readdir @@ -130,7 +130,7 @@ sys_call_table: /*170*/ .word sys_lsetxattr, sys_fsetxattr, sys_getxattr, sys_lgetxattr, sys_getdents .word sys_setsid, sys_fchdir, sys_fgetxattr, sys_listxattr, sys_llistxattr /*180*/ .word sys_flistxattr, sys_removexattr, sys_lremovexattr, sys_nis_syscall, sys_ni_syscall - .word sys_setpgid, sys_fremovexattr, sys_tkill, sys_exit_group, sys_sparc64_newuname + .word sys_setpgid, sys_fremovexattr, sys_tkill, sys_exit_group, sys_newuname /*190*/ .word sys_init_module, sys_sparc64_personality, sys_remap_file_pages, sys_epoll_create, sys_epoll_ctl .word sys_epoll_wait, sys_ioprio_set, sys_getppid, sys_nis_syscall, sys_sgetmask /*200*/ .word sys_ssetmask, sys_nis_syscall, sys_newlstat, sys_uselib, sys_nis_syscall diff --git a/arch/um/sys-x86_64/syscall_table.c b/arch/um/sys-x86_64/syscall_table.c index dd21d69715e6..47d469e7e7ce 100644 --- a/arch/um/sys-x86_64/syscall_table.c +++ b/arch/um/sys-x86_64/syscall_table.c @@ -26,11 +26,6 @@ /* On UML we call it this way ("old" means it's not mmap2) */ #define sys_mmap old_mmap -/* - * On x86-64 sys_uname is actually sys_newuname plus a compatibility trick. - * See arch/x86_64/kernel/sys_x86_64.c - */ -#define sys_uname sys_uname64 #define stub_clone sys_clone #define stub_fork sys_fork diff --git a/arch/um/sys-x86_64/syscalls.c b/arch/um/sys-x86_64/syscalls.c index f1199fd34d38..f3d82bb6e15a 100644 --- a/arch/um/sys-x86_64/syscalls.c +++ b/arch/um/sys-x86_64/syscalls.c @@ -12,20 +12,6 @@ #include "asm/uaccess.h" #include "os.h" -asmlinkage long sys_uname64(struct new_utsname __user * name) -{ - int err; - - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof (*name)); - up_read(&uts_sem); - - if (personality(current->personality) == PER_LINUX32) - err |= copy_to_user(&name->machine, "i686", 5); - - return err ? -EFAULT : 0; -} - long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) { unsigned long *ptr = addr, tmp; diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 9a9c7bdc923d..306160e58b48 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -8,7 +8,8 @@ #include #include -#define COMPAT_USER_HZ 100 +#define COMPAT_USER_HZ 100 +#define COMPAT_UTS_MACHINE "i686\0\0" typedef u32 compat_size_t; typedef s32 compat_ssize_t; diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 50f6a569f0d1..47cd606c3537 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -68,11 +68,8 @@ int sys_vm86(unsigned long, unsigned long, struct pt_regs *); long sys_arch_prctl(int, unsigned long); /* kernel/sys_x86_64.c */ -struct new_utsname; - asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -asmlinkage long sys_uname(struct new_utsname __user *); #endif /* CONFIG_X86_32 */ #endif /* _ASM_X86_SYSCALLS_H */ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 4843f7ba754a..83e2d6dc5038 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -146,7 +146,7 @@ __SYSCALL(__NR_wait4, sys_wait4) #define __NR_kill 62 __SYSCALL(__NR_kill, sys_kill) #define __NR_uname 63 -__SYSCALL(__NR_uname, sys_uname) +__SYSCALL(__NR_uname, sys_newuname) #define __NR_semget 64 __SYSCALL(__NR_semget, sys_semget) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 8aa2057efd12..ff14a5044ce6 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -209,15 +209,3 @@ bottomup: return addr; } - - -SYSCALL_DEFINE1(uname, struct new_utsname __user *, name) -{ - int err; - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof(*name)); - up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) - err |= copy_to_user(&name->machine, "i686", 5); - return err ? -EFAULT : 0; -} diff --git a/kernel/sys.c b/kernel/sys.c index 9814e43fb23b..e483eb5530e4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -1114,6 +1115,15 @@ out: DECLARE_RWSEM(uts_sem); +#ifdef COMPAT_UTS_MACHINE +#define override_architecture(name) \ + (current->personality == PER_LINUX32 && \ + copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ + sizeof(COMPAT_UTS_MACHINE))) +#else +#define override_architecture(name) 0 +#endif + SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { int errno = 0; @@ -1122,6 +1132,9 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) if (copy_to_user(name, utsname(), sizeof *name)) errno = -EFAULT; up_read(&uts_sem); + + if (!errno && override_architecture(name)) + errno = -EFAULT; return errno; } -- cgit v1.2.3 From 5cacdb4add1b1e50fe75edc50ebbb7bddd9cf5e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Mar 2010 15:21:21 -0800 Subject: Add generic sys_olduname() Add generic implementations of the old and really old uname system calls. Note that sh only implements sys_olduname but not sys_oldolduname, but I'm not going to bother with another ifdef for that special case. m32r implemented an old uname but never wired it up, so kill it, too. Signed-off-by: Christoph Hellwig Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: Jeff Dike Cc: Hirokazu Takata Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Al Viro Cc: Arnd Bergmann Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: "Luck, Tony" Cc: James Morris Cc: Andreas Schwab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m32r/kernel/sys_m32r.c | 11 -------- arch/mips/include/asm/unistd.h | 1 + arch/mips/kernel/syscall.c | 42 ----------------------------- arch/powerpc/include/asm/unistd.h | 1 + arch/powerpc/kernel/syscalls.c | 57 --------------------------------------- arch/sh/include/asm/syscalls.h | 3 --- arch/sh/include/asm/unistd_32.h | 1 + arch/sh/include/asm/unistd_64.h | 1 + arch/sh/kernel/sys_sh.c | 11 -------- arch/um/kernel/syscall.c | 45 ------------------------------- arch/x86/ia32/ia32entry.S | 4 +-- arch/x86/ia32/sys_ia32.c | 52 ----------------------------------- arch/x86/include/asm/sys_ia32.h | 5 ---- arch/x86/include/asm/syscalls.h | 7 ----- arch/x86/include/asm/unistd_32.h | 1 + arch/x86/include/asm/unistd_64.h | 1 + arch/x86/kernel/sys_i386_32.c | 49 --------------------------------- include/linux/syscalls.h | 4 +++ kernel/sys.c | 54 +++++++++++++++++++++++++++++++++++++ 19 files changed, 66 insertions(+), 284 deletions(-) (limited to 'kernel') diff --git a/arch/m32r/kernel/sys_m32r.c b/arch/m32r/kernel/sys_m32r.c index cf2e7279ce9b..0a00f467edfa 100644 --- a/arch/m32r/kernel/sys_m32r.c +++ b/arch/m32r/kernel/sys_m32r.c @@ -76,17 +76,6 @@ asmlinkage int sys_tas(int __user *addr) return oldval; } -asmlinkage int sys_uname(struct old_utsname __user * name) -{ - int err; - if (!name) - return -EFAULT; - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof (*name)); - up_read(&uts_sem); - return err?-EFAULT:0; -} - asmlinkage int sys_cacheflush(void *addr, int bytes, int cache) { /* This should flush more selectively ... */ diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index 97fe472095f2..1b5a6648eb86 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -1014,6 +1014,7 @@ #define __ARCH_WANT_SYS_LLSEEK #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLD_UNAME #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK diff --git a/arch/mips/kernel/syscall.c b/arch/mips/kernel/syscall.c index 257bf0141775..e96b1c30c7aa 100644 --- a/arch/mips/kernel/syscall.c +++ b/arch/mips/kernel/syscall.c @@ -215,48 +215,6 @@ out: return error; } -/* - * Compacrapability ... - */ -SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) -{ - if (name && !copy_to_user(name, utsname(), sizeof (*name))) - return 0; - return -EFAULT; -} - -/* - * Compacrapability ... - */ -SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) -{ - int error; - - if (!name) - return -EFAULT; - if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) - return -EFAULT; - - error = __copy_to_user(&name->sysname, &utsname()->sysname, - __OLD_UTS_LEN); - error -= __put_user(0, name->sysname + __OLD_UTS_LEN); - error -= __copy_to_user(&name->nodename, &utsname()->nodename, - __OLD_UTS_LEN); - error -= __put_user(0, name->nodename + __OLD_UTS_LEN); - error -= __copy_to_user(&name->release, &utsname()->release, - __OLD_UTS_LEN); - error -= __put_user(0, name->release + __OLD_UTS_LEN); - error -= __copy_to_user(&name->version, &utsname()->version, - __OLD_UTS_LEN); - error -= __put_user(0, name->version + __OLD_UTS_LEN); - error -= __copy_to_user(&name->machine, &utsname()->machine, - __OLD_UTS_LEN); - error = __put_user(0, name->machine + __OLD_UTS_LEN); - error = error ? -EFAULT : 0; - - return error; -} - SYSCALL_DEFINE1(set_thread_area, unsigned long, addr) { struct thread_info *ti = task_thread_info(current); diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index c13821fe8741..f0a10266e7f7 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -377,6 +377,7 @@ #define __ARCH_WANT_SYS_LLSEEK #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLD_UNAME #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK diff --git a/arch/powerpc/kernel/syscalls.c b/arch/powerpc/kernel/syscalls.c index 69d3c5d50a54..f2496f2faecc 100644 --- a/arch/powerpc/kernel/syscalls.c +++ b/arch/powerpc/kernel/syscalls.c @@ -116,63 +116,6 @@ long ppc64_personality(unsigned long personality) } #endif -#ifdef CONFIG_PPC64 -#define OVERRIDE_MACHINE (personality(current->personality) == PER_LINUX32) -#else -#define OVERRIDE_MACHINE 0 -#endif - -static inline int override_machine(char __user *mach) -{ - if (OVERRIDE_MACHINE) { - /* change ppc64 to ppc */ - if (__put_user(0, mach+3) || __put_user(0, mach+4)) - return -EFAULT; - } - return 0; -} - -int sys_uname(struct old_utsname __user *name) -{ - int err = 0; - - down_read(&uts_sem); - if (copy_to_user(name, utsname(), sizeof(*name))) - err = -EFAULT; - up_read(&uts_sem); - if (!err) - err = override_machine(name->machine); - return err; -} - -int sys_olduname(struct oldold_utsname __user *name) -{ - int error; - - if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) - return -EFAULT; - - down_read(&uts_sem); - error = __copy_to_user(&name->sysname, &utsname()->sysname, - __OLD_UTS_LEN); - error |= __put_user(0, name->sysname + __OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename, &utsname()->nodename, - __OLD_UTS_LEN); - error |= __put_user(0, name->nodename + __OLD_UTS_LEN); - error |= __copy_to_user(&name->release, &utsname()->release, - __OLD_UTS_LEN); - error |= __put_user(0, name->release + __OLD_UTS_LEN); - error |= __copy_to_user(&name->version, &utsname()->version, - __OLD_UTS_LEN); - error |= __put_user(0, name->version + __OLD_UTS_LEN); - error |= __copy_to_user(&name->machine, &utsname()->machine, - __OLD_UTS_LEN); - error |= override_machine(name->machine); - up_read(&uts_sem); - - return error? -EFAULT: 0; -} - long ppc_fadvise64_64(int fd, int advice, u32 offset_high, u32 offset_low, u32 len_high, u32 len_low) { diff --git a/arch/sh/include/asm/syscalls.h b/arch/sh/include/asm/syscalls.h index c1ce2862f7be..507725af2e54 100644 --- a/arch/sh/include/asm/syscalls.h +++ b/arch/sh/include/asm/syscalls.h @@ -3,15 +3,12 @@ #ifdef __KERNEL__ -struct old_utsname; - asmlinkage int old_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, int fd, unsigned long off); asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff); -asmlinkage int sys_uname(struct old_utsname __user *name); #ifdef CONFIG_SUPERH32 # include "syscalls_32.h" diff --git a/arch/sh/include/asm/unistd_32.h b/arch/sh/include/asm/unistd_32.h index a48f65e2e429..0e7f0fc8f086 100644 --- a/arch/sh/include/asm/unistd_32.h +++ b/arch/sh/include/asm/unistd_32.h @@ -371,6 +371,7 @@ #define __ARCH_WANT_SYS_LLSEEK #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLD_UNAME #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK diff --git a/arch/sh/include/asm/unistd_64.h b/arch/sh/include/asm/unistd_64.h index 7709b2b8f752..0580c33a1e04 100644 --- a/arch/sh/include/asm/unistd_64.h +++ b/arch/sh/include/asm/unistd_64.h @@ -411,6 +411,7 @@ #define __ARCH_WANT_SYS_LLSEEK #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLD_UNAME #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK diff --git a/arch/sh/kernel/sys_sh.c b/arch/sh/kernel/sys_sh.c index c18cfaa67fdd..81f58371613d 100644 --- a/arch/sh/kernel/sys_sh.c +++ b/arch/sh/kernel/sys_sh.c @@ -93,14 +93,3 @@ asmlinkage int sys_cacheflush(unsigned long addr, unsigned long len, int op) up_read(¤t->mm->mmap_sem); return 0; } - -asmlinkage int sys_uname(struct old_utsname __user *name) -{ - int err; - if (!name) - return -EFAULT; - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof(*name)); - up_read(&uts_sem); - return err?-EFAULT:0; -} diff --git a/arch/um/kernel/syscall.c b/arch/um/kernel/syscall.c index cccab850c27e..4393173923f5 100644 --- a/arch/um/kernel/syscall.c +++ b/arch/um/kernel/syscall.c @@ -51,51 +51,6 @@ long old_mmap(unsigned long addr, unsigned long len, return err; } -long sys_uname(struct old_utsname __user * name) -{ - long err; - if (!name) - return -EFAULT; - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof (*name)); - up_read(&uts_sem); - return err?-EFAULT:0; -} - -long sys_olduname(struct oldold_utsname __user * name) -{ - long error; - - if (!name) - return -EFAULT; - if (!access_ok(VERIFY_WRITE,name,sizeof(struct oldold_utsname))) - return -EFAULT; - - down_read(&uts_sem); - - error = __copy_to_user(&name->sysname, &utsname()->sysname, - __OLD_UTS_LEN); - error |= __put_user(0, name->sysname + __OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename, &utsname()->nodename, - __OLD_UTS_LEN); - error |= __put_user(0, name->nodename + __OLD_UTS_LEN); - error |= __copy_to_user(&name->release, &utsname()->release, - __OLD_UTS_LEN); - error |= __put_user(0, name->release + __OLD_UTS_LEN); - error |= __copy_to_user(&name->version, &utsname()->version, - __OLD_UTS_LEN); - error |= __put_user(0, name->version + __OLD_UTS_LEN); - error |= __copy_to_user(&name->machine, &utsname()->machine, - __OLD_UTS_LEN); - error |= __put_user(0, name->machine + __OLD_UTS_LEN); - - up_read(&uts_sem); - - error = error ? -EFAULT : 0; - - return error; -} - int kernel_execve(const char *filename, char *const argv[], char *const envp[]) { mm_segment_t fs; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 34f821802c23..59b4556a5b92 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -563,7 +563,7 @@ ia32_sys_call_table: .quad quiet_ni_syscall /* old mpx syscall holder */ .quad sys_setpgid .quad quiet_ni_syscall /* old ulimit syscall holder */ - .quad sys32_olduname + .quad sys_olduname .quad sys_umask /* 60 */ .quad sys_chroot .quad compat_sys_ustat @@ -613,7 +613,7 @@ ia32_sys_call_table: .quad compat_sys_newstat .quad compat_sys_newlstat .quad compat_sys_newfstat - .quad sys32_uname + .quad sys_uname .quad stub32_iopl /* 110 */ .quad sys_vhangup .quad quiet_ni_syscall /* old "idle" system call */ diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 56c99f46e289..74c35431b7d8 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -448,58 +448,6 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd, return ret; } -asmlinkage long sys32_olduname(struct oldold_utsname __user *name) -{ - char *arch = "x86_64"; - int err; - - if (!name) - return -EFAULT; - if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) - return -EFAULT; - - down_read(&uts_sem); - - err = __copy_to_user(&name->sysname, &utsname()->sysname, - __OLD_UTS_LEN); - err |= __put_user(0, name->sysname+__OLD_UTS_LEN); - err |= __copy_to_user(&name->nodename, &utsname()->nodename, - __OLD_UTS_LEN); - err |= __put_user(0, name->nodename+__OLD_UTS_LEN); - err |= __copy_to_user(&name->release, &utsname()->release, - __OLD_UTS_LEN); - err |= __put_user(0, name->release+__OLD_UTS_LEN); - err |= __copy_to_user(&name->version, &utsname()->version, - __OLD_UTS_LEN); - err |= __put_user(0, name->version+__OLD_UTS_LEN); - - if (personality(current->personality) == PER_LINUX32) - arch = "i686"; - - err |= __copy_to_user(&name->machine, arch, strlen(arch) + 1); - - up_read(&uts_sem); - - err = err ? -EFAULT : 0; - - return err; -} - -long sys32_uname(struct old_utsname __user *name) -{ - int err; - - if (!name) - return -EFAULT; - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof(*name)); - up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) - err |= copy_to_user(&name->machine, "i686", 5); - - return err ? -EFAULT : 0; -} - asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, compat_uptr_t __user *envp, struct pt_regs *regs) { diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 7d348d803669..3ad421784ae7 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -54,11 +54,6 @@ asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); asmlinkage long sys32_personality(unsigned long); asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); -struct oldold_utsname; -struct old_utsname; -asmlinkage long sys32_olduname(struct oldold_utsname __user *); -long sys32_uname(struct old_utsname __user *); - asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *, compat_uptr_t __user *, struct pt_regs *); asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *); diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 47cd606c3537..5c044b43e9a7 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -50,13 +50,6 @@ asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, struct old_sigaction __user *); unsigned long sys_sigreturn(struct pt_regs *); -/* kernel/sys_i386_32.c */ -struct oldold_utsname; -struct old_utsname; - -asmlinkage int sys_uname(struct old_utsname __user *); -asmlinkage int sys_olduname(struct oldold_utsname __user *); - /* kernel/vm86_32.c */ int sys_vm86old(struct vm86_struct __user *, struct pt_regs *); int sys_vm86(unsigned long, unsigned long, struct pt_regs *); diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 45e64a17b86e..beb9b5f8f8a4 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -367,6 +367,7 @@ #define __ARCH_WANT_SYS_LLSEEK #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLD_UNAME #define __ARCH_WANT_SYS_OLD_MMAP #define __ARCH_WANT_SYS_OLD_SELECT #define __ARCH_WANT_SYS_OLDUMOUNT diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 83e2d6dc5038..ff4307b0e81e 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -680,6 +680,7 @@ __SYSCALL(__NR_recvmmsg, sys_recvmmsg) #define __ARCH_WANT_SYS_LLSEEK #define __ARCH_WANT_SYS_NICE #define __ARCH_WANT_SYS_OLD_GETRLIMIT +#define __ARCH_WANT_SYS_OLD_UNAME #define __ARCH_WANT_SYS_OLDUMOUNT #define __ARCH_WANT_SYS_SIGPENDING #define __ARCH_WANT_SYS_SIGPROCMASK diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 8b5c348fdcf2..196552bb412c 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -24,55 +24,6 @@ #include -/* - * Old cruft - */ -asmlinkage int sys_uname(struct old_utsname __user *name) -{ - int err; - if (!name) - return -EFAULT; - down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof(*name)); - up_read(&uts_sem); - return err? -EFAULT:0; -} - -asmlinkage int sys_olduname(struct oldold_utsname __user *name) -{ - int error; - - if (!name) - return -EFAULT; - if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) - return -EFAULT; - - down_read(&uts_sem); - - error = __copy_to_user(&name->sysname, &utsname()->sysname, - __OLD_UTS_LEN); - error |= __put_user(0, name->sysname + __OLD_UTS_LEN); - error |= __copy_to_user(&name->nodename, &utsname()->nodename, - __OLD_UTS_LEN); - error |= __put_user(0, name->nodename + __OLD_UTS_LEN); - error |= __copy_to_user(&name->release, &utsname()->release, - __OLD_UTS_LEN); - error |= __put_user(0, name->release + __OLD_UTS_LEN); - error |= __copy_to_user(&name->version, &utsname()->version, - __OLD_UTS_LEN); - error |= __put_user(0, name->version + __OLD_UTS_LEN); - error |= __copy_to_user(&name->machine, &utsname()->machine, - __OLD_UTS_LEN); - error |= __put_user(0, name->machine + __OLD_UTS_LEN); - - up_read(&uts_sem); - - error = error ? -EFAULT : 0; - - return error; -} - - /* * Do a system call from kernel instead of calling sys_execve so we * end up with proper pt_regs. diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index fbb61ae70e06..44f2ad0e8825 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -31,6 +31,8 @@ struct msqid_ds; struct new_utsname; struct nfsctl_arg; struct __old_kernel_stat; +struct oldold_utsname; +struct old_utsname; struct pollfd; struct rlimit; struct rusage; @@ -655,6 +657,8 @@ asmlinkage long sys_gethostname(char __user *name, int len); asmlinkage long sys_sethostname(char __user *name, int len); asmlinkage long sys_setdomainname(char __user *name, int len); asmlinkage long sys_newuname(struct new_utsname __user *name); +asmlinkage long sys_uname(struct old_utsname __user *); +asmlinkage long sys_olduname(struct oldold_utsname __user *); asmlinkage long sys_getrlimit(unsigned int resource, struct rlimit __user *rlim); diff --git a/kernel/sys.c b/kernel/sys.c index e483eb5530e4..8298878f4f71 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1138,6 +1138,60 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) return errno; } +#ifdef __ARCH_WANT_SYS_OLD_UNAME +/* + * Old cruft + */ +SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) +{ + int error = 0; + + if (!name) + return -EFAULT; + + down_read(&uts_sem); + if (copy_to_user(name, utsname(), sizeof(*name))) + error = -EFAULT; + up_read(&uts_sem); + + if (!error && override_architecture(name)) + error = -EFAULT; + return error; +} + +SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) +{ + int error; + + if (!name) + return -EFAULT; + if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) + return -EFAULT; + + down_read(&uts_sem); + error = __copy_to_user(&name->sysname, &utsname()->sysname, + __OLD_UTS_LEN); + error |= __put_user(0, name->sysname + __OLD_UTS_LEN); + error |= __copy_to_user(&name->nodename, &utsname()->nodename, + __OLD_UTS_LEN); + error |= __put_user(0, name->nodename + __OLD_UTS_LEN); + error |= __copy_to_user(&name->release, &utsname()->release, + __OLD_UTS_LEN); + error |= __put_user(0, name->release + __OLD_UTS_LEN); + error |= __copy_to_user(&name->version, &utsname()->version, + __OLD_UTS_LEN); + error |= __put_user(0, name->version + __OLD_UTS_LEN); + error |= __copy_to_user(&name->machine, &utsname()->machine, + __OLD_UTS_LEN); + error |= __put_user(0, name->machine + __OLD_UTS_LEN); + up_read(&uts_sem); + + if (!error && override_architecture(name)) + error = -EFAULT; + return error ? -EFAULT : 0; +} +#endif + SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) { int errno; -- cgit v1.2.3 From 2468c7234b366eeb799ee0648cb58f9cba394a54 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Wed, 10 Mar 2010 15:22:03 -0800 Subject: cgroup: introduce cancel_attach() Add cancel_attach() operation to struct cgroup_subsys. cancel_attach() can be used when can_attach() operation prepares something for the subsys, but we should rollback what can_attach() operation has prepared if attach task fails after we've succeeded in can_attach(). Signed-off-by: Daisuke Nishimura Acked-by: Li Zefan Reviewed-by: Paul Menage Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 13 ++++++++++++- include/linux/cgroup.h | 2 ++ kernel/cgroup.c | 40 ++++++++++++++++++++++++++++++++------- 3 files changed, 47 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index 0b33bfe7dde9..d45082653e3d 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -536,10 +536,21 @@ returns an error, this will abort the attach operation. If a NULL task is passed, then a successful result indicates that *any* unspecified task can be moved into the cgroup. Note that this isn't called on a fork. If this method returns 0 (success) then this should -remain valid while the caller holds cgroup_mutex. If threadgroup is +remain valid while the caller holds cgroup_mutex and it is ensured that either +attach() or cancel_attach() will be called in future. If threadgroup is true, then a successful result indicates that all threads in the given thread's threadgroup can be moved together. +void cancel_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct task_struct *task, bool threadgroup) +(cgroup_mutex held by caller) + +Called when a task attach operation has failed after can_attach() has succeeded. +A subsystem whose can_attach() has some side-effects should provide this +function, so that the subsytem can implement a rollback. If not, not necessary. +This will be called only about subsystems whose can_attach() operation have +succeeded. + void attach(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup *old_cgrp, struct task_struct *task, bool threadgroup) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index c9bbcb2a75ae..d08cfe7e12e5 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -428,6 +428,8 @@ struct cgroup_subsys { void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp); int (*can_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, struct task_struct *tsk, bool threadgroup); + void (*cancel_attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, + struct task_struct *tsk, bool threadgroup); void (*attach)(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup *old_cgrp, struct task_struct *tsk, bool threadgroup); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4fd90e129772..be45d2f6008a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1554,7 +1554,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { int retval = 0; - struct cgroup_subsys *ss; + struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroup *oldcgrp; struct css_set *cg; struct css_set *newcg; @@ -1568,8 +1568,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { retval = ss->can_attach(ss, cgrp, tsk, false); - if (retval) - return retval; + if (retval) { + /* + * Remember on which subsystem the can_attach() + * failed, so that we only call cancel_attach() + * against the subsystems whose can_attach() + * succeeded. (See below) + */ + failed_ss = ss; + goto out; + } } } @@ -1583,14 +1591,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) */ newcg = find_css_set(cg, cgrp); put_css_set(cg); - if (!newcg) - return -ENOMEM; + if (!newcg) { + retval = -ENOMEM; + goto out; + } task_lock(tsk); if (tsk->flags & PF_EXITING) { task_unlock(tsk); put_css_set(newcg); - return -ESRCH; + retval = -ESRCH; + goto out; } rcu_assign_pointer(tsk->cgroups, newcg); task_unlock(tsk); @@ -1616,7 +1627,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) * is no longer empty. */ cgroup_wakeup_rmdir_waiter(cgrp); - return 0; +out: + if (retval) { + for_each_subsys(root, ss) { + if (ss == failed_ss) + /* + * This subsystem was the one that failed the + * can_attach() check earlier, so we don't need + * to call cancel_attach() against it or any + * remaining subsystems. + */ + break; + if (ss->cancel_attach) + ss->cancel_attach(ss, cgrp, tsk, false); + } + } + return retval; } /* -- cgit v1.2.3 From d7b9fff711d5e8db8c844161c684017e556c38a0 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Wed, 10 Mar 2010 15:22:05 -0800 Subject: cgroup: introduce coalesce css_get() and css_put() Current css_get() and css_put() increment/decrement css->refcnt one by one. This patch add a new function __css_get(), which takes "count" as a arg and increment the css->refcnt by "count". And this patch also add a new arg("count") to __css_put() and change the function to decrement the css->refcnt by "count". These coalesce version of __css_get()/__css_put() will be used to improve performance of memcg's moving charge feature later, where instead of calling css_get()/css_put() repeatedly, these new functions will be used. No change is needed for current users of css_get()/css_put(). Signed-off-by: Daisuke Nishimura Acked-by: Paul Menage Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 12 +++++++++--- kernel/cgroup.c | 5 +++-- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index d08cfe7e12e5..14160b5b693f 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -76,6 +76,12 @@ enum { CSS_REMOVED, /* This CSS is dead */ }; +/* Caller must verify that the css is not for root cgroup */ +static inline void __css_get(struct cgroup_subsys_state *css, int count) +{ + atomic_add(count, &css->refcnt); +} + /* * Call css_get() to hold a reference on the css; it can be used * for a reference obtained via: @@ -87,7 +93,7 @@ static inline void css_get(struct cgroup_subsys_state *css) { /* We don't need to reference count the root state */ if (!test_bit(CSS_ROOT, &css->flags)) - atomic_inc(&css->refcnt); + __css_get(css, 1); } static inline bool css_is_removed(struct cgroup_subsys_state *css) @@ -118,11 +124,11 @@ static inline bool css_tryget(struct cgroup_subsys_state *css) * css_get() or css_tryget() */ -extern void __css_put(struct cgroup_subsys_state *css); +extern void __css_put(struct cgroup_subsys_state *css, int count); static inline void css_put(struct cgroup_subsys_state *css) { if (!test_bit(CSS_ROOT, &css->flags)) - __css_put(css); + __css_put(css, 1); } /* bits in struct cgroup flags field */ diff --git a/kernel/cgroup.c b/kernel/cgroup.c index be45d2f6008a..cace83ddbcdc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3746,12 +3746,13 @@ static void check_for_release(struct cgroup *cgrp) } } -void __css_put(struct cgroup_subsys_state *css) +/* Caller must verify that the css is not for root cgroup */ +void __css_put(struct cgroup_subsys_state *css, int count) { struct cgroup *cgrp = css->cgroup; int val; rcu_read_lock(); - val = atomic_dec_return(&css->refcnt); + val = atomic_sub_return(count, &css->refcnt); if (val == 1) { if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); -- cgit v1.2.3 From aae8aab40367036931608fdaf9e2dc568b516f19 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 10 Mar 2010 15:22:07 -0800 Subject: cgroups: revamp subsys array This patch series provides the ability for cgroup subsystems to be compiled as modules both within and outside the kernel tree. This is mainly useful for classifiers and subsystems that hook into components that are already modules. cls_cgroup and blkio-cgroup serve as the example use cases for this feature. It provides an interface cgroup_load_subsys() and cgroup_unload_subsys() which modular subsystems can use to register and depart during runtime. The net_cls classifier subsystem serves as the example for a subsystem which can be converted into a module using these changes. Patch #1 sets up the subsys[] array so its contents can be dynamic as modules appear and (eventually) disappear. Iterations over the array are modified to handle when subsystems are absent, and the dynamic section of the array is protected by cgroup_mutex. Patch #2 implements an interface for modules to load subsystems, called cgroup_load_subsys, similar to cgroup_init_subsys, and adds a module pointer in struct cgroup_subsys. Patch #3 adds a mechanism for unloading modular subsystems, which includes a more advanced rework of the rudimentary reference counting introduced in patch 2. Patch #4 modifies the net_cls subsystem, which already had some module declarations, to be configurable as a module, which also serves as a simple proof-of-concept. Part of implementing patches 2 and 4 involved updating css pointers in each css_set when the module appears or leaves. In doing this, it was discovered that css_sets always remain linked to the dummy cgroup, regardless of whether or not any subsystems are actually bound to it (i.e., not mounted on an actual hierarchy). The subsystem loading and unloading code therefore should keep in mind the special cases where the added subsystem is the only one in the dummy cgroup (and therefore all css_sets need to be linked back into it) and where the removed subsys was the only one in the dummy cgroup (and therefore all css_sets should be unlinked from it) - however, as all css_sets always stay attached to the dummy cgroup anyway, these cases are ignored. Any fix that addresses this issue should also make sure these cases are addressed in the subsystem loading and unloading code. This patch: Make subsys[] able to be dynamically populated to support modular subsystems This patch reworks the way the subsys[] array is used so that subsystems can register themselves after boot time, and enables the internals of cgroups to be able to handle when subsystems are not present or may appear/disappear. Signed-off-by: Ben Blum Acked-by: Li Zefan Cc: Paul Menage Cc: "David S. Miller" Cc: KAMEZAWA Hiroyuki Cc: Lai Jiangshan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 10 ++++-- kernel/cgroup.c | 96 +++++++++++++++++++++++++++++++++++++++++--------- 2 files changed, 88 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 14160b5b693f..28319a9fe569 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -40,13 +40,19 @@ extern int cgroupstats_build(struct cgroupstats *stats, extern const struct file_operations proc_cgroup_operations; -/* Define the enumeration of all cgroup subsystems */ +/* Define the enumeration of all builtin cgroup subsystems */ #define SUBSYS(_x) _x ## _subsys_id, enum cgroup_subsys_id { #include - CGROUP_SUBSYS_COUNT + CGROUP_BUILTIN_SUBSYS_COUNT }; #undef SUBSYS +/* + * This define indicates the maximum number of subsystems that can be loaded + * at once. We limit to this many since cgroupfs_root has subsys_bits to keep + * track of all of them. + */ +#define CGROUP_SUBSYS_COUNT (BITS_PER_BYTE*sizeof(unsigned long)) /* Per-subsystem/per-cgroup state maintained by the system. */ struct cgroup_subsys_state { diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cace83ddbcdc..c92fb9549358 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,10 +57,14 @@ static DEFINE_MUTEX(cgroup_mutex); -/* Generate an array of cgroup subsystem pointers */ +/* + * Generate an array of cgroup subsystem pointers. At boot time, this is + * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are + * registered after that. The mutable section of this array is protected by + * cgroup_mutex. + */ #define SUBSYS(_x) &_x ## _subsys, - -static struct cgroup_subsys *subsys[] = { +static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { #include }; @@ -448,8 +452,11 @@ static struct css_set *find_existing_css_set( struct hlist_node *node; struct css_set *cg; - /* Built the set of subsystem state objects that we want to - * see in the new css_set */ + /* + * Build the set of subsystem state objects that we want to see in the + * new css_set. while subsystems can change globally, the entries here + * won't change, so no need for locking. + */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { if (root->subsys_bits & (1UL << i)) { /* Subsystem is in this hierarchy. So we want @@ -884,7 +891,9 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) css_put(css); } - +/* + * Call with cgroup_mutex held. + */ static int rebind_subsystems(struct cgroupfs_root *root, unsigned long final_bits) { @@ -892,6 +901,8 @@ static int rebind_subsystems(struct cgroupfs_root *root, struct cgroup *cgrp = &root->top_cgroup; int i; + BUG_ON(!mutex_is_locked(&cgroup_mutex)); + removed_bits = root->actual_subsys_bits & ~final_bits; added_bits = final_bits & ~root->actual_subsys_bits; /* Check that any added subsystems are currently free */ @@ -900,6 +911,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, struct cgroup_subsys *ss = subsys[i]; if (!(bit & added_bits)) continue; + /* + * Nobody should tell us to do a subsys that doesn't exist: + * parse_cgroupfs_options should catch that case and refcounts + * ensure that subsystems won't disappear once selected. + */ + BUG_ON(ss == NULL); if (ss->root != &rootnode) { /* Subsystem isn't free */ return -EBUSY; @@ -919,6 +936,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, unsigned long bit = 1UL << i; if (bit & added_bits) { /* We're binding this subsystem to this hierarchy */ + BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i]); BUG_ON(!dummytop->subsys[i]); BUG_ON(dummytop->subsys[i]->cgroup != dummytop); @@ -932,6 +950,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, mutex_unlock(&ss->hierarchy_mutex); } else if (bit & removed_bits) { /* We're removing this subsystem */ + BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); mutex_lock(&ss->hierarchy_mutex); @@ -944,6 +963,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, mutex_unlock(&ss->hierarchy_mutex); } else if (bit & final_bits) { /* Subsystem state should already exist */ + BUG_ON(ss == NULL); BUG_ON(!cgrp->subsys[i]); } else { /* Subsystem state shouldn't exist */ @@ -986,14 +1006,18 @@ struct cgroup_sb_opts { }; -/* Convert a hierarchy specifier into a bitmask of subsystems and - * flags. */ +/* + * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call + * with cgroup_mutex held to protect the subsys[] array. + */ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data ?: "all"; unsigned long mask = (unsigned long)-1; + BUG_ON(!mutex_is_locked(&cgroup_mutex)); + #ifdef CONFIG_CPUSETS mask = ~(1UL << cpuset_subsys_id); #endif @@ -1009,6 +1033,8 @@ static int parse_cgroupfs_options(char *data, opts->subsys_bits = 0; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; if (!ss->disabled) opts->subsys_bits |= 1ul << i; } @@ -1053,6 +1079,8 @@ static int parse_cgroupfs_options(char *data, int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { ss = subsys[i]; + if (ss == NULL) + continue; if (!strcmp(token, ss->name)) { if (!ss->disabled) set_bit(i, &opts->subsys_bits); @@ -1306,7 +1334,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct cgroupfs_root *new_root; /* First find the desired set of subsystems */ + mutex_lock(&cgroup_mutex); ret = parse_cgroupfs_options(data, &opts); + mutex_unlock(&cgroup_mutex); if (ret) goto out_err; @@ -2918,8 +2948,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root) /* We need to take each hierarchy_mutex in a consistent order */ int i; + /* + * No worry about a race with rebind_subsystems that might mess up the + * locking order, since both parties are under cgroup_mutex. + */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; if (ss->root == root) mutex_lock(&ss->hierarchy_mutex); } @@ -2931,6 +2967,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; if (ss->root == root) mutex_unlock(&ss->hierarchy_mutex); } @@ -3054,11 +3092,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) * synchronization other than RCU, and the subsystem linked * list isn't RCU-safe */ int i; + /* + * We won't need to lock the subsys array, because the subsystems + * we're concerned about aren't going anywhere since our cgroup root + * has a reference on them. + */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; struct cgroup_subsys_state *css; - /* Skip subsystems not in this hierarchy */ - if (ss->root != cgrp->root) + /* Skip subsystems not present or not in this hierarchy */ + if (ss == NULL || ss->root != cgrp->root) continue; css = cgrp->subsys[ss->subsys_id]; /* When called from check_for_release() it's possible @@ -3279,7 +3322,8 @@ int __init cgroup_init_early(void) for (i = 0; i < CSS_SET_TABLE_SIZE; i++) INIT_HLIST_HEAD(&css_set_table[i]); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* at bootup time, we don't worry about modular subsystems */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; BUG_ON(!ss->name); @@ -3314,7 +3358,8 @@ int __init cgroup_init(void) if (err) return err; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* at bootup time, we don't worry about modular subsystems */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (!ss->early_init) cgroup_init_subsys(ss); @@ -3423,9 +3468,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); + /* + * ideally we don't want subsystems moving around while we do this. + * cgroup_mutex is also necessary to guarantee an atomic snapshot of + * subsys/hierarchy state. + */ mutex_lock(&cgroup_mutex); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; seq_printf(m, "%s\t%d\t%d\t%d\n", ss->name, ss->root->hierarchy_id, ss->root->number_of_cgroups, !ss->disabled); @@ -3483,7 +3535,12 @@ void cgroup_fork_callbacks(struct task_struct *child) { if (need_forkexit_callback) { int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* + * forkexit callbacks are only supported for builtin + * subsystems, and the builtin section of the subsys array is + * immutable, so we don't need to lock the subsys array here. + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->fork) ss->fork(ss, child); @@ -3552,7 +3609,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) struct css_set *cg; if (run_callbacks && need_forkexit_callback) { - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* + * modular subsystems can't use callbacks, so no need to lock + * the subsys array + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->exit) ss->exit(ss, tsk); @@ -3844,8 +3905,11 @@ static int __init cgroup_disable(char *str) while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; - - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* + * cgroup_disable, being at boot time, can't know about module + * subsystems, so we don't worry about them. + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (!strcmp(token, ss->name)) { -- cgit v1.2.3 From e6a1105ba08b265023dd71a4174fb4a29ebc7083 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 10 Mar 2010 15:22:09 -0800 Subject: cgroups: subsystem module loading interface Add interface between cgroups subsystem management and module loading This patch implements rudimentary module-loading support for cgroups - namely, a cgroup_load_subsys (similar to cgroup_init_subsys) for use as a module initcall, and a struct module pointer in struct cgroup_subsys. Several functions that might be wanted by modules have had EXPORT_SYMBOL added to them, but it's unclear exactly which functions want it and which won't. Signed-off-by: Ben Blum Acked-by: Li Zefan Cc: Paul Menage Cc: "David S. Miller" Cc: KAMEZAWA Hiroyuki Cc: Lai Jiangshan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 4 + include/linux/cgroup.h | 4 + kernel/cgroup.c | 150 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 153 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index d45082653e3d..ae8a037a761e 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -488,6 +488,10 @@ Each subsystem should: - add an entry in linux/cgroup_subsys.h - define a cgroup_subsys object called _subsys +If a subsystem can be compiled as a module, it should also have in its +module initcall a call to cgroup_load_subsys(&its_subsys_struct). It +should also set its_subsys.module = THIS_MODULE in its .c file. + Each subsystem may export the following methods. The only mandatory methods are create/destroy. Any others that are null are presumed to be successful no-ops. diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 28319a9fe569..402ce477c47e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -37,6 +37,7 @@ extern void cgroup_post_fork(struct task_struct *p); extern void cgroup_exit(struct task_struct *p, int run_callbacks); extern int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); +extern int cgroup_load_subsys(struct cgroup_subsys *ss); extern const struct file_operations proc_cgroup_operations; @@ -486,6 +487,9 @@ struct cgroup_subsys { /* used when use_id == true */ struct idr idr; spinlock_t id_lock; + + /* should be defined only by modular subsystems */ + struct module *module; }; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c92fb9549358..2cae38e64c59 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -254,7 +255,8 @@ struct cg_cgroup_link { static struct css_set init_css_set; static struct cg_cgroup_link init_css_set_link; -static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); +static int cgroup_init_idr(struct cgroup_subsys *ss, + struct cgroup_subsys_state *css); /* css_set_lock protects the list of css_set objects, and the * chain of tasks off each css_set. Nests outside task->alloc_lock @@ -2125,6 +2127,7 @@ int cgroup_add_file(struct cgroup *cgrp, error = PTR_ERR(dentry); return error; } +EXPORT_SYMBOL_GPL(cgroup_add_file); int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, @@ -2139,6 +2142,7 @@ int cgroup_add_files(struct cgroup *cgrp, } return 0; } +EXPORT_SYMBOL_GPL(cgroup_add_files); /** * cgroup_task_count - count the number of tasks in a cgroup. @@ -3292,7 +3296,144 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) mutex_init(&ss->hierarchy_mutex); lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; + + /* this function shouldn't be used with modular subsystems, since they + * need to register a subsys_id, among other things */ + BUG_ON(ss->module); +} + +/** + * cgroup_load_subsys: load and register a modular subsystem at runtime + * @ss: the subsystem to load + * + * This function should be called in a modular subsystem's initcall. If the + * subsytem is built as a module, it will be assigned a new subsys_id and set + * up for use. If the subsystem is built-in anyway, work is delegated to the + * simpler cgroup_init_subsys. + */ +int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) +{ + int i; + struct cgroup_subsys_state *css; + + /* check name and function validity */ + if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || + ss->create == NULL || ss->destroy == NULL) + return -EINVAL; + + /* + * we don't support callbacks in modular subsystems. this check is + * before the ss->module check for consistency; a subsystem that could + * be a module should still have no callbacks even if the user isn't + * compiling it as one. + */ + if (ss->fork || ss->exit) + return -EINVAL; + + /* + * an optionally modular subsystem is built-in: we want to do nothing, + * since cgroup_init_subsys will have already taken care of it. + */ + if (ss->module == NULL) { + /* a few sanity checks */ + BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT); + BUG_ON(subsys[ss->subsys_id] != ss); + return 0; + } + + /* + * need to register a subsys id before anything else - for example, + * init_cgroup_css needs it. + */ + mutex_lock(&cgroup_mutex); + /* find the first empty slot in the array */ + for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + if (subsys[i] == NULL) + break; + } + if (i == CGROUP_SUBSYS_COUNT) { + /* maximum number of subsystems already registered! */ + mutex_unlock(&cgroup_mutex); + return -EBUSY; + } + /* assign ourselves the subsys_id */ + ss->subsys_id = i; + subsys[i] = ss; + + /* + * no ss->create seems to need anything important in the ss struct, so + * this can happen first (i.e. before the rootnode attachment). + */ + css = ss->create(ss, dummytop); + if (IS_ERR(css)) { + /* failure case - need to deassign the subsys[] slot. */ + subsys[i] = NULL; + mutex_unlock(&cgroup_mutex); + return PTR_ERR(css); + } + + list_add(&ss->sibling, &rootnode.subsys_list); + ss->root = &rootnode; + + /* our new subsystem will be attached to the dummy hierarchy. */ + init_cgroup_css(css, ss, dummytop); + /* init_idr must be after init_cgroup_css because it sets css->id. */ + if (ss->use_id) { + int ret = cgroup_init_idr(ss, css); + if (ret) { + dummytop->subsys[ss->subsys_id] = NULL; + ss->destroy(ss, dummytop); + subsys[i] = NULL; + mutex_unlock(&cgroup_mutex); + return ret; + } + } + + /* + * Now we need to entangle the css into the existing css_sets. unlike + * in cgroup_init_subsys, there are now multiple css_sets, so each one + * will need a new pointer to it; done by iterating the css_set_table. + * furthermore, modifying the existing css_sets will corrupt the hash + * table state, so each changed css_set will need its hash recomputed. + * this is all done under the css_set_lock. + */ + write_lock(&css_set_lock); + for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { + struct css_set *cg; + struct hlist_node *node, *tmp; + struct hlist_head *bucket = &css_set_table[i], *new_bucket; + + hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { + /* skip entries that we already rehashed */ + if (cg->subsys[ss->subsys_id]) + continue; + /* remove existing entry */ + hlist_del(&cg->hlist); + /* set new value */ + cg->subsys[ss->subsys_id] = css; + /* recompute hash and restore entry */ + new_bucket = css_set_hash(cg->subsys); + hlist_add_head(&cg->hlist, new_bucket); + } + } + write_unlock(&css_set_lock); + + mutex_init(&ss->hierarchy_mutex); + lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); + ss->active = 1; + + /* + * pin the subsystem's module so it doesn't go away. this shouldn't + * fail, since the module's initcall calls us. + * TODO: with module unloading, move this elsewhere + */ + BUG_ON(!try_module_get(ss->module)); + + /* success! */ + mutex_unlock(&cgroup_mutex); + return 0; } +EXPORT_SYMBOL_GPL(cgroup_load_subsys); /** * cgroup_init_early - cgroup initialization at system boot @@ -3364,7 +3505,7 @@ int __init cgroup_init(void) if (!ss->early_init) cgroup_init_subsys(ss); if (ss->use_id) - cgroup_subsys_init_idr(ss); + cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); } /* Add init_css_set to the hash table */ @@ -4033,15 +4174,14 @@ err_out: } -static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) +static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, + struct cgroup_subsys_state *rootcss) { struct css_id *newid; - struct cgroup_subsys_state *rootcss; spin_lock_init(&ss->id_lock); idr_init(&ss->idr); - rootcss = init_css_set.subsys[ss->subsys_id]; newid = get_new_cssid(ss, 0); if (IS_ERR(newid)) return PTR_ERR(newid); -- cgit v1.2.3 From cf5d5941fda647fe3d2f2d00cf9e0245236a5f08 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 10 Mar 2010 15:22:09 -0800 Subject: cgroups: subsystem module unloading Provides support for unloading modular subsystems. This patch adds a new function cgroup_unload_subsys which is to be used for removing a loaded subsystem during module deletion. Reference counting of the subsystems' modules is moved from once (at load time) to once per attached hierarchy (in parse_cgroupfs_options and rebind_subsystems) (i.e., 0 or 1). Signed-off-by: Ben Blum Acked-by: Li Zefan Cc: Paul Menage Cc: "David S. Miller" Cc: KAMEZAWA Hiroyuki Cc: Lai Jiangshan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 5 +- include/linux/cgroup.h | 4 +- kernel/cgroup.c | 167 ++++++++++++++++++++++++++++++++------ 3 files changed, 148 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index ae8a037a761e..764007b63921 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -489,8 +489,9 @@ Each subsystem should: - define a cgroup_subsys object called _subsys If a subsystem can be compiled as a module, it should also have in its -module initcall a call to cgroup_load_subsys(&its_subsys_struct). It -should also set its_subsys.module = THIS_MODULE in its .c file. +module initcall a call to cgroup_load_subsys(), and in its exitcall a +call to cgroup_unload_subsys(). It should also set its_subsys.module = +THIS_MODULE in its .c file. Each subsystem may export the following methods. The only mandatory methods are create/destroy. Any others that are null are presumed to diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 402ce477c47e..2a59d3101e5d 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -38,6 +38,7 @@ extern void cgroup_exit(struct task_struct *p, int run_callbacks); extern int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); extern int cgroup_load_subsys(struct cgroup_subsys *ss); +extern void cgroup_unload_subsys(struct cgroup_subsys *ss); extern const struct file_operations proc_cgroup_operations; @@ -271,7 +272,8 @@ struct css_set { /* * Set of subsystem states, one for each subsystem. This array * is immutable after creation apart from the init_css_set - * during subsystem registration (at boot time). + * during subsystem registration (at boot time) and modular subsystem + * loading/unloading. */ struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2cae38e64c59..aa889c96cc74 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -894,7 +894,9 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) } /* - * Call with cgroup_mutex held. + * Call with cgroup_mutex held. Drops reference counts on modules, including + * any duplicate ones that parse_cgroupfs_options took. If this function + * returns an error, no reference counts are touched. */ static int rebind_subsystems(struct cgroupfs_root *root, unsigned long final_bits) @@ -950,6 +952,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (ss->bind) ss->bind(ss, cgrp); mutex_unlock(&ss->hierarchy_mutex); + /* refcount was already taken, and we're keeping it */ } else if (bit & removed_bits) { /* We're removing this subsystem */ BUG_ON(ss == NULL); @@ -963,10 +966,20 @@ static int rebind_subsystems(struct cgroupfs_root *root, subsys[i]->root = &rootnode; list_move(&ss->sibling, &rootnode.subsys_list); mutex_unlock(&ss->hierarchy_mutex); + /* subsystem is now free - drop reference on module */ + module_put(ss->module); } else if (bit & final_bits) { /* Subsystem state should already exist */ BUG_ON(ss == NULL); BUG_ON(!cgrp->subsys[i]); + /* + * a refcount was taken, but we already had one, so + * drop the extra reference. + */ + module_put(ss->module); +#ifdef CONFIG_MODULE_UNLOAD + BUG_ON(ss->module && !module_refcount(ss->module)); +#endif } else { /* Subsystem state shouldn't exist */ BUG_ON(cgrp->subsys[i]); @@ -1010,13 +1023,16 @@ struct cgroup_sb_opts { /* * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call - * with cgroup_mutex held to protect the subsys[] array. + * with cgroup_mutex held to protect the subsys[] array. This function takes + * refcounts on subsystems to be used, unless it returns error, in which case + * no refcounts are taken. */ -static int parse_cgroupfs_options(char *data, - struct cgroup_sb_opts *opts) +static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data ?: "all"; unsigned long mask = (unsigned long)-1; + int i; + bool module_pin_failed = false; BUG_ON(!mutex_is_locked(&cgroup_mutex)); @@ -1031,7 +1047,6 @@ static int parse_cgroupfs_options(char *data, return -EINVAL; if (!strcmp(token, "all")) { /* Add all non-disabled subsystems */ - int i; opts->subsys_bits = 0; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; @@ -1054,7 +1069,6 @@ static int parse_cgroupfs_options(char *data, if (!opts->release_agent) return -ENOMEM; } else if (!strncmp(token, "name=", 5)) { - int i; const char *name = token + 5; /* Can't specify an empty name */ if (!strlen(name)) @@ -1078,7 +1092,6 @@ static int parse_cgroupfs_options(char *data, return -ENOMEM; } else { struct cgroup_subsys *ss; - int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { ss = subsys[i]; if (ss == NULL) @@ -1117,9 +1130,54 @@ static int parse_cgroupfs_options(char *data, if (!opts->subsys_bits && !opts->name) return -EINVAL; + /* + * Grab references on all the modules we'll need, so the subsystems + * don't dance around before rebind_subsystems attaches them. This may + * take duplicate reference counts on a subsystem that's already used, + * but rebind_subsystems handles this case. + */ + for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + unsigned long bit = 1UL << i; + + if (!(bit & opts->subsys_bits)) + continue; + if (!try_module_get(subsys[i]->module)) { + module_pin_failed = true; + break; + } + } + if (module_pin_failed) { + /* + * oops, one of the modules was going away. this means that we + * raced with a module_delete call, and to the user this is + * essentially a "subsystem doesn't exist" case. + */ + for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { + /* drop refcounts only on the ones we took */ + unsigned long bit = 1UL << i; + + if (!(bit & opts->subsys_bits)) + continue; + module_put(subsys[i]->module); + } + return -ENOENT; + } + return 0; } +static void drop_parsed_module_refcounts(unsigned long subsys_bits) +{ + int i; + for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + unsigned long bit = 1UL << i; + + if (!(bit & subsys_bits)) + continue; + module_put(subsys[i]->module); + } +} + static int cgroup_remount(struct super_block *sb, int *flags, char *data) { int ret = 0; @@ -1136,21 +1194,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (ret) goto out_unlock; - /* Don't allow flags to change at remount */ - if (opts.flags != root->flags) { - ret = -EINVAL; - goto out_unlock; - } - - /* Don't allow name to change at remount */ - if (opts.name && strcmp(opts.name, root->name)) { + /* Don't allow flags or name to change at remount */ + if (opts.flags != root->flags || + (opts.name && strcmp(opts.name, root->name))) { ret = -EINVAL; + drop_parsed_module_refcounts(opts.subsys_bits); goto out_unlock; } ret = rebind_subsystems(root, opts.subsys_bits); - if (ret) + if (ret) { + drop_parsed_module_refcounts(opts.subsys_bits); goto out_unlock; + } /* (re)populate subsystem files */ cgroup_populate_dir(cgrp); @@ -1349,7 +1405,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, new_root = cgroup_root_from_opts(&opts); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); - goto out_err; + goto drop_modules; } opts.new_root = new_root; @@ -1358,7 +1414,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, if (IS_ERR(sb)) { ret = PTR_ERR(sb); cgroup_drop_root(opts.new_root); - goto out_err; + goto drop_modules; } root = sb->s_fs_info; @@ -1414,6 +1470,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type, free_cg_links(&tmp_cg_links); goto drop_new_super; } + /* + * There must be no failure case after here, since rebinding + * takes care of subsystems' refcounts, which are explicitly + * dropped in the failure exit path. + */ /* EBUSY should be the only error here */ BUG_ON(ret); @@ -1452,6 +1513,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, * any) is not needed */ cgroup_drop_root(opts.new_root); + /* no subsys rebinding, so refcounts don't change */ + drop_parsed_module_refcounts(opts.subsys_bits); } simple_set_mnt(mnt, sb); @@ -1461,6 +1524,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, drop_new_super: deactivate_locked_super(sb); + drop_modules: + drop_parsed_module_refcounts(opts.subsys_bits); out_err: kfree(opts.release_agent); kfree(opts.name); @@ -3422,19 +3487,71 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; - /* - * pin the subsystem's module so it doesn't go away. this shouldn't - * fail, since the module's initcall calls us. - * TODO: with module unloading, move this elsewhere - */ - BUG_ON(!try_module_get(ss->module)); - /* success! */ mutex_unlock(&cgroup_mutex); return 0; } EXPORT_SYMBOL_GPL(cgroup_load_subsys); +/** + * cgroup_unload_subsys: unload a modular subsystem + * @ss: the subsystem to unload + * + * This function should be called in a modular subsystem's exitcall. When this + * function is invoked, the refcount on the subsystem's module will be 0, so + * the subsystem will not be attached to any hierarchy. + */ +void cgroup_unload_subsys(struct cgroup_subsys *ss) +{ + struct cg_cgroup_link *link; + struct hlist_head *hhead; + + BUG_ON(ss->module == NULL); + + /* + * we shouldn't be called if the subsystem is in use, and the use of + * try_module_get in parse_cgroupfs_options should ensure that it + * doesn't start being used while we're killing it off. + */ + BUG_ON(ss->root != &rootnode); + + mutex_lock(&cgroup_mutex); + /* deassign the subsys_id */ + BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT); + subsys[ss->subsys_id] = NULL; + + /* remove subsystem from rootnode's list of subsystems */ + list_del(&ss->sibling); + + /* + * disentangle the css from all css_sets attached to the dummytop. as + * in loading, we need to pay our respects to the hashtable gods. + */ + write_lock(&css_set_lock); + list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { + struct css_set *cg = link->cg; + + hlist_del(&cg->hlist); + BUG_ON(!cg->subsys[ss->subsys_id]); + cg->subsys[ss->subsys_id] = NULL; + hhead = css_set_hash(cg->subsys); + hlist_add_head(&cg->hlist, hhead); + } + write_unlock(&css_set_lock); + + /* + * remove subsystem's css from the dummytop and free it - need to free + * before marking as null because ss->destroy needs the cgrp->subsys + * pointer to find their state. note that this also takes care of + * freeing the css_id. + */ + ss->destroy(ss, dummytop); + dummytop->subsys[ss->subsys_id] = NULL; + + mutex_unlock(&cgroup_mutex); +} +EXPORT_SYMBOL_GPL(cgroup_unload_subsys); + /** * cgroup_init_early - cgroup initialization at system boot * -- cgit v1.2.3 From 67523c48aa74d5637848edeccf285af1c60bf14a Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 10 Mar 2010 15:22:11 -0800 Subject: cgroups: blkio subsystem as module Modify the Block I/O cgroup subsystem to be able to be built as a module. As the CFQ disk scheduler optionally depends on blk-cgroup, config options in block/Kconfig, block/Kconfig.iosched, and block/blk-cgroup.h are enhanced to support the new module dependency. Signed-off-by: Ben Blum Cc: Li Zefan Cc: Paul Menage Cc: "David S. Miller" Cc: KAMEZAWA Hiroyuki Cc: Lai Jiangshan Cc: Vivek Goyal Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/Kconfig | 2 +- block/Kconfig.iosched | 2 +- block/blk-cgroup.c | 53 ++++++++++++++++++++++++++++++++++++----------- block/blk-cgroup.h | 10 +++++++-- include/linux/iocontext.h | 2 +- kernel/cgroup.c | 9 ++++++++ 6 files changed, 61 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/block/Kconfig b/block/Kconfig index e20fbde0875c..62a5921321cd 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -78,7 +78,7 @@ config BLK_DEV_INTEGRITY Protection. If in doubt, say N. config BLK_CGROUP - bool + tristate depends on CGROUPS default n ---help--- diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index b71abfb0d726..fc71cf071fb2 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -23,6 +23,7 @@ config IOSCHED_DEADLINE config IOSCHED_CFQ tristate "CFQ I/O scheduler" + select BLK_CGROUP if CFQ_GROUP_IOSCHED default y ---help--- The CFQ I/O scheduler tries to distribute bandwidth equally @@ -35,7 +36,6 @@ config IOSCHED_CFQ config CFQ_GROUP_IOSCHED bool "CFQ Group Scheduling support" depends on IOSCHED_CFQ && CGROUPS - select BLK_CGROUP default n ---help--- Enable group IO scheduling in CFQ. diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c85d74cae200..4b686ad08eaa 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -23,6 +23,31 @@ static LIST_HEAD(blkio_list); struct blkio_cgroup blkio_root_cgroup = { .weight = 2*BLKIO_WEIGHT_DEFAULT }; EXPORT_SYMBOL_GPL(blkio_root_cgroup); +static struct cgroup_subsys_state *blkiocg_create(struct cgroup_subsys *, + struct cgroup *); +static int blkiocg_can_attach(struct cgroup_subsys *, struct cgroup *, + struct task_struct *, bool); +static void blkiocg_attach(struct cgroup_subsys *, struct cgroup *, + struct cgroup *, struct task_struct *, bool); +static void blkiocg_destroy(struct cgroup_subsys *, struct cgroup *); +static int blkiocg_populate(struct cgroup_subsys *, struct cgroup *); + +struct cgroup_subsys blkio_subsys = { + .name = "blkio", + .create = blkiocg_create, + .can_attach = blkiocg_can_attach, + .attach = blkiocg_attach, + .destroy = blkiocg_destroy, + .populate = blkiocg_populate, +#ifdef CONFIG_BLK_CGROUP + /* note: blkio_subsys_id is otherwise defined in blk-cgroup.h */ + .subsys_id = blkio_subsys_id, +#endif + .use_id = 1, + .module = THIS_MODULE, +}; +EXPORT_SYMBOL_GPL(blkio_subsys); + struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup) { return container_of(cgroup_subsys_state(cgroup, blkio_subsys_id), @@ -253,7 +278,8 @@ remove_entry: done: free_css_id(&blkio_subsys, &blkcg->css); rcu_read_unlock(); - kfree(blkcg); + if (blkcg != &blkio_root_cgroup) + kfree(blkcg); } static struct cgroup_subsys_state * @@ -319,17 +345,6 @@ static void blkiocg_attach(struct cgroup_subsys *subsys, struct cgroup *cgroup, task_unlock(tsk); } -struct cgroup_subsys blkio_subsys = { - .name = "blkio", - .create = blkiocg_create, - .can_attach = blkiocg_can_attach, - .attach = blkiocg_attach, - .destroy = blkiocg_destroy, - .populate = blkiocg_populate, - .subsys_id = blkio_subsys_id, - .use_id = 1, -}; - void blkio_policy_register(struct blkio_policy_type *blkiop) { spin_lock(&blkio_list_lock); @@ -345,3 +360,17 @@ void blkio_policy_unregister(struct blkio_policy_type *blkiop) spin_unlock(&blkio_list_lock); } EXPORT_SYMBOL_GPL(blkio_policy_unregister); + +static int __init init_cgroup_blkio(void) +{ + return cgroup_load_subsys(&blkio_subsys); +} + +static void __exit exit_cgroup_blkio(void) +{ + cgroup_unload_subsys(&blkio_subsys); +} + +module_init(init_cgroup_blkio); +module_exit(exit_cgroup_blkio); +MODULE_LICENSE("GPL"); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index 84bf745fa775..8ccc20464dae 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -15,7 +15,13 @@ #include -#ifdef CONFIG_BLK_CGROUP +#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) + +#ifndef CONFIG_BLK_CGROUP +/* When blk-cgroup is a module, its subsys_id isn't a compile-time constant */ +extern struct cgroup_subsys blkio_subsys; +#define blkio_subsys_id blkio_subsys.subsys_id +#endif struct blkio_cgroup { struct cgroup_subsys_state css; @@ -91,7 +97,7 @@ static inline void blkiocg_update_blkio_group_dequeue_stats( struct blkio_group *blkg, unsigned long dequeue) {} #endif -#ifdef CONFIG_BLK_CGROUP +#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) extern struct blkio_cgroup blkio_root_cgroup; extern struct blkio_cgroup *cgroup_to_blkio_cgroup(struct cgroup *cgroup); extern void blkiocg_add_blkio_group(struct blkio_cgroup *blkcg, diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 1195a806fe0c..a0bb301afac0 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -42,7 +42,7 @@ struct io_context { unsigned short ioprio; unsigned short ioprio_changed; -#ifdef CONFIG_BLK_CGROUP +#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE) unsigned short cgroup_changed; #endif diff --git a/kernel/cgroup.c b/kernel/cgroup.c index aa889c96cc74..521591dbab2f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -705,6 +705,7 @@ void cgroup_lock(void) { mutex_lock(&cgroup_mutex); } +EXPORT_SYMBOL_GPL(cgroup_lock); /** * cgroup_unlock - release lock on cgroup changes @@ -715,6 +716,7 @@ void cgroup_unlock(void) { mutex_unlock(&cgroup_mutex); } +EXPORT_SYMBOL_GPL(cgroup_unlock); /* * A couple of forward declarations required, due to cyclic reference loop: @@ -1639,6 +1641,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) memmove(buf, start, buf + buflen - start); return 0; } +EXPORT_SYMBOL_GPL(cgroup_path); /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' @@ -1805,6 +1808,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp) } return true; } +EXPORT_SYMBOL_GPL(cgroup_lock_live_group); static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, const char *buffer) @@ -4082,6 +4086,7 @@ void __css_put(struct cgroup_subsys_state *css, int count) rcu_read_unlock(); WARN_ON_ONCE(val < 1); } +EXPORT_SYMBOL_GPL(__css_put); /* * Notify userspace when a cgroup is released, by running the @@ -4197,6 +4202,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) return cssid->id; return 0; } +EXPORT_SYMBOL_GPL(css_id); unsigned short css_depth(struct cgroup_subsys_state *css) { @@ -4206,6 +4212,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) return cssid->depth; return 0; } +EXPORT_SYMBOL_GPL(css_depth); bool css_is_ancestor(struct cgroup_subsys_state *child, const struct cgroup_subsys_state *root) @@ -4242,6 +4249,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) spin_unlock(&ss->id_lock); call_rcu(&id->rcu_head, __free_css_id_cb); } +EXPORT_SYMBOL_GPL(free_css_id); /* * This is called by init or create(). Then, calls to this function are @@ -4358,6 +4366,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) return rcu_dereference(cssid->css); } +EXPORT_SYMBOL_GPL(css_lookup); /** * css_get_next - lookup next cgroup under specified hierarchy. -- cgit v1.2.3 From b70cc5fdb445a6929a01e9c406593265b136c99d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 10 Mar 2010 15:22:12 -0800 Subject: cgroups: clean up cgroup_pidlist_find() a bit Don't call get_pid_ns() before we locate/alloc the ns. Signed-off-by: Li Zefan Cc: Serge Hallyn Acked-by: Paul Menage Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 521591dbab2f..1bf4d6db54ab 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2597,7 +2597,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); + struct pid_namespace *ns = current->nsproxy->pid_ns; + /* * We can't drop the pidlist_mutex before taking the l->mutex in case * the last ref-holder is trying to remove l from the list at the same @@ -2607,8 +2608,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, mutex_lock(&cgrp->pidlist_mutex); list_for_each_entry(l, &cgrp->pidlists, links) { if (l->key.type == type && l->key.ns == ns) { - /* found a matching list - drop the extra refcount */ - put_pid_ns(ns); /* make sure l doesn't vanish out from under us */ down_write(&l->mutex); mutex_unlock(&cgrp->pidlist_mutex); @@ -2619,13 +2618,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); if (!l) { mutex_unlock(&cgrp->pidlist_mutex); - put_pid_ns(ns); return l; } init_rwsem(&l->mutex); down_write(&l->mutex); l->key.type = type; - l->key.ns = ns; + l->key.ns = get_pid_ns(ns); l->use_count = 0; /* don't increment here */ l->list = NULL; l->owner = cgrp; -- cgit v1.2.3 From 0dea116876eefc9c7ca9c5d74fe665481e499fa3 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 10 Mar 2010 15:22:20 -0800 Subject: cgroup: implement eventfd-based generic API for notifications This patchset introduces eventfd-based API for notifications in cgroups and implements memory notifications on top of it. It uses statistics in memory controler to track memory usage. Output of time(1) on building kernel on tmpfs: Root cgroup before changes: make -j2 506.37 user 60.93s system 193% cpu 4:52.77 total Non-root cgroup before changes: make -j2 507.14 user 62.66s system 193% cpu 4:54.74 total Root cgroup after changes (0 thresholds): make -j2 507.13 user 62.20s system 193% cpu 4:53.55 total Non-root cgroup after changes (0 thresholds): make -j2 507.70 user 64.20s system 193% cpu 4:55.70 total Root cgroup after changes (1 thresholds, never crossed): make -j2 506.97 user 62.20s system 193% cpu 4:53.90 total Non-root cgroup after changes (1 thresholds, never crossed): make -j2 507.55 user 64.08s system 193% cpu 4:55.63 total This patch: Introduce the write-only file "cgroup.event_control" in every cgroup. To register new notification handler you need: - create an eventfd; - open a control file to be monitored. Callbacks register_event() and unregister_event() must be defined for the control file; - write " " to cgroup.event_control. Interpretation of args is defined by control file implementation; eventfd will be woken up by control file implementation or when the cgroup is removed. To unregister notification handler just close eventfd. If you need notification functionality for a control file you have to implement callbacks register_event() and unregister_event() in the struct cftype. [kamezawa.hiroyu@jp.fujitsu.com: Kconfig fix] Signed-off-by: Kirill A. Shutemov Reviewed-by: KAMEZAWA Hiroyuki Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Pavel Emelyanov Cc: Dan Malek Cc: Vladislav Buzov Cc: Daisuke Nishimura Cc: Alexander Shishkin Cc: Davide Libenzi Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/cgroups.txt | 20 ++++ include/linux/cgroup.h | 24 ++++ init/Kconfig | 1 + kernel/cgroup.c | 228 +++++++++++++++++++++++++++++++++++++- 4 files changed, 272 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt index c0358c30c64f..fd588ff0e296 100644 --- a/Documentation/cgroups/cgroups.txt +++ b/Documentation/cgroups/cgroups.txt @@ -23,6 +23,7 @@ CONTENTS: 2.1 Basic Usage 2.2 Attaching processes 2.3 Mounting hierarchies by name + 2.4 Notification API 3. Kernel API 3.1 Overview 3.2 Synchronization @@ -435,6 +436,25 @@ you give a subsystem a name. The name of the subsystem appears as part of the hierarchy description in /proc/mounts and /proc//cgroups. +2.4 Notification API +-------------------- + +There is mechanism which allows to get notifications about changing +status of a cgroup. + +To register new notification handler you need: + - create a file descriptor for event notification using eventfd(2); + - open a control file to be monitored (e.g. memory.usage_in_bytes); + - write " " to cgroup.event_control. + Interpretation of args is defined by control file implementation; + +eventfd will be woken up by control file implementation or when the +cgroup is removed. + +To unregister notification handler just close eventfd. + +NOTE: Support of notifications should be implemented for the control +file. See documentation for the subsystem. 3. Kernel API ============= diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 2a59d3101e5d..b4f2201321cd 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -235,6 +235,10 @@ struct cgroup { /* For RCU-protected deletion */ struct rcu_head rcu_head; + + /* List of events which userspace want to recieve */ + struct list_head event_list; + spinlock_t event_list_lock; }; /* @@ -378,6 +382,26 @@ struct cftype { int (*trigger)(struct cgroup *cgrp, unsigned int event); int (*release)(struct inode *inode, struct file *file); + + /* + * register_event() callback will be used to add new userspace + * waiter for changes related to the cftype. Implement it if + * you want to provide this functionality. Use eventfd_signal() + * on eventfd to send notification to userspace. + */ + int (*register_event)(struct cgroup *cgrp, struct cftype *cft, + struct eventfd_ctx *eventfd, const char *args); + /* + * unregister_event() callback will be called when userspace + * closes the eventfd or on cgroup removing. + * This callback must be implemented, if you want provide + * notification functionality. + * + * Be careful. It can be called after destroy(), so you have + * to keep all nesessary data, until all events are removed. + */ + int (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, + struct eventfd_ctx *eventfd); }; struct cgroup_scanner { diff --git a/init/Kconfig b/init/Kconfig index 089a230e5652..eb77e8ccde1c 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -463,6 +463,7 @@ config HAVE_UNSTABLE_SCHED_CLOCK menuconfig CGROUPS boolean "Control Group support" + depends on EVENTFD help This option adds support for grouping sets of processes together, for use with process control subsystems such as Cpusets, CFS, memory diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1bf4d6db54ab..ea94984a3895 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4,6 +4,10 @@ * Based originally on the cpuset system, extracted by Paul Menage * Copyright (C) 2006 Google, Inc * + * Notifications support + * Copyright (C) 2009 Nokia Corporation + * Author: Kirill A. Shutemov + * * Copyright notices from the original cpuset code: * -------------------------------------------------- * Copyright (C) 2003 BULL SA. @@ -53,6 +57,8 @@ #include #include #include /* TODO: replace with more sophisticated array */ +#include +#include #include @@ -152,6 +158,35 @@ struct css_id { unsigned short stack[0]; /* Array of Length (depth+1) */ }; +/* + * cgroup_event represents events which userspace want to recieve. + */ +struct cgroup_event { + /* + * Cgroup which the event belongs to. + */ + struct cgroup *cgrp; + /* + * Control file which the event associated. + */ + struct cftype *cft; + /* + * eventfd to signal userspace about the event. + */ + struct eventfd_ctx *eventfd; + /* + * Each of these stored in a list by the cgroup. + */ + struct list_head list; + /* + * All fields below needed to unregister event when + * userspace closes eventfd. + */ + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_t wait; + struct work_struct remove; +}; /* The list of hierarchy roots */ @@ -760,14 +795,28 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) static int cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; + struct cgroup_event *event, *tmp; int ret = 0; for_each_subsys(cgrp->root, ss) if (ss->pre_destroy) { ret = ss->pre_destroy(ss, cgrp); if (ret) - break; + goto out; } + + /* + * Unregister events and notify userspace. + */ + spin_lock(&cgrp->event_list_lock); + list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { + list_del(&event->list); + eventfd_signal(event->eventfd, 1); + schedule_work(&event->remove); + } + spin_unlock(&cgrp->event_list_lock); + +out: return ret; } @@ -1239,6 +1288,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); + INIT_LIST_HEAD(&cgrp->event_list); + spin_lock_init(&cgrp->event_list_lock); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -2077,6 +2128,16 @@ static const struct inode_operations cgroup_dir_inode_operations = { .rename = cgroup_rename, }; +/* + * Check if a file is a control file + */ +static inline struct cftype *__file_cft(struct file *file) +{ + if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) + return ERR_PTR(-EINVAL); + return __d_cft(file->f_dentry); +} + static int cgroup_create_file(struct dentry *dentry, mode_t mode, struct super_block *sb) { @@ -2930,6 +2991,166 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, return 0; } +/* + * Unregister event and free resources. + * + * Gets called from workqueue. + */ +static void cgroup_event_remove(struct work_struct *work) +{ + struct cgroup_event *event = container_of(work, struct cgroup_event, + remove); + struct cgroup *cgrp = event->cgrp; + + /* TODO: check return code */ + event->cft->unregister_event(cgrp, event->cft, event->eventfd); + + eventfd_ctx_put(event->eventfd); + remove_wait_queue(event->wqh, &event->wait); + kfree(event); +} + +/* + * Gets called on POLLHUP on eventfd when user closes it. + * + * Called with wqh->lock held and interrupts disabled. + */ +static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct cgroup_event *event = container_of(wait, + struct cgroup_event, wait); + struct cgroup *cgrp = event->cgrp; + unsigned long flags = (unsigned long)key; + + if (flags & POLLHUP) { + spin_lock(&cgrp->event_list_lock); + list_del(&event->list); + spin_unlock(&cgrp->event_list_lock); + /* + * We are in atomic context, but cgroup_event_remove() may + * sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + + return 0; +} + +static void cgroup_event_ptable_queue_proc(struct file *file, + wait_queue_head_t *wqh, poll_table *pt) +{ + struct cgroup_event *event = container_of(pt, + struct cgroup_event, pt); + + event->wqh = wqh; + add_wait_queue(wqh, &event->wait); +} + +/* + * Parse input and register new cgroup event handler. + * + * Input must be in format ' '. + * Interpretation of args is defined by control file implementation. + */ +static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + struct cgroup_event *event = NULL; + unsigned int efd, cfd; + struct file *efile = NULL; + struct file *cfile = NULL; + char *endp; + int ret; + + efd = simple_strtoul(buffer, &endp, 10); + if (*endp != ' ') + return -EINVAL; + buffer = endp + 1; + + cfd = simple_strtoul(buffer, &endp, 10); + if ((*endp != ' ') && (*endp != '\0')) + return -EINVAL; + buffer = endp + 1; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + event->cgrp = cgrp; + INIT_LIST_HEAD(&event->list); + init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); + init_waitqueue_func_entry(&event->wait, cgroup_event_wake); + INIT_WORK(&event->remove, cgroup_event_remove); + + efile = eventfd_fget(efd); + if (IS_ERR(efile)) { + ret = PTR_ERR(efile); + goto fail; + } + + event->eventfd = eventfd_ctx_fileget(efile); + if (IS_ERR(event->eventfd)) { + ret = PTR_ERR(event->eventfd); + goto fail; + } + + cfile = fget(cfd); + if (!cfile) { + ret = -EBADF; + goto fail; + } + + /* the process need read permission on control file */ + ret = file_permission(cfile, MAY_READ); + if (ret < 0) + goto fail; + + event->cft = __file_cft(cfile); + if (IS_ERR(event->cft)) { + ret = PTR_ERR(event->cft); + goto fail; + } + + if (!event->cft->register_event || !event->cft->unregister_event) { + ret = -EINVAL; + goto fail; + } + + ret = event->cft->register_event(cgrp, event->cft, + event->eventfd, buffer); + if (ret) + goto fail; + + if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { + event->cft->unregister_event(cgrp, event->cft, event->eventfd); + ret = 0; + goto fail; + } + + spin_lock(&cgrp->event_list_lock); + list_add(&event->list, &cgrp->event_list); + spin_unlock(&cgrp->event_list_lock); + + fput(cfile); + fput(efile); + + return 0; + +fail: + if (cfile) + fput(cfile); + + if (event && event->eventfd && !IS_ERR(event->eventfd)) + eventfd_ctx_put(event->eventfd); + + if (!IS_ERR_OR_NULL(efile)) + fput(efile); + + kfree(event); + + return ret; +} + /* * for the common functions, 'private' gives the type of file */ @@ -2955,6 +3176,11 @@ static struct cftype files[] = { .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, + { + .name = CGROUP_FILE_GENERIC_PREFIX "event_control", + .write_string = cgroup_write_event_control, + .mode = S_IWUGO, + }, }; static struct cftype cft_release_agent = { -- cgit v1.2.3 From 4ab78683c17d739c2a2077141dcf81a02b7fb57e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 10 Mar 2010 15:22:34 -0800 Subject: cgroups: fix race between userspace and kernelspace Notify userspace about cgroup removing only after rmdir of cgroup directory to avoid race between userspace and kernelspace. eventfd are used to notify about two types of event: - control file-specific, like crossing memory threshold; - cgroup removing. To understand what really happen, userspace can check if the cgroup still exists. To avoid race beetween userspace and kernelspace we have to notify userspace about cgroup removing only after rmdir of cgroup directory. Signed-off-by: Kirill A. Shutemov Reviewed-by: KAMEZAWA Hiroyuki Cc: Paul Menage Acked-by: Li Zefan Cc: Balbir Singh Cc: Pavel Emelyanov Cc: Dan Malek Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ea94984a3895..87441fc75663 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -795,28 +795,15 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) static int cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; - struct cgroup_event *event, *tmp; int ret = 0; for_each_subsys(cgrp->root, ss) if (ss->pre_destroy) { ret = ss->pre_destroy(ss, cgrp); if (ret) - goto out; + break; } - /* - * Unregister events and notify userspace. - */ - spin_lock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { - list_del(&event->list); - eventfd_signal(event->eventfd, 1); - schedule_work(&event->remove); - } - spin_unlock(&cgrp->event_list_lock); - -out: return ret; } @@ -3006,7 +2993,6 @@ static void cgroup_event_remove(struct work_struct *work) event->cft->unregister_event(cgrp, event->cft, event->eventfd); eventfd_ctx_put(event->eventfd); - remove_wait_queue(event->wqh, &event->wait); kfree(event); } @@ -3024,6 +3010,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { + remove_wait_queue_locked(event->wqh, &event->wait); spin_lock(&cgrp->event_list_lock); list_del(&event->list); spin_unlock(&cgrp->event_list_lock); @@ -3472,6 +3459,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) struct dentry *d; struct cgroup *parent; DEFINE_WAIT(wait); + struct cgroup_event *event, *tmp; int ret; /* the vfs holds both inode->i_mutex already */ @@ -3555,6 +3543,20 @@ again: set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); + /* + * Unregister events and notify userspace. + * Notify userspace about cgroup removing only after rmdir of cgroup + * directory to avoid race between userspace and kernelspace + */ + spin_lock(&cgrp->event_list_lock); + list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { + list_del(&event->list); + remove_wait_queue(event->wqh, &event->wait); + eventfd_signal(event->eventfd, 1); + schedule_work(&event->remove); + } + spin_unlock(&cgrp->event_list_lock); + mutex_unlock(&cgroup_mutex); return 0; } -- cgit v1.2.3 From a0a4db548edcce067c1201ef25cf2bc29f32dca4 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 10 Mar 2010 15:22:34 -0800 Subject: cgroups: remove events before destroying subsystem state objects Events should be removed after rmdir of cgroup directory, but before destroying subsystem state objects. Let's take reference to cgroup directory dentry to do that. Signed-off-by: Kirill A. Shutemov Acked-by: KAMEZAWA Hiroyuki Cc: Paul Menage Acked-by: Li Zefan Cc: Balbir Singh Cc: Pavel Emelyanov Cc: Dan Malek Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 3 --- kernel/cgroup.c | 8 ++++++++ mm/memcontrol.c | 9 --------- 3 files changed, 8 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index b4f2201321cd..b8ad1ea99586 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -396,9 +396,6 @@ struct cftype { * closes the eventfd or on cgroup removing. * This callback must be implemented, if you want provide * notification functionality. - * - * Be careful. It can be called after destroy(), so you have - * to keep all nesessary data, until all events are removed. */ int (*unregister_event)(struct cgroup *cgrp, struct cftype *cft, struct eventfd_ctx *eventfd); diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 87441fc75663..ef909a329750 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2994,6 +2994,7 @@ static void cgroup_event_remove(struct work_struct *work) eventfd_ctx_put(event->eventfd); kfree(event); + dput(cgrp->dentry); } /* @@ -3114,6 +3115,13 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, goto fail; } + /* + * Events should be removed after rmdir of cgroup directory, but before + * destroying subsystem state objects. Let's take reference to cgroup + * directory dentry to do that. + */ + dget(cgrp->dentry); + spin_lock(&cgrp->event_list_lock); list_add(&event->list, &cgrp->event_list); spin_unlock(&cgrp->event_list_lock); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f9ae4b4c36eb..f7b910fc14fb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3361,12 +3361,6 @@ static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, } } - /* - * We need to increment refcnt to be sure that all thresholds - * will be unregistered before calling __mem_cgroup_free() - */ - mem_cgroup_get(memcg); - if (type == _MEM) rcu_assign_pointer(memcg->thresholds, thresholds_new); else @@ -3460,9 +3454,6 @@ assign: /* To be sure that nobody uses thresholds before freeing it */ synchronize_rcu(); - for (i = 0; i < thresholds->size - size; i++) - mem_cgroup_put(memcg); - kfree(thresholds); unlock: mutex_unlock(&memcg->thresholds_lock); -- cgit v1.2.3 From a56704ef6b0c5796c9ff38cc78aa232dfb9644d7 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 10 Mar 2010 15:23:01 -0800 Subject: copy_signal() cleanup: use zalloc and remove initializations Use kmem_cache_zalloc() on signal creation and remove unneeded initialization lines in copy_signal(). Signed-off-by: Veaceslav Falico Acked-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b0ec34abc0bb..ce2666f84d85 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -863,7 +863,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) if (clone_flags & CLONE_THREAD) return 0; - sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); + sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; if (!sig) return -ENOMEM; @@ -871,46 +871,21 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) atomic_set(&sig->count, 1); atomic_set(&sig->live, 1); init_waitqueue_head(&sig->wait_chldexit); - sig->flags = 0; if (clone_flags & CLONE_NEWPID) sig->flags |= SIGNAL_UNKILLABLE; - sig->group_exit_code = 0; - sig->group_exit_task = NULL; - sig->group_stop_count = 0; sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; - sig->leader = 0; /* session leadership doesn't inherit */ - sig->tty_old_pgrp = NULL; - sig->tty = NULL; - - sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; - sig->gtime = cputime_zero; - sig->cgtime = cputime_zero; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - sig->prev_utime = sig->prev_stime = cputime_zero; -#endif - sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; - sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; - sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; - sig->maxrss = sig->cmaxrss = 0; - task_io_accounting_init(&sig->ioac); - sig->sum_sched_runtime = 0; - taskstats_tgid_init(sig); - task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); posix_cpu_timers_init_group(sig); - acct_init_pacct(&sig->pacct); - tty_audit_fork(sig); sig->oom_adj = current->signal->oom_adj; -- cgit v1.2.3 From 4dd66e69d472f0ba5355a2529364d0db9a18a02b Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 10 Mar 2010 15:23:02 -0800 Subject: copy_signal() cleanup: kill taskstats_tgid_init() and acct_init_pacct() Kill unused functions taskstats_tgid_init() and acct_init_pacct() because we don't use them anywhere after using kmem_cache_zalloc() in copy_signal(). Signed-off-by: Veaceslav Falico Cc: Roland McGrath Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/acct.h | 2 -- include/linux/taskstats_kern.h | 7 ------- kernel/acct.c | 10 ---------- 3 files changed, 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/acct.h b/include/linux/acct.h index 882dc7248766..93f46096ad4c 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -123,14 +123,12 @@ struct pacct_struct; struct pid_namespace; extern void acct_auto_close_mnt(struct vfsmount *m); extern void acct_auto_close(struct super_block *sb); -extern void acct_init_pacct(struct pacct_struct *pacct); extern void acct_collect(long exitcode, int group_dead); extern void acct_process(void); extern void acct_exit_ns(struct pid_namespace *); #else #define acct_auto_close_mnt(x) do { } while (0) #define acct_auto_close(x) do { } while (0) -#define acct_init_pacct(x) do { } while (0) #define acct_collect(x,y) do { } while (0) #define acct_process() do { } while (0) #define acct_exit_ns(ns) do { } while (0) diff --git a/include/linux/taskstats_kern.h b/include/linux/taskstats_kern.h index 3398f4553269..b6523c1427ce 100644 --- a/include/linux/taskstats_kern.h +++ b/include/linux/taskstats_kern.h @@ -14,11 +14,6 @@ extern struct kmem_cache *taskstats_cache; extern struct mutex taskstats_exit_mutex; -static inline void taskstats_tgid_init(struct signal_struct *sig) -{ - sig->stats = NULL; -} - static inline void taskstats_tgid_free(struct signal_struct *sig) { if (sig->stats) @@ -30,8 +25,6 @@ extern void taskstats_init_early(void); #else static inline void taskstats_exit(struct task_struct *tsk, int group_dead) {} -static inline void taskstats_tgid_init(struct signal_struct *sig) -{} static inline void taskstats_tgid_free(struct signal_struct *sig) {} static inline void taskstats_init_early(void) diff --git a/kernel/acct.c b/kernel/acct.c index a6605ca921b6..24f8c81fc48d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -587,16 +587,6 @@ out: revert_creds(orig_cred); } -/** - * acct_init_pacct - initialize a new pacct_struct - * @pacct: per-process accounting info struct to initialize - */ -void acct_init_pacct(struct pacct_struct *pacct) -{ - memset(pacct, 0, sizeof(struct pacct_struct)); - pacct->ac_utime = pacct->ac_stime = cputime_zero; -} - /** * acct_collect - collect accounting information into pacct_struct * @exitcode: task exit code -- cgit v1.2.3 From 93c59907c6f247d09239135caecf294a106a2ae0 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 10 Mar 2010 15:23:03 -0800 Subject: copy_signal() cleanup: clean thread_group_cputime_init() Remove unneeded initializations in thread_group_cputime_init() and in posix_cpu_timers_init_group(). They are useless after kmem_cache_zalloc() was used in copy_signal(). Signed-off-by: Veaceslav Falico Acked-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 2 -- kernel/fork.c | 11 ----------- 2 files changed, 13 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 46c6f8d5dc06..ca635c128482 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2391,9 +2391,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); static inline void thread_group_cputime_init(struct signal_struct *sig) { - sig->cputimer.cputime = INIT_CPUTIME; spin_lock_init(&sig->cputimer.lock); - sig->cputimer.running = 0; } static inline void thread_group_cputime_free(struct signal_struct *sig) diff --git a/kernel/fork.c b/kernel/fork.c index ce2666f84d85..1beb6c303c41 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -833,17 +833,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) /* Thread group counters. */ thread_group_cputime_init(sig); - /* Expiration times and increments. */ - sig->it[CPUCLOCK_PROF].expires = cputime_zero; - sig->it[CPUCLOCK_PROF].incr = cputime_zero; - sig->it[CPUCLOCK_VIRT].expires = cputime_zero; - sig->it[CPUCLOCK_VIRT].incr = cputime_zero; - - /* Cached expiration times. */ - sig->cputime_expires.prof_exp = cputime_zero; - sig->cputime_expires.virt_exp = cputime_zero; - sig->cputime_expires.sched_exp = 0; - cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (cpu_limit != RLIM_INFINITY) { sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); -- cgit v1.2.3 From 13aa9a6b0f2371d2ce0de57c2ede62ab7a787157 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Mar 2010 15:23:09 -0800 Subject: pid_ns: zap_pid_ns_processes: use SEND_SIG_NOINFO instead of force_sig() zap_pid_ns_processes() uses force_sig(SIGKILL) to ensure SIGKILL will be delivered to sub-namespace inits as well. This is correct, but we are going to change force_sig_info() semantics. See http://bugzilla.kernel.org/show_bug.cgi?id=15395#c31 We can use send_sig_info(SEND_SIG_NOINFO) instead, since 614c517d7c00af1b26ded20646b329397d6f51a1 ("signals: SEND_SIG_NOINFO should be considered as SI_FROMUSER()") SEND_SIG_NOINFO means "from user" and therefore send_signal() will get the correct from_ancestor_ns = T flag. Signed-off-by: Oleg Nesterov Acked-by: Serge Hallyn Acked-by: Linus Torvalds Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 86b3796b0436..79aac93acf99 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -161,13 +161,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rcu_read_lock(); /* - * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring - * any nested-container's init processes don't ignore the - * signal + * Any nested-container's init processes won't ignore the + * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser(). */ task = pid_task(find_vpid(nr), PIDTYPE_PID); if (task) - force_sig(SIGKILL, task); + send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); rcu_read_unlock(); -- cgit v1.2.3 From 8467005da3ef6104b89a4cc5e9c9d9445b75565f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 10 Mar 2010 15:23:10 -0800 Subject: nsproxy: remove INIT_NSPROXY() Remove INIT_NSPROXY(), use C99 initializer. Remove INIT_IPC_NS(), INIT_NET_NS() while I'm at it. Note: headers trim will be done later, now it's quite pointless because results will be invalidated by merge window. Signed-off-by: Alexey Dobriyan Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/init_task.h | 8 -------- include/linux/ipc_namespace.h | 5 ----- include/net/net_namespace.h | 5 ----- kernel/nsproxy.c | 13 ++++++++++++- 4 files changed, 12 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index abec69b63d7e..b1ed1cd8e2a8 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -32,14 +32,6 @@ extern struct fs_struct init_fs; } extern struct nsproxy init_nsproxy; -#define INIT_NSPROXY(nsproxy) { \ - .pid_ns = &init_pid_ns, \ - .count = ATOMIC_INIT(1), \ - .uts_ns = &init_uts_ns, \ - .mnt_ns = NULL, \ - INIT_NET_NS(net_ns) \ - INIT_IPC_NS(ipc_ns) \ -} #define INIT_SIGHAND(sighand) { \ .count = ATOMIC_INIT(1), \ diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index 07baa38bce37..51952989ad42 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -62,11 +62,6 @@ extern struct ipc_namespace init_ipc_ns; extern atomic_t nr_ipc_ns; extern spinlock_t mq_lock; -#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) -#define INIT_IPC_NS(ns) .ns = &init_ipc_ns, -#else -#define INIT_IPC_NS(ns) -#endif #ifdef CONFIG_SYSVIPC extern int register_ipcns_notifier(struct ipc_namespace *); diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 82b7be4db89a..bd10a7908993 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -100,14 +100,9 @@ struct net { extern struct net init_net; #ifdef CONFIG_NET -#define INIT_NET_NS(net_ns) .net_ns = &init_net, - extern struct net *copy_net_ns(unsigned long flags, struct net *net_ns); #else /* CONFIG_NET */ - -#define INIT_NET_NS(net_ns) - static inline struct net *copy_net_ns(unsigned long flags, struct net *net_ns) { /* There is nothing to copy so this is a noop */ diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 09b4ff9711b2..2ab67233ee8f 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -24,7 +24,18 @@ static struct kmem_cache *nsproxy_cachep; -struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); +struct nsproxy init_nsproxy = { + .count = ATOMIC_INIT(1), + .uts_ns = &init_uts_ns, +#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) + .ipc_ns = &init_ipc_ns, +#endif + .mnt_ns = NULL, + .pid_ns = &init_pid_ns, +#ifdef CONFIG_NET + .net_ns = &init_net, +#endif +}; static inline struct nsproxy *create_nsproxy(void) { -- cgit v1.2.3 From eb5572fed55f4c2b7dbc42582bc82dcb47632380 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:23:59 -0800 Subject: sysctl extern cleanup: C_A_D Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move C_A_D extern variable declaration to linux/reboot.h Signed-off-by: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/reboot.h | 1 + kernel/sysctl.c | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/reboot.h b/include/linux/reboot.h index 988e55fe649b..3005d5a7fce5 100644 --- a/include/linux/reboot.h +++ b/include/linux/reboot.h @@ -64,6 +64,7 @@ extern void kernel_restart(char *cmd); extern void kernel_halt(void); extern void kernel_power_off(void); +extern int C_A_D; /* for sysctl */ void ctrl_alt_del(void); #define POWEROFF_CMD_PATH_LEN 256 diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0ef19c614f6d..72c3b1e80d7b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -65,7 +65,6 @@ #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int C_A_D; extern int print_fatal_signals; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; -- cgit v1.2.3 From d33ed52d57e794eba55cea3f5eab3c8f80b6cb5a Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:23:59 -0800 Subject: sysctl extern cleanup: signal Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move print_fatal_signals extern declaration to linux/signal.h Signed-off-by: Dave Young Cc: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/signal.h | 2 ++ kernel/sysctl.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/signal.h b/include/linux/signal.h index ab9272cc270c..fcd2b14b1932 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -7,6 +7,8 @@ #ifdef __KERNEL__ #include +/* for sysctl */ +extern int print_fatal_signals; /* * Real Time signals may be queued. */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 72c3b1e80d7b..a8fd10a9a501 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -65,7 +66,6 @@ #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int print_fatal_signals; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern int sysctl_panic_on_oom; -- cgit v1.2.3 From e5ab67726f33b50f40db0ccf271ceb3c658554d5 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:05 -0800 Subject: sysctl extern cleanup: rcu Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move rcutorture_runnable extern declaration to linux/rcupdate.h Signed-off-by: Dave Young Acked-by: Josh Triplett Reviewed-by: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rcupdate.h | 4 ++++ kernel/sysctl.c | 3 --- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index c84373626336..a005cac5e302 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -41,6 +41,10 @@ #include #include +#ifdef CONFIG_RCU_TORTURE_TEST +extern int rcutorture_runnable; /* for sysctl */ +#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ + /** * struct rcu_head - callback structure for use with RCU * @next: next update requests in a list diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a8fd10a9a501..f18aaa7b0d65 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -87,9 +87,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifndef CONFIG_MMU extern int sysctl_nr_trim_pages; #endif -#ifdef CONFIG_RCU_TORTURE_TEST -extern int rcutorture_runnable; -#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ #ifdef CONFIG_BLOCK extern int blk_iopoll_enabled; #endif -- cgit v1.2.3 From 5ed109103d73b0bafc92e860cead56725231384d Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:06 -0800 Subject: sysctl extern cleanup: module Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move modprobe_path extern declaration to linux/kmod.h Move modules_disabled extern declaration to linux/module.h Signed-off-by: Dave Young Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kmod.h | 1 + include/linux/module.h | 1 + kernel/sysctl.c | 4 ---- 3 files changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/kmod.h b/include/linux/kmod.h index 384ca8bbf1ac..facb27fe7de0 100644 --- a/include/linux/kmod.h +++ b/include/linux/kmod.h @@ -27,6 +27,7 @@ #define KMOD_PATH_LEN 256 #ifdef CONFIG_MODULES +extern char modprobe_path[]; /* for sysctl */ /* modprobe exit status on success, -ve on error. Return value * usually useless though. */ extern int __request_module(bool wait, const char *name, ...) \ diff --git a/include/linux/module.h b/include/linux/module.h index dd618eb026aa..5e869ffd34aa 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -175,6 +175,7 @@ struct notifier_block; #ifdef CONFIG_MODULES +extern int modules_disabled; /* for sysctl */ /* Get/put a kernel symbol (calls must be symmetric) */ void *__symbol_get(const char *symbol); void *__symbol_get_gpl(const char *symbol); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f18aaa7b0d65..44e9492368fd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -116,10 +116,6 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; -#ifdef CONFIG_MODULES -extern char modprobe_path[]; -extern int modules_disabled; -#endif #ifdef CONFIG_CHR_DEV_SG extern int sg_big_buff; #endif -- cgit v1.2.3 From 15485a4682d1d3bfee2aa78b4b1a5d36f5746b64 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:07 -0800 Subject: sysctl extern cleanup: sg Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move sg_big_buff extern declaration to scsi/sg.h Signed-off-by: Dave Young Acked-by: Doug Gilbert Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/scsi/sg.h | 3 +++ kernel/sysctl.c | 7 +++---- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/scsi/sg.h b/include/scsi/sg.h index 934ae389671d..a9f3c6fc3f57 100644 --- a/include/scsi/sg.h +++ b/include/scsi/sg.h @@ -70,6 +70,9 @@ Major new features in SG 3.x driver (cf SG 2.x drivers) (for the lk 2.2 series). */ +#ifdef __KERNEL__ +extern int sg_big_buff; /* for sysctl */ +#endif /* New interface introduced in the 3.x SG drivers follows */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 44e9492368fd..5290c437f151 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,6 +61,9 @@ #include #include #endif +#ifdef CONFIG_CHR_DEV_SG +#include +#endif #if defined(CONFIG_SYSCTL) @@ -116,10 +119,6 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; -#ifdef CONFIG_CHR_DEV_SG -extern int sg_big_buff; -#endif - #ifdef CONFIG_SPARC #include #endif -- cgit v1.2.3 From c55b7c3e82d0ad58f35a0785faaaf2f70b9b6cd3 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:08 -0800 Subject: sysctl extern cleanup: acct Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move acct_parm extern declaration to linux/acct.h Signed-off-by: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/acct.h | 1 + kernel/sysctl.c | 7 +++---- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/acct.h b/include/linux/acct.h index 93f46096ad4c..3e4737fa6cce 100644 --- a/include/linux/acct.h +++ b/include/linux/acct.h @@ -121,6 +121,7 @@ struct vfsmount; struct super_block; struct pacct_struct; struct pid_namespace; +extern int acct_parm[]; /* for sysctl */ extern void acct_auto_close_mnt(struct vfsmount *m); extern void acct_auto_close(struct super_block *sb); extern void acct_collect(long exitcode, int group_dead); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5290c437f151..7635bb15f5af 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,6 +61,9 @@ #include #include #endif +#ifdef CONFIG_BSD_PROCESS_ACCT +#include +#endif #ifdef CONFIG_CHR_DEV_SG #include #endif @@ -140,10 +143,6 @@ extern int sysctl_userprocess_debug; extern int spin_retry; #endif -#ifdef CONFIG_BSD_PROCESS_ACCT -extern int acct_parm[]; -#endif - #ifdef CONFIG_IA64 extern int no_unaligned_warning; extern int unaligned_dump_stack; -- cgit v1.2.3 From 4f0e056fdebc15d3f4724ebc7bbf323158add1d7 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:09 -0800 Subject: sysctl extern cleanup: rtmutex Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move max_lock_depth extern declaration to linux/rtmutex.h Signed-off-by: Dave Young Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rtmutex.h | 2 ++ kernel/sysctl.c | 7 +++---- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 281d8fd775e8..8d522ffeda33 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -16,6 +16,8 @@ #include #include +extern int max_lock_depth; /* for sysctl */ + /** * The rt_mutex structure * diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7635bb15f5af..622029ba5103 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -64,6 +64,9 @@ #ifdef CONFIG_BSD_PROCESS_ACCT #include #endif +#ifdef CONFIG_RT_MUTEXES +#include +#endif #ifdef CONFIG_CHR_DEV_SG #include #endif @@ -150,10 +153,6 @@ extern int unaligned_dump_stack; extern struct ratelimit_state printk_ratelimit_state; -#ifdef CONFIG_RT_MUTEXES -extern int max_lock_depth; -#endif - #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -- cgit v1.2.3 From 2edf5e49800846a2b2b6461d99cdae18067c440f Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:10 -0800 Subject: sysctl extern cleanup: lockdep Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move lockdep extern declarations to linux/lockdep.h Signed-off-by: Dave Young Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lockdep.h | 4 ++++ kernel/sysctl.c | 6 +++--- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 10206a87da19..a03977a96d7e 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -12,6 +12,10 @@ struct task_struct; struct lockdep_map; +/* for sysctl */ +extern int prove_locking; +extern int lock_stat; + #ifdef CONFIG_LOCKDEP #include diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 622029ba5103..8686b0f5fc12 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -67,6 +67,9 @@ #ifdef CONFIG_RT_MUTEXES #include #endif +#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) +#include +#endif #ifdef CONFIG_CHR_DEV_SG #include #endif @@ -191,9 +194,6 @@ extern struct ctl_table epoll_table[]; int sysctl_legacy_va_layout; #endif -extern int prove_locking; -extern int lock_stat; - /* The default sysctl tables: */ static struct ctl_table root_table[] = { -- cgit v1.2.3 From 52fbe9cde7fdb5c6fac196d7ebd2d92d05ef3cd4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 8 Mar 2010 14:50:43 +0800 Subject: ring-buffer: Move disabled check into preempt disable section The ring buffer resizing and resetting relies on a schedule RCU action. The buffers are disabled, a synchronize_sched() is called and then the resize or reset takes place. But this only works if the disabling of the buffers are within the preempt disabled section, otherwise a window exists that the buffers can be written to while a reset or resize takes place. Cc: stable@kernel.org Reported-by: Li Zefan Signed-off-by: Lai Jiangshan LKML-Reference: <4B949E43.2010906@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8c1b2d290718..54191d6ed195 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2232,12 +2232,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (ring_buffer_flags != RB_BUFFERS_ON) return NULL; - if (atomic_read(&buffer->record_disabled)) - return NULL; - /* If we are tracing schedule, we don't want to recurse */ resched = ftrace_preempt_disable(); + if (atomic_read(&buffer->record_disabled)) + goto out_nocheck; + if (trace_recursive_lock()) goto out_nocheck; @@ -2469,11 +2469,11 @@ int ring_buffer_write(struct ring_buffer *buffer, if (ring_buffer_flags != RB_BUFFERS_ON) return -EBUSY; - if (atomic_read(&buffer->record_disabled)) - return -EBUSY; - resched = ftrace_preempt_disable(); + if (atomic_read(&buffer->record_disabled)) + goto out; + cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) -- cgit v1.2.3 From ea14eb714041d40fcc5180b5a586034503650149 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 12 Mar 2010 19:41:23 -0500 Subject: function-graph: Init curr_ret_stack with ret_stack If the graph tracer is active, and a task is forked but the allocating of the processes graph stack fails, it can cause crash later on. This is due to the temporary stack being NULL, but the curr_ret_stack variable is copied from the parent. If it is not -1, then in ftrace_graph_probe_sched_switch() the following: for (index = next->curr_ret_stack; index >= 0; index--) next->ret_stack[index].calltime += timestamp; Will cause a kernel OOPS. Found with Li Zefan's ftrace_stress_test. Cc: stable@kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d4d1238b096b..bb53edbb5c8c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3349,6 +3349,7 @@ void ftrace_graph_init_task(struct task_struct *t) { /* Make sure we do not use the parent ret_stack */ t->ret_stack = NULL; + t->curr_ret_stack = -1; if (ftrace_graph_active) { struct ftrace_ret_stack *ret_stack; @@ -3358,7 +3359,6 @@ void ftrace_graph_init_task(struct task_struct *t) GFP_KERNEL); if (!ret_stack) return; - t->curr_ret_stack = -1; atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->ftrace_timestamp = 0; -- cgit v1.2.3 From 283740c619d211e34572cc93c8cdba92ccbdb9cc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 12 Mar 2010 19:48:41 -0500 Subject: tracing: Use same local variable when resetting the ring buffer In the ftrace code that resets the ring buffer it references the buffer with a local variable, but then uses the tr->buffer as the parameter to reset. If the wakeup tracer is running, which can switch the tr->buffer with the max saved buffer, this can break the requirement of disabling the buffer before the reset. buffer = tr->buffer; ring_buffer_record_disable(buffer); synchronize_sched(); __tracing_reset(tr->buffer, cpu); If the tr->buffer is swapped, then the reset is not happening to the buffer that was disabled. This will cause the ring buffer to fail. Found with Li Zefan's ftrace_stress_test. Cc: stable@kernel.org Reported-by: Lai Jiangshan Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6af8d7bc953b..60de37bd0f75 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -840,10 +840,10 @@ out: mutex_unlock(&trace_types_lock); } -static void __tracing_reset(struct trace_array *tr, int cpu) +static void __tracing_reset(struct ring_buffer *buffer, int cpu) { ftrace_disable_cpu(); - ring_buffer_reset_cpu(tr->buffer, cpu); + ring_buffer_reset_cpu(buffer, cpu); ftrace_enable_cpu(); } @@ -855,7 +855,7 @@ void tracing_reset(struct trace_array *tr, int cpu) /* Make sure all commits have finished */ synchronize_sched(); - __tracing_reset(tr, cpu); + __tracing_reset(buffer, cpu); ring_buffer_record_enable(buffer); } @@ -873,7 +873,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - __tracing_reset(tr, cpu); + __tracing_reset(buffer, cpu); ring_buffer_record_enable(buffer); } -- cgit v1.2.3 From a2f8071428ed9a0f06865f417c962421c9a6b488 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 12 Mar 2010 19:56:00 -0500 Subject: tracing: Disable buffer switching when starting or stopping trace When the trace iterator is read, tracing_start() and tracing_stop() is called to stop tracing while the iterator is processing the trace output. These functions disable both the standard buffer and the max latency buffer. But if the wakeup tracer is running, it can switch these buffers between the two disables: buffer = global_trace.buffer; if (buffer) ring_buffer_record_disable(buffer); <<<--------- swap happens here buffer = max_tr.buffer; if (buffer) ring_buffer_record_disable(buffer); What happens is that we disabled the same buffer twice. On tracing_start() we can enable the same buffer twice. All ring_buffer_record_disable() must be matched with a ring_buffer_record_enable() or the buffer can be disable permanently, or enable prematurely, and cause a bug where a reset happens while a trace is commiting. This patch protects these two by taking the ftrace_max_lock to prevent a switch from occurring. Found with Li Zefan's ftrace_stress_test. Cc: stable@kernel.org Reported-by: Lai Jiangshan Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 60de37bd0f75..484337d33959 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -950,6 +950,8 @@ void tracing_start(void) goto out; } + /* Prevent the buffers from switching */ + arch_spin_lock(&ftrace_max_lock); buffer = global_trace.buffer; if (buffer) @@ -959,6 +961,8 @@ void tracing_start(void) if (buffer) ring_buffer_record_enable(buffer); + arch_spin_unlock(&ftrace_max_lock); + ftrace_start(); out: spin_unlock_irqrestore(&tracing_start_lock, flags); @@ -980,6 +984,9 @@ void tracing_stop(void) if (trace_stop_count++) goto out; + /* Prevent the buffers from switching */ + arch_spin_lock(&ftrace_max_lock); + buffer = global_trace.buffer; if (buffer) ring_buffer_record_disable(buffer); @@ -988,6 +995,8 @@ void tracing_stop(void) if (buffer) ring_buffer_record_disable(buffer); + arch_spin_unlock(&ftrace_max_lock); + out: spin_unlock_irqrestore(&tracing_start_lock, flags); } -- cgit v1.2.3 From b6345879ccbd9b92864fbd7eb8ac48acdb4d6b15 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 12 Mar 2010 20:03:30 -0500 Subject: tracing: Do not record user stack trace from NMI context A bug was found with Li Zefan's ftrace_stress_test that caused applications to segfault during the test. Placing a tracing_off() in the segfault code, and examining several traces, I found that the following was always the case. The lock tracer was enabled (lockdep being required) and userstack was enabled. Testing this out, I just enabled the two, but that was not good enough. I needed to run something else that could trigger it. Running a load like hackbench did not work, but executing a new program would. The following would trigger the segfault within seconds: # echo 1 > /debug/tracing/options/userstacktrace # echo 1 > /debug/tracing/events/lock/enable # while :; do ls > /dev/null ; done Enabling the function graph tracer and looking at what was happening I finally noticed that all cashes happened just after an NMI. 1) | copy_user_handle_tail() { 1) | bad_area_nosemaphore() { 1) | __bad_area_nosemaphore() { 1) | no_context() { 1) | fixup_exception() { 1) 0.319 us | search_exception_tables(); 1) 0.873 us | } [...] 1) 0.314 us | __rcu_read_unlock(); 1) 0.325 us | native_apic_mem_write(); 1) 0.943 us | } 1) 0.304 us | rcu_nmi_exit(); [...] 1) 0.479 us | find_vma(); 1) | bad_area() { 1) | __bad_area() { After capturing several traces of failures, all of them happened after an NMI. Curious about this, I added a trace_printk() to the NMI handler to read the regs->ip to see where the NMI happened. In which I found out it was here: ffffffff8135b660 : ffffffff8135b660: 48 83 ec 78 sub $0x78,%rsp ffffffff8135b664: e8 97 01 00 00 callq ffffffff8135b800 What was happening is that the NMI would happen at the place that a page fault occurred. It would call rcu_read_lock() which was traced by the lock events, and the user_stack_trace would run. This would trigger a page fault inside the NMI. I do not see where the CR2 register is saved or restored in NMI handling. This means that it would corrupt the page fault handling that the NMI interrupted. The reason the while loop of ls helped trigger the bug, was that each execution of ls would cause lots of pages to be faulted in, and increase the chances of the race happening. The simple solution is to not allow user stack traces in NMI context. After this patch, I ran the above "ls" test for a couple of hours without any issues. Without this patch, the bug would trigger in less than a minute. Cc: stable@kernel.org Reported-by: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 484337d33959..e52683f7c3b2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1284,6 +1284,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) return; + /* + * NMIs can not handle page faults, even with fix ups. + * The save user stack can (and often does) fault. + */ + if (unlikely(in_nmi())) + return; + event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) -- cgit v1.2.3