From 3dccf5d07f68e850b84daede79f3c8bc121f1546 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:39:25 -0700 Subject: [PATCH] x86-64 update From: Andi Kleen Current x86-64 patchkit for 2.6.5. - Add drivers/firmware/Kconfig - Clarify description of CONFIG_IOMMU_DEBUG - Use correct gcc option to optimize for Intel CPUs - Add EDD support (Matt Domsch) - Add workaround for broken IOMMU on VIA hardware. Uses swiotlb there now. - Handle more than 8 local APICs (Suresh B Siddha) - Delete obsolete mtrr Makefile - Add x86_cache_alignment and set it up properly for P4 (128 bytes instead of 64bytes). Also report in /proc/cpuinfo - Minor cleanup in in_gate_area - Make asm-generic/dma-mapping.h compile with !CONFIG_PCI Just stub out all functions in this case. This is mainly to work around sysfs. - More !CONFIG_PCI compile fixes - Make u64 sector_t unconditional --- include/asm-generic/dma-mapping.h | 117 ++++++++++++++++++++++++++++++++++++++ include/asm-x86_64/apicdef.h | 2 + include/asm-x86_64/bootsetup.h | 3 + include/asm-x86_64/pci.h | 3 +- include/asm-x86_64/processor.h | 3 +- include/asm-x86_64/proto.h | 2 + include/asm-x86_64/smp.h | 24 +++++++- include/asm-x86_64/types.h | 4 -- 8 files changed, 150 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/asm-generic/dma-mapping.h b/include/asm-generic/dma-mapping.h index 54b0f7b71e95..04a28e6dd366 100644 --- a/include/asm-generic/dma-mapping.h +++ b/include/asm-generic/dma-mapping.h @@ -7,6 +7,10 @@ #ifndef _ASM_GENERIC_DMA_MAPPING_H #define _ASM_GENERIC_DMA_MAPPING_H +#include + +#ifdef CONFIG_PCI + /* we implement the API below in terms of the existing PCI one, * so include it */ #include @@ -146,6 +150,119 @@ dma_mapping_error(dma_addr_t dma_addr) return pci_dma_mapping_error(dma_addr); } + +#else + +static inline int +dma_supported(struct device *dev, u64 mask) +{ + return 0; +} + +static inline int +dma_set_mask(struct device *dev, u64 dma_mask) +{ + BUG(); + return 0; +} + +static inline void * +dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, + int flag) +{ + BUG(); + return NULL; +} + +static inline void +dma_free_coherent(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_handle) +{ + BUG(); +} + +static inline dma_addr_t +dma_map_single(struct device *dev, void *cpu_addr, size_t size, + enum dma_data_direction direction) +{ + BUG(); + return 0; +} + +static inline void +dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, + enum dma_data_direction direction) +{ + BUG(); +} + +static inline dma_addr_t +dma_map_page(struct device *dev, struct page *page, + unsigned long offset, size_t size, + enum dma_data_direction direction) +{ + BUG(); + return 0; +} + +static inline void +dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, + enum dma_data_direction direction) +{ + BUG(); +} + +static inline int +dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, + enum dma_data_direction direction) +{ + BUG(); + return 0; +} + +static inline void +dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, + enum dma_data_direction direction) +{ + BUG(); +} + +static inline void +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + BUG(); +} + +static inline void +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + BUG(); +} + +static inline void +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, + enum dma_data_direction direction) +{ + BUG(); +} + +static inline void +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, + enum dma_data_direction direction) +{ + BUG(); +} + +static inline int +dma_error(dma_addr_t dma_addr) +{ + return 0; +} + +#endif + /* Now for the API extensions over the pci_ one */ #define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) diff --git a/include/asm-x86_64/apicdef.h b/include/asm-x86_64/apicdef.h index 3a32c1452a89..8ba1d6ef13b2 100644 --- a/include/asm-x86_64/apicdef.h +++ b/include/asm-x86_64/apicdef.h @@ -370,4 +370,6 @@ struct local_apic { #undef u32 +#define BAD_APICID 0xFFu + #endif diff --git a/include/asm-x86_64/bootsetup.h b/include/asm-x86_64/bootsetup.h index b4644415575a..ee1557748b0e 100644 --- a/include/asm-x86_64/bootsetup.h +++ b/include/asm-x86_64/bootsetup.h @@ -26,6 +26,9 @@ extern char x86_boot_params[2048]; #define INITRD_START (*(unsigned int *) (PARAM+0x218)) #define INITRD_SIZE (*(unsigned int *) (PARAM+0x21c)) #define EDID_INFO (*(struct edid_info *) (PARAM+0x440)) +#define DISK80_SIGNATURE (*(unsigned int*) (PARAM+DISK80_SIG_BUFFER)) +#define EDD_NR (*(unsigned char *) (PARAM+EDDNR)) +#define EDD_BUF ((struct edd_info *) (PARAM+EDDBUF)) #define COMMAND_LINE saved_command_line #define COMMAND_LINE_SIZE 256 diff --git a/include/asm-x86_64/pci.h b/include/asm-x86_64/pci.h index 3e40884b0b19..ac9e9581d0a3 100644 --- a/include/asm-x86_64/pci.h +++ b/include/asm-x86_64/pci.h @@ -357,8 +357,9 @@ static inline void pcibios_add_platform_entries(struct pci_dev *dev) #endif /* __KERNEL__ */ /* generic pci stuff */ +#ifdef CONFIG_PCI #include - #include +#endif #endif /* __x8664_PCI_H */ diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h index 14bab87d299b..a0ecd64a6a89 100644 --- a/include/asm-x86_64/processor.h +++ b/include/asm-x86_64/processor.h @@ -59,6 +59,7 @@ struct cpuinfo_x86 { char x86_model_id[64]; int x86_cache_size; /* in KB */ int x86_clflush_size; + int x86_cache_alignment; int x86_tlbsize; /* number of 4K pages in DTLB/ITLB combined(in pages)*/ __u8 x86_virt_bits, x86_phys_bits; __u32 x86_power; @@ -453,6 +454,6 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) ti->task; \ }) -#define cache_line_size() (boot_cpu_data.x86_clflush_size) +#define cache_line_size() (boot_cpu_data.x86_cache_alignment) #endif /* __ASM_X86_64_PROCESSOR_H */ diff --git a/include/asm-x86_64/proto.h b/include/asm-x86_64/proto.h index 693f1a022314..5b0c38182172 100644 --- a/include/asm-x86_64/proto.h +++ b/include/asm-x86_64/proto.h @@ -101,6 +101,8 @@ extern int acpi_disabled; extern int fallback_aper_order; extern int fallback_aper_force; extern int iommu_aperture; +extern int iommu_aperture_disabled; +extern int iommu_aperture_allowed; extern void smp_local_timer_interrupt(struct pt_regs * regs); diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h index 97a19c35864f..e82b6a9884fb 100644 --- a/include/asm-x86_64/smp.h +++ b/include/asm-x86_64/smp.h @@ -80,10 +80,30 @@ extern __inline int hard_smp_processor_id(void) * the real APIC ID <-> CPU # mapping. * AK: why is this volatile? */ -extern volatile char x86_apicid_to_cpu[NR_CPUS]; extern volatile char x86_cpu_to_apicid[NR_CPUS]; -#define safe_smp_processor_id() (disable_apic ? 0 : x86_apicid_to_cpu[hard_smp_processor_id()]) +static inline char x86_apicid_to_cpu(char apicid) +{ + int i; + + for (i = 0; i < NR_CPUS; ++i) + if (x86_cpu_to_apicid[i] == apicid) + return i; + + return -1; +} + +#define safe_smp_processor_id() (disable_apic ? 0 : x86_apicid_to_cpu(hard_smp_processor_id())) + +extern u8 bios_cpu_apicid[]; + +static inline int cpu_present_to_apicid(int mps_cpu) +{ + if (mps_cpu < NR_CPUS) + return (int)bios_cpu_apicid[mps_cpu]; + else + return BAD_APICID; +} #define cpu_online(cpu) cpu_isset(cpu, cpu_online_map) #endif /* !ASSEMBLY */ diff --git a/include/asm-x86_64/types.h b/include/asm-x86_64/types.h index b7c4d2fb9509..c86c2e6793e2 100644 --- a/include/asm-x86_64/types.h +++ b/include/asm-x86_64/types.h @@ -33,8 +33,6 @@ typedef unsigned long long __u64; #ifndef __ASSEMBLY__ -#include - typedef signed char s8; typedef unsigned char u8; @@ -50,10 +48,8 @@ typedef unsigned long long u64; typedef u64 dma64_addr_t; typedef u64 dma_addr_t; -#ifdef CONFIG_LBD typedef u64 sector_t; #define HAVE_SECTOR_T -#endif #endif /* __ASSEMBLY__ */ -- cgit v1.2.3 From 243c64b2cfea7e49e074c80db65fa7b90d765c6f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:39:51 -0700 Subject: [PATCH] feed devfs through Lindent Nobody seems to have any outstanding work against devfs, so... --- fs/devfs/base.c | 2857 ++++++++++++++++++++------------------- fs/devfs/util.c | 2 +- include/linux/devfs_fs.h | 32 +- include/linux/devfs_fs_kernel.h | 26 +- 4 files changed, 1467 insertions(+), 1450 deletions(-) (limited to 'include') diff --git a/fs/devfs/base.c b/fs/devfs/base.c index 952031e7dd99..c7ea29114c4f 100644 --- a/fs/devfs/base.c +++ b/fs/devfs/base.c @@ -710,8 +710,8 @@ #define DEBUG_UNREGISTER 0x0000004 #define DEBUG_FREE 0x0000008 #define DEBUG_SET_FLAGS 0x0000010 -#define DEBUG_S_READ 0x0000100 /* Break */ -#define DEBUG_I_LOOKUP 0x0001000 /* Break */ +#define DEBUG_S_READ 0x0000100 /* Break */ +#define DEBUG_I_LOOKUP 0x0001000 /* Break */ #define DEBUG_I_CREATE 0x0002000 #define DEBUG_I_GET 0x0004000 #define DEBUG_I_CHANGE 0x0008000 @@ -719,8 +719,8 @@ #define DEBUG_I_RLINK 0x0020000 #define DEBUG_I_FLINK 0x0040000 #define DEBUG_I_MKNOD 0x0080000 -#define DEBUG_F_READDIR 0x0100000 /* Break */ -#define DEBUG_D_DELETE 0x1000000 /* Break */ +#define DEBUG_F_READDIR 0x0100000 /* Break */ +#define DEBUG_D_DELETE 0x1000000 /* Break */ #define DEBUG_D_RELEASE 0x2000000 #define DEBUG_D_IPUT 0x4000000 #define DEBUG_ALL 0xfffffff @@ -753,88 +753,80 @@ typedef struct devfs_entry *devfs_handle_t; -struct directory_type -{ - rwlock_t lock; /* Lock for searching(R)/updating(W) */ - struct devfs_entry *first; - struct devfs_entry *last; - unsigned char no_more_additions:1; +struct directory_type { + rwlock_t lock; /* Lock for searching(R)/updating(W) */ + struct devfs_entry *first; + struct devfs_entry *last; + unsigned char no_more_additions:1; }; -struct symlink_type -{ - unsigned int length; /* Not including the NULL-termimator */ - char *linkname; /* This is NULL-terminated */ +struct symlink_type { + unsigned int length; /* Not including the NULL-termimator */ + char *linkname; /* This is NULL-terminated */ }; -struct devfs_inode /* This structure is for "persistent" inode storage */ -{ - struct dentry *dentry; - struct timespec atime; - struct timespec mtime; - struct timespec ctime; - unsigned int ino; /* Inode number as seen in the VFS */ - uid_t uid; - gid_t gid; +struct devfs_inode { /* This structure is for "persistent" inode storage */ + struct dentry *dentry; + struct timespec atime; + struct timespec mtime; + struct timespec ctime; + unsigned int ino; /* Inode number as seen in the VFS */ + uid_t uid; + gid_t gid; }; -struct devfs_entry -{ +struct devfs_entry { #ifdef CONFIG_DEVFS_DEBUG - unsigned int magic_number; + unsigned int magic_number; #endif - void *info; - atomic_t refcount; /* When this drops to zero, it's unused */ - union - { - struct directory_type dir; - dev_t dev; - struct symlink_type symlink; - const char *name; /* Only used for (mode == 0) */ - } - u; - struct devfs_entry *prev; /* Previous entry in the parent directory */ - struct devfs_entry *next; /* Next entry in the parent directory */ - struct devfs_entry *parent; /* The parent directory */ - struct devfs_inode inode; - umode_t mode; - unsigned short namelen; /* I think 64k+ filenames are a way off... */ - unsigned char vfs:1;/* Whether the VFS may delete the entry */ - char name[1]; /* This is just a dummy: the allocated array - is bigger. This is NULL-terminated */ + void *info; + atomic_t refcount; /* When this drops to zero, it's unused */ + union { + struct directory_type dir; + dev_t dev; + struct symlink_type symlink; + const char *name; /* Only used for (mode == 0) */ + } u; + struct devfs_entry *prev; /* Previous entry in the parent directory */ + struct devfs_entry *next; /* Next entry in the parent directory */ + struct devfs_entry *parent; /* The parent directory */ + struct devfs_inode inode; + umode_t mode; + unsigned short namelen; /* I think 64k+ filenames are a way off... */ + unsigned char vfs:1; /* Whether the VFS may delete the entry */ + char name[1]; /* This is just a dummy: the allocated array + is bigger. This is NULL-terminated */ }; /* The root of the device tree */ static struct devfs_entry *root_entry; -struct devfsd_buf_entry -{ - struct devfs_entry *de; /* The name is generated with this */ - unsigned short type; /* The type of event */ - umode_t mode; - uid_t uid; - gid_t gid; - struct devfsd_buf_entry *next; +struct devfsd_buf_entry { + struct devfs_entry *de; /* The name is generated with this */ + unsigned short type; /* The type of event */ + umode_t mode; + uid_t uid; + gid_t gid; + struct devfsd_buf_entry *next; }; -struct fs_info /* This structure is for the mounted devfs */ -{ - struct super_block *sb; - spinlock_t devfsd_buffer_lock; /* Lock when inserting/deleting events */ - struct devfsd_buf_entry *devfsd_first_event; - struct devfsd_buf_entry *devfsd_last_event; - volatile int devfsd_sleeping; - volatile struct task_struct *devfsd_task; - volatile pid_t devfsd_pgrp; - volatile struct file *devfsd_file; - struct devfsd_notify_struct *devfsd_info; - volatile unsigned long devfsd_event_mask; - atomic_t devfsd_overrun_count; - wait_queue_head_t devfsd_wait_queue; /* Wake devfsd on input */ - wait_queue_head_t revalidate_wait_queue; /* Wake when devfsd sleeps */ +struct fs_info { /* This structure is for the mounted devfs */ + struct super_block *sb; + spinlock_t devfsd_buffer_lock; /* Lock when inserting/deleting events */ + struct devfsd_buf_entry *devfsd_first_event; + struct devfsd_buf_entry *devfsd_last_event; + volatile int devfsd_sleeping; + volatile struct task_struct *devfsd_task; + volatile pid_t devfsd_pgrp; + volatile struct file *devfsd_file; + struct devfsd_notify_struct *devfsd_info; + volatile unsigned long devfsd_event_mask; + atomic_t devfsd_overrun_count; + wait_queue_head_t devfsd_wait_queue; /* Wake devfsd on input */ + wait_queue_head_t revalidate_wait_queue; /* Wake when devfsd sleeps */ }; -static struct fs_info fs_info = {.devfsd_buffer_lock = SPIN_LOCK_UNLOCKED}; +static struct fs_info fs_info = {.devfsd_buffer_lock = SPIN_LOCK_UNLOCKED }; static kmem_cache_t *devfsd_buf_cache; #ifdef CONFIG_DEVFS_DEBUG static unsigned int devfs_debug_init __initdata = DEBUG_NONE; @@ -844,7 +836,7 @@ static unsigned int stat_num_entries; static unsigned int stat_num_bytes; #endif static unsigned char poison_array[8] = - {0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a}; + { 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a, 0x5a }; #ifdef CONFIG_DEVFS_MOUNT static unsigned int boot_options = OPTION_MOUNT; @@ -853,75 +845,77 @@ static unsigned int boot_options = OPTION_NONE; #endif /* Forward function declarations */ -static devfs_handle_t _devfs_walk_path (struct devfs_entry *dir, - const char *name, int namelen, - int traverse_symlink); -static ssize_t devfsd_read (struct file *file, char *buf, size_t len, - loff_t *ppos); -static int devfsd_ioctl (struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg); -static int devfsd_close (struct inode *inode, struct file *file); +static devfs_handle_t _devfs_walk_path(struct devfs_entry *dir, + const char *name, int namelen, + int traverse_symlink); +static ssize_t devfsd_read(struct file *file, char *buf, size_t len, + loff_t * ppos); +static int devfsd_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg); +static int devfsd_close(struct inode *inode, struct file *file); #ifdef CONFIG_DEVFS_DEBUG -static ssize_t stat_read (struct file *file, char *buf, size_t len, - loff_t *ppos); -static struct file_operations stat_fops = -{ - .read = stat_read, +static ssize_t stat_read(struct file *file, char *buf, size_t len, + loff_t * ppos); +static struct file_operations stat_fops = { + .read = stat_read, }; #endif - /* Devfs daemon file operations */ -static struct file_operations devfsd_fops = -{ - .read = devfsd_read, - .ioctl = devfsd_ioctl, - .release = devfsd_close, +static struct file_operations devfsd_fops = { + .read = devfsd_read, + .ioctl = devfsd_ioctl, + .release = devfsd_close, }; - /* Support functions follow */ - /** * devfs_get - Get a reference to a devfs entry. * @de: The devfs entry. */ -static struct devfs_entry *devfs_get (struct devfs_entry *de) +static struct devfs_entry *devfs_get(struct devfs_entry *de) { - VERIFY_ENTRY (de); - if (de) atomic_inc (&de->refcount); - return de; -} /* End Function devfs_get */ + VERIFY_ENTRY(de); + if (de) + atomic_inc(&de->refcount); + return de; +} /* End Function devfs_get */ /** * devfs_put - Put (release) a reference to a devfs entry. * @de: The handle to the devfs entry. */ -static void devfs_put (devfs_handle_t de) -{ - if (!de) return; - VERIFY_ENTRY (de); - if (de->info == POISON_PTR) OOPS ("(%p): poisoned pointer\n", de); - if ( !atomic_dec_and_test (&de->refcount) ) return; - if (de == root_entry) OOPS ("(%p): root entry being freed\n", de); - DPRINTK (DEBUG_FREE, "(%s): de: %p, parent: %p \"%s\"\n", - de->name, de, de->parent, - de->parent ? de->parent->name : "no parent"); - if ( S_ISLNK (de->mode) ) kfree (de->u.symlink.linkname); - WRITE_ENTRY_MAGIC (de, 0); +static void devfs_put(devfs_handle_t de) +{ + if (!de) + return; + VERIFY_ENTRY(de); + if (de->info == POISON_PTR) + OOPS("(%p): poisoned pointer\n", de); + if (!atomic_dec_and_test(&de->refcount)) + return; + if (de == root_entry) + OOPS("(%p): root entry being freed\n", de); + DPRINTK(DEBUG_FREE, "(%s): de: %p, parent: %p \"%s\"\n", + de->name, de, de->parent, + de->parent ? de->parent->name : "no parent"); + if (S_ISLNK(de->mode)) + kfree(de->u.symlink.linkname); + WRITE_ENTRY_MAGIC(de, 0); #ifdef CONFIG_DEVFS_DEBUG - spin_lock (&stat_lock); - --stat_num_entries; - stat_num_bytes -= sizeof *de + de->namelen; - if ( S_ISLNK (de->mode) ) stat_num_bytes -= de->u.symlink.length + 1; - spin_unlock (&stat_lock); + spin_lock(&stat_lock); + --stat_num_entries; + stat_num_bytes -= sizeof *de + de->namelen; + if (S_ISLNK(de->mode)) + stat_num_bytes -= de->u.symlink.length + 1; + spin_unlock(&stat_lock); #endif - de->info = POISON_PTR; - kfree (de); -} /* End Function devfs_put */ + de->info = POISON_PTR; + kfree(de); +} /* End Function devfs_put */ /** * _devfs_search_dir - Search for a devfs entry in a directory. @@ -934,26 +928,25 @@ static void devfs_put (devfs_handle_t de) * An implicit devfs_get() is performed on the returned entry. */ -static struct devfs_entry *_devfs_search_dir (struct devfs_entry *dir, - const char *name, - unsigned int namelen) +static struct devfs_entry *_devfs_search_dir(struct devfs_entry *dir, + const char *name, + unsigned int namelen) { - struct devfs_entry *curr; - - if ( !S_ISDIR (dir->mode) ) - { - PRINTK ("(%s): not a directory\n", dir->name); - return NULL; - } - for (curr = dir->u.dir.first; curr != NULL; curr = curr->next) - { - if (curr->namelen != namelen) continue; - if (memcmp (curr->name, name, namelen) == 0) break; - /* Not found: try the next one */ - } - return devfs_get (curr); -} /* End Function _devfs_search_dir */ + struct devfs_entry *curr; + if (!S_ISDIR(dir->mode)) { + PRINTK("(%s): not a directory\n", dir->name); + return NULL; + } + for (curr = dir->u.dir.first; curr != NULL; curr = curr->next) { + if (curr->namelen != namelen) + continue; + if (memcmp(curr->name, name, namelen) == 0) + break; + /* Not found: try the next one */ + } + return devfs_get(curr); +} /* End Function _devfs_search_dir */ /** * _devfs_alloc_entry - Allocate a devfs entry. @@ -965,36 +958,38 @@ static struct devfs_entry *_devfs_search_dir (struct devfs_entry *dir, * %NULL. */ -static struct devfs_entry *_devfs_alloc_entry (const char *name, - unsigned int namelen, - umode_t mode) +static struct devfs_entry *_devfs_alloc_entry(const char *name, + unsigned int namelen, + umode_t mode) { - struct devfs_entry *new; - static unsigned long inode_counter = FIRST_INODE; - static spinlock_t counter_lock = SPIN_LOCK_UNLOCKED; - - if ( name && (namelen < 1) ) namelen = strlen (name); - if ( ( new = kmalloc (sizeof *new + namelen, GFP_KERNEL) ) == NULL ) - return NULL; - memset (new, 0, sizeof *new + namelen); /* Will set '\0' on name */ - new->mode = mode; - if ( S_ISDIR (mode) ) rwlock_init (&new->u.dir.lock); - atomic_set (&new->refcount, 1); - spin_lock (&counter_lock); - new->inode.ino = inode_counter++; - spin_unlock (&counter_lock); - if (name) memcpy (new->name, name, namelen); - new->namelen = namelen; - WRITE_ENTRY_MAGIC (new, MAGIC_VALUE); + struct devfs_entry *new; + static unsigned long inode_counter = FIRST_INODE; + static spinlock_t counter_lock = SPIN_LOCK_UNLOCKED; + + if (name && (namelen < 1)) + namelen = strlen(name); + if ((new = kmalloc(sizeof *new + namelen, GFP_KERNEL)) == NULL) + return NULL; + memset(new, 0, sizeof *new + namelen); /* Will set '\0' on name */ + new->mode = mode; + if (S_ISDIR(mode)) + rwlock_init(&new->u.dir.lock); + atomic_set(&new->refcount, 1); + spin_lock(&counter_lock); + new->inode.ino = inode_counter++; + spin_unlock(&counter_lock); + if (name) + memcpy(new->name, name, namelen); + new->namelen = namelen; + WRITE_ENTRY_MAGIC(new, MAGIC_VALUE); #ifdef CONFIG_DEVFS_DEBUG - spin_lock (&stat_lock); - ++stat_num_entries; - stat_num_bytes += sizeof *new + namelen; - spin_unlock (&stat_lock); + spin_lock(&stat_lock); + ++stat_num_entries; + stat_num_bytes += sizeof *new + namelen; + spin_unlock(&stat_lock); #endif - return new; -} /* End Function _devfs_alloc_entry */ - + return new; +} /* End Function _devfs_alloc_entry */ /** * _devfs_append_entry - Append a devfs entry to a directory's child list. @@ -1009,43 +1004,48 @@ static struct devfs_entry *_devfs_alloc_entry (const char *name, * On failure, an implicit devfs_put() is performed on %de. */ -static int _devfs_append_entry (devfs_handle_t dir, devfs_handle_t de, - devfs_handle_t *old_de) +static int _devfs_append_entry(devfs_handle_t dir, devfs_handle_t de, + devfs_handle_t * old_de) { - int retval; - - if (old_de) *old_de = NULL; - if ( !S_ISDIR (dir->mode) ) - { - PRINTK ("(%s): dir: \"%s\" is not a directory\n", de->name, dir->name); - devfs_put (de); - return -ENOTDIR; - } - write_lock (&dir->u.dir.lock); - if (dir->u.dir.no_more_additions) retval = -ENOENT; - else - { - struct devfs_entry *old; - - old = _devfs_search_dir (dir, de->name, de->namelen); - if (old_de) *old_de = old; - else devfs_put (old); - if (old == NULL) - { - de->parent = dir; - de->prev = dir->u.dir.last; - /* Append to the directory's list of children */ - if (dir->u.dir.first == NULL) dir->u.dir.first = de; - else dir->u.dir.last->next = de; - dir->u.dir.last = de; - retval = 0; + int retval; + + if (old_de) + *old_de = NULL; + if (!S_ISDIR(dir->mode)) { + PRINTK("(%s): dir: \"%s\" is not a directory\n", de->name, + dir->name); + devfs_put(de); + return -ENOTDIR; } - else retval = -EEXIST; - } - write_unlock (&dir->u.dir.lock); - if (retval) devfs_put (de); - return retval; -} /* End Function _devfs_append_entry */ + write_lock(&dir->u.dir.lock); + if (dir->u.dir.no_more_additions) + retval = -ENOENT; + else { + struct devfs_entry *old; + + old = _devfs_search_dir(dir, de->name, de->namelen); + if (old_de) + *old_de = old; + else + devfs_put(old); + if (old == NULL) { + de->parent = dir; + de->prev = dir->u.dir.last; + /* Append to the directory's list of children */ + if (dir->u.dir.first == NULL) + dir->u.dir.first = de; + else + dir->u.dir.last->next = de; + dir->u.dir.last = de; + retval = 0; + } else + retval = -EEXIST; + } + write_unlock(&dir->u.dir.lock); + if (retval) + devfs_put(de); + return retval; +} /* End Function _devfs_append_entry */ /** * _devfs_get_root_entry - Get the root devfs entry. @@ -1067,7 +1067,7 @@ static struct devfs_entry *_devfs_get_root_entry(void) return root_entry; new = _devfs_alloc_entry(NULL, 0, MODE_DIR); - if (new == NULL ) + if (new == NULL) return NULL; spin_lock(&root_lock); @@ -1080,7 +1080,7 @@ static struct devfs_entry *_devfs_get_root_entry(void) spin_unlock(&root_lock); return root_entry; -} /* End Function _devfs_get_root_entry */ +} /* End Function _devfs_get_root_entry */ /** * _devfs_descend - Descend down a tree using the next component name. @@ -1096,142 +1096,134 @@ static struct devfs_entry *_devfs_get_root_entry(void) * An implicit devfs_get() is performed on the returned entry. */ -static struct devfs_entry *_devfs_descend (struct devfs_entry *dir, - const char *name, int namelen, - int *next_pos) -{ - const char *stop, *ptr; - struct devfs_entry *entry; - - if ( (namelen >= 3) && (strncmp (name, "../", 3) == 0) ) - { /* Special-case going to parent directory */ - *next_pos = 3; - return devfs_get (dir->parent); - } - stop = name + namelen; - /* Search for a possible '/' */ - for (ptr = name; (ptr < stop) && (*ptr != '/'); ++ptr); - *next_pos = ptr - name; - read_lock (&dir->u.dir.lock); - entry = _devfs_search_dir (dir, name, *next_pos); - read_unlock (&dir->u.dir.lock); - return entry; -} /* End Function _devfs_descend */ - - -static devfs_handle_t _devfs_make_parent_for_leaf (struct devfs_entry *dir, - const char *name, - int namelen, int *leaf_pos) +static struct devfs_entry *_devfs_descend(struct devfs_entry *dir, + const char *name, int namelen, + int *next_pos) { - int next_pos = 0; - - if (dir == NULL) dir = _devfs_get_root_entry (); - if (dir == NULL) return NULL; - devfs_get (dir); - /* Search for possible trailing component and ignore it */ - for (--namelen; (namelen > 0) && (name[namelen] != '/'); --namelen); - *leaf_pos = (name[namelen] == '/') ? (namelen + 1) : 0; - for (; namelen > 0; name += next_pos, namelen -= next_pos) - { - struct devfs_entry *de, *old = NULL; - - if ( ( de = _devfs_descend (dir, name, namelen, &next_pos) ) == NULL ) - { - de = _devfs_alloc_entry (name, next_pos, MODE_DIR); - devfs_get (de); - if ( !de || _devfs_append_entry (dir, de, &old) ) - { - devfs_put (de); - if ( !old || !S_ISDIR (old->mode) ) - { - devfs_put (old); - devfs_put (dir); - return NULL; - } - de = old; /* Use the existing directory */ - } + const char *stop, *ptr; + struct devfs_entry *entry; + + if ((namelen >= 3) && (strncmp(name, "../", 3) == 0)) { /* Special-case going to parent directory */ + *next_pos = 3; + return devfs_get(dir->parent); } - if (de == dir->parent) - { - devfs_put (dir); - devfs_put (de); - return NULL; + stop = name + namelen; + /* Search for a possible '/' */ + for (ptr = name; (ptr < stop) && (*ptr != '/'); ++ptr) ; + *next_pos = ptr - name; + read_lock(&dir->u.dir.lock); + entry = _devfs_search_dir(dir, name, *next_pos); + read_unlock(&dir->u.dir.lock); + return entry; +} /* End Function _devfs_descend */ + +static devfs_handle_t _devfs_make_parent_for_leaf(struct devfs_entry *dir, + const char *name, + int namelen, int *leaf_pos) +{ + int next_pos = 0; + + if (dir == NULL) + dir = _devfs_get_root_entry(); + if (dir == NULL) + return NULL; + devfs_get(dir); + /* Search for possible trailing component and ignore it */ + for (--namelen; (namelen > 0) && (name[namelen] != '/'); --namelen) ; + *leaf_pos = (name[namelen] == '/') ? (namelen + 1) : 0; + for (; namelen > 0; name += next_pos, namelen -= next_pos) { + struct devfs_entry *de, *old = NULL; + + if ((de = + _devfs_descend(dir, name, namelen, &next_pos)) == NULL) { + de = _devfs_alloc_entry(name, next_pos, MODE_DIR); + devfs_get(de); + if (!de || _devfs_append_entry(dir, de, &old)) { + devfs_put(de); + if (!old || !S_ISDIR(old->mode)) { + devfs_put(old); + devfs_put(dir); + return NULL; + } + de = old; /* Use the existing directory */ + } + } + if (de == dir->parent) { + devfs_put(dir); + devfs_put(de); + return NULL; + } + devfs_put(dir); + dir = de; + if (name[next_pos] == '/') + ++next_pos; } - devfs_put (dir); - dir = de; - if (name[next_pos] == '/') ++next_pos; - } - return dir; -} /* End Function _devfs_make_parent_for_leaf */ - + return dir; +} /* End Function _devfs_make_parent_for_leaf */ -static devfs_handle_t _devfs_prepare_leaf (devfs_handle_t *dir, - const char *name, umode_t mode) +static devfs_handle_t _devfs_prepare_leaf(devfs_handle_t * dir, + const char *name, umode_t mode) { - int namelen, leaf_pos; - struct devfs_entry *de; - - namelen = strlen (name); - if ( ( *dir = _devfs_make_parent_for_leaf (*dir, name, namelen, - &leaf_pos) ) == NULL ) - { - PRINTK ("(%s): could not create parent path\n", name); - return NULL; - } - if ( ( de = _devfs_alloc_entry (name + leaf_pos, namelen - leaf_pos,mode) ) - == NULL ) - { - PRINTK ("(%s): could not allocate entry\n", name); - devfs_put (*dir); - return NULL; - } - return de; -} /* End Function _devfs_prepare_leaf */ - - -static devfs_handle_t _devfs_walk_path (struct devfs_entry *dir, - const char *name, int namelen, - int traverse_symlink) -{ - int next_pos = 0; - - if (dir == NULL) dir = _devfs_get_root_entry (); - if (dir == NULL) return NULL; - devfs_get (dir); - for (; namelen > 0; name += next_pos, namelen -= next_pos) - { - struct devfs_entry *de, *link; - - if (!S_ISDIR (dir->mode)) - { - devfs_put (dir); - return NULL; - } + int namelen, leaf_pos; + struct devfs_entry *de; - if ( ( de = _devfs_descend (dir, name, namelen, &next_pos) ) == NULL ) - { - devfs_put (dir); - return NULL; + namelen = strlen(name); + if ((*dir = _devfs_make_parent_for_leaf(*dir, name, namelen, + &leaf_pos)) == NULL) { + PRINTK("(%s): could not create parent path\n", name); + return NULL; } - if (S_ISLNK (de->mode) && traverse_symlink) - { /* Need to follow the link: this is a stack chomper */ - /* FIXME what if it puts outside of mounted tree? */ - link = _devfs_walk_path (dir, de->u.symlink.linkname, - de->u.symlink.length, TRUE); - devfs_put (de); - if (!link) - { - devfs_put (dir); + if ((de = _devfs_alloc_entry(name + leaf_pos, namelen - leaf_pos, mode)) + == NULL) { + PRINTK("(%s): could not allocate entry\n", name); + devfs_put(*dir); return NULL; - } - de = link; } - devfs_put (dir); - dir = de; - if (name[next_pos] == '/') ++next_pos; - } - return dir; -} /* End Function _devfs_walk_path */ + return de; +} /* End Function _devfs_prepare_leaf */ + +static devfs_handle_t _devfs_walk_path(struct devfs_entry *dir, + const char *name, int namelen, + int traverse_symlink) +{ + int next_pos = 0; + + if (dir == NULL) + dir = _devfs_get_root_entry(); + if (dir == NULL) + return NULL; + devfs_get(dir); + for (; namelen > 0; name += next_pos, namelen -= next_pos) { + struct devfs_entry *de, *link; + + if (!S_ISDIR(dir->mode)) { + devfs_put(dir); + return NULL; + } + + if ((de = + _devfs_descend(dir, name, namelen, &next_pos)) == NULL) { + devfs_put(dir); + return NULL; + } + if (S_ISLNK(de->mode) && traverse_symlink) { /* Need to follow the link: this is a stack chomper */ + /* FIXME what if it puts outside of mounted tree? */ + link = _devfs_walk_path(dir, de->u.symlink.linkname, + de->u.symlink.length, TRUE); + devfs_put(de); + if (!link) { + devfs_put(dir); + return NULL; + } + de = link; + } + devfs_put(dir); + dir = de; + if (name[next_pos] == '/') + ++next_pos; + } + return dir; +} /* End Function _devfs_walk_path */ /** * _devfs_find_entry - Find a devfs entry. @@ -1244,40 +1236,37 @@ static devfs_handle_t _devfs_walk_path (struct devfs_entry *dir, * devfs_get() is performed. */ -static struct devfs_entry *_devfs_find_entry (devfs_handle_t dir, - const char *name, - int traverse_symlink) +static struct devfs_entry *_devfs_find_entry(devfs_handle_t dir, + const char *name, + int traverse_symlink) { - unsigned int namelen = strlen (name); - - if (name[0] == '/') - { - /* Skip leading pathname component */ - if (namelen < 2) - { - PRINTK ("(%s): too short\n", name); - return NULL; - } - for (++name, --namelen; (*name != '/') && (namelen > 0); - ++name, --namelen); - if (namelen < 2) - { - PRINTK ("(%s): too short\n", name); - return NULL; + unsigned int namelen = strlen(name); + + if (name[0] == '/') { + /* Skip leading pathname component */ + if (namelen < 2) { + PRINTK("(%s): too short\n", name); + return NULL; + } + for (++name, --namelen; (*name != '/') && (namelen > 0); + ++name, --namelen) ; + if (namelen < 2) { + PRINTK("(%s): too short\n", name); + return NULL; + } + ++name; + --namelen; } - ++name; - --namelen; - } - return _devfs_walk_path (dir, name, namelen, traverse_symlink); -} /* End Function _devfs_find_entry */ + return _devfs_walk_path(dir, name, namelen, traverse_symlink); +} /* End Function _devfs_find_entry */ -static struct devfs_entry *get_devfs_entry_from_vfs_inode (struct inode *inode) +static struct devfs_entry *get_devfs_entry_from_vfs_inode(struct inode *inode) { - if (inode == NULL) return NULL; - VERIFY_ENTRY ( (struct devfs_entry *) inode->u.generic_ip ); - return inode->u.generic_ip; -} /* End Function get_devfs_entry_from_vfs_inode */ - + if (inode == NULL) + return NULL; + VERIFY_ENTRY((struct devfs_entry *)inode->u.generic_ip); + return inode->u.generic_ip; +} /* End Function get_devfs_entry_from_vfs_inode */ /** * free_dentry - Free the dentry for a device entry and invalidate inode. @@ -1287,20 +1276,21 @@ static struct devfs_entry *get_devfs_entry_from_vfs_inode (struct inode *inode) * parent directory. */ -static void free_dentry (struct devfs_entry *de) +static void free_dentry(struct devfs_entry *de) { - struct dentry *dentry = de->inode.dentry; - - if (!dentry) return; - spin_lock (&dcache_lock); - dget_locked (dentry); - spin_unlock (&dcache_lock); - /* Forcefully remove the inode */ - if (dentry->d_inode != NULL) dentry->d_inode->i_nlink = 0; - d_drop (dentry); - dput (dentry); -} /* End Function free_dentry */ + struct dentry *dentry = de->inode.dentry; + if (!dentry) + return; + spin_lock(&dcache_lock); + dget_locked(dentry); + spin_unlock(&dcache_lock); + /* Forcefully remove the inode */ + if (dentry->d_inode != NULL) + dentry->d_inode->i_nlink = 0; + d_drop(dentry); + dput(dentry); +} /* End Function free_dentry */ /** * is_devfsd_or_child - Test if the current process is devfsd or one of its children. @@ -1309,25 +1299,24 @@ static void free_dentry (struct devfs_entry *de) * Returns %TRUE if devfsd or child, else %FALSE. */ -static int is_devfsd_or_child (struct fs_info *fs_info) +static int is_devfsd_or_child(struct fs_info *fs_info) { - struct task_struct *p = current; + struct task_struct *p = current; - if (p == fs_info->devfsd_task) return (TRUE); - if (process_group(p) == fs_info->devfsd_pgrp) return (TRUE); - read_lock(&tasklist_lock); - for ( ; p != &init_task; p = p->real_parent) - { if (p == fs_info->devfsd_task) - { - read_unlock (&tasklist_lock); - return (TRUE); + return (TRUE); + if (process_group(p) == fs_info->devfsd_pgrp) + return (TRUE); + read_lock(&tasklist_lock); + for (; p != &init_task; p = p->real_parent) { + if (p == fs_info->devfsd_task) { + read_unlock(&tasklist_lock); + return (TRUE); + } } - } - read_unlock (&tasklist_lock); - return (FALSE); -} /* End Function is_devfsd_or_child */ - + read_unlock(&tasklist_lock); + return (FALSE); +} /* End Function is_devfsd_or_child */ /** * devfsd_queue_empty - Test if devfsd has work pending in its event queue. @@ -1336,11 +1325,10 @@ static int is_devfsd_or_child (struct fs_info *fs_info) * Returns %TRUE if the queue is empty, else %FALSE. */ -static inline int devfsd_queue_empty (struct fs_info *fs_info) +static inline int devfsd_queue_empty(struct fs_info *fs_info) { - return (fs_info->devfsd_last_event) ? FALSE : TRUE; -} /* End Function devfsd_queue_empty */ - + return (fs_info->devfsd_last_event) ? FALSE : TRUE; +} /* End Function devfsd_queue_empty */ /** * wait_for_devfsd_finished - Wait for devfsd to finish processing its event queue. @@ -1349,22 +1337,25 @@ static inline int devfsd_queue_empty (struct fs_info *fs_info) * Returns %TRUE if no more waiting will be required, else %FALSE. */ -static int wait_for_devfsd_finished (struct fs_info *fs_info) +static int wait_for_devfsd_finished(struct fs_info *fs_info) { - DECLARE_WAITQUEUE (wait, current); - - if (fs_info->devfsd_task == NULL) return (TRUE); - if (devfsd_queue_empty (fs_info) && fs_info->devfsd_sleeping) return TRUE; - if ( is_devfsd_or_child (fs_info) ) return (FALSE); - set_current_state (TASK_UNINTERRUPTIBLE); - add_wait_queue (&fs_info->revalidate_wait_queue, &wait); - if (!devfsd_queue_empty (fs_info) || !fs_info->devfsd_sleeping) - if (fs_info->devfsd_task) schedule (); - remove_wait_queue (&fs_info->revalidate_wait_queue, &wait); - __set_current_state (TASK_RUNNING); - return (TRUE); -} /* End Function wait_for_devfsd_finished */ + DECLARE_WAITQUEUE(wait, current); + if (fs_info->devfsd_task == NULL) + return (TRUE); + if (devfsd_queue_empty(fs_info) && fs_info->devfsd_sleeping) + return TRUE; + if (is_devfsd_or_child(fs_info)) + return (FALSE); + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&fs_info->revalidate_wait_queue, &wait); + if (!devfsd_queue_empty(fs_info) || !fs_info->devfsd_sleeping) + if (fs_info->devfsd_task) + schedule(); + remove_wait_queue(&fs_info->revalidate_wait_queue, &wait); + __set_current_state(TASK_RUNNING); + return (TRUE); +} /* End Function wait_for_devfsd_finished */ /** * devfsd_notify_de - Notify the devfsd daemon of a change. @@ -1379,35 +1370,37 @@ static int wait_for_devfsd_finished (struct fs_info *fs_info) * Returns %TRUE if an event was queued and devfsd woken up, else %FALSE. */ -static int devfsd_notify_de (struct devfs_entry *de, - unsigned short type, umode_t mode, - uid_t uid, gid_t gid, struct fs_info *fs_info) +static int devfsd_notify_de(struct devfs_entry *de, + unsigned short type, umode_t mode, + uid_t uid, gid_t gid, struct fs_info *fs_info) { - struct devfsd_buf_entry *entry; - struct devfs_entry *curr; - - if ( !( fs_info->devfsd_event_mask & (1 << type) ) ) return (FALSE); - if ( ( entry = kmem_cache_alloc (devfsd_buf_cache, SLAB_KERNEL) ) == NULL ) - { - atomic_inc (&fs_info->devfsd_overrun_count); - return (FALSE); - } - for (curr = de; curr != NULL; curr = curr->parent) devfs_get (curr); - entry->de = de; - entry->type = type; - entry->mode = mode; - entry->uid = uid; - entry->gid = gid; - entry->next = NULL; - spin_lock (&fs_info->devfsd_buffer_lock); - if (!fs_info->devfsd_first_event) fs_info->devfsd_first_event = entry; - if (fs_info->devfsd_last_event) fs_info->devfsd_last_event->next = entry; - fs_info->devfsd_last_event = entry; - spin_unlock (&fs_info->devfsd_buffer_lock); - wake_up_interruptible (&fs_info->devfsd_wait_queue); - return (TRUE); -} /* End Function devfsd_notify_de */ + struct devfsd_buf_entry *entry; + struct devfs_entry *curr; + if (!(fs_info->devfsd_event_mask & (1 << type))) + return (FALSE); + if ((entry = kmem_cache_alloc(devfsd_buf_cache, SLAB_KERNEL)) == NULL) { + atomic_inc(&fs_info->devfsd_overrun_count); + return (FALSE); + } + for (curr = de; curr != NULL; curr = curr->parent) + devfs_get(curr); + entry->de = de; + entry->type = type; + entry->mode = mode; + entry->uid = uid; + entry->gid = gid; + entry->next = NULL; + spin_lock(&fs_info->devfsd_buffer_lock); + if (!fs_info->devfsd_first_event) + fs_info->devfsd_first_event = entry; + if (fs_info->devfsd_last_event) + fs_info->devfsd_last_event->next = entry; + fs_info->devfsd_last_event = entry; + spin_unlock(&fs_info->devfsd_buffer_lock); + wake_up_interruptible(&fs_info->devfsd_wait_queue); + return (TRUE); +} /* End Function devfsd_notify_de */ /** * devfsd_notify - Notify the devfsd daemon of a change. @@ -1417,11 +1410,11 @@ static int devfsd_notify_de (struct devfs_entry *de, * the event. */ -static void devfsd_notify (struct devfs_entry *de,unsigned short type) +static void devfsd_notify(struct devfs_entry *de, unsigned short type) { devfsd_notify_de(de, type, de->mode, current->euid, current->egid, &fs_info); -} +} static int devfs_mk_dev(dev_t dev, umode_t mode, const char *fmt, va_list args) { @@ -1432,15 +1425,15 @@ static int devfs_mk_dev(dev_t dev, umode_t mode, const char *fmt, va_list args) n = vsnprintf(buf, sizeof(buf), fmt, args); if (n >= sizeof(buf) || !buf[0]) { printk(KERN_WARNING "%s: invalid format string %s\n", - __FUNCTION__, fmt); + __FUNCTION__, fmt); return -EINVAL; } - + de = _devfs_prepare_leaf(&dir, buf, mode); if (!de) { printk(KERN_WARNING "%s: could not prepare leaf for %s\n", - __FUNCTION__, buf); - return -ENOMEM; /* could be more accurate... */ + __FUNCTION__, buf); + return -ENOMEM; /* could be more accurate... */ } de->u.dev = dev; @@ -1448,12 +1441,12 @@ static int devfs_mk_dev(dev_t dev, umode_t mode, const char *fmt, va_list args) error = _devfs_append_entry(dir, de, NULL); if (error) { printk(KERN_WARNING "%s: could not append to parent for %s\n", - __FUNCTION__, buf); + __FUNCTION__, buf); goto out; } devfsd_notify(de, DEVFSD_NOTIFY_REGISTERED); - out: + out: devfs_put(dir); return error; } @@ -1464,7 +1457,7 @@ int devfs_mk_bdev(dev_t dev, umode_t mode, const char *fmt, ...) if (!S_ISBLK(mode)) { printk(KERN_WARNING "%s: invalide mode (%u) for %s\n", - __FUNCTION__, mode, fmt); + __FUNCTION__, mode, fmt); return -EINVAL; } @@ -1474,14 +1467,13 @@ int devfs_mk_bdev(dev_t dev, umode_t mode, const char *fmt, ...) EXPORT_SYMBOL(devfs_mk_bdev); - int devfs_mk_cdev(dev_t dev, umode_t mode, const char *fmt, ...) { va_list args; if (!S_ISCHR(mode)) { printk(KERN_WARNING "%s: invalide mode (%u) for %s\n", - __FUNCTION__, mode, fmt); + __FUNCTION__, mode, fmt); return -EINVAL; } @@ -1491,7 +1483,6 @@ int devfs_mk_cdev(dev_t dev, umode_t mode, const char *fmt, ...) EXPORT_SYMBOL(devfs_mk_cdev); - /** * _devfs_unhook - Unhook a device entry from its parents list * @de: The entry to unhook. @@ -1501,21 +1492,25 @@ EXPORT_SYMBOL(devfs_mk_cdev); * The caller must have a write lock on the parent directory. */ -static int _devfs_unhook (struct devfs_entry *de) -{ - struct devfs_entry *parent; - - if ( !de || (de->prev == de) ) return FALSE; - parent = de->parent; - if (de->prev == NULL) parent->u.dir.first = de->next; - else de->prev->next = de->next; - if (de->next == NULL) parent->u.dir.last = de->prev; - else de->next->prev = de->prev; - de->prev = de; /* Indicate we're unhooked */ - de->next = NULL; /* Force early termination for */ - return TRUE; -} /* End Function _devfs_unhook */ - +static int _devfs_unhook(struct devfs_entry *de) +{ + struct devfs_entry *parent; + + if (!de || (de->prev == de)) + return FALSE; + parent = de->parent; + if (de->prev == NULL) + parent->u.dir.first = de->next; + else + de->prev->next = de->next; + if (de->next == NULL) + parent->u.dir.last = de->prev; + else + de->next->prev = de->prev; + de->prev = de; /* Indicate we're unhooked */ + de->next = NULL; /* Force early termination for */ + return TRUE; +} /* End Function _devfs_unhook */ /** * _devfs_unregister - Unregister a device entry from its parent. @@ -1526,83 +1521,83 @@ static int _devfs_unhook (struct devfs_entry *de) * unlocked by this function. */ -static void _devfs_unregister (struct devfs_entry *dir, struct devfs_entry *de) +static void _devfs_unregister(struct devfs_entry *dir, struct devfs_entry *de) { - int unhooked = _devfs_unhook (de); - - write_unlock (&dir->u.dir.lock); - if (!unhooked) return; - devfs_get (dir); - devfsd_notify (de, DEVFSD_NOTIFY_UNREGISTERED); - free_dentry (de); - devfs_put (dir); - if ( !S_ISDIR (de->mode) ) return; - while (TRUE) /* Recursively unregister: this is a stack chomper */ - { - struct devfs_entry *child; - - write_lock (&de->u.dir.lock); - de->u.dir.no_more_additions = TRUE; - child = de->u.dir.first; - VERIFY_ENTRY (child); - _devfs_unregister (de, child); - if (!child) break; - DPRINTK (DEBUG_UNREGISTER, "(%s): child: %p refcount: %d\n", - child->name, child, atomic_read (&child->refcount) ); - devfs_put (child); - } -} /* End Function _devfs_unregister */ - -static int devfs_do_symlink (devfs_handle_t dir, const char *name, - const char *link, devfs_handle_t *handle) + int unhooked = _devfs_unhook(de); + + write_unlock(&dir->u.dir.lock); + if (!unhooked) + return; + devfs_get(dir); + devfsd_notify(de, DEVFSD_NOTIFY_UNREGISTERED); + free_dentry(de); + devfs_put(dir); + if (!S_ISDIR(de->mode)) + return; + while (TRUE) { /* Recursively unregister: this is a stack chomper */ + struct devfs_entry *child; + + write_lock(&de->u.dir.lock); + de->u.dir.no_more_additions = TRUE; + child = de->u.dir.first; + VERIFY_ENTRY(child); + _devfs_unregister(de, child); + if (!child) + break; + DPRINTK(DEBUG_UNREGISTER, "(%s): child: %p refcount: %d\n", + child->name, child, atomic_read(&child->refcount)); + devfs_put(child); + } +} /* End Function _devfs_unregister */ + +static int devfs_do_symlink(devfs_handle_t dir, const char *name, + const char *link, devfs_handle_t * handle) { - int err; - unsigned int linklength; - char *newlink; - struct devfs_entry *de; - - if (handle != NULL) *handle = NULL; - if (name == NULL) - { - PRINTK ("(): NULL name pointer\n"); - return -EINVAL; - } - if (link == NULL) - { - PRINTK ("(%s): NULL link pointer\n", name); - return -EINVAL; - } - linklength = strlen (link); - if ( ( newlink = kmalloc (linklength + 1, GFP_KERNEL) ) == NULL ) - return -ENOMEM; - memcpy (newlink, link, linklength); - newlink[linklength] = '\0'; - if ( ( de = _devfs_prepare_leaf (&dir, name, S_IFLNK | S_IRUGO | S_IXUGO) ) - == NULL ) - { - PRINTK ("(%s): could not prepare leaf\n", name); - kfree (newlink); - return -ENOTDIR; - } - de->info = NULL; - de->u.symlink.linkname = newlink; - de->u.symlink.length = linklength; - if ( ( err = _devfs_append_entry (dir, de, NULL) ) != 0 ) - { - PRINTK ("(%s): could not append to parent, err: %d\n", name, err); - devfs_put (dir); - return err; - } - devfs_put (dir); + int err; + unsigned int linklength; + char *newlink; + struct devfs_entry *de; + + if (handle != NULL) + *handle = NULL; + if (name == NULL) { + PRINTK("(): NULL name pointer\n"); + return -EINVAL; + } + if (link == NULL) { + PRINTK("(%s): NULL link pointer\n", name); + return -EINVAL; + } + linklength = strlen(link); + if ((newlink = kmalloc(linklength + 1, GFP_KERNEL)) == NULL) + return -ENOMEM; + memcpy(newlink, link, linklength); + newlink[linklength] = '\0'; + if ((de = _devfs_prepare_leaf(&dir, name, S_IFLNK | S_IRUGO | S_IXUGO)) + == NULL) { + PRINTK("(%s): could not prepare leaf\n", name); + kfree(newlink); + return -ENOTDIR; + } + de->info = NULL; + de->u.symlink.linkname = newlink; + de->u.symlink.length = linklength; + if ((err = _devfs_append_entry(dir, de, NULL)) != 0) { + PRINTK("(%s): could not append to parent, err: %d\n", name, + err); + devfs_put(dir); + return err; + } + devfs_put(dir); #ifdef CONFIG_DEVFS_DEBUG - spin_lock (&stat_lock); - stat_num_bytes += linklength + 1; - spin_unlock (&stat_lock); + spin_lock(&stat_lock); + stat_num_bytes += linklength + 1; + spin_unlock(&stat_lock); #endif - if (handle != NULL) *handle = de; - return 0; -} /* End Function devfs_do_symlink */ - + if (handle != NULL) + *handle = de; + return 0; +} /* End Function devfs_do_symlink */ /** * devfs_mk_symlink Create a symbolic link in the devfs namespace. @@ -1626,7 +1621,6 @@ int devfs_mk_symlink(const char *from, const char *to) return err; } - /** * devfs_mk_dir - Create a directory in the devfs namespace. * new name is relative to the root of the devfs. @@ -1668,19 +1662,18 @@ int devfs_mk_dir(const char *fmt, ...) goto out_put; } else if (error) { PRINTK("(%s): could not append to dir: %p \"%s\"\n", - buf, dir, dir->name); + buf, dir, dir->name); devfs_put(old); goto out_put; } - + devfsd_notify(de, DEVFSD_NOTIFY_REGISTERED); - out_put: + out_put: devfs_put(dir); return error; } - void devfs_remove(const char *fmt, ...) { char buf[64]; @@ -1706,7 +1699,6 @@ void devfs_remove(const char *fmt, ...) } } - /** * devfs_generate_path - Generate a pathname for an entry, relative to the devfs root. * @de: The devfs entry. @@ -1718,90 +1710,93 @@ void devfs_remove(const char *fmt, ...) * else a negative error code. */ -static int devfs_generate_path (devfs_handle_t de, char *path, int buflen) +static int devfs_generate_path(devfs_handle_t de, char *path, int buflen) { - int pos; + int pos; #define NAMEOF(de) ( (de)->mode ? (de)->name : (de)->u.name ) - if (de == NULL) return -EINVAL; - VERIFY_ENTRY (de); - if (de->namelen >= buflen) return -ENAMETOOLONG; /* Must be first */ - path[buflen - 1] = '\0'; - if (de->parent == NULL) return buflen - 1; /* Don't prepend root */ - pos = buflen - de->namelen - 1; - memcpy (path + pos, NAMEOF (de), de->namelen); - for (de = de->parent; de->parent != NULL; de = de->parent) - { - if (pos - de->namelen - 1 < 0) return -ENAMETOOLONG; - path[--pos] = '/'; - pos -= de->namelen; - memcpy (path + pos, NAMEOF (de), de->namelen); - } - return pos; -} /* End Function devfs_generate_path */ + if (de == NULL) + return -EINVAL; + VERIFY_ENTRY(de); + if (de->namelen >= buflen) + return -ENAMETOOLONG; /* Must be first */ + path[buflen - 1] = '\0'; + if (de->parent == NULL) + return buflen - 1; /* Don't prepend root */ + pos = buflen - de->namelen - 1; + memcpy(path + pos, NAMEOF(de), de->namelen); + for (de = de->parent; de->parent != NULL; de = de->parent) { + if (pos - de->namelen - 1 < 0) + return -ENAMETOOLONG; + path[--pos] = '/'; + pos -= de->namelen; + memcpy(path + pos, NAMEOF(de), de->namelen); + } + return pos; +} /* End Function devfs_generate_path */ /** * devfs_setup - Process kernel boot options. * @str: The boot options after the "devfs=". */ -static int __init devfs_setup (char *str) +static int __init devfs_setup(char *str) { - static struct - { - char *name; - unsigned int mask; - unsigned int *opt; - } devfs_options_tab[] __initdata = - { + static struct { + char *name; + unsigned int mask; + unsigned int *opt; + } devfs_options_tab[] __initdata = { #ifdef CONFIG_DEVFS_DEBUG - {"dall", DEBUG_ALL, &devfs_debug_init}, - {"dmod", DEBUG_MODULE_LOAD, &devfs_debug_init}, - {"dreg", DEBUG_REGISTER, &devfs_debug_init}, - {"dunreg", DEBUG_UNREGISTER, &devfs_debug_init}, - {"dfree", DEBUG_FREE, &devfs_debug_init}, - {"diget", DEBUG_I_GET, &devfs_debug_init}, - {"dchange", DEBUG_SET_FLAGS, &devfs_debug_init}, - {"dsread", DEBUG_S_READ, &devfs_debug_init}, - {"dichange", DEBUG_I_CHANGE, &devfs_debug_init}, - {"dimknod", DEBUG_I_MKNOD, &devfs_debug_init}, - {"dilookup", DEBUG_I_LOOKUP, &devfs_debug_init}, - {"diunlink", DEBUG_I_UNLINK, &devfs_debug_init}, -#endif /* CONFIG_DEVFS_DEBUG */ - {"mount", OPTION_MOUNT, &boot_options}, - {NULL, 0, NULL} - }; - - while ( (*str != '\0') && !isspace (*str) ) - { - int i, found = 0, invert = 0; - - if (strncmp (str, "no", 2) == 0) - { - invert = 1; - str += 2; - } - for (i = 0; devfs_options_tab[i].name != NULL; i++) - { - int len = strlen (devfs_options_tab[i].name); - - if (strncmp (str, devfs_options_tab[i].name, len) == 0) - { - if (invert) - *devfs_options_tab[i].opt &= ~devfs_options_tab[i].mask; - else - *devfs_options_tab[i].opt |= devfs_options_tab[i].mask; - str += len; - found = 1; - break; - } + { + "dall", DEBUG_ALL, &devfs_debug_init}, { + "dmod", DEBUG_MODULE_LOAD, &devfs_debug_init}, { + "dreg", DEBUG_REGISTER, &devfs_debug_init}, { + "dunreg", DEBUG_UNREGISTER, &devfs_debug_init}, { + "dfree", DEBUG_FREE, &devfs_debug_init}, { + "diget", DEBUG_I_GET, &devfs_debug_init}, { + "dchange", DEBUG_SET_FLAGS, &devfs_debug_init}, { + "dsread", DEBUG_S_READ, &devfs_debug_init}, { + "dichange", DEBUG_I_CHANGE, &devfs_debug_init}, { + "dimknod", DEBUG_I_MKNOD, &devfs_debug_init}, { + "dilookup", DEBUG_I_LOOKUP, &devfs_debug_init}, { + "diunlink", DEBUG_I_UNLINK, &devfs_debug_init}, +#endif /* CONFIG_DEVFS_DEBUG */ + { + "mount", OPTION_MOUNT, &boot_options}, { + NULL, 0, NULL} + }; + + while ((*str != '\0') && !isspace(*str)) { + int i, found = 0, invert = 0; + + if (strncmp(str, "no", 2) == 0) { + invert = 1; + str += 2; + } + for (i = 0; devfs_options_tab[i].name != NULL; i++) { + int len = strlen(devfs_options_tab[i].name); + + if (strncmp(str, devfs_options_tab[i].name, len) == 0) { + if (invert) + *devfs_options_tab[i].opt &= + ~devfs_options_tab[i].mask; + else + *devfs_options_tab[i].opt |= + devfs_options_tab[i].mask; + str += len; + found = 1; + break; + } + } + if (!found) + return 0; /* No match */ + if (*str != ',') + return 0; /* No more options */ + ++str; } - if (!found) return 0; /* No match */ - if (*str != ',') return 0; /* No more options */ - ++str; - } - return 1; -} /* End Function devfs_setup */ + return 1; +} /* End Function devfs_setup */ __setup("devfs=", devfs_setup); @@ -1809,7 +1804,6 @@ EXPORT_SYMBOL(devfs_mk_symlink); EXPORT_SYMBOL(devfs_mk_dir); EXPORT_SYMBOL(devfs_remove); - /** * try_modload - Notify devfsd of an inode lookup by a non-devfsd process. * @parent: The parent devfs entry. @@ -1822,26 +1816,26 @@ EXPORT_SYMBOL(devfs_remove); * Returns 0 on success (event was queued), else a negative error code. */ -static int try_modload (struct devfs_entry *parent, struct fs_info *fs_info, - const char *name, unsigned namelen, - struct devfs_entry *buf) -{ - if ( !( fs_info->devfsd_event_mask & (1 << DEVFSD_NOTIFY_LOOKUP) ) ) - return -ENOENT; - if ( is_devfsd_or_child (fs_info) ) return -ENOENT; - memset (buf, 0, sizeof *buf); - atomic_set (&buf->refcount, 1); - buf->parent = parent; - buf->namelen = namelen; - buf->u.name = name; - WRITE_ENTRY_MAGIC (buf, MAGIC_VALUE); - if ( !devfsd_notify_de (buf, DEVFSD_NOTIFY_LOOKUP, 0, - current->euid, current->egid, fs_info) ) - return -ENOENT; - /* Possible success: event has been queued */ - return 0; -} /* End Function try_modload */ - +static int try_modload(struct devfs_entry *parent, struct fs_info *fs_info, + const char *name, unsigned namelen, + struct devfs_entry *buf) +{ + if (!(fs_info->devfsd_event_mask & (1 << DEVFSD_NOTIFY_LOOKUP))) + return -ENOENT; + if (is_devfsd_or_child(fs_info)) + return -ENOENT; + memset(buf, 0, sizeof *buf); + atomic_set(&buf->refcount, 1); + buf->parent = parent; + buf->namelen = namelen; + buf->u.name = name; + WRITE_ENTRY_MAGIC(buf, MAGIC_VALUE); + if (!devfsd_notify_de(buf, DEVFSD_NOTIFY_LOOKUP, 0, + current->euid, current->egid, fs_info)) + return -ENOENT; + /* Possible success: event has been queued */ + return 0; +} /* End Function try_modload */ /* Superblock operations follow */ @@ -1851,44 +1845,45 @@ static struct file_operations devfs_fops; static struct file_operations devfs_dir_fops; static struct inode_operations devfs_symlink_iops; -static int devfs_notify_change (struct dentry *dentry, struct iattr *iattr) +static int devfs_notify_change(struct dentry *dentry, struct iattr *iattr) { - int retval; - struct devfs_entry *de; - struct inode *inode = dentry->d_inode; - struct fs_info *fs_info = inode->i_sb->s_fs_info; - - de = get_devfs_entry_from_vfs_inode (inode); - if (de == NULL) return -ENODEV; - retval = inode_change_ok (inode, iattr); - if (retval != 0) return retval; - retval = inode_setattr (inode, iattr); - if (retval != 0) return retval; - DPRINTK (DEBUG_I_CHANGE, "(%d): VFS inode: %p devfs_entry: %p\n", - (int) inode->i_ino, inode, de); - DPRINTK (DEBUG_I_CHANGE, "(): mode: 0%o uid: %d gid: %d\n", - (int) inode->i_mode, (int) inode->i_uid, (int) inode->i_gid); - /* Inode is not on hash chains, thus must save permissions here rather - than in a write_inode() method */ - de->mode = inode->i_mode; - de->inode.uid = inode->i_uid; - de->inode.gid = inode->i_gid; - de->inode.atime = inode->i_atime; - de->inode.mtime = inode->i_mtime; - de->inode.ctime = inode->i_ctime; - if ( ( iattr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID) ) && - !is_devfsd_or_child (fs_info) ) - devfsd_notify_de (de, DEVFSD_NOTIFY_CHANGE, inode->i_mode, - inode->i_uid, inode->i_gid, fs_info); - return 0; -} /* End Function devfs_notify_change */ - -static struct super_operations devfs_sops = -{ - .drop_inode = generic_delete_inode, - .statfs = simple_statfs, -}; + int retval; + struct devfs_entry *de; + struct inode *inode = dentry->d_inode; + struct fs_info *fs_info = inode->i_sb->s_fs_info; + de = get_devfs_entry_from_vfs_inode(inode); + if (de == NULL) + return -ENODEV; + retval = inode_change_ok(inode, iattr); + if (retval != 0) + return retval; + retval = inode_setattr(inode, iattr); + if (retval != 0) + return retval; + DPRINTK(DEBUG_I_CHANGE, "(%d): VFS inode: %p devfs_entry: %p\n", + (int)inode->i_ino, inode, de); + DPRINTK(DEBUG_I_CHANGE, "(): mode: 0%o uid: %d gid: %d\n", + (int)inode->i_mode, (int)inode->i_uid, (int)inode->i_gid); + /* Inode is not on hash chains, thus must save permissions here rather + than in a write_inode() method */ + de->mode = inode->i_mode; + de->inode.uid = inode->i_uid; + de->inode.gid = inode->i_gid; + de->inode.atime = inode->i_atime; + de->inode.mtime = inode->i_mtime; + de->inode.ctime = inode->i_ctime; + if ((iattr->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) && + !is_devfsd_or_child(fs_info)) + devfsd_notify_de(de, DEVFSD_NOTIFY_CHANGE, inode->i_mode, + inode->i_uid, inode->i_gid, fs_info); + return 0; +} /* End Function devfs_notify_change */ + +static struct super_operations devfs_sops = { + .drop_inode = generic_delete_inode, + .statfs = simple_statfs, +}; /** * _devfs_get_vfs_inode - Get a VFS inode. @@ -1900,39 +1895,38 @@ static struct super_operations devfs_sops = * performed if the inode is created. */ -static struct inode *_devfs_get_vfs_inode (struct super_block *sb, - struct devfs_entry *de, - struct dentry *dentry) +static struct inode *_devfs_get_vfs_inode(struct super_block *sb, + struct devfs_entry *de, + struct dentry *dentry) { - struct inode *inode; - - if (de->prev == de) return NULL; /* Quick check to see if unhooked */ - if ( ( inode = new_inode (sb) ) == NULL ) - { - PRINTK ("(%s): new_inode() failed, de: %p\n", de->name, de); - return NULL; - } - if (de->parent) - { - read_lock (&de->parent->u.dir.lock); - if (de->prev != de) de->inode.dentry = dentry; /* Not unhooked */ - read_unlock (&de->parent->u.dir.lock); - } - else de->inode.dentry = dentry; /* Root: no locking needed */ - if (de->inode.dentry != dentry) - { /* Must have been unhooked */ - iput (inode); - return NULL; - } - /* FIXME where is devfs_put? */ - inode->u.generic_ip = devfs_get (de); - inode->i_ino = de->inode.ino; - DPRINTK (DEBUG_I_GET, "(%d): VFS inode: %p devfs_entry: %p\n", - (int) inode->i_ino, inode, de); - inode->i_blocks = 0; - inode->i_blksize = FAKE_BLOCK_SIZE; - inode->i_op = &devfs_iops; - inode->i_mode = de->mode; + struct inode *inode; + + if (de->prev == de) + return NULL; /* Quick check to see if unhooked */ + if ((inode = new_inode(sb)) == NULL) { + PRINTK("(%s): new_inode() failed, de: %p\n", de->name, de); + return NULL; + } + if (de->parent) { + read_lock(&de->parent->u.dir.lock); + if (de->prev != de) + de->inode.dentry = dentry; /* Not unhooked */ + read_unlock(&de->parent->u.dir.lock); + } else + de->inode.dentry = dentry; /* Root: no locking needed */ + if (de->inode.dentry != dentry) { /* Must have been unhooked */ + iput(inode); + return NULL; + } + /* FIXME where is devfs_put? */ + inode->u.generic_ip = devfs_get(de); + inode->i_ino = de->inode.ino; + DPRINTK(DEBUG_I_GET, "(%d): VFS inode: %p devfs_entry: %p\n", + (int)inode->i_ino, inode, de); + inode->i_blocks = 0; + inode->i_blksize = FAKE_BLOCK_SIZE; + inode->i_op = &devfs_iops; + inode->i_mode = de->mode; if (S_ISDIR(de->mode)) { inode->i_op = &devfs_dir_iops; inode->i_fop = &devfs_dir_fops; @@ -1945,100 +1939,107 @@ static struct inode *_devfs_get_vfs_inode (struct super_block *sb, init_special_inode(inode, de->mode, 0); } else { PRINTK("(%s): unknown mode %o de: %p\n", - de->name, de->mode, de); + de->name, de->mode, de); iput(inode); devfs_put(de); return NULL; } - inode->i_uid = de->inode.uid; - inode->i_gid = de->inode.gid; - inode->i_atime = de->inode.atime; - inode->i_mtime = de->inode.mtime; - inode->i_ctime = de->inode.ctime; - DPRINTK (DEBUG_I_GET, "(): mode: 0%o uid: %d gid: %d\n", - (int) inode->i_mode, (int) inode->i_uid, (int) inode->i_gid); - return inode; -} /* End Function _devfs_get_vfs_inode */ - + inode->i_uid = de->inode.uid; + inode->i_gid = de->inode.gid; + inode->i_atime = de->inode.atime; + inode->i_mtime = de->inode.mtime; + inode->i_ctime = de->inode.ctime; + DPRINTK(DEBUG_I_GET, "(): mode: 0%o uid: %d gid: %d\n", + (int)inode->i_mode, (int)inode->i_uid, (int)inode->i_gid); + return inode; +} /* End Function _devfs_get_vfs_inode */ /* File operations for device entries follow */ -static int devfs_readdir (struct file *file, void *dirent, filldir_t filldir) +static int devfs_readdir(struct file *file, void *dirent, filldir_t filldir) { - int err, count; - int stored = 0; - struct fs_info *fs_info; - struct devfs_entry *parent, *de, *next = NULL; - struct inode *inode = file->f_dentry->d_inode; - - fs_info = inode->i_sb->s_fs_info; - parent = get_devfs_entry_from_vfs_inode (file->f_dentry->d_inode); - if ( (long) file->f_pos < 0 ) return -EINVAL; - DPRINTK (DEBUG_F_READDIR, "(%s): fs_info: %p pos: %ld\n", - parent->name, fs_info, (long) file->f_pos); - switch ( (long) file->f_pos ) - { - case 0: - err = (*filldir) (dirent, "..", 2, file->f_pos, - parent_ino (file->f_dentry), DT_DIR); - if (err == -EINVAL) break; - if (err < 0) return err; - file->f_pos++; - ++stored; - /* Fall through */ - case 1: - err = (*filldir) (dirent, ".", 1, file->f_pos, inode->i_ino, DT_DIR); - if (err == -EINVAL) break; - if (err < 0) return err; - file->f_pos++; - ++stored; - /* Fall through */ - default: - /* Skip entries */ - count = file->f_pos - 2; - read_lock (&parent->u.dir.lock); - for (de = parent->u.dir.first; de && (count > 0); de = de->next) - --count; - devfs_get (de); - read_unlock (&parent->u.dir.lock); - /* Now add all remaining entries */ - while (de) - { - err = (*filldir) (dirent, de->name, de->namelen, - file->f_pos, de->inode.ino, de->mode >> 12); - if (err < 0) devfs_put (de); - else - { - file->f_pos++; - ++stored; - } - if (err == -EINVAL) break; - if (err < 0) return err; - read_lock (&parent->u.dir.lock); - next = devfs_get (de->next); - read_unlock (&parent->u.dir.lock); - devfs_put (de); - de = next; + int err, count; + int stored = 0; + struct fs_info *fs_info; + struct devfs_entry *parent, *de, *next = NULL; + struct inode *inode = file->f_dentry->d_inode; + + fs_info = inode->i_sb->s_fs_info; + parent = get_devfs_entry_from_vfs_inode(file->f_dentry->d_inode); + if ((long)file->f_pos < 0) + return -EINVAL; + DPRINTK(DEBUG_F_READDIR, "(%s): fs_info: %p pos: %ld\n", + parent->name, fs_info, (long)file->f_pos); + switch ((long)file->f_pos) { + case 0: + err = (*filldir) (dirent, "..", 2, file->f_pos, + parent_ino(file->f_dentry), DT_DIR); + if (err == -EINVAL) + break; + if (err < 0) + return err; + file->f_pos++; + ++stored; + /* Fall through */ + case 1: + err = + (*filldir) (dirent, ".", 1, file->f_pos, inode->i_ino, + DT_DIR); + if (err == -EINVAL) + break; + if (err < 0) + return err; + file->f_pos++; + ++stored; + /* Fall through */ + default: + /* Skip entries */ + count = file->f_pos - 2; + read_lock(&parent->u.dir.lock); + for (de = parent->u.dir.first; de && (count > 0); de = de->next) + --count; + devfs_get(de); + read_unlock(&parent->u.dir.lock); + /* Now add all remaining entries */ + while (de) { + err = (*filldir) (dirent, de->name, de->namelen, + file->f_pos, de->inode.ino, + de->mode >> 12); + if (err < 0) + devfs_put(de); + else { + file->f_pos++; + ++stored; + } + if (err == -EINVAL) + break; + if (err < 0) + return err; + read_lock(&parent->u.dir.lock); + next = devfs_get(de->next); + read_unlock(&parent->u.dir.lock); + devfs_put(de); + de = next; + } + break; } - break; - } - return stored; -} /* End Function devfs_readdir */ + return stored; +} /* End Function devfs_readdir */ /* Open devfs specific special files */ -static int devfs_open (struct inode *inode, struct file *file) +static int devfs_open(struct inode *inode, struct file *file) { int err; int minor = MINOR(inode->i_rdev); struct file_operations *old_fops, *new_fops; switch (minor) { - case 0: /* /dev/.devfsd */ + case 0: /* /dev/.devfsd */ new_fops = fops_get(&devfsd_fops); break; #ifdef CONFIG_DEVFS_DEBUG - case 1: /* /dev/.stat */ + case 1: /* /dev/.stat */ new_fops = fops_get(&stat_fops); break; #endif @@ -2057,32 +2058,28 @@ static int devfs_open (struct inode *inode, struct file *file) } else fops_put(old_fops); return err; -} /* End Function devfs_open */ +} /* End Function devfs_open */ -static struct file_operations devfs_fops = -{ - .open = devfs_open, +static struct file_operations devfs_fops = { + .open = devfs_open, }; -static struct file_operations devfs_dir_fops = -{ - .read = generic_read_dir, - .readdir = devfs_readdir, +static struct file_operations devfs_dir_fops = { + .read = generic_read_dir, + .readdir = devfs_readdir, }; - /* Dentry operations for device entries follow */ - /** * devfs_d_release - Callback for when a dentry is freed. * @dentry: The dentry. */ -static void devfs_d_release (struct dentry *dentry) +static void devfs_d_release(struct dentry *dentry) { - DPRINTK (DEBUG_D_RELEASE, "(%p): inode: %p\n", dentry, dentry->d_inode); -} /* End Function devfs_d_release */ + DPRINTK(DEBUG_D_RELEASE, "(%p): inode: %p\n", dentry, dentry->d_inode); +} /* End Function devfs_d_release */ /** * devfs_d_iput - Callback for when a dentry loses its inode. @@ -2090,38 +2087,37 @@ static void devfs_d_release (struct dentry *dentry) * @inode: The inode. */ -static void devfs_d_iput (struct dentry *dentry, struct inode *inode) +static void devfs_d_iput(struct dentry *dentry, struct inode *inode) { - struct devfs_entry *de; - - de = get_devfs_entry_from_vfs_inode (inode); - DPRINTK (DEBUG_D_IPUT,"(%s): dentry: %p inode: %p de: %p de->dentry: %p\n", - de->name, dentry, inode, de, de->inode.dentry); - if ( de->inode.dentry && (de->inode.dentry != dentry) ) - OOPS ("(%s): de: %p dentry: %p de->dentry: %p\n", - de->name, de, dentry, de->inode.dentry); - de->inode.dentry = NULL; - iput (inode); - devfs_put (de); -} /* End Function devfs_d_iput */ - -static int devfs_d_delete (struct dentry *dentry); - -static struct dentry_operations devfs_dops = -{ - .d_delete = devfs_d_delete, - .d_release = devfs_d_release, - .d_iput = devfs_d_iput, + struct devfs_entry *de; + + de = get_devfs_entry_from_vfs_inode(inode); + DPRINTK(DEBUG_D_IPUT, + "(%s): dentry: %p inode: %p de: %p de->dentry: %p\n", de->name, + dentry, inode, de, de->inode.dentry); + if (de->inode.dentry && (de->inode.dentry != dentry)) + OOPS("(%s): de: %p dentry: %p de->dentry: %p\n", + de->name, de, dentry, de->inode.dentry); + de->inode.dentry = NULL; + iput(inode); + devfs_put(de); +} /* End Function devfs_d_iput */ + +static int devfs_d_delete(struct dentry *dentry); + +static struct dentry_operations devfs_dops = { + .d_delete = devfs_d_delete, + .d_release = devfs_d_release, + .d_iput = devfs_d_iput, }; -static int devfs_d_revalidate_wait (struct dentry *dentry, struct nameidata *); +static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *); -static struct dentry_operations devfs_wait_dops = -{ - .d_delete = devfs_d_delete, - .d_release = devfs_d_release, - .d_iput = devfs_d_iput, - .d_revalidate = devfs_d_revalidate_wait, +static struct dentry_operations devfs_wait_dops = { + .d_delete = devfs_d_delete, + .d_release = devfs_d_release, + .d_iput = devfs_d_iput, + .d_revalidate = devfs_d_revalidate_wait, }; /** @@ -2129,653 +2125,673 @@ static struct dentry_operations devfs_wait_dops = * @dentry: The dentry. */ -static int devfs_d_delete (struct dentry *dentry) +static int devfs_d_delete(struct dentry *dentry) { - struct inode *inode = dentry->d_inode; + struct inode *inode = dentry->d_inode; - if (dentry->d_op == &devfs_wait_dops) dentry->d_op = &devfs_dops; - /* Unhash dentry if negative (has no inode) */ - if (inode == NULL) - { - DPRINTK (DEBUG_D_DELETE, "(%p): dropping negative dentry\n", dentry); - return 1; - } - return 0; -} /* End Function devfs_d_delete */ + if (dentry->d_op == &devfs_wait_dops) + dentry->d_op = &devfs_dops; + /* Unhash dentry if negative (has no inode) */ + if (inode == NULL) { + DPRINTK(DEBUG_D_DELETE, "(%p): dropping negative dentry\n", + dentry); + return 1; + } + return 0; +} /* End Function devfs_d_delete */ -struct devfs_lookup_struct -{ - devfs_handle_t de; - wait_queue_head_t wait_queue; +struct devfs_lookup_struct { + devfs_handle_t de; + wait_queue_head_t wait_queue; }; /* XXX: this doesn't handle the case where we got a negative dentry but a devfs entry has been registered in the meanwhile */ -static int devfs_d_revalidate_wait (struct dentry *dentry, struct nameidata *nd) +static int devfs_d_revalidate_wait(struct dentry *dentry, struct nameidata *nd) { - struct inode *dir = dentry->d_parent->d_inode; - struct fs_info *fs_info = dir->i_sb->s_fs_info; - devfs_handle_t parent = get_devfs_entry_from_vfs_inode (dir); - struct devfs_lookup_struct *lookup_info = dentry->d_fsdata; - DECLARE_WAITQUEUE (wait, current); - int need_lock; - - /* - * FIXME HACK - * - * make sure that - * d_instantiate always runs under lock - * we release i_sem lock before going to sleep - * - * unfortunately sometimes d_revalidate is called with - * and sometimes without i_sem lock held. The following checks - * attempt to deduce when we need to add (and drop resp.) lock - * here. This relies on current (2.6.2) calling coventions: - * - * lookup_hash is always run under i_sem and is passing NULL - * as nd - * - * open(...,O_CREATE,...) calls _lookup_hash under i_sem - * and sets flags to LOOKUP_OPEN|LOOKUP_CREATE - * - * all other invocations of ->d_revalidate seem to happen - * outside of i_sem - */ - need_lock = nd && - (!(nd->flags & LOOKUP_CREATE) || (nd->flags & LOOKUP_PARENT)); - - if (need_lock) - down(&dir->i_sem); - - if ( is_devfsd_or_child (fs_info) ) - { - devfs_handle_t de = lookup_info->de; - struct inode *inode; + struct inode *dir = dentry->d_parent->d_inode; + struct fs_info *fs_info = dir->i_sb->s_fs_info; + devfs_handle_t parent = get_devfs_entry_from_vfs_inode(dir); + struct devfs_lookup_struct *lookup_info = dentry->d_fsdata; + DECLARE_WAITQUEUE(wait, current); + int need_lock; - DPRINTK (DEBUG_I_LOOKUP, - "(%s): dentry: %p inode: %p de: %p by: \"%s\"\n", - dentry->d_name.name, dentry, dentry->d_inode, de, - current->comm); - if (dentry->d_inode) - goto out; - if (de == NULL) - { - read_lock (&parent->u.dir.lock); - de = _devfs_search_dir (parent, dentry->d_name.name, - dentry->d_name.len); - read_unlock (&parent->u.dir.lock); - if (de == NULL) - goto out; - lookup_info->de = de; - } - /* Create an inode, now that the driver information is available */ - inode = _devfs_get_vfs_inode (dir->i_sb, de, dentry); - if (!inode) - goto out; - DPRINTK (DEBUG_I_LOOKUP, - "(%s): new VFS inode(%u): %p de: %p by: \"%s\"\n", - de->name, de->inode.ino, inode, de, current->comm); - d_instantiate (dentry, inode); - goto out; - } - if (lookup_info == NULL) - goto out; /* Early termination */ - read_lock (&parent->u.dir.lock); - if (dentry->d_fsdata) - { - set_current_state (TASK_UNINTERRUPTIBLE); - add_wait_queue (&lookup_info->wait_queue, &wait); - read_unlock (&parent->u.dir.lock); - /* at this point it is always (hopefully) locked */ - up(&dir->i_sem); - schedule (); - down(&dir->i_sem); /* - * This does not need nor should remove wait from wait_queue. - * Wait queue head is never reused - nothing is ever added to it - * after all waiters have been waked up and head itself disappears - * very soon after it. Moreover it is local variable on stack that - * is likely to have already disappeared so any reference to it - * at this point is buggy. + * FIXME HACK + * + * make sure that + * d_instantiate always runs under lock + * we release i_sem lock before going to sleep + * + * unfortunately sometimes d_revalidate is called with + * and sometimes without i_sem lock held. The following checks + * attempt to deduce when we need to add (and drop resp.) lock + * here. This relies on current (2.6.2) calling coventions: + * + * lookup_hash is always run under i_sem and is passing NULL + * as nd + * + * open(...,O_CREATE,...) calls _lookup_hash under i_sem + * and sets flags to LOOKUP_OPEN|LOOKUP_CREATE + * + * all other invocations of ->d_revalidate seem to happen + * outside of i_sem */ + need_lock = nd && + (!(nd->flags & LOOKUP_CREATE) || (nd->flags & LOOKUP_PARENT)); + + if (need_lock) + down(&dir->i_sem); + + if (is_devfsd_or_child(fs_info)) { + devfs_handle_t de = lookup_info->de; + struct inode *inode; + + DPRINTK(DEBUG_I_LOOKUP, + "(%s): dentry: %p inode: %p de: %p by: \"%s\"\n", + dentry->d_name.name, dentry, dentry->d_inode, de, + current->comm); + if (dentry->d_inode) + goto out; + if (de == NULL) { + read_lock(&parent->u.dir.lock); + de = _devfs_search_dir(parent, dentry->d_name.name, + dentry->d_name.len); + read_unlock(&parent->u.dir.lock); + if (de == NULL) + goto out; + lookup_info->de = de; + } + /* Create an inode, now that the driver information is available */ + inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry); + if (!inode) + goto out; + DPRINTK(DEBUG_I_LOOKUP, + "(%s): new VFS inode(%u): %p de: %p by: \"%s\"\n", + de->name, de->inode.ino, inode, de, current->comm); + d_instantiate(dentry, inode); + goto out; + } + if (lookup_info == NULL) + goto out; /* Early termination */ + read_lock(&parent->u.dir.lock); + if (dentry->d_fsdata) { + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&lookup_info->wait_queue, &wait); + read_unlock(&parent->u.dir.lock); + /* at this point it is always (hopefully) locked */ + up(&dir->i_sem); + schedule(); + down(&dir->i_sem); + /* + * This does not need nor should remove wait from wait_queue. + * Wait queue head is never reused - nothing is ever added to it + * after all waiters have been waked up and head itself disappears + * very soon after it. Moreover it is local variable on stack that + * is likely to have already disappeared so any reference to it + * at this point is buggy. + */ - } - else read_unlock (&parent->u.dir.lock); - -out: - if (need_lock) - up(&dir->i_sem); - return 1; -} /* End Function devfs_d_revalidate_wait */ + } else + read_unlock(&parent->u.dir.lock); + out: + if (need_lock) + up(&dir->i_sem); + return 1; +} /* End Function devfs_d_revalidate_wait */ /* Inode operations for device entries follow */ -static struct dentry *devfs_lookup (struct inode *dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *devfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) { - struct devfs_entry tmp; /* Must stay in scope until devfsd idle again */ - struct devfs_lookup_struct lookup_info; - struct fs_info *fs_info = dir->i_sb->s_fs_info; - struct devfs_entry *parent, *de; - struct inode *inode; - struct dentry *retval = NULL; - - /* Set up the dentry operations before anything else, to ensure cleaning - up on any error */ - dentry->d_op = &devfs_dops; - /* First try to get the devfs entry for this directory */ - parent = get_devfs_entry_from_vfs_inode (dir); - DPRINTK (DEBUG_I_LOOKUP, "(%s): dentry: %p parent: %p by: \"%s\"\n", - dentry->d_name.name, dentry, parent, current->comm); - if (parent == NULL) return ERR_PTR (-ENOENT); - read_lock (&parent->u.dir.lock); - de = _devfs_search_dir (parent, dentry->d_name.name, dentry->d_name.len); - read_unlock (&parent->u.dir.lock); - lookup_info.de = de; - init_waitqueue_head (&lookup_info.wait_queue); - dentry->d_fsdata = &lookup_info; - if (de == NULL) - { /* Try with devfsd. For any kind of failure, leave a negative dentry - so someone else can deal with it (in the case where the sysadmin - does a mknod()). It's important to do this before hashing the - dentry, so that the devfsd queue is filled before revalidates - can start */ - if (try_modload (parent, fs_info, - dentry->d_name.name, dentry->d_name.len, &tmp) < 0) - { /* Lookup event was not queued to devfsd */ - d_add (dentry, NULL); - return NULL; + struct devfs_entry tmp; /* Must stay in scope until devfsd idle again */ + struct devfs_lookup_struct lookup_info; + struct fs_info *fs_info = dir->i_sb->s_fs_info; + struct devfs_entry *parent, *de; + struct inode *inode; + struct dentry *retval = NULL; + + /* Set up the dentry operations before anything else, to ensure cleaning + up on any error */ + dentry->d_op = &devfs_dops; + /* First try to get the devfs entry for this directory */ + parent = get_devfs_entry_from_vfs_inode(dir); + DPRINTK(DEBUG_I_LOOKUP, "(%s): dentry: %p parent: %p by: \"%s\"\n", + dentry->d_name.name, dentry, parent, current->comm); + if (parent == NULL) + return ERR_PTR(-ENOENT); + read_lock(&parent->u.dir.lock); + de = _devfs_search_dir(parent, dentry->d_name.name, dentry->d_name.len); + read_unlock(&parent->u.dir.lock); + lookup_info.de = de; + init_waitqueue_head(&lookup_info.wait_queue); + dentry->d_fsdata = &lookup_info; + if (de == NULL) { /* Try with devfsd. For any kind of failure, leave a negative dentry + so someone else can deal with it (in the case where the sysadmin + does a mknod()). It's important to do this before hashing the + dentry, so that the devfsd queue is filled before revalidates + can start */ + if (try_modload(parent, fs_info, dentry->d_name.name, dentry->d_name.len, &tmp) < 0) { /* Lookup event was not queued to devfsd */ + d_add(dentry, NULL); + return NULL; + } } - } - dentry->d_op = &devfs_wait_dops; - d_add (dentry, NULL); /* Open the floodgates */ - /* Unlock directory semaphore, which will release any waiters. They - will get the hashed dentry, and may be forced to wait for - revalidation */ - up (&dir->i_sem); - wait_for_devfsd_finished (fs_info); /* If I'm not devfsd, must wait */ - down (&dir->i_sem); /* Grab it again because them's the rules */ - de = lookup_info.de; - /* If someone else has been so kind as to make the inode, we go home - early */ - if (dentry->d_inode) goto out; - if (de == NULL) - { - read_lock (&parent->u.dir.lock); - de = _devfs_search_dir (parent, dentry->d_name.name, - dentry->d_name.len); - read_unlock (&parent->u.dir.lock); - if (de == NULL) goto out; - /* OK, there's an entry now, but no VFS inode yet */ - } - /* Create an inode, now that the driver information is available */ - inode = _devfs_get_vfs_inode (dir->i_sb, de, dentry); - if (!inode) - { - retval = ERR_PTR (-ENOMEM); - goto out; - } - DPRINTK (DEBUG_I_LOOKUP, "(%s): new VFS inode(%u): %p de: %p by: \"%s\"\n", - de->name, de->inode.ino, inode, de, current->comm); - d_instantiate (dentry, inode); -out: - write_lock (&parent->u.dir.lock); - dentry->d_op = &devfs_dops; - dentry->d_fsdata = NULL; - wake_up (&lookup_info.wait_queue); - write_unlock (&parent->u.dir.lock); - devfs_put (de); - return retval; -} /* End Function devfs_lookup */ - -static int devfs_unlink (struct inode *dir, struct dentry *dentry) -{ - int unhooked; - struct devfs_entry *de; - struct inode *inode = dentry->d_inode; - struct fs_info *fs_info = dir->i_sb->s_fs_info; - - de = get_devfs_entry_from_vfs_inode (inode); - DPRINTK (DEBUG_I_UNLINK, "(%s): de: %p\n", dentry->d_name.name, de); - if (de == NULL) return -ENOENT; - if (!de->vfs) return -EPERM; - write_lock (&de->parent->u.dir.lock); - unhooked = _devfs_unhook (de); - write_unlock (&de->parent->u.dir.lock); - if (!unhooked) return -ENOENT; - if ( !is_devfsd_or_child (fs_info) ) - devfsd_notify_de (de, DEVFSD_NOTIFY_DELETE, inode->i_mode, - inode->i_uid, inode->i_gid, fs_info); - free_dentry (de); - devfs_put (de); - return 0; -} /* End Function devfs_unlink */ - -static int devfs_symlink (struct inode *dir, struct dentry *dentry, - const char *symname) -{ - int err; - struct fs_info *fs_info = dir->i_sb->s_fs_info; - struct devfs_entry *parent, *de; - struct inode *inode; - - /* First try to get the devfs entry for this directory */ - parent = get_devfs_entry_from_vfs_inode (dir); - if (parent == NULL) return -ENOENT; - err = devfs_do_symlink (parent, dentry->d_name.name, symname, &de); - DPRINTK (DEBUG_DISABLED, "(%s): errcode from : %d\n", - dentry->d_name.name, err); - if (err < 0) return err; - de->vfs = TRUE; - de->inode.uid = current->euid; - de->inode.gid = current->egid; - de->inode.atime = CURRENT_TIME; - de->inode.mtime = CURRENT_TIME; - de->inode.ctime = CURRENT_TIME; - if ( ( inode = _devfs_get_vfs_inode (dir->i_sb, de, dentry) ) == NULL ) - return -ENOMEM; - DPRINTK (DEBUG_DISABLED, "(%s): new VFS inode(%u): %p dentry: %p\n", - dentry->d_name.name, de->inode.ino, inode, dentry); - d_instantiate (dentry, inode); - if ( !is_devfsd_or_child (fs_info) ) - devfsd_notify_de (de, DEVFSD_NOTIFY_CREATE, inode->i_mode, - inode->i_uid, inode->i_gid, fs_info); - return 0; -} /* End Function devfs_symlink */ - -static int devfs_mkdir (struct inode *dir, struct dentry *dentry, int mode) -{ - int err; - struct fs_info *fs_info = dir->i_sb->s_fs_info; - struct devfs_entry *parent, *de; - struct inode *inode; - - mode = (mode & ~S_IFMT) | S_IFDIR; /* VFS doesn't pass S_IFMT part */ - parent = get_devfs_entry_from_vfs_inode (dir); - if (parent == NULL) return -ENOENT; - de = _devfs_alloc_entry (dentry->d_name.name, dentry->d_name.len, mode); - if (!de) return -ENOMEM; - de->vfs = TRUE; - if ( ( err = _devfs_append_entry (parent, de, NULL) ) != 0 ) - return err; - de->inode.uid = current->euid; - de->inode.gid = current->egid; - de->inode.atime = CURRENT_TIME; - de->inode.mtime = CURRENT_TIME; - de->inode.ctime = CURRENT_TIME; - if ( ( inode = _devfs_get_vfs_inode (dir->i_sb, de, dentry) ) == NULL ) - return -ENOMEM; - DPRINTK (DEBUG_DISABLED, "(%s): new VFS inode(%u): %p dentry: %p\n", - dentry->d_name.name, de->inode.ino, inode, dentry); - d_instantiate (dentry, inode); - if ( !is_devfsd_or_child (fs_info) ) - devfsd_notify_de (de, DEVFSD_NOTIFY_CREATE, inode->i_mode, - inode->i_uid, inode->i_gid, fs_info); - return 0; -} /* End Function devfs_mkdir */ - -static int devfs_rmdir (struct inode *dir, struct dentry *dentry) + dentry->d_op = &devfs_wait_dops; + d_add(dentry, NULL); /* Open the floodgates */ + /* Unlock directory semaphore, which will release any waiters. They + will get the hashed dentry, and may be forced to wait for + revalidation */ + up(&dir->i_sem); + wait_for_devfsd_finished(fs_info); /* If I'm not devfsd, must wait */ + down(&dir->i_sem); /* Grab it again because them's the rules */ + de = lookup_info.de; + /* If someone else has been so kind as to make the inode, we go home + early */ + if (dentry->d_inode) + goto out; + if (de == NULL) { + read_lock(&parent->u.dir.lock); + de = _devfs_search_dir(parent, dentry->d_name.name, + dentry->d_name.len); + read_unlock(&parent->u.dir.lock); + if (de == NULL) + goto out; + /* OK, there's an entry now, but no VFS inode yet */ + } + /* Create an inode, now that the driver information is available */ + inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry); + if (!inode) { + retval = ERR_PTR(-ENOMEM); + goto out; + } + DPRINTK(DEBUG_I_LOOKUP, + "(%s): new VFS inode(%u): %p de: %p by: \"%s\"\n", de->name, + de->inode.ino, inode, de, current->comm); + d_instantiate(dentry, inode); + out: + write_lock(&parent->u.dir.lock); + dentry->d_op = &devfs_dops; + dentry->d_fsdata = NULL; + wake_up(&lookup_info.wait_queue); + write_unlock(&parent->u.dir.lock); + devfs_put(de); + return retval; +} /* End Function devfs_lookup */ + +static int devfs_unlink(struct inode *dir, struct dentry *dentry) +{ + int unhooked; + struct devfs_entry *de; + struct inode *inode = dentry->d_inode; + struct fs_info *fs_info = dir->i_sb->s_fs_info; + + de = get_devfs_entry_from_vfs_inode(inode); + DPRINTK(DEBUG_I_UNLINK, "(%s): de: %p\n", dentry->d_name.name, de); + if (de == NULL) + return -ENOENT; + if (!de->vfs) + return -EPERM; + write_lock(&de->parent->u.dir.lock); + unhooked = _devfs_unhook(de); + write_unlock(&de->parent->u.dir.lock); + if (!unhooked) + return -ENOENT; + if (!is_devfsd_or_child(fs_info)) + devfsd_notify_de(de, DEVFSD_NOTIFY_DELETE, inode->i_mode, + inode->i_uid, inode->i_gid, fs_info); + free_dentry(de); + devfs_put(de); + return 0; +} /* End Function devfs_unlink */ + +static int devfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) { - int err = 0; - struct devfs_entry *de; - struct fs_info *fs_info = dir->i_sb->s_fs_info; - struct inode *inode = dentry->d_inode; - - if (dir->i_sb->s_fs_info != inode->i_sb->s_fs_info) return -EINVAL; - de = get_devfs_entry_from_vfs_inode (inode); - if (de == NULL) return -ENOENT; - if ( !S_ISDIR (de->mode) ) return -ENOTDIR; - if (!de->vfs) return -EPERM; - /* First ensure the directory is empty and will stay that way */ - write_lock (&de->u.dir.lock); - if (de->u.dir.first) err = -ENOTEMPTY; - else de->u.dir.no_more_additions = TRUE; - write_unlock (&de->u.dir.lock); - if (err) return err; - /* Now unhook the directory from its parent */ - write_lock (&de->parent->u.dir.lock); - if ( !_devfs_unhook (de) ) err = -ENOENT; - write_unlock (&de->parent->u.dir.lock); - if (err) return err; - if ( !is_devfsd_or_child (fs_info) ) - devfsd_notify_de (de, DEVFSD_NOTIFY_DELETE, inode->i_mode, - inode->i_uid, inode->i_gid, fs_info); - free_dentry (de); - devfs_put (de); - return 0; -} /* End Function devfs_rmdir */ - -static int devfs_mknod (struct inode *dir, struct dentry *dentry, int mode, - dev_t rdev) + int err; + struct fs_info *fs_info = dir->i_sb->s_fs_info; + struct devfs_entry *parent, *de; + struct inode *inode; + + /* First try to get the devfs entry for this directory */ + parent = get_devfs_entry_from_vfs_inode(dir); + if (parent == NULL) + return -ENOENT; + err = devfs_do_symlink(parent, dentry->d_name.name, symname, &de); + DPRINTK(DEBUG_DISABLED, "(%s): errcode from : %d\n", + dentry->d_name.name, err); + if (err < 0) + return err; + de->vfs = TRUE; + de->inode.uid = current->euid; + de->inode.gid = current->egid; + de->inode.atime = CURRENT_TIME; + de->inode.mtime = CURRENT_TIME; + de->inode.ctime = CURRENT_TIME; + if ((inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry)) == NULL) + return -ENOMEM; + DPRINTK(DEBUG_DISABLED, "(%s): new VFS inode(%u): %p dentry: %p\n", + dentry->d_name.name, de->inode.ino, inode, dentry); + d_instantiate(dentry, inode); + if (!is_devfsd_or_child(fs_info)) + devfsd_notify_de(de, DEVFSD_NOTIFY_CREATE, inode->i_mode, + inode->i_uid, inode->i_gid, fs_info); + return 0; +} /* End Function devfs_symlink */ + +static int devfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) { - int err; - struct fs_info *fs_info = dir->i_sb->s_fs_info; - struct devfs_entry *parent, *de; - struct inode *inode; - - DPRINTK (DEBUG_I_MKNOD, "(%s): mode: 0%o dev: %u:%u\n", - dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); - parent = get_devfs_entry_from_vfs_inode (dir); - if (parent == NULL) return -ENOENT; - de = _devfs_alloc_entry (dentry->d_name.name, dentry->d_name.len, mode); - if (!de) return -ENOMEM; - de->vfs = TRUE; - if (S_ISCHR(mode) || S_ISBLK(mode)) - de->u.dev = rdev; - if ( ( err = _devfs_append_entry (parent, de, NULL) ) != 0 ) - return err; - de->inode.uid = current->euid; - de->inode.gid = current->egid; - de->inode.atime = CURRENT_TIME; - de->inode.mtime = CURRENT_TIME; - de->inode.ctime = CURRENT_TIME; - if ( ( inode = _devfs_get_vfs_inode (dir->i_sb, de, dentry) ) == NULL ) - return -ENOMEM; - DPRINTK (DEBUG_I_MKNOD, ": new VFS inode(%u): %p dentry: %p\n", - de->inode.ino, inode, dentry); - d_instantiate (dentry, inode); - if ( !is_devfsd_or_child (fs_info) ) - devfsd_notify_de (de, DEVFSD_NOTIFY_CREATE, inode->i_mode, - inode->i_uid, inode->i_gid, fs_info); - return 0; -} /* End Function devfs_mknod */ - -static int devfs_readlink (struct dentry *dentry, char *buffer, int buflen) + int err; + struct fs_info *fs_info = dir->i_sb->s_fs_info; + struct devfs_entry *parent, *de; + struct inode *inode; + + mode = (mode & ~S_IFMT) | S_IFDIR; /* VFS doesn't pass S_IFMT part */ + parent = get_devfs_entry_from_vfs_inode(dir); + if (parent == NULL) + return -ENOENT; + de = _devfs_alloc_entry(dentry->d_name.name, dentry->d_name.len, mode); + if (!de) + return -ENOMEM; + de->vfs = TRUE; + if ((err = _devfs_append_entry(parent, de, NULL)) != 0) + return err; + de->inode.uid = current->euid; + de->inode.gid = current->egid; + de->inode.atime = CURRENT_TIME; + de->inode.mtime = CURRENT_TIME; + de->inode.ctime = CURRENT_TIME; + if ((inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry)) == NULL) + return -ENOMEM; + DPRINTK(DEBUG_DISABLED, "(%s): new VFS inode(%u): %p dentry: %p\n", + dentry->d_name.name, de->inode.ino, inode, dentry); + d_instantiate(dentry, inode); + if (!is_devfsd_or_child(fs_info)) + devfsd_notify_de(de, DEVFSD_NOTIFY_CREATE, inode->i_mode, + inode->i_uid, inode->i_gid, fs_info); + return 0; +} /* End Function devfs_mkdir */ + +static int devfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + int err = 0; + struct devfs_entry *de; + struct fs_info *fs_info = dir->i_sb->s_fs_info; + struct inode *inode = dentry->d_inode; + + if (dir->i_sb->s_fs_info != inode->i_sb->s_fs_info) + return -EINVAL; + de = get_devfs_entry_from_vfs_inode(inode); + if (de == NULL) + return -ENOENT; + if (!S_ISDIR(de->mode)) + return -ENOTDIR; + if (!de->vfs) + return -EPERM; + /* First ensure the directory is empty and will stay that way */ + write_lock(&de->u.dir.lock); + if (de->u.dir.first) + err = -ENOTEMPTY; + else + de->u.dir.no_more_additions = TRUE; + write_unlock(&de->u.dir.lock); + if (err) + return err; + /* Now unhook the directory from its parent */ + write_lock(&de->parent->u.dir.lock); + if (!_devfs_unhook(de)) + err = -ENOENT; + write_unlock(&de->parent->u.dir.lock); + if (err) + return err; + if (!is_devfsd_or_child(fs_info)) + devfsd_notify_de(de, DEVFSD_NOTIFY_DELETE, inode->i_mode, + inode->i_uid, inode->i_gid, fs_info); + free_dentry(de); + devfs_put(de); + return 0; +} /* End Function devfs_rmdir */ + +static int devfs_mknod(struct inode *dir, struct dentry *dentry, int mode, + dev_t rdev) { - int err; - struct devfs_entry *de; + int err; + struct fs_info *fs_info = dir->i_sb->s_fs_info; + struct devfs_entry *parent, *de; + struct inode *inode; - de = get_devfs_entry_from_vfs_inode (dentry->d_inode); - if (!de) return -ENODEV; - err = vfs_readlink (dentry, buffer, buflen, de->u.symlink.linkname); - return err; -} /* End Function devfs_readlink */ + DPRINTK(DEBUG_I_MKNOD, "(%s): mode: 0%o dev: %u:%u\n", + dentry->d_name.name, mode, MAJOR(rdev), MINOR(rdev)); + parent = get_devfs_entry_from_vfs_inode(dir); + if (parent == NULL) + return -ENOENT; + de = _devfs_alloc_entry(dentry->d_name.name, dentry->d_name.len, mode); + if (!de) + return -ENOMEM; + de->vfs = TRUE; + if (S_ISCHR(mode) || S_ISBLK(mode)) + de->u.dev = rdev; + if ((err = _devfs_append_entry(parent, de, NULL)) != 0) + return err; + de->inode.uid = current->euid; + de->inode.gid = current->egid; + de->inode.atime = CURRENT_TIME; + de->inode.mtime = CURRENT_TIME; + de->inode.ctime = CURRENT_TIME; + if ((inode = _devfs_get_vfs_inode(dir->i_sb, de, dentry)) == NULL) + return -ENOMEM; + DPRINTK(DEBUG_I_MKNOD, ": new VFS inode(%u): %p dentry: %p\n", + de->inode.ino, inode, dentry); + d_instantiate(dentry, inode); + if (!is_devfsd_or_child(fs_info)) + devfsd_notify_de(de, DEVFSD_NOTIFY_CREATE, inode->i_mode, + inode->i_uid, inode->i_gid, fs_info); + return 0; +} /* End Function devfs_mknod */ -static int devfs_follow_link (struct dentry *dentry, struct nameidata *nd) +static int devfs_readlink(struct dentry *dentry, char *buffer, int buflen) { - int err; - struct devfs_entry *de; + int err; + struct devfs_entry *de; - de = get_devfs_entry_from_vfs_inode (dentry->d_inode); - if (!de) return -ENODEV; - err = vfs_follow_link (nd, de->u.symlink.linkname); - return err; -} /* End Function devfs_follow_link */ + de = get_devfs_entry_from_vfs_inode(dentry->d_inode); + if (!de) + return -ENODEV; + err = vfs_readlink(dentry, buffer, buflen, de->u.symlink.linkname); + return err; +} /* End Function devfs_readlink */ -static struct inode_operations devfs_iops = +static int devfs_follow_link(struct dentry *dentry, struct nameidata *nd) { - .setattr = devfs_notify_change, + int err; + struct devfs_entry *de; + + de = get_devfs_entry_from_vfs_inode(dentry->d_inode); + if (!de) + return -ENODEV; + err = vfs_follow_link(nd, de->u.symlink.linkname); + return err; +} /* End Function devfs_follow_link */ + +static struct inode_operations devfs_iops = { + .setattr = devfs_notify_change, }; -static struct inode_operations devfs_dir_iops = -{ - .lookup = devfs_lookup, - .unlink = devfs_unlink, - .symlink = devfs_symlink, - .mkdir = devfs_mkdir, - .rmdir = devfs_rmdir, - .mknod = devfs_mknod, - .setattr = devfs_notify_change, +static struct inode_operations devfs_dir_iops = { + .lookup = devfs_lookup, + .unlink = devfs_unlink, + .symlink = devfs_symlink, + .mkdir = devfs_mkdir, + .rmdir = devfs_rmdir, + .mknod = devfs_mknod, + .setattr = devfs_notify_change, }; -static struct inode_operations devfs_symlink_iops = -{ - .readlink = devfs_readlink, - .follow_link = devfs_follow_link, - .setattr = devfs_notify_change, +static struct inode_operations devfs_symlink_iops = { + .readlink = devfs_readlink, + .follow_link = devfs_follow_link, + .setattr = devfs_notify_change, }; -static int devfs_fill_super (struct super_block *sb, void *data, int silent) +static int devfs_fill_super(struct super_block *sb, void *data, int silent) { - struct inode *root_inode = NULL; - - if (_devfs_get_root_entry () == NULL) goto out_no_root; - atomic_set (&fs_info.devfsd_overrun_count, 0); - init_waitqueue_head (&fs_info.devfsd_wait_queue); - init_waitqueue_head (&fs_info.revalidate_wait_queue); - fs_info.sb = sb; - sb->s_fs_info = &fs_info; - sb->s_blocksize = 1024; - sb->s_blocksize_bits = 10; - sb->s_magic = DEVFS_SUPER_MAGIC; - sb->s_op = &devfs_sops; - if ( ( root_inode = _devfs_get_vfs_inode (sb, root_entry, NULL) ) == NULL ) - goto out_no_root; - sb->s_root = d_alloc_root (root_inode); - if (!sb->s_root) goto out_no_root; - DPRINTK (DEBUG_S_READ, "(): made devfs ptr: %p\n", sb->s_fs_info); - return 0; - -out_no_root: - PRINTK ("(): get root inode failed\n"); - if (root_inode) iput (root_inode); - return -EINVAL; -} /* End Function devfs_fill_super */ - -static struct super_block * -devfs_get_sb (struct file_system_type *fs_type, int flags, - const char *dev_name, void *data) + struct inode *root_inode = NULL; + + if (_devfs_get_root_entry() == NULL) + goto out_no_root; + atomic_set(&fs_info.devfsd_overrun_count, 0); + init_waitqueue_head(&fs_info.devfsd_wait_queue); + init_waitqueue_head(&fs_info.revalidate_wait_queue); + fs_info.sb = sb; + sb->s_fs_info = &fs_info; + sb->s_blocksize = 1024; + sb->s_blocksize_bits = 10; + sb->s_magic = DEVFS_SUPER_MAGIC; + sb->s_op = &devfs_sops; + if ((root_inode = _devfs_get_vfs_inode(sb, root_entry, NULL)) == NULL) + goto out_no_root; + sb->s_root = d_alloc_root(root_inode); + if (!sb->s_root) + goto out_no_root; + DPRINTK(DEBUG_S_READ, "(): made devfs ptr: %p\n", sb->s_fs_info); + return 0; + + out_no_root: + PRINTK("(): get root inode failed\n"); + if (root_inode) + iput(root_inode); + return -EINVAL; +} /* End Function devfs_fill_super */ + +static struct super_block *devfs_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) { - return get_sb_single (fs_type, flags, data, devfs_fill_super); + return get_sb_single(fs_type, flags, data, devfs_fill_super); } -static struct file_system_type devfs_fs_type = -{ - .name = DEVFS_NAME, - .get_sb = devfs_get_sb, - .kill_sb = kill_anon_super, +static struct file_system_type devfs_fs_type = { + .name = DEVFS_NAME, + .get_sb = devfs_get_sb, + .kill_sb = kill_anon_super, }; /* File operations for devfsd follow */ -static ssize_t devfsd_read (struct file *file, char *buf, size_t len, - loff_t *ppos) +static ssize_t devfsd_read(struct file *file, char *buf, size_t len, + loff_t * ppos) { - int done = FALSE; - int ival; - loff_t pos, devname_offset, tlen, rpos; - devfs_handle_t de; - struct devfsd_buf_entry *entry; - struct fs_info *fs_info = file->f_dentry->d_inode->i_sb->s_fs_info; - struct devfsd_notify_struct *info = fs_info->devfsd_info; - DECLARE_WAITQUEUE (wait, current); - - /* Can't seek (pread) on this device */ - if (ppos != &file->f_pos) return -ESPIPE; - /* Verify the task has grabbed the queue */ - if (fs_info->devfsd_task != current) return -EPERM; - info->major = 0; - info->minor = 0; - /* Block for a new entry */ - set_current_state (TASK_INTERRUPTIBLE); - add_wait_queue (&fs_info->devfsd_wait_queue, &wait); - while ( devfsd_queue_empty (fs_info) ) - { - fs_info->devfsd_sleeping = TRUE; - wake_up (&fs_info->revalidate_wait_queue); - schedule (); - fs_info->devfsd_sleeping = FALSE; - if ( signal_pending (current) ) - { - remove_wait_queue (&fs_info->devfsd_wait_queue, &wait); - __set_current_state (TASK_RUNNING); - return -EINTR; + int done = FALSE; + int ival; + loff_t pos, devname_offset, tlen, rpos; + devfs_handle_t de; + struct devfsd_buf_entry *entry; + struct fs_info *fs_info = file->f_dentry->d_inode->i_sb->s_fs_info; + struct devfsd_notify_struct *info = fs_info->devfsd_info; + DECLARE_WAITQUEUE(wait, current); + + /* Can't seek (pread) on this device */ + if (ppos != &file->f_pos) + return -ESPIPE; + /* Verify the task has grabbed the queue */ + if (fs_info->devfsd_task != current) + return -EPERM; + info->major = 0; + info->minor = 0; + /* Block for a new entry */ + set_current_state(TASK_INTERRUPTIBLE); + add_wait_queue(&fs_info->devfsd_wait_queue, &wait); + while (devfsd_queue_empty(fs_info)) { + fs_info->devfsd_sleeping = TRUE; + wake_up(&fs_info->revalidate_wait_queue); + schedule(); + fs_info->devfsd_sleeping = FALSE; + if (signal_pending(current)) { + remove_wait_queue(&fs_info->devfsd_wait_queue, &wait); + __set_current_state(TASK_RUNNING); + return -EINTR; + } + set_current_state(TASK_INTERRUPTIBLE); } - set_current_state (TASK_INTERRUPTIBLE); - } - remove_wait_queue (&fs_info->devfsd_wait_queue, &wait); - __set_current_state (TASK_RUNNING); - /* Now play with the data */ - ival = atomic_read (&fs_info->devfsd_overrun_count); - info->overrun_count = ival; - entry = fs_info->devfsd_first_event; - info->type = entry->type; - info->mode = entry->mode; - info->uid = entry->uid; - info->gid = entry->gid; - de = entry->de; - if (S_ISCHR(de->mode) || S_ISBLK(de->mode)) { - info->major = MAJOR(de->u.dev); - info->minor = MINOR(de->u.dev); - } - pos = devfs_generate_path (de, info->devname, DEVFS_PATHLEN); - if (pos < 0) return pos; - info->namelen = DEVFS_PATHLEN - pos - 1; - if (info->mode == 0) info->mode = de->mode; - devname_offset = info->devname - (char *) info; - rpos = *ppos; - if (rpos < devname_offset) - { - /* Copy parts of the header */ - tlen = devname_offset - rpos; - if (tlen > len) tlen = len; - if ( copy_to_user (buf, (char *) info + rpos, tlen) ) - { - return -EFAULT; + remove_wait_queue(&fs_info->devfsd_wait_queue, &wait); + __set_current_state(TASK_RUNNING); + /* Now play with the data */ + ival = atomic_read(&fs_info->devfsd_overrun_count); + info->overrun_count = ival; + entry = fs_info->devfsd_first_event; + info->type = entry->type; + info->mode = entry->mode; + info->uid = entry->uid; + info->gid = entry->gid; + de = entry->de; + if (S_ISCHR(de->mode) || S_ISBLK(de->mode)) { + info->major = MAJOR(de->u.dev); + info->minor = MINOR(de->u.dev); } - rpos += tlen; - buf += tlen; - len -= tlen; - } - if ( (rpos >= devname_offset) && (len > 0) ) - { - /* Copy the name */ - tlen = info->namelen + 1; - if (tlen > len) tlen = len; - else done = TRUE; - if ( copy_to_user (buf, info->devname + pos + rpos - devname_offset, - tlen) ) - { - return -EFAULT; + pos = devfs_generate_path(de, info->devname, DEVFS_PATHLEN); + if (pos < 0) + return pos; + info->namelen = DEVFS_PATHLEN - pos - 1; + if (info->mode == 0) + info->mode = de->mode; + devname_offset = info->devname - (char *)info; + rpos = *ppos; + if (rpos < devname_offset) { + /* Copy parts of the header */ + tlen = devname_offset - rpos; + if (tlen > len) + tlen = len; + if (copy_to_user(buf, (char *)info + rpos, tlen)) { + return -EFAULT; + } + rpos += tlen; + buf += tlen; + len -= tlen; } - rpos += tlen; - } - tlen = rpos - *ppos; - if (done) - { - devfs_handle_t parent; - - spin_lock (&fs_info->devfsd_buffer_lock); - fs_info->devfsd_first_event = entry->next; - if (entry->next == NULL) fs_info->devfsd_last_event = NULL; - spin_unlock (&fs_info->devfsd_buffer_lock); - for (; de != NULL; de = parent) - { - parent = de->parent; - devfs_put (de); + if ((rpos >= devname_offset) && (len > 0)) { + /* Copy the name */ + tlen = info->namelen + 1; + if (tlen > len) + tlen = len; + else + done = TRUE; + if (copy_to_user + (buf, info->devname + pos + rpos - devname_offset, tlen)) { + return -EFAULT; + } + rpos += tlen; } - kmem_cache_free (devfsd_buf_cache, entry); - if (ival > 0) atomic_sub (ival, &fs_info->devfsd_overrun_count); - *ppos = 0; - } - else *ppos = rpos; - return tlen; -} /* End Function devfsd_read */ - -static int devfsd_ioctl (struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) + tlen = rpos - *ppos; + if (done) { + devfs_handle_t parent; + + spin_lock(&fs_info->devfsd_buffer_lock); + fs_info->devfsd_first_event = entry->next; + if (entry->next == NULL) + fs_info->devfsd_last_event = NULL; + spin_unlock(&fs_info->devfsd_buffer_lock); + for (; de != NULL; de = parent) { + parent = de->parent; + devfs_put(de); + } + kmem_cache_free(devfsd_buf_cache, entry); + if (ival > 0) + atomic_sub(ival, &fs_info->devfsd_overrun_count); + *ppos = 0; + } else + *ppos = rpos; + return tlen; +} /* End Function devfsd_read */ + +static int devfsd_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) { - int ival; - struct fs_info *fs_info = inode->i_sb->s_fs_info; - - switch (cmd) - { - case DEVFSDIOC_GET_PROTO_REV: - ival = DEVFSD_PROTOCOL_REVISION_KERNEL; - if ( copy_to_user ( (void *)arg, &ival, sizeof ival ) ) return -EFAULT; - break; - case DEVFSDIOC_SET_EVENT_MASK: - /* Ensure only one reader has access to the queue. This scheme will - work even if the global kernel lock were to be removed, because it - doesn't matter who gets in first, as long as only one gets it */ - if (fs_info->devfsd_task == NULL) - { - static spinlock_t lock = SPIN_LOCK_UNLOCKED; - - if ( !spin_trylock (&lock) ) return -EBUSY; - if (fs_info->devfsd_task != NULL) - { /* We lost the race... */ - spin_unlock (&lock); - return -EBUSY; - } - fs_info->devfsd_task = current; - spin_unlock (&lock); - fs_info->devfsd_pgrp = (process_group(current) == current->pid) ? - process_group(current) : 0; - fs_info->devfsd_file = file; - fs_info->devfsd_info = kmalloc (sizeof *fs_info->devfsd_info, - GFP_KERNEL); - if (!fs_info->devfsd_info) - { - devfsd_close (inode, file); - return -ENOMEM; - } - } - else if (fs_info->devfsd_task != current) return -EBUSY; - fs_info->devfsd_event_mask = arg; /* Let the masses come forth */ - break; - case DEVFSDIOC_RELEASE_EVENT_QUEUE: - if (fs_info->devfsd_file != file) return -EPERM; - return devfsd_close (inode, file); - /*break;*/ + int ival; + struct fs_info *fs_info = inode->i_sb->s_fs_info; + + switch (cmd) { + case DEVFSDIOC_GET_PROTO_REV: + ival = DEVFSD_PROTOCOL_REVISION_KERNEL; + if (copy_to_user((void *)arg, &ival, sizeof ival)) + return -EFAULT; + break; + case DEVFSDIOC_SET_EVENT_MASK: + /* Ensure only one reader has access to the queue. This scheme will + work even if the global kernel lock were to be removed, because it + doesn't matter who gets in first, as long as only one gets it */ + if (fs_info->devfsd_task == NULL) { + static spinlock_t lock = SPIN_LOCK_UNLOCKED; + + if (!spin_trylock(&lock)) + return -EBUSY; + if (fs_info->devfsd_task != NULL) { /* We lost the race... */ + spin_unlock(&lock); + return -EBUSY; + } + fs_info->devfsd_task = current; + spin_unlock(&lock); + fs_info->devfsd_pgrp = + (process_group(current) == + current->pid) ? process_group(current) : 0; + fs_info->devfsd_file = file; + fs_info->devfsd_info = + kmalloc(sizeof *fs_info->devfsd_info, GFP_KERNEL); + if (!fs_info->devfsd_info) { + devfsd_close(inode, file); + return -ENOMEM; + } + } else if (fs_info->devfsd_task != current) + return -EBUSY; + fs_info->devfsd_event_mask = arg; /* Let the masses come forth */ + break; + case DEVFSDIOC_RELEASE_EVENT_QUEUE: + if (fs_info->devfsd_file != file) + return -EPERM; + return devfsd_close(inode, file); + /*break; */ #ifdef CONFIG_DEVFS_DEBUG - case DEVFSDIOC_SET_DEBUG_MASK: - if ( copy_from_user (&ival, (void *) arg, sizeof ival) )return -EFAULT; - devfs_debug = ival; - break; + case DEVFSDIOC_SET_DEBUG_MASK: + if (copy_from_user(&ival, (void *)arg, sizeof ival)) + return -EFAULT; + devfs_debug = ival; + break; #endif - default: - return -ENOIOCTLCMD; - } - return 0; -} /* End Function devfsd_ioctl */ - -static int devfsd_close (struct inode *inode, struct file *file) -{ - struct devfsd_buf_entry *entry, *next; - struct fs_info *fs_info = inode->i_sb->s_fs_info; - - if (fs_info->devfsd_file != file) return 0; - fs_info->devfsd_event_mask = 0; - fs_info->devfsd_file = NULL; - spin_lock (&fs_info->devfsd_buffer_lock); - entry = fs_info->devfsd_first_event; - fs_info->devfsd_first_event = NULL; - fs_info->devfsd_last_event = NULL; - if (fs_info->devfsd_info) - { - kfree (fs_info->devfsd_info); - fs_info->devfsd_info = NULL; - } - spin_unlock (&fs_info->devfsd_buffer_lock); - fs_info->devfsd_pgrp = 0; - fs_info->devfsd_task = NULL; - wake_up (&fs_info->revalidate_wait_queue); - for (; entry; entry = next) - { - next = entry->next; - kmem_cache_free (devfsd_buf_cache, entry); - } - return 0; -} /* End Function devfsd_close */ + default: + return -ENOIOCTLCMD; + } + return 0; +} /* End Function devfsd_ioctl */ + +static int devfsd_close(struct inode *inode, struct file *file) +{ + struct devfsd_buf_entry *entry, *next; + struct fs_info *fs_info = inode->i_sb->s_fs_info; + + if (fs_info->devfsd_file != file) + return 0; + fs_info->devfsd_event_mask = 0; + fs_info->devfsd_file = NULL; + spin_lock(&fs_info->devfsd_buffer_lock); + entry = fs_info->devfsd_first_event; + fs_info->devfsd_first_event = NULL; + fs_info->devfsd_last_event = NULL; + if (fs_info->devfsd_info) { + kfree(fs_info->devfsd_info); + fs_info->devfsd_info = NULL; + } + spin_unlock(&fs_info->devfsd_buffer_lock); + fs_info->devfsd_pgrp = 0; + fs_info->devfsd_task = NULL; + wake_up(&fs_info->revalidate_wait_queue); + for (; entry; entry = next) { + next = entry->next; + kmem_cache_free(devfsd_buf_cache, entry); + } + return 0; +} /* End Function devfsd_close */ #ifdef CONFIG_DEVFS_DEBUG -static ssize_t stat_read (struct file *file, char *buf, size_t len, - loff_t *ppos) -{ - ssize_t num; - char txt[80]; - - num = sprintf (txt, "Number of entries: %u number of bytes: %u\n", - stat_num_entries, stat_num_bytes) + 1; - /* Can't seek (pread) on this device */ - if (ppos != &file->f_pos) return -ESPIPE; - if (*ppos >= num) return 0; - if (*ppos + len > num) len = num - *ppos; - if ( copy_to_user (buf, txt + *ppos, len) ) return -EFAULT; - *ppos += len; - return len; -} /* End Function stat_read */ +static ssize_t stat_read(struct file *file, char *buf, size_t len, + loff_t * ppos) +{ + ssize_t num; + char txt[80]; + + num = sprintf(txt, "Number of entries: %u number of bytes: %u\n", + stat_num_entries, stat_num_bytes) + 1; + /* Can't seek (pread) on this device */ + if (ppos != &file->f_pos) + return -ESPIPE; + if (*ppos >= num) + return 0; + if (*ppos + len > num) + len = num - *ppos; + if (copy_to_user(buf, txt + *ppos, len)) + return -EFAULT; + *ppos += len; + return len; +} /* End Function stat_read */ #endif static int __init init_devfs_fs(void) @@ -2793,8 +2809,8 @@ static int __init init_devfs_fs(void) printk(KERN_INFO "%s: %s Richard Gooch (rgooch@atnf.csiro.au)\n", DEVFS_NAME, DEVFS_VERSION); devfsd_buf_cache = kmem_cache_create("devfsd_event", - sizeof (struct devfsd_buf_entry), - 0, 0, NULL, NULL); + sizeof(struct devfsd_buf_entry), + 0, 0, NULL, NULL); if (!devfsd_buf_cache) OOPS("(): unable to allocate event slab\n"); #ifdef CONFIG_DEVFS_DEBUG @@ -2809,32 +2825,35 @@ static int __init init_devfs_fs(void) return major; /* And create the entry for ".devfsd" */ - devfsd = _devfs_alloc_entry(".devfsd", 0, S_IFCHR|S_IRUSR|S_IWUSR); - if (devfsd == NULL ) + devfsd = _devfs_alloc_entry(".devfsd", 0, S_IFCHR | S_IRUSR | S_IWUSR); + if (devfsd == NULL) return -ENOMEM; devfsd->u.dev = MKDEV(major, 0); _devfs_append_entry(root_entry, devfsd, NULL); #ifdef CONFIG_DEVFS_DEBUG - stat = _devfs_alloc_entry(".stat", 0, S_IFCHR|S_IRUGO); - if (stat == NULL ) + stat = _devfs_alloc_entry(".stat", 0, S_IFCHR | S_IRUGO); + if (stat == NULL) return -ENOMEM; stat->u.dev = MKDEV(major, 1); - _devfs_append_entry (root_entry, stat, NULL); + _devfs_append_entry(root_entry, stat, NULL); #endif err = register_filesystem(&devfs_fs_type); return err; -} /* End Function init_devfs_fs */ +} /* End Function init_devfs_fs */ -void __init mount_devfs_fs (void) +void __init mount_devfs_fs(void) { - int err; + int err; - if ( !(boot_options & OPTION_MOUNT) ) return; - err = do_mount ("none", "/dev", "devfs", 0, NULL); - if (err == 0) printk (KERN_INFO "Mounted devfs on /dev\n"); - else PRINTK ("(): unable to mount devfs, err: %d\n", err); -} /* End Function mount_devfs_fs */ + if (!(boot_options & OPTION_MOUNT)) + return; + err = do_mount("none", "/dev", "devfs", 0, NULL); + if (err == 0) + printk(KERN_INFO "Mounted devfs on /dev\n"); + else + PRINTK("(): unable to mount devfs, err: %d\n", err); +} /* End Function mount_devfs_fs */ module_init(init_devfs_fs) diff --git a/fs/devfs/util.c b/fs/devfs/util.c index a6ecc014b471..06a2d827e3e5 100644 --- a/fs/devfs/util.c +++ b/fs/devfs/util.c @@ -73,7 +73,6 @@ #include #include - int devfs_register_tape(const char *name) { char tname[32], dest[64]; @@ -86,6 +85,7 @@ int devfs_register_tape(const char *name) return n; } + EXPORT_SYMBOL(devfs_register_tape); void devfs_unregister_tape(int num) diff --git a/include/linux/devfs_fs.h b/include/linux/devfs_fs.h index 48da59012021..de236f431877 100644 --- a/include/linux/devfs_fs.h +++ b/include/linux/devfs_fs.h @@ -22,22 +22,20 @@ #define DEVFSD_NOTIFY_CREATE 6 #define DEVFSD_NOTIFY_DELETE 7 -#define DEVFS_PATHLEN 1024 /* Never change this otherwise the - binary interface will change */ - -struct devfsd_notify_struct -{ /* Use native C types to ensure same types in kernel and user space */ - unsigned int type; /* DEVFSD_NOTIFY_* value */ - unsigned int mode; /* Mode of the inode or device entry */ - unsigned int major; /* Major number of device entry */ - unsigned int minor; /* Minor number of device entry */ - unsigned int uid; /* Uid of process, inode or device entry */ - unsigned int gid; /* Gid of process, inode or device entry */ - unsigned int overrun_count; /* Number of lost events */ - unsigned int namelen; /* Number of characters not including '\0' */ - /* The device name MUST come last */ - char devname[DEVFS_PATHLEN]; /* This will be '\0' terminated */ +#define DEVFS_PATHLEN 1024 /* Never change this otherwise the + binary interface will change */ + +struct devfsd_notify_struct { /* Use native C types to ensure same types in kernel and user space */ + unsigned int type; /* DEVFSD_NOTIFY_* value */ + unsigned int mode; /* Mode of the inode or device entry */ + unsigned int major; /* Major number of device entry */ + unsigned int minor; /* Minor number of device entry */ + unsigned int uid; /* Uid of process, inode or device entry */ + unsigned int gid; /* Gid of process, inode or device entry */ + unsigned int overrun_count; /* Number of lost events */ + unsigned int namelen; /* Number of characters not including '\0' */ + /* The device name MUST come last */ + char devname[DEVFS_PATHLEN]; /* This will be '\0' terminated */ }; - -#endif /* _LINUX_DEVFS_FS_H */ +#endif /* _LINUX_DEVFS_FS_H */ diff --git a/include/linux/devfs_fs_kernel.h b/include/linux/devfs_fs_kernel.h index 16c78f54f427..89810e73d256 100644 --- a/include/linux/devfs_fs_kernel.h +++ b/include/linux/devfs_fs_kernel.h @@ -12,18 +12,18 @@ #ifdef CONFIG_DEVFS_FS extern int devfs_mk_bdev(dev_t dev, umode_t mode, const char *fmt, ...) - __attribute__((format (printf, 3, 4))); + __attribute__ ((format(printf, 3, 4))); extern int devfs_mk_cdev(dev_t dev, umode_t mode, const char *fmt, ...) - __attribute__((format (printf, 3, 4))); + __attribute__ ((format(printf, 3, 4))); extern int devfs_mk_symlink(const char *name, const char *link); extern int devfs_mk_dir(const char *fmt, ...) - __attribute__((format (printf, 1, 2))); + __attribute__ ((format(printf, 1, 2))); extern void devfs_remove(const char *fmt, ...) - __attribute__((format (printf, 1, 2))); + __attribute__ ((format(printf, 1, 2))); extern int devfs_register_tape(const char *name); extern void devfs_unregister_tape(int num); extern void mount_devfs_fs(void); -#else /* CONFIG_DEVFS_FS */ +#else /* CONFIG_DEVFS_FS */ static inline int devfs_mk_bdev(dev_t dev, umode_t mode, const char *fmt, ...) { return 0; @@ -32,9 +32,9 @@ static inline int devfs_mk_cdev(dev_t dev, umode_t mode, const char *fmt, ...) { return 0; } -static inline int devfs_mk_symlink (const char *name, const char *link) +static inline int devfs_mk_symlink(const char *name, const char *link) { - return 0; + return 0; } static inline int devfs_mk_dir(const char *fmt, ...) { @@ -43,16 +43,16 @@ static inline int devfs_mk_dir(const char *fmt, ...) static inline void devfs_remove(const char *fmt, ...) { } -static inline int devfs_register_tape (const char *name) +static inline int devfs_register_tape(const char *name) { - return -1; + return -1; } static inline void devfs_unregister_tape(int num) { } -static inline void mount_devfs_fs (void) +static inline void mount_devfs_fs(void) { - return; + return; } -#endif /* CONFIG_DEVFS_FS */ -#endif /* _LINUX_DEVFS_FS_KERNEL_H */ +#endif /* CONFIG_DEVFS_FS */ +#endif /* _LINUX_DEVFS_FS_KERNEL_H */ -- cgit v1.2.3 From 0eb217f9b539fccf5aafaba8c9a06e170825f68b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:40:05 -0700 Subject: [PATCH] generalise system_running From: Olof Johansson It's currently a boolean, but that means that system_running goes to zero again when shutting down. So we then use code (in the page allocator) which is only designed to be used during bootup - it is marked __init. So we need to be able to distinguish early boot state from late shutdown state. Rename system_running to system_state and give it the three appropriate states. --- arch/ppc/platforms/pmac_nvram.c | 8 ++++---- include/linux/kernel.h | 8 +++++++- init/main.c | 8 ++------ kernel/kmod.c | 2 +- kernel/printk.c | 3 ++- kernel/sched.c | 3 ++- kernel/sys.c | 8 ++++---- mm/page_alloc.c | 2 +- 8 files changed, 23 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/arch/ppc/platforms/pmac_nvram.c b/arch/ppc/platforms/pmac_nvram.c index f381f3f745f9..3b3f984fb929 100644 --- a/arch/ppc/platforms/pmac_nvram.c +++ b/arch/ppc/platforms/pmac_nvram.c @@ -154,11 +154,11 @@ static unsigned char __pmac pmu_nvram_read_byte(int addr) struct adb_request req; DECLARE_COMPLETION(req_complete); - req.arg = system_running ? &req_complete : NULL; + req.arg = system_state == SYSTEM_RUNNING ? &req_complete : NULL; if (pmu_request(&req, pmu_nvram_complete, 3, PMU_READ_NVRAM, (addr >> 8) & 0xff, addr & 0xff)) return 0xff; - if (system_running) + if (system_state == SYSTEM_RUNNING) wait_for_completion(&req_complete); while (!req.complete) pmu_poll(); @@ -170,11 +170,11 @@ static void __pmac pmu_nvram_write_byte(int addr, unsigned char val) struct adb_request req; DECLARE_COMPLETION(req_complete); - req.arg = system_running ? &req_complete : NULL; + req.arg = system_state == SYSTEM_RUNNING ? &req_complete : NULL; if (pmu_request(&req, pmu_nvram_complete, 4, PMU_WRITE_NVRAM, (addr >> 8) & 0xff, addr & 0xff, val)) return; - if (system_running) + if (system_state == SYSTEM_RUNNING) wait_for_completion(&req_complete); while (!req.complete) pmu_poll(); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index e11e79199357..c1171e77c76b 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -109,9 +109,15 @@ static inline void console_verbose(void) extern void bust_spinlocks(int yes); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_on_oops; -extern int system_running; +extern int system_state; /* See values below */ extern int tainted; extern const char *print_tainted(void); + +/* Values used for system_state */ +#define SYSTEM_BOOTING 0 +#define SYSTEM_RUNNING 1 +#define SYSTEM_SHUTDOWN 2 + #define TAINT_PROPRIETARY_MODULE (1<<0) #define TAINT_FORCED_MODULE (1<<1) #define TAINT_UNSAFE_SMP (1<<2) diff --git a/init/main.c b/init/main.c index 9d1ed1de14c5..348ce7db30f3 100644 --- a/init/main.c +++ b/init/main.c @@ -94,11 +94,7 @@ extern void driver_init(void); extern void tc_init(void); #endif -/* - * Are we up and running (ie do we have all the infrastructure - * set up) - */ -int system_running; +int system_state; /* SYSTEM_BOOTING/RUNNING/SHUTDOWN */ /* * Boot command-line arguments @@ -613,7 +609,7 @@ static int init(void * unused) */ free_initmem(); unlock_kernel(); - system_running = 1; + system_state = SYSTEM_RUNNING; if (sys_open("/dev/console", O_RDWR, 0) < 0) printk("Warning: unable to open an initial console.\n"); diff --git a/kernel/kmod.c b/kernel/kmod.c index 5261de82029b..0002fcd4c554 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -249,7 +249,7 @@ int call_usermodehelper(char *path, char **argv, char **envp, int wait) }; DECLARE_WORK(work, __call_usermodehelper, &sub_info); - if (!system_running) + if (system_state != SYSTEM_RUNNING) return -EBUSY; if (path[0] == '\0') diff --git a/kernel/printk.c b/kernel/printk.c index a7be1f922f34..5f2b3c9bbd6e 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -522,7 +522,8 @@ asmlinkage int printk(const char *fmt, ...) log_level_unknown = 1; } - if (!cpu_online(smp_processor_id()) && !system_running) { + if (!cpu_online(smp_processor_id()) && + system_state != SYSTEM_RUNNING) { /* * Some console drivers may assume that per-cpu resources have * been allocated. So don't allow them to be called by this diff --git a/kernel/sched.c b/kernel/sched.c index d5f21712ffbb..9e19d4c0d4a9 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2982,7 +2982,8 @@ void __might_sleep(char *file, int line) #if defined(in_atomic) static unsigned long prev_jiffy; /* ratelimiting */ - if ((in_atomic() || irqs_disabled()) && system_running) { + if ((in_atomic() || irqs_disabled()) && + system_state == SYSTEM_RUNNING) { if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; diff --git a/kernel/sys.c b/kernel/sys.c index 33a14e13079e..bc498b12edcc 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -436,7 +436,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user switch (cmd) { case LINUX_REBOOT_CMD_RESTART: notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); - system_running = 0; + system_state = SYSTEM_SHUTDOWN; device_shutdown(); printk(KERN_EMERG "Restarting system.\n"); machine_restart(NULL); @@ -452,7 +452,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user case LINUX_REBOOT_CMD_HALT: notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); - system_running = 0; + system_state = SYSTEM_SHUTDOWN; device_shutdown(); printk(KERN_EMERG "System halted.\n"); machine_halt(); @@ -462,7 +462,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user case LINUX_REBOOT_CMD_POWER_OFF: notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); - system_running = 0; + system_state = SYSTEM_SHUTDOWN; device_shutdown(); printk(KERN_EMERG "Power down.\n"); machine_power_off(); @@ -478,7 +478,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user buffer[sizeof(buffer) - 1] = '\0'; notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); - system_running = 0; + system_state = SYSTEM_SHUTDOWN; device_shutdown(); printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); machine_restart(buffer); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5d035d836c15..9764a4e78e45 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -734,7 +734,7 @@ fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int orde struct page * page; #ifdef CONFIG_NUMA - if (unlikely(!system_running)) + if (unlikely(system_state == SYSTEM_BOOTING)) return get_boot_pages(gfp_mask, order); #endif page = alloc_pages(gfp_mask, order); -- cgit v1.2.3 From efffe9c8536bf9ee28f2f381bd285824bedcdbcd Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:40:55 -0700 Subject: [PATCH] Fix VT open/close race The race is that con_close() can sleep, and drops the BKL while tty->count==1. But another thread can come into init_dev() and will take a new ref against the tty and start using it. But con_close() doesn't notice that new ref and proceeds to null out tty->driver_data while someone else is using the resurrected tty. So the patch serialises con_close() against init_dev() with tty_sem. Here's a test app which reproduced the oops instantly on 2-way. It realy needs to be run against all tty-capable devices. /* * Run this against a tty which nobody currently has open, such as /dev/tty9 */ #include #include #include #include #include #include void doit(char *filename) { int fd,x; fd = open(filename, O_RDWR); if (fd < 0) { perror("open"); exit(1); } ioctl(fd, KDKBDREP, &x); close(fd); } main(int argc, char *argv[]) { char *filename = argv[1]; for ( ; ; ) doit(filename); } --- drivers/char/tty_io.c | 2 +- drivers/char/vt.c | 14 ++++++++++++++ include/linux/tty.h | 3 +++ 3 files changed, 18 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index 6bb5ae7e41a5..0ba52078f637 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -123,7 +123,7 @@ LIST_HEAD(tty_drivers); /* linked list of tty drivers */ struct tty_ldisc ldiscs[NR_LDISCS]; /* line disc dispatch table */ /* Semaphore to protect creating and releasing a tty */ -static DECLARE_MUTEX(tty_sem); +DECLARE_MUTEX(tty_sem); #ifdef CONFIG_UNIX98_PTYS extern struct tty_driver *ptm_driver; /* Unix98 pty masters; for /dev/ptmx */ diff --git a/drivers/char/vt.c b/drivers/char/vt.c index a5ddfc5ac9c1..2febed52e19f 100644 --- a/drivers/char/vt.c +++ b/drivers/char/vt.c @@ -2480,8 +2480,16 @@ static int con_open(struct tty_struct *tty, struct file *filp) return ret; } +/* + * We take tty_sem in here to prevent another thread from coming in via init_dev + * and taking a ref against the tty while we're in the process of forgetting + * about it and cleaning things up. + * + * This is because vcs_remove_devfs() can sleep and will drop the BKL. + */ static void con_close(struct tty_struct *tty, struct file *filp) { + down(&tty_sem); acquire_console_sem(); if (tty && tty->count == 1) { struct vt_struct *vt; @@ -2492,9 +2500,15 @@ static void con_close(struct tty_struct *tty, struct file *filp) tty->driver_data = 0; release_console_sem(); vcs_remove_devfs(tty); + up(&tty_sem); + /* + * tty_sem is released, but we still hold BKL, so there is + * still exclusion against init_dev() + */ return; } release_console_sem(); + up(&tty_sem); } static void vc_init(unsigned int currcons, unsigned int rows, diff --git a/include/linux/tty.h b/include/linux/tty.h index fbcc401e8b28..6e61f3b27157 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -363,6 +363,9 @@ extern void tty_flip_buffer_push(struct tty_struct *tty); extern int tty_get_baud_rate(struct tty_struct *tty); extern int tty_termios_baud_rate(struct termios *termios); +struct semaphore; +extern struct semaphore tty_sem; + /* n_tty.c */ extern struct tty_ldisc tty_ldisc_N_TTY; -- cgit v1.2.3 From ee28db843649533f5650186251ae4a8bd49a3da9 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:41:07 -0700 Subject: [PATCH] i4l: kernelcapi receive workqueue and locking rework From: Armin Schindler With this patch the ISDN kernel CAPI code uses a per application workqueue with proper locking to prevent message re-ordering due to the fact a workqueue may run on another CPU at the same time. Also some locks for internal data is added. Removed global recv_queue work, use per application workqueue. Added proper locking mechanisms for application, controller and application workqueue function. Increased max. number of possible applications and controllers. --- drivers/isdn/capi/kcapi.c | 96 ++++++++++++++++++++++++++++++++-------------- include/linux/kernelcapi.h | 11 ++++-- 2 files changed, 75 insertions(+), 32 deletions(-) (limited to 'include') diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c index 064dc3003716..8524997b10b6 100644 --- a/drivers/isdn/capi/kcapi.c +++ b/drivers/isdn/capi/kcapi.c @@ -1,4 +1,4 @@ -/* $Id: kcapi.c,v 1.1.2.7 2004/03/16 08:01:47 armin Exp $ +/* $Id: kcapi.c,v 1.1.2.8 2004/03/26 19:57:20 armin Exp $ * * Kernel CAPI 2.0 Module * @@ -31,7 +31,7 @@ #include #endif -static char *revision = "$Revision: 1.1.2.7 $"; +static char *revision = "$Revision: 1.1.2.8 $"; /* ------------------------------------------------------------- */ @@ -63,13 +63,13 @@ static char capi_manufakturer[64] = "AVM Berlin"; LIST_HEAD(capi_drivers); rwlock_t capi_drivers_list_lock = RW_LOCK_UNLOCKED; +static rwlock_t application_lock = RW_LOCK_UNLOCKED; +static DECLARE_MUTEX(controller_sem); + struct capi20_appl *capi_applications[CAPI_MAXAPPL]; struct capi_ctr *capi_cards[CAPI_MAXCONTR]; static int ncards; -static struct sk_buff_head recv_queue; - -static struct work_struct tq_recv_notify; /* -------- controller ref counting -------------------------------------- */ @@ -174,7 +174,7 @@ static void notify_up(u32 contr) for (applid = 1; applid <= CAPI_MAXAPPL; applid++) { ap = get_capi_appl_by_nr(applid); - if (ap && ap->callback) + if (ap && ap->callback && !ap->release_in_progress) ap->callback(KCI_CONTRUP, contr, &card->profile); } } @@ -192,7 +192,7 @@ static void notify_down(u32 contr) for (applid = 1; applid <= CAPI_MAXAPPL; applid++) { ap = get_capi_appl_by_nr(applid); - if (ap && ap->callback) + if (ap && ap->callback && !ap->release_in_progress) ap->callback(KCI_CONTRDOWN, contr, 0); } } @@ -237,38 +237,39 @@ static int notify_push(unsigned int cmd, u32 controller, u16 applid, u32 ncci) /* -------- Receiver ------------------------------------------ */ -static void recv_handler(void *dummy) +static void recv_handler(void *_ap) { struct sk_buff *skb; - struct capi20_appl *ap; + struct capi20_appl *ap = (struct capi20_appl *) _ap; - while ((skb = skb_dequeue(&recv_queue)) != 0) { - ap = get_capi_appl_by_nr(CAPIMSG_APPID(skb->data)); - if (!ap) { - printk(KERN_ERR "kcapi: recv_handler: applid %d ? (%s)\n", - CAPIMSG_APPID(skb->data), capi_message2str(skb->data)); - kfree_skb(skb); - continue; - } + if ((!ap) || (ap->release_in_progress)) + return; + down(&ap->recv_sem); + while ((skb = skb_dequeue(&ap->recv_queue))) { if (CAPIMSG_CMD(skb->data) == CAPI_DATA_B3_IND) ap->nrecvdatapkt++; else ap->nrecvctlpkt++; + ap->recv_message(ap, skb); } + up(&ap->recv_sem); } void capi_ctr_handle_message(struct capi_ctr * card, u16 appl, struct sk_buff *skb) { + struct capi20_appl *ap; int showctl = 0; u8 cmd, subcmd; + unsigned long flags; if (card->cardstate != CARD_RUNNING) { printk(KERN_INFO "kcapi: controller %d not active, got: %s", card->cnr, capi_message2str(skb->data)); goto error; } + cmd = CAPIMSG_COMMAND(skb->data); subcmd = CAPIMSG_SUBCOMMAND(skb->data); if (cmd == CAPI_DATA_B3 && subcmd == CAPI_IND) { @@ -293,8 +294,19 @@ void capi_ctr_handle_message(struct capi_ctr * card, u16 appl, struct sk_buff *s } } - skb_queue_tail(&recv_queue, skb); - schedule_work(&tq_recv_notify); + + read_lock_irqsave(&application_lock, flags); + ap = get_capi_appl_by_nr(CAPIMSG_APPID(skb->data)); + if ((!ap) || (ap->release_in_progress)) { + read_unlock_irqrestore(&application_lock, flags); + printk(KERN_ERR "kcapi: handle_message: applid %d state released (%s)\n", + CAPIMSG_APPID(skb->data), capi_message2str(skb->data)); + goto error; + } + skb_queue_tail(&ap->recv_queue, skb); + schedule_work(&ap->recv_work); + read_unlock_irqrestore(&application_lock, flags); + return; error: @@ -310,11 +322,13 @@ void capi_ctr_ready(struct capi_ctr * card) card->cardstate = CARD_RUNNING; + down(&controller_sem); for (appl = 1; appl <= CAPI_MAXAPPL; appl++) { ap = get_capi_appl_by_nr(appl); - if (!ap) continue; + if (!ap || ap->release_in_progress) continue; register_appl(card, appl, &ap->rparam); } + up(&controller_sem); printk(KERN_NOTICE "kcapi: card %d \"%s\" ready.\n", card->cnr, card->name); @@ -342,7 +356,7 @@ void capi_ctr_reseted(struct capi_ctr * card) for (appl = 1; appl <= CAPI_MAXAPPL; appl++) { struct capi20_appl *ap = get_capi_appl_by_nr(appl); - if (!ap) + if (!ap || ap->release_in_progress) continue; capi_ctr_put(card); @@ -382,16 +396,21 @@ attach_capi_ctr(struct capi_ctr *card) { int i; + down(&controller_sem); + for (i = 0; i < CAPI_MAXCONTR; i++) { if (capi_cards[i] == NULL) break; } if (i == CAPI_MAXCONTR) { + up(&controller_sem); printk(KERN_ERR "kcapi: out of controller slots\n"); return -EBUSY; } capi_cards[i] = card; + up(&controller_sem); + card->nrecvctlpkt = 0; card->nrecvdatapkt = 0; card->nsentctlpkt = 0; @@ -480,18 +499,23 @@ u16 capi20_register(struct capi20_appl *ap) { int i; u16 applid; + unsigned long flags; DBG(""); if (ap->rparam.datablklen < 128) return CAPI_LOGBLKSIZETOSMALL; + write_lock_irqsave(&application_lock, flags); + for (applid = 1; applid <= CAPI_MAXAPPL; applid++) { if (capi_applications[applid - 1] == NULL) break; } - if (applid > CAPI_MAXAPPL) + if (applid > CAPI_MAXAPPL) { + write_unlock_irqrestore(&application_lock, flags); return CAPI_TOOMANYAPPLS; + } ap->applid = applid; capi_applications[applid - 1] = ap; @@ -501,12 +525,21 @@ u16 capi20_register(struct capi20_appl *ap) ap->nsentctlpkt = 0; ap->nsentdatapkt = 0; ap->callback = 0; + init_MUTEX(&ap->recv_sem); + skb_queue_head_init(&ap->recv_queue); + INIT_WORK(&ap->recv_work, recv_handler, (void *)ap); + ap->release_in_progress = 0; + + write_unlock_irqrestore(&application_lock, flags); + down(&controller_sem); for (i = 0; i < CAPI_MAXCONTR; i++) { if (!capi_cards[i] || capi_cards[i]->cardstate != CARD_RUNNING) continue; register_appl(capi_cards[i], applid, &ap->rparam); } + up(&controller_sem); + if (showcapimsgs & 1) { printk(KERN_DEBUG "kcapi: appl %d up\n", applid); } @@ -519,15 +552,26 @@ EXPORT_SYMBOL(capi20_register); u16 capi20_release(struct capi20_appl *ap) { int i; + unsigned long flags; DBG("applid %#x", ap->applid); + write_lock_irqsave(&application_lock, flags); + ap->release_in_progress = 1; + capi_applications[ap->applid - 1] = NULL; + write_unlock_irqrestore(&application_lock, flags); + + down(&controller_sem); for (i = 0; i < CAPI_MAXCONTR; i++) { if (!capi_cards[i] || capi_cards[i]->cardstate != CARD_RUNNING) continue; release_appl(capi_cards[i], ap->applid); } - capi_applications[ap->applid - 1] = NULL; + up(&controller_sem); + + flush_scheduled_work(); + skb_queue_purge(&ap->recv_queue); + if (showcapimsgs & 1) { printk(KERN_DEBUG "kcapi: appl %d down\n", ap->applid); } @@ -547,7 +591,7 @@ u16 capi20_put_message(struct capi20_appl *ap, struct sk_buff *skb) if (ncards == 0) return CAPI_REGNOTINSTALLED; - if (ap->applid == 0) + if ((ap->applid == 0) || ap->release_in_progress) return CAPI_ILLAPPNR; if (skb->len < 12 || !capi_cmd_valid(CAPIMSG_COMMAND(skb->data)) @@ -925,10 +969,6 @@ static int __init kcapi_init(void) char *p; char rev[32]; - skb_queue_head_init(&recv_queue); - - INIT_WORK(&tq_recv_notify, recv_handler, NULL); - kcapi_proc_init(); if ((p = strchr(revision, ':')) != 0 && p[1]) { diff --git a/include/linux/kernelcapi.h b/include/linux/kernelcapi.h index b982d5b77ae9..1d4b1b15d0b8 100644 --- a/include/linux/kernelcapi.h +++ b/include/linux/kernelcapi.h @@ -10,10 +10,8 @@ #ifndef __KERNELCAPI_H__ #define __KERNELCAPI_H__ -#include - -#define CAPI_MAXAPPL 128 /* maximum number of applications */ -#define CAPI_MAXCONTR 16 /* maximum number of controller */ +#define CAPI_MAXAPPL 240 /* maximum number of applications */ +#define CAPI_MAXCONTR 32 /* maximum number of controller */ #define CAPI_MAXDATAWINDOW 8 @@ -47,6 +45,7 @@ typedef struct kcapi_carddef { #ifdef __KERNEL__ +#include #include #define KCI_CONTRUP 0 /* arg: struct capi_profile */ @@ -63,6 +62,10 @@ struct capi20_appl { unsigned long nrecvdatapkt; unsigned long nsentctlpkt; unsigned long nsentdatapkt; + struct semaphore recv_sem; + struct sk_buff_head recv_queue; + struct work_struct recv_work; + int release_in_progress; /* ugly hack to allow for notification of added/removed * controllers. The Right Way (tm) is known. XXX -- cgit v1.2.3 From b283f09cf8f51c29bf90e42e22099f76d0f33378 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:41:20 -0700 Subject: [PATCH] Fix get_wchan() FIXME wrt. order of functions From: William Lee Irwin III This addresses the issue with get_wchan() that the various functions acting as scheduling-related primitives are not, in fact, contiguous in the text segment. It creates an ELF section for scheduling primitives to be placed in, and places currently-detected (i.e. skipped during stack decoding) scheduling primitives and others like io_schedule() and down(), which are currently missed by get_wchan() code, into this section also. The net effects are more reliability of get_wchan()'s results and the new ability, made use of by this code, to arbitrarily place scheduling primitives in the source code without disturbing get_wchan()'s accuracy. Suggestions by Arnd Bergmann and Matthew Wilcox regarding reducing the invasiveness of the patch were incorporated during prior rounds of review. I've at least tried to sweep all arches in this patch. --- arch/alpha/kernel/process.c | 2 -- arch/alpha/kernel/semaphore.c | 9 ++++---- arch/alpha/kernel/vmlinux.lds.S | 1 + arch/arm/kernel/process.c | 2 -- arch/arm/kernel/semaphore.c | 8 ++++--- arch/arm/kernel/vmlinux.lds.S | 1 + arch/arm26/kernel/process.c | 2 -- arch/arm26/kernel/semaphore.c | 8 ++++--- arch/arm26/kernel/vmlinux-arm26-xip.lds.in | 1 + arch/arm26/kernel/vmlinux-arm26.lds.in | 1 + arch/cris/arch-v10/kernel/process.c | 3 +-- arch/cris/arch-v10/vmlinux.lds.S | 1 + arch/cris/kernel/semaphore.c | 5 ++-- arch/h8300/kernel/process.c | 3 --- arch/h8300/kernel/semaphore.c | 5 ++-- arch/h8300/kernel/vmlinux.lds.S | 1 + arch/i386/kernel/process.c | 2 -- arch/i386/kernel/semaphore.c | 17 +++++++------- arch/i386/kernel/vmlinux.lds.S | 1 + arch/ia64/kernel/process.c | 2 -- arch/ia64/kernel/semaphore.c | 7 +++--- arch/ia64/kernel/vmlinux.lds.S | 1 + arch/m68k/kernel/process.c | 5 ---- arch/m68k/kernel/semaphore.c | 5 ++-- arch/m68k/kernel/vmlinux-std.lds | 1 + arch/m68k/kernel/vmlinux-sun3.lds | 1 + arch/m68knommu/kernel/process.c | 5 ---- arch/m68knommu/kernel/semaphore.c | 5 ++-- arch/m68knommu/kernel/vmlinux.lds.S | 1 + arch/mips/kernel/process.c | 2 -- arch/mips/kernel/semaphore.c | 5 ++-- arch/mips/kernel/vmlinux.lds.S | 1 + arch/parisc/kernel/semaphore.c | 5 ++-- arch/parisc/kernel/vmlinux.lds.S | 1 + arch/ppc/kernel/process.c | 2 -- arch/ppc/kernel/semaphore.c | 5 ++-- arch/ppc/kernel/vmlinux.lds.S | 1 + arch/ppc64/kernel/process.c | 2 -- arch/ppc64/kernel/semaphore.c | 5 ++-- arch/ppc64/kernel/vmlinux.lds.S | 1 + arch/s390/kernel/process.c | 2 -- arch/s390/kernel/semaphore.c | 5 ++-- arch/s390/kernel/vmlinux.lds.S | 1 + arch/sh/kernel/process.c | 4 +--- arch/sh/kernel/semaphore.c | 5 ++-- arch/sh/kernel/vmlinux.lds.S | 1 + arch/sparc/kernel/process.c | 4 +--- arch/sparc/kernel/semaphore.c | 5 ++-- arch/sparc/kernel/vmlinux.lds.S | 1 + arch/sparc/lib/rwsem.S | 3 ++- arch/sparc64/kernel/process.c | 4 +--- arch/sparc64/kernel/semaphore.c | 9 ++++---- arch/sparc64/kernel/vmlinux.lds.S | 1 + arch/sparc64/lib/rwsem.c | 5 ++-- arch/v850/kernel/process.c | 3 --- arch/v850/kernel/semaphore.c | 5 ++-- arch/v850/kernel/vmlinux.lds.S | 1 + arch/x86_64/kernel/process.c | 2 -- arch/x86_64/kernel/semaphore.c | 5 ++-- arch/x86_64/kernel/vmlinux.lds.S | 1 + arch/x86_64/lib/thunk.S | 3 ++- include/asm-generic/vmlinux.lds.h | 5 ++++ include/linux/init.h | 2 ++ include/linux/sched.h | 2 ++ kernel/sched.c | 37 ++++++++++++++++-------------- kernel/timer.c | 4 ++-- lib/rwsem.c | 5 ++-- 67 files changed, 137 insertions(+), 124 deletions(-) (limited to 'include') diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index e427bae12ffe..297e4b48bfe2 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -513,8 +513,6 @@ thread_saved_pc(task_t *t) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/alpha/kernel/semaphore.c b/arch/alpha/kernel/semaphore.c index b52a0df303fe..4d60a0ccd6f7 100644 --- a/arch/alpha/kernel/semaphore.c +++ b/arch/alpha/kernel/semaphore.c @@ -7,6 +7,7 @@ #include #include +#include /* * This is basically the PPC semaphore scheme ported to use @@ -60,7 +61,7 @@ static inline int __sem_update_count(struct semaphore *sem, int incr) * Either form may be used in conjunction with "up()". */ -void +void __sched __down_failed(struct semaphore *sem) { struct task_struct *tsk = current; @@ -101,7 +102,7 @@ __down_failed(struct semaphore *sem) #endif } -int +int __sched __down_failed_interruptible(struct semaphore *sem) { struct task_struct *tsk = current; @@ -159,7 +160,7 @@ __up_wakeup(struct semaphore *sem) wake_up(&sem->wait); } -void +void __sched down(struct semaphore *sem) { #if WAITQUEUE_DEBUG @@ -173,7 +174,7 @@ down(struct semaphore *sem) __down(sem); } -int +int __sched down_interruptible(struct semaphore *sem) { #if WAITQUEUE_DEBUG diff --git a/arch/alpha/kernel/vmlinux.lds.S b/arch/alpha/kernel/vmlinux.lds.S index 7afd00d5d46b..d159b8f0d022 100644 --- a/arch/alpha/kernel/vmlinux.lds.S +++ b/arch/alpha/kernel/vmlinux.lds.S @@ -17,6 +17,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } :kernel diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index 863c4076daad..8423921e821a 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -414,8 +414,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/arm/kernel/semaphore.c b/arch/arm/kernel/semaphore.c index a50902e8bec7..da39eb3dca31 100644 --- a/arch/arm/kernel/semaphore.c +++ b/arch/arm/kernel/semaphore.c @@ -13,6 +13,7 @@ */ #include #include +#include #include @@ -54,7 +55,7 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -87,7 +88,7 @@ void __down(struct semaphore * sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -176,7 +177,8 @@ int __down_trylock(struct semaphore * sem) * registers (r0 to r3 and lr), but not ip, as we use it as a return * value in some cases.. */ -asm(" .align 5 \n\ +asm(" .section .sched.text \n\ + .align 5 \n\ .globl __down_failed \n\ __down_failed: \n\ stmfd sp!, {r0 - r3, lr} \n\ diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index 56af3401b34d..a5db0ddca6a4 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -73,6 +73,7 @@ SECTIONS .text : { /* Real text segment */ _text = .; /* Text and read-only data */ *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) *(.rodata) diff --git a/arch/arm26/kernel/process.c b/arch/arm26/kernel/process.c index 09a2f52ad8a8..ce23571617a1 100644 --- a/arch/arm26/kernel/process.c +++ b/arch/arm26/kernel/process.c @@ -400,8 +400,6 @@ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/arm26/kernel/semaphore.c b/arch/arm26/kernel/semaphore.c index e7964ce1d0d9..60591a738592 100644 --- a/arch/arm26/kernel/semaphore.c +++ b/arch/arm26/kernel/semaphore.c @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -56,7 +57,7 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -89,7 +90,7 @@ void __down(struct semaphore * sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -178,7 +179,8 @@ int __down_trylock(struct semaphore * sem) * registers (r0 to r3 and lr), but not ip, as we use it as a return * value in some cases.. */ -asm(" .align 5 \n\ +asm(" .section .sched.text \n\ + .align 5 \n\ .globl __down_failed \n\ __down_failed: \n\ stmfd sp!, {r0 - r3, lr} \n\ diff --git a/arch/arm26/kernel/vmlinux-arm26-xip.lds.in b/arch/arm26/kernel/vmlinux-arm26-xip.lds.in index 602a77c022d7..61eedf0bc42f 100644 --- a/arch/arm26/kernel/vmlinux-arm26-xip.lds.in +++ b/arch/arm26/kernel/vmlinux-arm26-xip.lds.in @@ -66,6 +66,7 @@ SECTIONS .text : { /* Real text segment */ _text = .; /* Text and read-only data */ *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) *(.rodata) diff --git a/arch/arm26/kernel/vmlinux-arm26.lds.in b/arch/arm26/kernel/vmlinux-arm26.lds.in index 8782fe36f0a8..2393f3805a49 100644 --- a/arch/arm26/kernel/vmlinux-arm26.lds.in +++ b/arch/arm26/kernel/vmlinux-arm26.lds.in @@ -67,6 +67,7 @@ SECTIONS .text : { /* Real text segment */ _text = .; /* Text and read-only data */ *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) *(.rodata) diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c index 62e3a4fbf33a..c785b54e6cbd 100644 --- a/arch/cris/arch-v10/kernel/process.c +++ b/arch/cris/arch-v10/kernel/process.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_ETRAX_GPIO void etrax_gpio_wake_up_check(void); /* drivers/gpio.c */ @@ -216,8 +217,6 @@ asmlinkage int sys_execve(const char *fname, char **argv, char **envp, * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/cris/arch-v10/vmlinux.lds.S b/arch/cris/arch-v10/vmlinux.lds.S index b2c27e147f29..6b73a2c0dad8 100644 --- a/arch/cris/arch-v10/vmlinux.lds.S +++ b/arch/cris/arch-v10/vmlinux.lds.S @@ -25,6 +25,7 @@ SECTIONS __stext = .; .text : { *(.text) + SCHED_TEXT *(.fixup) *(.text.__*) } diff --git a/arch/cris/kernel/semaphore.c b/arch/cris/kernel/semaphore.c index d62b355e1706..b884263d3cd4 100644 --- a/arch/cris/kernel/semaphore.c +++ b/arch/cris/kernel/semaphore.c @@ -4,6 +4,7 @@ */ #include +#include #include /* @@ -94,7 +95,7 @@ void __up(struct semaphore *sem) tsk->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DOWN_VAR DOWN_HEAD(TASK_UNINTERRUPTIBLE) @@ -104,7 +105,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int ret = 0; DOWN_VAR diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index bd6ccd542399..8640ea20dba0 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -264,8 +264,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) @@ -289,7 +287,6 @@ unsigned long get_wchan(struct task_struct *p) fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - /* FIXME: This depends on the order of these functions. */ if (pc < first_sched || pc >= last_sched) return pc; fp = *(unsigned long *) fp; diff --git a/arch/h8300/kernel/semaphore.c b/arch/h8300/kernel/semaphore.c index 690efce1e437..1ebb79baaa8c 100644 --- a/arch/h8300/kernel/semaphore.c +++ b/arch/h8300/kernel/semaphore.c @@ -5,6 +5,7 @@ #include #include +#include #include #ifndef CONFIG_RMW_INSNS @@ -95,7 +96,7 @@ void __up(struct semaphore *sem) current->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); @@ -106,7 +107,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); int ret = 0; diff --git a/arch/h8300/kernel/vmlinux.lds.S b/arch/h8300/kernel/vmlinux.lds.S index 60787f07eb2b..3a643954a8fe 100644 --- a/arch/h8300/kernel/vmlinux.lds.S +++ b/arch/h8300/kernel/vmlinux.lds.S @@ -82,6 +82,7 @@ SECTIONS #endif __stext = . ; *(.text) + SCHED_TEXT . = ALIGN(0x4) ; *(.exit.text) *(.text.*) diff --git a/arch/i386/kernel/process.c b/arch/i386/kernel/process.c index 3495f1aedf67..7fed9d3823ed 100644 --- a/arch/i386/kernel/process.c +++ b/arch/i386/kernel/process.c @@ -632,8 +632,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) #define top_esp (THREAD_SIZE - sizeof(unsigned long)) diff --git a/arch/i386/kernel/semaphore.c b/arch/i386/kernel/semaphore.c index 5acd544f0cbd..073912cfcf44 100644 --- a/arch/i386/kernel/semaphore.c +++ b/arch/i386/kernel/semaphore.c @@ -15,6 +15,7 @@ #include #include #include +#include #include /* @@ -53,7 +54,7 @@ asmlinkage void __up(struct semaphore *sem) wake_up(&sem->wait); } -asmlinkage void __down(struct semaphore * sem) +asmlinkage void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -90,7 +91,7 @@ asmlinkage void __down(struct semaphore * sem) tsk->state = TASK_RUNNING; } -asmlinkage int __down_interruptible(struct semaphore * sem) +asmlinkage int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -187,7 +188,7 @@ asmlinkage int __down_trylock(struct semaphore * sem) * value.. */ asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __down_failed\n" "__down_failed:\n\t" @@ -210,7 +211,7 @@ asm( ); asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __down_failed_interruptible\n" "__down_failed_interruptible:\n\t" @@ -231,7 +232,7 @@ asm( ); asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __down_failed_trylock\n" "__down_failed_trylock:\n\t" @@ -252,7 +253,7 @@ asm( ); asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __up_wakeup\n" "__up_wakeup:\n\t" @@ -271,7 +272,7 @@ asm( */ #if defined(CONFIG_SMP) asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __write_lock_failed\n" "__write_lock_failed:\n\t" @@ -285,7 +286,7 @@ asm( ); asm( -".text\n" +".section .sched.text\n" ".align 4\n" ".globl __read_lock_failed\n" "__read_lock_failed:\n\t" diff --git a/arch/i386/kernel/vmlinux.lds.S b/arch/i386/kernel/vmlinux.lds.S index 3623d7e2934a..0253c586547b 100644 --- a/arch/i386/kernel/vmlinux.lds.S +++ b/arch/i386/kernel/vmlinux.lds.S @@ -16,6 +16,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x9090 diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index a1d09d5c91c4..0d245cbcd1f6 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -660,8 +660,6 @@ get_wchan (struct task_struct *p) /* * These bracket the sleeping functions.. */ - extern void scheduling_functions_start_here(void); - extern void scheduling_functions_end_here(void); # define first_sched ((unsigned long) scheduling_functions_start_here) # define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/ia64/kernel/semaphore.c b/arch/ia64/kernel/semaphore.c index f3926a3c4d73..2724ef3fbae2 100644 --- a/arch/ia64/kernel/semaphore.c +++ b/arch/ia64/kernel/semaphore.c @@ -24,6 +24,7 @@ * where we want to avoid any extra jumps and calls. */ #include +#include #include #include @@ -44,8 +45,7 @@ __up (struct semaphore *sem) wake_up(&sem->wait); } -void -__down (struct semaphore *sem) +void __sched __down (struct semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -82,8 +82,7 @@ __down (struct semaphore *sem) tsk->state = TASK_RUNNING; } -int -__down_interruptible (struct semaphore * sem) +int __sched __down_interruptible (struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index e5589e49d9da..5c45718a9c82 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -41,6 +41,7 @@ SECTIONS { *(.text.ivt) *(.text) + SCHED_TEXT *(.gnu.linkonce.t*) } .text2 : AT(ADDR(.text2) - LOAD_OFFSET) diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 8d72a5c5b0c7..fc2c753c332b 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -65,8 +65,6 @@ asmlinkage void ret_from_fork(void); */ unsigned long thread_saved_pc(struct task_struct *tsk) { - extern void scheduling_functions_start_here(void); - extern void scheduling_functions_end_here(void); struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp; /* Check whether the thread is blocked in resume() */ if (sw->retpc > (unsigned long)scheduling_functions_start_here && @@ -387,8 +385,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) @@ -407,7 +403,6 @@ unsigned long get_wchan(struct task_struct *p) fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - /* FIXME: This depends on the order of these functions. */ if (pc < first_sched || pc >= last_sched) return pc; fp = *(unsigned long *) fp; diff --git a/arch/m68k/kernel/semaphore.c b/arch/m68k/kernel/semaphore.c index 690efce1e437..1ebb79baaa8c 100644 --- a/arch/m68k/kernel/semaphore.c +++ b/arch/m68k/kernel/semaphore.c @@ -5,6 +5,7 @@ #include #include +#include #include #ifndef CONFIG_RMW_INSNS @@ -95,7 +96,7 @@ void __up(struct semaphore *sem) current->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); @@ -106,7 +107,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); int ret = 0; diff --git a/arch/m68k/kernel/vmlinux-std.lds b/arch/m68k/kernel/vmlinux-std.lds index bd41fc992169..6dc62684c7b9 100644 --- a/arch/m68k/kernel/vmlinux-std.lds +++ b/arch/m68k/kernel/vmlinux-std.lds @@ -12,6 +12,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x4e75 diff --git a/arch/m68k/kernel/vmlinux-sun3.lds b/arch/m68k/kernel/vmlinux-sun3.lds index 2e81cde14987..f293e567192c 100644 --- a/arch/m68k/kernel/vmlinux-sun3.lds +++ b/arch/m68k/kernel/vmlinux-sun3.lds @@ -13,6 +13,7 @@ SECTIONS .text : { *(.head) *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x4e75 diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c index c8b87371641a..896d596a1bd8 100644 --- a/arch/m68knommu/kernel/process.c +++ b/arch/m68knommu/kernel/process.c @@ -406,8 +406,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) @@ -426,7 +424,6 @@ unsigned long get_wchan(struct task_struct *p) fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - /* FIXME: This depends on the order of these functions. */ if (pc < first_sched || pc >= last_sched) return pc; fp = *(unsigned long *) fp; @@ -439,8 +436,6 @@ unsigned long get_wchan(struct task_struct *p) */ unsigned long thread_saved_pc(struct task_struct *tsk) { - extern void scheduling_functions_start_here(void); - extern void scheduling_functions_end_here(void); struct switch_stack *sw = (struct switch_stack *)tsk->thread.ksp; /* Check whether the thread is blocked in resume() */ diff --git a/arch/m68knommu/kernel/semaphore.c b/arch/m68knommu/kernel/semaphore.c index 33d704fcf883..c083f4772add 100644 --- a/arch/m68knommu/kernel/semaphore.c +++ b/arch/m68knommu/kernel/semaphore.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #ifndef CONFIG_RMW_INSNS @@ -96,7 +97,7 @@ void __up(struct semaphore *sem) current->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); @@ -107,7 +108,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { DECLARE_WAITQUEUE(wait, current); int ret = 0; diff --git a/arch/m68knommu/kernel/vmlinux.lds.S b/arch/m68knommu/kernel/vmlinux.lds.S index 1ab8a31ef964..a362870b6e4e 100644 --- a/arch/m68knommu/kernel/vmlinux.lds.S +++ b/arch/m68knommu/kernel/vmlinux.lds.S @@ -191,6 +191,7 @@ SECTIONS { .text : { _stext = . ; *(.text) + SCHED_TEXT *(.text.lock) . = ALIGN(16); /* Exception table */ diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index f8ba26770bf4..f4ab9c66b27f 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -283,8 +283,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/mips/kernel/semaphore.c b/arch/mips/kernel/semaphore.c index 11b937f20604..51c3e772c029 100644 --- a/arch/mips/kernel/semaphore.c +++ b/arch/mips/kernel/semaphore.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #ifdef CONFIG_CPU_HAS_LLDSCD @@ -104,7 +105,7 @@ static inline int waking_non_zero(struct semaphore *sem) * Either form may be used in conjunction with "up()". */ -void __down_failed(struct semaphore * sem) +void __sched __down_failed(struct semaphore * sem) { struct task_struct *tsk = current; wait_queue_t wait; @@ -227,7 +228,7 @@ static inline int waking_non_zero_interruptible(struct semaphore *sem, #endif /* !CONFIG_CPU_HAS_LLDSCD */ -int __down_failed_interruptible(struct semaphore * sem) +int __sched __down_failed_interruptible(struct semaphore * sem) { struct task_struct *tsk = current; wait_queue_t wait; diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index b72639f8db65..098cfaa23c0e 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -28,6 +28,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } =0 diff --git a/arch/parisc/kernel/semaphore.c b/arch/parisc/kernel/semaphore.c index ffb4851451fc..ee806bcc3726 100644 --- a/arch/parisc/kernel/semaphore.c +++ b/arch/parisc/kernel/semaphore.c @@ -5,6 +5,7 @@ #include #include #include +#include /* * Semaphores are complex as we wish to avoid using two variables. @@ -58,7 +59,7 @@ void __up(struct semaphore *sem) sem->count += (sem->count < 0) ? 1 : - 1; -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DOWN_HEAD @@ -74,7 +75,7 @@ void __down(struct semaphore * sem) UPDATE_COUNT } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { DOWN_HEAD diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S index 14d0882a19d2..e5d5aeef96e5 100644 --- a/arch/parisc/kernel/vmlinux.lds.S +++ b/arch/parisc/kernel/vmlinux.lds.S @@ -50,6 +50,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text ALIGN(16) : { *(.text*) + SCHED_TEXT *(.PARISC.unwind) *(.fixup) *(.lock.text) /* out-of-line lock text */ diff --git a/arch/ppc/kernel/process.c b/arch/ppc/kernel/process.c index ada32baeda19..3363a030e00f 100644 --- a/arch/ppc/kernel/process.c +++ b/arch/ppc/kernel/process.c @@ -661,8 +661,6 @@ void __init ll_puts(const char *s) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/ppc/kernel/semaphore.c b/arch/ppc/kernel/semaphore.c index 7bf51fba5c14..2fe429b27c14 100644 --- a/arch/ppc/kernel/semaphore.c +++ b/arch/ppc/kernel/semaphore.c @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -69,7 +70,7 @@ void __up(struct semaphore *sem) * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __down(struct semaphore *sem) +void __sched __down(struct semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -99,7 +100,7 @@ void __down(struct semaphore *sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/ppc/kernel/vmlinux.lds.S b/arch/ppc/kernel/vmlinux.lds.S index 81b95d449a22..b710d55c5b08 100644 --- a/arch/ppc/kernel/vmlinux.lds.S +++ b/arch/ppc/kernel/vmlinux.lds.S @@ -31,6 +31,7 @@ SECTIONS .text : { *(.text) + SCHED_TEXT *(.fixup) *(.got1) __got2_start = .; diff --git a/arch/ppc64/kernel/process.c b/arch/ppc64/kernel/process.c index cec7225a6ac1..f74b14d7e58e 100644 --- a/arch/ppc64/kernel/process.c +++ b/arch/ppc64/kernel/process.c @@ -475,8 +475,6 @@ static inline int validate_sp(unsigned long sp, struct task_struct *p) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched (*(unsigned long *)scheduling_functions_start_here) #define last_sched (*(unsigned long *)scheduling_functions_end_here) diff --git a/arch/ppc64/kernel/semaphore.c b/arch/ppc64/kernel/semaphore.c index c977029e2465..d723632d59f3 100644 --- a/arch/ppc64/kernel/semaphore.c +++ b/arch/ppc64/kernel/semaphore.c @@ -17,6 +17,7 @@ */ #include +#include #include #include #include @@ -70,7 +71,7 @@ void __up(struct semaphore *sem) * Thus it is only when we decrement count from some value > 0 * that we have actually got the semaphore. */ -void __down(struct semaphore *sem) +void __sched __down(struct semaphore *sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -99,7 +100,7 @@ void __down(struct semaphore *sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/ppc64/kernel/vmlinux.lds.S b/arch/ppc64/kernel/vmlinux.lds.S index a8531b1f9ef2..1d9b61143aaa 100644 --- a/arch/ppc64/kernel/vmlinux.lds.S +++ b/arch/ppc64/kernel/vmlinux.lds.S @@ -13,6 +13,7 @@ SECTIONS /* Read-only sections, merged into text segment: */ .text : { *(.text .text.*) + SCHED_TEXT *(.fixup) . = ALIGN(4096); _etext = .; diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 3676307d1d8a..050585ab5d2a 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -384,8 +384,6 @@ void dump_thread(struct pt_regs * regs, struct user * dump) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/s390/kernel/semaphore.c b/arch/s390/kernel/semaphore.c index 8203f5e0228d..8dfb690c159f 100644 --- a/arch/s390/kernel/semaphore.c +++ b/arch/s390/kernel/semaphore.c @@ -11,6 +11,7 @@ */ #include #include +#include #include @@ -60,7 +61,7 @@ void __up(struct semaphore *sem) * count > 0: decrement count, wake up queue and exit. * count <= 0: set count to -1, go to sleep. */ -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -82,7 +83,7 @@ void __down(struct semaphore * sem) * count > 0: wake up queue and exit. * count <= 0: set count to 0, wake up queue and exit. */ -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/s390/kernel/vmlinux.lds.S b/arch/s390/kernel/vmlinux.lds.S index c9ca7a8e93b3..b4534b2867c3 100644 --- a/arch/s390/kernel/vmlinux.lds.S +++ b/arch/s390/kernel/vmlinux.lds.S @@ -23,6 +23,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x0700 diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c index 773006661b50..7d45ea0acd09 100644 --- a/arch/sh/kernel/process.c +++ b/arch/sh/kernel/process.c @@ -464,8 +464,6 @@ out: /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) @@ -481,7 +479,7 @@ unsigned long get_wchan(struct task_struct *p) * The same comment as on the Alpha applies here, too ... */ pc = thread_saved_pc(p); - if (pc >= (unsigned long) interruptible_sleep_on && pc < (unsigned long) add_timer) { + if (pc >= first_sched && pc < last_sched) { schedule_frame = ((unsigned long *)(long)p->thread.sp)[1]; return (unsigned long)((unsigned long *)schedule_frame)[1]; } diff --git a/arch/sh/kernel/semaphore.c b/arch/sh/kernel/semaphore.c index 0943ad666a67..a3c24dcbf01d 100644 --- a/arch/sh/kernel/semaphore.c +++ b/arch/sh/kernel/semaphore.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -103,7 +104,7 @@ void __up(struct semaphore *sem) tsk->state = TASK_RUNNING; \ remove_wait_queue(&sem->wait, &wait); -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { DOWN_VAR DOWN_HEAD(TASK_UNINTERRUPTIBLE) @@ -113,7 +114,7 @@ void __down(struct semaphore * sem) DOWN_TAIL(TASK_UNINTERRUPTIBLE) } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int ret = 0; DOWN_VAR diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 2cc86534c130..da0f5d728b3e 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -22,6 +22,7 @@ SECTIONS } = 0 .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x0009 diff --git a/arch/sparc/kernel/process.c b/arch/sparc/kernel/process.c index beae70a970e4..70261b211997 100644 --- a/arch/sparc/kernel/process.c +++ b/arch/sparc/kernel/process.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -694,9 +695,6 @@ pid_t kernel_thread(int (*fn)(void *), void * arg, unsigned long flags) return retval; } -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); - unsigned long get_wchan(struct task_struct *task) { unsigned long pc, fp, bias = 0; diff --git a/arch/sparc/kernel/semaphore.c b/arch/sparc/kernel/semaphore.c index 5a8f3d176a8f..77e63b92ca30 100644 --- a/arch/sparc/kernel/semaphore.c +++ b/arch/sparc/kernel/semaphore.c @@ -4,6 +4,7 @@ #include #include +#include #include @@ -45,7 +46,7 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -78,7 +79,7 @@ void __down(struct semaphore * sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/sparc/kernel/vmlinux.lds.S b/arch/sparc/kernel/vmlinux.lds.S index 0862360d865d..8d4bbfaf304c 100644 --- a/arch/sparc/kernel/vmlinux.lds.S +++ b/arch/sparc/kernel/vmlinux.lds.S @@ -12,6 +12,7 @@ SECTIONS .text 0xf0004000 : { *(.text) + SCHED_TEXT *(.gnu.warning) } =0 _etext = .; diff --git a/arch/sparc/lib/rwsem.S b/arch/sparc/lib/rwsem.S index 98b757cb67c6..e7578dc600b8 100644 --- a/arch/sparc/lib/rwsem.S +++ b/arch/sparc/lib/rwsem.S @@ -8,7 +8,7 @@ #include #include - .text + .section .sched.text .align 4 .globl ___down_read @@ -113,6 +113,7 @@ ___down_write: ba 2b restore %l5, %g0, %g5 + .text .globl ___up_read ___up_read: rd %psr, %g3 diff --git a/arch/sparc64/kernel/process.c b/arch/sparc64/kernel/process.c index 1be2b97e4672..0caf962e8155 100644 --- a/arch/sparc64/kernel/process.c +++ b/arch/sparc64/kernel/process.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -823,9 +824,6 @@ out: return error; } -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); - unsigned long get_wchan(struct task_struct *task) { unsigned long pc, fp, bias = 0; diff --git a/arch/sparc64/kernel/semaphore.c b/arch/sparc64/kernel/semaphore.c index a9e66d666ceb..9ddfcb9a1900 100644 --- a/arch/sparc64/kernel/semaphore.c +++ b/arch/sparc64/kernel/semaphore.c @@ -8,6 +8,7 @@ #include #include +#include /* * Atomically update sem->count. @@ -90,7 +91,7 @@ void up(struct semaphore *sem) : "g5", "g7", "memory", "cc"); } -static void __down(struct semaphore * sem) +static void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -108,7 +109,7 @@ static void __down(struct semaphore * sem) wake_up(&sem->wait); } -void down(struct semaphore *sem) +void __sched down(struct semaphore *sem) { might_sleep(); /* This atomically does: @@ -192,7 +193,7 @@ int down_trylock(struct semaphore *sem) return ret; } -static int __down_interruptible(struct semaphore * sem) +static int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; @@ -216,7 +217,7 @@ static int __down_interruptible(struct semaphore * sem) return retval; } -int down_interruptible(struct semaphore *sem) +int __sched down_interruptible(struct semaphore *sem) { int ret = 0; diff --git a/arch/sparc64/kernel/vmlinux.lds.S b/arch/sparc64/kernel/vmlinux.lds.S index ad95e88a3cbc..8faeee09fab2 100644 --- a/arch/sparc64/kernel/vmlinux.lds.S +++ b/arch/sparc64/kernel/vmlinux.lds.S @@ -15,6 +15,7 @@ SECTIONS .text 0x0000000000404000 : { *(.text) + SCHED_TEXT *(.gnu.warning) } =0 _etext = .; diff --git a/arch/sparc64/lib/rwsem.c b/arch/sparc64/lib/rwsem.c index 8e1dfdda91fa..e19968dbc2d1 100644 --- a/arch/sparc64/lib/rwsem.c +++ b/arch/sparc64/lib/rwsem.c @@ -6,6 +6,7 @@ #include #include +#include #include extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem)); @@ -13,7 +14,7 @@ extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *)); extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *)); -void __down_read(struct rw_semaphore *sem) +void __sched __down_read(struct rw_semaphore *sem) { __asm__ __volatile__( "! beginning __down_read\n" @@ -72,7 +73,7 @@ int __down_read_trylock(struct rw_semaphore *sem) } EXPORT_SYMBOL(__down_read_trylock); -void __down_write(struct rw_semaphore *sem) +void __sched __down_write(struct rw_semaphore *sem) { __asm__ __volatile__( "! beginning __down_write\n\t" diff --git a/arch/v850/kernel/process.c b/arch/v850/kernel/process.c index 5c29ae51a303..977d75772d81 100644 --- a/arch/v850/kernel/process.c +++ b/arch/v850/kernel/process.c @@ -203,8 +203,6 @@ int sys_execve (char *name, char **argv, char **envp, struct pt_regs *regs) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here (void); -extern void scheduling_functions_end_here (void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) @@ -228,7 +226,6 @@ unsigned long get_wchan (struct task_struct *p) fp >= 8184+stack_page) return 0; pc = ((unsigned long *)fp)[1]; - /* FIXME: This depends on the order of these functions. */ if (pc < first_sched || pc >= last_sched) return pc; fp = *(unsigned long *) fp; diff --git a/arch/v850/kernel/semaphore.c b/arch/v850/kernel/semaphore.c index b78d714384db..2d20886863d8 100644 --- a/arch/v850/kernel/semaphore.c +++ b/arch/v850/kernel/semaphore.c @@ -15,6 +15,7 @@ #include #include +#include #include @@ -56,7 +57,7 @@ void __up(struct semaphore *sem) static spinlock_t semaphore_lock = SPIN_LOCK_UNLOCKED; -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -89,7 +90,7 @@ void __down(struct semaphore * sem) wake_up(&sem->wait); } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/v850/kernel/vmlinux.lds.S b/arch/v850/kernel/vmlinux.lds.S index 028c224fa66a..07ab0f292d1c 100644 --- a/arch/v850/kernel/vmlinux.lds.S +++ b/arch/v850/kernel/vmlinux.lds.S @@ -64,6 +64,7 @@ #define TEXT_CONTENTS \ __stext = . ; \ *(.text) \ + SCHED_TEXT *(.exit.text) /* 2.5 convention */ \ *(.text.exit) /* 2.4 convention */ \ *(.text.lock) \ diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 7b2414765ca3..d1d9471581a8 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -576,8 +576,6 @@ asmlinkage long sys_vfork(struct pt_regs regs) /* * These bracket the sleeping functions.. */ -extern void scheduling_functions_start_here(void); -extern void scheduling_functions_end_here(void); #define first_sched ((unsigned long) scheduling_functions_start_here) #define last_sched ((unsigned long) scheduling_functions_end_here) diff --git a/arch/x86_64/kernel/semaphore.c b/arch/x86_64/kernel/semaphore.c index 5e517814dd07..2bcd4a7ec38d 100644 --- a/arch/x86_64/kernel/semaphore.c +++ b/arch/x86_64/kernel/semaphore.c @@ -14,6 +14,7 @@ */ #include #include +#include #include #include @@ -54,7 +55,7 @@ void __up(struct semaphore *sem) wake_up(&sem->wait); } -void __down(struct semaphore * sem) +void __sched __down(struct semaphore * sem) { struct task_struct *tsk = current; DECLARE_WAITQUEUE(wait, tsk); @@ -91,7 +92,7 @@ void __down(struct semaphore * sem) tsk->state = TASK_RUNNING; } -int __down_interruptible(struct semaphore * sem) +int __sched __down_interruptible(struct semaphore * sem) { int retval = 0; struct task_struct *tsk = current; diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index 7b9e1beb360e..c612e4d213a1 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -15,6 +15,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : { *(.text) + SCHED_TEXT *(.fixup) *(.gnu.warning) } = 0x9090 diff --git a/arch/x86_64/lib/thunk.S b/arch/x86_64/lib/thunk.S index 876cb937f9f1..acc1e2ca7ed7 100644 --- a/arch/x86_64/lib/thunk.S +++ b/arch/x86_64/lib/thunk.S @@ -35,6 +35,7 @@ .endm + .section .sched.text #ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed @@ -65,7 +66,7 @@ restore_norax: #ifdef CONFIG_SMP /* Support for read/write spinlocks. */ - + .text /* rax: pointer to rwlock_t */ ENTRY(__write_lock_failed) lock diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 59c2b950e8b8..a4b6c768cf49 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -51,3 +51,8 @@ *(.security_initcall.init) \ __security_initcall_end = .; \ } + +#define SCHED_TEXT \ + __scheduling_functions_start_here = .; \ + *(.sched.text) \ + __scheduling_functions_end_here = .; diff --git a/include/linux/init.h b/include/linux/init.h index 45069e275b3d..c6842477243c 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -46,6 +46,8 @@ #define __exitdata __attribute__ ((__section__(".exit.data"))) #define __exit_call __attribute_used__ __attribute__ ((__section__ (".exitcall.exit"))) +#define __sched __attribute__((__section__(".sched.text"))) + #ifdef MODULE #define __exit __attribute__ ((__section__(".exit.text"))) #else diff --git a/include/linux/sched.h b/include/linux/sched.h index f5fa0c07a7f8..054b3c0d5962 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -170,6 +170,8 @@ extern void update_one_process(struct task_struct *p, unsigned long user, unsigned long system, int cpu); extern void scheduler_tick(int user_tick, int system); extern unsigned long cache_decay_ticks; +extern const unsigned long scheduling_functions_start_here; +extern const unsigned long scheduling_functions_end_here; #define MAX_SCHEDULE_TIMEOUT LONG_MAX diff --git a/kernel/sched.c b/kernel/sched.c index 9e19d4c0d4a9..b42029abe679 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -225,6 +225,13 @@ static DEFINE_PER_CPU(struct runqueue, runqueues); #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +extern unsigned long __scheduling_functions_start_here; +extern unsigned long __scheduling_functions_end_here; +const unsigned long scheduling_functions_start_here = + (unsigned long)&__scheduling_functions_start_here; +const unsigned long scheduling_functions_end_here = + (unsigned long)&__scheduling_functions_end_here; + /* * Default context-switch locking: */ @@ -1587,12 +1594,10 @@ out: rebalance_tick(rq, 0); } -void scheduling_functions_start_here(void) { } - /* * schedule() is the main scheduler function. */ -asmlinkage void schedule(void) +asmlinkage void __sched schedule(void) { long *switch_count; task_t *prev, *next; @@ -1731,7 +1736,7 @@ EXPORT_SYMBOL(schedule); * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ -asmlinkage void preempt_schedule(void) +asmlinkage void __sched preempt_schedule(void) { struct thread_info *ti = current_thread_info(); @@ -1869,7 +1874,7 @@ void fastcall complete_all(struct completion *x) spin_unlock_irqrestore(&x->wait.lock, flags); } -void fastcall wait_for_completion(struct completion *x) +void fastcall __sched wait_for_completion(struct completion *x) { might_sleep(); spin_lock_irq(&x->wait.lock); @@ -1907,7 +1912,7 @@ EXPORT_SYMBOL(wait_for_completion); __remove_wait_queue(q, &wait); \ spin_unlock_irqrestore(&q->lock, flags); -void fastcall interruptible_sleep_on(wait_queue_head_t *q) +void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) { SLEEP_ON_VAR @@ -1920,7 +1925,7 @@ void fastcall interruptible_sleep_on(wait_queue_head_t *q) EXPORT_SYMBOL(interruptible_sleep_on); -long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) +long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR @@ -1935,7 +1940,7 @@ long fastcall interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) EXPORT_SYMBOL(interruptible_sleep_on_timeout); -void fastcall sleep_on(wait_queue_head_t *q) +void fastcall __sched sleep_on(wait_queue_head_t *q) { SLEEP_ON_VAR @@ -1948,7 +1953,7 @@ void fastcall sleep_on(wait_queue_head_t *q) EXPORT_SYMBOL(sleep_on); -long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout) +long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR @@ -1963,8 +1968,6 @@ long fastcall sleep_on_timeout(wait_queue_head_t *q, long timeout) EXPORT_SYMBOL(sleep_on_timeout); -void scheduling_functions_end_here(void) { } - void set_user_nice(task_t *p, long nice) { unsigned long flags; @@ -2424,7 +2427,7 @@ asmlinkage long sys_sched_yield(void) return 0; } -void __cond_resched(void) +void __sched __cond_resched(void) { set_current_state(TASK_RUNNING); schedule(); @@ -2438,7 +2441,7 @@ EXPORT_SYMBOL(__cond_resched); * this is a shortcut for kernel-space yielding - it marks the * thread runnable and calls sys_sched_yield(). */ -void yield(void) +void __sched yield(void) { set_current_state(TASK_RUNNING); sys_sched_yield(); @@ -2453,7 +2456,7 @@ EXPORT_SYMBOL(yield); * But don't do that if it is a deliberate, throttling IO wait (this task * has set its backing_dev_info: the queue against which it should throttle) */ -void io_schedule(void) +void __sched io_schedule(void) { struct runqueue *rq = this_rq(); @@ -2464,7 +2467,7 @@ void io_schedule(void) EXPORT_SYMBOL(io_schedule); -long io_schedule_timeout(long timeout) +long __sched io_schedule_timeout(long timeout) { struct runqueue *rq = this_rq(); long ret; @@ -3010,7 +3013,7 @@ EXPORT_SYMBOL(__might_sleep); * * Called inside preempt_disable(). */ -void __preempt_spin_lock(spinlock_t *lock) +void __sched __preempt_spin_lock(spinlock_t *lock) { if (preempt_count() > 1) { _raw_spin_lock(lock); @@ -3026,7 +3029,7 @@ void __preempt_spin_lock(spinlock_t *lock) EXPORT_SYMBOL(__preempt_spin_lock); -void __preempt_write_lock(rwlock_t *lock) +void __sched __preempt_write_lock(rwlock_t *lock) { if (preempt_count() > 1) { _raw_write_lock(lock); diff --git a/kernel/timer.c b/kernel/timer.c index f53e0749b0d2..cbcb5522866d 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -996,7 +996,7 @@ static void process_timeout(unsigned long __data) * * In all cases the return value is guaranteed to be non-negative. */ -fastcall signed long schedule_timeout(signed long timeout) +fastcall signed long __sched schedule_timeout(signed long timeout) { struct timer_list timer; unsigned long expire; @@ -1056,7 +1056,7 @@ asmlinkage long sys_gettid(void) return current->pid; } -static long nanosleep_restart(struct restart_block *restart) +static long __sched nanosleep_restart(struct restart_block *restart) { unsigned long expire = restart->arg0, now = jiffies; struct timespec __user *rmtp = (struct timespec __user *) restart->arg1; diff --git a/lib/rwsem.c b/lib/rwsem.c index 95469d7fb796..85dcae7e9337 100644 --- a/lib/rwsem.c +++ b/lib/rwsem.c @@ -5,6 +5,7 @@ */ #include #include +#include #include struct rwsem_waiter { @@ -162,7 +163,7 @@ static inline struct rw_semaphore *rwsem_down_failed_common(struct rw_semaphore /* * wait for the read lock to be granted */ -struct rw_semaphore fastcall *rwsem_down_read_failed(struct rw_semaphore *sem) +struct rw_semaphore fastcall __sched *rwsem_down_read_failed(struct rw_semaphore *sem) { struct rwsem_waiter waiter; @@ -178,7 +179,7 @@ struct rw_semaphore fastcall *rwsem_down_read_failed(struct rw_semaphore *sem) /* * wait for the write lock to be granted */ -struct rw_semaphore fastcall *rwsem_down_write_failed(struct rw_semaphore *sem) +struct rw_semaphore fastcall __sched *rwsem_down_write_failed(struct rw_semaphore *sem) { struct rwsem_waiter waiter; -- cgit v1.2.3 From 906648b4864649cd72317718ae25ce5b33b0b8c7 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:41:32 -0700 Subject: [PATCH] get_wchan() sparc64 fix From: William Lee Irwin III Now the scheduler text is in its own ELF section this branch is asking for an illegal displacement. --- include/asm-sparc64/system.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/asm-sparc64/system.h b/include/asm-sparc64/system.h index c41ddfc89a2b..c03b2d9d59e7 100644 --- a/include/asm-sparc64/system.h +++ b/include/asm-sparc64/system.h @@ -205,8 +205,10 @@ do { if (test_thread_flag(TIF_PERFCTR)) { \ "ldx [%%g6 + %7], %%g4\n\t" \ "wrpr %%g0, 0x96, %%pstate\n\t" \ "andcc %%o7, %6, %%g0\n\t" \ - "bne,pn %%icc, ret_from_syscall\n\t" \ + "beq,pn %%icc, 1f\n\t" \ " mov %%g5, %0\n\t" \ + "b,a ret_from_syscall\n\t" \ + "1:\n\t" \ : "=&r" (last) \ : "0" (next->thread_info), \ "i" (TI_WSTATE), "i" (TI_KSP), "i" (TI_FLAGS), "i" (TI_CWP), \ -- cgit v1.2.3 From b4e0dd09f6ee56aa1c25ca9dfb4e897f241a5b57 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:42:11 -0700 Subject: [PATCH] ppc64: Fix bug in hugepage support From: David Gibson The PPC64 version of is_aligned_hugepage_range() is buggy. It is supposed to test not only that the given range is hugepage aligned, but that it lies within the address space allowed for hugepages. We were checking only that the given range intersected the hugepage range, not that it lay entirely within it. This patch fixes the problem and changes the name of some macros to make it less likely to make that misunderstanding again. --- arch/ppc64/mm/hugetlbpage.c | 7 ++++--- include/asm-ppc64/page.h | 12 ++++++++---- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c index b763fb9d06e3..125b45ed4cb2 100644 --- a/arch/ppc64/mm/hugetlbpage.c +++ b/arch/ppc64/mm/hugetlbpage.c @@ -230,7 +230,8 @@ int is_aligned_hugepage_range(unsigned long addr, unsigned long len) return -EINVAL; if (addr & ~HPAGE_MASK) return -EINVAL; - if (! is_hugepage_only_range(addr, len)) + if (! (within_hugepage_low_range(addr, len) + || within_hugepage_high_range(addr, len)) ) return -EINVAL; return 0; } @@ -300,9 +301,9 @@ static int open_32bit_htlbpage_range(struct mm_struct *mm) int prepare_hugepage_range(unsigned long addr, unsigned long len) { - if (is_hugepage_high_range(addr, len)) + if (within_hugepage_high_range(addr, len)) return 0; - else if (is_hugepage_low_range(addr, len)) + else if (within_hugepage_low_range(addr, len)) return open_32bit_htlbpage_range(current->mm); return -EINVAL; diff --git a/include/asm-ppc64/page.h b/include/asm-ppc64/page.h index fd707bb57da5..1c53c228ff22 100644 --- a/include/asm-ppc64/page.h +++ b/include/asm-ppc64/page.h @@ -40,15 +40,19 @@ #define ARCH_HAS_HUGEPAGE_ONLY_RANGE #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE -#define is_hugepage_low_range(addr, len) \ +#define touches_hugepage_low_range(addr, len) \ (((addr) > (TASK_HPAGE_BASE_32-(len))) && ((addr) < TASK_HPAGE_END_32)) -#define is_hugepage_high_range(addr, len) \ +#define touches_hugepage_high_range(addr, len) \ (((addr) > (TASK_HPAGE_BASE-(len))) && ((addr) < TASK_HPAGE_END)) +#define within_hugepage_low_range(addr, len) (((addr) >= TASK_HPAGE_BASE_32) \ + && ((addr)+(len) <= TASK_HPAGE_END_32) && ((addr)+(len) >= (addr))) +#define within_hugepage_high_range(addr, len) (((addr) >= TASK_HPAGE_BASE) \ + && ((addr)+(len) <= TASK_HPAGE_END) && ((addr)+(len) >= (addr))) #define is_hugepage_only_range(addr, len) \ - (is_hugepage_high_range((addr), (len)) || \ + (touches_hugepage_high_range((addr), (len)) || \ (current->mm->context.low_hpages \ - && is_hugepage_low_range((addr), (len)))) + && touches_hugepage_low_range((addr), (len)))) #define hugetlb_free_pgtables free_pgtables #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA -- cgit v1.2.3 From 81c31b894ebb3d1409ffa3d54e477a91b5a5ebf3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:43:37 -0700 Subject: [PATCH] ppc64: allow hugepages anywhere in low 4GB From: David Gibson On PPC64, to deal with the restrictions imposed by the PPC MMU's segment design, hugepages are only allowed to be mapping in two fixed address ranges, one 2-3G (for use by 32-bit processes) and one 1-1.5T (for use in 64-bit processes). This is quite limiting, particularly for 32-bit processes which want to use a lot of large page memory. This patch relaxes this restriction, and allows any of the low 16 segments (i.e. those below 4G) to be individually switched over to allow hugepage mappings (provided the segment does not already have any normal page mappings). The 1-1.5T fixed range for 64-bit processes remains. --- arch/ppc64/mm/hugetlbpage.c | 203 ++++++++++++++++++++++++++++---------------- include/asm-ppc64/mmu.h | 4 +- include/asm-ppc64/page.h | 32 +++---- 3 files changed, 151 insertions(+), 88 deletions(-) (limited to 'include') diff --git a/arch/ppc64/mm/hugetlbpage.c b/arch/ppc64/mm/hugetlbpage.c index ceaefc4d9700..f5b5781d21a4 100644 --- a/arch/ppc64/mm/hugetlbpage.c +++ b/arch/ppc64/mm/hugetlbpage.c @@ -241,31 +241,25 @@ static void do_slbia(void *unused) asm volatile ("isync; slbia; isync":::"memory"); } -/* Activate the low hpage region for 32bit processes. mmap_sem must - * be held*/ -static int open_32bit_htlbpage_range(struct mm_struct *mm) +static int prepare_low_seg_for_htlb(struct mm_struct *mm, unsigned long seg) { + unsigned long start = seg << SID_SHIFT; + unsigned long end = (seg+1) << SID_SHIFT; struct vm_area_struct *vma; unsigned long addr; struct mmu_gather *tlb; - if (mm->context.low_hpages) - return 0; /* The window is already open */ - - /* Check no VMAs are in the region */ - vma = find_vma(mm, TASK_HPAGE_BASE_32); + BUG_ON(seg >= 16); - if (vma && (vma->vm_start < TASK_HPAGE_END_32)) { - printk(KERN_DEBUG "Low HTLB region busy: PID=%d vma @ %lx-%lx\n", - current->pid, vma->vm_start, vma->vm_end); + /* Check no VMAs are in the region */ + vma = find_vma(mm, start); + if (vma && (vma->vm_start < end)) return -EBUSY; - } /* Clean up any leftover PTE pages in the region */ spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); - for (addr = TASK_HPAGE_BASE_32; addr < TASK_HPAGE_END_32; - addr += PMD_SIZE) { + for (addr = start; addr < end; addr += PMD_SIZE) { pgd_t *pgd = pgd_offset(mm, addr); pmd_t *pmd; struct page *page; @@ -293,15 +287,29 @@ static int open_32bit_htlbpage_range(struct mm_struct *mm) pgtable_remove_rmap(page); pte_free_tlb(tlb, page); } - tlb_finish_mmu(tlb, TASK_HPAGE_BASE_32, TASK_HPAGE_END_32); + tlb_finish_mmu(tlb, start, end); spin_unlock(&mm->page_table_lock); - mm->context.low_hpages = 1; + return 0; +} + +static int open_low_hpage_segs(struct mm_struct *mm, u16 newsegs) +{ + unsigned long i; + + newsegs &= ~(mm->context.htlb_segs); + if (! newsegs) + return 0; /* The segments we want are already open */ + for (i = 0; i < 16; i++) + if ((1 << i) & newsegs) + if (prepare_low_seg_for_htlb(mm, i) != 0) + return -EBUSY; + + mm->context.htlb_segs |= newsegs; /* the context change must make it to memory before the slbia, * so that further SLB misses do the right thing. */ mb(); - on_each_cpu(do_slbia, NULL, 0, 1); return 0; @@ -311,8 +319,18 @@ int prepare_hugepage_range(unsigned long addr, unsigned long len) { if (within_hugepage_high_range(addr, len)) return 0; - else if (within_hugepage_low_range(addr, len)) - return open_32bit_htlbpage_range(current->mm); + else if ((addr < 0x100000000) && ((addr+len) < 0x100000000)) { + int err; + /* Yes, we need both tests, in case addr+len overflows + * 64-bit arithmetic */ + err = open_low_hpage_segs(current->mm, + LOW_ESID_MASK(addr, len)); + if (err) + printk(KERN_DEBUG "prepare_hugepage_range(%lx, %lx)" + " failed (segs: 0x%04hx)\n", addr, len, + LOW_ESID_MASK(addr, len)); + return err; + } return -EINVAL; } @@ -559,7 +577,7 @@ out: /* Because we have an exclusive hugepage region which lies within the * normal user address space, we have to take special measures to make - * non-huge mmap()s evade the hugepage reserved region. */ + * non-huge mmap()s evade the hugepage reserved regions. */ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -574,36 +592,29 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start) && - !is_hugepage_only_range(addr,len)) + if (((TASK_SIZE - len) >= addr) + && (!vma || (addr+len) <= vma->vm_start) + && !is_hugepage_only_range(addr,len)) return addr; } start_addr = addr = mm->free_area_cache; full_search: - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (TASK_SIZE - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != TASK_UNMAPPED_BASE) { - start_addr = addr = TASK_UNMAPPED_BASE; - goto full_search; - } - return -ENOMEM; + vma = find_vma(mm, addr); + while (TASK_SIZE - len >= addr) { + BUG_ON(vma && (addr >= vma->vm_end)); + + if (touches_hugepage_low_range(addr, len)) { + addr = ALIGN(addr+1, 1<vm_start) { - if (is_hugepage_only_range(addr, len)) { - if (addr < TASK_HPAGE_END_32) - addr = TASK_HPAGE_END_32; - else - addr = TASK_HPAGE_END; - - continue; - } /* * Remember the place where we stopped the search: */ @@ -611,16 +622,70 @@ full_search: return addr; } addr = vma->vm_end; + vma = vma->vm_next; + } + + /* Make sure we didn't miss any holes */ + if (start_addr != TASK_UNMAPPED_BASE) { + start_addr = addr = TASK_UNMAPPED_BASE; + goto full_search; } + return -ENOMEM; +} + +static unsigned long htlb_get_low_area(unsigned long len, u16 segmask) +{ + unsigned long addr = 0; + struct vm_area_struct *vma; + + vma = find_vma(current->mm, addr); + while (addr + len <= 0x100000000UL) { + BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ + + if (! __within_hugepage_low_range(addr, len, segmask)) { + addr = ALIGN(addr+1, 1<mm, addr); + continue; + } + + if (!vma || (addr + len) <= vma->vm_start) + return addr; + addr = ALIGN(vma->vm_end, HPAGE_SIZE); + /* Depending on segmask this might not be a confirmed + * hugepage region, so the ALIGN could have skipped + * some VMAs */ + vma = find_vma(current->mm, addr); + } + + return -ENOMEM; +} + +static unsigned long htlb_get_high_area(unsigned long len) +{ + unsigned long addr = TASK_HPAGE_BASE; + struct vm_area_struct *vma; + + vma = find_vma(current->mm, addr); + for (vma = find_vma(current->mm, addr); + addr + len <= TASK_HPAGE_END; + vma = vma->vm_next) { + BUG_ON(vma && (addr >= vma->vm_end)); /* invariant */ + BUG_ON(! within_hugepage_high_range(addr, len)); + + if (!vma || (addr + len) <= vma->vm_start) + return addr; + addr = ALIGN(vma->vm_end, HPAGE_SIZE); + /* Because we're in a hugepage region, this alignment + * should not skip us over any VMAs */ + } + + return -ENOMEM; } unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - struct vm_area_struct *vma; - unsigned long base, end; - if (len & ~HPAGE_MASK) return -EINVAL; @@ -628,34 +693,30 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return -EINVAL; if (test_thread_flag(TIF_32BIT)) { - int err; - - err = open_32bit_htlbpage_range(current->mm); - if (err) - return err; /* Should this just be EINVAL? */ - - base = TASK_HPAGE_BASE_32; - end = TASK_HPAGE_END_32; - } else { - base = TASK_HPAGE_BASE; - end = TASK_HPAGE_END; - } - - if (!in_hugepage_area(current->mm->context, addr) - || (addr & (HPAGE_SIZE - 1))) - addr = base; + int lastshift = 0; + u16 segmask, cursegs = current->mm->context.htlb_segs; - for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (addr + len > end) - return -ENOMEM; - if (!vma || (addr + len) <= vma->vm_start) + /* First see if we can do the mapping in the existing + * low hpage segments */ + addr = htlb_get_low_area(len, cursegs); + if (addr != -ENOMEM) return addr; - addr = ALIGN(vma->vm_end, HPAGE_SIZE); - /* Because we're in an exclusively hugepage region, - * this alignment shouldn't have skipped over any - * other vmas */ + for (segmask = LOW_ESID_MASK(0x100000000UL-len, len); + ! lastshift; segmask >>=1) { + if (segmask & 1) + lastshift = 1; + + addr = htlb_get_low_area(len, cursegs | segmask); + if ((addr != -ENOMEM) + && open_low_hpage_segs(current->mm, segmask) == 0) + return addr; + } + printk(KERN_DEBUG "hugetlb_get_unmapped_area() unable to open" + " enough segments\n"); + return -ENOMEM; + } else { + return htlb_get_high_area(len); } } diff --git a/include/asm-ppc64/mmu.h b/include/asm-ppc64/mmu.h index a68e47f717e7..b42d9a4db08f 100644 --- a/include/asm-ppc64/mmu.h +++ b/include/asm-ppc64/mmu.h @@ -23,12 +23,12 @@ typedef unsigned long mm_context_id_t; typedef struct { mm_context_id_t id; #ifdef CONFIG_HUGETLB_PAGE - int low_hpages; + u16 htlb_segs; /* bitmask */ #endif } mm_context_t; #ifdef CONFIG_HUGETLB_PAGE -#define KERNEL_LOW_HPAGES .low_hpages = 0, +#define KERNEL_LOW_HPAGES .htlb_segs = 0, #else #define KERNEL_LOW_HPAGES #endif diff --git a/include/asm-ppc64/page.h b/include/asm-ppc64/page.h index 1c53c228ff22..984602ae4fcc 100644 --- a/include/asm-ppc64/page.h +++ b/include/asm-ppc64/page.h @@ -22,6 +22,10 @@ #define PAGE_MASK (~(PAGE_SIZE-1)) #define PAGE_OFFSET_MASK (PAGE_SIZE-1) +#define SID_SHIFT 28 +#define SID_MASK 0xfffffffff +#define GET_ESID(x) (((x) >> SID_SHIFT) & SID_MASK) + #ifdef CONFIG_HUGETLB_PAGE #define HPAGE_SHIFT 24 @@ -33,34 +37,36 @@ #define TASK_HPAGE_BASE (0x0000010000000000UL) #define TASK_HPAGE_END (0x0000018000000000UL) -/* For 32-bit processes the hugepage range is 2-3G */ -#define TASK_HPAGE_BASE_32 (0x80000000UL) -#define TASK_HPAGE_END_32 (0xc0000000UL) +#define LOW_ESID_MASK(addr, len) (((1U << (GET_ESID(addr+len-1)+1)) \ + - (1U << GET_ESID(addr))) & 0xffff) #define ARCH_HAS_HUGEPAGE_ONLY_RANGE #define ARCH_HAS_PREPARE_HUGEPAGE_RANGE #define touches_hugepage_low_range(addr, len) \ - (((addr) > (TASK_HPAGE_BASE_32-(len))) && ((addr) < TASK_HPAGE_END_32)) + (LOW_ESID_MASK((addr), (len)) & current->mm->context.htlb_segs) #define touches_hugepage_high_range(addr, len) \ (((addr) > (TASK_HPAGE_BASE-(len))) && ((addr) < TASK_HPAGE_END)) -#define within_hugepage_low_range(addr, len) (((addr) >= TASK_HPAGE_BASE_32) \ - && ((addr)+(len) <= TASK_HPAGE_END_32) && ((addr)+(len) >= (addr))) + +#define __within_hugepage_low_range(addr, len, segmask) \ + ((LOW_ESID_MASK((addr), (len)) | (segmask)) == (segmask)) +#define within_hugepage_low_range(addr, len) \ + __within_hugepage_low_range((addr), (len), \ + current->mm->context.htlb_segs) #define within_hugepage_high_range(addr, len) (((addr) >= TASK_HPAGE_BASE) \ && ((addr)+(len) <= TASK_HPAGE_END) && ((addr)+(len) >= (addr))) #define is_hugepage_only_range(addr, len) \ (touches_hugepage_high_range((addr), (len)) || \ - (current->mm->context.low_hpages \ - && touches_hugepage_low_range((addr), (len)))) + touches_hugepage_low_range((addr), (len))) #define hugetlb_free_pgtables free_pgtables #define HAVE_ARCH_HUGETLB_UNMAPPED_AREA #define in_hugepage_area(context, addr) \ ((cur_cpu_spec->cpu_features & CPU_FTR_16M_PAGE) && \ - ((((addr) >= TASK_HPAGE_BASE) && ((addr) < TASK_HPAGE_END)) || \ - ((context).low_hpages && \ - (((addr) >= TASK_HPAGE_BASE_32) && ((addr) < TASK_HPAGE_END_32))))) + ( (((addr) >= TASK_HPAGE_BASE) && ((addr) < TASK_HPAGE_END)) || \ + ( ((addr) < 0x100000000L) && \ + ((1 << GET_ESID(addr)) & (context).htlb_segs) ) ) ) #else /* !CONFIG_HUGETLB_PAGE */ @@ -68,10 +74,6 @@ #endif /* !CONFIG_HUGETLB_PAGE */ -#define SID_SHIFT 28 -#define SID_MASK 0xfffffffff -#define GET_ESID(x) (((x) >> SID_SHIFT) & SID_MASK) - /* align addr on a size boundary - adjust address up/down if needed */ #define _ALIGN_UP(addr,size) (((addr)+((size)-1))&(~((size)-1))) #define _ALIGN_DOWN(addr,size) ((addr)&(~((size)-1))) -- cgit v1.2.3 From d9110d3abbe16e21ea3d16ed8f37f22354ca9d4e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:44:15 -0700 Subject: [PATCH] ppc64: Allow PCI devices to use address that happens to fall in the ISA range From: Jake Moilanen Allow PCI devices to use address that happens to fall in the ISA range, but still protect against ISA device accesses when there is not an ISA bus. --- arch/ppc64/kernel/eeh.c | 6 ++++++ arch/ppc64/kernel/pSeries_pci.c | 28 ++++++++++++++++++++++++++-- include/asm-ppc64/eeh.h | 33 +++++++++++++++------------------ 3 files changed, 47 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/arch/ppc64/kernel/eeh.c b/arch/ppc64/kernel/eeh.c index 008ad1ef1783..303eac178519 100644 --- a/arch/ppc64/kernel/eeh.c +++ b/arch/ppc64/kernel/eeh.c @@ -395,6 +395,12 @@ unsigned long eeh_check_failure(void *token, unsigned long val) return val; } + /* Make sure we aren't ISA */ + if (!strcmp(dn->type, "isa")) { + pci_dev_put(dev); + return val; + } + if (!dn->eeh_config_addr) { pci_dev_put(dev); return val; diff --git a/arch/ppc64/kernel/pSeries_pci.c b/arch/ppc64/kernel/pSeries_pci.c index bae29d0f670d..4014ccd9fb60 100644 --- a/arch/ppc64/kernel/pSeries_pci.c +++ b/arch/ppc64/kernel/pSeries_pci.c @@ -44,6 +44,12 @@ #include "open_pic.h" #include "pci.h" +/* legal IO pages under MAX_ISA_PORT. This is to ensure we don't touch + devices we don't have access to. */ +unsigned long io_page_mask; + +EXPORT_SYMBOL(io_page_mask); + /* RTAS tokens */ static int read_pci_config; static int write_pci_config; @@ -280,6 +286,8 @@ static void __init pci_process_bridge_OF_ranges(struct pci_controller *hose, pci_process_ISA_OF_ranges(isa_dn, hose->io_base_phys, hose->io_base_virt); + /* Allow all IO */ + io_page_mask = -1; } } @@ -523,8 +531,24 @@ void __devinit pcibios_fixup_device_resources(struct pci_dev *dev, for (i = 0; i < PCI_NUM_RESOURCES; i++) { if (dev->resource[i].flags & IORESOURCE_IO) { unsigned long offset = (unsigned long)hose->io_base_virt - pci_io_base; - dev->resource[i].start += offset; - dev->resource[i].end += offset; + unsigned long start, end, mask; + + start = dev->resource[i].start += offset; + end = dev->resource[i].end += offset; + + /* Need to allow IO access to pages that are in the + ISA range */ + if (start < MAX_ISA_PORT) { + if (end > MAX_ISA_PORT) + end = MAX_ISA_PORT; + + start >>= PAGE_SHIFT; + end >>= PAGE_SHIFT; + + /* get the range of pages for the map */ + mask = ((1 << (end+1))-1) ^ ((1 << start)-1); + io_page_mask |= mask; + } } else if (dev->resource[i].flags & IORESOURCE_MEM) { dev->resource[i].start += hose->pci_mem_offset; diff --git a/include/asm-ppc64/eeh.h b/include/asm-ppc64/eeh.h index d426126ddab1..4ccf43666ee5 100644 --- a/include/asm-ppc64/eeh.h +++ b/include/asm-ppc64/eeh.h @@ -199,74 +199,71 @@ static inline void eeh_memcpy_toio(void *dest, void *src, unsigned long n) { memcpy(vdest, src, n); } -/* The I/O macros must handle ISA ports as well as PCI I/O bars. - * ISA does not implement EEH and ISA may not exist in the system. - * For PCI we check for EEH failures. - */ -#define _IO_IS_ISA(port) ((port) < 0x10000) -#define _IO_HAS_ISA_BUS (isa_io_base != 0) +#define MAX_ISA_PORT 0x10000 +extern unsigned long io_page_mask; +#define _IO_IS_VALID(port) ((port) >= MAX_ISA_PORT || (1 << (port>>PAGE_SHIFT)) & io_page_mask) static inline u8 eeh_inb(unsigned long port) { u8 val; - if (_IO_IS_ISA(port) && !_IO_HAS_ISA_BUS) + if (!_IO_IS_VALID(port)) return ~0; val = in_8((u8 *)(port+pci_io_base)); - if (!_IO_IS_ISA(port) && EEH_POSSIBLE_IO_ERROR(val, u8)) + if (EEH_POSSIBLE_IO_ERROR(val, u8)) return eeh_check_failure((void*)(port), val); return val; } static inline void eeh_outb(u8 val, unsigned long port) { - if (!_IO_IS_ISA(port) || _IO_HAS_ISA_BUS) + if (_IO_IS_VALID(port)) return out_8((u8 *)(port+pci_io_base), val); } static inline u16 eeh_inw(unsigned long port) { u16 val; - if (_IO_IS_ISA(port) && !_IO_HAS_ISA_BUS) + if (!_IO_IS_VALID(port)) return ~0; val = in_le16((u16 *)(port+pci_io_base)); - if (!_IO_IS_ISA(port) && EEH_POSSIBLE_IO_ERROR(val, u16)) + if (EEH_POSSIBLE_IO_ERROR(val, u16)) return eeh_check_failure((void*)(port), val); return val; } static inline void eeh_outw(u16 val, unsigned long port) { - if (!_IO_IS_ISA(port) || _IO_HAS_ISA_BUS) + if (_IO_IS_VALID(port)) return out_le16((u16 *)(port+pci_io_base), val); } static inline u32 eeh_inl(unsigned long port) { u32 val; - if (_IO_IS_ISA(port) && !_IO_HAS_ISA_BUS) + if (!_IO_IS_VALID(port)) return ~0; val = in_le32((u32 *)(port+pci_io_base)); - if (!_IO_IS_ISA(port) && EEH_POSSIBLE_IO_ERROR(val, u32)) + if (EEH_POSSIBLE_IO_ERROR(val, u32)) return eeh_check_failure((void*)(port), val); return val; } static inline void eeh_outl(u32 val, unsigned long port) { - if (!_IO_IS_ISA(port) || _IO_HAS_ISA_BUS) + if (_IO_IS_VALID(port)) return out_le32((u32 *)(port+pci_io_base), val); } /* in-string eeh macros */ static inline void eeh_insb(unsigned long port, void * buf, int ns) { _insb((u8 *)(port+pci_io_base), buf, ns); - if (!_IO_IS_ISA(port) && EEH_POSSIBLE_IO_ERROR((*(((u8*)buf)+ns-1)), u8)) + if (EEH_POSSIBLE_IO_ERROR((*(((u8*)buf)+ns-1)), u8)) eeh_check_failure((void*)(port), *(u8*)buf); } static inline void eeh_insw_ns(unsigned long port, void * buf, int ns) { _insw_ns((u16 *)(port+pci_io_base), buf, ns); - if (!_IO_IS_ISA(port) && EEH_POSSIBLE_IO_ERROR((*(((u16*)buf)+ns-1)), u16)) + if (EEH_POSSIBLE_IO_ERROR((*(((u16*)buf)+ns-1)), u16)) eeh_check_failure((void*)(port), *(u16*)buf); } static inline void eeh_insl_ns(unsigned long port, void * buf, int nl) { _insl_ns((u32 *)(port+pci_io_base), buf, nl); - if (!_IO_IS_ISA(port) && EEH_POSSIBLE_IO_ERROR((*(((u32*)buf)+nl-1)), u32)) + if (EEH_POSSIBLE_IO_ERROR((*(((u32*)buf)+nl-1)), u32)) eeh_check_failure((void*)(port), *(u32*)buf); } -- cgit v1.2.3 From e80bc2ce5fd11792993c43d02a6425cc0a2138b8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:44:40 -0700 Subject: [PATCH] ppc64: Correct comments for the offsets of fields in paca From: Will Schmidt Correct comments for the offsets of fields in paca --- include/asm-ppc64/paca.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/asm-ppc64/paca.h b/include/asm-ppc64/paca.h index 24a2e99faae1..d368afedd735 100644 --- a/include/asm-ppc64/paca.h +++ b/include/asm-ppc64/paca.h @@ -64,13 +64,13 @@ struct paca_struct { u16 xHwProcNum; /* Physical processor number 0x1A */ u32 default_decr; /* Default decrementer value 0x1c */ u64 xKsave; /* Saved Kernel stack addr or zero 0x20 */ - struct ItLpQueue *lpQueuePtr; /* LpQueue handled by this processor 0x30 */ - u64 xTOC; /* Kernel TOC address 0x38 */ - STAB xStab_data; /* Segment table information 0x40,0x48,0x50 */ - u8 *exception_sp; /* 0x58 */ - u8 xProcEnabled; /* 0x59 */ - u8 prof_enabled; /* 1=iSeries profiling enabled 0x60 */ - u8 resv1[38]; /* 0x61-0x7F */ + struct ItLpQueue *lpQueuePtr; /* LpQueue handled by this processor 0x28 */ + u64 xTOC; /* Kernel TOC address 0x30 */ + STAB xStab_data; /* Segment table information 0x38,0x40,0x48 */ + u8 *exception_sp; /* 0x50 */ + u8 xProcEnabled; /* 0x58 */ + u8 prof_enabled; /* 1=iSeries profiling enabled 0x59 */ + u8 resv1[38]; /* 0x5a-0x7f*/ /*===================================================================================== * CACHE_LINE_2 0x0080 - 0x00FF -- cgit v1.2.3 From 87fb698cd58394a5e1729a9ee56b4ada4fb0a51b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:44:51 -0700 Subject: [PATCH] ppc64: Make rtasd dump KERN_DEBUG From: Jake Moilanen Change the loglevel of an error log printed so it does not goto the console. Since error logs can be upto 2k in size, it can spam the console. --- arch/ppc64/kernel/rtasd.c | 8 ++++---- include/asm-ppc64/rtas.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/ppc64/kernel/rtasd.c b/arch/ppc64/kernel/rtasd.c index 99125ccfe648..b31ffad73195 100644 --- a/arch/ppc64/kernel/rtasd.c +++ b/arch/ppc64/kernel/rtasd.c @@ -78,7 +78,7 @@ static void printk_log_rtas(char *buf, int len) char buffer[64]; char * str = "RTAS event"; - printk(RTAS_ERR "%d -------- %s begin --------\n", error_log_cnt, str); + printk(RTAS_DEBUG "%d -------- %s begin --------\n", error_log_cnt, str); /* * Print perline bytes on each line, each line will start @@ -99,12 +99,12 @@ static void printk_log_rtas(char *buf, int len) n += sprintf(buffer+n, "%02x", (unsigned char)buf[i]); if (j == (perline-1)) - printk(KERN_ERR "%s\n", buffer); + printk(KERN_DEBUG "%s\n", buffer); } if ((i % perline) != 0) - printk(KERN_ERR "%s\n", buffer); + printk(KERN_DEBUG "%s\n", buffer); - printk(RTAS_ERR "%d -------- %s end ----------\n", error_log_cnt, str); + printk(RTAS_DEBUG "%d -------- %s end ----------\n", error_log_cnt, str); } static int log_rtas_len(char * buf) diff --git a/include/asm-ppc64/rtas.h b/include/asm-ppc64/rtas.h index 5ce76143dce1..62838ce91e59 100644 --- a/include/asm-ppc64/rtas.h +++ b/include/asm-ppc64/rtas.h @@ -198,7 +198,7 @@ extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); /* All the types and not flags */ #define ERR_TYPE_MASK (ERR_TYPE_RTAS_LOG | ERR_TYPE_KERNEL_PANIC) -#define RTAS_ERR KERN_ERR "RTAS: " +#define RTAS_DEBUG KERN_DEBUG "RTAS: " #define RTAS_ERROR_LOG_MAX 2048 -- cgit v1.2.3 From 15cddddb837495b9b6441ae201d3be66e897614e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:45:43 -0700 Subject: [PATCH] ppc64: Add support for hotplug cpus From: Joel Schopp Add support for hotplug cpus --- arch/ppc64/Kconfig | 8 ++ arch/ppc64/kernel/idle.c | 14 +++ arch/ppc64/kernel/irq.c | 28 +++-- arch/ppc64/kernel/rtas.c | 19 +++ arch/ppc64/kernel/setup.c | 11 +- arch/ppc64/kernel/smp.c | 302 ++++++++++++++++++++++++++++++++++++++++++---- arch/ppc64/kernel/xics.c | 98 +++++++++++++-- include/asm-ppc64/rtas.h | 2 + include/asm-ppc64/smp.h | 3 + 9 files changed, 444 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/arch/ppc64/Kconfig b/arch/ppc64/Kconfig index 729a949ebe35..9b2f319d0cca 100644 --- a/arch/ppc64/Kconfig +++ b/arch/ppc64/Kconfig @@ -248,6 +248,14 @@ source "fs/Kconfig.binfmt" source "drivers/pci/Kconfig" +config HOTPLUG_CPU + bool "Support for hot-pluggable CPUs" + depends on SMP && HOTPLUG && EXPERIMENTAL + ---help--- + Say Y here to be able to turn CPUs off and on. + + Say N if you are unsure. + source "drivers/pcmcia/Kconfig" source "drivers/pci/hotplug/Kconfig" diff --git a/arch/ppc64/kernel/idle.c b/arch/ppc64/kernel/idle.c index 3ec662afac29..b30aea273974 100644 --- a/arch/ppc64/kernel/idle.c +++ b/arch/ppc64/kernel/idle.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -150,12 +151,18 @@ int default_idle(void) } schedule(); + if (cpu_is_offline(smp_processor_id()) && + system_state == SYSTEM_RUNNING) + cpu_die(); } return 0; } #ifdef CONFIG_PPC_PSERIES + +DECLARE_PER_CPU(smt_snooze_delay); + int dedicated_idle(void) { long oldval; @@ -236,6 +243,9 @@ int dedicated_idle(void) HMT_medium(); lpaca->xLpPaca.xIdle = 0; schedule(); + if (cpu_is_offline(smp_processor_id()) && + system_state == SYSTEM_RUNNING) + cpu_die(); } return 0; } @@ -245,6 +255,10 @@ int shared_idle(void) struct paca_struct *lpaca = get_paca(); while (1) { + if (cpu_is_offline(smp_processor_id()) && + system_state == SYSTEM_RUNNING) + cpu_die(); + /* Indicate to the HV that we are idle. Now would be * a good time to find other work to dispatch. */ lpaca->xLpPaca.xIdle = 1; diff --git a/arch/ppc64/kernel/irq.c b/arch/ppc64/kernel/irq.c index 29a66ad7dc5d..70d7c0ed892e 100644 --- a/arch/ppc64/kernel/irq.c +++ b/arch/ppc64/kernel/irq.c @@ -683,6 +683,7 @@ static struct proc_dir_entry * root_irq_dir; static struct proc_dir_entry * irq_dir [NR_IRQS]; static struct proc_dir_entry * smp_affinity_entry [NR_IRQS]; +/* Protected by irq descriptor spinlock */ #ifdef CONFIG_IRQ_ALL_CPUS cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; #else /* CONFIG_IRQ_ALL_CPUS */ @@ -702,16 +703,17 @@ static int irq_affinity_read_proc (char *page, char **start, off_t off, static int irq_affinity_write_proc (struct file *file, const char *buffer, unsigned long count, void *data) { - int irq = (long)data, full_count = count, err; + int irq = (long)data; + int ret; cpumask_t new_value, tmp; cpumask_t allcpus = CPU_MASK_ALL; if (!irq_desc[irq].handler->set_affinity) return -EIO; - err = cpumask_parse(buffer, count, new_value); - if (err) - return err; + ret = cpumask_parse(buffer, count, new_value); + if (ret != 0) + return ret; /* * We check for CPU_MASK_ALL in xics to send irqs to all cpus. @@ -721,19 +723,30 @@ static int irq_affinity_write_proc (struct file *file, const char *buffer, */ cpus_and(new_value, new_value, allcpus); + /* + * Grab lock here so cpu_online_map can't change, and also + * protect irq_affinity[]. + */ + spin_lock(&irq_desc[irq].lock); + /* * Do not allow disabling IRQs completely - it's a too easy * way to make the system unusable accidentally :-) At least * one online CPU still has to be targeted. */ cpus_and(tmp, new_value, cpu_online_map); - if (cpus_empty(tmp)) - return -EINVAL; + if (cpus_empty(tmp)) { + ret = -EINVAL; + goto out; + } irq_affinity[irq] = new_value; irq_desc[irq].handler->set_affinity(irq, new_value); + ret = count; - return full_count; +out: + spin_unlock(&irq_desc[irq].lock); + return ret; } static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, @@ -946,5 +959,4 @@ unsigned int real_irq_to_virt_slowpath(unsigned int real_irq) } - #endif diff --git a/arch/ppc64/kernel/rtas.c b/arch/ppc64/kernel/rtas.c index 143b2f7ce5e7..afa36ac32add 100644 --- a/arch/ppc64/kernel/rtas.c +++ b/arch/ppc64/kernel/rtas.c @@ -494,6 +494,25 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs) return 0; } +#ifdef CONFIG_HOTPLUG_CPU +/* This version can't take the spinlock. */ + +void rtas_stop_self(void) +{ + struct rtas_args *rtas_args = &(get_paca()->xRtas); + + rtas_args->token = rtas_token("stop-self"); + BUG_ON(rtas_args->token == RTAS_UNKNOWN_SERVICE); + rtas_args->nargs = 0; + rtas_args->nret = 1; + rtas_args->rets = &(rtas_args->args[0]); + + printk("%u %u Ready to die...\n", + smp_processor_id(), hard_smp_processor_id()); + enter_rtas((void *)__pa(rtas_args)); + panic("Alas, I survived.\n"); +} +#endif /* CONFIG_HOTPLUG_CPU */ EXPORT_SYMBOL(rtas_firmware_flash_list); EXPORT_SYMBOL(rtas_token); diff --git a/arch/ppc64/kernel/setup.c b/arch/ppc64/kernel/setup.c index efd3a598466e..0c230d04e9d6 100644 --- a/arch/ppc64/kernel/setup.c +++ b/arch/ppc64/kernel/setup.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -338,8 +339,13 @@ static int show_cpuinfo(struct seq_file *m, void *v) return 0; } - if (!cpu_online(cpu_id)) + /* We only show online cpus: disable preempt (overzealous, I + * knew) to prevent cpu going down. */ + preempt_disable(); + if (!cpu_online(cpu_id)) { + preempt_enable(); return 0; + } #ifdef CONFIG_SMP pvr = per_cpu(pvr, cpu_id); @@ -372,7 +378,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) ppc_proc_freq % 1000000); seq_printf(m, "revision\t: %hd.%hd\n\n", maj, min); - + + preempt_enable(); return 0; } diff --git a/arch/ppc64/kernel/smp.c b/arch/ppc64/kernel/smp.c index f671515c0676..72144b6122f9 100644 --- a/arch/ppc64/kernel/smp.c +++ b/arch/ppc64/kernel/smp.c @@ -230,10 +230,237 @@ static void __devinit smp_openpic_setup_cpu(int cpu) do_openpic_setup_cpu(); } +#ifdef CONFIG_HOTPLUG_CPU +/* Get state of physical CPU. + * Return codes: + * 0 - The processor is in the RTAS stopped state + * 1 - stop-self is in progress + * 2 - The processor is not in the RTAS stopped state + * -1 - Hardware Error + * -2 - Hardware Busy, Try again later. + */ +static int query_cpu_stopped(unsigned int pcpu) +{ + long cpu_status; + int status, qcss_tok; + + qcss_tok = rtas_token("query-cpu-stopped-state"); + BUG_ON(qcss_tok == RTAS_UNKNOWN_SERVICE); + status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu); + if (status != 0) { + printk(KERN_ERR + "RTAS query-cpu-stopped-state failed: %i\n", status); + return status; + } + + return cpu_status; +} + +int __cpu_disable(void) +{ + /* FIXME: go put this in a header somewhere */ + extern void xics_migrate_irqs_away(void); + + systemcfg->processorCount--; + + /*fix boot_cpuid here*/ + if (smp_processor_id() == boot_cpuid) + boot_cpuid = any_online_cpu(cpu_online_map); + + /* FIXME: abstract this to not be platform specific later on */ + xics_migrate_irqs_away(); + return 0; +} + +void __cpu_die(unsigned int cpu) +{ + int tries; + int cpu_status; + unsigned int pcpu = get_hard_smp_processor_id(cpu); + + for (tries = 0; tries < 5; tries++) { + cpu_status = query_cpu_stopped(pcpu); + + if (cpu_status == 0) + break; + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ); + } + if (cpu_status != 0) { + printk("Querying DEAD? cpu %i (%i) shows %i\n", + cpu, pcpu, cpu_status); + } + + /* Isolation and deallocation are definatly done by + * drslot_chrp_cpu. If they were not they would be + * done here. Change isolate state to Isolate and + * change allocation-state to Unusable. + */ + paca[cpu].xProcStart = 0; + + /* So we can recognize if it fails to come up next time. */ + cpu_callin_map[cpu] = 0; +} + +/* Kill this cpu */ +void cpu_die(void) +{ + local_irq_disable(); + rtas_stop_self(); + /* Should never get here... */ + BUG(); + for(;;); +} + +/* Search all cpu device nodes for an offline logical cpu. If a + * device node has a "ibm,my-drc-index" property (meaning this is an + * LPAR), paranoid-check whether we own the cpu. For each "thread" + * of a cpu, if it is offline and has the same hw index as before, + * grab that in preference. + */ +static unsigned int find_physical_cpu_to_start(unsigned int old_hwindex) +{ + struct device_node *np = NULL; + unsigned int best = -1U; + + while ((np = of_find_node_by_type(np, "cpu"))) { + int nr_threads, len; + u32 *index = (u32 *)get_property(np, "ibm,my-drc-index", NULL); + u32 *tid = (u32 *) + get_property(np, "ibm,ppc-interrupt-server#s", &len); + + if (!tid) + tid = (u32 *)get_property(np, "reg", &len); + + if (!tid) + continue; + + /* If there is a drc-index, make sure that we own + * the cpu. + */ + if (index) { + int state; + int rc = rtas_get_sensor(9003, *index, &state); + if (rc != 0 || state != 1) + continue; + } + + nr_threads = len / sizeof(u32); + + while (nr_threads--) { + if (0 == query_cpu_stopped(tid[nr_threads])) { + best = tid[nr_threads]; + if (best == old_hwindex) + goto out; + } + } + } +out: + of_node_put(np); + return best; +} + +/** + * smp_startup_cpu() - start the given cpu + * + * At boot time, there is nothing to do. At run-time, call RTAS with + * the appropriate start location, if the cpu is in the RTAS stopped + * state. + * + * Returns: + * 0 - failure + * 1 - success + */ +static inline int __devinit smp_startup_cpu(unsigned int lcpu) +{ + int status; + extern void (*pseries_secondary_smp_init)(unsigned int cpu); + unsigned long start_here = __pa(pseries_secondary_smp_init); + unsigned int pcpu; + + /* At boot time the cpus are already spinning in hold + * loops, so nothing to do. */ + if (system_state == SYSTEM_BOOTING) + return 1; + + pcpu = find_physical_cpu_to_start(get_hard_smp_processor_id(lcpu)); + if (pcpu == -1U) { + printk(KERN_INFO "No more cpus available, failing\n"); + return 0; + } + + /* Fixup atomic count: it exited inside IRQ handler. */ + ((struct task_struct *)paca[lcpu].xCurrent)->thread_info->preempt_count + = 0; + /* Fixup SLB round-robin so next segment (kernel) goes in segment 0 */ + paca[lcpu].xStab_data.next_round_robin = 0; + + /* At boot this is done in prom.c. */ + paca[lcpu].xHwProcNum = pcpu; + + status = rtas_call(rtas_token("start-cpu"), 3, 1, NULL, + pcpu, start_here, lcpu); + if (status != 0) { + printk(KERN_ERR "start-cpu failed: %i\n", status); + return 0; + } + return 1; +} + +static inline void look_for_more_cpus(void) +{ + int num_addr_cell, num_size_cell, len, i, maxcpus; + struct device_node *np; + unsigned int *ireg; + + /* Find the property which will tell us about how many CPUs + * we're allowed to have. */ + if ((np = find_path_device("/rtas")) == NULL) { + printk(KERN_ERR "Could not find /rtas in device tree!"); + return; + } + num_addr_cell = prom_n_addr_cells(np); + num_size_cell = prom_n_size_cells(np); + + ireg = (unsigned int *)get_property(np, "ibm,lrdr-capacity", &len); + if (ireg == NULL) { + /* FIXME: make sure not marked as lrdr_capable() */ + return; + } + + maxcpus = ireg[num_addr_cell + num_size_cell]; + /* DRENG need to account for threads here too */ + + if (maxcpus > NR_CPUS) { + printk(KERN_WARNING + "Partition configured for %d cpus, " + "operating system maximum is %d.\n", maxcpus, NR_CPUS); + maxcpus = NR_CPUS; + } else + printk(KERN_INFO "Partition configured for %d cpus.\n", + maxcpus); + + /* Make those cpus (which might appear later) possible too. */ + for (i = 0; i < maxcpus; i++) + cpu_set(i, cpu_possible_map); +} +#else /* ... CONFIG_HOTPLUG_CPU */ +static inline int __devinit smp_startup_cpu(unsigned int lcpu) +{ + return 1; +} +static inline void look_for_more_cpus(void) +{ +} +#endif /* CONFIG_HOTPLUG_CPU */ + static void smp_pSeries_kick_cpu(int nr) { BUG_ON(nr < 0 || nr >= NR_CPUS); + if (!smp_startup_cpu(nr)) + return; + /* The processor is currently spinning, waiting * for the xProcStart field to become non-zero * After we set xProcStart, the processor will @@ -241,7 +468,7 @@ static void smp_pSeries_kick_cpu(int nr) */ paca[nr].xProcStart = 1; } -#endif +#endif /* CONFIG_PPC_PSERIES */ static void __init smp_space_timers(unsigned int max_cpus) { @@ -462,12 +689,9 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, int wait) { struct call_data_struct data; - int ret = -1, cpus = num_online_cpus()-1; + int ret = -1, cpus; unsigned long timeout; - if (!cpus) - return 0; - data.func = func; data.info = info; atomic_set(&data.started, 0); @@ -476,6 +700,14 @@ int smp_call_function (void (*func) (void *info), void *info, int nonatomic, atomic_set(&data.finished, 0); spin_lock(&call_lock); + /* Must grab online cpu count with preempt disabled, otherwise + * it can change. */ + cpus = num_online_cpus() - 1; + if (!cpus) { + ret = 0; + goto out; + } + call_data = &data; wmb(); /* Send a message to all other CPUs and wait for them to respond */ @@ -565,8 +797,31 @@ static void __devinit smp_store_cpu_info(int id) per_cpu(pvr, id) = _get_PVR(); } +static void __init smp_create_idle(unsigned int cpu) +{ + struct pt_regs regs; + struct task_struct *p; + + /* create a process for the processor */ + /* only regs.msr is actually used, and 0 is OK for it */ + memset(®s, 0, sizeof(struct pt_regs)); + p = copy_process(CLONE_VM | CLONE_IDLETASK, + 0, ®s, 0, NULL, NULL); + if (IS_ERR(p)) + panic("failed fork for CPU %u: %li", cpu, PTR_ERR(p)); + + wake_up_forked_process(p); + init_idle(p, cpu); + unhash_process(p); + + paca[cpu].xCurrent = (u64)p; + current_set[cpu] = p->thread_info; +} + void __init smp_prepare_cpus(unsigned int max_cpus) { + unsigned int cpu; + /* * setup_cpu may need to be called on the boot cpu. We havent * spun any cpus up but lets be paranoid. @@ -593,6 +848,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus) * number of msecs off until someone does a settimeofday() */ do_gtod.tb_orig_stamp = tb_last_stamp; + + look_for_more_cpus(); #endif max_cpus = smp_ops->probe(); @@ -601,20 +858,31 @@ void __init smp_prepare_cpus(unsigned int max_cpus) __save_cpu_setup(); smp_space_timers(max_cpus); + + for_each_cpu(cpu) + if (cpu != boot_cpuid) + smp_create_idle(cpu); } void __devinit smp_prepare_boot_cpu(void) { - cpu_set(smp_processor_id(), cpu_online_map); - /* FIXME: what about cpu_possible()? */ + BUG_ON(smp_processor_id() != boot_cpuid); + + /* cpu_possible is set up in prom.c */ + cpu_set(boot_cpuid, cpu_online_map); + + paca[boot_cpuid].xCurrent = (u64)current; + current_set[boot_cpuid] = current->thread_info; } int __devinit __cpu_up(unsigned int cpu) { - struct pt_regs regs; - struct task_struct *p; int c; + /* At boot, don't bother with non-present cpus -JSCHOPP */ + if (system_state == SYSTEM_BOOTING && !cpu_present_at_boot(cpu)) + return -ENOENT; + paca[cpu].prof_counter = 1; paca[cpu].prof_multiplier = 1; paca[cpu].default_decr = tb_ticks_per_jiffy / decr_overclock; @@ -632,19 +900,9 @@ int __devinit __cpu_up(unsigned int cpu) paca[cpu].xStab_data.real = virt_to_abs(tmp); } - /* create a process for the processor */ - /* only regs.msr is actually used, and 0 is OK for it */ - memset(®s, 0, sizeof(struct pt_regs)); - p = copy_process(CLONE_VM|CLONE_IDLETASK, 0, ®s, 0, NULL, NULL); - if (IS_ERR(p)) - panic("failed fork for CPU %u: %li", cpu, PTR_ERR(p)); - - wake_up_forked_process(p); - init_idle(p, cpu); - unhash_process(p); - - paca[cpu].xCurrent = (u64)p; - current_set[cpu] = p->thread_info; + /* The information for processor bringup must be written out + * to main store before we release the processor. */ + mb(); /* The information for processor bringup must * be written out to main store before we release diff --git a/arch/ppc64/kernel/xics.c b/arch/ppc64/kernel/xics.c index 9696dc866540..c4d4574cc675 100644 --- a/arch/ppc64/kernel/xics.c +++ b/arch/ppc64/kernel/xics.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -372,6 +373,9 @@ irqreturn_t xics_ipi_action(int irq, void *dev_id, struct pt_regs *regs) int cpu = smp_processor_id(); ops->qirr_info(cpu, 0xff); + + WARN_ON(cpu_is_offline(cpu)); + while (xics_ipi_message[cpu].value) { if (test_and_clear_bit(PPC_MSG_CALL_FUNCTION, &xics_ipi_message[cpu].value)) { @@ -514,6 +518,9 @@ nextnode: if (systemcfg->platform == PLATFORM_PSERIES) { #ifdef CONFIG_SMP for_each_cpu(i) { + /* FIXME: Do this dynamically! --RR */ + if (!cpu_present_at_boot(i)) + continue; xics_per_cpu[i] = __ioremap((ulong)inodes[get_hard_smp_processor_id(i)].addr, (ulong)inodes[get_hard_smp_processor_id(i)].size, _PAGE_NO_CACHE); @@ -575,9 +582,7 @@ void xics_request_IPIs(void) static void xics_set_affinity(unsigned int virq, cpumask_t cpumask) { - irq_desc_t *desc = irq_desc + virq; unsigned int irq; - unsigned long flags; long status; unsigned long xics_status[2]; unsigned long newmask; @@ -589,14 +594,12 @@ static void xics_set_affinity(unsigned int virq, cpumask_t cpumask) if (irq == XICS_IPI) return; - spin_lock_irqsave(&desc->lock, flags); - status = rtas_call(ibm_get_xive, 1, 3, (void *)&xics_status, irq); if (status) { printk(KERN_ERR "xics_set_affinity: irq=%d ibm,get-xive " "returns %ld\n", irq, status); - goto out; + return; } /* For the moment only implement delivery to all cpus or one cpu */ @@ -605,7 +608,7 @@ static void xics_set_affinity(unsigned int virq, cpumask_t cpumask) } else { cpus_and(tmp, cpu_online_map, cpumask); if (cpus_empty(tmp)) - goto out; + return; newmask = get_hard_smp_processor_id(first_cpu(cpumask)); } @@ -615,9 +618,86 @@ static void xics_set_affinity(unsigned int virq, cpumask_t cpumask) if (status) { printk(KERN_ERR "xics_set_affinity irq=%d ibm,set-xive " "returns %ld\n", irq, status); - goto out; + return; + } +} + +#ifdef CONFIG_HOTPLUG_CPU + +/* Interrupts are disabled. */ +void xics_migrate_irqs_away(void) +{ + int set_indicator = rtas_token("set-indicator"); + const unsigned long giqs = 9005UL; /* Global Interrupt Queue Server */ + unsigned long status = 0; + unsigned int irq, cpu = smp_processor_id(); + unsigned long xics_status[2]; + unsigned long flags; + + BUG_ON(set_indicator == RTAS_UNKNOWN_SERVICE); + + /* Reject any interrupt that was queued to us... */ + ops->cppr_info(cpu, 0); + iosync(); + + /* Refuse any new interrupts... */ + rtas_call(set_indicator, 3, 1, &status, giqs, + hard_smp_processor_id(), 0UL); + WARN_ON(status != 0); + + /* Allow IPIs again... */ + ops->cppr_info(cpu, DEFAULT_PRIORITY); + iosync(); + + printk(KERN_WARNING "HOTPLUG: Migrating IRQs away\n"); + for_each_irq(irq) { + irq_desc_t *desc = get_irq_desc(irq); + + /* We need to get IPIs still. */ + if (irq_offset_down(irq) == XICS_IPI) + continue; + + /* We only need to migrate enabled IRQS */ + if (desc == NULL || desc->handler == NULL + || desc->action == NULL + || desc->handler->set_affinity == NULL) + continue; + + spin_lock_irqsave(&desc->lock, flags); + + status = rtas_call(ibm_get_xive, 1, 3, (void *)&xics_status, + irq); + if (status) { + printk(KERN_ERR "migrate_irqs_away: irq=%d " + "ibm,get-xive returns %ld\n", + irq, status); + goto unlock; + } + + /* + * We only support delivery to all cpus or to one cpu. + * The irq has to be migrated only in the single cpu + * case. + */ + if (xics_status[0] != get_hard_smp_processor_id(cpu)) + goto unlock; + + printk(KERN_WARNING "IRQ %d affinity broken off cpu %u\n", + irq, cpu); + + /* Reset affinity to all cpus */ + xics_status[0] = default_distrib_server; + + status = rtas_call(ibm_set_xive, 3, 1, NULL, + irq, xics_status[0], xics_status[1]); + if (status) + printk(KERN_ERR "migrate_irqs_away irq=%d " + "ibm,set-xive returns %ld\n", + irq, status); + +unlock: + spin_unlock_irqrestore(&desc->lock, flags); } -out: - spin_unlock_irqrestore(&desc->lock, flags); } +#endif diff --git a/include/asm-ppc64/rtas.h b/include/asm-ppc64/rtas.h index 62838ce91e59..712747a2b3f9 100644 --- a/include/asm-ppc64/rtas.h +++ b/include/asm-ppc64/rtas.h @@ -219,6 +219,8 @@ extern void pSeries_log_error(char *buf, unsigned int err_type, int fatal); extern spinlock_t rtas_data_buf_lock; extern char rtas_data_buf[RTAS_DATA_BUF_SIZE]; +extern void rtas_stop_self(void); + /* RMO buffer reserved for user-space RTAS use */ extern unsigned long rtas_rmo_buf; diff --git a/include/asm-ppc64/smp.h b/include/asm-ppc64/smp.h index 22fc412bcfc9..8a96f975e496 100644 --- a/include/asm-ppc64/smp.h +++ b/include/asm-ppc64/smp.h @@ -70,6 +70,9 @@ extern cpumask_t cpu_available_map; void smp_init_iSeries(void); void smp_init_pSeries(void); +extern int __cpu_disable(void); +extern void __cpu_die(unsigned int cpu); +extern void cpu_die(void) __attribute__((noreturn)); #endif /* !(CONFIG_SMP) */ #define get_hard_smp_processor_id(CPU) (paca[(CPU)].xHwProcNum) -- cgit v1.2.3 From 69bc70b9552bcf74b35053ba91bc4c909522dcf0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:45:56 -0700 Subject: [PATCH] ppc64: Add RTAS os-term call for panic on pSeries From: Michael Strosaker Add RTAS os-term call for panic on pSeries --- arch/ppc64/kernel/chrp_setup.c | 1 + arch/ppc64/kernel/rtas.c | 21 +++++++++++++++++++++ arch/ppc64/kernel/setup.c | 19 +++++++++++++++++++ include/asm-ppc64/machdep.h | 1 + include/asm-ppc64/rtas.h | 1 + 5 files changed, 43 insertions(+) (limited to 'include') diff --git a/arch/ppc64/kernel/chrp_setup.c b/arch/ppc64/kernel/chrp_setup.c index a2e281768381..4d7756de570f 100644 --- a/arch/ppc64/kernel/chrp_setup.c +++ b/arch/ppc64/kernel/chrp_setup.c @@ -267,6 +267,7 @@ chrp_init(unsigned long r3, unsigned long r4, unsigned long r5, ppc_md.restart = rtas_restart; ppc_md.power_off = rtas_power_off; ppc_md.halt = rtas_halt; + ppc_md.panic = rtas_os_term; ppc_md.get_boot_time = pSeries_get_boot_time; ppc_md.get_rtc_time = pSeries_get_rtc_time; diff --git a/arch/ppc64/kernel/rtas.c b/arch/ppc64/kernel/rtas.c index afa36ac32add..4a27c3d8312c 100644 --- a/arch/ppc64/kernel/rtas.c +++ b/arch/ppc64/kernel/rtas.c @@ -448,6 +448,27 @@ rtas_halt(void) rtas_power_off(); } +/* Must be in the RMO region, so we place it here */ +static char rtas_os_term_buf[2048]; + +void rtas_os_term(char *str) +{ + long status; + + snprintf(rtas_os_term_buf, 2048, "OS panic: %s", str); + + do { + status = rtas_call(rtas_token("ibm,os-term"), 1, 1, NULL, + __pa(rtas_os_term_buf)); + + if (status == RTAS_BUSY) + udelay(1); + else if (status != 0) + printk(KERN_EMERG "ibm,os-term call failed %ld\n", + status); + } while (status == RTAS_BUSY); +} + unsigned long rtas_rmo_buf = 0; asmlinkage int ppc_rtas(struct rtas_args __user *uargs) diff --git a/arch/ppc64/kernel/setup.c b/arch/ppc64/kernel/setup.c index 0c230d04e9d6..9c220a4c039c 100644 --- a/arch/ppc64/kernel/setup.c +++ b/arch/ppc64/kernel/setup.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -94,6 +95,13 @@ unsigned long SYSRQ_KEY; struct machdep_calls ppc_md; +static int ppc64_panic_event(struct notifier_block *, unsigned long, void *); + +static struct notifier_block ppc64_panic_block = { + notifier_call: ppc64_panic_event, + priority: INT_MIN /* may not return; must be done last */ +}; + /* * Perhaps we can put the pmac screen_info[] here * on pmac as well so we don't need the ifdef's. @@ -319,6 +327,14 @@ EXPORT_SYMBOL(machine_halt); unsigned long ppc_proc_freq; unsigned long ppc_tb_freq; +static int ppc64_panic_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + ppc_md.panic((char *)ptr); /* May not return */ + return NOTIFY_DONE; +} + + #ifdef CONFIG_SMP DEFINE_PER_CPU(unsigned int, pvr); #endif @@ -605,6 +621,9 @@ void __init setup_arch(char **cmdline_p) /* reboot on panic */ panic_timeout = 180; + if (ppc_md.panic) + notifier_chain_register(&panic_notifier_list, &ppc64_panic_block); + init_mm.start_code = PAGE_OFFSET; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; diff --git a/include/asm-ppc64/machdep.h b/include/asm-ppc64/machdep.h index a4d181d79c21..10e7e9ec6251 100644 --- a/include/asm-ppc64/machdep.h +++ b/include/asm-ppc64/machdep.h @@ -79,6 +79,7 @@ struct machdep_calls { void (*restart)(char *cmd); void (*power_off)(void); void (*halt)(void); + void (*panic)(char *str); int (*set_rtc_time)(struct rtc_time *); void (*get_rtc_time)(struct rtc_time *); diff --git a/include/asm-ppc64/rtas.h b/include/asm-ppc64/rtas.h index 712747a2b3f9..7f6139064c7c 100644 --- a/include/asm-ppc64/rtas.h +++ b/include/asm-ppc64/rtas.h @@ -175,6 +175,7 @@ extern void call_rtas_display_status(char); extern void rtas_restart(char *cmd); extern void rtas_power_off(void); extern void rtas_halt(void); +extern void rtas_os_term(char *str); extern int rtas_get_sensor(int sensor, int index, int *state); extern int rtas_get_power_level(int powerdomain, int *level); extern int rtas_set_power_level(int powerdomain, int level, int *setlevel); -- cgit v1.2.3 From b902751692396b28b9dc5b9c7266bfc32090f333 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:46:22 -0700 Subject: [PATCH] ppc64: irq cleanups From: Paul Mackerras Create and use irq_offset_up/down, get_irq_desc, for_each_irq --- arch/ppc64/kernel/chrp_setup.c | 2 +- arch/ppc64/kernel/i8259.c | 4 +-- arch/ppc64/kernel/iSeries_irq.c | 4 +-- arch/ppc64/kernel/iSeries_setup.h | 3 -- arch/ppc64/kernel/irq.c | 59 ++++++++++++++++++++------------------- arch/ppc64/kernel/open_pic.c | 23 +++++++++------ arch/ppc64/kernel/open_pic.h | 8 +----- arch/ppc64/kernel/prom.c | 4 +-- arch/ppc64/kernel/ras.c | 5 ++-- arch/ppc64/kernel/setup.c | 3 +- arch/ppc64/kernel/vio.c | 3 +- arch/ppc64/kernel/xics.c | 36 +++++++++++------------- include/asm-ppc64/hw_irq.h | 21 ++++++++++++-- include/asm-ppc64/irq.h | 41 ++++++++++++++++++++++----- include/asm-ppc64/smp.h | 2 ++ 15 files changed, 130 insertions(+), 88 deletions(-) (limited to 'include') diff --git a/arch/ppc64/kernel/chrp_setup.c b/arch/ppc64/kernel/chrp_setup.c index 4d7756de570f..5a1d60221c15 100644 --- a/arch/ppc64/kernel/chrp_setup.c +++ b/arch/ppc64/kernel/chrp_setup.c @@ -252,7 +252,7 @@ chrp_init(unsigned long r3, unsigned long r4, unsigned long r5, ppc_md.setup_arch = chrp_setup_arch; ppc_md.get_cpuinfo = chrp_get_cpuinfo; - if(naca->interrupt_controller == IC_OPEN_PIC) { + if (naca->interrupt_controller == IC_OPEN_PIC) { ppc_md.init_IRQ = pSeries_init_openpic; ppc_md.get_irq = openpic_get_irq; } else { diff --git a/arch/ppc64/kernel/i8259.c b/arch/ppc64/kernel/i8259.c index c1026da59fb7..2f2b9bf8cf1c 100644 --- a/arch/ppc64/kernel/i8259.c +++ b/arch/ppc64/kernel/i8259.c @@ -124,8 +124,8 @@ static void i8259_unmask_irq(unsigned int irq_nr) static void i8259_end_irq(unsigned int irq) { - if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS)) && - irq_desc[irq].action) + if (!(get_irq_desc(irq)->status & (IRQ_DISABLED|IRQ_INPROGRESS)) && + get_irq_desc(irq)->action) i8259_unmask_irq(irq); } diff --git a/arch/ppc64/kernel/iSeries_irq.c b/arch/ppc64/kernel/iSeries_irq.c index abbe9a499377..fe6d63676c09 100644 --- a/arch/ppc64/kernel/iSeries_irq.c +++ b/arch/ppc64/kernel/iSeries_irq.c @@ -122,8 +122,8 @@ void __init iSeries_activate_IRQs() int irq; unsigned long flags; - for (irq = 0; irq < NR_IRQS; irq++) { - irq_desc_t *desc = &irq_desc[irq]; + for_each_irq (irq) { + irq_desc_t *desc = get_irq_desc(irq); if (desc && desc->handler && desc->handler->startup) { spin_lock_irqsave(&desc->lock, flags); diff --git a/arch/ppc64/kernel/iSeries_setup.h b/arch/ppc64/kernel/iSeries_setup.h index 53776d403508..240dad4ef20c 100644 --- a/arch/ppc64/kernel/iSeries_setup.h +++ b/arch/ppc64/kernel/iSeries_setup.h @@ -19,8 +19,6 @@ #ifndef __ISERIES_SETUP_H__ #define __ISERIES_SETUP_H__ -#include /* for irq_desc_t */ - extern void iSeries_init_early(void); extern void iSeries_init(unsigned long r3, unsigned long ird_start, unsigned long ird_end, unsigned long cline_start, @@ -29,7 +27,6 @@ extern void iSeries_setup_arch(void); extern void iSeries_setup_residual(struct seq_file *m, int cpu_id); extern void iSeries_get_cpuinfo(struct seq_file *m); extern void iSeries_init_IRQ(void); -extern void iSeries_init_irq_desc(irq_desc_t *); extern int iSeries_get_irq(struct pt_regs *regs); extern void iSeries_restart(char *cmd); extern void iSeries_power_off(void); diff --git a/arch/ppc64/kernel/irq.c b/arch/ppc64/kernel/irq.c index 70d7c0ed892e..5a4d3e47241e 100644 --- a/arch/ppc64/kernel/irq.c +++ b/arch/ppc64/kernel/irq.c @@ -67,6 +67,7 @@ irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { } }; +int __irq_offset_value; int ppc_spurious_interrupts = 0; unsigned long lpEvent_count = 0; @@ -76,7 +77,7 @@ setup_irq(unsigned int irq, struct irqaction * new) int shared = 0; unsigned long flags; struct irqaction *old, **p; - irq_desc_t *desc = irq_desc + irq; + irq_desc_t *desc = get_irq_desc(irq); /* * Some drivers like serial.c use request_irq() heavily, @@ -134,7 +135,7 @@ setup_irq(unsigned int irq, struct irqaction * new) inline void synchronize_irq(unsigned int irq) { - while (irq_desc[irq].status & IRQ_INPROGRESS) + while (get_irq_desc(irq)->status & IRQ_INPROGRESS) cpu_relax(); } @@ -148,11 +149,10 @@ EXPORT_SYMBOL(synchronize_irq); static int do_free_irq(int irq, void* dev_id) { - irq_desc_t *desc; + irq_desc_t *desc = get_irq_desc(irq); struct irqaction **p; unsigned long flags; - desc = irq_desc + irq; spin_lock_irqsave(&desc->lock,flags); p = &desc->action; for (;;) { @@ -247,7 +247,7 @@ EXPORT_SYMBOL(free_irq); inline void disable_irq_nosync(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + irq_desc_t *desc = get_irq_desc(irq); unsigned long flags; spin_lock_irqsave(&desc->lock, flags); @@ -276,7 +276,7 @@ EXPORT_SYMBOL(disable_irq_nosync); void disable_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + irq_desc_t *desc = get_irq_desc(irq); disable_irq_nosync(irq); if (desc->action) synchronize_irq(irq); @@ -296,7 +296,7 @@ EXPORT_SYMBOL(disable_irq); void enable_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + irq_desc_t *desc = get_irq_desc(irq); unsigned long flags; spin_lock_irqsave(&desc->lock, flags); @@ -327,6 +327,7 @@ int show_interrupts(struct seq_file *p, void *v) { int i = *(loff_t *) v, j; struct irqaction * action; + irq_desc_t *desc; unsigned long flags; if (i == 0) { @@ -339,8 +340,9 @@ int show_interrupts(struct seq_file *p, void *v) } if (i < NR_IRQS) { - spin_lock_irqsave(&irq_desc[i].lock, flags); - action = irq_desc[i].action; + desc = get_irq_desc(i); + spin_lock_irqsave(&desc->lock, flags); + action = desc->action; if (!action || !action->handler) goto skip; seq_printf(p, "%3d: ", i); @@ -352,17 +354,17 @@ int show_interrupts(struct seq_file *p, void *v) #else seq_printf(p, "%10u ", kstat_irqs(i)); #endif /* CONFIG_SMP */ - if (irq_desc[i].handler) - seq_printf(p, " %s ", irq_desc[i].handler->typename ); + if (desc->handler) + seq_printf(p, " %s ", desc->handler->typename ); else seq_printf(p, " None "); - seq_printf(p, "%s", (irq_desc[i].status & IRQ_LEVEL) ? "Level " : "Edge "); + seq_printf(p, "%s", (desc->status & IRQ_LEVEL) ? "Level " : "Edge "); seq_printf(p, " %s",action->name); for (action=action->next; action; action = action->next) seq_printf(p, ", %s", action->name); seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + spin_unlock_irqrestore(&desc->lock, flags); } else if (i == NR_IRQS) seq_printf(p, "BAD: %10u\n", ppc_spurious_interrupts); return 0; @@ -482,7 +484,7 @@ void ppc_irq_dispatch_handler(struct pt_regs *regs, int irq) int status; struct irqaction *action; int cpu = smp_processor_id(); - irq_desc_t *desc = irq_desc + irq; + irq_desc_t *desc = get_irq_desc(irq); irqreturn_t action_ret; kstat_cpu(cpu).irqs[irq]++; @@ -564,11 +566,11 @@ out: * The ->end() handler has to deal with interrupts which got * disabled while the handler was running. */ - if (irq_desc[irq].handler) { - if (irq_desc[irq].handler->end) - irq_desc[irq].handler->end(irq); - else if (irq_desc[irq].handler->enable) - irq_desc[irq].handler->enable(irq); + if (desc->handler) { + if (desc->handler->end) + desc->handler->end(irq); + else if (desc->handler->enable) + desc->handler->enable(irq); } spin_unlock(&desc->lock); } @@ -683,7 +685,7 @@ static struct proc_dir_entry * root_irq_dir; static struct proc_dir_entry * irq_dir [NR_IRQS]; static struct proc_dir_entry * smp_affinity_entry [NR_IRQS]; -/* Protected by irq descriptor spinlock */ +/* Protected by get_irq_desc(irq)->lock. */ #ifdef CONFIG_IRQ_ALL_CPUS cpumask_t irq_affinity [NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL }; #else /* CONFIG_IRQ_ALL_CPUS */ @@ -703,12 +705,13 @@ static int irq_affinity_read_proc (char *page, char **start, off_t off, static int irq_affinity_write_proc (struct file *file, const char *buffer, unsigned long count, void *data) { - int irq = (long)data; + unsigned int irq = (long)data; + irq_desc_t *desc = get_irq_desc(irq); int ret; cpumask_t new_value, tmp; cpumask_t allcpus = CPU_MASK_ALL; - if (!irq_desc[irq].handler->set_affinity) + if (!desc->handler->set_affinity) return -EIO; ret = cpumask_parse(buffer, count, new_value); @@ -727,7 +730,7 @@ static int irq_affinity_write_proc (struct file *file, const char *buffer, * Grab lock here so cpu_online_map can't change, and also * protect irq_affinity[]. */ - spin_lock(&irq_desc[irq].lock); + spin_lock(&desc->lock); /* * Do not allow disabling IRQs completely - it's a too easy @@ -741,11 +744,11 @@ static int irq_affinity_write_proc (struct file *file, const char *buffer, } irq_affinity[irq] = new_value; - irq_desc[irq].handler->set_affinity(irq, new_value); + desc->handler->set_affinity(irq, new_value); ret = count; out: - spin_unlock(&irq_desc[irq].lock); + spin_unlock(&desc->lock); return ret; } @@ -841,8 +844,8 @@ void init_irq_proc (void) /* * Create entries for all existing IRQs. */ - for (i = 0; i < NR_IRQS; i++) { - if (irq_desc[i].handler == NULL) + for_each_irq(i) { + if (get_irq_desc(i)->handler == NULL) continue; register_irq_proc(i); } @@ -870,7 +873,7 @@ unsigned int virt_irq_to_real_map[NR_IRQS]; * we don't end up with an interrupt number >= NR_IRQS. */ #define MIN_VIRT_IRQ 3 -#define MAX_VIRT_IRQ (NR_IRQS - NUM_8259_INTERRUPTS - 1) +#define MAX_VIRT_IRQ (NR_IRQS - NUM_ISA_INTERRUPTS - 1) #define NR_VIRT_IRQS (MAX_VIRT_IRQ - MIN_VIRT_IRQ + 1) void diff --git a/arch/ppc64/kernel/open_pic.c b/arch/ppc64/kernel/open_pic.c index e97d6ddc18d9..0eed791f3eb6 100644 --- a/arch/ppc64/kernel/open_pic.c +++ b/arch/ppc64/kernel/open_pic.c @@ -67,7 +67,6 @@ static void openpic_disable_irq(u_int irq); static void openpic_initirq(u_int irq, u_int pri, u_int vector, int polarity, int is_level); static void openpic_mapirq(u_int irq, u_int cpumask); -static void openpic_set_sense(u_int irq, int sense); static void find_ISUs(void); @@ -170,7 +169,7 @@ void __init pSeries_init_openpic(void) int i; unsigned int *addrp; unsigned char* chrp_int_ack_special = 0; - unsigned char init_senses[NR_IRQS - NUM_8259_INTERRUPTS]; + unsigned char init_senses[NR_IRQS - NUM_ISA_INTERRUPTS]; int nmi_irq = -1; #if defined(CONFIG_VT) && defined(CONFIG_ADB_KEYBOARD) && defined(XMON) struct device_node *kbd; @@ -185,12 +184,12 @@ void __init pSeries_init_openpic(void) __ioremap(addrp[prom_n_addr_cells(np)-1], 1, _PAGE_NO_CACHE); /* hydra still sets OpenPIC_InitSenses to a static set of values */ if (OpenPIC_InitSenses == NULL) { - prom_get_irq_senses(init_senses, NUM_8259_INTERRUPTS, NR_IRQS); + prom_get_irq_senses(init_senses, NUM_ISA_INTERRUPTS, NR_IRQS); OpenPIC_InitSenses = init_senses; - OpenPIC_NumInitSenses = NR_IRQS - NUM_8259_INTERRUPTS; + OpenPIC_NumInitSenses = NR_IRQS - NUM_ISA_INTERRUPTS; } - openpic_init(1, NUM_8259_INTERRUPTS, chrp_int_ack_special, nmi_irq); - for ( i = 0 ; i < NUM_8259_INTERRUPTS ; i++ ) + openpic_init(1, NUM_ISA_INTERRUPTS, chrp_int_ack_special, nmi_irq); + for (i = 0; i < NUM_ISA_INTERRUPTS; i++) irq_desc[i].handler = &i8259_pic; of_node_put(np); } @@ -441,7 +440,7 @@ static int __init openpic_setup_i8259(void) if (naca->interrupt_controller == IC_OPEN_PIC) { /* Initialize the cascade */ - if (request_irq(NUM_8259_INTERRUPTS, no_action, SA_INTERRUPT, + if (request_irq(NUM_ISA_INTERRUPTS, no_action, SA_INTERRUPT, "82c59 cascade", NULL)) printk(KERN_ERR "Unable to get OpenPIC IRQ 0 for cascade\n"); i8259_init(); @@ -820,13 +819,21 @@ static void openpic_mapirq(u_int irq, u_int physmask) * * sense: 1 for level, 0 for edge */ -static inline void openpic_set_sense(u_int irq, int sense) +#if 0 /* not used */ +static void openpic_set_sense(u_int irq, int sense) { openpic_safe_writefield(&GET_ISU(irq).Vector_Priority, OPENPIC_SENSE_LEVEL, (sense ? OPENPIC_SENSE_LEVEL : 0)); } +static int openpic_get_sense(u_int irq) +{ + return openpic_readfield(&GET_ISU(irq).Vector_Priority, + OPENPIC_SENSE_LEVEL) != 0; +} +#endif + static void openpic_end_irq(unsigned int irq_nr) { openpic_eoi(); diff --git a/arch/ppc64/kernel/open_pic.h b/arch/ppc64/kernel/open_pic.h index cf6a31f55c71..21f0a7afb84a 100644 --- a/arch/ppc64/kernel/open_pic.h +++ b/arch/ppc64/kernel/open_pic.h @@ -14,6 +14,7 @@ #include #include +#include #define OPENPIC_SIZE 0x40000 @@ -38,11 +39,4 @@ extern void openpic_init_processor(u_int cpumask); extern void openpic_setup_ISU(int isu_num, unsigned long addr); extern void openpic_cause_IPI(u_int ipi, u_int cpumask); -extern inline int openpic_to_irq(int irq) -{ - if (systemcfg->platform == PLATFORM_POWERMAC) - return irq; - return irq += NUM_8259_INTERRUPTS; -} -/*extern int open_pic_irq_offset;*/ #endif /* _PPC64_KERNEL_OPEN_PIC_H */ diff --git a/arch/ppc64/kernel/prom.c b/arch/ppc64/kernel/prom.c index e092a13594ce..f1cfd43dd39c 100644 --- a/arch/ppc64/kernel/prom.c +++ b/arch/ppc64/kernel/prom.c @@ -2191,7 +2191,7 @@ finish_node_interrupts(struct device_node *np, unsigned long mem_start) printk(KERN_CRIT "Could not allocate interrupt " "number for %s\n", np->full_name); } else - np->intrs[i].line = openpic_to_irq(virq); + np->intrs[i].line = irq_offset_up(virq); /* We offset irq numbers for the u3 MPIC by 128 in PowerMac */ if (systemcfg->platform == PLATFORM_POWERMAC && ic && ic->parent) { @@ -3019,7 +3019,7 @@ static int of_finish_dynamic_node_interrupts(struct device_node *node) "number for %s\n", node->full_name); return -ENOMEM; } - node->intrs[i].line = openpic_to_irq(virq); + node->intrs[i].line = irq_offset_up(virq); if (n > 1) node->intrs[i].sense = irq[1]; if (n > 2) { diff --git a/arch/ppc64/kernel/ras.c b/arch/ppc64/kernel/ras.c index 405bf1c51db0..c21e797a3c6d 100644 --- a/arch/ppc64/kernel/ras.c +++ b/arch/ppc64/kernel/ras.c @@ -1,4 +1,3 @@ - /* * ras.c * Copyright (C) 2001 Dave Engebretsen IBM Corporation @@ -80,7 +79,7 @@ static int __init init_ras_IRQ(void) "number for %s\n", np->full_name); break; } - request_irq(virq + NUM_8259_INTERRUPTS, + request_irq(irq_offset_up(virq), ras_error_interrupt, 0, "RAS_ERROR", NULL); ireg++; @@ -98,7 +97,7 @@ static int __init init_ras_IRQ(void) " number for %s\n", np->full_name); break; } - request_irq(virq + NUM_8259_INTERRUPTS, + request_irq(irq_offset_up(virq), ras_epow_interrupt, 0, "RAS_EPOW", NULL); ireg++; diff --git a/arch/ppc64/kernel/setup.c b/arch/ppc64/kernel/setup.c index 9c220a4c039c..24b62c83479c 100644 --- a/arch/ppc64/kernel/setup.c +++ b/arch/ppc64/kernel/setup.c @@ -25,8 +25,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -224,6 +224,7 @@ void setup_system(unsigned long r3, unsigned long r4, unsigned long r5, if (systemcfg->platform & PLATFORM_PSERIES) { early_console_initialized = 1; register_console(&udbg_console); + __irq_offset_value = NUM_ISA_INTERRUPTS; finish_device_tree(); chrp_init(r3, r4, r5, r6, r7); diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c index f5eb219233ef..fba53a437e60 100644 --- a/arch/ppc64/kernel/vio.c +++ b/arch/ppc64/kernel/vio.c @@ -26,7 +26,6 @@ #include #include #include -#include "open_pic.h" #define DBGENTER() pr_debug("%s entered\n", __FUNCTION__) @@ -256,7 +255,7 @@ struct vio_dev * __devinit vio_register_device(struct device_node *of_node) printk(KERN_ERR "Unable to allocate interrupt " "number for %s\n", of_node->full_name); } else - viodev->irq = openpic_to_irq(virq); + viodev->irq = irq_offset_up(virq); } /* init generic 'struct device' fields: */ diff --git a/arch/ppc64/kernel/xics.c b/arch/ppc64/kernel/xics.c index e1d557dac270..50dadf89f5f7 100644 --- a/arch/ppc64/kernel/xics.c +++ b/arch/ppc64/kernel/xics.c @@ -59,7 +59,6 @@ struct hw_interrupt_type xics_8259_pic = { static struct radix_tree_root irq_map = RADIX_TREE_INIT(GFP_KERNEL); #define XICS_IPI 2 -#define XICS_IRQ_OFFSET 0x10 #define XICS_IRQ_SPURIOUS 0 /* Want a priority other than 0. Various HW issues require this. */ @@ -217,7 +216,7 @@ xics_ops pSeriesLP_ops = { static unsigned int xics_startup(unsigned int virq) { - virq -= XICS_IRQ_OFFSET; + virq = irq_offset_down(virq); if (radix_tree_insert(&irq_map, virt_irq_to_real(virq), &virt_irq_to_real_map[virq]) == -ENOMEM) printk(KERN_CRIT "Out of memory creating real -> virtual" @@ -242,8 +241,7 @@ static void xics_enable_irq(unsigned int virq) long call_status; unsigned int server; - virq -= XICS_IRQ_OFFSET; - irq = virt_irq_to_real(virq); + irq = virt_irq_to_real(irq_offset_down(virq)); if (irq == XICS_IPI) return; @@ -301,25 +299,25 @@ static void xics_disable_irq(unsigned int virq) { unsigned int irq; - virq -= XICS_IRQ_OFFSET; - irq = virt_irq_to_real(virq); + irq = virt_irq_to_real(irq_offset_down(virq)); xics_disable_real_irq(irq); } -static void xics_end_irq(unsigned int irq) +static void xics_end_irq(unsigned int irq) { int cpu = smp_processor_id(); iosync(); - ops->xirr_info_set(cpu, ((0xff<<24) | - (virt_irq_to_real(irq-XICS_IRQ_OFFSET)))); + ops->xirr_info_set(cpu, ((0xff << 24) | + (virt_irq_to_real(irq_offset_down(irq))))); + } static void xics_mask_and_ack_irq(unsigned int irq) { int cpu = smp_processor_id(); - if (irq < XICS_IRQ_OFFSET) { + if (irq < irq_offset_value()) { i8259_pic.ack(irq); iosync(); ops->xirr_info_set(cpu, ((0xff<<24) | @@ -345,7 +343,8 @@ int xics_get_irq(struct pt_regs *regs) irq = i8259_irq(cpu); if (irq == -1) { /* Spurious cascaded interrupt. Still must ack xics */ - xics_end_irq(XICS_IRQ_OFFSET + xics_irq_8259_cascade); + xics_end_irq(irq_offset_up(xics_irq_8259_cascade)); + irq = -1; } } else if (vec == XICS_IRQ_SPURIOUS) { @@ -359,7 +358,7 @@ int xics_get_irq(struct pt_regs *regs) " disabling it.\n", vec); xics_disable_real_irq(vec); } else - irq += XICS_IRQ_OFFSET; + irq = irq_offset_up(irq); } return irq; } @@ -541,9 +540,9 @@ nextnode: xics_8259_pic.enable = i8259_pic.enable; xics_8259_pic.disable = i8259_pic.disable; for (i = 0; i < 16; ++i) - irq_desc[i].handler = &xics_8259_pic; + get_irq_desc(i)->handler = &xics_8259_pic; for (; i < NR_IRQS; ++i) - irq_desc[i].handler = &xics_pic; + get_irq_desc(i)->handler = &xics_pic; ops->cppr_info(boot_cpuid, 0xff); iosync(); @@ -559,7 +558,7 @@ static int __init xics_setup_i8259(void) { if (naca->interrupt_controller == IC_PPC_XIC && xics_irq_8259_cascade != -1) { - if (request_irq(xics_irq_8259_cascade + XICS_IRQ_OFFSET, + if (request_irq(irq_offset_up(xics_irq_8259_cascade), no_action, 0, "8259 cascade", 0)) printk(KERN_ERR "xics_init_IRQ: couldn't get 8259 cascade\n"); i8259_init(); @@ -574,9 +573,9 @@ void xics_request_IPIs(void) virt_irq_to_real_map[XICS_IPI] = XICS_IPI; /* IPIs are marked SA_INTERRUPT as they must run with irqs disabled */ - request_irq(XICS_IPI + XICS_IRQ_OFFSET, xics_ipi_action, SA_INTERRUPT, + request_irq(irq_offset_up(XICS_IPI), xics_ipi_action, SA_INTERRUPT, "IPI", 0); - irq_desc[XICS_IPI+XICS_IRQ_OFFSET].status |= IRQ_PER_CPU; + get_irq_desc(irq_offset_up(XICS_IPI))->status |= IRQ_PER_CPU; } #endif @@ -589,8 +588,7 @@ static void xics_set_affinity(unsigned int virq, cpumask_t cpumask) cpumask_t allcpus = CPU_MASK_ALL; cpumask_t tmp = CPU_MASK_NONE; - virq -= XICS_IRQ_OFFSET; - irq = virt_irq_to_real(virq); + irq = virt_irq_to_real(irq_offset_down(virq)); if (irq == XICS_IPI) return; diff --git a/include/asm-ppc64/hw_irq.h b/include/asm-ppc64/hw_irq.h index 8db7a1a70756..baea40e695ec 100644 --- a/include/asm-ppc64/hw_irq.h +++ b/include/asm-ppc64/hw_irq.h @@ -75,9 +75,24 @@ static inline void __do_save_and_cli(unsigned long *flags) #endif /* CONFIG_PPC_ISERIES */ -#define mask_irq(irq) ({if (irq_desc[irq].handler && irq_desc[irq].handler->disable) irq_desc[irq].handler->disable(irq);}) -#define unmask_irq(irq) ({if (irq_desc[irq].handler && irq_desc[irq].handler->enable) irq_desc[irq].handler->enable(irq);}) -#define ack_irq(irq) ({if (irq_desc[irq].handler && irq_desc[irq].handler->ack) irq_desc[irq].handler->ack(irq);}) +#define mask_irq(irq) \ + ({ \ + irq_desc_t *desc = get_irq_desc(irq); \ + if (desc->handler && desc->handler->disable) \ + desc->handler->disable(irq); \ + }) +#define unmask_irq(irq) \ + ({ \ + irq_desc_t *desc = get_irq_desc(irq); \ + if (desc->handler && desc->handler->enable) \ + desc->handler->enable(irq); \ + }) +#define ack_irq(irq) \ + ({ \ + irq_desc_t *desc = get_irq_desc(irq); \ + if (desc->handler && desc->handler->ack) \ + desc->handler->ack(irq); \ + }) /* Should we handle this via lost interrupts and IPIs or should we don't care like * we do now ? --BenH. diff --git a/include/asm-ppc64/irq.h b/include/asm-ppc64/irq.h index f4ed6fe326dd..949e19f96be1 100644 --- a/include/asm-ppc64/irq.h +++ b/include/asm-ppc64/irq.h @@ -11,6 +11,11 @@ #include +/* + * Maximum number of interrupt sources that we can handle. + */ +#define NR_IRQS 512 + extern void disable_irq(unsigned int); extern void disable_irq_nosync(unsigned int); extern void enable_irq(unsigned int); @@ -18,12 +23,11 @@ extern void enable_irq(unsigned int); /* this number is used when no interrupt has been assigned */ #define NO_IRQ (-1) -/* - * this is the maximum number of virtual irqs we will use. - */ -#define NR_IRQS 512 +#define get_irq_desc(irq) (&irq_desc[(irq)]) -#define NUM_8259_INTERRUPTS 16 +/* Define a way to iterate across irqs. */ +#define for_each_irq(i) \ + for ((i) = 0; (i) < NR_IRQS; ++(i)) /* Interrupt numbers are virtual in case they are sparsely * distributed by the hardware. @@ -41,12 +45,35 @@ static inline unsigned int virt_irq_to_real(unsigned int virt_irq) return virt_irq_to_real_map[virt_irq]; } +/* + * Because many systems have two overlapping names spaces for + * interrupts (ISA and XICS for example), and the ISA interrupts + * have historically not been easy to renumber, we allow ISA + * interrupts to take values 0 - 15, and shift up the remaining + * interrupts by 0x10. + */ +#define NUM_ISA_INTERRUPTS 0x10 +extern int __irq_offset_value; + +static inline int irq_offset_up(int irq) +{ + return(irq + __irq_offset_value); +} + +static inline int irq_offset_down(int irq) +{ + return(irq - __irq_offset_value); +} + +static inline int irq_offset_value(void) +{ + return __irq_offset_value; +} + static __inline__ int irq_canonicalize(int irq) { return irq; } -#define NR_MASK_WORDS ((NR_IRQS + 63) / 64) - #endif /* _ASM_IRQ_H */ #endif /* __KERNEL__ */ diff --git a/include/asm-ppc64/smp.h b/include/asm-ppc64/smp.h index 8a96f975e496..3d7e3d7c7663 100644 --- a/include/asm-ppc64/smp.h +++ b/include/asm-ppc64/smp.h @@ -67,6 +67,8 @@ extern cpumask_t cpu_available_map; #endif #define PPC_MSG_DEBUGGER_BREAK 3 +extern cpumask_t irq_affinity[]; + void smp_init_iSeries(void); void smp_init_pSeries(void); -- cgit v1.2.3 From 0e75cd7813f82e49975d3ba8a1bf6113aa497547 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:46:59 -0700 Subject: [PATCH] ppc64: Update CPU features From: Anton Blanchard Update CPU features. Remove DABR feature, all cpus have it. Add MMCRA, PMC8, SMT, COHERENT_ICACHE, LOCKLESS_TLBIE features --- arch/ppc64/kernel/cputable.c | 30 +++++++++++++++++------------- arch/ppc64/xmon/xmon.c | 9 ++------- include/asm-ppc64/cputable.h | 8 ++++++-- 3 files changed, 25 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/arch/ppc64/kernel/cputable.c b/arch/ppc64/kernel/cputable.c index 672cd6a30169..df13c89ff25c 100644 --- a/arch/ppc64/kernel/cputable.c +++ b/arch/ppc64/kernel/cputable.c @@ -48,7 +48,7 @@ struct cpu_spec cpu_specs[] = { { /* Power3 */ 0xffff0000, 0x00400000, "POWER3 (630)", CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE | - CPU_FTR_DABR | CPU_FTR_IABR, + CPU_FTR_IABR | CPU_FTR_PMC8, COMMON_USER_PPC64, 128, 128, __setup_cpu_power3, @@ -57,7 +57,7 @@ struct cpu_spec cpu_specs[] = { { /* Power3+ */ 0xffff0000, 0x00410000, "POWER3 (630+)", CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE | - CPU_FTR_DABR | CPU_FTR_IABR, + CPU_FTR_IABR | CPU_FTR_PMC8, COMMON_USER_PPC64, 128, 128, __setup_cpu_power3, @@ -66,7 +66,7 @@ struct cpu_spec cpu_specs[] = { { /* Northstar */ 0xffff0000, 0x00330000, "RS64-II (northstar)", CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE | - CPU_FTR_DABR | CPU_FTR_IABR, + CPU_FTR_IABR | CPU_FTR_PMC8 | CPU_FTR_MMCRA, COMMON_USER_PPC64, 128, 128, __setup_cpu_power3, @@ -75,7 +75,7 @@ struct cpu_spec cpu_specs[] = { { /* Pulsar */ 0xffff0000, 0x00340000, "RS64-III (pulsar)", CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE | - CPU_FTR_DABR | CPU_FTR_IABR, + CPU_FTR_IABR | CPU_FTR_PMC8 | CPU_FTR_MMCRA, COMMON_USER_PPC64, 128, 128, __setup_cpu_power3, @@ -84,7 +84,7 @@ struct cpu_spec cpu_specs[] = { { /* I-star */ 0xffff0000, 0x00360000, "RS64-III (icestar)", CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE | - CPU_FTR_DABR | CPU_FTR_IABR, + CPU_FTR_IABR | CPU_FTR_PMC8 | CPU_FTR_MMCRA, COMMON_USER_PPC64, 128, 128, __setup_cpu_power3, @@ -93,7 +93,7 @@ struct cpu_spec cpu_specs[] = { { /* S-star */ 0xffff0000, 0x00370000, "RS64-IV (sstar)", CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE | - CPU_FTR_DABR | CPU_FTR_IABR, + CPU_FTR_IABR | CPU_FTR_PMC8 | CPU_FTR_MMCRA, COMMON_USER_PPC64, 128, 128, __setup_cpu_power3, @@ -102,7 +102,7 @@ struct cpu_spec cpu_specs[] = { { /* Power4 */ 0xffff0000, 0x00350000, "POWER4 (gp)", CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE | - CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_DABR, + CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_PMC8 | CPU_FTR_MMCRA, COMMON_USER_PPC64, 128, 128, __setup_cpu_power4, @@ -111,7 +111,7 @@ struct cpu_spec cpu_specs[] = { { /* Power4+ */ 0xffff0000, 0x00380000, "POWER4+ (gq)", CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE | - CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_DABR, + CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_PMC8 | CPU_FTR_MMCRA, COMMON_USER_PPC64, 128, 128, __setup_cpu_power4, @@ -120,7 +120,8 @@ struct cpu_spec cpu_specs[] = { { /* PPC970 */ 0xffff0000, 0x00390000, "PPC970", CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE | - CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP, + CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_ALTIVEC_COMP | + CPU_FTR_CAN_NAP | CPU_FTR_PMC8 | CPU_FTR_MMCRA, COMMON_USER_PPC64 | PPC_FEATURE_HAS_ALTIVEC_COMP, 128, 128, __setup_cpu_ppc970, @@ -129,7 +130,8 @@ struct cpu_spec cpu_specs[] = { { /* PPC970FX */ 0xffff0000, 0x003c0000, "PPC970FX", CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE | - CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_ALTIVEC_COMP | CPU_FTR_CAN_NAP, + CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_ALTIVEC_COMP | + CPU_FTR_CAN_NAP | CPU_FTR_PMC8 | CPU_FTR_MMCRA, COMMON_USER_PPC64 | PPC_FEATURE_HAS_ALTIVEC_COMP, 128, 128, __setup_cpu_ppc970, @@ -138,7 +140,8 @@ struct cpu_spec cpu_specs[] = { { /* Power5 */ 0xffff0000, 0x003a0000, "POWER5 (gr)", CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE | - CPU_FTR_PPCAS_ARCH_V2, + CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_MMCRA | CPU_FTR_SMT | + CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE, COMMON_USER_PPC64, 128, 128, __setup_cpu_power4, @@ -147,7 +150,8 @@ struct cpu_spec cpu_specs[] = { { /* Power5 */ 0xffff0000, 0x003b0000, "POWER5 (gs)", CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE | - CPU_FTR_PPCAS_ARCH_V2, + CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_MMCRA | CPU_FTR_SMT | + CPU_FTR_COHERENT_ICACHE | CPU_FTR_LOCKLESS_TLBIE, COMMON_USER_PPC64, 128, 128, __setup_cpu_power4, @@ -156,7 +160,7 @@ struct cpu_spec cpu_specs[] = { { /* default match */ 0x00000000, 0x00000000, "POWER4 (compatible)", CPU_FTR_SPLIT_ID_CACHE | CPU_FTR_USE_TB | CPU_FTR_HPTE_TABLE | - CPU_FTR_PPCAS_ARCH_V2 | CPU_FTR_DABR, + CPU_FTR_PPCAS_ARCH_V2, COMMON_USER_PPC64, 128, 128, __setup_cpu_power4, diff --git a/arch/ppc64/xmon/xmon.c b/arch/ppc64/xmon/xmon.c index 7695c3ff4962..8bf490b348f1 100644 --- a/arch/ppc64/xmon/xmon.c +++ b/arch/ppc64/xmon/xmon.c @@ -452,7 +452,7 @@ insert_bpts() } } - if ((cur_cpu_spec->cpu_features & CPU_FTR_DABR) && dabr.enabled) + if (dabr.enabled) set_dabr(dabr.address); if ((cur_cpu_spec->cpu_features & CPU_FTR_IABR) && iabr.enabled) set_iabr(iabr.address); @@ -465,8 +465,7 @@ remove_bpts() struct bpt *bp; unsigned instr; - if ((cur_cpu_spec->cpu_features & CPU_FTR_DABR)) - set_dabr(0); + set_dabr(0); if ((cur_cpu_spec->cpu_features & CPU_FTR_IABR)) set_iabr(0); @@ -751,10 +750,6 @@ bpt_cmds(void) cmd = inchar(); switch (cmd) { case 'd': /* bd - hardware data breakpoint */ - if (!(cur_cpu_spec->cpu_features & CPU_FTR_DABR)) { - printf("Not implemented on this cpu\n"); - break; - } mode = 7; cmd = inchar(); if (cmd == 'r') diff --git a/include/asm-ppc64/cputable.h b/include/asm-ppc64/cputable.h index 99c3abfba704..abca635f9f9b 100644 --- a/include/asm-ppc64/cputable.h +++ b/include/asm-ppc64/cputable.h @@ -125,8 +125,12 @@ extern firmware_feature_t firmware_features_table[]; #define CPU_FTR_TLBIEL 0x0000000400000000 #define CPU_FTR_NOEXECUTE 0x0000000800000000 #define CPU_FTR_NODSISRALIGN 0x0000001000000000 -#define CPU_FTR_DABR 0x0000002000000000 -#define CPU_FTR_IABR 0x0000004000000000 +#define CPU_FTR_IABR 0x0000002000000000 +#define CPU_FTR_MMCRA 0x0000004000000000 +#define CPU_FTR_PMC8 0x0000008000000000 +#define CPU_FTR_SMT 0x0000010000000000 +#define CPU_FTR_COHERENT_ICACHE 0x0000020000000000 +#define CPU_FTR_LOCKLESS_TLBIE 0x0000040000000000 /* Platform firmware features */ #define FW_FTR_ 0x0000000000000001 -- cgit v1.2.3 From c1a86d3b4fb53cd954a18edb6157b58584209c8b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:47:25 -0700 Subject: [PATCH] ppc64: Add some POWER5 specific optimisations From: Anton Blanchard Add some POWER5 specific optimisations: - icache is coherent, no need to explicitly flush - tlbie lock no longer required --- arch/ppc64/kernel/misc.S | 2 +- arch/ppc64/kernel/pSeries_htab.c | 32 ++++++++++++++++++++++---------- arch/ppc64/kernel/pSeries_lpar.c | 11 +++++++---- arch/ppc64/mm/hash_low.S | 2 ++ arch/ppc64/mm/init.c | 10 +++++++++- include/asm-ppc64/cacheflush.h | 10 ++++++++-- 6 files changed, 49 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/arch/ppc64/kernel/misc.S b/arch/ppc64/kernel/misc.S index a9a0a9953f7d..dced77d38ece 100644 --- a/arch/ppc64/kernel/misc.S +++ b/arch/ppc64/kernel/misc.S @@ -132,7 +132,7 @@ _GLOBAL(flush_instruction_cache) * flush all bytes from start through stop-1 inclusive */ -_GLOBAL(flush_icache_range) +_GLOBAL(__flush_icache_range) /* * Flush the data cache to memory diff --git a/arch/ppc64/kernel/pSeries_htab.c b/arch/ppc64/kernel/pSeries_htab.c index dbe18cf5715d..8f556f3c9df7 100644 --- a/arch/ppc64/kernel/pSeries_htab.c +++ b/arch/ppc64/kernel/pSeries_htab.c @@ -221,9 +221,11 @@ static long pSeries_hpte_updatepp(unsigned long slot, unsigned long newpp, if ((cur_cpu_spec->cpu_features & CPU_FTR_TLBIEL) && !large && local) { tlbiel(va); } else { - spin_lock_irqsave(&pSeries_tlbie_lock, flags); + if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE)) + spin_lock_irqsave(&pSeries_tlbie_lock, flags); tlbie(va, large); - spin_unlock_irqrestore(&pSeries_tlbie_lock, flags); + if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE)) + spin_unlock_irqrestore(&pSeries_tlbie_lock, flags); } return ret; @@ -255,9 +257,11 @@ static void pSeries_hpte_updateboltedpp(unsigned long newpp, unsigned long ea) set_pp_bit(newpp, hptep); /* Ensure it is out of the tlb too */ - spin_lock_irqsave(&pSeries_tlbie_lock, flags); + if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE)) + spin_lock_irqsave(&pSeries_tlbie_lock, flags); tlbie(va, 0); - spin_unlock_irqrestore(&pSeries_tlbie_lock, flags); + if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE)) + spin_unlock_irqrestore(&pSeries_tlbie_lock, flags); } static void pSeries_hpte_invalidate(unsigned long slot, unsigned long va, @@ -287,9 +291,11 @@ static void pSeries_hpte_invalidate(unsigned long slot, unsigned long va, if ((cur_cpu_spec->cpu_features & CPU_FTR_TLBIEL) && !large && local) { tlbiel(va); } else { - spin_lock_irqsave(&pSeries_tlbie_lock, flags); + if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE)) + spin_lock_irqsave(&pSeries_tlbie_lock, flags); tlbie(va, large); - spin_unlock_irqrestore(&pSeries_tlbie_lock, flags); + if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE)) + spin_unlock_irqrestore(&pSeries_tlbie_lock, flags); } } @@ -356,7 +362,8 @@ static void pSeries_flush_hash_range(unsigned long context, asm volatile("ptesync":::"memory"); } else { /* XXX double check that it is safe to take this late */ - spin_lock_irqsave(&pSeries_tlbie_lock, flags); + if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE)) + spin_lock_irqsave(&pSeries_tlbie_lock, flags); asm volatile("ptesync":::"memory"); @@ -365,7 +372,8 @@ static void pSeries_flush_hash_range(unsigned long context, asm volatile("eieio; tlbsync; ptesync":::"memory"); - spin_unlock_irqrestore(&pSeries_tlbie_lock, flags); + if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE)) + spin_unlock_irqrestore(&pSeries_tlbie_lock, flags); } } @@ -384,8 +392,12 @@ void hpte_init_pSeries(void) root = of_find_node_by_path("/"); if (root) { model = get_property(root, "model", NULL); - if (strcmp(model, "CHRP IBM,9076-N81")) - ppc_md.flush_hash_range = pSeries_flush_hash_range; + if (!strcmp(model, "CHRP IBM,9076-N81")) { + of_node_put(root); + return; + } of_node_put(root); } + + ppc_md.flush_hash_range = pSeries_flush_hash_range; } diff --git a/arch/ppc64/kernel/pSeries_lpar.c b/arch/ppc64/kernel/pSeries_lpar.c index d1a28982f378..12b0fb86acad 100644 --- a/arch/ppc64/kernel/pSeries_lpar.c +++ b/arch/ppc64/kernel/pSeries_lpar.c @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -30,13 +31,13 @@ #include #include #include -#include #include #include #include #include #include #include +#include /* in pSeries_hvCall.S */ EXPORT_SYMBOL(plpar_hcall); @@ -146,7 +147,7 @@ static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, long npage (u64)tcenum << 12, tce.te_word ); - if(rc && printk_ratelimit()) { + if (rc && printk_ratelimit()) { printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%ld\n", rc); printk("\tindex = 0x%lx\n", (u64)tbl->it_index); printk("\ttcenum = 0x%lx\n", (u64)tcenum); @@ -559,12 +560,14 @@ void pSeries_lpar_flush_hash_range(unsigned long context, unsigned long number, unsigned long flags; struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); - spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); + if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE)) + spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); for (i = 0; i < number; i++) flush_hash_page(context, batch->addr[i], batch->pte[i], local); - spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags); + if (!(cur_cpu_spec->cpu_features & CPU_FTR_LOCKLESS_TLBIE)) + spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags); } void pSeries_lpar_mm_init(void) diff --git a/arch/ppc64/mm/hash_low.S b/arch/ppc64/mm/hash_low.S index 94e187a8bc40..0d6b5c29b645 100644 --- a/arch/ppc64/mm/hash_low.S +++ b/arch/ppc64/mm/hash_low.S @@ -125,11 +125,13 @@ _GLOBAL(__hash_page) /* We eventually do the icache sync here (maybe inline that * code rather than call a C function...) */ +BEGIN_FTR_SECTION BEGIN_FTR_SECTION mr r4,r30 mr r5,r7 bl .hash_page_do_lazy_icache END_FTR_SECTION_IFSET(CPU_FTR_NOEXECUTE) +END_FTR_SECTION_IFCLR(CPU_FTR_COHERENT_ICACHE) /* At this point, r3 contains new PP bits, save them in * place of "access" in the param area (sic) diff --git a/arch/ppc64/mm/init.c b/arch/ppc64/mm/init.c index 61708dc2dd50..a62225a645d1 100644 --- a/arch/ppc64/mm/init.c +++ b/arch/ppc64/mm/init.c @@ -696,6 +696,8 @@ void __init mem_init(void) */ void flush_dcache_page(struct page *page) { + if (cur_cpu_spec->cpu_features & CPU_FTR_COHERENT_ICACHE) + return; /* avoid an atomic op if possible */ if (test_bit(PG_arch_1, &page->flags)) clear_bit(PG_arch_1, &page->flags); @@ -705,6 +707,8 @@ void clear_user_page(void *page, unsigned long vaddr, struct page *pg) { clear_page(page); + if (cur_cpu_spec->cpu_features & CPU_FTR_COHERENT_ICACHE) + return; /* * We shouldnt have to do this, but some versions of glibc * require it (ld.so assumes zero filled pages are icache clean) @@ -736,6 +740,9 @@ void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, return; #endif + if (cur_cpu_spec->cpu_features & CPU_FTR_COHERENT_ICACHE) + return; + /* avoid an atomic op if possible */ if (test_bit(PG_arch_1, &pg->flags)) clear_bit(PG_arch_1, &pg->flags); @@ -768,7 +775,8 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long ea, cpumask_t tmp; /* handle i-cache coherency */ - if (!(cur_cpu_spec->cpu_features & CPU_FTR_NOEXECUTE)) { + if (!(cur_cpu_spec->cpu_features & CPU_FTR_COHERENT_ICACHE) && + !(cur_cpu_spec->cpu_features & CPU_FTR_NOEXECUTE)) { unsigned long pfn = pte_pfn(pte); if (pfn_valid(pfn)) { struct page *page = pfn_to_page(pfn); diff --git a/include/asm-ppc64/cacheflush.h b/include/asm-ppc64/cacheflush.h index 7d958ac381b0..d6f24f9a1fed 100644 --- a/include/asm-ppc64/cacheflush.h +++ b/include/asm-ppc64/cacheflush.h @@ -1,8 +1,8 @@ #ifndef _PPC64_CACHEFLUSH_H #define _PPC64_CACHEFLUSH_H -/* Keep includes the same across arches. */ #include +#include /* * No cache flushing is required when address mappings are @@ -18,7 +18,7 @@ #define flush_cache_vunmap(start, end) do { } while (0) extern void flush_dcache_page(struct page *page); -extern void flush_icache_range(unsigned long, unsigned long); +extern void __flush_icache_range(unsigned long, unsigned long); extern void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); @@ -35,4 +35,10 @@ do { memcpy(dst, src, len); \ extern void __flush_dcache_icache(void *page_va); +static inline void flush_icache_range(unsigned long start, unsigned long stop) +{ + if (!(cur_cpu_spec->cpu_features & CPU_FTR_COHERENT_ICACHE)) + __flush_icache_range(start, stop); +} + #endif /* _PPC64_CACHEFLUSH_H */ -- cgit v1.2.3 From 12c9ae0de28d9fef2766c4ae5a1f01b7ab6aca20 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:47:39 -0700 Subject: [PATCH] ppc64: Add PMCs to sysfs From: Anton Blanchard Add PMCs to sysfs. --- arch/ppc64/kernel/sysfs.c | 108 ++++++++++++++++++++++++++++++++++++++++++ include/asm-ppc64/processor.h | 29 +++++++----- 2 files changed, 125 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/arch/ppc64/kernel/sysfs.c b/arch/ppc64/kernel/sysfs.c index 7699817c3fc4..b168d4a52864 100644 --- a/arch/ppc64/kernel/sysfs.c +++ b/arch/ppc64/kernel/sysfs.c @@ -4,7 +4,113 @@ #include #include #include +#include +#include #include +#include +#include + +/* PMC stuff */ + +/* XXX convert to rusty's on_one_cpu */ +static unsigned long run_on_cpu(unsigned long cpu, + unsigned long (*func)(unsigned long), + unsigned long arg) +{ + cpumask_t old_affinity = current->cpus_allowed; + unsigned long ret; + + /* should return -EINVAL to userspace */ + if (set_cpus_allowed(current, cpumask_of_cpu(cpu))) + return 0; + + ret = func(arg); + + set_cpus_allowed(current, old_affinity); + + return ret; +} + +#define SYSFS_PMCSETUP(NAME, ADDRESS) \ +static unsigned long read_##NAME(unsigned long junk) \ +{ \ + return mfspr(ADDRESS); \ +} \ +static unsigned long write_##NAME(unsigned long val) \ +{ \ + mtspr(ADDRESS, val); \ + return 0; \ +} \ +static ssize_t show_##NAME(struct sys_device *dev, char *buf) \ +{ \ + struct cpu *cpu = container_of(dev, struct cpu, sysdev); \ + unsigned long val = run_on_cpu(cpu->sysdev.id, read_##NAME, 0); \ + return sprintf(buf, "%lx\n", val); \ +} \ +static ssize_t store_##NAME(struct sys_device *dev, const char *buf, \ + size_t count) \ +{ \ + struct cpu *cpu = container_of(dev, struct cpu, sysdev); \ + unsigned long val; \ + int ret = sscanf(buf, "%lx", &val); \ + if (ret != 1) \ + return -EINVAL; \ + run_on_cpu(cpu->sysdev.id, write_##NAME, val); \ + return count; \ +} + +SYSFS_PMCSETUP(mmcr0, SPRN_MMCR0); +SYSFS_PMCSETUP(mmcr1, SPRN_MMCR1); +SYSFS_PMCSETUP(mmcra, SPRN_MMCRA); +SYSFS_PMCSETUP(pmc1, SPRN_PMC1); +SYSFS_PMCSETUP(pmc2, SPRN_PMC2); +SYSFS_PMCSETUP(pmc3, SPRN_PMC3); +SYSFS_PMCSETUP(pmc4, SPRN_PMC4); +SYSFS_PMCSETUP(pmc5, SPRN_PMC5); +SYSFS_PMCSETUP(pmc6, SPRN_PMC6); +SYSFS_PMCSETUP(pmc7, SPRN_PMC7); +SYSFS_PMCSETUP(pmc8, SPRN_PMC8); +SYSFS_PMCSETUP(purr, SPRN_PURR); + +static SYSDEV_ATTR(mmcr0, 0600, show_mmcr0, store_mmcr0); +static SYSDEV_ATTR(mmcr1, 0600, show_mmcr1, store_mmcr1); +static SYSDEV_ATTR(mmcra, 0600, show_mmcra, store_mmcra); +static SYSDEV_ATTR(pmc1, 0600, show_pmc1, store_pmc1); +static SYSDEV_ATTR(pmc2, 0600, show_pmc2, store_pmc2); +static SYSDEV_ATTR(pmc3, 0600, show_pmc3, store_pmc3); +static SYSDEV_ATTR(pmc4, 0600, show_pmc4, store_pmc4); +static SYSDEV_ATTR(pmc5, 0600, show_pmc5, store_pmc5); +static SYSDEV_ATTR(pmc6, 0600, show_pmc6, store_pmc6); +static SYSDEV_ATTR(pmc7, 0600, show_pmc7, store_pmc7); +static SYSDEV_ATTR(pmc8, 0600, show_pmc8, store_pmc8); +static SYSDEV_ATTR(purr, 0600, show_purr, NULL); + +static void __init register_cpu_pmc(struct sys_device *s) +{ + sysdev_create_file(s, &attr_mmcr0); + sysdev_create_file(s, &attr_mmcr1); + + if (cur_cpu_spec->cpu_features & CPU_FTR_MMCRA) + sysdev_create_file(s, &attr_mmcra); + + sysdev_create_file(s, &attr_pmc1); + sysdev_create_file(s, &attr_pmc2); + sysdev_create_file(s, &attr_pmc3); + sysdev_create_file(s, &attr_pmc4); + sysdev_create_file(s, &attr_pmc5); + sysdev_create_file(s, &attr_pmc6); + + if (cur_cpu_spec->cpu_features & CPU_FTR_PMC8) { + sysdev_create_file(s, &attr_pmc7); + sysdev_create_file(s, &attr_pmc8); + } + + if (cur_cpu_spec->cpu_features & CPU_FTR_SMT) + sysdev_create_file(s, &attr_purr); +} + + +/* NUMA stuff */ #ifdef CONFIG_NUMA static struct node node_devices[MAX_NUMNODES]; @@ -60,6 +166,8 @@ static int __init topology_init(void) #endif register_cpu(c, cpu, parent); + register_cpu_pmc(&c->sysdev); + sysdev_create_file(&c->sysdev, &attr_physical_id); } diff --git a/include/asm-ppc64/processor.h b/include/asm-ppc64/processor.h index b8c7d26b947b..ea8bf67f7007 100644 --- a/include/asm-ppc64/processor.h +++ b/include/asm-ppc64/processor.h @@ -235,8 +235,6 @@ #define SPRN_IMMR 0x27E /* Internal Memory Map Register */ #define SPRN_L2CR 0x3F9 /* Level 2 Cache Control Regsiter */ #define SPRN_LR 0x008 /* Link Register */ -#define SPRN_MMCR0 0x3B8 /* Monitor Mode Control Register 0 */ -#define SPRN_MMCR1 0x3BC /* Monitor Mode Control Register 1 */ #define SPRN_PBL1 0x3FC /* Protection Bound Lower 1 */ #define SPRN_PBL2 0x3FE /* Protection Bound Lower 2 */ #define SPRN_PBU1 0x3FD /* Protection Bound Upper 1 */ @@ -244,10 +242,7 @@ #define SPRN_PID 0x3B1 /* Process ID */ #define SPRN_PIR 0x3FF /* Processor Identification Register */ #define SPRN_PIT 0x3DB /* Programmable Interval Timer */ -#define SPRN_PMC1 0x3B9 /* Performance Counter Register 1 */ -#define SPRN_PMC2 0x3BA /* Performance Counter Register 2 */ -#define SPRN_PMC3 0x3BD /* Performance Counter Register 3 */ -#define SPRN_PMC4 0x3BE /* Performance Counter Register 4 */ +#define SPRN_PURR 0x135 /* Processor Utilization of Resources Register */ #define SPRN_PVR 0x11F /* Processor Version Register */ #define SPRN_RPA 0x3D6 /* Required Physical Address Register */ #define SPRN_SDA 0x3BF /* Sampled Data Address Register */ @@ -307,17 +302,26 @@ #define WRS_SYSTEM 3 /* WDT forced system reset */ #define TSR_PIS 0x08000000 /* PIT Interrupt Status */ #define TSR_FIS 0x04000000 /* FIT Interrupt Status */ -#define SPRN_UMMCR0 0x3A8 /* User Monitor Mode Control Register 0 */ -#define SPRN_UMMCR1 0x3AC /* User Monitor Mode Control Register 0 */ -#define SPRN_UPMC1 0x3A9 /* User Performance Counter Register 1 */ -#define SPRN_UPMC2 0x3AA /* User Performance Counter Register 2 */ -#define SPRN_UPMC3 0x3AD /* User Performance Counter Register 3 */ -#define SPRN_UPMC4 0x3AE /* User Performance Counter Register 4 */ #define SPRN_USIA 0x3AB /* User Sampled Instruction Address Register */ #define SPRN_XER 0x001 /* Fixed Point Exception Register */ #define SPRN_ZPR 0x3B0 /* Zone Protection Register */ #define SPRN_VRSAVE 0x100 /* Vector save */ +/* Performance monitor SPRs */ +#define SPRN_SIAR 780 +#define SPRN_SDAR 781 +#define SPRN_MMCRA 786 +#define SPRN_PMC1 787 +#define SPRN_PMC2 788 +#define SPRN_PMC3 789 +#define SPRN_PMC4 790 +#define SPRN_PMC5 791 +#define SPRN_PMC6 792 +#define SPRN_PMC7 793 +#define SPRN_PMC8 794 +#define SPRN_MMCR0 795 +#define SPRN_MMCR1 798 + /* Short-hand versions for a number of the above SPRNs */ #define CTR SPRN_CTR /* Counter Register */ @@ -343,6 +347,7 @@ #define __LR SPRN_LR #define PVR SPRN_PVR /* Processor Version */ #define PIR SPRN_PIR /* Processor ID */ +#define PURR SPRN_PURR /* Processor Utilization of Resource Register */ #define RPA SPRN_RPA /* Required Physical Address Register */ #define SDR1 SPRN_SDR1 /* MMU hash base register */ #define SPR0 SPRN_SPRG0 /* Supervisor Private Registers */ -- cgit v1.2.3 From f4421b9c28e02a7260d6896b25fc4ac4f158baf0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:48:25 -0700 Subject: [PATCH] ppc64: Use enum dma_data_direction for the vio DMA api routines. From: Stephen Rothwell This patch uses enum dma_data_direction for the vio DMA api routines. This allows us to remove some include of linux/pci.h. Also missed some pci_dma_mapping_error uses. --- arch/ppc64/kernel/dma.c | 16 ++++++++-------- arch/ppc64/kernel/vio.c | 23 +++++++++++------------ drivers/net/ibmveth.c | 25 ++++++++++++------------- include/asm-ppc64/vio.h | 21 ++++++++++----------- 4 files changed, 41 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/arch/ppc64/kernel/dma.c b/arch/ppc64/kernel/dma.c index eb6f7996c7fe..26839a571415 100644 --- a/arch/ppc64/kernel/dma.c +++ b/arch/ppc64/kernel/dma.c @@ -77,7 +77,7 @@ dma_addr_t dma_map_single(struct device *dev, void *cpu_addr, size_t size, return pci_map_single(to_pci_dev(dev), cpu_addr, size, (int)direction); #ifdef CONFIG_PPC_PSERIES if (dev->bus == &vio_bus_type) - return vio_map_single(to_vio_dev(dev), cpu_addr, size, (int)direction); + return vio_map_single(to_vio_dev(dev), cpu_addr, size, direction); #endif BUG(); return (dma_addr_t)0; @@ -91,7 +91,7 @@ void dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, pci_unmap_single(to_pci_dev(dev), dma_addr, size, (int)direction); #ifdef CONFIG_PPC_PSERIES else if (dev->bus == &vio_bus_type) - vio_unmap_single(to_vio_dev(dev), dma_addr, size, (int)direction); + vio_unmap_single(to_vio_dev(dev), dma_addr, size, direction); #endif else BUG(); @@ -106,7 +106,7 @@ dma_addr_t dma_map_page(struct device *dev, struct page *page, return pci_map_page(to_pci_dev(dev), page, offset, size, (int)direction); #ifdef CONFIG_PPC_PSERIES if (dev->bus == &vio_bus_type) - return vio_map_page(to_vio_dev(dev), page, offset, size, (int)direction); + return vio_map_page(to_vio_dev(dev), page, offset, size, direction); #endif BUG(); return (dma_addr_t)0; @@ -120,7 +120,7 @@ void dma_unmap_page(struct device *dev, dma_addr_t dma_address, size_t size, pci_unmap_page(to_pci_dev(dev), dma_address, size, (int)direction); #ifdef CONFIG_PPC_PSERIES else if (dev->bus == &vio_bus_type) - vio_unmap_page(to_vio_dev(dev), dma_address, size, (int)direction); + vio_unmap_page(to_vio_dev(dev), dma_address, size, direction); #endif else BUG(); @@ -134,7 +134,7 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, return pci_map_sg(to_pci_dev(dev), sg, nents, (int)direction); #ifdef CONFIG_PPC_PSERIES if (dev->bus == &vio_bus_type) - return vio_map_sg(to_vio_dev(dev), sg, nents, (int)direction); + return vio_map_sg(to_vio_dev(dev), sg, nents, direction); #endif BUG(); return 0; @@ -148,7 +148,7 @@ void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, pci_unmap_sg(to_pci_dev(dev), sg, nhwentries, (int)direction); #ifdef CONFIG_PPC_PSERIES else if (dev->bus == &vio_bus_type) - vio_unmap_sg(to_vio_dev(dev), sg, nhwentries, (int)direction); + vio_unmap_sg(to_vio_dev(dev), sg, nhwentries, direction); #endif else BUG(); @@ -162,7 +162,7 @@ void dma_sync_single(struct device *dev, dma_addr_t dma_handle, size_t size, pci_dma_sync_single(to_pci_dev(dev), dma_handle, size, (int)direction); #ifdef CONFIG_PPC_PSERIES else if (dev->bus == &vio_bus_type) - vio_dma_sync_single(to_vio_dev(dev), dma_handle, size, (int)direction); + vio_dma_sync_single(to_vio_dev(dev), dma_handle, size, direction); #endif else BUG(); @@ -176,7 +176,7 @@ void dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems, pci_dma_sync_sg(to_pci_dev(dev), sg, nelems, (int)direction); #ifdef CONFIG_PPC_PSERIES else if (dev->bus == &vio_bus_type) - vio_dma_sync_sg(to_vio_dev(dev), sg, nelems, (int)direction); + vio_dma_sync_sg(to_vio_dev(dev), sg, nelems, direction); #endif else BUG(); diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c index fba53a437e60..054027f898fb 100644 --- a/arch/ppc64/kernel/vio.c +++ b/arch/ppc64/kernel/vio.c @@ -14,7 +14,6 @@ #include #include -#include #include #include #include @@ -416,14 +415,14 @@ EXPORT_SYMBOL(vio_disable_interrupts); dma_addr_t vio_map_single(struct vio_dev *dev, void *vaddr, - size_t size, int direction ) + size_t size, enum dma_data_direction direction) { struct iommu_table *tbl; dma_addr_t dma_handle = DMA_ERROR_CODE; unsigned long uaddr; unsigned int npages; - BUG_ON(direction == PCI_DMA_NONE); + BUG_ON(direction == DMA_NONE); uaddr = (unsigned long)vaddr; npages = PAGE_ALIGN( uaddr + size ) - ( uaddr & PAGE_MASK ); @@ -432,7 +431,7 @@ dma_addr_t vio_map_single(struct vio_dev *dev, void *vaddr, tbl = dev->iommu_table; if (tbl) { - dma_handle = iommu_alloc(tbl, vaddr, npages, direction); + dma_handle = iommu_alloc(tbl, vaddr, npages, (int)direction); dma_handle |= (uaddr & ~PAGE_MASK); } @@ -441,12 +440,12 @@ dma_addr_t vio_map_single(struct vio_dev *dev, void *vaddr, EXPORT_SYMBOL(vio_map_single); void vio_unmap_single(struct vio_dev *dev, dma_addr_t dma_handle, - size_t size, int direction) + size_t size, enum dma_data_direction direction) { struct iommu_table * tbl; unsigned int npages; - BUG_ON(direction == PCI_DMA_NONE); + BUG_ON(direction == DMA_NONE); npages = PAGE_ALIGN( dma_handle + size ) - ( dma_handle & PAGE_MASK ); npages >>= PAGE_SHIFT; @@ -458,11 +457,11 @@ void vio_unmap_single(struct vio_dev *dev, dma_addr_t dma_handle, EXPORT_SYMBOL(vio_unmap_single); int vio_map_sg(struct vio_dev *vdev, struct scatterlist *sglist, int nelems, - int direction) + enum dma_data_direction direction) { struct iommu_table *tbl; - BUG_ON(direction == PCI_DMA_NONE); + BUG_ON(direction == DMA_NONE); if (nelems == 0) return 0; @@ -471,16 +470,16 @@ int vio_map_sg(struct vio_dev *vdev, struct scatterlist *sglist, int nelems, if (!tbl) return 0; - return iommu_alloc_sg(tbl, &vdev->dev, sglist, nelems, direction); + return iommu_alloc_sg(tbl, &vdev->dev, sglist, nelems, (int)direction); } EXPORT_SYMBOL(vio_map_sg); void vio_unmap_sg(struct vio_dev *vdev, struct scatterlist *sglist, int nelems, - int direction) + enum dma_data_direction direction) { struct iommu_table *tbl; - BUG_ON(direction == PCI_DMA_NONE); + BUG_ON(direction == DMA_NONE); tbl = vdev->iommu_table; if (tbl) @@ -516,7 +515,7 @@ void *vio_alloc_consistent(struct vio_dev *dev, size_t size, /* Page allocation succeeded */ memset(ret, 0, npages << PAGE_SHIFT); /* Set up tces to cover the allocated range */ - tce = iommu_alloc(tbl, ret, npages, PCI_DMA_BIDIRECTIONAL); + tce = iommu_alloc(tbl, ret, npages, (int)DMA_BIDIRECTIONAL); if (tce == DMA_ERROR_CODE) { PPCDBG(PPCDBG_TCE, "vio_alloc_consistent: iommu_alloc failed\n" ); free_pages((unsigned long)ret, order); diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c index c3f4944e724e..6427a25c2719 100644 --- a/drivers/net/ibmveth.c +++ b/drivers/net/ibmveth.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include @@ -218,7 +217,7 @@ static void ibmveth_replenish_buffer_pool(struct ibmveth_adapter *adapter, struc ibmveth_assert(index != 0xffff); ibmveth_assert(pool->skbuff[index] == NULL); - dma_addr = vio_map_single(adapter->vdev, skb->data, pool->buff_size, PCI_DMA_FROMDEVICE); + dma_addr = vio_map_single(adapter->vdev, skb->data, pool->buff_size, DMA_FROM_DEVICE); pool->dma_addr[index] = dma_addr; pool->skbuff[index] = skb; @@ -236,7 +235,7 @@ static void ibmveth_replenish_buffer_pool(struct ibmveth_adapter *adapter, struc if(lpar_rc != H_Success) { pool->skbuff[index] = NULL; pool->consumer_index--; - vio_unmap_single(adapter->vdev, pool->dma_addr[index], pool->buff_size, PCI_DMA_FROMDEVICE); + vio_unmap_single(adapter->vdev, pool->dma_addr[index], pool->buff_size, DMA_FROM_DEVICE); dev_kfree_skb_any(skb); adapter->replenish_add_buff_failure++; break; @@ -300,7 +299,7 @@ static void ibmveth_free_buffer_pool(struct ibmveth_adapter *adapter, struct ibm vio_unmap_single(adapter->vdev, pool->dma_addr[i], pool->buff_size, - PCI_DMA_FROMDEVICE); + DMA_FROM_DEVICE); dev_kfree_skb_any(skb); pool->skbuff[i] = NULL; } @@ -338,7 +337,7 @@ static void ibmveth_remove_buffer_from_pool(struct ibmveth_adapter *adapter, u64 vio_unmap_single(adapter->vdev, adapter->rx_buff_pool[pool].dma_addr[index], adapter->rx_buff_pool[pool].buff_size, - PCI_DMA_FROMDEVICE); + DMA_FROM_DEVICE); free_index = adapter->rx_buff_pool[pool].producer_index++ % adapter->rx_buff_pool[pool].size; adapter->rx_buff_pool[pool].free_map[free_index] = index; @@ -406,7 +405,7 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter) { if(adapter->buffer_list_addr != NULL) { if(!vio_dma_mapping_error(adapter->buffer_list_dma)) { - vio_unmap_single(adapter->vdev, adapter->buffer_list_dma, 4096, PCI_DMA_BIDIRECTIONAL); + vio_unmap_single(adapter->vdev, adapter->buffer_list_dma, 4096, DMA_BIDIRECTIONAL); adapter->buffer_list_dma = DMA_ERROR_CODE; } free_page((unsigned long)adapter->buffer_list_addr); @@ -415,7 +414,7 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter) if(adapter->filter_list_addr != NULL) { if(!vio_dma_mapping_error(adapter->filter_list_dma)) { - vio_unmap_single(adapter->vdev, adapter->filter_list_dma, 4096, PCI_DMA_BIDIRECTIONAL); + vio_unmap_single(adapter->vdev, adapter->filter_list_dma, 4096, DMA_BIDIRECTIONAL); adapter->filter_list_dma = DMA_ERROR_CODE; } free_page((unsigned long)adapter->filter_list_addr); @@ -424,7 +423,7 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter) if(adapter->rx_queue.queue_addr != NULL) { if(!vio_dma_mapping_error(adapter->rx_queue.queue_dma)) { - vio_unmap_single(adapter->vdev, adapter->rx_queue.queue_dma, adapter->rx_queue.queue_len, PCI_DMA_BIDIRECTIONAL); + vio_unmap_single(adapter->vdev, adapter->rx_queue.queue_dma, adapter->rx_queue.queue_len, DMA_BIDIRECTIONAL); adapter->rx_queue.queue_dma = DMA_ERROR_CODE; } kfree(adapter->rx_queue.queue_addr); @@ -470,9 +469,9 @@ static int ibmveth_open(struct net_device *netdev) return -ENOMEM; } - adapter->buffer_list_dma = vio_map_single(adapter->vdev, adapter->buffer_list_addr, 4096, PCI_DMA_BIDIRECTIONAL); - adapter->filter_list_dma = vio_map_single(adapter->vdev, adapter->filter_list_addr, 4096, PCI_DMA_BIDIRECTIONAL); - adapter->rx_queue.queue_dma = vio_map_single(adapter->vdev, adapter->rx_queue.queue_addr, adapter->rx_queue.queue_len, PCI_DMA_BIDIRECTIONAL); + adapter->buffer_list_dma = vio_map_single(adapter->vdev, adapter->buffer_list_addr, 4096, DMA_BIDIRECTIONAL); + adapter->filter_list_dma = vio_map_single(adapter->vdev, adapter->filter_list_addr, 4096, DMA_BIDIRECTIONAL); + adapter->rx_queue.queue_dma = vio_map_single(adapter->vdev, adapter->rx_queue.queue_addr, adapter->rx_queue.queue_len, DMA_BIDIRECTIONAL); if((vio_dma_mapping_error(adapter->buffer_list_dma) ) || (vio_dma_mapping_error(adapter->filter_list_dma)) || @@ -673,7 +672,7 @@ static int ibmveth_start_xmit(struct sk_buff *skb, struct net_device *netdev) vio_unmap_single(adapter->vdev, desc[curfrag+1].fields.address, desc[curfrag+1].fields.length, - PCI_DMA_TODEVICE); + DMA_TO_DEVICE); curfrag++; } dev_kfree_skb(skb); @@ -710,7 +709,7 @@ static int ibmveth_start_xmit(struct sk_buff *skb, struct net_device *netdev) } do { - vio_unmap_single(adapter->vdev, desc[nfrags].fields.address, desc[nfrags].fields.length, PCI_DMA_TODEVICE); + vio_unmap_single(adapter->vdev, desc[nfrags].fields.address, desc[nfrags].fields.length, DMA_TO_DEVICE); } while(--nfrags >= 0); dev_kfree_skb(skb); diff --git a/include/asm-ppc64/vio.h b/include/asm-ppc64/vio.h index d70eeb87dfd9..7fe3a23736e7 100644 --- a/include/asm-ppc64/vio.h +++ b/include/asm-ppc64/vio.h @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -58,13 +57,13 @@ int vio_enable_interrupts(struct vio_dev *dev); int vio_disable_interrupts(struct vio_dev *dev); dma_addr_t vio_map_single(struct vio_dev *dev, void *vaddr, - size_t size, int direction); + size_t size, enum dma_data_direction direction); void vio_unmap_single(struct vio_dev *dev, dma_addr_t dma_handle, - size_t size, int direction); + size_t size, enum dma_data_direction direction); int vio_map_sg(struct vio_dev *vdev, struct scatterlist *sglist, - int nelems, int direction); + int nelems, enum dma_data_direction direction); void vio_unmap_sg(struct vio_dev *vdev, struct scatterlist *sglist, - int nelems, int direction); + int nelems, enum dma_data_direction direction); void *vio_alloc_consistent(struct vio_dev *dev, size_t size, dma_addr_t *dma_handle); void vio_free_consistent(struct vio_dev *dev, size_t size, void *vaddr, @@ -81,18 +80,18 @@ static inline int vio_dma_supported(struct vio_dev *hwdev, u64 mask) static inline void vio_dma_sync_single(struct vio_dev *hwdev, - dma_addr_t dma_handle, - size_t size, int direction) + dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) { - BUG_ON(direction == PCI_DMA_NONE); + BUG_ON(direction == DMA_NONE); /* nothing to do */ } static inline void vio_dma_sync_sg(struct vio_dev *hwdev, - struct scatterlist *sg, - int nelems, int direction) + struct scatterlist *sg, int nelems, + enum dma_data_direction direction) { - BUG_ON(direction == PCI_DMA_NONE); + BUG_ON(direction == DMA_NONE); /* nothing to do */ } static inline int vio_set_dma_mask(struct vio_dev *dev, u64 mask) { return -EIO; } -- cgit v1.2.3 From 9b678c1e2247e6dddf4bf245554bd3c099e456ab Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:48:41 -0700 Subject: [PATCH] ppc64: Use enum dma_data_direction for all APIs From: Stephen Rothwell This is just a cleanup to use enum dma_data_direction for all APIs except the pci_dma_ ones (since they are defined generically). Also make most of the functions in arch/ppc64/kernel/pci_iommu.c static. --- arch/ppc64/kernel/iSeries_iommu.c | 7 ++++--- arch/ppc64/kernel/iommu.c | 6 +++--- arch/ppc64/kernel/pSeries_iommu.c | 5 +++-- arch/ppc64/kernel/pSeries_lpar.c | 9 +++++---- arch/ppc64/kernel/pci_dma_direct.c | 9 +++++---- arch/ppc64/kernel/pci_iommu.c | 30 +++++++++++++++--------------- arch/ppc64/kernel/pmac_iommu.c | 3 ++- arch/ppc64/kernel/vio.c | 6 +++--- include/asm-ppc64/iommu.h | 6 ++++-- include/asm-ppc64/machdep.h | 3 ++- include/asm-ppc64/pci-bridge.h | 2 ++ include/asm-ppc64/pci.h | 20 ++++++++++++-------- 12 files changed, 60 insertions(+), 46 deletions(-) (limited to 'include') diff --git a/arch/ppc64/kernel/iSeries_iommu.c b/arch/ppc64/kernel/iSeries_iommu.c index 1922af2dbd43..ea4ef7497193 100644 --- a/arch/ppc64/kernel/iSeries_iommu.c +++ b/arch/ppc64/kernel/iSeries_iommu.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -69,7 +70,7 @@ extern struct list_head iSeries_Global_Device_List; static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages, - unsigned long uaddr, int direction) + unsigned long uaddr, enum dma_data_direction direction) { u64 rc; union tce_entry tce; @@ -82,12 +83,12 @@ static void tce_build_iSeries(struct iommu_table *tbl, long index, long npages, /* Virtual Bus */ tce.te_bits.tb_valid = 1; tce.te_bits.tb_allio = 1; - if (direction != PCI_DMA_TODEVICE) + if (direction != DMA_TO_DEVICE) tce.te_bits.tb_rdwr = 1; } else { /* PCI Bus */ tce.te_bits.tb_rdwr = 1; /* Read allowed */ - if (direction != PCI_DMA_TODEVICE) + if (direction != DMA_TO_DEVICE) tce.te_bits.tb_pciwr = 1; } diff --git a/arch/ppc64/kernel/iommu.c b/arch/ppc64/kernel/iommu.c index e3f032bbbab4..aa6b207cd321 100644 --- a/arch/ppc64/kernel/iommu.c +++ b/arch/ppc64/kernel/iommu.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -142,7 +141,7 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl, unsigned long np } dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page, - unsigned int npages, int direction) + unsigned int npages, enum dma_data_direction direction) { unsigned long entry, flags; dma_addr_t ret = DMA_ERROR_CODE; @@ -227,7 +226,8 @@ void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, } int iommu_alloc_sg(struct iommu_table *tbl, struct device *dev, - struct scatterlist *sglist, int nelems, int direction) + struct scatterlist *sglist, int nelems, + enum dma_data_direction direction) { dma_addr_t dma_next, dma_addr; unsigned long flags; diff --git a/arch/ppc64/kernel/pSeries_iommu.c b/arch/ppc64/kernel/pSeries_iommu.c index cfa278643ab5..367da0eb5b51 100644 --- a/arch/ppc64/kernel/pSeries_iommu.c +++ b/arch/ppc64/kernel/pSeries_iommu.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -46,7 +47,7 @@ static void tce_build_pSeries(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, - int direction) + enum dma_data_direction direction) { union tce_entry t; union tce_entry *tp; @@ -54,7 +55,7 @@ static void tce_build_pSeries(struct iommu_table *tbl, long index, t.te_word = 0; t.te_rdwr = 1; // Read allowed - if (direction != PCI_DMA_TODEVICE) + if (direction != DMA_TO_DEVICE) t.te_pciwr = 1; tp = ((union tce_entry *)tbl->it_base) + index; diff --git a/arch/ppc64/kernel/pSeries_lpar.c b/arch/ppc64/kernel/pSeries_lpar.c index 12b0fb86acad..15a9eb4ed014 100644 --- a/arch/ppc64/kernel/pSeries_lpar.c +++ b/arch/ppc64/kernel/pSeries_lpar.c @@ -21,7 +21,7 @@ #include #include -#include +#include #include #include #include @@ -130,8 +130,9 @@ long plpar_put_term_char(unsigned long termno, lbuf[1]); } -static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, - unsigned long uaddr, int direction ) +static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, + long npages, unsigned long uaddr, + enum dma_data_direction direction) { u64 rc; union tce_entry tce; @@ -139,7 +140,7 @@ static void tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, long npage tce.te_word = 0; tce.te_rpn = (virt_to_abs(uaddr)) >> PAGE_SHIFT; tce.te_rdwr = 1; - if (direction != PCI_DMA_TODEVICE) + if (direction != DMA_TO_DEVICE) tce.te_pciwr = 1; while (npages--) { diff --git a/arch/ppc64/kernel/pci_dma_direct.c b/arch/ppc64/kernel/pci_dma_direct.c index 1cd843237ed9..dc96c0cf2b91 100644 --- a/arch/ppc64/kernel/pci_dma_direct.c +++ b/arch/ppc64/kernel/pci_dma_direct.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -49,18 +50,18 @@ static void pci_direct_free_consistent(struct pci_dev *hwdev, size_t size, } static dma_addr_t pci_direct_map_single(struct pci_dev *hwdev, void *ptr, - size_t size, int direction) + size_t size, enum dma_data_direction direction) { return virt_to_abs(ptr); } static void pci_direct_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, - size_t size, int direction) + size_t size, enum dma_data_direction direction) { } static int pci_direct_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, - int nents, int direction) + int nents, enum dma_data_direction direction) { int i; @@ -73,7 +74,7 @@ static int pci_direct_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, } static void pci_direct_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, - int nents, int direction) + int nents, enum dma_data_direction direction) { } diff --git a/arch/ppc64/kernel/pci_iommu.c b/arch/ppc64/kernel/pci_iommu.c index 8d9869173efd..0d4da23394e7 100644 --- a/arch/ppc64/kernel/pci_iommu.c +++ b/arch/ppc64/kernel/pci_iommu.c @@ -66,7 +66,7 @@ static inline struct iommu_table *devnode_table(struct pci_dev *dev) * Returns the virtual address of the buffer and sets dma_handle * to the dma address (mapping) of the first page. */ -void *pci_iommu_alloc_consistent(struct pci_dev *hwdev, size_t size, +static void *pci_iommu_alloc_consistent(struct pci_dev *hwdev, size_t size, dma_addr_t *dma_handle) { struct iommu_table *tbl; @@ -100,7 +100,7 @@ void *pci_iommu_alloc_consistent(struct pci_dev *hwdev, size_t size, memset(ret, 0, size); /* Set up tces to cover the allocated range */ - mapping = iommu_alloc(tbl, ret, npages, PCI_DMA_BIDIRECTIONAL); + mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); if (mapping == DMA_ERROR_CODE) { free_pages((unsigned long)ret, order); @@ -112,7 +112,7 @@ void *pci_iommu_alloc_consistent(struct pci_dev *hwdev, size_t size, } -void pci_iommu_free_consistent(struct pci_dev *hwdev, size_t size, +static void pci_iommu_free_consistent(struct pci_dev *hwdev, size_t size, void *vaddr, dma_addr_t dma_handle) { struct iommu_table *tbl; @@ -136,15 +136,15 @@ void pci_iommu_free_consistent(struct pci_dev *hwdev, size_t size, * need not be page aligned, the dma_addr_t returned will point to the same * byte within the page as vaddr. */ -dma_addr_t pci_iommu_map_single(struct pci_dev *hwdev, void *vaddr, - size_t size, int direction) +static dma_addr_t pci_iommu_map_single(struct pci_dev *hwdev, void *vaddr, + size_t size, enum dma_data_direction direction) { struct iommu_table * tbl; dma_addr_t dma_handle = DMA_ERROR_CODE; unsigned long uaddr; unsigned int npages; - BUG_ON(direction == PCI_DMA_NONE); + BUG_ON(direction == DMA_NONE); uaddr = (unsigned long)vaddr; npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK); @@ -167,13 +167,13 @@ dma_addr_t pci_iommu_map_single(struct pci_dev *hwdev, void *vaddr, } -void pci_iommu_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_handle, - size_t size, int direction) +static void pci_iommu_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_handle, + size_t size, enum dma_data_direction direction) { struct iommu_table *tbl; unsigned int npages; - BUG_ON(direction == PCI_DMA_NONE); + BUG_ON(direction == DMA_NONE); npages = (PAGE_ALIGN(dma_handle + size) - (dma_handle & PAGE_MASK)) >> PAGE_SHIFT; @@ -185,12 +185,12 @@ void pci_iommu_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_handle, } -int pci_iommu_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, - int direction) +static int pci_iommu_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction) { struct iommu_table * tbl; - BUG_ON(direction == PCI_DMA_NONE); + BUG_ON(direction == DMA_NONE); if (nelems == 0) return 0; @@ -202,12 +202,12 @@ int pci_iommu_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelem return iommu_alloc_sg(tbl, &pdev->dev, sglist, nelems, direction); } -void pci_iommu_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, - int direction) +static void pci_iommu_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction) { struct iommu_table *tbl; - BUG_ON(direction == PCI_DMA_NONE); + BUG_ON(direction == DMA_NONE); tbl = devnode_table(pdev); if (!tbl) diff --git a/arch/ppc64/kernel/pmac_iommu.c b/arch/ppc64/kernel/pmac_iommu.c index e04c344c127e..0e91536b73c1 100644 --- a/arch/ppc64/kernel/pmac_iommu.c +++ b/arch/ppc64/kernel/pmac_iommu.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -141,7 +142,7 @@ static void dart_flush(struct iommu_table *tbl) static void dart_build_pmac(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, - int direction) + enum dma_data_direction direction) { unsigned int *dp; unsigned int rpn; diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c index 054027f898fb..530528a3fbfb 100644 --- a/arch/ppc64/kernel/vio.c +++ b/arch/ppc64/kernel/vio.c @@ -431,7 +431,7 @@ dma_addr_t vio_map_single(struct vio_dev *dev, void *vaddr, tbl = dev->iommu_table; if (tbl) { - dma_handle = iommu_alloc(tbl, vaddr, npages, (int)direction); + dma_handle = iommu_alloc(tbl, vaddr, npages, direction); dma_handle |= (uaddr & ~PAGE_MASK); } @@ -470,7 +470,7 @@ int vio_map_sg(struct vio_dev *vdev, struct scatterlist *sglist, int nelems, if (!tbl) return 0; - return iommu_alloc_sg(tbl, &vdev->dev, sglist, nelems, (int)direction); + return iommu_alloc_sg(tbl, &vdev->dev, sglist, nelems, direction); } EXPORT_SYMBOL(vio_map_sg); @@ -515,7 +515,7 @@ void *vio_alloc_consistent(struct vio_dev *dev, size_t size, /* Page allocation succeeded */ memset(ret, 0, npages << PAGE_SHIFT); /* Set up tces to cover the allocated range */ - tce = iommu_alloc(tbl, ret, npages, (int)DMA_BIDIRECTIONAL); + tce = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); if (tce == DMA_ERROR_CODE) { PPCDBG(PPCDBG_TCE, "vio_alloc_consistent: iommu_alloc failed\n" ); free_pages((unsigned long)ret, order); diff --git a/include/asm-ppc64/iommu.h b/include/asm-ppc64/iommu.h index 3aeadc38d5f3..a2cc850ef10a 100644 --- a/include/asm-ppc64/iommu.h +++ b/include/asm-ppc64/iommu.h @@ -25,6 +25,7 @@ #include #include #include +#include /* * IOMAP_MAX_ORDER defines the largest contiguous block @@ -132,14 +133,15 @@ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl); /* allocates a range of tces and sets them to the pages */ extern dma_addr_t iommu_alloc(struct iommu_table *, void *page, - unsigned int numPages, int direction); + unsigned int numPages, + enum dma_data_direction direction); extern void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages); /* same with sg lists */ extern int iommu_alloc_sg(struct iommu_table *table, struct device *dev, struct scatterlist *sglist, int nelems, - int direction); + enum dma_data_direction direction); extern void iommu_free_sg(struct iommu_table *tbl, struct scatterlist *sglist, int nelems); diff --git a/include/asm-ppc64/machdep.h b/include/asm-ppc64/machdep.h index 10e7e9ec6251..bb961a029fae 100644 --- a/include/asm-ppc64/machdep.h +++ b/include/asm-ppc64/machdep.h @@ -11,6 +11,7 @@ #include #include +#include struct pt_regs; struct pci_bus; @@ -57,7 +58,7 @@ struct machdep_calls { long index, long npages, unsigned long uaddr, - int direction); + enum dma_data_direction direction); void (*tce_free)(struct iommu_table *tbl, long index, long npages); diff --git a/include/asm-ppc64/pci-bridge.h b/include/asm-ppc64/pci-bridge.h index a092b9cae621..08ba3f2b89ba 100644 --- a/include/asm-ppc64/pci-bridge.h +++ b/include/asm-ppc64/pci-bridge.h @@ -2,6 +2,8 @@ #ifndef _ASM_PCI_BRIDGE_H #define _ASM_PCI_BRIDGE_H +#include + /* * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License diff --git a/include/asm-ppc64/pci.h b/include/asm-ppc64/pci.h index 9186c7d55e9d..032a2e6e8224 100644 --- a/include/asm-ppc64/pci.h +++ b/include/asm-ppc64/pci.h @@ -64,13 +64,13 @@ struct pci_dma_ops { void *vaddr, dma_addr_t dma_handle); dma_addr_t (*pci_map_single)(struct pci_dev *hwdev, void *ptr, - size_t size, int direction); + size_t size, enum dma_data_direction direction); void (*pci_unmap_single)(struct pci_dev *hwdev, dma_addr_t dma_addr, - size_t size, int direction); + size_t size, enum dma_data_direction direction); int (*pci_map_sg)(struct pci_dev *hwdev, struct scatterlist *sg, - int nents, int direction); + int nents, enum dma_data_direction direction); void (*pci_unmap_sg)(struct pci_dev *hwdev, struct scatterlist *sg, - int nents, int direction); + int nents, enum dma_data_direction direction); int (*pci_dma_supported)(struct pci_dev *hwdev, u64 mask); int (*pci_dac_dma_supported)(struct pci_dev *hwdev, u64 mask); }; @@ -92,25 +92,29 @@ static inline void pci_free_consistent(struct pci_dev *hwdev, size_t size, static inline dma_addr_t pci_map_single(struct pci_dev *hwdev, void *ptr, size_t size, int direction) { - return pci_dma_ops.pci_map_single(hwdev, ptr, size, direction); + return pci_dma_ops.pci_map_single(hwdev, ptr, size, + (enum dma_data_direction)direction); } static inline void pci_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_addr, size_t size, int direction) { - pci_dma_ops.pci_unmap_single(hwdev, dma_addr, size, direction); + pci_dma_ops.pci_unmap_single(hwdev, dma_addr, size, + (enum dma_data_direction)direction); } static inline int pci_map_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction) { - return pci_dma_ops.pci_map_sg(hwdev, sg, nents, direction); + return pci_dma_ops.pci_map_sg(hwdev, sg, nents, + (enum dma_data_direction)direction); } static inline void pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg, int nents, int direction) { - pci_dma_ops.pci_unmap_sg(hwdev, sg, nents, direction); + pci_dma_ops.pci_unmap_sg(hwdev, sg, nents, + (enum dma_data_direction)direction); } static inline void pci_dma_sync_single_for_cpu(struct pci_dev *hwdev, -- cgit v1.2.3 From e1df56ff96e6b7be4a651dad58ba38cda5f0d8b3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:48:54 -0700 Subject: [PATCH] ppc64: Consolidate some of the iommu DMA mapping routines. From: Stephen Rothwell This patch consolidates some of the iommu DMA mapping routines. --- arch/ppc64/kernel/iommu.c | 125 +++++++++++++++++++++++++++++++++++++++--- arch/ppc64/kernel/pci_iommu.c | 121 +++------------------------------------- arch/ppc64/kernel/vio.c | 113 +++----------------------------------- include/asm-ppc64/iommu.h | 34 ++++++------ 4 files changed, 149 insertions(+), 244 deletions(-) (limited to 'include') diff --git a/arch/ppc64/kernel/iommu.c b/arch/ppc64/kernel/iommu.c index aa6b207cd321..fb321026ea72 100644 --- a/arch/ppc64/kernel/iommu.c +++ b/arch/ppc64/kernel/iommu.c @@ -140,7 +140,7 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl, unsigned long np return n; } -dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page, +static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *page, unsigned int npages, enum dma_data_direction direction) { unsigned long entry, flags; @@ -206,7 +206,7 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, __clear_bit(free_entry+i, tbl->it_map); } -void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, +static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { unsigned long flags; @@ -225,9 +225,9 @@ void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, spin_unlock_irqrestore(&(tbl->it_lock), flags); } -int iommu_alloc_sg(struct iommu_table *tbl, struct device *dev, - struct scatterlist *sglist, int nelems, - enum dma_data_direction direction) +int iommu_map_sg(struct device *dev, struct iommu_table *tbl, + struct scatterlist *sglist, int nelems, + enum dma_data_direction direction) { dma_addr_t dma_next, dma_addr; unsigned long flags; @@ -235,6 +235,11 @@ int iommu_alloc_sg(struct iommu_table *tbl, struct device *dev, int outcount; unsigned long handle; + BUG_ON(direction == DMA_NONE); + + if ((nelems == 0) || !tbl) + return 0; + outs = s = segstart = &sglist[0]; outcount = 1; handle = 0; @@ -349,11 +354,16 @@ int iommu_alloc_sg(struct iommu_table *tbl, struct device *dev, } -void iommu_free_sg(struct iommu_table *tbl, struct scatterlist *sglist, - int nelems) +void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction) { unsigned long flags; + BUG_ON(direction == DMA_NONE); + + if (!tbl) + return; + spin_lock_irqsave(&(tbl->it_lock), flags); while (nelems--) { @@ -414,3 +424,104 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl) return tbl; } + +/* Creates TCEs for a user provided buffer. The user buffer must be + * contiguous real kernel storage (not vmalloc). The address of the buffer + * passed here is the kernel (virtual) address of the buffer. The buffer + * need not be page aligned, the dma_addr_t returned will point to the same + * byte within the page as vaddr. + */ +dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr, + size_t size, enum dma_data_direction direction) +{ + dma_addr_t dma_handle = DMA_ERROR_CODE; + unsigned long uaddr; + unsigned int npages; + + BUG_ON(direction == DMA_NONE); + + uaddr = (unsigned long)vaddr; + npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK); + npages >>= PAGE_SHIFT; + + if (tbl) { + dma_handle = iommu_alloc(tbl, vaddr, npages, direction); + if (dma_handle == DMA_ERROR_CODE) { + if (printk_ratelimit()) { + printk(KERN_INFO "iommu_alloc failed, " + "tbl %p vaddr %p npages %d\n", + tbl, vaddr, npages); + } + } else + dma_handle |= (uaddr & ~PAGE_MASK); + } + + return dma_handle; +} + +void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle, + size_t size, enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); + + if (tbl) + iommu_free(tbl, dma_handle, (PAGE_ALIGN(dma_handle + size) - + (dma_handle & PAGE_MASK)) >> PAGE_SHIFT); +} + +/* Allocates a contiguous real buffer and creates mappings over it. + * Returns the virtual address of the buffer and sets dma_handle + * to the dma address (mapping) of the first page. + */ +void *iommu_alloc_consistent(struct iommu_table *tbl, size_t size, + dma_addr_t *dma_handle) +{ + void *ret = NULL; + dma_addr_t mapping; + unsigned int npages, order; + + size = PAGE_ALIGN(size); + npages = size >> PAGE_SHIFT; + order = get_order(size); + + /* + * Client asked for way too much space. This is checked later + * anyway. It is easier to debug here for the drivers than in + * the tce tables. + */ + if (order >= IOMAP_MAX_ORDER) { + printk("iommu_alloc_consistent size too large: 0x%lx\n", size); + return (void *)DMA_ERROR_CODE; + } + + if (!tbl) + return NULL; + + /* Alloc enough pages (and possibly more) */ + ret = (void *)__get_free_pages(GFP_ATOMIC, order); + if (!ret) + return NULL; + memset(ret, 0, size); + + /* Set up tces to cover the allocated range */ + mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); + if (mapping == DMA_ERROR_CODE) { + free_pages((unsigned long)ret, order); + ret = NULL; + } else + *dma_handle = mapping; + return ret; +} + +void iommu_free_consistent(struct iommu_table *tbl, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + unsigned int npages; + + if (tbl) { + size = PAGE_ALIGN(size); + npages = size >> PAGE_SHIFT; + iommu_free(tbl, dma_handle, npages); + free_pages((unsigned long)vaddr, get_order(size)); + } +} diff --git a/arch/ppc64/kernel/pci_iommu.c b/arch/ppc64/kernel/pci_iommu.c index 0d4da23394e7..4d99851fe815 100644 --- a/arch/ppc64/kernel/pci_iommu.c +++ b/arch/ppc64/kernel/pci_iommu.c @@ -43,8 +43,6 @@ #include #endif /* CONFIG_PPC_ISERIES */ -#define DBG(...) - static inline struct iommu_table *devnode_table(struct pci_dev *dev) { if (!dev) @@ -69,67 +67,15 @@ static inline struct iommu_table *devnode_table(struct pci_dev *dev) static void *pci_iommu_alloc_consistent(struct pci_dev *hwdev, size_t size, dma_addr_t *dma_handle) { - struct iommu_table *tbl; - void *ret = NULL; - dma_addr_t mapping; - unsigned int npages, order; - - size = PAGE_ALIGN(size); - npages = size >> PAGE_SHIFT; - order = get_order(size); - - /* Client asked for way too much space. This is checked later anyway */ - /* It is easier to debug here for the drivers than in the tce tables.*/ - if (order >= IOMAP_MAX_ORDER) { - printk("PCI_DMA: pci_alloc_consistent size too large: 0x%lx\n", - size); - return NULL; - } - - tbl = devnode_table(hwdev); - - if (!tbl) - return NULL; - - /* Alloc enough pages (and possibly more) */ - ret = (void *)__get_free_pages(GFP_ATOMIC, order); - - if (!ret) - return NULL; - - memset(ret, 0, size); - - /* Set up tces to cover the allocated range */ - mapping = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); - - if (mapping == DMA_ERROR_CODE) { - free_pages((unsigned long)ret, order); - ret = NULL; - } else - *dma_handle = mapping; - - return ret; + return iommu_alloc_consistent(devnode_table(hwdev), size, dma_handle); } - static void pci_iommu_free_consistent(struct pci_dev *hwdev, size_t size, void *vaddr, dma_addr_t dma_handle) { - struct iommu_table *tbl; - unsigned int npages; - - size = PAGE_ALIGN(size); - npages = size >> PAGE_SHIFT; - - tbl = devnode_table(hwdev); - - if (tbl) { - iommu_free(tbl, dma_handle, npages); - free_pages((unsigned long)vaddr, get_order(size)); - } + iommu_free_consistent(devnode_table(hwdev), size, vaddr, dma_handle); } - /* Creates TCEs for a user provided buffer. The user buffer must be * contiguous real kernel storage (not vmalloc). The address of the buffer * passed here is the kernel (virtual) address of the buffer. The buffer @@ -139,81 +85,28 @@ static void pci_iommu_free_consistent(struct pci_dev *hwdev, size_t size, static dma_addr_t pci_iommu_map_single(struct pci_dev *hwdev, void *vaddr, size_t size, enum dma_data_direction direction) { - struct iommu_table * tbl; - dma_addr_t dma_handle = DMA_ERROR_CODE; - unsigned long uaddr; - unsigned int npages; - - BUG_ON(direction == DMA_NONE); - - uaddr = (unsigned long)vaddr; - npages = PAGE_ALIGN(uaddr + size) - (uaddr & PAGE_MASK); - npages >>= PAGE_SHIFT; - - tbl = devnode_table(hwdev); - - if (tbl) { - dma_handle = iommu_alloc(tbl, vaddr, npages, direction); - if (dma_handle == DMA_ERROR_CODE) { - if (printk_ratelimit()) { - printk(KERN_INFO "iommu_alloc failed, tbl %p vaddr %p npages %d\n", - tbl, vaddr, npages); - } - } else - dma_handle |= (uaddr & ~PAGE_MASK); - } - - return dma_handle; + return iommu_map_single(devnode_table(hwdev), vaddr, size, direction); } static void pci_iommu_unmap_single(struct pci_dev *hwdev, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction) { - struct iommu_table *tbl; - unsigned int npages; - - BUG_ON(direction == DMA_NONE); - - npages = (PAGE_ALIGN(dma_handle + size) - (dma_handle & PAGE_MASK)) - >> PAGE_SHIFT; - - tbl = devnode_table(hwdev); - - if (tbl) - iommu_free(tbl, dma_handle, npages); + iommu_unmap_single(devnode_table(hwdev), dma_handle, size, direction); } static int pci_iommu_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, enum dma_data_direction direction) { - struct iommu_table * tbl; - - BUG_ON(direction == DMA_NONE); - - if (nelems == 0) - return 0; - - tbl = devnode_table(pdev); - if (!tbl) - return 0; - - return iommu_alloc_sg(tbl, &pdev->dev, sglist, nelems, direction); + return iommu_map_sg(&pdev->dev, devnode_table(pdev), sglist, + nelems, direction); } static void pci_iommu_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, enum dma_data_direction direction) { - struct iommu_table *tbl; - - BUG_ON(direction == DMA_NONE); - - tbl = devnode_table(pdev); - if (!tbl) - return; - - iommu_free_sg(tbl, sglist, nelems); + iommu_unmap_sg(devnode_table(pdev), sglist, nelems, direction); } /* We support DMA to/from any memory page via the iommu */ diff --git a/arch/ppc64/kernel/vio.c b/arch/ppc64/kernel/vio.c index 530528a3fbfb..c4c16b8cd7a9 100644 --- a/arch/ppc64/kernel/vio.c +++ b/arch/ppc64/kernel/vio.c @@ -413,145 +413,46 @@ int vio_disable_interrupts(struct vio_dev *dev) } EXPORT_SYMBOL(vio_disable_interrupts); - dma_addr_t vio_map_single(struct vio_dev *dev, void *vaddr, size_t size, enum dma_data_direction direction) { - struct iommu_table *tbl; - dma_addr_t dma_handle = DMA_ERROR_CODE; - unsigned long uaddr; - unsigned int npages; - - BUG_ON(direction == DMA_NONE); - - uaddr = (unsigned long)vaddr; - npages = PAGE_ALIGN( uaddr + size ) - ( uaddr & PAGE_MASK ); - npages >>= PAGE_SHIFT; - - tbl = dev->iommu_table; - - if (tbl) { - dma_handle = iommu_alloc(tbl, vaddr, npages, direction); - dma_handle |= (uaddr & ~PAGE_MASK); - } - - return dma_handle; + return iommu_map_single(dev->iommu_table, vaddr, size, direction); } EXPORT_SYMBOL(vio_map_single); void vio_unmap_single(struct vio_dev *dev, dma_addr_t dma_handle, size_t size, enum dma_data_direction direction) { - struct iommu_table * tbl; - unsigned int npages; - - BUG_ON(direction == DMA_NONE); - - npages = PAGE_ALIGN( dma_handle + size ) - ( dma_handle & PAGE_MASK ); - npages >>= PAGE_SHIFT; - - tbl = dev->iommu_table; - if(tbl) - iommu_free(tbl, dma_handle, npages); + iommu_unmap_single(dev->iommu_table, dma_handle, size, direction); } EXPORT_SYMBOL(vio_unmap_single); int vio_map_sg(struct vio_dev *vdev, struct scatterlist *sglist, int nelems, enum dma_data_direction direction) { - struct iommu_table *tbl; - - BUG_ON(direction == DMA_NONE); - - if (nelems == 0) - return 0; - - tbl = vdev->iommu_table; - if (!tbl) - return 0; - - return iommu_alloc_sg(tbl, &vdev->dev, sglist, nelems, direction); + return iommu_map_sg(&vdev->dev, vdev->iommu_table, sglist, + nelems, direction); } EXPORT_SYMBOL(vio_map_sg); void vio_unmap_sg(struct vio_dev *vdev, struct scatterlist *sglist, int nelems, enum dma_data_direction direction) { - struct iommu_table *tbl; - - BUG_ON(direction == DMA_NONE); - - tbl = vdev->iommu_table; - if (tbl) - iommu_free_sg(tbl, sglist, nelems); + iommu_unmap_sg(vdev->iommu_table, sglist, nelems, direction); } EXPORT_SYMBOL(vio_unmap_sg); void *vio_alloc_consistent(struct vio_dev *dev, size_t size, dma_addr_t *dma_handle) { - struct iommu_table * tbl; - void *ret = NULL; - unsigned int npages, order; - dma_addr_t tce; - - size = PAGE_ALIGN(size); - npages = size >> PAGE_SHIFT; - order = get_order(size); - - /* Client asked for way to much space. This is checked later anyway */ - /* It is easier to debug here for the drivers than in the tce tables.*/ - if(order >= IOMAP_MAX_ORDER) { - printk("VIO_DMA: vio_alloc_consistent size too large: 0x%lx \n", size); - return NULL; - } - - tbl = dev->iommu_table; - - if (tbl) { - /* Alloc enough pages (and possibly more) */ - ret = (void *)__get_free_pages(GFP_ATOMIC, order); - if (ret) { - /* Page allocation succeeded */ - memset(ret, 0, npages << PAGE_SHIFT); - /* Set up tces to cover the allocated range */ - tce = iommu_alloc(tbl, ret, npages, DMA_BIDIRECTIONAL); - if (tce == DMA_ERROR_CODE) { - PPCDBG(PPCDBG_TCE, "vio_alloc_consistent: iommu_alloc failed\n" ); - free_pages((unsigned long)ret, order); - ret = NULL; - } else { - *dma_handle = tce; - } - } - else PPCDBG(PPCDBG_TCE, "vio_alloc_consistent: __get_free_pages failed for size = %d\n", size); - } - else PPCDBG(PPCDBG_TCE, "vio_alloc_consistent: get_iommu_table failed for 0x%016lx\n", dev); - - PPCDBG(PPCDBG_TCE, "\tvio_alloc_consistent: dma_handle = 0x%16.16lx\n", *dma_handle); - PPCDBG(PPCDBG_TCE, "\tvio_alloc_consistent: return = 0x%16.16lx\n", ret); - return ret; + return iommu_alloc_consistent(dev->iommu_table, size, dma_handle); } EXPORT_SYMBOL(vio_alloc_consistent); void vio_free_consistent(struct vio_dev *dev, size_t size, void *vaddr, dma_addr_t dma_handle) { - struct iommu_table *tbl; - unsigned int npages; - - PPCDBG(PPCDBG_TCE, "vio_free_consistent:\n"); - PPCDBG(PPCDBG_TCE, "\tdev = 0x%16.16lx, size = 0x%16.16lx, dma_handle = 0x%16.16lx, vaddr = 0x%16.16lx\n", dev, size, dma_handle, vaddr); - - size = PAGE_ALIGN(size); - npages = size >> PAGE_SHIFT; - - tbl = dev->iommu_table; - - if ( tbl ) { - iommu_free(tbl, dma_handle, npages); - free_pages((unsigned long)vaddr, get_order(size)); - } + iommu_free_consistent(dev->iommu_table, size, vaddr, dma_handle); } EXPORT_SYMBOL(vio_free_consistent); diff --git a/include/asm-ppc64/iommu.h b/include/asm-ppc64/iommu.h index a2cc850ef10a..689b6adf383e 100644 --- a/include/asm-ppc64/iommu.h +++ b/include/asm-ppc64/iommu.h @@ -19,8 +19,8 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ -#ifndef _PCI_DMA_H -#define _PCI_DMA_H +#ifndef _ASM_IOMMU_H +#define _ASM_IOMMU_H #include #include @@ -131,20 +131,20 @@ extern void iommu_devnode_init(struct iSeries_Device_Node *dn); */ extern struct iommu_table *iommu_init_table(struct iommu_table * tbl); -/* allocates a range of tces and sets them to the pages */ -extern dma_addr_t iommu_alloc(struct iommu_table *, void *page, - unsigned int numPages, - enum dma_data_direction direction); -extern void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, - unsigned int npages); - -/* same with sg lists */ -extern int iommu_alloc_sg(struct iommu_table *table, struct device *dev, - struct scatterlist *sglist, int nelems, - enum dma_data_direction direction); -extern void iommu_free_sg(struct iommu_table *tbl, struct scatterlist *sglist, - int nelems); - +extern int iommu_map_sg(struct device *dev, struct iommu_table *tbl, + struct scatterlist *sglist, int nelems, + enum dma_data_direction direction); +extern void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist, + int nelems, enum dma_data_direction direction); + +extern void *iommu_alloc_consistent(struct iommu_table *tbl, size_t size, + dma_addr_t *dma_handle); +extern void iommu_free_consistent(struct iommu_table *tbl, size_t size, + void *vaddr, dma_addr_t dma_handle); +extern dma_addr_t iommu_map_single(struct iommu_table *tbl, void *vaddr, + size_t size, enum dma_data_direction direction); +extern void iommu_unmap_single(struct iommu_table *tbl, dma_addr_t dma_handle, + size_t size, enum dma_data_direction direction); extern void tce_init_pSeries(void); extern void tce_init_iSeries(void); @@ -154,4 +154,4 @@ extern void pci_dma_init_direct(void); extern int ppc64_iommu_off; -#endif +#endif /* _ASM_IOMMU_H */ -- cgit v1.2.3 From 53e8cdeb3bd9eea4ccd67de4a3b5b479680ca063 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:49:21 -0700 Subject: [PATCH] ppc64: replace vio_dma_mapping_error with dma_mapping_error everywhere. From: Stephen Rothwell James Bottomley is right, this was a mistake. This patch replaces vio_dma_mapping_error with dma_mapping_error everywhere. --- drivers/net/ibmveth.c | 16 ++++++++-------- include/asm-ppc64/vio.h | 5 ----- 2 files changed, 8 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/net/ibmveth.c b/drivers/net/ibmveth.c index 6427a25c2719..3da41374a127 100644 --- a/drivers/net/ibmveth.c +++ b/drivers/net/ibmveth.c @@ -404,7 +404,7 @@ static inline void ibmveth_rxq_harvest_buffer(struct ibmveth_adapter *adapter) static void ibmveth_cleanup(struct ibmveth_adapter *adapter) { if(adapter->buffer_list_addr != NULL) { - if(!vio_dma_mapping_error(adapter->buffer_list_dma)) { + if(!dma_mapping_error(adapter->buffer_list_dma)) { vio_unmap_single(adapter->vdev, adapter->buffer_list_dma, 4096, DMA_BIDIRECTIONAL); adapter->buffer_list_dma = DMA_ERROR_CODE; } @@ -413,7 +413,7 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter) } if(adapter->filter_list_addr != NULL) { - if(!vio_dma_mapping_error(adapter->filter_list_dma)) { + if(!dma_mapping_error(adapter->filter_list_dma)) { vio_unmap_single(adapter->vdev, adapter->filter_list_dma, 4096, DMA_BIDIRECTIONAL); adapter->filter_list_dma = DMA_ERROR_CODE; } @@ -422,7 +422,7 @@ static void ibmveth_cleanup(struct ibmveth_adapter *adapter) } if(adapter->rx_queue.queue_addr != NULL) { - if(!vio_dma_mapping_error(adapter->rx_queue.queue_dma)) { + if(!dma_mapping_error(adapter->rx_queue.queue_dma)) { vio_unmap_single(adapter->vdev, adapter->rx_queue.queue_dma, adapter->rx_queue.queue_len, DMA_BIDIRECTIONAL); adapter->rx_queue.queue_dma = DMA_ERROR_CODE; } @@ -473,9 +473,9 @@ static int ibmveth_open(struct net_device *netdev) adapter->filter_list_dma = vio_map_single(adapter->vdev, adapter->filter_list_addr, 4096, DMA_BIDIRECTIONAL); adapter->rx_queue.queue_dma = vio_map_single(adapter->vdev, adapter->rx_queue.queue_addr, adapter->rx_queue.queue_len, DMA_BIDIRECTIONAL); - if((vio_dma_mapping_error(adapter->buffer_list_dma) ) || - (vio_dma_mapping_error(adapter->filter_list_dma)) || - (vio_dma_mapping_error(adapter->rx_queue.queue_dma))) { + if((dma_mapping_error(adapter->buffer_list_dma) ) || + (dma_mapping_error(adapter->filter_list_dma)) || + (dma_mapping_error(adapter->rx_queue.queue_dma))) { ibmveth_error_printk("unable to map filter or buffer list pages\n"); ibmveth_cleanup(adapter); return -ENOMEM; @@ -644,7 +644,7 @@ static int ibmveth_start_xmit(struct sk_buff *skb, struct net_device *netdev) desc[0].fields.address = vio_map_single(adapter->vdev, skb->data, desc[0].fields.length, PCI_DMA_TODEVICE); desc[0].fields.valid = 1; - if(vio_dma_mapping_error(desc[0].fields.address)) { + if(dma_mapping_error(desc[0].fields.address)) { ibmveth_error_printk("tx: unable to map initial fragment\n"); adapter->tx_map_failed++; adapter->stats.tx_dropped++; @@ -663,7 +663,7 @@ static int ibmveth_start_xmit(struct sk_buff *skb, struct net_device *netdev) desc[curfrag+1].fields.length = frag->size; desc[curfrag+1].fields.valid = 1; - if(vio_dma_mapping_error(desc[curfrag+1].fields.address)) { + if(dma_mapping_error(desc[curfrag+1].fields.address)) { ibmveth_error_printk("tx: unable to map fragment %d\n", curfrag); adapter->tx_map_failed++; adapter->stats.tx_dropped++; diff --git a/include/asm-ppc64/vio.h b/include/asm-ppc64/vio.h index 7fe3a23736e7..107201b25008 100644 --- a/include/asm-ppc64/vio.h +++ b/include/asm-ppc64/vio.h @@ -137,9 +137,4 @@ static inline struct vio_dev *to_vio_dev(struct device *dev) return container_of(dev, struct vio_dev, dev); } -static inline int vio_dma_mapping_error(dma_addr_t dma_addr) -{ - return dma_mapping_error(dma_addr); -} - #endif /* _ASM_VIO_H */ -- cgit v1.2.3 From c3a85f1fb88cfa30ab4af65348eaf4290233cac8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:49:46 -0700 Subject: [PATCH] ppc64: Oops cleanup From: Anton Blanchard Oops cleanup: - Move prototypes into system.h - Move the debugger hooks into die, all the calls sites were calling them. - Handle bad values passed to prregs --- arch/ppc64/kernel/traps.c | 50 +++++++++++++++++++++------------------------- arch/ppc64/mm/fault.c | 8 +------- arch/ppc64/xmon/xmon.c | 34 +++++++++++++++++++------------ include/asm-ppc64/system.h | 5 +++++ 4 files changed, 50 insertions(+), 47 deletions(-) (limited to 'include') diff --git a/arch/ppc64/kernel/traps.c b/arch/ppc64/kernel/traps.c index 6074cae369e9..82d9b8b24e61 100644 --- a/arch/ppc64/kernel/traps.c +++ b/arch/ppc64/kernel/traps.c @@ -37,9 +37,6 @@ #include #include -extern int fix_alignment(struct pt_regs *); -extern void bad_page_fault(struct pt_regs *, unsigned long, int); - #ifdef CONFIG_PPC_PSERIES /* This is true if we are using the firmware NMI handler (typically LPAR) */ extern int fwnmi_active; @@ -67,11 +64,17 @@ EXPORT_SYMBOL(__debugger_fault_handler); static spinlock_t die_lock = SPIN_LOCK_UNLOCKED; -void die(const char *str, struct pt_regs *regs, long err) +int die(const char *str, struct pt_regs *regs, long err) { static int die_counter; int nl = 0; + if (debugger_fault_handler(regs)) + return 1; + + if (debugger(regs)) + return 1; + console_verbose(); spin_lock_irq(&die_lock); bust_spinlocks(1); @@ -126,15 +129,16 @@ void die(const char *str, struct pt_regs *regs, long err) panic("Fatal exception"); } do_exit(SIGSEGV); + + return 0; } static void _exception(int signr, siginfo_t *info, struct pt_regs *regs) { if (!user_mode(regs)) { - if (debugger(regs)) + if (die("Exception in kernel mode", regs, signr)) return; - die("Exception in kernel mode", regs, signr); } force_sig_info(signr, info, current); @@ -188,8 +192,7 @@ SystemResetException(struct pt_regs *regs) } #endif - if (!debugger(regs)) - die("System Reset", regs, 0); + die("System Reset", regs, 0); /* Must die if the interrupt is not recoverable */ if (!(regs->msr & MSR_RI)) @@ -246,9 +249,6 @@ static int recover_mce(struct pt_regs *regs, struct rtas_error_log err) * * On hardware prior to Power 4 these exceptions were asynchronous which * means we can't tell exactly where it occurred and so we can't recover. - * - * Note that the debugger should test RI=0 and warn the user that system - * state has been corrupted. */ void MachineCheckException(struct pt_regs *regs) @@ -266,12 +266,11 @@ MachineCheckException(struct pt_regs *regs) } #endif - if (debugger_fault_handler(regs)) - return; - if (debugger(regs)) - return; + die("Machine check", regs, 0); - die("Machine check in kernel mode", regs, 0); + /* Must die if the interrupt is not recoverable */ + if (!(regs->msr & MSR_RI)) + panic("Unrecoverable Machine check"); } void @@ -397,9 +396,6 @@ ProgramCheckException(struct pt_regs *regs) { siginfo_t info; - if (debugger_fault_handler(regs)) - return; - if (regs->msr & 0x100000) { /* IEEE FP exception */ @@ -438,16 +434,18 @@ ProgramCheckException(struct pt_regs *regs) } } -void -KernelFPUnavailableException(struct pt_regs *regs) +void KernelFPUnavailableException(struct pt_regs *regs) { - die("Unrecoverable FP Unavailable Exception in Kernel", regs, 0); + printk(KERN_EMERG "Unrecoverable FP Unavailable Exception " + "%lx at %lx\n", regs->trap, regs->nip); + die("Unrecoverable FP Unavailable Exception", regs, SIGABRT); } -void -KernelAltivecUnavailableException(struct pt_regs *regs) +void KernelAltivecUnavailableException(struct pt_regs *regs) { - die("Unrecoverable VMX/Altivec Unavailable Exception in Kernel", regs, 0); + printk(KERN_EMERG "Unrecoverable VMX/Altivec Unavailable Exception " + "%lx at %lx\n", regs->trap, regs->nip); + die("Unrecoverable VMX/Altivec Unavailable Exception", regs, SIGABRT); } void @@ -539,7 +537,6 @@ void unrecoverable_exception(struct pt_regs *regs) { printk(KERN_EMERG "Unrecoverable exception %lx at %lx\n", regs->trap, regs->nip); - debugger(regs); die("Unrecoverable exception", regs, SIGABRT); } @@ -551,7 +548,6 @@ void kernel_bad_stack(struct pt_regs *regs) { printk(KERN_EMERG "Bad kernel stack pointer %lx at %lx\n", regs->gpr[1], regs->nip); - debugger(regs); die("Bad kernel stack pointer", regs, SIGABRT); } diff --git a/arch/ppc64/mm/fault.c b/arch/ppc64/mm/fault.c index 4fd5100acff1..73712c143825 100644 --- a/arch/ppc64/mm/fault.c +++ b/arch/ppc64/mm/fault.c @@ -37,8 +37,6 @@ #include #include -void bad_page_fault(struct pt_regs *, unsigned long, int); - /* * The error_code parameter is * - DSISR for a non-SLB data access fault, @@ -177,10 +175,8 @@ do_sigbus: * It is called from do_page_fault above and from some of the procedures * in traps.c. */ -void -bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) +void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) { - extern void die(const char *, struct pt_regs *, long); const struct exception_table_entry *entry; /* Are we prepared to handle this fault? */ @@ -190,7 +186,5 @@ bad_page_fault(struct pt_regs *regs, unsigned long address, int sig) } /* kernel has accessed a bad area */ - if (debugger(regs)) - return; die("Kernel access of bad area", regs, sig); } diff --git a/arch/ppc64/xmon/xmon.c b/arch/ppc64/xmon/xmon.c index 8bf490b348f1..90fe14bacc59 100644 --- a/arch/ppc64/xmon/xmon.c +++ b/arch/ppc64/xmon/xmon.c @@ -542,8 +542,7 @@ cmds(struct pt_regs *excp) symbol_lookup(); break; case 'r': - if (excp != NULL) - prregs(excp); /* print regs */ + prregs(excp); /* print regs */ break; case 'e': if (excp == NULL) @@ -966,8 +965,7 @@ static void backtrace(struct pt_regs *excp) spinlock_t exception_print_lock = SPIN_LOCK_UNLOCKED; -void -excprint(struct pt_regs *fp) +void excprint(struct pt_regs *fp) { unsigned long flags; @@ -1002,21 +1000,31 @@ excprint(struct pt_regs *fp) spin_unlock_irqrestore(&exception_print_lock, flags); } -void -prregs(struct pt_regs *fp) +void prregs(struct pt_regs *fp) { int n; unsigned long base; if (scanhex((void *)&base)) fp = (struct pt_regs *) base; - for (n = 0; n < 16; ++n) - printf("R%.2ld = %.16lx R%.2ld = %.16lx\n", n, fp->gpr[n], - n+16, fp->gpr[n+16]); - printf("pc = %.16lx msr = %.16lx\nlr = %.16lx cr = %.16lx\n", - fp->nip, fp->msr, fp->link, fp->ccr); - printf("ctr = %.16lx xer = %.16lx trap = %8lx\n", - fp->ctr, fp->xer, fp->trap); + + if (setjmp(bus_error_jmp) == 0) { + __debugger_fault_handler = handle_fault; + sync(); + for (n = 0; n < 16; ++n) + printf("R%.2ld = %.16lx R%.2ld = %.16lx\n", n, + fp->gpr[n], n+16, fp->gpr[n+16]); + printf("pc = %.16lx msr = %.16lx\nlr = %.16lx " + "cr = %.16lx\n", fp->nip, fp->msr, fp->link, fp->ccr); + printf("ctr = %.16lx xer = %.16lx trap = %8lx\n", + fp->ctr, fp->xer, fp->trap); + + sync(); + /* wait a little while to see if we get a machine check */ + __delay(200); + } else { + printf("*** Error reading regs\n"); + } } void diff --git a/include/asm-ppc64/system.h b/include/asm-ppc64/system.h index 9d732aa1256e..152a59acbc35 100644 --- a/include/asm-ppc64/system.h +++ b/include/asm-ppc64/system.h @@ -94,7 +94,12 @@ static inline int debugger_dabr_match(struct pt_regs *regs) { return 0; } static inline int debugger_fault_handler(struct pt_regs *regs) { return 0; } #endif +extern int fix_alignment(struct pt_regs *regs); +extern void bad_page_fault(struct pt_regs *regs, unsigned long address, + int sig); extern void show_regs(struct pt_regs * regs); +extern int die(const char *str, struct pt_regs *regs, long err); + extern void flush_instruction_cache(void); extern int _get_PVR(void); extern void giveup_fpu(struct task_struct *); -- cgit v1.2.3 From b7ceb1452399ef59ab14868337d2d74a9b5c4c8d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:49:59 -0700 Subject: [PATCH] ppc64: Add smt_snooze_delay cpu sysfs attribute From: Anton Blanchard Add smt_snooze_delay cpu sysfs attribute --- arch/ppc64/kernel/idle.c | 7 ++-- arch/ppc64/kernel/prom.c | 46 ------------------------ arch/ppc64/kernel/sysfs.c | 89 +++++++++++++++++++++++++++++++++++++++++++++++ include/asm-ppc64/naca.h | 6 ++-- 4 files changed, 95 insertions(+), 53 deletions(-) (limited to 'include') diff --git a/arch/ppc64/kernel/idle.c b/arch/ppc64/kernel/idle.c index b30aea273974..a9a501df397c 100644 --- a/arch/ppc64/kernel/idle.c +++ b/arch/ppc64/kernel/idle.c @@ -161,13 +161,14 @@ int default_idle(void) #ifdef CONFIG_PPC_PSERIES -DECLARE_PER_CPU(smt_snooze_delay); +DECLARE_PER_CPU(unsigned long, smt_snooze_delay); int dedicated_idle(void) { long oldval; struct paca_struct *lpaca = get_paca(), *ppaca; unsigned long start_snooze; + unsigned long *smt_snooze_delay = &__get_cpu_var(smt_snooze_delay); ppaca = &paca[smp_processor_id() ^ 1]; @@ -180,14 +181,14 @@ int dedicated_idle(void) if (!oldval) { set_thread_flag(TIF_POLLING_NRFLAG); start_snooze = __get_tb() + - naca->smt_snooze_delay*tb_ticks_per_usec; + *smt_snooze_delay * tb_ticks_per_usec; while (!need_resched()) { /* need_resched could be 1 or 0 at this * point. If it is 0, set it to 0, so * an IPI/Prod is sent. If it is 1, keep * it that way & schedule work. */ - if (naca->smt_snooze_delay == 0 || + if (*smt_snooze_delay == 0 || __get_tb() < start_snooze) { HMT_low(); /* Low thread priority */ continue; diff --git a/arch/ppc64/kernel/prom.c b/arch/ppc64/kernel/prom.c index f1cfd43dd39c..6748b2244e88 100644 --- a/arch/ppc64/kernel/prom.c +++ b/arch/ppc64/kernel/prom.c @@ -1254,7 +1254,6 @@ smt_setup(void) { char *p, *q; char my_smt_enabled = SMT_DYNAMIC; - unsigned long my_smt_snooze_delay; ihandle prom_options = NULL; char option[9]; unsigned long offset = reloc_offset(); @@ -1301,51 +1300,6 @@ smt_setup(void) if (!found ) my_smt_enabled = SMT_DYNAMIC; /* default to on */ - found = 0; - if (my_smt_enabled) { - if (strstr(RELOC(cmd_line), RELOC("smt-snooze-delay="))) { - for (q = RELOC(cmd_line); (p = strstr(q, RELOC("smt-snooze-delay="))) != 0; ) { - q = p + 17; - if (p > RELOC(cmd_line) && p[-1] != ' ') - continue; - found = 1; - /* Don't use simple_strtoul() because _ctype & others aren't RELOC'd */ - my_smt_snooze_delay = 0; - while (*q >= '0' && *q <= '9') { - my_smt_snooze_delay = my_smt_snooze_delay * 10 + *q - '0'; - q++; - } - } - } - - if (!found) { - prom_options = (ihandle)call_prom(RELOC("finddevice"), 1, 1, RELOC("/options")); - if (prom_options != (ihandle) -1) { - call_prom(RELOC("getprop"), - 4, 1, prom_options, - RELOC("ibm,smt-snooze-delay"), - option, - sizeof(option)); - if (option[0] != 0) { - found = 1; - /* Don't use simple_strtoul() because _ctype & others aren't RELOC'd */ - my_smt_snooze_delay = 0; - q = option; - while (*q >= '0' && *q <= '9') { - my_smt_snooze_delay = my_smt_snooze_delay * 10 + *q - '0'; - q++; - } - } - } - } - - if (!found) { - my_smt_snooze_delay = 0; /* default value */ - } - } else { - my_smt_snooze_delay = 0; /* default value */ - } - _naca->smt_snooze_delay = my_smt_snooze_delay; _naca->smt_state = my_smt_enabled; } diff --git a/arch/ppc64/kernel/sysfs.c b/arch/ppc64/kernel/sysfs.c index 05ef5291a737..3bcbdec74195 100644 --- a/arch/ppc64/kernel/sysfs.c +++ b/arch/ppc64/kernel/sysfs.c @@ -9,6 +9,90 @@ #include #include #include +#include + + +/* SMT stuff */ + +#ifndef CONFIG_PPC_ISERIES + +/* default to snooze disabled */ +DEFINE_PER_CPU(unsigned long, smt_snooze_delay); + +static ssize_t store_smt_snooze_delay(struct sys_device *dev, const char *buf, + size_t count) +{ + struct cpu *cpu = container_of(dev, struct cpu, sysdev); + ssize_t ret; + unsigned long snooze; + + ret = sscanf(buf, "%lu", &snooze); + if (ret != 1) + return -EINVAL; + + per_cpu(smt_snooze_delay, cpu->sysdev.id) = snooze; + + return count; +} + +static ssize_t show_smt_snooze_delay(struct sys_device *dev, char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, sysdev); + + return sprintf(buf, "%lu\n", per_cpu(smt_snooze_delay, cpu->sysdev.id)); +} + +static SYSDEV_ATTR(smt_snooze_delay, 0644, show_smt_snooze_delay, + store_smt_snooze_delay); + +/* Only parse OF options if the matching cmdline option was not specified */ +static int smt_snooze_cmdline; + +static int __init smt_setup(void) +{ + struct device_node *options; + unsigned int *val; + unsigned int cpu; + + if (!cur_cpu_spec->cpu_features & CPU_FTR_SMT) + return 1; + + options = find_path_device("/options"); + if (!options) + return 1; + + val = (unsigned int *)get_property(options, "ibm,smt-snooze-delay", + NULL); + if (!smt_snooze_cmdline && val) { + for_each_cpu(cpu) + per_cpu(smt_snooze_delay, cpu) = *val; + } + + return 1; +} +__initcall(smt_setup); + +static int __init setup_smt_snooze_delay(char *str) +{ + unsigned int cpu; + int snooze; + + if (!cur_cpu_spec->cpu_features & CPU_FTR_SMT) + return 1; + + smt_snooze_cmdline = 1; + + if (get_option(&str, &snooze)) { + for_each_cpu(cpu) + per_cpu(smt_snooze_delay, cpu) = snooze; + } + + return 1; +} +__setup("smt-snooze-delay=", setup_smt_snooze_delay); + +#endif + /* PMC stuff */ @@ -235,6 +319,11 @@ static int __init topology_init(void) register_cpu_pmc(&c->sysdev); sysdev_create_file(&c->sysdev, &attr_physical_id); + +#ifndef CONFIG_PPC_ISERIES + if (cur_cpu_spec->cpu_features & CPU_FTR_SMT) + sysdev_create_file(&c->sysdev, &attr_smt_snooze_delay); +#endif } return 0; diff --git a/include/asm-ppc64/naca.h b/include/asm-ppc64/naca.h index b93cdf160dd3..a50189402420 100644 --- a/include/asm-ppc64/naca.h +++ b/include/asm-ppc64/naca.h @@ -37,12 +37,10 @@ struct naca_struct { u32 dCacheL1LinesPerPage; /* L1 d-cache lines / page 0x64 */ u32 iCacheL1LogLineSize; /* L1 i-cache line size Log2 0x68 */ u32 iCacheL1LinesPerPage; /* L1 i-cache lines / page 0x6c */ - u64 smt_snooze_delay; /* Delay (in usec) before 0x70 */ - /* entering ST mode */ - u8 smt_state; /* 0 = SMT off 0x78 */ + u8 smt_state; /* 0 = SMT off 0x70 */ /* 1 = SMT on */ /* 2 = SMT dynamic */ - u8 resv0[7]; /* Reserved 0x70 - 0x7F */ + u8 resv0[15]; /* Reserved 0x71 - 0x7F */ }; extern struct naca_struct *naca; -- cgit v1.2.3 From 9ed9e7e5eb0ab64f58ef41e36779cb96abc17076 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:50:11 -0700 Subject: [PATCH] ppc64: DMA API updates From: Anton Blanchard DMA API updates, in particular adding the new cache flush interfaces. --- arch/ppc64/kernel/dma.c | 24 -------------- include/asm-ppc64/dma-mapping.h | 69 +++++++++++++++++++++++++++++++---------- include/asm-ppc64/vio.h | 17 ++-------- 3 files changed, 54 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/arch/ppc64/kernel/dma.c b/arch/ppc64/kernel/dma.c index f81dcd1e8da1..dec55efcc873 100644 --- a/arch/ppc64/kernel/dma.c +++ b/arch/ppc64/kernel/dma.c @@ -131,27 +131,3 @@ void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, BUG(); } EXPORT_SYMBOL(dma_unmap_sg); - -void dma_sync_single(struct device *dev, dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - if (dev->bus == &pci_bus_type) - pci_dma_sync_single(to_pci_dev(dev), dma_handle, size, (int)direction); - else if (dev->bus == &vio_bus_type) - vio_dma_sync_single(to_vio_dev(dev), dma_handle, size, direction); - else - BUG(); -} -EXPORT_SYMBOL(dma_sync_single); - -void dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction) -{ - if (dev->bus == &pci_bus_type) - pci_dma_sync_sg(to_pci_dev(dev), sg, nelems, (int)direction); - else if (dev->bus == &vio_bus_type) - vio_dma_sync_sg(to_vio_dev(dev), sg, nelems, direction); - else - BUG(); -} -EXPORT_SYMBOL(dma_sync_sg); diff --git a/include/asm-ppc64/dma-mapping.h b/include/asm-ppc64/dma-mapping.h index e12753cf8861..0cdc5346f6f8 100644 --- a/include/asm-ppc64/dma-mapping.h +++ b/include/asm-ppc64/dma-mapping.h @@ -36,10 +36,43 @@ extern int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents, enum dma_data_direction direction); extern void dma_unmap_sg(struct device *dev, struct scatterlist *sg, int nhwentries, enum dma_data_direction direction); -extern void dma_sync_single(struct device *dev, dma_addr_t dma_handle, - size_t size, enum dma_data_direction direction); -extern void dma_sync_sg(struct device *dev, struct scatterlist *sg, int nelems, - enum dma_data_direction direction); + +static inline void +dma_sync_single_for_cpu(struct device *dev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); + /* nothing to do */ +} + +static inline void +dma_sync_single_for_device(struct device *dev, dma_addr_t dma_handle, size_t size, + enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); + /* nothing to do */ +} + +static inline void +dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg, int nelems, + enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); + /* nothing to do */ +} + +static inline void +dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, int nelems, + enum dma_data_direction direction) +{ + BUG_ON(direction == DMA_NONE); + /* nothing to do */ +} + +static inline int dma_mapping_error(dma_addr_t dma_addr) +{ + return (dma_addr == DMA_ERROR_CODE); +} /* Now for the API extensions over the pci_ one */ @@ -56,27 +89,29 @@ dma_get_cache_alignment(void) } static inline void -dma_sync_single_range(struct device *dev, dma_addr_t dma_handle, - unsigned long offset, size_t size, - enum dma_data_direction direction) +dma_sync_single_range_for_cpu(struct device *dev, dma_addr_t dma_handle, + unsigned long offset, size_t size, + enum dma_data_direction direction) { - /* just sync everything, that's all the pci API can do */ - dma_sync_single(dev, dma_handle, offset+size, direction); + BUG_ON(direction == DMA_NONE); + /* nothing to do */ } static inline void -dma_cache_sync(void *vaddr, size_t size, - enum dma_data_direction direction) +dma_sync_single_range_for_device(struct device *dev, dma_addr_t dma_handle, + unsigned long offset, size_t size, + enum dma_data_direction direction) { - /* could define this in terms of the dma_cache ... operations, - * but if you get this on a platform, you should convert the platform - * to using the generic device DMA API */ - BUG(); + BUG_ON(direction == DMA_NONE); + /* nothing to do */ } -static inline int dma_mapping_error(dma_addr_t dma_addr) +static inline void +dma_cache_sync(void *vaddr, size_t size, + enum dma_data_direction direction) { - return (dma_addr == DMA_ERROR_CODE); + BUG_ON(direction == DMA_NONE); + /* nothing to do */ } #endif /* _ASM_DMA_MAPPING_H */ diff --git a/include/asm-ppc64/vio.h b/include/asm-ppc64/vio.h index 107201b25008..1ef9a270e7f7 100644 --- a/include/asm-ppc64/vio.h +++ b/include/asm-ppc64/vio.h @@ -78,23 +78,10 @@ static inline int vio_dma_supported(struct vio_dev *hwdev, u64 mask) vio_map_single(dev, (page_address(page) + (off)), size, dir) #define vio_unmap_page(dev,addr,sz,dir) vio_unmap_single(dev,addr,sz,dir) - -static inline void vio_dma_sync_single(struct vio_dev *hwdev, - dma_addr_t dma_handle, size_t size, - enum dma_data_direction direction) -{ - BUG_ON(direction == DMA_NONE); - /* nothing to do */ -} - -static inline void vio_dma_sync_sg(struct vio_dev *hwdev, - struct scatterlist *sg, int nelems, - enum dma_data_direction direction) +static inline int vio_set_dma_mask(struct vio_dev *dev, u64 mask) { - BUG_ON(direction == DMA_NONE); - /* nothing to do */ + return -EIO; } -static inline int vio_set_dma_mask(struct vio_dev *dev, u64 mask) { return -EIO; } extern struct bus_type vio_bus_type; -- cgit v1.2.3 From ec19a28db34aeb29720c658e5ebf4e60ccee4f6d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:50:24 -0700 Subject: [PATCH] ppc64: Remove unused rtas functions From: Joel Schopp I was looking at rtas serialization for reasons I won't go into here. While wandering through the code I found that two functions were not properly serialized. phys_call_rtas and phys_call_rtas_display_status are the functions. After looking further they are redundant and not used anywhere at all. --- arch/ppc64/kernel/rtas.c | 36 ------------------------------------ include/asm-ppc64/rtas.h | 2 -- 2 files changed, 38 deletions(-) (limited to 'include') diff --git a/arch/ppc64/kernel/rtas.c b/arch/ppc64/kernel/rtas.c index 4a27c3d8312c..ff0453726c9e 100644 --- a/arch/ppc64/kernel/rtas.c +++ b/arch/ppc64/kernel/rtas.c @@ -65,42 +65,6 @@ extern unsigned long reloc_offset(void); spinlock_t rtas_data_buf_lock = SPIN_LOCK_UNLOCKED; char rtas_data_buf[RTAS_DATA_BUF_SIZE]__page_aligned; -void -phys_call_rtas(int token, int nargs, int nret, ...) -{ - va_list list; - unsigned long offset = reloc_offset(); - struct rtas_args *rtas = PTRRELOC(&(get_paca()->xRtas)); - int i; - - rtas->token = token; - rtas->nargs = nargs; - rtas->nret = nret; - rtas->rets = (rtas_arg_t *)PTRRELOC(&(rtas->args[nargs])); - - va_start(list, nret); - for (i = 0; i < nargs; i++) - rtas->args[i] = (rtas_arg_t)LONG_LSW(va_arg(list, ulong)); - va_end(list); - - enter_rtas(rtas); -} - -void -phys_call_rtas_display_status(char c) -{ - unsigned long offset = reloc_offset(); - struct rtas_args *rtas = PTRRELOC(&(get_paca()->xRtas)); - - rtas->token = 10; - rtas->nargs = 1; - rtas->nret = 1; - rtas->rets = (rtas_arg_t *)PTRRELOC(&(rtas->args[1])); - rtas->args[0] = (int)c; - - enter_rtas(rtas); -} - void call_rtas_display_status(char c) { diff --git a/include/asm-ppc64/rtas.h b/include/asm-ppc64/rtas.h index 7f6139064c7c..47232af7b278 100644 --- a/include/asm-ppc64/rtas.h +++ b/include/asm-ppc64/rtas.h @@ -169,8 +169,6 @@ extern struct rtas_t rtas; extern void enter_rtas(struct rtas_args *); extern int rtas_token(const char *service); extern long rtas_call(int token, int, int, unsigned long *, ...); -extern void phys_call_rtas(int, int, int, ...); -extern void phys_call_rtas_display_status(char); extern void call_rtas_display_status(char); extern void rtas_restart(char *cmd); extern void rtas_power_off(void); -- cgit v1.2.3 From a97de48b693b787fab0a47a1cf35de001ac50a6b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:51:02 -0700 Subject: [PATCH] ppc44x: fix memory leak From: Matt Porter This fixes a memory leak when freeing pgds on PPC44x. --- arch/ppc/kernel/misc.S | 7 +++++-- arch/ppc/mm/pgtable.c | 4 ++-- include/asm-ppc/page.h | 3 ++- 3 files changed, 9 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/ppc/kernel/misc.S b/arch/ppc/kernel/misc.S index 3f9b6a206937..bb4a5ec77429 100644 --- a/arch/ppc/kernel/misc.S +++ b/arch/ppc/kernel/misc.S @@ -738,12 +738,15 @@ _GLOBAL(__flush_dcache_icache_phys) blr /* - * Clear a page using the dcbz instruction, which doesn't cause any + * Clear pages using the dcbz instruction, which doesn't cause any * memory traffic (except to write out any cache lines which get * displaced). This only works on cacheable memory. + * + * void clear_pages(void *page, int order) ; */ -_GLOBAL(clear_page) +_GLOBAL(clear_pages) li r0,4096/L1_CACHE_LINE_SIZE + slw r0,r0,r4 mtctr r0 #ifdef CONFIG_8xx li r4, 0 diff --git a/arch/ppc/mm/pgtable.c b/arch/ppc/mm/pgtable.c index 78ea44090efa..b1b93fc18d4d 100644 --- a/arch/ppc/mm/pgtable.c +++ b/arch/ppc/mm/pgtable.c @@ -71,13 +71,13 @@ pgd_t *pgd_alloc(struct mm_struct *mm) pgd_t *ret; if ((ret = (pgd_t *)__get_free_pages(GFP_KERNEL, PGDIR_ORDER)) != NULL) - clear_page(ret); + clear_pages(ret, PGDIR_ORDER); return ret; } void pgd_free(pgd_t *pgd) { - free_page((unsigned long)pgd); + free_pages((unsigned long)pgd, PGDIR_ORDER); } pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) diff --git a/include/asm-ppc/page.h b/include/asm-ppc/page.h index e47e77327ee5..57838e8e00f1 100644 --- a/include/asm-ppc/page.h +++ b/include/asm-ppc/page.h @@ -84,7 +84,8 @@ typedef unsigned long pgprot_t; #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) struct page; -extern void clear_page(void *page); +extern void clear_pages(void *page, int order); +static inline void clear_page(void *page) { clear_pages(page, 0); } extern void copy_page(void *to, void *from); extern void clear_user_page(void *page, unsigned long vaddr, struct page *pg); extern void copy_user_page(void *to, void *from, unsigned long vaddr, -- cgit v1.2.3 From ed678f13aec6fdd86c952b05200f741aa473dba8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:51:16 -0700 Subject: [PATCH] Quota locking fixes From: Jan Kara Change locking rules in quota code to fix lock ordering especially wrt journal lock. Also some unnecessary spinlocking is removed. The locking changes are mainly: dqptr_sem, dqio_sem are acquired only when transaction is already started, dqonoff_sem before a transaction is started. This change requires some callbacks to ext3 (also implemented in this patch) to start transaction before the locks are acquired. --- fs/Kconfig | 6 +- fs/dquot.c | 204 ++++++++++++++++++++++++++--------------------- fs/ext3/super.c | 51 +++++++++--- fs/inode.c | 16 ++-- include/linux/quotaops.h | 15 +--- 5 files changed, 165 insertions(+), 127 deletions(-) (limited to 'include') diff --git a/fs/Kconfig b/fs/Kconfig index ef8e47fb1c39..c748a2ce35ee 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -417,7 +417,7 @@ config QFMT_V1 tristate "Old quota format support" depends on QUOTA help - This quota format was (is) used by kernels earlier than 2.4.??. If + This quota format was (is) used by kernels earlier than 2.4.22. If you have quota working and you don't want to convert to new quota format say Y here. @@ -426,8 +426,8 @@ config QFMT_V2 depends on QUOTA help This quota format allows using quotas with 32-bit UIDs/GIDs. If you - need this functionality say Y here. Note that you will need latest - quota utilities for new quota format with this kernel. + need this functionality say Y here. Note that you will need recent + quota utilities (>= 3.01) for new quota format with this kernel. config QUOTACTL bool diff --git a/fs/dquot.c b/fs/dquot.c index b7b9b5c44277..e6b39e66207a 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -85,12 +85,31 @@ * and quota formats and also dqstats structure containing statistics about the * lists. dq_data_lock protects data from dq_dqb and also mem_dqinfo structures * and also guards consistency of dquot->dq_dqb with inode->i_blocks, i_bytes. - * Note that we don't have to do the locking of i_blocks and i_bytes when the - * quota is disabled - i_sem should serialize the access. dq_data_lock should - * be always grabbed before dq_list_lock. + * i_blocks and i_bytes updates itself are guarded by i_lock acquired directly + * in inode_add_bytes() and inode_sub_bytes(). + * + * The spinlock ordering is hence: dq_data_lock > dq_list_lock > i_lock * * Note that some things (eg. sb pointer, type, id) doesn't change during * the life of the dquot structure and so needn't to be protected by a lock + * + * Any operation working on dquots via inode pointers must hold dqptr_sem. If + * operation is just reading pointers from inode (or not using them at all) the + * read lock is enough. If pointers are altered function must hold write lock. + * If operation is holding reference to dquot in other way (e.g. quotactl ops) + * it must be guarded by dqonoff_sem. + * This locking assures that: + * a) update/access to dquot pointers in inode is serialized + * b) everyone is guarded against invalidate_dquots() + * + * Each dquot has its dq_lock semaphore. Locked dquots might not be referenced + * from inodes (dquot_alloc_space() and such don't check the dq_lock). + * Currently dquot is locked only when it is being read to memory on the first + * dqget(). Write operations on dquots don't hold dq_lock as they copy data + * under dq_data_lock spinlock to internal buffers before writing. + * + * Lock ordering (including journal_lock) is following: + * dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock > dqio_sem */ spinlock_t dq_list_lock = SPIN_LOCK_UNLOCKED; spinlock_t dq_data_lock = SPIN_LOCK_UNLOCKED; @@ -169,23 +188,6 @@ static void put_quota_format(struct quota_format_type *fmt) * mechanism to locate a specific dquot. */ -/* - * Note that any operation which operates on dquot data (ie. dq_dqb) must - * hold dq_data_lock. - * - * Any operation working with dquots must hold dqptr_sem. If operation is - * just reading pointers from inodes than read lock is enough. If pointers - * are altered function must hold write lock. - * - * Locked dquots might not be referenced in inodes. Currently dquot it locked - * only once in its existence - when it's being read to memory on first dqget() - * and at that time it can't be referenced from inode. Write operations on - * dquots don't hold dquot lock as they copy data to internal buffers before - * writing anyway and copying as well as any data update should be atomic. Also - * nobody can change used entries in dquot structure as this is done only when - * quota is destroyed and invalidate_dquots() is called only when dq_count == 0. - */ - static LIST_HEAD(inuse_list); static LIST_HEAD(free_dquots); static struct list_head dquot_hash[NR_DQHASH]; @@ -286,9 +288,9 @@ static int commit_dqblk(struct dquot *dquot) } /* Invalidate all dquots on the list. Note that this function is called after - * quota is disabled so no new quota might be created. Because we hold dqptr_sem - * for writing and pointers were already removed from inodes we actually know that - * no quota for this sb+type should be held. */ + * quota is disabled so no new quota might be created. Because we hold + * dqonoff_sem and pointers were already removed from inodes we actually know + * that no quota for this sb+type should be held. */ static void invalidate_dquots(struct super_block *sb, int type) { struct dquot *dquot; @@ -302,12 +304,11 @@ static void invalidate_dquots(struct super_block *sb, int type) continue; if (dquot->dq_type != type) continue; -#ifdef __DQUOT_PARANOIA - /* There should be no users of quota - we hold dqptr_sem for writing */ +#ifdef __DQUOT_PARANOIA if (atomic_read(&dquot->dq_count)) BUG(); #endif - /* Quota now have no users and it has been written on last dqput() */ + /* Quota now has no users and it has been written on last dqput() */ remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); @@ -323,7 +324,7 @@ static int vfs_quota_sync(struct super_block *sb, int type) struct quota_info *dqopt = sb_dqopt(sb); int cnt; - down_read(&dqopt->dqptr_sem); + down(&dqopt->dqonoff_sem); restart: /* At this point any dirty dquot will definitely be written so we can clear dirty flag from info */ @@ -359,7 +360,7 @@ restart: spin_lock(&dq_list_lock); dqstats.syncs++; spin_unlock(&dq_list_lock); - up_read(&dqopt->dqptr_sem); + up(&dqopt->dqonoff_sem); return 0; } @@ -402,7 +403,7 @@ static int shrink_dqcache_memory(int nr, unsigned int gfp_mask) /* * Put reference to dquot * NOTE: If you change this function please check whether dqput_blocks() works right... - * MUST be called with dqptr_sem held + * MUST be called with either dqptr_sem or dqonoff_sem held */ static void dqput(struct dquot *dquot) { @@ -467,7 +468,7 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type) /* * Get reference to dquot - * MUST be called with dqptr_sem held + * MUST be called with either dqptr_sem or dqonoff_sem held */ static struct dquot *dqget(struct super_block *sb, unsigned int id, int type) { @@ -528,7 +529,7 @@ static int dqinit_needed(struct inode *inode, int type) return 0; } -/* This routine is guarded by dqptr_sem semaphore */ +/* This routine is guarded by dqonoff_sem semaphore */ static void add_dquot_ref(struct super_block *sb, int type) { struct list_head *p; @@ -594,7 +595,7 @@ put_it: /* Free list of dquots - called from inode.c */ /* dquots are removed from inodes, no new references can be got so we are the only ones holding reference */ -void put_dquot_list(struct list_head *tofree_head) +static void put_dquot_list(struct list_head *tofree_head) { struct list_head *act_head; struct dquot *dquot; @@ -609,6 +610,20 @@ void put_dquot_list(struct list_head *tofree_head) } } +/* Function in inode.c - remove pointers to dquots in icache */ +extern void remove_dquot_ref(struct super_block *, int, struct list_head *); + +/* Gather all references from inodes and drop them */ +static void drop_dquot_ref(struct super_block *sb, int type) +{ + LIST_HEAD(tofree_head); + + down_write(&sb_dqopt(sb)->dqptr_sem); + remove_dquot_ref(sb, type, &tofree_head); + up_write(&sb_dqopt(sb)->dqptr_sem); + put_dquot_list(&tofree_head); +} + static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number) { dquot->dq_dqb.dqb_curinodes += number; @@ -804,6 +819,9 @@ void dquot_initialize(struct inode *inode, int type) unsigned int id = 0; int cnt; + /* Solve deadlock when we recurse when holding dqptr_sem... */ + if (IS_NOQUOTA(inode)) + return; down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); /* Having dqptr_sem we know NOQUOTA flags can't be altered... */ if (IS_NOQUOTA(inode)) { @@ -831,50 +849,23 @@ void dquot_initialize(struct inode *inode, int type) up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); } -/* - * Remove references to quota from inode - * This function needs dqptr_sem for writing - */ -static void dquot_drop_iupdate(struct inode *inode, struct dquot **to_drop) -{ - int cnt; - - inode->i_flags &= ~S_QUOTA; - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - to_drop[cnt] = inode->i_dquot[cnt]; - inode->i_dquot[cnt] = NODQUOT; - } -} - /* * Release all quotas referenced by inode + * Transaction must be started at an entry */ void dquot_drop(struct inode *inode) { - struct dquot *to_drop[MAXQUOTAS]; int cnt; down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - dquot_drop_iupdate(inode, to_drop); + inode->i_flags &= ~S_QUOTA; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (inode->i_dquot[cnt] != NODQUOT) { + dqput(inode->i_dquot[cnt]); + inode->i_dquot[cnt] = NODQUOT; + } + } up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (to_drop[cnt] != NODQUOT) - dqput(to_drop[cnt]); -} - -/* - * Release all quotas referenced by inode. - * This function assumes dqptr_sem for writing - */ -void dquot_drop_nolock(struct inode *inode) -{ - struct dquot *to_drop[MAXQUOTAS]; - int cnt; - - dquot_drop_iupdate(inode, to_drop); - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (to_drop[cnt] != NODQUOT) - dqput(to_drop[cnt]); } /* @@ -885,11 +876,17 @@ int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) int cnt, ret = NO_QUOTA; char warntype[MAXQUOTAS]; + /* Solve deadlock when we recurse when holding dqptr_sem... */ + if (IS_NOQUOTA(inode)) { + inode_add_bytes(inode, number); + return QUOTA_OK; + } for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = NOWARN; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); spin_lock(&dq_data_lock); + /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) goto add_bytes; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -921,9 +918,13 @@ int dquot_alloc_inode(const struct inode *inode, unsigned long number) int cnt, ret = NO_QUOTA; char warntype[MAXQUOTAS]; + /* Solve deadlock when we recurse when holding dqptr_sem... */ + if (IS_NOQUOTA(inode)) + return QUOTA_OK; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = NOWARN; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) { up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return QUOTA_OK; @@ -956,8 +957,14 @@ void dquot_free_space(struct inode *inode, qsize_t number) { unsigned int cnt; + /* Solve deadlock when we recurse when holding dqptr_sem... */ + if (IS_NOQUOTA(inode)) { + inode_sub_bytes(inode, number); + return; + } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); spin_lock(&dq_data_lock); + /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) goto sub_bytes; for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -978,7 +985,11 @@ void dquot_free_inode(const struct inode *inode, unsigned long number) { unsigned int cnt; + /* Solve deadlock when we recurse when holding dqptr_sem... */ + if (IS_NOQUOTA(inode)) + return; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) { up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return; @@ -1007,14 +1018,20 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) chgid = (iattr->ia_valid & ATTR_GID) && inode->i_gid != iattr->ia_gid; char warntype[MAXQUOTAS]; + /* Solve deadlock when we recurse when holding dqptr_sem... */ + if (IS_NOQUOTA(inode)) + return QUOTA_OK; /* Clear the arrays */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { transfer_to[cnt] = transfer_from[cnt] = NODQUOT; warntype[cnt] = NOWARN; } + down(&sb_dqopt(inode->i_sb)->dqonoff_sem); down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + /* Now recheck reliably when holding dqptr_sem */ if (IS_NOQUOTA(inode)) { /* File without quota accounting? */ up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + up(&sb_dqopt(inode->i_sb)->dqonoff_sem); return QUOTA_OK; } /* First build the transfer_to list - here we can block on reading of dquots... */ @@ -1065,6 +1082,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) ret = QUOTA_OK; warn_put_all: spin_unlock(&dq_data_lock); + up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); flush_warnings(transfer_to, warntype); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1073,7 +1091,7 @@ warn_put_all: if (ret == NO_QUOTA && transfer_to[cnt] != NODQUOT) dqput(transfer_to[cnt]); } - up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + up(&sb_dqopt(inode->i_sb)->dqonoff_sem); return ret; } @@ -1121,9 +1139,6 @@ static inline void reset_enable_flags(struct quota_info *dqopt, int type) } } -/* Function in inode.c - remove pointers to dquots in icache */ -extern void remove_dquot_ref(struct super_block *, int); - /* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ @@ -1137,7 +1152,6 @@ int vfs_quota_off(struct super_block *sb, int type) /* We need to serialize quota_off() for device */ down(&dqopt->dqonoff_sem); - down_write(&dqopt->dqptr_sem); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; @@ -1146,7 +1160,7 @@ int vfs_quota_off(struct super_block *sb, int type) reset_enable_flags(dqopt, cnt); /* Note: these are blocking operations */ - remove_dquot_ref(sb, cnt); + drop_dquot_ref(sb, cnt); invalidate_dquots(sb, cnt); /* * Now all dquots should be invalidated, all writes done so we should be only @@ -1168,7 +1182,6 @@ int vfs_quota_off(struct super_block *sb, int type) dqopt->info[cnt].dqi_bgrace = 0; dqopt->ops[cnt] = NULL; } - up_write(&dqopt->dqptr_sem); up(&dqopt->dqonoff_sem); out: return 0; @@ -1180,7 +1193,8 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path) struct inode *inode; struct quota_info *dqopt = sb_dqopt(sb); struct quota_format_type *fmt = find_quota_format(format_id); - int error; + int error, cnt; + struct dquot *to_drop[MAXQUOTAS]; unsigned int oldflags; if (!fmt) @@ -1202,7 +1216,6 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path) goto out_f; down(&dqopt->dqonoff_sem); - down_write(&dqopt->dqptr_sem); if (sb_has_quota_enabled(sb, type)) { error = -EBUSY; goto out_lock; @@ -1213,8 +1226,20 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path) if (!fmt->qf_ops->check_quota_file(sb, type)) goto out_file_init; /* We don't want quota and atime on quota files (deadlocks possible) */ - dquot_drop_nolock(inode); + down_write(&dqopt->dqptr_sem); inode->i_flags |= S_NOQUOTA | S_NOATIME; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + to_drop[cnt] = inode->i_dquot[cnt]; + inode->i_dquot[cnt] = NODQUOT; + } + inode->i_flags &= ~S_QUOTA; + up_write(&dqopt->dqptr_sem); + /* We must put dquots outside of dqptr_sem because we may need to + * start transaction for write */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (to_drop[cnt]) + dqput(to_drop[cnt]); + } dqopt->ops[type] = fmt->qf_ops; dqopt->info[type].dqi_format = fmt; @@ -1225,7 +1250,6 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path) } up(&dqopt->dqio_sem); set_enable_flags(dqopt, type); - up_write(&dqopt->dqptr_sem); add_dquot_ref(sb, type); up(&dqopt->dqonoff_sem); @@ -1268,14 +1292,14 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d { struct dquot *dquot; - down_read(&sb_dqopt(sb)->dqptr_sem); + down(&sb_dqopt(sb)->dqonoff_sem); if (!(dquot = dqget(sb, id, type))) { - up_read(&sb_dqopt(sb)->dqptr_sem); + up(&sb_dqopt(sb)->dqonoff_sem); return -ESRCH; } do_get_dqblk(dquot, di); dqput(dquot); - up_read(&sb_dqopt(sb)->dqptr_sem); + up(&sb_dqopt(sb)->dqonoff_sem); return 0; } @@ -1337,14 +1361,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d { struct dquot *dquot; - down_read(&sb_dqopt(sb)->dqptr_sem); + down(&sb_dqopt(sb)->dqonoff_sem); if (!(dquot = dqget(sb, id, type))) { - up_read(&sb_dqopt(sb)->dqptr_sem); + up(&sb_dqopt(sb)->dqonoff_sem); return -ESRCH; } do_set_dqblk(dquot, di); dqput(dquot); - up_read(&sb_dqopt(sb)->dqptr_sem); + up(&sb_dqopt(sb)->dqonoff_sem); return 0; } @@ -1353,9 +1377,9 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) { struct mem_dqinfo *mi; - down_read(&sb_dqopt(sb)->dqptr_sem); + down(&sb_dqopt(sb)->dqonoff_sem); if (!sb_has_quota_enabled(sb, type)) { - up_read(&sb_dqopt(sb)->dqptr_sem); + up(&sb_dqopt(sb)->dqonoff_sem); return -ESRCH; } mi = sb_dqopt(sb)->info + type; @@ -1365,7 +1389,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) ii->dqi_flags = mi->dqi_flags & DQF_MASK; ii->dqi_valid = IIF_ALL; spin_unlock(&dq_data_lock); - up_read(&sb_dqopt(sb)->dqptr_sem); + up(&sb_dqopt(sb)->dqonoff_sem); return 0; } @@ -1374,9 +1398,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) { struct mem_dqinfo *mi; - down_read(&sb_dqopt(sb)->dqptr_sem); + down(&sb_dqopt(sb)->dqonoff_sem); if (!sb_has_quota_enabled(sb, type)) { - up_read(&sb_dqopt(sb)->dqptr_sem); + up(&sb_dqopt(sb)->dqonoff_sem); return -ESRCH; } mi = sb_dqopt(sb)->info + type; @@ -1389,7 +1413,7 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) mi->dqi_flags = (mi->dqi_flags & ~DQF_MASK) | (ii->dqi_flags & DQF_MASK); mark_info_dirty(mi); spin_unlock(&dq_data_lock); - up_read(&sb_dqopt(sb)->dqptr_sem); + up(&sb_dqopt(sb)->dqonoff_sem); return 0; } diff --git a/fs/ext3/super.c b/fs/ext3/super.c index baf30c5045ec..e6ae6c9e0f46 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1958,6 +1958,18 @@ int ext3_statfs (struct super_block * sb, struct kstatfs * buf) #define EXT3_V0_QFMT_BLOCKS 27 static int (*old_write_dquot)(struct dquot *dquot); +static void (*old_drop_dquot)(struct inode *inode); + +static int fmt_to_blocks(int fmt) +{ + switch (fmt) { + case QFMT_VFS_OLD: + return EXT3_OLD_QFMT_BLOCKS; + case QFMT_VFS_V0: + return EXT3_V0_QFMT_BLOCKS; + } + return EXT3_MAX_TRANS_DATA; +} static int ext3_write_dquot(struct dquot *dquot) { @@ -1965,20 +1977,11 @@ static int ext3_write_dquot(struct dquot *dquot) int ret; int err; handle_t *handle; - struct quota_info *dqops = sb_dqopt(dquot->dq_sb); + struct quota_info *dqopt = sb_dqopt(dquot->dq_sb); struct inode *qinode; - switch (dqops->info[dquot->dq_type].dqi_format->qf_fmt_id) { - case QFMT_VFS_OLD: - nblocks = EXT3_OLD_QFMT_BLOCKS; - break; - case QFMT_VFS_V0: - nblocks = EXT3_V0_QFMT_BLOCKS; - break; - default: - nblocks = EXT3_MAX_TRANS_DATA; - } - qinode = dqops->files[dquot->dq_type]->f_dentry->d_inode; + nblocks = fmt_to_blocks(dqopt->info[dquot->dq_type].dqi_format->qf_fmt_id); + qinode = dqopt->files[dquot->dq_type]->f_dentry->d_inode; handle = ext3_journal_start(qinode, nblocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); @@ -1991,6 +1994,28 @@ static int ext3_write_dquot(struct dquot *dquot) out: return ret; } + +static void ext3_drop_dquot(struct inode *inode) +{ + int nblocks, type; + struct quota_info *dqopt = sb_dqopt(inode->i_sb); + handle_t *handle; + + for (type = 0; type < MAXQUOTAS; type++) { + if (sb_has_quota_enabled(inode->i_sb, type)) + break; + } + if (type < MAXQUOTAS) + nblocks = fmt_to_blocks(dqopt->info[type].dqi_format->qf_fmt_id); + else + nblocks = 0; /* No quota => no drop */ + handle = ext3_journal_start(inode, 2*nblocks); + if (IS_ERR(handle)) + return; + old_drop_dquot(inode); + ext3_journal_stop(handle); + return; +} #endif static struct super_block *ext3_get_sb(struct file_system_type *fs_type, @@ -2018,7 +2043,9 @@ static int __init init_ext3_fs(void) #ifdef CONFIG_QUOTA init_dquot_operations(&ext3_qops); old_write_dquot = ext3_qops.write_dquot; + old_drop_dquot = ext3_qops.drop; ext3_qops.write_dquot = ext3_write_dquot; + ext3_qops.drop = ext3_drop_dquot; #endif err = register_filesystem(&ext3_fs_type); if (err) diff --git a/fs/inode.c b/fs/inode.c index 01c5740aacdd..d367d4629f3e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1216,15 +1216,13 @@ EXPORT_SYMBOL(inode_needs_sync); */ #ifdef CONFIG_QUOTA -/* Functions back in dquot.c */ -void put_dquot_list(struct list_head *); +/* Function back in dquot.c */ int remove_inode_dquot_ref(struct inode *, int, struct list_head *); -void remove_dquot_ref(struct super_block *sb, int type) +void remove_dquot_ref(struct super_block *sb, int type, struct list_head *tofree_head) { struct inode *inode; struct list_head *act_head; - LIST_HEAD(tofree_head); if (!sb->dq_op) return; /* nothing to do */ @@ -1234,26 +1232,24 @@ void remove_dquot_ref(struct super_block *sb, int type) list_for_each(act_head, &inode_in_use) { inode = list_entry(act_head, struct inode, i_list); if (inode->i_sb == sb && IS_QUOTAINIT(inode)) - remove_inode_dquot_ref(inode, type, &tofree_head); + remove_inode_dquot_ref(inode, type, tofree_head); } list_for_each(act_head, &inode_unused) { inode = list_entry(act_head, struct inode, i_list); if (inode->i_sb == sb && IS_QUOTAINIT(inode)) - remove_inode_dquot_ref(inode, type, &tofree_head); + remove_inode_dquot_ref(inode, type, tofree_head); } list_for_each(act_head, &sb->s_dirty) { inode = list_entry(act_head, struct inode, i_list); if (IS_QUOTAINIT(inode)) - remove_inode_dquot_ref(inode, type, &tofree_head); + remove_inode_dquot_ref(inode, type, tofree_head); } list_for_each(act_head, &sb->s_io) { inode = list_entry(act_head, struct inode, i_list); if (IS_QUOTAINIT(inode)) - remove_inode_dquot_ref(inode, type, &tofree_head); + remove_inode_dquot_ref(inode, type, tofree_head); } spin_unlock(&inode_lock); - - put_dquot_list(&tofree_head); } #endif diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 155c9a2af016..e5a9e6bed751 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -64,11 +64,8 @@ static __inline__ int DQUOT_PREALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t if (inode->i_sb->dq_op->alloc_space(inode, nr, 1) == NO_QUOTA) return 1; } - else { - spin_lock(&dq_data_lock); + else inode_add_bytes(inode, nr); - spin_unlock(&dq_data_lock); - } return 0; } @@ -87,11 +84,8 @@ static __inline__ int DQUOT_ALLOC_SPACE_NODIRTY(struct inode *inode, qsize_t nr) if (inode->i_sb->dq_op->alloc_space(inode, nr, 0) == NO_QUOTA) return 1; } - else { - spin_lock(&dq_data_lock); + else inode_add_bytes(inode, nr); - spin_unlock(&dq_data_lock); - } return 0; } @@ -117,11 +111,8 @@ static __inline__ void DQUOT_FREE_SPACE_NODIRTY(struct inode *inode, qsize_t nr) { if (sb_any_quota_enabled(inode->i_sb)) inode->i_sb->dq_op->free_space(inode, nr); - else { - spin_lock(&dq_data_lock); + else inode_sub_bytes(inode, nr); - spin_unlock(&dq_data_lock); - } } static __inline__ void DQUOT_FREE_SPACE(struct inode *inode, qsize_t nr) -- cgit v1.2.3 From 94b1c3ebf78bd58c2f45b78f2c24c7c939c34a9e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:52:32 -0700 Subject: [PATCH] knfsd: Remove name_lookup.h that noone is using anymore. From: NeilBrown --- include/linux/sunrpc/name_lookup.h | 38 -------------------------------------- 1 file changed, 38 deletions(-) delete mode 100644 include/linux/sunrpc/name_lookup.h (limited to 'include') diff --git a/include/linux/sunrpc/name_lookup.h b/include/linux/sunrpc/name_lookup.h deleted file mode 100644 index 0c97ec324ada..000000000000 --- a/include/linux/sunrpc/name_lookup.h +++ /dev/null @@ -1,38 +0,0 @@ - -/* - * map between user/group name and id for a given 'client' - */ - -struct name_ent { - char name[20]; -}; -static inline int name_get_user(int uid, struct name_ent **namep) -{ - struct name_ent *n = kmalloc(sizeof(*n),GFP_KERNEL); - if (n) sprintf(n->name, "%d",uid); - *namep = n; - return n ? 0 : -ENOMEM; -} -static inline int name_get_group(int uid, struct name_ent **namep) -{ - struct name_ent *n = kmalloc(sizeof(*n),GFP_KERNEL); - if (n) sprintf(n->name, "%d",uid); - *namep = n; - return n ? 0 : -ENOMEM; -} -static inline int name_get_uid(char *name, int name_len, int *uidp) -{ - *uidp = simple_strtoul(name, NULL, 0); - return 0; -} - -static inline int name_get_gid(char *name, int name_len, int *gidp) -{ - *gidp = simple_strtoul(name, NULL, 0); - return 0; -} - -static inline void name_put(struct name_ent *ent) -{ - kfree(ent); -} -- cgit v1.2.3 From c02c0886973521cd77904d8f07aa98d99c63cb3b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:52:44 -0700 Subject: [PATCH] knfsd: Add server-side support for the nfsv4 mounted_on_fileid attribute. From: NeilBrown --- fs/nfsd/nfs4xdr.c | 11 +++++++++++ include/linux/nfs4.h | 1 + include/linux/nfsd/nfsd.h | 2 +- 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index d19b1c6b7f45..8908bfc17184 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1588,7 +1588,18 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp, WRITE32(stat.mtime.tv_sec); WRITE32(stat.mtime.tv_nsec); } + if (bmval1 & FATTR4_WORD1_MOUNTED_ON_FILEID) { + struct dentry *mnt_pnt, *mnt_root; + if ((buflen -= 8) < 0) + goto out_resource; + mnt_root = exp->ex_mnt->mnt_root; + if (mnt_root->d_inode == dentry->d_inode) { + mnt_pnt = exp->ex_mnt->mnt_mountpoint; + WRITE64((u64) mnt_pnt->d_inode->i_ino); + } else + WRITE64((u64) stat.ino); + } *attrlenp = htonl((char *)p - (char *)attrlenp - 4); *countp = p - buffer; status = nfs_ok; diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h index e8ea2239a213..520545881a52 100644 --- a/include/linux/nfs4.h +++ b/include/linux/nfs4.h @@ -274,6 +274,7 @@ enum lock_type4 { #define FATTR4_WORD1_TIME_METADATA (1 << 20) #define FATTR4_WORD1_TIME_MODIFY (1 << 21) #define FATTR4_WORD1_TIME_MODIFY_SET (1 << 22) +#define FATTR4_WORD1_MOUNTED_ON_FILEID (1 << 23) #define NFSPROC4_NULL 0 #define NFSPROC4_COMPOUND 1 diff --git a/include/linux/nfsd/nfsd.h b/include/linux/nfsd/nfsd.h index 6e6a66208308..418356558209 100644 --- a/include/linux/nfsd/nfsd.h +++ b/include/linux/nfsd/nfsd.h @@ -278,7 +278,7 @@ static inline int is_fsid(struct svc_fh *fh, struct knfsd_fh *reffh) | FATTR4_WORD1_SPACE_AVAIL | FATTR4_WORD1_SPACE_FREE | FATTR4_WORD1_SPACE_TOTAL \ | FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS | FATTR4_WORD1_TIME_ACCESS_SET \ | FATTR4_WORD1_TIME_CREATE | FATTR4_WORD1_TIME_DELTA | FATTR4_WORD1_TIME_METADATA \ - | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET) + | FATTR4_WORD1_TIME_MODIFY | FATTR4_WORD1_TIME_MODIFY_SET | FATTR4_WORD1_MOUNTED_ON_FILEID) /* These will return ERR_INVAL if specified in GETATTR or READDIR. */ #define NFSD_WRITEONLY_ATTRS_WORD1 \ -- cgit v1.2.3 From 238a06e203a96960843faec4ec8f553f453082b9 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:53:09 -0700 Subject: [PATCH] knfsd: Export a symbol needed by auth_gss From: NeilBrown From: "J. Bruce Fields" Without this compiling auth_gss as module fails. --- include/linux/sunrpc/xdr.h | 1 + net/sunrpc/sunrpc_syms.c | 1 + net/sunrpc/xdr.c | 2 +- 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index 0ccaff2cdee2..2b334dc19962 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -145,6 +145,7 @@ extern void _copy_from_pages(char *, struct page **, size_t, size_t); extern void xdr_buf_from_iov(struct iovec *, struct xdr_buf *); extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, int, int); extern int xdr_buf_read_netobj(struct xdr_buf *, struct xdr_netobj *, int); +extern int read_bytes_from_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len); /* * Helper structure for copying from an sk_buff. diff --git a/net/sunrpc/sunrpc_syms.c b/net/sunrpc/sunrpc_syms.c index 9061f6498cc4..1ae41edbb0f1 100644 --- a/net/sunrpc/sunrpc_syms.c +++ b/net/sunrpc/sunrpc_syms.c @@ -134,6 +134,7 @@ EXPORT_SYMBOL(xdr_read_pages); EXPORT_SYMBOL(xdr_buf_from_iov); EXPORT_SYMBOL(xdr_buf_subsegment); EXPORT_SYMBOL(xdr_buf_read_netobj); +EXPORT_SYMBOL(read_bytes_from_xdr_buf); /* Debugging symbols */ #ifdef RPC_DEBUG diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index accfdd9284df..cae451e8db8d 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -799,7 +799,7 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf, } /* obj is assumed to point to allocated memory of size at least len: */ -static int +int read_bytes_from_xdr_buf(struct xdr_buf *buf, int base, void *obj, int len) { struct xdr_buf subbuf; -- cgit v1.2.3 From 9abdc6608d7c5e3cb09c05bd6c726d04dc59ace4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:53:24 -0700 Subject: [PATCH] knfsd: Add data integrity to serve rside gss From: NeilBrown From: "J. Bruce Fields" rpcsec_gss supports three security levels: 1. authentication only: sign the header of each rpc request and response. 2. integrity: sign the header and body of each rpc request and response. 3. privacy: sign the header and encrypt the body of each rpc request and response. The first 2 are already supported on the client; this adds integrity support on the server. --- include/linux/sunrpc/svcauth_gss.h | 9 -- net/sunrpc/auth_gss/gss_krb5_mech.c | 2 + net/sunrpc/auth_gss/svcauth_gss.c | 172 ++++++++++++++++++++++++++++++++++-- 3 files changed, 168 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/svcauth_gss.h b/include/linux/sunrpc/svcauth_gss.h index 73ca6ef2c4a8..a444c9edb9e9 100644 --- a/include/linux/sunrpc/svcauth_gss.h +++ b/include/linux/sunrpc/svcauth_gss.h @@ -22,14 +22,5 @@ int gss_svc_init(void); int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name); - -struct gss_svc_data { - /* decoded gss client cred: */ - struct rpc_gss_wire_cred clcred; - /* pointer to the beginning of the procedure-specific results, which - * may be encrypted/checksummed in svcauth_gss_release: */ - u32 *body_start; -}; - #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_SVCAUTH_GSS_H */ diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c index 42ceee1907d7..57c074a06970 100644 --- a/net/sunrpc/auth_gss/gss_krb5_mech.c +++ b/net/sunrpc/auth_gss/gss_krb5_mech.c @@ -236,6 +236,8 @@ static int __init init_kerberos_module(void) gss_register_triple(RPC_AUTH_GSS_KRB5I, gm, 0, RPC_GSS_SVC_INTEGRITY); if (svcauth_gss_register_pseudoflavor(RPC_AUTH_GSS_KRB5, "krb5")) printk("Failed to register %s with server!\n", "krb5"); + if (svcauth_gss_register_pseudoflavor(RPC_AUTH_GSS_KRB5I, "krb5i")) + printk("Failed to register %s with server!\n", "krb5i"); gss_mech_put(gm); return 0; } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index 9e13aaa2bc79..2277667d3397 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -670,6 +670,68 @@ out: return stat; } +static inline int +read_u32_from_xdr_buf(struct xdr_buf *buf, int base, u32 *obj) +{ + u32 raw; + int status; + + status = read_bytes_from_xdr_buf(buf, base, &raw, sizeof(*obj)); + if (status) + return status; + *obj = ntohl(raw); + return 0; +} + +/* It would be nice if this bit of code could be shared with the client. + * Obstacles: + * The client shouldn't malloc(), would have to pass in own memory. + * The server uses base of head iovec as read pointer, while the + * client uses separate pointer. */ +static int +unwrap_integ_data(struct xdr_buf *buf, u32 seq, struct gss_ctx *ctx) +{ + int stat = -EINVAL; + u32 integ_len, maj_stat; + struct xdr_netobj mic; + struct xdr_buf integ_buf; + + integ_len = ntohl(svc_getu32(&buf->head[0])); + if (integ_len & 3) + goto out; + if (integ_len > buf->len) + goto out; + if (xdr_buf_subsegment(buf, &integ_buf, 0, integ_len)) + BUG(); + /* copy out mic... */ + if (read_u32_from_xdr_buf(buf, integ_len, &mic.len)) + BUG(); + if (mic.len > RPC_MAX_AUTH_SIZE) + goto out; + mic.data = kmalloc(mic.len, GFP_KERNEL); + if (!mic.data) + goto out; + if (read_bytes_from_xdr_buf(buf, integ_len + 4, mic.data, mic.len)) + goto out; + maj_stat = gss_verify_mic(ctx, &integ_buf, &mic, NULL); + if (maj_stat != GSS_S_COMPLETE) + goto out; + if (ntohl(svc_getu32(&buf->head[0])) != seq) + goto out; + stat = 0; +out: + return stat; +} + +struct gss_svc_data { + /* decoded gss client cred: */ + struct rpc_gss_wire_cred clcred; + /* pointer to the beginning of the procedure-specific results, + * which may be encrypted/checksummed in svcauth_gss_release: */ + u32 *body_start; + struct rsc *rsci; +}; + /* * Accept an rpcsec packet. * If context establishment, punt to user space @@ -701,6 +763,8 @@ svcauth_gss_accept(struct svc_rqst *rqstp, u32 *authp) if (!svcdata) goto auth_err; rqstp->rq_auth_data = svcdata; + svcdata->body_start = 0; + svcdata->rsci = NULL; gc = &svcdata->clcred; /* start of rpc packet is 7 u32's back from here: @@ -754,9 +818,6 @@ svcauth_gss_accept(struct svc_rqst *rqstp, u32 *authp) break; case RPC_GSS_PROC_DATA: case RPC_GSS_PROC_DESTROY: - /* integrity and privacy unsupported: */ - if (gc->gc_svc != RPC_GSS_SVC_NONE) - goto auth_err; *authp = rpcsec_gsserr_credproblem; rsci = gss_svc_searchbyctx(&gc->gc_ctx); if (!rsci) @@ -841,10 +902,28 @@ svcauth_gss_accept(struct svc_rqst *rqstp, u32 *authp) *authp = rpcsec_gsserr_ctxproblem; if (gss_write_verf(rqstp, rsci->mechctx, gc->gc_seq)) goto auth_err; - /* For use when wrapping: */ - svcdata->body_start = resv->iov_base + 1; rqstp->rq_cred = rsci->cred; get_group_info(rsci->cred.cr_group_info); + *authp = rpc_autherr_badcred; + switch (gc->gc_svc) { + case RPC_GSS_SVC_NONE: + break; + case RPC_GSS_SVC_INTEGRITY: + if (unwrap_integ_data(&rqstp->rq_arg, + gc->gc_seq, rsci->mechctx)) + goto auth_err; + svcdata->rsci = rsci; + cache_get(&rsci->h); + /* placeholders for length and seq. number: */ + svcdata->body_start = resv->iov_base + resv->iov_len; + svc_putu32(resv, 0); + svc_putu32(resv, 0); + break; + case RPC_GSS_SVC_PRIVACY: + /* currently unsupported */ + default: + goto auth_err; + } ret = SVC_OK; goto out; } @@ -867,14 +946,95 @@ out: static int svcauth_gss_release(struct svc_rqst *rqstp) { + struct gss_svc_data *gsd = (struct gss_svc_data *)rqstp->rq_auth_data; + struct rpc_gss_wire_cred *gc = &gsd->clcred; + struct xdr_buf *resbuf = &rqstp->rq_res; + struct xdr_buf integ_buf; + struct xdr_netobj mic; + struct iovec *resv; + u32 *p; + int integ_offset, integ_len; + int stat = -EINVAL; + + if (gc->gc_proc != RPC_GSS_PROC_DATA) + goto out; + /* Release can be called twice, but we only wrap once. */ + if (gsd->body_start == 0) + goto out; + /* normally not set till svc_send, but we need it here: */ + resbuf->len = resbuf->head[0].iov_len + + resbuf->page_len + resbuf->tail[0].iov_len; + switch (gc->gc_svc) { + case RPC_GSS_SVC_NONE: + break; + case RPC_GSS_SVC_INTEGRITY: + p = gsd->body_start; + gsd->body_start = 0; + /* move accept_stat to right place: */ + memcpy(p, p + 2, 4); + /* don't wrap in failure case: */ + /* Note: counting on not getting here if call was not even + * accepted! */ + if (*p != rpc_success) { + resbuf->head[0].iov_len -= 2 * 4; + goto out; + } + p++; + integ_offset = (u8 *)(p + 1) - (u8 *)resbuf->head[0].iov_base; + integ_len = resbuf->len - integ_offset; + BUG_ON(integ_len % 4); + *p++ = htonl(integ_len); + *p++ = htonl(gc->gc_seq); + if (xdr_buf_subsegment(resbuf, &integ_buf, integ_offset, + integ_len)) + BUG(); + if (resbuf->page_len == 0 + && resbuf->tail[0].iov_len + RPC_MAX_AUTH_SIZE + < PAGE_SIZE) { + BUG_ON(resbuf->tail[0].iov_len); + /* Use head for everything */ + resv = &resbuf->head[0]; + } else if (resbuf->tail[0].iov_base == NULL) { + /* copied from nfsd4_encode_read */ + svc_take_page(rqstp); + resbuf->tail[0].iov_base = page_address(rqstp + ->rq_respages[rqstp->rq_resused-1]); + rqstp->rq_restailpage = rqstp->rq_resused-1; + resbuf->tail[0].iov_len = 0; + resv = &resbuf->tail[0]; + } else { + resv = &resbuf->tail[0]; + } + mic.data = (u8 *)resv->iov_base + resv->iov_len + 4; + if (gss_get_mic(gsd->rsci->mechctx, 0, &integ_buf, &mic)) + goto out_err; + svc_putu32(resv, htonl(mic.len)); + memset(mic.data + mic.len, 0, + round_up_to_quad(mic.len) - mic.len); + resv->iov_len += XDR_QUADLEN(mic.len) << 2; + /* not strictly required: */ + resbuf->len += XDR_QUADLEN(mic.len) << 2; + BUG_ON(resv->iov_len > PAGE_SIZE); + break; + case RPC_GSS_SVC_PRIVACY: + default: + goto out_err; + } + +out: + stat = 0; +out_err: if (rqstp->rq_client) auth_domain_put(rqstp->rq_client); rqstp->rq_client = NULL; if (rqstp->rq_cred.cr_group_info) put_group_info(rqstp->rq_cred.cr_group_info); rqstp->rq_cred.cr_group_info = NULL; + if (gsd->rsci) + rsc_put(&gsd->rsci->h, &rsc_cache); + gsd->rsci = NULL; - return 0; + return stat; } static void -- cgit v1.2.3 From c334f752d8e9d3847d4459d06f7544dea9a49923 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:53:50 -0700 Subject: [PATCH] posix message queues: code move From: Manfred Spraul cleanup of sysv ipc as a preparation for posix message queues: - replace !CONFIG_SYSVIPC wrappers for copy_semundo and exit_sem with static inline wrappers. Now the whole ipc/util.c file is only used if CONFIG_SYSVIPC is set, use makefile magic instead of #ifdef. - remove the prototypes for copy_semundo and exit_sem from kernel/fork.c - they belong into a header file. - create a new msgutil.c with the helper functions for message queues. - cleanup the helper functions: run Lindent, add __user tags. --- include/linux/msg.h | 3 -- include/linux/sem.h | 17 ++++++- ipc/Makefile | 4 +- ipc/msg.c | 105 ------------------------------------------- ipc/msgutil.c | 127 ++++++++++++++++++++++++++++++++++++++++++++++++++++ ipc/util.c | 19 -------- ipc/util.h | 10 +++++ kernel/fork.c | 4 +- 8 files changed, 155 insertions(+), 134 deletions(-) create mode 100644 ipc/msgutil.c (limited to 'include') diff --git a/include/linux/msg.h b/include/linux/msg.h index b235e862a3dd..2c4c6aa643ff 100644 --- a/include/linux/msg.h +++ b/include/linux/msg.h @@ -74,9 +74,6 @@ struct msg_msg { /* the actual message follows immediately */ }; -#define DATALEN_MSG (PAGE_SIZE-sizeof(struct msg_msg)) -#define DATALEN_SEG (PAGE_SIZE-sizeof(struct msg_msgseg)) - /* one msq_queue structure for each present queue on the system */ struct msg_queue { struct kern_ipc_perm q_perm; diff --git a/include/linux/sem.h b/include/linux/sem.h index b337c509ac29..aaf45764a56e 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -134,7 +134,22 @@ struct sysv_sem { struct sem_undo_list *undo_list; }; -void exit_sem(struct task_struct *p); +#ifdef CONFIG_SYSVIPC + +extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); +extern void exit_sem(struct task_struct *tsk); + +#else +static inline int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) +{ + return 0; +} + +static inline void exit_sem(struct task_struct *tsk) +{ + return; +} +#endif #endif /* __KERNEL__ */ diff --git a/ipc/Makefile b/ipc/Makefile index ccc6c64c2493..6cd32a30f03f 100644 --- a/ipc/Makefile +++ b/ipc/Makefile @@ -2,7 +2,5 @@ # Makefile for the linux ipc. # -obj-y := util.o - obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o -obj-$(CONFIG_SYSVIPC) += msg.o sem.o shm.o +obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o diff --git a/ipc/msg.c b/ipc/msg.c index 709ff71bf5c1..37e2d3bb17cb 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -51,11 +51,6 @@ struct msg_sender { struct task_struct* tsk; }; -struct msg_msgseg { - struct msg_msgseg* next; - /* the next part of the message follows immediately */ -}; - #define SEARCH_ANY 1 #define SEARCH_EQUAL 2 #define SEARCH_NOTEQUAL 3 @@ -129,106 +124,6 @@ static int newque (key_t key, int msgflg) return msg_buildid(id,msq->q_perm.seq); } -static void free_msg(struct msg_msg* msg) -{ - struct msg_msgseg* seg; - - security_msg_msg_free(msg); - - seg = msg->next; - kfree(msg); - while(seg != NULL) { - struct msg_msgseg* tmp = seg->next; - kfree(seg); - seg = tmp; - } -} - -static struct msg_msg* load_msg(void* src, int len) -{ - struct msg_msg* msg; - struct msg_msgseg** pseg; - int err; - int alen; - - alen = len; - if(alen > DATALEN_MSG) - alen = DATALEN_MSG; - - msg = (struct msg_msg *) kmalloc (sizeof(*msg) + alen, GFP_KERNEL); - if(msg==NULL) - return ERR_PTR(-ENOMEM); - - msg->next = NULL; - msg->security = NULL; - - if (copy_from_user(msg+1, src, alen)) { - err = -EFAULT; - goto out_err; - } - - len -= alen; - src = ((char*)src)+alen; - pseg = &msg->next; - while(len > 0) { - struct msg_msgseg* seg; - alen = len; - if(alen > DATALEN_SEG) - alen = DATALEN_SEG; - seg = (struct msg_msgseg *) kmalloc (sizeof(*seg) + alen, GFP_KERNEL); - if(seg==NULL) { - err=-ENOMEM; - goto out_err; - } - *pseg = seg; - seg->next = NULL; - if(copy_from_user (seg+1, src, alen)) { - err = -EFAULT; - goto out_err; - } - pseg = &seg->next; - len -= alen; - src = ((char*)src)+alen; - } - - err = security_msg_msg_alloc(msg); - if (err) - goto out_err; - - return msg; - -out_err: - free_msg(msg); - return ERR_PTR(err); -} - -static int store_msg(void* dest, struct msg_msg* msg, int len) -{ - int alen; - struct msg_msgseg *seg; - - alen = len; - if(alen > DATALEN_MSG) - alen = DATALEN_MSG; - if(copy_to_user (dest, msg+1, alen)) - return -1; - - len -= alen; - dest = ((char*)dest)+alen; - seg = msg->next; - while(len > 0) { - alen = len; - if(alen > DATALEN_SEG) - alen = DATALEN_SEG; - if(copy_to_user (dest, seg+1, alen)) - return -1; - len -= alen; - dest = ((char*)dest)+alen; - seg=seg->next; - } - return 0; -} - static inline void ss_add(struct msg_queue* msq, struct msg_sender* mss) { mss->tsk=current; diff --git a/ipc/msgutil.c b/ipc/msgutil.c new file mode 100644 index 000000000000..e48d777de2a3 --- /dev/null +++ b/ipc/msgutil.c @@ -0,0 +1,127 @@ +/* + * linux/ipc/util.c + * Copyright (C) 1999, 2004 Manfred Spraul + * + * This file is released under GNU General Public Licence version 2 or + * (at your option) any later version. + * + * See the file COPYING for more details. + */ + +#include +#include +#include +#include +#include +#include + +#include "util.h" + +struct msg_msgseg { + struct msg_msgseg* next; + /* the next part of the message follows immediately */ +}; + +#define DATALEN_MSG (PAGE_SIZE-sizeof(struct msg_msg)) +#define DATALEN_SEG (PAGE_SIZE-sizeof(struct msg_msgseg)) + +struct msg_msg *load_msg(void __user *src, int len) +{ + struct msg_msg *msg; + struct msg_msgseg **pseg; + int err; + int alen; + + alen = len; + if (alen > DATALEN_MSG) + alen = DATALEN_MSG; + + msg = (struct msg_msg *)kmalloc(sizeof(*msg) + alen, GFP_KERNEL); + if (msg == NULL) + return ERR_PTR(-ENOMEM); + + msg->next = NULL; + msg->security = NULL; + + if (copy_from_user(msg + 1, src, alen)) { + err = -EFAULT; + goto out_err; + } + + len -= alen; + src = ((char *)src) + alen; + pseg = &msg->next; + while (len > 0) { + struct msg_msgseg *seg; + alen = len; + if (alen > DATALEN_SEG) + alen = DATALEN_SEG; + seg = (struct msg_msgseg *)kmalloc(sizeof(*seg) + alen, + GFP_KERNEL); + if (seg == NULL) { + err = -ENOMEM; + goto out_err; + } + *pseg = seg; + seg->next = NULL; + if (copy_from_user(seg + 1, src, alen)) { + err = -EFAULT; + goto out_err; + } + pseg = &seg->next; + len -= alen; + src = ((char *)src) + alen; + } + + err = security_msg_msg_alloc(msg); + if (err) + goto out_err; + + return msg; + +out_err: + free_msg(msg); + return ERR_PTR(err); +} + +int store_msg(void __user *dest, struct msg_msg *msg, int len) +{ + int alen; + struct msg_msgseg *seg; + + alen = len; + if (alen > DATALEN_MSG) + alen = DATALEN_MSG; + if (copy_to_user(dest, msg + 1, alen)) + return -1; + + len -= alen; + dest = ((char *)dest) + alen; + seg = msg->next; + while (len > 0) { + alen = len; + if (alen > DATALEN_SEG) + alen = DATALEN_SEG; + if (copy_to_user(dest, seg + 1, alen)) + return -1; + len -= alen; + dest = ((char *)dest) + alen; + seg = seg->next; + } + return 0; +} + +void free_msg(struct msg_msg *msg) +{ + struct msg_msgseg *seg; + + security_msg_msg_free(msg); + + seg = msg->next; + kfree(msg); + while (seg != NULL) { + struct msg_msgseg *tmp = seg->next; + kfree(seg); + seg = tmp; + } +} diff --git a/ipc/util.c b/ipc/util.c index 6d94883edae0..f74c5eef57d0 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -25,8 +25,6 @@ #include #include -#if defined(CONFIG_SYSVIPC) - #include "util.h" /** @@ -531,20 +529,3 @@ int ipc_parse_version (int *cmd) } #endif /* __ia64__ */ - -#else -/* - * Dummy functions when SYSV IPC isn't configured - */ - -int copy_semundo(unsigned long clone_flags, struct task_struct *tsk) -{ - return 0; -} - -void exit_sem(struct task_struct *tsk) -{ - return; -} - -#endif /* CONFIG_SYSVIPC */ diff --git a/ipc/util.h b/ipc/util.h index 79c8fc901317..e6434942c097 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -4,6 +4,10 @@ * * ipc helper functions (c) 1999 Manfred Spraul */ + +#ifndef _IPC_UTIL_H +#define _IPC_UTIL_H + #define USHRT_MAX 0xffff #define SEQ_MULTIPLIER (IPCMNI) @@ -62,3 +66,9 @@ void ipc64_perm_to_ipc_perm(struct ipc64_perm *in, struct ipc_perm *out); #else int ipc_parse_version (int *cmd); #endif + +extern void free_msg(struct msg_msg *msg); +extern struct msg_msg *load_msg(void __user *src, int len); +extern int store_msg(void __user *dest, struct msg_msg *msg, int len); + +#endif diff --git a/kernel/fork.c b/kernel/fork.c index 3b17a249c50d..a1f20cabbdd3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -39,9 +40,6 @@ #include #include -extern int copy_semundo(unsigned long clone_flags, struct task_struct *tsk); -extern void exit_sem(struct task_struct *tsk); - /* The idle threads do not count.. * Protected by write_lock_irq(&tasklist_lock) */ -- cgit v1.2.3 From c50142a5433ed504fff2b1af152f8f7628830dfb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:54:03 -0700 Subject: [PATCH] posix message queues: syscall stubs From: Manfred Spraul Add -ENOSYS stubs for the posix message queue syscalls. The API is a direct mapping of the api from the unix spec, with two exceptions: - mq_close() doesn't exist. Message queue file descriptors can be closed with close(). - mq_notify(SIGEV_THREAD) cannot be implemented in the kernel. The kernel returns a pollable file descriptor . User space must poll (or read) this descriptor and call the notifier function if the file descriptor is signaled. --- arch/i386/kernel/entry.S | 9 +++++++++ include/asm-i386/unistd.h | 11 ++++++++++- include/linux/mqueue.h | 36 ++++++++++++++++++++++++++++++++++++ include/linux/syscalls.h | 9 +++++++++ kernel/sys.c | 6 ++++++ 5 files changed, 70 insertions(+), 1 deletion(-) create mode 100644 include/linux/mqueue.h (limited to 'include') diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 3024740ba84c..14e64d3ea25c 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -882,5 +882,14 @@ ENTRY(sys_call_table) .long sys_utimes .long sys_fadvise64_64 .long sys_ni_syscall /* sys_vserver */ + .long sys_ni_syscall /* sys_mbind */ + .long sys_ni_syscall /* 275 sys_get_mempolicy */ + .long sys_ni_syscall /* sys_set_mempolicy */ + .long sys_mq_open + .long sys_mq_unlink + .long sys_mq_timedsend + .long sys_mq_timedreceive /* 280 */ + .long sys_mq_notify + .long sys_mq_getsetattr syscall_table_size=(.-sys_call_table) diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index a2d58a99491e..620a232084f3 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -279,8 +279,17 @@ #define __NR_utimes 271 #define __NR_fadvise64_64 272 #define __NR_vserver 273 +#define __NR_mbind 274 +#define __NR_get_mempolicy 275 +#define __NR_set_mempolicy 276 +#define __NR_mq_open 277 +#define __NR_mq_unlink (__NR_mq_open+1) +#define __NR_mq_timedsend (__NR_mq_open+2) +#define __NR_mq_timedreceive (__NR_mq_open+3) +#define __NR_mq_notify (__NR_mq_open+4) +#define __NR_mq_getsetattr (__NR_mq_open+5) -#define NR_syscalls 274 +#define NR_syscalls 283 /* user-visible error numbers are in the range -1 - -124: see */ diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h new file mode 100644 index 000000000000..c0c5fcc89f0e --- /dev/null +++ b/include/linux/mqueue.h @@ -0,0 +1,36 @@ +/* Copyright (C) 2003 Krzysztof Benedyczak & Michal Wronski + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + It is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this software; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA + 02111-1307 USA. */ + +#ifndef _LINUX_MQUEUE_H +#define _LINUX_MQUEUE_H + +#define MQ_PRIO_MAX 32768 + +typedef int mqd_t; + +struct mq_attr { + long mq_flags; /* message queue flags */ + long mq_maxmsg; /* maximum number of messages */ + long mq_msgsize; /* maximum message size */ + long mq_curmsgs; /* number of messages currently queued */ +}; + +#define NOTIFY_NONE 0 +#define NOTIFY_WOKENUP 1 +#define NOTIFY_REMOVED 2 + +#endif diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index aaf87aeacafb..7ee5f67abb5f 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -48,6 +48,8 @@ struct timex; struct timezone; struct tms; struct utimbuf; +typedef int mqd_t; +struct mq_attr; #include #include @@ -450,6 +452,13 @@ asmlinkage long sys_shmget(key_t key, size_t size, int flag); asmlinkage long sys_shmdt(char __user *shmaddr); asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf); +asmlinkage long sys_mq_open(const char __user *name, int oflag, mode_t mode, struct mq_attr __user *attr); +asmlinkage long sys_mq_unlink(const char __user *name); +asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout); +asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout); +asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification); +asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat); + asmlinkage long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn); asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn, unsigned long off, unsigned long len, diff --git a/kernel/sys.c b/kernel/sys.c index bc498b12edcc..7d1bf5c57aca 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -260,6 +260,12 @@ cond_syscall(sys_msgctl) cond_syscall(sys_shmget) cond_syscall(sys_shmdt) cond_syscall(sys_shmctl) +cond_syscall(sys_mq_open) +cond_syscall(sys_mq_unlink) +cond_syscall(sys_mq_timedsend) +cond_syscall(sys_mq_timedreceive) +cond_syscall(sys_mq_notify) +cond_syscall(sys_mq_getsetattr) /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read) -- cgit v1.2.3 From be94d44e818a56406016111fc48a1084b9f8e435 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:54:16 -0700 Subject: [PATCH] posix message queues: implementation From: Manfred Spraul Actual implementation of the posix message queues, written by Krzysztof Benedyczak and Michal Wronski. The complete implementation is dependant on CONFIG_POSIX_MQUEUE. It passed the openposix test suite with two exceptions: one mq_unlink test was bad and tested undefined behavior. And Linux succeeds mq_close(open(,,,)). The spec mandates EBADF, but we have decided to ignore that: we would have to add a new syscall just for the right error code. The patch intentionally doesn't use all helpers from fs/libfs for kernel-only filesystems: step 5 allows user space mounts of the file system. Signal changes: The patch redefines SI_MESGQ using __SI_CODE: The generic Linux ABI uses a negative value (i.e. from user) for SI_MESGQ, but the kernel internal value must be posive to pass check_kill_value. Additionally, the patch adds support into copy_siginfo_to_user to copy the "new" signal type to user space. Changes in signal code caused by POSIX message queues patch: General & rationale: mqueues generated signals (only upon notification) must have si_code == SI_MESGQ. In fact such a signal is send from one process which caused notification (== sent message to empty message queue) to another which requested it. Both processes can be of course unrelated in terms of uids/euids. So SI_MESGQ signals must be classified as SI_FROMKERNEL to pass check_kill_permissions (not need to say that this signals ARE from kernel). Signals generated by message queues notification need the same fields in siginfo struct's union _sifields as POSIX.1b signals and we can reuse its union entry. SI_MESGQ was previously defined to -3 in kernel and also in glibc. So in userspace SI_MESGQ must be still visible as -3. Solution: SI_MESGQ is defined in the same style as SI_TIMER using __SI_CODE macro. Details: Fortunately copy_siginfo_to_user copies si_code as short. So we can use remaining part of int value freely. __SI_CODE does the work. SI_MESGQ is in kernel: 6<<16 | (-3 & 0xffff) what is > 0 but to userspace is copied (short) SI_MESGQ == -3 Actual changes: Changes in include/asm-generic/siginfo.h __SI_MESGQ added in signal.h to represent inside-kernel prefix of SI_MESGQ. SI_MESGQ is redefined from -3 to __SI_CODE(__SI_MESGQ, -3) Except mips architecture those changes should be arch independent (asm-generic/siginfo.h is included in arch versions). On mips SI_MESGQ is redefined to -4 in order to be compatible with IRIX. But the same schema can be used. Change in copy_siginfo_to_user: We only add one line to order the same copy semantics as for _SI_RT. This change isn't very portable - some arch have its own copy_siginfo_to_user. All those should have similar change (but possibly not one-line as _SI_RT case was sometimes ignored because i wasn't used yet, e.g. see ia64 signal.c). Update: mq: only fail with invalid timespec if mq_timed{send,receive} needs to block From: Jakub Jelinek POSIX requires EINVAL to be set if: "The process or thread would have blocked, and the abs_timeout parameter specified a nanoseconds field value less than zero or greater than or equal to 1000 million." but 2.6.5-mm3 returns -EINVAL even if the process or thread would not block (if the queue is not empty for timedreceive or not full for timedsend). --- CREDITS | 17 + Documentation/filesystems/proc.txt | 25 + include/asm-generic/siginfo.h | 4 +- init/Kconfig | 18 + ipc/Makefile | 2 + ipc/mqueue.c | 1165 ++++++++++++++++++++++++++++++++++++ kernel/signal.c | 1 + 7 files changed, 1231 insertions(+), 1 deletion(-) create mode 100644 ipc/mqueue.c (limited to 'include') diff --git a/CREDITS b/CREDITS index dc9b943d10f1..52128c120f63 100644 --- a/CREDITS +++ b/CREDITS @@ -289,6 +289,15 @@ S: Via Delle Palme, 9 S: Terni 05100 S: Italy +N: Krzysztof Benedyczak +E: golbi@mat.uni.torun.pl +W: http://www.mat.uni.torun.pl/~golbi +D: POSIX message queues fs (with M. Wronski) +S: ul. Podmiejska 52 +S: Radunica +S: 83-000 Pruszcz Gdanski +S: Poland + N: Randolph Bentson E: bentson@grieg.seaslug.org W: http://www.aa.net/~bentson/ @@ -3485,6 +3494,14 @@ S: 12725 SW Millikan Way, Suite 400 S: Beaverton, OR 97005 S: USA +N: Michal Wronski +E: wrona@mat.uni.torun.pl +W: http://www.mat.uni.torun.pl/~wrona +D: POSIX message queues fs (with K. Benedyczak) +S: ul. Teczowa 23/12 +S: 80-680 Gdansk-Sobieszewo +S: Poland + N: Frank Xia E: qx@math.columbia.edu D: Xiafs filesystem [defunct] diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 22fd3adcc96e..378722d5bb70 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -38,6 +38,7 @@ Table of Contents 2.8 /proc/sys/net/ipv4 - IPV4 settings 2.9 Appletalk 2.10 IPX + 2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem ------------------------------------------------------------------------------ Preface @@ -1814,6 +1815,30 @@ The /proc/net/ipx_route table holds a list of IPX routes. For each route it gives the destination network, the router node (or Directly) and the network address of the router (or Connected) for internal networks. +2.11 /proc/sys/fs/mqueue - POSIX message queues filesystem +---------------------------------------------------------- + +The "mqueue" filesystem provides the necessary kernel features to enable the +creation of a user space library that implements the POSIX message queues +API (as noted by the MSG tag in the POSIX 1003.1-2001 version of the System +Interfaces specification.) + +The "mqueue" filesystem contains values for determining/setting the amount of +resources used by the file system. + +/proc/sys/fs/mqueue/queues_max is a read/write file for setting/getting the +maximum number of message queues allowed on the system. + +/proc/sys/fs/mqueue/msg_max is a read/write file for setting/getting the +maximum number of messages in a queue value. In fact it is the limiting value +for another (user) limit which is set in mq_open invocation. This attribute of +a queue must be less or equal then msg_max. + +/proc/sys/fs/mqueue/msgsize_max is a read/write file for setting/getting the +maximum message size value (it is every message queue's attribute set during +its creation). + + ------------------------------------------------------------------------------ Summary ------------------------------------------------------------------------------ diff --git a/include/asm-generic/siginfo.h b/include/asm-generic/siginfo.h index e95efd9e00c6..fe02b1a4d286 100644 --- a/include/asm-generic/siginfo.h +++ b/include/asm-generic/siginfo.h @@ -123,6 +123,7 @@ typedef struct siginfo { #define __SI_FAULT (3 << 16) #define __SI_CHLD (4 << 16) #define __SI_RT (5 << 16) +#define __SI_MESGQ (6 << 16) #define __SI_CODE(T,N) ((T) | ((N) & 0xffff)) #else #define __SI_KILL 0 @@ -131,6 +132,7 @@ typedef struct siginfo { #define __SI_FAULT 0 #define __SI_CHLD 0 #define __SI_RT 0 +#define __SI_MESGQ 0 #define __SI_CODE(T,N) (N) #endif @@ -142,7 +144,7 @@ typedef struct siginfo { #define SI_KERNEL 0x80 /* sent by the kernel from somewhere */ #define SI_QUEUE -1 /* sent by sigqueue */ #define SI_TIMER __SI_CODE(__SI_TIMER,-2) /* sent by timer expiration */ -#define SI_MESGQ -3 /* sent by real time mesq state change */ +#define SI_MESGQ __SI_CODE(__SI_MESGQ,-3) /* sent by real time mesq state change */ #define SI_ASYNCIO -4 /* sent by AIO completion */ #define SI_SIGIO -5 /* sent by queued SIGIO */ #define SI_TKILL -6 /* sent by tkill system call */ diff --git a/init/Kconfig b/init/Kconfig index c10fec8ebe9e..9eff25e8f6ed 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -90,6 +90,24 @@ config SYSVIPC section 6.4 of the Linux Programmer's Guide, available from . +config POSIX_MQUEUE + bool "POSIX Message Queues" + depends on EXPERIMENTAL + ---help--- + POSIX variant of message queues is a part of IPC. In POSIX message + queues every message has a priority which decides about succession + of receiving it by a process. If you want to compile and run + programs written e.g. for Solaris with use of its POSIX message + queues (functions mq_*) say Y here. To use this feature you will + also need mqueue library, available from + + + POSIX message queues are visible as a filesystem called 'mqueue' + and can be mounted somewhere if you want to do filesystem + operations on message queues. + + If unsure, say Y. + config BSD_PROCESS_ACCT bool "BSD Process Accounting" help diff --git a/ipc/Makefile b/ipc/Makefile index 6cd32a30f03f..913790207d85 100644 --- a/ipc/Makefile +++ b/ipc/Makefile @@ -4,3 +4,5 @@ obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o +obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o + diff --git a/ipc/mqueue.c b/ipc/mqueue.c new file mode 100644 index 000000000000..4de249718675 --- /dev/null +++ b/ipc/mqueue.c @@ -0,0 +1,1165 @@ +/* + * POSIX message queues filesystem for Linux. + * + * Copyright (C) 2003,2004 Krzysztof Benedyczak (golbi@mat.uni.torun.pl) + * Michal Wronski (wrona@mat.uni.torun.pl) + * + * Spinlocks: Mohamed Abbas (abbas.mohamed@intel.com) + * Lockless receive & send, fd based notify: + * Manfred Spraul (manfred@colorfullife.com) + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "util.h" + +#define MQUEUE_MAGIC 0x19800202 +#define DIRENT_SIZE 20 +#define FILENT_SIZE 80 + +#define SEND 0 +#define RECV 1 + +#define STATE_NONE 0 +#define STATE_PENDING 1 +#define STATE_READY 2 + +#define NP_NONE ((void*)NOTIFY_NONE) +#define NP_WOKENUP ((void*)NOTIFY_WOKENUP) +#define NP_REMOVED ((void*)NOTIFY_REMOVED) +/* used by sysctl */ +#define FS_MQUEUE 1 +#define CTL_QUEUESMAX 2 +#define CTL_MSGMAX 3 +#define CTL_MSGSIZEMAX 4 + +/* default values */ +#define DFLT_QUEUESMAX 64 /* max number of message queues */ +#define DFLT_MSGMAX 40 /* max number of messages in each queue */ +#define HARD_MSGMAX (131072/sizeof(void*)) +#define DFLT_MSGSIZEMAX 16384 /* max message size */ + +struct ext_wait_queue { /* queue of sleeping tasks */ + struct task_struct *task; + struct list_head list; + struct msg_msg *msg; /* ptr of loaded message */ + int state; /* one of STATE_* values */ +}; + +struct mqueue_inode_info { + struct mq_attr attr; + struct msg_msg **messages; + + pid_t notify_owner; /* != 0 means notification registered */ + struct sigevent notify; + struct file *notify_filp; + + /* for tasks waiting for free space and messages, respectively */ + struct ext_wait_queue e_wait_q[2]; + wait_queue_head_t wait_q; + + unsigned long qsize; /* size of queue in memory (sum of all msgs) */ + spinlock_t lock; + struct inode vfs_inode; +}; + +static struct inode_operations mqueue_dir_inode_operations; +static struct file_operations mqueue_file_operations; +static struct file_operations mqueue_notify_fops; +static struct super_operations mqueue_super_ops; +static void remove_notification(struct mqueue_inode_info *info); + +static spinlock_t mq_lock; +static kmem_cache_t *mqueue_inode_cachep; +static struct vfsmount *mqueue_mnt; + +static unsigned int queues_count; +static unsigned int queues_max = DFLT_QUEUESMAX; +static unsigned int msg_max = DFLT_MSGMAX; +static unsigned int msgsize_max = DFLT_MSGSIZEMAX; + +static struct ctl_table_header * mq_sysctl_table; + +static inline struct mqueue_inode_info *MQUEUE_I(struct inode *inode) +{ + return container_of(inode, struct mqueue_inode_info, vfs_inode); +} + +static struct inode *mqueue_get_inode(struct super_block *sb, int mode) +{ + struct inode *inode; + + inode = new_inode(sb); + if (inode) { + inode->i_mode = mode; + inode->i_uid = current->fsuid; + inode->i_gid = current->fsgid; + inode->i_blksize = PAGE_CACHE_SIZE; + inode->i_blocks = 0; + inode->i_mtime = inode->i_ctime = inode->i_atime = + CURRENT_TIME; + + if (S_ISREG(mode)) { + struct mqueue_inode_info *info; + + inode->i_fop = &mqueue_file_operations; + inode->i_size = FILENT_SIZE; + /* mqueue specific info */ + info = MQUEUE_I(inode); + spin_lock_init(&info->lock); + init_waitqueue_head(&info->wait_q); + INIT_LIST_HEAD(&info->e_wait_q[0].list); + INIT_LIST_HEAD(&info->e_wait_q[1].list); + info->notify_owner = 0; + info->qsize = 0; + info->attr.mq_curmsgs = 0; + info->messages = NULL; + } else if (S_ISDIR(mode)) { + inode->i_nlink++; + inode->i_op = &mqueue_dir_inode_operations; + inode->i_fop = &simple_dir_operations; + } + } + return inode; +} + +static int mqueue_fill_super(struct super_block *sb, void *data, int silent) +{ + struct inode *inode; + + sb->s_flags = MS_NOUSER; + sb->s_blocksize = PAGE_CACHE_SIZE; + sb->s_blocksize_bits = PAGE_CACHE_SHIFT; + sb->s_magic = MQUEUE_MAGIC; + sb->s_op = &mqueue_super_ops; + + inode = mqueue_get_inode(sb, S_IFDIR | S_IRWXUGO); + if (!inode) + return -ENOMEM; + + sb->s_root = d_alloc_root(inode); + if (!sb->s_root) { + iput(inode); + return -ENOMEM; + } + + return 0; +} + +static struct super_block *mqueue_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, + void *data) +{ + return get_sb_single(fs_type, flags, data, mqueue_fill_super); +} + +static void init_once(void *foo, kmem_cache_t * cachep, unsigned long flags) +{ + struct mqueue_inode_info *p = (struct mqueue_inode_info *) foo; + + if ((flags & (SLAB_CTOR_VERIFY | SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(&p->vfs_inode); +} + +static struct inode *mqueue_alloc_inode(struct super_block *sb) +{ + struct mqueue_inode_info *ei; + + ei = kmem_cache_alloc(mqueue_inode_cachep, SLAB_KERNEL); + if (!ei) + return NULL; + return &ei->vfs_inode; +} + +static void mqueue_destroy_inode(struct inode *inode) +{ + kmem_cache_free(mqueue_inode_cachep, MQUEUE_I(inode)); +} + +static void mqueue_delete_inode(struct inode *inode) +{ + struct mqueue_inode_info *info; + int i; + + if (S_ISDIR(inode->i_mode)) { + clear_inode(inode); + return; + } + info = MQUEUE_I(inode); + spin_lock(&info->lock); + for (i = 0; i < info->attr.mq_curmsgs; i++) + free_msg(info->messages[i]); + kfree(info->messages); + spin_unlock(&info->lock); + + clear_inode(inode); + + spin_lock(&mq_lock); + queues_count--; + spin_unlock(&mq_lock); +} + +static int mqueue_create(struct inode *dir, struct dentry *dentry, + int mode, struct nameidata *nd) +{ + struct inode *inode; + int error; + + spin_lock(&mq_lock); + if (queues_count >= queues_max && !capable(CAP_SYS_RESOURCE)) { + error = -ENOSPC; + goto out_lock; + } + queues_count++; + spin_unlock(&mq_lock); + + inode = mqueue_get_inode(dir->i_sb, mode); + if (!inode) { + error = -ENOMEM; + spin_lock(&mq_lock); + queues_count--; + goto out_lock; + } + + d_instantiate(dentry, inode); + dget(dentry); + return 0; +out_lock: + spin_unlock(&mq_lock); + return error; +} + +static int mqueue_flush_file(struct file *filp) +{ + struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); + + spin_lock(&info->lock); + if (current->tgid == info->notify_owner) + remove_notification(info); + + spin_unlock(&info->lock); + return 0; +} + +/* Adds current to info->e_wait_q[sr] before element with smaller prio */ +static void wq_add(struct mqueue_inode_info *info, int sr, + struct ext_wait_queue *ewp) +{ + struct ext_wait_queue *walk; + + ewp->task = current; + + list_for_each_entry(walk, &info->e_wait_q[sr].list, list) { + if (walk->task->static_prio <= current->static_prio) { + list_add_tail(&ewp->list, &walk->list); + return; + } + } + list_add_tail(&ewp->list, &info->e_wait_q[sr].list); +} + +/* + * Puts current task to sleep. Caller must hold queue lock. After return + * lock isn't held. + * sr: SEND or RECV + */ +static int wq_sleep(struct mqueue_inode_info *info, int sr, + long timeout, struct ext_wait_queue *ewp) +{ + int retval; + signed long time; + + wq_add(info, sr, ewp); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + spin_unlock(&info->lock); + time = schedule_timeout(timeout); + + while (ewp->state == STATE_PENDING) + cpu_relax(); + + if (ewp->state == STATE_READY) { + retval = 0; + goto out; + } + spin_lock(&info->lock); + if (ewp->state == STATE_READY) { + retval = 0; + goto out_unlock; + } + if (signal_pending(current)) { + retval = -ERESTARTSYS; + break; + } + if (time == 0) { + retval = -ETIMEDOUT; + break; + } + } + list_del(&ewp->list); +out_unlock: + spin_unlock(&info->lock); +out: + return retval; +} + +/* + * Returns waiting task that should be serviced first or NULL if none exists + */ +static struct ext_wait_queue *wq_get_first_waiter( + struct mqueue_inode_info *info, int sr) +{ + struct list_head *ptr; + + ptr = info->e_wait_q[sr].list.prev; + if (ptr == &info->e_wait_q[sr].list) + return NULL; + return list_entry(ptr, struct ext_wait_queue, list); +} + +/* Auxiliary functions to manipulate messages' list */ +static void msg_insert(struct msg_msg *ptr, struct mqueue_inode_info *info) +{ + int k; + + k = info->attr.mq_curmsgs - 1; + while (k >= 0 && info->messages[k]->m_type >= ptr->m_type) { + info->messages[k + 1] = info->messages[k]; + k--; + } + info->attr.mq_curmsgs++; + info->qsize += ptr->m_ts; + info->messages[k + 1] = ptr; +} + +static inline struct msg_msg *msg_get(struct mqueue_inode_info *info) +{ + info->qsize -= info->messages[--info->attr.mq_curmsgs]->m_ts; + return info->messages[info->attr.mq_curmsgs]; +} + +/* + * The next function is only to split too long sys_mq_timedsend + */ +static void __do_notify(struct mqueue_inode_info *info) +{ + /* notification + * invoked when there is registered process and there isn't process + * waiting synchronously for message AND state of queue changed from + * empty to not empty. Here we are sure that no one is waiting + * synchronously. */ + if (info->notify_owner && info->attr.mq_curmsgs == 1) { + /* sends signal */ + if (info->notify.sigev_notify == SIGEV_SIGNAL) { + struct siginfo sig_i; + + sig_i.si_signo = info->notify.sigev_signo; + sig_i.si_errno = 0; + sig_i.si_code = SI_MESGQ; + sig_i.si_value = info->notify.sigev_value; + sig_i.si_pid = current->tgid; + sig_i.si_uid = current->uid; + + kill_proc_info(info->notify.sigev_signo, + &sig_i, info->notify_owner); + } else if (info->notify.sigev_notify == SIGEV_THREAD) { + info->notify_filp->private_data = (void*)NP_WOKENUP; + wake_up(&info->wait_q); + } + /* after notification unregisters process */ + info->notify_owner = 0; + } +} + +static long prepare_timeout(const struct timespec __user *u_arg) +{ + struct timespec ts, nowts; + long timeout; + + if (u_arg) { + if (unlikely(copy_from_user(&ts, u_arg, + sizeof(struct timespec)))) + return -EFAULT; + + if (unlikely(ts.tv_nsec < 0 || ts.tv_sec < 0 + || ts.tv_nsec >= NSEC_PER_SEC)) + return -EINVAL; + nowts = CURRENT_TIME; + /* first subtract as jiffies can't be too big */ + ts.tv_sec -= nowts.tv_sec; + if (ts.tv_nsec < nowts.tv_nsec) { + ts.tv_nsec += NSEC_PER_SEC; + ts.tv_sec--; + } + ts.tv_nsec -= nowts.tv_nsec; + if (ts.tv_sec < 0) + return 0; + + timeout = timespec_to_jiffies(&ts) + 1; + } else + return MAX_SCHEDULE_TIMEOUT; + + return timeout; +} + +/* + * File descriptor based notification, intended to be used to implement + * SIGEV_THREAD: + * SIGEV_THREAD means that a notification function should be called in the + * context of a new thread. The kernel can't do that. Therefore mq_notify + * calls with SIGEV_THREAD return a new file descriptor. A user space helper + * must create a new thread and then read from the given file descriptor. + * The read always returns one byte. If it's NOTIFY_WOKENUP, then it must + * call the notification function. If it's NOTIFY_REMOVED, then the + * notification was removed. The file descriptor supports poll, thus one + * supervisor thread can manage multiple message queue notifications. + * + * The implementation must support multiple outstanding notifications: + * It's possible that a new notification is added and signaled before user + * space calls mqueue_notify_read for the previous notification. + * Therefore the notification state is stored in the private_data field of + * the file descriptor. + */ +static unsigned int mqueue_notify_poll(struct file *filp, + struct poll_table_struct *poll_tab) +{ + struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); + int retval; + + poll_wait(filp, &info->wait_q, poll_tab); + + if (filp->private_data == NP_NONE) + retval = 0; + else + retval = POLLIN | POLLRDNORM; + return retval; +} + +static ssize_t mqueue_notify_read(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); + char result; + + if (!count) + return 0; + if (*ppos != 0) + return 0; + spin_lock(&info->lock); + while (filp->private_data == NP_NONE) { + DEFINE_WAIT(wait); + if (filp->f_flags & O_NONBLOCK) { + spin_unlock(&info->lock); + return -EAGAIN; + } + prepare_to_wait(&info->wait_q, &wait, TASK_INTERRUPTIBLE); + spin_unlock(&info->lock); + schedule(); + finish_wait(&info->wait_q, &wait); + spin_lock(&info->lock); + } + spin_unlock(&info->lock); + result = (char)(unsigned long)filp->private_data; + if (put_user(result, buf)) + return -EFAULT; + *ppos = 1; + return 1; +} + +static int mqueue_notify_release(struct inode *inode, struct file *filp) +{ + struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); + + spin_lock(&info->lock); + if (info->notify_owner && info->notify_filp == filp) + info->notify_owner = 0; + filp->private_data = NP_REMOVED; + spin_unlock(&info->lock); + + return 0; +} + +static void remove_notification(struct mqueue_inode_info *info) +{ + if (info->notify.sigev_notify == SIGEV_THREAD) { + info->notify_filp->private_data = NP_REMOVED; + wake_up(&info->wait_q); + } + info->notify_owner = 0; +} + +/* + * Invoked when creating a new queue via sys_mq_open + */ +static struct file *do_create(struct dentry *dir, struct dentry *dentry, + int oflag, mode_t mode, struct mq_attr __user *u_attr) +{ + struct file *filp; + struct inode *inode; + struct mqueue_inode_info *info; + struct msg_msg **msgs = NULL; + struct mq_attr attr; + int ret; + + if (u_attr != NULL) { + if (copy_from_user(&attr, u_attr, sizeof(attr))) + return ERR_PTR(-EFAULT); + + if (attr.mq_maxmsg <= 0 || attr.mq_msgsize <= 0) + return ERR_PTR(-EINVAL); + if (capable(CAP_SYS_RESOURCE)) { + if (attr.mq_maxmsg > HARD_MSGMAX) + return ERR_PTR(-EINVAL); + } else { + if (attr.mq_maxmsg > msg_max || + attr.mq_msgsize > msgsize_max) + return ERR_PTR(-EINVAL); + } + } else { + attr.mq_maxmsg = DFLT_MSGMAX; + attr.mq_msgsize = DFLT_MSGSIZEMAX; + } + msgs = kmalloc(attr.mq_maxmsg * sizeof(*msgs), GFP_KERNEL); + if (!msgs) + return ERR_PTR(-ENOMEM); + + ret = vfs_create(dir->d_inode, dentry, mode, NULL); + if (ret) { + kfree(msgs); + return ERR_PTR(ret); + } + + inode = dentry->d_inode; + info = MQUEUE_I(inode); + + info->attr.mq_maxmsg = attr.mq_maxmsg; + info->attr.mq_msgsize = attr.mq_msgsize; + info->messages = msgs; + + filp = dentry_open(dentry, mqueue_mnt, oflag); + if (!IS_ERR(filp)) + dget(dentry); + + return filp; +} + +/* Opens existing queue */ +static struct file *do_open(struct dentry *dentry, int oflag) +{ +static int oflag2acc[O_ACCMODE] = { MAY_READ, MAY_WRITE, + MAY_READ | MAY_WRITE }; + struct file *filp; + + if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY)) + return ERR_PTR(-EINVAL); + + if (permission(dentry->d_inode, oflag2acc[oflag & O_ACCMODE], NULL)) + return ERR_PTR(-EACCES); + + filp = dentry_open(dentry, mqueue_mnt, oflag); + + if (!IS_ERR(filp)) + dget(dentry); + + return filp; +} + +asmlinkage long sys_mq_open(const char __user *u_name, int oflag, mode_t mode, + struct mq_attr __user *u_attr) +{ + struct dentry *dentry; + struct file *filp; + char *name; + int fd, error; + + if (IS_ERR(name = getname(u_name))) + return PTR_ERR(name); + + fd = get_unused_fd(); + if (fd < 0) + goto out_putname; + + down(&mqueue_mnt->mnt_root->d_inode->i_sem); + dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name)); + if (IS_ERR(dentry)) { + error = PTR_ERR(dentry); + goto out_err; + } + mntget(mqueue_mnt); + + if (oflag & O_CREAT) { + if (dentry->d_inode) { /* entry already exists */ + filp = (oflag & O_EXCL) ? ERR_PTR(-EEXIST) : + do_open(dentry, oflag); + } else { + filp = do_create(mqueue_mnt->mnt_root, dentry, + oflag, mode, u_attr); + } + } else + filp = (dentry->d_inode) ? do_open(dentry, oflag) : + ERR_PTR(-ENOENT); + + dput(dentry); + + if (IS_ERR(filp)) { + error = PTR_ERR(filp); + goto out_putfd; + } + + fd_install(fd, filp); + goto out_upsem; + +out_putfd: + mntput(mqueue_mnt); + put_unused_fd(fd); +out_err: + fd = error; +out_upsem: + up(&mqueue_mnt->mnt_root->d_inode->i_sem); +out_putname: + putname(name); + return fd; +} + +asmlinkage long sys_mq_unlink(const char __user *u_name) +{ + int err; + char *name; + struct dentry *dentry; + struct inode *inode = NULL; + + name = getname(u_name); + if (IS_ERR(name)) + return PTR_ERR(name); + + down(&mqueue_mnt->mnt_root->d_inode->i_sem); + dentry = lookup_one_len(name, mqueue_mnt->mnt_root, strlen(name)); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); + goto out_unlock; + } + + if (!dentry->d_inode) { + err = -ENOENT; + goto out_err; + } + + if (permission(dentry->d_inode, MAY_WRITE, NULL)) { + err = -EACCES; + goto out_err; + } + inode = dentry->d_inode; + if (inode) + atomic_inc(&inode->i_count); + + err = vfs_unlink(dentry->d_parent->d_inode, dentry); +out_err: + dput(dentry); + +out_unlock: + up(&mqueue_mnt->mnt_root->d_inode->i_sem); + putname(name); + if (inode) + iput(inode); + + return err; +} + +/* Pipelined send and receive functions. + * + * If a receiver finds no waiting message, then it registers itself in the + * list of waiting receivers. A sender checks that list before adding the new + * message into the message array. If there is a waiting receiver, then it + * bypasses the message array and directly hands the message over to the + * receiver. + * The receiver accepts the message and returns without grabbing the queue + * spinlock. Therefore an intermediate STATE_PENDING state and memory barriers + * are necessary. The same algorithm is used for sysv semaphores, see + * ipc/sem.c fore more details. + * + * The same algorithm is used for senders. + */ + +/* pipelined_send() - send a message directly to the task waiting in + * sys_mq_timedreceive() (without inserting message into a queue). */ +static inline void pipelined_send(struct mqueue_inode_info *info, + struct msg_msg *message, + struct ext_wait_queue *receiver) +{ + receiver->msg = message; + list_del(&receiver->list); + receiver->state = STATE_PENDING; + wake_up_process(receiver->task); + wmb(); + receiver->state = STATE_READY; +} + +/* pipelined_receive() - if there is task waiting in sys_mq_timedsend() + * gets its message and put to the queue (we have one free place for sure). */ +static inline void pipelined_receive(struct mqueue_inode_info *info) +{ + struct ext_wait_queue *sender = wq_get_first_waiter(info, SEND); + + if (!sender) + return; + + msg_insert(sender->msg, info); + list_del(&sender->list); + sender->state = STATE_PENDING; + wake_up_process(sender->task); + wmb(); + sender->state = STATE_READY; +} + +asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *u_msg_ptr, + size_t msg_len, unsigned int msg_prio, + const struct timespec __user *u_abs_timeout) +{ + struct file *filp; + struct inode *inode; + struct ext_wait_queue wait; + struct ext_wait_queue *receiver; + struct msg_msg *msg_ptr; + struct mqueue_inode_info *info; + long timeout; + int ret; + + if (unlikely(msg_prio >= (unsigned long) MQ_PRIO_MAX)) + return -EINVAL; + + timeout = prepare_timeout(u_abs_timeout); + + ret = -EBADF; + filp = fget(mqdes); + if (unlikely(!filp)) + goto out; + + inode = filp->f_dentry->d_inode; + if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb)) + goto out_fput; + info = MQUEUE_I(inode); + + if (unlikely((filp->f_flags & O_ACCMODE) == O_RDONLY)) + goto out_fput; + + if (unlikely(msg_len > info->attr.mq_msgsize)) { + ret = -EMSGSIZE; + goto out_fput; + } + + /* First try to allocate memory, before doing anything with + * existing queues. */ + msg_ptr = load_msg((void *)u_msg_ptr, msg_len); + if (unlikely(IS_ERR(msg_ptr))) { + ret = PTR_ERR(msg_ptr); + goto out_fput; + } + msg_ptr->m_ts = msg_len; + msg_ptr->m_type = msg_prio; + + spin_lock(&info->lock); + + if (info->attr.mq_curmsgs == info->attr.mq_maxmsg) { + if (filp->f_flags & O_NONBLOCK) { + spin_unlock(&info->lock); + ret = -EAGAIN; + } else if (unlikely(timeout < 0)) { + spin_unlock(&info->lock); + ret = timeout; + } else { + wait.task = current; + wait.msg = (void *) msg_ptr; + wait.state = STATE_NONE; + ret = wq_sleep(info, SEND, timeout, &wait); + if (ret < 0) + free_msg(msg_ptr); + } + } else { + receiver = wq_get_first_waiter(info, RECV); + if (receiver) { + pipelined_send(info, msg_ptr, receiver); + } else { + /* adds message to the queue */ + msg_insert(msg_ptr, info); + __do_notify(info); + } + inode->i_atime = inode->i_mtime = inode->i_ctime = + CURRENT_TIME; + spin_unlock(&info->lock); + ret = 0; + } +out_fput: + fput(filp); +out: + return ret; +} + +asmlinkage ssize_t sys_mq_timedreceive(mqd_t mqdes, char __user *u_msg_ptr, + size_t msg_len, unsigned int __user *u_msg_prio, + const struct timespec __user *u_abs_timeout) +{ + long timeout; + ssize_t ret; + struct msg_msg *msg_ptr; + struct file *filp; + struct inode *inode; + struct mqueue_inode_info *info; + struct ext_wait_queue wait; + + timeout = prepare_timeout(u_abs_timeout); + + ret = -EBADF; + filp = fget(mqdes); + if (unlikely(!filp)) + goto out; + + inode = filp->f_dentry->d_inode; + if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb)) + goto out_fput; + info = MQUEUE_I(inode); + + if (unlikely((filp->f_flags & O_ACCMODE) == O_WRONLY)) + goto out_fput; + + /* checks if buffer is big enough */ + if (unlikely(msg_len < info->attr.mq_msgsize)) { + ret = -EMSGSIZE; + goto out_fput; + } + + spin_lock(&info->lock); + if (info->attr.mq_curmsgs == 0) { + if (filp->f_flags & O_NONBLOCK) { + spin_unlock(&info->lock); + ret = -EAGAIN; + msg_ptr = NULL; + } else if (unlikely(timeout < 0)) { + spin_unlock(&info->lock); + ret = timeout; + msg_ptr = NULL; + } else { + wait.task = current; + wait.state = STATE_NONE; + ret = wq_sleep(info, RECV, timeout, &wait); + msg_ptr = wait.msg; + } + } else { + msg_ptr = msg_get(info); + + inode->i_atime = inode->i_mtime = inode->i_ctime = + CURRENT_TIME; + + /* There is now free space in queue. */ + pipelined_receive(info); + spin_unlock(&info->lock); + ret = 0; + } + if (ret == 0) { + ret = msg_ptr->m_ts; + + if ((u_msg_prio && put_user(msg_ptr->m_type, u_msg_prio)) || + store_msg(u_msg_ptr, msg_ptr, msg_ptr->m_ts)) { + ret = -EFAULT; + } + free_msg(msg_ptr); + } +out_fput: + fput(filp); +out: + return ret; +} + +/* + * Notes: the case when user wants us to deregister (with NULL as pointer + * or SIGEV_NONE) and he isn't currently owner of notification will be + * silently discarded. It isn't explicitly defined in the POSIX. + */ +asmlinkage long sys_mq_notify(mqd_t mqdes, + const struct sigevent __user *u_notification) +{ + int ret, fd; + struct file *filp, *nfilp; + struct inode *inode; + struct sigevent notification; + struct mqueue_inode_info *info; + + if (u_notification == NULL) { + notification.sigev_notify = SIGEV_NONE; + } else { + if (copy_from_user(¬ification, u_notification, + sizeof(struct sigevent))) + return -EFAULT; + + if (unlikely(notification.sigev_notify != SIGEV_NONE && + notification.sigev_notify != SIGEV_SIGNAL && + notification.sigev_notify != SIGEV_THREAD)) + return -EINVAL; + if (notification.sigev_notify == SIGEV_SIGNAL && + (notification.sigev_signo < 0 || + notification.sigev_signo > _NSIG)) { + return -EINVAL; + } + } + + ret = -EBADF; + filp = fget(mqdes); + if (!filp) + goto out; + + inode = filp->f_dentry->d_inode; + if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb)) + goto out_fput; + info = MQUEUE_I(inode); + + ret = 0; + if (notification.sigev_notify == SIGEV_THREAD) { + ret = get_unused_fd(); + if (ret < 0) + goto out_fput; + fd = ret; + nfilp = get_empty_filp(); + if (!nfilp) { + ret = -ENFILE; + goto out_dropfd; + } + nfilp->private_data = NP_NONE; + nfilp->f_op = &mqueue_notify_fops; + nfilp->f_vfsmnt = mntget(mqueue_mnt); + nfilp->f_dentry = dget(filp->f_dentry); + nfilp->f_mapping = filp->f_dentry->d_inode->i_mapping; + nfilp->f_mode = FMODE_READ; + } else { + nfilp = NULL; + fd = -1; + } + + spin_lock(&info->lock); + + if (notification.sigev_notify == SIGEV_NONE) { + if (info->notify_owner == current->tgid) { + remove_notification(info); + inode->i_atime = inode->i_ctime = CURRENT_TIME; + } + } else if (info->notify_owner) { + ret = -EBUSY; + } else if (notification.sigev_notify == SIGEV_THREAD) { + info->notify_filp = nfilp; + fd_install(fd, nfilp); + ret = fd; + fd = -1; + nfilp = NULL; + info->notify.sigev_notify = SIGEV_THREAD; + info->notify_owner = current->tgid; + inode->i_atime = inode->i_ctime = CURRENT_TIME; + } else { + info->notify.sigev_signo = notification.sigev_signo; + info->notify.sigev_value = notification.sigev_value; + info->notify.sigev_notify = SIGEV_SIGNAL; + info->notify_owner = current->tgid; + inode->i_atime = inode->i_ctime = CURRENT_TIME; + } + spin_unlock(&info->lock); +out_dropfd: + if (fd != -1) + put_unused_fd(fd); +out_fput: + fput(filp); +out: + return ret; +} + +asmlinkage long sys_mq_getsetattr(mqd_t mqdes, + const struct mq_attr __user *u_mqstat, + struct mq_attr __user *u_omqstat) +{ + int ret; + struct mq_attr mqstat, omqstat; + struct file *filp; + struct inode *inode; + struct mqueue_inode_info *info; + + if (u_mqstat != NULL) { + if (copy_from_user(&mqstat, u_mqstat, sizeof(struct mq_attr))) + return -EFAULT; + } + + ret = -EBADF; + filp = fget(mqdes); + if (!filp) + goto out; + + inode = filp->f_dentry->d_inode; + if (unlikely(inode->i_sb != mqueue_mnt->mnt_sb)) + goto out_fput; + info = MQUEUE_I(inode); + + spin_lock(&info->lock); + + omqstat = info->attr; + omqstat.mq_flags = filp->f_flags; + if (u_mqstat) { + if (mqstat.mq_flags & O_NONBLOCK) + filp->f_flags |= O_NONBLOCK; + else + filp->f_flags &= ~O_NONBLOCK; + + inode->i_atime = inode->i_ctime = CURRENT_TIME; + } + + spin_unlock(&info->lock); + + ret = 0; + if (u_omqstat != NULL && copy_to_user(u_omqstat, &omqstat, + sizeof(struct mq_attr))) + ret = -EFAULT; + +out_fput: + fput(filp); +out: + return ret; +} + +static struct inode_operations mqueue_dir_inode_operations = { + .lookup = simple_lookup, + .create = mqueue_create, + .unlink = simple_unlink, +}; + +static struct file_operations mqueue_file_operations = { + .flush = mqueue_flush_file, +}; + +static struct file_operations mqueue_notify_fops = { + .poll = mqueue_notify_poll, + .read = mqueue_notify_read, + .release = mqueue_notify_release, +}; + + +static struct super_operations mqueue_super_ops = { + .alloc_inode = mqueue_alloc_inode, + .destroy_inode = mqueue_destroy_inode, + .delete_inode = mqueue_delete_inode, + .drop_inode = generic_delete_inode, +}; + +static struct file_system_type mqueue_fs_type = { + .name = "mqueue", + .get_sb = mqueue_get_sb, + .kill_sb = kill_anon_super, +}; + +static int msg_max_limit_min = DFLT_MSGMAX; +static int msg_max_limit_max = HARD_MSGMAX; + +static int msg_maxsize_limit_min = DFLT_MSGSIZEMAX; +static int msg_maxsize_limit_max = INT_MAX; + +static ctl_table mq_sysctls[] = { + { + .ctl_name = CTL_QUEUESMAX, + .procname = "queues_max", + .data = &queues_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = CTL_MSGMAX, + .procname = "msg_max", + .data = &msg_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &msg_max_limit_min, + .extra2 = &msg_max_limit_max, + }, + { + .ctl_name = CTL_MSGSIZEMAX, + .procname = "msgsize_max", + .data = &msgsize_max, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &msg_maxsize_limit_min, + .extra2 = &msg_maxsize_limit_max, + }, + { .ctl_name = 0 } +}; + +static ctl_table mq_sysctl_dir[] = { + { + .ctl_name = FS_MQUEUE, + .procname = "mqueue", + .mode = 0555, + .child = mq_sysctls, + }, + { .ctl_name = 0 } +}; + +static ctl_table mq_sysctl_root[] = { + { + .ctl_name = CTL_FS, + .procname = "fs", + .mode = 0555, + .child = mq_sysctl_dir, + }, + { .ctl_name = 0 } +}; + +static int __init init_mqueue_fs(void) +{ + int error; + + mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache", + sizeof(struct mqueue_inode_info), 0, + SLAB_HWCACHE_ALIGN, init_once, NULL); + if (mqueue_inode_cachep == NULL) + return -ENOMEM; + + mq_sysctl_table = register_sysctl_table(mq_sysctl_root, 0); + if (!mq_sysctl_table) { + error = -ENOMEM; + goto out_cache; + } + + error = register_filesystem(&mqueue_fs_type); + if (error) + goto out_sysctl; + + if (IS_ERR(mqueue_mnt = kern_mount(&mqueue_fs_type))) { + error = PTR_ERR(mqueue_mnt); + goto out_filesystem; + } + + /* internal initialization - not common for vfs */ + queues_count = 0; + spin_lock_init(&mq_lock); + + return 0; + +out_filesystem: + unregister_filesystem(&mqueue_fs_type); +out_sysctl: + unregister_sysctl_table(mq_sysctl_table); +out_cache: + if (kmem_cache_destroy(mqueue_inode_cachep)) { + printk(KERN_INFO + "mqueue_inode_cache: not all structures were freed\n"); + } + return error; +} + +__initcall(init_mqueue_fs); diff --git a/kernel/signal.c b/kernel/signal.c index 32992a71683b..e6b7904df68f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2047,6 +2047,7 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from) err |= __put_user(from->si_stime, &to->si_stime); break; case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: /* But this is */ err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); err |= __put_user(from->si_int, &to->si_int); -- cgit v1.2.3 From f3ca8d5dd5c23594bda07893ae374bed7981d473 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:54:54 -0700 Subject: [PATCH] posix message queue update From: Manfred Spraul My discussion with Ulrich had one result: - mq_setattr can accept implementation defined flags. Right now we have none, but we might add some later (e.g. switch to CLOCK_MONOTONIC for mq_timed{send,receive} or something similar). When we add flags, we might need the fields for additional information. And they don't hurt. Therefore add four __reserved fields to mq_attr. - fail mq_setattr if we get unknown flags - otherwise glibc can't detect if it's running on a future kernel that supports new features. - use memset to initialize the mq_attr structure - theoretically we could leak kernel memory. - Only set O_NONBLOCK in mq_attr, explicitely clear O_RDWR & friends. openposix uses getattr, attr |=O_NONBLOCK, setattr - a sane approach. Without clearing O_RDWR, this fails. I've retested all openposix conformance tests with the new patch - the two new FAILED tests check undefined behavior. Note that I won't have net access until Sunday - if the message queue patch breaks something important either ask Krzysztof or drop it. Ulrich had another good idea for SIGEV_THREAD, but I must think about it. It would mean less complexitiy in glibc, but more code in the kernel. I'm not yet convinced that it's overall better. --- include/linux/mqueue.h | 1 + ipc/mqueue.c | 6 ++++-- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h index c0c5fcc89f0e..535fe4b2f14b 100644 --- a/include/linux/mqueue.h +++ b/include/linux/mqueue.h @@ -27,6 +27,7 @@ struct mq_attr { long mq_maxmsg; /* maximum number of messages */ long mq_msgsize; /* maximum message size */ long mq_curmsgs; /* number of messages currently queued */ + long __reserved[4]; /* ignored for input, zeroed for output */ }; #define NOTIFY_NONE 0 diff --git a/ipc/mqueue.c b/ipc/mqueue.c index c9a3e652a026..b5f731781f56 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -121,7 +121,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode) INIT_LIST_HEAD(&info->e_wait_q[1].list); info->notify_owner = 0; info->qsize = 0; - info->attr.mq_curmsgs = 0; + memset(&info->attr, 0, sizeof(info->attr)); info->attr.mq_maxmsg = DFLT_MSGMAX; info->attr.mq_msgsize = DFLT_MSGSIZEMAX; info->messages = kmalloc(DFLT_MSGMAX * sizeof(struct msg_msg *), GFP_KERNEL); @@ -1082,6 +1082,8 @@ asmlinkage long sys_mq_getsetattr(mqd_t mqdes, if (u_mqstat != NULL) { if (copy_from_user(&mqstat, u_mqstat, sizeof(struct mq_attr))) return -EFAULT; + if (mqstat.mq_flags & (~O_NONBLOCK)) + return -EINVAL; } ret = -EBADF; @@ -1097,7 +1099,7 @@ asmlinkage long sys_mq_getsetattr(mqd_t mqdes, spin_lock(&info->lock); omqstat = info->attr; - omqstat.mq_flags = filp->f_flags; + omqstat.mq_flags = filp->f_flags & O_NONBLOCK; if (u_mqstat) { if (mqstat.mq_flags & O_NONBLOCK) filp->f_flags |= O_NONBLOCK; -- cgit v1.2.3 From ed6dcf4a49c1098e0701762f6cc52b194cb7f661 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:55:19 -0700 Subject: [PATCH] split netlink_unicast From: Manfred Spraul The attached patch splits netlink_unicast into three steps: - netlink_getsock{bypid,byfilp}: lookup the destination socket. - netlink_attachskb: perform the nonblock checks, sleep if the socket queue is longer than the limit, etc. - netlink_sendskb: actually send the skb. jamal looked over it and didn't see a problem with the netlink change. The actual use from ipc/mqueue.c is still open (just send back whatever the C library passed to mq_notify, add an nlmsghdr or perhaps even make it a specialized netlink protocol), but the attached patch is independant from the the message queue change. (acked by davem) --- include/linux/netlink.h | 7 +++ net/netlink/af_netlink.c | 120 +++++++++++++++++++++++++++++++++++++---------- 2 files changed, 101 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/netlink.h b/include/linux/netlink.h index 4e5ea27305a2..e5e15ddadab5 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -119,6 +119,13 @@ extern void netlink_set_err(struct sock *ssk, __u32 pid, __u32 group, int code); extern int netlink_register_notifier(struct notifier_block *nb); extern int netlink_unregister_notifier(struct notifier_block *nb); +/* finegrained unicast helpers: */ +struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid); +struct sock *netlink_getsockbyfilp(struct file *filp); +int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo); +void netlink_detachskb(struct sock *sk, struct sk_buff *skb); +int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol); + /* * skb should fit one page. This choice is good for headerless malloc. * diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 38c27b9bb70a..398cd03f2d7b 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -415,38 +415,65 @@ static void netlink_overrun(struct sock *sk) } } -int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock) +struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid) { - struct sock *sk; - struct netlink_opt *nlk; - int len = skb->len; int protocol = ssk->sk_protocol; - long timeo; - DECLARE_WAITQUEUE(wait, current); - - timeo = sock_sndtimeo(ssk, nonblock); + struct sock *sock; + struct netlink_opt *nlk; -retry: - sk = netlink_lookup(protocol, pid); - if (sk == NULL) - goto no_dst; - nlk = nlk_sk(sk); + sock = netlink_lookup(protocol, pid); + if (!sock) + return ERR_PTR(-ECONNREFUSED); /* Don't bother queuing skb if kernel socket has no input function */ - if (nlk->pid == 0 && !nlk->data_ready) - goto no_dst; + nlk = nlk_sk(sock); + if (nlk->pid == 0 && !nlk->data_ready) { + sock_put(sock); + return ERR_PTR(-ECONNREFUSED); + } + return sock; +} + +struct sock *netlink_getsockbyfilp(struct file *filp) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct socket *socket; + struct sock *sock; + + if (!inode->i_sock || !(socket = SOCKET_I(inode))) + return ERR_PTR(-ENOTSOCK); + + sock = socket->sk; + if (sock->sk_family != AF_NETLINK) + return ERR_PTR(-EINVAL); + + sock_hold(sock); + return sock; +} + +/* + * Attach a skb to a netlink socket. + * The caller must hold a reference to the destination socket. On error, the + * reference is dropped. The skb is not send to the destination, just all + * all error checks are performed and memory in the queue is reserved. + * Return values: + * < 0: error. skb freed, reference to sock dropped. + * 0: continue + * 1: repeat lookup - reference dropped while waiting for socket memory. + */ +int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock, long timeo) +{ + struct netlink_opt *nlk; + + nlk = nlk_sk(sk); #ifdef NL_EMULATE_DEV - if (nlk->handler) { - skb_orphan(skb); - len = nlk->handler(protocol, skb); - sock_put(sk); - return len; - } + if (nlk->handler) + return 0; #endif - if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || test_bit(0, &nlk->state)) { + DECLARE_WAITQUEUE(wait, current); if (!timeo) { if (!nlk->pid) netlink_overrun(sk); @@ -471,19 +498,60 @@ retry: kfree_skb(skb); return sock_intr_errno(timeo); } - goto retry; + return 1; } - skb_orphan(skb); skb_set_owner_r(skb, sk); + return 0; +} + +int netlink_sendskb(struct sock *sk, struct sk_buff *skb, int protocol) +{ + struct netlink_opt *nlk; + int len = skb->len; + + nlk = nlk_sk(sk); +#ifdef NL_EMULATE_DEV + if (nlk->handler) { + skb_orphan(skb); + len = nlk->handler(protocol, skb); + sock_put(sk); + return len; + } +#endif + skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, len); sock_put(sk); return len; +} -no_dst: +void netlink_detachskb(struct sock *sk, struct sk_buff *skb) +{ kfree_skb(skb); - return -ECONNREFUSED; + sock_put(sk); +} + +int netlink_unicast(struct sock *ssk, struct sk_buff *skb, u32 pid, int nonblock) +{ + struct sock *sk; + int err; + long timeo; + + timeo = sock_sndtimeo(ssk, nonblock); +retry: + sk = netlink_getsockbypid(ssk, pid); + if (IS_ERR(sk)) { + kfree_skb(skb); + return PTR_ERR(skb); + } + err = netlink_attachskb(sk, skb, nonblock, timeo); + if (err == 1) + goto retry; + if (err) + return err; + + return netlink_sendskb(sk, skb, ssk->sk_protocol); } static __inline__ int netlink_broadcast_deliver(struct sock *sk, struct sk_buff *skb) -- cgit v1.2.3 From 34b98f223bb21673f4cab2f5079a763c34a67946 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:55:32 -0700 Subject: [PATCH] posix message queues: send notifications via netlink From: Manfred Spraul SIGEV_THREAD means that a given callback should be called in the context on a new thread. This must be done by the C library. The kernel must deliver a notice of the event to the C library when the callback should be called. This patch switches to a new, simpler interface: User space creates a socket with socket(PF_NETLINK, SOCK_RAW,0) and passes the fd to the mq_notify call together with a cookie. When the mq_notify() condition is satisfied, the kernel "writes" the cookie to the socket. User space then reads the cookie and calls the appropriate callback. --- include/linux/mqueue.h | 16 ++++ ipc/mqueue.c | 254 +++++++++++++++++++++---------------------------- 2 files changed, 123 insertions(+), 147 deletions(-) (limited to 'include') diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h index 535fe4b2f14b..fdab3b8ee242 100644 --- a/include/linux/mqueue.h +++ b/include/linux/mqueue.h @@ -30,8 +30,24 @@ struct mq_attr { long __reserved[4]; /* ignored for input, zeroed for output */ }; +/* + * SIGEV_THREAD implementation: + * SIGEV_THREAD must be implemented in user space. If SIGEV_THREAD is passed + * to mq_notify, then + * - sigev_signo must be the file descriptor of an AF_NETLINK socket. It's not + * necessary that the socket is bound. + * - sigev_value.sival_ptr must point to a cookie that is NOTIFY_COOKIE_LEN + * bytes long. + * If the notification is triggered, then the cookie is sent to the netlink + * socket. The last byte of the cookie is replaced with the NOTIFY_?? codes: + * NOTIFY_WOKENUP if the notification got triggered, NOTIFY_REMOVED if it was + * removed, either due to a close() on the message queue fd or due to a + * mq_notify() that removed the notification. + */ #define NOTIFY_NONE 0 #define NOTIFY_WOKENUP 1 #define NOTIFY_REMOVED 2 +#define NOTIFY_COOKIE_LEN 32 + #endif diff --git a/ipc/mqueue.c b/ipc/mqueue.c index f0d78fefc28b..f81441d63564 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -20,6 +20,9 @@ #include #include #include +#include +#include +#include #include "util.h" #define MQUEUE_MAGIC 0x19800202 @@ -33,9 +36,6 @@ #define STATE_PENDING 1 #define STATE_READY 2 -#define NP_NONE ((void*)NOTIFY_NONE) -#define NP_WOKENUP ((void*)NOTIFY_WOKENUP) -#define NP_REMOVED ((void*)NOTIFY_REMOVED) /* used by sysctl */ #define FS_MQUEUE 1 #define CTL_QUEUESMAX 2 @@ -48,6 +48,8 @@ #define HARD_MSGMAX (131072/sizeof(void*)) #define DFLT_MSGSIZEMAX 16384 /* max message size */ +#define NOTIFY_COOKIE_LEN 32 + struct ext_wait_queue { /* queue of sleeping tasks */ struct task_struct *task; struct list_head list; @@ -56,25 +58,26 @@ struct ext_wait_queue { /* queue of sleeping tasks */ }; struct mqueue_inode_info { - struct mq_attr attr; + spinlock_t lock; + struct inode vfs_inode; + wait_queue_head_t wait_q; + struct msg_msg **messages; + struct mq_attr attr; - pid_t notify_owner; /* != 0 means notification registered */ - struct sigevent notify; - struct file *notify_filp; + struct sigevent notify; /* notify.sigev_notify == SIGEV_NONE means */ + pid_t notify_owner; /* no notification registered */ + struct sock *notify_sock; + struct sk_buff *notify_cookie; /* for tasks waiting for free space and messages, respectively */ struct ext_wait_queue e_wait_q[2]; - wait_queue_head_t wait_q; unsigned long qsize; /* size of queue in memory (sum of all msgs) */ - spinlock_t lock; - struct inode vfs_inode; }; static struct inode_operations mqueue_dir_inode_operations; static struct file_operations mqueue_file_operations; -static struct file_operations mqueue_notify_fops; static struct super_operations mqueue_super_ops; static void remove_notification(struct mqueue_inode_info *info); @@ -119,7 +122,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, int mode) init_waitqueue_head(&info->wait_q); INIT_LIST_HEAD(&info->e_wait_q[0].list); INIT_LIST_HEAD(&info->e_wait_q[1].list); - info->notify_owner = 0; + info->notify.sigev_notify = SIGEV_NONE; info->qsize = 0; memset(&info->attr, 0, sizeof(info->attr)); info->attr.mq_maxmsg = DFLT_MSGMAX; @@ -283,10 +286,11 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data, snprintf(buffer, sizeof(buffer), "QSIZE:%-10lu NOTIFY:%-5d SIGNO:%-5d NOTIFY_PID:%-6d\n", info->qsize, - info->notify_owner ? info->notify.sigev_notify : SIGEV_NONE, - (info->notify_owner && info->notify.sigev_notify == SIGEV_SIGNAL ) ? + info->notify.sigev_notify, + (info->notify.sigev_notify == SIGEV_SIGNAL ) ? info->notify.sigev_signo : 0, - info->notify_owner); + (info->notify.sigev_notify != SIGEV_NONE) ? + info->notify_owner : 0); spin_unlock(&info->lock); buffer[sizeof(buffer)-1] = '\0'; slen = strlen(buffer)+1; @@ -299,7 +303,7 @@ static ssize_t mqueue_read_file(struct file *filp, char __user *u_data, count = slen - o; if (copy_to_user(u_data, buffer + o, count)) - return -EFAULT; + return -EFAULT; *off = o + count; filp->f_dentry->d_inode->i_atime = filp->f_dentry->d_inode->i_ctime = CURRENT_TIME; @@ -311,7 +315,8 @@ static int mqueue_flush_file(struct file *filp) struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); spin_lock(&info->lock); - if (current->tgid == info->notify_owner) + if (info->notify.sigev_notify != SIGEV_NONE && + current->tgid == info->notify_owner) remove_notification(info); spin_unlock(&info->lock); @@ -435,6 +440,11 @@ static inline struct msg_msg *msg_get(struct mqueue_inode_info *info) return info->messages[info->attr.mq_curmsgs]; } +static inline void set_cookie(struct sk_buff *skb, char code) +{ + ((char*)skb->data)[NOTIFY_COOKIE_LEN-1] = code; +} + /* * The next function is only to split too long sys_mq_timedsend */ @@ -445,7 +455,8 @@ static void __do_notify(struct mqueue_inode_info *info) * waiting synchronously for message AND state of queue changed from * empty to not empty. Here we are sure that no one is waiting * synchronously. */ - if (info->notify_owner && info->attr.mq_curmsgs == 1) { + if (info->notify.sigev_notify != SIGEV_NONE && + info->attr.mq_curmsgs == 1) { /* sends signal */ if (info->notify.sigev_notify == SIGEV_SIGNAL) { struct siginfo sig_i; @@ -460,10 +471,12 @@ static void __do_notify(struct mqueue_inode_info *info) kill_proc_info(info->notify.sigev_signo, &sig_i, info->notify_owner); } else if (info->notify.sigev_notify == SIGEV_THREAD) { - info->notify_filp->private_data = (void*)NP_WOKENUP; + set_cookie(info->notify_cookie, NOTIFY_WOKENUP); + netlink_sendskb(info->notify_sock, + info->notify_cookie, 0); } /* after notification unregisters process */ - info->notify_owner = 0; + info->notify.sigev_notify = SIGEV_NONE; } wake_up(&info->wait_q); } @@ -499,90 +512,13 @@ static long prepare_timeout(const struct timespec __user *u_arg) return timeout; } -/* - * File descriptor based notification, intended to be used to implement - * SIGEV_THREAD: - * SIGEV_THREAD means that a notification function should be called in the - * context of a new thread. The kernel can't do that. Therefore mq_notify - * calls with SIGEV_THREAD return a new file descriptor. A user space helper - * must create a new thread and then read from the given file descriptor. - * The read always returns one byte. If it's NOTIFY_WOKENUP, then it must - * call the notification function. If it's NOTIFY_REMOVED, then the - * notification was removed. The file descriptor supports poll, thus one - * supervisor thread can manage multiple message queue notifications. - * - * The implementation must support multiple outstanding notifications: - * It's possible that a new notification is added and signaled before user - * space calls mqueue_notify_read for the previous notification. - * Therefore the notification state is stored in the private_data field of - * the file descriptor. - */ -static unsigned int mqueue_notify_poll(struct file *filp, - struct poll_table_struct *poll_tab) -{ - struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); - int retval; - - poll_wait(filp, &info->wait_q, poll_tab); - - if (filp->private_data == NP_NONE) - retval = 0; - else - retval = POLLIN | POLLRDNORM; - return retval; -} - -static ssize_t mqueue_notify_read(struct file *filp, char __user *buf, - size_t count, loff_t *ppos) -{ - struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); - char result; - - if (!count) - return 0; - if (*ppos != 0) - return 0; - spin_lock(&info->lock); - while (filp->private_data == NP_NONE) { - DEFINE_WAIT(wait); - if (filp->f_flags & O_NONBLOCK) { - spin_unlock(&info->lock); - return -EAGAIN; - } - prepare_to_wait(&info->wait_q, &wait, TASK_INTERRUPTIBLE); - spin_unlock(&info->lock); - schedule(); - finish_wait(&info->wait_q, &wait); - spin_lock(&info->lock); - } - spin_unlock(&info->lock); - result = (char)(unsigned long)filp->private_data; - if (put_user(result, buf)) - return -EFAULT; - *ppos = 1; - return 1; -} - -static int mqueue_notify_release(struct inode *inode, struct file *filp) -{ - struct mqueue_inode_info *info = MQUEUE_I(filp->f_dentry->d_inode); - - spin_lock(&info->lock); - if (info->notify_owner && info->notify_filp == filp) - info->notify_owner = 0; - filp->private_data = NP_REMOVED; - spin_unlock(&info->lock); - - return 0; -} - static void remove_notification(struct mqueue_inode_info *info) { if (info->notify.sigev_notify == SIGEV_THREAD) { - info->notify_filp->private_data = NP_REMOVED; - wake_up(&info->wait_q); + set_cookie(info->notify_cookie, NOTIFY_REMOVED); + netlink_sendskb(info->notify_sock, info->notify_cookie, 0); } - info->notify_owner = 0; + info->notify.sigev_notify = SIGEV_NONE; } /* @@ -780,7 +716,8 @@ out_unlock: */ /* pipelined_send() - send a message directly to the task waiting in - * sys_mq_timedreceive() (without inserting message into a queue). */ + * sys_mq_timedreceive() (without inserting message into a queue). + */ static inline void pipelined_send(struct mqueue_inode_info *info, struct msg_msg *message, struct ext_wait_queue *receiver) @@ -978,12 +915,16 @@ out: asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification) { - int ret, fd; - struct file *filp, *nfilp; + int ret; + struct file *filp; + struct sock *sock; struct inode *inode; struct sigevent notification; struct mqueue_inode_info *info; + struct sk_buff *nc; + nc = NULL; + sock = NULL; if (u_notification == NULL) { notification.sigev_notify = SIGEV_NONE; } else { @@ -1000,6 +941,44 @@ asmlinkage long sys_mq_notify(mqd_t mqdes, notification.sigev_signo > _NSIG)) { return -EINVAL; } + if (notification.sigev_notify == SIGEV_THREAD) { + /* create the notify skb */ + nc = alloc_skb(NOTIFY_COOKIE_LEN, GFP_KERNEL); + ret = -ENOMEM; + if (!nc) + goto out; + ret = -EFAULT; + if (copy_from_user(nc->data, + notification.sigev_value.sival_ptr, + NOTIFY_COOKIE_LEN)) { + goto out; + } + + /* TODO: add a header? */ + skb_put(nc, NOTIFY_COOKIE_LEN); + /* and attach it to the socket */ +retry: + filp = fget(notification.sigev_signo); + ret = -EBADF; + if (!filp) + goto out; + sock = netlink_getsockbyfilp(filp); + fput(filp); + if (IS_ERR(sock)) { + ret = PTR_ERR(sock); + sock = NULL; + goto out; + } + + ret = netlink_attachskb(sock, nc, 0, MAX_SCHEDULE_TIMEOUT); + if (ret == 1) + goto retry; + if (ret) { + sock = NULL; + nc = NULL; + goto out; + } + } } ret = -EBADF; @@ -1013,47 +992,33 @@ asmlinkage long sys_mq_notify(mqd_t mqdes, info = MQUEUE_I(inode); ret = 0; - if (notification.sigev_notify == SIGEV_THREAD) { - ret = get_unused_fd(); - if (ret < 0) - goto out_fput; - fd = ret; - nfilp = get_empty_filp(); - if (!nfilp) { - ret = -ENFILE; - goto out_dropfd; - } - nfilp->private_data = NP_NONE; - nfilp->f_op = &mqueue_notify_fops; - nfilp->f_vfsmnt = mntget(mqueue_mnt); - nfilp->f_dentry = dget(filp->f_dentry); - nfilp->f_mapping = filp->f_dentry->d_inode->i_mapping; - nfilp->f_flags = O_RDONLY; - nfilp->f_mode = FMODE_READ; - } else { - nfilp = NULL; - fd = -1; - } - spin_lock(&info->lock); - - if (notification.sigev_notify == SIGEV_NONE) { - if (info->notify_owner == current->tgid) { + switch (notification.sigev_notify) { + case SIGEV_NONE: + if (info->notify.sigev_notify != SIGEV_NONE && + info->notify_owner == current->tgid) { remove_notification(info); inode->i_atime = inode->i_ctime = CURRENT_TIME; } - } else if (info->notify_owner) { - ret = -EBUSY; - } else if (notification.sigev_notify == SIGEV_THREAD) { - info->notify_filp = nfilp; - fd_install(fd, nfilp); - ret = fd; - fd = -1; - nfilp = NULL; + break; + case SIGEV_THREAD: + if (info->notify.sigev_notify != SIGEV_NONE) { + ret = -EBUSY; + break; + } + info->notify_sock = sock; + info->notify_cookie = nc; + sock = NULL; + nc = NULL; info->notify.sigev_notify = SIGEV_THREAD; info->notify_owner = current->tgid; inode->i_atime = inode->i_ctime = CURRENT_TIME; - } else { + break; + case SIGEV_SIGNAL: + if (info->notify.sigev_notify != SIGEV_NONE) { + ret = -EBUSY; + break; + } info->notify.sigev_signo = notification.sigev_signo; info->notify.sigev_value = notification.sigev_value; info->notify.sigev_notify = SIGEV_SIGNAL; @@ -1061,12 +1026,14 @@ asmlinkage long sys_mq_notify(mqd_t mqdes, inode->i_atime = inode->i_ctime = CURRENT_TIME; } spin_unlock(&info->lock); -out_dropfd: - if (fd != -1) - put_unused_fd(fd); out_fput: fput(filp); out: + if (sock) { + netlink_detachskb(sock, nc); + } else if (nc) { + dev_kfree_skb(nc); + } return ret; } @@ -1135,13 +1102,6 @@ static struct file_operations mqueue_file_operations = { .read = mqueue_read_file, }; -static struct file_operations mqueue_notify_fops = { - .poll = mqueue_notify_poll, - .read = mqueue_notify_read, - .release = mqueue_notify_release, -}; - - static struct super_operations mqueue_super_ops = { .alloc_inode = mqueue_alloc_inode, .destroy_inode = mqueue_destroy_inode, -- cgit v1.2.3 From 87c22e8470366e81aa82bcbadaf147c4ecdfb182 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:55:45 -0700 Subject: [PATCH] compat emulation for posix message queues From: Arnd Bergmann I have tested the code with the open posix test suite and found the same four failures for both 64-bit and compat mode, most tests pass. The patch is against -mc1, but I guess it also applies to the other trees around. What worries me more than mq_attr compatibility is the conversion of struct sigevent, which might turn out really hard when more fields in there are used. AFAICS, the only other part in the kernel ABI is sys_timer_create(), so maybe it's not too late to deprecate the current structure and create a structure that can be used properly for compat syscalls. --- arch/ia64/ia32/ia32_signal.c | 7 +- arch/mips/kernel/signal32.c | 7 +- arch/s390/kernel/compat_signal.c | 5 +- arch/sparc64/kernel/signal32.c | 7 +- arch/x86_64/ia32/ia32_signal.c | 6 +- include/asm-ppc64/ppc32.h | 14 --- include/linux/compat.h | 17 ++++ include/linux/mqueue.h | 4 +- include/linux/posix_types.h | 1 + include/linux/syscalls.h | 1 - include/linux/types.h | 1 + ipc/Makefile | 3 +- ipc/compat_mq.c | 196 +++++++++++++++++++++++++++++++++++++++ kernel/sys.c | 5 + 14 files changed, 251 insertions(+), 23 deletions(-) create mode 100644 ipc/compat_mq.c (limited to 'include') diff --git a/arch/ia64/ia32/ia32_signal.c b/arch/ia64/ia32/ia32_signal.c index 8b1374c172b6..bb1e836fb227 100644 --- a/arch/ia64/ia32/ia32_signal.c +++ b/arch/ia64/ia32/ia32_signal.c @@ -114,7 +114,12 @@ copy_siginfo_from_user32 (siginfo_t *to, siginfo_t32 *from) err |= __get_user(to->si_band, &from->si_band); err |= __get_user(to->si_fd, &from->si_fd); break; - /* case __SI_RT: This is not generated by the kernel as of now. */ + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: + err |= __get_user(to->si_pid, &from->si_pid); + err |= __get_user(to->si_uid, &from->si_uid); + err |= __get_user(to->si_int, &from->si_int); + break; } } return err; diff --git a/arch/mips/kernel/signal32.c b/arch/mips/kernel/signal32.c index 5c1489f4fdc2..c52074f84300 100644 --- a/arch/mips/kernel/signal32.c +++ b/arch/mips/kernel/signal32.c @@ -358,7 +358,12 @@ static int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from) err |= __put_user(from->si_band, &to->si_band); err |= __put_user(from->si_fd, &to->si_fd); break; - /* case __SI_RT: This is not generated by the kernel as of now. */ + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(from->si_int, &to->si_int); + break; } } return err; diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 44fe6e477e92..373040404a5a 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -74,6 +74,10 @@ int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from) err |= __copy_to_user(&to->_sifields._pad, &from->_sifields._pad, SI_PAD_SIZE); else { switch (from->si_code >> 16) { + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: + err |= __put_user(from->si_int, &to->si_int); + /* fallthrough */ case __SI_KILL >> 16: err |= __put_user(from->si_pid, &to->si_pid); err |= __put_user(from->si_uid, &to->si_uid); @@ -96,7 +100,6 @@ int copy_siginfo_to_user32(siginfo_t32 *to, siginfo_t *from) break; default: break; - /* case __SI_RT: This is not generated by the kernel as of now. */ } } return err; diff --git a/arch/sparc64/kernel/signal32.c b/arch/sparc64/kernel/signal32.c index cc3019d6dd65..e2f62a666d8c 100644 --- a/arch/sparc64/kernel/signal32.c +++ b/arch/sparc64/kernel/signal32.c @@ -129,7 +129,12 @@ int copy_siginfo_to_user32(siginfo_t32 __user *to, siginfo_t *from) err |= __put_user(from->si_trapno, &to->si_trapno); err |= __put_user((long)from->si_addr, &to->si_addr); break; - /* case __SI_RT: This is not generated by the kernel as of now. */ + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: + err |= __put_user(from->si_pid, &to->si_pid); + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(from->si_int, &to->si_int); + break; } } return err; diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index bce5fbc5be2c..1a828de6a55d 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -85,7 +85,11 @@ int ia32_copy_siginfo_to_user(siginfo_t32 __user *to, siginfo_t *from) err |= __put_user(from->si_overrun, &to->si_overrun); err |= __put_user((u32)(u64)from->si_ptr, &to->si_ptr); break; - /* case __SI_RT: This is not generated by the kernel as of now. */ + case __SI_RT: /* This is not generated by the kernel as of now. */ + case __SI_MESGQ: + err |= __put_user(from->si_uid, &to->si_uid); + err |= __put_user(from->si_int, &to->si_int); + break; } } return err; diff --git a/include/asm-ppc64/ppc32.h b/include/asm-ppc64/ppc32.h index 53865a8c4f8d..7338ea298a19 100644 --- a/include/asm-ppc64/ppc32.h +++ b/include/asm-ppc64/ppc32.h @@ -141,20 +141,6 @@ struct ucontext32 { struct mcontext32 uc_mcontext; }; -typedef struct compat_sigevent { - compat_sigval_t sigev_value; - int sigev_signo; - int sigev_notify; - union { - int _pad[SIGEV_PAD_SIZE]; - int _tid; - struct { - compat_uptr_t _function; - compat_uptr_t _attribute; - } _sigev_thread; - } _sigev_un; -} compat_sigevent_t; - struct ipc_kludge_32 { unsigned int msgp; int msgtyp; diff --git a/include/linux/compat.h b/include/linux/compat.h index 7b82209ab4ab..796204f59bd9 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -13,6 +13,7 @@ #include #include +#include #define compat_jiffies_to_clock_t(x) \ (((unsigned long)(x) * COMPAT_USER_HZ) / HZ) @@ -90,6 +91,22 @@ typedef union compat_sigval { compat_uptr_t sival_ptr; } compat_sigval_t; +typedef struct compat_sigevent { + compat_sigval_t sigev_value; + compat_int_t sigev_signo; + compat_int_t sigev_notify; + union { + compat_int_t _pad[SIGEV_PAD_SIZE]; + compat_int_t _tid; + + struct { + compat_uptr_t _function; + compat_uptr_t _attribute; + } _sigev_thread; + } _sigev_un; +} compat_sigevent_t; + + long compat_sys_semctl(int first, int second, int third, void __user *uptr); long compat_sys_msgsnd(int first, int second, int third, void __user *uptr); long compat_sys_msgrcv(int first, int second, int msgtyp, int third, diff --git a/include/linux/mqueue.h b/include/linux/mqueue.h index fdab3b8ee242..fc40b774b913 100644 --- a/include/linux/mqueue.h +++ b/include/linux/mqueue.h @@ -18,9 +18,9 @@ #ifndef _LINUX_MQUEUE_H #define _LINUX_MQUEUE_H -#define MQ_PRIO_MAX 32768 +#include -typedef int mqd_t; +#define MQ_PRIO_MAX 32768 struct mq_attr { long mq_flags; /* message queue flags */ diff --git a/include/linux/posix_types.h b/include/linux/posix_types.h index 3ee2ed9de1db..f04c98cf44f3 100644 --- a/include/linux/posix_types.h +++ b/include/linux/posix_types.h @@ -42,6 +42,7 @@ typedef void (*__kernel_sighandler_t)(int); /* Type of a SYSV IPC key. */ typedef int __kernel_key_t; +typedef int __kernel_mqd_t; #include diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 7ee5f67abb5f..89ffe55898f2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -48,7 +48,6 @@ struct timex; struct timezone; struct tms; struct utimbuf; -typedef int mqd_t; struct mq_attr; #include diff --git a/include/linux/types.h b/include/linux/types.h index 3b407b06b48f..93f5f3653561 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -31,6 +31,7 @@ typedef __kernel_key_t key_t; typedef __kernel_suseconds_t suseconds_t; typedef __kernel_timer_t timer_t; typedef __kernel_clockid_t clockid_t; +typedef __kernel_mqd_t mqd_t; #ifdef __KERNEL__ typedef __kernel_uid32_t uid_t; diff --git a/ipc/Makefile b/ipc/Makefile index 913790207d85..0a6d626cd794 100644 --- a/ipc/Makefile +++ b/ipc/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_SYSVIPC_COMPAT) += compat.o obj-$(CONFIG_SYSVIPC) += util.o msgutil.o msg.o sem.o shm.o -obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o +obj_mq-$(CONFIG_COMPAT) += compat_mq.o +obj-$(CONFIG_POSIX_MQUEUE) += mqueue.o msgutil.o $(obj_mq-y) diff --git a/ipc/compat_mq.c b/ipc/compat_mq.c new file mode 100644 index 000000000000..1520df89c424 --- /dev/null +++ b/ipc/compat_mq.c @@ -0,0 +1,196 @@ +/* + * ipc/compat_mq.c + * 32 bit emulation for POSIX message queue system calls + * + * Copyright (C) 2004 IBM Deutschland Entwicklung GmbH, IBM Corporation + * Author: Arnd Bergmann + */ + +#include +#include +#include +#include +#include + +#include + +struct compat_mq_attr { + compat_long_t mq_flags; /* message queue flags */ + compat_long_t mq_maxmsg; /* maximum number of messages */ + compat_long_t mq_msgsize; /* maximum message size */ + compat_long_t mq_curmsgs; /* number of messages currently queued */ + compat_long_t __reserved[4]; /* ignored for input, zeroed for output */ +}; + +static inline int get_compat_mq_attr(struct mq_attr *attr, + const struct compat_mq_attr __user *uattr) +{ + if (verify_area(VERIFY_READ, uattr, sizeof *uattr)) + return -EFAULT; + + return __get_user(attr->mq_flags, &uattr->mq_flags) + | __get_user(attr->mq_maxmsg, &uattr->mq_maxmsg) + | __get_user(attr->mq_msgsize, &uattr->mq_msgsize) + | __get_user(attr->mq_curmsgs, &uattr->mq_curmsgs); +} + +static inline int put_compat_mq_attr(const struct mq_attr *attr, + struct compat_mq_attr __user *uattr) +{ + if (clear_user(uattr, sizeof *uattr)) + return -EFAULT; + + return __put_user(attr->mq_flags, &uattr->mq_flags) + | __put_user(attr->mq_maxmsg, &uattr->mq_maxmsg) + | __put_user(attr->mq_msgsize, &uattr->mq_msgsize) + | __put_user(attr->mq_curmsgs, &uattr->mq_curmsgs); +} + +asmlinkage long compat_sys_mq_open(const char __user *u_name, + int oflag, compat_mode_t mode, + struct compat_mq_attr __user *u_attr) +{ + struct mq_attr attr; + mm_segment_t oldfs; + char *name; + long ret; + + if ((oflag & O_CREAT) == 0 || !u_attr) + return sys_mq_open(u_name, oflag, mode, 0); + + if (get_compat_mq_attr(&attr, u_attr)) + return -EFAULT; + + name = getname(u_name); + if (IS_ERR(name)) + return PTR_ERR(name); + + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_mq_open(name, oflag, mode, &attr); + set_fs(oldfs); + + putname(name); + return ret; +} + +static struct timespec __user *compat_prepare_timeout( + const struct compat_timespec __user *u_abs_timeout) +{ + struct timespec ts, __user *u_ts; + + if (!u_abs_timeout) + return 0; + + u_ts = compat_alloc_user_space(sizeof(*u_ts)); + if (get_compat_timespec(&ts, u_abs_timeout) + || copy_to_user(u_ts, &ts, sizeof(*u_ts))) + return ERR_PTR(-EFAULT); + + return u_ts; +} + +asmlinkage long compat_sys_mq_timedsend(mqd_t mqdes, + const char __user *u_msg_ptr, + size_t msg_len, unsigned int msg_prio, + const struct compat_timespec __user *u_abs_timeout) +{ + struct timespec __user *u_ts; + + u_ts = compat_prepare_timeout(u_abs_timeout); + if (IS_ERR(u_ts)) + return -EFAULT; + + return sys_mq_timedsend(mqdes, u_msg_ptr, msg_len, + msg_prio, u_ts); +} + +asmlinkage ssize_t compat_sys_mq_timedreceive(mqd_t mqdes, + char __user *u_msg_ptr, + size_t msg_len, unsigned int __user *u_msg_prio, + const struct compat_timespec __user *u_abs_timeout) +{ + struct timespec *u_ts; + + u_ts = compat_prepare_timeout(u_abs_timeout); + if (IS_ERR(u_ts)) + return -EFAULT; + + return sys_mq_timedreceive(mqdes, u_msg_ptr, msg_len, + u_msg_prio, u_ts); +} + +static int get_compat_sigevent(struct sigevent *event, + const struct compat_sigevent __user *u_event) +{ + if (verify_area(VERIFY_READ, u_event, sizeof(*u_event))) + return -EFAULT; + + return __get_user(event->sigev_value.sival_int, + &u_event->sigev_value.sival_int) + | __get_user(event->sigev_signo, &u_event->sigev_signo) + | __get_user(event->sigev_notify, &u_event->sigev_notify) + | __get_user(event->sigev_notify_thread_id, + &u_event->sigev_notify_thread_id); +} + +asmlinkage long compat_sys_mq_notify(mqd_t mqdes, + const struct compat_sigevent __user *u_notification) +{ + mm_segment_t oldfs; + struct sigevent notification; + char cookie[NOTIFY_COOKIE_LEN]; + compat_uptr_t u_cookie; + long ret; + + if (!u_notification) + return sys_mq_notify(mqdes, 0); + + if (get_compat_sigevent(¬ification, u_notification)) + return -EFAULT; + + if (notification.sigev_notify == SIGEV_THREAD) { + u_cookie = (compat_uptr_t)notification.sigev_value.sival_int; + if (copy_from_user(cookie, compat_ptr(u_cookie), + NOTIFY_COOKIE_LEN)) { + return -EFAULT; + } + notification.sigev_value.sival_ptr = cookie; + } + + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_mq_notify(mqdes, ¬ification); + set_fs(oldfs); + + return ret; +} + +asmlinkage long compat_sys_mq_getsetattr(mqd_t mqdes, + const struct compat_mq_attr __user *u_mqstat, + struct compat_mq_attr __user *u_omqstat) +{ + struct mq_attr mqstat, omqstat; + struct mq_attr *p_mqstat = 0, *p_omqstat = 0; + mm_segment_t oldfs; + long ret; + + if (u_mqstat) { + p_mqstat = &mqstat; + if (get_compat_mq_attr(p_mqstat, u_mqstat)) + return -EFAULT; + } + + if (u_omqstat) + p_omqstat = &omqstat; + + oldfs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_mq_getsetattr(mqdes, p_mqstat, p_omqstat); + set_fs(oldfs); + + if (ret) + return ret; + + return (u_omqstat) ? put_compat_mq_attr(&omqstat, u_omqstat) : 0; +} diff --git a/kernel/sys.c b/kernel/sys.c index 7d1bf5c57aca..81f9e02f2071 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -266,6 +266,11 @@ cond_syscall(sys_mq_timedsend) cond_syscall(sys_mq_timedreceive) cond_syscall(sys_mq_notify) cond_syscall(sys_mq_getsetattr) +cond_syscall(compat_sys_mq_open) +cond_syscall(compat_sys_mq_timedsend) +cond_syscall(compat_sys_mq_timedreceive) +cond_syscall(compat_sys_mq_notify) +cond_syscall(compat_sys_mq_getsetattr) /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read) -- cgit v1.2.3 From 0ab2d6681c4e8502990523d46d928f37b764d52d Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sun, 11 Apr 2004 22:56:34 -0700 Subject: [PATCH] IPMI driver updates From: Corey Minyard - Add support for messaging through an IPMI LAN interface, which is required for some system software that already exists on other IPMI drivers. It also does some renaming and a lot of little cleanups. - Add the "System Interface" driver. The previous driver for system interfaces only supported the KCS interface, this driver supports all system interfaces defined in the IPMI standard. It also does a much better job of handling ACPI and SMBIOS tables for detecting IPMI system interfaces. --- Documentation/IPMI.txt | 218 +++- drivers/char/ipmi/Kconfig | 8 +- drivers/char/ipmi/Makefile | 9 +- drivers/char/ipmi/ipmi_bt_sm.c | 513 +++++++++ drivers/char/ipmi/ipmi_devintf.c | 197 ++-- drivers/char/ipmi/ipmi_kcs_intf.c | 1305 ---------------------- drivers/char/ipmi/ipmi_kcs_sm.c | 156 +-- drivers/char/ipmi/ipmi_kcs_sm.h | 70 -- drivers/char/ipmi/ipmi_msghandler.c | 1292 +++++++++++++++++++--- drivers/char/ipmi/ipmi_si_intf.c | 2052 +++++++++++++++++++++++++++++++++++ drivers/char/ipmi/ipmi_si_sm.h | 117 ++ drivers/char/ipmi/ipmi_smic_sm.c | 599 ++++++++++ drivers/char/ipmi/ipmi_watchdog.c | 122 +-- include/linux/ipmi.h | 131 ++- include/linux/ipmi_msgdefs.h | 36 +- include/linux/ipmi_smi.h | 14 +- 16 files changed, 5013 insertions(+), 1826 deletions(-) create mode 100644 drivers/char/ipmi/ipmi_bt_sm.c delete mode 100644 drivers/char/ipmi/ipmi_kcs_intf.c delete mode 100644 drivers/char/ipmi/ipmi_kcs_sm.h create mode 100644 drivers/char/ipmi/ipmi_si_intf.c create mode 100644 drivers/char/ipmi/ipmi_si_sm.h create mode 100644 drivers/char/ipmi/ipmi_smic_sm.c (limited to 'include') diff --git a/Documentation/IPMI.txt b/Documentation/IPMI.txt index 825e83cb4acc..ec8a6fa2c34b 100644 --- a/Documentation/IPMI.txt +++ b/Documentation/IPMI.txt @@ -22,6 +22,58 @@ are not familiar with IPMI itself, see the web site at http://www.intel.com/design/servers/ipmi/index.htm. IPMI is a big subject and I can't cover it all here! +Configuration +------------- + +The LinuxIPMI driver is modular, which means you have to pick several +things to have it work right depending on your hardware. Most of +these are available in the 'Character Devices' menu. + +No matter what, you must pick 'IPMI top-level message handler' to use +IPMI. What you do beyond that depends on your needs and hardware. + +The message handler does not provide any user-level interfaces. +Kernel code (like the watchdog) can still use it. If you need access +from userland, you need to select 'Device interface for IPMI' if you +want access through a device driver. Another interface is also +available, you may select 'IPMI sockets' in the 'Networking Support' +main menu. This provides a socket interface to IPMI. You may select +both of these at the same time, they will both work together. + +The driver interface depends on your hardware. If you have a board +with a standard interface (These will generally be either "KCS", +"SMIC", or "BT", consult your hardware manual), choose the 'IPMI SI +handler' option. A driver also exists for direct I2C access to the +IPMI management controller. Some boards support this, but it is +unknown if it will work on every board. For this, choose 'IPMI SMBus +handler', but be ready to try to do some figuring to see if it will +work. + +There is also a KCS-only driver interface supplied, but it is +depracated in favor of the SI interface. + +You should generally enable ACPI on your system, as systems with IPMI +should have ACPI tables describing them. + +If you have a standard interface and the board manufacturer has done +their job correctly, the IPMI controller should be automatically +detect (via ACPI or SMBIOS tables) and should just work. Sadly, many +boards do not have this information. The driver attempts standard +defaults, but they may not work. If you fall into this situation, you +need to read the section below named 'The SI Driver' on how to +hand-configure your system. + +IPMI defines a standard watchdog timer. You can enable this with the +'IPMI Watchdog Timer' config option. If you compile the driver into +the kernel, then via a kernel command-line option you can have the +watchdog timer start as soon as it intitializes. It also have a lot +of other options, see the 'Watchdog' section below for more details. +Note that you can also have the watchdog continue to run if it is +closed (by default it is disabled on close). Go into the 'Watchdog +Cards' menu, enable 'Watchdog Timer Support', and enable the option +'Disable watchdog shutdown on close'. + + Basic Design ------------ @@ -41,18 +93,30 @@ ipmi_devintf - This provides a userland IOCTL interface for the IPMI driver, each open file for this device ties in to the message handler as an IPMI user. -ipmi_kcs_drv - A driver for the KCS SMI. Most system have a KCS -interface for IPMI. +ipmi_si - A driver for various system interfaces. This supports +KCS, SMIC, and may support BT in the future. Unless you have your own +custom interface, you probably need to use this. + +ipmi_smb - A driver for accessing BMCs on the SMBus. It uses the +I2C kernel driver's SMBus interfaces to send and receive IPMI messages +over the SMBus. + +af_ipmi - A network socket interface to IPMI. This doesn't take up +a character device in your system. +Note that the KCS-only interface ahs been removed. Much documentation for the interface is in the include files. The IPMI include files are: -ipmi.h - Contains the user interface and IOCTL interface for IPMI. +net/af_ipmi.h - Contains the socket interface. -ipmi_smi.h - Contains the interface for SMI drivers to use. +linux/ipmi.h - Contains the user interface and IOCTL interface for IPMI. -ipmi_msgdefs.h - General definitions for base IPMI messaging. +linux/ipmi_smi.h - Contains the interface for system management interfaces +(things that interface to IPMI controllers) to use. + +linux/ipmi_msgdefs.h - General definitions for base IPMI messaging. Addressing @@ -260,70 +324,131 @@ they register with the message handler. They are generally assigned in the order they register, although if an SMI unregisters and then another one registers, all bets are off. -The ipmi_smi.h defines the interface for SMIs, see that for more -details. +The ipmi_smi.h defines the interface for management interfaces, see +that for more details. -The KCS Driver --------------- +The SI Driver +------------- -The KCS driver allows up to 4 KCS interfaces to be configured in the -system. By default, the driver will register one KCS interface at the -spec-specified I/O port 0xca2 without interrupts. You can change this -at module load time (for a module) with: +The SI driver allows up to 4 KCS or SMIC interfaces to be configured +in the system. By default, scan the ACPI tables for interfaces, and +if it doesn't find any the driver will attempt to register one KCS +interface at the spec-specified I/O port 0xca2 without interrupts. +You can change this at module load time (for a module) with: + + modprobe ipmi_si.o type=,.... + ports=,... addrs=,... + irqs=,... trydefaults=[0|1] + +Each of these except si_trydefaults is a list, the first item for the +first interface, second item for the second interface, etc. - insmod ipmi_kcs_drv.o kcs_ports=,... kcs_addrs=, - kcs_irqs=,... kcs_trydefaults=[0|1] +The si_type may be either "kcs", "smic", or "bt". If you leave it blank, it +defaults to "kcs". -The KCS driver supports two types of interfaces, ports (for I/O port -based KCS interfaces) and memory addresses (for KCS interfaces in -memory). The driver will support both of them simultaneously, setting -the port to zero (or just not specifying it) will allow the memory -address to be used. The port will override the memory address if it -is specified and non-zero. kcs_trydefaults sets whether the standard -IPMI interface at 0xca2 and any interfaces specified by ACPE are -tried. By default, the driver tries it, set this value to zero to -turn this off. +If you specify si_addrs as non-zero for an interface, the driver will +use the memory address given as the address of the device. This +overrides si_ports. + +If you specify si_ports as non-zero for an interface, the driver will +use the I/O port given as the device address. + +If you specify si_irqs as non-zero for an interface, the driver will +attempt to use the given interrupt for the device. + +si_trydefaults sets whether the standard IPMI interface at 0xca2 and +any interfaces specified by ACPE are tried. By default, the driver +tries it, set this value to zero to turn this off. When compiled into the kernel, the addresses can be specified on the kernel command line as: - ipmi_kcs=:,:....,[nodefault] + ipmi_si.type=,... + ipmi_si.ports=,... ipmi_si.addrs=,... + ipmi_si.irqs=,... ipmi_si.trydefaults=[0|1] -The values is either "p" or "m" for port or memory -addresses. So for instance, a KCS interface at port 0xca2 using -interrupt 9 and a memory interface at address 0xf9827341 with no -interrupt would be specified "ipmi_kcs=p0xca2:9,m0xf9827341". -If you specify zero for in irq or don't specify it, the driver will -run polled unless the software can detect the interrupt to use in the -ACPI tables. +It works the same as the module parameters of the same names. -By default, the driver will attempt to detect a KCS device at the -spec-specified 0xca2 address and any address specified by ACPI. If -you want to turn this off, use the "nodefault" option. +By default, the driver will attempt to detect any device specified by +ACPI, and if none of those then a KCS device at the spec-specified +0xca2. If you want to turn this off, set the "trydefaults" option to +false. If you have high-res timers compiled into the kernel, the driver will use them to provide much better performance. Note that if you do not have high-res timers enabled in the kernel and you don't have interrupts enabled, the driver will run VERY slowly. Don't blame me, -the KCS interface sucks. +these interfaces suck. + + +The SMBus Driver +---------------- + +The SMBus driver allows up to 4 SMBus devices to be configured in the +system. By default, the driver will register any SMBus interfaces it finds +in the I2C address range of 0x20 to 0x4f on any adapter. You can change this +at module load time (for a module) with: + + modprobe ipmi_smb.o + addr=,[,,[,...]] + dbg=,... + [defaultprobe=0] [dbg_probe=1] + +The addresses are specified in pairs, the first is the adapter ID and the +second is the I2C address on that adapter. + +The debug flags are bit flags for each BMC found, they are: +IPMI messages: 1, driver state: 2, timing: 4, I2C probe: 8 + +Setting smb_defaultprobe to zero disabled the default probing of SMBus +interfaces at address range 0x20 to 0x4f. This means that only the +BMCs specified on the smb_addr line will be detected. + +Setting smb_dbg_probe to 1 will enable debugging of the probing and +detection process for BMCs on the SMBusses. + +Discovering the IPMI compilant BMC on the SMBus can cause devices +on the I2C bus to fail. The SMBus driver writes a "Get Device ID" IPMI +message as a block write to the I2C bus and waits for a response. +This action can be detrimental to some I2C devices. It is highly recommended +that the known I2c address be given to the SMBus driver in the smb_addr +parameter. The default adrress range will not be used when a smb_addr +parameter is provided. + +When compiled into the kernel, the addresses can be specified on the +kernel command line as: + + ipmb_smb.addr=,[,,[,...]] + ipmi_smb.dbg=,... + ipmi_smb.defaultprobe=0 ipmi_smb.dbg_probe=1 + +These are the same options as on the module command line. + +Note that you might need some I2C changes if CONFIG_IPMI_PANIC_EVENT +is enabled along with this, so the I2C driver knows to run to +completion during sending a panic event. Other Pieces ------------ Watchdog +-------- A watchdog timer is provided that implements the Linux-standard watchdog timer interface. It has three module parameters that can be used to control it: - insmod ipmi_watchdog timeout= pretimeout= action= - preaction= preop= + modprobe ipmi_watchdog timeout= pretimeout= action= + preaction= preop= start_now=x The timeout is the number of seconds to the action, and the pretimeout is the amount of seconds before the reset that the pre-timeout panic will -occur (if pretimeout is zero, then pretimeout will not be enabled). +occur (if pretimeout is zero, then pretimeout will not be enabled). Note +that the pretimeout is the time before the final timeout. So if the +timeout is 50 seconds and the pretimeout is 10 seconds, then the pretimeout +will occur in 40 second (10 seconds before the timeout). The action may be "reset", "power_cycle", or "power_off", and specifies what to do when the timer times out, and defaults to @@ -344,16 +469,19 @@ When preop is set to "preop_give_data", one byte comes ready to read on the device when the pretimeout occurs. Select and fasync work on the device, as well. +If start_now is set to 1, the watchdog timer will start running as +soon as the driver is loaded. + When compiled into the kernel, the kernel command line is available for configuring the watchdog: - ipmi_wdog=[,[,