From baf74405636c17b7976f7a08160d4127c8dd483b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 17 Jun 2002 19:24:40 -0700 Subject: More IDE locking fixes. Found by Nick Piggin. --- drivers/ide/ioctl.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/ide/ioctl.c b/drivers/ide/ioctl.c index b986555fd4f3..609ed7dcfa56 100644 --- a/drivers/ide/ioctl.c +++ b/drivers/ide/ioctl.c @@ -345,8 +345,9 @@ int ata_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned if (!arg) { if (ide_spin_wait_hwgroup(drive)) return -EBUSY; - else - return 0; + /* Do nothing, just unlock */ + spin_unlock_irq(drive->channel->lock); + return 0; } return do_cmd_ioctl(drive, arg); -- cgit v1.2.3 From 68d6275b42055147e1e885e87424ee61dffa34e8 Mon Sep 17 00:00:00 2001 From: Stelian Pop Date: Mon, 17 Jun 2002 19:25:25 -0700 Subject: [PATCH] export pci_bus_type to modules. This exports the pci_bus_type symbol to modules, needed by (at least) the recent changes in pcmcia/cardbus.c. --- drivers/pci/pci-driver.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index 0260ccf2092a..db4cdb8e3ad4 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -210,3 +210,4 @@ EXPORT_SYMBOL(pci_match_device); EXPORT_SYMBOL(pci_register_driver); EXPORT_SYMBOL(pci_unregister_driver); EXPORT_SYMBOL(pci_dev_driver); +EXPORT_SYMBOL(pci_bus_type); -- cgit v1.2.3 From c8712aebdb2455a095d596c1a5a6af358b3efde3 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 17 Jun 2002 19:43:55 -0700 Subject: [PATCH] change_page_attr and AGP update Add change_page_attr to change page attributes for the kernel linear map. Fix AGP driver to use change_page_attr for the AGP buffer. Clean up AGP driver a bit (only tested on i386/VIA+AMD) Change ioremap_nocache to use change_page_attr to avoid mappings with conflicting caching attributes. --- arch/i386/mm/Makefile | 3 +- arch/i386/mm/ioremap.c | 69 ++++++++++++- arch/i386/mm/pageattr.c | 197 ++++++++++++++++++++++++++++++++++++++ drivers/char/agp/agp.h | 4 +- drivers/char/agp/agpgart_be.c | 155 +++++++++++------------------- include/asm-alpha/agp.h | 11 +++ include/asm-i386/agp.h | 23 +++++ include/asm-i386/cacheflush.h | 3 + include/asm-i386/io.h | 26 +---- include/asm-i386/page.h | 3 + include/asm-i386/pgtable-2level.h | 1 + include/asm-i386/pgtable-3level.h | 2 + include/asm-i386/pgtable.h | 3 + include/asm-ia64/agp.h | 11 +++ include/asm-sparc64/agp.h | 11 +++ include/asm-x86_64/agp.h | 23 +++++ include/asm-x86_64/cacheflush.h | 3 + include/linux/vmalloc.h | 3 + mm/vmalloc.c | 28 ++++-- 19 files changed, 443 insertions(+), 136 deletions(-) create mode 100644 arch/i386/mm/pageattr.c create mode 100644 include/asm-alpha/agp.h create mode 100644 include/asm-i386/agp.h create mode 100644 include/asm-ia64/agp.h create mode 100644 include/asm-sparc64/agp.h create mode 100644 include/asm-x86_64/agp.h diff --git a/arch/i386/mm/Makefile b/arch/i386/mm/Makefile index 73e25bd3022a..67df8b6f6594 100644 --- a/arch/i386/mm/Makefile +++ b/arch/i386/mm/Makefile @@ -9,6 +9,7 @@ O_TARGET := mm.o -obj-y := init.o fault.o ioremap.o extable.o +obj-y := init.o fault.o ioremap.o extable.o pageattr.o +export-objs := pageattr.o include $(TOPDIR)/Rules.make diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c index f81fae4ff7a9..4ba5641b271f 100644 --- a/arch/i386/mm/ioremap.c +++ b/arch/i386/mm/ioremap.c @@ -10,12 +10,13 @@ #include #include +#include #include #include #include #include #include - +#include static inline void remap_area_pte(pte_t * pte, unsigned long address, unsigned long size, unsigned long phys_addr, unsigned long flags) @@ -155,6 +156,7 @@ void * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flag area = get_vm_area(size, VM_IOREMAP); if (!area) return NULL; + area->phys_addr = phys_addr; addr = area->addr; if (remap_area_pages(VMALLOC_VMADDR(addr), phys_addr, size, flags)) { vfree(addr); @@ -163,10 +165,71 @@ void * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flag return (void *) (offset + (char *)addr); } + +/** + * ioremap_nocache - map bus memory into CPU space + * @offset: bus address of the memory + * @size: size of the resource to map + * + * ioremap_nocache performs a platform specific sequence of operations to + * make bus memory CPU accessible via the readb/readw/readl/writeb/ + * writew/writel functions and the other mmio helpers. The returned + * address is not guaranteed to be usable directly as a virtual + * address. + * + * This version of ioremap ensures that the memory is marked uncachable + * on the CPU as well as honouring existing caching rules from things like + * the PCI bus. Note that there are other caches and buffers on many + * busses. In particular driver authors should read up on PCI writes + * + * It's useful if some control registers are in such an area and + * write combining or read caching is not desirable: + * + * Must be freed with iounmap. + */ + +void *ioremap_nocache (unsigned long phys_addr, unsigned long size) +{ + void *p = __ioremap(phys_addr, size, _PAGE_PCD); + if (!p) + return p; + + if (phys_addr + size < virt_to_phys(high_memory)) { + struct page *ppage = virt_to_page(__va(phys_addr)); + unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + + BUG_ON(phys_addr+size > (unsigned long)high_memory); + BUG_ON(phys_addr + size < phys_addr); + + if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) { + iounmap(p); + p = NULL; + } + } + + return p; +} + void iounmap(void *addr) { - if (addr > high_memory) - return vfree((void *) (PAGE_MASK & (unsigned long) addr)); + struct vm_struct *p; + if (addr < high_memory) + return; + p = remove_kernel_area(addr); + if (!p) { + printk("__iounmap: bad address %p\n", addr); + return; + } + + BUG_ON(p->phys_addr == 0); /* not allocated with ioremap */ + + vmfree_area_pages(VMALLOC_VMADDR(p->addr), p->size); + if (p->flags && p->phys_addr < virt_to_phys(high_memory)) { + change_page_attr(virt_to_page(__va(p->phys_addr)), + p->size >> PAGE_SHIFT, + PAGE_KERNEL); + } + kfree(p); } void __init *bt_ioremap(unsigned long phys_addr, unsigned long size) diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c new file mode 100644 index 000000000000..c5e2374b6bc7 --- /dev/null +++ b/arch/i386/mm/pageattr.c @@ -0,0 +1,197 @@ +/* + * Copyright 2002 Andi Kleen, SuSE Labs. + * Thanks to Ben LaHaise for precious feedback. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static inline pte_t *lookup_address(unsigned long address) +{ + pgd_t *pgd = pgd_offset_k(address); + pmd_t *pmd = pmd_offset(pgd, address); + if (pmd_large(*pmd)) + return (pte_t *)pmd; + return pte_offset_kernel(pmd, address); +} + +static struct page *split_large_page(unsigned long address, pgprot_t prot) +{ + int i; + unsigned long addr; + struct page *base = alloc_pages(GFP_KERNEL, 0); + pte_t *pbase; + if (!base) + return NULL; + address = __pa(address); + addr = address & LARGE_PAGE_MASK; + pbase = (pte_t *)page_address(base); + for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) { + pbase[i] = pfn_pte(addr >> PAGE_SHIFT, + addr == address ? prot : PAGE_KERNEL); + } + return base; +} + +static void flush_kernel_map(void *dummy) +{ + /* Could use CLFLUSH here if the CPU supports it (Hammer,P4) */ + if (boot_cpu_data.x86_model >= 4) + asm volatile("wbinvd":::"memory"); + /* Flush all to work around Errata in early athlons regarding + * large page flushing. + */ + __flush_tlb_all(); +} + +static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) +{ + set_pte_atomic(kpte, pte); /* change init_mm */ +#ifndef CONFIG_X86_PAE + { + struct list_head *l; + spin_lock(&mmlist_lock); + list_for_each(l, &init_mm.mmlist) { + struct mm_struct *mm = list_entry(l, struct mm_struct, mmlist); + pmd_t *pmd = pmd_offset(pgd_offset(mm, address), address); + set_pte_atomic((pte_t *)pmd, pte); + } + spin_unlock(&mmlist_lock); + } +#endif +} + +/* + * No more special protections in this 2/4MB area - revert to a + * large page again. + */ +static inline void revert_page(struct page *kpte_page, unsigned long address) +{ + pte_t *linear = (pte_t *) + pmd_offset(pgd_offset(&init_mm, address), address); + set_pmd_pte(linear, address, + pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT, + PAGE_KERNEL_LARGE)); +} + +static int +__change_page_attr(struct page *page, pgprot_t prot, struct page **oldpage) +{ + pte_t *kpte; + unsigned long address; + struct page *kpte_page; + +#ifdef CONFIG_HIGHMEM + if (page >= highmem_start_page) + BUG(); +#endif + address = (unsigned long)page_address(page); + + kpte = lookup_address(address); + kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK); + if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) { + if ((pte_val(*kpte) & _PAGE_PSE) == 0) { + pte_t old = *kpte; + pte_t standard = mk_pte(page, PAGE_KERNEL); + + set_pte_atomic(kpte, mk_pte(page, prot)); + if (pte_same(old,standard)) + atomic_inc(&kpte_page->count); + } else { + struct page *split = split_large_page(address, prot); + if (!split) + return -ENOMEM; + set_pmd_pte(kpte,address,mk_pte(split, PAGE_KERNEL)); + } + } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { + set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); + atomic_dec(&kpte_page->count); + } + + if (cpu_has_pse && (atomic_read(&kpte_page->count) == 1)) { + *oldpage = kpte_page; + revert_page(kpte_page, address); + } + return 0; +} + +static inline void flush_map(void) +{ +#ifdef CONFIG_SMP + smp_call_function(flush_kernel_map, NULL, 1, 1); +#endif + flush_kernel_map(NULL); +} + +struct deferred_page { + struct deferred_page *next; + struct page *fpage; +}; +static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */ + +/* + * Change the page attributes of an page in the linear mapping. + * + * This should be used when a page is mapped with a different caching policy + * than write-back somewhere - some CPUs do not like it when mappings with + * different caching policies exist. This changes the page attributes of the + * in kernel linear mapping too. + * + * The caller needs to ensure that there are no conflicting mappings elsewhere. + * This function only deals with the kernel linear map. + * + * Caller must call global_flush_tlb() after this. + */ +int change_page_attr(struct page *page, int numpages, pgprot_t prot) +{ + int err = 0; + struct page *fpage; + int i; + + down_write(&init_mm.mmap_sem); + for (i = 0; i < numpages; i++, page++) { + fpage = NULL; + err = __change_page_attr(page, prot, &fpage); + if (err) + break; + if (fpage) { + struct deferred_page *df; + df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); + if (!df) { + flush_map(); + __free_page(fpage); + } else { + df->next = df_list; + df->fpage = fpage; + df_list = df; + } + } + } + up_write(&init_mm.mmap_sem); + return err; +} + +void global_flush_tlb(void) +{ + struct deferred_page *df, *next_df; + + down_read(&init_mm.mmap_sem); + df = xchg(&df_list, NULL); + up_read(&init_mm.mmap_sem); + flush_map(); + for (; df; df = next_df) { + next_df = df->next; + if (df->fpage) + __free_page(df->fpage); + kfree(df); + } +} + +EXPORT_SYMBOL(change_page_attr); +EXPORT_SYMBOL(global_flush_tlb); diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h index be8178161e80..94e405104df4 100644 --- a/drivers/char/agp/agp.h +++ b/drivers/char/agp/agp.h @@ -118,8 +118,8 @@ struct agp_bridge_data { int (*remove_memory) (agp_memory *, off_t, int); agp_memory *(*alloc_by_type) (size_t, int); void (*free_by_type) (agp_memory *); - unsigned long (*agp_alloc_page) (void); - void (*agp_destroy_page) (unsigned long); + void *(*agp_alloc_page) (void); + void (*agp_destroy_page) (void *); int (*suspend)(void); void (*resume)(void); diff --git a/drivers/char/agp/agpgart_be.c b/drivers/char/agp/agpgart_be.c index 10cc178c4d89..8ba761695215 100644 --- a/drivers/char/agp/agpgart_be.c +++ b/drivers/char/agp/agpgart_be.c @@ -22,6 +22,8 @@ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * + * TODO: + * - Allocate more than order 0 pages to avoid too much linear map splitting. */ #include #include @@ -43,6 +45,7 @@ #include #include #include +#include #include #include "agp.h" @@ -59,56 +62,28 @@ EXPORT_SYMBOL(agp_enable); EXPORT_SYMBOL(agp_backend_acquire); EXPORT_SYMBOL(agp_backend_release); -static void flush_cache(void); - static struct agp_bridge_data agp_bridge; static int agp_try_unsupported __initdata = 0; - -static inline void flush_cache(void) -{ -#if defined(__i386__) || defined(__x86_64__) - asm volatile ("wbinvd":::"memory"); -#elif defined(__alpha__) || defined(__ia64__) || defined(__sparc__) - /* ??? I wonder if we'll really need to flush caches, or if the - core logic can manage to keep the system coherent. The ARM - speaks only of using `cflush' to get things in memory in - preparation for power failure. - - If we do need to call `cflush', we'll need a target page, - as we can only flush one page at a time. - - Ditto for IA-64. --davidm 00/08/07 */ - mb(); -#else -#error "Please define flush_cache." -#endif -} - #ifdef CONFIG_SMP -static atomic_t cpus_waiting; - static void ipi_handler(void *null) { - flush_cache(); - atomic_dec(&cpus_waiting); - while (atomic_read(&cpus_waiting) > 0) - barrier(); + flush_agp_cache(); } static void smp_flush_cache(void) { - atomic_set(&cpus_waiting, smp_num_cpus - 1); - if (smp_call_function(ipi_handler, NULL, 1, 0) != 0) + if (smp_call_function(ipi_handler, NULL, 1, 1) != 0) panic(PFX "timed out waiting for the other CPUs!\n"); - flush_cache(); - while (atomic_read(&cpus_waiting) > 0) - barrier(); + flush_agp_cache(); } #define global_cache_flush smp_flush_cache #else /* CONFIG_SMP */ -#define global_cache_flush flush_cache -#endif /* CONFIG_SMP */ +static void global_cache_flush(void) +{ + flush_agp_cache(); +} +#endif /* !CONFIG_SMP */ int agp_backend_acquire(void) { @@ -208,8 +183,7 @@ void agp_free_memory(agp_memory * curr) if (curr->page_count != 0) { for (i = 0; i < curr->page_count; i++) { curr->memory[i] &= ~(0x00000fff); - agp_bridge.agp_destroy_page((unsigned long) - phys_to_virt(curr->memory[i])); + agp_bridge.agp_destroy_page(phys_to_virt(curr->memory[i])); } } agp_free_key(curr->key); @@ -252,21 +226,22 @@ agp_memory *agp_allocate_memory(size_t page_count, u32 type) MOD_DEC_USE_COUNT; return NULL; } + for (i = 0; i < page_count; i++) { - new->memory[i] = agp_bridge.agp_alloc_page(); + void *addr = agp_bridge.agp_alloc_page(); - if (new->memory[i] == 0) { + if (addr == NULL) { /* Free this structure */ agp_free_memory(new); return NULL; } new->memory[i] = - agp_bridge.mask_memory( - virt_to_phys((void *) new->memory[i]), - type); + agp_bridge.mask_memory(virt_to_phys(addr), type); new->page_count++; } + flush_agp_mappings(); + return new; } @@ -561,6 +536,7 @@ static int agp_generic_create_gatt_table(void) agp_bridge.current_size; break; } + temp = agp_bridge.current_size; } else { agp_bridge.aperture_size_idx = i; } @@ -761,7 +737,7 @@ static void agp_generic_free_by_type(agp_memory * curr) * against a maximum value. */ -static unsigned long agp_generic_alloc_page(void) +static void *agp_generic_alloc_page(void) { struct page * page; @@ -769,24 +745,26 @@ static unsigned long agp_generic_alloc_page(void) if (page == NULL) return 0; + map_page_into_agp(page); + get_page(page); SetPageLocked(page); atomic_inc(&agp_bridge.current_memory_agp); - return (unsigned long)page_address(page); + return page_address(page); } -static void agp_generic_destroy_page(unsigned long addr) +static void agp_generic_destroy_page(void *addr) { - void *pt = (void *) addr; struct page *page; - if (pt == NULL) + if (addr == NULL) return; - page = virt_to_page(pt); + page = virt_to_page(addr); + unmap_page_from_agp(page); put_page(page); unlock_page(page); - free_page((unsigned long) pt); + free_page((unsigned long)addr); atomic_dec(&agp_bridge.current_memory_agp); } @@ -993,6 +971,7 @@ static agp_memory *intel_i810_alloc_by_type(size_t pg_count, int type) return new; } if(type == AGP_PHYS_MEMORY) { + void *addr; /* The I810 requires a physical address to program * it's mouse pointer into hardware. However the * Xserver still writes to it through the agp @@ -1007,17 +986,14 @@ static agp_memory *intel_i810_alloc_by_type(size_t pg_count, int type) return NULL; } MOD_INC_USE_COUNT; - new->memory[0] = agp_bridge.agp_alloc_page(); + addr = agp_bridge.agp_alloc_page(); - if (new->memory[0] == 0) { + if (addr == NULL) { /* Free this structure */ agp_free_memory(new); return NULL; } - new->memory[0] = - agp_bridge.mask_memory( - virt_to_phys((void *) new->memory[0]), - type); + new->memory[0] = agp_bridge.mask_memory(virt_to_phys(addr), type); new->page_count = 1; new->num_scratch_pages = 1; new->type = AGP_PHYS_MEMORY; @@ -1032,7 +1008,7 @@ static void intel_i810_free_by_type(agp_memory * curr) { agp_free_key(curr->key); if(curr->type == AGP_PHYS_MEMORY) { - agp_bridge.agp_destroy_page((unsigned long) + agp_bridge.agp_destroy_page( phys_to_virt(curr->memory[0])); vfree(curr->memory); } @@ -1291,7 +1267,7 @@ static agp_memory *intel_i830_alloc_by_type(size_t pg_count,int type) if (type == AGP_DCACHE_MEMORY) return(NULL); if (type == AGP_PHYS_MEMORY) { - unsigned long physical; + void *addr; /* The i830 requires a physical address to program * it's mouse pointer into hardware. However the @@ -1306,19 +1282,18 @@ static agp_memory *intel_i830_alloc_by_type(size_t pg_count,int type) if (nw == NULL) return(NULL); MOD_INC_USE_COUNT; - nw->memory[0] = agp_bridge.agp_alloc_page(); - physical = nw->memory[0]; - if (nw->memory[0] == 0) { + addr = agp_bridge.agp_alloc_page(); + if (addr == NULL) { /* free this structure */ agp_free_memory(nw); return(NULL); } - nw->memory[0] = agp_bridge.mask_memory(virt_to_phys((void *) nw->memory[0]),type); + nw->memory[0] = agp_bridge.mask_memory(virt_to_phys(addr),type); nw->page_count = 1; nw->num_scratch_pages = 1; nw->type = AGP_PHYS_MEMORY; - nw->physical = virt_to_phys((void *) physical); + nw->physical = virt_to_phys(addr); return(nw); } @@ -1849,16 +1824,17 @@ static int intel_i460_remove_memory(agp_memory * mem, off_t pg_start, int type) * Let's just hope nobody counts on the allocated AGP memory being there * before bind time (I don't think current drivers do)... */ -static unsigned long intel_i460_alloc_page(void) +static void * intel_i460_alloc_page(void) { if (intel_i460_cpk) return agp_generic_alloc_page(); /* Returning NULL would cause problems */ - return ~0UL; + /* AK: really dubious code. */ + return (void *)~0UL; } -static void intel_i460_destroy_page(unsigned long page) +static void intel_i460_destroy_page(void *page) { if (intel_i460_cpk) agp_generic_destroy_page(page); @@ -3298,38 +3274,29 @@ static void ali_cache_flush(void) } } -static unsigned long ali_alloc_page(void) +static void *ali_alloc_page(void) { - struct page *page; - u32 temp; + void *adr = agp_generic_alloc_page(); + unsigned temp; - page = alloc_page(GFP_KERNEL); - if (page == NULL) + if (adr == 0) return 0; - get_page(page); - SetPageLocked(page); - atomic_inc(&agp_bridge.current_memory_agp); - - global_cache_flush(); - if (agp_bridge.type == ALI_M1541) { pci_read_config_dword(agp_bridge.dev, ALI_CACHE_FLUSH_CTRL, &temp); pci_write_config_dword(agp_bridge.dev, ALI_CACHE_FLUSH_CTRL, (((temp & ALI_CACHE_FLUSH_ADDR_MASK) | - virt_to_phys(page_address(page))) | + virt_to_phys(adr)) | ALI_CACHE_FLUSH_EN )); } - return (unsigned long)page_address(page); + return adr; } -static void ali_destroy_page(unsigned long addr) +static void ali_destroy_page(void * addr) { u32 temp; - void *pt = (void *) addr; - struct page *page; - if (pt == NULL) + if (addr == NULL) return; global_cache_flush(); @@ -3338,15 +3305,11 @@ static void ali_destroy_page(unsigned long addr) pci_read_config_dword(agp_bridge.dev, ALI_CACHE_FLUSH_CTRL, &temp); pci_write_config_dword(agp_bridge.dev, ALI_CACHE_FLUSH_CTRL, (((temp & ALI_CACHE_FLUSH_ADDR_MASK) | - virt_to_phys((void *)pt)) | + virt_to_phys(addr)) | ALI_CACHE_FLUSH_EN)); } - page = virt_to_page(pt); - put_page(page); - unlock_page(page); - free_page((unsigned long) pt); - atomic_dec(&agp_bridge.current_memory_agp); + agp_generic_destroy_page(addr); } /* Setup function */ @@ -5011,15 +4974,15 @@ static int __init agp_backend_initialize(void) } if (agp_bridge.needs_scratch_page == TRUE) { - agp_bridge.scratch_page = agp_bridge.agp_alloc_page(); + void *addr; + addr = agp_bridge.agp_alloc_page(); - if (agp_bridge.scratch_page == 0) { + if (addr == NULL) { printk(KERN_ERR PFX "unable to get memory for " "scratch page.\n"); return -ENOMEM; } - agp_bridge.scratch_page = - virt_to_phys((void *) agp_bridge.scratch_page); + agp_bridge.scratch_page = virt_to_phys(addr); agp_bridge.scratch_page = agp_bridge.mask_memory(agp_bridge.scratch_page, 0); } @@ -5064,8 +5027,7 @@ static int __init agp_backend_initialize(void) err_out: if (agp_bridge.needs_scratch_page == TRUE) { agp_bridge.scratch_page &= ~(0x00000fff); - agp_bridge.agp_destroy_page((unsigned long) - phys_to_virt(agp_bridge.scratch_page)); + agp_bridge.agp_destroy_page(phys_to_virt(agp_bridge.scratch_page)); } if (got_gatt) agp_bridge.free_gatt_table(); @@ -5084,8 +5046,7 @@ static void agp_backend_cleanup(void) if (agp_bridge.needs_scratch_page == TRUE) { agp_bridge.scratch_page &= ~(0x00000fff); - agp_bridge.agp_destroy_page((unsigned long) - phys_to_virt(agp_bridge.scratch_page)); + agp_bridge.agp_destroy_page(phys_to_virt(agp_bridge.scratch_page)); } } diff --git a/include/asm-alpha/agp.h b/include/asm-alpha/agp.h new file mode 100644 index 000000000000..ba05bdf9a211 --- /dev/null +++ b/include/asm-alpha/agp.h @@ -0,0 +1,11 @@ +#ifndef AGP_H +#define AGP_H 1 + +/* dummy for now */ + +#define map_page_into_agp(page) +#define unmap_page_from_agp(page) +#define flush_agp_mappings() +#define flush_agp_cache() mb() + +#endif diff --git a/include/asm-i386/agp.h b/include/asm-i386/agp.h new file mode 100644 index 000000000000..9ae97c09fb49 --- /dev/null +++ b/include/asm-i386/agp.h @@ -0,0 +1,23 @@ +#ifndef AGP_H +#define AGP_H 1 + +#include + +/* + * Functions to keep the agpgart mappings coherent with the MMU. + * The GART gives the CPU a physical alias of pages in memory. The alias region is + * mapped uncacheable. Make sure there are no conflicting mappings + * with different cachability attributes for the same page. This avoids + * data corruption on some CPUs. + */ + +#define map_page_into_agp(page) change_page_attr(page, 1, PAGE_KERNEL_NOCACHE) +#define unmap_page_from_agp(page) change_page_attr(page, 1, PAGE_KERNEL) +#define flush_agp_mappings() global_flush_tlb() + +/* Could use CLFLUSH here if the cpu supports it. But then it would + need to be called for each cacheline of the whole page so it may not be + worth it. Would need a page for it. */ +#define flush_agp_cache() asm volatile("wbinvd":::"memory") + +#endif diff --git a/include/asm-i386/cacheflush.h b/include/asm-i386/cacheflush.h index 58d027dfc5ff..319e65a7047f 100644 --- a/include/asm-i386/cacheflush.h +++ b/include/asm-i386/cacheflush.h @@ -15,4 +15,7 @@ #define flush_icache_page(vma,pg) do { } while (0) #define flush_icache_user_range(vma,pg,adr,len) do { } while (0) +void global_flush_tlb(void); +int change_page_attr(struct page *page, int numpages, pgprot_t prot); + #endif /* _I386_CACHEFLUSH_H */ diff --git a/include/asm-i386/io.h b/include/asm-i386/io.h index 44996d06ecc3..9922dd823c9c 100644 --- a/include/asm-i386/io.h +++ b/include/asm-i386/io.h @@ -121,31 +121,7 @@ static inline void * ioremap (unsigned long offset, unsigned long size) return __ioremap(offset, size, 0); } -/** - * ioremap_nocache - map bus memory into CPU space - * @offset: bus address of the memory - * @size: size of the resource to map - * - * ioremap_nocache performs a platform specific sequence of operations to - * make bus memory CPU accessible via the readb/readw/readl/writeb/ - * writew/writel functions and the other mmio helpers. The returned - * address is not guaranteed to be usable directly as a virtual - * address. - * - * This version of ioremap ensures that the memory is marked uncachable - * on the CPU as well as honouring existing caching rules from things like - * the PCI bus. Note that there are other caches and buffers on many - * busses. In paticular driver authors should read up on PCI writes - * - * It's useful if some control registers are in such an area and - * write combining or read caching is not desirable: - */ - -static inline void * ioremap_nocache (unsigned long offset, unsigned long size) -{ - return __ioremap(offset, size, _PAGE_PCD); -} - +extern void * ioremap_nocache (unsigned long offset, unsigned long size); extern void iounmap(void *addr); /* diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h index 4737ef69ae18..d8e1f404c08b 100644 --- a/include/asm-i386/page.h +++ b/include/asm-i386/page.h @@ -6,6 +6,9 @@ #define PAGE_SIZE (1UL << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) +#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) +#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT) + #ifdef __KERNEL__ #ifndef __ASSEMBLY__ diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h index e22db0cc6824..9f8bdc13adac 100644 --- a/include/asm-i386/pgtable-2level.h +++ b/include/asm-i386/pgtable-2level.h @@ -40,6 +40,7 @@ static inline int pgd_present(pgd_t pgd) { return 1; } * hook is made available. */ #define set_pte(pteptr, pteval) (*(pteptr) = pteval) +#define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval) /* * (pmds are folded into pgds so this doesnt get actually called, * but the define is needed for a generic inline function.) diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h index bb2eaea63fde..beb0c1bc3d30 100644 --- a/include/asm-i386/pgtable-3level.h +++ b/include/asm-i386/pgtable-3level.h @@ -49,6 +49,8 @@ static inline void set_pte(pte_t *ptep, pte_t pte) smp_wmb(); ptep->pte_low = pte.pte_low; } +#define set_pte_atomic(pteptr,pteval) \ + set_64bit((unsigned long long *)(pteptr),pte_val(pteval)) #define set_pmd(pmdptr,pmdval) \ set_64bit((unsigned long long *)(pmdptr),pmd_val(pmdval)) #define set_pgd(pgdptr,pgdval) \ diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h index f48db2beeeba..71b75fa234af 100644 --- a/include/asm-i386/pgtable.h +++ b/include/asm-i386/pgtable.h @@ -237,6 +237,9 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) #define pmd_page(pmd) \ (mem_map + (pmd_val(pmd) >> PAGE_SHIFT)) +#define pmd_large(pmd) \ + ((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT)) + /* to find an entry in a page-table-directory. */ #define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1)) diff --git a/include/asm-ia64/agp.h b/include/asm-ia64/agp.h new file mode 100644 index 000000000000..ba05bdf9a211 --- /dev/null +++ b/include/asm-ia64/agp.h @@ -0,0 +1,11 @@ +#ifndef AGP_H +#define AGP_H 1 + +/* dummy for now */ + +#define map_page_into_agp(page) +#define unmap_page_from_agp(page) +#define flush_agp_mappings() +#define flush_agp_cache() mb() + +#endif diff --git a/include/asm-sparc64/agp.h b/include/asm-sparc64/agp.h new file mode 100644 index 000000000000..ba05bdf9a211 --- /dev/null +++ b/include/asm-sparc64/agp.h @@ -0,0 +1,11 @@ +#ifndef AGP_H +#define AGP_H 1 + +/* dummy for now */ + +#define map_page_into_agp(page) +#define unmap_page_from_agp(page) +#define flush_agp_mappings() +#define flush_agp_cache() mb() + +#endif diff --git a/include/asm-x86_64/agp.h b/include/asm-x86_64/agp.h new file mode 100644 index 000000000000..8c2fabe80419 --- /dev/null +++ b/include/asm-x86_64/agp.h @@ -0,0 +1,23 @@ +#ifndef AGP_H +#define AGP_H 1 + +#include + +/* + * Functions to keep the agpgart mappings coherent. + * The GART gives the CPU a physical alias of memory. The alias is + * mapped uncacheable. Make sure there are no conflicting mappings + * with different cachability attributes for the same page. + */ + +#define map_page_into_agp(page) \ + change_page_attr(page, __pgprot(__PAGE_KERNEL | _PAGE_PCD)) +#define unmap_page_from_agp(page) change_page_attr(page, PAGE_KERNEL) +#define flush_agp_mappings() global_flush_tlb() + +/* Could use CLFLUSH here if the cpu supports it. But then it would + need to be called for each cacheline of the whole page so it may not be + worth it. Would need a page for it. */ +#define flush_agp_cache() asm volatile("wbinvd":::"memory") + +#endif diff --git a/include/asm-x86_64/cacheflush.h b/include/asm-x86_64/cacheflush.h index 58d027dfc5ff..319e65a7047f 100644 --- a/include/asm-x86_64/cacheflush.h +++ b/include/asm-x86_64/cacheflush.h @@ -15,4 +15,7 @@ #define flush_icache_page(vma,pg) do { } while (0) #define flush_icache_user_range(vma,pg,adr,len) do { } while (0) +void global_flush_tlb(void); +int change_page_attr(struct page *page, int numpages, pgprot_t prot); + #endif /* _I386_CACHEFLUSH_H */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 4051c031a976..9cc67b500368 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -13,6 +13,7 @@ struct vm_struct { unsigned long flags; void * addr; unsigned long size; + unsigned long phys_addr; struct vm_struct * next; }; @@ -23,6 +24,8 @@ extern long vread(char *buf, char *addr, unsigned long count); extern void vmfree_area_pages(unsigned long address, unsigned long size); extern int vmalloc_area_pages(unsigned long address, unsigned long size, int gfp_mask, pgprot_t prot); +extern struct vm_struct *remove_kernel_area(void *addr); + /* * Various ways to allocate pages. */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f95ebed746b0..50cc6d13f0ff 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -195,6 +195,7 @@ struct vm_struct * get_vm_area(unsigned long size, unsigned long flags) if (addr > VMALLOC_END-size) goto out; } + area->phys_addr = 0; area->flags = flags; area->addr = (void *)addr; area->size = size; @@ -209,9 +210,25 @@ out: return NULL; } -void vfree(void * addr) +struct vm_struct *remove_kernel_area(void *addr) { struct vm_struct **p, *tmp; + write_lock(&vmlist_lock); + for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) { + if (tmp->addr == addr) { + *p = tmp->next; + write_unlock(&vmlist_lock); + return tmp; + } + + } + write_unlock(&vmlist_lock); + return NULL; +} + +void vfree(void * addr) +{ + struct vm_struct *tmp; if (!addr) return; @@ -219,17 +236,12 @@ void vfree(void * addr) printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); return; } - write_lock(&vmlist_lock); - for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) { - if (tmp->addr == addr) { - *p = tmp->next; + tmp = remove_kernel_area(addr); + if (tmp) { vmfree_area_pages(VMALLOC_VMADDR(tmp->addr), tmp->size); - write_unlock(&vmlist_lock); kfree(tmp); return; } - } - write_unlock(&vmlist_lock); printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); } -- cgit v1.2.3 From 88bccfb722a28584970601aa8963ddaaca3d147b Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 17 Jun 2002 19:47:44 -0700 Subject: [PATCH] Net updates / CPU hotplug infrastructure missed merge Ironically enough, both were written by me. Fixed thus. --- net/ipv4/route.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8b1f2a159e19..464a56367e28 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2419,7 +2419,7 @@ struct ip_rt_acct *ip_rt_acct; /* This code sucks. But you should have seen it before! --RR */ /* IP route accounting ptr for this logical cpu number. */ -#define IP_RT_ACCT_CPU(i) (ip_rt_acct + cpu_logical_map(i) * 256) +#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256) static int ip_rt_acct_read(char *buffer, char **start, off_t offset, int length, int *eof, void *data) @@ -2441,6 +2441,8 @@ static int ip_rt_acct_read(char *buffer, char **start, off_t offset, /* Add the other cpus in, one int at a time */ for (i = 1; i < NR_CPUS; i++) { unsigned int j; + if (!cpu_online(i)) + continue; for (j = 0; j < length/4; j++) ((u32*)buffer)[j] += ((u32*)IP_RT_ACCT_CPU(i))[j]; } -- cgit v1.2.3 From e3e529bfc6d7ed94570b36fdf6cf3c9935e9a7c4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:17:21 -0700 Subject: [PATCH] writeback tunables Adds five sysctls for tuning the writeback behaviour: dirty_async_ratio dirty_background_ratio dirty_sync_ratio dirty_expire_centisecs dirty_writeback_centisecs these are described in Documentation/filesystems/proc.txt They are basically the tradiditional knobs which we've always had... We are accreting a ton of obsolete sysctl numbers under /proc/sys/vm/. I didn't recycle these - just mark them unused and remove the obsolete documentation. --- Documentation/filesystems/proc.txt | 202 +++++-------------------------------- Documentation/sysctl/vm.txt | 143 ++------------------------ include/linux/sysctl.h | 19 ++-- include/linux/writeback.h | 6 ++ kernel/sysctl.c | 14 +++ mm/page-writeback.c | 68 ++++++++----- 6 files changed, 110 insertions(+), 342 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index f93b1544c6b2..57597335536d 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -948,120 +948,43 @@ program to load modules on demand. ----------------------------------------------- The files in this directory can be used to tune the operation of the virtual -memory (VM) subsystem of the Linux kernel. In addition, one of the files -(bdflush) has some influence on disk usage. +memory (VM) subsystem of the Linux kernel. -bdflush -------- - -This file controls the operation of the bdflush kernel daemon. It currently -contains nine integer values, six of which are actually used by the kernel. -They are listed in table 2-2. - - -Table 2-2: Parameters in /proc/sys/vm/bdflush -.............................................................................. - Value Meaning - nfract Percentage of buffer cache dirty to activate bdflush - ndirty Maximum number of dirty blocks to write out per wake-cycle - nrefill Number of clean buffers to try to obtain each time we call refill - nref_dirt buffer threshold for activating bdflush when trying to refill - buffers. - dummy Unused - age_buffer Time for normal buffer to age before we flush it - age_super Time for superblock to age before we flush it - dummy Unused - dummy Unused -.............................................................................. - -nfract ------- - -This parameter governs the maximum number of dirty buffers in the buffer -cache. Dirty means that the contents of the buffer still have to be written to -disk (as opposed to a clean buffer, which can just be forgotten about). -Setting this to a higher value means that Linux can delay disk writes for a -long time, but it also means that it will have to do a lot of I/O at once when -memory becomes short. A lower value will spread out disk I/O more evenly. - -ndirty ------- - -Ndirty gives the maximum number of dirty buffers that bdflush can write to the -disk at one time. A high value will mean delayed, bursty I/O, while a small -value can lead to memory shortage when bdflush isn't woken up often enough. - -nrefill -------- - -This is the number of buffers that bdflush will add to the list of free -buffers when refill_freelist() is called. It is necessary to allocate free -buffers beforehand, since the buffers are often different sizes than the -memory pages and some bookkeeping needs to be done beforehand. The higher the -number, the more memory will be wasted and the less often refill_freelist() -will need to run. - -nref_dirt ---------- - -When refill_freelist() comes across more than nref_dirt dirty buffers, it will -wake up bdflush. - -age_buffer and age_super ------------------------- - -Finally, the age_buffer and age_super parameters govern the maximum time Linux -waits before writing out a dirty buffer to disk. The value is expressed in -jiffies (clockticks), the number of jiffies per second is 100. Age_buffer is -the maximum age for data blocks, while age_super is for filesystems meta data. - -buffermem ---------- - -The three values in this file control how much memory should be used for -buffer memory. The percentage is calculated as a percentage of total system -memory. - -The values are: - -min_percent ------------ +dirty_background_ratio +---------------------- -This is the minimum percentage of memory that should be spent on buffer -memory. +Contains, as a percentage of total system memory, the number of pages at which +the pdflush background writeback daemon will start writing out dirty data. -borrow_percent --------------- +dirty_async_ratio +----------------- -When Linux is short on memory, and the buffer cache uses more than it has been -allotted, the memory management (MM) subsystem will prune the buffer cache -more heavily than other memory to compensate. +Contains, as a percentage of total system memory, the number of pages at which +a process which is generating disk writes will itself start writing out dirty +data. -max_percent ------------ +dirty_sync_ratio +---------------- -This is the maximum amount of memory that can be used for buffer memory. +Contains, as a percentage of total system memory, the number of pages at which +a process which is generating disk writes will itself start writing out dirty +data and waiting upon completion of that writeout. -freepages ---------- +dirty_writeback_centisecs +------------------------- -This file contains three values: min, low and high: +The pdflush writeback daemons will periodically wake up and write `old' data +out to disk. This tunable expresses the interval between those wakeups, in +100'ths of a second. -min ---- -When the number of free pages in the system reaches this number, only the -kernel can allocate more memory. +dirty_expire_centisecs +---------------------- -low ---- -If the number of free pages falls below this point, the kernel starts swapping -aggressively. +This tunable is used to define when dirty data is old enough to be eligible +for writeout by the pdflush daemons. It is expressed in 100'ths of a second. +Data which has been dirty in-memory for longer than this interval will be +written out next time a pdflush daemon wakes up. -high ----- -The kernel tries to keep up to this amount of memory free; if memory falls -below this point, the kernel starts gently swapping in the hopes that it never -has to do really aggressive swapping. kswapd ------ @@ -1113,79 +1036,6 @@ On the other hand, enabling this feature can cause you to run out of memory and thrash the system to death, so large and/or important servers will want to set this value to 0. -pagecache ---------- - -This file does exactly the same job as buffermem, only this file controls the -amount of memory allowed for memory mapping and generic caching of files. - -You don't want the minimum level to be too low, otherwise your system might -thrash when memory is tight or fragmentation is high. - -pagetable_cache ---------------- - -The kernel keeps a number of page tables in a per-processor cache (this helps -a lot on SMP systems). The cache size for each processor will be between the -low and the high value. - -On a low-memory, single CPU system, you can safely set these values to 0 so -you don't waste memory. It is used on SMP systems so that the system can -perform fast pagetable allocations without having to acquire the kernel memory -lock. - -For large systems, the settings are probably fine. For normal systems they -won't hurt a bit. For small systems ( less than 16MB ram) it might be -advantageous to set both values to 0. - -swapctl -------- - -This file contains no less than 8 variables. All of these values are used by -kswapd. - -The first four variables -* sc_max_page_age, -* sc_page_advance, -* sc_page_decline and -* sc_page_initial_age -are used to keep track of Linux's page aging. Page aging is a bookkeeping -method to track which pages of memory are often used, and which pages can be -swapped out without consequences. - -When a page is swapped in, it starts at sc_page_initial_age (default 3) and -when the page is scanned by kswapd, its age is adjusted according to the -following scheme: - -* If the page was used since the last time we scanned, its age is increased - by sc_page_advance (default 3). Where the maximum value is given by - sc_max_page_age (default 20). -* Otherwise (meaning it wasn't used) its age is decreased by sc_page_decline - (default 1). - -When a page reaches age 0, it's ready to be swapped out. - -The variables sc_age_cluster_fract, sc_age_cluster_min, sc_pageout_weight and -sc_bufferout_weight, can be used to control kswapd's aggressiveness in -swapping out pages. - -Sc_age_cluster_fract is used to calculate how many pages from a process are to -be scanned by kswapd. The formula used is - -(sc_age_cluster_fract divided by 1024) times resident set size - -So if you want kswapd to scan the whole process, sc_age_cluster_fract needs to -have a value of 1024. The minimum number of pages kswapd will scan is -represented by sc_age_cluster_min, which is done so that kswapd will also scan -small processes. - -The values of sc_pageout_weight and sc_bufferout_weight are used to control -how many tries kswapd will make in order to swap out one page/buffer. These -values can be used to fine-tune the ratio between user pages and buffer/cache -memory. When you find that your Linux system is swapping out too many process -pages in order to satisfy buffer memory demands, you may want to either -increase sc_bufferout_weight, or decrease the value of sc_pageout_weight. - 2.5 /proc/sys/dev - Device specific parameters ---------------------------------------------- diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index bf9abe829e40..b8221db90cde 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -9,116 +9,28 @@ This file contains the documentation for the sysctl files in /proc/sys/vm and is valid for Linux kernel version 2.2. The files in this directory can be used to tune the operation -of the virtual memory (VM) subsystem of the Linux kernel, and -one of the files (bdflush) also has a little influence on disk -usage. +of the virtual memory (VM) subsystem of the Linux kernel and +the writeout of dirty data to disk. Default values and initialization routines for most of these files can be found in mm/swap.c. Currently, these files are in /proc/sys/vm: -- bdflush -- buffermem -- freepages - kswapd - overcommit_memory - page-cluster -- pagecache -- pagetable_cache +- dirty_async_ratio +- dirty_background_ratio +- dirty_expire_centisecs +- dirty_sync_ratio +- dirty_writeback_centisecs ============================================================== -bdflush: - -This file controls the operation of the bdflush kernel -daemon. The source code to this struct can be found in -linux/fs/buffer.c. It currently contains 9 integer values, -of which 4 are actually used by the kernel. - -From linux/fs/buffer.c: --------------------------------------------------------------- -union bdflush_param { - struct { - int nfract; /* Percentage of buffer cache dirty to - activate bdflush */ - int dummy1; /* old "ndirty" */ - int dummy2; /* old "nrefill" */ - int dummy3; /* unused */ - int interval; /* jiffies delay between kupdate flushes */ - int age_buffer; /* Time for normal buffer to age */ - int nfract_sync;/* Percentage of buffer cache dirty to - activate bdflush synchronously */ - int dummy4; /* unused */ - int dummy5; /* unused */ - } b_un; - unsigned int data[N_PARAM]; -} bdf_prm = {{30, 64, 64, 256, 5*HZ, 30*HZ, 60, 0, 0}}; --------------------------------------------------------------- - -int nfract: -The first parameter governs the maximum number of dirty -buffers in the buffer cache. Dirty means that the contents -of the buffer still have to be written to disk (as opposed -to a clean buffer, which can just be forgotten about). -Setting this to a high value means that Linux can delay disk -writes for a long time, but it also means that it will have -to do a lot of I/O at once when memory becomes short. A low -value will spread out disk I/O more evenly, at the cost of -more frequent I/O operations. The default value is 30%, -the minimum is 0%, and the maximum is 100%. - -int interval: -The fifth parameter, interval, is the minimum rate at -which kupdate will wake and flush. The value is expressed in -jiffies (clockticks), the number of jiffies per second is -normally 100 (Alpha is 1024). Thus, x*HZ is x seconds. The -default value is 5 seconds, the minimum is 0 seconds, and the -maximum is 600 seconds. - -int age_buffer: -The sixth parameter, age_buffer, governs the maximum time -Linux waits before writing out a dirty buffer to disk. The -value is in jiffies. The default value is 30 seconds, -the minimum is 1 second, and the maximum 6,000 seconds. - -int nfract_sync: -The seventh parameter, nfract_sync, governs the percentage -of buffer cache that is dirty before bdflush activates -synchronously. This can be viewed as the hard limit before -bdflush forces buffers to disk. The default is 60%, the -minimum is 0%, and the maximum is 100%. - -============================================================== -buffermem: - -The three values in this file correspond to the values in -the struct buffer_mem. It controls how much memory should -be used for buffer memory. The percentage is calculated -as a percentage of total system memory. - -The values are: -min_percent -- this is the minimum percentage of memory - that should be spent on buffer memory -borrow_percent -- UNUSED -max_percent -- UNUSED - -============================================================== -freepages: +dirty_async_ratio, dirty_background_ratio, dirty_expire_centisecs, +dirty_sync_ratio dirty_writeback_centisecs: -This file contains the values in the struct freepages. That -struct contains three members: min, low and high. - -The meaning of the numbers is: - -freepages.min When the number of free pages in the system - reaches this number, only the kernel can - allocate more memory. -freepages.low If the number of free pages gets below this - point, the kernel starts swapping aggressively. -freepages.high The kernel tries to keep up to this amount of - memory free; if memory comes below this point, - the kernel gently starts swapping in the hopes - that it never has to do real aggressive swapping. +See Documentation/filesystems/proc.txt ============================================================== @@ -180,38 +92,3 @@ The number of pages the kernel reads in at once is equal to 2 ^ page-cluster. Values above 2 ^ 5 don't make much sense for swap because we only cluster swap data in 32-page groups. -============================================================== - -pagecache: - -This file does exactly the same as buffermem, only this -file controls the struct page_cache, and thus controls -the amount of memory used for the page cache. - -In 2.2, the page cache is used for 3 main purposes: -- caching read() data from files -- caching mmap()ed data and executable files -- swap cache - -When your system is both deep in swap and high on cache, -it probably means that a lot of the swapped data is being -cached, making for more efficient swapping than possible -with the 2.0 kernel. - -============================================================== - -pagetable_cache: - -The kernel keeps a number of page tables in a per-processor -cache (this helps a lot on SMP systems). The cache size for -each processor will be between the low and the high value. - -On a low-memory, single CPU system you can safely set these -values to 0 so you don't waste the memory. On SMP systems it -is used so that the system can do fast pagetable allocations -without having to acquire the kernel memory lock. - -For large systems, the settings are probably OK. For normal -systems they won't hurt a bit. For small systems (<16MB ram) -it might be advantageous to set both values to 0. - diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index a5a6684f9a50..488bc05dbcc1 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -130,16 +130,21 @@ enum /* CTL_VM names: */ enum { - VM_SWAPCTL=1, /* struct: Set vm swapping control */ - VM_SWAPOUT=2, /* int: Linear or sqrt() swapout for hogs */ - VM_FREEPG=3, /* struct: Set free page thresholds */ + VM_UNUSED1=1, /* was: struct: Set vm swapping control */ + VM_UNUSED2=2, /* was; int: Linear or sqrt() swapout for hogs */ + VM_UNUSED3=3, /* was: struct: Set free page thresholds */ VM_BDFLUSH_UNUSED=4, /* Spare */ VM_OVERCOMMIT_MEMORY=5, /* Turn off the virtual memory safety limit */ - VM_BUFFERMEM=6, /* struct: Set buffer memory thresholds */ - VM_PAGECACHE=7, /* struct: Set cache memory thresholds */ + VM_UNUSED4=6, /* was: struct: Set buffer memory thresholds */ + VM_UNUSED5=7, /* was: struct: Set cache memory thresholds */ VM_PAGERDAEMON=8, /* struct: Control kswapd behaviour */ - VM_PGT_CACHE=9, /* struct: Set page table cache parameters */ - VM_PAGE_CLUSTER=10 /* int: set number of pages to swap together */ + VM_UNUSED6=9, /* was: struct: Set page table cache parameters */ + VM_PAGE_CLUSTER=10, /* int: set number of pages to swap together */ + VM_DIRTY_BACKGROUND=11, /* dirty_background_ratio */ + VM_DIRTY_ASYNC=12, /* dirty_async_ratio */ + VM_DIRTY_SYNC=13, /* dirty_sync_ratio */ + VM_DIRTY_WB_CS=14, /* dirty_writeback_centisecs */ + VM_DIRTY_EXPIRE_CS=15, /* dirty_expire_centisecs */ }; diff --git a/include/linux/writeback.h b/include/linux/writeback.h index cf706c783eda..a06b0f116ebd 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -45,6 +45,12 @@ static inline void wait_on_inode(struct inode *inode) /* * mm/page-writeback.c */ +extern int dirty_background_ratio; +extern int dirty_async_ratio; +extern int dirty_sync_ratio; +extern int dirty_writeback_centisecs; +extern int dirty_expire_centisecs; + void balance_dirty_pages(struct address_space *mapping); void balance_dirty_pages_ratelimited(struct address_space *mapping); int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7eb271716af9..f0c6215b1718 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -264,6 +265,19 @@ static ctl_table vm_table[] = { &pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec}, {VM_PAGE_CLUSTER, "page-cluster", &page_cluster, sizeof(int), 0644, NULL, &proc_dointvec}, + {VM_DIRTY_BACKGROUND, "dirty_background_ratio", + &dirty_background_ratio, sizeof(dirty_background_ratio), + 0644, NULL, &proc_dointvec}, + {VM_DIRTY_ASYNC, "dirty_async_ratio", &dirty_async_ratio, + sizeof(dirty_async_ratio), 0644, NULL, &proc_dointvec}, + {VM_DIRTY_SYNC, "dirty_sync_ratio", &dirty_sync_ratio, + sizeof(dirty_sync_ratio), 0644, NULL, &proc_dointvec}, + {VM_DIRTY_WB_CS, "dirty_writeback_centisecs", + &dirty_writeback_centisecs, sizeof(dirty_writeback_centisecs), 0644, + NULL, &proc_dointvec}, + {VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs", + &dirty_expire_centisecs, sizeof(dirty_expire_centisecs), 0644, + NULL, &proc_dointvec}, {0} }; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 082e8fb8cb16..6d4555c3fb91 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -26,29 +26,56 @@ * The maximum number of pages to writeout in a single bdflush/kupdate * operation. We do this so we don't hold I_LOCK against an inode for * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. + * been forced to throttle against that inode. Also, the code reevaluates + * the dirty each time it has written this many pages. */ #define MAX_WRITEBACK_PAGES 1024 /* - * Memory thresholds, in percentages - * FIXME: expose these via /proc or whatever. + * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited + * will look to see if it needs to force writeback or throttling. Probably + * should be scaled by memory size. + */ +#define RATELIMIT_PAGES 1000 + +/* + * When balance_dirty_pages decides that the caller needs to perform some + * non-background writeback, this is how many pages it will attempt to write. + * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably + * large amounts of I/O are submitted. + */ +#define SYNC_WRITEBACK_PAGES 1500 + + +/* + * Dirty memory thresholds, in percentages */ /* * Start background writeback (via pdflush) at this level */ -static int dirty_background_ratio = 40; +int dirty_background_ratio = 40; /* * The generator of dirty data starts async writeback at this level */ -static int dirty_async_ratio = 50; +int dirty_async_ratio = 50; /* * The generator of dirty data performs sync writeout at this level */ -static int dirty_sync_ratio = 60; +int dirty_sync_ratio = 60; + +/* + * The interval between `kupdate'-style writebacks. + */ +int dirty_writeback_centisecs = 5 * 100; + +/* + * The largest amount of time for which data is allowed to remain dirty + */ +int dirty_expire_centisecs = 30 * 100; + static void background_writeout(unsigned long _min_pages); @@ -84,12 +111,12 @@ void balance_dirty_pages(struct address_space *mapping) sync_thresh = (dirty_sync_ratio * tot) / 100; if (dirty_and_writeback > sync_thresh) { - int nr_to_write = 1500; + int nr_to_write = SYNC_WRITEBACK_PAGES; writeback_unlocked_inodes(&nr_to_write, WB_SYNC_LAST, NULL); get_page_state(&ps); } else if (dirty_and_writeback > async_thresh) { - int nr_to_write = 1500; + int nr_to_write = SYNC_WRITEBACK_PAGES; writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL); get_page_state(&ps); @@ -118,7 +145,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) int cpu; cpu = get_cpu(); - if (ratelimits[cpu].count++ >= 1000) { + if (ratelimits[cpu].count++ >= RATELIMIT_PAGES) { ratelimits[cpu].count = 0; put_cpu(); balance_dirty_pages(mapping); @@ -162,17 +189,6 @@ void wakeup_bdflush(void) pdflush_operation(background_writeout, ps.nr_dirty); } -/* - * The interval between `kupdate'-style writebacks. - * - * Traditional kupdate writes back data which is 30-35 seconds old. - * This one does that, but it also writes back just 1/6th of the dirty - * data. This is to avoid great I/O storms. - * - * We chunk the writes up and yield, to permit any throttled page-allocators - * to perform their I/O against a large file. - */ -static int wb_writeback_jifs = 5 * HZ; static struct timer_list wb_timer; /* @@ -183,9 +199,9 @@ static struct timer_list wb_timer; * just walks the superblock inode list, writing back any inodes which are * older than a specific point in time. * - * Try to run once per wb_writeback_jifs jiffies. But if a writeback event - * takes longer than a wb_writeback_jifs interval, then leave a one-second - * gap. + * Try to run once per dirty_writeback_centisecs. But if a writeback event + * takes longer than a dirty_writeback_centisecs interval, then leave a + * one-second gap. * * older_than_this takes precedence over nr_to_write. So we'll only write back * all dirty pages if they are all attached to "old" mappings. @@ -201,9 +217,9 @@ static void wb_kupdate(unsigned long arg) sync_supers(); get_page_state(&ps); - oldest_jif = jiffies - 30*HZ; + oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100; start_jif = jiffies; - next_jif = start_jif + wb_writeback_jifs; + next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100; nr_to_write = ps.nr_dirty; writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, &oldest_jif); blk_run_queues(); @@ -223,7 +239,7 @@ static void wb_timer_fn(unsigned long unused) static int __init wb_timer_init(void) { init_timer(&wb_timer); - wb_timer.expires = jiffies + wb_writeback_jifs; + wb_timer.expires = jiffies + (dirty_writeback_centisecs * HZ) / 100; wb_timer.data = 0; wb_timer.function = wb_timer_fn; add_timer(&wb_timer); -- cgit v1.2.3 From afb51f819b777e9650470e066450d5f887483b19 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:17:34 -0700 Subject: [PATCH] ext3 corruption fix Stephen and Neil Brown recently worked this out. It's a rare situation which only affects data=journal mode. Fix problem in data=journal mode where writeback could be left pending on a journaled, deleted disk block. If that block then gets reallocated, we can end up with an alias in which the old data can be written back to disk over the new. Thanks to Neil Brown for spotting this and coming up with the initial fix. --- fs/jbd/commit.c | 14 ++++++++++++++ fs/jbd/transaction.c | 1 + include/linux/jbd.h | 1 + 3 files changed, 16 insertions(+) diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index e4ce53b05a55..2283894a81a6 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -659,6 +659,20 @@ skip_commit: * there's no point in keeping a checkpoint record for * it. */ bh = jh2bh(jh); + + /* A buffer which has been freed while still being + * journaled by a previous transaction may end up still + * being dirty here, but we want to avoid writing back + * that buffer in the future now that the last use has + * been committed. That's not only a performance gain, + * it also stops aliasing problems if the buffer is left + * behind for writeback and gets reallocated for another + * use in a different page. */ + if (buffer_freed(bh)) { + clear_buffer_freed(bh); + clear_buffer_jbddirty(bh); + } + if (buffer_jdirty(bh)) { JBUFFER_TRACE(jh, "add to new checkpointing trans"); __journal_insert_checkpoint(jh, commit_transaction); diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 89c625bf9fa8..04f15abd8cb6 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1861,6 +1861,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) * running transaction if that is set, but nothing * else. */ JBUFFER_TRACE(jh, "on committing transaction"); + set_buffer_freed(bh); if (jh->b_next_transaction) { J_ASSERT(jh->b_next_transaction == journal->j_running_transaction); diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 835d38c9dbfc..683c1247fd70 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -238,6 +238,7 @@ enum jbd_state_bits { BUFFER_FNS(JBD, jbd) BUFFER_FNS(JBDDirty, jbddirty) TAS_BUFFER_FNS(JBDDirty, jbddirty) +BUFFER_FNS(Freed, freed) static inline struct buffer_head *jh2bh(struct journal_head *jh) { -- cgit v1.2.3 From 386b1f7440e90f1b1541fc4db4dfcd34b00ccd96 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:17:48 -0700 Subject: [PATCH] update_atime cleanup Remove unneeded do_update_atime(), and convert update_atime() to C. --- fs/inode.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index bc90e4232713..a3b2cd4e8a3c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -913,16 +913,6 @@ int bmap(struct inode * inode, int block) return res; } -static inline void do_atime_update(struct inode *inode) -{ - unsigned long time = CURRENT_TIME; - if (inode->i_atime != time) { - inode->i_atime = time; - mark_inode_dirty_sync(inode); - } -} - - /** * update_atime - update the access time * @inode: inode accessed @@ -932,15 +922,19 @@ static inline void do_atime_update(struct inode *inode) * as well as the "noatime" flag and inode specific "noatime" markers. */ -void update_atime (struct inode *inode) +void update_atime(struct inode *inode) { if (inode->i_atime == CURRENT_TIME) return; - if ( IS_NOATIME (inode) ) return; - if ( IS_NODIRATIME (inode) && S_ISDIR (inode->i_mode) ) return; - if ( IS_RDONLY (inode) ) return; - do_atime_update(inode); -} /* End Function update_atime */ + if (IS_NOATIME(inode)) + return; + if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode)) + return; + if (IS_RDONLY(inode)) + return; + inode->i_atime = CURRENT_TIME; + mark_inode_dirty_sync(inode); +} int inode_needs_sync(struct inode *inode) { -- cgit v1.2.3 From 85bfa7dce791fa3d110bd2551fcad41f52a5bc87 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:18:02 -0700 Subject: [PATCH] grab_cache_page_nowait deadlock fix - If grab_cache_page_nowait() is to be called while holding a lock on a different page, it must perform memory allocations with GFP_NOFS. Otherwise it could come back onto the locked page (if it's dirty) and deadlock. Also tidy this function up a bit - the checks in there were overly paranoid. - In a few of places, look to see if we can avoid a buslocked cycle and dirtying of a cacheline. --- mm/filemap.c | 68 ++++++++++++++++++++++-------------------------------------- 1 file changed, 25 insertions(+), 43 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 0b6edcc0d0eb..a31fbce9e196 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -445,8 +445,10 @@ int fail_writepage(struct page *page) { /* Only activate on memory-pressure, not fsync.. */ if (current->flags & PF_MEMALLOC) { - activate_page(page); - SetPageReferenced(page); + if (!PageActive(page)) + activate_page(page); + if (!PageReferenced(page)) + SetPageReferenced(page); } /* Set the page dirty again, unlock */ @@ -868,55 +870,35 @@ struct page *grab_cache_page(struct address_space *mapping, unsigned long index) * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should * be safe to call while holding the lock for another page. + * + * Clear __GFP_FS when allocating the page to avoid recursion into the fs + * and deadlock against the caller's locked page. */ -struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index) +struct page * +grab_cache_page_nowait(struct address_space *mapping, unsigned long index) { - struct page *page; - - page = find_get_page(mapping, index); - - if ( page ) { - if ( !TestSetPageLocked(page) ) { - /* Page found and locked */ - /* This test is overly paranoid, but what the heck... */ - if ( unlikely(page->mapping != mapping || page->index != index) ) { - /* Someone reallocated this page under us. */ - unlock_page(page); - page_cache_release(page); - return NULL; - } else { - return page; - } - } else { - /* Page locked by someone else */ - page_cache_release(page); - return NULL; - } - } - - page = page_cache_alloc(mapping); - if (unlikely(!page)) - return NULL; /* Failed to allocate a page */ + struct page *page = find_get_page(mapping, index); - if (unlikely(add_to_page_cache_unique(page, mapping, index))) { - /* - * Someone else grabbed the page already, or - * failed to allocate a radix-tree node - */ + if (page) { + if (!TestSetPageLocked(page)) + return page; page_cache_release(page); return NULL; } - + page = alloc_pages(mapping->gfp_mask & ~__GFP_FS, 0); + if (page && add_to_page_cache_unique(page, mapping, index)) { + page_cache_release(page); + page = NULL; + } return page; } /* * Mark a page as having seen activity. * - * If it was already so marked, move it - * to the active queue and drop the referenced - * bit. Otherwise, just mark it for future - * action.. + * inactive,unreferenced -> inactive,referenced + * inactive,referenced -> active,unreferenced + * active,unreferenced -> active,referenced */ void mark_page_accessed(struct page *page) { @@ -924,10 +906,9 @@ void mark_page_accessed(struct page *page) activate_page(page); ClearPageReferenced(page); return; + } else if (!PageReferenced(page)) { + SetPageReferenced(page); } - - /* Mark the page referenced, AFTER checking for previous usage.. */ - SetPageReferenced(page); } /* @@ -2286,7 +2267,8 @@ generic_file_write(struct file *file, const char *buf, } } kunmap(page); - SetPageReferenced(page); + if (!PageReferenced(page)) + SetPageReferenced(page); unlock_page(page); page_cache_release(page); if (status < 0) -- cgit v1.2.3 From 7a1a7f5b21d9b3d494f3244ef9d3264dde126525 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:18:15 -0700 Subject: [PATCH] mark_buffer_dirty() speedup mark_buffer_dirty() is showing up on Anton's graphs. Avoiding the buslocked RMW if the buffer is already dirty should fix that up. --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/buffer.c b/fs/buffer.c index b7e31f59193b..c16959deb6ea 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1218,7 +1218,7 @@ void mark_buffer_dirty(struct buffer_head *bh) { if (!buffer_uptodate(bh)) buffer_error(); - if (!test_set_buffer_dirty(bh)) + if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh)) __set_page_dirty_nobuffers(bh->b_page); } -- cgit v1.2.3 From 374cac7a137dfd3212fb729337de479d1a410012 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:18:30 -0700 Subject: [PATCH] go back to 256 requests per queue The request queue was increased from 256 slots to 512 in 2.5.20. The throughput of `dbench 128' on Randy's 384 megabyte machine fell 40%. We do need to understand why that happened, and what we can learn from it. But in the meanwhile I'd suggest that we go back to 256 slots so that this known problem doesn't impact people's evaluation and tuning of 2.5 performance. --- drivers/block/ll_rw_blk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index d53122b1ae46..3527afa3cae7 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -2002,8 +2002,8 @@ int __init blk_dev_init(void) queue_nr_requests = (total_ram >> 8) & ~15; /* One per quarter-megabyte */ if (queue_nr_requests < 32) queue_nr_requests = 32; - if (queue_nr_requests > 512) - queue_nr_requests = 512; + if (queue_nr_requests > 256) + queue_nr_requests = 256; /* * Batch frees according to queue length -- cgit v1.2.3 From 43967af3411e677c136fbb6901d47ab4a192e319 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:18:44 -0700 Subject: [PATCH] mark_buffer_dirty_inode() speedup buffer_insert_list() is showing up on Anton's graphs. It'll be via ext2's mark_buffer_dirty_inode() against indirect blocks. If the buffer is already on an inode queue, we know that it is on the correct inode's queue so we don't need to re-add it. --- fs/buffer.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index c16959deb6ea..abe95fd38d22 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -856,8 +856,9 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode) if (mapping->assoc_mapping != buffer_mapping) BUG(); } - buffer_insert_list(&buffer_mapping->private_lock, - bh, &mapping->private_list); + if (list_empty(&bh->b_assoc_buffers)) + buffer_insert_list(&buffer_mapping->private_lock, + bh, &mapping->private_list); } EXPORT_SYMBOL(mark_buffer_dirty_inode); @@ -1243,10 +1244,17 @@ void __brelse(struct buffer_head * buf) * bforget() is like brelse(), except it discards any * potentially dirty data. */ -void __bforget(struct buffer_head * buf) +void __bforget(struct buffer_head *bh) { - clear_buffer_dirty(buf); - __brelse(buf); + clear_buffer_dirty(bh); + if (!list_empty(&bh->b_assoc_buffers)) { + struct address_space *buffer_mapping = bh->b_page->mapping; + + spin_lock(&buffer_mapping->private_lock); + list_del_init(&bh->b_assoc_buffers); + spin_unlock(&buffer_mapping->private_lock); + } + __brelse(bh); } /** -- cgit v1.2.3 From 3ab86fb0d43ce886f67521f4f0bb959901fa12c8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:18:58 -0700 Subject: [PATCH] leave swapcache pages unlocked during writeout Convert swap pages so that they are PageWriteback and !PageLocked while under writeout, like all other block-backed pages. (Network filesystems aren't doing this yet - their pages are still locked while under writeout) --- fs/buffer.c | 32 +++++++------------------------- mm/shmem.c | 22 +++++++++++++++++----- mm/swap_state.c | 7 +++---- mm/swapfile.c | 16 ++++++++++++---- 4 files changed, 39 insertions(+), 38 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index abe95fd38d22..6ff9598d1e44 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -542,14 +542,6 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) */ if (page_uptodate && !PageError(page)) SetPageUptodate(page); - - /* - * swap page handling is a bit hacky. A standalone completion handler - * for swapout pages would fix that up. swapin can use this function. - */ - if (PageSwapCache(page) && PageWriteback(page)) - end_page_writeback(page); - unlock_page(page); return; @@ -559,8 +551,9 @@ still_busy: } /* - * Completion handler for block_write_full_page() - pages which are unlocked - * during I/O, and which have PageWriteback cleared upon I/O completion. + * Completion handler for block_write_full_page() and for brw_page() - pages + * which are unlocked during I/O, and which have PageWriteback cleared + * upon I/O completion. */ static void end_buffer_async_write(struct buffer_head *bh, int uptodate) { @@ -2281,16 +2274,6 @@ int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], * * FIXME: we need a swapper_inode->get_block function to remove * some of the bmap kludges and interface ugliness here. - * - * NOTE: unlike file pages, swap pages are locked while under writeout. - * This is to throttle processes which reuse their swapcache pages while - * they are under writeout, and to ensure that there is no I/O going on - * when the page has been successfully locked. Functions such as - * free_swap_and_cache() need to guarantee that there is no I/O in progress - * because they will be freeing up swap blocks, which may then be reused. - * - * Swap pages are also marked PageWriteback when they are being written - * so that memory allocators will throttle on them. */ int brw_page(int rw, struct page *page, struct block_device *bdev, sector_t b[], int size) @@ -2312,18 +2295,17 @@ int brw_page(int rw, struct page *page, if (rw == WRITE) { set_buffer_uptodate(bh); clear_buffer_dirty(bh); + mark_buffer_async_write(bh); + } else { + mark_buffer_async_read(bh); } - /* - * Swap pages are locked during writeout, so use - * buffer_async_read in strange ways. - */ - mark_buffer_async_read(bh); bh = bh->b_this_page; } while (bh != head); if (rw == WRITE) { BUG_ON(PageWriteback(page)); SetPageWriteback(page); + unlock_page(page); } /* Stage 2: start the IO */ diff --git a/mm/shmem.c b/mm/shmem.c index 9367252b65b0..07bdba83bdf5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -426,15 +426,22 @@ found: swap_free(entry); ptr[offset] = (swp_entry_t) {0}; - while (inode && move_from_swap_cache(page, idx, inode->i_mapping)) { + while (inode && (PageWriteback(page) || + move_from_swap_cache(page, idx, inode->i_mapping))) { /* * Yield for kswapd, and try again - but we're still * holding the page lock - ugh! fix this up later on. * Beware of inode being unlinked or truncated: just * leave try_to_unuse to delete_from_swap_cache if so. + * + * AKPM: We now wait on writeback too. Note that it's + * the page lock which prevents new writeback from starting. */ spin_unlock(&info->lock); - yield(); + if (PageWriteback(page)) + wait_on_page_writeback(page); + else + yield(); spin_lock(&info->lock); ptr = shmem_swp_entry(info, idx, 0); if (IS_ERR(ptr)) @@ -594,9 +601,14 @@ repeat: } /* We have to do this with page locked to prevent races */ - if (TestSetPageLocked(page)) + if (TestSetPageLocked(page)) goto wait_retry; - + if (PageWriteback(page)) { + spin_unlock(&info->lock); + wait_on_page_writeback(page); + unlock_page(page); + goto repeat; + } error = move_from_swap_cache(page, idx, mapping); if (error < 0) { unlock_page(page); @@ -651,7 +663,7 @@ no_space: return ERR_PTR(-ENOSPC); wait_retry: - spin_unlock (&info->lock); + spin_unlock(&info->lock); wait_on_page_locked(page); page_cache_release(page); goto repeat; diff --git a/mm/swap_state.c b/mm/swap_state.c index 5fe5a4462bbb..925e5c516b79 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -131,10 +131,9 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry) */ void __delete_from_swap_cache(struct page *page) { - if (!PageLocked(page)) - BUG(); - if (!PageSwapCache(page)) - BUG(); + BUG_ON(!PageLocked(page)); + BUG_ON(!PageSwapCache(page)); + BUG_ON(PageWriteback(page)); ClearPageDirty(page); __remove_inode_page(page); INC_CACHE_INFO(del_total); diff --git a/mm/swapfile.c b/mm/swapfile.c index 70a517bbcc16..656e94d7be05 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -298,6 +298,8 @@ int remove_exclusive_swap_page(struct page *page) BUG(); if (!PageSwapCache(page)) return 0; + if (PageWriteback(page)) + return 0; if (page_count(page) - !!PagePrivate(page) != 2) /* 2: us + cache */ return 0; @@ -311,7 +313,8 @@ int remove_exclusive_swap_page(struct page *page) if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ write_lock(&swapper_space.page_lock); - if (page_count(page) - !!PagePrivate(page) == 2) { + if ((page_count(page) - !!page_has_buffers(page) == 2) && + !PageWriteback(page)) { __delete_from_swap_cache(page); /* * NOTE: if/when swap gets buffer/page coherency @@ -326,7 +329,6 @@ int remove_exclusive_swap_page(struct page *page) swap_info_put(p); if (retval) { - BUG_ON(PageWriteback(page)); if (page_has_buffers(page) && !try_to_free_buffers(page)) BUG(); swap_free(entry); @@ -352,9 +354,12 @@ void free_swap_and_cache(swp_entry_t entry) swap_info_put(p); } if (page) { + int one_user; + page_cache_get(page); + one_user = (page_count(page) - !!page_has_buffers(page) == 2); /* Only cache user (+us), or swap space full? Free it! */ - if (page_count(page) - !!PagePrivate(page) == 2 || vm_swap_full()) { + if (!PageWriteback(page) && (one_user || vm_swap_full())) { delete_from_swap_cache(page); SetPageDirty(page); } @@ -606,6 +611,7 @@ static int try_to_unuse(unsigned int type) wait_on_page_locked(page); wait_on_page_writeback(page); lock_page(page); + wait_on_page_writeback(page); /* * Remove all references to entry, without blocking. @@ -688,8 +694,10 @@ static int try_to_unuse(unsigned int type) rw_swap_page(WRITE, page); lock_page(page); } - if (PageSwapCache(page)) + if (PageSwapCache(page)) { + wait_on_page_writeback(page); delete_from_swap_cache(page); + } /* * So we could skip searching mms once swap count went -- cgit v1.2.3 From 88c4650a9ece8fef2be042fbbec2dde2d0afa1a4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:19:13 -0700 Subject: [PATCH] direct-to-BIO I/O for swapcache pages This patch changes the swap I/O handling. The objectives are: - Remove swap special-casing - Stop using buffer_heads -> direct-to-BIO - Make S_ISREG swapfiles more robust. I've spent quite some time with swap. The first patches converted swap to use block_read/write_full_page(). These were discarded because they are still using buffer_heads, and a reasonable amount of otherwise unnecessary infrastructure had to be added to the swap code just to make it look like a regular fs. So this code just has a custom direct-to-BIO path for swap, which seems to be the most comfortable approach. A significant thing here is the introduction of "swap extents". A swap extent is a simple data structure which maps a range of swap pages onto a range of disk sectors. It is simply: struct swap_extent { struct list_head list; pgoff_t start_page; pgoff_t nr_pages; sector_t start_block; }; At swapon time (for an S_ISREG swapfile), each block in the file is bmapped() and the block numbers are parsed to generate the device's swap extent list. This extent list is quite compact - a 512 megabyte swapfile generates about 130 nodes in the list. That's about 4 kbytes of storage. The conversion from filesystem blocksize blocks into PAGE_SIZE blocks is performed at swapon time. At swapon time (for an S_ISBLK swapfile), we install a single swap extent which describes the entire device. The advantages of the swap extents are: 1: We never have to run bmap() (ie: read from disk) at swapout time. So S_ISREG swapfiles are now just as robust as S_ISBLK swapfiles. 2: All the differences between S_ISBLK swapfiles and S_ISREG swapfiles are handled at swapon time. During normal operation, we just don't care. Both types of swapfiles are handled the same way. 3: The extent lists always operate in PAGE_SIZE units. So the problems of going from fs blocksize to PAGE_SIZE are handled at swapon time and normal operating code doesn't need to care. 4: Because we don't have to fiddle with different blocksizes, we can go direct-to-BIO for swap_readpage() and swap_writepage(). This introduces the kernel-wide invariant "anonymous pages never have buffers attached", which cleans some things up nicely. All those block_flushpage() calls in the swap code simply go away. 5: The kernel no longer has to allocate both buffer_heads and BIOs to perform swapout. Just a BIO. 6: It permits us to perform swapcache writeout and throttling for GFP_NOFS allocations (a later patch). (Well, there is one sort of anon page which can have buffers: the pages which are cast adrift in truncate_complete_page() because do_invalidatepage() failed. But these pages are never added to swapcache, and nobody except the VM LRU has to deal with them). The swapfile parser in setup_swap_extents() will attempt to extract the largest possible number of PAGE_SIZE-sized and PAGE_SIZE-aligned chunks of disk from the S_ISREG swapfile. Any stray blocks (due to file discontiguities) are simply discarded - we never swap to those. If an S_ISREG swapfile is found to have any unmapped blocks (file holes) then the swapon attempt will fail. The extent list can be quite large (hundreds of nodes for a gigabyte S_ISREG swapfile). It needs to be consulted once for each page within swap_readpage() and swap_writepage(). Hence there is a risk that we could blow significant amounts of CPU walking that list. However I have implemented a "where we found the last block" cache, which is used as the starting point for the next search. Empirical testing indicates that this is wildly effective - the average length of the list walk in map_swap_page() is 0.3 iterations per page, with a 130-element list. It _could_ be that some workloads do start suffering long walks in that code, and perhaps a tree would be needed there. But I doubt that, and if this is happening then it means that we're seeking all over the disk for swap I/O, and the list walk is the least of our problems. rw_swap_page_nolock() now takes a page*, not a kernel virtual address. It has been renamed to rw_swap_page_sync() and it takes care of locking and unlocking the page itself. Which is all a much better interface. Support for type 0 swap has been removed. Current versions of mkwap(8) seem to never produce v0 swap unless you explicitly ask for it, so I doubt if this will affect anyone. If you _do_ have a type 0 swapfile, swapon will fail and the message version 0 swap is no longer supported. Use mkswap -v1 /dev/sdb3 is printed. We can remove that code for real later on. Really, all that swapfile header parsing should be pushed out to userspace. This code always uses single-page BIOs for swapin and swapout. I have an additional patch which converts swap to use mpage_writepages(), so we swap out in 16-page BIOs. It works fine, but I don't intend to submit that. There just doesn't seem to be any significant advantage to it. I can't see anything in sys_swapon()/sys_swapoff() which needs the lock_kernel() calls, so I deleted them. If you ftruncate an S_ISREG swapfile to a shorter size while it is in use, subsequent swapout will destroy the filesystem. It was always thus, but it is much, much easier to do now. Not really a kernel problem, but swapon(8) should not be allowing the kernel to use swapfiles which are modifiable by unprivileged users. --- fs/buffer.c | 74 ++------- include/linux/buffer_head.h | 1 - include/linux/swap.h | 32 +++- kernel/ksyms.c | 1 - kernel/suspend.c | 35 ++-- mm/page_io.c | 221 +++++++++++++++---------- mm/swap_state.c | 67 ++------ mm/swapfile.c | 394 +++++++++++++++++++++++++++++--------------- 8 files changed, 460 insertions(+), 365 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 6ff9598d1e44..5d9dee75f287 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -492,7 +492,7 @@ static void free_more_memory(void) } /* - * I/O completion handler for block_read_full_page() and brw_page() - pages + * I/O completion handler for block_read_full_page() - pages * which come unlocked at the end of I/O. */ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) @@ -551,9 +551,8 @@ still_busy: } /* - * Completion handler for block_write_full_page() and for brw_page() - pages - * which are unlocked during I/O, and which have PageWriteback cleared - * upon I/O completion. + * Completion handler for block_write_full_page() - pages which are unlocked + * during I/O, and which have PageWriteback cleared upon I/O completion. */ static void end_buffer_async_write(struct buffer_head *bh, int uptodate) { @@ -1360,11 +1359,11 @@ int block_invalidatepage(struct page *page, unsigned long offset) { struct buffer_head *head, *bh, *next; unsigned int curr_off = 0; + int ret = 1; - if (!PageLocked(page)) - BUG(); + BUG_ON(!PageLocked(page)); if (!page_has_buffers(page)) - return 1; + goto out; head = page_buffers(page); bh = head; @@ -1386,12 +1385,10 @@ int block_invalidatepage(struct page *page, unsigned long offset) * The get_block cached value has been unconditionally invalidated, * so real IO is not possible anymore. */ - if (offset == 0) { - if (!try_to_release_page(page, 0)) - return 0; - } - - return 1; + if (offset == 0) + ret = try_to_release_page(page, 0); +out: + return ret; } EXPORT_SYMBOL(block_invalidatepage); @@ -2266,57 +2263,6 @@ int brw_kiovec(int rw, int nr, struct kiobuf *iovec[], return err ? err : transferred; } -/* - * Start I/O on a page. - * This function expects the page to be locked and may return - * before I/O is complete. You then have to check page->locked - * and page->uptodate. - * - * FIXME: we need a swapper_inode->get_block function to remove - * some of the bmap kludges and interface ugliness here. - */ -int brw_page(int rw, struct page *page, - struct block_device *bdev, sector_t b[], int size) -{ - struct buffer_head *head, *bh; - - BUG_ON(!PageLocked(page)); - - if (!page_has_buffers(page)) - create_empty_buffers(page, size, 0); - head = bh = page_buffers(page); - - /* Stage 1: lock all the buffers */ - do { - lock_buffer(bh); - bh->b_blocknr = *(b++); - bh->b_bdev = bdev; - set_buffer_mapped(bh); - if (rw == WRITE) { - set_buffer_uptodate(bh); - clear_buffer_dirty(bh); - mark_buffer_async_write(bh); - } else { - mark_buffer_async_read(bh); - } - bh = bh->b_this_page; - } while (bh != head); - - if (rw == WRITE) { - BUG_ON(PageWriteback(page)); - SetPageWriteback(page); - unlock_page(page); - } - - /* Stage 2: start the IO */ - do { - struct buffer_head *next = bh->b_this_page; - submit_bh(rw, bh); - bh = next; - } while (bh != head); - return 0; -} - /* * Sanity checks for try_to_free_buffers. */ diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 90767fc78617..fda967ab9358 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -183,7 +183,6 @@ struct buffer_head * __bread(struct block_device *, int, int); void wakeup_bdflush(void); struct buffer_head *alloc_buffer_head(int async); void free_buffer_head(struct buffer_head * bh); -int brw_page(int, struct page *, struct block_device *, sector_t [], int); void FASTCALL(unlock_buffer(struct buffer_head *bh)); /* diff --git a/include/linux/swap.h b/include/linux/swap.h index d0160265e3c5..0b448a811a39 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ @@ -61,6 +62,21 @@ typedef struct { #ifdef __KERNEL__ +/* + * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of + * disk blocks. A list of swap extents maps the entire swapfile. (Where the + * term `swapfile' refers to either a blockdevice or an IS_REG file. Apart + * from setup, they're handled identically. + * + * We always assume that blocks are of size PAGE_SIZE. + */ +struct swap_extent { + struct list_head list; + pgoff_t start_page; + pgoff_t nr_pages; + sector_t start_block; +}; + /* * Max bad pages in the new format.. */ @@ -83,11 +99,17 @@ enum { /* * The in-memory structure used to track swap areas. + * extent_list.prev points at the lowest-index extent. That list is + * sorted. */ struct swap_info_struct { unsigned int flags; spinlock_t sdev_lock; struct file *swap_file; + struct block_device *bdev; + struct list_head extent_list; + int nr_extents; + struct swap_extent *curr_swap_extent; unsigned old_block_size; unsigned short * swap_map; unsigned int lowest_bit; @@ -134,8 +156,9 @@ extern wait_queue_head_t kswapd_wait; extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int)); /* linux/mm/page_io.c */ -extern void rw_swap_page(int, struct page *); -extern void rw_swap_page_nolock(int, swp_entry_t, char *); +int swap_readpage(struct file *file, struct page *page); +int swap_writepage(struct page *page); +int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page); /* linux/mm/page_alloc.c */ @@ -163,12 +186,13 @@ extern unsigned int nr_swapfiles; extern struct swap_info_struct swap_info[]; extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(void); -extern void get_swaphandle_info(swp_entry_t, unsigned long *, struct inode **); extern int swap_duplicate(swp_entry_t); -extern int swap_count(struct page *); extern int valid_swaphandles(swp_entry_t, unsigned long *); extern void swap_free(swp_entry_t); extern void free_swap_and_cache(swp_entry_t); +sector_t map_swap_page(struct swap_info_struct *p, pgoff_t offset); +struct swap_info_struct *get_swap_info_struct(unsigned type); + struct swap_list_t { int head; /* head of priority-ordered swapfile list */ int next; /* swapfile to be used next */ diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 9391bb0e933d..34a50cd558e9 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -559,7 +559,6 @@ EXPORT_SYMBOL(buffer_insert_list); EXPORT_SYMBOL(make_bad_inode); EXPORT_SYMBOL(is_bad_inode); EXPORT_SYMBOL(event); -EXPORT_SYMBOL(brw_page); #ifdef CONFIG_UID16 EXPORT_SYMBOL(overflowuid); diff --git a/kernel/suspend.c b/kernel/suspend.c index 2fcf5db57868..12e5b0f01f57 100644 --- a/kernel/suspend.c +++ b/kernel/suspend.c @@ -320,14 +320,15 @@ static void mark_swapfiles(swp_entry_t prev, int mode) { swp_entry_t entry; union diskpage *cur; - - cur = (union diskpage *)get_free_page(GFP_ATOMIC); - if (!cur) + struct page *page; + + page = alloc_page(GFP_ATOMIC); + if (!page) panic("Out of memory in mark_swapfiles"); + cur = page_address(page); /* XXX: this is dirty hack to get first page of swap file */ entry = swp_entry(root_swap, 0); - lock_page(virt_to_page((unsigned long)cur)); - rw_swap_page_nolock(READ, entry, (char *) cur); + rw_swap_page_sync(READ, entry, page); if (mode == MARK_SWAP_RESUME) { if (!memcmp("SUSP1R",cur->swh.magic.magic,6)) @@ -345,10 +346,8 @@ static void mark_swapfiles(swp_entry_t prev, int mode) cur->link.next = prev; /* prev is the first/last swap page of the resume area */ /* link.next lies *no more* in last 4 bytes of magic */ } - lock_page(virt_to_page((unsigned long)cur)); - rw_swap_page_nolock(WRITE, entry, (char *)cur); - - free_page((unsigned long)cur); + rw_swap_page_sync(WRITE, entry, page); + __free_page(page); } static void read_swapfiles(void) /* This is called before saving image */ @@ -409,6 +408,7 @@ static int write_suspend_image(void) int nr_pgdir_pages = SUSPEND_PD_PAGES(nr_copy_pages); union diskpage *cur, *buffer = (union diskpage *)get_free_page(GFP_ATOMIC); unsigned long address; + struct page *page; PRINTS( "Writing data to swap (%d pages): ", nr_copy_pages ); for (i=0; iaddress; - lock_page(virt_to_page(address)); - { - long dummy1; - struct inode *suspend_file; - get_swaphandle_info(entry, &dummy1, &suspend_file); - } - rw_swap_page_nolock(WRITE, entry, (char *) address); + page = virt_to_page(address); + rw_swap_page_sync(WRITE, entry, page); (pagedir_nosave+i)->swap_address = entry; } PRINTK(" done\n"); @@ -452,8 +447,8 @@ static int write_suspend_image(void) if (PAGE_SIZE % sizeof(struct pbe)) panic("I need PAGE_SIZE to be integer multiple of struct pbe, otherwise next assignment could damage pagedir"); cur->link.next = prev; - lock_page(virt_to_page((unsigned long)cur)); - rw_swap_page_nolock(WRITE, entry, (char *) cur); + page = virt_to_page((unsigned long)cur); + rw_swap_page_sync(WRITE, entry, page); prev = entry; } PRINTK(", header"); @@ -473,8 +468,8 @@ static int write_suspend_image(void) cur->link.next = prev; - lock_page(virt_to_page((unsigned long)cur)); - rw_swap_page_nolock(WRITE, entry, (char *) cur); + page = virt_to_page((unsigned long)cur); + rw_swap_page_sync(WRITE, entry, page); prev = entry; PRINTK( ", signature" ); diff --git a/mm/page_io.c b/mm/page_io.c index 942ea274dccd..3692ead4d94c 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -14,112 +14,163 @@ #include #include #include -#include -#include /* for brw_page() */ - +#include +#include #include +#include -/* - * Reads or writes a swap page. - * wait=1: start I/O and wait for completion. wait=0: start asynchronous I/O. - * - * Important prevention of race condition: the caller *must* atomically - * create a unique swap cache entry for this swap page before calling - * rw_swap_page, and must lock that page. By ensuring that there is a - * single page of memory reserved for the swap entry, the normal VM page - * lock on that page also doubles as a lock on swap entries. Having only - * one lock to deal with per swap entry (rather than locking swap and memory - * independently) also makes it easier to make certain swapping operations - * atomic, which is particularly important when we are trying to ensure - * that shared pages stay shared while being swapped. - */ +static int +swap_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct swap_info_struct *sis; + swp_entry_t entry; -static int rw_swap_page_base(int rw, swp_entry_t entry, struct page *page) + entry.val = iblock; + sis = get_swap_info_struct(swp_type(entry)); + bh_result->b_bdev = sis->bdev; + bh_result->b_blocknr = map_swap_page(sis, swp_offset(entry)); + bh_result->b_size = PAGE_SIZE; + set_buffer_mapped(bh_result); + return 0; +} + +static struct bio * +get_swap_bio(int gfp_flags, struct page *page, bio_end_io_t end_io) { - unsigned long offset; - sector_t zones[PAGE_SIZE/512]; - int zones_used; - int block_size; - struct inode *swapf = 0; - struct block_device *bdev; + struct bio *bio; + struct buffer_head bh; - if (rw == READ) { + bio = bio_alloc(gfp_flags, 1); + if (bio) { + swap_get_block(NULL, page->index, &bh, 1); + bio->bi_sector = bh.b_blocknr * (PAGE_SIZE >> 9); + bio->bi_bdev = bh.b_bdev; + bio->bi_io_vec[0].bv_page = page; + bio->bi_io_vec[0].bv_len = PAGE_SIZE; + bio->bi_io_vec[0].bv_offset = 0; + bio->bi_vcnt = 1; + bio->bi_idx = 0; + bio->bi_size = PAGE_SIZE; + bio->bi_end_io = end_io; + } + return bio; +} + +static void end_swap_bio_write(struct bio *bio) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct page *page = bio->bi_io_vec[0].bv_page; + + if (!uptodate) + SetPageError(page); + end_page_writeback(page); + bio_put(bio); +} + +static void end_swap_bio_read(struct bio *bio) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct page *page = bio->bi_io_vec[0].bv_page; + + if (!uptodate) { + SetPageError(page); ClearPageUptodate(page); - kstat.pswpin++; - } else - kstat.pswpout++; - - get_swaphandle_info(entry, &offset, &swapf); - bdev = swapf->i_bdev; - if (bdev) { - zones[0] = offset; - zones_used = 1; - block_size = PAGE_SIZE; } else { - int i, j; - unsigned int block = offset - << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits); - - block_size = swapf->i_sb->s_blocksize; - for (i=0, j=0; j< PAGE_SIZE ; i++, j += block_size) - if (!(zones[i] = bmap(swapf,block++))) { - printk("rw_swap_page: bad swap file\n"); - return 0; - } - zones_used = i; - bdev = swapf->i_sb->s_bdev; + SetPageUptodate(page); } + unlock_page(page); + bio_put(bio); +} - /* block_size == PAGE_SIZE/zones_used */ - brw_page(rw, page, bdev, zones, block_size); +/* + * We may have stale swap cache pages in memory: notice + * them here and get rid of the unnecessary final write. + */ +int swap_writepage(struct page *page) +{ + struct bio *bio; + int ret = 0; - /* Note! For consistency we do all of the logic, - * decrementing the page count, and unlocking the page in the - * swap lock map - in the IO completion handler. - */ - return 1; + if (remove_exclusive_swap_page(page)) { + unlock_page(page); + goto out; + } + bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); + if (bio == NULL) { + ret = -ENOMEM; + goto out; + } + kstat.pswpout++; + SetPageWriteback(page); + unlock_page(page); + submit_bio(WRITE, bio); +out: + return ret; } +int swap_readpage(struct file *file, struct page *page) +{ + struct bio *bio; + int ret = 0; + + ClearPageUptodate(page); + bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); + if (bio == NULL) { + ret = -ENOMEM; + goto out; + } + kstat.pswpin++; + submit_bio(READ, bio); +out: + return ret; +} /* - * A simple wrapper so the base function doesn't need to enforce - * that all swap pages go through the swap cache! We verify that: - * - the page is locked - * - it's marked as being swap-cache - * - it's associated with the swap inode + * swapper_space doesn't have a real inode, so it gets a special vm_writeback() + * so we don't need swap special cases in generic_vm_writeback(). + * + * Swap pages are PageLocked and PageWriteback while under writeout so that + * memory allocators will throttle against them. */ -void rw_swap_page(int rw, struct page *page) +static int swap_vm_writeback(struct page *page, int *nr_to_write) { - swp_entry_t entry; + struct address_space *mapping = page->mapping; - entry.val = page->index; - - if (!PageLocked(page)) - PAGE_BUG(page); - if (!PageSwapCache(page)) - PAGE_BUG(page); - if (!rw_swap_page_base(rw, entry, page)) - unlock_page(page); + unlock_page(page); + return generic_writepages(mapping, nr_to_write); } +struct address_space_operations swap_aops = { + vm_writeback: swap_vm_writeback, + writepage: swap_writepage, + readpage: swap_readpage, + sync_page: block_sync_page, + set_page_dirty: __set_page_dirty_nobuffers, +}; + /* - * The swap lock map insists that pages be in the page cache! - * Therefore we can't use it. Later when we can remove the need for the - * lock map and we can reduce the number of functions exported. + * A scruffy utility function to read or write an arbitrary swap page + * and wait on the I/O. */ -void rw_swap_page_nolock(int rw, swp_entry_t entry, char *buf) +int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page) { - struct page *page = virt_to_page(buf); - - if (!PageLocked(page)) - PAGE_BUG(page); - if (page->mapping) - PAGE_BUG(page); - /* needs sync_page to wait I/O completation */ + int ret; + + lock_page(page); + + BUG_ON(page->mapping); page->mapping = &swapper_space; - if (rw_swap_page_base(rw, entry, page)) - lock_page(page); - if (page_has_buffers(page) && !try_to_free_buffers(page)) - PAGE_BUG(page); + page->index = entry.val; + + if (rw == READ) { + ret = swap_readpage(NULL, page); + wait_on_page_locked(page); + } else { + ret = swap_writepage(page); + wait_on_page_writeback(page); + } page->mapping = NULL; - unlock_page(page); + if (ret == 0 && (!PageUptodate(page) || PageError(page))) + ret = -EIO; + return ret; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 925e5c516b79..4513649a1208 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -14,54 +14,27 @@ #include #include #include -#include /* block_sync_page()/try_to_free_buffers() */ +#include /* block_sync_page() */ #include -/* - * We may have stale swap cache pages in memory: notice - * them here and get rid of the unnecessary final write. - */ -static int swap_writepage(struct page *page) -{ - if (remove_exclusive_swap_page(page)) { - unlock_page(page); - return 0; - } - rw_swap_page(WRITE, page); - return 0; -} - -/* - * swapper_space doesn't have a real inode, so it gets a special vm_writeback() - * so we don't need swap special cases in generic_vm_writeback(). - * - * Swap pages are PageLocked and PageWriteback while under writeout so that - * memory allocators will throttle against them. - */ -static int swap_vm_writeback(struct page *page, int *nr_to_write) -{ - struct address_space *mapping = page->mapping; - - unlock_page(page); - return generic_writepages(mapping, nr_to_write); -} - -static struct address_space_operations swap_aops = { - vm_writeback: swap_vm_writeback, - writepage: swap_writepage, - sync_page: block_sync_page, - set_page_dirty: __set_page_dirty_nobuffers, -}; - /* * swapper_inode doesn't do anything much. It is really only here to * avoid some special-casing in other parts of the kernel. + * + * We set i_size to "infinity" to keep the page I/O functions happy. The swap + * block allocator makes sure that allocations are in-range. A strange + * number is chosen to prevent various arith overflows elsewhere. For example, + * `lblock' in block_read_full_page(). */ static struct inode swapper_inode = { - i_mapping: &swapper_space, + i_mapping: &swapper_space, + i_size: PAGE_SIZE * 0xffffffffLL, + i_blkbits: PAGE_SHIFT, }; +extern struct address_space_operations swap_aops; + struct address_space swapper_space = { page_tree: RADIX_TREE_INIT(GFP_ATOMIC), page_lock: RW_LOCK_UNLOCKED, @@ -149,14 +122,9 @@ void delete_from_swap_cache(struct page *page) { swp_entry_t entry; - /* - * I/O should have completed and nobody can have a ref against the - * page's buffers - */ BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - if (page_has_buffers(page) && !try_to_free_buffers(page)) - BUG(); + BUG_ON(page_has_buffers(page)); entry.val = page->index; @@ -222,16 +190,9 @@ int move_from_swap_cache(struct page *page, unsigned long index, void **pslot; int err; - /* - * Drop the buffers now, before taking the page_lock. Because - * mapping->private_lock nests outside mapping->page_lock. - * This "must" succeed. The page is locked and all I/O has completed - * and nobody else has a ref against its buffers. - */ BUG_ON(!PageLocked(page)); BUG_ON(PageWriteback(page)); - if (page_has_buffers(page) && !try_to_free_buffers(page)) - BUG(); + BUG_ON(page_has_buffers(page)); write_lock(&swapper_space.page_lock); write_lock(&mapping->page_lock); @@ -361,7 +322,7 @@ struct page * read_swap_cache_async(swp_entry_t entry) /* * Initiate read into locked page and return. */ - rw_swap_page(READ, new_page); + swap_readpage(NULL, new_page); return new_page; } } while (err != -ENOENT && err != -ENOMEM); diff --git a/mm/swapfile.c b/mm/swapfile.c index 656e94d7be05..175c812a63d6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -16,7 +16,7 @@ #include #include #include -#include /* for try_to_free_buffers() */ +#include #include #include @@ -294,13 +294,14 @@ int remove_exclusive_swap_page(struct page *page) struct swap_info_struct * p; swp_entry_t entry; - if (!PageLocked(page)) - BUG(); + BUG_ON(page_has_buffers(page)); + BUG_ON(!PageLocked(page)); + if (!PageSwapCache(page)) return 0; if (PageWriteback(page)) return 0; - if (page_count(page) - !!PagePrivate(page) != 2) /* 2: us + cache */ + if (page_count(page) != 2) /* 2: us + cache */ return 0; entry.val = page->index; @@ -313,14 +314,8 @@ int remove_exclusive_swap_page(struct page *page) if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the pagecache lock held.. */ write_lock(&swapper_space.page_lock); - if ((page_count(page) - !!page_has_buffers(page) == 2) && - !PageWriteback(page)) { + if ((page_count(page) == 2) && !PageWriteback(page)) { __delete_from_swap_cache(page); - /* - * NOTE: if/when swap gets buffer/page coherency - * like other mappings, we'll need to mark the buffers - * dirty here too. set_page_dirty(). - */ SetPageDirty(page); retval = 1; } @@ -329,8 +324,6 @@ int remove_exclusive_swap_page(struct page *page) swap_info_put(p); if (retval) { - if (page_has_buffers(page) && !try_to_free_buffers(page)) - BUG(); swap_free(entry); page_cache_release(page); } @@ -356,8 +349,9 @@ void free_swap_and_cache(swp_entry_t entry) if (page) { int one_user; + BUG_ON(page_has_buffers(page)); page_cache_get(page); - one_user = (page_count(page) - !!page_has_buffers(page) == 2); + one_user = (page_count(page) == 2); /* Only cache user (+us), or swap space full? Free it! */ if (!PageWriteback(page) && (one_user || vm_swap_full())) { delete_from_swap_cache(page); @@ -691,7 +685,7 @@ static int try_to_unuse(unsigned int type) * Note shmem_unuse already deleted its from swap cache. */ if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { - rw_swap_page(WRITE, page); + swap_writepage(page); lock_page(page); } if (PageSwapCache(page)) { @@ -725,6 +719,207 @@ static int try_to_unuse(unsigned int type) return retval; } +/* + * Use this swapdev's extent info to locate the (PAGE_SIZE) block which + * corresponds to page offset `offset'. + */ +sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) +{ + struct swap_extent *se = sis->curr_swap_extent; + struct swap_extent *start_se = se; + + for ( ; ; ) { + struct list_head *lh; + + if (se->start_page <= offset && + offset < (se->start_page + se->nr_pages)) { + return se->start_block + (offset - se->start_page); + } + lh = se->list.prev; + if (lh == &sis->extent_list) + lh = lh->prev; + se = list_entry(lh, struct swap_extent, list); + sis->curr_swap_extent = se; + BUG_ON(se == start_se); /* It *must* be present */ + } +} + +/* + * Free all of a swapdev's extent information + */ +static void destroy_swap_extents(struct swap_info_struct *sis) +{ + while (!list_empty(&sis->extent_list)) { + struct swap_extent *se; + + se = list_entry(sis->extent_list.next, + struct swap_extent, list); + list_del(&se->list); + kfree(se); + } + sis->nr_extents = 0; +} + +/* + * Add a block range (and the corresponding page range) into this swapdev's + * extent list. The extent list is kept sorted in block order. + * + * This function rather assumes that it is called in ascending sector_t order. + * It doesn't look for extent coalescing opportunities. + */ +static int +add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, + unsigned long nr_pages, sector_t start_block) +{ + struct swap_extent *se; + struct swap_extent *new_se; + struct list_head *lh; + + lh = sis->extent_list.next; /* The highest-addressed block */ + while (lh != &sis->extent_list) { + se = list_entry(lh, struct swap_extent, list); + if (se->start_block + se->nr_pages == start_block) { + /* Merge it */ + se->nr_pages += nr_pages; + return 0; + } + lh = lh->next; + } + + /* + * No merge. Insert a new extent, preserving ordering. + */ + new_se = kmalloc(sizeof(*se), GFP_KERNEL); + if (new_se == NULL) + return -ENOMEM; + new_se->start_page = start_page; + new_se->nr_pages = nr_pages; + new_se->start_block = start_block; + + lh = sis->extent_list.prev; /* The lowest block */ + while (lh != &sis->extent_list) { + se = list_entry(lh, struct swap_extent, list); + if (se->start_block > start_block) + break; + lh = lh->prev; + } + list_add_tail(&new_se->list, lh); + sis->nr_extents++; + return 0; +} + +/* + * A `swap extent' is a simple thing which maps a contiguous range of pages + * onto a contiguous range of disk blocks. An ordered list of swap extents + * is built at swapon time and is then used at swap_writepage/swap_readpage + * time for locating where on disk a page belongs. + * + * If the swapfile is an S_ISBLK block device, a single extent is installed. + * This is done so that the main operating code can treat S_ISBLK and S_ISREG + * swap files identically. + * + * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap + * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK + * swapfiles are handled *identically* after swapon time. + * + * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks + * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If + * some stray blocks are found which do not fall within the PAGE_SIZE alignment + * requirements, they are simply tossed out - we will never use those blocks + * for swapping. + * + * The amount of disk space which a single swap extent represents varies. + * Typically it is in the 1-4 megabyte range. So we can have hundreds of + * extents in the list. To avoid much list walking, we cache the previous + * search location in `curr_swap_extent', and start new searches from there. + * This is extremely effective. The average number of iterations in + * map_swap_page() has been measured at about 0.3 per page. - akpm. + */ +static int setup_swap_extents(struct swap_info_struct *sis) +{ + struct inode *inode; + unsigned blocks_per_page; + unsigned long page_no; + unsigned blkbits; + sector_t probe_block; + sector_t last_block; + int ret; + + inode = sis->swap_file->f_dentry->d_inode; + if (S_ISBLK(inode->i_mode)) { + ret = add_swap_extent(sis, 0, sis->max, 0); + goto done; + } + + blkbits = inode->i_blkbits; + blocks_per_page = PAGE_SIZE >> blkbits; + + /* + * Map all the blocks into the extent list. This code doesn't try + * to be very smart. + */ + probe_block = 0; + page_no = 0; + last_block = inode->i_size >> blkbits; + while ((probe_block + blocks_per_page) <= last_block && + page_no < sis->max) { + unsigned block_in_page; + sector_t first_block; + + first_block = bmap(inode, probe_block); + if (first_block == 0) + goto bad_bmap; + + /* + * It must be PAGE_SIZE aligned on-disk + */ + if (first_block & (blocks_per_page - 1)) { + probe_block++; + goto reprobe; + } + + for (block_in_page = 1; block_in_page < blocks_per_page; + block_in_page++) { + sector_t block; + + block = bmap(inode, probe_block + block_in_page); + if (block == 0) + goto bad_bmap; + if (block != first_block + block_in_page) { + /* Discontiguity */ + probe_block++; + goto reprobe; + } + } + + /* + * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks + */ + ret = add_swap_extent(sis, page_no, 1, + first_block >> (PAGE_SHIFT - blkbits)); + if (ret) + goto out; + page_no++; + probe_block += blocks_per_page; +reprobe: + continue; + } + ret = 0; + if (page_no == 0) + ret = -EINVAL; + sis->max = page_no; + sis->highest_bit = page_no - 1; +done: + sis->curr_swap_extent = list_entry(sis->extent_list.prev, + struct swap_extent, list); + goto out; +bad_bmap: + printk(KERN_ERR "swapon: swapfile has holes\n"); + ret = -EINVAL; +out: + return ret; +} + asmlinkage long sys_swapoff(const char * specialfile) { struct swap_info_struct * p = NULL; @@ -741,7 +936,6 @@ asmlinkage long sys_swapoff(const char * specialfile) if (err) goto out; - lock_kernel(); prev = -1; swap_list_lock(); for (type = swap_list.head; type >= 0; type = swap_info[type].next) { @@ -771,9 +965,7 @@ asmlinkage long sys_swapoff(const char * specialfile) total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; swap_list_unlock(); - unlock_kernel(); err = try_to_unuse(type); - lock_kernel(); if (err) { /* re-insert swap space back into swap_list */ swap_list_lock(); @@ -799,6 +991,7 @@ asmlinkage long sys_swapoff(const char * specialfile) swap_map = p->swap_map; p->swap_map = NULL; p->flags = 0; + destroy_swap_extents(p); swap_device_unlock(p); swap_list_unlock(); vfree(swap_map); @@ -812,7 +1005,6 @@ asmlinkage long sys_swapoff(const char * specialfile) err = 0; out_dput: - unlock_kernel(); path_release(&nd); out: return err; @@ -866,12 +1058,12 @@ int get_swaparea_info(char *buf) asmlinkage long sys_swapon(const char * specialfile, int swap_flags) { struct swap_info_struct * p; - char *name; + char *name = NULL; struct block_device *bdev = NULL; struct file *swap_file = NULL; struct address_space *mapping; unsigned int type; - int i, j, prev; + int i, prev; int error; static int least_priority = 0; union swap_header *swap_header = 0; @@ -880,10 +1072,10 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) unsigned long maxpages = 1; int swapfilesize; unsigned short *swap_map; - + struct page *page = NULL; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - lock_kernel(); swap_list_lock(); p = swap_info; for (type = 0 ; type < nr_swapfiles ; type++,p++) @@ -896,7 +1088,9 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) } if (type >= nr_swapfiles) nr_swapfiles = type+1; + INIT_LIST_HEAD(&p->extent_list); p->flags = SWP_USED; + p->nr_extents = 0; p->swap_file = NULL; p->old_block_size = 0; p->swap_map = NULL; @@ -917,7 +1111,6 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) if (IS_ERR(name)) goto bad_swap_2; swap_file = filp_open(name, O_RDWR, 0); - putname(name); error = PTR_ERR(swap_file); if (IS_ERR(swap_file)) { swap_file = NULL; @@ -939,8 +1132,12 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) PAGE_SIZE); if (error < 0) goto bad_swap; - } else if (!S_ISREG(swap_file->f_dentry->d_inode->i_mode)) + p->bdev = bdev; + } else if (S_ISREG(swap_file->f_dentry->d_inode->i_mode)) { + p->bdev = swap_file->f_dentry->d_inode->i_sb->s_bdev; + } else { goto bad_swap; + } mapping = swap_file->f_dentry->d_inode->i_mapping; swapfilesize = mapping->host->i_size >> PAGE_SHIFT; @@ -954,15 +1151,20 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) goto bad_swap; } - swap_header = (void *) __get_free_page(GFP_USER); - if (!swap_header) { - printk("Unable to start swapping: out of memory :-)\n"); - error = -ENOMEM; + /* + * Read the swap header. + */ + page = read_cache_page(mapping, 0, + (filler_t *)mapping->a_ops->readpage, swap_file); + if (IS_ERR(page)) { + error = PTR_ERR(page); goto bad_swap; } - - lock_page(virt_to_page(swap_header)); - rw_swap_page_nolock(READ, swp_entry(type,0), (char *) swap_header); + wait_on_page_locked(page); + if (!PageUptodate(page)) + goto bad_swap; + kmap(page); + swap_header = page_address(page); if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) swap_header_version = 1; @@ -976,33 +1178,10 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) switch (swap_header_version) { case 1: - memset(((char *) swap_header)+PAGE_SIZE-10,0,10); - j = 0; - p->lowest_bit = 0; - p->highest_bit = 0; - for (i = 1 ; i < 8*PAGE_SIZE ; i++) { - if (test_bit(i,(unsigned long *) swap_header)) { - if (!p->lowest_bit) - p->lowest_bit = i; - p->highest_bit = i; - maxpages = i+1; - j++; - } - } - nr_good_pages = j; - p->swap_map = vmalloc(maxpages * sizeof(short)); - if (!p->swap_map) { - error = -ENOMEM; - goto bad_swap; - } - for (i = 1 ; i < maxpages ; i++) { - if (test_bit(i,(unsigned long *) swap_header)) - p->swap_map[i] = 0; - else - p->swap_map[i] = SWAP_MAP_BAD; - } - break; - + printk(KERN_ERR "version 0 swap is no longer supported. " + "Use mkswap -v1 %s\n", name); + error = -EINVAL; + goto bad_swap; case 2: /* Check the swap header's sub-version and the size of the swap file and bad block lists */ @@ -1058,15 +1237,20 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags) goto bad_swap; } p->swap_map[0] = SWAP_MAP_BAD; + p->max = maxpages; + p->pages = nr_good_pages; + + if (setup_swap_extents(p)) + goto bad_swap; + swap_list_lock(); swap_device_lock(p); - p->max = maxpages; p->flags = SWP_ACTIVE; - p->pages = nr_good_pages; nr_swap_pages += nr_good_pages; total_swap_pages += nr_good_pages; - printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n", - nr_good_pages<<(PAGE_SHIFT-10), p->prio); + printk(KERN_INFO "Adding %dk swap on %s. Priority:%d extents:%d\n", + nr_good_pages<<(PAGE_SHIFT-10), name, + p->prio, p->nr_extents); /* insert swap space into swap_list: */ prev = -1; @@ -1100,14 +1284,18 @@ bad_swap_2: if (!(swap_flags & SWAP_FLAG_PREFER)) ++least_priority; swap_list_unlock(); + destroy_swap_extents(p); if (swap_map) vfree(swap_map); if (swap_file && !IS_ERR(swap_file)) filp_close(swap_file, NULL); out: - if (swap_header) - free_page((long) swap_header); - unlock_kernel(); + if (page && !IS_ERR(page)) { + kunmap(page); + page_cache_release(page); + } + if (name) + putname(name); return error; } @@ -1176,78 +1364,10 @@ bad_file: goto out; } -/* - * Page lock needs to be held in all cases to prevent races with - * swap file deletion. - */ -int swap_count(struct page *page) +struct swap_info_struct * +get_swap_info_struct(unsigned type) { - struct swap_info_struct * p; - unsigned long offset, type; - swp_entry_t entry; - int retval = 0; - - entry.val = page->index; - if (!entry.val) - goto bad_entry; - type = swp_type(entry); - if (type >= nr_swapfiles) - goto bad_file; - p = type + swap_info; - offset = swp_offset(entry); - if (offset >= p->max) - goto bad_offset; - if (!p->swap_map[offset]) - goto bad_unused; - retval = p->swap_map[offset]; -out: - return retval; - -bad_entry: - printk(KERN_ERR "swap_count: null entry!\n"); - goto out; -bad_file: - printk(KERN_ERR "swap_count: %s%08lx\n", Bad_file, entry.val); - goto out; -bad_offset: - printk(KERN_ERR "swap_count: %s%08lx\n", Bad_offset, entry.val); - goto out; -bad_unused: - printk(KERN_ERR "swap_count: %s%08lx\n", Unused_offset, entry.val); - goto out; -} - -/* - * Prior swap_duplicate protects against swap device deletion. - */ -void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, - struct inode **swapf) -{ - unsigned long type; - struct swap_info_struct *p; - - type = swp_type(entry); - if (type >= nr_swapfiles) { - printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val); - return; - } - - p = &swap_info[type]; - *offset = swp_offset(entry); - if (*offset >= p->max && *offset != 0) { - printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val); - return; - } - if (p->swap_map && !p->swap_map[*offset]) { - printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val); - return; - } - if (!(p->flags & SWP_USED)) { - printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val); - return; - } - - *swapf = p->swap_file->f_dentry->d_inode; + return &swap_info[type]; } /* -- cgit v1.2.3 From 8504e4792403def2b9d5e1a6c91c15a1840f39c8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:19:26 -0700 Subject: [PATCH] fix loop driver for large BIOs Fix the loop driver for loop-on-blockdev setups. When presented with a multipage BIO, loop_make_request overindexes the first page and corrupts kernel memory. Fix it to walk the individual pages. BTW, I suspect the IV handling in loop may be incorrect for multipage BIOs. Should we not be recalculating the IV for each page in the BIOs, or incrementing the offset by the size of the preceding pages, or such? --- drivers/block/loop.c | 83 +++++++++++++++++++++++++++++----------------------- include/linux/loop.h | 8 ----- 2 files changed, 47 insertions(+), 44 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 5689de41b771..ecc87aa5a819 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -168,6 +168,15 @@ static void figure_loop_size(struct loop_device *lo) } +static inline int lo_do_transfer(struct loop_device *lo, int cmd, char *rbuf, + char *lbuf, int size, int rblock) +{ + if (!lo->transfer) + return 0; + + return lo->transfer(lo, cmd, rbuf, lbuf, size, rblock); +} + static int do_lo_send(struct loop_device *lo, struct bio_vec *bvec, int bsize, loff_t pos) { @@ -454,20 +463,43 @@ static struct bio *loop_get_buffer(struct loop_device *lo, struct bio *rbh) out_bh: bio->bi_sector = rbh->bi_sector + (lo->lo_offset >> 9); bio->bi_rw = rbh->bi_rw; - spin_lock_irq(&lo->lo_lock); bio->bi_bdev = lo->lo_device; - spin_unlock_irq(&lo->lo_lock); return bio; } -static int loop_make_request(request_queue_t *q, struct bio *rbh) +static int +bio_transfer(struct loop_device *lo, struct bio *to_bio, + struct bio *from_bio) { - struct bio *bh = NULL; + unsigned long IV = loop_get_iv(lo, from_bio->bi_sector); + struct bio_vec *from_bvec, *to_bvec; + char *vto, *vfrom; + int ret = 0, i; + + __bio_for_each_segment(from_bvec, from_bio, i, 0) { + to_bvec = &to_bio->bi_io_vec[i]; + + kmap(from_bvec->bv_page); + kmap(to_bvec->bv_page); + vfrom = page_address(from_bvec->bv_page) + from_bvec->bv_offset; + vto = page_address(to_bvec->bv_page) + to_bvec->bv_offset; + ret |= lo_do_transfer(lo, bio_data_dir(to_bio), vto, vfrom, + from_bvec->bv_len, IV); + kunmap(from_bvec->bv_page); + kunmap(to_bvec->bv_page); + } + + return ret; +} + +static int loop_make_request(request_queue_t *q, struct bio *old_bio) +{ + struct bio *new_bio = NULL; struct loop_device *lo; unsigned long IV; - int rw = bio_rw(rbh); - int unit = minor(to_kdev_t(rbh->bi_bdev->bd_dev)); + int rw = bio_rw(old_bio); + int unit = minor(to_kdev_t(old_bio->bi_bdev->bd_dev)); if (unit >= max_loop) goto out; @@ -489,60 +521,41 @@ static int loop_make_request(request_queue_t *q, struct bio *rbh) goto err; } - blk_queue_bounce(q, &rbh); + blk_queue_bounce(q, &old_bio); /* * file backed, queue for loop_thread to handle */ if (lo->lo_flags & LO_FLAGS_DO_BMAP) { - loop_add_bio(lo, rbh); + loop_add_bio(lo, old_bio); return 0; } /* * piggy old buffer on original, and submit for I/O */ - bh = loop_get_buffer(lo, rbh); - IV = loop_get_iv(lo, rbh->bi_sector); + new_bio = loop_get_buffer(lo, old_bio); + IV = loop_get_iv(lo, old_bio->bi_sector); if (rw == WRITE) { - if (lo_do_transfer(lo, WRITE, bio_data(bh), bio_data(rbh), - bh->bi_size, IV)) + if (bio_transfer(lo, new_bio, old_bio)) goto err; } - generic_make_request(bh); + generic_make_request(new_bio); return 0; err: if (atomic_dec_and_test(&lo->lo_pending)) up(&lo->lo_bh_mutex); - loop_put_buffer(bh); + loop_put_buffer(new_bio); out: - bio_io_error(rbh); + bio_io_error(old_bio); return 0; inactive: spin_unlock_irq(&lo->lo_lock); goto out; } -static int do_bio_blockbacked(struct loop_device *lo, struct bio *bio, - struct bio *rbh) -{ - unsigned long IV = loop_get_iv(lo, rbh->bi_sector); - struct bio_vec *from; - char *vto, *vfrom; - int ret = 0, i; - - bio_for_each_segment(from, rbh, i) { - vfrom = page_address(from->bv_page) + from->bv_offset; - vto = page_address(bio->bi_io_vec[i].bv_page) + bio->bi_io_vec[i].bv_offset; - ret |= lo_do_transfer(lo, bio_data_dir(bio), vto, vfrom, - from->bv_len, IV); - } - - return ret; -} - static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio) { int ret; @@ -556,7 +569,7 @@ static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio) } else { struct bio *rbh = bio->bi_private; - ret = do_bio_blockbacked(lo, bio, rbh); + ret = bio_transfer(lo, bio, rbh); bio_endio(rbh, !ret); loop_put_buffer(bio); @@ -588,10 +601,8 @@ static int loop_thread(void *data) set_user_nice(current, -20); - spin_lock_irq(&lo->lo_lock); lo->lo_state = Lo_bound; atomic_inc(&lo->lo_pending); - spin_unlock_irq(&lo->lo_lock); /* * up sem, we are running diff --git a/include/linux/loop.h b/include/linux/loop.h index d4dc0665a92d..4dfa8b14a586 100644 --- a/include/linux/loop.h +++ b/include/linux/loop.h @@ -62,14 +62,6 @@ typedef int (* transfer_proc_t)(struct loop_device *, int cmd, char *raw_buf, char *loop_buf, int size, int real_block); -static inline int lo_do_transfer(struct loop_device *lo, int cmd, char *rbuf, - char *lbuf, int size, int rblock) -{ - if (!lo->transfer) - return 0; - - return lo->transfer(lo, cmd, rbuf, lbuf, size, rblock); -} #endif /* __KERNEL__ */ /* -- cgit v1.2.3 From 9d8e6506454723f7df81399911ee31bed63e91ce Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:19:40 -0700 Subject: [PATCH] kmap_atomic fix in bio_copy() bio_copy is doing vfrom = kmap_atomic(bv->bv_page, KM_BIO_IRQ); vto = kmap_atomic(bbv->bv_page, KM_BIO_IRQ); which, if I understand atomic kmaps, is incorrect. Both source and dest will get the same pte. The patch creates a separate atomic kmap member for the destination and source of this copy. --- fs/bio.c | 8 ++++---- fs/ntfs/aops.c | 16 ++++++++-------- include/asm-i386/kmap_types.h | 9 +++++---- include/asm-ppc/kmap_types.h | 3 ++- include/asm-sparc/kmap_types.h | 3 ++- include/asm-x86_64/kmap_types.h | 3 ++- include/linux/highmem.h | 4 ++-- 7 files changed, 25 insertions(+), 21 deletions(-) diff --git a/fs/bio.c b/fs/bio.c index e89734a07bea..00cd91f8aaa1 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -284,8 +284,8 @@ struct bio *bio_copy(struct bio *bio, int gfp_mask, int copy) vto = kmap(bbv->bv_page); } else { local_irq_save(flags); - vfrom = kmap_atomic(bv->bv_page, KM_BIO_IRQ); - vto = kmap_atomic(bbv->bv_page, KM_BIO_IRQ); + vfrom = kmap_atomic(bv->bv_page, KM_BIO_SRC_IRQ); + vto = kmap_atomic(bbv->bv_page, KM_BIO_DST_IRQ); } memcpy(vto + bbv->bv_offset, vfrom + bv->bv_offset, bv->bv_len); @@ -293,8 +293,8 @@ struct bio *bio_copy(struct bio *bio, int gfp_mask, int copy) kunmap(bbv->bv_page); kunmap(bv->bv_page); } else { - kunmap_atomic(vto, KM_BIO_IRQ); - kunmap_atomic(vfrom, KM_BIO_IRQ); + kunmap_atomic(vto, KM_BIO_DST_IRQ); + kunmap_atomic(vfrom, KM_BIO_SRC_IRQ); local_irq_restore(flags); } } diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index fbff42392bab..7c20a2949e96 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -61,10 +61,10 @@ static void end_buffer_read_file_async(struct buffer_head *bh, int uptodate) if (file_ofs < ni->initialized_size) ofs = ni->initialized_size - file_ofs; - addr = kmap_atomic(page, KM_BIO_IRQ); + addr = kmap_atomic(page, KM_BIO_SRC_IRQ); memset(addr + bh_offset(bh) + ofs, 0, bh->b_size - ofs); flush_dcache_page(page); - kunmap_atomic(addr, KM_BIO_IRQ); + kunmap_atomic(addr, KM_BIO_SRC_IRQ); } } else SetPageError(page); @@ -363,10 +363,10 @@ static void end_buffer_read_mftbmp_async(struct buffer_head *bh, int uptodate) if (file_ofs < vol->mftbmp_initialized_size) ofs = vol->mftbmp_initialized_size - file_ofs; - addr = kmap_atomic(page, KM_BIO_IRQ); + addr = kmap_atomic(page, KM_BIO_SRC_IRQ); memset(addr + bh_offset(bh) + ofs, 0, bh->b_size - ofs); flush_dcache_page(page); - kunmap_atomic(addr, KM_BIO_IRQ); + kunmap_atomic(addr, KM_BIO_SRC_IRQ); } } else SetPageError(page); @@ -559,10 +559,10 @@ static void end_buffer_read_mst_async(struct buffer_head *bh, int uptodate) if (file_ofs < ni->initialized_size) ofs = ni->initialized_size - file_ofs; - addr = kmap_atomic(page, KM_BIO_IRQ); + addr = kmap_atomic(page, KM_BIO_SRC_IRQ); memset(addr + bh_offset(bh) + ofs, 0, bh->b_size - ofs); flush_dcache_page(page); - kunmap_atomic(addr, KM_BIO_IRQ); + kunmap_atomic(addr, KM_BIO_SRC_IRQ); } } else SetPageError(page); @@ -593,7 +593,7 @@ static void end_buffer_read_mst_async(struct buffer_head *bh, int uptodate) rec_size = ni->_IDM(index_block_size); recs = PAGE_CACHE_SIZE / rec_size; - addr = kmap_atomic(page, KM_BIO_IRQ); + addr = kmap_atomic(page, KM_BIO_SRC_IRQ); for (i = 0; i < recs; i++) { if (!post_read_mst_fixup((NTFS_RECORD*)(addr + i * rec_size), rec_size)) @@ -607,7 +607,7 @@ static void end_buffer_read_mst_async(struct buffer_head *bh, int uptodate) ni->_IDM(index_block_size_bits)) + i)); } flush_dcache_page(page); - kunmap_atomic(addr, KM_BIO_IRQ); + kunmap_atomic(addr, KM_BIO_SRC_IRQ); if (likely(!nr_err && recs)) SetPageUptodate(page); else { diff --git a/include/asm-i386/kmap_types.h b/include/asm-i386/kmap_types.h index 9a12267d3a4f..0ae7bb3c2b8d 100644 --- a/include/asm-i386/kmap_types.h +++ b/include/asm-i386/kmap_types.h @@ -15,10 +15,11 @@ D(1) KM_SKB_SUNRPC_DATA, D(2) KM_SKB_DATA_SOFTIRQ, D(3) KM_USER0, D(4) KM_USER1, -D(5) KM_BIO_IRQ, -D(6) KM_PTE0, -D(7) KM_PTE1, -D(8) KM_TYPE_NR +D(5) KM_BIO_SRC_IRQ, +D(6) KM_BIO_DST_IRQ, +D(7) KM_PTE0, +D(8) KM_PTE1, +D(9) KM_TYPE_NR }; #undef D diff --git a/include/asm-ppc/kmap_types.h b/include/asm-ppc/kmap_types.h index 99fec407abf5..bce7fd8c1ff2 100644 --- a/include/asm-ppc/kmap_types.h +++ b/include/asm-ppc/kmap_types.h @@ -11,7 +11,8 @@ enum km_type { KM_SKB_DATA_SOFTIRQ, KM_USER0, KM_USER1, - KM_BIO_IRQ, + KM_BIO_SRC_IRQ, + KM_BIO_DST_IRQ, KM_PTE0, KM_PTE1, KM_TYPE_NR diff --git a/include/asm-sparc/kmap_types.h b/include/asm-sparc/kmap_types.h index 7e9a5661c698..bab20a2a676b 100644 --- a/include/asm-sparc/kmap_types.h +++ b/include/asm-sparc/kmap_types.h @@ -7,7 +7,8 @@ enum km_type { KM_SKB_DATA_SOFTIRQ, KM_USER0, KM_USER1, - KM_BIO_IRQ, + KM_BIO_SRC_IRQ, + KM_BIO_DST_IRQ, KM_TYPE_NR }; diff --git a/include/asm-x86_64/kmap_types.h b/include/asm-x86_64/kmap_types.h index 7e9a5661c698..bab20a2a676b 100644 --- a/include/asm-x86_64/kmap_types.h +++ b/include/asm-x86_64/kmap_types.h @@ -7,7 +7,8 @@ enum km_type { KM_SKB_DATA_SOFTIRQ, KM_USER0, KM_USER1, - KM_BIO_IRQ, + KM_BIO_SRC_IRQ, + KM_BIO_DST_IRQ, KM_TYPE_NR }; diff --git a/include/linux/highmem.h b/include/linux/highmem.h index da66723d62c5..54fb9176fb75 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -38,7 +38,7 @@ static inline char *bio_kmap_irq(struct bio *bio, unsigned long *flags) * it's a highmem page */ __cli(); - addr = (unsigned long) kmap_atomic(bio_page(bio), KM_BIO_IRQ); + addr = (unsigned long) kmap_atomic(bio_page(bio), KM_BIO_SRC_IRQ); if (addr & ~PAGE_MASK) BUG(); @@ -50,7 +50,7 @@ static inline void bio_kunmap_irq(char *buffer, unsigned long *flags) { unsigned long ptr = (unsigned long) buffer & PAGE_MASK; - kunmap_atomic((void *) ptr, KM_BIO_IRQ); + kunmap_atomic((void *) ptr, KM_BIO_SRC_IRQ); __restore_flags(*flags); } -- cgit v1.2.3 From 1704566fde4fd7ea2be6e8a4a2e0731459d0fa48 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:19:54 -0700 Subject: [PATCH] ext3: clean up journal_try_to_free_buffers() Clean up ext3's journal_try_to_free_buffers(). Now that the releasepage() a_op is non-blocking and need not perform I/O, this function becomes much simpler. --- fs/jbd/transaction.c | 65 ++++++++++++++-------------------------------------- 1 file changed, 17 insertions(+), 48 deletions(-) diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 04f15abd8cb6..37c9ed30ebfd 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -1601,8 +1601,7 @@ void journal_unfile_buffer(struct journal_head *jh) * * Returns non-zero iff we were able to free the journal_head. */ -static int __journal_try_to_free_buffer(struct buffer_head *bh, - int *locked_or_dirty) +static inline int __journal_try_to_free_buffer(struct buffer_head *bh) { struct journal_head *jh; @@ -1610,12 +1609,7 @@ static int __journal_try_to_free_buffer(struct buffer_head *bh, jh = bh2jh(bh); - if (buffer_locked(bh) || buffer_dirty(bh)) { - *locked_or_dirty = 1; - goto out; - } - - if (!buffer_uptodate(bh)) /* AKPM: why? */ + if (buffer_locked(bh) || buffer_dirty(bh)) goto out; if (jh->b_next_transaction != 0) @@ -1630,8 +1624,7 @@ static int __journal_try_to_free_buffer(struct buffer_head *bh, __journal_remove_journal_head(bh); __brelse(bh); } - } - else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) { + } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) { /* written-back checkpointed metadata buffer */ if (jh->b_jlist == BJ_None) { JBUFFER_TRACE(jh, "remove from checkpoint list"); @@ -1647,10 +1640,8 @@ out: } /* - * journal_try_to_free_buffers(). For all the buffers on this page, - * if they are fully written out ordered data, move them onto BUF_CLEAN - * so try_to_free_buffers() can reap them. Called with lru_list_lock - * not held. Does its own locking. + * journal_try_to_free_buffers(). Try to remove all this page's buffers + * from the journal. * * This complicates JBD locking somewhat. We aren't protected by the * BKL here. We wish to remove the buffer from its committing or @@ -1669,50 +1660,28 @@ out: * journal_try_to_free_buffer() is changing its state. But that * cannot happen because we never reallocate freed data as metadata * while the data is part of a transaction. Yes? - * - * This function returns non-zero if we wish try_to_free_buffers() - * to be called. We do this is the page is releasable by try_to_free_buffers(). - * We also do it if the page has locked or dirty buffers and the caller wants - * us to perform sync or async writeout. */ int journal_try_to_free_buffers(journal_t *journal, - struct page *page, int gfp_mask) + struct page *page, int unused_gfp_mask) { + struct buffer_head *head; struct buffer_head *bh; - struct buffer_head *tmp; - int locked_or_dirty = 0; - int call_ttfb = 1; - int ret; + int ret = 0; J_ASSERT(PageLocked(page)); - bh = page_buffers(page); - tmp = bh; + head = page_buffers(page); + bh = head; spin_lock(&journal_datalist_lock); do { - struct buffer_head *p = tmp; - - tmp = tmp->b_this_page; - if (buffer_jbd(p)) - if (!__journal_try_to_free_buffer(p, &locked_or_dirty)) - call_ttfb = 0; - } while (tmp != bh); + if (buffer_jbd(bh) && !__journal_try_to_free_buffer(bh)) { + spin_unlock(&journal_datalist_lock); + goto busy; + } + } while ((bh = bh->b_this_page) != head); spin_unlock(&journal_datalist_lock); - - if (!(gfp_mask & (__GFP_IO|__GFP_WAIT))) - goto out; - if (!locked_or_dirty) - goto out; - /* - * The VM wants us to do writeout, or to block on IO, or both. - * So we allow try_to_free_buffers to be called even if the page - * still has journalled buffers. - */ - call_ttfb = 1; -out: - ret = 0; - if (call_ttfb) - ret = try_to_free_buffers(page); + ret = try_to_free_buffers(page); +busy: return ret; } -- cgit v1.2.3 From c67b85b06126b960a44c1dcdf809cf9e947f4a1c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:20:09 -0700 Subject: [PATCH] clean up alloc_buffer_head() alloc_bufer_head() does not need the additional argument - GFP_NOFS is always correct. --- fs/buffer.c | 14 +++++++------- fs/jbd/journal.c | 2 +- include/linux/buffer_head.h | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 5d9dee75f287..70025ee603a6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -945,12 +945,12 @@ void invalidate_inode_buffers(struct inode *inode) * the size of each buffer.. Use the bh->b_this_page linked list to * follow the buffers created. Return NULL if unable to create more * buffers. - * The async flag is used to differentiate async IO (paging, swapping) - * from ordinary buffer allocations, and only async requests are allowed - * to sleep waiting for buffer heads. + * + * The retry flag is used to differentiate async IO (paging, swapping) + * which may not fail from ordinary buffer allocations. */ static struct buffer_head * -create_buffers(struct page * page, unsigned long size, int async) +create_buffers(struct page * page, unsigned long size, int retry) { struct buffer_head *bh, *head; long offset; @@ -959,7 +959,7 @@ try_again: head = NULL; offset = PAGE_SIZE; while ((offset -= size) >= 0) { - bh = alloc_buffer_head(async); + bh = alloc_buffer_head(); if (!bh) goto no_grow; @@ -996,7 +996,7 @@ no_grow: * become available. But we don't want tasks sleeping with * partially complete buffers, so all were released above. */ - if (!async) + if (!retry) return NULL; /* We're _really_ low on memory. Now we just @@ -2392,7 +2392,7 @@ asmlinkage long sys_bdflush(int func, long data) static kmem_cache_t *bh_cachep; static mempool_t *bh_mempool; -struct buffer_head *alloc_buffer_head(int async) +struct buffer_head *alloc_buffer_head(void) { return mempool_alloc(bh_mempool, GFP_NOFS); } diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 052dd4ef3f01..ade37ad43606 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -463,7 +463,7 @@ int journal_write_metadata_buffer(transaction_t *transaction, * Right, time to make up the new buffer_head. */ do { - new_bh = alloc_buffer_head(0); + new_bh = alloc_buffer_head(); if (!new_bh) { printk (KERN_NOTICE "%s: ENOMEM at alloc_buffer_head, " "trying again.\n", __FUNCTION__); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index fda967ab9358..5df63727fa7a 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -181,7 +181,7 @@ void __brelse(struct buffer_head *); void __bforget(struct buffer_head *); struct buffer_head * __bread(struct block_device *, int, int); void wakeup_bdflush(void); -struct buffer_head *alloc_buffer_head(int async); +struct buffer_head *alloc_buffer_head(void); void free_buffer_head(struct buffer_head * bh); void FASTCALL(unlock_buffer(struct buffer_head *bh)); -- cgit v1.2.3 From a28b4d4ede7e5f9bcde157417957998571b7a639 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:20:24 -0700 Subject: [PATCH] take bio.h out of highmem.h highmem.h includes bio.h, so just about every compilation unit in the kernel gets to process bio.h. The patch moves the BIO-related functions out of highmem.h and into bio-related headers. The nested include is removed and all files which need to include bio.h now do so. --- drivers/block/DAC960.c | 1 + drivers/block/cciss.c | 1 + drivers/block/cpqarray.c | 1 + drivers/block/elevator.c | 1 + drivers/block/floppy.c | 1 + drivers/block/ll_rw_blk.c | 1 + drivers/block/loop.c | 1 + drivers/block/nbd.c | 1 + drivers/block/rd.c | 2 ++ drivers/block/umem.c | 1 + drivers/md/linear.c | 2 +- drivers/md/lvm.c | 1 + drivers/md/md.c | 1 + drivers/md/multipath.c | 1 + drivers/md/raid0.c | 1 + drivers/md/raid1.c | 1 + drivers/md/raid5.c | 1 + drivers/scsi/cpqfcTSinit.c | 1 + drivers/scsi/scsi_lib.c | 1 + drivers/scsi/sd.c | 1 + drivers/scsi/sr.c | 1 + fs/bio.c | 1 + fs/jfs/jfs_logmgr.c | 1 + include/linux/bio.h | 50 ++++++++++++++++++++++++++++++++++++++++++--- include/linux/blkdev.h | 7 +------ include/linux/buffer_head.h | 5 ----- include/linux/highmem.h | 44 --------------------------------------- include/linux/ide.h | 1 + include/linux/raid/raid5.h | 1 + kernel/ksyms.c | 2 +- mm/highmem.c | 5 ++++- 31 files changed, 79 insertions(+), 61 deletions(-) diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index d57dc51df3f5..210449ad1715 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 9ae961460ff2..e06fd274b653 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c index 727cdeb23c0c..fccef1bb792c 100644 --- a/drivers/block/cpqarray.c +++ b/drivers/block/cpqarray.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c index 189814dbc7d1..cd3a4254e9e3 100644 --- a/drivers/block/elevator.c +++ b/drivers/block/elevator.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 94f42b356556..aff8acff0ef3 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -165,6 +165,7 @@ static int print_unex=1; #include #include #include +#include #include #include #include diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index 3527afa3cae7..16abcb3f5481 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ecc87aa5a819..982604ff6bfd 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 67344c7fcc1a..697e825c3a91 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/rd.c b/drivers/block/rd.c index 4faf52c7be5c..7b60e75d5584 100644 --- a/drivers/block/rd.c +++ b/drivers/block/rd.c @@ -45,6 +45,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 8c61688cab1c..44909021aa06 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 118ce821a208..48fb74e50d5c 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -20,7 +20,7 @@ #include #include - +#include #include #define MAJOR_NR MD_MAJOR diff --git a/drivers/md/lvm.c b/drivers/md/lvm.c index dfc256c6a2ec..c44a1b8a74b2 100644 --- a/drivers/md/lvm.c +++ b/drivers/md/lvm.c @@ -209,6 +209,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/md/md.c b/drivers/md/md.c index 21e20ea10be7..d23270322804 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 46f089ee8481..6db555317b13 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 430448c566af..8f149a1efe1b 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -20,6 +20,7 @@ #include #include +#include #define MAJOR_NR MD_MAJOR #define MD_DRIVER diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 43fdb75de0fe..96ad858cf033 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -23,6 +23,7 @@ */ #include +#include #define MAJOR_NR MD_MAJOR #define MD_DRIVER diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 9402b0c779b9..62873d89e395 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/drivers/scsi/cpqfcTSinit.c b/drivers/scsi/cpqfcTSinit.c index e6f03847c212..f38e377207c7 100644 --- a/drivers/scsi/cpqfcTSinit.c +++ b/drivers/scsi/cpqfcTSinit.c @@ -39,6 +39,7 @@ #include #include #include +#include #include // request_region() prototype #include // ioremap() //#if LINUX_VERSION_CODE >= LinuxVersionCode(2,4,7) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index fc69760ab484..bede96547efb 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 382e04ceace2..63fe305e4342 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index d536f3bc94f6..0e28dc69652b 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/bio.c b/fs/bio.c index 00cd91f8aaa1..5fdae32e35ae 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -17,6 +17,7 @@ * */ #include +#include #include #include #include diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index ea37f1c39a64..7790f413096a 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -65,6 +65,7 @@ #include #include #include /* for sync_blockdev() */ +#include #include "jfs_incore.h" #include "jfs_filsys.h" #include "jfs_metapage.h" diff --git a/include/linux/bio.h b/include/linux/bio.h index b244108a27a8..ffc38fca9c1e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -21,6 +21,8 @@ #define __LINUX_BIO_H #include +#include + /* Platforms may set this to teach the BIO layer about IOMMU hardware. */ #include #ifndef BIO_VMERGE_BOUNDARY @@ -47,9 +49,6 @@ struct bio_vec { unsigned int bv_offset; }; -/* - * weee, c forward decl... - */ struct bio; typedef void (bio_end_io_t) (struct bio *); typedef void (bio_destructor_t) (struct bio *); @@ -206,4 +205,49 @@ extern inline void bio_init(struct bio *); extern int bio_ioctl(kdev_t, unsigned int, unsigned long); +#ifdef CONFIG_HIGHMEM +/* + * remember to add offset! and never ever reenable interrupts between a + * bio_kmap_irq and bio_kunmap_irq!! + * + * This function MUST be inlined - it plays with the CPU interrupt flags. + * Hence the `extern inline'. + */ +extern inline char *bio_kmap_irq(struct bio *bio, unsigned long *flags) +{ + unsigned long addr; + + __save_flags(*flags); + + /* + * could be low + */ + if (!PageHighMem(bio_page(bio))) + return bio_data(bio); + + /* + * it's a highmem page + */ + __cli(); + addr = (unsigned long) kmap_atomic(bio_page(bio), KM_BIO_SRC_IRQ); + + if (addr & ~PAGE_MASK) + BUG(); + + return (char *) addr + bio_offset(bio); +} + +extern inline void bio_kunmap_irq(char *buffer, unsigned long *flags) +{ + unsigned long ptr = (unsigned long) buffer & PAGE_MASK; + + kunmap_atomic((void *) ptr, KM_BIO_SRC_IRQ); + __restore_flags(*flags); +} + +#else +#define bio_kmap_irq(bio, flags) (bio_data(bio)) +#define bio_kunmap_irq(buf, flags) do { *(flags) = 0; } while (0) +#endif + #endif /* __LINUX_BIO_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ef86a3ed6e64..c0c099834df2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -246,12 +246,7 @@ extern unsigned long blk_max_low_pfn, blk_max_pfn; #define BLK_BOUNCE_ISA (ISA_DMA_THRESHOLD) extern int init_emergency_isa_pool(void); -extern void create_bounce(unsigned long pfn, int gfp, struct bio **bio_orig); - -extern inline void blk_queue_bounce(request_queue_t *q, struct bio **bio) -{ - create_bounce(q->bounce_pfn, q->bounce_gfp, bio); -} +void blk_queue_bounce(request_queue_t *q, struct bio **bio); #define rq_for_each_bio(bio, rq) \ if ((rq->bio)) \ diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 5df63727fa7a..96207c0d6dce 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -108,12 +108,7 @@ BUFFER_FNS(Async_Read, async_read) BUFFER_FNS(Async_Write, async_write) BUFFER_FNS(Boundary, boundary) -/* - * FIXME: this is used only by bh_kmap, which is used only by RAID5. - * Move all that stuff into raid5.c - */ #define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK) - #define touch_buffer(bh) mark_page_accessed(bh->b_page) /* If we *know* page->private refers to buffer_heads */ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 54fb9176fb75..68c841afc622 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -2,7 +2,6 @@ #define _LINUX_HIGHMEM_H #include -#include #include #include @@ -15,45 +14,8 @@ extern struct page *highmem_start_page; /* declarations for linux/mm/highmem.c */ unsigned int nr_free_highpages(void); -extern void create_bounce(unsigned long pfn, int gfp, struct bio **bio_orig); extern void check_highmem_ptes(void); -/* - * remember to add offset! and never ever reenable interrupts between a - * bio_kmap_irq and bio_kunmap_irq!! - */ -static inline char *bio_kmap_irq(struct bio *bio, unsigned long *flags) -{ - unsigned long addr; - - __save_flags(*flags); - - /* - * could be low - */ - if (!PageHighMem(bio_page(bio))) - return bio_data(bio); - - /* - * it's a highmem page - */ - __cli(); - addr = (unsigned long) kmap_atomic(bio_page(bio), KM_BIO_SRC_IRQ); - - if (addr & ~PAGE_MASK) - BUG(); - - return (char *) addr + bio_offset(bio); -} - -static inline void bio_kunmap_irq(char *buffer, unsigned long *flags) -{ - unsigned long ptr = (unsigned long) buffer & PAGE_MASK; - - kunmap_atomic((void *) ptr, KM_BIO_SRC_IRQ); - __restore_flags(*flags); -} - #else /* CONFIG_HIGHMEM */ static inline unsigned int nr_free_highpages(void) { return 0; } @@ -65,12 +27,6 @@ static inline void *kmap(struct page *page) { return page_address(page); } #define kmap_atomic(page,idx) kmap(page) #define kunmap_atomic(page,idx) kunmap(page) -#define bh_kmap(bh) ((bh)->b_data) -#define bh_kunmap(bh) do { } while (0) - -#define bio_kmap_irq(bio, flags) (bio_data(bio)) -#define bio_kunmap_irq(buf, flags) do { *(flags) = 0; } while (0) - #endif /* CONFIG_HIGHMEM */ /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ diff --git a/include/linux/ide.h b/include/linux/ide.h index e07d0f19fcd1..03c21c567ce4 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h index 5c25120581a7..67f7bf471798 100644 --- a/include/linux/raid/raid5.h +++ b/include/linux/raid/raid5.h @@ -3,6 +3,7 @@ #include #include +#include /* * diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 34a50cd558e9..4e554c38c7f5 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -120,7 +120,7 @@ EXPORT_SYMBOL(vmtruncate); EXPORT_SYMBOL(find_vma); EXPORT_SYMBOL(get_unmapped_area); EXPORT_SYMBOL(init_mm); -EXPORT_SYMBOL(create_bounce); +EXPORT_SYMBOL(blk_queue_bounce); #ifdef CONFIG_HIGHMEM EXPORT_SYMBOL(kmap_high); EXPORT_SYMBOL(kunmap_high); diff --git a/mm/highmem.c b/mm/highmem.c index de5ebeb0a167..ae9c5a26376b 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -17,6 +17,7 @@ */ #include +#include #include #include #include @@ -347,13 +348,15 @@ static void bounce_end_io_read_isa(struct bio *bio) return __bounce_end_io_read(bio, isa_page_pool); } -void create_bounce(unsigned long pfn, int gfp, struct bio **bio_orig) +void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig) { struct page *page; struct bio *bio = NULL; int i, rw = bio_data_dir(*bio_orig), bio_gfp; struct bio_vec *to, *from; mempool_t *pool; + unsigned long pfn = q->bounce_pfn; + int gfp = q->bounce_gfp; BUG_ON((*bio_orig)->bi_idx); -- cgit v1.2.3 From 38cb52ca07ab22719026ffdfe1db794ed8e9b73b Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:20:38 -0700 Subject: [PATCH] remove set_page_buffers() and clear_page_buffers() The set_page_buffers() and clear_page_buffers() macros are each used in only one place. Fold them into their callers. --- fs/buffer.c | 6 ++++-- include/linux/buffer_head.h | 10 ---------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 70025ee603a6..a47863b3bd8a 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -152,14 +152,16 @@ __set_page_buffers(struct page *page, struct buffer_head *head) { if (page_has_buffers(page)) buffer_error(); - set_page_buffers(page, head); page_cache_get(page); + SetPagePrivate(page); + page->private = (unsigned long)head; } static inline void __clear_page_buffers(struct page *page) { - clear_page_buffers(page); + ClearPagePrivate(page); + page->private = 0; page_cache_release(page); } diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 96207c0d6dce..903850460ab5 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -119,16 +119,6 @@ BUFFER_FNS(Boundary, boundary) ((struct buffer_head *)(page)->private); \ }) #define page_has_buffers(page) PagePrivate(page) -#define set_page_buffers(page, buffers) \ - do { \ - SetPagePrivate(page); \ - page->private = (unsigned long)buffers; \ - } while (0) -#define clear_page_buffers(page) \ - do { \ - ClearPagePrivate(page); \ - page->private = 0; \ - } while (0) #define invalidate_buffers(dev) __invalidate_buffers((dev), 0) #define destroy_buffers(dev) __invalidate_buffers((dev), 1) -- cgit v1.2.3 From 493f4988d640a73337df91f2c63e94c78ecd5e97 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:20:53 -0700 Subject: [PATCH] allow GFP_NOFS allocators to perform swapcache writeout One weakness which was introduced when the buffer LRU went away was that GFP_NOFS allocations became equivalent to GFP_NOIO. Because all writeback goes via writepage/writepages, which requires entry into the filesystem. However now that swapout no longer calls bmap(), we can honour GFP_NOFS's intent for swapcache pages. So if the allocation request specifies __GFP_IO and !__GFP_FS, we can wait on swapcache pages and we can perform swapcache writeout. This should strengthen the VM somewhat. --- mm/vmscan.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 91f180f2b08a..6561f2b71b35 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -392,7 +392,8 @@ shrink_cache(int nr_pages, zone_t *classzone, spin_lock(&pagemap_lru_lock); while (--max_scan >= 0 && (entry = inactive_list.prev) != &inactive_list) { - struct page * page; + struct page *page; + int may_enter_fs; if (need_resched()) { spin_unlock(&pagemap_lru_lock); @@ -426,11 +427,18 @@ shrink_cache(int nr_pages, zone_t *classzone, if (!PagePrivate(page) && (page_count(page) != 1 || !page->mapping)) goto page_mapped; + /* + * swap activity never enters the filesystem and is safe + * for GFP_NOFS allocations. + */ + may_enter_fs = (gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (gfp_mask & __GFP_IO)); + /* * IO in progress? Leave it at the back of the list. */ if (unlikely(PageWriteback(page))) { - if (gfp_mask & __GFP_FS) { + if (may_enter_fs) { page_cache_get(page); spin_unlock(&pagemap_lru_lock); wait_on_page_writeback(page); @@ -451,7 +459,7 @@ shrink_cache(int nr_pages, zone_t *classzone, mapping = page->mapping; if (PageDirty(page) && is_page_cache_freeable(page) && - page->mapping && (gfp_mask & __GFP_FS)) { + page->mapping && may_enter_fs) { /* * It is not critical here to write it only if * the page is unmapped beause any direct writer @@ -480,6 +488,15 @@ shrink_cache(int nr_pages, zone_t *classzone, * If the page has buffers, try to free the buffer mappings * associated with this page. If we succeed we try to free * the page as well. + * + * We do this even if the page is PageDirty(). + * try_to_release_page() does not perform I/O, but it is + * possible for a page to have PageDirty set, but it is actually + * clean (all its buffers are clean). This happens if the + * buffers were written out directly, with submit_bh(). ext3 + * will do this, as well as the blockdev mapping. + * try_to_release_page() will discover that cleanness and will + * drop the buffers and mark the page clean - it can be freed. */ if (PagePrivate(page)) { spin_unlock(&pagemap_lru_lock); -- cgit v1.2.3 From 3fb3b749ba712dd4197b585d654d233d3487d9d4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:21:09 -0700 Subject: [PATCH] rename get_hash_table() to find_get_block() Renames the buffer_head lookup function `get_hash_table' to `find_get_block'. get_hash_table() is too generic a name. Plus it doesn't even use a hash any more. --- drivers/md/lvm-snap.c | 2 +- fs/buffer.c | 10 +++++----- fs/ext3/balloc.c | 4 ++-- fs/ext3/inode.c | 2 +- fs/jbd/revoke.c | 6 +++--- fs/qnx4/fsync.c | 2 +- fs/reiserfs/fix_node.c | 2 +- fs/reiserfs/journal.c | 8 ++++---- fs/ufs/truncate.c | 8 ++++---- include/linux/buffer_head.h | 6 +++--- include/linux/reiserfs_fs.h | 2 +- kernel/ksyms.c | 2 +- 12 files changed, 27 insertions(+), 27 deletions(-) diff --git a/drivers/md/lvm-snap.c b/drivers/md/lvm-snap.c index c90947fc5f89..46df5c8ff0ef 100644 --- a/drivers/md/lvm-snap.c +++ b/drivers/md/lvm-snap.c @@ -224,7 +224,7 @@ static inline void invalidate_snap_cache(unsigned long start, unsigned long nr, for (i = 0; i < nr; i++) { - bh = get_hash_table(dev, start++, blksize); + bh = find_get_block(dev, start++, blksize); if (bh) bforget(bh); } diff --git a/fs/buffer.c b/fs/buffer.c index a47863b3bd8a..dde8e7d9bae6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -378,7 +378,7 @@ out: } /* - * Various filesystems appear to want __get_hash_table to be non-blocking. + * Various filesystems appear to want __find_get_block to be non-blocking. * But it's the page lock which protects the buffers. To get around this, * we get exclusion from try_to_free_buffers with the blockdev mapping's * private_lock. @@ -389,7 +389,7 @@ out: * private_lock is contended then so is mapping->page_lock). */ struct buffer_head * -__get_hash_table(struct block_device *bdev, sector_t block, int unused) +__find_get_block(struct block_device *bdev, sector_t block, int unused) { struct inode *bd_inode = bdev->bd_inode; struct address_space *bd_mapping = bd_inode->i_mapping; @@ -1091,7 +1091,7 @@ grow_dev_page(struct block_device *bdev, unsigned long block, /* * Link the page to the buffers and initialise them. Take the - * lock to be atomic wrt __get_hash_table(), which does not + * lock to be atomic wrt __find_get_block(), which does not * run under the page lock. */ spin_lock(&inode->i_mapping->private_lock); @@ -1164,7 +1164,7 @@ __getblk(struct block_device *bdev, sector_t block, int size) for (;;) { struct buffer_head * bh; - bh = __get_hash_table(bdev, block, size); + bh = __find_get_block(bdev, block, size); if (bh) { touch_buffer(bh); return bh; @@ -1449,7 +1449,7 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block) { struct buffer_head *old_bh; - old_bh = __get_hash_table(bdev, block, 0); + old_bh = __find_get_block(bdev, block, 0); if (old_bh) { #if 0 /* This happens. Later. */ if (buffer_dirty(old_bh)) diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index f8f6828d5f59..c5cc2178ad4a 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -352,7 +352,7 @@ do_more: #ifdef CONFIG_JBD_DEBUG { struct buffer_head *debug_bh; - debug_bh = sb_get_hash_table(sb, block + i); + debug_bh = sb_find_get_block(sb, block + i); if (debug_bh) { BUFFER_TRACE(debug_bh, "Deleted!"); if (!bh2jh(bitmap_bh)->b_committed_data) @@ -701,7 +701,7 @@ got_block: struct buffer_head *debug_bh; /* Record bitmap buffer state in the newly allocated block */ - debug_bh = sb_get_hash_table(sb, tmp); + debug_bh = sb_find_get_block(sb, tmp); if (debug_bh) { BUFFER_TRACE(debug_bh, "state when allocated"); BUFFER_TRACE2(debug_bh, bh, "bitmap state"); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index b339c253628e..a9b2c7beb70b 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1650,7 +1650,7 @@ ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh, struct buffer_head *bh; *p = 0; - bh = sb_get_hash_table(inode->i_sb, nr); + bh = sb_find_get_block(inode->i_sb, nr); ext3_forget(handle, 0, inode, bh, nr); } } diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index 7cecb0237988..6a6464533c35 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -293,7 +293,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr, bh = bh_in; if (!bh) { - bh = __get_hash_table(bdev, blocknr, journal->j_blocksize); + bh = __find_get_block(bdev, blocknr, journal->j_blocksize); if (bh) BUFFER_TRACE(bh, "found on hash"); } @@ -303,7 +303,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr, /* If there is a different buffer_head lying around in * memory anywhere... */ - bh2 = __get_hash_table(bdev, blocknr, journal->j_blocksize); + bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); if (bh2) { /* ... and it has RevokeValid status... */ if ((bh2 != bh) && @@ -407,7 +407,7 @@ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh) * state machine will get very upset later on. */ if (need_cancel) { struct buffer_head *bh2; - bh2 = __get_hash_table(bh->b_bdev, bh->b_blocknr, bh->b_size); + bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); if (bh2) { if (bh2 != bh) clear_bit(BH_Revoked, &bh2->b_state); diff --git a/fs/qnx4/fsync.c b/fs/qnx4/fsync.c index 2bb315473ee6..df5bc75d5414 100644 --- a/fs/qnx4/fsync.c +++ b/fs/qnx4/fsync.c @@ -37,7 +37,7 @@ static int sync_block(struct inode *inode, unsigned short *block, int wait) if (!*block) return 0; tmp = *block; - bh = sb_get_hash_table(inode->i_sb, *block); + bh = sb_find_get_block(inode->i_sb, *block); if (!bh) return 0; if (*block != tmp) { diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c index 0bdb34c5acf4..1cdcd39a06bd 100644 --- a/fs/reiserfs/fix_node.c +++ b/fs/reiserfs/fix_node.c @@ -920,7 +920,7 @@ static int is_left_neighbor_in_cache( /* Get left neighbor block number. */ n_left_neighbor_blocknr = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position); /* Look for the left neighbor in the cache. */ - if ( (left = sb_get_hash_table(p_s_sb, n_left_neighbor_blocknr)) ) { + if ( (left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr)) ) { RFALSE( buffer_uptodate (left) && ! B_IS_IN_TREE(left), "vs-8170: left neighbor (%b %z) is not in the tree", left, left); diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index c16dbdc12ca6..2cf16631e224 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -689,7 +689,7 @@ retry: count = 0 ; for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && i < (jl->j_len + 1) ; i++) { /* everything but commit_bh */ bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) % SB_ONDISK_JOURNAL_SIZE(s); - tbh = journal_get_hash_table(s, bn) ; + tbh = journal_find_get_block(s, bn) ; /* kill this sanity check */ if (count > (orig_commit_left + 2)) { @@ -718,7 +718,7 @@ reiserfs_panic(s, "journal-539: flush_commit_list: BAD count(%d) > orig_commit_l for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && i < (jl->j_len + 1) ; i++) { /* everything but commit_bh */ bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ; - tbh = journal_get_hash_table(s, bn) ; + tbh = journal_find_get_block(s, bn) ; wait_on_buffer(tbh) ; if (!buffer_uptodate(tbh)) { @@ -2764,7 +2764,7 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_bloc int cleaned = 0 ; if (reiserfs_dont_log(th->t_super)) { - bh = sb_get_hash_table(p_s_sb, blocknr) ; + bh = sb_find_get_block(p_s_sb, blocknr) ; if (bh && buffer_dirty (bh)) { printk ("journal_mark_freed(dont_log): dirty buffer on hash list: %lx %ld\n", bh->b_state, blocknr); BUG (); @@ -2772,7 +2772,7 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_bloc brelse (bh); return 0 ; } - bh = sb_get_hash_table(p_s_sb, blocknr) ; + bh = sb_find_get_block(p_s_sb, blocknr) ; /* if it is journal new, we just remove it from this transaction */ if (bh && buffer_journal_new(bh)) { mark_buffer_notjournal_new(bh) ; diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index f8134d41d98e..6b87c6f26702 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -117,7 +117,7 @@ static int ufs_trunc_direct (struct inode * inode) frag1 = ufs_fragnum (frag1); frag2 = ufs_fragnum (frag2); for (j = frag1; j < frag2; j++) { - bh = sb_get_hash_table (sb, tmp + j); + bh = sb_find_get_block (sb, tmp + j); if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { retry = 1; brelse (bh); @@ -140,7 +140,7 @@ next1: if (!tmp) continue; for (j = 0; j < uspi->s_fpb; j++) { - bh = sb_get_hash_table(sb, tmp + j); + bh = sb_find_get_block(sb, tmp + j); if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { retry = 1; brelse (bh); @@ -179,7 +179,7 @@ next2:; ufs_panic(sb, "ufs_truncate_direct", "internal error"); frag4 = ufs_fragnum (frag4); for (j = 0; j < frag4; j++) { - bh = sb_get_hash_table (sb, tmp + j); + bh = sb_find_get_block (sb, tmp + j); if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) { retry = 1; brelse (bh); @@ -238,7 +238,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, u32 * p) if (!tmp) continue; for (j = 0; j < uspi->s_fpb; j++) { - bh = sb_get_hash_table(sb, tmp + j); + bh = sb_find_get_block(sb, tmp + j); if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *ind)) { retry = 1; brelse (bh); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 903850460ab5..4fc6bab55825 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -160,7 +160,7 @@ int fsync_dev(kdev_t); int fsync_bdev(struct block_device *); int fsync_super(struct super_block *); int fsync_no_super(struct block_device *); -struct buffer_head *__get_hash_table(struct block_device *, sector_t, int); +struct buffer_head *__find_get_block(struct block_device *, sector_t, int); struct buffer_head * __getblk(struct block_device *, sector_t, int); void __brelse(struct buffer_head *); void __bforget(struct buffer_head *); @@ -254,9 +254,9 @@ static inline struct buffer_head * sb_getblk(struct super_block *sb, int block) } static inline struct buffer_head * -sb_get_hash_table(struct super_block *sb, int block) +sb_find_get_block(struct super_block *sb, int block) { - return __get_hash_table(sb->s_bdev, block, sb->s_blocksize); + return __find_get_block(sb->s_bdev, block, sb->s_blocksize); } static inline void diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h index 4a3d16d7b8dc..29f6063b3546 100644 --- a/include/linux/reiserfs_fs.h +++ b/include/linux/reiserfs_fs.h @@ -1651,7 +1651,7 @@ extern wait_queue_head_t reiserfs_commit_thread_wait ; #define JOURNAL_BUFFER(j,n) ((j)->j_ap_blocks[((j)->j_start + (n)) % JOURNAL_BLOCK_COUNT]) // We need these to make journal.c code more readable -#define journal_get_hash_table(s, block) __get_hash_table(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) +#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) #define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) #define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize) diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 4e554c38c7f5..8b2511787ccb 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -551,7 +551,7 @@ EXPORT_SYMBOL(file_fsync); EXPORT_SYMBOL(fsync_buffers_list); EXPORT_SYMBOL(clear_inode); EXPORT_SYMBOL(init_special_inode); -EXPORT_SYMBOL(__get_hash_table); +EXPORT_SYMBOL(__find_get_block); EXPORT_SYMBOL(new_inode); EXPORT_SYMBOL(__insert_inode_hash); EXPORT_SYMBOL(remove_inode_hash); -- cgit v1.2.3 From df01cd1777d43744c13ba18ce48739185d20103a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:21:21 -0700 Subject: [PATCH] Reduce the radix tree nodes to 64 slots Reduce the radix tree nodes from 128 slots to 64. - The main reason for this is that on 64-bit/4k page machines, the slab allocator has decided that radix tree nodes will require an order-1 allocation. Shrinking the nodes to 64 slots pulls that back to an order-0 allocation. - On x86 we get fifteen 64-slot nodes per page rather than seven 129-slot nodes, for a modest memory saving. - Halving the node size will approximately halve the memory use in the worrisome really-large, really-sparse file case. Of course, the downside is longer tree walks. Each level of the tree covers six bits of pagecache index rather than seven. As ever, I am guided by Anton's profiling on the 12- and 32-way PPC boxes. radix_tree_lookup() is currently down in the noise floor. Now, there is one special case: one file which is really big and which is accessed in a random manner and which is accessed very heavily: the blockdev mapping. We _are_ showing some locking cost in __find_get_block (used to be __get_hash_table) and in its call to find_get_page(). I have a bunch of patches which introduce a generic per-cpu buffer LRU, and which remove ext2's private bitmap buffer LRUs. I expect these patches to wipe the blockdev mapping lookup lock contention off the map, but I'm awaiting test results from Anton before deciding whether those patches are worth submitting. --- lib/radix-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 689a5448ea31..e17cd888fc3d 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -29,7 +29,7 @@ /* * Radix tree node definition. */ -#define RADIX_TREE_MAP_SHIFT 7 +#define RADIX_TREE_MAP_SHIFT 6 #define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT) #define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1) -- cgit v1.2.3 From 9343c8e266dfe2e80578c592ade7825f5183af34 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 17 Jun 2002 20:21:34 -0700 Subject: [PATCH] msync(bad address) should return -ENOMEM Heaven knows why, but that's what the opengroup say, and returning -EFAULT causes 2.5 to fail one of the Linux Test Project tests. [ENOMEM] The addresses in the range starting at addr and continuing for len bytes are outside the range allowed for the address space of a process or specify one or more pages that are not mapped. 2.4 has it right, but 2.5 doesn't. --- mm/msync.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/msync.c b/mm/msync.c index 2a2b31de8957..5ea980e6b1dc 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -169,7 +169,7 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) { unsigned long end; struct vm_area_struct * vma; - int unmapped_error, error = -EINVAL; + int unmapped_error, error = -ENOMEM; down_read(¤t->mm->mmap_sem); if (start & ~PAGE_MASK) @@ -185,18 +185,18 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags) goto out; /* * If the interval [start,end) covers some unmapped address ranges, - * just ignore them, but return -EFAULT at the end. + * just ignore them, but return -ENOMEM at the end. */ vma = find_vma(current->mm, start); unmapped_error = 0; for (;;) { /* Still start < end. */ - error = -EFAULT; + error = -ENOMEM; if (!vma) goto out; /* Here start < vma->vm_end. */ if (start < vma->vm_start) { - unmapped_error = -EFAULT; + unmapped_error = -ENOMEM; start = vma->vm_start; } /* Here vma->vm_start <= start < vma->vm_end. */ @@ -220,5 +220,3 @@ out: up_read(¤t->mm->mmap_sem); return error; } - - -- cgit v1.2.3 From b068ec41ff5c343330ca11a25347918accb66c2a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 17 Jun 2002 20:27:16 -0700 Subject: [PATCH] x86-64 merge x86_64 core updates. - Make it compile again (switch_to macros etc., add dummy suspend.h) - reenable strength reduce optimization - Fix ramdisk (patch from Mikael Pettersson) - Some merges from i386 - Reimplement lazy iobitmap allocation. I reimplemented it based on bcrl's idea. - Fix IPC 32bit emulation to actually work and move into own file - New fixed mtrr.c from DaveJ ported from 2.4 and reenable it. - Move tlbstate into PDA. - Add some changes that got lost during the last merge. - new memset that seems to actually work. - Align signal handler stack frames to 16 bytes. - Some more minor bugfixes. --- arch/x86_64/Makefile | 6 - arch/x86_64/boot/Makefile | 4 - arch/x86_64/config.in | 4 +- arch/x86_64/ia32/Makefile | 5 +- arch/x86_64/ia32/ipc32.c | 645 +++++++++++++++++++++++++++++++++++++++ arch/x86_64/ia32/sys_ia32.c | 416 ------------------------- arch/x86_64/kernel/ioport.c | 23 +- arch/x86_64/kernel/mtrr.c | 454 ++++++++++++--------------- arch/x86_64/kernel/process.c | 33 +- arch/x86_64/kernel/setup64.c | 3 + arch/x86_64/kernel/signal.c | 56 ++-- arch/x86_64/kernel/smp.c | 22 +- arch/x86_64/kernel/vsyscall.c | 2 +- arch/x86_64/kernel/x8664_ksyms.c | 2 + arch/x86_64/lib/Makefile | 2 +- arch/x86_64/lib/memset.S | 76 +++-- include/asm-x86_64/i387.h | 11 + include/asm-x86_64/ia32.h | 2 + include/asm-x86_64/ipc.h | 30 +- include/asm-x86_64/mmu_context.h | 12 +- include/asm-x86_64/msr.h | 21 +- include/asm-x86_64/mtrr.h | 42 ++- include/asm-x86_64/pda.h | 2 + include/asm-x86_64/processor.h | 11 +- include/asm-x86_64/spinlock.h | 6 +- include/asm-x86_64/string.h | 13 +- include/asm-x86_64/suspend.h | 6 + include/asm-x86_64/system.h | 7 +- include/asm-x86_64/timex.h | 2 - include/asm-x86_64/tlbflush.h | 9 - 30 files changed, 1018 insertions(+), 909 deletions(-) create mode 100644 arch/x86_64/ia32/ipc32.c create mode 100644 include/asm-x86_64/suspend.h diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 3968f838fe7c..46fe5228c782 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -43,15 +43,9 @@ CFLAGS += -mcmodel=kernel CFLAGS += -pipe # this makes reading assembly source easier CFLAGS += -fno-reorder-blocks -# needed for later gcc 3.1 CFLAGS += -finline-limit=2000 -# needed for earlier gcc 3.1 -#CFLAGS += -fno-strength-reduce #CFLAGS += -g -# prevent gcc from keeping the stack 16 byte aligned (FIXME) -#CFLAGS += -mpreferred-stack-boundary=2 - HEAD := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o SUBDIRS := arch/x86_64/tools $(SUBDIRS) arch/x86_64/kernel arch/x86_64/mm arch/x86_64/lib diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile index a82cabc11223..9549b65aaae7 100644 --- a/arch/x86_64/boot/Makefile +++ b/arch/x86_64/boot/Makefile @@ -21,10 +21,6 @@ ROOT_DEV := CURRENT SVGA_MODE := -DSVGA_MODE=NORMAL_VGA -# If you want the RAM disk device, define this to be the size in blocks. - -RAMDISK := -DRAMDISK=512 - # --------------------------------------------------------------------------- BOOT_INCL = $(TOPDIR)/include/linux/config.h \ diff --git a/arch/x86_64/config.in b/arch/x86_64/config.in index 8605598747a8..829a74f439ad 100644 --- a/arch/x86_64/config.in +++ b/arch/x86_64/config.in @@ -47,8 +47,7 @@ define_bool CONFIG_EISA n define_bool CONFIG_X86_IO_APIC y define_bool CONFIG_X86_LOCAL_APIC y -#currently broken: -#bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR +bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR bool 'Symmetric multi-processing support' CONFIG_SMP if [ "$CONFIG_SMP" = "n" ]; then bool 'Preemptible Kernel' CONFIG_PREEMPT @@ -226,6 +225,7 @@ if [ "$CONFIG_DEBUG_KERNEL" != "n" ]; then bool ' Spinlock debugging' CONFIG_DEBUG_SPINLOCK bool ' Additional run-time checks' CONFIG_CHECKING bool ' Debug __init statements' CONFIG_INIT_DEBUG + bool ' Spinlock debugging' CONFIG_DEBUG_SPINLOCK fi endmenu diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile index 45c356b60cb5..00e69a2d0060 100644 --- a/arch/x86_64/ia32/Makefile +++ b/arch/x86_64/ia32/Makefile @@ -9,8 +9,9 @@ export-objs := ia32_ioctl.o sys_ia32.o all: ia32.o O_TARGET := ia32.o -obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_ioctl.o ia32_signal.o \ - ia32_binfmt.o fpu32.o socket32.o ptrace32.o +obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_ioctl.o \ + ia32_signal.o \ + ia32_binfmt.o fpu32.o socket32.o ptrace32.o ipc32.o clean:: diff --git a/arch/x86_64/ia32/ipc32.c b/arch/x86_64/ia32/ipc32.c new file mode 100644 index 000000000000..2d322dda88ef --- /dev/null +++ b/arch/x86_64/ia32/ipc32.c @@ -0,0 +1,645 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * sys32_ipc() is the de-multiplexer for the SysV IPC calls in 32bit emulation.. + * + * This is really horribly ugly. + */ + +struct msgbuf32 { + s32 mtype; + char mtext[1]; +}; + +struct ipc_perm32 { + int key; + __kernel_uid_t32 uid; + __kernel_gid_t32 gid; + __kernel_uid_t32 cuid; + __kernel_gid_t32 cgid; + unsigned short mode; + unsigned short seq; +}; + +struct ipc64_perm32 { + unsigned key; + __kernel_uid32_t32 uid; + __kernel_gid32_t32 gid; + __kernel_uid32_t32 cuid; + __kernel_gid32_t32 cgid; + unsigned short mode; + unsigned short __pad1; + unsigned short seq; + unsigned short __pad2; + unsigned int unused1; + unsigned int unused2; +}; + +struct semid_ds32 { + struct ipc_perm32 sem_perm; /* permissions .. see ipc.h */ + __kernel_time_t32 sem_otime; /* last semop time */ + __kernel_time_t32 sem_ctime; /* last change time */ + u32 sem_base; /* ptr to first semaphore in array */ + u32 sem_pending; /* pending operations to be processed */ + u32 sem_pending_last; /* last pending operation */ + u32 undo; /* undo requests on this array */ + unsigned short sem_nsems; /* no. of semaphores in array */ +}; + +struct semid64_ds32 { + struct ipc64_perm32 sem_perm; + __kernel_time_t32 sem_otime; + unsigned int __unused1; + __kernel_time_t32 sem_ctime; + unsigned int __unused2; + unsigned int sem_nsems; + unsigned int __unused3; + unsigned int __unused4; +}; + +struct msqid_ds32 { + struct ipc_perm32 msg_perm; + u32 msg_first; + u32 msg_last; + __kernel_time_t32 msg_stime; + __kernel_time_t32 msg_rtime; + __kernel_time_t32 msg_ctime; + u32 wwait; + u32 rwait; + unsigned short msg_cbytes; + unsigned short msg_qnum; + unsigned short msg_qbytes; + __kernel_ipc_pid_t32 msg_lspid; + __kernel_ipc_pid_t32 msg_lrpid; +}; + +struct msqid64_ds32 { + struct ipc64_perm32 msg_perm; + __kernel_time_t32 msg_stime; + unsigned int __unused1; + __kernel_time_t32 msg_rtime; + unsigned int __unused2; + __kernel_time_t32 msg_ctime; + unsigned int __unused3; + unsigned int msg_cbytes; + unsigned int msg_qnum; + unsigned int msg_qbytes; + __kernel_pid_t32 msg_lspid; + __kernel_pid_t32 msg_lrpid; + unsigned int __unused4; + unsigned int __unused5; +}; + +struct shmid_ds32 { + struct ipc_perm32 shm_perm; + int shm_segsz; + __kernel_time_t32 shm_atime; + __kernel_time_t32 shm_dtime; + __kernel_time_t32 shm_ctime; + __kernel_ipc_pid_t32 shm_cpid; + __kernel_ipc_pid_t32 shm_lpid; + unsigned short shm_nattch; +}; + +struct shmid64_ds32 { + struct ipc64_perm32 shm_perm; + __kernel_size_t32 shm_segsz; + __kernel_time_t32 shm_atime; + unsigned int __unused1; + __kernel_time_t32 shm_dtime; + unsigned int __unused2; + __kernel_time_t32 shm_ctime; + unsigned int __unused3; + __kernel_pid_t32 shm_cpid; + __kernel_pid_t32 shm_lpid; + unsigned int shm_nattch; + unsigned int __unused4; + unsigned int __unused5; +}; + +struct shminfo64_32 { + unsigned int shmmax; + unsigned int shmmin; + unsigned int shmmni; + unsigned int shmseg; + unsigned int shmall; + unsigned int __unused1; + unsigned int __unused2; + unsigned int __unused3; + unsigned int __unused4; +}; + +struct shm_info32 { + int used_ids; + u32 shm_tot, shm_rss, shm_swp; + u32 swap_attempts, swap_successes; +}; + +struct ipc_kludge { + struct msgbuf *msgp; + int msgtyp; +}; + + +#define A(__x) ((unsigned long)(__x)) +#define AA(__x) ((unsigned long)(__x)) + +#define SEMOP 1 +#define SEMGET 2 +#define SEMCTL 3 +#define MSGSND 11 +#define MSGRCV 12 +#define MSGGET 13 +#define MSGCTL 14 +#define SHMAT 21 +#define SHMDT 22 +#define SHMGET 23 +#define SHMCTL 24 + +#define IPCOP_MASK(__x) (1UL << (__x)) + +static int +ipc_parse_version32 (int *cmd) +{ + if (*cmd & IPC_64) { + *cmd ^= IPC_64; + return IPC_64; + } else { + return IPC_OLD; + } +} + +static int +semctl32 (int first, int second, int third, void *uptr) +{ + union semun fourth; + u32 pad; + int err = 0, err2; + struct semid64_ds s; + mm_segment_t old_fs; + int version = ipc_parse_version32(&third); + + if (!uptr) + return -EINVAL; + if (get_user(pad, (u32 *)uptr)) + return -EFAULT; + if (third == SETVAL) + fourth.val = (int)pad; + else + fourth.__pad = (void *)A(pad); + switch (third) { + case IPC_INFO: + case IPC_RMID: + case IPC_SET: + case SEM_INFO: + case GETVAL: + case GETPID: + case GETNCNT: + case GETZCNT: + case GETALL: + case SETVAL: + case SETALL: + err = sys_semctl(first, second, third, fourth); + break; + + case IPC_STAT: + case SEM_STAT: + fourth.__pad = &s; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = sys_semctl(first, second|IPC_64, third, fourth); + set_fs(old_fs); + + if (version == IPC_64) { + struct semid64_ds32 *usp64 = (struct semid64_ds32 *) A(pad); + + if (!access_ok(VERIFY_WRITE, usp64, sizeof(*usp64))) { + err = -EFAULT; + break; + } + err2 = __put_user(s.sem_perm.key, &usp64->sem_perm.key); + err2 |= __put_user(s.sem_perm.uid, &usp64->sem_perm.uid); + err2 |= __put_user(s.sem_perm.gid, &usp64->sem_perm.gid); + err2 |= __put_user(s.sem_perm.cuid, &usp64->sem_perm.cuid); + err2 |= __put_user(s.sem_perm.cgid, &usp64->sem_perm.cgid); + err2 |= __put_user(s.sem_perm.mode, &usp64->sem_perm.mode); + err2 |= __put_user(s.sem_perm.seq, &usp64->sem_perm.seq); + err2 |= __put_user(s.sem_otime, &usp64->sem_otime); + err2 |= __put_user(s.sem_ctime, &usp64->sem_ctime); + err2 |= __put_user(s.sem_nsems, &usp64->sem_nsems); + } else { + struct semid_ds32 *usp32 = (struct semid_ds32 *) A(pad); + + if (!access_ok(VERIFY_WRITE, usp32, sizeof(*usp32))) { + err = -EFAULT; + break; + } + err2 = __put_user(s.sem_perm.key, &usp32->sem_perm.key); + err2 |= __put_user(s.sem_perm.uid, &usp32->sem_perm.uid); + err2 |= __put_user(s.sem_perm.gid, &usp32->sem_perm.gid); + err2 |= __put_user(s.sem_perm.cuid, &usp32->sem_perm.cuid); + err2 |= __put_user(s.sem_perm.cgid, &usp32->sem_perm.cgid); + err2 |= __put_user(s.sem_perm.mode, &usp32->sem_perm.mode); + err2 |= __put_user(s.sem_perm.seq, &usp32->sem_perm.seq); + err2 |= __put_user(s.sem_otime, &usp32->sem_otime); + err2 |= __put_user(s.sem_ctime, &usp32->sem_ctime); + err2 |= __put_user(s.sem_nsems, &usp32->sem_nsems); + } + if (err2) + err = -EFAULT; + break; + } + return err; +} + +static int +do_sys32_msgsnd (int first, int second, int third, void *uptr) +{ + struct msgbuf *p = kmalloc(second + sizeof(struct msgbuf) + 4, GFP_USER); + struct msgbuf32 *up = (struct msgbuf32 *)uptr; + mm_segment_t old_fs; + int err; + + if (!p) + return -ENOMEM; + err = get_user(p->mtype, &up->mtype); + err |= copy_from_user(p->mtext, &up->mtext, second); + if (err) + goto out; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = sys_msgsnd(first, p, second, third); + set_fs(old_fs); + out: + kfree(p); + return err; +} + +static int +do_sys32_msgrcv (int first, int second, int msgtyp, int third, int version, void *uptr) +{ + struct msgbuf32 *up; + struct msgbuf *p; + mm_segment_t old_fs; + int err; + + if (!version) { + struct ipc_kludge *uipck = (struct ipc_kludge *)uptr; + struct ipc_kludge ipck; + + err = -EINVAL; + if (!uptr) + goto out; + err = -EFAULT; + if (copy_from_user(&ipck, uipck, sizeof(struct ipc_kludge))) + goto out; + uptr = (void *)A(ipck.msgp); + msgtyp = ipck.msgtyp; + } + err = -ENOMEM; + p = kmalloc(second + sizeof(struct msgbuf) + 4, GFP_USER); + if (!p) + goto out; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = sys_msgrcv(first, p, second + 4, msgtyp, third); + set_fs(old_fs); + if (err < 0) + goto free_then_out; + up = (struct msgbuf32 *)uptr; + if (put_user(p->mtype, &up->mtype) || copy_to_user(&up->mtext, p->mtext, err)) + err = -EFAULT; +free_then_out: + kfree(p); +out: + return err; +} + +static int +msgctl32 (int first, int second, void *uptr) +{ + int err = -EINVAL, err2; + struct msqid_ds m; + struct msqid64_ds m64; + struct msqid_ds32 *up32 = (struct msqid_ds32 *)uptr; + struct msqid64_ds32 *up64 = (struct msqid64_ds32 *)uptr; + mm_segment_t old_fs; + int version = ipc_parse_version32(&second); + + switch (second) { + case IPC_INFO: + case IPC_RMID: + case MSG_INFO: + err = sys_msgctl(first, second, (struct msqid_ds *)uptr); + break; + + case IPC_SET: + if (version == IPC_64) { + err = get_user(m.msg_perm.uid, &up64->msg_perm.uid); + err |= get_user(m.msg_perm.gid, &up64->msg_perm.gid); + err |= get_user(m.msg_perm.mode, &up64->msg_perm.mode); + err |= get_user(m.msg_qbytes, &up64->msg_qbytes); + } else { + err = get_user(m.msg_perm.uid, &up32->msg_perm.uid); + err |= get_user(m.msg_perm.gid, &up32->msg_perm.gid); + err |= get_user(m.msg_perm.mode, &up32->msg_perm.mode); + err |= get_user(m.msg_qbytes, &up32->msg_qbytes); + } + if (err) + break; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = sys_msgctl(first, second, &m); + set_fs(old_fs); + break; + + case IPC_STAT: + case MSG_STAT: + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = sys_msgctl(first, second|IPC_64, (void *) &m64); + set_fs(old_fs); + + if (version == IPC_64) { + if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64))) { + err = -EFAULT; + break; + } + err2 = __put_user(m64.msg_perm.key, &up64->msg_perm.key); + err2 |= __put_user(m64.msg_perm.uid, &up64->msg_perm.uid); + err2 |= __put_user(m64.msg_perm.gid, &up64->msg_perm.gid); + err2 |= __put_user(m64.msg_perm.cuid, &up64->msg_perm.cuid); + err2 |= __put_user(m64.msg_perm.cgid, &up64->msg_perm.cgid); + err2 |= __put_user(m64.msg_perm.mode, &up64->msg_perm.mode); + err2 |= __put_user(m64.msg_perm.seq, &up64->msg_perm.seq); + err2 |= __put_user(m64.msg_stime, &up64->msg_stime); + err2 |= __put_user(m64.msg_rtime, &up64->msg_rtime); + err2 |= __put_user(m64.msg_ctime, &up64->msg_ctime); + err2 |= __put_user(m64.msg_cbytes, &up64->msg_cbytes); + err2 |= __put_user(m64.msg_qnum, &up64->msg_qnum); + err2 |= __put_user(m64.msg_qbytes, &up64->msg_qbytes); + err2 |= __put_user(m64.msg_lspid, &up64->msg_lspid); + err2 |= __put_user(m64.msg_lrpid, &up64->msg_lrpid); + if (err2) + err = -EFAULT; + } else { + if (!access_ok(VERIFY_WRITE, up32, sizeof(*up32))) { + err = -EFAULT; + break; + } + err2 = __put_user(m64.msg_perm.key, &up32->msg_perm.key); + err2 |= __put_user(m64.msg_perm.uid, &up32->msg_perm.uid); + err2 |= __put_user(m64.msg_perm.gid, &up32->msg_perm.gid); + err2 |= __put_user(m64.msg_perm.cuid, &up32->msg_perm.cuid); + err2 |= __put_user(m64.msg_perm.cgid, &up32->msg_perm.cgid); + err2 |= __put_user(m64.msg_perm.mode, &up32->msg_perm.mode); + err2 |= __put_user(m64.msg_perm.seq, &up32->msg_perm.seq); + err2 |= __put_user(m64.msg_stime, &up32->msg_stime); + err2 |= __put_user(m64.msg_rtime, &up32->msg_rtime); + err2 |= __put_user(m64.msg_ctime, &up32->msg_ctime); + err2 |= __put_user(m64.msg_cbytes, &up32->msg_cbytes); + err2 |= __put_user(m64.msg_qnum, &up32->msg_qnum); + err2 |= __put_user(m64.msg_qbytes, &up32->msg_qbytes); + err2 |= __put_user(m64.msg_lspid, &up32->msg_lspid); + err2 |= __put_user(m64.msg_lrpid, &up32->msg_lrpid); + if (err2) + err = -EFAULT; + } + break; + } + return err; +} + +static int +shmat32 (int first, int second, int third, int version, void *uptr) +{ + unsigned long raddr; + u32 *uaddr = (u32 *)A((u32)third); + int err; + + if (version == 1) + return -EINVAL; /* iBCS2 emulator entry point: unsupported */ + err = sys_shmat(first, uptr, second, &raddr); + if (err) + return err; + return put_user(raddr, uaddr); +} + +static int put_shmid64(struct shmid64_ds *s64p, void *uptr, int version) +{ + int err2; +#define s64 (*s64p) + if (version == IPC_64) { + struct shmid64_ds32 *up64 = (struct shmid64_ds32 *)uptr; + + if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64))) + return -EFAULT; + + err2 = __put_user(s64.shm_perm.key, &up64->shm_perm.key); + err2 |= __put_user(s64.shm_perm.uid, &up64->shm_perm.uid); + err2 |= __put_user(s64.shm_perm.gid, &up64->shm_perm.gid); + err2 |= __put_user(s64.shm_perm.cuid, &up64->shm_perm.cuid); + err2 |= __put_user(s64.shm_perm.cgid, &up64->shm_perm.cgid); + err2 |= __put_user(s64.shm_perm.mode, &up64->shm_perm.mode); + err2 |= __put_user(s64.shm_perm.seq, &up64->shm_perm.seq); + err2 |= __put_user(s64.shm_atime, &up64->shm_atime); + err2 |= __put_user(s64.shm_dtime, &up64->shm_dtime); + err2 |= __put_user(s64.shm_ctime, &up64->shm_ctime); + err2 |= __put_user(s64.shm_segsz, &up64->shm_segsz); + err2 |= __put_user(s64.shm_nattch, &up64->shm_nattch); + err2 |= __put_user(s64.shm_cpid, &up64->shm_cpid); + err2 |= __put_user(s64.shm_lpid, &up64->shm_lpid); + } else { + struct shmid_ds32 *up32 = (struct shmid_ds32 *)uptr; + + if (!access_ok(VERIFY_WRITE, up32, sizeof(*up32))) + return -EFAULT; + + err2 = __put_user(s64.shm_perm.key, &up32->shm_perm.key); + err2 |= __put_user(s64.shm_perm.uid, &up32->shm_perm.uid); + err2 |= __put_user(s64.shm_perm.gid, &up32->shm_perm.gid); + err2 |= __put_user(s64.shm_perm.cuid, &up32->shm_perm.cuid); + err2 |= __put_user(s64.shm_perm.cgid, &up32->shm_perm.cgid); + err2 |= __put_user(s64.shm_perm.mode, &up32->shm_perm.mode); + err2 |= __put_user(s64.shm_perm.seq, &up32->shm_perm.seq); + err2 |= __put_user(s64.shm_atime, &up32->shm_atime); + err2 |= __put_user(s64.shm_dtime, &up32->shm_dtime); + err2 |= __put_user(s64.shm_ctime, &up32->shm_ctime); + err2 |= __put_user(s64.shm_segsz, &up32->shm_segsz); + err2 |= __put_user(s64.shm_nattch, &up32->shm_nattch); + err2 |= __put_user(s64.shm_cpid, &up32->shm_cpid); + err2 |= __put_user(s64.shm_lpid, &up32->shm_lpid); + } +#undef s64 + return err2 ? -EFAULT : 0; +} +static int +shmctl32 (int first, int second, void *uptr) +{ + int err = -EFAULT, err2; + struct shmid_ds s; + struct shmid64_ds s64; + mm_segment_t old_fs; + struct shm_info32 *uip = (struct shm_info32 *)uptr; + struct shm_info si; + int version = ipc_parse_version32(&second); + struct shminfo64 smi; + struct shminfo *usi32 = (struct shminfo *) uptr; + struct shminfo64_32 *usi64 = (struct shminfo64_32 *) uptr; + + switch (second) { + case IPC_INFO: + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = sys_shmctl(first, second|IPC_64, (struct shmid_ds *)&smi); + set_fs(old_fs); + + if (version == IPC_64) { + if (!access_ok(VERIFY_WRITE, usi64, sizeof(*usi64))) { + err = -EFAULT; + break; + } + err2 = __put_user(smi.shmmax, &usi64->shmmax); + err2 |= __put_user(smi.shmmin, &usi64->shmmin); + err2 |= __put_user(smi.shmmni, &usi64->shmmni); + err2 |= __put_user(smi.shmseg, &usi64->shmseg); + err2 |= __put_user(smi.shmall, &usi64->shmall); + } else { + if (!access_ok(VERIFY_WRITE, usi32, sizeof(*usi32))) { + err = -EFAULT; + break; + } + err2 = __put_user(smi.shmmax, &usi32->shmmax); + err2 |= __put_user(smi.shmmin, &usi32->shmmin); + err2 |= __put_user(smi.shmmni, &usi32->shmmni); + err2 |= __put_user(smi.shmseg, &usi32->shmseg); + err2 |= __put_user(smi.shmall, &usi32->shmall); + } + if (err2) + err = -EFAULT; + break; + + case IPC_RMID: + case SHM_LOCK: + case SHM_UNLOCK: + err = sys_shmctl(first, second, (struct shmid_ds *)uptr); + break; + + case IPC_SET: + if (version == IPC_64) { + struct shmid64_ds32 *up64 = (struct shmid64_ds32 *)uptr; + err = get_user(s.shm_perm.uid, &up64->shm_perm.uid); + err |= get_user(s.shm_perm.gid, &up64->shm_perm.gid); + err |= get_user(s.shm_perm.mode, &up64->shm_perm.mode); + } else { + struct shmid_ds32 *up32 = (struct shmid_ds32 *)uptr; + err = get_user(s.shm_perm.uid, &up32->shm_perm.uid); + err |= get_user(s.shm_perm.gid, &up32->shm_perm.gid); + err |= get_user(s.shm_perm.mode, &up32->shm_perm.mode); + } + if (err) + break; + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = sys_shmctl(first, second, &s); + set_fs(old_fs); + break; + + case IPC_STAT: + case SHM_STAT: + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = sys_shmctl(first, second|IPC_64, (void *) &s64); + set_fs(old_fs); + + if (err < 0) + break; + err2 = put_shmid64(&s64, uptr, version); + if (err2) + err = err2; + break; + + case SHM_INFO: + old_fs = get_fs(); + set_fs(KERNEL_DS); + err = sys_shmctl(first, second, (void *)&si); + set_fs(old_fs); + if (err < 0) + break; + + if (!access_ok(VERIFY_WRITE, uip, sizeof(*uip))) { + err = -EFAULT; + break; + } + err2 = __put_user(si.used_ids, &uip->used_ids); + err2 |= __put_user(si.shm_tot, &uip->shm_tot); + err2 |= __put_user(si.shm_rss, &uip->shm_rss); + err2 |= __put_user(si.shm_swp, &uip->shm_swp); + err2 |= __put_user(si.swap_attempts, &uip->swap_attempts); + err2 |= __put_user(si.swap_successes, &uip->swap_successes); + if (err2) + err = -EFAULT; + break; + + } + return err; +} + +asmlinkage long +sys32_ipc (u32 call, int first, int second, int third, u32 ptr, u32 fifth) +{ + int version; + + version = call >> 16; /* hack for backward compatibility */ + call &= 0xffff; + + switch (call) { + case SEMOP: + /* struct sembuf is the same on 32 and 64bit :)) */ + return sys_semop(first, (struct sembuf *)AA(ptr), second); + case SEMGET: + return sys_semget(first, second, third); + case SEMCTL: + return semctl32(first, second, third, (void *)AA(ptr)); + + case MSGSND: + return do_sys32_msgsnd(first, second, third, (void *)AA(ptr)); + case MSGRCV: + return do_sys32_msgrcv(first, second, fifth, third, version, (void *)AA(ptr)); + case MSGGET: + return sys_msgget((key_t) first, second); + case MSGCTL: + return msgctl32(first, second, (void *)AA(ptr)); + + case SHMAT: + return shmat32(first, second, third, version, (void *)AA(ptr)); + break; + case SHMDT: + return sys_shmdt((char *)AA(ptr)); + case SHMGET: + return sys_shmget(first, second, third); + case SHMCTL: + return shmctl32(first, second, (void *)AA(ptr)); + + default: + return -EINVAL; + } + return -EINVAL; +} + diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c index 35060b86a54a..85aaed5ec40a 100644 --- a/arch/x86_64/ia32/sys_ia32.c +++ b/arch/x86_64/ia32/sys_ia32.c @@ -1118,422 +1118,6 @@ sys32_setrlimit(unsigned int resource, struct rlimit32 *rlim) return ret; } -/* - * sys32_ipc() is the de-multiplexer for the SysV IPC calls in 32bit emulation.. - * - * This is really horribly ugly. - */ - -struct msgbuf32 { s32 mtype; char mtext[1]; }; - -struct ipc_perm32 -{ - key_t key; - __kernel_uid_t32 uid; - __kernel_gid_t32 gid; - __kernel_uid_t32 cuid; - __kernel_gid_t32 cgid; - __kernel_mode_t32 mode; - unsigned short seq; -}; - -struct semid_ds32 { - struct ipc_perm32 sem_perm; /* permissions .. see ipc.h */ - __kernel_time_t32 sem_otime; /* last semop time */ - __kernel_time_t32 sem_ctime; /* last change time */ - u32 sem_base; /* ptr to first semaphore in array */ - u32 sem_pending; /* pending operations to be processed */ - u32 sem_pending_last; /* last pending operation */ - u32 undo; /* undo requests on this array */ - unsigned short sem_nsems; /* no. of semaphores in array */ -}; - -struct msqid_ds32 -{ - struct ipc_perm32 msg_perm; - u32 msg_first; - u32 msg_last; - __kernel_time_t32 msg_stime; - __kernel_time_t32 msg_rtime; - __kernel_time_t32 msg_ctime; - u32 wwait; - u32 rwait; - unsigned short msg_cbytes; - unsigned short msg_qnum; - unsigned short msg_qbytes; - __kernel_ipc_pid_t32 msg_lspid; - __kernel_ipc_pid_t32 msg_lrpid; -}; - -struct shmid_ds32 { - struct ipc_perm32 shm_perm; - int shm_segsz; - __kernel_time_t32 shm_atime; - __kernel_time_t32 shm_dtime; - __kernel_time_t32 shm_ctime; - __kernel_ipc_pid_t32 shm_cpid; - __kernel_ipc_pid_t32 shm_lpid; - unsigned short shm_nattch; -}; - -#define IPCOP_MASK(__x) (1UL << (__x)) - -static int -do_sys32_semctl(int first, int second, int third, void *uptr) -{ - union semun fourth; - u32 pad; - int err; - struct semid64_ds s; - struct semid_ds32 *usp; - mm_segment_t old_fs; - - if (!uptr) - return -EINVAL; - err = -EFAULT; - if (get_user (pad, (u32 *)uptr)) - return err; - if(third == SETVAL) - fourth.val = (int)pad; - else - fourth.__pad = (void *)A(pad); - - switch (third) { - - case IPC_INFO: - case IPC_RMID: - case IPC_SET: - case SEM_INFO: - case GETVAL: - case GETPID: - case GETNCNT: - case GETZCNT: - case GETALL: - case SETVAL: - case SETALL: - err = sys_semctl (first, second, third, fourth); - break; - - case IPC_STAT: - case SEM_STAT: - usp = (struct semid_ds32 *)A(pad); - fourth.__pad = &s; - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_semctl (first, second, third, fourth); - set_fs (old_fs); - if (verify_area(VERIFY_WRITE, usp, sizeof(struct semid_ds32)) || - __put_user(s.sem_perm.key, &usp->sem_perm.key) || - __put_user(s.sem_perm.uid, &usp->sem_perm.uid) || - __put_user(s.sem_perm.gid, &usp->sem_perm.gid) || - __put_user(s.sem_perm.cuid, &usp->sem_perm.cuid) || - __put_user (s.sem_perm.cgid, &usp->sem_perm.cgid) || - __put_user (s.sem_perm.mode, &usp->sem_perm.mode) || - __put_user (s.sem_perm.seq, &usp->sem_perm.seq) || - __put_user (s.sem_otime, &usp->sem_otime) || - __put_user (s.sem_ctime, &usp->sem_ctime) || - __put_user (s.sem_nsems, &usp->sem_nsems)) - return -EFAULT; - break; - - } - - return err; -} - -static int -do_sys32_msgsnd (int first, int second, int third, void *uptr) -{ - struct msgbuf *p = kmalloc (second + sizeof (struct msgbuf) - + 4, GFP_USER); - struct msgbuf32 *up = (struct msgbuf32 *)uptr; - mm_segment_t old_fs; - int err; - - if (!p) - return -ENOMEM; - err = verify_area(VERIFY_READ, up, sizeof(struct msgbuf32)); - if (err) - goto out; - err = __get_user (p->mtype, &up->mtype); - err |= __copy_from_user (p->mtext, &up->mtext, second); - if (err) - goto out; - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_msgsnd (first, p, second, third); - set_fs (old_fs); -out: - kfree (p); - return err; -} - -static int -do_sys32_msgrcv (int first, int second, int msgtyp, int third, - int version, void *uptr) -{ - struct msgbuf32 *up; - struct msgbuf *p; - mm_segment_t old_fs; - int err; - - if (!version) { - struct ipc_kludge *uipck = (struct ipc_kludge *)uptr; - struct ipc_kludge ipck; - - err = -EINVAL; - if (!uptr) - goto out; - err = -EFAULT; - if (copy_from_user (&ipck, uipck, sizeof (struct ipc_kludge))) - goto out; - uptr = (void *)A(ipck.msgp); - msgtyp = ipck.msgtyp; - } - err = -ENOMEM; - p = kmalloc (second + sizeof (struct msgbuf) + 4, GFP_USER); - if (!p) - goto out; - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_msgrcv (first, p, second + 4, msgtyp, third); - set_fs (old_fs); - if (err < 0) - goto free_then_out; - up = (struct msgbuf32 *)uptr; - if (verify_area(VERIFY_WRITE, up, sizeof(struct msgbuf32)) || - __put_user (p->mtype, &up->mtype) || - __copy_to_user (&up->mtext, p->mtext, err)) - err = -EFAULT; -free_then_out: - kfree (p); -out: - return err; -} - -static int -do_sys32_msgctl (int first, int second, void *uptr) -{ - int err = -EINVAL; - struct msqid_ds m; - struct msqid64_ds m64; - struct msqid_ds32 *up = (struct msqid_ds32 *)uptr; - mm_segment_t old_fs; - - switch (second) { - - case IPC_INFO: - case IPC_RMID: - case MSG_INFO: - err = sys_msgctl (first, second, (struct msqid_ds *)uptr); - break; - - case IPC_SET: - err = verify_area(VERIFY_READ, up, sizeof(struct msqid_ds32)); - if (err) - break; - err = __get_user (m.msg_perm.uid, &up->msg_perm.uid); - err |= __get_user (m.msg_perm.gid, &up->msg_perm.gid); - err |= __get_user (m.msg_perm.mode, &up->msg_perm.mode); - err |= __get_user (m.msg_qbytes, &up->msg_qbytes); - if (err) - break; - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_msgctl (first, second, &m); - set_fs (old_fs); - break; - - case IPC_STAT: - case MSG_STAT: - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_msgctl (first, second, (void *) &m64); - set_fs (old_fs); - if (verify_area(VERIFY_WRITE, up, sizeof(struct msqid_ds32)) || - __put_user (m64.msg_perm.key, &up->msg_perm.key) || - __put_user(m64.msg_perm.uid, &up->msg_perm.uid) || - __put_user(m64.msg_perm.gid, &up->msg_perm.gid) || - __put_user(m64.msg_perm.cuid, &up->msg_perm.cuid) || - __put_user(m64.msg_perm.cgid, &up->msg_perm.cgid) || - __put_user(m64.msg_perm.mode, &up->msg_perm.mode) || - __put_user(m64.msg_perm.seq, &up->msg_perm.seq) || - __put_user(m64.msg_stime, &up->msg_stime) || - __put_user(m64.msg_rtime, &up->msg_rtime) || - __put_user(m64.msg_ctime, &up->msg_ctime) || - __put_user(m64.msg_cbytes, &up->msg_cbytes) || - __put_user(m64.msg_qnum, &up->msg_qnum) || - __put_user(m64.msg_qbytes, &up->msg_qbytes) || - __put_user(m64.msg_lspid, &up->msg_lspid) || - __put_user(m64.msg_lrpid, &up->msg_lrpid)) - return -EFAULT; - break; - - } - - return err; -} - -static int -do_sys32_shmat (int first, int second, int third, int version, void *uptr) -{ - unsigned long raddr; - u32 *uaddr = (u32 *)A((u32)third); - int err = -EINVAL; - - if (version == 1) - return err; - err = sys_shmat (first, uptr, second, &raddr); - if (err) - return err; - err = put_user (raddr, uaddr); - return err; -} - -static int -do_sys32_shmctl (int first, int second, void *uptr) -{ - int err = -EFAULT; - struct shmid_ds s; - struct shmid64_ds s64; - struct shmid_ds32 *up = (struct shmid_ds32 *)uptr; - mm_segment_t old_fs; - struct shm_info32 { - int used_ids; - u32 shm_tot, shm_rss, shm_swp; - u32 swap_attempts, swap_successes; - } *uip = (struct shm_info32 *)uptr; - struct shm_info si; - - switch (second) { - - case IPC_INFO: - case IPC_RMID: - case SHM_LOCK: - case SHM_UNLOCK: - err = sys_shmctl (first, second, (struct shmid_ds *)uptr); - break; - case IPC_SET: - err = verify_area(VERIFY_READ, up, sizeof(struct shmid_ds32)); - if (err) - break; - err = __get_user (s.shm_perm.uid, &up->shm_perm.uid); - err |= __get_user (s.shm_perm.gid, &up->shm_perm.gid); - err |= __get_user (s.shm_perm.mode, &up->shm_perm.mode); - if (err) - break; - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_shmctl (first, second, &s); - set_fs (old_fs); - break; - - case IPC_STAT: - case SHM_STAT: - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_shmctl (first, second, (void *) &s64); - set_fs (old_fs); - if (err < 0) - break; - if (verify_area(VERIFY_WRITE, up, sizeof(struct shmid_ds32)) || - __put_user (s64.shm_perm.key, &up->shm_perm.key) || - __put_user (s64.shm_perm.uid, &up->shm_perm.uid) || - __put_user (s64.shm_perm.gid, &up->shm_perm.gid) || - __put_user (s64.shm_perm.cuid, &up->shm_perm.cuid) || - __put_user (s64.shm_perm.cgid, &up->shm_perm.cgid) || - __put_user (s64.shm_perm.mode, &up->shm_perm.mode) || - __put_user (s64.shm_perm.seq, &up->shm_perm.seq) || - __put_user (s64.shm_atime, &up->shm_atime) || - __put_user (s64.shm_dtime, &up->shm_dtime) || - __put_user (s64.shm_ctime, &up->shm_ctime) || - __put_user (s64.shm_segsz, &up->shm_segsz) || - __put_user (s64.shm_nattch, &up->shm_nattch) || - __put_user (s64.shm_cpid, &up->shm_cpid) || - __put_user (s64.shm_lpid, &up->shm_lpid)) - return -EFAULT; - break; - - case SHM_INFO: - old_fs = get_fs (); - set_fs (KERNEL_DS); - err = sys_shmctl (first, second, (void *)&si); - set_fs (old_fs); - if (err < 0) - break; - if (verify_area(VERIFY_WRITE, uip, sizeof(struct shm_info32)) || - __put_user (si.used_ids, &uip->used_ids) || - __put_user (si.shm_tot, &uip->shm_tot) || - __put_user (si.shm_rss, &uip->shm_rss) || - __put_user (si.shm_swp, &uip->shm_swp) || - __put_user (si.swap_attempts, &uip->swap_attempts) || - __put_user (si.swap_successes, &uip->swap_successes)) - return -EFAULT; - break; - - } - return err; -} - -asmlinkage long -sys32_ipc (u32 call, int first, int second, int third, u32 ptr, u32 fifth) -{ - int version, err; - - version = call >> 16; /* hack for backward compatibility */ - call &= 0xffff; - - switch (call) { - - case SEMOP: - /* struct sembuf is the same on 32 and 64bit :)) */ - err = sys_semop (first, (struct sembuf *)AA(ptr), - second); - break; - case SEMGET: - err = sys_semget (first, second, third); - break; - case SEMCTL: - err = do_sys32_semctl (first, second, third, - (void *)AA(ptr)); - break; - - case MSGSND: - err = do_sys32_msgsnd (first, second, third, - (void *)AA(ptr)); - break; - case MSGRCV: - err = do_sys32_msgrcv (first, second, fifth, third, - version, (void *)AA(ptr)); - break; - case MSGGET: - err = sys_msgget ((key_t) first, second); - break; - case MSGCTL: - err = do_sys32_msgctl (first, second, (void *)AA(ptr)); - break; - - case SHMAT: - err = do_sys32_shmat (first, second, third, - version, (void *)AA(ptr)); - break; - case SHMDT: - err = sys_shmdt ((char *)AA(ptr)); - break; - case SHMGET: - err = sys_shmget (first, second, third); - break; - case SHMCTL: - err = do_sys32_shmctl (first, second, (void *)AA(ptr)); - break; - default: - err = -EINVAL; - break; - } - - return err; -} - /* * sys_time() can be implemented in user-level using * sys_gettimeofday(). IA64 did this but i386 Linux did not diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c index a0ab1a1ee68e..b8ad4c6d3709 100644 --- a/arch/x86_64/kernel/ioport.c +++ b/arch/x86_64/kernel/ioport.c @@ -14,6 +14,7 @@ #include #include #include +#include /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ static void set_bitmap(unsigned long *bitmap, short base, short extent, int new_value) @@ -61,27 +62,19 @@ asmlinkage int sys_ioperm(unsigned long from, unsigned long num, int turn_on) return -EINVAL; if (turn_on && !capable(CAP_SYS_RAWIO)) return -EPERM; - /* - * If it's the first ioperm() call in this thread's lifetime, set the - * IO bitmap up. ioperm() is much less timing critical than clone(), - * this is why we delay this operation until now: - */ - if (!t->ioperm) { - /* - * just in case ... - */ - memset(t->io_bitmap,0xff,(IO_BITMAP_SIZE+1)*4); - t->ioperm = 1; - /* - * this activates it in the TSS - */ + + if (!t->io_bitmap_ptr) { + t->io_bitmap_ptr = kmalloc((IO_BITMAP_SIZE+1)*4, GFP_KERNEL); + if (!t->io_bitmap_ptr) + return -ENOMEM; + memset(t->io_bitmap_ptr,0xff,(IO_BITMAP_SIZE+1)*4); tss->io_map_base = IO_BITMAP_OFFSET; } /* * do it in the per-thread copy and in the TSS ... */ - set_bitmap((unsigned long *) t->io_bitmap, from, num, !turn_on); + set_bitmap((unsigned long *) t->io_bitmap_ptr, from, num, !turn_on); set_bitmap((unsigned long *) tss->io_bitmap, from, num, !turn_on); return 0; diff --git a/arch/x86_64/kernel/mtrr.c b/arch/x86_64/kernel/mtrr.c index 1f36d262b618..b0c43563a30a 100644 --- a/arch/x86_64/kernel/mtrr.c +++ b/arch/x86_64/kernel/mtrr.c @@ -19,10 +19,14 @@ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. (For earlier history, see arch/i386/kernel/mtrr.c) - September 2001 Dave Jones + v2.00 September 2001 Dave Jones Initial rewrite for x86-64. - + Removal of non-Intel style MTRR code. + v2.01 June 2002 Dave Jones + Removal of redundant abstraction layer. + 64-bit fixes. */ + #include #include #include @@ -60,35 +64,19 @@ #include #include -#define MTRR_VERSION "2.00 (20020207)" +#define MTRR_VERSION "2.01 (20020605)" #define TRUE 1 #define FALSE 0 -#define MTRRcap_MSR 0x0fe -#define MTRRdefType_MSR 0x2ff - -#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) -#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) +#define MSR_MTRRphysBase(reg) (0x200 + 2 * (reg)) +#define MSR_MTRRphysMask(reg) (0x200 + 2 * (reg) + 1) #define NUM_FIXED_RANGES 88 -#define MTRRfix64K_00000_MSR 0x250 -#define MTRRfix16K_80000_MSR 0x258 -#define MTRRfix16K_A0000_MSR 0x259 -#define MTRRfix4K_C0000_MSR 0x268 -#define MTRRfix4K_C8000_MSR 0x269 -#define MTRRfix4K_D0000_MSR 0x26a -#define MTRRfix4K_D8000_MSR 0x26b -#define MTRRfix4K_E0000_MSR 0x26c -#define MTRRfix4K_E8000_MSR 0x26d -#define MTRRfix4K_F0000_MSR 0x26e -#define MTRRfix4K_F8000_MSR 0x26f -#ifdef CONFIG_SMP #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 -#endif typedef u8 mtrr_type; @@ -97,49 +85,43 @@ typedef u8 mtrr_type; #ifdef CONFIG_SMP #define set_mtrr(reg,base,size,type) set_mtrr_smp (reg, base, size, type) #else -#define set_mtrr(reg,base,size,type) (*set_mtrr_up) (reg, base, size, type, \ - TRUE) +#define set_mtrr(reg,base,size,type) set_mtrr_up (reg, base, size, type, TRUE) #endif #if defined(CONFIG_PROC_FS) || defined(CONFIG_DEVFS_FS) #define USERSPACE_INTERFACE #endif -#ifndef USERSPACE_INTERFACE -#define compute_ascii() while (0) -#endif - #ifdef USERSPACE_INTERFACE static char *ascii_buffer; static unsigned int ascii_buf_bytes; -#endif -static unsigned int *usage_table; -static DECLARE_MUTEX (main_lock); - -/* Private functions */ -#ifdef USERSPACE_INTERFACE static void compute_ascii (void); +#else +#define compute_ascii() while (0) #endif +static unsigned int *usage_table; +static DECLARE_MUTEX (mtrr_lock); + struct set_mtrr_context { - unsigned long flags; - unsigned long deftype_lo; - unsigned long deftype_hi; - unsigned long cr4val; + u32 deftype_lo; + u32 deftype_hi; + u64 flags; + u64 cr4val; }; /* Put the processor into a state where MTRRs can be safely set */ static void set_mtrr_prepare (struct set_mtrr_context *ctxt) { - unsigned long cr0; + u64 cr0; /* Disable interrupts locally */ __save_flags(ctxt->flags); __cli(); /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if (cpu_has_ge) { + if (cpu_has_pge) { ctxt->cr4val = read_cr4(); write_cr4(ctxt->cr4val & ~(1UL << 7)); } @@ -152,8 +134,8 @@ static void set_mtrr_prepare (struct set_mtrr_context *ctxt) wbinvd(); /* Disable MTRRs, and set the default type to uncached */ - rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); - wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi); + rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); + wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi); } @@ -164,7 +146,7 @@ static void set_mtrr_done (struct set_mtrr_context *ctxt) wbinvd(); /* Restore MTRRdefType */ - wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); @@ -181,9 +163,9 @@ static void set_mtrr_done (struct set_mtrr_context *ctxt) /* This function returns the number of variable MTRRs */ static unsigned int get_num_var_ranges (void) { - unsigned long config, dummy; + u32 config, dummy; - rdmsr (MTRRcap_MSR, config, dummy); + rdmsr (MSR_MTRRcap, config, dummy); return (config & 0xff); } @@ -191,21 +173,21 @@ static unsigned int get_num_var_ranges (void) /* Returns non-zero if we have the write-combining memory type */ static int have_wrcomb (void) { - unsigned long config, dummy; + u32 config, dummy; - rdmsr (MTRRcap_MSR, config, dummy); + rdmsr (MSR_MTRRcap, config, dummy); return (config & (1 << 10)); } -static u32 size_or_mask, size_and_mask; +static u64 size_or_mask, size_and_mask; -static void get_mtrr (unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type * type) +static void get_mtrr (unsigned int reg, u64 *base, u32 *size, mtrr_type * type) { - unsigned long mask_lo, mask_hi, base_lo, base_hi; + u32 mask_lo, mask_hi, base_lo, base_hi; + u64 newsize; - rdmsr (MTRRphysMask_MSR (reg), mask_lo, mask_hi); + rdmsr (MSR_MTRRphysMask(reg), mask_lo, mask_hi); if ((mask_lo & 0x800) == 0) { /* Invalid (i.e. free) range */ *base = 0; @@ -214,32 +196,29 @@ static void get_mtrr (unsigned int reg, unsigned long *base, return; } - rdmsr (MTRRphysBase_MSR (reg), base_lo, base_hi); + rdmsr (MSR_MTRRphysBase(reg), base_lo, base_hi); /* Work out the shifted address mask. */ - mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) - | mask_lo >> PAGE_SHIFT; - - /* This works correctly if size is a power of two, i.e. a - contiguous range. */ - *size = -mask_lo; + newsize = (u64) mask_hi << 32 | (mask_lo & ~0x800); + newsize = ~newsize+1; + *size = (u32) newsize >> PAGE_SHIFT; *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; *type = base_lo & 0xff; } -static void set_mtrr_up (unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type, int do_safe) -/* [SUMMARY] Set variable MTRR register on the local CPU. - The register to set. - The base address of the region. - The size of the region. If this is 0 the region is disabled. - The type of the region. - If TRUE, do the change safely. If FALSE, safety measures should - be done externally. - [RETURNS] Nothing. -*/ +/* + * Set variable MTRR register on the local CPU. + * The register to set. + * The base address of the region. + * The size of the region. If this is 0 the region is disabled. + * The type of the region. + * If TRUE, do the change safely. If FALSE, safety measures should + * be done externally. + */ +static void set_mtrr_up (unsigned int reg, u64 base, + u32 size, mtrr_type type, int do_safe) { struct set_mtrr_context ctxt; @@ -249,12 +228,12 @@ static void set_mtrr_up (unsigned int reg, unsigned long base, if (size == 0) { /* The invalid bit is kept in the mask, so we simply clear the relevant mask register to disable a range. */ - wrmsr (MTRRphysMask_MSR (reg), 0, 0); + wrmsr (MSR_MTRRphysMask(reg), 0, 0); } else { - wrmsr (MTRRphysBase_MSR (reg), base << PAGE_SHIFT | type, + wrmsr (MSR_MTRRphysBase(reg), base << PAGE_SHIFT | type, (base & size_and_mask) >> (32 - PAGE_SHIFT)); - wrmsr (MTRRphysMask_MSR (reg), -size << PAGE_SHIFT | 0x800, - (-size & size_and_mask) >> (32 - PAGE_SHIFT)); + wrmsr (MSR_MTRRphysMask(reg), (-size-1) << PAGE_SHIFT | 0x800, + ((-size-1) & size_and_mask) >> (32 - PAGE_SHIFT)); } if (do_safe) set_mtrr_done (&ctxt); @@ -264,41 +243,40 @@ static void set_mtrr_up (unsigned int reg, unsigned long base, #ifdef CONFIG_SMP struct mtrr_var_range { - unsigned long base_lo; - unsigned long base_hi; - unsigned long mask_lo; - unsigned long mask_hi; + u32 base_lo; + u32 base_hi; + u32 mask_lo; + u32 mask_hi; }; /* Get the MSR pair relating to a var range */ static void __init get_mtrr_var_range (unsigned int index, struct mtrr_var_range *vr) { - rdmsr (MTRRphysBase_MSR (index), vr->base_lo, vr->base_hi); - rdmsr (MTRRphysMask_MSR (index), vr->mask_lo, vr->mask_hi); + rdmsr (MSR_MTRRphysBase(index), vr->base_lo, vr->base_hi); + rdmsr (MSR_MTRRphysMask(index), vr->mask_lo, vr->mask_hi); } /* Set the MSR pair relating to a var range. Returns TRUE if changes are made */ -static int __init -set_mtrr_var_range_testing (unsigned int index, struct mtrr_var_range *vr) +static int __init set_mtrr_var_range_testing (unsigned int index, + struct mtrr_var_range *vr) { - unsigned int lo, hi; + u32 lo, hi; int changed = FALSE; - rdmsr (MTRRphysBase_MSR (index), lo, hi); - if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) - || (vr->base_hi & 0xfUL) != (hi & 0xfUL)) { - wrmsr (MTRRphysBase_MSR (index), vr->base_lo, vr->base_hi); + rdmsr (MSR_MTRRphysBase(index), lo, hi); + if ((vr->base_lo & 0xfffff0ff) != (lo & 0xfffff0ff) + || (vr->base_hi & 0x000fffff) != (hi & 0x000fffff)) { + wrmsr (MSR_MTRRphysBase(index), vr->base_lo, vr->base_hi); changed = TRUE; } - rdmsr (MTRRphysMask_MSR (index), lo, hi); - - if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL) - || (vr->mask_hi & 0xfUL) != (hi & 0xfUL)) { - wrmsr (MTRRphysMask_MSR (index), vr->mask_lo, vr->mask_hi); + rdmsr (MSR_MTRRphysMask(index), lo, hi); + if ((vr->mask_lo & 0xfffff800) != (lo & 0xfffff800) + || (vr->mask_hi & 0x000fffff) != (hi & 0x000fffff)) { + wrmsr (MSR_MTRRphysMask(index), vr->mask_lo, vr->mask_hi); changed = TRUE; } return changed; @@ -307,45 +285,50 @@ set_mtrr_var_range_testing (unsigned int index, struct mtrr_var_range *vr) static void __init get_fixed_ranges (mtrr_type * frs) { - unsigned long *p = (unsigned long *) frs; + u32 *p = (u32 *) frs; int i; - rdmsr (MTRRfix64K_00000_MSR, p[0], p[1]); + rdmsr (MSR_MTRRfix64K_00000, p[0], p[1]); for (i = 0; i < 2; i++) - rdmsr (MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); + rdmsr (MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); for (i = 0; i < 8; i++) - rdmsr (MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); + rdmsr (MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]); } static int __init set_fixed_ranges_testing (mtrr_type * frs) { - unsigned long *p = (unsigned long *) frs; + u32 *p = (u32 *) frs; int changed = FALSE; int i; - unsigned long lo, hi; + u32 lo, hi; - rdmsr (MTRRfix64K_00000_MSR, lo, hi); + printk (KERN_INFO "mtrr: rdmsr 64K_00000\n"); + rdmsr (MSR_MTRRfix64K_00000, lo, hi); if (p[0] != lo || p[1] != hi) { - wrmsr (MTRRfix64K_00000_MSR, p[0], p[1]); + printk (KERN_INFO "mtrr: Writing %x:%x to 64K MSR. lohi were %x:%x\n", p[0], p[1], lo, hi); + wrmsr (MSR_MTRRfix64K_00000, p[0], p[1]); changed = TRUE; } + printk (KERN_INFO "mtrr: rdmsr 16K_80000\n"); for (i = 0; i < 2; i++) { - rdmsr (MTRRfix16K_80000_MSR + i, lo, hi); + rdmsr (MSR_MTRRfix16K_80000 + i, lo, hi); if (p[2 + i * 2] != lo || p[3 + i * 2] != hi) { - wrmsr (MTRRfix16K_80000_MSR + i, p[2 + i * 2], - p[3 + i * 2]); + printk (KERN_INFO "mtrr: Writing %x:%x to 16K MSR%d. lohi were %x:%x\n", p[2 + i * 2], p[3 + i * 2], i, lo, hi ); + wrmsr (MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); changed = TRUE; } } + printk (KERN_INFO "mtrr: rdmsr 4K_C0000\n"); for (i = 0; i < 8; i++) { - rdmsr (MTRRfix4K_C0000_MSR + i, lo, hi); + rdmsr (MSR_MTRRfix4K_C0000 + i, lo, hi); + printk (KERN_INFO "mtrr: MTRRfix4K_C0000+%d = %x:%x\n", i, lo, hi); if (p[6 + i * 2] != lo || p[7 + i * 2] != hi) { - wrmsr (MTRRfix4K_C0000_MSR + i, p[6 + i * 2], - p[7 + i * 2]); + printk (KERN_INFO "mtrr: Writing %x:%x to 4K MSR%d. lohi were %x:%x\n", p[6 + i * 2], p[7 + i * 2], i, lo, hi); + wrmsr (MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]); changed = TRUE; } } @@ -357,8 +340,8 @@ struct mtrr_state { unsigned int num_var_ranges; struct mtrr_var_range *var_ranges; mtrr_type fixed_ranges[NUM_FIXED_RANGES]; - unsigned char enabled; mtrr_type def_type; + unsigned char enabled; }; @@ -367,9 +350,9 @@ static void __init get_mtrr_state (struct mtrr_state *state) { unsigned int nvrs, i; struct mtrr_var_range *vrs; - unsigned long lo, dummy; + u32 lo, dummy; - nvrs = state->num_var_ranges = get_num_var_ranges (); + nvrs = state->num_var_ranges = get_num_var_ranges(); vrs = state->var_ranges = kmalloc (nvrs * sizeof (struct mtrr_var_range), GFP_KERNEL); if (vrs == NULL) @@ -379,7 +362,7 @@ static void __init get_mtrr_state (struct mtrr_state *state) get_mtrr_var_range (i, &vrs[i]); get_fixed_ranges (state->fixed_ranges); - rdmsr (MTRRdefType_MSR, lo, dummy); + rdmsr (MSR_MTRRdefType, lo, dummy); state->def_type = (lo & 0xff); state->enabled = (lo & 0xc00) >> 10; } @@ -393,17 +376,18 @@ static void __init finalize_mtrr_state (struct mtrr_state *state) } -static unsigned long __init set_mtrr_state (struct mtrr_state *state, +/* + * Set the MTRR state for this CPU. + * The MTRR state information to read. + * Some relevant CPU context. + * [NOTE] The CPU must already be in a safe state for MTRR changes. + * [RETURNS] 0 if no changes made, else a mask indication what was changed. + */ +static u64 __init set_mtrr_state (struct mtrr_state *state, struct set_mtrr_context *ctxt) -/* [SUMMARY] Set the MTRR state for this CPU. - The MTRR state information to read. - Some relevant CPU context. - [NOTE] The CPU must already be in a safe state for MTRR changes. - [RETURNS] 0 if no changes made, else a mask indication what was changed. -*/ { unsigned int i; - unsigned long change_mask = 0; + u64 change_mask = 0; for (i = 0; i < state->num_var_ranges; i++) if (set_mtrr_var_range_testing (i, &state->var_ranges[i])) @@ -428,16 +412,16 @@ static volatile int wait_barrier_execute = FALSE; static volatile int wait_barrier_cache_enable = FALSE; struct set_mtrr_data { - unsigned long smp_base; - unsigned long smp_size; + u64 smp_base; + u32 smp_size; unsigned int smp_reg; mtrr_type smp_type; }; +/* + * Synchronisation handler. Executed by "other" CPUs. + */ static void ipi_handler (void *info) -/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. - [RETURNS] Nothing. -*/ { struct set_mtrr_data *data = info; struct set_mtrr_context ctxt; @@ -449,7 +433,7 @@ static void ipi_handler (void *info) barrier (); /* The master has cleared me to execute */ - (*set_mtrr_up) (data->smp_reg, data->smp_base, data->smp_size, + set_mtrr_up (data->smp_reg, data->smp_base, data->smp_size, data->smp_type, FALSE); /* Notify master CPU that I've executed the function */ @@ -462,8 +446,7 @@ static void ipi_handler (void *info) } -static void set_mtrr_smp (unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) +static void set_mtrr_smp (unsigned int reg, u64 base, u32 size, mtrr_type type) { struct set_mtrr_data data; struct set_mtrr_context ctxt; @@ -490,7 +473,7 @@ static void set_mtrr_smp (unsigned int reg, unsigned long base, /* Set up for completion wait and then release other CPUs to change MTRRs */ atomic_set (&undone_count, smp_num_cpus - 1); wait_barrier_execute = FALSE; - (*set_mtrr_up) (reg, base, size, type, FALSE); + set_mtrr_up (reg, base, size, type, FALSE); /* Now wait for other CPUs to complete the function */ while (atomic_read (&undone_count) > 0) @@ -505,7 +488,7 @@ static void set_mtrr_smp (unsigned int reg, unsigned long base, /* Some BIOS's are fucked and don't set all MTRRs the same! */ -static void __init mtrr_state_warn (unsigned long mask) +static void __init mtrr_state_warn (u32 mask) { if (!mask) return; @@ -521,7 +504,7 @@ static void __init mtrr_state_warn (unsigned long mask) #endif /* CONFIG_SMP */ -static char inline * attrib_to_str (int x) +static inline char * attrib_to_str (int x) { return (x <= 6) ? mtrr_strings[x] : "?"; } @@ -551,21 +534,20 @@ static void __init init_table (void) } -static int generic_get_free_region (unsigned long base, - unsigned long size) -/* [SUMMARY] Get a free MTRR. - The starting (base) address of the region. - The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. +/* + * Get a free MTRR. + * returns the index of the region on success, else -1 on error. */ +static int get_free_region(void) { int i, max; mtrr_type ltype; - unsigned long lbase, lsize; + u64 lbase; + u32 lsize; max = get_num_var_ranges (); for (i = 0; i < max; ++i) { - (*get_mtrr) (i, &lbase, &lsize, <ype); + get_mtrr (i, &lbase, &lsize, <ype); if (lsize == 0) return i; } @@ -573,22 +555,19 @@ static int generic_get_free_region (unsigned long base, } -static int (*get_free_region) (unsigned long base, - unsigned long size) = generic_get_free_region; - /** * mtrr_add_page - Add a memory type region * @base: Physical base address of region in pages (4 KB) * @size: Physical size of region in pages (4 KB) * @type: Type of MTRR desired * @increment: If this is true do usage counting on the region + * Returns The MTRR register on success, else a negative number + * indicating the error code. * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. + * Memory type region registers control the caching on newer + * processors. This function allows drivers to request an MTRR is added. + * The caller should expect to need to provide a power of two size on + * an equivalent power of two boundary. * * If the region cannot be added either because all regions are in use * or the CPU cannot support it a negative value is returned. On success @@ -596,42 +575,28 @@ static int (*get_free_region) (unsigned long base, * as a cookie only. * * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. * * The available types are * * %MTRR_TYPE_UNCACHABLE - No caching - * * %MTRR_TYPE_WRBACK - Write data back in bursts whenever - * * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts - * * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes * * BUGS: Needs a quiet flag for the cases where drivers do not mind * failures and do not wish system log messages to be sent. */ -int mtrr_add_page (unsigned long base, unsigned long size, - unsigned int type, char increment) +int mtrr_add_page (u64 base, u32 size, unsigned int type, char increment) { -/* [SUMMARY] Add an MTRR entry. - The starting (base, in pages) address of the region. - The size of the region. (in pages) - The type of the new region. - If true and the region already exists, the usage count will be - incremented. - [RETURNS] The MTRR register on success, else a negative number indicating - the error code. - [NOTE] This routine uses a spinlock. -*/ int i, max; mtrr_type ltype; - unsigned long lbase, lsize, last; + u64 lbase, last; + u32 lsize; if (base + size < 0x100) { printk (KERN_WARNING - "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n", + "mtrr: cannot set region below 1 MiB (0x%lx000,0x%x000)\n", base, size); return -EINVAL; } @@ -644,7 +609,7 @@ int mtrr_add_page (unsigned long base, unsigned long size, if (lbase != last) { printk (KERN_WARNING - "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", + "mtrr: base(0x%lx000) is not aligned on a size(0x%x000) boundary\n", base, size); return -EINVAL; } @@ -655,7 +620,7 @@ int mtrr_add_page (unsigned long base, unsigned long size, } /* If the type is WC, check that this processor supports it */ - if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb ()) { + if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { printk (KERN_WARNING "mtrr: your processor doesn't support write-combining\n"); return -ENOSYS; @@ -669,9 +634,9 @@ int mtrr_add_page (unsigned long base, unsigned long size, increment = increment ? 1 : 0; max = get_num_var_ranges (); /* Search for existing MTRR */ - down (&main_lock); + down (&mtrr_lock); for (i = 0; i < max; ++i) { - (*get_mtrr) (i, &lbase, &lsize, <ype); + get_mtrr (i, &lbase, &lsize, <ype); if (base >= lbase + lsize) continue; if ((base < lbase) && (base + size <= lbase)) @@ -679,41 +644,41 @@ int mtrr_add_page (unsigned long base, unsigned long size, /* At this point we know there is some kind of overlap/enclosure */ if ((base < lbase) || (base + size > lbase + lsize)) { - up (&main_lock); + up (&mtrr_lock); printk (KERN_WARNING - "mtrr: 0x%lx000,0x%lx000 overlaps existing" - " 0x%lx000,0x%lx000\n", base, size, lbase, - lsize); + "mtrr: 0x%lx000,0x%x000 overlaps existing" + " 0x%lx000,0x%x000\n", base, size, lbase, lsize); return -EINVAL; } /* New region is enclosed by an existing region */ if (ltype != type) { if (type == MTRR_TYPE_UNCACHABLE) continue; - up (&main_lock); + up (&mtrr_lock); printk - ("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", - base, size, attrib_to_str (ltype), + ("mtrr: type mismatch for %lx000,%x000 old: %s new: %s\n", + base, size, + attrib_to_str (ltype), attrib_to_str (type)); return -EINVAL; } if (increment) ++usage_table[i]; compute_ascii (); - up (&main_lock); + up (&mtrr_lock); return i; } /* Search for an empty MTRR */ - i = (*get_free_region) (base, size); + i = get_free_region(); if (i < 0) { - up (&main_lock); + up (&mtrr_lock); printk ("mtrr: no more MTRRs available\n"); return i; } set_mtrr (i, base, size, type); usage_table[i] = 1; compute_ascii (); - up (&main_lock); + up (&mtrr_lock); return i; } @@ -724,13 +689,13 @@ int mtrr_add_page (unsigned long base, unsigned long size, * @size: Physical size of region * @type: Type of MTRR desired * @increment: If this is true do usage counting on the region + * Return the MTRR register on success, else a negative numbe + * indicating the error code. * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. + * Memory type region registers control the caching on newer processors. + * This function allows drivers to request an MTRR is added. + * The caller should expect to need to provide a power of two size on + * an equivalent power of two boundary. * * If the region cannot be added either because all regions are in use * or the CPU cannot support it a negative value is returned. On success @@ -743,33 +708,19 @@ int mtrr_add_page (unsigned long base, unsigned long size, * The available types are * * %MTRR_TYPE_UNCACHABLE - No caching - * * %MTRR_TYPE_WRBACK - Write data back in bursts whenever - * * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts - * * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes * * BUGS: Needs a quiet flag for the cases where drivers do not mind * failures and do not wish system log messages to be sent. */ -int mtrr_add (unsigned long base, unsigned long size, unsigned int type, - char increment) +int mtrr_add (u64 base, u32 size, unsigned int type, char increment) { -/* [SUMMARY] Add an MTRR entry. - The starting (base) address of the region. - The size (in bytes) of the region. - The type of the new region. - If true and the region already exists, the usage count will be - incremented. - [RETURNS] The MTRR register on success, else a negative number indicating - the error code. -*/ - if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { printk ("mtrr: size and base must be multiples of 4 kiB\n"); - printk ("mtrr: size: 0x%lx base: 0x%lx\n", size, base); + printk ("mtrr: size: 0x%x base: 0x%lx\n", size, base); return -EINVAL; } return mtrr_add_page (base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, @@ -792,55 +743,46 @@ int mtrr_add (unsigned long base, unsigned long size, unsigned int type, * code. */ -int mtrr_del_page (int reg, unsigned long base, unsigned long size) -/* [SUMMARY] Delete MTRR/decrement usage count. - The register. If this is less than 0 then <> and <> must - be supplied. - The base address of the region. This is ignored if <> is >= 0. - The size of the region. This is ignored if <> is >= 0. - [RETURNS] The register on success, else a negative number indicating - the error code. - [NOTE] This routine uses a spinlock. -*/ +int mtrr_del_page (int reg, u64 base, u32 size) { int i, max; mtrr_type ltype; - unsigned long lbase, lsize; + u64 lbase; + u32 lsize; max = get_num_var_ranges (); - down (&main_lock); + down (&mtrr_lock); if (reg < 0) { /* Search for existing MTRR */ for (i = 0; i < max; ++i) { - (*get_mtrr) (i, &lbase, &lsize, <ype); + get_mtrr (i, &lbase, &lsize, <ype); if (lbase == base && lsize == size) { reg = i; break; } } if (reg < 0) { - up (&main_lock); - printk ("mtrr: no MTRR for %lx000,%lx000 found\n", base, - size); + up (&mtrr_lock); + printk ("mtrr: no MTRR for %lx000,%x000 found\n", base, size); return -EINVAL; } } if (reg >= max) { - up (&main_lock); + up (&mtrr_lock); printk ("mtrr: register: %d too big\n", reg); return -EINVAL; } - (*get_mtrr) (reg, &lbase, &lsize, <ype); + get_mtrr (reg, &lbase, &lsize, <ype); if (lsize < 1) { - up (&main_lock); + up (&mtrr_lock); printk ("mtrr: MTRR %d not used\n", reg); return -EINVAL; } if (usage_table[reg] < 1) { - up (&main_lock); + up (&mtrr_lock); printk ("mtrr: reg: %d has count=0\n", reg); return -EINVAL; } @@ -848,7 +790,7 @@ int mtrr_del_page (int reg, unsigned long base, unsigned long size) if (--usage_table[reg] < 1) set_mtrr (reg, 0, 0, 0); compute_ascii (); - up (&main_lock); + up (&mtrr_lock); return reg; } @@ -868,19 +810,11 @@ int mtrr_del_page (int reg, unsigned long base, unsigned long size) * code. */ -int mtrr_del (int reg, unsigned long base, unsigned long size) -/* [SUMMARY] Delete MTRR/decrement usage count. - The register. If this is less than 0 then <> and <> must - be supplied. - The base address of the region. This is ignored if <> is >= 0. - The size of the region. This is ignored if <> is >= 0. - [RETURNS] The register on success, else a negative number indicating - the error code. -*/ +int mtrr_del (int reg, u64 base, u32 size) { if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { printk ("mtrr: size and base must be multiples of 4 kiB\n"); - printk ("mtrr: size: 0x%lx base: 0x%lx\n", size, base); + printk ("mtrr: size: 0x%x base: 0x%lx\n", size, base); return -EINVAL; } return mtrr_del_page (reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); @@ -889,8 +823,8 @@ int mtrr_del (int reg, unsigned long base, unsigned long size) #ifdef USERSPACE_INTERFACE -static int mtrr_file_add (unsigned long base, unsigned long size, - unsigned int type, char increment, struct file *file, int page) +static int mtrr_file_add (u64 base, u32 size, unsigned int type, + struct file *file, int page) { int reg, max; unsigned int *fcount = file->private_data; @@ -910,7 +844,7 @@ static int mtrr_file_add (unsigned long base, unsigned long size, if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { printk ("mtrr: size and base must be multiples of 4 kiB\n"); - printk ("mtrr: size: 0x%lx base: 0x%lx\n", size, base); + printk ("mtrr: size: 0x%x base: 0x%lx\n", size, base); return -EINVAL; } base >>= PAGE_SHIFT; @@ -925,7 +859,7 @@ static int mtrr_file_add (unsigned long base, unsigned long size, } -static int mtrr_file_del (unsigned long base, unsigned long size, +static int mtrr_file_del (u64 base, u32 size, struct file *file, int page) { int reg; @@ -935,7 +869,7 @@ static int mtrr_file_del (unsigned long base, unsigned long size, if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { printk ("mtrr: size and base must be multiples of 4 kiB\n"); - printk ("mtrr: size: 0x%lx base: 0x%lx\n", size, base); + printk ("mtrr: size: 0x%x base: 0x%lx\n", size, base); return -EINVAL; } base >>= PAGE_SHIFT; @@ -977,9 +911,9 @@ static ssize_t mtrr_write (struct file *file, const char *buf, "disable=%d" */ { - int i, err; - unsigned long reg; - unsigned long long base, size; + int i, err, reg; + u64 base; + u32 size; char *ptr; char line[LINE_SIZE]; @@ -1027,7 +961,7 @@ static ssize_t mtrr_write (struct file *file, const char *buf, if ((base & 0xfff) || (size & 0xfff)) { printk ("mtrr: size and base must be multiples of 4 kiB\n"); - printk ("mtrr: size: 0x%Lx base: 0x%Lx\n", size, base); + printk ("mtrr: size: 0x%x base: 0x%lx\n", size, base); return -EINVAL; } @@ -1046,9 +980,7 @@ static ssize_t mtrr_write (struct file *file, const char *buf, continue; base >>= PAGE_SHIFT; size >>= PAGE_SHIFT; - err = - mtrr_add_page ((unsigned long) base, (unsigned long) size, - i, 1); + err = mtrr_add_page ((u64) base, size, i, 1); if (err < 0) return err; return len; @@ -1076,7 +1008,7 @@ static int mtrr_ioctl (struct inode *inode, struct file *file, if (copy_from_user (&sentry, (void *) arg, sizeof sentry)) return -EFAULT; err = - mtrr_file_add (sentry.base, sentry.size, sentry.type, 1, + mtrr_file_add (sentry.base, sentry.size, sentry.type, file, 0); if (err < 0) return err; @@ -1117,7 +1049,7 @@ static int mtrr_ioctl (struct inode *inode, struct file *file, return -EFAULT; if (gentry.regnum >= get_num_var_ranges ()) return -EINVAL; - (*get_mtrr) (gentry.regnum, &gentry.base, &gentry.size, &type); + get_mtrr (gentry.regnum, &gentry.base, &gentry.size, &type); /* Hide entries that go above 4GB */ if (gentry.base + gentry.size > 0x100000 @@ -1139,7 +1071,7 @@ static int mtrr_ioctl (struct inode *inode, struct file *file, if (copy_from_user (&sentry, (void *) arg, sizeof sentry)) return -EFAULT; err = - mtrr_file_add (sentry.base, sentry.size, sentry.type, 1, + mtrr_file_add (sentry.base, sentry.size, sentry.type, file, 1); if (err < 0) return err; @@ -1180,7 +1112,7 @@ static int mtrr_ioctl (struct inode *inode, struct file *file, return -EFAULT; if (gentry.regnum >= get_num_var_ranges ()) return -EINVAL; - (*get_mtrr) (gentry.regnum, &gentry.base, &gentry.size, &type); + get_mtrr (gentry.regnum, &gentry.base, &gentry.size, &type); gentry.type = type; if (copy_to_user ((void *) arg, &gentry, sizeof gentry)) @@ -1199,7 +1131,6 @@ static int mtrr_close (struct inode *ino, struct file *file) if (fcount == NULL) return 0; - lock_kernel (); max = get_num_var_ranges (); for (i = 0; i < max; ++i) { while (fcount[i] > 0) { @@ -1208,7 +1139,6 @@ static int mtrr_close (struct inode *ino, struct file *file) --fcount[i]; } } - unlock_kernel (); kfree (fcount); file->private_data = NULL; return 0; @@ -1234,12 +1164,13 @@ static void compute_ascii (void) char factor; int i, max; mtrr_type type; - unsigned long base, size; + u64 base; + u32 size; ascii_buf_bytes = 0; max = get_num_var_ranges (); for (i = 0; i < max; i++) { - (*get_mtrr) (i, &base, &size, &type); + get_mtrr (i, &base, &size, &type); if (size == 0) usage_table[i] = 0; else { @@ -1253,11 +1184,10 @@ static void compute_ascii (void) } sprintf (ascii_buffer + ascii_buf_bytes, - "reg%02i: base=0x%05lx000 (%4liMB), size=%4li%cB: %s, count=%d\n", + "reg%02i: base=0x%05lx000 (%4liMB), size=%4i%cB: %s, count=%d\n", i, base, base >> (20 - PAGE_SHIFT), size, factor, attrib_to_str (type), usage_table[i]); - ascii_buf_bytes += - strlen (ascii_buffer + ascii_buf_bytes); + ascii_buf_bytes += strlen (ascii_buffer + ascii_buf_bytes); } } devfs_set_file_size (devfs_handle, ascii_buf_bytes); @@ -1283,22 +1213,16 @@ static void __init mtrr_setup (void) if ((cpuid_eax (0x80000000) >= 0x80000008)) { u32 phys_addr; phys_addr = cpuid_eax (0x80000008) & 0xff; - size_or_mask = - ~((1 << (phys_addr - PAGE_SHIFT)) - 1); - size_and_mask = ~size_or_mask & 0xfff00000; - } else { - /* FIXME: This is to make it work on Athlon during debugging. */ - size_or_mask = 0xff000000; /* 36 bits */ - size_and_mask = 0x00f00000; + size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1); + size_and_mask = ~size_or_mask & 0xfffffffffff00000; } - printk ("mtrr: detected mtrr type: x86-64\n"); } } #ifdef CONFIG_SMP -static volatile unsigned long smp_changes_mask __initdata = 0; +static volatile u32 smp_changes_mask __initdata = 0; static struct mtrr_state smp_mtrr_state __initdata = { 0, 0 }; void __init mtrr_init_boot_cpu (void) @@ -1310,7 +1234,8 @@ void __init mtrr_init_boot_cpu (void) void __init mtrr_init_secondary_cpu (void) { - unsigned long mask, count; + u64 mask; + int count; struct set_mtrr_context ctxt; /* Note that this is not ideal, since the cache is only flushed/disabled @@ -1357,4 +1282,3 @@ int __init mtrr_init (void) init_table (); return 0; } - diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index e233b3557ce5..f00fff0638de 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -320,9 +321,6 @@ void show_regs(struct pt_regs * regs) printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); } -#define __STR(x) #x -#define __STR2(x) __STR(x) - extern void load_gs_index(unsigned); /* @@ -330,7 +328,13 @@ extern void load_gs_index(unsigned); */ void exit_thread(void) { - /* nothing to do ... */ + struct task_struct *me = current; + if (me->thread.io_bitmap_ptr) { + kfree(me->thread.io_bitmap_ptr); + me->thread.io_bitmap_ptr = NULL; + (init_tss + smp_processor_id())->io_map_base = + INVALID_IO_BITMAP_OFFSET; + } } void flush_thread(void) @@ -392,6 +396,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp, unlazy_fpu(current); p->thread.i387 = current->thread.i387; + if (unlikely(me->thread.io_bitmap_ptr != NULL)) { + p->thread.io_bitmap_ptr = kmalloc((IO_BITMAP_SIZE+1)*4, GFP_KERNEL); + if (!p->thread.io_bitmap_ptr) + return -ENOMEM; + memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, + (IO_BITMAP_SIZE+1)*4); + } + return 0; } @@ -491,21 +503,14 @@ void __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Handle the IO bitmap */ - if (unlikely(prev->ioperm || next->ioperm)) { - if (next->ioperm) { + if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) { + if (next->io_bitmap_ptr) { /* * 4 cachelines copy ... not good, but not that * bad either. Anyone got something better? * This only affects processes which use ioperm(). - * [Putting the TSSs into 4k-tlb mapped regions - * and playing VM tricks to switch the IO bitmap - * is not really acceptable.] - * On x86-64 we could put multiple bitmaps into - * the GDT and just switch offsets - * This would require ugly special cases on overflow - * though -AK */ - memcpy(tss->io_bitmap, next->io_bitmap, + memcpy(tss->io_bitmap, next->io_bitmap_ptr, IO_BITMAP_SIZE*sizeof(u32)); tss->io_map_base = IO_BITMAP_OFFSET; } else { diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c index f6c296dce4b5..66ae787c8d19 100644 --- a/arch/x86_64/kernel/setup64.c +++ b/arch/x86_64/kernel/setup64.c @@ -91,6 +91,9 @@ void pda_init(int cpu) pda->me = pda; pda->cpudata_offset = 0; + pda->active_mm = &init_mm; + pda->mmu_state = 0; + asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0)); wrmsrl(MSR_GS_BASE, cpu_pda + cpu); } diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 98b653afe853..229592faf805 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -84,7 +84,6 @@ struct rt_sigframe char *pretcode; struct ucontext uc; struct siginfo info; - struct _fpstate fpstate; }; static int @@ -186,8 +185,7 @@ badframe: */ static int -setup_sigcontext(struct sigcontext *sc, struct _fpstate *fpstate, - struct pt_regs *regs, unsigned long mask) +setup_sigcontext(struct sigcontext *sc, struct pt_regs *regs, unsigned long mask) { int tmp, err = 0; struct task_struct *me = current; @@ -221,20 +219,17 @@ setup_sigcontext(struct sigcontext *sc, struct _fpstate *fpstate, err |= __put_user(mask, &sc->oldmask); err |= __put_user(me->thread.cr2, &sc->cr2); - tmp = save_i387(fpstate); - if (tmp < 0) - err = 1; - else - err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate); - return err; } /* * Determine which stack to use.. */ -static inline struct rt_sigframe * -get_sigframe(struct k_sigaction *ka, struct pt_regs * regs) + +#define round_down(p, r) ((void *) ((unsigned long)((p) - (r) + 1) & ~((r)-1))) + +static void * +get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) { unsigned long rsp; @@ -247,22 +242,34 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs * regs) rsp = current->sas_ss_sp + current->sas_ss_size; } - rsp = (rsp - sizeof(struct _fpstate)) & ~(15UL); - rsp -= offsetof(struct rt_sigframe, fpstate); - - return (struct rt_sigframe *) rsp; + return round_down(rsp - size, 16); } static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs * regs) { - struct rt_sigframe *frame; + struct rt_sigframe *frame = NULL; + struct _fpstate *fp = NULL; int err = 0; - frame = get_sigframe(ka, regs); + if (current->used_math) { + fp = get_stack(ka, regs, sizeof(struct _fpstate)); + frame = round_down((char *)fp - sizeof(struct rt_sigframe), 16) - 8; - if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) { goto give_sigsegv; + } + + if (save_i387(fp) < 0) + err |= -1; + } + + if (!frame) + frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) { + goto give_sigsegv; + } if (ka->sa.sa_flags & SA_SIGINFO) { err |= copy_siginfo_to_user(&frame->info, info); @@ -278,14 +285,10 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(sas_ss_flags(regs->rsp), &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, - regs, set->sig[0]); + err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0]); + err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - if (err) { - goto give_sigsegv; - } - /* Set up to return from userspace. If provided, use a stub already in userspace. */ /* x86-64 should always use SA_RESTORER. */ @@ -297,7 +300,6 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, } if (err) { - printk("fault 3\n"); goto give_sigsegv; } @@ -305,7 +307,6 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax); #endif - /* Set up registers for signal handler */ { struct exec_domain *ed = current_thread_info()->exec_domain; @@ -320,9 +321,10 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, next argument after the signal number on the stack. */ regs->rsi = (unsigned long)&frame->info; regs->rdx = (unsigned long)&frame->uc; - regs->rsp = (unsigned long) frame; regs->rip = (unsigned long) ka->sa.sa_handler; + regs->rsp = (unsigned long)frame; + set_fs(USER_DS); regs->eflags &= ~TF_MASK; diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 3d6e8a406b54..f0d99edfec0e 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -25,8 +25,6 @@ /* The 'big kernel lock' */ spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; -struct tlb_state cpu_tlbstate[NR_CPUS] = {[0 ... NR_CPUS-1] = { &init_mm, 0 }}; - /* * the following functions deal with sending IPIs between CPUs. * @@ -147,9 +145,9 @@ static spinlock_t tlbstate_lock = SPIN_LOCK_UNLOCKED; */ static void inline leave_mm (unsigned long cpu) { - if (cpu_tlbstate[cpu].state == TLBSTATE_OK) + if (read_pda(mmu_state) == TLBSTATE_OK) BUG(); - clear_bit(cpu, &cpu_tlbstate[cpu].active_mm->cpu_vm_mask); + clear_bit(cpu, &read_pda(active_mm)->cpu_vm_mask); __flush_tlb(); } @@ -164,18 +162,18 @@ static void inline leave_mm (unsigned long cpu) * the other cpus, but smp_invalidate_interrupt ignore flush ipis * for the wrong mm, and in the worst case we perform a superflous * tlb flush. - * 1a2) set cpu_tlbstate to TLBSTATE_OK + * 1a2) set cpu mmu_state to TLBSTATE_OK * Now the smp_invalidate_interrupt won't call leave_mm if cpu0 * was in lazy tlb mode. - * 1a3) update cpu_tlbstate[].active_mm + * 1a3) update cpu active_mm * Now cpu0 accepts tlb flushes for the new mm. * 1a4) set_bit(cpu, &new_mm->cpu_vm_mask); * Now the other cpus will send tlb flush ipis. * 1a4) change cr3. * 1b) thread switch without mm change - * cpu_tlbstate[].active_mm is correct, cpu0 already handles + * cpu active_mm is correct, cpu0 already handles * flush ipis. - * 1b1) set cpu_tlbstate to TLBSTATE_OK + * 1b1) set cpu mmu_state to TLBSTATE_OK * 1b2) test_and_set the cpu bit in cpu_vm_mask. * Atomically set the bit [other cpus will start sending flush ipis], * and test the bit. @@ -188,7 +186,7 @@ static void inline leave_mm (unsigned long cpu) * runs in kernel space, the cpu could load tlb entries for user space * pages. * - * The good news is that cpu_tlbstate is local to each cpu, no + * The good news is that cpu mmu_state is local to each cpu, no * write/read ordering problems. */ @@ -216,8 +214,8 @@ asmlinkage void smp_invalidate_interrupt (void) * BUG(); */ - if (flush_mm == cpu_tlbstate[cpu].active_mm) { - if (cpu_tlbstate[cpu].state == TLBSTATE_OK) { + if (flush_mm == read_pda(active_mm)) { + if (read_pda(mmu_state) == TLBSTATE_OK) { if (flush_va == FLUSH_ALL) local_flush_tlb(); else @@ -335,7 +333,7 @@ static inline void do_flush_tlb_all_local(void) unsigned long cpu = smp_processor_id(); __flush_tlb_all(); - if (cpu_tlbstate[cpu].state == TLBSTATE_LAZY) + if (read_pda(mmu_state) == TLBSTATE_LAZY) leave_mm(cpu); } diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index b292ca527a8a..e576e9f98ec5 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -47,7 +47,7 @@ #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) -#define NO_VSYSCALL 1 +//#define NO_VSYSCALL 1 #ifdef NO_VSYSCALL #include diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 9d88edb5c62d..2bbb7d8238b5 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -189,3 +189,5 @@ EXPORT_SYMBOL_NOVERS(do_softirq_thunk); void out_of_line_bug(void); EXPORT_SYMBOL(out_of_line_bug); + +EXPORT_SYMBOL(init_level4_pgt); diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile index 8fbcee522aeb..6791678212ed 100644 --- a/arch/x86_64/lib/Makefile +++ b/arch/x86_64/lib/Makefile @@ -12,7 +12,7 @@ obj-y = csum-partial.o csum-copy.o csum-wrappers.o delay.o \ thunk.o io.o clear_page.o copy_page.o obj-y += memcpy.o obj-y += memmove.o -#obj-y += memset.o +obj-y += memset.o obj-y += copy_user.o export-objs := io.o csum-wrappers.o csum-partial.o diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S index 1c5d73cd73b8..44ce1223d832 100644 --- a/arch/x86_64/lib/memset.S +++ b/arch/x86_64/lib/memset.S @@ -1,6 +1,4 @@ -/* Copyright 2002 Andi Kleen, SuSE Labs */ - - // #define FIX_ALIGNMENT 1 +/* Copyright 2002 Andi Kleen */ /* * ISO C memset - set a memory block to a byte value. @@ -11,51 +9,51 @@ * * rax original destination */ - .globl ____memset + .globl __memset + .globl memset .p2align -____memset: - movq %rdi,%r10 /* save destination for return address */ - movq %rdx,%r11 /* save count */ +memset: +__memset: + movq %rdi,%r10 + movq %rdx,%r11 /* expand byte value */ - movzbl %sil,%ecx /* zero extend char value */ - movabs $0x0101010101010101,%rax /* expansion pattern */ - mul %rcx /* expand with rax, clobbers rdx */ + movzbl %sil,%ecx + movabs $0x0101010101010101,%rax + mul %rcx /* with rax, clobbers rdx */ -#ifdef FIX_ALIGNMENT /* align dst */ movl %edi,%r9d - andl $7,%r9d /* test unaligned bits */ + andl $7,%r9d jnz bad_alignment after_bad_alignment: -#endif - movq %r11,%rcx /* restore count */ - shrq $6,%rcx /* divide by 64 */ - jz handle_tail /* block smaller than 64 bytes? */ - movl $64,%r8d /* CSE loop block size */ + movq %r11,%rcx + movl $64,%r8d + shrq $6,%rcx + jz handle_tail loop_64: - movnti %rax,0*8(%rdi) - movnti %rax,1*8(%rdi) - movnti %rax,2*8(%rdi) - movnti %rax,3*8(%rdi) - movnti %rax,4*8(%rdi) - movnti %rax,5*8(%rdi) - movnti %rax,6*8(%rdi) - movnti %rax,7*8(%rdi) /* clear 64 byte blocks */ - addq %r8,%rdi /* increase pointer by 64 bytes */ - loop loop_64 /* decrement rcx and if not zero loop */ + movnti %rax,(%rdi) + movnti %rax,8(%rdi) + movnti %rax,16(%rdi) + movnti %rax,24(%rdi) + movnti %rax,32(%rdi) + movnti %rax,40(%rdi) + movnti %rax,48(%rdi) + movnti %rax,56(%rdi) + addq %r8,%rdi + loop loop_64 /* Handle tail in loops. The loops should be faster than hard to predict jump tables. */ handle_tail: movl %r11d,%ecx - andl $63,%ecx - shrl $3,%ecx + andl $63&(~7),%ecx jz handle_7 + shrl $3,%ecx loop_8: - movnti %rax,(%rdi) /* long words */ + movnti %rax,(%rdi) addq $8,%rdi loop loop_8 @@ -64,22 +62,20 @@ handle_7: andl $7,%ecx jz ende loop_1: - movb %al,(%rdi) /* bytes */ - incq %rdi + movb %al,(%rdi) + addq $1,%rdi loop loop_1 ende: movq %r10,%rax ret -#ifdef FIX_ALIGNMENT bad_alignment: - andq $-8,%r11 /* shorter than 8 bytes */ - jz handle_7 /* if yes handle it in the tail code */ - movnti %rax,(%rdi) /* unaligned store of 8 bytes */ + cmpq $7,%r11 + jbe handle_7 + movnti %rax,(%rdi) /* unaligned store */ movq $8,%r8 - subq %r9,%r8 /* compute alignment (8-misalignment) */ - addq %r8,%rdi /* fix destination */ - subq %r8,%r11 /* fix count */ + subq %r9,%r8 + addq %r8,%rdi + subq %r8,%r11 jmp after_bad_alignment -#endif diff --git a/include/asm-x86_64/i387.h b/include/asm-x86_64/i387.h index edb75edb063e..2a0292c00b54 100644 --- a/include/asm-x86_64/i387.h +++ b/include/asm-x86_64/i387.h @@ -16,11 +16,22 @@ #include #include #include +#include extern void fpu_init(void); extern void init_fpu(void); int save_i387(struct _fpstate *buf); +static inline int need_signal_i387(struct task_struct *me) +{ + if (!me->used_math) + return 0; + me->used_math = 0; + if (!test_thread_flag(TIF_USEDFPU)) + return 0; + return 1; +} + /* * FPU lazy state save handling... */ diff --git a/include/asm-x86_64/ia32.h b/include/asm-x86_64/ia32.h index e57c2e593007..7830bf40cfd4 100644 --- a/include/asm-x86_64/ia32.h +++ b/include/asm-x86_64/ia32.h @@ -18,7 +18,9 @@ typedef int __kernel_clock_t32; typedef int __kernel_pid_t32; typedef unsigned short __kernel_ipc_pid_t32; typedef unsigned short __kernel_uid_t32; +typedef unsigned __kernel_uid32_t32; typedef unsigned short __kernel_gid_t32; +typedef unsigned __kernel_gid32_t32; typedef unsigned short __kernel_dev_t32; typedef unsigned int __kernel_ino_t32; typedef unsigned short __kernel_mode_t32; diff --git a/include/asm-x86_64/ipc.h b/include/asm-x86_64/ipc.h index 49ea4fdc19b4..2ca5773be061 100644 --- a/include/asm-x86_64/ipc.h +++ b/include/asm-x86_64/ipc.h @@ -1,34 +1,6 @@ #ifndef __i386_IPC_H__ #define __i386_IPC_H__ -/* - * These are used to wrap system calls on x86. - * - * See arch/i386/kernel/sys_i386.c for ugly details.. - * - * (on x86-64 only used for 32bit emulation) - */ - -struct ipc_kludge { - struct msgbuf *msgp; - long msgtyp; -}; - -#define SEMOP 1 -#define SEMGET 2 -#define SEMCTL 3 -#define MSGSND 11 -#define MSGRCV 12 -#define MSGGET 13 -#define MSGCTL 14 -#define SHMAT 21 -#define SHMDT 22 -#define SHMGET 23 -#define SHMCTL 24 - -/* Used by the DIPC package, try and avoid reusing it */ -#define DIPC 25 - -#define IPCCALL(version,op) ((version)<<16 | (op)) +/* dummy */ #endif diff --git a/include/asm-x86_64/mmu_context.h b/include/asm-x86_64/mmu_context.h index e9f6d661cf4c..e21f0e6721f8 100644 --- a/include/asm-x86_64/mmu_context.h +++ b/include/asm-x86_64/mmu_context.h @@ -19,8 +19,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm); static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk, unsigned cpu) { - if(cpu_tlbstate[cpu].state == TLBSTATE_OK) - cpu_tlbstate[cpu].state = TLBSTATE_LAZY; + if (read_pda(mmu_state) == TLBSTATE_OK) + write_pda(mmu_state, TLBSTATE_LAZY); } #else static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk, unsigned cpu) @@ -35,8 +35,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* stop flush ipis for the previous mm */ clear_bit(cpu, &prev->cpu_vm_mask); #ifdef CONFIG_SMP - cpu_tlbstate[cpu].state = TLBSTATE_OK; - cpu_tlbstate[cpu].active_mm = next; + write_pda(mmu_state, TLBSTATE_OK); + write_pda(active_mm, next); #endif set_bit(cpu, &next->cpu_vm_mask); /* Re-load page tables */ @@ -48,8 +48,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, } #ifdef CONFIG_SMP else { - cpu_tlbstate[cpu].state = TLBSTATE_OK; - if(cpu_tlbstate[cpu].active_mm != next) + write_pda(mmu_state, TLBSTATE_OK); + if (read_pda(active_mm) != next) out_of_line_bug(); if(!test_and_set_bit(cpu, &next->cpu_vm_mask)) { /* We were in lazy tlb mode and leave_mm disabled diff --git a/include/asm-x86_64/msr.h b/include/asm-x86_64/msr.h index 7e522c2f4846..4085cc8c5dbe 100644 --- a/include/asm-x86_64/msr.h +++ b/include/asm-x86_64/msr.h @@ -95,6 +95,7 @@ #define MSR_IA32_PERFCTR0 0xc1 #define MSR_IA32_PERFCTR1 0xc2 +#define MSR_MTRRcap 0x0fe #define MSR_IA32_BBL_CR_CTL 0x119 #define MSR_IA32_MCG_CAP 0x179 @@ -110,6 +111,19 @@ #define MSR_IA32_LASTINTFROMIP 0x1dd #define MSR_IA32_LASTINTTOIP 0x1de +#define MSR_MTRRfix64K_00000 0x250 +#define MSR_MTRRfix16K_80000 0x258 +#define MSR_MTRRfix16K_A0000 0x259 +#define MSR_MTRRfix4K_C0000 0x268 +#define MSR_MTRRfix4K_C8000 0x269 +#define MSR_MTRRfix4K_D0000 0x26a +#define MSR_MTRRfix4K_D8000 0x26b +#define MSR_MTRRfix4K_E0000 0x26c +#define MSR_MTRRfix4K_E8000 0x26d +#define MSR_MTRRfix4K_F0000 0x26e +#define MSR_MTRRfix4K_F8000 0x26f +#define MSR_MTRRdefType 0x2ff + #define MSR_IA32_MC0_CTL 0x400 #define MSR_IA32_MC0_STATUS 0x401 #define MSR_IA32_MC0_ADDR 0x402 @@ -171,11 +185,4 @@ #define MSR_IA32_APICBASE_ENABLE (1<<11) #define MSR_IA32_APICBASE_BASE (0xfffff<<12) - -#define MSR_IA32_THERM_CONTROL 0x19a -#define MSR_IA32_THERM_INTERRUPT 0x19b -#define MSR_IA32_THERM_STATUS 0x19c -#define MSR_IA32_MISC_ENABLE 0x1a0 - - #endif diff --git a/include/asm-x86_64/mtrr.h b/include/asm-x86_64/mtrr.h index ff3ea870d0d6..6505d7bd6ece 100644 --- a/include/asm-x86_64/mtrr.h +++ b/include/asm-x86_64/mtrr.h @@ -30,16 +30,16 @@ struct mtrr_sentry { - unsigned long base; /* Base address */ - unsigned long size; /* Size of region */ + __u64 base; /* Base address */ + __u32 size; /* Size of region */ unsigned int type; /* Type of region */ }; struct mtrr_gentry { + __u64 base; /* Base address */ + __u32 size; /* Size of region */ unsigned int regnum; /* Register number */ - unsigned long base; /* Base address */ - unsigned long size; /* Size of region */ unsigned int type; /* Type of region */ }; @@ -81,46 +81,38 @@ static char *mtrr_strings[MTRR_NUM_TYPES] = #ifdef __KERNEL__ /* The following functions are for use by other drivers */ -# ifdef CONFIG_MTRR -extern int mtrr_add (unsigned long base, unsigned long size, - unsigned int type, char increment); -extern int mtrr_add_page (unsigned long base, unsigned long size, - unsigned int type, char increment); -extern int mtrr_del (int reg, unsigned long base, unsigned long size); -extern int mtrr_del_page (int reg, unsigned long base, unsigned long size); -extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi); -# else -static __inline__ int mtrr_add (unsigned long base, unsigned long size, +#ifdef CONFIG_MTRR +extern int mtrr_add (__u64 base, __u32 size, unsigned int type, char increment); +extern int mtrr_add_page (__u64 base, __u32 size, unsigned int type, char increment); +extern int mtrr_del (int reg, __u64 base, __u32 size); +extern int mtrr_del_page (int reg, __u64 base, __u32 size); +#else +static __inline__ int mtrr_add (__u64 base, __u32 size, unsigned int type, char increment) { return -ENODEV; } -static __inline__ int mtrr_add_page (unsigned long base, unsigned long size, +static __inline__ int mtrr_add_page (__u64 base, __u32 size, unsigned int type, char increment) { return -ENODEV; } -static __inline__ int mtrr_del (int reg, unsigned long base, - unsigned long size) +static __inline__ int mtrr_del (int reg, __u64 base, __u32 size) { return -ENODEV; } -static __inline__ int mtrr_del_page (int reg, unsigned long base, - unsigned long size) +static __inline__ int mtrr_del_page (int reg, __u64 base, __u32 size) { return -ENODEV; } - -static __inline__ void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) {;} - -# endif +#endif /* The following functions are for initialisation: don't use them! */ extern int mtrr_init (void); -# if defined(CONFIG_SMP) && defined(CONFIG_MTRR) +#if defined(CONFIG_SMP) && defined(CONFIG_MTRR) extern void mtrr_init_boot_cpu (void); extern void mtrr_init_secondary_cpu (void); -# endif +#endif #endif diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h index 7ff508346013..eb38cf70fb90 100644 --- a/include/asm-x86_64/pda.h +++ b/include/asm-x86_64/pda.h @@ -22,6 +22,8 @@ struct x8664_pda { unsigned int __local_bh_count; unsigned int __nmi_count; /* arch dependent */ struct task_struct * __ksoftirqd_task; /* waitqueue is too large */ + struct mm_struct *active_mm; + int mmu_state; } ____cacheline_aligned; #define PDA_STACKOFFSET (5*8) diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h index 4cda0f055a5f..03875338aedf 100644 --- a/include/asm-x86_64/processor.h +++ b/include/asm-x86_64/processor.h @@ -45,21 +45,12 @@ struct cpuinfo_x86 { __u8 x86_vendor; /* CPU vendor */ __u8 x86_model; __u8 x86_mask; - /* We know that wp_works_ok = 1, hlt_works_ok = 1, hard_math = 1, - etc... */ - char wp_works_ok; /* It doesn't on 386's */ - char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */ - char hard_math; - char rfu; int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */ __u32 x86_capability[NCAPINTS]; char x86_vendor_id[16]; char x86_model_id[64]; int x86_cache_size; /* in KB - valid for CPUS which support this call */ - int fdiv_bug; - int f00f_bug; - int coma_bug; unsigned long loops_per_jiffy; } ____cacheline_aligned; @@ -323,7 +314,7 @@ struct thread_struct { /* IO permissions. the bitmap could be moved into the GDT, that would make switch faster for a limited number of ioperm using tasks. -AK */ int ioperm; - u32 io_bitmap[IO_BITMAP_SIZE+1]; + u32 *io_bitmap_ptr; }; #define INIT_THREAD { \ diff --git a/include/asm-x86_64/spinlock.h b/include/asm-x86_64/spinlock.h index 6f1d71c65a68..a276217b88a3 100644 --- a/include/asm-x86_64/spinlock.h +++ b/include/asm-x86_64/spinlock.h @@ -15,7 +15,7 @@ extern int printk(const char * fmt, ...) typedef struct { volatile unsigned int lock; -#ifdef CONFIG_DEBUG_SPINLOCK +#if SPINLOCK_DEBUG unsigned magic; #endif } spinlock_t; @@ -39,7 +39,7 @@ typedef struct { * We make no fairness assumptions. They have a cost. */ -#define spin_is_locked(x) (*(volatile char *)(&(x)->lock) <= 0) +#define spin_is_locked(x) (*(volatile signed char *)(&(x)->lock) <= 0) #define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x)) #define spin_lock_string \ @@ -62,7 +62,7 @@ typedef struct { static inline int _raw_spin_trylock(spinlock_t *lock) { - char oldval; + signed char oldval; __asm__ __volatile__( "xchgb %b0,%1" :"=q" (oldval), "=m" (lock->lock) diff --git a/include/asm-x86_64/string.h b/include/asm-x86_64/string.h index ec456eadb674..27876b9da06a 100644 --- a/include/asm-x86_64/string.h +++ b/include/asm-x86_64/string.h @@ -40,18 +40,9 @@ extern void *__memcpy(void *to, const void *from, size_t len); __ret = __builtin_memcpy((dst),(src),__len); \ __ret; }) -#if 0 + #define __HAVE_ARCH_MEMSET -extern void *__memset(void *mem, int val, size_t len); -#define memset(dst,val,len) \ - ({ size_t __len = (len); \ - void *__ret; \ - if (__builtin_constant_p(len) && __len >= 64) \ - __ret = __memset((dst),(val),__len); \ - else \ - __ret = __builtin_memset((dst),(val),__len); \ - __ret; }) -#endif +#define memset __builtin_memset #define __HAVE_ARCH_MEMMOVE void * memmove(void * dest,const void *src,size_t count); diff --git a/include/asm-x86_64/suspend.h b/include/asm-x86_64/suspend.h new file mode 100644 index 000000000000..9f065f8fe33d --- /dev/null +++ b/include/asm-x86_64/suspend.h @@ -0,0 +1,6 @@ +#ifndef SUSPEND_H +#define SUSPEND_H 1 + +/* dummy for now */ + +#endif diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h index 1df84d087823..9d6c6f1f48d5 100644 --- a/include/asm-x86_64/system.h +++ b/include/asm-x86_64/system.h @@ -13,7 +13,10 @@ #define LOCK_PREFIX "" #endif -#define prepare_to_switch() do {} while(0) +#define prepare_arch_schedule(prev) do { } while(0) +#define finish_arch_schedule(prev) do { } while(0) +#define prepare_arch_switch(rq) do { } while(0) +#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock) #define __STR(x) #x #define STR(x) __STR(x) @@ -41,7 +44,7 @@ __POP(rax) __POP(r15) __POP(r14) __POP(r13) __POP(r12) __POP(r11) __POP(r10) \ __POP(r9) __POP(r8) -#define switch_to(prev,next) \ +#define switch_to(prev,next,last) \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%[prevrsp]\n\t" \ "movq %[nextrsp],%%rsp\n\t" \ diff --git a/include/asm-x86_64/timex.h b/include/asm-x86_64/timex.h index b87680d9e51a..98bddc2d805a 100644 --- a/include/asm-x86_64/timex.h +++ b/include/asm-x86_64/timex.h @@ -48,6 +48,4 @@ static inline cycles_t get_cycles (void) extern unsigned int cpu_khz; -#define ARCH_HAS_JIFFIES_64 - #endif diff --git a/include/asm-x86_64/tlbflush.h b/include/asm-x86_64/tlbflush.h index 3f086b2d03b3..2e811ac262af 100644 --- a/include/asm-x86_64/tlbflush.h +++ b/include/asm-x86_64/tlbflush.h @@ -106,15 +106,6 @@ static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long st #define TLBSTATE_OK 1 #define TLBSTATE_LAZY 2 -struct tlb_state -{ - struct mm_struct *active_mm; - int state; - char __cacheline_padding[24]; -}; -extern struct tlb_state cpu_tlbstate[NR_CPUS]; - - #endif #define flush_tlb_kernel_range(start, end) flush_tlb_all() -- cgit v1.2.3 From 86403107ccc75a0562ecdf21b3adc07285b83daa Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 17 Jun 2002 20:27:29 -0700 Subject: [PATCH] Move jiffies_64 down into architectures x86-64 needs an own special declaration of jiffies_64. prepare for this by moving the jiffies_64 declaration from kernel/timer.c down into each architecture. --- arch/alpha/kernel/time.c | 2 ++ arch/arm/kernel/time.c | 2 ++ arch/cris/kernel/time.c | 2 ++ arch/i386/kernel/time.c | 1 + arch/ia64/kernel/time.c | 2 ++ arch/m68k/kernel/time.c | 1 + arch/mips/kernel/time.c | 2 ++ arch/mips64/kernel/syscall.c | 2 ++ arch/parisc/kernel/time.c | 2 ++ arch/ppc/kernel/time.c | 3 +++ arch/ppc64/kernel/time.c | 2 ++ arch/s390/kernel/time.c | 2 ++ arch/s390x/kernel/time.c | 2 ++ arch/sh/kernel/time.c | 2 ++ arch/sparc/kernel/time.c | 2 ++ arch/sparc64/kernel/time.c | 2 ++ kernel/timer.c | 4 ++-- 17 files changed, 33 insertions(+), 2 deletions(-) diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c index 0be250e543e8..93a569828d70 100644 --- a/arch/alpha/kernel/time.c +++ b/arch/alpha/kernel/time.c @@ -48,6 +48,8 @@ #include "proto.h" #include "irq_impl.h" +u64 jiffies_64; + extern rwlock_t xtime_lock; extern unsigned long wall_jiffies; /* kernel/timer.c */ diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c index 7c7e03c5b6e9..cd00aacc74a9 100644 --- a/arch/arm/kernel/time.c +++ b/arch/arm/kernel/time.c @@ -32,6 +32,8 @@ #include #include +u64 jiffies_64; + extern rwlock_t xtime_lock; extern unsigned long wall_jiffies; diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c index 537040f95a6d..1ee0bbfeab7e 100644 --- a/arch/cris/kernel/time.c +++ b/arch/cris/kernel/time.c @@ -44,6 +44,8 @@ #include +u64 jiffies_64; + static int have_rtc; /* used to remember if we have an RTC or not */ /* define this if you need to use print_timestamp */ diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c index 1e1eb0d3a5f7..f56251513581 100644 --- a/arch/i386/kernel/time.c +++ b/arch/i386/kernel/time.c @@ -65,6 +65,7 @@ */ #include +u64 jiffies_64; unsigned long cpu_khz; /* Detected as we calibrate the TSC */ diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index dc6500b7a167..1c348cce1fdd 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -27,6 +27,8 @@ extern rwlock_t xtime_lock; extern unsigned long wall_jiffies; extern unsigned long last_time_offset; +u64 jiffies_64; + #ifdef CONFIG_IA64_DEBUG_IRQ unsigned long last_cli_ip; diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c index a845040b339a..54b8f68cf7e0 100644 --- a/arch/m68k/kernel/time.c +++ b/arch/m68k/kernel/time.c @@ -24,6 +24,7 @@ #include +u64 jiffies_64; static inline int set_rtc_mmss(unsigned long nowtime) { diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c index e548314773de..6ea186b42155 100644 --- a/arch/mips/kernel/time.c +++ b/arch/mips/kernel/time.c @@ -32,6 +32,8 @@ #define USECS_PER_JIFFY (1000000/HZ) #define USECS_PER_JIFFY_FRAC ((1000000ULL << 32) / HZ & 0xffffffff) +u64 jiffies_64; + /* * forward reference */ diff --git a/arch/mips64/kernel/syscall.c b/arch/mips64/kernel/syscall.c index 6daab491059b..053051c63a25 100644 --- a/arch/mips64/kernel/syscall.c +++ b/arch/mips64/kernel/syscall.c @@ -32,6 +32,8 @@ #include #include +u64 jiffies_64; + extern asmlinkage void syscall_trace(void); asmlinkage int sys_pipe(abi64_no_regargs, struct pt_regs regs) diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index 7b3de0e0ada3..e028e6f3dbe2 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -30,6 +30,8 @@ #include +u64 jiffies_64; + extern rwlock_t xtime_lock; static int timer_value; diff --git a/arch/ppc/kernel/time.c b/arch/ppc/kernel/time.c index 260345226022..88a4d63ffea0 100644 --- a/arch/ppc/kernel/time.c +++ b/arch/ppc/kernel/time.c @@ -70,6 +70,9 @@ #include +/* XXX false sharing with below? */ +u64 jiffies_64; + unsigned long disarm_decr[NR_CPUS]; extern int do_sys_settimeofday(struct timeval *tv, struct timezone *tz); diff --git a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c index d00224a05633..9cd390d65342 100644 --- a/arch/ppc64/kernel/time.c +++ b/arch/ppc64/kernel/time.c @@ -64,6 +64,8 @@ void smp_local_timer_interrupt(struct pt_regs *); +u64 jiffies_64; + /* keep track of when we need to update the rtc */ time_t last_rtc_update; extern rwlock_t xtime_lock; diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 2a135d999830..f09059ee63bd 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -39,6 +39,8 @@ #define TICK_SIZE tick +u64 jiffies_64; + static ext_int_info_t ext_int_info_timer; static uint64_t init_timer_cc; diff --git a/arch/s390x/kernel/time.c b/arch/s390x/kernel/time.c index e12e41e2eaef..b81dcb9683d7 100644 --- a/arch/s390x/kernel/time.c +++ b/arch/s390x/kernel/time.c @@ -39,6 +39,8 @@ #define TICK_SIZE tick +u64 jiffies_64; + static ext_int_info_t ext_int_info_timer; static uint64_t init_timer_cc; diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index 62af96d4fd48..e51e0eb001d6 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -70,6 +70,8 @@ #endif /* CONFIG_CPU_SUBTYPE_ST40STB1 */ #endif /* __sh3__ or __SH4__ */ +u64 jiffies_64; + extern rwlock_t xtime_lock; extern unsigned long wall_jiffies; #define TICK_SIZE tick diff --git a/arch/sparc/kernel/time.c b/arch/sparc/kernel/time.c index 6e7935ab7c56..90d3e8528358 100644 --- a/arch/sparc/kernel/time.c +++ b/arch/sparc/kernel/time.c @@ -43,6 +43,8 @@ extern rwlock_t xtime_lock; +u64 jiffies_64; + enum sparc_clock_type sp_clock_typ; spinlock_t mostek_lock = SPIN_LOCK_UNLOCKED; unsigned long mstk48t02_regs = 0UL; diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c index 852c96d62319..47c794e99f4b 100644 --- a/arch/sparc64/kernel/time.c +++ b/arch/sparc64/kernel/time.c @@ -44,6 +44,8 @@ unsigned long mstk48t02_regs = 0UL; unsigned long ds1287_regs = 0UL; #endif +u64 jiffies_64; + static unsigned long mstk48t08_regs = 0UL; static unsigned long mstk48t59_regs = 0UL; diff --git a/kernel/timer.c b/kernel/timer.c index 0b7efa84970b..c6d6d12e04d4 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -69,11 +69,11 @@ unsigned long event; extern int do_setitimer(int, struct itimerval *, struct itimerval *); /* - * The 64-bit value is not volatile - you MUST NOT read it + * The 64-bit jiffies value is not atomic - you MUST NOT read it * without holding read_lock_irq(&xtime_lock). * jiffies is defined in the linker script... */ -u64 jiffies_64; + unsigned int * prof_buffer; unsigned long prof_len; -- cgit v1.2.3 From 30724dcd73d7d0baf54250e5e80c97b8f49e210e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 17 Jun 2002 20:27:43 -0700 Subject: [PATCH] poll/select fast path This patch streamlines poll and select by adding fast paths for a small number of descriptors passed. The majority of polls/selects seem to be of this nature. The main saving comes from not allocating two pages for wait queue and table, but from using stack allocation (upto 256bytes) when only a few descriptors are needed. This makes it as fast again as 2.0 and even a bit faster because the wait queue page allocation is avoided too (except when the drivers overflow it) select also skips a lot faster over big holes and avoids the separate pass of determining the max. number of descriptors in the bitmap. A typical linux system saves a considerable amount of unswappable memory with this patch, because it usually has 10+ daemons hanging around in poll or select with each two pages allocated for data and wait queue. Some other cleanups. --- fs/select.c | 307 ++++++++++++++++++++++++++++----------------------- include/linux/poll.h | 49 ++++---- 2 files changed, 187 insertions(+), 169 deletions(-) diff --git a/fs/select.c b/fs/select.c index 30c29f1e49f8..6a5909a75677 100644 --- a/fs/select.c +++ b/fs/select.c @@ -12,6 +12,9 @@ * 24 January 2000 * Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation * of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian). + * + * Dec 2001 + * Stack allocation and fast path (Andi Kleen) */ #include @@ -26,21 +29,6 @@ #define ROUND_UP(x,y) (((x)+(y)-1)/(y)) #define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM) -struct poll_table_entry { - struct file * filp; - wait_queue_t wait; - wait_queue_head_t * wait_address; -}; - -struct poll_table_page { - struct poll_table_page * next; - struct poll_table_entry * entry; - struct poll_table_entry entries[0]; -}; - -#define POLL_TABLE_FULL(table) \ - ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table)) - /* * Ok, Peter made a complicated, but straightforward multiple_wait() function. * I have rewritten this, taking some shortcuts: This code may not be easy to @@ -62,30 +50,39 @@ void poll_freewait(poll_table* pt) struct poll_table_page *old; entry = p->entry; - do { + while (entry > p->entries) { entry--; remove_wait_queue(entry->wait_address,&entry->wait); fput(entry->filp); - } while (entry > p->entries); + } old = p; p = p->next; - free_page((unsigned long) old); + if (old != &pt->inline_page) + free_page((unsigned long) old); } } void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) { struct poll_table_page *table = p->table; - - if (!table || POLL_TABLE_FULL(table)) { - struct poll_table_page *new_table; - - new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); - if (!new_table) { - p->error = -ENOMEM; - __set_current_state(TASK_RUNNING); - return; + struct poll_table_page *new_table = NULL; + int sz; + + if (!table) { + new_table = &p->inline_page; + } else { + sz = (table == &p->inline_page) ? POLL_INLINE_TABLE_LEN : PAGE_SIZE; + if ((char*)table->entry >= (char*)table + sz) { + new_table = (struct poll_table_page *)__get_free_page(GFP_KERNEL); + if (!new_table) { + p->error = -ENOMEM; + __set_current_state(TASK_RUNNING); + return; + } } + } + + if (new_table) { new_table->entry = new_table->entries; new_table->next = table; p->table = new_table; @@ -113,48 +110,6 @@ void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table #define BITS(fds, n) (*__IN(fds, n)|*__OUT(fds, n)|*__EX(fds, n)) -static int max_select_fd(unsigned long n, fd_set_bits *fds) -{ - unsigned long *open_fds; - unsigned long set; - int max; - - /* handle last in-complete long-word first */ - set = ~(~0UL << (n & (__NFDBITS-1))); - n /= __NFDBITS; - open_fds = current->files->open_fds->fds_bits+n; - max = 0; - if (set) { - set &= BITS(fds, n); - if (set) { - if (!(set & ~*open_fds)) - goto get_max; - return -EBADF; - } - } - while (n) { - open_fds--; - n--; - set = BITS(fds, n); - if (!set) - continue; - if (set & ~*open_fds) - return -EBADF; - if (max) - continue; -get_max: - do { - max++; - set >>= 1; - } while (set); - max += n * __NFDBITS; - } - - return max; -} - -#define BIT(i) (1UL << ((i)&(__NFDBITS-1))) -#define MEM(i,m) ((m)+(unsigned)(i)/__NFDBITS) #define ISSET(i,m) (((i)&*(m)) != 0) #define SET(i,m) (*(m) |= (i)) @@ -165,56 +120,71 @@ get_max: int do_select(int n, fd_set_bits *fds, long *timeout) { poll_table table, *wait; - int retval, i, off; + int retval, off, max, maxoff; long __timeout = *timeout; - read_lock(¤t->files->file_lock); - retval = max_select_fd(n, fds); - read_unlock(¤t->files->file_lock); - - if (retval < 0) - return retval; - n = retval; - poll_initwait(&table); wait = &table; if (!__timeout) wait = NULL; + retval = 0; + maxoff = n/BITS_PER_LONG; + max = 0; for (;;) { set_current_state(TASK_INTERRUPTIBLE); - for (i = 0 ; i < n; i++) { - unsigned long bit = BIT(i); - unsigned long mask; - struct file *file; + for (off = 0; off <= maxoff; off++) { + unsigned long val = BITS(fds, off); - off = i / __NFDBITS; - if (!(bit & BITS(fds, off))) + if (!val) continue; - file = fget(i); - mask = POLLNVAL; - if (file) { - mask = DEFAULT_POLLMASK; - if (file->f_op && file->f_op->poll) - mask = file->f_op->poll(file, wait); - fput(file); - } - if ((mask & POLLIN_SET) && ISSET(bit, __IN(fds,off))) { - SET(bit, __RES_IN(fds,off)); - retval++; - wait = NULL; - } - if ((mask & POLLOUT_SET) && ISSET(bit, __OUT(fds,off))) { - SET(bit, __RES_OUT(fds,off)); - retval++; - wait = NULL; - } - if ((mask & POLLEX_SET) && ISSET(bit, __EX(fds,off))) { - SET(bit, __RES_EX(fds,off)); - retval++; - wait = NULL; + while (val) { + int k = ffz(~val); + unsigned long mask, bit; + struct file *file; + + if (k > n%BITS_PER_LONG) + break; + + bit = (1UL << k); + val &= ~bit; + + file = fget((off * BITS_PER_LONG) + k); + mask = POLLNVAL; + if (file) { + mask = DEFAULT_POLLMASK; + if (file->f_op && file->f_op->poll) + mask = file->f_op->poll(file, wait); + fput(file); + } else { + /* This error will shadow all other results. + * This matches previous linux behaviour */ + retval = -EBADF; + goto out; + } + if ((mask & POLLIN_SET) && ISSET(bit, __IN(fds,off))) { + SET(bit, __RES_IN(fds,off)); + retval++; + wait = NULL; + } + if ((mask& POLLOUT_SET) && ISSET(bit,__OUT(fds,off))) { + SET(bit, __RES_OUT(fds,off)); + retval++; + wait = NULL; + } + if ((mask & POLLEX_SET) && ISSET(bit, __EX(fds,off))) { + SET(bit, __RES_EX(fds,off)); + retval++; + wait = NULL; + } + + if (!(val &= ~bit)) + break; } } + + + maxoff = max; wait = NULL; if (retval || !__timeout || signal_pending(current)) break; @@ -224,25 +194,43 @@ int do_select(int n, fd_set_bits *fds, long *timeout) } __timeout = schedule_timeout(__timeout); } + +out: current->state = TASK_RUNNING; poll_freewait(&table); /* - * Up-to-date the caller timeout. + * Update the caller timeout. */ *timeout = __timeout; return retval; } -static void *select_bits_alloc(int size) -{ - return kmalloc(6 * size, GFP_KERNEL); -} +/* + * We do a VERIFY_WRITE here even though we are only reading this time: + * we'll write to it eventually.. + */ -static void select_bits_free(void *bits, int size) +static int get_fd_set(unsigned long nr, void *ufdset, unsigned long *fdset) { - kfree(bits); + unsigned long rounded = FDS_BYTES(nr), mask; + if (ufdset) { + int error = verify_area(VERIFY_WRITE, ufdset, rounded); + if (!error && __copy_from_user(fdset, ufdset, rounded)) + error = -EFAULT; + if (nr % __NFDBITS == 0) + mask = 0; + else { + /* This includes one bit too much according to SU; + but without this some programs hang. */ + mask = ~(~0UL << (nr%__NFDBITS)); + } + fdset[nr/__NFDBITS] &= mask; + return error; + } + memset(fdset, 0, rounded); + return 0; } /* @@ -263,6 +251,7 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp) char *bits; long timeout; int ret, size, max_fdset; + char stack_bits[FDS_BYTES(FAST_SELECT_MAX) * 6]; timeout = MAX_SCHEDULE_TIMEOUT; if (tvp) { @@ -297,11 +286,16 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp) * since we used fdset we need to allocate memory in units of * long-words. */ - ret = -ENOMEM; size = FDS_BYTES(n); - bits = select_bits_alloc(size); - if (!bits) - goto out_nofds; + if (n < FAST_SELECT_MAX) { + bits = stack_bits; + } else { + ret = -ENOMEM; + bits = kmalloc(6*size, GFP_KERNEL); + if (!bits) + goto out_nofds; + } + fds.in = (unsigned long *) bits; fds.out = (unsigned long *) (bits + size); fds.ex = (unsigned long *) (bits + 2*size); @@ -313,9 +307,7 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp) (ret = get_fd_set(n, outp, fds.out)) || (ret = get_fd_set(n, exp, fds.ex))) goto out; - zero_fd_set(n, fds.res_in); - zero_fd_set(n, fds.res_out); - zero_fd_set(n, fds.res_ex); + memset(fds.res_in, 0, 3*size); ret = do_select(n, &fds, &timeout); @@ -326,8 +318,8 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp) usec = timeout % HZ; usec *= (1000000/HZ); } - put_user(sec, &tvp->tv_sec); - put_user(usec, &tvp->tv_usec); + __put_user(sec, &tvp->tv_sec); + __put_user(usec, &tvp->tv_usec); } if (ret < 0) @@ -344,8 +336,10 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp) set_fd_set(n, exp, fds.res_ex); out: - select_bits_free(bits, size); + if (n >= FAST_SELECT_MAX) + kfree(bits); out_nofds: + return ret; } @@ -410,12 +404,42 @@ static int do_poll(unsigned int nfds, unsigned int nchunks, unsigned int nleft, return count; } +static int fast_poll(poll_table *table, poll_table *wait, struct pollfd *ufds, + unsigned int nfds, long timeout) +{ + poll_table *pt = wait; + struct pollfd fds[FAST_POLL_MAX]; + int count, i; + + if (copy_from_user(fds, ufds, nfds * sizeof(struct pollfd))) + return -EFAULT; + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + count = 0; + do_pollfd(nfds, fds, &pt, &count); + pt = NULL; + if (count || !timeout || signal_pending(current)) + break; + count = wait->error; + if (count) + break; + timeout = schedule_timeout(timeout); + } + current->state = TASK_RUNNING; + for (i = 0; i < nfds; i++) + __put_user(fds[i].revents, &ufds[i].revents); + poll_freewait(table); + if (!count && signal_pending(current)) + return -EINTR; + return count; +} + asmlinkage long sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout) { - int i, j, fdcount, err; + int i, j, err, fdcount; struct pollfd **fds; poll_table table, *wait; - int nchunks, nleft; + int nchunks, nleft; /* Do a sanity check on nfds ... */ if (nfds > NR_OPEN) @@ -429,43 +453,45 @@ asmlinkage long sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout) timeout = MAX_SCHEDULE_TIMEOUT; } + poll_initwait(&table); wait = &table; if (!timeout) wait = NULL; - err = -ENOMEM; - fds = NULL; - if (nfds != 0) { - fds = (struct pollfd **)kmalloc( - (1 + (nfds - 1) / POLLFD_PER_PAGE) * sizeof(struct pollfd *), - GFP_KERNEL); - if (fds == NULL) - goto out; - } + if (nfds < FAST_POLL_MAX) + return fast_poll(&table, wait, ufds, nfds, timeout); + err = -ENOMEM; + fds = (struct pollfd **)kmalloc( + (1 + (nfds - 1) / POLLFD_PER_PAGE) * sizeof(struct pollfd *), + GFP_KERNEL); + if (fds == NULL) + goto out; + nchunks = 0; nleft = nfds; - while (nleft > POLLFD_PER_PAGE) { /* allocate complete PAGE_SIZE chunks */ + while (nleft > POLLFD_PER_PAGE) { fds[nchunks] = (struct pollfd *)__get_free_page(GFP_KERNEL); if (fds[nchunks] == NULL) goto out_fds; nchunks++; nleft -= POLLFD_PER_PAGE; } - if (nleft) { /* allocate last PAGE_SIZE chunk, only nleft elements used */ + if (nleft) { fds[nchunks] = (struct pollfd *)__get_free_page(GFP_KERNEL); if (fds[nchunks] == NULL) goto out_fds; - } - + } + err = -EFAULT; for (i=0; i < nchunks; i++) if (copy_from_user(fds[i], ufds + i*POLLFD_PER_PAGE, PAGE_SIZE)) goto out_fds1; + if (nleft) { if (copy_from_user(fds[nchunks], ufds + nchunks*POLLFD_PER_PAGE, - nleft * sizeof(struct pollfd))) + nleft * sizeof(struct pollfd))) goto out_fds1; } @@ -489,8 +515,7 @@ out_fds1: out_fds: for (i=0; i < nchunks; i++) free_page((unsigned long)(fds[i])); - if (nfds != 0) - kfree(fds); + kfree(fds); out: poll_freewait(&table); return err; diff --git a/include/linux/poll.h b/include/linux/poll.h index 796aac51388a..86b1ee2d3eb3 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -10,13 +10,32 @@ #include #include -struct poll_table_page; +#define POLL_INLINE_BYTES 256 +#define FAST_SELECT_MAX 128 +#define FAST_POLL_MAX 128 +#define POLL_INLINE_ENTRIES (1+(POLL_INLINE_BYTES / sizeof(struct poll_table_entry))) + +struct poll_table_entry { + struct file * filp; + wait_queue_t wait; + wait_queue_head_t * wait_address; +}; + +struct poll_table_page { + struct poll_table_page * next; + struct poll_table_entry * entry; + struct poll_table_entry entries[0]; +}; typedef struct poll_table_struct { int error; struct poll_table_page * table; + struct poll_table_page inline_page; + struct poll_table_entry inline_table[POLL_INLINE_ENTRIES]; } poll_table; +#define POLL_INLINE_TABLE_LEN (sizeof(poll_table) - offsetof(poll_table, inline_page)) + extern void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p); static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) @@ -30,6 +49,7 @@ static inline void poll_initwait(poll_table* pt) pt->error = 0; pt->table = NULL; } + extern void poll_freewait(poll_table* pt); @@ -49,27 +69,6 @@ typedef struct { #define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) #define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) -/* - * We do a VERIFY_WRITE here even though we are only reading this time: - * we'll write to it eventually.. - * - * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. - */ -static inline -int get_fd_set(unsigned long nr, void *ufdset, unsigned long *fdset) -{ - nr = FDS_BYTES(nr); - if (ufdset) { - int error; - error = verify_area(VERIFY_WRITE, ufdset, nr); - if (!error && __copy_from_user(fdset, ufdset, nr)) - error = -EFAULT; - return error; - } - memset(fdset, 0, nr); - return 0; -} - static inline void set_fd_set(unsigned long nr, void *ufdset, unsigned long *fdset) { @@ -77,12 +76,6 @@ void set_fd_set(unsigned long nr, void *ufdset, unsigned long *fdset) __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); } -static inline -void zero_fd_set(unsigned long nr, unsigned long *fdset) -{ - memset(fdset, 0, FDS_BYTES(nr)); -} - extern int do_select(int n, fd_set_bits *fds, long *timeout); #endif /* KERNEL */ -- cgit v1.2.3 From 4f9d90c4a8511edfe3faa8dbc27e3d3c933a720d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 17 Jun 2002 20:37:27 -0700 Subject: [PATCH] remove tqueue.h from sched.h This is actually part of the work I've been doing to remove BHs, but it stands by itself. --- drivers/char/random.c | 1 + drivers/pcmcia/pci_socket.c | 1 + drivers/pcmcia/yenta.c | 1 + include/linux/sched.h | 2 -- include/linux/tqueue.h | 3 +++ kernel/context.c | 1 + kernel/kmod.c | 1 + kernel/sys.c | 1 + kernel/timer.c | 1 + 9 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index db20dec287d0..9db52acb9ef2 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -252,6 +252,7 @@ #include #include #include +#include #include #include diff --git a/drivers/pcmcia/pci_socket.c b/drivers/pcmcia/pci_socket.c index d30df9b4203a..5a4b78312391 100644 --- a/drivers/pcmcia/pci_socket.c +++ b/drivers/pcmcia/pci_socket.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/drivers/pcmcia/yenta.c b/drivers/pcmcia/yenta.c index e5453fb455e2..40b20b945488 100644 --- a/drivers/pcmcia/yenta.c +++ b/drivers/pcmcia/yenta.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include diff --git a/include/linux/sched.h b/include/linux/sched.h index 3b43d3bb1123..9e7d80851c32 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -7,7 +7,6 @@ extern unsigned long event; #include #include -#include #include #include #include @@ -160,7 +159,6 @@ extern unsigned long cache_decay_ticks; extern signed long FASTCALL(schedule_timeout(signed long timeout)); asmlinkage void schedule(void); -extern int schedule_task(struct tq_struct *task); extern void flush_scheduled_tasks(void); extern int start_context_thread(void); extern int current_is_keventd(void); diff --git a/include/linux/tqueue.h b/include/linux/tqueue.h index 3d3047027229..d4729c518f22 100644 --- a/include/linux/tqueue.h +++ b/include/linux/tqueue.h @@ -110,6 +110,9 @@ static inline int queue_task(struct tq_struct *bh_pointer, task_queue *bh_list) return ret; } +/* Schedule a tq to run in process context */ +extern int schedule_task(struct tq_struct *task); + /* * Call all "bottom halfs" on a given list. */ diff --git a/kernel/context.c b/kernel/context.c index 56bada438f61..c49f914430e0 100644 --- a/kernel/context.c +++ b/kernel/context.c @@ -20,6 +20,7 @@ #include #include #include +#include static DECLARE_TASK_QUEUE(tq_context); static DECLARE_WAIT_QUEUE_HEAD(context_task_wq); diff --git a/kernel/kmod.c b/kernel/kmod.c index a9f0ddb521cc..05388d9557fa 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -28,6 +28,7 @@ #include #include #include +#include #include diff --git a/kernel/sys.c b/kernel/sys.c index 3bd38f344817..2ba72b6c87d4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include diff --git a/kernel/timer.c b/kernel/timer.c index c6d6d12e04d4..ab864eca2645 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From 94173f6818a6a2816f4104f698f56d304a397405 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Mon, 17 Jun 2002 20:37:55 -0700 Subject: [PATCH] Remove sync_timers Nobody's using it any more, kill: --- include/linux/timer.h | 2 -- kernel/timer.c | 5 ----- 2 files changed, 7 deletions(-) diff --git a/include/linux/timer.h b/include/linux/timer.h index d6f0ce5f8740..6e1e61a4c07b 100644 --- a/include/linux/timer.h +++ b/include/linux/timer.h @@ -25,10 +25,8 @@ extern int del_timer(struct timer_list * timer); #ifdef CONFIG_SMP extern int del_timer_sync(struct timer_list * timer); -extern void sync_timers(void); #else #define del_timer_sync(t) del_timer(t) -#define sync_timers() do { } while (0) #endif /* diff --git a/kernel/timer.c b/kernel/timer.c index ab864eca2645..858954c871e1 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -232,11 +232,6 @@ int del_timer(struct timer_list * timer) } #ifdef CONFIG_SMP -void sync_timers(void) -{ - spin_unlock_wait(&global_bh_lock); -} - /* * SMP specific function to delete periodic timer. * Caller must disable by some means restarting the timer -- cgit v1.2.3 From f179b6ce3a03a6ee60c821766691488a6f461c52 Mon Sep 17 00:00:00 2001 From: Kai Mäkisara Date: Mon, 17 Jun 2002 20:44:18 -0700 Subject: [PATCH] 2.5.22 SCSI tape buffering changes This contains the following changes to the SCSI tape driver: - one buffer is used for each tape (no buffer pool) - buffers allocated when needed and freed when device closed - common code from read and write moved to a function - default maximum number of scatter/gather segments increased to 64 - tape status set to "no tape" after succesful unload --- drivers/scsi/README.st | 46 ++--- drivers/scsi/st.c | 429 +++++++++++++++------------------------------- drivers/scsi/st_options.h | 15 +- 3 files changed, 152 insertions(+), 338 deletions(-) diff --git a/drivers/scsi/README.st b/drivers/scsi/README.st index e06a21597910..702a5b178b61 100644 --- a/drivers/scsi/README.st +++ b/drivers/scsi/README.st @@ -2,7 +2,7 @@ This file contains brief information about the SCSI tape driver. The driver is currently maintained by Kai M{kisara (email Kai.Makisara@metla.fi) -Last modified: Tue Jan 22 21:08:57 2002 by makisara +Last modified: Tue Jun 18 18:13:50 2002 by makisara BASICS @@ -105,15 +105,19 @@ The default is BSD semantics. BUFFERING -The driver uses tape buffers allocated either at system initialization -or at run-time when needed. One buffer is used for each open tape -device. The size of the buffers is selectable at compile and/or boot -time. The buffers are used to store the data being transferred to/from -the SCSI adapter. The following buffering options are selectable at -compile time and/or at run time (via ioctl): +The driver uses tape buffers allocated at run-time when needed and it +is freed when the device file is closed. One buffer is used for each +open tape device. + +The size of the buffers is always at least one tape block. In fixed +block mode, the minimum buffer size is defined (in 1024 byte units) by +ST_FIXED_BUFFER_BLOCKS. With small block size this allows buffering of +several blocks and using one SCSI read or write to transfer all of the +blocks. Buffering of data across write calls in fixed block mode is +allowed if ST_BUFFER_WRITES is non-zero. Buffer allocation uses chunks of +memory having sizes 2^n * (page size). Because of this the actual +buffer size may be larger than the minimum allowable buffer size. -Buffering of data across write calls in fixed block mode (define -ST_BUFFER_WRITES). Asynchronous writing. Writing the buffer contents to the tape is started and the write call returns immediately. The status is checked @@ -128,30 +132,6 @@ attempted even if the user does not want to get all of the data at this read command. Should be disabled for those drives that don't like a filemark to truncate a read request or that don't like backspacing. -The buffer size is defined (in 1024 byte units) by ST_BUFFER_BLOCKS or -at boot time. If this size is not large enough, the driver tries to -temporarily enlarge the buffer. Buffer allocation uses chunks of -memory having sizes 2^n * (page size). Because of this the actual -buffer size may be larger than the buffer size specified with -ST_BUFFER_BLOCKS. - -A small number of buffers are allocated at driver initialisation. The -maximum number of these buffers is defined by ST_MAX_BUFFERS. The -maximum can be changed with kernel or module startup options. One -buffer is allocated for each drive detected when the driver is -initialized up to the maximum. - -The driver tries to allocate new buffers at run-time if -necessary. These buffers are freed after use. If the maximum number of -initial buffers is set to zero, all buffer allocation is done at -run-time. The advantage of run-time allocation is that memory is not -wasted for buffers not being used. The disadvantage is that there may -not be memory available at the time when a buffer is needed for the -first time (once a buffer is allocated, it is not released). This risk -should not be big if the tape drive is connected to a PCI adapter that -supports scatter/gather (the allocation is not limited to "DMA memory" -and the buffer can be composed of several fragments). - The threshold for triggering asynchronous write in fixed block mode is defined by ST_WRITE_THRESHOLD. This may be optimized for each use pattern. The default triggers asynchronous write after three diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index f48ac845bc08..7342c3e661f3 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -12,13 +12,13 @@ Copyright 1992 - 2002 Kai Makisara email Kai.Makisara@metla.fi - Last modified: Tue Feb 5 21:25:55 2002 by makisara + Last modified: Sat Jun 15 13:01:56 2002 by makisara Some small formal changes - aeb, 950809 Last modified: 18-JAN-1998 Richard Gooch Devfs support */ -static char *verstr = "20020205"; +static char *verstr = "20020615"; #include @@ -69,7 +69,6 @@ static char *verstr = "20020205"; static int buffer_kbs; static int write_threshold_kbs; -static int max_buffers = (-1); static int max_sg_segs; MODULE_AUTHOR("Kai Makisara"); @@ -80,8 +79,6 @@ MODULE_PARM(buffer_kbs, "i"); MODULE_PARM_DESC(buffer_kbs, "Default driver buffer size (KB; 32)"); MODULE_PARM(write_threshold_kbs, "i"); MODULE_PARM_DESC(write_threshold_kbs, "Asynchronous write threshold (KB; 30)"); -MODULE_PARM(max_buffers, "i"); -MODULE_PARM_DESC(max_buffers, "Maximum number of buffer allocated at initialisation (4)"); MODULE_PARM(max_sg_segs, "i"); MODULE_PARM_DESC(max_sg_segs, "Maximum number of scatter/gather segments to use (32)"); @@ -96,9 +93,6 @@ static struct st_dev_parm { { "write_threshold_kbs", &write_threshold_kbs }, - { - "max_buffers", &max_buffers - }, { "max_sg_segs", &max_sg_segs } @@ -108,12 +102,12 @@ static struct st_dev_parm { /* The default definitions have been moved to st_options.h */ -#define ST_BUFFER_SIZE (ST_BUFFER_BLOCKS * ST_KILOBYTE) +#define ST_FIXED_BUFFER_SIZE (ST_FIXED_BUFFER_BLOCKS * ST_KILOBYTE) #define ST_WRITE_THRESHOLD (ST_WRITE_THRESHOLD_BLOCKS * ST_KILOBYTE) /* The buffer size should fit into the 24 bits for length in the 6-byte SCSI read and write commands. */ -#if ST_BUFFER_SIZE >= (2 << 24 - 1) +#if ST_FIXED_BUFFER_SIZE >= (2 << 24 - 1) #error "Buffer size should not exceed (2 << 24 - 1) bytes!" #endif @@ -121,7 +115,7 @@ DEB( static int debugging = DEBUG; ) #define MAX_RETRIES 0 #define MAX_WRITE_RETRIES 0 -#define MAX_READY_RETRIES 5 +#define MAX_READY_RETRIES 0 #define NO_TAPE NOT_READY #define ST_TIMEOUT (900 * HZ) @@ -137,18 +131,15 @@ DEB( static int debugging = DEBUG; ) #define ST_DEV_ARR_LUMP 6 static rwlock_t st_dev_arr_lock = RW_LOCK_UNLOCKED; -static int st_nbr_buffers; -static ST_buffer **st_buffers = NULL; -static int st_buffer_size = ST_BUFFER_SIZE; +static int st_fixed_buffer_size = ST_FIXED_BUFFER_SIZE; static int st_write_threshold = ST_WRITE_THRESHOLD; -static int st_max_buffers = ST_MAX_BUFFERS; static int st_max_sg_segs = ST_MAX_SG; static Scsi_Tape **scsi_tapes = NULL; static int modes_defined; -static ST_buffer *new_tape_buffer(int, int, int); +static ST_buffer *new_tape_buffer(int, int); static int enlarge_buffer(ST_buffer *, int, int); static void normalize_buffer(ST_buffer *); static int append_to_buffer(const char *, ST_buffer *, int); @@ -914,8 +905,7 @@ static int check_tape(Scsi_Tape *STp, struct file *filp) module count. */ static int st_open(struct inode *inode, struct file *filp) { - int i, need_dma_buffer; - int retval = (-EIO); + int i, retval = (-EIO); Scsi_Tape *STp; ST_partstat *STps; int dev = TAPE_NR(inode->i_rdev); @@ -945,38 +935,15 @@ static int st_open(struct inode *inode, struct file *filp) goto err_out; } - /* Allocate a buffer for this user */ - need_dma_buffer = STp->restr_dma; - write_lock(&st_dev_arr_lock); - for (i = 0; i < st_nbr_buffers; i++) - if (!st_buffers[i]->in_use && - (!need_dma_buffer || st_buffers[i]->dma)) { - STp->buffer = st_buffers[i]; - (STp->buffer)->in_use = 1; - break; - } - write_unlock(&st_dev_arr_lock); - if (i >= st_nbr_buffers) { - STp->buffer = new_tape_buffer(FALSE, need_dma_buffer, TRUE); - if (STp->buffer == NULL) { - printk(KERN_WARNING "st%d: Can't allocate tape buffer.\n", dev); - retval = (-EBUSY); - goto err_out; - } + /* See that we have at least a one page buffer available */ + if (!enlarge_buffer(STp->buffer, PAGE_SIZE, STp->restr_dma)) { + printk(KERN_WARNING "st%d: Can't allocate tape buffer.\n", dev); + retval = (-EOVERFLOW); + goto err_out; } (STp->buffer)->writing = 0; (STp->buffer)->syscall_result = 0; - (STp->buffer)->use_sg = STp->device->host->sg_tablesize; - - /* Compute the usable buffer size for this SCSI adapter */ - if (!(STp->buffer)->use_sg) - (STp->buffer)->buffer_size = (STp->buffer)->sg[0].length; - else { - for (i = 0, (STp->buffer)->buffer_size = 0; i < (STp->buffer)->use_sg && - i < (STp->buffer)->sg_segs; i++) - (STp->buffer)->buffer_size += (STp->buffer)->sg[i].length; - } STp->write_prot = ((filp->f_flags & O_ACCMODE) == O_RDONLY); @@ -999,10 +966,7 @@ static int st_open(struct inode *inode, struct file *filp) return 0; err_out: - if (STp->buffer != NULL) { - (STp->buffer)->in_use = 0; - STp->buffer = NULL; - } + normalize_buffer(STp->buffer); STp->in_use = 0; STp->device->access_count--; if (STp->device->host->hostt->module) @@ -1149,16 +1113,8 @@ static int st_release(struct inode *inode, struct file *filp) if (STp->door_locked == ST_LOCKED_AUTO) st_int_ioctl(STp, MTUNLOCK, 0); - if (STp->buffer != NULL) { - normalize_buffer(STp->buffer); - write_lock(&st_dev_arr_lock); - (STp->buffer)->in_use = 0; - STp->buffer = NULL; - } - else { - write_lock(&st_dev_arr_lock); - } - + normalize_buffer(STp->buffer); + write_lock(&st_dev_arr_lock); STp->in_use = 0; write_unlock(&st_dev_arr_lock); STp->device->access_count--; @@ -1168,31 +1124,11 @@ static int st_release(struct inode *inode, struct file *filp) return result; } - -/* Write command */ -static ssize_t - st_write(struct file *filp, const char *buf, size_t count, loff_t * ppos) +/* The checks common to both reading and writing */ +static ssize_t rw_checks(Scsi_Tape *STp, struct file *filp, size_t count, loff_t *ppos) { - struct inode *inode = filp->f_dentry->d_inode; - ssize_t total; - ssize_t i, do_count, blks, transfer; + int bufsize; ssize_t retval = 0; - int write_threshold; - int doing_write = 0; - unsigned char cmd[MAX_COMMAND_SIZE]; - const char *b_point; - Scsi_Request *SRpnt = NULL; - Scsi_Tape *STp; - ST_mode *STm; - ST_partstat *STps; - int dev = TAPE_NR(inode->i_rdev); - - read_lock(&st_dev_arr_lock); - STp = scsi_tapes[dev]; - read_unlock(&st_dev_arr_lock); - - if (down_interruptible(&STp->lock)) - return -ERESTARTSYS; /* * If we are in the middle of error recovery, don't let anyone @@ -1219,13 +1155,11 @@ static ssize_t goto out; } - STm = &(STp->modes[STp->current_mode]); - if (!STm->defined) { + if (! STp->modes[STp->current_mode].defined) { retval = (-ENXIO); goto out; } - if (count == 0) - goto out; + /* * If there was a bus reset, block further access @@ -1236,30 +1170,20 @@ static ssize_t goto out; } + if (count == 0) + goto out; + DEB( if (!STp->in_use) { + int dev = TAPE_NR(filp->f_dentry->d_inode->i_rdev); printk(ST_DEB_MSG "st%d: Incorrect device.\n", dev); retval = (-EIO); goto out; } ) /* end DEB */ - /* Write must be integral number of blocks */ - if (STp->block_size != 0 && (count % STp->block_size) != 0) { - printk(KERN_WARNING "st%d: Write not multiple of tape block size.\n", - dev); - retval = (-EINVAL); - goto out; - } - if (STp->can_partitions && (retval = update_partition(STp)) < 0) goto out; - STps = &(STp->ps[STp->partition]); - - if (STp->write_prot) { - retval = (-EACCES); - goto out; - } if (STp->block_size == 0) { if (STp->max_block > 0 && @@ -1273,19 +1197,73 @@ static ssize_t goto out; } } - if ((STp->buffer)->buffer_blocks < 1) { - /* Fixed block mode with too small buffer */ - if (!enlarge_buffer(STp->buffer, STp->block_size, STp->restr_dma)) { + else { + /* Fixed block mode with too small buffer? */ + bufsize = STp->block_size > st_fixed_buffer_size ? + STp->block_size : st_fixed_buffer_size; + if ((STp->buffer)->buffer_size < bufsize && + !enlarge_buffer(STp->buffer, bufsize, STp->restr_dma)) { retval = (-EOVERFLOW); goto out; } - (STp->buffer)->buffer_blocks = 1; + (STp->buffer)->buffer_blocks = bufsize / STp->block_size; } if (STp->do_auto_lock && STp->door_locked == ST_UNLOCKED && !st_int_ioctl(STp, MTLOCK, 0)) STp->door_locked = ST_LOCKED_AUTO; + out: + return retval; +} + + +/* Write command */ +static ssize_t + st_write(struct file *filp, const char *buf, size_t count, loff_t * ppos) +{ + struct inode *inode = filp->f_dentry->d_inode; + ssize_t total; + ssize_t i, do_count, blks, transfer; + ssize_t retval; + int write_threshold; + int doing_write = 0; + unsigned char cmd[MAX_COMMAND_SIZE]; + const char *b_point; + Scsi_Request *SRpnt = NULL; + Scsi_Tape *STp; + ST_mode *STm; + ST_partstat *STps; + int dev = TAPE_NR(inode->i_rdev); + + read_lock(&st_dev_arr_lock); + STp = scsi_tapes[dev]; + read_unlock(&st_dev_arr_lock); + + if (down_interruptible(&STp->lock)) + return -ERESTARTSYS; + + retval = rw_checks(STp, filp, count, ppos); + if (retval || count == 0) + goto out; + + /* Write must be integral number of blocks */ + if (STp->block_size != 0 && (count % STp->block_size) != 0) { + printk(KERN_WARNING "st%d: Write not multiple of tape block size.\n", + dev); + retval = (-EINVAL); + goto out; + } + + STm = &(STp->modes[STp->current_mode]); + STps = &(STp->ps[STp->partition]); + + if (STp->write_prot) { + retval = (-EACCES); + goto out; + } + + if (STps->rw == ST_READING) { retval = flush_buffer(STp, 0); if (retval) @@ -1718,77 +1696,17 @@ static ssize_t if (down_interruptible(&STp->lock)) return -ERESTARTSYS; - /* - * If we are in the middle of error recovery, don't let anyone - * else try and use this device. Also, if error recovery fails, it - * may try and take the device offline, in which case all further - * access to the device is prohibited. - */ - if (!scsi_block_when_processing_errors(STp->device)) { - retval = (-ENXIO); - goto out; - } - - if (ppos != &filp->f_pos) { - /* "A request was outside the capabilities of the device." */ - retval = (-ENXIO); + retval = rw_checks(STp, filp, count, ppos); + if (retval || count == 0) goto out; - } - if (STp->ready != ST_READY) { - if (STp->ready == ST_NO_TAPE) - retval = (-ENOMEDIUM); - else - retval = (-EIO); - goto out; - } STm = &(STp->modes[STp->current_mode]); - if (!STm->defined) { - retval = (-ENXIO); - goto out; - } - DEB( - if (!STp->in_use) { - printk(ST_DEB_MSG "st%d: Incorrect device.\n", dev); - retval = (-EIO); - goto out; - } ) /* end DEB */ - - if (STp->can_partitions && - (retval = update_partition(STp)) < 0) - goto out; - - if (STp->block_size == 0) { - if (STp->max_block > 0 && - (count < STp->min_block || count > STp->max_block)) { - retval = (-EINVAL); - goto out; - } - if (count > (STp->buffer)->buffer_size && - !enlarge_buffer(STp->buffer, count, STp->restr_dma)) { - retval = (-EOVERFLOW); - goto out; - } - } - if ((STp->buffer)->buffer_blocks < 1) { - /* Fixed block mode with too small buffer */ - if (!enlarge_buffer(STp->buffer, STp->block_size, STp->restr_dma)) { - retval = (-EOVERFLOW); - goto out; - } - (STp->buffer)->buffer_blocks = 1; - } - if (!(STm->do_read_ahead) && STp->block_size != 0 && (count % STp->block_size) != 0) { retval = (-EINVAL); /* Read must be integral number of blocks */ goto out; } - if (STp->do_auto_lock && STp->door_locked == ST_UNLOCKED && - !st_int_ioctl(STp, MTLOCK, 0)) - STp->door_locked = ST_LOCKED_AUTO; - STps = &(STp->ps[STp->partition]); if (STps->rw == ST_WRITING) { retval = flush_buffer(STp, 0); @@ -1986,7 +1904,7 @@ static int st_set_options(Scsi_Tape *STp, long options) st_log_options(STp, STm, dev); } else if (code == MT_ST_WRITE_THRESHOLD) { value = (options & ~MT_ST_OPTIONS) * ST_KILOBYTE; - if (value < 1 || value > st_buffer_size) { + if (value < 1 || value > st_fixed_buffer_size) { printk(KERN_WARNING "st%d: Write threshold %d too small or too large.\n", dev, value); @@ -2289,8 +2207,10 @@ static int do_load_unload(Scsi_Tape *STp, struct file *filp, int load_code) if (!retval) { /* SCSI command successful */ - if (!load_code) + if (!load_code) { STp->rew_at_close = 0; + STp->ready = ST_NO_TAPE; + } else { STp->rew_at_close = STp->autorew_dev; retval = check_tape(STp, filp); @@ -2619,10 +2539,14 @@ static int st_int_ioctl(Scsi_Tape *STp, unsigned int cmd_in, unsigned long arg) ioctl_result = st_int_ioctl(STp, MTBSF, 1); if (cmd_in == MTSETBLK || cmd_in == SET_DENS_AND_BLK) { + int old_block_size = STp->block_size; STp->block_size = arg & MT_ST_BLKSIZE_MASK; - if (STp->block_size != 0) + if (STp->block_size != 0) { + if (old_block_size == 0) + normalize_buffer(STp->buffer); (STp->buffer)->buffer_blocks = (STp->buffer)->buffer_size / STp->block_size; + } (STp->buffer)->buffer_bytes = (STp->buffer)->read_pointer = 0; if (cmd_in == SET_DENS_AND_BLK) STp->density = arg >> MT_ST_DENSITY_SHIFT; @@ -3372,18 +3296,11 @@ static int st_ioctl(struct inode *inode, struct file *file, /* Try to allocate a new tape buffer. Calling function must not hold dev_arr_lock. */ static ST_buffer * - new_tape_buffer(int from_initialization, int need_dma, int in_use) + new_tape_buffer(int from_initialization, int need_dma) { - int i, priority, b_size, order, got = 0, segs = 0; + int i, priority, got = 0, segs = 0; ST_buffer *tb; - read_lock(&st_dev_arr_lock); - if (st_nbr_buffers >= st_template.dev_max) { - read_unlock(&st_dev_arr_lock); - return NULL; /* Should never happen */ - } - read_unlock(&st_dev_arr_lock); - if (from_initialization) priority = GFP_ATOMIC; else @@ -3391,85 +3308,19 @@ static ST_buffer * i = sizeof(ST_buffer) + (st_max_sg_segs - 1) * sizeof(struct scatterlist); tb = kmalloc(i, priority); - if (tb) { - if (need_dma) - priority |= GFP_DMA; - - /* Try to allocate the first segment up to ST_FIRST_ORDER and the - others big enough to reach the goal */ - for (b_size = PAGE_SIZE, order=0; - b_size < st_buffer_size && order < ST_FIRST_ORDER; - order++, b_size *= 2) - ; - for ( ; b_size >= PAGE_SIZE; order--, b_size /= 2) { - tb->sg[0].page = alloc_pages(priority, order); - tb->sg[0].offset = 0; - if (tb->sg[0].page != NULL) { - tb->sg[0].length = b_size; - break; - } - } - if (tb->sg[segs].page == NULL) { - kfree(tb); - tb = NULL; - } else { /* Got something, continue */ - - for (b_size = PAGE_SIZE, order=0; - st_buffer_size > - tb->sg[0].length + (ST_FIRST_SG - 1) * b_size; - order++, b_size *= 2) - ; - for (segs = 1, got = tb->sg[0].length; - got < st_buffer_size && segs < ST_FIRST_SG;) { - tb->sg[segs].page = alloc_pages(priority, order); - tb->sg[segs].offset = 0; - if (tb->sg[segs].page == NULL) { - if (st_buffer_size - got <= - (ST_FIRST_SG - segs) * b_size / 2) { - b_size /= 2; /* Large enough for the - rest of the buffers */ - order--; - continue; - } - tb->sg_segs = segs; - tb->orig_sg_segs = 0; - DEB(tb->buffer_size = got); - normalize_buffer(tb); - kfree(tb); - tb = NULL; - break; - } - tb->sg[segs].length = b_size; - got += b_size; - segs++; - } - } - } - if (!tb) { - printk(KERN_NOTICE "st: Can't allocate new tape buffer (nbr %d).\n", - st_nbr_buffers); + printk(KERN_NOTICE "st: Can't allocate new tape buffer.\n"); return NULL; } tb->sg_segs = tb->orig_sg_segs = segs; - tb->b_data = page_address(tb->sg[0].page); + if (segs > 0) + tb->b_data = page_address(tb->sg[0].page); - DEBC(printk(ST_DEB_MSG - "st: Allocated tape buffer %d (%d bytes, %d segments, dma: %d, a: %p).\n", - st_nbr_buffers, got, tb->sg_segs, need_dma, tb->b_data); - printk(ST_DEB_MSG - "st: segment sizes: first %d, last %d bytes.\n", - tb->sg[0].length, tb->sg[segs - 1].length); - ) - tb->in_use = in_use; + tb->in_use = TRUE; tb->dma = need_dma; tb->buffer_size = got; tb->writing = 0; - write_lock(&st_dev_arr_lock); - st_buffers[st_nbr_buffers++] = tb; - write_unlock(&st_dev_arr_lock); - return tb; } @@ -3479,6 +3330,9 @@ static int enlarge_buffer(ST_buffer * STbuffer, int new_size, int need_dma) { int segs, nbr, max_segs, b_size, priority, order, got; + if (new_size <= STbuffer->buffer_size) + return TRUE; + normalize_buffer(STbuffer); max_segs = STbuffer->use_sg; @@ -3492,13 +3346,14 @@ static int enlarge_buffer(ST_buffer * STbuffer, int new_size, int need_dma) if (need_dma) priority |= GFP_DMA; for (b_size = PAGE_SIZE, order=0; - b_size * nbr < new_size - STbuffer->buffer_size; + b_size < new_size - STbuffer->buffer_size; order++, b_size *= 2) ; /* empty */ for (segs = STbuffer->sg_segs, got = STbuffer->buffer_size; segs < max_segs && got < new_size;) { STbuffer->sg[segs].page = alloc_pages(priority, order); + /* printk("st: allocated %x, order %d\n", STbuffer->sg[segs].page, order); */ STbuffer->sg[segs].offset = 0; if (STbuffer->sg[segs].page == NULL) { if (new_size - got <= (max_segs - segs) * b_size / 2) { @@ -3518,9 +3373,10 @@ static int enlarge_buffer(ST_buffer * STbuffer, int new_size, int need_dma) STbuffer->buffer_size = got; segs++; } + STbuffer->b_data = page_address(STbuffer->sg[0].page); DEBC(printk(ST_DEB_MSG - "st: Succeeded to enlarge buffer to %d bytes (segs %d->%d, %d).\n", - got, STbuffer->orig_sg_segs, STbuffer->sg_segs, b_size)); + "st: Succeeded to enlarge buffer at %p to %d bytes (segs %d->%d, %d).\n", + STbuffer, got, STbuffer->orig_sg_segs, STbuffer->sg_segs, b_size)); return TRUE; } @@ -3535,14 +3391,14 @@ static void normalize_buffer(ST_buffer * STbuffer) for (b_size=PAGE_SIZE, order=0; b_size < STbuffer->sg[i].length; order++, b_size *= 2) ; /* empty */ + /* printk("st: freeing %x, order %d\n", STbuffer->sg[i].page, order); */ __free_pages(STbuffer->sg[i].page, order); STbuffer->buffer_size -= STbuffer->sg[i].length; } DEB( if (debugging && STbuffer->orig_sg_segs < STbuffer->sg_segs) printk(ST_DEB_MSG "st: Buffer at %p normalized to %d bytes (segs %d).\n", - page_address(STbuffer->sg[0].page), STbuffer->buffer_size, - STbuffer->sg_segs); + STbuffer, STbuffer->buffer_size, STbuffer->sg_segs); ) /* end DEB */ STbuffer->sg_segs = STbuffer->orig_sg_segs; } @@ -3619,18 +3475,16 @@ static int from_buffer(ST_buffer * st_bp, char *ubp, int do_count) static void validate_options(void) { if (buffer_kbs > 0) - st_buffer_size = buffer_kbs * ST_KILOBYTE; + st_fixed_buffer_size = buffer_kbs * ST_KILOBYTE; if (write_threshold_kbs > 0) st_write_threshold = write_threshold_kbs * ST_KILOBYTE; else if (buffer_kbs > 0) - st_write_threshold = st_buffer_size - 2048; - if (st_write_threshold > st_buffer_size) { - st_write_threshold = st_buffer_size; + st_write_threshold = st_fixed_buffer_size - 2048; + if (st_write_threshold > st_fixed_buffer_size) { + st_write_threshold = st_fixed_buffer_size; printk(KERN_WARNING "st: write_threshold limited to %d bytes.\n", st_write_threshold); } - if (max_buffers >= 0) - st_max_buffers = max_buffers; if (max_sg_segs >= ST_FIRST_SG) st_max_sg_segs = max_sg_segs; } @@ -3694,7 +3548,8 @@ static int st_attach(Scsi_Device * SDp) Scsi_Tape *tpnt; ST_mode *STm; ST_partstat *STps; - int i, mode, target_nbr, dev_num; + ST_buffer *buffer; + int i, mode, dev_num; char *stp; if (SDp->type != TYPE_TAPE) @@ -3707,6 +3562,12 @@ static int st_attach(Scsi_Device * SDp) return 1; } + buffer = new_tape_buffer(TRUE, (SDp->host)->unchecked_isa_dma); + if (buffer == NULL) { + printk(KERN_ERR "st: Can't allocate new tape buffer. Device not attached.\n"); + return 1; + } + write_lock(&st_dev_arr_lock); if (st_template.nr_dev >= st_template.dev_max) { Scsi_Tape **tmp_da; @@ -3745,14 +3606,6 @@ static int st_attach(Scsi_Device * SDp) } scsi_tapes = tmp_da; - memset(tmp_ba, 0, tmp_dev_max * sizeof(ST_buffer *)); - if (st_buffers != NULL) { - memcpy(tmp_ba, st_buffers, - st_template.dev_max * sizeof(ST_buffer *)); - kfree(st_buffers); - } - st_buffers = tmp_ba; - st_template.dev_max = tmp_dev_max; } @@ -3799,6 +3652,9 @@ static int st_attach(Scsi_Device * SDp) else tpnt->tape_type = MT_ISSCSI2; + buffer->use_sg = tpnt->device->host->sg_tablesize; + tpnt->buffer = buffer; + tpnt->inited = 0; tpnt->devt = mk_kdev(SCSI_TAPE_MAJOR, i); tpnt->dirty = 0; @@ -3858,18 +3714,6 @@ static int st_attach(Scsi_Device * SDp) "Attached scsi tape st%d at scsi%d, channel %d, id %d, lun %d\n", dev_num, SDp->host->host_no, SDp->channel, SDp->id, SDp->lun); - /* See if we need to allocate more static buffers */ - target_nbr = st_template.nr_dev; - if (target_nbr > st_max_buffers) - target_nbr = st_max_buffers; - for (i=st_nbr_buffers; i < target_nbr; i++) - if (!new_tape_buffer(TRUE, TRUE, FALSE)) { - printk(KERN_INFO "st: Unable to allocate new static buffer.\n"); - break; - } - /* If the previous allocation fails, we will try again when the buffer is - really needed. */ - return 0; }; @@ -3897,6 +3741,11 @@ static void st_detach(Scsi_Device * SDp) devfs_unregister (tpnt->de_n[mode]); tpnt->de_n[mode] = NULL; } + if (tpnt->buffer) { + tpnt->buffer->orig_sg_segs = 0; + normalize_buffer(tpnt->buffer); + kfree(tpnt->buffer); + } kfree(tpnt); scsi_tapes[i] = 0; SDp->attached--; @@ -3916,10 +3765,10 @@ static int __init init_st(void) validate_options(); printk(KERN_INFO - "st: Version %s, bufsize %d, wrt %d, " - "max init. bufs %d, s/g segs %d\n", - verstr, st_buffer_size, st_write_threshold, - st_max_buffers, st_max_sg_segs); + "st: Version %s, fixed bufsize %d, wrt %d, " + "s/g segs %d\n", + verstr, st_fixed_buffer_size, st_write_threshold, + st_max_sg_segs); if (devfs_register_chrdev(SCSI_TAPE_MAJOR, "st", &st_fops) >= 0) return scsi_register_device(&st_template); @@ -3939,16 +3788,6 @@ static void __exit exit_st(void) if (scsi_tapes[i]) kfree(scsi_tapes[i]); kfree(scsi_tapes); - if (st_buffers != NULL) { - for (i = 0; i < st_nbr_buffers; i++) { - if (st_buffers[i] != NULL) { - st_buffers[i]->orig_sg_segs = 0; - normalize_buffer(st_buffers[i]); - kfree(st_buffers[i]); - } - } - kfree(st_buffers); - } } st_template.dev_max = 0; printk(KERN_INFO "st: Unloaded.\n"); diff --git a/drivers/scsi/st_options.h b/drivers/scsi/st_options.h index 325bd3cb5c1e..2c412f72be13 100644 --- a/drivers/scsi/st_options.h +++ b/drivers/scsi/st_options.h @@ -3,7 +3,7 @@ Copyright 1995-2000 Kai Makisara. - Last modified: Tue Jan 22 21:52:34 2002 by makisara + Last modified: Sun May 5 15:09:56 2002 by makisara */ #ifndef _ST_OPTIONS_H @@ -30,22 +30,17 @@ SENSE. */ #define ST_DEFAULT_BLOCK 0 -/* The tape driver buffer size in kilobytes. Must be non-zero. */ -#define ST_BUFFER_BLOCKS 32 +/* The minimum tape driver buffer size in kilobytes in fixed block mode. + Must be non-zero. */ +#define ST_FIXED_BUFFER_BLOCKS 32 /* The number of kilobytes of data in the buffer that triggers an asynchronous write in fixed block mode. See also ST_ASYNC_WRITES below. */ #define ST_WRITE_THRESHOLD_BLOCKS 30 -/* The maximum number of tape buffers the driver tries to allocate at - driver initialisation. The number is also constrained by the number - of drives detected. If more buffers are needed, they are allocated - at run time and freed after use. */ -#define ST_MAX_BUFFERS 4 - /* Maximum number of scatter/gather segments */ -#define ST_MAX_SG 16 +#define ST_MAX_SG 64 /* The number of scatter/gather segments to allocate at first try (must be smaller or equal to the maximum). */ -- cgit v1.2.3 From 73769d9baca2040c407de7230377d440b9d11997 Mon Sep 17 00:00:00 2001 From: Paul Menage Date: Mon, 17 Jun 2002 20:46:11 -0700 Subject: [PATCH] Push BKL into ->permission() calls This patch (against 2.5.22) removes the BKL from around the call to i_op->permission() in fs/namei.c, and pushes the BKL into those filesystems that have permission() methods that require it. --- Documentation/filesystems/Locking | 42 +++++++++++++++++++-------------------- Documentation/filesystems/porting | 6 +++--- fs/coda/dir.c | 9 +++++++-- fs/intermezzo/dir.c | 27 ++++++++++++------------- fs/namei.c | 9 ++------- fs/nfs/dir.c | 4 ++++ 6 files changed, 50 insertions(+), 47 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index d636ae84e508..c894fcceb996 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -50,27 +50,27 @@ prototypes: int (*removexattr) (struct dentry *, const char *); locking rules: - all may block - BKL i_sem(inode) -lookup: no yes -create: no yes -link: no yes (both) -mknod: no yes -symlink: no yes -mkdir: no yes -unlink: no yes (both) -rmdir: no yes (both) (see below) -rename: no yes (all) (see below) -readlink: no no -follow_link: no no -truncate: no yes (see below) -setattr: no yes -permission: yes no -getattr: no no -setxattr: no yes -getxattr: no yes -listxattr: no yes -removexattr: no yes + all may block, none have BKL + i_sem(inode) +lookup: yes +create: yes +link: yes (both) +mknod: yes +symlink: yes +mkdir: yes +unlink: yes (both) +rmdir: yes (both) (see below) +rename: yes (all) (see below) +readlink: no +follow_link: no +truncate: yes (see below) +setattr: yes +permission: no +getattr: no +setxattr: yes +getxattr: yes +listxattr: yes +removexattr: yes Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_sem on victim. cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting index ef49709ee8ad..85281b6f4ff0 100644 --- a/Documentation/filesystems/porting +++ b/Documentation/filesystems/porting @@ -81,9 +81,9 @@ can relax your locking. [mandatory] ->lookup(), ->truncate(), ->create(), ->unlink(), ->mknod(), ->mkdir(), -->rmdir(), ->link(), ->lseek(), ->symlink(), ->rename() and ->readdir() -are called without BKL now. Grab it on the entry, drop upon return - that -will guarantee the same locking you used to have. If your method or its +->rmdir(), ->link(), ->lseek(), ->symlink(), ->rename(), ->permission() +and ->readdir() are called without BKL now. Grab it on entry, drop upon return +- that will guarantee the same locking you used to have. If your method or its parts do not need BKL - better yet, now you can shift lock_kernel() and unlock_kernel() so that they would protect exactly what needs to be protected. diff --git a/fs/coda/dir.c b/fs/coda/dir.c index 16bd5714cecf..5c581916ecdd 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -147,21 +147,26 @@ exit: int coda_permission(struct inode *inode, int mask) { - int error; + int error = 0; if (!mask) return 0; + lock_kernel(); + coda_vfs_stat.permission++; if (coda_cache_check(inode, mask)) - return 0; + goto out; error = venus_access(inode->i_sb, coda_i2f(inode), mask); if (!error) coda_cache_enter(inode, mask); + out: + unlock_kernel(); + return error; } diff --git a/fs/intermezzo/dir.c b/fs/intermezzo/dir.c index c8a8c1988f16..cec0471800f1 100644 --- a/fs/intermezzo/dir.c +++ b/fs/intermezzo/dir.c @@ -785,13 +785,15 @@ int presto_permission(struct inode *inode, int mask) { unsigned short mode = inode->i_mode; struct presto_cache *cache; - int rc; + int rc = 0; + lock_kernel(); ENTRY; + if ( presto_can_ilookup() && !(mask & S_IWOTH)) { CDEBUG(D_CACHE, "ilookup on %ld OK\n", inode->i_ino); - EXIT; - return 0; + EXIT; + goto out; } cache = presto_get_cache(inode); @@ -803,25 +805,22 @@ int presto_permission(struct inode *inode, int mask) if ( S_ISREG(mode) && fiops && fiops->permission ) { EXIT; - return fiops->permission(inode, mask); + rc = fiops->permission(inode, mask); + goto out; } if ( S_ISDIR(mode) && diops && diops->permission ) { EXIT; - return diops->permission(inode, mask); + rc = diops->permission(inode, mask); + goto out; } } - /* The cache filesystem doesn't have its own permission function, - * but we don't want to duplicate the VFS code here. In order - * to avoid looping from permission calling this function again, - * we temporarily override the permission operation while we call - * the VFS permission function. - */ - inode->i_op->permission = NULL; - rc = permission(inode, mask); - inode->i_op->permission = &presto_permission; + rc = vfs_permission(inode, mask); EXIT; + + out: + unlock_kernel(); return rc; } diff --git a/fs/namei.c b/fs/namei.c index 506f8b5eee6b..8ac8afda4ccb 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -204,13 +204,8 @@ int vfs_permission(struct inode * inode, int mask) int permission(struct inode * inode,int mask) { - if (inode->i_op && inode->i_op->permission) { - int retval; - lock_kernel(); - retval = inode->i_op->permission(inode, mask); - unlock_kernel(); - return retval; - } + if (inode->i_op && inode->i_op->permission) + return inode->i_op->permission(inode, mask); return vfs_permission(inode, mask); } diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 1cbf3a697bda..73d57238a1cc 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1123,6 +1123,8 @@ nfs_permission(struct inode *inode, int mask) && error != -EACCES) goto out; + lock_kernel(); + error = NFS_PROTO(inode)->access(inode, mask, 0); if (error == -EACCES && NFS_CLIENT(inode)->cl_droppriv && @@ -1130,6 +1132,8 @@ nfs_permission(struct inode *inode, int mask) (current->fsuid != current->uid || current->fsgid != current->gid)) error = NFS_PROTO(inode)->access(inode, mask, 1); + unlock_kernel(); + out: return error; } -- cgit v1.2.3 From d76513b3526a32389c30843aa11d050db1b350e2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 18 Jun 2002 21:34:31 +0200 Subject: - fix preemption bug in cli(). --- arch/i386/kernel/irq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 8608a903f86d..600f7a6d2c51 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -360,8 +360,9 @@ void __global_cli(void) __save_flags(flags); if (flags & (1 << EFLAGS_IF_SHIFT)) { - int cpu = smp_processor_id(); + int cpu; __cli(); + cpu = smp_processor_id(); if (!local_irq_count(cpu)) get_irqlock(cpu); } -- cgit v1.2.3 From 5567614b40b53cf9abe6e8370f40d19a9206f2ed Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 18 Jun 2002 21:35:17 +0200 Subject: - sti() preemption fix. --- arch/i386/kernel/irq.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c index 600f7a6d2c51..4265cb038a5a 100644 --- a/arch/i386/kernel/irq.c +++ b/arch/i386/kernel/irq.c @@ -370,11 +370,12 @@ void __global_cli(void) void __global_sti(void) { - int cpu = smp_processor_id(); + int cpu = get_cpu(); if (!local_irq_count(cpu)) release_irqlock(cpu); __sti(); + put_cpu(); } /* -- cgit v1.2.3