From d30a11004e3411909f2448546f036a011978062e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:31:30 -0700 Subject: readahead: record mmap read-around states in file_ra_state Mmap read-around now shares the same code style and data structure with readahead code. This also removes do_page_cache_readahead(). Its last user, mmap read-around, has been changed to call ra_submit(). The no-readahead-if-congested logic is dumped by the way. Users will be pretty sensitive about the slow loading of executables. So it's unfavorable to disabled mmap read-around on a congested queue. [akpm@linux-foundation.org: coding-style fixes] Cc: Nick Piggin Signed-off-by: Fengguang Wu Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index ad613ed66ab0..33da7f538841 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1178,8 +1178,6 @@ void task_dirty_inc(struct task_struct *tsk); #define VM_MAX_READAHEAD 128 /* kbytes */ #define VM_MIN_READAHEAD 16 /* kbytes (includes current page) */ -int do_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read); int force_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read); @@ -1197,6 +1195,9 @@ void page_cache_async_readahead(struct address_space *mapping, unsigned long size); unsigned long max_sane_readahead(unsigned long nr); +unsigned long ra_submit(struct file_ra_state *ra, + struct address_space *mapping, + struct file *filp); /* Do stack extension */ extern int expand_stack(struct vm_area_struct *vma, unsigned long address); -- cgit v1.2.3 From d2bf6be8ab63aa84e6149aac934649aadf3828b1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Tue, 16 Jun 2009 15:31:39 -0700 Subject: mm: clean up get_user_pages_fast() documentation Move more documentation for get_user_pages_fast into the new kerneldoc comment. Add some comments for get_user_pages as well. Also, move get_user_pages_fast declaration up to get_user_pages. It wasn't there initially because it was once a static inline function. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Nick Piggin Cc: Andy Grover Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 20 +++++--------------- mm/memory.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ mm/util.c | 16 ++++++++++++---- 3 files changed, 67 insertions(+), 19 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 33da7f538841..a880161a3854 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -824,8 +824,11 @@ static inline int handle_mm_fault(struct mm_struct *mm, extern int make_pages_present(unsigned long addr, unsigned long end); extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write); -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, - int len, int write, int force, struct page **pages, struct vm_area_struct **vmas); +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas); +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages); extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned long offset); @@ -849,19 +852,6 @@ extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); -/* - * get_user_pages_fast provides equivalent functionality to get_user_pages, - * operating on current and current->mm (force=0 and doesn't return any vmas). - * - * get_user_pages_fast may take mmap_sem and page tables, so no assumptions - * can be made about locking. get_user_pages_fast is to be implemented in a - * way that is advantageous (vs get_user_pages()) when the user memory area is - * already faulted in and present in ptes. However if the pages have to be - * faulted in, it may turn out to be slightly slower). - */ -int get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); - /* * A callback you can register to apply pressure to ageable caches. * diff --git a/mm/memory.c b/mm/memory.c index 4126dd16778c..891bad0613f4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1360,6 +1360,56 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i; } +/** + * get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @len: number of pages from start to pin + * @write: whether pages will be written to by the caller + * @force: whether to force write access even if user mapping is + * readonly. This will result in the page being COWed even + * in MAP_SHARED mappings. You do not want this. + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If len is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If write=0, the page must not be written to. If the page is written to, + * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called + * after the page is finished with, and before put_page is called. + * + * get_user_pages is typically used for fewer-copy IO operations, to get a + * handle on the memory by some means other than accesses via the user virtual + * addresses. The pages may be submitted for DMA to devices or accessed via + * their kernel linear mapping (via the kmap APIs). Care should be taken to + * use the correct cache flushing APIs. + * + * See also get_user_pages_fast, for performance critical applications. + */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) diff --git a/mm/util.c b/mm/util.c index abc65aa7cdfc..d5d2213728c5 100644 --- a/mm/util.c +++ b/mm/util.c @@ -233,13 +233,21 @@ void arch_pick_mmap_layout(struct mm_struct *mm) * @pages: array that receives pointers to the pages pinned. * Should be at least nr_pages long. * - * Attempt to pin user pages in memory without taking mm->mmap_sem. - * If not successful, it will fall back to taking the lock and - * calling get_user_pages(). - * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. + * + * get_user_pages_fast provides equivalent functionality to get_user_pages, + * operating on current and current->mm, with force=0 and vma=NULL. However + * unlike get_user_pages, it must be called without mmap_sem held. + * + * get_user_pages_fast may take mmap_sem and page table locks, so no + * assumptions can be made about lack of locking. get_user_pages_fast is to be + * implemented in a way that is advantageous (vs get_user_pages()) when the + * user memory area is already faulted in and present in ptes. However if the + * pages have to be faulted in, it may turn out to be slightly slower so + * callers need to carefully consider what to use. On many architectures, + * get_user_pages_fast simply falls back to get_user_pages. */ int __attribute__((weak)) get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) -- cgit v1.2.3 From 6484eb3e2a81807722c5f28efef94d8338b7b996 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 16 Jun 2009 15:31:54 -0700 Subject: page allocator: do not check NUMA node ID when the caller knows the node is valid Callers of alloc_pages_node() can optionally specify -1 as a node to mean "allocate from the current node". However, a number of the callers in fast paths know for a fact their node is valid. To avoid a comparison and branch, this patch adds alloc_pages_exact_node() that only checks the nid with VM_BUG_ON(). Callers that know their node is valid are then converted. Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KOSAKI Motohiro Reviewed-by: Pekka Enberg Acked-by: Paul Mundt [for the SLOB NUMA bits] Cc: Peter Zijlstra Cc: Nick Piggin Cc: Dave Hansen Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/hp/common/sba_iommu.c | 2 +- arch/ia64/kernel/mca.c | 3 +-- arch/ia64/kernel/uncached.c | 3 ++- arch/ia64/sn/pci/pci_dma.c | 3 ++- arch/powerpc/platforms/cell/ras.c | 4 ++-- arch/x86/kvm/vmx.c | 2 +- drivers/misc/sgi-gru/grufile.c | 2 +- drivers/misc/sgi-xp/xpc_uv.c | 2 +- include/linux/gfp.h | 9 +++++++++ include/linux/mm.h | 1 - kernel/profile.c | 8 ++++---- mm/filemap.c | 2 +- mm/hugetlb.c | 4 ++-- mm/mempolicy.c | 2 +- mm/migrate.c | 2 +- mm/slab.c | 4 ++-- mm/slob.c | 4 ++-- 17 files changed, 33 insertions(+), 24 deletions(-) (limited to 'include/linux/mm.h') diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c index 56ceb68eb99d..fe63b2dc9d07 100644 --- a/arch/ia64/hp/common/sba_iommu.c +++ b/arch/ia64/hp/common/sba_iommu.c @@ -1131,7 +1131,7 @@ sba_alloc_coherent (struct device *dev, size_t size, dma_addr_t *dma_handle, gfp #ifdef CONFIG_NUMA { struct page *page; - page = alloc_pages_node(ioc->node == MAX_NUMNODES ? + page = alloc_pages_exact_node(ioc->node == MAX_NUMNODES ? numa_node_id() : ioc->node, flags, get_order(size)); diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index 8f33a8840422..5b17bd402275 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -1829,8 +1829,7 @@ ia64_mca_cpu_init(void *cpu_data) data = mca_bootmem(); first_time = 0; } else - data = page_address(alloc_pages_node(numa_node_id(), - GFP_KERNEL, get_order(sz))); + data = __get_free_pages(GFP_KERNEL, get_order(sz)); if (!data) panic("Could not allocate MCA memory for cpu %d\n", cpu); diff --git a/arch/ia64/kernel/uncached.c b/arch/ia64/kernel/uncached.c index 8eff8c1d40a6..6ba72ab42fcc 100644 --- a/arch/ia64/kernel/uncached.c +++ b/arch/ia64/kernel/uncached.c @@ -98,7 +98,8 @@ static int uncached_add_chunk(struct uncached_pool *uc_pool, int nid) /* attempt to allocate a granule's worth of cached memory pages */ - page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + page = alloc_pages_exact_node(nid, + GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, IA64_GRANULE_SHIFT-PAGE_SHIFT); if (!page) { mutex_unlock(&uc_pool->add_chunk_mutex); diff --git a/arch/ia64/sn/pci/pci_dma.c b/arch/ia64/sn/pci/pci_dma.c index d876423e4e75..98b684928e12 100644 --- a/arch/ia64/sn/pci/pci_dma.c +++ b/arch/ia64/sn/pci/pci_dma.c @@ -90,7 +90,8 @@ static void *sn_dma_alloc_coherent(struct device *dev, size_t size, */ node = pcibus_to_node(pdev->bus); if (likely(node >=0)) { - struct page *p = alloc_pages_node(node, flags, get_order(size)); + struct page *p = alloc_pages_exact_node(node, + flags, get_order(size)); if (likely(p)) cpuaddr = page_address(p); diff --git a/arch/powerpc/platforms/cell/ras.c b/arch/powerpc/platforms/cell/ras.c index 296b5268754e..5e0a191764fc 100644 --- a/arch/powerpc/platforms/cell/ras.c +++ b/arch/powerpc/platforms/cell/ras.c @@ -122,8 +122,8 @@ static int __init cbe_ptcal_enable_on_node(int nid, int order) area->nid = nid; area->order = order; - area->pages = alloc_pages_node(area->nid, GFP_KERNEL | GFP_THISNODE, - area->order); + area->pages = alloc_pages_exact_node(area->nid, GFP_KERNEL|GFP_THISNODE, + area->order); if (!area->pages) { printk(KERN_WARNING "%s: no page on node %d\n", diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 32d6ae8fb60e..e770bf349ec4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1277,7 +1277,7 @@ static struct vmcs *alloc_vmcs_cpu(int cpu) struct page *pages; struct vmcs *vmcs; - pages = alloc_pages_node(node, GFP_KERNEL, vmcs_config.order); + pages = alloc_pages_exact_node(node, GFP_KERNEL, vmcs_config.order); if (!pages) return NULL; vmcs = page_address(pages); diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c index bbefe77c67a9..3ce2920e2bf3 100644 --- a/drivers/misc/sgi-gru/grufile.c +++ b/drivers/misc/sgi-gru/grufile.c @@ -302,7 +302,7 @@ static int gru_init_tables(unsigned long gru_base_paddr, void *gru_base_vaddr) pnode = uv_node_to_pnode(nid); if (bid < 0 || gru_base[bid]) continue; - page = alloc_pages_node(nid, GFP_KERNEL, order); + page = alloc_pages_exact_node(nid, GFP_KERNEL, order); if (!page) goto fail; gru_base[bid] = page_address(page); diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c index 9172fcdee4e2..c76677afda1b 100644 --- a/drivers/misc/sgi-xp/xpc_uv.c +++ b/drivers/misc/sgi-xp/xpc_uv.c @@ -232,7 +232,7 @@ xpc_create_gru_mq_uv(unsigned int mq_size, int cpu, char *irq_name, mq->mmr_blade = uv_cpu_to_blade_id(cpu); nid = cpu_to_node(cpu); - page = alloc_pages_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, + page = alloc_pages_exact_node(nid, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, pg_order); if (page == NULL) { dev_err(xpc_part, "xpc_create_gru_mq_uv() failed to alloc %d " diff --git a/include/linux/gfp.h b/include/linux/gfp.h index c2d3fe03b5d2..4efa33088a82 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -5,6 +5,7 @@ #include #include #include +#include struct vm_area_struct; @@ -192,6 +193,14 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); } +static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask, + unsigned int order) +{ + VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); + + return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask)); +} + #ifdef CONFIG_NUMA extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order); diff --git a/include/linux/mm.h b/include/linux/mm.h index a880161a3854..7b548e7cfbd9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -7,7 +7,6 @@ #include #include -#include #include #include #include diff --git a/kernel/profile.c b/kernel/profile.c index 28cf26ad2d24..69911b5745eb 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -365,7 +365,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info, node = cpu_to_node(cpu); per_cpu(cpu_profile_flip, cpu) = 0; if (!per_cpu(cpu_profile_hits, cpu)[1]) { - page = alloc_pages_node(node, + page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -373,7 +373,7 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info, per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); } if (!per_cpu(cpu_profile_hits, cpu)[0]) { - page = alloc_pages_node(node, + page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) @@ -564,14 +564,14 @@ static int create_hash_tables(void) int node = cpu_to_node(cpu); struct page *page; - page = alloc_pages_node(node, + page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 0); if (!page) goto out_cleanup; per_cpu(cpu_profile_hits, cpu)[1] = (struct profile_hit *)page_address(page); - page = alloc_pages_node(node, + page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO | GFP_THISNODE, 0); if (!page) diff --git a/mm/filemap.c b/mm/filemap.c index 6846a902f5cf..22396713feb9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -521,7 +521,7 @@ struct page *__page_cache_alloc(gfp_t gfp) { if (cpuset_do_page_mem_spread()) { int n = cpuset_mem_spread_node(); - return alloc_pages_node(n, gfp, 0); + return alloc_pages_exact_node(n, gfp, 0); } return alloc_pages(gfp, 0); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e83ad2c9228c..2f8241f300f5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -630,7 +630,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) if (h->order >= MAX_ORDER) return NULL; - page = alloc_pages_node(nid, + page = alloc_pages_exact_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); @@ -649,7 +649,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) * Use a helper variable to find the next node and then * copy it back to hugetlb_next_nid afterwards: * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. * But we don't need to use a spin_lock here: it really * doesn't matter if occasionally a racer chooses the * same nid as we do. Move nid forward in the mask even diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 46bdf9ddf2ba..e08e2c4da63a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -803,7 +803,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, static struct page *new_node_page(struct page *page, unsigned long node, int **x) { - return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); + return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0); } /* diff --git a/mm/migrate.c b/mm/migrate.c index 068655d8f883..5a24923e7fd7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -802,7 +802,7 @@ static struct page *new_page_node(struct page *p, unsigned long private, *result = &pm->status; - return alloc_pages_node(pm->node, + return alloc_pages_exact_node(pm->node, GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); } diff --git a/mm/slab.c b/mm/slab.c index 18e3164de09a..bb3254c95cd2 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1707,7 +1707,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; - page = alloc_pages_node(nodeid, flags, cachep->gfporder); + page = alloc_pages_exact_node(nodeid, flags, cachep->gfporder); if (!page) return NULL; @@ -3261,7 +3261,7 @@ retry: if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, -1); + obj = kmem_getpages(cache, local_flags, numa_node_id()); if (local_flags & __GFP_WAIT) local_irq_disable(); if (obj) { diff --git a/mm/slob.c b/mm/slob.c index 12f261499925..64f6db1943bf 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -46,7 +46,7 @@ * NUMA support in SLOB is fairly simplistic, pushing most of the real * logic down to the page allocator, and simply doing the node accounting * on the upper levels. In the event that a node id is explicitly - * provided, alloc_pages_node() with the specified node id is used + * provided, alloc_pages_exact_node() with the specified node id is used * instead. The common case (or when the node id isn't explicitly provided) * will default to the current node, as per numa_node_id(). * @@ -244,7 +244,7 @@ static void *slob_new_pages(gfp_t gfp, int order, int node) #ifdef CONFIG_NUMA if (node != -1) - page = alloc_pages_node(node, gfp, order); + page = alloc_pages_exact_node(node, gfp, order); else #endif page = alloc_pages(gfp, order); -- cgit v1.2.3 From 3b6748e2dd69906af3835db4dc9d1c8a3ee4c68c Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 16 Jun 2009 15:32:35 -0700 Subject: mm: introduce follow_pfn() Analoguous to follow_phys(), add a helper that looks up the PFN at a user virtual address in an IO mapping or a raw PFN mapping. Signed-off-by: Johannes Weiner Cc: Christoph Hellwig Acked-by: Magnus Damm Cc: Hans Verkuil Cc: Paul Mundt Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/memory.c | 29 +++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b548e7cfbd9..c80dbd4a62c6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -792,6 +792,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned long *prot, resource_size_t *phys); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index 24ff20480bb7..d5d1653d60a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3140,6 +3140,35 @@ out: return -EINVAL; } +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * Returns zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + int ret = -EINVAL; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return ret; + + ret = follow_pte(vma->vm_mm, address, &ptep, &ptl); + if (ret) + return ret; + *pfn = pte_pfn(*ptep); + pte_unmap_unlock(ptep, ptl); + return 0; +} +EXPORT_SYMBOL(follow_pfn); + #ifdef CONFIG_HAVE_IOREMAP_PROT int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, -- cgit v1.2.3 From bc75d33f0fc1d56e734db1f56d3cfc8097b8e0cf Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 16 Jun 2009 15:32:48 -0700 Subject: page-allocator: clean up functions related to pages_min Change the names of two functions. It doesn't affect behavior. Presently, setup_per_zone_pages_min() changes low, high of zone as well as min. So a better name is setup_per_zone_wmarks(). That's because Mel changed zone->pages_[hig/low/min] to zone->watermark array in "page allocator: replace the watermark-related union in struct zone with a watermark[] array". * setup_per_zone_pages_min => setup_per_zone_wmarks Of course, we have to change init_per_zone_pages_min, too. There are not pages_min any more. * init_per_zone_pages_min => init_per_zone_wmark_min [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Minchan Kim Acked-by: Mel Gorman Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/memory_hotplug.c | 2 +- mm/page_alloc.c | 17 +++++++++-------- 3 files changed, 11 insertions(+), 10 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index c80dbd4a62c6..6e11cda1ba02 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1052,7 +1052,7 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn); extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context); -extern void setup_per_zone_pages_min(void); +extern void setup_per_zone_wmarks(void); extern void mem_init(void); extern void __init mmap_init(void); extern void show_mem(void); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c083cf5fd6df..037291e15b27 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -422,7 +422,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); if (onlined_pages) { kswapd_run(zone_to_nid(zone)); node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b09488d0f55..8629c84fd9ba 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4396,12 +4396,13 @@ static void setup_per_zone_lowmem_reserve(void) } /** - * setup_per_zone_pages_min - called when min_free_kbytes changes. + * setup_per_zone_wmarks - called when min_free_kbytes changes + * or when memory is hot-added * - * Ensures that the pages_{min,low,high} values for each zone are set correctly - * with respect to min_free_kbytes. + * Ensures that the watermark[min,low,high] values for each zone are set + * correctly with respect to min_free_kbytes. */ -void setup_per_zone_pages_min(void) +void setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -4519,7 +4520,7 @@ static void __init setup_per_zone_inactive_ratio(void) * 8192MB: 11584k * 16384MB: 16384k */ -static int __init init_per_zone_pages_min(void) +static int __init init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; @@ -4530,12 +4531,12 @@ static int __init init_per_zone_pages_min(void) min_free_kbytes = 128; if (min_free_kbytes > 65536) min_free_kbytes = 65536; - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); setup_per_zone_lowmem_reserve(); setup_per_zone_inactive_ratio(); return 0; } -module_init(init_per_zone_pages_min) +module_init(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so @@ -4547,7 +4548,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, { proc_dointvec(table, write, file, buffer, length, ppos); if (write) - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); return 0; } -- cgit v1.2.3 From 96cb4df5ddf5e6d5785b5acd4003e3689b87f896 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 16 Jun 2009 15:32:49 -0700 Subject: page-allocator: add inactive ratio calculation function of each zone Factor the per-zone arithemetic inside setup_per_zone_inactive_ratio()'s loop into a a separate function, calculate_zone_inactive_ratio(). This function will be used in a later patch [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Minchan Kim Reviewed-by: KOSAKI Motohiro Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/page_alloc.c | 28 ++++++++++++++++------------ 2 files changed, 17 insertions(+), 12 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6e11cda1ba02..f0473a88ca2e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1053,6 +1053,7 @@ extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context); extern void setup_per_zone_wmarks(void); +extern void calculate_zone_inactive_ratio(struct zone *zone); extern void mem_init(void); extern void __init mmap_init(void); extern void show_mem(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8629c84fd9ba..303607f1d323 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4478,22 +4478,26 @@ void setup_per_zone_wmarks(void) * 1TB 101 10GB * 10TB 320 32GB */ -static void __init setup_per_zone_inactive_ratio(void) +void calculate_zone_inactive_ratio(struct zone *zone) { - struct zone *zone; + unsigned int gb, ratio; - for_each_zone(zone) { - unsigned int gb, ratio; + /* Zone size in gigabytes */ + gb = zone->present_pages >> (30 - PAGE_SHIFT); + if (gb) + ratio = int_sqrt(10 * gb); + else + ratio = 1; - /* Zone size in gigabytes */ - gb = zone->present_pages >> (30 - PAGE_SHIFT); - if (gb) - ratio = int_sqrt(10 * gb); - else - ratio = 1; + zone->inactive_ratio = ratio; +} - zone->inactive_ratio = ratio; - } +static void __init setup_per_zone_inactive_ratio(void) +{ + struct zone *zone; + + for_each_zone(zone) + calculate_zone_inactive_ratio(zone); } /* -- cgit v1.2.3 From 168f5ac668f63dfb64439766e3ef9e866b83719d Mon Sep 17 00:00:00 2001 From: Sergei Trofimovich Date: Tue, 16 Jun 2009 15:33:02 -0700 Subject: mm cleanup: shmem_file_setup: 'char *' -> 'const char *' for name argument As function shmem_file_setup does not modify/allocate/free/pass given filename - mark it as const. Signed-off-by: Sergei Trofimovich Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 +- mm/shmem.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux/mm.h') diff --git a/include/linux/mm.h b/include/linux/mm.h index f0473a88ca2e..d88d6fc530ad 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -724,7 +724,7 @@ static inline int shmem_lock(struct file *file, int lock, return 0; } #endif -struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags); +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags); int shmem_zero_setup(struct vm_area_struct *); diff --git a/mm/shmem.c b/mm/shmem.c index 47ab19182287..e89d7ec18eda 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2612,7 +2612,7 @@ int shmem_unuse(swp_entry_t entry, struct page *page) * @size: size to be set for the file * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size */ -struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) +struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags) { int error; struct file *file; -- cgit v1.2.3