From fccbe3844c29beed4e665b1a5aafada44e133adc Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 3 Feb 2003 16:59:04 -0800 Subject: [PATCH] implement posix_fadvise64() An implementation of posix_fadvise64(). It adds 368 bytes to my vmlinux and is worth it. I didn't bother doing posix_fadvise(), as userspace can implement that by calling fadvise64(). The main reason for wanting this syscall is to provide userspace with the ability to explicitly shoot down pagecache when streaming large files. This is what O_STEAMING does, only posix_fadvise() is standards-based, and harder to use. posix_fadvise() also subsumes sys_readahead(). POSIX_FADV_WILLNEED will generally provide asynchronous readahead semantics for small amounts of I/O. As long as things like indirect blocks are aready in core. POSIX_FADV_RANDOM gives unprivileged applications a way of disabling readahead on a per-fd basis, which may provide some benefit for super-seeky access patterns such as databases. The POSIX_FADV_* values are already implemented in glibc, and this patch ensures that they are in sync. A test app (fadvise.c) is available in ext3 CVS. See http://www.zip.com.au/~akpm/linux/ext3/ for CVS details. Ulrich has reviewed this patch (thanks). --- arch/i386/kernel/entry.S | 4 +-- include/asm-i386/unistd.h | 2 ++ include/linux/fadvise.h | 11 ++++++++ include/linux/fs.h | 2 ++ mm/Makefile | 2 +- mm/fadvise.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++ mm/truncate.c | 21 ++++++++++---- 7 files changed, 105 insertions(+), 9 deletions(-) create mode 100644 include/linux/fadvise.h create mode 100644 mm/fadvise.c diff --git a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S index 09954f83b483..40bd0c4e42ca 100644 --- a/arch/i386/kernel/entry.S +++ b/arch/i386/kernel/entry.S @@ -792,8 +792,8 @@ ENTRY(sys_call_table) .long sys_io_getevents .long sys_io_submit .long sys_io_cancel - .long sys_ni_syscall /* 250 sys_alloc_hugepages - reuse this */ - .long sys_ni_syscall /* was sys_free_hugepages - reuse this */ + .long sys_fadvise64 /* 250 */ + .long sys_ni_syscall .long sys_exit_group .long sys_lookup_dcookie .long sys_epoll_create diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index fb5a97ec22f5..d21b3a8a4f4e 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -255,6 +255,8 @@ #define __NR_io_getevents 247 #define __NR_io_submit 248 #define __NR_io_cancel 249 +#define __NR_fadvise64 250 + #define __NR_exit_group 252 #define __NR_lookup_dcookie 253 #define __NR_epoll_create 254 diff --git a/include/linux/fadvise.h b/include/linux/fadvise.h new file mode 100644 index 000000000000..6fc656dfb93d --- /dev/null +++ b/include/linux/fadvise.h @@ -0,0 +1,11 @@ +#ifndef FADVISE_H_INCLUDED +#define FADVISE_H_INCLUDED + +#define POSIX_FADV_NORMAL 0 /* No further special treatment. */ +#define POSIX_FADV_RANDOM 1 /* Expect random page references. */ +#define POSIX_FADV_SEQUENTIAL 2 /* Expect sequential page references. */ +#define POSIX_FADV_WILLNEED 3 /* Will need these pages. */ +#define POSIX_FADV_DONTNEED 4 /* Don't need these pages. */ +#define POSIX_FADV_NOREUSE 5 /* Data will be accessed once. */ + +#endif /* FADVISE_H_INCLUDED */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 76b32526394f..f4c994d02f5d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1102,6 +1102,8 @@ extern int full_check_disk_change(struct block_device *); extern int __check_disk_change(dev_t); extern int invalidate_inodes(struct super_block *); extern int invalidate_device(kdev_t, int); +extern void invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end); extern void invalidate_inode_pages(struct address_space *mapping); extern void invalidate_inode_pages2(struct address_space *mapping); extern void write_inode_now(struct inode *, int); diff --git a/mm/Makefile b/mm/Makefile index a6dd1ab57fdf..a8de64ff3525 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -7,7 +7,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ shmem.o vmalloc.o -obj-y := bootmem.o filemap.o mempool.o oom_kill.o \ +obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ page_alloc.o page-writeback.o pdflush.o readahead.o \ slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y) diff --git a/mm/fadvise.c b/mm/fadvise.c new file mode 100644 index 000000000000..9503b65076a1 --- /dev/null +++ b/mm/fadvise.c @@ -0,0 +1,72 @@ +/* + * mm/fadvise.c + * + * Copyright (C) 2002, Linus Torvalds + * + * 11Jan2003 akpm@digeo.com + * Initial version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could + * deactivate the pages and clear PG_Referenced. + */ +int sys_fadvise64(int fd, loff_t offset, size_t len, int advice) +{ + struct file *file = fget(fd); + struct inode *inode; + struct address_space *mapping; + struct backing_dev_info *bdi; + int ret = 0; + + if (!file) + return -EBADF; + + inode = file->f_dentry->d_inode; + mapping = inode->i_mapping; + if (!mapping) + return -EINVAL; + + bdi = mapping->backing_dev_info; + + switch (advice) { + case POSIX_FADV_NORMAL: + file->f_ra.ra_pages = bdi->ra_pages; + break; + case POSIX_FADV_RANDOM: + file->f_ra.ra_pages = 0; + break; + case POSIX_FADV_SEQUENTIAL: + file->f_ra.ra_pages = bdi->ra_pages * 2; + break; + case POSIX_FADV_WILLNEED: + case POSIX_FADV_NOREUSE: + if (!mapping->a_ops->readpage) { + ret = -EINVAL; + break; + } + ret = do_page_cache_readahead(mapping, file, + offset >> PAGE_CACHE_SHIFT, + max_sane_readahead(len >> PAGE_CACHE_SHIFT)); + if (ret > 0) + ret = 0; + break; + case POSIX_FADV_DONTNEED: + invalidate_mapping_pages(mapping, offset >> PAGE_CACHE_SHIFT, + (len >> PAGE_CACHE_SHIFT) + 1); + break; + default: + ret = -EINVAL; + } + fput(file); + return ret; +} diff --git a/mm/truncate.c b/mm/truncate.c index b1782738666b..0c1dd53ae48f 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -177,24 +177,28 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) } /** - * invalidate_inode_pages - Invalidate all the unlocked pages of one inode - * @inode: the inode which pages we want to invalidate + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @inode: the address_space which holds the pages to invalidate + * @end: the index of the last page to invalidate (inclusive) + * @nr_pages: defines the pagecache span. Invalidate up to @start + @nr_pages * * This function only removes the unlocked pages, if you want to * remove all the pages of one inode, you must call truncate_inode_pages. * - * invalidate_inode_pages() will not block on IO activity. It will not + * invalidate_mapping_pages() will not block on IO activity. It will not * invalidate pages which are dirty, locked, under writeback or mapped into * pagetables. */ -void invalidate_inode_pages(struct address_space *mapping) +void invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) { struct pagevec pvec; - pgoff_t next = 0; + pgoff_t next = start; int i; pagevec_init(&pvec, 0); - while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + while (next <= end && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -218,6 +222,11 @@ unlock: } } +void invalidate_inode_pages(struct address_space *mapping) +{ + invalidate_mapping_pages(mapping, 0, ~0UL); +} + /** * invalidate_inode_pages2 - remove all unmapped pages from an address_space * @mapping - the address_space -- cgit v1.2.3