summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@home.transmeta.com>2002-06-17 20:48:29 -0700
committerLinus Torvalds <torvalds@home.transmeta.com>2002-06-17 20:48:29 -0700
commit1f60ade2a44d22a67c75a165b70d66f9d4e0b76e (patch)
tree7a8bda4c45fb3e5d255a023b030137e3b6be87ee
parent8509486ae776be099cbedb6c37c37741ddc20ad8 (diff)
parent3986594c6167a269053d3d88f17e53e0ca4023f8 (diff)
Merge master.kernel.org:/home/mingo/bk-sched
into home.transmeta.com:/home/torvalds/v2.5/linux
-rw-r--r--Documentation/filesystems/Locking42
-rw-r--r--Documentation/filesystems/porting6
-rw-r--r--Documentation/filesystems/proc.txt202
-rw-r--r--Documentation/sysctl/vm.txt143
-rw-r--r--arch/alpha/kernel/time.c2
-rw-r--r--arch/arm/kernel/time.c2
-rw-r--r--arch/cris/kernel/time.c2
-rw-r--r--arch/i386/kernel/irq.c6
-rw-r--r--arch/i386/kernel/time.c1
-rw-r--r--arch/i386/mm/Makefile3
-rw-r--r--arch/i386/mm/ioremap.c69
-rw-r--r--arch/i386/mm/pageattr.c197
-rw-r--r--arch/ia64/kernel/time.c2
-rw-r--r--arch/m68k/kernel/time.c1
-rw-r--r--arch/mips/kernel/time.c2
-rw-r--r--arch/mips64/kernel/syscall.c2
-rw-r--r--arch/parisc/kernel/time.c2
-rw-r--r--arch/ppc/kernel/time.c3
-rw-r--r--arch/ppc64/kernel/time.c2
-rw-r--r--arch/s390/kernel/time.c2
-rw-r--r--arch/s390x/kernel/time.c2
-rw-r--r--arch/sh/kernel/time.c2
-rw-r--r--arch/sparc/kernel/time.c2
-rw-r--r--arch/sparc64/kernel/time.c2
-rw-r--r--arch/x86_64/Makefile6
-rw-r--r--arch/x86_64/boot/Makefile4
-rw-r--r--arch/x86_64/config.in4
-rw-r--r--arch/x86_64/ia32/Makefile5
-rw-r--r--arch/x86_64/ia32/ipc32.c645
-rw-r--r--arch/x86_64/ia32/sys_ia32.c416
-rw-r--r--arch/x86_64/kernel/ioport.c23
-rw-r--r--arch/x86_64/kernel/mtrr.c454
-rw-r--r--arch/x86_64/kernel/process.c33
-rw-r--r--arch/x86_64/kernel/setup64.c3
-rw-r--r--arch/x86_64/kernel/signal.c56
-rw-r--r--arch/x86_64/kernel/smp.c22
-rw-r--r--arch/x86_64/kernel/vsyscall.c2
-rw-r--r--arch/x86_64/kernel/x8664_ksyms.c2
-rw-r--r--arch/x86_64/lib/Makefile2
-rw-r--r--arch/x86_64/lib/memset.S76
-rw-r--r--drivers/block/DAC960.c1
-rw-r--r--drivers/block/cciss.c1
-rw-r--r--drivers/block/cpqarray.c1
-rw-r--r--drivers/block/elevator.c1
-rw-r--r--drivers/block/floppy.c1
-rw-r--r--drivers/block/ll_rw_blk.c5
-rw-r--r--drivers/block/loop.c84
-rw-r--r--drivers/block/nbd.c1
-rw-r--r--drivers/block/rd.c2
-rw-r--r--drivers/block/umem.c1
-rw-r--r--drivers/char/agp/agp.h4
-rw-r--r--drivers/char/agp/agpgart_be.c155
-rw-r--r--drivers/char/random.c1
-rw-r--r--drivers/ide/ioctl.c5
-rw-r--r--drivers/md/linear.c2
-rw-r--r--drivers/md/lvm-snap.c2
-rw-r--r--drivers/md/lvm.c1
-rw-r--r--drivers/md/md.c1
-rw-r--r--drivers/md/multipath.c1
-rw-r--r--drivers/md/raid0.c1
-rw-r--r--drivers/md/raid1.c1
-rw-r--r--drivers/md/raid5.c1
-rw-r--r--drivers/pci/pci-driver.c1
-rw-r--r--drivers/pcmcia/pci_socket.c1
-rw-r--r--drivers/pcmcia/yenta.c1
-rw-r--r--drivers/scsi/README.st46
-rw-r--r--drivers/scsi/cpqfcTSinit.c1
-rw-r--r--drivers/scsi/scsi_lib.c1
-rw-r--r--drivers/scsi/sd.c1
-rw-r--r--drivers/scsi/sr.c1
-rw-r--r--drivers/scsi/st.c429
-rw-r--r--drivers/scsi/st_options.h15
-rw-r--r--fs/bio.c9
-rw-r--r--fs/buffer.c138
-rw-r--r--fs/coda/dir.c9
-rw-r--r--fs/ext3/balloc.c4
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/inode.c26
-rw-r--r--fs/intermezzo/dir.c27
-rw-r--r--fs/jbd/commit.c14
-rw-r--r--fs/jbd/journal.c2
-rw-r--r--fs/jbd/revoke.c6
-rw-r--r--fs/jbd/transaction.c66
-rw-r--r--fs/jfs/jfs_logmgr.c1
-rw-r--r--fs/namei.c9
-rw-r--r--fs/nfs/dir.c4
-rw-r--r--fs/ntfs/aops.c16
-rw-r--r--fs/qnx4/fsync.c2
-rw-r--r--fs/reiserfs/fix_node.c2
-rw-r--r--fs/reiserfs/journal.c8
-rw-r--r--fs/select.c307
-rw-r--r--fs/ufs/truncate.c8
-rw-r--r--include/asm-alpha/agp.h11
-rw-r--r--include/asm-i386/agp.h23
-rw-r--r--include/asm-i386/cacheflush.h3
-rw-r--r--include/asm-i386/io.h26
-rw-r--r--include/asm-i386/kmap_types.h9
-rw-r--r--include/asm-i386/page.h3
-rw-r--r--include/asm-i386/pgtable-2level.h1
-rw-r--r--include/asm-i386/pgtable-3level.h2
-rw-r--r--include/asm-i386/pgtable.h3
-rw-r--r--include/asm-ia64/agp.h11
-rw-r--r--include/asm-ppc/kmap_types.h3
-rw-r--r--include/asm-sparc/kmap_types.h3
-rw-r--r--include/asm-sparc64/agp.h11
-rw-r--r--include/asm-x86_64/agp.h23
-rw-r--r--include/asm-x86_64/cacheflush.h3
-rw-r--r--include/asm-x86_64/i387.h11
-rw-r--r--include/asm-x86_64/ia32.h2
-rw-r--r--include/asm-x86_64/ipc.h30
-rw-r--r--include/asm-x86_64/kmap_types.h3
-rw-r--r--include/asm-x86_64/mmu_context.h12
-rw-r--r--include/asm-x86_64/msr.h21
-rw-r--r--include/asm-x86_64/mtrr.h42
-rw-r--r--include/asm-x86_64/pda.h2
-rw-r--r--include/asm-x86_64/processor.h11
-rw-r--r--include/asm-x86_64/spinlock.h6
-rw-r--r--include/asm-x86_64/string.h13
-rw-r--r--include/asm-x86_64/suspend.h6
-rw-r--r--include/asm-x86_64/system.h7
-rw-r--r--include/asm-x86_64/timex.h2
-rw-r--r--include/asm-x86_64/tlbflush.h9
-rw-r--r--include/linux/bio.h50
-rw-r--r--include/linux/blkdev.h7
-rw-r--r--include/linux/buffer_head.h24
-rw-r--r--include/linux/highmem.h44
-rw-r--r--include/linux/ide.h1
-rw-r--r--include/linux/jbd.h1
-rw-r--r--include/linux/loop.h8
-rw-r--r--include/linux/poll.h49
-rw-r--r--include/linux/raid/raid5.h1
-rw-r--r--include/linux/reiserfs_fs.h2
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/swap.h32
-rw-r--r--include/linux/sysctl.h19
-rw-r--r--include/linux/timer.h2
-rw-r--r--include/linux/tqueue.h3
-rw-r--r--include/linux/vmalloc.h3
-rw-r--r--include/linux/writeback.h6
-rw-r--r--kernel/context.c1
-rw-r--r--kernel/kmod.c1
-rw-r--r--kernel/ksyms.c5
-rw-r--r--kernel/suspend.c35
-rw-r--r--kernel/sys.c1
-rw-r--r--kernel/sysctl.c14
-rw-r--r--kernel/timer.c10
-rw-r--r--lib/radix-tree.c2
-rw-r--r--mm/filemap.c68
-rw-r--r--mm/highmem.c5
-rw-r--r--mm/msync.c10
-rw-r--r--mm/page-writeback.c68
-rw-r--r--mm/page_io.c221
-rw-r--r--mm/shmem.c22
-rw-r--r--mm/swap_state.c74
-rw-r--r--mm/swapfile.c404
-rw-r--r--mm/vmalloc.c28
-rw-r--r--mm/vmscan.c23
-rw-r--r--net/ipv4/route.c4
158 files changed, 2799 insertions, 2645 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index d636ae84e508..c894fcceb996 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -50,27 +50,27 @@ prototypes:
int (*removexattr) (struct dentry *, const char *);
locking rules:
- all may block
- BKL i_sem(inode)
-lookup: no yes
-create: no yes
-link: no yes (both)
-mknod: no yes
-symlink: no yes
-mkdir: no yes
-unlink: no yes (both)
-rmdir: no yes (both) (see below)
-rename: no yes (all) (see below)
-readlink: no no
-follow_link: no no
-truncate: no yes (see below)
-setattr: no yes
-permission: yes no
-getattr: no no
-setxattr: no yes
-getxattr: no yes
-listxattr: no yes
-removexattr: no yes
+ all may block, none have BKL
+ i_sem(inode)
+lookup: yes
+create: yes
+link: yes (both)
+mknod: yes
+symlink: yes
+mkdir: yes
+unlink: yes (both)
+rmdir: yes (both) (see below)
+rename: yes (all) (see below)
+readlink: no
+follow_link: no
+truncate: yes (see below)
+setattr: yes
+permission: no
+getattr: no
+setxattr: yes
+getxattr: yes
+listxattr: yes
+removexattr: yes
Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_sem on
victim.
cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index ef49709ee8ad..85281b6f4ff0 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -81,9 +81,9 @@ can relax your locking.
[mandatory]
->lookup(), ->truncate(), ->create(), ->unlink(), ->mknod(), ->mkdir(),
-->rmdir(), ->link(), ->lseek(), ->symlink(), ->rename() and ->readdir()
-are called without BKL now. Grab it on the entry, drop upon return - that
-will guarantee the same locking you used to have. If your method or its
+->rmdir(), ->link(), ->lseek(), ->symlink(), ->rename(), ->permission()
+and ->readdir() are called without BKL now. Grab it on entry, drop upon return
+- that will guarantee the same locking you used to have. If your method or its
parts do not need BKL - better yet, now you can shift lock_kernel() and
unlock_kernel() so that they would protect exactly what needs to be
protected.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index f93b1544c6b2..57597335536d 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -948,120 +948,43 @@ program to load modules on demand.
-----------------------------------------------
The files in this directory can be used to tune the operation of the virtual
-memory (VM) subsystem of the Linux kernel. In addition, one of the files
-(bdflush) has some influence on disk usage.
+memory (VM) subsystem of the Linux kernel.
-bdflush
--------
-
-This file controls the operation of the bdflush kernel daemon. It currently
-contains nine integer values, six of which are actually used by the kernel.
-They are listed in table 2-2.
-
-
-Table 2-2: Parameters in /proc/sys/vm/bdflush
-..............................................................................
- Value Meaning
- nfract Percentage of buffer cache dirty to activate bdflush
- ndirty Maximum number of dirty blocks to write out per wake-cycle
- nrefill Number of clean buffers to try to obtain each time we call refill
- nref_dirt buffer threshold for activating bdflush when trying to refill
- buffers.
- dummy Unused
- age_buffer Time for normal buffer to age before we flush it
- age_super Time for superblock to age before we flush it
- dummy Unused
- dummy Unused
-..............................................................................
-
-nfract
-------
-
-This parameter governs the maximum number of dirty buffers in the buffer
-cache. Dirty means that the contents of the buffer still have to be written to
-disk (as opposed to a clean buffer, which can just be forgotten about).
-Setting this to a higher value means that Linux can delay disk writes for a
-long time, but it also means that it will have to do a lot of I/O at once when
-memory becomes short. A lower value will spread out disk I/O more evenly.
-
-ndirty
-------
-
-Ndirty gives the maximum number of dirty buffers that bdflush can write to the
-disk at one time. A high value will mean delayed, bursty I/O, while a small
-value can lead to memory shortage when bdflush isn't woken up often enough.
-
-nrefill
--------
-
-This is the number of buffers that bdflush will add to the list of free
-buffers when refill_freelist() is called. It is necessary to allocate free
-buffers beforehand, since the buffers are often different sizes than the
-memory pages and some bookkeeping needs to be done beforehand. The higher the
-number, the more memory will be wasted and the less often refill_freelist()
-will need to run.
-
-nref_dirt
----------
-
-When refill_freelist() comes across more than nref_dirt dirty buffers, it will
-wake up bdflush.
-
-age_buffer and age_super
-------------------------
-
-Finally, the age_buffer and age_super parameters govern the maximum time Linux
-waits before writing out a dirty buffer to disk. The value is expressed in
-jiffies (clockticks), the number of jiffies per second is 100. Age_buffer is
-the maximum age for data blocks, while age_super is for filesystems meta data.
-
-buffermem
----------
-
-The three values in this file control how much memory should be used for
-buffer memory. The percentage is calculated as a percentage of total system
-memory.
-
-The values are:
-
-min_percent
------------
+dirty_background_ratio
+----------------------
-This is the minimum percentage of memory that should be spent on buffer
-memory.
+Contains, as a percentage of total system memory, the number of pages at which
+the pdflush background writeback daemon will start writing out dirty data.
-borrow_percent
---------------
+dirty_async_ratio
+-----------------
-When Linux is short on memory, and the buffer cache uses more than it has been
-allotted, the memory management (MM) subsystem will prune the buffer cache
-more heavily than other memory to compensate.
+Contains, as a percentage of total system memory, the number of pages at which
+a process which is generating disk writes will itself start writing out dirty
+data.
-max_percent
------------
+dirty_sync_ratio
+----------------
-This is the maximum amount of memory that can be used for buffer memory.
+Contains, as a percentage of total system memory, the number of pages at which
+a process which is generating disk writes will itself start writing out dirty
+data and waiting upon completion of that writeout.
-freepages
----------
+dirty_writeback_centisecs
+-------------------------
-This file contains three values: min, low and high:
+The pdflush writeback daemons will periodically wake up and write `old' data
+out to disk. This tunable expresses the interval between those wakeups, in
+100'ths of a second.
-min
----
-When the number of free pages in the system reaches this number, only the
-kernel can allocate more memory.
+dirty_expire_centisecs
+----------------------
-low
----
-If the number of free pages falls below this point, the kernel starts swapping
-aggressively.
+This tunable is used to define when dirty data is old enough to be eligible
+for writeout by the pdflush daemons. It is expressed in 100'ths of a second.
+Data which has been dirty in-memory for longer than this interval will be
+written out next time a pdflush daemon wakes up.
-high
-----
-The kernel tries to keep up to this amount of memory free; if memory falls
-below this point, the kernel starts gently swapping in the hopes that it never
-has to do really aggressive swapping.
kswapd
------
@@ -1113,79 +1036,6 @@ On the other hand, enabling this feature can cause you to run out of memory
and thrash the system to death, so large and/or important servers will want to
set this value to 0.
-pagecache
----------
-
-This file does exactly the same job as buffermem, only this file controls the
-amount of memory allowed for memory mapping and generic caching of files.
-
-You don't want the minimum level to be too low, otherwise your system might
-thrash when memory is tight or fragmentation is high.
-
-pagetable_cache
----------------
-
-The kernel keeps a number of page tables in a per-processor cache (this helps
-a lot on SMP systems). The cache size for each processor will be between the
-low and the high value.
-
-On a low-memory, single CPU system, you can safely set these values to 0 so
-you don't waste memory. It is used on SMP systems so that the system can
-perform fast pagetable allocations without having to acquire the kernel memory
-lock.
-
-For large systems, the settings are probably fine. For normal systems they
-won't hurt a bit. For small systems ( less than 16MB ram) it might be
-advantageous to set both values to 0.
-
-swapctl
--------
-
-This file contains no less than 8 variables. All of these values are used by
-kswapd.
-
-The first four variables
-* sc_max_page_age,
-* sc_page_advance,
-* sc_page_decline and
-* sc_page_initial_age
-are used to keep track of Linux's page aging. Page aging is a bookkeeping
-method to track which pages of memory are often used, and which pages can be
-swapped out without consequences.
-
-When a page is swapped in, it starts at sc_page_initial_age (default 3) and
-when the page is scanned by kswapd, its age is adjusted according to the
-following scheme:
-
-* If the page was used since the last time we scanned, its age is increased
- by sc_page_advance (default 3). Where the maximum value is given by
- sc_max_page_age (default 20).
-* Otherwise (meaning it wasn't used) its age is decreased by sc_page_decline
- (default 1).
-
-When a page reaches age 0, it's ready to be swapped out.
-
-The variables sc_age_cluster_fract, sc_age_cluster_min, sc_pageout_weight and
-sc_bufferout_weight, can be used to control kswapd's aggressiveness in
-swapping out pages.
-
-Sc_age_cluster_fract is used to calculate how many pages from a process are to
-be scanned by kswapd. The formula used is
-
-(sc_age_cluster_fract divided by 1024) times resident set size
-
-So if you want kswapd to scan the whole process, sc_age_cluster_fract needs to
-have a value of 1024. The minimum number of pages kswapd will scan is
-represented by sc_age_cluster_min, which is done so that kswapd will also scan
-small processes.
-
-The values of sc_pageout_weight and sc_bufferout_weight are used to control
-how many tries kswapd will make in order to swap out one page/buffer. These
-values can be used to fine-tune the ratio between user pages and buffer/cache
-memory. When you find that your Linux system is swapping out too many process
-pages in order to satisfy buffer memory demands, you may want to either
-increase sc_bufferout_weight, or decrease the value of sc_pageout_weight.
-
2.5 /proc/sys/dev - Device specific parameters
----------------------------------------------
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index bf9abe829e40..b8221db90cde 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -9,116 +9,28 @@ This file contains the documentation for the sysctl files in
/proc/sys/vm and is valid for Linux kernel version 2.2.
The files in this directory can be used to tune the operation
-of the virtual memory (VM) subsystem of the Linux kernel, and
-one of the files (bdflush) also has a little influence on disk
-usage.
+of the virtual memory (VM) subsystem of the Linux kernel and
+the writeout of dirty data to disk.
Default values and initialization routines for most of these
files can be found in mm/swap.c.
Currently, these files are in /proc/sys/vm:
-- bdflush
-- buffermem
-- freepages
- kswapd
- overcommit_memory
- page-cluster
-- pagecache
-- pagetable_cache
+- dirty_async_ratio
+- dirty_background_ratio
+- dirty_expire_centisecs
+- dirty_sync_ratio
+- dirty_writeback_centisecs
==============================================================
-bdflush:
-
-This file controls the operation of the bdflush kernel
-daemon. The source code to this struct can be found in
-linux/fs/buffer.c. It currently contains 9 integer values,
-of which 4 are actually used by the kernel.
-
-From linux/fs/buffer.c:
---------------------------------------------------------------
-union bdflush_param {
- struct {
- int nfract; /* Percentage of buffer cache dirty to
- activate bdflush */
- int dummy1; /* old "ndirty" */
- int dummy2; /* old "nrefill" */
- int dummy3; /* unused */
- int interval; /* jiffies delay between kupdate flushes */
- int age_buffer; /* Time for normal buffer to age */
- int nfract_sync;/* Percentage of buffer cache dirty to
- activate bdflush synchronously */
- int dummy4; /* unused */
- int dummy5; /* unused */
- } b_un;
- unsigned int data[N_PARAM];
-} bdf_prm = {{30, 64, 64, 256, 5*HZ, 30*HZ, 60, 0, 0}};
---------------------------------------------------------------
-
-int nfract:
-The first parameter governs the maximum number of dirty
-buffers in the buffer cache. Dirty means that the contents
-of the buffer still have to be written to disk (as opposed
-to a clean buffer, which can just be forgotten about).
-Setting this to a high value means that Linux can delay disk
-writes for a long time, but it also means that it will have
-to do a lot of I/O at once when memory becomes short. A low
-value will spread out disk I/O more evenly, at the cost of
-more frequent I/O operations. The default value is 30%,
-the minimum is 0%, and the maximum is 100%.
-
-int interval:
-The fifth parameter, interval, is the minimum rate at
-which kupdate will wake and flush. The value is expressed in
-jiffies (clockticks), the number of jiffies per second is
-normally 100 (Alpha is 1024). Thus, x*HZ is x seconds. The
-default value is 5 seconds, the minimum is 0 seconds, and the
-maximum is 600 seconds.
-
-int age_buffer:
-The sixth parameter, age_buffer, governs the maximum time
-Linux waits before writing out a dirty buffer to disk. The
-value is in jiffies. The default value is 30 seconds,
-the minimum is 1 second, and the maximum 6,000 seconds.
-
-int nfract_sync:
-The seventh parameter, nfract_sync, governs the percentage
-of buffer cache that is dirty before bdflush activates
-synchronously. This can be viewed as the hard limit before
-bdflush forces buffers to disk. The default is 60%, the
-minimum is 0%, and the maximum is 100%.
-
-==============================================================
-buffermem:
-
-The three values in this file correspond to the values in
-the struct buffer_mem. It controls how much memory should
-be used for buffer memory. The percentage is calculated
-as a percentage of total system memory.
-
-The values are:
-min_percent -- this is the minimum percentage of memory
- that should be spent on buffer memory
-borrow_percent -- UNUSED
-max_percent -- UNUSED
-
-==============================================================
-freepages:
+dirty_async_ratio, dirty_background_ratio, dirty_expire_centisecs,
+dirty_sync_ratio dirty_writeback_centisecs:
-This file contains the values in the struct freepages. That
-struct contains three members: min, low and high.
-
-The meaning of the numbers is:
-
-freepages.min When the number of free pages in the system
- reaches this number, only the kernel can
- allocate more memory.
-freepages.low If the number of free pages gets below this
- point, the kernel starts swapping aggressively.
-freepages.high The kernel tries to keep up to this amount of
- memory free; if memory comes below this point,
- the kernel gently starts swapping in the hopes
- that it never has to do real aggressive swapping.
+See Documentation/filesystems/proc.txt
==============================================================
@@ -180,38 +92,3 @@ The number of pages the kernel reads in at once is equal to
2 ^ page-cluster. Values above 2 ^ 5 don't make much sense
for swap because we only cluster swap data in 32-page groups.
-==============================================================
-
-pagecache:
-
-This file does exactly the same as buffermem, only this
-file controls the struct page_cache, and thus controls
-the amount of memory used for the page cache.
-
-In 2.2, the page cache is used for 3 main purposes:
-- caching read() data from files
-- caching mmap()ed data and executable files
-- swap cache
-
-When your system is both deep in swap and high on cache,
-it probably means that a lot of the swapped data is being
-cached, making for more efficient swapping than possible
-with the 2.0 kernel.
-
-==============================================================
-
-pagetable_cache:
-
-The kernel keeps a number of page tables in a per-processor
-cache (this helps a lot on SMP systems). The cache size for
-each processor will be between the low and the high value.
-
-On a low-memory, single CPU system you can safely set these
-values to 0 so you don't waste the memory. On SMP systems it
-is used so that the system can do fast pagetable allocations
-without having to acquire the kernel memory lock.
-
-For large systems, the settings are probably OK. For normal
-systems they won't hurt a bit. For small systems (<16MB ram)
-it might be advantageous to set both values to 0.
-
diff --git a/arch/alpha/kernel/time.c b/arch/alpha/kernel/time.c
index 0be250e543e8..93a569828d70 100644
--- a/arch/alpha/kernel/time.c
+++ b/arch/alpha/kernel/time.c
@@ -48,6 +48,8 @@
#include "proto.h"
#include "irq_impl.h"
+u64 jiffies_64;
+
extern rwlock_t xtime_lock;
extern unsigned long wall_jiffies; /* kernel/timer.c */
diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index 7c7e03c5b6e9..cd00aacc74a9 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -32,6 +32,8 @@
#include <asm/irq.h>
#include <asm/leds.h>
+u64 jiffies_64;
+
extern rwlock_t xtime_lock;
extern unsigned long wall_jiffies;
diff --git a/arch/cris/kernel/time.c b/arch/cris/kernel/time.c
index 537040f95a6d..1ee0bbfeab7e 100644
--- a/arch/cris/kernel/time.c
+++ b/arch/cris/kernel/time.c
@@ -44,6 +44,8 @@
#include <asm/svinto.h>
+u64 jiffies_64;
+
static int have_rtc; /* used to remember if we have an RTC or not */
/* define this if you need to use print_timestamp */
diff --git a/arch/i386/kernel/irq.c b/arch/i386/kernel/irq.c
index 8608a903f86d..4265cb038a5a 100644
--- a/arch/i386/kernel/irq.c
+++ b/arch/i386/kernel/irq.c
@@ -360,8 +360,9 @@ void __global_cli(void)
__save_flags(flags);
if (flags & (1 << EFLAGS_IF_SHIFT)) {
- int cpu = smp_processor_id();
+ int cpu;
__cli();
+ cpu = smp_processor_id();
if (!local_irq_count(cpu))
get_irqlock(cpu);
}
@@ -369,11 +370,12 @@ void __global_cli(void)
void __global_sti(void)
{
- int cpu = smp_processor_id();
+ int cpu = get_cpu();
if (!local_irq_count(cpu))
release_irqlock(cpu);
__sti();
+ put_cpu();
}
/*
diff --git a/arch/i386/kernel/time.c b/arch/i386/kernel/time.c
index 1e1eb0d3a5f7..f56251513581 100644
--- a/arch/i386/kernel/time.c
+++ b/arch/i386/kernel/time.c
@@ -65,6 +65,7 @@
*/
#include <linux/irq.h>
+u64 jiffies_64;
unsigned long cpu_khz; /* Detected as we calibrate the TSC */
diff --git a/arch/i386/mm/Makefile b/arch/i386/mm/Makefile
index 73e25bd3022a..67df8b6f6594 100644
--- a/arch/i386/mm/Makefile
+++ b/arch/i386/mm/Makefile
@@ -9,6 +9,7 @@
O_TARGET := mm.o
-obj-y := init.o fault.o ioremap.o extable.o
+obj-y := init.o fault.o ioremap.o extable.o pageattr.o
+export-objs := pageattr.o
include $(TOPDIR)/Rules.make
diff --git a/arch/i386/mm/ioremap.c b/arch/i386/mm/ioremap.c
index f81fae4ff7a9..4ba5641b271f 100644
--- a/arch/i386/mm/ioremap.c
+++ b/arch/i386/mm/ioremap.c
@@ -10,12 +10,13 @@
#include <linux/vmalloc.h>
#include <linux/init.h>
+#include <linux/slab.h>
#include <asm/io.h>
#include <asm/pgalloc.h>
#include <asm/fixmap.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
-
+#include <asm/pgtable.h>
static inline void remap_area_pte(pte_t * pte, unsigned long address, unsigned long size,
unsigned long phys_addr, unsigned long flags)
@@ -155,6 +156,7 @@ void * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flag
area = get_vm_area(size, VM_IOREMAP);
if (!area)
return NULL;
+ area->phys_addr = phys_addr;
addr = area->addr;
if (remap_area_pages(VMALLOC_VMADDR(addr), phys_addr, size, flags)) {
vfree(addr);
@@ -163,10 +165,71 @@ void * __ioremap(unsigned long phys_addr, unsigned long size, unsigned long flag
return (void *) (offset + (char *)addr);
}
+
+/**
+ * ioremap_nocache - map bus memory into CPU space
+ * @offset: bus address of the memory
+ * @size: size of the resource to map
+ *
+ * ioremap_nocache performs a platform specific sequence of operations to
+ * make bus memory CPU accessible via the readb/readw/readl/writeb/
+ * writew/writel functions and the other mmio helpers. The returned
+ * address is not guaranteed to be usable directly as a virtual
+ * address.
+ *
+ * This version of ioremap ensures that the memory is marked uncachable
+ * on the CPU as well as honouring existing caching rules from things like
+ * the PCI bus. Note that there are other caches and buffers on many
+ * busses. In particular driver authors should read up on PCI writes
+ *
+ * It's useful if some control registers are in such an area and
+ * write combining or read caching is not desirable:
+ *
+ * Must be freed with iounmap.
+ */
+
+void *ioremap_nocache (unsigned long phys_addr, unsigned long size)
+{
+ void *p = __ioremap(phys_addr, size, _PAGE_PCD);
+ if (!p)
+ return p;
+
+ if (phys_addr + size < virt_to_phys(high_memory)) {
+ struct page *ppage = virt_to_page(__va(phys_addr));
+ unsigned long npages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+
+ BUG_ON(phys_addr+size > (unsigned long)high_memory);
+ BUG_ON(phys_addr + size < phys_addr);
+
+ if (change_page_attr(ppage, npages, PAGE_KERNEL_NOCACHE) < 0) {
+ iounmap(p);
+ p = NULL;
+ }
+ }
+
+ return p;
+}
+
void iounmap(void *addr)
{
- if (addr > high_memory)
- return vfree((void *) (PAGE_MASK & (unsigned long) addr));
+ struct vm_struct *p;
+ if (addr < high_memory)
+ return;
+ p = remove_kernel_area(addr);
+ if (!p) {
+ printk("__iounmap: bad address %p\n", addr);
+ return;
+ }
+
+ BUG_ON(p->phys_addr == 0); /* not allocated with ioremap */
+
+ vmfree_area_pages(VMALLOC_VMADDR(p->addr), p->size);
+ if (p->flags && p->phys_addr < virt_to_phys(high_memory)) {
+ change_page_attr(virt_to_page(__va(p->phys_addr)),
+ p->size >> PAGE_SHIFT,
+ PAGE_KERNEL);
+ }
+ kfree(p);
}
void __init *bt_ioremap(unsigned long phys_addr, unsigned long size)
diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c
new file mode 100644
index 000000000000..c5e2374b6bc7
--- /dev/null
+++ b/arch/i386/mm/pageattr.c
@@ -0,0 +1,197 @@
+/*
+ * Copyright 2002 Andi Kleen, SuSE Labs.
+ * Thanks to Ben LaHaise for precious feedback.
+ */
+
+#include <linux/config.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/uaccess.h>
+#include <asm/processor.h>
+
+static inline pte_t *lookup_address(unsigned long address)
+{
+ pgd_t *pgd = pgd_offset_k(address);
+ pmd_t *pmd = pmd_offset(pgd, address);
+ if (pmd_large(*pmd))
+ return (pte_t *)pmd;
+ return pte_offset_kernel(pmd, address);
+}
+
+static struct page *split_large_page(unsigned long address, pgprot_t prot)
+{
+ int i;
+ unsigned long addr;
+ struct page *base = alloc_pages(GFP_KERNEL, 0);
+ pte_t *pbase;
+ if (!base)
+ return NULL;
+ address = __pa(address);
+ addr = address & LARGE_PAGE_MASK;
+ pbase = (pte_t *)page_address(base);
+ for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
+ pbase[i] = pfn_pte(addr >> PAGE_SHIFT,
+ addr == address ? prot : PAGE_KERNEL);
+ }
+ return base;
+}
+
+static void flush_kernel_map(void *dummy)
+{
+ /* Could use CLFLUSH here if the CPU supports it (Hammer,P4) */
+ if (boot_cpu_data.x86_model >= 4)
+ asm volatile("wbinvd":::"memory");
+ /* Flush all to work around Errata in early athlons regarding
+ * large page flushing.
+ */
+ __flush_tlb_all();
+}
+
+static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
+{
+ set_pte_atomic(kpte, pte); /* change init_mm */
+#ifndef CONFIG_X86_PAE
+ {
+ struct list_head *l;
+ spin_lock(&mmlist_lock);
+ list_for_each(l, &init_mm.mmlist) {
+ struct mm_struct *mm = list_entry(l, struct mm_struct, mmlist);
+ pmd_t *pmd = pmd_offset(pgd_offset(mm, address), address);
+ set_pte_atomic((pte_t *)pmd, pte);
+ }
+ spin_unlock(&mmlist_lock);
+ }
+#endif
+}
+
+/*
+ * No more special protections in this 2/4MB area - revert to a
+ * large page again.
+ */
+static inline void revert_page(struct page *kpte_page, unsigned long address)
+{
+ pte_t *linear = (pte_t *)
+ pmd_offset(pgd_offset(&init_mm, address), address);
+ set_pmd_pte(linear, address,
+ pfn_pte((__pa(address) & LARGE_PAGE_MASK) >> PAGE_SHIFT,
+ PAGE_KERNEL_LARGE));
+}
+
+static int
+__change_page_attr(struct page *page, pgprot_t prot, struct page **oldpage)
+{
+ pte_t *kpte;
+ unsigned long address;
+ struct page *kpte_page;
+
+#ifdef CONFIG_HIGHMEM
+ if (page >= highmem_start_page)
+ BUG();
+#endif
+ address = (unsigned long)page_address(page);
+
+ kpte = lookup_address(address);
+ kpte_page = virt_to_page(((unsigned long)kpte) & PAGE_MASK);
+ if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) {
+ if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
+ pte_t old = *kpte;
+ pte_t standard = mk_pte(page, PAGE_KERNEL);
+
+ set_pte_atomic(kpte, mk_pte(page, prot));
+ if (pte_same(old,standard))
+ atomic_inc(&kpte_page->count);
+ } else {
+ struct page *split = split_large_page(address, prot);
+ if (!split)
+ return -ENOMEM;
+ set_pmd_pte(kpte,address,mk_pte(split, PAGE_KERNEL));
+ }
+ } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) {
+ set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL));
+ atomic_dec(&kpte_page->count);
+ }
+
+ if (cpu_has_pse && (atomic_read(&kpte_page->count) == 1)) {
+ *oldpage = kpte_page;
+ revert_page(kpte_page, address);
+ }
+ return 0;
+}
+
+static inline void flush_map(void)
+{
+#ifdef CONFIG_SMP
+ smp_call_function(flush_kernel_map, NULL, 1, 1);
+#endif
+ flush_kernel_map(NULL);
+}
+
+struct deferred_page {
+ struct deferred_page *next;
+ struct page *fpage;
+};
+static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */
+
+/*
+ * Change the page attributes of an page in the linear mapping.
+ *
+ * This should be used when a page is mapped with a different caching policy
+ * than write-back somewhere - some CPUs do not like it when mappings with
+ * different caching policies exist. This changes the page attributes of the
+ * in kernel linear mapping too.
+ *
+ * The caller needs to ensure that there are no conflicting mappings elsewhere.
+ * This function only deals with the kernel linear map.
+ *
+ * Caller must call global_flush_tlb() after this.
+ */
+int change_page_attr(struct page *page, int numpages, pgprot_t prot)
+{
+ int err = 0;
+ struct page *fpage;
+ int i;
+
+ down_write(&init_mm.mmap_sem);
+ for (i = 0; i < numpages; i++, page++) {
+ fpage = NULL;
+ err = __change_page_attr(page, prot, &fpage);
+ if (err)
+ break;
+ if (fpage) {
+ struct deferred_page *df;
+ df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL);
+ if (!df) {
+ flush_map();
+ __free_page(fpage);
+ } else {
+ df->next = df_list;
+ df->fpage = fpage;
+ df_list = df;
+ }
+ }
+ }
+ up_write(&init_mm.mmap_sem);
+ return err;
+}
+
+void global_flush_tlb(void)
+{
+ struct deferred_page *df, *next_df;
+
+ down_read(&init_mm.mmap_sem);
+ df = xchg(&df_list, NULL);
+ up_read(&init_mm.mmap_sem);
+ flush_map();
+ for (; df; df = next_df) {
+ next_df = df->next;
+ if (df->fpage)
+ __free_page(df->fpage);
+ kfree(df);
+ }
+}
+
+EXPORT_SYMBOL(change_page_attr);
+EXPORT_SYMBOL(global_flush_tlb);
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index dc6500b7a167..1c348cce1fdd 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -27,6 +27,8 @@ extern rwlock_t xtime_lock;
extern unsigned long wall_jiffies;
extern unsigned long last_time_offset;
+u64 jiffies_64;
+
#ifdef CONFIG_IA64_DEBUG_IRQ
unsigned long last_cli_ip;
diff --git a/arch/m68k/kernel/time.c b/arch/m68k/kernel/time.c
index a845040b339a..54b8f68cf7e0 100644
--- a/arch/m68k/kernel/time.c
+++ b/arch/m68k/kernel/time.c
@@ -24,6 +24,7 @@
#include <linux/timex.h>
+u64 jiffies_64;
static inline int set_rtc_mmss(unsigned long nowtime)
{
diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c
index e548314773de..6ea186b42155 100644
--- a/arch/mips/kernel/time.c
+++ b/arch/mips/kernel/time.c
@@ -32,6 +32,8 @@
#define USECS_PER_JIFFY (1000000/HZ)
#define USECS_PER_JIFFY_FRAC ((1000000ULL << 32) / HZ & 0xffffffff)
+u64 jiffies_64;
+
/*
* forward reference
*/
diff --git a/arch/mips64/kernel/syscall.c b/arch/mips64/kernel/syscall.c
index 6daab491059b..053051c63a25 100644
--- a/arch/mips64/kernel/syscall.c
+++ b/arch/mips64/kernel/syscall.c
@@ -32,6 +32,8 @@
#include <asm/sysmips.h>
#include <asm/uaccess.h>
+u64 jiffies_64;
+
extern asmlinkage void syscall_trace(void);
asmlinkage int sys_pipe(abi64_no_regargs, struct pt_regs regs)
diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c
index 7b3de0e0ada3..e028e6f3dbe2 100644
--- a/arch/parisc/kernel/time.c
+++ b/arch/parisc/kernel/time.c
@@ -30,6 +30,8 @@
#include <linux/timex.h>
+u64 jiffies_64;
+
extern rwlock_t xtime_lock;
static int timer_value;
diff --git a/arch/ppc/kernel/time.c b/arch/ppc/kernel/time.c
index 260345226022..88a4d63ffea0 100644
--- a/arch/ppc/kernel/time.c
+++ b/arch/ppc/kernel/time.c
@@ -70,6 +70,9 @@
#include <asm/time.h>
+/* XXX false sharing with below? */
+u64 jiffies_64;
+
unsigned long disarm_decr[NR_CPUS];
extern int do_sys_settimeofday(struct timeval *tv, struct timezone *tz);
diff --git a/arch/ppc64/kernel/time.c b/arch/ppc64/kernel/time.c
index d00224a05633..9cd390d65342 100644
--- a/arch/ppc64/kernel/time.c
+++ b/arch/ppc64/kernel/time.c
@@ -64,6 +64,8 @@
void smp_local_timer_interrupt(struct pt_regs *);
+u64 jiffies_64;
+
/* keep track of when we need to update the rtc */
time_t last_rtc_update;
extern rwlock_t xtime_lock;
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 2a135d999830..f09059ee63bd 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -39,6 +39,8 @@
#define TICK_SIZE tick
+u64 jiffies_64;
+
static ext_int_info_t ext_int_info_timer;
static uint64_t init_timer_cc;
diff --git a/arch/s390x/kernel/time.c b/arch/s390x/kernel/time.c
index e12e41e2eaef..b81dcb9683d7 100644
--- a/arch/s390x/kernel/time.c
+++ b/arch/s390x/kernel/time.c
@@ -39,6 +39,8 @@
#define TICK_SIZE tick
+u64 jiffies_64;
+
static ext_int_info_t ext_int_info_timer;
static uint64_t init_timer_cc;
diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c
index 62af96d4fd48..e51e0eb001d6 100644
--- a/arch/sh/kernel/time.c
+++ b/arch/sh/kernel/time.c
@@ -70,6 +70,8 @@
#endif /* CONFIG_CPU_SUBTYPE_ST40STB1 */
#endif /* __sh3__ or __SH4__ */
+u64 jiffies_64;
+
extern rwlock_t xtime_lock;
extern unsigned long wall_jiffies;
#define TICK_SIZE tick
diff --git a/arch/sparc/kernel/time.c b/arch/sparc/kernel/time.c
index 6e7935ab7c56..90d3e8528358 100644
--- a/arch/sparc/kernel/time.c
+++ b/arch/sparc/kernel/time.c
@@ -43,6 +43,8 @@
extern rwlock_t xtime_lock;
+u64 jiffies_64;
+
enum sparc_clock_type sp_clock_typ;
spinlock_t mostek_lock = SPIN_LOCK_UNLOCKED;
unsigned long mstk48t02_regs = 0UL;
diff --git a/arch/sparc64/kernel/time.c b/arch/sparc64/kernel/time.c
index 852c96d62319..47c794e99f4b 100644
--- a/arch/sparc64/kernel/time.c
+++ b/arch/sparc64/kernel/time.c
@@ -44,6 +44,8 @@ unsigned long mstk48t02_regs = 0UL;
unsigned long ds1287_regs = 0UL;
#endif
+u64 jiffies_64;
+
static unsigned long mstk48t08_regs = 0UL;
static unsigned long mstk48t59_regs = 0UL;
diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile
index 3968f838fe7c..46fe5228c782 100644
--- a/arch/x86_64/Makefile
+++ b/arch/x86_64/Makefile
@@ -43,15 +43,9 @@ CFLAGS += -mcmodel=kernel
CFLAGS += -pipe
# this makes reading assembly source easier
CFLAGS += -fno-reorder-blocks
-# needed for later gcc 3.1
CFLAGS += -finline-limit=2000
-# needed for earlier gcc 3.1
-#CFLAGS += -fno-strength-reduce
#CFLAGS += -g
-# prevent gcc from keeping the stack 16 byte aligned (FIXME)
-#CFLAGS += -mpreferred-stack-boundary=2
-
HEAD := arch/x86_64/kernel/head.o arch/x86_64/kernel/head64.o arch/x86_64/kernel/init_task.o
SUBDIRS := arch/x86_64/tools $(SUBDIRS) arch/x86_64/kernel arch/x86_64/mm arch/x86_64/lib
diff --git a/arch/x86_64/boot/Makefile b/arch/x86_64/boot/Makefile
index a82cabc11223..9549b65aaae7 100644
--- a/arch/x86_64/boot/Makefile
+++ b/arch/x86_64/boot/Makefile
@@ -21,10 +21,6 @@ ROOT_DEV := CURRENT
SVGA_MODE := -DSVGA_MODE=NORMAL_VGA
-# If you want the RAM disk device, define this to be the size in blocks.
-
-RAMDISK := -DRAMDISK=512
-
# ---------------------------------------------------------------------------
BOOT_INCL = $(TOPDIR)/include/linux/config.h \
diff --git a/arch/x86_64/config.in b/arch/x86_64/config.in
index 8605598747a8..829a74f439ad 100644
--- a/arch/x86_64/config.in
+++ b/arch/x86_64/config.in
@@ -47,8 +47,7 @@ define_bool CONFIG_EISA n
define_bool CONFIG_X86_IO_APIC y
define_bool CONFIG_X86_LOCAL_APIC y
-#currently broken:
-#bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR
+bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR
bool 'Symmetric multi-processing support' CONFIG_SMP
if [ "$CONFIG_SMP" = "n" ]; then
bool 'Preemptible Kernel' CONFIG_PREEMPT
@@ -226,6 +225,7 @@ if [ "$CONFIG_DEBUG_KERNEL" != "n" ]; then
bool ' Spinlock debugging' CONFIG_DEBUG_SPINLOCK
bool ' Additional run-time checks' CONFIG_CHECKING
bool ' Debug __init statements' CONFIG_INIT_DEBUG
+ bool ' Spinlock debugging' CONFIG_DEBUG_SPINLOCK
fi
endmenu
diff --git a/arch/x86_64/ia32/Makefile b/arch/x86_64/ia32/Makefile
index 45c356b60cb5..00e69a2d0060 100644
--- a/arch/x86_64/ia32/Makefile
+++ b/arch/x86_64/ia32/Makefile
@@ -9,8 +9,9 @@ export-objs := ia32_ioctl.o sys_ia32.o
all: ia32.o
O_TARGET := ia32.o
-obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_ioctl.o ia32_signal.o \
- ia32_binfmt.o fpu32.o socket32.o ptrace32.o
+obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_ioctl.o \
+ ia32_signal.o \
+ ia32_binfmt.o fpu32.o socket32.o ptrace32.o ipc32.o
clean::
diff --git a/arch/x86_64/ia32/ipc32.c b/arch/x86_64/ia32/ipc32.c
new file mode 100644
index 000000000000..2d322dda88ef
--- /dev/null
+++ b/arch/x86_64/ia32/ipc32.c
@@ -0,0 +1,645 @@
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/sem.h>
+#include <linux/msg.h>
+#include <linux/mm.h>
+#include <linux/shm.h>
+#include <linux/slab.h>
+#include <linux/ipc.h>
+#include <asm/mman.h>
+#include <asm/types.h>
+#include <asm/uaccess.h>
+#include <asm/semaphore.h>
+#include <asm/ipc.h>
+
+#include <asm/ia32.h>
+
+/*
+ * sys32_ipc() is the de-multiplexer for the SysV IPC calls in 32bit emulation..
+ *
+ * This is really horribly ugly.
+ */
+
+struct msgbuf32 {
+ s32 mtype;
+ char mtext[1];
+};
+
+struct ipc_perm32 {
+ int key;
+ __kernel_uid_t32 uid;
+ __kernel_gid_t32 gid;
+ __kernel_uid_t32 cuid;
+ __kernel_gid_t32 cgid;
+ unsigned short mode;
+ unsigned short seq;
+};
+
+struct ipc64_perm32 {
+ unsigned key;
+ __kernel_uid32_t32 uid;
+ __kernel_gid32_t32 gid;
+ __kernel_uid32_t32 cuid;
+ __kernel_gid32_t32 cgid;
+ unsigned short mode;
+ unsigned short __pad1;
+ unsigned short seq;
+ unsigned short __pad2;
+ unsigned int unused1;
+ unsigned int unused2;
+};
+
+struct semid_ds32 {
+ struct ipc_perm32 sem_perm; /* permissions .. see ipc.h */
+ __kernel_time_t32 sem_otime; /* last semop time */
+ __kernel_time_t32 sem_ctime; /* last change time */
+ u32 sem_base; /* ptr to first semaphore in array */
+ u32 sem_pending; /* pending operations to be processed */
+ u32 sem_pending_last; /* last pending operation */
+ u32 undo; /* undo requests on this array */
+ unsigned short sem_nsems; /* no. of semaphores in array */
+};
+
+struct semid64_ds32 {
+ struct ipc64_perm32 sem_perm;
+ __kernel_time_t32 sem_otime;
+ unsigned int __unused1;
+ __kernel_time_t32 sem_ctime;
+ unsigned int __unused2;
+ unsigned int sem_nsems;
+ unsigned int __unused3;
+ unsigned int __unused4;
+};
+
+struct msqid_ds32 {
+ struct ipc_perm32 msg_perm;
+ u32 msg_first;
+ u32 msg_last;
+ __kernel_time_t32 msg_stime;
+ __kernel_time_t32 msg_rtime;
+ __kernel_time_t32 msg_ctime;
+ u32 wwait;
+ u32 rwait;
+ unsigned short msg_cbytes;
+ unsigned short msg_qnum;
+ unsigned short msg_qbytes;
+ __kernel_ipc_pid_t32 msg_lspid;
+ __kernel_ipc_pid_t32 msg_lrpid;
+};
+
+struct msqid64_ds32 {
+ struct ipc64_perm32 msg_perm;
+ __kernel_time_t32 msg_stime;
+ unsigned int __unused1;
+ __kernel_time_t32 msg_rtime;
+ unsigned int __unused2;
+ __kernel_time_t32 msg_ctime;
+ unsigned int __unused3;
+ unsigned int msg_cbytes;
+ unsigned int msg_qnum;
+ unsigned int msg_qbytes;
+ __kernel_pid_t32 msg_lspid;
+ __kernel_pid_t32 msg_lrpid;
+ unsigned int __unused4;
+ unsigned int __unused5;
+};
+
+struct shmid_ds32 {
+ struct ipc_perm32 shm_perm;
+ int shm_segsz;
+ __kernel_time_t32 shm_atime;
+ __kernel_time_t32 shm_dtime;
+ __kernel_time_t32 shm_ctime;
+ __kernel_ipc_pid_t32 shm_cpid;
+ __kernel_ipc_pid_t32 shm_lpid;
+ unsigned short shm_nattch;
+};
+
+struct shmid64_ds32 {
+ struct ipc64_perm32 shm_perm;
+ __kernel_size_t32 shm_segsz;
+ __kernel_time_t32 shm_atime;
+ unsigned int __unused1;
+ __kernel_time_t32 shm_dtime;
+ unsigned int __unused2;
+ __kernel_time_t32 shm_ctime;
+ unsigned int __unused3;
+ __kernel_pid_t32 shm_cpid;
+ __kernel_pid_t32 shm_lpid;
+ unsigned int shm_nattch;
+ unsigned int __unused4;
+ unsigned int __unused5;
+};
+
+struct shminfo64_32 {
+ unsigned int shmmax;
+ unsigned int shmmin;
+ unsigned int shmmni;
+ unsigned int shmseg;
+ unsigned int shmall;
+ unsigned int __unused1;
+ unsigned int __unused2;
+ unsigned int __unused3;
+ unsigned int __unused4;
+};
+
+struct shm_info32 {
+ int used_ids;
+ u32 shm_tot, shm_rss, shm_swp;
+ u32 swap_attempts, swap_successes;
+};
+
+struct ipc_kludge {
+ struct msgbuf *msgp;
+ int msgtyp;
+};
+
+
+#define A(__x) ((unsigned long)(__x))
+#define AA(__x) ((unsigned long)(__x))
+
+#define SEMOP 1
+#define SEMGET 2
+#define SEMCTL 3
+#define MSGSND 11
+#define MSGRCV 12
+#define MSGGET 13
+#define MSGCTL 14
+#define SHMAT 21
+#define SHMDT 22
+#define SHMGET 23
+#define SHMCTL 24
+
+#define IPCOP_MASK(__x) (1UL << (__x))
+
+static int
+ipc_parse_version32 (int *cmd)
+{
+ if (*cmd & IPC_64) {
+ *cmd ^= IPC_64;
+ return IPC_64;
+ } else {
+ return IPC_OLD;
+ }
+}
+
+static int
+semctl32 (int first, int second, int third, void *uptr)
+{
+ union semun fourth;
+ u32 pad;
+ int err = 0, err2;
+ struct semid64_ds s;
+ mm_segment_t old_fs;
+ int version = ipc_parse_version32(&third);
+
+ if (!uptr)
+ return -EINVAL;
+ if (get_user(pad, (u32 *)uptr))
+ return -EFAULT;
+ if (third == SETVAL)
+ fourth.val = (int)pad;
+ else
+ fourth.__pad = (void *)A(pad);
+ switch (third) {
+ case IPC_INFO:
+ case IPC_RMID:
+ case IPC_SET:
+ case SEM_INFO:
+ case GETVAL:
+ case GETPID:
+ case GETNCNT:
+ case GETZCNT:
+ case GETALL:
+ case SETVAL:
+ case SETALL:
+ err = sys_semctl(first, second, third, fourth);
+ break;
+
+ case IPC_STAT:
+ case SEM_STAT:
+ fourth.__pad = &s;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_semctl(first, second|IPC_64, third, fourth);
+ set_fs(old_fs);
+
+ if (version == IPC_64) {
+ struct semid64_ds32 *usp64 = (struct semid64_ds32 *) A(pad);
+
+ if (!access_ok(VERIFY_WRITE, usp64, sizeof(*usp64))) {
+ err = -EFAULT;
+ break;
+ }
+ err2 = __put_user(s.sem_perm.key, &usp64->sem_perm.key);
+ err2 |= __put_user(s.sem_perm.uid, &usp64->sem_perm.uid);
+ err2 |= __put_user(s.sem_perm.gid, &usp64->sem_perm.gid);
+ err2 |= __put_user(s.sem_perm.cuid, &usp64->sem_perm.cuid);
+ err2 |= __put_user(s.sem_perm.cgid, &usp64->sem_perm.cgid);
+ err2 |= __put_user(s.sem_perm.mode, &usp64->sem_perm.mode);
+ err2 |= __put_user(s.sem_perm.seq, &usp64->sem_perm.seq);
+ err2 |= __put_user(s.sem_otime, &usp64->sem_otime);
+ err2 |= __put_user(s.sem_ctime, &usp64->sem_ctime);
+ err2 |= __put_user(s.sem_nsems, &usp64->sem_nsems);
+ } else {
+ struct semid_ds32 *usp32 = (struct semid_ds32 *) A(pad);
+
+ if (!access_ok(VERIFY_WRITE, usp32, sizeof(*usp32))) {
+ err = -EFAULT;
+ break;
+ }
+ err2 = __put_user(s.sem_perm.key, &usp32->sem_perm.key);
+ err2 |= __put_user(s.sem_perm.uid, &usp32->sem_perm.uid);
+ err2 |= __put_user(s.sem_perm.gid, &usp32->sem_perm.gid);
+ err2 |= __put_user(s.sem_perm.cuid, &usp32->sem_perm.cuid);
+ err2 |= __put_user(s.sem_perm.cgid, &usp32->sem_perm.cgid);
+ err2 |= __put_user(s.sem_perm.mode, &usp32->sem_perm.mode);
+ err2 |= __put_user(s.sem_perm.seq, &usp32->sem_perm.seq);
+ err2 |= __put_user(s.sem_otime, &usp32->sem_otime);
+ err2 |= __put_user(s.sem_ctime, &usp32->sem_ctime);
+ err2 |= __put_user(s.sem_nsems, &usp32->sem_nsems);
+ }
+ if (err2)
+ err = -EFAULT;
+ break;
+ }
+ return err;
+}
+
+static int
+do_sys32_msgsnd (int first, int second, int third, void *uptr)
+{
+ struct msgbuf *p = kmalloc(second + sizeof(struct msgbuf) + 4, GFP_USER);
+ struct msgbuf32 *up = (struct msgbuf32 *)uptr;
+ mm_segment_t old_fs;
+ int err;
+
+ if (!p)
+ return -ENOMEM;
+ err = get_user(p->mtype, &up->mtype);
+ err |= copy_from_user(p->mtext, &up->mtext, second);
+ if (err)
+ goto out;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_msgsnd(first, p, second, third);
+ set_fs(old_fs);
+ out:
+ kfree(p);
+ return err;
+}
+
+static int
+do_sys32_msgrcv (int first, int second, int msgtyp, int third, int version, void *uptr)
+{
+ struct msgbuf32 *up;
+ struct msgbuf *p;
+ mm_segment_t old_fs;
+ int err;
+
+ if (!version) {
+ struct ipc_kludge *uipck = (struct ipc_kludge *)uptr;
+ struct ipc_kludge ipck;
+
+ err = -EINVAL;
+ if (!uptr)
+ goto out;
+ err = -EFAULT;
+ if (copy_from_user(&ipck, uipck, sizeof(struct ipc_kludge)))
+ goto out;
+ uptr = (void *)A(ipck.msgp);
+ msgtyp = ipck.msgtyp;
+ }
+ err = -ENOMEM;
+ p = kmalloc(second + sizeof(struct msgbuf) + 4, GFP_USER);
+ if (!p)
+ goto out;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_msgrcv(first, p, second + 4, msgtyp, third);
+ set_fs(old_fs);
+ if (err < 0)
+ goto free_then_out;
+ up = (struct msgbuf32 *)uptr;
+ if (put_user(p->mtype, &up->mtype) || copy_to_user(&up->mtext, p->mtext, err))
+ err = -EFAULT;
+free_then_out:
+ kfree(p);
+out:
+ return err;
+}
+
+static int
+msgctl32 (int first, int second, void *uptr)
+{
+ int err = -EINVAL, err2;
+ struct msqid_ds m;
+ struct msqid64_ds m64;
+ struct msqid_ds32 *up32 = (struct msqid_ds32 *)uptr;
+ struct msqid64_ds32 *up64 = (struct msqid64_ds32 *)uptr;
+ mm_segment_t old_fs;
+ int version = ipc_parse_version32(&second);
+
+ switch (second) {
+ case IPC_INFO:
+ case IPC_RMID:
+ case MSG_INFO:
+ err = sys_msgctl(first, second, (struct msqid_ds *)uptr);
+ break;
+
+ case IPC_SET:
+ if (version == IPC_64) {
+ err = get_user(m.msg_perm.uid, &up64->msg_perm.uid);
+ err |= get_user(m.msg_perm.gid, &up64->msg_perm.gid);
+ err |= get_user(m.msg_perm.mode, &up64->msg_perm.mode);
+ err |= get_user(m.msg_qbytes, &up64->msg_qbytes);
+ } else {
+ err = get_user(m.msg_perm.uid, &up32->msg_perm.uid);
+ err |= get_user(m.msg_perm.gid, &up32->msg_perm.gid);
+ err |= get_user(m.msg_perm.mode, &up32->msg_perm.mode);
+ err |= get_user(m.msg_qbytes, &up32->msg_qbytes);
+ }
+ if (err)
+ break;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_msgctl(first, second, &m);
+ set_fs(old_fs);
+ break;
+
+ case IPC_STAT:
+ case MSG_STAT:
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_msgctl(first, second|IPC_64, (void *) &m64);
+ set_fs(old_fs);
+
+ if (version == IPC_64) {
+ if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64))) {
+ err = -EFAULT;
+ break;
+ }
+ err2 = __put_user(m64.msg_perm.key, &up64->msg_perm.key);
+ err2 |= __put_user(m64.msg_perm.uid, &up64->msg_perm.uid);
+ err2 |= __put_user(m64.msg_perm.gid, &up64->msg_perm.gid);
+ err2 |= __put_user(m64.msg_perm.cuid, &up64->msg_perm.cuid);
+ err2 |= __put_user(m64.msg_perm.cgid, &up64->msg_perm.cgid);
+ err2 |= __put_user(m64.msg_perm.mode, &up64->msg_perm.mode);
+ err2 |= __put_user(m64.msg_perm.seq, &up64->msg_perm.seq);
+ err2 |= __put_user(m64.msg_stime, &up64->msg_stime);
+ err2 |= __put_user(m64.msg_rtime, &up64->msg_rtime);
+ err2 |= __put_user(m64.msg_ctime, &up64->msg_ctime);
+ err2 |= __put_user(m64.msg_cbytes, &up64->msg_cbytes);
+ err2 |= __put_user(m64.msg_qnum, &up64->msg_qnum);
+ err2 |= __put_user(m64.msg_qbytes, &up64->msg_qbytes);
+ err2 |= __put_user(m64.msg_lspid, &up64->msg_lspid);
+ err2 |= __put_user(m64.msg_lrpid, &up64->msg_lrpid);
+ if (err2)
+ err = -EFAULT;
+ } else {
+ if (!access_ok(VERIFY_WRITE, up32, sizeof(*up32))) {
+ err = -EFAULT;
+ break;
+ }
+ err2 = __put_user(m64.msg_perm.key, &up32->msg_perm.key);
+ err2 |= __put_user(m64.msg_perm.uid, &up32->msg_perm.uid);
+ err2 |= __put_user(m64.msg_perm.gid, &up32->msg_perm.gid);
+ err2 |= __put_user(m64.msg_perm.cuid, &up32->msg_perm.cuid);
+ err2 |= __put_user(m64.msg_perm.cgid, &up32->msg_perm.cgid);
+ err2 |= __put_user(m64.msg_perm.mode, &up32->msg_perm.mode);
+ err2 |= __put_user(m64.msg_perm.seq, &up32->msg_perm.seq);
+ err2 |= __put_user(m64.msg_stime, &up32->msg_stime);
+ err2 |= __put_user(m64.msg_rtime, &up32->msg_rtime);
+ err2 |= __put_user(m64.msg_ctime, &up32->msg_ctime);
+ err2 |= __put_user(m64.msg_cbytes, &up32->msg_cbytes);
+ err2 |= __put_user(m64.msg_qnum, &up32->msg_qnum);
+ err2 |= __put_user(m64.msg_qbytes, &up32->msg_qbytes);
+ err2 |= __put_user(m64.msg_lspid, &up32->msg_lspid);
+ err2 |= __put_user(m64.msg_lrpid, &up32->msg_lrpid);
+ if (err2)
+ err = -EFAULT;
+ }
+ break;
+ }
+ return err;
+}
+
+static int
+shmat32 (int first, int second, int third, int version, void *uptr)
+{
+ unsigned long raddr;
+ u32 *uaddr = (u32 *)A((u32)third);
+ int err;
+
+ if (version == 1)
+ return -EINVAL; /* iBCS2 emulator entry point: unsupported */
+ err = sys_shmat(first, uptr, second, &raddr);
+ if (err)
+ return err;
+ return put_user(raddr, uaddr);
+}
+
+static int put_shmid64(struct shmid64_ds *s64p, void *uptr, int version)
+{
+ int err2;
+#define s64 (*s64p)
+ if (version == IPC_64) {
+ struct shmid64_ds32 *up64 = (struct shmid64_ds32 *)uptr;
+
+ if (!access_ok(VERIFY_WRITE, up64, sizeof(*up64)))
+ return -EFAULT;
+
+ err2 = __put_user(s64.shm_perm.key, &up64->shm_perm.key);
+ err2 |= __put_user(s64.shm_perm.uid, &up64->shm_perm.uid);
+ err2 |= __put_user(s64.shm_perm.gid, &up64->shm_perm.gid);
+ err2 |= __put_user(s64.shm_perm.cuid, &up64->shm_perm.cuid);
+ err2 |= __put_user(s64.shm_perm.cgid, &up64->shm_perm.cgid);
+ err2 |= __put_user(s64.shm_perm.mode, &up64->shm_perm.mode);
+ err2 |= __put_user(s64.shm_perm.seq, &up64->shm_perm.seq);
+ err2 |= __put_user(s64.shm_atime, &up64->shm_atime);
+ err2 |= __put_user(s64.shm_dtime, &up64->shm_dtime);
+ err2 |= __put_user(s64.shm_ctime, &up64->shm_ctime);
+ err2 |= __put_user(s64.shm_segsz, &up64->shm_segsz);
+ err2 |= __put_user(s64.shm_nattch, &up64->shm_nattch);
+ err2 |= __put_user(s64.shm_cpid, &up64->shm_cpid);
+ err2 |= __put_user(s64.shm_lpid, &up64->shm_lpid);
+ } else {
+ struct shmid_ds32 *up32 = (struct shmid_ds32 *)uptr;
+
+ if (!access_ok(VERIFY_WRITE, up32, sizeof(*up32)))
+ return -EFAULT;
+
+ err2 = __put_user(s64.shm_perm.key, &up32->shm_perm.key);
+ err2 |= __put_user(s64.shm_perm.uid, &up32->shm_perm.uid);
+ err2 |= __put_user(s64.shm_perm.gid, &up32->shm_perm.gid);
+ err2 |= __put_user(s64.shm_perm.cuid, &up32->shm_perm.cuid);
+ err2 |= __put_user(s64.shm_perm.cgid, &up32->shm_perm.cgid);
+ err2 |= __put_user(s64.shm_perm.mode, &up32->shm_perm.mode);
+ err2 |= __put_user(s64.shm_perm.seq, &up32->shm_perm.seq);
+ err2 |= __put_user(s64.shm_atime, &up32->shm_atime);
+ err2 |= __put_user(s64.shm_dtime, &up32->shm_dtime);
+ err2 |= __put_user(s64.shm_ctime, &up32->shm_ctime);
+ err2 |= __put_user(s64.shm_segsz, &up32->shm_segsz);
+ err2 |= __put_user(s64.shm_nattch, &up32->shm_nattch);
+ err2 |= __put_user(s64.shm_cpid, &up32->shm_cpid);
+ err2 |= __put_user(s64.shm_lpid, &up32->shm_lpid);
+ }
+#undef s64
+ return err2 ? -EFAULT : 0;
+}
+static int
+shmctl32 (int first, int second, void *uptr)
+{
+ int err = -EFAULT, err2;
+ struct shmid_ds s;
+ struct shmid64_ds s64;
+ mm_segment_t old_fs;
+ struct shm_info32 *uip = (struct shm_info32 *)uptr;
+ struct shm_info si;
+ int version = ipc_parse_version32(&second);
+ struct shminfo64 smi;
+ struct shminfo *usi32 = (struct shminfo *) uptr;
+ struct shminfo64_32 *usi64 = (struct shminfo64_32 *) uptr;
+
+ switch (second) {
+ case IPC_INFO:
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_shmctl(first, second|IPC_64, (struct shmid_ds *)&smi);
+ set_fs(old_fs);
+
+ if (version == IPC_64) {
+ if (!access_ok(VERIFY_WRITE, usi64, sizeof(*usi64))) {
+ err = -EFAULT;
+ break;
+ }
+ err2 = __put_user(smi.shmmax, &usi64->shmmax);
+ err2 |= __put_user(smi.shmmin, &usi64->shmmin);
+ err2 |= __put_user(smi.shmmni, &usi64->shmmni);
+ err2 |= __put_user(smi.shmseg, &usi64->shmseg);
+ err2 |= __put_user(smi.shmall, &usi64->shmall);
+ } else {
+ if (!access_ok(VERIFY_WRITE, usi32, sizeof(*usi32))) {
+ err = -EFAULT;
+ break;
+ }
+ err2 = __put_user(smi.shmmax, &usi32->shmmax);
+ err2 |= __put_user(smi.shmmin, &usi32->shmmin);
+ err2 |= __put_user(smi.shmmni, &usi32->shmmni);
+ err2 |= __put_user(smi.shmseg, &usi32->shmseg);
+ err2 |= __put_user(smi.shmall, &usi32->shmall);
+ }
+ if (err2)
+ err = -EFAULT;
+ break;
+
+ case IPC_RMID:
+ case SHM_LOCK:
+ case SHM_UNLOCK:
+ err = sys_shmctl(first, second, (struct shmid_ds *)uptr);
+ break;
+
+ case IPC_SET:
+ if (version == IPC_64) {
+ struct shmid64_ds32 *up64 = (struct shmid64_ds32 *)uptr;
+ err = get_user(s.shm_perm.uid, &up64->shm_perm.uid);
+ err |= get_user(s.shm_perm.gid, &up64->shm_perm.gid);
+ err |= get_user(s.shm_perm.mode, &up64->shm_perm.mode);
+ } else {
+ struct shmid_ds32 *up32 = (struct shmid_ds32 *)uptr;
+ err = get_user(s.shm_perm.uid, &up32->shm_perm.uid);
+ err |= get_user(s.shm_perm.gid, &up32->shm_perm.gid);
+ err |= get_user(s.shm_perm.mode, &up32->shm_perm.mode);
+ }
+ if (err)
+ break;
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_shmctl(first, second, &s);
+ set_fs(old_fs);
+ break;
+
+ case IPC_STAT:
+ case SHM_STAT:
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_shmctl(first, second|IPC_64, (void *) &s64);
+ set_fs(old_fs);
+
+ if (err < 0)
+ break;
+ err2 = put_shmid64(&s64, uptr, version);
+ if (err2)
+ err = err2;
+ break;
+
+ case SHM_INFO:
+ old_fs = get_fs();
+ set_fs(KERNEL_DS);
+ err = sys_shmctl(first, second, (void *)&si);
+ set_fs(old_fs);
+ if (err < 0)
+ break;
+
+ if (!access_ok(VERIFY_WRITE, uip, sizeof(*uip))) {
+ err = -EFAULT;
+ break;
+ }
+ err2 = __put_user(si.used_ids, &uip->used_ids);
+ err2 |= __put_user(si.shm_tot, &uip->shm_tot);
+ err2 |= __put_user(si.shm_rss, &uip->shm_rss);
+ err2 |= __put_user(si.shm_swp, &uip->shm_swp);
+ err2 |= __put_user(si.swap_attempts, &uip->swap_attempts);
+ err2 |= __put_user(si.swap_successes, &uip->swap_successes);
+ if (err2)
+ err = -EFAULT;
+ break;
+
+ }
+ return err;
+}
+
+asmlinkage long
+sys32_ipc (u32 call, int first, int second, int third, u32 ptr, u32 fifth)
+{
+ int version;
+
+ version = call >> 16; /* hack for backward compatibility */
+ call &= 0xffff;
+
+ switch (call) {
+ case SEMOP:
+ /* struct sembuf is the same on 32 and 64bit :)) */
+ return sys_semop(first, (struct sembuf *)AA(ptr), second);
+ case SEMGET:
+ return sys_semget(first, second, third);
+ case SEMCTL:
+ return semctl32(first, second, third, (void *)AA(ptr));
+
+ case MSGSND:
+ return do_sys32_msgsnd(first, second, third, (void *)AA(ptr));
+ case MSGRCV:
+ return do_sys32_msgrcv(first, second, fifth, third, version, (void *)AA(ptr));
+ case MSGGET:
+ return sys_msgget((key_t) first, second);
+ case MSGCTL:
+ return msgctl32(first, second, (void *)AA(ptr));
+
+ case SHMAT:
+ return shmat32(first, second, third, version, (void *)AA(ptr));
+ break;
+ case SHMDT:
+ return sys_shmdt((char *)AA(ptr));
+ case SHMGET:
+ return sys_shmget(first, second, third);
+ case SHMCTL:
+ return shmctl32(first, second, (void *)AA(ptr));
+
+ default:
+ return -EINVAL;
+ }
+ return -EINVAL;
+}
+
diff --git a/arch/x86_64/ia32/sys_ia32.c b/arch/x86_64/ia32/sys_ia32.c
index 35060b86a54a..85aaed5ec40a 100644
--- a/arch/x86_64/ia32/sys_ia32.c
+++ b/arch/x86_64/ia32/sys_ia32.c
@@ -1119,422 +1119,6 @@ sys32_setrlimit(unsigned int resource, struct rlimit32 *rlim)
}
/*
- * sys32_ipc() is the de-multiplexer for the SysV IPC calls in 32bit emulation..
- *
- * This is really horribly ugly.
- */
-
-struct msgbuf32 { s32 mtype; char mtext[1]; };
-
-struct ipc_perm32
-{
- key_t key;
- __kernel_uid_t32 uid;
- __kernel_gid_t32 gid;
- __kernel_uid_t32 cuid;
- __kernel_gid_t32 cgid;
- __kernel_mode_t32 mode;
- unsigned short seq;
-};
-
-struct semid_ds32 {
- struct ipc_perm32 sem_perm; /* permissions .. see ipc.h */
- __kernel_time_t32 sem_otime; /* last semop time */
- __kernel_time_t32 sem_ctime; /* last change time */
- u32 sem_base; /* ptr to first semaphore in array */
- u32 sem_pending; /* pending operations to be processed */
- u32 sem_pending_last; /* last pending operation */
- u32 undo; /* undo requests on this array */
- unsigned short sem_nsems; /* no. of semaphores in array */
-};
-
-struct msqid_ds32
-{
- struct ipc_perm32 msg_perm;
- u32 msg_first;
- u32 msg_last;
- __kernel_time_t32 msg_stime;
- __kernel_time_t32 msg_rtime;
- __kernel_time_t32 msg_ctime;
- u32 wwait;
- u32 rwait;
- unsigned short msg_cbytes;
- unsigned short msg_qnum;
- unsigned short msg_qbytes;
- __kernel_ipc_pid_t32 msg_lspid;
- __kernel_ipc_pid_t32 msg_lrpid;
-};
-
-struct shmid_ds32 {
- struct ipc_perm32 shm_perm;
- int shm_segsz;
- __kernel_time_t32 shm_atime;
- __kernel_time_t32 shm_dtime;
- __kernel_time_t32 shm_ctime;
- __kernel_ipc_pid_t32 shm_cpid;
- __kernel_ipc_pid_t32 shm_lpid;
- unsigned short shm_nattch;
-};
-
-#define IPCOP_MASK(__x) (1UL << (__x))
-
-static int
-do_sys32_semctl(int first, int second, int third, void *uptr)
-{
- union semun fourth;
- u32 pad;
- int err;
- struct semid64_ds s;
- struct semid_ds32 *usp;
- mm_segment_t old_fs;
-
- if (!uptr)
- return -EINVAL;
- err = -EFAULT;
- if (get_user (pad, (u32 *)uptr))
- return err;
- if(third == SETVAL)
- fourth.val = (int)pad;
- else
- fourth.__pad = (void *)A(pad);
-
- switch (third) {
-
- case IPC_INFO:
- case IPC_RMID:
- case IPC_SET:
- case SEM_INFO:
- case GETVAL:
- case GETPID:
- case GETNCNT:
- case GETZCNT:
- case GETALL:
- case SETVAL:
- case SETALL:
- err = sys_semctl (first, second, third, fourth);
- break;
-
- case IPC_STAT:
- case SEM_STAT:
- usp = (struct semid_ds32 *)A(pad);
- fourth.__pad = &s;
- old_fs = get_fs ();
- set_fs (KERNEL_DS);
- err = sys_semctl (first, second, third, fourth);
- set_fs (old_fs);
- if (verify_area(VERIFY_WRITE, usp, sizeof(struct semid_ds32)) ||
- __put_user(s.sem_perm.key, &usp->sem_perm.key) ||
- __put_user(s.sem_perm.uid, &usp->sem_perm.uid) ||
- __put_user(s.sem_perm.gid, &usp->sem_perm.gid) ||
- __put_user(s.sem_perm.cuid, &usp->sem_perm.cuid) ||
- __put_user (s.sem_perm.cgid, &usp->sem_perm.cgid) ||
- __put_user (s.sem_perm.mode, &usp->sem_perm.mode) ||
- __put_user (s.sem_perm.seq, &usp->sem_perm.seq) ||
- __put_user (s.sem_otime, &usp->sem_otime) ||
- __put_user (s.sem_ctime, &usp->sem_ctime) ||
- __put_user (s.sem_nsems, &usp->sem_nsems))
- return -EFAULT;
- break;
-
- }
-
- return err;
-}
-
-static int
-do_sys32_msgsnd (int first, int second, int third, void *uptr)
-{
- struct msgbuf *p = kmalloc (second + sizeof (struct msgbuf)
- + 4, GFP_USER);
- struct msgbuf32 *up = (struct msgbuf32 *)uptr;
- mm_segment_t old_fs;
- int err;
-
- if (!p)
- return -ENOMEM;
- err = verify_area(VERIFY_READ, up, sizeof(struct msgbuf32));
- if (err)
- goto out;
- err = __get_user (p->mtype, &up->mtype);
- err |= __copy_from_user (p->mtext, &up->mtext, second);
- if (err)
- goto out;
- old_fs = get_fs ();
- set_fs (KERNEL_DS);
- err = sys_msgsnd (first, p, second, third);
- set_fs (old_fs);
-out:
- kfree (p);
- return err;
-}
-
-static int
-do_sys32_msgrcv (int first, int second, int msgtyp, int third,
- int version, void *uptr)
-{
- struct msgbuf32 *up;
- struct msgbuf *p;
- mm_segment_t old_fs;
- int err;
-
- if (!version) {
- struct ipc_kludge *uipck = (struct ipc_kludge *)uptr;
- struct ipc_kludge ipck;
-
- err = -EINVAL;
- if (!uptr)
- goto out;
- err = -EFAULT;
- if (copy_from_user (&ipck, uipck, sizeof (struct ipc_kludge)))
- goto out;
- uptr = (void *)A(ipck.msgp);
- msgtyp = ipck.msgtyp;
- }
- err = -ENOMEM;
- p = kmalloc (second + sizeof (struct msgbuf) + 4, GFP_USER);
- if (!p)
- goto out;
- old_fs = get_fs ();
- set_fs (KERNEL_DS);
- err = sys_msgrcv (first, p, second + 4, msgtyp, third);
- set_fs (old_fs);
- if (err < 0)
- goto free_then_out;
- up = (struct msgbuf32 *)uptr;
- if (verify_area(VERIFY_WRITE, up, sizeof(struct msgbuf32)) ||
- __put_user (p->mtype, &up->mtype) ||
- __copy_to_user (&up->mtext, p->mtext, err))
- err = -EFAULT;
-free_then_out:
- kfree (p);
-out:
- return err;
-}
-
-static int
-do_sys32_msgctl (int first, int second, void *uptr)
-{
- int err = -EINVAL;
- struct msqid_ds m;
- struct msqid64_ds m64;
- struct msqid_ds32 *up = (struct msqid_ds32 *)uptr;
- mm_segment_t old_fs;
-
- switch (second) {
-
- case IPC_INFO:
- case IPC_RMID:
- case MSG_INFO:
- err = sys_msgctl (first, second, (struct msqid_ds *)uptr);
- break;
-
- case IPC_SET:
- err = verify_area(VERIFY_READ, up, sizeof(struct msqid_ds32));
- if (err)
- break;
- err = __get_user (m.msg_perm.uid, &up->msg_perm.uid);
- err |= __get_user (m.msg_perm.gid, &up->msg_perm.gid);
- err |= __get_user (m.msg_perm.mode, &up->msg_perm.mode);
- err |= __get_user (m.msg_qbytes, &up->msg_qbytes);
- if (err)
- break;
- old_fs = get_fs ();
- set_fs (KERNEL_DS);
- err = sys_msgctl (first, second, &m);
- set_fs (old_fs);
- break;
-
- case IPC_STAT:
- case MSG_STAT:
- old_fs = get_fs ();
- set_fs (KERNEL_DS);
- err = sys_msgctl (first, second, (void *) &m64);
- set_fs (old_fs);
- if (verify_area(VERIFY_WRITE, up, sizeof(struct msqid_ds32)) ||
- __put_user (m64.msg_perm.key, &up->msg_perm.key) ||
- __put_user(m64.msg_perm.uid, &up->msg_perm.uid) ||
- __put_user(m64.msg_perm.gid, &up->msg_perm.gid) ||
- __put_user(m64.msg_perm.cuid, &up->msg_perm.cuid) ||
- __put_user(m64.msg_perm.cgid, &up->msg_perm.cgid) ||
- __put_user(m64.msg_perm.mode, &up->msg_perm.mode) ||
- __put_user(m64.msg_perm.seq, &up->msg_perm.seq) ||
- __put_user(m64.msg_stime, &up->msg_stime) ||
- __put_user(m64.msg_rtime, &up->msg_rtime) ||
- __put_user(m64.msg_ctime, &up->msg_ctime) ||
- __put_user(m64.msg_cbytes, &up->msg_cbytes) ||
- __put_user(m64.msg_qnum, &up->msg_qnum) ||
- __put_user(m64.msg_qbytes, &up->msg_qbytes) ||
- __put_user(m64.msg_lspid, &up->msg_lspid) ||
- __put_user(m64.msg_lrpid, &up->msg_lrpid))
- return -EFAULT;
- break;
-
- }
-
- return err;
-}
-
-static int
-do_sys32_shmat (int first, int second, int third, int version, void *uptr)
-{
- unsigned long raddr;
- u32 *uaddr = (u32 *)A((u32)third);
- int err = -EINVAL;
-
- if (version == 1)
- return err;
- err = sys_shmat (first, uptr, second, &raddr);
- if (err)
- return err;
- err = put_user (raddr, uaddr);
- return err;
-}
-
-static int
-do_sys32_shmctl (int first, int second, void *uptr)
-{
- int err = -EFAULT;
- struct shmid_ds s;
- struct shmid64_ds s64;
- struct shmid_ds32 *up = (struct shmid_ds32 *)uptr;
- mm_segment_t old_fs;
- struct shm_info32 {
- int used_ids;
- u32 shm_tot, shm_rss, shm_swp;
- u32 swap_attempts, swap_successes;
- } *uip = (struct shm_info32 *)uptr;
- struct shm_info si;
-
- switch (second) {
-
- case IPC_INFO:
- case IPC_RMID:
- case SHM_LOCK:
- case SHM_UNLOCK:
- err = sys_shmctl (first, second, (struct shmid_ds *)uptr);
- break;
- case IPC_SET:
- err = verify_area(VERIFY_READ, up, sizeof(struct shmid_ds32));
- if (err)
- break;
- err = __get_user (s.shm_perm.uid, &up->shm_perm.uid);
- err |= __get_user (s.shm_perm.gid, &up->shm_perm.gid);
- err |= __get_user (s.shm_perm.mode, &up->shm_perm.mode);
- if (err)
- break;
- old_fs = get_fs ();
- set_fs (KERNEL_DS);
- err = sys_shmctl (first, second, &s);
- set_fs (old_fs);
- break;
-
- case IPC_STAT:
- case SHM_STAT:
- old_fs = get_fs ();
- set_fs (KERNEL_DS);
- err = sys_shmctl (first, second, (void *) &s64);
- set_fs (old_fs);
- if (err < 0)
- break;
- if (verify_area(VERIFY_WRITE, up, sizeof(struct shmid_ds32)) ||
- __put_user (s64.shm_perm.key, &up->shm_perm.key) ||
- __put_user (s64.shm_perm.uid, &up->shm_perm.uid) ||
- __put_user (s64.shm_perm.gid, &up->shm_perm.gid) ||
- __put_user (s64.shm_perm.cuid, &up->shm_perm.cuid) ||
- __put_user (s64.shm_perm.cgid, &up->shm_perm.cgid) ||
- __put_user (s64.shm_perm.mode, &up->shm_perm.mode) ||
- __put_user (s64.shm_perm.seq, &up->shm_perm.seq) ||
- __put_user (s64.shm_atime, &up->shm_atime) ||
- __put_user (s64.shm_dtime, &up->shm_dtime) ||
- __put_user (s64.shm_ctime, &up->shm_ctime) ||
- __put_user (s64.shm_segsz, &up->shm_segsz) ||
- __put_user (s64.shm_nattch, &up->shm_nattch) ||
- __put_user (s64.shm_cpid, &up->shm_cpid) ||
- __put_user (s64.shm_lpid, &up->shm_lpid))
- return -EFAULT;
- break;
-
- case SHM_INFO:
- old_fs = get_fs ();
- set_fs (KERNEL_DS);
- err = sys_shmctl (first, second, (void *)&si);
- set_fs (old_fs);
- if (err < 0)
- break;
- if (verify_area(VERIFY_WRITE, uip, sizeof(struct shm_info32)) ||
- __put_user (si.used_ids, &uip->used_ids) ||
- __put_user (si.shm_tot, &uip->shm_tot) ||
- __put_user (si.shm_rss, &uip->shm_rss) ||
- __put_user (si.shm_swp, &uip->shm_swp) ||
- __put_user (si.swap_attempts, &uip->swap_attempts) ||
- __put_user (si.swap_successes, &uip->swap_successes))
- return -EFAULT;
- break;
-
- }
- return err;
-}
-
-asmlinkage long
-sys32_ipc (u32 call, int first, int second, int third, u32 ptr, u32 fifth)
-{
- int version, err;
-
- version = call >> 16; /* hack for backward compatibility */
- call &= 0xffff;
-
- switch (call) {
-
- case SEMOP:
- /* struct sembuf is the same on 32 and 64bit :)) */
- err = sys_semop (first, (struct sembuf *)AA(ptr),
- second);
- break;
- case SEMGET:
- err = sys_semget (first, second, third);
- break;
- case SEMCTL:
- err = do_sys32_semctl (first, second, third,
- (void *)AA(ptr));
- break;
-
- case MSGSND:
- err = do_sys32_msgsnd (first, second, third,
- (void *)AA(ptr));
- break;
- case MSGRCV:
- err = do_sys32_msgrcv (first, second, fifth, third,
- version, (void *)AA(ptr));
- break;
- case MSGGET:
- err = sys_msgget ((key_t) first, second);
- break;
- case MSGCTL:
- err = do_sys32_msgctl (first, second, (void *)AA(ptr));
- break;
-
- case SHMAT:
- err = do_sys32_shmat (first, second, third,
- version, (void *)AA(ptr));
- break;
- case SHMDT:
- err = sys_shmdt ((char *)AA(ptr));
- break;
- case SHMGET:
- err = sys_shmget (first, second, third);
- break;
- case SHMCTL:
- err = do_sys32_shmctl (first, second, (void *)AA(ptr));
- break;
- default:
- err = -EINVAL;
- break;
- }
-
- return err;
-}
-
-/*
* sys_time() can be implemented in user-level using
* sys_gettimeofday(). IA64 did this but i386 Linux did not
* so we have to implement this system call here.
diff --git a/arch/x86_64/kernel/ioport.c b/arch/x86_64/kernel/ioport.c
index a0ab1a1ee68e..b8ad4c6d3709 100644
--- a/arch/x86_64/kernel/ioport.c
+++ b/arch/x86_64/kernel/ioport.c
@@ -14,6 +14,7 @@
#include <linux/smp.h>
#include <linux/smp_lock.h>
#include <linux/stddef.h>
+#include <linux/slab.h>
/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
static void set_bitmap(unsigned long *bitmap, short base, short extent, int new_value)
@@ -61,27 +62,19 @@ asmlinkage int sys_ioperm(unsigned long from, unsigned long num, int turn_on)
return -EINVAL;
if (turn_on && !capable(CAP_SYS_RAWIO))
return -EPERM;
- /*
- * If it's the first ioperm() call in this thread's lifetime, set the
- * IO bitmap up. ioperm() is much less timing critical than clone(),
- * this is why we delay this operation until now:
- */
- if (!t->ioperm) {
- /*
- * just in case ...
- */
- memset(t->io_bitmap,0xff,(IO_BITMAP_SIZE+1)*4);
- t->ioperm = 1;
- /*
- * this activates it in the TSS
- */
+
+ if (!t->io_bitmap_ptr) {
+ t->io_bitmap_ptr = kmalloc((IO_BITMAP_SIZE+1)*4, GFP_KERNEL);
+ if (!t->io_bitmap_ptr)
+ return -ENOMEM;
+ memset(t->io_bitmap_ptr,0xff,(IO_BITMAP_SIZE+1)*4);
tss->io_map_base = IO_BITMAP_OFFSET;
}
/*
* do it in the per-thread copy and in the TSS ...
*/
- set_bitmap((unsigned long *) t->io_bitmap, from, num, !turn_on);
+ set_bitmap((unsigned long *) t->io_bitmap_ptr, from, num, !turn_on);
set_bitmap((unsigned long *) tss->io_bitmap, from, num, !turn_on);
return 0;
diff --git a/arch/x86_64/kernel/mtrr.c b/arch/x86_64/kernel/mtrr.c
index 1f36d262b618..b0c43563a30a 100644
--- a/arch/x86_64/kernel/mtrr.c
+++ b/arch/x86_64/kernel/mtrr.c
@@ -19,10 +19,14 @@
Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
(For earlier history, see arch/i386/kernel/mtrr.c)
- September 2001 Dave Jones <davej@suse.de>
+ v2.00 September 2001 Dave Jones <davej@suse.de>
Initial rewrite for x86-64.
-
+ Removal of non-Intel style MTRR code.
+ v2.01 June 2002 Dave Jones <davej@suse.de>
+ Removal of redundant abstraction layer.
+ 64-bit fixes.
*/
+
#include <linux/types.h>
#include <linux/errno.h>
#include <linux/sched.h>
@@ -60,35 +64,19 @@
#include <asm/hardirq.h>
#include <linux/irq.h>
-#define MTRR_VERSION "2.00 (20020207)"
+#define MTRR_VERSION "2.01 (20020605)"
#define TRUE 1
#define FALSE 0
-#define MTRRcap_MSR 0x0fe
-#define MTRRdefType_MSR 0x2ff
-
-#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg))
-#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1)
+#define MSR_MTRRphysBase(reg) (0x200 + 2 * (reg))
+#define MSR_MTRRphysMask(reg) (0x200 + 2 * (reg) + 1)
#define NUM_FIXED_RANGES 88
-#define MTRRfix64K_00000_MSR 0x250
-#define MTRRfix16K_80000_MSR 0x258
-#define MTRRfix16K_A0000_MSR 0x259
-#define MTRRfix4K_C0000_MSR 0x268
-#define MTRRfix4K_C8000_MSR 0x269
-#define MTRRfix4K_D0000_MSR 0x26a
-#define MTRRfix4K_D8000_MSR 0x26b
-#define MTRRfix4K_E0000_MSR 0x26c
-#define MTRRfix4K_E8000_MSR 0x26d
-#define MTRRfix4K_F0000_MSR 0x26e
-#define MTRRfix4K_F8000_MSR 0x26f
-#ifdef CONFIG_SMP
#define MTRR_CHANGE_MASK_FIXED 0x01
#define MTRR_CHANGE_MASK_VARIABLE 0x02
#define MTRR_CHANGE_MASK_DEFTYPE 0x04
-#endif
typedef u8 mtrr_type;
@@ -97,49 +85,43 @@ typedef u8 mtrr_type;
#ifdef CONFIG_SMP
#define set_mtrr(reg,base,size,type) set_mtrr_smp (reg, base, size, type)
#else
-#define set_mtrr(reg,base,size,type) (*set_mtrr_up) (reg, base, size, type, \
- TRUE)
+#define set_mtrr(reg,base,size,type) set_mtrr_up (reg, base, size, type, TRUE)
#endif
#if defined(CONFIG_PROC_FS) || defined(CONFIG_DEVFS_FS)
#define USERSPACE_INTERFACE
#endif
-#ifndef USERSPACE_INTERFACE
-#define compute_ascii() while (0)
-#endif
-
#ifdef USERSPACE_INTERFACE
static char *ascii_buffer;
static unsigned int ascii_buf_bytes;
-#endif
-static unsigned int *usage_table;
-static DECLARE_MUTEX (main_lock);
-
-/* Private functions */
-#ifdef USERSPACE_INTERFACE
static void compute_ascii (void);
+#else
+#define compute_ascii() while (0)
#endif
+static unsigned int *usage_table;
+static DECLARE_MUTEX (mtrr_lock);
+
struct set_mtrr_context {
- unsigned long flags;
- unsigned long deftype_lo;
- unsigned long deftype_hi;
- unsigned long cr4val;
+ u32 deftype_lo;
+ u32 deftype_hi;
+ u64 flags;
+ u64 cr4val;
};
/* Put the processor into a state where MTRRs can be safely set */
static void set_mtrr_prepare (struct set_mtrr_context *ctxt)
{
- unsigned long cr0;
+ u64 cr0;
/* Disable interrupts locally */
__save_flags(ctxt->flags);
__cli();
/* Save value of CR4 and clear Page Global Enable (bit 7) */
- if (cpu_has_ge) {
+ if (cpu_has_pge) {
ctxt->cr4val = read_cr4();
write_cr4(ctxt->cr4val & ~(1UL << 7));
}
@@ -152,8 +134,8 @@ static void set_mtrr_prepare (struct set_mtrr_context *ctxt)
wbinvd();
/* Disable MTRRs, and set the default type to uncached */
- rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
- wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi);
+ rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
+ wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi);
}
@@ -164,7 +146,7 @@ static void set_mtrr_done (struct set_mtrr_context *ctxt)
wbinvd();
/* Restore MTRRdefType */
- wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+ wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
/* Enable caches */
write_cr0(read_cr0() & 0xbfffffff);
@@ -181,9 +163,9 @@ static void set_mtrr_done (struct set_mtrr_context *ctxt)
/* This function returns the number of variable MTRRs */
static unsigned int get_num_var_ranges (void)
{
- unsigned long config, dummy;
+ u32 config, dummy;
- rdmsr (MTRRcap_MSR, config, dummy);
+ rdmsr (MSR_MTRRcap, config, dummy);
return (config & 0xff);
}
@@ -191,21 +173,21 @@ static unsigned int get_num_var_ranges (void)
/* Returns non-zero if we have the write-combining memory type */
static int have_wrcomb (void)
{
- unsigned long config, dummy;
+ u32 config, dummy;
- rdmsr (MTRRcap_MSR, config, dummy);
+ rdmsr (MSR_MTRRcap, config, dummy);
return (config & (1 << 10));
}
-static u32 size_or_mask, size_and_mask;
+static u64 size_or_mask, size_and_mask;
-static void get_mtrr (unsigned int reg, unsigned long *base,
- unsigned long *size, mtrr_type * type)
+static void get_mtrr (unsigned int reg, u64 *base, u32 *size, mtrr_type * type)
{
- unsigned long mask_lo, mask_hi, base_lo, base_hi;
+ u32 mask_lo, mask_hi, base_lo, base_hi;
+ u64 newsize;
- rdmsr (MTRRphysMask_MSR (reg), mask_lo, mask_hi);
+ rdmsr (MSR_MTRRphysMask(reg), mask_lo, mask_hi);
if ((mask_lo & 0x800) == 0) {
/* Invalid (i.e. free) range */
*base = 0;
@@ -214,32 +196,29 @@ static void get_mtrr (unsigned int reg, unsigned long *base,
return;
}
- rdmsr (MTRRphysBase_MSR (reg), base_lo, base_hi);
+ rdmsr (MSR_MTRRphysBase(reg), base_lo, base_hi);
/* Work out the shifted address mask. */
- mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT)
- | mask_lo >> PAGE_SHIFT;
-
- /* This works correctly if size is a power of two, i.e. a
- contiguous range. */
- *size = -mask_lo;
+ newsize = (u64) mask_hi << 32 | (mask_lo & ~0x800);
+ newsize = ~newsize+1;
+ *size = (u32) newsize >> PAGE_SHIFT;
*base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT;
*type = base_lo & 0xff;
}
-static void set_mtrr_up (unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type, int do_safe)
-/* [SUMMARY] Set variable MTRR register on the local CPU.
- <reg> The register to set.
- <base> The base address of the region.
- <size> The size of the region. If this is 0 the region is disabled.
- <type> The type of the region.
- <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
- be done externally.
- [RETURNS] Nothing.
-*/
+/*
+ * Set variable MTRR register on the local CPU.
+ * <reg> The register to set.
+ * <base> The base address of the region.
+ * <size> The size of the region. If this is 0 the region is disabled.
+ * <type> The type of the region.
+ * <do_safe> If TRUE, do the change safely. If FALSE, safety measures should
+ * be done externally.
+ */
+static void set_mtrr_up (unsigned int reg, u64 base,
+ u32 size, mtrr_type type, int do_safe)
{
struct set_mtrr_context ctxt;
@@ -249,12 +228,12 @@ static void set_mtrr_up (unsigned int reg, unsigned long base,
if (size == 0) {
/* The invalid bit is kept in the mask, so we simply clear the
relevant mask register to disable a range. */
- wrmsr (MTRRphysMask_MSR (reg), 0, 0);
+ wrmsr (MSR_MTRRphysMask(reg), 0, 0);
} else {
- wrmsr (MTRRphysBase_MSR (reg), base << PAGE_SHIFT | type,
+ wrmsr (MSR_MTRRphysBase(reg), base << PAGE_SHIFT | type,
(base & size_and_mask) >> (32 - PAGE_SHIFT));
- wrmsr (MTRRphysMask_MSR (reg), -size << PAGE_SHIFT | 0x800,
- (-size & size_and_mask) >> (32 - PAGE_SHIFT));
+ wrmsr (MSR_MTRRphysMask(reg), (-size-1) << PAGE_SHIFT | 0x800,
+ ((-size-1) & size_and_mask) >> (32 - PAGE_SHIFT));
}
if (do_safe)
set_mtrr_done (&ctxt);
@@ -264,41 +243,40 @@ static void set_mtrr_up (unsigned int reg, unsigned long base,
#ifdef CONFIG_SMP
struct mtrr_var_range {
- unsigned long base_lo;
- unsigned long base_hi;
- unsigned long mask_lo;
- unsigned long mask_hi;
+ u32 base_lo;
+ u32 base_hi;
+ u32 mask_lo;
+ u32 mask_hi;
};
/* Get the MSR pair relating to a var range */
static void __init get_mtrr_var_range (unsigned int index,
struct mtrr_var_range *vr)
{
- rdmsr (MTRRphysBase_MSR (index), vr->base_lo, vr->base_hi);
- rdmsr (MTRRphysMask_MSR (index), vr->mask_lo, vr->mask_hi);
+ rdmsr (MSR_MTRRphysBase(index), vr->base_lo, vr->base_hi);
+ rdmsr (MSR_MTRRphysMask(index), vr->mask_lo, vr->mask_hi);
}
/* Set the MSR pair relating to a var range. Returns TRUE if
changes are made */
-static int __init
-set_mtrr_var_range_testing (unsigned int index, struct mtrr_var_range *vr)
+static int __init set_mtrr_var_range_testing (unsigned int index,
+ struct mtrr_var_range *vr)
{
- unsigned int lo, hi;
+ u32 lo, hi;
int changed = FALSE;
- rdmsr (MTRRphysBase_MSR (index), lo, hi);
- if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL)
- || (vr->base_hi & 0xfUL) != (hi & 0xfUL)) {
- wrmsr (MTRRphysBase_MSR (index), vr->base_lo, vr->base_hi);
+ rdmsr (MSR_MTRRphysBase(index), lo, hi);
+ if ((vr->base_lo & 0xfffff0ff) != (lo & 0xfffff0ff)
+ || (vr->base_hi & 0x000fffff) != (hi & 0x000fffff)) {
+ wrmsr (MSR_MTRRphysBase(index), vr->base_lo, vr->base_hi);
changed = TRUE;
}
- rdmsr (MTRRphysMask_MSR (index), lo, hi);
-
- if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL)
- || (vr->mask_hi & 0xfUL) != (hi & 0xfUL)) {
- wrmsr (MTRRphysMask_MSR (index), vr->mask_lo, vr->mask_hi);
+ rdmsr (MSR_MTRRphysMask(index), lo, hi);
+ if ((vr->mask_lo & 0xfffff800) != (lo & 0xfffff800)
+ || (vr->mask_hi & 0x000fffff) != (hi & 0x000fffff)) {
+ wrmsr (MSR_MTRRphysMask(index), vr->mask_lo, vr->mask_hi);
changed = TRUE;
}
return changed;
@@ -307,45 +285,50 @@ set_mtrr_var_range_testing (unsigned int index, struct mtrr_var_range *vr)
static void __init get_fixed_ranges (mtrr_type * frs)
{
- unsigned long *p = (unsigned long *) frs;
+ u32 *p = (u32 *) frs;
int i;
- rdmsr (MTRRfix64K_00000_MSR, p[0], p[1]);
+ rdmsr (MSR_MTRRfix64K_00000, p[0], p[1]);
for (i = 0; i < 2; i++)
- rdmsr (MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]);
+ rdmsr (MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
for (i = 0; i < 8; i++)
- rdmsr (MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
+ rdmsr (MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
}
static int __init set_fixed_ranges_testing (mtrr_type * frs)
{
- unsigned long *p = (unsigned long *) frs;
+ u32 *p = (u32 *) frs;
int changed = FALSE;
int i;
- unsigned long lo, hi;
+ u32 lo, hi;
- rdmsr (MTRRfix64K_00000_MSR, lo, hi);
+ printk (KERN_INFO "mtrr: rdmsr 64K_00000\n");
+ rdmsr (MSR_MTRRfix64K_00000, lo, hi);
if (p[0] != lo || p[1] != hi) {
- wrmsr (MTRRfix64K_00000_MSR, p[0], p[1]);
+ printk (KERN_INFO "mtrr: Writing %x:%x to 64K MSR. lohi were %x:%x\n", p[0], p[1], lo, hi);
+ wrmsr (MSR_MTRRfix64K_00000, p[0], p[1]);
changed = TRUE;
}
+ printk (KERN_INFO "mtrr: rdmsr 16K_80000\n");
for (i = 0; i < 2; i++) {
- rdmsr (MTRRfix16K_80000_MSR + i, lo, hi);
+ rdmsr (MSR_MTRRfix16K_80000 + i, lo, hi);
if (p[2 + i * 2] != lo || p[3 + i * 2] != hi) {
- wrmsr (MTRRfix16K_80000_MSR + i, p[2 + i * 2],
- p[3 + i * 2]);
+ printk (KERN_INFO "mtrr: Writing %x:%x to 16K MSR%d. lohi were %x:%x\n", p[2 + i * 2], p[3 + i * 2], i, lo, hi );
+ wrmsr (MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
changed = TRUE;
}
}
+ printk (KERN_INFO "mtrr: rdmsr 4K_C0000\n");
for (i = 0; i < 8; i++) {
- rdmsr (MTRRfix4K_C0000_MSR + i, lo, hi);
+ rdmsr (MSR_MTRRfix4K_C0000 + i, lo, hi);
+ printk (KERN_INFO "mtrr: MTRRfix4K_C0000+%d = %x:%x\n", i, lo, hi);
if (p[6 + i * 2] != lo || p[7 + i * 2] != hi) {
- wrmsr (MTRRfix4K_C0000_MSR + i, p[6 + i * 2],
- p[7 + i * 2]);
+ printk (KERN_INFO "mtrr: Writing %x:%x to 4K MSR%d. lohi were %x:%x\n", p[6 + i * 2], p[7 + i * 2], i, lo, hi);
+ wrmsr (MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
changed = TRUE;
}
}
@@ -357,8 +340,8 @@ struct mtrr_state {
unsigned int num_var_ranges;
struct mtrr_var_range *var_ranges;
mtrr_type fixed_ranges[NUM_FIXED_RANGES];
- unsigned char enabled;
mtrr_type def_type;
+ unsigned char enabled;
};
@@ -367,9 +350,9 @@ static void __init get_mtrr_state (struct mtrr_state *state)
{
unsigned int nvrs, i;
struct mtrr_var_range *vrs;
- unsigned long lo, dummy;
+ u32 lo, dummy;
- nvrs = state->num_var_ranges = get_num_var_ranges ();
+ nvrs = state->num_var_ranges = get_num_var_ranges();
vrs = state->var_ranges
= kmalloc (nvrs * sizeof (struct mtrr_var_range), GFP_KERNEL);
if (vrs == NULL)
@@ -379,7 +362,7 @@ static void __init get_mtrr_state (struct mtrr_state *state)
get_mtrr_var_range (i, &vrs[i]);
get_fixed_ranges (state->fixed_ranges);
- rdmsr (MTRRdefType_MSR, lo, dummy);
+ rdmsr (MSR_MTRRdefType, lo, dummy);
state->def_type = (lo & 0xff);
state->enabled = (lo & 0xc00) >> 10;
}
@@ -393,17 +376,18 @@ static void __init finalize_mtrr_state (struct mtrr_state *state)
}
-static unsigned long __init set_mtrr_state (struct mtrr_state *state,
+/*
+ * Set the MTRR state for this CPU.
+ * <state> The MTRR state information to read.
+ * <ctxt> Some relevant CPU context.
+ * [NOTE] The CPU must already be in a safe state for MTRR changes.
+ * [RETURNS] 0 if no changes made, else a mask indication what was changed.
+ */
+static u64 __init set_mtrr_state (struct mtrr_state *state,
struct set_mtrr_context *ctxt)
-/* [SUMMARY] Set the MTRR state for this CPU.
- <state> The MTRR state information to read.
- <ctxt> Some relevant CPU context.
- [NOTE] The CPU must already be in a safe state for MTRR changes.
- [RETURNS] 0 if no changes made, else a mask indication what was changed.
-*/
{
unsigned int i;
- unsigned long change_mask = 0;
+ u64 change_mask = 0;
for (i = 0; i < state->num_var_ranges; i++)
if (set_mtrr_var_range_testing (i, &state->var_ranges[i]))
@@ -428,16 +412,16 @@ static volatile int wait_barrier_execute = FALSE;
static volatile int wait_barrier_cache_enable = FALSE;
struct set_mtrr_data {
- unsigned long smp_base;
- unsigned long smp_size;
+ u64 smp_base;
+ u32 smp_size;
unsigned int smp_reg;
mtrr_type smp_type;
};
+/*
+ * Synchronisation handler. Executed by "other" CPUs.
+ */
static void ipi_handler (void *info)
-/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs.
- [RETURNS] Nothing.
-*/
{
struct set_mtrr_data *data = info;
struct set_mtrr_context ctxt;
@@ -449,7 +433,7 @@ static void ipi_handler (void *info)
barrier ();
/* The master has cleared me to execute */
- (*set_mtrr_up) (data->smp_reg, data->smp_base, data->smp_size,
+ set_mtrr_up (data->smp_reg, data->smp_base, data->smp_size,
data->smp_type, FALSE);
/* Notify master CPU that I've executed the function */
@@ -462,8 +446,7 @@ static void ipi_handler (void *info)
}
-static void set_mtrr_smp (unsigned int reg, unsigned long base,
- unsigned long size, mtrr_type type)
+static void set_mtrr_smp (unsigned int reg, u64 base, u32 size, mtrr_type type)
{
struct set_mtrr_data data;
struct set_mtrr_context ctxt;
@@ -490,7 +473,7 @@ static void set_mtrr_smp (unsigned int reg, unsigned long base,
/* Set up for completion wait and then release other CPUs to change MTRRs */
atomic_set (&undone_count, smp_num_cpus - 1);
wait_barrier_execute = FALSE;
- (*set_mtrr_up) (reg, base, size, type, FALSE);
+ set_mtrr_up (reg, base, size, type, FALSE);
/* Now wait for other CPUs to complete the function */
while (atomic_read (&undone_count) > 0)
@@ -505,7 +488,7 @@ static void set_mtrr_smp (unsigned int reg, unsigned long base,
/* Some BIOS's are fucked and don't set all MTRRs the same! */
-static void __init mtrr_state_warn (unsigned long mask)
+static void __init mtrr_state_warn (u32 mask)
{
if (!mask)
return;
@@ -521,7 +504,7 @@ static void __init mtrr_state_warn (unsigned long mask)
#endif /* CONFIG_SMP */
-static char inline * attrib_to_str (int x)
+static inline char * attrib_to_str (int x)
{
return (x <= 6) ? mtrr_strings[x] : "?";
}
@@ -551,21 +534,20 @@ static void __init init_table (void)
}
-static int generic_get_free_region (unsigned long base,
- unsigned long size)
-/* [SUMMARY] Get a free MTRR.
- <base> The starting (base) address of the region.
- <size> The size (in bytes) of the region.
- [RETURNS] The index of the region on success, else -1 on error.
+/*
+ * Get a free MTRR.
+ * returns the index of the region on success, else -1 on error.
*/
+static int get_free_region(void)
{
int i, max;
mtrr_type ltype;
- unsigned long lbase, lsize;
+ u64 lbase;
+ u32 lsize;
max = get_num_var_ranges ();
for (i = 0; i < max; ++i) {
- (*get_mtrr) (i, &lbase, &lsize, &ltype);
+ get_mtrr (i, &lbase, &lsize, &ltype);
if (lsize == 0)
return i;
}
@@ -573,22 +555,19 @@ static int generic_get_free_region (unsigned long base,
}
-static int (*get_free_region) (unsigned long base,
- unsigned long size) = generic_get_free_region;
-
/**
* mtrr_add_page - Add a memory type region
* @base: Physical base address of region in pages (4 KB)
* @size: Physical size of region in pages (4 KB)
* @type: Type of MTRR desired
* @increment: If this is true do usage counting on the region
+ * Returns The MTRR register on success, else a negative number
+ * indicating the error code.
*
- * Memory type region registers control the caching on newer Intel and
- * non Intel processors. This function allows drivers to request an
- * MTRR is added. The details and hardware specifics of each processor's
- * implementation are hidden from the caller, but nevertheless the
- * caller should expect to need to provide a power of two size on an
- * equivalent power of two boundary.
+ * Memory type region registers control the caching on newer
+ * processors. This function allows drivers to request an MTRR is added.
+ * The caller should expect to need to provide a power of two size on
+ * an equivalent power of two boundary.
*
* If the region cannot be added either because all regions are in use
* or the CPU cannot support it a negative value is returned. On success
@@ -596,42 +575,28 @@ static int (*get_free_region) (unsigned long base,
* as a cookie only.
*
* On a multiprocessor machine the changes are made to all processors.
- * This is required on x86 by the Intel processors.
*
* The available types are
*
* %MTRR_TYPE_UNCACHABLE - No caching
- *
* %MTRR_TYPE_WRBACK - Write data back in bursts whenever
- *
* %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
- *
* %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
*
* BUGS: Needs a quiet flag for the cases where drivers do not mind
* failures and do not wish system log messages to be sent.
*/
-int mtrr_add_page (unsigned long base, unsigned long size,
- unsigned int type, char increment)
+int mtrr_add_page (u64 base, u32 size, unsigned int type, char increment)
{
-/* [SUMMARY] Add an MTRR entry.
- <base> The starting (base, in pages) address of the region.
- <size> The size of the region. (in pages)
- <type> The type of the new region.
- <increment> If true and the region already exists, the usage count will be
- incremented.
- [RETURNS] The MTRR register on success, else a negative number indicating
- the error code.
- [NOTE] This routine uses a spinlock.
-*/
int i, max;
mtrr_type ltype;
- unsigned long lbase, lsize, last;
+ u64 lbase, last;
+ u32 lsize;
if (base + size < 0x100) {
printk (KERN_WARNING
- "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n",
+ "mtrr: cannot set region below 1 MiB (0x%lx000,0x%x000)\n",
base, size);
return -EINVAL;
}
@@ -644,7 +609,7 @@ int mtrr_add_page (unsigned long base, unsigned long size,
if (lbase != last) {
printk (KERN_WARNING
- "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n",
+ "mtrr: base(0x%lx000) is not aligned on a size(0x%x000) boundary\n",
base, size);
return -EINVAL;
}
@@ -655,7 +620,7 @@ int mtrr_add_page (unsigned long base, unsigned long size,
}
/* If the type is WC, check that this processor supports it */
- if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb ()) {
+ if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) {
printk (KERN_WARNING
"mtrr: your processor doesn't support write-combining\n");
return -ENOSYS;
@@ -669,9 +634,9 @@ int mtrr_add_page (unsigned long base, unsigned long size,
increment = increment ? 1 : 0;
max = get_num_var_ranges ();
/* Search for existing MTRR */
- down (&main_lock);
+ down (&mtrr_lock);
for (i = 0; i < max; ++i) {
- (*get_mtrr) (i, &lbase, &lsize, &ltype);
+ get_mtrr (i, &lbase, &lsize, &ltype);
if (base >= lbase + lsize)
continue;
if ((base < lbase) && (base + size <= lbase))
@@ -679,41 +644,41 @@ int mtrr_add_page (unsigned long base, unsigned long size,
/* At this point we know there is some kind of overlap/enclosure */
if ((base < lbase) || (base + size > lbase + lsize)) {
- up (&main_lock);
+ up (&mtrr_lock);
printk (KERN_WARNING
- "mtrr: 0x%lx000,0x%lx000 overlaps existing"
- " 0x%lx000,0x%lx000\n", base, size, lbase,
- lsize);
+ "mtrr: 0x%lx000,0x%x000 overlaps existing"
+ " 0x%lx000,0x%x000\n", base, size, lbase, lsize);
return -EINVAL;
}
/* New region is enclosed by an existing region */
if (ltype != type) {
if (type == MTRR_TYPE_UNCACHABLE)
continue;
- up (&main_lock);
+ up (&mtrr_lock);
printk
- ("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n",
- base, size, attrib_to_str (ltype),
+ ("mtrr: type mismatch for %lx000,%x000 old: %s new: %s\n",
+ base, size,
+ attrib_to_str (ltype),
attrib_to_str (type));
return -EINVAL;
}
if (increment)
++usage_table[i];
compute_ascii ();
- up (&main_lock);
+ up (&mtrr_lock);
return i;
}
/* Search for an empty MTRR */
- i = (*get_free_region) (base, size);
+ i = get_free_region();
if (i < 0) {
- up (&main_lock);
+ up (&mtrr_lock);
printk ("mtrr: no more MTRRs available\n");
return i;
}
set_mtrr (i, base, size, type);
usage_table[i] = 1;
compute_ascii ();
- up (&main_lock);
+ up (&mtrr_lock);
return i;
}
@@ -724,13 +689,13 @@ int mtrr_add_page (unsigned long base, unsigned long size,
* @size: Physical size of region
* @type: Type of MTRR desired
* @increment: If this is true do usage counting on the region
+ * Return the MTRR register on success, else a negative numbe
+ * indicating the error code.
*
- * Memory type region registers control the caching on newer Intel and
- * non Intel processors. This function allows drivers to request an
- * MTRR is added. The details and hardware specifics of each processor's
- * implementation are hidden from the caller, but nevertheless the
- * caller should expect to need to provide a power of two size on an
- * equivalent power of two boundary.
+ * Memory type region registers control the caching on newer processors.
+ * This function allows drivers to request an MTRR is added.
+ * The caller should expect to need to provide a power of two size on
+ * an equivalent power of two boundary.
*
* If the region cannot be added either because all regions are in use
* or the CPU cannot support it a negative value is returned. On success
@@ -743,33 +708,19 @@ int mtrr_add_page (unsigned long base, unsigned long size,
* The available types are
*
* %MTRR_TYPE_UNCACHABLE - No caching
- *
* %MTRR_TYPE_WRBACK - Write data back in bursts whenever
- *
* %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts
- *
* %MTRR_TYPE_WRTHROUGH - Cache reads but not writes
*
* BUGS: Needs a quiet flag for the cases where drivers do not mind
* failures and do not wish system log messages to be sent.
*/
-int mtrr_add (unsigned long base, unsigned long size, unsigned int type,
- char increment)
+int mtrr_add (u64 base, u32 size, unsigned int type, char increment)
{
-/* [SUMMARY] Add an MTRR entry.
- <base> The starting (base) address of the region.
- <size> The size (in bytes) of the region.
- <type> The type of the new region.
- <increment> If true and the region already exists, the usage count will be
- incremented.
- [RETURNS] The MTRR register on success, else a negative number indicating
- the error code.
-*/
-
if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
printk ("mtrr: size and base must be multiples of 4 kiB\n");
- printk ("mtrr: size: 0x%lx base: 0x%lx\n", size, base);
+ printk ("mtrr: size: 0x%x base: 0x%lx\n", size, base);
return -EINVAL;
}
return mtrr_add_page (base >> PAGE_SHIFT, size >> PAGE_SHIFT, type,
@@ -792,55 +743,46 @@ int mtrr_add (unsigned long base, unsigned long size, unsigned int type,
* code.
*/
-int mtrr_del_page (int reg, unsigned long base, unsigned long size)
-/* [SUMMARY] Delete MTRR/decrement usage count.
- <reg> The register. If this is less than 0 then <<base>> and <<size>> must
- be supplied.
- <base> The base address of the region. This is ignored if <<reg>> is >= 0.
- <size> The size of the region. This is ignored if <<reg>> is >= 0.
- [RETURNS] The register on success, else a negative number indicating
- the error code.
- [NOTE] This routine uses a spinlock.
-*/
+int mtrr_del_page (int reg, u64 base, u32 size)
{
int i, max;
mtrr_type ltype;
- unsigned long lbase, lsize;
+ u64 lbase;
+ u32 lsize;
max = get_num_var_ranges ();
- down (&main_lock);
+ down (&mtrr_lock);
if (reg < 0) {
/* Search for existing MTRR */
for (i = 0; i < max; ++i) {
- (*get_mtrr) (i, &lbase, &lsize, &ltype);
+ get_mtrr (i, &lbase, &lsize, &ltype);
if (lbase == base && lsize == size) {
reg = i;
break;
}
}
if (reg < 0) {
- up (&main_lock);
- printk ("mtrr: no MTRR for %lx000,%lx000 found\n", base,
- size);
+ up (&mtrr_lock);
+ printk ("mtrr: no MTRR for %lx000,%x000 found\n", base, size);
return -EINVAL;
}
}
if (reg >= max) {
- up (&main_lock);
+ up (&mtrr_lock);
printk ("mtrr: register: %d too big\n", reg);
return -EINVAL;
}
- (*get_mtrr) (reg, &lbase, &lsize, &ltype);
+ get_mtrr (reg, &lbase, &lsize, &ltype);
if (lsize < 1) {
- up (&main_lock);
+ up (&mtrr_lock);
printk ("mtrr: MTRR %d not used\n", reg);
return -EINVAL;
}
if (usage_table[reg] < 1) {
- up (&main_lock);
+ up (&mtrr_lock);
printk ("mtrr: reg: %d has count=0\n", reg);
return -EINVAL;
}
@@ -848,7 +790,7 @@ int mtrr_del_page (int reg, unsigned long base, unsigned long size)
if (--usage_table[reg] < 1)
set_mtrr (reg, 0, 0, 0);
compute_ascii ();
- up (&main_lock);
+ up (&mtrr_lock);
return reg;
}
@@ -868,19 +810,11 @@ int mtrr_del_page (int reg, unsigned long base, unsigned long size)
* code.
*/
-int mtrr_del (int reg, unsigned long base, unsigned long size)
-/* [SUMMARY] Delete MTRR/decrement usage count.
- <reg> The register. If this is less than 0 then <<base>> and <<size>> must
- be supplied.
- <base> The base address of the region. This is ignored if <<reg>> is >= 0.
- <size> The size of the region. This is ignored if <<reg>> is >= 0.
- [RETURNS] The register on success, else a negative number indicating
- the error code.
-*/
+int mtrr_del (int reg, u64 base, u32 size)
{
if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
printk ("mtrr: size and base must be multiples of 4 kiB\n");
- printk ("mtrr: size: 0x%lx base: 0x%lx\n", size, base);
+ printk ("mtrr: size: 0x%x base: 0x%lx\n", size, base);
return -EINVAL;
}
return mtrr_del_page (reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT);
@@ -889,8 +823,8 @@ int mtrr_del (int reg, unsigned long base, unsigned long size)
#ifdef USERSPACE_INTERFACE
-static int mtrr_file_add (unsigned long base, unsigned long size,
- unsigned int type, char increment, struct file *file, int page)
+static int mtrr_file_add (u64 base, u32 size, unsigned int type,
+ struct file *file, int page)
{
int reg, max;
unsigned int *fcount = file->private_data;
@@ -910,7 +844,7 @@ static int mtrr_file_add (unsigned long base, unsigned long size,
if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
printk
("mtrr: size and base must be multiples of 4 kiB\n");
- printk ("mtrr: size: 0x%lx base: 0x%lx\n", size, base);
+ printk ("mtrr: size: 0x%x base: 0x%lx\n", size, base);
return -EINVAL;
}
base >>= PAGE_SHIFT;
@@ -925,7 +859,7 @@ static int mtrr_file_add (unsigned long base, unsigned long size,
}
-static int mtrr_file_del (unsigned long base, unsigned long size,
+static int mtrr_file_del (u64 base, u32 size,
struct file *file, int page)
{
int reg;
@@ -935,7 +869,7 @@ static int mtrr_file_del (unsigned long base, unsigned long size,
if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) {
printk
("mtrr: size and base must be multiples of 4 kiB\n");
- printk ("mtrr: size: 0x%lx base: 0x%lx\n", size, base);
+ printk ("mtrr: size: 0x%x base: 0x%lx\n", size, base);
return -EINVAL;
}
base >>= PAGE_SHIFT;
@@ -977,9 +911,9 @@ static ssize_t mtrr_write (struct file *file, const char *buf,
"disable=%d"
*/
{
- int i, err;
- unsigned long reg;
- unsigned long long base, size;
+ int i, err, reg;
+ u64 base;
+ u32 size;
char *ptr;
char line[LINE_SIZE];
@@ -1027,7 +961,7 @@ static ssize_t mtrr_write (struct file *file, const char *buf,
if ((base & 0xfff) || (size & 0xfff)) {
printk ("mtrr: size and base must be multiples of 4 kiB\n");
- printk ("mtrr: size: 0x%Lx base: 0x%Lx\n", size, base);
+ printk ("mtrr: size: 0x%x base: 0x%lx\n", size, base);
return -EINVAL;
}
@@ -1046,9 +980,7 @@ static ssize_t mtrr_write (struct file *file, const char *buf,
continue;
base >>= PAGE_SHIFT;
size >>= PAGE_SHIFT;
- err =
- mtrr_add_page ((unsigned long) base, (unsigned long) size,
- i, 1);
+ err = mtrr_add_page ((u64) base, size, i, 1);
if (err < 0)
return err;
return len;
@@ -1076,7 +1008,7 @@ static int mtrr_ioctl (struct inode *inode, struct file *file,
if (copy_from_user (&sentry, (void *) arg, sizeof sentry))
return -EFAULT;
err =
- mtrr_file_add (sentry.base, sentry.size, sentry.type, 1,
+ mtrr_file_add (sentry.base, sentry.size, sentry.type,
file, 0);
if (err < 0)
return err;
@@ -1117,7 +1049,7 @@ static int mtrr_ioctl (struct inode *inode, struct file *file,
return -EFAULT;
if (gentry.regnum >= get_num_var_ranges ())
return -EINVAL;
- (*get_mtrr) (gentry.regnum, &gentry.base, &gentry.size, &type);
+ get_mtrr (gentry.regnum, &gentry.base, &gentry.size, &type);
/* Hide entries that go above 4GB */
if (gentry.base + gentry.size > 0x100000
@@ -1139,7 +1071,7 @@ static int mtrr_ioctl (struct inode *inode, struct file *file,
if (copy_from_user (&sentry, (void *) arg, sizeof sentry))
return -EFAULT;
err =
- mtrr_file_add (sentry.base, sentry.size, sentry.type, 1,
+ mtrr_file_add (sentry.base, sentry.size, sentry.type,
file, 1);
if (err < 0)
return err;
@@ -1180,7 +1112,7 @@ static int mtrr_ioctl (struct inode *inode, struct file *file,
return -EFAULT;
if (gentry.regnum >= get_num_var_ranges ())
return -EINVAL;
- (*get_mtrr) (gentry.regnum, &gentry.base, &gentry.size, &type);
+ get_mtrr (gentry.regnum, &gentry.base, &gentry.size, &type);
gentry.type = type;
if (copy_to_user ((void *) arg, &gentry, sizeof gentry))
@@ -1199,7 +1131,6 @@ static int mtrr_close (struct inode *ino, struct file *file)
if (fcount == NULL)
return 0;
- lock_kernel ();
max = get_num_var_ranges ();
for (i = 0; i < max; ++i) {
while (fcount[i] > 0) {
@@ -1208,7 +1139,6 @@ static int mtrr_close (struct inode *ino, struct file *file)
--fcount[i];
}
}
- unlock_kernel ();
kfree (fcount);
file->private_data = NULL;
return 0;
@@ -1234,12 +1164,13 @@ static void compute_ascii (void)
char factor;
int i, max;
mtrr_type type;
- unsigned long base, size;
+ u64 base;
+ u32 size;
ascii_buf_bytes = 0;
max = get_num_var_ranges ();
for (i = 0; i < max; i++) {
- (*get_mtrr) (i, &base, &size, &type);
+ get_mtrr (i, &base, &size, &type);
if (size == 0)
usage_table[i] = 0;
else {
@@ -1253,11 +1184,10 @@ static void compute_ascii (void)
}
sprintf
(ascii_buffer + ascii_buf_bytes,
- "reg%02i: base=0x%05lx000 (%4liMB), size=%4li%cB: %s, count=%d\n",
+ "reg%02i: base=0x%05lx000 (%4liMB), size=%4i%cB: %s, count=%d\n",
i, base, base >> (20 - PAGE_SHIFT), size, factor,
attrib_to_str (type), usage_table[i]);
- ascii_buf_bytes +=
- strlen (ascii_buffer + ascii_buf_bytes);
+ ascii_buf_bytes += strlen (ascii_buffer + ascii_buf_bytes);
}
}
devfs_set_file_size (devfs_handle, ascii_buf_bytes);
@@ -1283,22 +1213,16 @@ static void __init mtrr_setup (void)
if ((cpuid_eax (0x80000000) >= 0x80000008)) {
u32 phys_addr;
phys_addr = cpuid_eax (0x80000008) & 0xff;
- size_or_mask =
- ~((1 << (phys_addr - PAGE_SHIFT)) - 1);
- size_and_mask = ~size_or_mask & 0xfff00000;
- } else {
- /* FIXME: This is to make it work on Athlon during debugging. */
- size_or_mask = 0xff000000; /* 36 bits */
- size_and_mask = 0x00f00000;
+ size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1);
+ size_and_mask = ~size_or_mask & 0xfffffffffff00000;
}
-
printk ("mtrr: detected mtrr type: x86-64\n");
}
}
#ifdef CONFIG_SMP
-static volatile unsigned long smp_changes_mask __initdata = 0;
+static volatile u32 smp_changes_mask __initdata = 0;
static struct mtrr_state smp_mtrr_state __initdata = { 0, 0 };
void __init mtrr_init_boot_cpu (void)
@@ -1310,7 +1234,8 @@ void __init mtrr_init_boot_cpu (void)
void __init mtrr_init_secondary_cpu (void)
{
- unsigned long mask, count;
+ u64 mask;
+ int count;
struct set_mtrr_context ctxt;
/* Note that this is not ideal, since the cache is only flushed/disabled
@@ -1357,4 +1282,3 @@ int __init mtrr_init (void)
init_table ();
return 0;
}
-
diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c
index e233b3557ce5..f00fff0638de 100644
--- a/arch/x86_64/kernel/process.c
+++ b/arch/x86_64/kernel/process.c
@@ -39,6 +39,7 @@
#include <linux/reboot.h>
#include <linux/init.h>
#include <linux/ctype.h>
+#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
@@ -320,9 +321,6 @@ void show_regs(struct pt_regs * regs)
printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4);
}
-#define __STR(x) #x
-#define __STR2(x) __STR(x)
-
extern void load_gs_index(unsigned);
/*
@@ -330,7 +328,13 @@ extern void load_gs_index(unsigned);
*/
void exit_thread(void)
{
- /* nothing to do ... */
+ struct task_struct *me = current;
+ if (me->thread.io_bitmap_ptr) {
+ kfree(me->thread.io_bitmap_ptr);
+ me->thread.io_bitmap_ptr = NULL;
+ (init_tss + smp_processor_id())->io_map_base =
+ INVALID_IO_BITMAP_OFFSET;
+ }
}
void flush_thread(void)
@@ -392,6 +396,14 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long rsp,
unlazy_fpu(current);
p->thread.i387 = current->thread.i387;
+ if (unlikely(me->thread.io_bitmap_ptr != NULL)) {
+ p->thread.io_bitmap_ptr = kmalloc((IO_BITMAP_SIZE+1)*4, GFP_KERNEL);
+ if (!p->thread.io_bitmap_ptr)
+ return -ENOMEM;
+ memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr,
+ (IO_BITMAP_SIZE+1)*4);
+ }
+
return 0;
}
@@ -491,21 +503,14 @@ void __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Handle the IO bitmap
*/
- if (unlikely(prev->ioperm || next->ioperm)) {
- if (next->ioperm) {
+ if (unlikely(prev->io_bitmap_ptr || next->io_bitmap_ptr)) {
+ if (next->io_bitmap_ptr) {
/*
* 4 cachelines copy ... not good, but not that
* bad either. Anyone got something better?
* This only affects processes which use ioperm().
- * [Putting the TSSs into 4k-tlb mapped regions
- * and playing VM tricks to switch the IO bitmap
- * is not really acceptable.]
- * On x86-64 we could put multiple bitmaps into
- * the GDT and just switch offsets
- * This would require ugly special cases on overflow
- * though -AK
*/
- memcpy(tss->io_bitmap, next->io_bitmap,
+ memcpy(tss->io_bitmap, next->io_bitmap_ptr,
IO_BITMAP_SIZE*sizeof(u32));
tss->io_map_base = IO_BITMAP_OFFSET;
} else {
diff --git a/arch/x86_64/kernel/setup64.c b/arch/x86_64/kernel/setup64.c
index f6c296dce4b5..66ae787c8d19 100644
--- a/arch/x86_64/kernel/setup64.c
+++ b/arch/x86_64/kernel/setup64.c
@@ -91,6 +91,9 @@ void pda_init(int cpu)
pda->me = pda;
pda->cpudata_offset = 0;
+ pda->active_mm = &init_mm;
+ pda->mmu_state = 0;
+
asm volatile("movl %0,%%fs ; movl %0,%%gs" :: "r" (0));
wrmsrl(MSR_GS_BASE, cpu_pda + cpu);
}
diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c
index 98b653afe853..229592faf805 100644
--- a/arch/x86_64/kernel/signal.c
+++ b/arch/x86_64/kernel/signal.c
@@ -84,7 +84,6 @@ struct rt_sigframe
char *pretcode;
struct ucontext uc;
struct siginfo info;
- struct _fpstate fpstate;
};
static int
@@ -186,8 +185,7 @@ badframe:
*/
static int
-setup_sigcontext(struct sigcontext *sc, struct _fpstate *fpstate,
- struct pt_regs *regs, unsigned long mask)
+setup_sigcontext(struct sigcontext *sc, struct pt_regs *regs, unsigned long mask)
{
int tmp, err = 0;
struct task_struct *me = current;
@@ -221,20 +219,17 @@ setup_sigcontext(struct sigcontext *sc, struct _fpstate *fpstate,
err |= __put_user(mask, &sc->oldmask);
err |= __put_user(me->thread.cr2, &sc->cr2);
- tmp = save_i387(fpstate);
- if (tmp < 0)
- err = 1;
- else
- err |= __put_user(tmp ? fpstate : NULL, &sc->fpstate);
-
return err;
}
/*
* Determine which stack to use..
*/
-static inline struct rt_sigframe *
-get_sigframe(struct k_sigaction *ka, struct pt_regs * regs)
+
+#define round_down(p, r) ((void *) ((unsigned long)((p) - (r) + 1) & ~((r)-1)))
+
+static void *
+get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
{
unsigned long rsp;
@@ -247,22 +242,34 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs * regs)
rsp = current->sas_ss_sp + current->sas_ss_size;
}
- rsp = (rsp - sizeof(struct _fpstate)) & ~(15UL);
- rsp -= offsetof(struct rt_sigframe, fpstate);
-
- return (struct rt_sigframe *) rsp;
+ return round_down(rsp - size, 16);
}
static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
sigset_t *set, struct pt_regs * regs)
{
- struct rt_sigframe *frame;
+ struct rt_sigframe *frame = NULL;
+ struct _fpstate *fp = NULL;
int err = 0;
- frame = get_sigframe(ka, regs);
+ if (current->used_math) {
+ fp = get_stack(ka, regs, sizeof(struct _fpstate));
+ frame = round_down((char *)fp - sizeof(struct rt_sigframe), 16) - 8;
- if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
+ if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) {
goto give_sigsegv;
+ }
+
+ if (save_i387(fp) < 0)
+ err |= -1;
+ }
+
+ if (!frame)
+ frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
+
+ if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) {
+ goto give_sigsegv;
+ }
if (ka->sa.sa_flags & SA_SIGINFO) {
err |= copy_siginfo_to_user(&frame->info, info);
@@ -278,14 +285,10 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
err |= __put_user(sas_ss_flags(regs->rsp),
&frame->uc.uc_stack.ss_flags);
err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
- err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate,
- regs, set->sig[0]);
+ err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0]);
+ err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
- if (err) {
- goto give_sigsegv;
- }
-
/* Set up to return from userspace. If provided, use a stub
already in userspace. */
/* x86-64 should always use SA_RESTORER. */
@@ -297,7 +300,6 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
}
if (err) {
- printk("fault 3\n");
goto give_sigsegv;
}
@@ -305,7 +307,6 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
printk("%d old rip %lx old rsp %lx old rax %lx\n", current->pid,regs->rip,regs->rsp,regs->rax);
#endif
-
/* Set up registers for signal handler */
{
struct exec_domain *ed = current_thread_info()->exec_domain;
@@ -320,9 +321,10 @@ static void setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
next argument after the signal number on the stack. */
regs->rsi = (unsigned long)&frame->info;
regs->rdx = (unsigned long)&frame->uc;
- regs->rsp = (unsigned long) frame;
regs->rip = (unsigned long) ka->sa.sa_handler;
+ regs->rsp = (unsigned long)frame;
+
set_fs(USER_DS);
regs->eflags &= ~TF_MASK;
diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c
index 3d6e8a406b54..f0d99edfec0e 100644
--- a/arch/x86_64/kernel/smp.c
+++ b/arch/x86_64/kernel/smp.c
@@ -25,8 +25,6 @@
/* The 'big kernel lock' */
spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED;
-struct tlb_state cpu_tlbstate[NR_CPUS] = {[0 ... NR_CPUS-1] = { &init_mm, 0 }};
-
/*
* the following functions deal with sending IPIs between CPUs.
*
@@ -147,9 +145,9 @@ static spinlock_t tlbstate_lock = SPIN_LOCK_UNLOCKED;
*/
static void inline leave_mm (unsigned long cpu)
{
- if (cpu_tlbstate[cpu].state == TLBSTATE_OK)
+ if (read_pda(mmu_state) == TLBSTATE_OK)
BUG();
- clear_bit(cpu, &cpu_tlbstate[cpu].active_mm->cpu_vm_mask);
+ clear_bit(cpu, &read_pda(active_mm)->cpu_vm_mask);
__flush_tlb();
}
@@ -164,18 +162,18 @@ static void inline leave_mm (unsigned long cpu)
* the other cpus, but smp_invalidate_interrupt ignore flush ipis
* for the wrong mm, and in the worst case we perform a superflous
* tlb flush.
- * 1a2) set cpu_tlbstate to TLBSTATE_OK
+ * 1a2) set cpu mmu_state to TLBSTATE_OK
* Now the smp_invalidate_interrupt won't call leave_mm if cpu0
* was in lazy tlb mode.
- * 1a3) update cpu_tlbstate[].active_mm
+ * 1a3) update cpu active_mm
* Now cpu0 accepts tlb flushes for the new mm.
* 1a4) set_bit(cpu, &new_mm->cpu_vm_mask);
* Now the other cpus will send tlb flush ipis.
* 1a4) change cr3.
* 1b) thread switch without mm change
- * cpu_tlbstate[].active_mm is correct, cpu0 already handles
+ * cpu active_mm is correct, cpu0 already handles
* flush ipis.
- * 1b1) set cpu_tlbstate to TLBSTATE_OK
+ * 1b1) set cpu mmu_state to TLBSTATE_OK
* 1b2) test_and_set the cpu bit in cpu_vm_mask.
* Atomically set the bit [other cpus will start sending flush ipis],
* and test the bit.
@@ -188,7 +186,7 @@ static void inline leave_mm (unsigned long cpu)
* runs in kernel space, the cpu could load tlb entries for user space
* pages.
*
- * The good news is that cpu_tlbstate is local to each cpu, no
+ * The good news is that cpu mmu_state is local to each cpu, no
* write/read ordering problems.
*/
@@ -216,8 +214,8 @@ asmlinkage void smp_invalidate_interrupt (void)
* BUG();
*/
- if (flush_mm == cpu_tlbstate[cpu].active_mm) {
- if (cpu_tlbstate[cpu].state == TLBSTATE_OK) {
+ if (flush_mm == read_pda(active_mm)) {
+ if (read_pda(mmu_state) == TLBSTATE_OK) {
if (flush_va == FLUSH_ALL)
local_flush_tlb();
else
@@ -335,7 +333,7 @@ static inline void do_flush_tlb_all_local(void)
unsigned long cpu = smp_processor_id();
__flush_tlb_all();
- if (cpu_tlbstate[cpu].state == TLBSTATE_LAZY)
+ if (read_pda(mmu_state) == TLBSTATE_LAZY)
leave_mm(cpu);
}
diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c
index b292ca527a8a..e576e9f98ec5 100644
--- a/arch/x86_64/kernel/vsyscall.c
+++ b/arch/x86_64/kernel/vsyscall.c
@@ -47,7 +47,7 @@
#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
-#define NO_VSYSCALL 1
+//#define NO_VSYSCALL 1
#ifdef NO_VSYSCALL
#include <asm/unistd.h>
diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c
index 9d88edb5c62d..2bbb7d8238b5 100644
--- a/arch/x86_64/kernel/x8664_ksyms.c
+++ b/arch/x86_64/kernel/x8664_ksyms.c
@@ -189,3 +189,5 @@ EXPORT_SYMBOL_NOVERS(do_softirq_thunk);
void out_of_line_bug(void);
EXPORT_SYMBOL(out_of_line_bug);
+
+EXPORT_SYMBOL(init_level4_pgt);
diff --git a/arch/x86_64/lib/Makefile b/arch/x86_64/lib/Makefile
index 8fbcee522aeb..6791678212ed 100644
--- a/arch/x86_64/lib/Makefile
+++ b/arch/x86_64/lib/Makefile
@@ -12,7 +12,7 @@ obj-y = csum-partial.o csum-copy.o csum-wrappers.o delay.o \
thunk.o io.o clear_page.o copy_page.o
obj-y += memcpy.o
obj-y += memmove.o
-#obj-y += memset.o
+obj-y += memset.o
obj-y += copy_user.o
export-objs := io.o csum-wrappers.o csum-partial.o
diff --git a/arch/x86_64/lib/memset.S b/arch/x86_64/lib/memset.S
index 1c5d73cd73b8..44ce1223d832 100644
--- a/arch/x86_64/lib/memset.S
+++ b/arch/x86_64/lib/memset.S
@@ -1,6 +1,4 @@
-/* Copyright 2002 Andi Kleen, SuSE Labs */
-
- // #define FIX_ALIGNMENT 1
+/* Copyright 2002 Andi Kleen */
/*
* ISO C memset - set a memory block to a byte value.
@@ -11,51 +9,51 @@
*
* rax original destination
*/
- .globl ____memset
+ .globl __memset
+ .globl memset
.p2align
-____memset:
- movq %rdi,%r10 /* save destination for return address */
- movq %rdx,%r11 /* save count */
+memset:
+__memset:
+ movq %rdi,%r10
+ movq %rdx,%r11
/* expand byte value */
- movzbl %sil,%ecx /* zero extend char value */
- movabs $0x0101010101010101,%rax /* expansion pattern */
- mul %rcx /* expand with rax, clobbers rdx */
+ movzbl %sil,%ecx
+ movabs $0x0101010101010101,%rax
+ mul %rcx /* with rax, clobbers rdx */
-#ifdef FIX_ALIGNMENT
/* align dst */
movl %edi,%r9d
- andl $7,%r9d /* test unaligned bits */
+ andl $7,%r9d
jnz bad_alignment
after_bad_alignment:
-#endif
- movq %r11,%rcx /* restore count */
- shrq $6,%rcx /* divide by 64 */
- jz handle_tail /* block smaller than 64 bytes? */
- movl $64,%r8d /* CSE loop block size */
+ movq %r11,%rcx
+ movl $64,%r8d
+ shrq $6,%rcx
+ jz handle_tail
loop_64:
- movnti %rax,0*8(%rdi)
- movnti %rax,1*8(%rdi)
- movnti %rax,2*8(%rdi)
- movnti %rax,3*8(%rdi)
- movnti %rax,4*8(%rdi)
- movnti %rax,5*8(%rdi)
- movnti %rax,6*8(%rdi)
- movnti %rax,7*8(%rdi) /* clear 64 byte blocks */
- addq %r8,%rdi /* increase pointer by 64 bytes */
- loop loop_64 /* decrement rcx and if not zero loop */
+ movnti %rax,(%rdi)
+ movnti %rax,8(%rdi)
+ movnti %rax,16(%rdi)
+ movnti %rax,24(%rdi)
+ movnti %rax,32(%rdi)
+ movnti %rax,40(%rdi)
+ movnti %rax,48(%rdi)
+ movnti %rax,56(%rdi)
+ addq %r8,%rdi
+ loop loop_64
/* Handle tail in loops. The loops should be faster than hard
to predict jump tables. */
handle_tail:
movl %r11d,%ecx
- andl $63,%ecx
- shrl $3,%ecx
+ andl $63&(~7),%ecx
jz handle_7
+ shrl $3,%ecx
loop_8:
- movnti %rax,(%rdi) /* long words */
+ movnti %rax,(%rdi)
addq $8,%rdi
loop loop_8
@@ -64,22 +62,20 @@ handle_7:
andl $7,%ecx
jz ende
loop_1:
- movb %al,(%rdi) /* bytes */
- incq %rdi
+ movb %al,(%rdi)
+ addq $1,%rdi
loop loop_1
ende:
movq %r10,%rax
ret
-#ifdef FIX_ALIGNMENT
bad_alignment:
- andq $-8,%r11 /* shorter than 8 bytes */
- jz handle_7 /* if yes handle it in the tail code */
- movnti %rax,(%rdi) /* unaligned store of 8 bytes */
+ cmpq $7,%r11
+ jbe handle_7
+ movnti %rax,(%rdi) /* unaligned store */
movq $8,%r8
- subq %r9,%r8 /* compute alignment (8-misalignment) */
- addq %r8,%rdi /* fix destination */
- subq %r8,%r11 /* fix count */
+ subq %r9,%r8
+ addq %r8,%rdi
+ subq %r8,%r11
jmp after_bad_alignment
-#endif
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index d57dc51df3f5..210449ad1715 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -28,6 +28,7 @@
#include <linux/types.h>
#include <linux/blk.h>
#include <linux/blkdev.h>
+#include <linux/bio.h>
#include <linux/completion.h>
#include <linux/delay.h>
#include <linux/genhd.h>
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 9ae961460ff2..e06fd274b653 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -30,6 +30,7 @@
#include <linux/delay.h>
#include <linux/major.h>
#include <linux/fs.h>
+#include <linux/bio.h>
#include <linux/blkpg.h>
#include <linux/timer.h>
#include <linux/proc_fs.h>
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index 727cdeb23c0c..fccef1bb792c 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -24,6 +24,7 @@
#include <linux/version.h>
#include <linux/types.h>
#include <linux/pci.h>
+#include <linux/bio.h>
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/delay.h>
diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c
index 189814dbc7d1..cd3a4254e9e3 100644
--- a/drivers/block/elevator.c
+++ b/drivers/block/elevator.c
@@ -28,6 +28,7 @@
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
+#include <linux/bio.h>
#include <linux/blk.h>
#include <linux/config.h>
#include <linux/module.h>
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 94f42b356556..aff8acff0ef3 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -165,6 +165,7 @@ static int print_unex=1;
#include <linux/errno.h>
#include <linux/slab.h>
#include <linux/mm.h>
+#include <linux/bio.h>
#include <linux/string.h>
#include <linux/fcntl.h>
#include <linux/delay.h>
diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c
index d53122b1ae46..16abcb3f5481 100644
--- a/drivers/block/ll_rw_blk.c
+++ b/drivers/block/ll_rw_blk.c
@@ -18,6 +18,7 @@
#include <linux/errno.h>
#include <linux/string.h>
#include <linux/config.h>
+#include <linux/bio.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/init.h>
@@ -2002,8 +2003,8 @@ int __init blk_dev_init(void)
queue_nr_requests = (total_ram >> 8) & ~15; /* One per quarter-megabyte */
if (queue_nr_requests < 32)
queue_nr_requests = 32;
- if (queue_nr_requests > 512)
- queue_nr_requests = 512;
+ if (queue_nr_requests > 256)
+ queue_nr_requests = 256;
/*
* Batch frees according to queue length
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 5689de41b771..982604ff6bfd 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -60,6 +60,7 @@
#include <linux/sched.h>
#include <linux/fs.h>
#include <linux/file.h>
+#include <linux/bio.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/major.h>
@@ -168,6 +169,15 @@ static void figure_loop_size(struct loop_device *lo)
}
+static inline int lo_do_transfer(struct loop_device *lo, int cmd, char *rbuf,
+ char *lbuf, int size, int rblock)
+{
+ if (!lo->transfer)
+ return 0;
+
+ return lo->transfer(lo, cmd, rbuf, lbuf, size, rblock);
+}
+
static int
do_lo_send(struct loop_device *lo, struct bio_vec *bvec, int bsize, loff_t pos)
{
@@ -454,20 +464,43 @@ static struct bio *loop_get_buffer(struct loop_device *lo, struct bio *rbh)
out_bh:
bio->bi_sector = rbh->bi_sector + (lo->lo_offset >> 9);
bio->bi_rw = rbh->bi_rw;
- spin_lock_irq(&lo->lo_lock);
bio->bi_bdev = lo->lo_device;
- spin_unlock_irq(&lo->lo_lock);
return bio;
}
-static int loop_make_request(request_queue_t *q, struct bio *rbh)
+static int
+bio_transfer(struct loop_device *lo, struct bio *to_bio,
+ struct bio *from_bio)
{
- struct bio *bh = NULL;
+ unsigned long IV = loop_get_iv(lo, from_bio->bi_sector);
+ struct bio_vec *from_bvec, *to_bvec;
+ char *vto, *vfrom;
+ int ret = 0, i;
+
+ __bio_for_each_segment(from_bvec, from_bio, i, 0) {
+ to_bvec = &to_bio->bi_io_vec[i];
+
+ kmap(from_bvec->bv_page);
+ kmap(to_bvec->bv_page);
+ vfrom = page_address(from_bvec->bv_page) + from_bvec->bv_offset;
+ vto = page_address(to_bvec->bv_page) + to_bvec->bv_offset;
+ ret |= lo_do_transfer(lo, bio_data_dir(to_bio), vto, vfrom,
+ from_bvec->bv_len, IV);
+ kunmap(from_bvec->bv_page);
+ kunmap(to_bvec->bv_page);
+ }
+
+ return ret;
+}
+
+static int loop_make_request(request_queue_t *q, struct bio *old_bio)
+{
+ struct bio *new_bio = NULL;
struct loop_device *lo;
unsigned long IV;
- int rw = bio_rw(rbh);
- int unit = minor(to_kdev_t(rbh->bi_bdev->bd_dev));
+ int rw = bio_rw(old_bio);
+ int unit = minor(to_kdev_t(old_bio->bi_bdev->bd_dev));
if (unit >= max_loop)
goto out;
@@ -489,60 +522,41 @@ static int loop_make_request(request_queue_t *q, struct bio *rbh)
goto err;
}
- blk_queue_bounce(q, &rbh);
+ blk_queue_bounce(q, &old_bio);
/*
* file backed, queue for loop_thread to handle
*/
if (lo->lo_flags & LO_FLAGS_DO_BMAP) {
- loop_add_bio(lo, rbh);
+ loop_add_bio(lo, old_bio);
return 0;
}
/*
* piggy old buffer on original, and submit for I/O
*/
- bh = loop_get_buffer(lo, rbh);
- IV = loop_get_iv(lo, rbh->bi_sector);
+ new_bio = loop_get_buffer(lo, old_bio);
+ IV = loop_get_iv(lo, old_bio->bi_sector);
if (rw == WRITE) {
- if (lo_do_transfer(lo, WRITE, bio_data(bh), bio_data(rbh),
- bh->bi_size, IV))
+ if (bio_transfer(lo, new_bio, old_bio))
goto err;
}
- generic_make_request(bh);
+ generic_make_request(new_bio);
return 0;
err:
if (atomic_dec_and_test(&lo->lo_pending))
up(&lo->lo_bh_mutex);
- loop_put_buffer(bh);
+ loop_put_buffer(new_bio);
out:
- bio_io_error(rbh);
+ bio_io_error(old_bio);
return 0;
inactive:
spin_unlock_irq(&lo->lo_lock);
goto out;
}
-static int do_bio_blockbacked(struct loop_device *lo, struct bio *bio,
- struct bio *rbh)
-{
- unsigned long IV = loop_get_iv(lo, rbh->bi_sector);
- struct bio_vec *from;
- char *vto, *vfrom;
- int ret = 0, i;
-
- bio_for_each_segment(from, rbh, i) {
- vfrom = page_address(from->bv_page) + from->bv_offset;
- vto = page_address(bio->bi_io_vec[i].bv_page) + bio->bi_io_vec[i].bv_offset;
- ret |= lo_do_transfer(lo, bio_data_dir(bio), vto, vfrom,
- from->bv_len, IV);
- }
-
- return ret;
-}
-
static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
{
int ret;
@@ -556,7 +570,7 @@ static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio)
} else {
struct bio *rbh = bio->bi_private;
- ret = do_bio_blockbacked(lo, bio, rbh);
+ ret = bio_transfer(lo, bio, rbh);
bio_endio(rbh, !ret);
loop_put_buffer(bio);
@@ -588,10 +602,8 @@ static int loop_thread(void *data)
set_user_nice(current, -20);
- spin_lock_irq(&lo->lo_lock);
lo->lo_state = Lo_bound;
atomic_inc(&lo->lo_pending);
- spin_unlock_irq(&lo->lo_lock);
/*
* up sem, we are running
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 67344c7fcc1a..697e825c3a91 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -39,6 +39,7 @@
#include <linux/init.h>
#include <linux/sched.h>
#include <linux/fs.h>
+#include <linux/bio.h>
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/file.h>
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index 4faf52c7be5c..7b60e75d5584 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -45,6 +45,8 @@
#include <linux/config.h>
#include <linux/string.h>
#include <linux/slab.h>
+#include <asm/atomic.h>
+#include <linux/bio.h>
#include <linux/module.h>
#include <linux/init.h>
#include <linux/devfs_fs_kernel.h>
diff --git a/drivers/block/umem.c b/drivers/block/umem.c
index 8c61688cab1c..44909021aa06 100644
--- a/drivers/block/umem.c
+++ b/drivers/block/umem.c
@@ -37,6 +37,7 @@
#include <linux/config.h>
#include <linux/sched.h>
#include <linux/fs.h>
+#include <linux/bio.h>
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/mman.h>
diff --git a/drivers/char/agp/agp.h b/drivers/char/agp/agp.h
index be8178161e80..94e405104df4 100644
--- a/drivers/char/agp/agp.h
+++ b/drivers/char/agp/agp.h
@@ -118,8 +118,8 @@ struct agp_bridge_data {
int (*remove_memory) (agp_memory *, off_t, int);
agp_memory *(*alloc_by_type) (size_t, int);
void (*free_by_type) (agp_memory *);
- unsigned long (*agp_alloc_page) (void);
- void (*agp_destroy_page) (unsigned long);
+ void *(*agp_alloc_page) (void);
+ void (*agp_destroy_page) (void *);
int (*suspend)(void);
void (*resume)(void);
diff --git a/drivers/char/agp/agpgart_be.c b/drivers/char/agp/agpgart_be.c
index 44cbc013d91c..8ba761695215 100644
--- a/drivers/char/agp/agpgart_be.c
+++ b/drivers/char/agp/agpgart_be.c
@@ -22,6 +22,8 @@
* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE
* OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
+ * TODO:
+ * - Allocate more than order 0 pages to avoid too much linear map splitting.
*/
#include <linux/config.h>
#include <linux/version.h>
@@ -43,6 +45,7 @@
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/page.h>
+#include <asm/agp.h>
#include <linux/agp_backend.h>
#include "agp.h"
@@ -59,56 +62,28 @@ EXPORT_SYMBOL(agp_enable);
EXPORT_SYMBOL(agp_backend_acquire);
EXPORT_SYMBOL(agp_backend_release);
-static void flush_cache(void);
-
static struct agp_bridge_data agp_bridge;
static int agp_try_unsupported __initdata = 0;
-
-static inline void flush_cache(void)
-{
-#if defined(__i386__) || defined(__x86_64__)
- asm volatile ("wbinvd":::"memory");
-#elif defined(__alpha__) || defined(__ia64__) || defined(__sparc__)
- /* ??? I wonder if we'll really need to flush caches, or if the
- core logic can manage to keep the system coherent. The ARM
- speaks only of using `cflush' to get things in memory in
- preparation for power failure.
-
- If we do need to call `cflush', we'll need a target page,
- as we can only flush one page at a time.
-
- Ditto for IA-64. --davidm 00/08/07 */
- mb();
-#else
-#error "Please define flush_cache."
-#endif
-}
-
#ifdef CONFIG_SMP
-static atomic_t cpus_waiting;
-
static void ipi_handler(void *null)
{
- flush_cache();
- atomic_dec(&cpus_waiting);
- while (atomic_read(&cpus_waiting) > 0)
- barrier();
+ flush_agp_cache();
}
static void smp_flush_cache(void)
{
- atomic_set(&cpus_waiting, num_online_cpus() - 1);
- if (smp_call_function(ipi_handler, NULL, 1, 0) != 0)
+ if (smp_call_function(ipi_handler, NULL, 1, 1) != 0)
panic(PFX "timed out waiting for the other CPUs!\n");
- flush_cache();
- while (atomic_read(&cpus_waiting) > 0)
- barrier();
+ flush_agp_cache();
}
#define global_cache_flush smp_flush_cache
#else /* CONFIG_SMP */
-#define global_cache_flush flush_cache
-#endif /* CONFIG_SMP */
+static void global_cache_flush(void)
+{
+ flush_agp_cache();
+}
+#endif /* !CONFIG_SMP */
int agp_backend_acquire(void)
{
@@ -208,8 +183,7 @@ void agp_free_memory(agp_memory * curr)
if (curr->page_count != 0) {
for (i = 0; i < curr->page_count; i++) {
curr->memory[i] &= ~(0x00000fff);
- agp_bridge.agp_destroy_page((unsigned long)
- phys_to_virt(curr->memory[i]));
+ agp_bridge.agp_destroy_page(phys_to_virt(curr->memory[i]));
}
}
agp_free_key(curr->key);
@@ -252,21 +226,22 @@ agp_memory *agp_allocate_memory(size_t page_count, u32 type)
MOD_DEC_USE_COUNT;
return NULL;
}
+
for (i = 0; i < page_count; i++) {
- new->memory[i] = agp_bridge.agp_alloc_page();
+ void *addr = agp_bridge.agp_alloc_page();
- if (new->memory[i] == 0) {
+ if (addr == NULL) {
/* Free this structure */
agp_free_memory(new);
return NULL;
}
new->memory[i] =
- agp_bridge.mask_memory(
- virt_to_phys((void *) new->memory[i]),
- type);
+ agp_bridge.mask_memory(virt_to_phys(addr), type);
new->page_count++;
}
+ flush_agp_mappings();
+
return new;
}
@@ -561,6 +536,7 @@ static int agp_generic_create_gatt_table(void)
agp_bridge.current_size;
break;
}
+ temp = agp_bridge.current_size;
} else {
agp_bridge.aperture_size_idx = i;
}
@@ -761,7 +737,7 @@ static void agp_generic_free_by_type(agp_memory * curr)
* against a maximum value.
*/
-static unsigned long agp_generic_alloc_page(void)
+static void *agp_generic_alloc_page(void)
{
struct page * page;
@@ -769,24 +745,26 @@ static unsigned long agp_generic_alloc_page(void)
if (page == NULL)
return 0;
+ map_page_into_agp(page);
+
get_page(page);
SetPageLocked(page);
atomic_inc(&agp_bridge.current_memory_agp);
- return (unsigned long)page_address(page);
+ return page_address(page);
}
-static void agp_generic_destroy_page(unsigned long addr)
+static void agp_generic_destroy_page(void *addr)
{
- void *pt = (void *) addr;
struct page *page;
- if (pt == NULL)
+ if (addr == NULL)
return;
- page = virt_to_page(pt);
+ page = virt_to_page(addr);
+ unmap_page_from_agp(page);
put_page(page);
unlock_page(page);
- free_page((unsigned long) pt);
+ free_page((unsigned long)addr);
atomic_dec(&agp_bridge.current_memory_agp);
}
@@ -993,6 +971,7 @@ static agp_memory *intel_i810_alloc_by_type(size_t pg_count, int type)
return new;
}
if(type == AGP_PHYS_MEMORY) {
+ void *addr;
/* The I810 requires a physical address to program
* it's mouse pointer into hardware. However the
* Xserver still writes to it through the agp
@@ -1007,17 +986,14 @@ static agp_memory *intel_i810_alloc_by_type(size_t pg_count, int type)
return NULL;
}
MOD_INC_USE_COUNT;
- new->memory[0] = agp_bridge.agp_alloc_page();
+ addr = agp_bridge.agp_alloc_page();
- if (new->memory[0] == 0) {
+ if (addr == NULL) {
/* Free this structure */
agp_free_memory(new);
return NULL;
}
- new->memory[0] =
- agp_bridge.mask_memory(
- virt_to_phys((void *) new->memory[0]),
- type);
+ new->memory[0] = agp_bridge.mask_memory(virt_to_phys(addr), type);
new->page_count = 1;
new->num_scratch_pages = 1;
new->type = AGP_PHYS_MEMORY;
@@ -1032,7 +1008,7 @@ static void intel_i810_free_by_type(agp_memory * curr)
{
agp_free_key(curr->key);
if(curr->type == AGP_PHYS_MEMORY) {
- agp_bridge.agp_destroy_page((unsigned long)
+ agp_bridge.agp_destroy_page(
phys_to_virt(curr->memory[0]));
vfree(curr->memory);
}
@@ -1291,7 +1267,7 @@ static agp_memory *intel_i830_alloc_by_type(size_t pg_count,int type)
if (type == AGP_DCACHE_MEMORY) return(NULL);
if (type == AGP_PHYS_MEMORY) {
- unsigned long physical;
+ void *addr;
/* The i830 requires a physical address to program
* it's mouse pointer into hardware. However the
@@ -1306,19 +1282,18 @@ static agp_memory *intel_i830_alloc_by_type(size_t pg_count,int type)
if (nw == NULL) return(NULL);
MOD_INC_USE_COUNT;
- nw->memory[0] = agp_bridge.agp_alloc_page();
- physical = nw->memory[0];
- if (nw->memory[0] == 0) {
+ addr = agp_bridge.agp_alloc_page();
+ if (addr == NULL) {
/* free this structure */
agp_free_memory(nw);
return(NULL);
}
- nw->memory[0] = agp_bridge.mask_memory(virt_to_phys((void *) nw->memory[0]),type);
+ nw->memory[0] = agp_bridge.mask_memory(virt_to_phys(addr),type);
nw->page_count = 1;
nw->num_scratch_pages = 1;
nw->type = AGP_PHYS_MEMORY;
- nw->physical = virt_to_phys((void *) physical);
+ nw->physical = virt_to_phys(addr);
return(nw);
}
@@ -1849,16 +1824,17 @@ static int intel_i460_remove_memory(agp_memory * mem, off_t pg_start, int type)
* Let's just hope nobody counts on the allocated AGP memory being there
* before bind time (I don't think current drivers do)...
*/
-static unsigned long intel_i460_alloc_page(void)
+static void * intel_i460_alloc_page(void)
{
if (intel_i460_cpk)
return agp_generic_alloc_page();
/* Returning NULL would cause problems */
- return ~0UL;
+ /* AK: really dubious code. */
+ return (void *)~0UL;
}
-static void intel_i460_destroy_page(unsigned long page)
+static void intel_i460_destroy_page(void *page)
{
if (intel_i460_cpk)
agp_generic_destroy_page(page);
@@ -3298,38 +3274,29 @@ static void ali_cache_flush(void)
}
}
-static unsigned long ali_alloc_page(void)
+static void *ali_alloc_page(void)
{
- struct page *page;
- u32 temp;
+ void *adr = agp_generic_alloc_page();
+ unsigned temp;
- page = alloc_page(GFP_KERNEL);
- if (page == NULL)
+ if (adr == 0)
return 0;
- get_page(page);
- SetPageLocked(page);
- atomic_inc(&agp_bridge.current_memory_agp);
-
- global_cache_flush();
-
if (agp_bridge.type == ALI_M1541) {
pci_read_config_dword(agp_bridge.dev, ALI_CACHE_FLUSH_CTRL, &temp);
pci_write_config_dword(agp_bridge.dev, ALI_CACHE_FLUSH_CTRL,
(((temp & ALI_CACHE_FLUSH_ADDR_MASK) |
- virt_to_phys(page_address(page))) |
+ virt_to_phys(adr)) |
ALI_CACHE_FLUSH_EN ));
}
- return (unsigned long)page_address(page);
+ return adr;
}
-static void ali_destroy_page(unsigned long addr)
+static void ali_destroy_page(void * addr)
{
u32 temp;
- void *pt = (void *) addr;
- struct page *page;
- if (pt == NULL)
+ if (addr == NULL)
return;
global_cache_flush();
@@ -3338,15 +3305,11 @@ static void ali_destroy_page(unsigned long addr)
pci_read_config_dword(agp_bridge.dev, ALI_CACHE_FLUSH_CTRL, &temp);
pci_write_config_dword(agp_bridge.dev, ALI_CACHE_FLUSH_CTRL,
(((temp & ALI_CACHE_FLUSH_ADDR_MASK) |
- virt_to_phys((void *)pt)) |
+ virt_to_phys(addr)) |
ALI_CACHE_FLUSH_EN));
}
- page = virt_to_page(pt);
- put_page(page);
- unlock_page(page);
- free_page((unsigned long) pt);
- atomic_dec(&agp_bridge.current_memory_agp);
+ agp_generic_destroy_page(addr);
}
/* Setup function */
@@ -5011,15 +4974,15 @@ static int __init agp_backend_initialize(void)
}
if (agp_bridge.needs_scratch_page == TRUE) {
- agp_bridge.scratch_page = agp_bridge.agp_alloc_page();
+ void *addr;
+ addr = agp_bridge.agp_alloc_page();
- if (agp_bridge.scratch_page == 0) {
+ if (addr == NULL) {
printk(KERN_ERR PFX "unable to get memory for "
"scratch page.\n");
return -ENOMEM;
}
- agp_bridge.scratch_page =
- virt_to_phys((void *) agp_bridge.scratch_page);
+ agp_bridge.scratch_page = virt_to_phys(addr);
agp_bridge.scratch_page =
agp_bridge.mask_memory(agp_bridge.scratch_page, 0);
}
@@ -5064,8 +5027,7 @@ static int __init agp_backend_initialize(void)
err_out:
if (agp_bridge.needs_scratch_page == TRUE) {
agp_bridge.scratch_page &= ~(0x00000fff);
- agp_bridge.agp_destroy_page((unsigned long)
- phys_to_virt(agp_bridge.scratch_page));
+ agp_bridge.agp_destroy_page(phys_to_virt(agp_bridge.scratch_page));
}
if (got_gatt)
agp_bridge.free_gatt_table();
@@ -5084,8 +5046,7 @@ static void agp_backend_cleanup(void)
if (agp_bridge.needs_scratch_page == TRUE) {
agp_bridge.scratch_page &= ~(0x00000fff);
- agp_bridge.agp_destroy_page((unsigned long)
- phys_to_virt(agp_bridge.scratch_page));
+ agp_bridge.agp_destroy_page(phys_to_virt(agp_bridge.scratch_page));
}
}
diff --git a/drivers/char/random.c b/drivers/char/random.c
index db20dec287d0..9db52acb9ef2 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -252,6 +252,7 @@
#include <linux/poll.h>
#include <linux/init.h>
#include <linux/fs.h>
+#include <linux/tqueue.h>
#include <asm/processor.h>
#include <asm/uaccess.h>
diff --git a/drivers/ide/ioctl.c b/drivers/ide/ioctl.c
index b986555fd4f3..609ed7dcfa56 100644
--- a/drivers/ide/ioctl.c
+++ b/drivers/ide/ioctl.c
@@ -345,8 +345,9 @@ int ata_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned
if (!arg) {
if (ide_spin_wait_hwgroup(drive))
return -EBUSY;
- else
- return 0;
+ /* Do nothing, just unlock */
+ spin_unlock_irq(drive->channel->lock);
+ return 0;
}
return do_cmd_ioctl(drive, arg);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 118ce821a208..48fb74e50d5c 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -20,7 +20,7 @@
#include <linux/raid/md.h>
#include <linux/slab.h>
-
+#include <linux/bio.h>
#include <linux/raid/linear.h>
#define MAJOR_NR MD_MAJOR
diff --git a/drivers/md/lvm-snap.c b/drivers/md/lvm-snap.c
index c90947fc5f89..46df5c8ff0ef 100644
--- a/drivers/md/lvm-snap.c
+++ b/drivers/md/lvm-snap.c
@@ -224,7 +224,7 @@ static inline void invalidate_snap_cache(unsigned long start, unsigned long nr,
for (i = 0; i < nr; i++)
{
- bh = get_hash_table(dev, start++, blksize);
+ bh = find_get_block(dev, start++, blksize);
if (bh)
bforget(bh);
}
diff --git a/drivers/md/lvm.c b/drivers/md/lvm.c
index dfc256c6a2ec..c44a1b8a74b2 100644
--- a/drivers/md/lvm.c
+++ b/drivers/md/lvm.c
@@ -209,6 +209,7 @@
#include <linux/hdreg.h>
#include <linux/stat.h>
#include <linux/fs.h>
+#include <linux/bio.h>
#include <linux/proc_fs.h>
#include <linux/blkdev.h>
#include <linux/genhd.h>
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 21e20ea10be7..d23270322804 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -33,6 +33,7 @@
#include <linux/linkage.h>
#include <linux/raid/md.h>
#include <linux/sysctl.h>
+#include <linux/bio.h>
#include <linux/raid/xor.h>
#include <linux/devfs_fs_kernel.h>
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 46f089ee8481..6db555317b13 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -23,6 +23,7 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/raid/multipath.h>
+#include <linux/bio.h>
#include <linux/buffer_head.h>
#include <asm/atomic.h>
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 430448c566af..8f149a1efe1b 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -20,6 +20,7 @@
#include <linux/module.h>
#include <linux/raid/raid0.h>
+#include <linux/bio.h>
#define MAJOR_NR MD_MAJOR
#define MD_DRIVER
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 43fdb75de0fe..96ad858cf033 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -23,6 +23,7 @@
*/
#include <linux/raid/raid1.h>
+#include <linux/bio.h>
#define MAJOR_NR MD_MAJOR
#define MD_DRIVER
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 9402b0c779b9..62873d89e395 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -20,6 +20,7 @@
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/raid/raid5.h>
+#include <linux/bio.h>
#include <asm/bitops.h>
#include <asm/atomic.h>
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 0260ccf2092a..db4cdb8e3ad4 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -210,3 +210,4 @@ EXPORT_SYMBOL(pci_match_device);
EXPORT_SYMBOL(pci_register_driver);
EXPORT_SYMBOL(pci_unregister_driver);
EXPORT_SYMBOL(pci_dev_driver);
+EXPORT_SYMBOL(pci_bus_type);
diff --git a/drivers/pcmcia/pci_socket.c b/drivers/pcmcia/pci_socket.c
index d30df9b4203a..5a4b78312391 100644
--- a/drivers/pcmcia/pci_socket.c
+++ b/drivers/pcmcia/pci_socket.c
@@ -20,6 +20,7 @@
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/sched.h>
+#include <linux/tqueue.h>
#include <linux/interrupt.h>
#include <pcmcia/ss.h>
diff --git a/drivers/pcmcia/yenta.c b/drivers/pcmcia/yenta.c
index e5453fb455e2..40b20b945488 100644
--- a/drivers/pcmcia/yenta.c
+++ b/drivers/pcmcia/yenta.c
@@ -6,6 +6,7 @@
#include <linux/init.h>
#include <linux/pci.h>
#include <linux/sched.h>
+#include <linux/tqueue.h>
#include <linux/interrupt.h>
#include <linux/delay.h>
#include <linux/module.h>
diff --git a/drivers/scsi/README.st b/drivers/scsi/README.st
index e06a21597910..702a5b178b61 100644
--- a/drivers/scsi/README.st
+++ b/drivers/scsi/README.st
@@ -2,7 +2,7 @@ This file contains brief information about the SCSI tape driver.
The driver is currently maintained by Kai M{kisara (email
Kai.Makisara@metla.fi)
-Last modified: Tue Jan 22 21:08:57 2002 by makisara
+Last modified: Tue Jun 18 18:13:50 2002 by makisara
BASICS
@@ -105,15 +105,19 @@ The default is BSD semantics.
BUFFERING
-The driver uses tape buffers allocated either at system initialization
-or at run-time when needed. One buffer is used for each open tape
-device. The size of the buffers is selectable at compile and/or boot
-time. The buffers are used to store the data being transferred to/from
-the SCSI adapter. The following buffering options are selectable at
-compile time and/or at run time (via ioctl):
+The driver uses tape buffers allocated at run-time when needed and it
+is freed when the device file is closed. One buffer is used for each
+open tape device.
+
+The size of the buffers is always at least one tape block. In fixed
+block mode, the minimum buffer size is defined (in 1024 byte units) by
+ST_FIXED_BUFFER_BLOCKS. With small block size this allows buffering of
+several blocks and using one SCSI read or write to transfer all of the
+blocks. Buffering of data across write calls in fixed block mode is
+allowed if ST_BUFFER_WRITES is non-zero. Buffer allocation uses chunks of
+memory having sizes 2^n * (page size). Because of this the actual
+buffer size may be larger than the minimum allowable buffer size.
-Buffering of data across write calls in fixed block mode (define
-ST_BUFFER_WRITES).
Asynchronous writing. Writing the buffer contents to the tape is
started and the write call returns immediately. The status is checked
@@ -128,30 +132,6 @@ attempted even if the user does not want to get all of the data at
this read command. Should be disabled for those drives that don't like
a filemark to truncate a read request or that don't like backspacing.
-The buffer size is defined (in 1024 byte units) by ST_BUFFER_BLOCKS or
-at boot time. If this size is not large enough, the driver tries to
-temporarily enlarge the buffer. Buffer allocation uses chunks of
-memory having sizes 2^n * (page size). Because of this the actual
-buffer size may be larger than the buffer size specified with
-ST_BUFFER_BLOCKS.
-
-A small number of buffers are allocated at driver initialisation. The
-maximum number of these buffers is defined by ST_MAX_BUFFERS. The
-maximum can be changed with kernel or module startup options. One
-buffer is allocated for each drive detected when the driver is
-initialized up to the maximum.
-
-The driver tries to allocate new buffers at run-time if
-necessary. These buffers are freed after use. If the maximum number of
-initial buffers is set to zero, all buffer allocation is done at
-run-time. The advantage of run-time allocation is that memory is not
-wasted for buffers not being used. The disadvantage is that there may
-not be memory available at the time when a buffer is needed for the
-first time (once a buffer is allocated, it is not released). This risk
-should not be big if the tape drive is connected to a PCI adapter that
-supports scatter/gather (the allocation is not limited to "DMA memory"
-and the buffer can be composed of several fragments).
-
The threshold for triggering asynchronous write in fixed block mode
is defined by ST_WRITE_THRESHOLD. This may be optimized for each
use pattern. The default triggers asynchronous write after three
diff --git a/drivers/scsi/cpqfcTSinit.c b/drivers/scsi/cpqfcTSinit.c
index e6f03847c212..f38e377207c7 100644
--- a/drivers/scsi/cpqfcTSinit.c
+++ b/drivers/scsi/cpqfcTSinit.c
@@ -39,6 +39,7 @@
#include <linux/pci.h>
#include <linux/delay.h>
#include <linux/timer.h>
+#include <linux/init.h>
#include <linux/ioport.h> // request_region() prototype
#include <linux/vmalloc.h> // ioremap()
//#if LINUX_VERSION_CODE >= LinuxVersionCode(2,4,7)
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index fc69760ab484..bede96547efb 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -23,6 +23,7 @@
#include <linux/timer.h>
#include <linux/string.h>
#include <linux/slab.h>
+#include <linux/bio.h>
#include <linux/ioport.h>
#include <linux/kernel.h>
#include <linux/stat.h>
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 382e04ceace2..63fe305e4342 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -36,6 +36,7 @@
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/bio.h>
#include <linux/string.h>
#include <linux/hdreg.h>
#include <linux/errno.h>
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index d536f3bc94f6..0e28dc69652b 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -39,6 +39,7 @@
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/mm.h>
+#include <linux/bio.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/cdrom.h>
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index f48ac845bc08..7342c3e661f3 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -12,13 +12,13 @@
Copyright 1992 - 2002 Kai Makisara
email Kai.Makisara@metla.fi
- Last modified: Tue Feb 5 21:25:55 2002 by makisara
+ Last modified: Sat Jun 15 13:01:56 2002 by makisara
Some small formal changes - aeb, 950809
Last modified: 18-JAN-1998 Richard Gooch <rgooch@atnf.csiro.au> Devfs support
*/
-static char *verstr = "20020205";
+static char *verstr = "20020615";
#include <linux/module.h>
@@ -69,7 +69,6 @@ static char *verstr = "20020205";
static int buffer_kbs;
static int write_threshold_kbs;
-static int max_buffers = (-1);
static int max_sg_segs;
MODULE_AUTHOR("Kai Makisara");
@@ -80,8 +79,6 @@ MODULE_PARM(buffer_kbs, "i");
MODULE_PARM_DESC(buffer_kbs, "Default driver buffer size (KB; 32)");
MODULE_PARM(write_threshold_kbs, "i");
MODULE_PARM_DESC(write_threshold_kbs, "Asynchronous write threshold (KB; 30)");
-MODULE_PARM(max_buffers, "i");
-MODULE_PARM_DESC(max_buffers, "Maximum number of buffer allocated at initialisation (4)");
MODULE_PARM(max_sg_segs, "i");
MODULE_PARM_DESC(max_sg_segs, "Maximum number of scatter/gather segments to use (32)");
@@ -97,9 +94,6 @@ static struct st_dev_parm {
"write_threshold_kbs", &write_threshold_kbs
},
{
- "max_buffers", &max_buffers
- },
- {
"max_sg_segs", &max_sg_segs
}
};
@@ -108,12 +102,12 @@ static struct st_dev_parm {
/* The default definitions have been moved to st_options.h */
-#define ST_BUFFER_SIZE (ST_BUFFER_BLOCKS * ST_KILOBYTE)
+#define ST_FIXED_BUFFER_SIZE (ST_FIXED_BUFFER_BLOCKS * ST_KILOBYTE)
#define ST_WRITE_THRESHOLD (ST_WRITE_THRESHOLD_BLOCKS * ST_KILOBYTE)
/* The buffer size should fit into the 24 bits for length in the
6-byte SCSI read and write commands. */
-#if ST_BUFFER_SIZE >= (2 << 24 - 1)
+#if ST_FIXED_BUFFER_SIZE >= (2 << 24 - 1)
#error "Buffer size should not exceed (2 << 24 - 1) bytes!"
#endif
@@ -121,7 +115,7 @@ DEB( static int debugging = DEBUG; )
#define MAX_RETRIES 0
#define MAX_WRITE_RETRIES 0
-#define MAX_READY_RETRIES 5
+#define MAX_READY_RETRIES 0
#define NO_TAPE NOT_READY
#define ST_TIMEOUT (900 * HZ)
@@ -137,18 +131,15 @@ DEB( static int debugging = DEBUG; )
#define ST_DEV_ARR_LUMP 6
static rwlock_t st_dev_arr_lock = RW_LOCK_UNLOCKED;
-static int st_nbr_buffers;
-static ST_buffer **st_buffers = NULL;
-static int st_buffer_size = ST_BUFFER_SIZE;
+static int st_fixed_buffer_size = ST_FIXED_BUFFER_SIZE;
static int st_write_threshold = ST_WRITE_THRESHOLD;
-static int st_max_buffers = ST_MAX_BUFFERS;
static int st_max_sg_segs = ST_MAX_SG;
static Scsi_Tape **scsi_tapes = NULL;
static int modes_defined;
-static ST_buffer *new_tape_buffer(int, int, int);
+static ST_buffer *new_tape_buffer(int, int);
static int enlarge_buffer(ST_buffer *, int, int);
static void normalize_buffer(ST_buffer *);
static int append_to_buffer(const char *, ST_buffer *, int);
@@ -914,8 +905,7 @@ static int check_tape(Scsi_Tape *STp, struct file *filp)
module count. */
static int st_open(struct inode *inode, struct file *filp)
{
- int i, need_dma_buffer;
- int retval = (-EIO);
+ int i, retval = (-EIO);
Scsi_Tape *STp;
ST_partstat *STps;
int dev = TAPE_NR(inode->i_rdev);
@@ -945,38 +935,15 @@ static int st_open(struct inode *inode, struct file *filp)
goto err_out;
}
- /* Allocate a buffer for this user */
- need_dma_buffer = STp->restr_dma;
- write_lock(&st_dev_arr_lock);
- for (i = 0; i < st_nbr_buffers; i++)
- if (!st_buffers[i]->in_use &&
- (!need_dma_buffer || st_buffers[i]->dma)) {
- STp->buffer = st_buffers[i];
- (STp->buffer)->in_use = 1;
- break;
- }
- write_unlock(&st_dev_arr_lock);
- if (i >= st_nbr_buffers) {
- STp->buffer = new_tape_buffer(FALSE, need_dma_buffer, TRUE);
- if (STp->buffer == NULL) {
- printk(KERN_WARNING "st%d: Can't allocate tape buffer.\n", dev);
- retval = (-EBUSY);
- goto err_out;
- }
+ /* See that we have at least a one page buffer available */
+ if (!enlarge_buffer(STp->buffer, PAGE_SIZE, STp->restr_dma)) {
+ printk(KERN_WARNING "st%d: Can't allocate tape buffer.\n", dev);
+ retval = (-EOVERFLOW);
+ goto err_out;
}
(STp->buffer)->writing = 0;
(STp->buffer)->syscall_result = 0;
- (STp->buffer)->use_sg = STp->device->host->sg_tablesize;
-
- /* Compute the usable buffer size for this SCSI adapter */
- if (!(STp->buffer)->use_sg)
- (STp->buffer)->buffer_size = (STp->buffer)->sg[0].length;
- else {
- for (i = 0, (STp->buffer)->buffer_size = 0; i < (STp->buffer)->use_sg &&
- i < (STp->buffer)->sg_segs; i++)
- (STp->buffer)->buffer_size += (STp->buffer)->sg[i].length;
- }
STp->write_prot = ((filp->f_flags & O_ACCMODE) == O_RDONLY);
@@ -999,10 +966,7 @@ static int st_open(struct inode *inode, struct file *filp)
return 0;
err_out:
- if (STp->buffer != NULL) {
- (STp->buffer)->in_use = 0;
- STp->buffer = NULL;
- }
+ normalize_buffer(STp->buffer);
STp->in_use = 0;
STp->device->access_count--;
if (STp->device->host->hostt->module)
@@ -1149,16 +1113,8 @@ static int st_release(struct inode *inode, struct file *filp)
if (STp->door_locked == ST_LOCKED_AUTO)
st_int_ioctl(STp, MTUNLOCK, 0);
- if (STp->buffer != NULL) {
- normalize_buffer(STp->buffer);
- write_lock(&st_dev_arr_lock);
- (STp->buffer)->in_use = 0;
- STp->buffer = NULL;
- }
- else {
- write_lock(&st_dev_arr_lock);
- }
-
+ normalize_buffer(STp->buffer);
+ write_lock(&st_dev_arr_lock);
STp->in_use = 0;
write_unlock(&st_dev_arr_lock);
STp->device->access_count--;
@@ -1168,31 +1124,11 @@ static int st_release(struct inode *inode, struct file *filp)
return result;
}
-
-/* Write command */
-static ssize_t
- st_write(struct file *filp, const char *buf, size_t count, loff_t * ppos)
+/* The checks common to both reading and writing */
+static ssize_t rw_checks(Scsi_Tape *STp, struct file *filp, size_t count, loff_t *ppos)
{
- struct inode *inode = filp->f_dentry->d_inode;
- ssize_t total;
- ssize_t i, do_count, blks, transfer;
+ int bufsize;
ssize_t retval = 0;
- int write_threshold;
- int doing_write = 0;
- unsigned char cmd[MAX_COMMAND_SIZE];
- const char *b_point;
- Scsi_Request *SRpnt = NULL;
- Scsi_Tape *STp;
- ST_mode *STm;
- ST_partstat *STps;
- int dev = TAPE_NR(inode->i_rdev);
-
- read_lock(&st_dev_arr_lock);
- STp = scsi_tapes[dev];
- read_unlock(&st_dev_arr_lock);
-
- if (down_interruptible(&STp->lock))
- return -ERESTARTSYS;
/*
* If we are in the middle of error recovery, don't let anyone
@@ -1219,13 +1155,11 @@ static ssize_t
goto out;
}
- STm = &(STp->modes[STp->current_mode]);
- if (!STm->defined) {
+ if (! STp->modes[STp->current_mode].defined) {
retval = (-ENXIO);
goto out;
}
- if (count == 0)
- goto out;
+
/*
* If there was a bus reset, block further access
@@ -1236,30 +1170,20 @@ static ssize_t
goto out;
}
+ if (count == 0)
+ goto out;
+
DEB(
if (!STp->in_use) {
+ int dev = TAPE_NR(filp->f_dentry->d_inode->i_rdev);
printk(ST_DEB_MSG "st%d: Incorrect device.\n", dev);
retval = (-EIO);
goto out;
} ) /* end DEB */
- /* Write must be integral number of blocks */
- if (STp->block_size != 0 && (count % STp->block_size) != 0) {
- printk(KERN_WARNING "st%d: Write not multiple of tape block size.\n",
- dev);
- retval = (-EINVAL);
- goto out;
- }
-
if (STp->can_partitions &&
(retval = update_partition(STp)) < 0)
goto out;
- STps = &(STp->ps[STp->partition]);
-
- if (STp->write_prot) {
- retval = (-EACCES);
- goto out;
- }
if (STp->block_size == 0) {
if (STp->max_block > 0 &&
@@ -1273,19 +1197,73 @@ static ssize_t
goto out;
}
}
- if ((STp->buffer)->buffer_blocks < 1) {
- /* Fixed block mode with too small buffer */
- if (!enlarge_buffer(STp->buffer, STp->block_size, STp->restr_dma)) {
+ else {
+ /* Fixed block mode with too small buffer? */
+ bufsize = STp->block_size > st_fixed_buffer_size ?
+ STp->block_size : st_fixed_buffer_size;
+ if ((STp->buffer)->buffer_size < bufsize &&
+ !enlarge_buffer(STp->buffer, bufsize, STp->restr_dma)) {
retval = (-EOVERFLOW);
goto out;
}
- (STp->buffer)->buffer_blocks = 1;
+ (STp->buffer)->buffer_blocks = bufsize / STp->block_size;
}
if (STp->do_auto_lock && STp->door_locked == ST_UNLOCKED &&
!st_int_ioctl(STp, MTLOCK, 0))
STp->door_locked = ST_LOCKED_AUTO;
+ out:
+ return retval;
+}
+
+
+/* Write command */
+static ssize_t
+ st_write(struct file *filp, const char *buf, size_t count, loff_t * ppos)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ ssize_t total;
+ ssize_t i, do_count, blks, transfer;
+ ssize_t retval;
+ int write_threshold;
+ int doing_write = 0;
+ unsigned char cmd[MAX_COMMAND_SIZE];
+ const char *b_point;
+ Scsi_Request *SRpnt = NULL;
+ Scsi_Tape *STp;
+ ST_mode *STm;
+ ST_partstat *STps;
+ int dev = TAPE_NR(inode->i_rdev);
+
+ read_lock(&st_dev_arr_lock);
+ STp = scsi_tapes[dev];
+ read_unlock(&st_dev_arr_lock);
+
+ if (down_interruptible(&STp->lock))
+ return -ERESTARTSYS;
+
+ retval = rw_checks(STp, filp, count, ppos);
+ if (retval || count == 0)
+ goto out;
+
+ /* Write must be integral number of blocks */
+ if (STp->block_size != 0 && (count % STp->block_size) != 0) {
+ printk(KERN_WARNING "st%d: Write not multiple of tape block size.\n",
+ dev);
+ retval = (-EINVAL);
+ goto out;
+ }
+
+ STm = &(STp->modes[STp->current_mode]);
+ STps = &(STp->ps[STp->partition]);
+
+ if (STp->write_prot) {
+ retval = (-EACCES);
+ goto out;
+ }
+
+
if (STps->rw == ST_READING) {
retval = flush_buffer(STp, 0);
if (retval)
@@ -1718,77 +1696,17 @@ static ssize_t
if (down_interruptible(&STp->lock))
return -ERESTARTSYS;
- /*
- * If we are in the middle of error recovery, don't let anyone
- * else try and use this device. Also, if error recovery fails, it
- * may try and take the device offline, in which case all further
- * access to the device is prohibited.
- */
- if (!scsi_block_when_processing_errors(STp->device)) {
- retval = (-ENXIO);
- goto out;
- }
-
- if (ppos != &filp->f_pos) {
- /* "A request was outside the capabilities of the device." */
- retval = (-ENXIO);
+ retval = rw_checks(STp, filp, count, ppos);
+ if (retval || count == 0)
goto out;
- }
- if (STp->ready != ST_READY) {
- if (STp->ready == ST_NO_TAPE)
- retval = (-ENOMEDIUM);
- else
- retval = (-EIO);
- goto out;
- }
STm = &(STp->modes[STp->current_mode]);
- if (!STm->defined) {
- retval = (-ENXIO);
- goto out;
- }
- DEB(
- if (!STp->in_use) {
- printk(ST_DEB_MSG "st%d: Incorrect device.\n", dev);
- retval = (-EIO);
- goto out;
- } ) /* end DEB */
-
- if (STp->can_partitions &&
- (retval = update_partition(STp)) < 0)
- goto out;
-
- if (STp->block_size == 0) {
- if (STp->max_block > 0 &&
- (count < STp->min_block || count > STp->max_block)) {
- retval = (-EINVAL);
- goto out;
- }
- if (count > (STp->buffer)->buffer_size &&
- !enlarge_buffer(STp->buffer, count, STp->restr_dma)) {
- retval = (-EOVERFLOW);
- goto out;
- }
- }
- if ((STp->buffer)->buffer_blocks < 1) {
- /* Fixed block mode with too small buffer */
- if (!enlarge_buffer(STp->buffer, STp->block_size, STp->restr_dma)) {
- retval = (-EOVERFLOW);
- goto out;
- }
- (STp->buffer)->buffer_blocks = 1;
- }
-
if (!(STm->do_read_ahead) && STp->block_size != 0 &&
(count % STp->block_size) != 0) {
retval = (-EINVAL); /* Read must be integral number of blocks */
goto out;
}
- if (STp->do_auto_lock && STp->door_locked == ST_UNLOCKED &&
- !st_int_ioctl(STp, MTLOCK, 0))
- STp->door_locked = ST_LOCKED_AUTO;
-
STps = &(STp->ps[STp->partition]);
if (STps->rw == ST_WRITING) {
retval = flush_buffer(STp, 0);
@@ -1986,7 +1904,7 @@ static int st_set_options(Scsi_Tape *STp, long options)
st_log_options(STp, STm, dev);
} else if (code == MT_ST_WRITE_THRESHOLD) {
value = (options & ~MT_ST_OPTIONS) * ST_KILOBYTE;
- if (value < 1 || value > st_buffer_size) {
+ if (value < 1 || value > st_fixed_buffer_size) {
printk(KERN_WARNING
"st%d: Write threshold %d too small or too large.\n",
dev, value);
@@ -2289,8 +2207,10 @@ static int do_load_unload(Scsi_Tape *STp, struct file *filp, int load_code)
if (!retval) { /* SCSI command successful */
- if (!load_code)
+ if (!load_code) {
STp->rew_at_close = 0;
+ STp->ready = ST_NO_TAPE;
+ }
else {
STp->rew_at_close = STp->autorew_dev;
retval = check_tape(STp, filp);
@@ -2619,10 +2539,14 @@ static int st_int_ioctl(Scsi_Tape *STp, unsigned int cmd_in, unsigned long arg)
ioctl_result = st_int_ioctl(STp, MTBSF, 1);
if (cmd_in == MTSETBLK || cmd_in == SET_DENS_AND_BLK) {
+ int old_block_size = STp->block_size;
STp->block_size = arg & MT_ST_BLKSIZE_MASK;
- if (STp->block_size != 0)
+ if (STp->block_size != 0) {
+ if (old_block_size == 0)
+ normalize_buffer(STp->buffer);
(STp->buffer)->buffer_blocks =
(STp->buffer)->buffer_size / STp->block_size;
+ }
(STp->buffer)->buffer_bytes = (STp->buffer)->read_pointer = 0;
if (cmd_in == SET_DENS_AND_BLK)
STp->density = arg >> MT_ST_DENSITY_SHIFT;
@@ -3372,18 +3296,11 @@ static int st_ioctl(struct inode *inode, struct file *file,
/* Try to allocate a new tape buffer. Calling function must not hold
dev_arr_lock. */
static ST_buffer *
- new_tape_buffer(int from_initialization, int need_dma, int in_use)
+ new_tape_buffer(int from_initialization, int need_dma)
{
- int i, priority, b_size, order, got = 0, segs = 0;
+ int i, priority, got = 0, segs = 0;
ST_buffer *tb;
- read_lock(&st_dev_arr_lock);
- if (st_nbr_buffers >= st_template.dev_max) {
- read_unlock(&st_dev_arr_lock);
- return NULL; /* Should never happen */
- }
- read_unlock(&st_dev_arr_lock);
-
if (from_initialization)
priority = GFP_ATOMIC;
else
@@ -3391,85 +3308,19 @@ static ST_buffer *
i = sizeof(ST_buffer) + (st_max_sg_segs - 1) * sizeof(struct scatterlist);
tb = kmalloc(i, priority);
- if (tb) {
- if (need_dma)
- priority |= GFP_DMA;
-
- /* Try to allocate the first segment up to ST_FIRST_ORDER and the
- others big enough to reach the goal */
- for (b_size = PAGE_SIZE, order=0;
- b_size < st_buffer_size && order < ST_FIRST_ORDER;
- order++, b_size *= 2)
- ;
- for ( ; b_size >= PAGE_SIZE; order--, b_size /= 2) {
- tb->sg[0].page = alloc_pages(priority, order);
- tb->sg[0].offset = 0;
- if (tb->sg[0].page != NULL) {
- tb->sg[0].length = b_size;
- break;
- }
- }
- if (tb->sg[segs].page == NULL) {
- kfree(tb);
- tb = NULL;
- } else { /* Got something, continue */
-
- for (b_size = PAGE_SIZE, order=0;
- st_buffer_size >
- tb->sg[0].length + (ST_FIRST_SG - 1) * b_size;
- order++, b_size *= 2)
- ;
- for (segs = 1, got = tb->sg[0].length;
- got < st_buffer_size && segs < ST_FIRST_SG;) {
- tb->sg[segs].page = alloc_pages(priority, order);
- tb->sg[segs].offset = 0;
- if (tb->sg[segs].page == NULL) {
- if (st_buffer_size - got <=
- (ST_FIRST_SG - segs) * b_size / 2) {
- b_size /= 2; /* Large enough for the
- rest of the buffers */
- order--;
- continue;
- }
- tb->sg_segs = segs;
- tb->orig_sg_segs = 0;
- DEB(tb->buffer_size = got);
- normalize_buffer(tb);
- kfree(tb);
- tb = NULL;
- break;
- }
- tb->sg[segs].length = b_size;
- got += b_size;
- segs++;
- }
- }
- }
-
if (!tb) {
- printk(KERN_NOTICE "st: Can't allocate new tape buffer (nbr %d).\n",
- st_nbr_buffers);
+ printk(KERN_NOTICE "st: Can't allocate new tape buffer.\n");
return NULL;
}
tb->sg_segs = tb->orig_sg_segs = segs;
- tb->b_data = page_address(tb->sg[0].page);
+ if (segs > 0)
+ tb->b_data = page_address(tb->sg[0].page);
- DEBC(printk(ST_DEB_MSG
- "st: Allocated tape buffer %d (%d bytes, %d segments, dma: %d, a: %p).\n",
- st_nbr_buffers, got, tb->sg_segs, need_dma, tb->b_data);
- printk(ST_DEB_MSG
- "st: segment sizes: first %d, last %d bytes.\n",
- tb->sg[0].length, tb->sg[segs - 1].length);
- )
- tb->in_use = in_use;
+ tb->in_use = TRUE;
tb->dma = need_dma;
tb->buffer_size = got;
tb->writing = 0;
- write_lock(&st_dev_arr_lock);
- st_buffers[st_nbr_buffers++] = tb;
- write_unlock(&st_dev_arr_lock);
-
return tb;
}
@@ -3479,6 +3330,9 @@ static int enlarge_buffer(ST_buffer * STbuffer, int new_size, int need_dma)
{
int segs, nbr, max_segs, b_size, priority, order, got;
+ if (new_size <= STbuffer->buffer_size)
+ return TRUE;
+
normalize_buffer(STbuffer);
max_segs = STbuffer->use_sg;
@@ -3492,13 +3346,14 @@ static int enlarge_buffer(ST_buffer * STbuffer, int new_size, int need_dma)
if (need_dma)
priority |= GFP_DMA;
for (b_size = PAGE_SIZE, order=0;
- b_size * nbr < new_size - STbuffer->buffer_size;
+ b_size < new_size - STbuffer->buffer_size;
order++, b_size *= 2)
; /* empty */
for (segs = STbuffer->sg_segs, got = STbuffer->buffer_size;
segs < max_segs && got < new_size;) {
STbuffer->sg[segs].page = alloc_pages(priority, order);
+ /* printk("st: allocated %x, order %d\n", STbuffer->sg[segs].page, order); */
STbuffer->sg[segs].offset = 0;
if (STbuffer->sg[segs].page == NULL) {
if (new_size - got <= (max_segs - segs) * b_size / 2) {
@@ -3518,9 +3373,10 @@ static int enlarge_buffer(ST_buffer * STbuffer, int new_size, int need_dma)
STbuffer->buffer_size = got;
segs++;
}
+ STbuffer->b_data = page_address(STbuffer->sg[0].page);
DEBC(printk(ST_DEB_MSG
- "st: Succeeded to enlarge buffer to %d bytes (segs %d->%d, %d).\n",
- got, STbuffer->orig_sg_segs, STbuffer->sg_segs, b_size));
+ "st: Succeeded to enlarge buffer at %p to %d bytes (segs %d->%d, %d).\n",
+ STbuffer, got, STbuffer->orig_sg_segs, STbuffer->sg_segs, b_size));
return TRUE;
}
@@ -3535,14 +3391,14 @@ static void normalize_buffer(ST_buffer * STbuffer)
for (b_size=PAGE_SIZE, order=0; b_size < STbuffer->sg[i].length;
order++, b_size *= 2)
; /* empty */
+ /* printk("st: freeing %x, order %d\n", STbuffer->sg[i].page, order); */
__free_pages(STbuffer->sg[i].page, order);
STbuffer->buffer_size -= STbuffer->sg[i].length;
}
DEB(
if (debugging && STbuffer->orig_sg_segs < STbuffer->sg_segs)
printk(ST_DEB_MSG "st: Buffer at %p normalized to %d bytes (segs %d).\n",
- page_address(STbuffer->sg[0].page), STbuffer->buffer_size,
- STbuffer->sg_segs);
+ STbuffer, STbuffer->buffer_size, STbuffer->sg_segs);
) /* end DEB */
STbuffer->sg_segs = STbuffer->orig_sg_segs;
}
@@ -3619,18 +3475,16 @@ static int from_buffer(ST_buffer * st_bp, char *ubp, int do_count)
static void validate_options(void)
{
if (buffer_kbs > 0)
- st_buffer_size = buffer_kbs * ST_KILOBYTE;
+ st_fixed_buffer_size = buffer_kbs * ST_KILOBYTE;
if (write_threshold_kbs > 0)
st_write_threshold = write_threshold_kbs * ST_KILOBYTE;
else if (buffer_kbs > 0)
- st_write_threshold = st_buffer_size - 2048;
- if (st_write_threshold > st_buffer_size) {
- st_write_threshold = st_buffer_size;
+ st_write_threshold = st_fixed_buffer_size - 2048;
+ if (st_write_threshold > st_fixed_buffer_size) {
+ st_write_threshold = st_fixed_buffer_size;
printk(KERN_WARNING "st: write_threshold limited to %d bytes.\n",
st_write_threshold);
}
- if (max_buffers >= 0)
- st_max_buffers = max_buffers;
if (max_sg_segs >= ST_FIRST_SG)
st_max_sg_segs = max_sg_segs;
}
@@ -3694,7 +3548,8 @@ static int st_attach(Scsi_Device * SDp)
Scsi_Tape *tpnt;
ST_mode *STm;
ST_partstat *STps;
- int i, mode, target_nbr, dev_num;
+ ST_buffer *buffer;
+ int i, mode, dev_num;
char *stp;
if (SDp->type != TYPE_TAPE)
@@ -3707,6 +3562,12 @@ static int st_attach(Scsi_Device * SDp)
return 1;
}
+ buffer = new_tape_buffer(TRUE, (SDp->host)->unchecked_isa_dma);
+ if (buffer == NULL) {
+ printk(KERN_ERR "st: Can't allocate new tape buffer. Device not attached.\n");
+ return 1;
+ }
+
write_lock(&st_dev_arr_lock);
if (st_template.nr_dev >= st_template.dev_max) {
Scsi_Tape **tmp_da;
@@ -3745,14 +3606,6 @@ static int st_attach(Scsi_Device * SDp)
}
scsi_tapes = tmp_da;
- memset(tmp_ba, 0, tmp_dev_max * sizeof(ST_buffer *));
- if (st_buffers != NULL) {
- memcpy(tmp_ba, st_buffers,
- st_template.dev_max * sizeof(ST_buffer *));
- kfree(st_buffers);
- }
- st_buffers = tmp_ba;
-
st_template.dev_max = tmp_dev_max;
}
@@ -3799,6 +3652,9 @@ static int st_attach(Scsi_Device * SDp)
else
tpnt->tape_type = MT_ISSCSI2;
+ buffer->use_sg = tpnt->device->host->sg_tablesize;
+ tpnt->buffer = buffer;
+
tpnt->inited = 0;
tpnt->devt = mk_kdev(SCSI_TAPE_MAJOR, i);
tpnt->dirty = 0;
@@ -3858,18 +3714,6 @@ static int st_attach(Scsi_Device * SDp)
"Attached scsi tape st%d at scsi%d, channel %d, id %d, lun %d\n",
dev_num, SDp->host->host_no, SDp->channel, SDp->id, SDp->lun);
- /* See if we need to allocate more static buffers */
- target_nbr = st_template.nr_dev;
- if (target_nbr > st_max_buffers)
- target_nbr = st_max_buffers;
- for (i=st_nbr_buffers; i < target_nbr; i++)
- if (!new_tape_buffer(TRUE, TRUE, FALSE)) {
- printk(KERN_INFO "st: Unable to allocate new static buffer.\n");
- break;
- }
- /* If the previous allocation fails, we will try again when the buffer is
- really needed. */
-
return 0;
};
@@ -3897,6 +3741,11 @@ static void st_detach(Scsi_Device * SDp)
devfs_unregister (tpnt->de_n[mode]);
tpnt->de_n[mode] = NULL;
}
+ if (tpnt->buffer) {
+ tpnt->buffer->orig_sg_segs = 0;
+ normalize_buffer(tpnt->buffer);
+ kfree(tpnt->buffer);
+ }
kfree(tpnt);
scsi_tapes[i] = 0;
SDp->attached--;
@@ -3916,10 +3765,10 @@ static int __init init_st(void)
validate_options();
printk(KERN_INFO
- "st: Version %s, bufsize %d, wrt %d, "
- "max init. bufs %d, s/g segs %d\n",
- verstr, st_buffer_size, st_write_threshold,
- st_max_buffers, st_max_sg_segs);
+ "st: Version %s, fixed bufsize %d, wrt %d, "
+ "s/g segs %d\n",
+ verstr, st_fixed_buffer_size, st_write_threshold,
+ st_max_sg_segs);
if (devfs_register_chrdev(SCSI_TAPE_MAJOR, "st", &st_fops) >= 0)
return scsi_register_device(&st_template);
@@ -3939,16 +3788,6 @@ static void __exit exit_st(void)
if (scsi_tapes[i])
kfree(scsi_tapes[i]);
kfree(scsi_tapes);
- if (st_buffers != NULL) {
- for (i = 0; i < st_nbr_buffers; i++) {
- if (st_buffers[i] != NULL) {
- st_buffers[i]->orig_sg_segs = 0;
- normalize_buffer(st_buffers[i]);
- kfree(st_buffers[i]);
- }
- }
- kfree(st_buffers);
- }
}
st_template.dev_max = 0;
printk(KERN_INFO "st: Unloaded.\n");
diff --git a/drivers/scsi/st_options.h b/drivers/scsi/st_options.h
index 325bd3cb5c1e..2c412f72be13 100644
--- a/drivers/scsi/st_options.h
+++ b/drivers/scsi/st_options.h
@@ -3,7 +3,7 @@
Copyright 1995-2000 Kai Makisara.
- Last modified: Tue Jan 22 21:52:34 2002 by makisara
+ Last modified: Sun May 5 15:09:56 2002 by makisara
*/
#ifndef _ST_OPTIONS_H
@@ -30,22 +30,17 @@
SENSE. */
#define ST_DEFAULT_BLOCK 0
-/* The tape driver buffer size in kilobytes. Must be non-zero. */
-#define ST_BUFFER_BLOCKS 32
+/* The minimum tape driver buffer size in kilobytes in fixed block mode.
+ Must be non-zero. */
+#define ST_FIXED_BUFFER_BLOCKS 32
/* The number of kilobytes of data in the buffer that triggers an
asynchronous write in fixed block mode. See also ST_ASYNC_WRITES
below. */
#define ST_WRITE_THRESHOLD_BLOCKS 30
-/* The maximum number of tape buffers the driver tries to allocate at
- driver initialisation. The number is also constrained by the number
- of drives detected. If more buffers are needed, they are allocated
- at run time and freed after use. */
-#define ST_MAX_BUFFERS 4
-
/* Maximum number of scatter/gather segments */
-#define ST_MAX_SG 16
+#define ST_MAX_SG 64
/* The number of scatter/gather segments to allocate at first try (must be
smaller or equal to the maximum). */
diff --git a/fs/bio.c b/fs/bio.c
index e89734a07bea..5fdae32e35ae 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -17,6 +17,7 @@
*
*/
#include <linux/mm.h>
+#include <linux/bio.h>
#include <linux/blk.h>
#include <linux/slab.h>
#include <linux/iobuf.h>
@@ -284,8 +285,8 @@ struct bio *bio_copy(struct bio *bio, int gfp_mask, int copy)
vto = kmap(bbv->bv_page);
} else {
local_irq_save(flags);
- vfrom = kmap_atomic(bv->bv_page, KM_BIO_IRQ);
- vto = kmap_atomic(bbv->bv_page, KM_BIO_IRQ);
+ vfrom = kmap_atomic(bv->bv_page, KM_BIO_SRC_IRQ);
+ vto = kmap_atomic(bbv->bv_page, KM_BIO_DST_IRQ);
}
memcpy(vto + bbv->bv_offset, vfrom + bv->bv_offset, bv->bv_len);
@@ -293,8 +294,8 @@ struct bio *bio_copy(struct bio *bio, int gfp_mask, int copy)
kunmap(bbv->bv_page);
kunmap(bv->bv_page);
} else {
- kunmap_atomic(vto, KM_BIO_IRQ);
- kunmap_atomic(vfrom, KM_BIO_IRQ);
+ kunmap_atomic(vto, KM_BIO_DST_IRQ);
+ kunmap_atomic(vfrom, KM_BIO_SRC_IRQ);
local_irq_restore(flags);
}
}
diff --git a/fs/buffer.c b/fs/buffer.c
index b7e31f59193b..dde8e7d9bae6 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -152,14 +152,16 @@ __set_page_buffers(struct page *page, struct buffer_head *head)
{
if (page_has_buffers(page))
buffer_error();
- set_page_buffers(page, head);
page_cache_get(page);
+ SetPagePrivate(page);
+ page->private = (unsigned long)head;
}
static inline void
__clear_page_buffers(struct page *page)
{
- clear_page_buffers(page);
+ ClearPagePrivate(page);
+ page->private = 0;
page_cache_release(page);
}
@@ -376,7 +378,7 @@ out:
}
/*
- * Various filesystems appear to want __get_hash_table to be non-blocking.
+ * Various filesystems appear to want __find_get_block to be non-blocking.
* But it's the page lock which protects the buffers. To get around this,
* we get exclusion from try_to_free_buffers with the blockdev mapping's
* private_lock.
@@ -387,7 +389,7 @@ out:
* private_lock is contended then so is mapping->page_lock).
*/
struct buffer_head *
-__get_hash_table(struct block_device *bdev, sector_t block, int unused)
+__find_get_block(struct block_device *bdev, sector_t block, int unused)
{
struct inode *bd_inode = bdev->bd_inode;
struct address_space *bd_mapping = bd_inode->i_mapping;
@@ -492,7 +494,7 @@ static void free_more_memory(void)
}
/*
- * I/O completion handler for block_read_full_page() and brw_page() - pages
+ * I/O completion handler for block_read_full_page() - pages
* which come unlocked at the end of I/O.
*/
static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
@@ -542,14 +544,6 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
*/
if (page_uptodate && !PageError(page))
SetPageUptodate(page);
-
- /*
- * swap page handling is a bit hacky. A standalone completion handler
- * for swapout pages would fix that up. swapin can use this function.
- */
- if (PageSwapCache(page) && PageWriteback(page))
- end_page_writeback(page);
-
unlock_page(page);
return;
@@ -856,8 +850,9 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
if (mapping->assoc_mapping != buffer_mapping)
BUG();
}
- buffer_insert_list(&buffer_mapping->private_lock,
- bh, &mapping->private_list);
+ if (list_empty(&bh->b_assoc_buffers))
+ buffer_insert_list(&buffer_mapping->private_lock,
+ bh, &mapping->private_list);
}
EXPORT_SYMBOL(mark_buffer_dirty_inode);
@@ -952,12 +947,12 @@ void invalidate_inode_buffers(struct inode *inode)
* the size of each buffer.. Use the bh->b_this_page linked list to
* follow the buffers created. Return NULL if unable to create more
* buffers.
- * The async flag is used to differentiate async IO (paging, swapping)
- * from ordinary buffer allocations, and only async requests are allowed
- * to sleep waiting for buffer heads.
+ *
+ * The retry flag is used to differentiate async IO (paging, swapping)
+ * which may not fail from ordinary buffer allocations.
*/
static struct buffer_head *
-create_buffers(struct page * page, unsigned long size, int async)
+create_buffers(struct page * page, unsigned long size, int retry)
{
struct buffer_head *bh, *head;
long offset;
@@ -966,7 +961,7 @@ try_again:
head = NULL;
offset = PAGE_SIZE;
while ((offset -= size) >= 0) {
- bh = alloc_buffer_head(async);
+ bh = alloc_buffer_head();
if (!bh)
goto no_grow;
@@ -1003,7 +998,7 @@ no_grow:
* become available. But we don't want tasks sleeping with
* partially complete buffers, so all were released above.
*/
- if (!async)
+ if (!retry)
return NULL;
/* We're _really_ low on memory. Now we just
@@ -1096,7 +1091,7 @@ grow_dev_page(struct block_device *bdev, unsigned long block,
/*
* Link the page to the buffers and initialise them. Take the
- * lock to be atomic wrt __get_hash_table(), which does not
+ * lock to be atomic wrt __find_get_block(), which does not
* run under the page lock.
*/
spin_lock(&inode->i_mapping->private_lock);
@@ -1169,7 +1164,7 @@ __getblk(struct block_device *bdev, sector_t block, int size)
for (;;) {
struct buffer_head * bh;
- bh = __get_hash_table(bdev, block, size);
+ bh = __find_get_block(bdev, block, size);
if (bh) {
touch_buffer(bh);
return bh;
@@ -1218,7 +1213,7 @@ void mark_buffer_dirty(struct buffer_head *bh)
{
if (!buffer_uptodate(bh))
buffer_error();
- if (!test_set_buffer_dirty(bh))
+ if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
__set_page_dirty_nobuffers(bh->b_page);
}
@@ -1243,10 +1238,17 @@ void __brelse(struct buffer_head * buf)
* bforget() is like brelse(), except it discards any
* potentially dirty data.
*/
-void __bforget(struct buffer_head * buf)
+void __bforget(struct buffer_head *bh)
{
- clear_buffer_dirty(buf);
- __brelse(buf);
+ clear_buffer_dirty(bh);
+ if (!list_empty(&bh->b_assoc_buffers)) {
+ struct address_space *buffer_mapping = bh->b_page->mapping;
+
+ spin_lock(&buffer_mapping->private_lock);
+ list_del_init(&bh->b_assoc_buffers);
+ spin_unlock(&buffer_mapping->private_lock);
+ }
+ __brelse(bh);
}
/**
@@ -1359,11 +1361,11 @@ int block_invalidatepage(struct page *page, unsigned long offset)
{
struct buffer_head *head, *bh, *next;
unsigned int curr_off = 0;
+ int ret = 1;
- if (!PageLocked(page))
- BUG();
+ BUG_ON(!PageLocked(page));
if (!page_has_buffers(page))
- return 1;
+ goto out;
head = page_buffers(page);
bh = head;
@@ -1385,12 +1387,10 @@ int block_invalidatepage(struct page *page, unsigned long offset)
* The get_block cached value has been unconditionally invalidated,
* so real IO is not possible anymore.
*/
- if (offset == 0) {
- if (!try_to_release_page(page, 0))
- return 0;
- }
-
- return 1;
+ if (offset == 0)
+ ret = try_to_release_page(page, 0);
+out:
+ return ret;
}
EXPORT_SYMBOL(block_invalidatepage);
@@ -1449,7 +1449,7 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
{
struct buffer_head *old_bh;
- old_bh = __get_hash_table(bdev, block, 0);
+ old_bh = __find_get_block(bdev, block, 0);
if (old_bh) {
#if 0 /* This happens. Later. */
if (buffer_dirty(old_bh))
@@ -2266,68 +2266,6 @@ int brw_kiovec(int rw, int nr, struct kiobuf *iovec[],
}
/*
- * Start I/O on a page.
- * This function expects the page to be locked and may return
- * before I/O is complete. You then have to check page->locked
- * and page->uptodate.
- *
- * FIXME: we need a swapper_inode->get_block function to remove
- * some of the bmap kludges and interface ugliness here.
- *
- * NOTE: unlike file pages, swap pages are locked while under writeout.
- * This is to throttle processes which reuse their swapcache pages while
- * they are under writeout, and to ensure that there is no I/O going on
- * when the page has been successfully locked. Functions such as
- * free_swap_and_cache() need to guarantee that there is no I/O in progress
- * because they will be freeing up swap blocks, which may then be reused.
- *
- * Swap pages are also marked PageWriteback when they are being written
- * so that memory allocators will throttle on them.
- */
-int brw_page(int rw, struct page *page,
- struct block_device *bdev, sector_t b[], int size)
-{
- struct buffer_head *head, *bh;
-
- BUG_ON(!PageLocked(page));
-
- if (!page_has_buffers(page))
- create_empty_buffers(page, size, 0);
- head = bh = page_buffers(page);
-
- /* Stage 1: lock all the buffers */
- do {
- lock_buffer(bh);
- bh->b_blocknr = *(b++);
- bh->b_bdev = bdev;
- set_buffer_mapped(bh);
- if (rw == WRITE) {
- set_buffer_uptodate(bh);
- clear_buffer_dirty(bh);
- }
- /*
- * Swap pages are locked during writeout, so use
- * buffer_async_read in strange ways.
- */
- mark_buffer_async_read(bh);
- bh = bh->b_this_page;
- } while (bh != head);
-
- if (rw == WRITE) {
- BUG_ON(PageWriteback(page));
- SetPageWriteback(page);
- }
-
- /* Stage 2: start the IO */
- do {
- struct buffer_head *next = bh->b_this_page;
- submit_bh(rw, bh);
- bh = next;
- } while (bh != head);
- return 0;
-}
-
-/*
* Sanity checks for try_to_free_buffers.
*/
static void check_ttfb_buffer(struct page *page, struct buffer_head *bh)
@@ -2456,7 +2394,7 @@ asmlinkage long sys_bdflush(int func, long data)
static kmem_cache_t *bh_cachep;
static mempool_t *bh_mempool;
-struct buffer_head *alloc_buffer_head(int async)
+struct buffer_head *alloc_buffer_head(void)
{
return mempool_alloc(bh_mempool, GFP_NOFS);
}
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 16bd5714cecf..5c581916ecdd 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -147,21 +147,26 @@ exit:
int coda_permission(struct inode *inode, int mask)
{
- int error;
+ int error = 0;
if (!mask)
return 0;
+ lock_kernel();
+
coda_vfs_stat.permission++;
if (coda_cache_check(inode, mask))
- return 0;
+ goto out;
error = venus_access(inode->i_sb, coda_i2f(inode), mask);
if (!error)
coda_cache_enter(inode, mask);
+ out:
+ unlock_kernel();
+
return error;
}
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index f8f6828d5f59..c5cc2178ad4a 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -352,7 +352,7 @@ do_more:
#ifdef CONFIG_JBD_DEBUG
{
struct buffer_head *debug_bh;
- debug_bh = sb_get_hash_table(sb, block + i);
+ debug_bh = sb_find_get_block(sb, block + i);
if (debug_bh) {
BUFFER_TRACE(debug_bh, "Deleted!");
if (!bh2jh(bitmap_bh)->b_committed_data)
@@ -701,7 +701,7 @@ got_block:
struct buffer_head *debug_bh;
/* Record bitmap buffer state in the newly allocated block */
- debug_bh = sb_get_hash_table(sb, tmp);
+ debug_bh = sb_find_get_block(sb, tmp);
if (debug_bh) {
BUFFER_TRACE(debug_bh, "state when allocated");
BUFFER_TRACE2(debug_bh, bh, "bitmap state");
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index b339c253628e..a9b2c7beb70b 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1650,7 +1650,7 @@ ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
struct buffer_head *bh;
*p = 0;
- bh = sb_get_hash_table(inode->i_sb, nr);
+ bh = sb_find_get_block(inode->i_sb, nr);
ext3_forget(handle, 0, inode, bh, nr);
}
}
diff --git a/fs/inode.c b/fs/inode.c
index bc90e4232713..a3b2cd4e8a3c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -913,16 +913,6 @@ int bmap(struct inode * inode, int block)
return res;
}
-static inline void do_atime_update(struct inode *inode)
-{
- unsigned long time = CURRENT_TIME;
- if (inode->i_atime != time) {
- inode->i_atime = time;
- mark_inode_dirty_sync(inode);
- }
-}
-
-
/**
* update_atime - update the access time
* @inode: inode accessed
@@ -932,15 +922,19 @@ static inline void do_atime_update(struct inode *inode)
* as well as the "noatime" flag and inode specific "noatime" markers.
*/
-void update_atime (struct inode *inode)
+void update_atime(struct inode *inode)
{
if (inode->i_atime == CURRENT_TIME)
return;
- if ( IS_NOATIME (inode) ) return;
- if ( IS_NODIRATIME (inode) && S_ISDIR (inode->i_mode) ) return;
- if ( IS_RDONLY (inode) ) return;
- do_atime_update(inode);
-} /* End Function update_atime */
+ if (IS_NOATIME(inode))
+ return;
+ if (IS_NODIRATIME(inode) && S_ISDIR(inode->i_mode))
+ return;
+ if (IS_RDONLY(inode))
+ return;
+ inode->i_atime = CURRENT_TIME;
+ mark_inode_dirty_sync(inode);
+}
int inode_needs_sync(struct inode *inode)
{
diff --git a/fs/intermezzo/dir.c b/fs/intermezzo/dir.c
index c8a8c1988f16..cec0471800f1 100644
--- a/fs/intermezzo/dir.c
+++ b/fs/intermezzo/dir.c
@@ -785,13 +785,15 @@ int presto_permission(struct inode *inode, int mask)
{
unsigned short mode = inode->i_mode;
struct presto_cache *cache;
- int rc;
+ int rc = 0;
+ lock_kernel();
ENTRY;
+
if ( presto_can_ilookup() && !(mask & S_IWOTH)) {
CDEBUG(D_CACHE, "ilookup on %ld OK\n", inode->i_ino);
- EXIT;
- return 0;
+ EXIT;
+ goto out;
}
cache = presto_get_cache(inode);
@@ -803,25 +805,22 @@ int presto_permission(struct inode *inode, int mask)
if ( S_ISREG(mode) && fiops && fiops->permission ) {
EXIT;
- return fiops->permission(inode, mask);
+ rc = fiops->permission(inode, mask);
+ goto out;
}
if ( S_ISDIR(mode) && diops && diops->permission ) {
EXIT;
- return diops->permission(inode, mask);
+ rc = diops->permission(inode, mask);
+ goto out;
}
}
- /* The cache filesystem doesn't have its own permission function,
- * but we don't want to duplicate the VFS code here. In order
- * to avoid looping from permission calling this function again,
- * we temporarily override the permission operation while we call
- * the VFS permission function.
- */
- inode->i_op->permission = NULL;
- rc = permission(inode, mask);
- inode->i_op->permission = &presto_permission;
+ rc = vfs_permission(inode, mask);
EXIT;
+
+ out:
+ unlock_kernel();
return rc;
}
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index e4ce53b05a55..2283894a81a6 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -659,6 +659,20 @@ skip_commit:
* there's no point in keeping a checkpoint record for
* it. */
bh = jh2bh(jh);
+
+ /* A buffer which has been freed while still being
+ * journaled by a previous transaction may end up still
+ * being dirty here, but we want to avoid writing back
+ * that buffer in the future now that the last use has
+ * been committed. That's not only a performance gain,
+ * it also stops aliasing problems if the buffer is left
+ * behind for writeback and gets reallocated for another
+ * use in a different page. */
+ if (buffer_freed(bh)) {
+ clear_buffer_freed(bh);
+ clear_buffer_jbddirty(bh);
+ }
+
if (buffer_jdirty(bh)) {
JBUFFER_TRACE(jh, "add to new checkpointing trans");
__journal_insert_checkpoint(jh, commit_transaction);
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 052dd4ef3f01..ade37ad43606 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -463,7 +463,7 @@ int journal_write_metadata_buffer(transaction_t *transaction,
* Right, time to make up the new buffer_head.
*/
do {
- new_bh = alloc_buffer_head(0);
+ new_bh = alloc_buffer_head();
if (!new_bh) {
printk (KERN_NOTICE "%s: ENOMEM at alloc_buffer_head, "
"trying again.\n", __FUNCTION__);
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index 7cecb0237988..6a6464533c35 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -293,7 +293,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
bh = bh_in;
if (!bh) {
- bh = __get_hash_table(bdev, blocknr, journal->j_blocksize);
+ bh = __find_get_block(bdev, blocknr, journal->j_blocksize);
if (bh)
BUFFER_TRACE(bh, "found on hash");
}
@@ -303,7 +303,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
/* If there is a different buffer_head lying around in
* memory anywhere... */
- bh2 = __get_hash_table(bdev, blocknr, journal->j_blocksize);
+ bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize);
if (bh2) {
/* ... and it has RevokeValid status... */
if ((bh2 != bh) &&
@@ -407,7 +407,7 @@ int journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
* state machine will get very upset later on. */
if (need_cancel) {
struct buffer_head *bh2;
- bh2 = __get_hash_table(bh->b_bdev, bh->b_blocknr, bh->b_size);
+ bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size);
if (bh2) {
if (bh2 != bh)
clear_bit(BH_Revoked, &bh2->b_state);
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 89c625bf9fa8..37c9ed30ebfd 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1601,8 +1601,7 @@ void journal_unfile_buffer(struct journal_head *jh)
*
* Returns non-zero iff we were able to free the journal_head.
*/
-static int __journal_try_to_free_buffer(struct buffer_head *bh,
- int *locked_or_dirty)
+static inline int __journal_try_to_free_buffer(struct buffer_head *bh)
{
struct journal_head *jh;
@@ -1610,12 +1609,7 @@ static int __journal_try_to_free_buffer(struct buffer_head *bh,
jh = bh2jh(bh);
- if (buffer_locked(bh) || buffer_dirty(bh)) {
- *locked_or_dirty = 1;
- goto out;
- }
-
- if (!buffer_uptodate(bh)) /* AKPM: why? */
+ if (buffer_locked(bh) || buffer_dirty(bh))
goto out;
if (jh->b_next_transaction != 0)
@@ -1630,8 +1624,7 @@ static int __journal_try_to_free_buffer(struct buffer_head *bh,
__journal_remove_journal_head(bh);
__brelse(bh);
}
- }
- else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
+ } else if (jh->b_cp_transaction != 0 && jh->b_transaction == 0) {
/* written-back checkpointed metadata buffer */
if (jh->b_jlist == BJ_None) {
JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1647,10 +1640,8 @@ out:
}
/*
- * journal_try_to_free_buffers(). For all the buffers on this page,
- * if they are fully written out ordered data, move them onto BUF_CLEAN
- * so try_to_free_buffers() can reap them. Called with lru_list_lock
- * not held. Does its own locking.
+ * journal_try_to_free_buffers(). Try to remove all this page's buffers
+ * from the journal.
*
* This complicates JBD locking somewhat. We aren't protected by the
* BKL here. We wish to remove the buffer from its committing or
@@ -1669,50 +1660,28 @@ out:
* journal_try_to_free_buffer() is changing its state. But that
* cannot happen because we never reallocate freed data as metadata
* while the data is part of a transaction. Yes?
- *
- * This function returns non-zero if we wish try_to_free_buffers()
- * to be called. We do this is the page is releasable by try_to_free_buffers().
- * We also do it if the page has locked or dirty buffers and the caller wants
- * us to perform sync or async writeout.
*/
int journal_try_to_free_buffers(journal_t *journal,
- struct page *page, int gfp_mask)
+ struct page *page, int unused_gfp_mask)
{
+ struct buffer_head *head;
struct buffer_head *bh;
- struct buffer_head *tmp;
- int locked_or_dirty = 0;
- int call_ttfb = 1;
- int ret;
+ int ret = 0;
J_ASSERT(PageLocked(page));
- bh = page_buffers(page);
- tmp = bh;
+ head = page_buffers(page);
+ bh = head;
spin_lock(&journal_datalist_lock);
do {
- struct buffer_head *p = tmp;
-
- tmp = tmp->b_this_page;
- if (buffer_jbd(p))
- if (!__journal_try_to_free_buffer(p, &locked_or_dirty))
- call_ttfb = 0;
- } while (tmp != bh);
+ if (buffer_jbd(bh) && !__journal_try_to_free_buffer(bh)) {
+ spin_unlock(&journal_datalist_lock);
+ goto busy;
+ }
+ } while ((bh = bh->b_this_page) != head);
spin_unlock(&journal_datalist_lock);
-
- if (!(gfp_mask & (__GFP_IO|__GFP_WAIT)))
- goto out;
- if (!locked_or_dirty)
- goto out;
- /*
- * The VM wants us to do writeout, or to block on IO, or both.
- * So we allow try_to_free_buffers to be called even if the page
- * still has journalled buffers.
- */
- call_ttfb = 1;
-out:
- ret = 0;
- if (call_ttfb)
- ret = try_to_free_buffers(page);
+ ret = try_to_free_buffers(page);
+busy:
return ret;
}
@@ -1861,6 +1830,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
* running transaction if that is set, but nothing
* else. */
JBUFFER_TRACE(jh, "on committing transaction");
+ set_buffer_freed(bh);
if (jh->b_next_transaction) {
J_ASSERT(jh->b_next_transaction ==
journal->j_running_transaction);
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index ea37f1c39a64..7790f413096a 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -65,6 +65,7 @@
#include <linux/smp_lock.h>
#include <linux/completion.h>
#include <linux/buffer_head.h> /* for sync_blockdev() */
+#include <linux/bio.h>
#include "jfs_incore.h"
#include "jfs_filsys.h"
#include "jfs_metapage.h"
diff --git a/fs/namei.c b/fs/namei.c
index 506f8b5eee6b..8ac8afda4ccb 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -204,13 +204,8 @@ int vfs_permission(struct inode * inode, int mask)
int permission(struct inode * inode,int mask)
{
- if (inode->i_op && inode->i_op->permission) {
- int retval;
- lock_kernel();
- retval = inode->i_op->permission(inode, mask);
- unlock_kernel();
- return retval;
- }
+ if (inode->i_op && inode->i_op->permission)
+ return inode->i_op->permission(inode, mask);
return vfs_permission(inode, mask);
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 1cbf3a697bda..73d57238a1cc 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1123,6 +1123,8 @@ nfs_permission(struct inode *inode, int mask)
&& error != -EACCES)
goto out;
+ lock_kernel();
+
error = NFS_PROTO(inode)->access(inode, mask, 0);
if (error == -EACCES && NFS_CLIENT(inode)->cl_droppriv &&
@@ -1130,6 +1132,8 @@ nfs_permission(struct inode *inode, int mask)
(current->fsuid != current->uid || current->fsgid != current->gid))
error = NFS_PROTO(inode)->access(inode, mask, 1);
+ unlock_kernel();
+
out:
return error;
}
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fbff42392bab..7c20a2949e96 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -61,10 +61,10 @@ static void end_buffer_read_file_async(struct buffer_head *bh, int uptodate)
if (file_ofs < ni->initialized_size)
ofs = ni->initialized_size - file_ofs;
- addr = kmap_atomic(page, KM_BIO_IRQ);
+ addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
memset(addr + bh_offset(bh) + ofs, 0, bh->b_size - ofs);
flush_dcache_page(page);
- kunmap_atomic(addr, KM_BIO_IRQ);
+ kunmap_atomic(addr, KM_BIO_SRC_IRQ);
}
} else
SetPageError(page);
@@ -363,10 +363,10 @@ static void end_buffer_read_mftbmp_async(struct buffer_head *bh, int uptodate)
if (file_ofs < vol->mftbmp_initialized_size)
ofs = vol->mftbmp_initialized_size - file_ofs;
- addr = kmap_atomic(page, KM_BIO_IRQ);
+ addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
memset(addr + bh_offset(bh) + ofs, 0, bh->b_size - ofs);
flush_dcache_page(page);
- kunmap_atomic(addr, KM_BIO_IRQ);
+ kunmap_atomic(addr, KM_BIO_SRC_IRQ);
}
} else
SetPageError(page);
@@ -559,10 +559,10 @@ static void end_buffer_read_mst_async(struct buffer_head *bh, int uptodate)
if (file_ofs < ni->initialized_size)
ofs = ni->initialized_size - file_ofs;
- addr = kmap_atomic(page, KM_BIO_IRQ);
+ addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
memset(addr + bh_offset(bh) + ofs, 0, bh->b_size - ofs);
flush_dcache_page(page);
- kunmap_atomic(addr, KM_BIO_IRQ);
+ kunmap_atomic(addr, KM_BIO_SRC_IRQ);
}
} else
SetPageError(page);
@@ -593,7 +593,7 @@ static void end_buffer_read_mst_async(struct buffer_head *bh, int uptodate)
rec_size = ni->_IDM(index_block_size);
recs = PAGE_CACHE_SIZE / rec_size;
- addr = kmap_atomic(page, KM_BIO_IRQ);
+ addr = kmap_atomic(page, KM_BIO_SRC_IRQ);
for (i = 0; i < recs; i++) {
if (!post_read_mst_fixup((NTFS_RECORD*)(addr +
i * rec_size), rec_size))
@@ -607,7 +607,7 @@ static void end_buffer_read_mst_async(struct buffer_head *bh, int uptodate)
ni->_IDM(index_block_size_bits)) + i));
}
flush_dcache_page(page);
- kunmap_atomic(addr, KM_BIO_IRQ);
+ kunmap_atomic(addr, KM_BIO_SRC_IRQ);
if (likely(!nr_err && recs))
SetPageUptodate(page);
else {
diff --git a/fs/qnx4/fsync.c b/fs/qnx4/fsync.c
index 2bb315473ee6..df5bc75d5414 100644
--- a/fs/qnx4/fsync.c
+++ b/fs/qnx4/fsync.c
@@ -37,7 +37,7 @@ static int sync_block(struct inode *inode, unsigned short *block, int wait)
if (!*block)
return 0;
tmp = *block;
- bh = sb_get_hash_table(inode->i_sb, *block);
+ bh = sb_find_get_block(inode->i_sb, *block);
if (!bh)
return 0;
if (*block != tmp) {
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 0bdb34c5acf4..1cdcd39a06bd 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -920,7 +920,7 @@ static int is_left_neighbor_in_cache(
/* Get left neighbor block number. */
n_left_neighbor_blocknr = B_N_CHILD_NUM(p_s_tb->FL[n_h], n_left_neighbor_position);
/* Look for the left neighbor in the cache. */
- if ( (left = sb_get_hash_table(p_s_sb, n_left_neighbor_blocknr)) ) {
+ if ( (left = sb_find_get_block(p_s_sb, n_left_neighbor_blocknr)) ) {
RFALSE( buffer_uptodate (left) && ! B_IS_IN_TREE(left),
"vs-8170: left neighbor (%b %z) is not in the tree", left, left);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index c16dbdc12ca6..2cf16631e224 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -689,7 +689,7 @@ retry:
count = 0 ;
for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 && i < (jl->j_len + 1) ; i++) { /* everything but commit_bh */
bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start+i) % SB_ONDISK_JOURNAL_SIZE(s);
- tbh = journal_get_hash_table(s, bn) ;
+ tbh = journal_find_get_block(s, bn) ;
/* kill this sanity check */
if (count > (orig_commit_left + 2)) {
@@ -718,7 +718,7 @@ reiserfs_panic(s, "journal-539: flush_commit_list: BAD count(%d) > orig_commit_l
for (i = 0 ; atomic_read(&(jl->j_commit_left)) > 1 &&
i < (jl->j_len + 1) ; i++) { /* everything but commit_bh */
bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s) ;
- tbh = journal_get_hash_table(s, bn) ;
+ tbh = journal_find_get_block(s, bn) ;
wait_on_buffer(tbh) ;
if (!buffer_uptodate(tbh)) {
@@ -2764,7 +2764,7 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_bloc
int cleaned = 0 ;
if (reiserfs_dont_log(th->t_super)) {
- bh = sb_get_hash_table(p_s_sb, blocknr) ;
+ bh = sb_find_get_block(p_s_sb, blocknr) ;
if (bh && buffer_dirty (bh)) {
printk ("journal_mark_freed(dont_log): dirty buffer on hash list: %lx %ld\n", bh->b_state, blocknr);
BUG ();
@@ -2772,7 +2772,7 @@ int journal_mark_freed(struct reiserfs_transaction_handle *th, struct super_bloc
brelse (bh);
return 0 ;
}
- bh = sb_get_hash_table(p_s_sb, blocknr) ;
+ bh = sb_find_get_block(p_s_sb, blocknr) ;
/* if it is journal new, we just remove it from this transaction */
if (bh && buffer_journal_new(bh)) {
mark_buffer_notjournal_new(bh) ;
diff --git a/fs/select.c b/fs/select.c
index 30c29f1e49f8..6a5909a75677 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -12,6 +12,9 @@
* 24 January 2000
* Changed sys_poll()/do_poll() to use PAGE_SIZE chunk-based allocation
* of fds to overcome nfds < 16390 descriptors limit (Tigran Aivazian).
+ *
+ * Dec 2001
+ * Stack allocation and fast path (Andi Kleen)
*/
#include <linux/slab.h>
@@ -26,21 +29,6 @@
#define ROUND_UP(x,y) (((x)+(y)-1)/(y))
#define DEFAULT_POLLMASK (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)
-struct poll_table_entry {
- struct file * filp;
- wait_queue_t wait;
- wait_queue_head_t * wait_address;
-};
-
-struct poll_table_page {
- struct poll_table_page * next;
- struct poll_table_entry * entry;
- struct poll_table_entry entries[0];
-};
-
-#define POLL_TABLE_FULL(table) \
- ((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))
-
/*
* Ok, Peter made a complicated, but straightforward multiple_wait() function.
* I have rewritten this, taking some shortcuts: This code may not be easy to
@@ -62,30 +50,39 @@ void poll_freewait(poll_table* pt)
struct poll_table_page *old;
entry = p->entry;
- do {
+ while (entry > p->entries) {
entry--;
remove_wait_queue(entry->wait_address,&entry->wait);
fput(entry->filp);
- } while (entry > p->entries);
+ }
old = p;
p = p->next;
- free_page((unsigned long) old);
+ if (old != &pt->inline_page)
+ free_page((unsigned long) old);
}
}
void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
struct poll_table_page *table = p->table;
-
- if (!table || POLL_TABLE_FULL(table)) {
- struct poll_table_page *new_table;
-
- new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
- if (!new_table) {
- p->error = -ENOMEM;
- __set_current_state(TASK_RUNNING);
- return;
+ struct poll_table_page *new_table = NULL;
+ int sz;
+
+ if (!table) {
+ new_table = &p->inline_page;
+ } else {
+ sz = (table == &p->inline_page) ? POLL_INLINE_TABLE_LEN : PAGE_SIZE;
+ if ((char*)table->entry >= (char*)table + sz) {
+ new_table = (struct poll_table_page *)__get_free_page(GFP_KERNEL);
+ if (!new_table) {
+ p->error = -ENOMEM;
+ __set_current_state(TASK_RUNNING);
+ return;
+ }
}
+ }
+
+ if (new_table) {
new_table->entry = new_table->entries;
new_table->next = table;
p->table = new_table;
@@ -113,48 +110,6 @@ void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table
#define BITS(fds, n) (*__IN(fds, n)|*__OUT(fds, n)|*__EX(fds, n))
-static int max_select_fd(unsigned long n, fd_set_bits *fds)
-{
- unsigned long *open_fds;
- unsigned long set;
- int max;
-
- /* handle last in-complete long-word first */
- set = ~(~0UL << (n & (__NFDBITS-1)));
- n /= __NFDBITS;
- open_fds = current->files->open_fds->fds_bits+n;
- max = 0;
- if (set) {
- set &= BITS(fds, n);
- if (set) {
- if (!(set & ~*open_fds))
- goto get_max;
- return -EBADF;
- }
- }
- while (n) {
- open_fds--;
- n--;
- set = BITS(fds, n);
- if (!set)
- continue;
- if (set & ~*open_fds)
- return -EBADF;
- if (max)
- continue;
-get_max:
- do {
- max++;
- set >>= 1;
- } while (set);
- max += n * __NFDBITS;
- }
-
- return max;
-}
-
-#define BIT(i) (1UL << ((i)&(__NFDBITS-1)))
-#define MEM(i,m) ((m)+(unsigned)(i)/__NFDBITS)
#define ISSET(i,m) (((i)&*(m)) != 0)
#define SET(i,m) (*(m) |= (i))
@@ -165,56 +120,71 @@ get_max:
int do_select(int n, fd_set_bits *fds, long *timeout)
{
poll_table table, *wait;
- int retval, i, off;
+ int retval, off, max, maxoff;
long __timeout = *timeout;
- read_lock(&current->files->file_lock);
- retval = max_select_fd(n, fds);
- read_unlock(&current->files->file_lock);
-
- if (retval < 0)
- return retval;
- n = retval;
-
poll_initwait(&table);
wait = &table;
if (!__timeout)
wait = NULL;
+
retval = 0;
+ maxoff = n/BITS_PER_LONG;
+ max = 0;
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
- for (i = 0 ; i < n; i++) {
- unsigned long bit = BIT(i);
- unsigned long mask;
- struct file *file;
+ for (off = 0; off <= maxoff; off++) {
+ unsigned long val = BITS(fds, off);
- off = i / __NFDBITS;
- if (!(bit & BITS(fds, off)))
+ if (!val)
continue;
- file = fget(i);
- mask = POLLNVAL;
- if (file) {
- mask = DEFAULT_POLLMASK;
- if (file->f_op && file->f_op->poll)
- mask = file->f_op->poll(file, wait);
- fput(file);
- }
- if ((mask & POLLIN_SET) && ISSET(bit, __IN(fds,off))) {
- SET(bit, __RES_IN(fds,off));
- retval++;
- wait = NULL;
- }
- if ((mask & POLLOUT_SET) && ISSET(bit, __OUT(fds,off))) {
- SET(bit, __RES_OUT(fds,off));
- retval++;
- wait = NULL;
- }
- if ((mask & POLLEX_SET) && ISSET(bit, __EX(fds,off))) {
- SET(bit, __RES_EX(fds,off));
- retval++;
- wait = NULL;
+ while (val) {
+ int k = ffz(~val);
+ unsigned long mask, bit;
+ struct file *file;
+
+ if (k > n%BITS_PER_LONG)
+ break;
+
+ bit = (1UL << k);
+ val &= ~bit;
+
+ file = fget((off * BITS_PER_LONG) + k);
+ mask = POLLNVAL;
+ if (file) {
+ mask = DEFAULT_POLLMASK;
+ if (file->f_op && file->f_op->poll)
+ mask = file->f_op->poll(file, wait);
+ fput(file);
+ } else {
+ /* This error will shadow all other results.
+ * This matches previous linux behaviour */
+ retval = -EBADF;
+ goto out;
+ }
+ if ((mask & POLLIN_SET) && ISSET(bit, __IN(fds,off))) {
+ SET(bit, __RES_IN(fds,off));
+ retval++;
+ wait = NULL;
+ }
+ if ((mask& POLLOUT_SET) && ISSET(bit,__OUT(fds,off))) {
+ SET(bit, __RES_OUT(fds,off));
+ retval++;
+ wait = NULL;
+ }
+ if ((mask & POLLEX_SET) && ISSET(bit, __EX(fds,off))) {
+ SET(bit, __RES_EX(fds,off));
+ retval++;
+ wait = NULL;
+ }
+
+ if (!(val &= ~bit))
+ break;
}
}
+
+
+ maxoff = max;
wait = NULL;
if (retval || !__timeout || signal_pending(current))
break;
@@ -224,25 +194,43 @@ int do_select(int n, fd_set_bits *fds, long *timeout)
}
__timeout = schedule_timeout(__timeout);
}
+
+out:
current->state = TASK_RUNNING;
poll_freewait(&table);
/*
- * Up-to-date the caller timeout.
+ * Update the caller timeout.
*/
*timeout = __timeout;
return retval;
}
-static void *select_bits_alloc(int size)
-{
- return kmalloc(6 * size, GFP_KERNEL);
-}
+/*
+ * We do a VERIFY_WRITE here even though we are only reading this time:
+ * we'll write to it eventually..
+ */
-static void select_bits_free(void *bits, int size)
+static int get_fd_set(unsigned long nr, void *ufdset, unsigned long *fdset)
{
- kfree(bits);
+ unsigned long rounded = FDS_BYTES(nr), mask;
+ if (ufdset) {
+ int error = verify_area(VERIFY_WRITE, ufdset, rounded);
+ if (!error && __copy_from_user(fdset, ufdset, rounded))
+ error = -EFAULT;
+ if (nr % __NFDBITS == 0)
+ mask = 0;
+ else {
+ /* This includes one bit too much according to SU;
+ but without this some programs hang. */
+ mask = ~(~0UL << (nr%__NFDBITS));
+ }
+ fdset[nr/__NFDBITS] &= mask;
+ return error;
+ }
+ memset(fdset, 0, rounded);
+ return 0;
}
/*
@@ -263,6 +251,7 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
char *bits;
long timeout;
int ret, size, max_fdset;
+ char stack_bits[FDS_BYTES(FAST_SELECT_MAX) * 6];
timeout = MAX_SCHEDULE_TIMEOUT;
if (tvp) {
@@ -297,11 +286,16 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
* since we used fdset we need to allocate memory in units of
* long-words.
*/
- ret = -ENOMEM;
size = FDS_BYTES(n);
- bits = select_bits_alloc(size);
- if (!bits)
- goto out_nofds;
+ if (n < FAST_SELECT_MAX) {
+ bits = stack_bits;
+ } else {
+ ret = -ENOMEM;
+ bits = kmalloc(6*size, GFP_KERNEL);
+ if (!bits)
+ goto out_nofds;
+ }
+
fds.in = (unsigned long *) bits;
fds.out = (unsigned long *) (bits + size);
fds.ex = (unsigned long *) (bits + 2*size);
@@ -313,9 +307,7 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
(ret = get_fd_set(n, outp, fds.out)) ||
(ret = get_fd_set(n, exp, fds.ex)))
goto out;
- zero_fd_set(n, fds.res_in);
- zero_fd_set(n, fds.res_out);
- zero_fd_set(n, fds.res_ex);
+ memset(fds.res_in, 0, 3*size);
ret = do_select(n, &fds, &timeout);
@@ -326,8 +318,8 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
usec = timeout % HZ;
usec *= (1000000/HZ);
}
- put_user(sec, &tvp->tv_sec);
- put_user(usec, &tvp->tv_usec);
+ __put_user(sec, &tvp->tv_sec);
+ __put_user(usec, &tvp->tv_usec);
}
if (ret < 0)
@@ -344,8 +336,10 @@ sys_select(int n, fd_set *inp, fd_set *outp, fd_set *exp, struct timeval *tvp)
set_fd_set(n, exp, fds.res_ex);
out:
- select_bits_free(bits, size);
+ if (n >= FAST_SELECT_MAX)
+ kfree(bits);
out_nofds:
+
return ret;
}
@@ -410,12 +404,42 @@ static int do_poll(unsigned int nfds, unsigned int nchunks, unsigned int nleft,
return count;
}
+static int fast_poll(poll_table *table, poll_table *wait, struct pollfd *ufds,
+ unsigned int nfds, long timeout)
+{
+ poll_table *pt = wait;
+ struct pollfd fds[FAST_POLL_MAX];
+ int count, i;
+
+ if (copy_from_user(fds, ufds, nfds * sizeof(struct pollfd)))
+ return -EFAULT;
+ for (;;) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ count = 0;
+ do_pollfd(nfds, fds, &pt, &count);
+ pt = NULL;
+ if (count || !timeout || signal_pending(current))
+ break;
+ count = wait->error;
+ if (count)
+ break;
+ timeout = schedule_timeout(timeout);
+ }
+ current->state = TASK_RUNNING;
+ for (i = 0; i < nfds; i++)
+ __put_user(fds[i].revents, &ufds[i].revents);
+ poll_freewait(table);
+ if (!count && signal_pending(current))
+ return -EINTR;
+ return count;
+}
+
asmlinkage long sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout)
{
- int i, j, fdcount, err;
+ int i, j, err, fdcount;
struct pollfd **fds;
poll_table table, *wait;
- int nchunks, nleft;
+ int nchunks, nleft;
/* Do a sanity check on nfds ... */
if (nfds > NR_OPEN)
@@ -429,43 +453,45 @@ asmlinkage long sys_poll(struct pollfd * ufds, unsigned int nfds, long timeout)
timeout = MAX_SCHEDULE_TIMEOUT;
}
+
poll_initwait(&table);
wait = &table;
if (!timeout)
wait = NULL;
- err = -ENOMEM;
- fds = NULL;
- if (nfds != 0) {
- fds = (struct pollfd **)kmalloc(
- (1 + (nfds - 1) / POLLFD_PER_PAGE) * sizeof(struct pollfd *),
- GFP_KERNEL);
- if (fds == NULL)
- goto out;
- }
+ if (nfds < FAST_POLL_MAX)
+ return fast_poll(&table, wait, ufds, nfds, timeout);
+ err = -ENOMEM;
+ fds = (struct pollfd **)kmalloc(
+ (1 + (nfds - 1) / POLLFD_PER_PAGE) * sizeof(struct pollfd *),
+ GFP_KERNEL);
+ if (fds == NULL)
+ goto out;
+
nchunks = 0;
nleft = nfds;
- while (nleft > POLLFD_PER_PAGE) { /* allocate complete PAGE_SIZE chunks */
+ while (nleft > POLLFD_PER_PAGE) {
fds[nchunks] = (struct pollfd *)__get_free_page(GFP_KERNEL);
if (fds[nchunks] == NULL)
goto out_fds;
nchunks++;
nleft -= POLLFD_PER_PAGE;
}
- if (nleft) { /* allocate last PAGE_SIZE chunk, only nleft elements used */
+ if (nleft) {
fds[nchunks] = (struct pollfd *)__get_free_page(GFP_KERNEL);
if (fds[nchunks] == NULL)
goto out_fds;
- }
-
+ }
+
err = -EFAULT;
for (i=0; i < nchunks; i++)
if (copy_from_user(fds[i], ufds + i*POLLFD_PER_PAGE, PAGE_SIZE))
goto out_fds1;
+
if (nleft) {
if (copy_from_user(fds[nchunks], ufds + nchunks*POLLFD_PER_PAGE,
- nleft * sizeof(struct pollfd)))
+ nleft * sizeof(struct pollfd)))
goto out_fds1;
}
@@ -489,8 +515,7 @@ out_fds1:
out_fds:
for (i=0; i < nchunks; i++)
free_page((unsigned long)(fds[i]));
- if (nfds != 0)
- kfree(fds);
+ kfree(fds);
out:
poll_freewait(&table);
return err;
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index f8134d41d98e..6b87c6f26702 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -117,7 +117,7 @@ static int ufs_trunc_direct (struct inode * inode)
frag1 = ufs_fragnum (frag1);
frag2 = ufs_fragnum (frag2);
for (j = frag1; j < frag2; j++) {
- bh = sb_get_hash_table (sb, tmp + j);
+ bh = sb_find_get_block (sb, tmp + j);
if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) {
retry = 1;
brelse (bh);
@@ -140,7 +140,7 @@ next1:
if (!tmp)
continue;
for (j = 0; j < uspi->s_fpb; j++) {
- bh = sb_get_hash_table(sb, tmp + j);
+ bh = sb_find_get_block(sb, tmp + j);
if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) {
retry = 1;
brelse (bh);
@@ -179,7 +179,7 @@ next2:;
ufs_panic(sb, "ufs_truncate_direct", "internal error");
frag4 = ufs_fragnum (frag4);
for (j = 0; j < frag4; j++) {
- bh = sb_get_hash_table (sb, tmp + j);
+ bh = sb_find_get_block (sb, tmp + j);
if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *p)) {
retry = 1;
brelse (bh);
@@ -238,7 +238,7 @@ static int ufs_trunc_indirect (struct inode * inode, unsigned offset, u32 * p)
if (!tmp)
continue;
for (j = 0; j < uspi->s_fpb; j++) {
- bh = sb_get_hash_table(sb, tmp + j);
+ bh = sb_find_get_block(sb, tmp + j);
if ((bh && DATA_BUFFER_USED(bh)) || tmp != fs32_to_cpu(sb, *ind)) {
retry = 1;
brelse (bh);
diff --git a/include/asm-alpha/agp.h b/include/asm-alpha/agp.h
new file mode 100644
index 000000000000..ba05bdf9a211
--- /dev/null
+++ b/include/asm-alpha/agp.h
@@ -0,0 +1,11 @@
+#ifndef AGP_H
+#define AGP_H 1
+
+/* dummy for now */
+
+#define map_page_into_agp(page)
+#define unmap_page_from_agp(page)
+#define flush_agp_mappings()
+#define flush_agp_cache() mb()
+
+#endif
diff --git a/include/asm-i386/agp.h b/include/asm-i386/agp.h
new file mode 100644
index 000000000000..9ae97c09fb49
--- /dev/null
+++ b/include/asm-i386/agp.h
@@ -0,0 +1,23 @@
+#ifndef AGP_H
+#define AGP_H 1
+
+#include <asm/pgtable.h>
+
+/*
+ * Functions to keep the agpgart mappings coherent with the MMU.
+ * The GART gives the CPU a physical alias of pages in memory. The alias region is
+ * mapped uncacheable. Make sure there are no conflicting mappings
+ * with different cachability attributes for the same page. This avoids
+ * data corruption on some CPUs.
+ */
+
+#define map_page_into_agp(page) change_page_attr(page, 1, PAGE_KERNEL_NOCACHE)
+#define unmap_page_from_agp(page) change_page_attr(page, 1, PAGE_KERNEL)
+#define flush_agp_mappings() global_flush_tlb()
+
+/* Could use CLFLUSH here if the cpu supports it. But then it would
+ need to be called for each cacheline of the whole page so it may not be
+ worth it. Would need a page for it. */
+#define flush_agp_cache() asm volatile("wbinvd":::"memory")
+
+#endif
diff --git a/include/asm-i386/cacheflush.h b/include/asm-i386/cacheflush.h
index 58d027dfc5ff..319e65a7047f 100644
--- a/include/asm-i386/cacheflush.h
+++ b/include/asm-i386/cacheflush.h
@@ -15,4 +15,7 @@
#define flush_icache_page(vma,pg) do { } while (0)
#define flush_icache_user_range(vma,pg,adr,len) do { } while (0)
+void global_flush_tlb(void);
+int change_page_attr(struct page *page, int numpages, pgprot_t prot);
+
#endif /* _I386_CACHEFLUSH_H */
diff --git a/include/asm-i386/io.h b/include/asm-i386/io.h
index 44996d06ecc3..9922dd823c9c 100644
--- a/include/asm-i386/io.h
+++ b/include/asm-i386/io.h
@@ -121,31 +121,7 @@ static inline void * ioremap (unsigned long offset, unsigned long size)
return __ioremap(offset, size, 0);
}
-/**
- * ioremap_nocache - map bus memory into CPU space
- * @offset: bus address of the memory
- * @size: size of the resource to map
- *
- * ioremap_nocache performs a platform specific sequence of operations to
- * make bus memory CPU accessible via the readb/readw/readl/writeb/
- * writew/writel functions and the other mmio helpers. The returned
- * address is not guaranteed to be usable directly as a virtual
- * address.
- *
- * This version of ioremap ensures that the memory is marked uncachable
- * on the CPU as well as honouring existing caching rules from things like
- * the PCI bus. Note that there are other caches and buffers on many
- * busses. In paticular driver authors should read up on PCI writes
- *
- * It's useful if some control registers are in such an area and
- * write combining or read caching is not desirable:
- */
-
-static inline void * ioremap_nocache (unsigned long offset, unsigned long size)
-{
- return __ioremap(offset, size, _PAGE_PCD);
-}
-
+extern void * ioremap_nocache (unsigned long offset, unsigned long size);
extern void iounmap(void *addr);
/*
diff --git a/include/asm-i386/kmap_types.h b/include/asm-i386/kmap_types.h
index 9a12267d3a4f..0ae7bb3c2b8d 100644
--- a/include/asm-i386/kmap_types.h
+++ b/include/asm-i386/kmap_types.h
@@ -15,10 +15,11 @@ D(1) KM_SKB_SUNRPC_DATA,
D(2) KM_SKB_DATA_SOFTIRQ,
D(3) KM_USER0,
D(4) KM_USER1,
-D(5) KM_BIO_IRQ,
-D(6) KM_PTE0,
-D(7) KM_PTE1,
-D(8) KM_TYPE_NR
+D(5) KM_BIO_SRC_IRQ,
+D(6) KM_BIO_DST_IRQ,
+D(7) KM_PTE0,
+D(8) KM_PTE1,
+D(9) KM_TYPE_NR
};
#undef D
diff --git a/include/asm-i386/page.h b/include/asm-i386/page.h
index 4737ef69ae18..d8e1f404c08b 100644
--- a/include/asm-i386/page.h
+++ b/include/asm-i386/page.h
@@ -6,6 +6,9 @@
#define PAGE_SIZE (1UL << PAGE_SHIFT)
#define PAGE_MASK (~(PAGE_SIZE-1))
+#define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
+#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
+
#ifdef __KERNEL__
#ifndef __ASSEMBLY__
diff --git a/include/asm-i386/pgtable-2level.h b/include/asm-i386/pgtable-2level.h
index e22db0cc6824..9f8bdc13adac 100644
--- a/include/asm-i386/pgtable-2level.h
+++ b/include/asm-i386/pgtable-2level.h
@@ -40,6 +40,7 @@ static inline int pgd_present(pgd_t pgd) { return 1; }
* hook is made available.
*/
#define set_pte(pteptr, pteval) (*(pteptr) = pteval)
+#define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval)
/*
* (pmds are folded into pgds so this doesnt get actually called,
* but the define is needed for a generic inline function.)
diff --git a/include/asm-i386/pgtable-3level.h b/include/asm-i386/pgtable-3level.h
index bb2eaea63fde..beb0c1bc3d30 100644
--- a/include/asm-i386/pgtable-3level.h
+++ b/include/asm-i386/pgtable-3level.h
@@ -49,6 +49,8 @@ static inline void set_pte(pte_t *ptep, pte_t pte)
smp_wmb();
ptep->pte_low = pte.pte_low;
}
+#define set_pte_atomic(pteptr,pteval) \
+ set_64bit((unsigned long long *)(pteptr),pte_val(pteval))
#define set_pmd(pmdptr,pmdval) \
set_64bit((unsigned long long *)(pmdptr),pmd_val(pmdval))
#define set_pgd(pgdptr,pgdval) \
diff --git a/include/asm-i386/pgtable.h b/include/asm-i386/pgtable.h
index f48db2beeeba..71b75fa234af 100644
--- a/include/asm-i386/pgtable.h
+++ b/include/asm-i386/pgtable.h
@@ -237,6 +237,9 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
#define pmd_page(pmd) \
(mem_map + (pmd_val(pmd) >> PAGE_SHIFT))
+#define pmd_large(pmd) \
+ ((pmd_val(pmd) & (_PAGE_PSE|_PAGE_PRESENT)) == (_PAGE_PSE|_PAGE_PRESENT))
+
/* to find an entry in a page-table-directory. */
#define pgd_index(address) ((address >> PGDIR_SHIFT) & (PTRS_PER_PGD-1))
diff --git a/include/asm-ia64/agp.h b/include/asm-ia64/agp.h
new file mode 100644
index 000000000000..ba05bdf9a211
--- /dev/null
+++ b/include/asm-ia64/agp.h
@@ -0,0 +1,11 @@
+#ifndef AGP_H
+#define AGP_H 1
+
+/* dummy for now */
+
+#define map_page_into_agp(page)
+#define unmap_page_from_agp(page)
+#define flush_agp_mappings()
+#define flush_agp_cache() mb()
+
+#endif
diff --git a/include/asm-ppc/kmap_types.h b/include/asm-ppc/kmap_types.h
index 99fec407abf5..bce7fd8c1ff2 100644
--- a/include/asm-ppc/kmap_types.h
+++ b/include/asm-ppc/kmap_types.h
@@ -11,7 +11,8 @@ enum km_type {
KM_SKB_DATA_SOFTIRQ,
KM_USER0,
KM_USER1,
- KM_BIO_IRQ,
+ KM_BIO_SRC_IRQ,
+ KM_BIO_DST_IRQ,
KM_PTE0,
KM_PTE1,
KM_TYPE_NR
diff --git a/include/asm-sparc/kmap_types.h b/include/asm-sparc/kmap_types.h
index 7e9a5661c698..bab20a2a676b 100644
--- a/include/asm-sparc/kmap_types.h
+++ b/include/asm-sparc/kmap_types.h
@@ -7,7 +7,8 @@ enum km_type {
KM_SKB_DATA_SOFTIRQ,
KM_USER0,
KM_USER1,
- KM_BIO_IRQ,
+ KM_BIO_SRC_IRQ,
+ KM_BIO_DST_IRQ,
KM_TYPE_NR
};
diff --git a/include/asm-sparc64/agp.h b/include/asm-sparc64/agp.h
new file mode 100644
index 000000000000..ba05bdf9a211
--- /dev/null
+++ b/include/asm-sparc64/agp.h
@@ -0,0 +1,11 @@
+#ifndef AGP_H
+#define AGP_H 1
+
+/* dummy for now */
+
+#define map_page_into_agp(page)
+#define unmap_page_from_agp(page)
+#define flush_agp_mappings()
+#define flush_agp_cache() mb()
+
+#endif
diff --git a/include/asm-x86_64/agp.h b/include/asm-x86_64/agp.h
new file mode 100644
index 000000000000..8c2fabe80419
--- /dev/null
+++ b/include/asm-x86_64/agp.h
@@ -0,0 +1,23 @@
+#ifndef AGP_H
+#define AGP_H 1
+
+#include <asm/cacheflush.h>
+
+/*
+ * Functions to keep the agpgart mappings coherent.
+ * The GART gives the CPU a physical alias of memory. The alias is
+ * mapped uncacheable. Make sure there are no conflicting mappings
+ * with different cachability attributes for the same page.
+ */
+
+#define map_page_into_agp(page) \
+ change_page_attr(page, __pgprot(__PAGE_KERNEL | _PAGE_PCD))
+#define unmap_page_from_agp(page) change_page_attr(page, PAGE_KERNEL)
+#define flush_agp_mappings() global_flush_tlb()
+
+/* Could use CLFLUSH here if the cpu supports it. But then it would
+ need to be called for each cacheline of the whole page so it may not be
+ worth it. Would need a page for it. */
+#define flush_agp_cache() asm volatile("wbinvd":::"memory")
+
+#endif
diff --git a/include/asm-x86_64/cacheflush.h b/include/asm-x86_64/cacheflush.h
index 58d027dfc5ff..319e65a7047f 100644
--- a/include/asm-x86_64/cacheflush.h
+++ b/include/asm-x86_64/cacheflush.h
@@ -15,4 +15,7 @@
#define flush_icache_page(vma,pg) do { } while (0)
#define flush_icache_user_range(vma,pg,adr,len) do { } while (0)
+void global_flush_tlb(void);
+int change_page_attr(struct page *page, int numpages, pgprot_t prot);
+
#endif /* _I386_CACHEFLUSH_H */
diff --git a/include/asm-x86_64/i387.h b/include/asm-x86_64/i387.h
index edb75edb063e..2a0292c00b54 100644
--- a/include/asm-x86_64/i387.h
+++ b/include/asm-x86_64/i387.h
@@ -16,11 +16,22 @@
#include <asm/processor.h>
#include <asm/sigcontext.h>
#include <asm/user.h>
+#include <asm/thread_info.h>
extern void fpu_init(void);
extern void init_fpu(void);
int save_i387(struct _fpstate *buf);
+static inline int need_signal_i387(struct task_struct *me)
+{
+ if (!me->used_math)
+ return 0;
+ me->used_math = 0;
+ if (!test_thread_flag(TIF_USEDFPU))
+ return 0;
+ return 1;
+}
+
/*
* FPU lazy state save handling...
*/
diff --git a/include/asm-x86_64/ia32.h b/include/asm-x86_64/ia32.h
index e57c2e593007..7830bf40cfd4 100644
--- a/include/asm-x86_64/ia32.h
+++ b/include/asm-x86_64/ia32.h
@@ -18,7 +18,9 @@ typedef int __kernel_clock_t32;
typedef int __kernel_pid_t32;
typedef unsigned short __kernel_ipc_pid_t32;
typedef unsigned short __kernel_uid_t32;
+typedef unsigned __kernel_uid32_t32;
typedef unsigned short __kernel_gid_t32;
+typedef unsigned __kernel_gid32_t32;
typedef unsigned short __kernel_dev_t32;
typedef unsigned int __kernel_ino_t32;
typedef unsigned short __kernel_mode_t32;
diff --git a/include/asm-x86_64/ipc.h b/include/asm-x86_64/ipc.h
index 49ea4fdc19b4..2ca5773be061 100644
--- a/include/asm-x86_64/ipc.h
+++ b/include/asm-x86_64/ipc.h
@@ -1,34 +1,6 @@
#ifndef __i386_IPC_H__
#define __i386_IPC_H__
-/*
- * These are used to wrap system calls on x86.
- *
- * See arch/i386/kernel/sys_i386.c for ugly details..
- *
- * (on x86-64 only used for 32bit emulation)
- */
-
-struct ipc_kludge {
- struct msgbuf *msgp;
- long msgtyp;
-};
-
-#define SEMOP 1
-#define SEMGET 2
-#define SEMCTL 3
-#define MSGSND 11
-#define MSGRCV 12
-#define MSGGET 13
-#define MSGCTL 14
-#define SHMAT 21
-#define SHMDT 22
-#define SHMGET 23
-#define SHMCTL 24
-
-/* Used by the DIPC package, try and avoid reusing it */
-#define DIPC 25
-
-#define IPCCALL(version,op) ((version)<<16 | (op))
+/* dummy */
#endif
diff --git a/include/asm-x86_64/kmap_types.h b/include/asm-x86_64/kmap_types.h
index 7e9a5661c698..bab20a2a676b 100644
--- a/include/asm-x86_64/kmap_types.h
+++ b/include/asm-x86_64/kmap_types.h
@@ -7,7 +7,8 @@ enum km_type {
KM_SKB_DATA_SOFTIRQ,
KM_USER0,
KM_USER1,
- KM_BIO_IRQ,
+ KM_BIO_SRC_IRQ,
+ KM_BIO_DST_IRQ,
KM_TYPE_NR
};
diff --git a/include/asm-x86_64/mmu_context.h b/include/asm-x86_64/mmu_context.h
index e9f6d661cf4c..e21f0e6721f8 100644
--- a/include/asm-x86_64/mmu_context.h
+++ b/include/asm-x86_64/mmu_context.h
@@ -19,8 +19,8 @@ int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk, unsigned cpu)
{
- if(cpu_tlbstate[cpu].state == TLBSTATE_OK)
- cpu_tlbstate[cpu].state = TLBSTATE_LAZY;
+ if (read_pda(mmu_state) == TLBSTATE_OK)
+ write_pda(mmu_state, TLBSTATE_LAZY);
}
#else
static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk, unsigned cpu)
@@ -35,8 +35,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
/* stop flush ipis for the previous mm */
clear_bit(cpu, &prev->cpu_vm_mask);
#ifdef CONFIG_SMP
- cpu_tlbstate[cpu].state = TLBSTATE_OK;
- cpu_tlbstate[cpu].active_mm = next;
+ write_pda(mmu_state, TLBSTATE_OK);
+ write_pda(active_mm, next);
#endif
set_bit(cpu, &next->cpu_vm_mask);
/* Re-load page tables */
@@ -48,8 +48,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
}
#ifdef CONFIG_SMP
else {
- cpu_tlbstate[cpu].state = TLBSTATE_OK;
- if(cpu_tlbstate[cpu].active_mm != next)
+ write_pda(mmu_state, TLBSTATE_OK);
+ if (read_pda(active_mm) != next)
out_of_line_bug();
if(!test_and_set_bit(cpu, &next->cpu_vm_mask)) {
/* We were in lazy tlb mode and leave_mm disabled
diff --git a/include/asm-x86_64/msr.h b/include/asm-x86_64/msr.h
index 7e522c2f4846..4085cc8c5dbe 100644
--- a/include/asm-x86_64/msr.h
+++ b/include/asm-x86_64/msr.h
@@ -95,6 +95,7 @@
#define MSR_IA32_PERFCTR0 0xc1
#define MSR_IA32_PERFCTR1 0xc2
+#define MSR_MTRRcap 0x0fe
#define MSR_IA32_BBL_CR_CTL 0x119
#define MSR_IA32_MCG_CAP 0x179
@@ -110,6 +111,19 @@
#define MSR_IA32_LASTINTFROMIP 0x1dd
#define MSR_IA32_LASTINTTOIP 0x1de
+#define MSR_MTRRfix64K_00000 0x250
+#define MSR_MTRRfix16K_80000 0x258
+#define MSR_MTRRfix16K_A0000 0x259
+#define MSR_MTRRfix4K_C0000 0x268
+#define MSR_MTRRfix4K_C8000 0x269
+#define MSR_MTRRfix4K_D0000 0x26a
+#define MSR_MTRRfix4K_D8000 0x26b
+#define MSR_MTRRfix4K_E0000 0x26c
+#define MSR_MTRRfix4K_E8000 0x26d
+#define MSR_MTRRfix4K_F0000 0x26e
+#define MSR_MTRRfix4K_F8000 0x26f
+#define MSR_MTRRdefType 0x2ff
+
#define MSR_IA32_MC0_CTL 0x400
#define MSR_IA32_MC0_STATUS 0x401
#define MSR_IA32_MC0_ADDR 0x402
@@ -171,11 +185,4 @@
#define MSR_IA32_APICBASE_ENABLE (1<<11)
#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
-
-#define MSR_IA32_THERM_CONTROL 0x19a
-#define MSR_IA32_THERM_INTERRUPT 0x19b
-#define MSR_IA32_THERM_STATUS 0x19c
-#define MSR_IA32_MISC_ENABLE 0x1a0
-
-
#endif
diff --git a/include/asm-x86_64/mtrr.h b/include/asm-x86_64/mtrr.h
index ff3ea870d0d6..6505d7bd6ece 100644
--- a/include/asm-x86_64/mtrr.h
+++ b/include/asm-x86_64/mtrr.h
@@ -30,16 +30,16 @@
struct mtrr_sentry
{
- unsigned long base; /* Base address */
- unsigned long size; /* Size of region */
+ __u64 base; /* Base address */
+ __u32 size; /* Size of region */
unsigned int type; /* Type of region */
};
struct mtrr_gentry
{
+ __u64 base; /* Base address */
+ __u32 size; /* Size of region */
unsigned int regnum; /* Register number */
- unsigned long base; /* Base address */
- unsigned long size; /* Size of region */
unsigned int type; /* Type of region */
};
@@ -81,46 +81,38 @@ static char *mtrr_strings[MTRR_NUM_TYPES] =
#ifdef __KERNEL__
/* The following functions are for use by other drivers */
-# ifdef CONFIG_MTRR
-extern int mtrr_add (unsigned long base, unsigned long size,
- unsigned int type, char increment);
-extern int mtrr_add_page (unsigned long base, unsigned long size,
- unsigned int type, char increment);
-extern int mtrr_del (int reg, unsigned long base, unsigned long size);
-extern int mtrr_del_page (int reg, unsigned long base, unsigned long size);
-extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
-# else
-static __inline__ int mtrr_add (unsigned long base, unsigned long size,
+#ifdef CONFIG_MTRR
+extern int mtrr_add (__u64 base, __u32 size, unsigned int type, char increment);
+extern int mtrr_add_page (__u64 base, __u32 size, unsigned int type, char increment);
+extern int mtrr_del (int reg, __u64 base, __u32 size);
+extern int mtrr_del_page (int reg, __u64 base, __u32 size);
+#else
+static __inline__ int mtrr_add (__u64 base, __u32 size,
unsigned int type, char increment)
{
return -ENODEV;
}
-static __inline__ int mtrr_add_page (unsigned long base, unsigned long size,
+static __inline__ int mtrr_add_page (__u64 base, __u32 size,
unsigned int type, char increment)
{
return -ENODEV;
}
-static __inline__ int mtrr_del (int reg, unsigned long base,
- unsigned long size)
+static __inline__ int mtrr_del (int reg, __u64 base, __u32 size)
{
return -ENODEV;
}
-static __inline__ int mtrr_del_page (int reg, unsigned long base,
- unsigned long size)
+static __inline__ int mtrr_del_page (int reg, __u64 base, __u32 size)
{
return -ENODEV;
}
-
-static __inline__ void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) {;}
-
-# endif
+#endif
/* The following functions are for initialisation: don't use them! */
extern int mtrr_init (void);
-# if defined(CONFIG_SMP) && defined(CONFIG_MTRR)
+#if defined(CONFIG_SMP) && defined(CONFIG_MTRR)
extern void mtrr_init_boot_cpu (void);
extern void mtrr_init_secondary_cpu (void);
-# endif
+#endif
#endif
diff --git a/include/asm-x86_64/pda.h b/include/asm-x86_64/pda.h
index 7ff508346013..eb38cf70fb90 100644
--- a/include/asm-x86_64/pda.h
+++ b/include/asm-x86_64/pda.h
@@ -22,6 +22,8 @@ struct x8664_pda {
unsigned int __local_bh_count;
unsigned int __nmi_count; /* arch dependent */
struct task_struct * __ksoftirqd_task; /* waitqueue is too large */
+ struct mm_struct *active_mm;
+ int mmu_state;
} ____cacheline_aligned;
#define PDA_STACKOFFSET (5*8)
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 4cda0f055a5f..03875338aedf 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -45,21 +45,12 @@ struct cpuinfo_x86 {
__u8 x86_vendor; /* CPU vendor */
__u8 x86_model;
__u8 x86_mask;
- /* We know that wp_works_ok = 1, hlt_works_ok = 1, hard_math = 1,
- etc... */
- char wp_works_ok; /* It doesn't on 386's */
- char hlt_works_ok; /* Problems on some 486Dx4's and old 386's */
- char hard_math;
- char rfu;
int cpuid_level; /* Maximum supported CPUID level, -1=no CPUID */
__u32 x86_capability[NCAPINTS];
char x86_vendor_id[16];
char x86_model_id[64];
int x86_cache_size; /* in KB - valid for CPUS which support this
call */
- int fdiv_bug;
- int f00f_bug;
- int coma_bug;
unsigned long loops_per_jiffy;
} ____cacheline_aligned;
@@ -323,7 +314,7 @@ struct thread_struct {
/* IO permissions. the bitmap could be moved into the GDT, that would make
switch faster for a limited number of ioperm using tasks. -AK */
int ioperm;
- u32 io_bitmap[IO_BITMAP_SIZE+1];
+ u32 *io_bitmap_ptr;
};
#define INIT_THREAD { \
diff --git a/include/asm-x86_64/spinlock.h b/include/asm-x86_64/spinlock.h
index 6f1d71c65a68..a276217b88a3 100644
--- a/include/asm-x86_64/spinlock.h
+++ b/include/asm-x86_64/spinlock.h
@@ -15,7 +15,7 @@ extern int printk(const char * fmt, ...)
typedef struct {
volatile unsigned int lock;
-#ifdef CONFIG_DEBUG_SPINLOCK
+#if SPINLOCK_DEBUG
unsigned magic;
#endif
} spinlock_t;
@@ -39,7 +39,7 @@ typedef struct {
* We make no fairness assumptions. They have a cost.
*/
-#define spin_is_locked(x) (*(volatile char *)(&(x)->lock) <= 0)
+#define spin_is_locked(x) (*(volatile signed char *)(&(x)->lock) <= 0)
#define spin_unlock_wait(x) do { barrier(); } while(spin_is_locked(x))
#define spin_lock_string \
@@ -62,7 +62,7 @@ typedef struct {
static inline int _raw_spin_trylock(spinlock_t *lock)
{
- char oldval;
+ signed char oldval;
__asm__ __volatile__(
"xchgb %b0,%1"
:"=q" (oldval), "=m" (lock->lock)
diff --git a/include/asm-x86_64/string.h b/include/asm-x86_64/string.h
index ec456eadb674..27876b9da06a 100644
--- a/include/asm-x86_64/string.h
+++ b/include/asm-x86_64/string.h
@@ -40,18 +40,9 @@ extern void *__memcpy(void *to, const void *from, size_t len);
__ret = __builtin_memcpy((dst),(src),__len); \
__ret; })
-#if 0
+
#define __HAVE_ARCH_MEMSET
-extern void *__memset(void *mem, int val, size_t len);
-#define memset(dst,val,len) \
- ({ size_t __len = (len); \
- void *__ret; \
- if (__builtin_constant_p(len) && __len >= 64) \
- __ret = __memset((dst),(val),__len); \
- else \
- __ret = __builtin_memset((dst),(val),__len); \
- __ret; })
-#endif
+#define memset __builtin_memset
#define __HAVE_ARCH_MEMMOVE
void * memmove(void * dest,const void *src,size_t count);
diff --git a/include/asm-x86_64/suspend.h b/include/asm-x86_64/suspend.h
new file mode 100644
index 000000000000..9f065f8fe33d
--- /dev/null
+++ b/include/asm-x86_64/suspend.h
@@ -0,0 +1,6 @@
+#ifndef SUSPEND_H
+#define SUSPEND_H 1
+
+/* dummy for now */
+
+#endif
diff --git a/include/asm-x86_64/system.h b/include/asm-x86_64/system.h
index 1df84d087823..9d6c6f1f48d5 100644
--- a/include/asm-x86_64/system.h
+++ b/include/asm-x86_64/system.h
@@ -13,7 +13,10 @@
#define LOCK_PREFIX ""
#endif
-#define prepare_to_switch() do {} while(0)
+#define prepare_arch_schedule(prev) do { } while(0)
+#define finish_arch_schedule(prev) do { } while(0)
+#define prepare_arch_switch(rq) do { } while(0)
+#define finish_arch_switch(rq) spin_unlock_irq(&(rq)->lock)
#define __STR(x) #x
#define STR(x) __STR(x)
@@ -41,7 +44,7 @@
__POP(rax) __POP(r15) __POP(r14) __POP(r13) __POP(r12) __POP(r11) __POP(r10) \
__POP(r9) __POP(r8)
-#define switch_to(prev,next) \
+#define switch_to(prev,next,last) \
asm volatile(SAVE_CONTEXT \
"movq %%rsp,%[prevrsp]\n\t" \
"movq %[nextrsp],%%rsp\n\t" \
diff --git a/include/asm-x86_64/timex.h b/include/asm-x86_64/timex.h
index b87680d9e51a..98bddc2d805a 100644
--- a/include/asm-x86_64/timex.h
+++ b/include/asm-x86_64/timex.h
@@ -48,6 +48,4 @@ static inline cycles_t get_cycles (void)
extern unsigned int cpu_khz;
-#define ARCH_HAS_JIFFIES_64
-
#endif
diff --git a/include/asm-x86_64/tlbflush.h b/include/asm-x86_64/tlbflush.h
index 3f086b2d03b3..2e811ac262af 100644
--- a/include/asm-x86_64/tlbflush.h
+++ b/include/asm-x86_64/tlbflush.h
@@ -106,15 +106,6 @@ static inline void flush_tlb_range(struct vm_area_struct * vma, unsigned long st
#define TLBSTATE_OK 1
#define TLBSTATE_LAZY 2
-struct tlb_state
-{
- struct mm_struct *active_mm;
- int state;
- char __cacheline_padding[24];
-};
-extern struct tlb_state cpu_tlbstate[NR_CPUS];
-
-
#endif
#define flush_tlb_kernel_range(start, end) flush_tlb_all()
diff --git a/include/linux/bio.h b/include/linux/bio.h
index b244108a27a8..ffc38fca9c1e 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -21,6 +21,8 @@
#define __LINUX_BIO_H
#include <linux/kdev_t.h>
+#include <linux/highmem.h>
+
/* Platforms may set this to teach the BIO layer about IOMMU hardware. */
#include <asm/io.h>
#ifndef BIO_VMERGE_BOUNDARY
@@ -47,9 +49,6 @@ struct bio_vec {
unsigned int bv_offset;
};
-/*
- * weee, c forward decl...
- */
struct bio;
typedef void (bio_end_io_t) (struct bio *);
typedef void (bio_destructor_t) (struct bio *);
@@ -206,4 +205,49 @@ extern inline void bio_init(struct bio *);
extern int bio_ioctl(kdev_t, unsigned int, unsigned long);
+#ifdef CONFIG_HIGHMEM
+/*
+ * remember to add offset! and never ever reenable interrupts between a
+ * bio_kmap_irq and bio_kunmap_irq!!
+ *
+ * This function MUST be inlined - it plays with the CPU interrupt flags.
+ * Hence the `extern inline'.
+ */
+extern inline char *bio_kmap_irq(struct bio *bio, unsigned long *flags)
+{
+ unsigned long addr;
+
+ __save_flags(*flags);
+
+ /*
+ * could be low
+ */
+ if (!PageHighMem(bio_page(bio)))
+ return bio_data(bio);
+
+ /*
+ * it's a highmem page
+ */
+ __cli();
+ addr = (unsigned long) kmap_atomic(bio_page(bio), KM_BIO_SRC_IRQ);
+
+ if (addr & ~PAGE_MASK)
+ BUG();
+
+ return (char *) addr + bio_offset(bio);
+}
+
+extern inline void bio_kunmap_irq(char *buffer, unsigned long *flags)
+{
+ unsigned long ptr = (unsigned long) buffer & PAGE_MASK;
+
+ kunmap_atomic((void *) ptr, KM_BIO_SRC_IRQ);
+ __restore_flags(*flags);
+}
+
+#else
+#define bio_kmap_irq(bio, flags) (bio_data(bio))
+#define bio_kunmap_irq(buf, flags) do { *(flags) = 0; } while (0)
+#endif
+
#endif /* __LINUX_BIO_H */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index ef86a3ed6e64..c0c099834df2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -246,12 +246,7 @@ extern unsigned long blk_max_low_pfn, blk_max_pfn;
#define BLK_BOUNCE_ISA (ISA_DMA_THRESHOLD)
extern int init_emergency_isa_pool(void);
-extern void create_bounce(unsigned long pfn, int gfp, struct bio **bio_orig);
-
-extern inline void blk_queue_bounce(request_queue_t *q, struct bio **bio)
-{
- create_bounce(q->bounce_pfn, q->bounce_gfp, bio);
-}
+void blk_queue_bounce(request_queue_t *q, struct bio **bio);
#define rq_for_each_bio(bio, rq) \
if ((rq->bio)) \
diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 90767fc78617..4fc6bab55825 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -108,12 +108,7 @@ BUFFER_FNS(Async_Read, async_read)
BUFFER_FNS(Async_Write, async_write)
BUFFER_FNS(Boundary, boundary)
-/*
- * FIXME: this is used only by bh_kmap, which is used only by RAID5.
- * Move all that stuff into raid5.c
- */
#define bh_offset(bh) ((unsigned long)(bh)->b_data & ~PAGE_MASK)
-
#define touch_buffer(bh) mark_page_accessed(bh->b_page)
/* If we *know* page->private refers to buffer_heads */
@@ -124,16 +119,6 @@ BUFFER_FNS(Boundary, boundary)
((struct buffer_head *)(page)->private); \
})
#define page_has_buffers(page) PagePrivate(page)
-#define set_page_buffers(page, buffers) \
- do { \
- SetPagePrivate(page); \
- page->private = (unsigned long)buffers; \
- } while (0)
-#define clear_page_buffers(page) \
- do { \
- ClearPagePrivate(page); \
- page->private = 0; \
- } while (0)
#define invalidate_buffers(dev) __invalidate_buffers((dev), 0)
#define destroy_buffers(dev) __invalidate_buffers((dev), 1)
@@ -175,15 +160,14 @@ int fsync_dev(kdev_t);
int fsync_bdev(struct block_device *);
int fsync_super(struct super_block *);
int fsync_no_super(struct block_device *);
-struct buffer_head *__get_hash_table(struct block_device *, sector_t, int);
+struct buffer_head *__find_get_block(struct block_device *, sector_t, int);
struct buffer_head * __getblk(struct block_device *, sector_t, int);
void __brelse(struct buffer_head *);
void __bforget(struct buffer_head *);
struct buffer_head * __bread(struct block_device *, int, int);
void wakeup_bdflush(void);
-struct buffer_head *alloc_buffer_head(int async);
+struct buffer_head *alloc_buffer_head(void);
void free_buffer_head(struct buffer_head * bh);
-int brw_page(int, struct page *, struct block_device *, sector_t [], int);
void FASTCALL(unlock_buffer(struct buffer_head *bh));
/*
@@ -270,9 +254,9 @@ static inline struct buffer_head * sb_getblk(struct super_block *sb, int block)
}
static inline struct buffer_head *
-sb_get_hash_table(struct super_block *sb, int block)
+sb_find_get_block(struct super_block *sb, int block)
{
- return __get_hash_table(sb->s_bdev, block, sb->s_blocksize);
+ return __find_get_block(sb->s_bdev, block, sb->s_blocksize);
}
static inline void
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index da66723d62c5..68c841afc622 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -2,7 +2,6 @@
#define _LINUX_HIGHMEM_H
#include <linux/config.h>
-#include <linux/bio.h>
#include <linux/fs.h>
#include <asm/cacheflush.h>
@@ -15,45 +14,8 @@ extern struct page *highmem_start_page;
/* declarations for linux/mm/highmem.c */
unsigned int nr_free_highpages(void);
-extern void create_bounce(unsigned long pfn, int gfp, struct bio **bio_orig);
extern void check_highmem_ptes(void);
-/*
- * remember to add offset! and never ever reenable interrupts between a
- * bio_kmap_irq and bio_kunmap_irq!!
- */
-static inline char *bio_kmap_irq(struct bio *bio, unsigned long *flags)
-{
- unsigned long addr;
-
- __save_flags(*flags);
-
- /*
- * could be low
- */
- if (!PageHighMem(bio_page(bio)))
- return bio_data(bio);
-
- /*
- * it's a highmem page
- */
- __cli();
- addr = (unsigned long) kmap_atomic(bio_page(bio), KM_BIO_IRQ);
-
- if (addr & ~PAGE_MASK)
- BUG();
-
- return (char *) addr + bio_offset(bio);
-}
-
-static inline void bio_kunmap_irq(char *buffer, unsigned long *flags)
-{
- unsigned long ptr = (unsigned long) buffer & PAGE_MASK;
-
- kunmap_atomic((void *) ptr, KM_BIO_IRQ);
- __restore_flags(*flags);
-}
-
#else /* CONFIG_HIGHMEM */
static inline unsigned int nr_free_highpages(void) { return 0; }
@@ -65,12 +27,6 @@ static inline void *kmap(struct page *page) { return page_address(page); }
#define kmap_atomic(page,idx) kmap(page)
#define kunmap_atomic(page,idx) kunmap(page)
-#define bh_kmap(bh) ((bh)->b_data)
-#define bh_kunmap(bh) do { } while (0)
-
-#define bio_kmap_irq(bio, flags) (bio_data(bio))
-#define bio_kunmap_irq(buf, flags) do { *(flags) = 0; } while (0)
-
#endif /* CONFIG_HIGHMEM */
/* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */
diff --git a/include/linux/ide.h b/include/linux/ide.h
index e07d0f19fcd1..03c21c567ce4 100644
--- a/include/linux/ide.h
+++ b/include/linux/ide.h
@@ -15,6 +15,7 @@
#include <linux/devfs_fs_kernel.h>
#include <linux/interrupt.h>
#include <linux/bitops.h>
+#include <linux/bio.h>
#include <asm/byteorder.h>
#include <asm/hdreg.h>
diff --git a/include/linux/jbd.h b/include/linux/jbd.h
index 835d38c9dbfc..683c1247fd70 100644
--- a/include/linux/jbd.h
+++ b/include/linux/jbd.h
@@ -238,6 +238,7 @@ enum jbd_state_bits {
BUFFER_FNS(JBD, jbd)
BUFFER_FNS(JBDDirty, jbddirty)
TAS_BUFFER_FNS(JBDDirty, jbddirty)
+BUFFER_FNS(Freed, freed)
static inline struct buffer_head *jh2bh(struct journal_head *jh)
{
diff --git a/include/linux/loop.h b/include/linux/loop.h
index d4dc0665a92d..4dfa8b14a586 100644
--- a/include/linux/loop.h
+++ b/include/linux/loop.h
@@ -62,14 +62,6 @@ typedef int (* transfer_proc_t)(struct loop_device *, int cmd,
char *raw_buf, char *loop_buf, int size,
int real_block);
-static inline int lo_do_transfer(struct loop_device *lo, int cmd, char *rbuf,
- char *lbuf, int size, int rblock)
-{
- if (!lo->transfer)
- return 0;
-
- return lo->transfer(lo, cmd, rbuf, lbuf, size, rblock);
-}
#endif /* __KERNEL__ */
/*
diff --git a/include/linux/poll.h b/include/linux/poll.h
index 796aac51388a..86b1ee2d3eb3 100644
--- a/include/linux/poll.h
+++ b/include/linux/poll.h
@@ -10,13 +10,32 @@
#include <linux/mm.h>
#include <asm/uaccess.h>
-struct poll_table_page;
+#define POLL_INLINE_BYTES 256
+#define FAST_SELECT_MAX 128
+#define FAST_POLL_MAX 128
+#define POLL_INLINE_ENTRIES (1+(POLL_INLINE_BYTES / sizeof(struct poll_table_entry)))
+
+struct poll_table_entry {
+ struct file * filp;
+ wait_queue_t wait;
+ wait_queue_head_t * wait_address;
+};
+
+struct poll_table_page {
+ struct poll_table_page * next;
+ struct poll_table_entry * entry;
+ struct poll_table_entry entries[0];
+};
typedef struct poll_table_struct {
int error;
struct poll_table_page * table;
+ struct poll_table_page inline_page;
+ struct poll_table_entry inline_table[POLL_INLINE_ENTRIES];
} poll_table;
+#define POLL_INLINE_TABLE_LEN (sizeof(poll_table) - offsetof(poll_table, inline_page))
+
extern void __pollwait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p);
static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
@@ -30,6 +49,7 @@ static inline void poll_initwait(poll_table* pt)
pt->error = 0;
pt->table = NULL;
}
+
extern void poll_freewait(poll_table* pt);
@@ -49,27 +69,6 @@ typedef struct {
#define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
#define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long))
-/*
- * We do a VERIFY_WRITE here even though we are only reading this time:
- * we'll write to it eventually..
- *
- * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned.
- */
-static inline
-int get_fd_set(unsigned long nr, void *ufdset, unsigned long *fdset)
-{
- nr = FDS_BYTES(nr);
- if (ufdset) {
- int error;
- error = verify_area(VERIFY_WRITE, ufdset, nr);
- if (!error && __copy_from_user(fdset, ufdset, nr))
- error = -EFAULT;
- return error;
- }
- memset(fdset, 0, nr);
- return 0;
-}
-
static inline
void set_fd_set(unsigned long nr, void *ufdset, unsigned long *fdset)
{
@@ -77,12 +76,6 @@ void set_fd_set(unsigned long nr, void *ufdset, unsigned long *fdset)
__copy_to_user(ufdset, fdset, FDS_BYTES(nr));
}
-static inline
-void zero_fd_set(unsigned long nr, unsigned long *fdset)
-{
- memset(fdset, 0, FDS_BYTES(nr));
-}
-
extern int do_select(int n, fd_set_bits *fds, long *timeout);
#endif /* KERNEL */
diff --git a/include/linux/raid/raid5.h b/include/linux/raid/raid5.h
index 5c25120581a7..67f7bf471798 100644
--- a/include/linux/raid/raid5.h
+++ b/include/linux/raid/raid5.h
@@ -3,6 +3,7 @@
#include <linux/raid/md.h>
#include <linux/raid/xor.h>
+#include <linux/bio.h>
/*
*
diff --git a/include/linux/reiserfs_fs.h b/include/linux/reiserfs_fs.h
index 4a3d16d7b8dc..29f6063b3546 100644
--- a/include/linux/reiserfs_fs.h
+++ b/include/linux/reiserfs_fs.h
@@ -1651,7 +1651,7 @@ extern wait_queue_head_t reiserfs_commit_thread_wait ;
#define JOURNAL_BUFFER(j,n) ((j)->j_ap_blocks[((j)->j_start + (n)) % JOURNAL_BLOCK_COUNT])
// We need these to make journal.c code more readable
-#define journal_get_hash_table(s, block) __get_hash_table(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
+#define journal_find_get_block(s, block) __find_get_block(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
#define journal_getblk(s, block) __getblk(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
#define journal_bread(s, block) __bread(SB_JOURNAL(s)->j_dev_bd, block, s->s_blocksize)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 3b43d3bb1123..9e7d80851c32 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -7,7 +7,6 @@ extern unsigned long event;
#include <linux/config.h>
#include <linux/capability.h>
-#include <linux/tqueue.h>
#include <linux/threads.h>
#include <linux/kernel.h>
#include <linux/types.h>
@@ -160,7 +159,6 @@ extern unsigned long cache_decay_ticks;
extern signed long FASTCALL(schedule_timeout(signed long timeout));
asmlinkage void schedule(void);
-extern int schedule_task(struct tq_struct *task);
extern void flush_scheduled_tasks(void);
extern int start_context_thread(void);
extern int current_is_keventd(void);
diff --git a/include/linux/swap.h b/include/linux/swap.h
index d0160265e3c5..0b448a811a39 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -5,6 +5,7 @@
#include <linux/kdev_t.h>
#include <linux/linkage.h>
#include <linux/mmzone.h>
+#include <linux/list.h>
#include <asm/page.h>
#define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */
@@ -62,6 +63,21 @@ typedef struct {
#ifdef __KERNEL__
/*
+ * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of
+ * disk blocks. A list of swap extents maps the entire swapfile. (Where the
+ * term `swapfile' refers to either a blockdevice or an IS_REG file. Apart
+ * from setup, they're handled identically.
+ *
+ * We always assume that blocks are of size PAGE_SIZE.
+ */
+struct swap_extent {
+ struct list_head list;
+ pgoff_t start_page;
+ pgoff_t nr_pages;
+ sector_t start_block;
+};
+
+/*
* Max bad pages in the new format..
*/
#define __swapoffset(x) ((unsigned long)&((union swap_header *)0)->x)
@@ -83,11 +99,17 @@ enum {
/*
* The in-memory structure used to track swap areas.
+ * extent_list.prev points at the lowest-index extent. That list is
+ * sorted.
*/
struct swap_info_struct {
unsigned int flags;
spinlock_t sdev_lock;
struct file *swap_file;
+ struct block_device *bdev;
+ struct list_head extent_list;
+ int nr_extents;
+ struct swap_extent *curr_swap_extent;
unsigned old_block_size;
unsigned short * swap_map;
unsigned int lowest_bit;
@@ -134,8 +156,9 @@ extern wait_queue_head_t kswapd_wait;
extern int FASTCALL(try_to_free_pages(zone_t *, unsigned int, unsigned int));
/* linux/mm/page_io.c */
-extern void rw_swap_page(int, struct page *);
-extern void rw_swap_page_nolock(int, swp_entry_t, char *);
+int swap_readpage(struct file *file, struct page *page);
+int swap_writepage(struct page *page);
+int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page);
/* linux/mm/page_alloc.c */
@@ -163,12 +186,13 @@ extern unsigned int nr_swapfiles;
extern struct swap_info_struct swap_info[];
extern void si_swapinfo(struct sysinfo *);
extern swp_entry_t get_swap_page(void);
-extern void get_swaphandle_info(swp_entry_t, unsigned long *, struct inode **);
extern int swap_duplicate(swp_entry_t);
-extern int swap_count(struct page *);
extern int valid_swaphandles(swp_entry_t, unsigned long *);
extern void swap_free(swp_entry_t);
extern void free_swap_and_cache(swp_entry_t);
+sector_t map_swap_page(struct swap_info_struct *p, pgoff_t offset);
+struct swap_info_struct *get_swap_info_struct(unsigned type);
+
struct swap_list_t {
int head; /* head of priority-ordered swapfile list */
int next; /* swapfile to be used next */
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index a5a6684f9a50..488bc05dbcc1 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -130,16 +130,21 @@ enum
/* CTL_VM names: */
enum
{
- VM_SWAPCTL=1, /* struct: Set vm swapping control */
- VM_SWAPOUT=2, /* int: Linear or sqrt() swapout for hogs */
- VM_FREEPG=3, /* struct: Set free page thresholds */
+ VM_UNUSED1=1, /* was: struct: Set vm swapping control */
+ VM_UNUSED2=2, /* was; int: Linear or sqrt() swapout for hogs */
+ VM_UNUSED3=3, /* was: struct: Set free page thresholds */
VM_BDFLUSH_UNUSED=4, /* Spare */
VM_OVERCOMMIT_MEMORY=5, /* Turn off the virtual memory safety limit */
- VM_BUFFERMEM=6, /* struct: Set buffer memory thresholds */
- VM_PAGECACHE=7, /* struct: Set cache memory thresholds */
+ VM_UNUSED4=6, /* was: struct: Set buffer memory thresholds */
+ VM_UNUSED5=7, /* was: struct: Set cache memory thresholds */
VM_PAGERDAEMON=8, /* struct: Control kswapd behaviour */
- VM_PGT_CACHE=9, /* struct: Set page table cache parameters */
- VM_PAGE_CLUSTER=10 /* int: set number of pages to swap together */
+ VM_UNUSED6=9, /* was: struct: Set page table cache parameters */
+ VM_PAGE_CLUSTER=10, /* int: set number of pages to swap together */
+ VM_DIRTY_BACKGROUND=11, /* dirty_background_ratio */
+ VM_DIRTY_ASYNC=12, /* dirty_async_ratio */
+ VM_DIRTY_SYNC=13, /* dirty_sync_ratio */
+ VM_DIRTY_WB_CS=14, /* dirty_writeback_centisecs */
+ VM_DIRTY_EXPIRE_CS=15, /* dirty_expire_centisecs */
};
diff --git a/include/linux/timer.h b/include/linux/timer.h
index d6f0ce5f8740..6e1e61a4c07b 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -25,10 +25,8 @@ extern int del_timer(struct timer_list * timer);
#ifdef CONFIG_SMP
extern int del_timer_sync(struct timer_list * timer);
-extern void sync_timers(void);
#else
#define del_timer_sync(t) del_timer(t)
-#define sync_timers() do { } while (0)
#endif
/*
diff --git a/include/linux/tqueue.h b/include/linux/tqueue.h
index 3d3047027229..d4729c518f22 100644
--- a/include/linux/tqueue.h
+++ b/include/linux/tqueue.h
@@ -110,6 +110,9 @@ static inline int queue_task(struct tq_struct *bh_pointer, task_queue *bh_list)
return ret;
}
+/* Schedule a tq to run in process context */
+extern int schedule_task(struct tq_struct *task);
+
/*
* Call all "bottom halfs" on a given list.
*/
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 4051c031a976..9cc67b500368 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -13,6 +13,7 @@ struct vm_struct {
unsigned long flags;
void * addr;
unsigned long size;
+ unsigned long phys_addr;
struct vm_struct * next;
};
@@ -23,6 +24,8 @@ extern long vread(char *buf, char *addr, unsigned long count);
extern void vmfree_area_pages(unsigned long address, unsigned long size);
extern int vmalloc_area_pages(unsigned long address, unsigned long size,
int gfp_mask, pgprot_t prot);
+extern struct vm_struct *remove_kernel_area(void *addr);
+
/*
* Various ways to allocate pages.
*/
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index cf706c783eda..a06b0f116ebd 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -45,6 +45,12 @@ static inline void wait_on_inode(struct inode *inode)
/*
* mm/page-writeback.c
*/
+extern int dirty_background_ratio;
+extern int dirty_async_ratio;
+extern int dirty_sync_ratio;
+extern int dirty_writeback_centisecs;
+extern int dirty_expire_centisecs;
+
void balance_dirty_pages(struct address_space *mapping);
void balance_dirty_pages_ratelimited(struct address_space *mapping);
int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
diff --git a/kernel/context.c b/kernel/context.c
index 56bada438f61..c49f914430e0 100644
--- a/kernel/context.c
+++ b/kernel/context.c
@@ -20,6 +20,7 @@
#include <linux/unistd.h>
#include <linux/signal.h>
#include <linux/completion.h>
+#include <linux/tqueue.h>
static DECLARE_TASK_QUEUE(tq_context);
static DECLARE_WAIT_QUEUE_HEAD(context_task_wq);
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a9f0ddb521cc..05388d9557fa 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -28,6 +28,7 @@
#include <linux/namespace.h>
#include <linux/completion.h>
#include <linux/file.h>
+#include <linux/tqueue.h>
#include <asm/uaccess.h>
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index 9391bb0e933d..8b2511787ccb 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -120,7 +120,7 @@ EXPORT_SYMBOL(vmtruncate);
EXPORT_SYMBOL(find_vma);
EXPORT_SYMBOL(get_unmapped_area);
EXPORT_SYMBOL(init_mm);
-EXPORT_SYMBOL(create_bounce);
+EXPORT_SYMBOL(blk_queue_bounce);
#ifdef CONFIG_HIGHMEM
EXPORT_SYMBOL(kmap_high);
EXPORT_SYMBOL(kunmap_high);
@@ -551,7 +551,7 @@ EXPORT_SYMBOL(file_fsync);
EXPORT_SYMBOL(fsync_buffers_list);
EXPORT_SYMBOL(clear_inode);
EXPORT_SYMBOL(init_special_inode);
-EXPORT_SYMBOL(__get_hash_table);
+EXPORT_SYMBOL(__find_get_block);
EXPORT_SYMBOL(new_inode);
EXPORT_SYMBOL(__insert_inode_hash);
EXPORT_SYMBOL(remove_inode_hash);
@@ -559,7 +559,6 @@ EXPORT_SYMBOL(buffer_insert_list);
EXPORT_SYMBOL(make_bad_inode);
EXPORT_SYMBOL(is_bad_inode);
EXPORT_SYMBOL(event);
-EXPORT_SYMBOL(brw_page);
#ifdef CONFIG_UID16
EXPORT_SYMBOL(overflowuid);
diff --git a/kernel/suspend.c b/kernel/suspend.c
index 2fcf5db57868..12e5b0f01f57 100644
--- a/kernel/suspend.c
+++ b/kernel/suspend.c
@@ -320,14 +320,15 @@ static void mark_swapfiles(swp_entry_t prev, int mode)
{
swp_entry_t entry;
union diskpage *cur;
-
- cur = (union diskpage *)get_free_page(GFP_ATOMIC);
- if (!cur)
+ struct page *page;
+
+ page = alloc_page(GFP_ATOMIC);
+ if (!page)
panic("Out of memory in mark_swapfiles");
+ cur = page_address(page);
/* XXX: this is dirty hack to get first page of swap file */
entry = swp_entry(root_swap, 0);
- lock_page(virt_to_page((unsigned long)cur));
- rw_swap_page_nolock(READ, entry, (char *) cur);
+ rw_swap_page_sync(READ, entry, page);
if (mode == MARK_SWAP_RESUME) {
if (!memcmp("SUSP1R",cur->swh.magic.magic,6))
@@ -345,10 +346,8 @@ static void mark_swapfiles(swp_entry_t prev, int mode)
cur->link.next = prev; /* prev is the first/last swap page of the resume area */
/* link.next lies *no more* in last 4 bytes of magic */
}
- lock_page(virt_to_page((unsigned long)cur));
- rw_swap_page_nolock(WRITE, entry, (char *)cur);
-
- free_page((unsigned long)cur);
+ rw_swap_page_sync(WRITE, entry, page);
+ __free_page(page);
}
static void read_swapfiles(void) /* This is called before saving image */
@@ -409,6 +408,7 @@ static int write_suspend_image(void)
int nr_pgdir_pages = SUSPEND_PD_PAGES(nr_copy_pages);
union diskpage *cur, *buffer = (union diskpage *)get_free_page(GFP_ATOMIC);
unsigned long address;
+ struct page *page;
PRINTS( "Writing data to swap (%d pages): ", nr_copy_pages );
for (i=0; i<nr_copy_pages; i++) {
@@ -421,13 +421,8 @@ static int write_suspend_image(void)
panic("\nPage %d: not enough swapspace on suspend device", i );
address = (pagedir_nosave+i)->address;
- lock_page(virt_to_page(address));
- {
- long dummy1;
- struct inode *suspend_file;
- get_swaphandle_info(entry, &dummy1, &suspend_file);
- }
- rw_swap_page_nolock(WRITE, entry, (char *) address);
+ page = virt_to_page(address);
+ rw_swap_page_sync(WRITE, entry, page);
(pagedir_nosave+i)->swap_address = entry;
}
PRINTK(" done\n");
@@ -452,8 +447,8 @@ static int write_suspend_image(void)
if (PAGE_SIZE % sizeof(struct pbe))
panic("I need PAGE_SIZE to be integer multiple of struct pbe, otherwise next assignment could damage pagedir");
cur->link.next = prev;
- lock_page(virt_to_page((unsigned long)cur));
- rw_swap_page_nolock(WRITE, entry, (char *) cur);
+ page = virt_to_page((unsigned long)cur);
+ rw_swap_page_sync(WRITE, entry, page);
prev = entry;
}
PRINTK(", header");
@@ -473,8 +468,8 @@ static int write_suspend_image(void)
cur->link.next = prev;
- lock_page(virt_to_page((unsigned long)cur));
- rw_swap_page_nolock(WRITE, entry, (char *) cur);
+ page = virt_to_page((unsigned long)cur);
+ rw_swap_page_sync(WRITE, entry, page);
prev = entry;
PRINTK( ", signature" );
diff --git a/kernel/sys.c b/kernel/sys.c
index 3bd38f344817..2ba72b6c87d4 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -16,6 +16,7 @@
#include <linux/init.h>
#include <linux/highuid.h>
#include <linux/fs.h>
+#include <linux/tqueue.h>
#include <linux/device.h>
#include <asm/uaccess.h>
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 7eb271716af9..f0c6215b1718 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -31,6 +31,7 @@
#include <linux/init.h>
#include <linux/sysrq.h>
#include <linux/highuid.h>
+#include <linux/writeback.h>
#include <asm/uaccess.h>
@@ -264,6 +265,19 @@ static ctl_table vm_table[] = {
&pager_daemon, sizeof(pager_daemon_t), 0644, NULL, &proc_dointvec},
{VM_PAGE_CLUSTER, "page-cluster",
&page_cluster, sizeof(int), 0644, NULL, &proc_dointvec},
+ {VM_DIRTY_BACKGROUND, "dirty_background_ratio",
+ &dirty_background_ratio, sizeof(dirty_background_ratio),
+ 0644, NULL, &proc_dointvec},
+ {VM_DIRTY_ASYNC, "dirty_async_ratio", &dirty_async_ratio,
+ sizeof(dirty_async_ratio), 0644, NULL, &proc_dointvec},
+ {VM_DIRTY_SYNC, "dirty_sync_ratio", &dirty_sync_ratio,
+ sizeof(dirty_sync_ratio), 0644, NULL, &proc_dointvec},
+ {VM_DIRTY_WB_CS, "dirty_writeback_centisecs",
+ &dirty_writeback_centisecs, sizeof(dirty_writeback_centisecs), 0644,
+ NULL, &proc_dointvec},
+ {VM_DIRTY_EXPIRE_CS, "dirty_expire_centisecs",
+ &dirty_expire_centisecs, sizeof(dirty_expire_centisecs), 0644,
+ NULL, &proc_dointvec},
{0}
};
diff --git a/kernel/timer.c b/kernel/timer.c
index 0b7efa84970b..858954c871e1 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -22,6 +22,7 @@
#include <linux/delay.h>
#include <linux/smp_lock.h>
#include <linux/interrupt.h>
+#include <linux/tqueue.h>
#include <linux/kernel_stat.h>
#include <asm/uaccess.h>
@@ -69,11 +70,11 @@ unsigned long event;
extern int do_setitimer(int, struct itimerval *, struct itimerval *);
/*
- * The 64-bit value is not volatile - you MUST NOT read it
+ * The 64-bit jiffies value is not atomic - you MUST NOT read it
* without holding read_lock_irq(&xtime_lock).
* jiffies is defined in the linker script...
*/
-u64 jiffies_64;
+
unsigned int * prof_buffer;
unsigned long prof_len;
@@ -231,11 +232,6 @@ int del_timer(struct timer_list * timer)
}
#ifdef CONFIG_SMP
-void sync_timers(void)
-{
- spin_unlock_wait(&global_bh_lock);
-}
-
/*
* SMP specific function to delete periodic timer.
* Caller must disable by some means restarting the timer
diff --git a/lib/radix-tree.c b/lib/radix-tree.c
index 689a5448ea31..e17cd888fc3d 100644
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -29,7 +29,7 @@
/*
* Radix tree node definition.
*/
-#define RADIX_TREE_MAP_SHIFT 7
+#define RADIX_TREE_MAP_SHIFT 6
#define RADIX_TREE_MAP_SIZE (1UL << RADIX_TREE_MAP_SHIFT)
#define RADIX_TREE_MAP_MASK (RADIX_TREE_MAP_SIZE-1)
diff --git a/mm/filemap.c b/mm/filemap.c
index 0b6edcc0d0eb..a31fbce9e196 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -445,8 +445,10 @@ int fail_writepage(struct page *page)
{
/* Only activate on memory-pressure, not fsync.. */
if (current->flags & PF_MEMALLOC) {
- activate_page(page);
- SetPageReferenced(page);
+ if (!PageActive(page))
+ activate_page(page);
+ if (!PageReferenced(page))
+ SetPageReferenced(page);
}
/* Set the page dirty again, unlock */
@@ -868,55 +870,35 @@ struct page *grab_cache_page(struct address_space *mapping, unsigned long index)
* This is intended for speculative data generators, where the data can
* be regenerated if the page couldn't be grabbed. This routine should
* be safe to call while holding the lock for another page.
+ *
+ * Clear __GFP_FS when allocating the page to avoid recursion into the fs
+ * and deadlock against the caller's locked page.
*/
-struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
+struct page *
+grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
{
- struct page *page;
-
- page = find_get_page(mapping, index);
-
- if ( page ) {
- if ( !TestSetPageLocked(page) ) {
- /* Page found and locked */
- /* This test is overly paranoid, but what the heck... */
- if ( unlikely(page->mapping != mapping || page->index != index) ) {
- /* Someone reallocated this page under us. */
- unlock_page(page);
- page_cache_release(page);
- return NULL;
- } else {
- return page;
- }
- } else {
- /* Page locked by someone else */
- page_cache_release(page);
- return NULL;
- }
- }
-
- page = page_cache_alloc(mapping);
- if (unlikely(!page))
- return NULL; /* Failed to allocate a page */
+ struct page *page = find_get_page(mapping, index);
- if (unlikely(add_to_page_cache_unique(page, mapping, index))) {
- /*
- * Someone else grabbed the page already, or
- * failed to allocate a radix-tree node
- */
+ if (page) {
+ if (!TestSetPageLocked(page))
+ return page;
page_cache_release(page);
return NULL;
}
-
+ page = alloc_pages(mapping->gfp_mask & ~__GFP_FS, 0);
+ if (page && add_to_page_cache_unique(page, mapping, index)) {
+ page_cache_release(page);
+ page = NULL;
+ }
return page;
}
/*
* Mark a page as having seen activity.
*
- * If it was already so marked, move it
- * to the active queue and drop the referenced
- * bit. Otherwise, just mark it for future
- * action..
+ * inactive,unreferenced -> inactive,referenced
+ * inactive,referenced -> active,unreferenced
+ * active,unreferenced -> active,referenced
*/
void mark_page_accessed(struct page *page)
{
@@ -924,10 +906,9 @@ void mark_page_accessed(struct page *page)
activate_page(page);
ClearPageReferenced(page);
return;
+ } else if (!PageReferenced(page)) {
+ SetPageReferenced(page);
}
-
- /* Mark the page referenced, AFTER checking for previous usage.. */
- SetPageReferenced(page);
}
/*
@@ -2286,7 +2267,8 @@ generic_file_write(struct file *file, const char *buf,
}
}
kunmap(page);
- SetPageReferenced(page);
+ if (!PageReferenced(page))
+ SetPageReferenced(page);
unlock_page(page);
page_cache_release(page);
if (status < 0)
diff --git a/mm/highmem.c b/mm/highmem.c
index de5ebeb0a167..ae9c5a26376b 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -17,6 +17,7 @@
*/
#include <linux/mm.h>
+#include <linux/bio.h>
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
@@ -347,13 +348,15 @@ static void bounce_end_io_read_isa(struct bio *bio)
return __bounce_end_io_read(bio, isa_page_pool);
}
-void create_bounce(unsigned long pfn, int gfp, struct bio **bio_orig)
+void blk_queue_bounce(request_queue_t *q, struct bio **bio_orig)
{
struct page *page;
struct bio *bio = NULL;
int i, rw = bio_data_dir(*bio_orig), bio_gfp;
struct bio_vec *to, *from;
mempool_t *pool;
+ unsigned long pfn = q->bounce_pfn;
+ int gfp = q->bounce_gfp;
BUG_ON((*bio_orig)->bi_idx);
diff --git a/mm/msync.c b/mm/msync.c
index 2a2b31de8957..5ea980e6b1dc 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -169,7 +169,7 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
{
unsigned long end;
struct vm_area_struct * vma;
- int unmapped_error, error = -EINVAL;
+ int unmapped_error, error = -ENOMEM;
down_read(&current->mm->mmap_sem);
if (start & ~PAGE_MASK)
@@ -185,18 +185,18 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
goto out;
/*
* If the interval [start,end) covers some unmapped address ranges,
- * just ignore them, but return -EFAULT at the end.
+ * just ignore them, but return -ENOMEM at the end.
*/
vma = find_vma(current->mm, start);
unmapped_error = 0;
for (;;) {
/* Still start < end. */
- error = -EFAULT;
+ error = -ENOMEM;
if (!vma)
goto out;
/* Here start < vma->vm_end. */
if (start < vma->vm_start) {
- unmapped_error = -EFAULT;
+ unmapped_error = -ENOMEM;
start = vma->vm_start;
}
/* Here vma->vm_start <= start < vma->vm_end. */
@@ -220,5 +220,3 @@ out:
up_read(&current->mm->mmap_sem);
return error;
}
-
-
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 082e8fb8cb16..6d4555c3fb91 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -26,29 +26,56 @@
* The maximum number of pages to writeout in a single bdflush/kupdate
* operation. We do this so we don't hold I_LOCK against an inode for
* enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode.
+ * been forced to throttle against that inode. Also, the code reevaluates
+ * the dirty each time it has written this many pages.
*/
#define MAX_WRITEBACK_PAGES 1024
/*
- * Memory thresholds, in percentages
- * FIXME: expose these via /proc or whatever.
+ * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
+ * will look to see if it needs to force writeback or throttling. Probably
+ * should be scaled by memory size.
+ */
+#define RATELIMIT_PAGES 1000
+
+/*
+ * When balance_dirty_pages decides that the caller needs to perform some
+ * non-background writeback, this is how many pages it will attempt to write.
+ * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
+ * large amounts of I/O are submitted.
+ */
+#define SYNC_WRITEBACK_PAGES 1500
+
+
+/*
+ * Dirty memory thresholds, in percentages
*/
/*
* Start background writeback (via pdflush) at this level
*/
-static int dirty_background_ratio = 40;
+int dirty_background_ratio = 40;
/*
* The generator of dirty data starts async writeback at this level
*/
-static int dirty_async_ratio = 50;
+int dirty_async_ratio = 50;
/*
* The generator of dirty data performs sync writeout at this level
*/
-static int dirty_sync_ratio = 60;
+int dirty_sync_ratio = 60;
+
+/*
+ * The interval between `kupdate'-style writebacks.
+ */
+int dirty_writeback_centisecs = 5 * 100;
+
+/*
+ * The largest amount of time for which data is allowed to remain dirty
+ */
+int dirty_expire_centisecs = 30 * 100;
+
static void background_writeout(unsigned long _min_pages);
@@ -84,12 +111,12 @@ void balance_dirty_pages(struct address_space *mapping)
sync_thresh = (dirty_sync_ratio * tot) / 100;
if (dirty_and_writeback > sync_thresh) {
- int nr_to_write = 1500;
+ int nr_to_write = SYNC_WRITEBACK_PAGES;
writeback_unlocked_inodes(&nr_to_write, WB_SYNC_LAST, NULL);
get_page_state(&ps);
} else if (dirty_and_writeback > async_thresh) {
- int nr_to_write = 1500;
+ int nr_to_write = SYNC_WRITEBACK_PAGES;
writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, NULL);
get_page_state(&ps);
@@ -118,7 +145,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
int cpu;
cpu = get_cpu();
- if (ratelimits[cpu].count++ >= 1000) {
+ if (ratelimits[cpu].count++ >= RATELIMIT_PAGES) {
ratelimits[cpu].count = 0;
put_cpu();
balance_dirty_pages(mapping);
@@ -162,17 +189,6 @@ void wakeup_bdflush(void)
pdflush_operation(background_writeout, ps.nr_dirty);
}
-/*
- * The interval between `kupdate'-style writebacks.
- *
- * Traditional kupdate writes back data which is 30-35 seconds old.
- * This one does that, but it also writes back just 1/6th of the dirty
- * data. This is to avoid great I/O storms.
- *
- * We chunk the writes up and yield, to permit any throttled page-allocators
- * to perform their I/O against a large file.
- */
-static int wb_writeback_jifs = 5 * HZ;
static struct timer_list wb_timer;
/*
@@ -183,9 +199,9 @@ static struct timer_list wb_timer;
* just walks the superblock inode list, writing back any inodes which are
* older than a specific point in time.
*
- * Try to run once per wb_writeback_jifs jiffies. But if a writeback event
- * takes longer than a wb_writeback_jifs interval, then leave a one-second
- * gap.
+ * Try to run once per dirty_writeback_centisecs. But if a writeback event
+ * takes longer than a dirty_writeback_centisecs interval, then leave a
+ * one-second gap.
*
* older_than_this takes precedence over nr_to_write. So we'll only write back
* all dirty pages if they are all attached to "old" mappings.
@@ -201,9 +217,9 @@ static void wb_kupdate(unsigned long arg)
sync_supers();
get_page_state(&ps);
- oldest_jif = jiffies - 30*HZ;
+ oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
start_jif = jiffies;
- next_jif = start_jif + wb_writeback_jifs;
+ next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
nr_to_write = ps.nr_dirty;
writeback_unlocked_inodes(&nr_to_write, WB_SYNC_NONE, &oldest_jif);
blk_run_queues();
@@ -223,7 +239,7 @@ static void wb_timer_fn(unsigned long unused)
static int __init wb_timer_init(void)
{
init_timer(&wb_timer);
- wb_timer.expires = jiffies + wb_writeback_jifs;
+ wb_timer.expires = jiffies + (dirty_writeback_centisecs * HZ) / 100;
wb_timer.data = 0;
wb_timer.function = wb_timer_fn;
add_timer(&wb_timer);
diff --git a/mm/page_io.c b/mm/page_io.c
index 942ea274dccd..3692ead4d94c 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -14,112 +14,163 @@
#include <linux/kernel_stat.h>
#include <linux/pagemap.h>
#include <linux/swap.h>
-#include <linux/swapctl.h>
-#include <linux/buffer_head.h> /* for brw_page() */
-
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
#include <asm/pgtable.h>
+#include <linux/swapops.h>
-/*
- * Reads or writes a swap page.
- * wait=1: start I/O and wait for completion. wait=0: start asynchronous I/O.
- *
- * Important prevention of race condition: the caller *must* atomically
- * create a unique swap cache entry for this swap page before calling
- * rw_swap_page, and must lock that page. By ensuring that there is a
- * single page of memory reserved for the swap entry, the normal VM page
- * lock on that page also doubles as a lock on swap entries. Having only
- * one lock to deal with per swap entry (rather than locking swap and memory
- * independently) also makes it easier to make certain swapping operations
- * atomic, which is particularly important when we are trying to ensure
- * that shared pages stay shared while being swapped.
- */
+static int
+swap_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ struct swap_info_struct *sis;
+ swp_entry_t entry;
-static int rw_swap_page_base(int rw, swp_entry_t entry, struct page *page)
+ entry.val = iblock;
+ sis = get_swap_info_struct(swp_type(entry));
+ bh_result->b_bdev = sis->bdev;
+ bh_result->b_blocknr = map_swap_page(sis, swp_offset(entry));
+ bh_result->b_size = PAGE_SIZE;
+ set_buffer_mapped(bh_result);
+ return 0;
+}
+
+static struct bio *
+get_swap_bio(int gfp_flags, struct page *page, bio_end_io_t end_io)
{
- unsigned long offset;
- sector_t zones[PAGE_SIZE/512];
- int zones_used;
- int block_size;
- struct inode *swapf = 0;
- struct block_device *bdev;
+ struct bio *bio;
+ struct buffer_head bh;
- if (rw == READ) {
+ bio = bio_alloc(gfp_flags, 1);
+ if (bio) {
+ swap_get_block(NULL, page->index, &bh, 1);
+ bio->bi_sector = bh.b_blocknr * (PAGE_SIZE >> 9);
+ bio->bi_bdev = bh.b_bdev;
+ bio->bi_io_vec[0].bv_page = page;
+ bio->bi_io_vec[0].bv_len = PAGE_SIZE;
+ bio->bi_io_vec[0].bv_offset = 0;
+ bio->bi_vcnt = 1;
+ bio->bi_idx = 0;
+ bio->bi_size = PAGE_SIZE;
+ bio->bi_end_io = end_io;
+ }
+ return bio;
+}
+
+static void end_swap_bio_write(struct bio *bio)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct page *page = bio->bi_io_vec[0].bv_page;
+
+ if (!uptodate)
+ SetPageError(page);
+ end_page_writeback(page);
+ bio_put(bio);
+}
+
+static void end_swap_bio_read(struct bio *bio)
+{
+ const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
+ struct page *page = bio->bi_io_vec[0].bv_page;
+
+ if (!uptodate) {
+ SetPageError(page);
ClearPageUptodate(page);
- kstat.pswpin++;
- } else
- kstat.pswpout++;
-
- get_swaphandle_info(entry, &offset, &swapf);
- bdev = swapf->i_bdev;
- if (bdev) {
- zones[0] = offset;
- zones_used = 1;
- block_size = PAGE_SIZE;
} else {
- int i, j;
- unsigned int block = offset
- << (PAGE_SHIFT - swapf->i_sb->s_blocksize_bits);
-
- block_size = swapf->i_sb->s_blocksize;
- for (i=0, j=0; j< PAGE_SIZE ; i++, j += block_size)
- if (!(zones[i] = bmap(swapf,block++))) {
- printk("rw_swap_page: bad swap file\n");
- return 0;
- }
- zones_used = i;
- bdev = swapf->i_sb->s_bdev;
+ SetPageUptodate(page);
}
+ unlock_page(page);
+ bio_put(bio);
+}
- /* block_size == PAGE_SIZE/zones_used */
- brw_page(rw, page, bdev, zones, block_size);
+/*
+ * We may have stale swap cache pages in memory: notice
+ * them here and get rid of the unnecessary final write.
+ */
+int swap_writepage(struct page *page)
+{
+ struct bio *bio;
+ int ret = 0;
- /* Note! For consistency we do all of the logic,
- * decrementing the page count, and unlocking the page in the
- * swap lock map - in the IO completion handler.
- */
- return 1;
+ if (remove_exclusive_swap_page(page)) {
+ unlock_page(page);
+ goto out;
+ }
+ bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write);
+ if (bio == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ kstat.pswpout++;
+ SetPageWriteback(page);
+ unlock_page(page);
+ submit_bio(WRITE, bio);
+out:
+ return ret;
}
+int swap_readpage(struct file *file, struct page *page)
+{
+ struct bio *bio;
+ int ret = 0;
+
+ ClearPageUptodate(page);
+ bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read);
+ if (bio == NULL) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ kstat.pswpin++;
+ submit_bio(READ, bio);
+out:
+ return ret;
+}
/*
- * A simple wrapper so the base function doesn't need to enforce
- * that all swap pages go through the swap cache! We verify that:
- * - the page is locked
- * - it's marked as being swap-cache
- * - it's associated with the swap inode
+ * swapper_space doesn't have a real inode, so it gets a special vm_writeback()
+ * so we don't need swap special cases in generic_vm_writeback().
+ *
+ * Swap pages are PageLocked and PageWriteback while under writeout so that
+ * memory allocators will throttle against them.
*/
-void rw_swap_page(int rw, struct page *page)
+static int swap_vm_writeback(struct page *page, int *nr_to_write)
{
- swp_entry_t entry;
+ struct address_space *mapping = page->mapping;
- entry.val = page->index;
-
- if (!PageLocked(page))
- PAGE_BUG(page);
- if (!PageSwapCache(page))
- PAGE_BUG(page);
- if (!rw_swap_page_base(rw, entry, page))
- unlock_page(page);
+ unlock_page(page);
+ return generic_writepages(mapping, nr_to_write);
}
+struct address_space_operations swap_aops = {
+ vm_writeback: swap_vm_writeback,
+ writepage: swap_writepage,
+ readpage: swap_readpage,
+ sync_page: block_sync_page,
+ set_page_dirty: __set_page_dirty_nobuffers,
+};
+
/*
- * The swap lock map insists that pages be in the page cache!
- * Therefore we can't use it. Later when we can remove the need for the
- * lock map and we can reduce the number of functions exported.
+ * A scruffy utility function to read or write an arbitrary swap page
+ * and wait on the I/O.
*/
-void rw_swap_page_nolock(int rw, swp_entry_t entry, char *buf)
+int rw_swap_page_sync(int rw, swp_entry_t entry, struct page *page)
{
- struct page *page = virt_to_page(buf);
-
- if (!PageLocked(page))
- PAGE_BUG(page);
- if (page->mapping)
- PAGE_BUG(page);
- /* needs sync_page to wait I/O completation */
+ int ret;
+
+ lock_page(page);
+
+ BUG_ON(page->mapping);
page->mapping = &swapper_space;
- if (rw_swap_page_base(rw, entry, page))
- lock_page(page);
- if (page_has_buffers(page) && !try_to_free_buffers(page))
- PAGE_BUG(page);
+ page->index = entry.val;
+
+ if (rw == READ) {
+ ret = swap_readpage(NULL, page);
+ wait_on_page_locked(page);
+ } else {
+ ret = swap_writepage(page);
+ wait_on_page_writeback(page);
+ }
page->mapping = NULL;
- unlock_page(page);
+ if (ret == 0 && (!PageUptodate(page) || PageError(page)))
+ ret = -EIO;
+ return ret;
}
diff --git a/mm/shmem.c b/mm/shmem.c
index 9367252b65b0..07bdba83bdf5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -426,15 +426,22 @@ found:
swap_free(entry);
ptr[offset] = (swp_entry_t) {0};
- while (inode && move_from_swap_cache(page, idx, inode->i_mapping)) {
+ while (inode && (PageWriteback(page) ||
+ move_from_swap_cache(page, idx, inode->i_mapping))) {
/*
* Yield for kswapd, and try again - but we're still
* holding the page lock - ugh! fix this up later on.
* Beware of inode being unlinked or truncated: just
* leave try_to_unuse to delete_from_swap_cache if so.
+ *
+ * AKPM: We now wait on writeback too. Note that it's
+ * the page lock which prevents new writeback from starting.
*/
spin_unlock(&info->lock);
- yield();
+ if (PageWriteback(page))
+ wait_on_page_writeback(page);
+ else
+ yield();
spin_lock(&info->lock);
ptr = shmem_swp_entry(info, idx, 0);
if (IS_ERR(ptr))
@@ -594,9 +601,14 @@ repeat:
}
/* We have to do this with page locked to prevent races */
- if (TestSetPageLocked(page))
+ if (TestSetPageLocked(page))
goto wait_retry;
-
+ if (PageWriteback(page)) {
+ spin_unlock(&info->lock);
+ wait_on_page_writeback(page);
+ unlock_page(page);
+ goto repeat;
+ }
error = move_from_swap_cache(page, idx, mapping);
if (error < 0) {
unlock_page(page);
@@ -651,7 +663,7 @@ no_space:
return ERR_PTR(-ENOSPC);
wait_retry:
- spin_unlock (&info->lock);
+ spin_unlock(&info->lock);
wait_on_page_locked(page);
page_cache_release(page);
goto repeat;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 5fe5a4462bbb..4513649a1208 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -14,54 +14,27 @@
#include <linux/init.h>
#include <linux/pagemap.h>
#include <linux/smp_lock.h>
-#include <linux/buffer_head.h> /* block_sync_page()/try_to_free_buffers() */
+#include <linux/buffer_head.h> /* block_sync_page() */
#include <asm/pgtable.h>
/*
- * We may have stale swap cache pages in memory: notice
- * them here and get rid of the unnecessary final write.
- */
-static int swap_writepage(struct page *page)
-{
- if (remove_exclusive_swap_page(page)) {
- unlock_page(page);
- return 0;
- }
- rw_swap_page(WRITE, page);
- return 0;
-}
-
-/*
- * swapper_space doesn't have a real inode, so it gets a special vm_writeback()
- * so we don't need swap special cases in generic_vm_writeback().
- *
- * Swap pages are PageLocked and PageWriteback while under writeout so that
- * memory allocators will throttle against them.
- */
-static int swap_vm_writeback(struct page *page, int *nr_to_write)
-{
- struct address_space *mapping = page->mapping;
-
- unlock_page(page);
- return generic_writepages(mapping, nr_to_write);
-}
-
-static struct address_space_operations swap_aops = {
- vm_writeback: swap_vm_writeback,
- writepage: swap_writepage,
- sync_page: block_sync_page,
- set_page_dirty: __set_page_dirty_nobuffers,
-};
-
-/*
* swapper_inode doesn't do anything much. It is really only here to
* avoid some special-casing in other parts of the kernel.
+ *
+ * We set i_size to "infinity" to keep the page I/O functions happy. The swap
+ * block allocator makes sure that allocations are in-range. A strange
+ * number is chosen to prevent various arith overflows elsewhere. For example,
+ * `lblock' in block_read_full_page().
*/
static struct inode swapper_inode = {
- i_mapping: &swapper_space,
+ i_mapping: &swapper_space,
+ i_size: PAGE_SIZE * 0xffffffffLL,
+ i_blkbits: PAGE_SHIFT,
};
+extern struct address_space_operations swap_aops;
+
struct address_space swapper_space = {
page_tree: RADIX_TREE_INIT(GFP_ATOMIC),
page_lock: RW_LOCK_UNLOCKED,
@@ -131,10 +104,9 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry)
*/
void __delete_from_swap_cache(struct page *page)
{
- if (!PageLocked(page))
- BUG();
- if (!PageSwapCache(page))
- BUG();
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!PageSwapCache(page));
+ BUG_ON(PageWriteback(page));
ClearPageDirty(page);
__remove_inode_page(page);
INC_CACHE_INFO(del_total);
@@ -150,14 +122,9 @@ void delete_from_swap_cache(struct page *page)
{
swp_entry_t entry;
- /*
- * I/O should have completed and nobody can have a ref against the
- * page's buffers
- */
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- if (page_has_buffers(page) && !try_to_free_buffers(page))
- BUG();
+ BUG_ON(page_has_buffers(page));
entry.val = page->index;
@@ -223,16 +190,9 @@ int move_from_swap_cache(struct page *page, unsigned long index,
void **pslot;
int err;
- /*
- * Drop the buffers now, before taking the page_lock. Because
- * mapping->private_lock nests outside mapping->page_lock.
- * This "must" succeed. The page is locked and all I/O has completed
- * and nobody else has a ref against its buffers.
- */
BUG_ON(!PageLocked(page));
BUG_ON(PageWriteback(page));
- if (page_has_buffers(page) && !try_to_free_buffers(page))
- BUG();
+ BUG_ON(page_has_buffers(page));
write_lock(&swapper_space.page_lock);
write_lock(&mapping->page_lock);
@@ -362,7 +322,7 @@ struct page * read_swap_cache_async(swp_entry_t entry)
/*
* Initiate read into locked page and return.
*/
- rw_swap_page(READ, new_page);
+ swap_readpage(NULL, new_page);
return new_page;
}
} while (err != -ENOENT && err != -ENOMEM);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 70a517bbcc16..175c812a63d6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -16,7 +16,7 @@
#include <linux/namei.h>
#include <linux/shm.h>
#include <linux/blkdev.h>
-#include <linux/buffer_head.h> /* for try_to_free_buffers() */
+#include <linux/buffer_head.h>
#include <asm/pgtable.h>
#include <linux/swapops.h>
@@ -294,11 +294,14 @@ int remove_exclusive_swap_page(struct page *page)
struct swap_info_struct * p;
swp_entry_t entry;
- if (!PageLocked(page))
- BUG();
+ BUG_ON(page_has_buffers(page));
+ BUG_ON(!PageLocked(page));
+
if (!PageSwapCache(page))
return 0;
- if (page_count(page) - !!PagePrivate(page) != 2) /* 2: us + cache */
+ if (PageWriteback(page))
+ return 0;
+ if (page_count(page) != 2) /* 2: us + cache */
return 0;
entry.val = page->index;
@@ -311,13 +314,8 @@ int remove_exclusive_swap_page(struct page *page)
if (p->swap_map[swp_offset(entry)] == 1) {
/* Recheck the page count with the pagecache lock held.. */
write_lock(&swapper_space.page_lock);
- if (page_count(page) - !!PagePrivate(page) == 2) {
+ if ((page_count(page) == 2) && !PageWriteback(page)) {
__delete_from_swap_cache(page);
- /*
- * NOTE: if/when swap gets buffer/page coherency
- * like other mappings, we'll need to mark the buffers
- * dirty here too. set_page_dirty().
- */
SetPageDirty(page);
retval = 1;
}
@@ -326,9 +324,6 @@ int remove_exclusive_swap_page(struct page *page)
swap_info_put(p);
if (retval) {
- BUG_ON(PageWriteback(page));
- if (page_has_buffers(page) && !try_to_free_buffers(page))
- BUG();
swap_free(entry);
page_cache_release(page);
}
@@ -352,9 +347,13 @@ void free_swap_and_cache(swp_entry_t entry)
swap_info_put(p);
}
if (page) {
+ int one_user;
+
+ BUG_ON(page_has_buffers(page));
page_cache_get(page);
+ one_user = (page_count(page) == 2);
/* Only cache user (+us), or swap space full? Free it! */
- if (page_count(page) - !!PagePrivate(page) == 2 || vm_swap_full()) {
+ if (!PageWriteback(page) && (one_user || vm_swap_full())) {
delete_from_swap_cache(page);
SetPageDirty(page);
}
@@ -606,6 +605,7 @@ static int try_to_unuse(unsigned int type)
wait_on_page_locked(page);
wait_on_page_writeback(page);
lock_page(page);
+ wait_on_page_writeback(page);
/*
* Remove all references to entry, without blocking.
@@ -685,11 +685,13 @@ static int try_to_unuse(unsigned int type)
* Note shmem_unuse already deleted its from swap cache.
*/
if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) {
- rw_swap_page(WRITE, page);
+ swap_writepage(page);
lock_page(page);
}
- if (PageSwapCache(page))
+ if (PageSwapCache(page)) {
+ wait_on_page_writeback(page);
delete_from_swap_cache(page);
+ }
/*
* So we could skip searching mms once swap count went
@@ -717,6 +719,207 @@ static int try_to_unuse(unsigned int type)
return retval;
}
+/*
+ * Use this swapdev's extent info to locate the (PAGE_SIZE) block which
+ * corresponds to page offset `offset'.
+ */
+sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset)
+{
+ struct swap_extent *se = sis->curr_swap_extent;
+ struct swap_extent *start_se = se;
+
+ for ( ; ; ) {
+ struct list_head *lh;
+
+ if (se->start_page <= offset &&
+ offset < (se->start_page + se->nr_pages)) {
+ return se->start_block + (offset - se->start_page);
+ }
+ lh = se->list.prev;
+ if (lh == &sis->extent_list)
+ lh = lh->prev;
+ se = list_entry(lh, struct swap_extent, list);
+ sis->curr_swap_extent = se;
+ BUG_ON(se == start_se); /* It *must* be present */
+ }
+}
+
+/*
+ * Free all of a swapdev's extent information
+ */
+static void destroy_swap_extents(struct swap_info_struct *sis)
+{
+ while (!list_empty(&sis->extent_list)) {
+ struct swap_extent *se;
+
+ se = list_entry(sis->extent_list.next,
+ struct swap_extent, list);
+ list_del(&se->list);
+ kfree(se);
+ }
+ sis->nr_extents = 0;
+}
+
+/*
+ * Add a block range (and the corresponding page range) into this swapdev's
+ * extent list. The extent list is kept sorted in block order.
+ *
+ * This function rather assumes that it is called in ascending sector_t order.
+ * It doesn't look for extent coalescing opportunities.
+ */
+static int
+add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
+ unsigned long nr_pages, sector_t start_block)
+{
+ struct swap_extent *se;
+ struct swap_extent *new_se;
+ struct list_head *lh;
+
+ lh = sis->extent_list.next; /* The highest-addressed block */
+ while (lh != &sis->extent_list) {
+ se = list_entry(lh, struct swap_extent, list);
+ if (se->start_block + se->nr_pages == start_block) {
+ /* Merge it */
+ se->nr_pages += nr_pages;
+ return 0;
+ }
+ lh = lh->next;
+ }
+
+ /*
+ * No merge. Insert a new extent, preserving ordering.
+ */
+ new_se = kmalloc(sizeof(*se), GFP_KERNEL);
+ if (new_se == NULL)
+ return -ENOMEM;
+ new_se->start_page = start_page;
+ new_se->nr_pages = nr_pages;
+ new_se->start_block = start_block;
+
+ lh = sis->extent_list.prev; /* The lowest block */
+ while (lh != &sis->extent_list) {
+ se = list_entry(lh, struct swap_extent, list);
+ if (se->start_block > start_block)
+ break;
+ lh = lh->prev;
+ }
+ list_add_tail(&new_se->list, lh);
+ sis->nr_extents++;
+ return 0;
+}
+
+/*
+ * A `swap extent' is a simple thing which maps a contiguous range of pages
+ * onto a contiguous range of disk blocks. An ordered list of swap extents
+ * is built at swapon time and is then used at swap_writepage/swap_readpage
+ * time for locating where on disk a page belongs.
+ *
+ * If the swapfile is an S_ISBLK block device, a single extent is installed.
+ * This is done so that the main operating code can treat S_ISBLK and S_ISREG
+ * swap files identically.
+ *
+ * Whether the swapdev is an S_ISREG file or an S_ISBLK blockdev, the swap
+ * extent list operates in PAGE_SIZE disk blocks. Both S_ISREG and S_ISBLK
+ * swapfiles are handled *identically* after swapon time.
+ *
+ * For S_ISREG swapfiles, setup_swap_extents() will walk all the file's blocks
+ * and will parse them into an ordered extent list, in PAGE_SIZE chunks. If
+ * some stray blocks are found which do not fall within the PAGE_SIZE alignment
+ * requirements, they are simply tossed out - we will never use those blocks
+ * for swapping.
+ *
+ * The amount of disk space which a single swap extent represents varies.
+ * Typically it is in the 1-4 megabyte range. So we can have hundreds of
+ * extents in the list. To avoid much list walking, we cache the previous
+ * search location in `curr_swap_extent', and start new searches from there.
+ * This is extremely effective. The average number of iterations in
+ * map_swap_page() has been measured at about 0.3 per page. - akpm.
+ */
+static int setup_swap_extents(struct swap_info_struct *sis)
+{
+ struct inode *inode;
+ unsigned blocks_per_page;
+ unsigned long page_no;
+ unsigned blkbits;
+ sector_t probe_block;
+ sector_t last_block;
+ int ret;
+
+ inode = sis->swap_file->f_dentry->d_inode;
+ if (S_ISBLK(inode->i_mode)) {
+ ret = add_swap_extent(sis, 0, sis->max, 0);
+ goto done;
+ }
+
+ blkbits = inode->i_blkbits;
+ blocks_per_page = PAGE_SIZE >> blkbits;
+
+ /*
+ * Map all the blocks into the extent list. This code doesn't try
+ * to be very smart.
+ */
+ probe_block = 0;
+ page_no = 0;
+ last_block = inode->i_size >> blkbits;
+ while ((probe_block + blocks_per_page) <= last_block &&
+ page_no < sis->max) {
+ unsigned block_in_page;
+ sector_t first_block;
+
+ first_block = bmap(inode, probe_block);
+ if (first_block == 0)
+ goto bad_bmap;
+
+ /*
+ * It must be PAGE_SIZE aligned on-disk
+ */
+ if (first_block & (blocks_per_page - 1)) {
+ probe_block++;
+ goto reprobe;
+ }
+
+ for (block_in_page = 1; block_in_page < blocks_per_page;
+ block_in_page++) {
+ sector_t block;
+
+ block = bmap(inode, probe_block + block_in_page);
+ if (block == 0)
+ goto bad_bmap;
+ if (block != first_block + block_in_page) {
+ /* Discontiguity */
+ probe_block++;
+ goto reprobe;
+ }
+ }
+
+ /*
+ * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
+ */
+ ret = add_swap_extent(sis, page_no, 1,
+ first_block >> (PAGE_SHIFT - blkbits));
+ if (ret)
+ goto out;
+ page_no++;
+ probe_block += blocks_per_page;
+reprobe:
+ continue;
+ }
+ ret = 0;
+ if (page_no == 0)
+ ret = -EINVAL;
+ sis->max = page_no;
+ sis->highest_bit = page_no - 1;
+done:
+ sis->curr_swap_extent = list_entry(sis->extent_list.prev,
+ struct swap_extent, list);
+ goto out;
+bad_bmap:
+ printk(KERN_ERR "swapon: swapfile has holes\n");
+ ret = -EINVAL;
+out:
+ return ret;
+}
+
asmlinkage long sys_swapoff(const char * specialfile)
{
struct swap_info_struct * p = NULL;
@@ -733,7 +936,6 @@ asmlinkage long sys_swapoff(const char * specialfile)
if (err)
goto out;
- lock_kernel();
prev = -1;
swap_list_lock();
for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
@@ -763,9 +965,7 @@ asmlinkage long sys_swapoff(const char * specialfile)
total_swap_pages -= p->pages;
p->flags &= ~SWP_WRITEOK;
swap_list_unlock();
- unlock_kernel();
err = try_to_unuse(type);
- lock_kernel();
if (err) {
/* re-insert swap space back into swap_list */
swap_list_lock();
@@ -791,6 +991,7 @@ asmlinkage long sys_swapoff(const char * specialfile)
swap_map = p->swap_map;
p->swap_map = NULL;
p->flags = 0;
+ destroy_swap_extents(p);
swap_device_unlock(p);
swap_list_unlock();
vfree(swap_map);
@@ -804,7 +1005,6 @@ asmlinkage long sys_swapoff(const char * specialfile)
err = 0;
out_dput:
- unlock_kernel();
path_release(&nd);
out:
return err;
@@ -858,12 +1058,12 @@ int get_swaparea_info(char *buf)
asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
{
struct swap_info_struct * p;
- char *name;
+ char *name = NULL;
struct block_device *bdev = NULL;
struct file *swap_file = NULL;
struct address_space *mapping;
unsigned int type;
- int i, j, prev;
+ int i, prev;
int error;
static int least_priority = 0;
union swap_header *swap_header = 0;
@@ -872,10 +1072,10 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
unsigned long maxpages = 1;
int swapfilesize;
unsigned short *swap_map;
-
+ struct page *page = NULL;
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- lock_kernel();
swap_list_lock();
p = swap_info;
for (type = 0 ; type < nr_swapfiles ; type++,p++)
@@ -888,7 +1088,9 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
}
if (type >= nr_swapfiles)
nr_swapfiles = type+1;
+ INIT_LIST_HEAD(&p->extent_list);
p->flags = SWP_USED;
+ p->nr_extents = 0;
p->swap_file = NULL;
p->old_block_size = 0;
p->swap_map = NULL;
@@ -909,7 +1111,6 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
if (IS_ERR(name))
goto bad_swap_2;
swap_file = filp_open(name, O_RDWR, 0);
- putname(name);
error = PTR_ERR(swap_file);
if (IS_ERR(swap_file)) {
swap_file = NULL;
@@ -931,8 +1132,12 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
PAGE_SIZE);
if (error < 0)
goto bad_swap;
- } else if (!S_ISREG(swap_file->f_dentry->d_inode->i_mode))
+ p->bdev = bdev;
+ } else if (S_ISREG(swap_file->f_dentry->d_inode->i_mode)) {
+ p->bdev = swap_file->f_dentry->d_inode->i_sb->s_bdev;
+ } else {
goto bad_swap;
+ }
mapping = swap_file->f_dentry->d_inode->i_mapping;
swapfilesize = mapping->host->i_size >> PAGE_SHIFT;
@@ -946,15 +1151,20 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
goto bad_swap;
}
- swap_header = (void *) __get_free_page(GFP_USER);
- if (!swap_header) {
- printk("Unable to start swapping: out of memory :-)\n");
- error = -ENOMEM;
+ /*
+ * Read the swap header.
+ */
+ page = read_cache_page(mapping, 0,
+ (filler_t *)mapping->a_ops->readpage, swap_file);
+ if (IS_ERR(page)) {
+ error = PTR_ERR(page);
goto bad_swap;
}
-
- lock_page(virt_to_page(swap_header));
- rw_swap_page_nolock(READ, swp_entry(type,0), (char *) swap_header);
+ wait_on_page_locked(page);
+ if (!PageUptodate(page))
+ goto bad_swap;
+ kmap(page);
+ swap_header = page_address(page);
if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10))
swap_header_version = 1;
@@ -968,33 +1178,10 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
switch (swap_header_version) {
case 1:
- memset(((char *) swap_header)+PAGE_SIZE-10,0,10);
- j = 0;
- p->lowest_bit = 0;
- p->highest_bit = 0;
- for (i = 1 ; i < 8*PAGE_SIZE ; i++) {
- if (test_bit(i,(unsigned long *) swap_header)) {
- if (!p->lowest_bit)
- p->lowest_bit = i;
- p->highest_bit = i;
- maxpages = i+1;
- j++;
- }
- }
- nr_good_pages = j;
- p->swap_map = vmalloc(maxpages * sizeof(short));
- if (!p->swap_map) {
- error = -ENOMEM;
- goto bad_swap;
- }
- for (i = 1 ; i < maxpages ; i++) {
- if (test_bit(i,(unsigned long *) swap_header))
- p->swap_map[i] = 0;
- else
- p->swap_map[i] = SWAP_MAP_BAD;
- }
- break;
-
+ printk(KERN_ERR "version 0 swap is no longer supported. "
+ "Use mkswap -v1 %s\n", name);
+ error = -EINVAL;
+ goto bad_swap;
case 2:
/* Check the swap header's sub-version and the size of
the swap file and bad block lists */
@@ -1050,15 +1237,20 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
goto bad_swap;
}
p->swap_map[0] = SWAP_MAP_BAD;
+ p->max = maxpages;
+ p->pages = nr_good_pages;
+
+ if (setup_swap_extents(p))
+ goto bad_swap;
+
swap_list_lock();
swap_device_lock(p);
- p->max = maxpages;
p->flags = SWP_ACTIVE;
- p->pages = nr_good_pages;
nr_swap_pages += nr_good_pages;
total_swap_pages += nr_good_pages;
- printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n",
- nr_good_pages<<(PAGE_SHIFT-10), p->prio);
+ printk(KERN_INFO "Adding %dk swap on %s. Priority:%d extents:%d\n",
+ nr_good_pages<<(PAGE_SHIFT-10), name,
+ p->prio, p->nr_extents);
/* insert swap space into swap_list: */
prev = -1;
@@ -1092,14 +1284,18 @@ bad_swap_2:
if (!(swap_flags & SWAP_FLAG_PREFER))
++least_priority;
swap_list_unlock();
+ destroy_swap_extents(p);
if (swap_map)
vfree(swap_map);
if (swap_file && !IS_ERR(swap_file))
filp_close(swap_file, NULL);
out:
- if (swap_header)
- free_page((long) swap_header);
- unlock_kernel();
+ if (page && !IS_ERR(page)) {
+ kunmap(page);
+ page_cache_release(page);
+ }
+ if (name)
+ putname(name);
return error;
}
@@ -1168,78 +1364,10 @@ bad_file:
goto out;
}
-/*
- * Page lock needs to be held in all cases to prevent races with
- * swap file deletion.
- */
-int swap_count(struct page *page)
+struct swap_info_struct *
+get_swap_info_struct(unsigned type)
{
- struct swap_info_struct * p;
- unsigned long offset, type;
- swp_entry_t entry;
- int retval = 0;
-
- entry.val = page->index;
- if (!entry.val)
- goto bad_entry;
- type = swp_type(entry);
- if (type >= nr_swapfiles)
- goto bad_file;
- p = type + swap_info;
- offset = swp_offset(entry);
- if (offset >= p->max)
- goto bad_offset;
- if (!p->swap_map[offset])
- goto bad_unused;
- retval = p->swap_map[offset];
-out:
- return retval;
-
-bad_entry:
- printk(KERN_ERR "swap_count: null entry!\n");
- goto out;
-bad_file:
- printk(KERN_ERR "swap_count: %s%08lx\n", Bad_file, entry.val);
- goto out;
-bad_offset:
- printk(KERN_ERR "swap_count: %s%08lx\n", Bad_offset, entry.val);
- goto out;
-bad_unused:
- printk(KERN_ERR "swap_count: %s%08lx\n", Unused_offset, entry.val);
- goto out;
-}
-
-/*
- * Prior swap_duplicate protects against swap device deletion.
- */
-void get_swaphandle_info(swp_entry_t entry, unsigned long *offset,
- struct inode **swapf)
-{
- unsigned long type;
- struct swap_info_struct *p;
-
- type = swp_type(entry);
- if (type >= nr_swapfiles) {
- printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val);
- return;
- }
-
- p = &swap_info[type];
- *offset = swp_offset(entry);
- if (*offset >= p->max && *offset != 0) {
- printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val);
- return;
- }
- if (p->swap_map && !p->swap_map[*offset]) {
- printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val);
- return;
- }
- if (!(p->flags & SWP_USED)) {
- printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val);
- return;
- }
-
- *swapf = p->swap_file->f_dentry->d_inode;
+ return &swap_info[type];
}
/*
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f95ebed746b0..50cc6d13f0ff 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -195,6 +195,7 @@ struct vm_struct * get_vm_area(unsigned long size, unsigned long flags)
if (addr > VMALLOC_END-size)
goto out;
}
+ area->phys_addr = 0;
area->flags = flags;
area->addr = (void *)addr;
area->size = size;
@@ -209,9 +210,25 @@ out:
return NULL;
}
-void vfree(void * addr)
+struct vm_struct *remove_kernel_area(void *addr)
{
struct vm_struct **p, *tmp;
+ write_lock(&vmlist_lock);
+ for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) {
+ if (tmp->addr == addr) {
+ *p = tmp->next;
+ write_unlock(&vmlist_lock);
+ return tmp;
+ }
+
+ }
+ write_unlock(&vmlist_lock);
+ return NULL;
+}
+
+void vfree(void * addr)
+{
+ struct vm_struct *tmp;
if (!addr)
return;
@@ -219,17 +236,12 @@ void vfree(void * addr)
printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr);
return;
}
- write_lock(&vmlist_lock);
- for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) {
- if (tmp->addr == addr) {
- *p = tmp->next;
+ tmp = remove_kernel_area(addr);
+ if (tmp) {
vmfree_area_pages(VMALLOC_VMADDR(tmp->addr), tmp->size);
- write_unlock(&vmlist_lock);
kfree(tmp);
return;
}
- }
- write_unlock(&vmlist_lock);
printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 91f180f2b08a..6561f2b71b35 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -392,7 +392,8 @@ shrink_cache(int nr_pages, zone_t *classzone,
spin_lock(&pagemap_lru_lock);
while (--max_scan >= 0 &&
(entry = inactive_list.prev) != &inactive_list) {
- struct page * page;
+ struct page *page;
+ int may_enter_fs;
if (need_resched()) {
spin_unlock(&pagemap_lru_lock);
@@ -427,10 +428,17 @@ shrink_cache(int nr_pages, zone_t *classzone,
goto page_mapped;
/*
+ * swap activity never enters the filesystem and is safe
+ * for GFP_NOFS allocations.
+ */
+ may_enter_fs = (gfp_mask & __GFP_FS) ||
+ (PageSwapCache(page) && (gfp_mask & __GFP_IO));
+
+ /*
* IO in progress? Leave it at the back of the list.
*/
if (unlikely(PageWriteback(page))) {
- if (gfp_mask & __GFP_FS) {
+ if (may_enter_fs) {
page_cache_get(page);
spin_unlock(&pagemap_lru_lock);
wait_on_page_writeback(page);
@@ -451,7 +459,7 @@ shrink_cache(int nr_pages, zone_t *classzone,
mapping = page->mapping;
if (PageDirty(page) && is_page_cache_freeable(page) &&
- page->mapping && (gfp_mask & __GFP_FS)) {
+ page->mapping && may_enter_fs) {
/*
* It is not critical here to write it only if
* the page is unmapped beause any direct writer
@@ -480,6 +488,15 @@ shrink_cache(int nr_pages, zone_t *classzone,
* If the page has buffers, try to free the buffer mappings
* associated with this page. If we succeed we try to free
* the page as well.
+ *
+ * We do this even if the page is PageDirty().
+ * try_to_release_page() does not perform I/O, but it is
+ * possible for a page to have PageDirty set, but it is actually
+ * clean (all its buffers are clean). This happens if the
+ * buffers were written out directly, with submit_bh(). ext3
+ * will do this, as well as the blockdev mapping.
+ * try_to_release_page() will discover that cleanness and will
+ * drop the buffers and mark the page clean - it can be freed.
*/
if (PagePrivate(page)) {
spin_unlock(&pagemap_lru_lock);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8b1f2a159e19..464a56367e28 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2419,7 +2419,7 @@ struct ip_rt_acct *ip_rt_acct;
/* This code sucks. But you should have seen it before! --RR */
/* IP route accounting ptr for this logical cpu number. */
-#define IP_RT_ACCT_CPU(i) (ip_rt_acct + cpu_logical_map(i) * 256)
+#define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256)
static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
int length, int *eof, void *data)
@@ -2441,6 +2441,8 @@ static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
/* Add the other cpus in, one int at a time */
for (i = 1; i < NR_CPUS; i++) {
unsigned int j;
+ if (!cpu_online(i))
+ continue;
for (j = 0; j < length/4; j++)
((u32*)buffer)[j] += ((u32*)IP_RT_ACCT_CPU(i))[j];
}