summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@athlon.transmeta.com>2002-02-04 20:19:17 -0800
committerLinus Torvalds <torvalds@athlon.transmeta.com>2002-02-04 20:19:17 -0800
commite2f6721a0a1b07612c0682d8240d3e9bc0a445a4 (patch)
treeaa6410ac8fc965f7d695031fa4c467347138c485
parent269f8f707739116e68aee38d78d0cfb3d896b856 (diff)
v2.4.9.14 -> v2.4.9.15
- Jan Harkes: make Coda work with arbitrary host filesystems, not just filesystems that use generic_file_read/write - Al Viro: block device cleanups - Hugh Dickins: swap device lock fixes - fix swap readahead race - me, Andrea: more reference bit cleanups
-rw-r--r--Makefile2
-rw-r--r--drivers/block/rd.c3
-rw-r--r--drivers/char/raw.c1
-rw-r--r--drivers/char/tty_io.c2
-rw-r--r--drivers/ide/hptraid.c21
-rw-r--r--drivers/ide/pdcraid.c19
-rw-r--r--drivers/md/md.c8
-rw-r--r--fs/block_dev.c68
-rw-r--r--fs/coda/file.c57
-rw-r--r--fs/coda/psdev.c2
-rw-r--r--fs/devfs/base.c11
-rw-r--r--fs/devices.c1
-rw-r--r--fs/inode.c9
-rw-r--r--fs/partitions/ibm.c10
-rw-r--r--fs/super.c12
-rw-r--r--include/linux/fs.h4
-rw-r--r--include/linux/mm.h32
-rw-r--r--include/linux/swap.h19
-rw-r--r--mm/filemap.c8
-rw-r--r--mm/memory.c25
-rw-r--r--mm/shmem.c50
-rw-r--r--mm/swap.c2
-rw-r--r--mm/swap_state.c70
-rw-r--r--mm/swapfile.c151
-rw-r--r--mm/vmscan.c108
25 files changed, 373 insertions, 322 deletions
diff --git a/Makefile b/Makefile
index d5ea20273dc5..447e51566b59 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
VERSION = 2
PATCHLEVEL = 4
SUBLEVEL = 10
-EXTRAVERSION =-pre14
+EXTRAVERSION =-pre15
KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION)
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
index 369a776ce33b..4bf4286168b2 100644
--- a/drivers/block/rd.c
+++ b/drivers/block/rd.c
@@ -491,7 +491,6 @@ static void __exit rd_cleanup (void)
bdev->bd_cache_openers--;
truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
blkdev_put(bdev, BDEV_FILE);
- bdput(bdev);
}
destroy_buffers(MKDEV(MAJOR_NR, i));
}
@@ -780,7 +779,7 @@ static void __init rd_load_image(kdev_t device, int offset, int unit)
if (i && (i % devblocks == 0)) {
printk("done disk #%d.\n", i/devblocks);
rotate = 0;
- if (blkdev_close(inode, &infile) != 0) {
+ if (infile.f_op->release(inode, &infile) != 0) {
printk("Error closing the disk.\n");
goto noclose_input;
}
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index c9c5b6e4ff7c..d90d9723c1f6 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -103,6 +103,7 @@ int raw_open(struct inode *inode, struct file *filp)
if (!bdev)
goto out;
+ atomic_inc(&bdev->bd_count);
rdev = to_kdev_t(bdev->bd_dev);
err = blkdev_get(bdev, filp->f_mode, 0, BDEV_RAW);
if (err)
diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 9b2d90a6cf4e..4a688d830362 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -270,6 +270,8 @@ int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc)
return 0;
}
+EXPORT_SYMBOL(tty_register_ldisc);
+
/* Set the discipline of a tty line. */
static int tty_set_ldisc(struct tty_struct *tty, int ldisc)
{
diff --git a/drivers/ide/hptraid.c b/drivers/ide/hptraid.c
index d70fab90e451..bbdf1d70983f 100644
--- a/drivers/ide/hptraid.c
+++ b/drivers/ide/hptraid.c
@@ -279,6 +279,7 @@ static void __init probedisk(int major, int minor,int device)
int i;
struct highpoint_raid_conf *prom;
static unsigned char block[4096];
+ struct block_device *bdev;
if (maxsectors(major,minor)==0)
return;
@@ -301,12 +302,12 @@ static void __init probedisk(int major, int minor,int device)
if (i>8)
return;
- raid[device].disk[i].bdev = bdget(MKDEV(major,minor));
- if (raid[device].disk[i].bdev != NULL) {
+ bdev = bdget(MKDEV(major,minor));
+ if (bdev && blkdev_get(bdev,FMODE_READ|FMODE_WRITE,0,BDEV_RAW) == 0) {
int j=0;
struct gendisk *gd;
- /* This is supposed to prevent others from stealing our underlying disks */
- blkdev_get(raid[device].disk[i].bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
+
+ raid[device].disk[i].bdev = bdev;
/* now blank the /proc/partitions table for the wrong partition table,
so that scripts don't accidentally mount it and crash the kernel */
/* XXX: the 0 is an utter hack --hch */
@@ -408,12 +409,12 @@ static void __exit hptraid_exit (void)
{
int i,device;
for (device = 0; device<16; device++) {
- for (i=0;i<8;i++)
- if (raid[device].disk[i].bdev) {
- blkdev_put(raid[device].disk[i].bdev, BDEV_RAW);
- bdput(raid[device].disk[i].bdev);
- raid[device].disk[i].bdev = NULL;
- }
+ for (i=0;i<8;i++) {
+ struct block_device *bdev = raid[device].disk[i].bdev;
+ raid[device].disk[i].bdev = NULL;
+ if (bdev)
+ blkdev_put(bdev, BDEV_RAW);
+ }
if (raid[device].sectors)
ataraid_release_device(device);
}
diff --git a/drivers/ide/pdcraid.c b/drivers/ide/pdcraid.c
index b12fb0be5861..f21fda755a5f 100644
--- a/drivers/ide/pdcraid.c
+++ b/drivers/ide/pdcraid.c
@@ -311,12 +311,12 @@ static void __init probedisk(int major, int minor,int device)
for (i=0;(i<prom->raid.total_disks)&&(i<8);i++) {
if ( (prom->raid.disk[i].channel== prom->raid.channel) &&
(prom->raid.disk[i].device == prom->raid.device) ) {
- raid[device].disk[i].bdev = bdget(MKDEV(major,minor));
- if (raid[device].disk[i].bdev != NULL) {
+ struct block_device *bdev = bdget(MKDEV(major,minor));
+ if (bdev && blkdev_get(bdev,FMODE_READ|FMODE_WRITE,0,BDEV_RAW) == 0) {
struct gendisk *gd;
int j;
/* This is supposed to prevent others from stealing our underlying disks */
- blkdev_get(raid[device].disk[i].bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_RAW);
+ raid[device].disk[i].bdev = bdev;
gd=get_gendisk(major);
if (gd!=NULL) {
for (j=1+(minor<<gd->minor_shift);j<((minor+1)<<gd->minor_shift);j++)
@@ -418,13 +418,12 @@ static void __exit pdcraid_exit (void)
{
int i,device;
for (device = 0; device<16; device++) {
- for (i=0;i<8;i++)
- if (raid[device].disk[i].bdev) {
- blkdev_put(raid[device].disk[i].bdev, BDEV_RAW);
- bdput(raid[device].disk[i].bdev);
- raid[device].disk[i].bdev = NULL;
-
- }
+ for (i=0;i<8;i++) {
+ struct block_device *bdev = raid[device].disk[i].bdev;
+ raid[device].disk[i].bdev = NULL;
+ if (bdev)
+ blkdev_put(bdev, BDEV_RAW);
+ }
if (raid[device].sectors)
ataraid_release_device(device);
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d4d51fd5f43d..19c30390d089 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -649,11 +649,11 @@ static int lock_rdev (mdk_rdev_t *rdev)
static void unlock_rdev (mdk_rdev_t *rdev)
{
- if (!rdev->bdev)
- MD_BUG();
- blkdev_put(rdev->bdev, BDEV_RAW);
- bdput(rdev->bdev);
+ struct block_device *bdev = rdev->bdev;
rdev->bdev = NULL;
+ if (!bdev)
+ MD_BUG();
+ blkdev_put(bdev, BDEV_RAW);
}
void md_autodetect_dev (kdev_t dev);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 52f35224795e..b2d374ec457b 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -404,7 +404,6 @@ static int get_inode(struct block_device *bdev)
if (!inode)
return -ENOMEM;
inode->i_rdev = to_kdev_t(bdev->bd_dev);
- atomic_inc(&bdev->bd_count); /* will go away */
inode->i_bdev = bdev;
inode->i_data.a_ops = &def_blk_aops;
bdev->bd_inode = inode;
@@ -437,6 +436,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
{
memset(bdev, 0, sizeof(*bdev));
sema_init(&bdev->bd_sem, 1);
+ INIT_LIST_HEAD(&bdev->bd_inodes);
}
}
@@ -522,17 +522,58 @@ struct block_device *bdget(dev_t dev)
void bdput(struct block_device *bdev)
{
- if (atomic_dec_and_test(&bdev->bd_count)) {
+ if (atomic_dec_and_lock(&bdev->bd_count, &bdev_lock)) {
+ struct list_head *p;
if (bdev->bd_openers)
BUG();
if (bdev->bd_cache_openers)
BUG();
- spin_lock(&bdev_lock);
list_del(&bdev->bd_hash);
+ while ( (p = bdev->bd_inodes.next) != &bdev->bd_inodes ) {
+ struct inode *inode;
+ inode = list_entry(p, struct inode, i_devices);
+ list_del_init(p);
+ inode->i_bdev = NULL;
+ }
spin_unlock(&bdev_lock);
destroy_bdev(bdev);
}
}
+
+int bd_acquire(struct inode *inode)
+{
+ struct block_device *bdev;
+ spin_lock(&bdev_lock);
+ if (inode->i_bdev) {
+ atomic_inc(&inode->i_bdev->bd_count);
+ spin_unlock(&bdev_lock);
+ return 0;
+ }
+ spin_unlock(&bdev_lock);
+ bdev = bdget(kdev_t_to_nr(inode->i_rdev));
+ if (!bdev)
+ return -ENOMEM;
+ spin_lock(&bdev_lock);
+ if (!inode->i_bdev) {
+ inode->i_bdev = bdev;
+ list_add(&inode->i_devices, &bdev->bd_inodes);
+ } else if (inode->i_bdev != bdev)
+ BUG();
+ spin_unlock(&bdev_lock);
+ return 0;
+}
+
+/* Call when you free inode */
+
+void bd_forget(struct inode *inode)
+{
+ spin_lock(&bdev_lock);
+ if (inode->i_bdev) {
+ list_del_init(&inode->i_devices);
+ inode->i_bdev = NULL;
+ }
+ spin_unlock(&bdev_lock);
+}
static struct {
const char *name;
@@ -706,13 +747,15 @@ int blkdev_get(struct block_device *bdev, mode_t mode, unsigned flags, int kind)
}
unlock_kernel();
up(&bdev->bd_sem);
+ if (ret)
+ bdput(bdev);
return ret;
}
int blkdev_open(struct inode * inode, struct file * filp)
{
- int ret = -ENXIO;
- struct block_device *bdev = inode->i_bdev;
+ int ret;
+ struct block_device *bdev;
/*
* Preserve backwards compatibility and allow large file access
@@ -722,13 +765,15 @@ int blkdev_open(struct inode * inode, struct file * filp)
*/
filp->f_flags |= O_LARGEFILE;
+ bd_acquire(inode);
+ bdev = inode->i_bdev;
down(&bdev->bd_sem);
- if (get_inode(bdev)) {
- up(&bdev->bd_sem);
- return -ENOMEM;
- }
+ ret = get_inode(bdev);
+ if (ret)
+ goto out;
+ ret = -ENXIO;
lock_kernel();
if (!bdev->bd_op)
bdev->bd_op = get_blkfops(MAJOR(inode->i_rdev));
@@ -749,7 +794,10 @@ int blkdev_open(struct inode * inode, struct file * filp)
}
}
unlock_kernel();
+out:
up(&bdev->bd_sem);
+ if (ret)
+ bdput(bdev);
return ret;
}
@@ -777,6 +825,7 @@ int blkdev_put(struct block_device *bdev, int kind)
}
unlock_kernel();
up(&bdev->bd_sem);
+ bdput(bdev);
return ret;
}
@@ -841,6 +890,7 @@ int blkdev_close(struct inode * inode, struct file * filp)
}
unlock_kernel();
up(&bdev->bd_sem);
+ bdput(bdev);
return ret;
}
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 6c2610f0ab87..fa1d150e32d0 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -31,28 +31,65 @@
int use_coda_close;
static ssize_t
-coda_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
+coda_file_read(struct file *file, char *buf, size_t count, loff_t *ppos)
{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct coda_inode_info *cii = ITOC(inode);
struct file *cfile;
+
+ cfile = cii->c_container;
+ if (!cfile) BUG();
+
+ if (!cfile->f_op || !cfile->f_op->read)
+ return -EINVAL;
+
+ return cfile->f_op->read(cfile, buf, count, ppos);
+}
+
+static ssize_t
+coda_file_write(struct file *file,const char *buf,size_t count,loff_t *ppos)
+{
struct inode *cinode, *inode = file->f_dentry->d_inode;
struct coda_inode_info *cii = ITOC(inode);
- ssize_t n;
+ struct file *cfile;
+ ssize_t ret;
+ int flags;
cfile = cii->c_container;
if (!cfile) BUG();
- if (!cfile->f_op || cfile->f_op->write != generic_file_write)
- BUG();
+ if (!cfile->f_op || !cfile->f_op->write)
+ return -EINVAL;
cinode = cfile->f_dentry->d_inode;
- down(&cinode->i_sem);
+ down(&inode->i_sem);
+ flags = cfile->f_flags;
+ cfile->f_flags |= file->f_flags & (O_APPEND | O_SYNC);
- n = generic_file_write(file, buf, count, ppos);
+ ret = cfile->f_op->write(cfile, buf, count, ppos);
+
+ cfile->f_flags = flags;
inode->i_size = cinode->i_size;
+ up(&inode->i_sem);
+
+ return ret;
+}
+
+static int
+coda_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct coda_inode_info *cii = ITOC(inode);
+ struct file *cfile;
+
+ cfile = cii->c_container;
+
+ if (!cfile) BUG();
- up(&cinode->i_sem);
+ if (!cfile->f_op || !cfile->f_op->mmap)
+ return -ENODEV;
- return n;
+ return cfile->f_op->mmap(cfile, vma);
}
int coda_open(struct inode *i, struct file *f)
@@ -237,9 +274,9 @@ int coda_fsync(struct file *file, struct dentry *dentry, int datasync)
struct file_operations coda_file_operations = {
llseek: generic_file_llseek,
- read: generic_file_read,
+ read: coda_file_read,
write: coda_file_write,
- mmap: generic_file_mmap,
+ mmap: coda_file_mmap,
open: coda_open,
flush: coda_flush,
release: coda_release,
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 59c530d6d8f3..2fe942f7cad9 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -414,7 +414,7 @@ MODULE_AUTHOR("Peter J. Braam <braam@cs.cmu.edu>");
static int __init init_coda(void)
{
int status;
- printk(KERN_INFO "Coda Kernel/Venus communications, v5.3.14, coda@cs.cmu.edu\n");
+ printk(KERN_INFO "Coda Kernel/Venus communications, v5.3.15, coda@cs.cmu.edu\n");
status = init_coda_psdev();
if ( status ) {
diff --git a/fs/devfs/base.c b/fs/devfs/base.c
index 58aedeca1907..974379d14d4d 100644
--- a/fs/devfs/base.c
+++ b/fs/devfs/base.c
@@ -2291,9 +2291,16 @@ static int devfs_statfs (struct super_block *sb, struct statfs *buf)
return 0;
} /* End Function devfs_statfs */
+static void devfs_clear_inode(struct inode *inode)
+{
+ if (S_ISBLK(inode->i_mode))
+ bdput(inode->i_bdev);
+}
+
static struct super_operations devfs_sops =
{
put_inode: force_delete,
+ clear_inode: devfs_clear_inode,
statfs: devfs_statfs,
};
@@ -2351,9 +2358,7 @@ static struct inode *get_vfs_inode (struct super_block *sb,
{
inode->i_rdev = MKDEV (de->u.fcb.u.device.major,
de->u.fcb.u.device.minor);
- inode->i_bdev = bdget ( kdev_t_to_nr (inode->i_rdev) );
- inode->i_mapping->a_ops = &def_blk_aops;
- if (inode->i_bdev)
+ if (bd_acquire(inode) == 0)
{
if (!inode->i_bdev->bd_op && de->u.fcb.ops)
inode->i_bdev->bd_op = de->u.fcb.ops;
diff --git a/fs/devices.c b/fs/devices.c
index 875f0e9f364c..3b4448e8d0e8 100644
--- a/fs/devices.c
+++ b/fs/devices.c
@@ -207,7 +207,6 @@ void init_special_inode(struct inode *inode, umode_t mode, int rdev)
} else if (S_ISBLK(mode)) {
inode->i_fop = &def_blk_fops;
inode->i_rdev = to_kdev_t(rdev);
- inode->i_bdev = bdget(rdev);
} else if (S_ISFIFO(mode))
inode->i_fop = &def_fifo_fops;
else if (S_ISSOCK(mode))
diff --git a/fs/inode.c b/fs/inode.c
index e034073de731..f9783a67b454 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -106,6 +106,7 @@ static void init_once(void * foo, kmem_cache_t * cachep, unsigned long flags)
INIT_LIST_HEAD(&inode->i_dentry);
INIT_LIST_HEAD(&inode->i_dirty_buffers);
INIT_LIST_HEAD(&inode->i_dirty_data_buffers);
+ INIT_LIST_HEAD(&inode->i_devices);
sema_init(&inode->i_sem, 1);
sema_init(&inode->i_zombie, 1);
spin_lock_init(&inode->i_data.i_shared_lock);
@@ -516,11 +517,9 @@ void clear_inode(struct inode *inode)
DQUOT_DROP(inode);
if (inode->i_sb && inode->i_sb->s_op && inode->i_sb->s_op->clear_inode)
inode->i_sb->s_op->clear_inode(inode);
- if (inode->i_bdev) {
- bdput(inode->i_bdev);
- inode->i_bdev = NULL;
- }
- if (inode->i_cdev) {
+ if (inode->i_bdev)
+ bd_forget(inode);
+ else if (inode->i_cdev) {
cdput(inode->i_cdev);
inode->i_cdev = NULL;
}
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index ce313bbe89eb..8e2b6d5da229 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -47,9 +47,10 @@ get_drive_geometry(int kdev,struct hd_geometry *geo)
{
struct block_device *bdev = bdget(kdev_t_to_nr(kdev));
int rc = blkdev_get(bdev, 0, 1, BDEV_FILE);
- if ( rc == 0 )
+ if ( rc == 0 ) {
rc = ioctl_by_bdev(bdev, HDIO_GETGEO, (unsigned long)geo);
- blkdev_put(bdev,BDEV_FILE);
+ blkdev_put(bdev, BDEV_FILE);
+ }
return rc;
}
@@ -58,9 +59,10 @@ get_drive_info(int kdev,dasd_information_t *info)
{
struct block_device *bdev = bdget(kdev_t_to_nr(kdev));
int rc = blkdev_get(bdev, 0, 1, BDEV_FILE);
- if ( rc == 0 )
+ if ( rc == 0 ) {
rc = ioctl_by_bdev(bdev, BIODASDINFO, (unsigned long)(info));
- blkdev_put(bdev,BDEV_FILE);
+ blkdev_put(bdev, BDEV_FILE);
+ }
return rc;
}
diff --git a/fs/super.c b/fs/super.c
index 5541b10196fc..4736ded323c6 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -925,6 +925,7 @@ static struct super_block *get_sb_bdev(struct file_system_type *fs_type,
error = -EACCES;
if (nd.mnt->mnt_flags & MNT_NODEV)
goto out;
+ bd_acquire(inode);
bdev = inode->i_bdev;
bdops = devfs_get_ops ( devfs_get_handle_from_inode (inode) );
if (bdops) bdev->bd_op = bdops;
@@ -982,8 +983,6 @@ restart:
if (!fs_type->read_super(s, data, 0))
goto out_fail;
unlock_super(s);
- /* tell bdcache that we are going to keep this one */
- atomic_inc(&bdev->bd_count);
get_filesystem(fs_type);
path_release(&nd);
return s;
@@ -1128,10 +1127,9 @@ static void kill_super(struct super_block *sb)
sb->s_type = NULL;
unlock_super(sb);
unlock_kernel();
- if (bdev) {
+ if (bdev)
blkdev_put(bdev, BDEV_FS);
- bdput(bdev);
- } else
+ else
put_unnamed_dev(dev);
spin_lock(&sb_lock);
list_del(&sb->s_list);
@@ -1718,6 +1716,7 @@ skip_nfs:
if (!ROOT_DEV)
panic("I have no root and I want to scream");
+retry:
bdev = bdget(kdev_t_to_nr(ROOT_DEV));
if (!bdev)
panic(__FUNCTION__ ": unable to allocate root device");
@@ -1729,7 +1728,7 @@ skip_nfs:
retval = blkdev_get(bdev, mode, 0, BDEV_FS);
if (retval == -EROFS) {
root_mountflags |= MS_RDONLY;
- retval = blkdev_get(bdev, FMODE_READ, 0, BDEV_FS);
+ goto retry;
}
if (retval) {
/*
@@ -1977,6 +1976,7 @@ int __init change_root(kdev_t new_root_dev,const char *put_old)
int blivet;
struct block_device *ramdisk = old_rootmnt->mnt_sb->s_bdev;
+ atomic_inc(&ramdisk->bd_count);
blivet = blkdev_get(ramdisk, FMODE_READ, 0, BDEV_FS);
printk(KERN_NOTICE "Trying to unmount old root ... ");
if (!blivet) {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 31a2167afac8..9eca17f8ee1b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -415,6 +415,7 @@ struct block_device {
int bd_cache_openers;
const struct block_device_operations *bd_op;
struct semaphore bd_sem; /* open/close mutex */
+ struct list_head bd_inodes;
};
struct inode {
@@ -452,6 +453,7 @@ struct inode {
int i_mapping_overload;
struct dquot *i_dquot[MAXQUOTAS];
/* These three should probably be a union */
+ struct list_head i_devices;
struct pipe_inode_info *i_pipe;
struct block_device *i_bdev;
struct char_device *i_cdev;
@@ -1046,6 +1048,8 @@ enum {BDEV_FILE, BDEV_SWAP, BDEV_FS, BDEV_RAW};
extern int register_blkdev(unsigned int, const char *, struct block_device_operations *);
extern int unregister_blkdev(unsigned int, const char *);
extern struct block_device *bdget(dev_t);
+extern int bd_acquire(struct inode *inode);
+extern void bd_forget(struct inode *inode);
extern void bdput(struct block_device *);
extern struct char_device *cdget(dev_t);
extern void cdput(struct char_device *);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8f9e8bfbce88..494d025d143d 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -274,7 +274,6 @@ typedef struct page {
#define PG_active 6
#define PG_inactive 7
#define PG_slab 8
-#define PG_swap_cache 9
#define PG_skip 10
#define PG_highmem 11
#define PG_checked 12 /* kill me in 2.5.<early>. */
@@ -326,18 +325,9 @@ static inline void set_page_dirty(struct page * page)
#define SetPageDecrAfter(page) set_bit(PG_decr_after, &(page)->flags)
#define PageTestandClearDecrAfter(page) test_and_clear_bit(PG_decr_after, &(page)->flags)
#define PageSlab(page) test_bit(PG_slab, &(page)->flags)
-#define PageSwapCache(page) test_bit(PG_swap_cache, &(page)->flags)
-#define PageReserved(page) test_bit(PG_reserved, &(page)->flags)
-
#define PageSetSlab(page) set_bit(PG_slab, &(page)->flags)
-#define PageSetSwapCache(page) set_bit(PG_swap_cache, &(page)->flags)
-
-#define PageTestandSetSwapCache(page) test_and_set_bit(PG_swap_cache, &(page)->flags)
-
-#define PageClearSlab(page) clear_bit(PG_slab, &(page)->flags)
-#define PageClearSwapCache(page) clear_bit(PG_swap_cache, &(page)->flags)
-
-#define PageTestandClearSwapCache(page) test_and_clear_bit(PG_swap_cache, &(page)->flags)
+#define PageClearSlab(page) clear_bit(PG_slab, &(page)->flags)
+#define PageReserved(page) test_bit(PG_reserved, &(page)->flags)
#define PageActive(page) test_bit(PG_active, &(page)->flags)
#define SetPageActive(page) set_bit(PG_active, &(page)->flags)
@@ -465,6 +455,9 @@ extern void show_mem(void);
extern void si_meminfo(struct sysinfo * val);
extern void swapin_readahead(swp_entry_t);
+extern struct address_space swapper_space;
+#define PageSwapCache(page) ((page)->mapping == &swapper_space)
+
static inline int is_page_cache_freeable(struct page * page)
{
return page_count(page) - !!page->buffers == 1;
@@ -476,15 +469,13 @@ static inline int is_page_cache_freeable(struct page * page)
*/
static inline int exclusive_swap_page(struct page *page)
{
- unsigned int count;
-
if (!PageLocked(page))
BUG();
if (!PageSwapCache(page))
return 0;
- count = page_count(page) - !!page->buffers; /* 2: us + swap cache */
- count += swap_count(page); /* +1: just swap cache */
- return count == 3; /* =3: total */
+ if (page_count(page) - !!page->buffers != 2) /* 2: us + cache */
+ return 0;
+ return swap_count(page) == 1; /* 1: just cache */
}
extern void __free_pte(pte_t);
@@ -565,11 +556,10 @@ extern struct page *filemap_nopage(struct vm_area_struct *, unsigned long, int);
#define GFP_NOFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO)
#define GFP_ATOMIC (__GFP_HIGH)
#define GFP_USER ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
-#define GFP_HIGHUSER ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO \
- | __GFP_FS | __GFP_HIGHMEM)
+#define GFP_HIGHUSER ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS | __GFP_HIGHMEM)
#define GFP_KERNEL (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
-#define GFP_NFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
-#define GFP_KSWAPD ( __GFP_IO | __GFP_HIGHIO | __GFP_FS)
+#define GFP_NFS (__GFP_HIGH | __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
+#define GFP_KSWAPD ( __GFP_WAIT | __GFP_IO | __GFP_HIGHIO | __GFP_FS)
/* Flag - indicates that the buffer will be suitable for DMA. Ignored on some
platforms, used as appropriate on others */
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 0ce8a374a11d..0282b6bac60c 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -84,7 +84,6 @@ extern unsigned int nr_free_buffer_pages(void);
extern int nr_active_pages;
extern int nr_inactive_pages;
extern atomic_t nr_async_pages;
-extern struct address_space swapper_space;
extern atomic_t page_cache_size;
extern atomic_t buffermem_pages;
extern spinlock_t pagecache_lock;
@@ -122,35 +121,27 @@ extern void rw_swap_page_nolock(int, swp_entry_t, char *);
/* linux/mm/swap_state.c */
extern void show_swap_cache_info(void);
extern void add_to_swap_cache(struct page *, swp_entry_t);
-extern int swap_check_entry(unsigned long);
+extern void __delete_from_swap_cache(struct page *page);
+extern void delete_from_swap_cache(struct page *page);
+extern void free_page_and_swap_cache(struct page *page);
extern struct page * lookup_swap_cache(swp_entry_t);
extern struct page * read_swap_cache_async(swp_entry_t);
/* linux/mm/oom_kill.c */
extern void oom_kill(void);
-/*
- * Make these inline later once they are working properly.
- */
-extern void __delete_from_swap_cache(struct page *page);
-extern void delete_from_swap_cache(struct page *page);
-extern void delete_from_swap_cache_nolock(struct page *page);
-extern void free_page_and_swap_cache(struct page *page);
-
/* linux/mm/swapfile.c */
extern unsigned int nr_swapfiles;
extern struct swap_info_struct swap_info[];
extern int is_swap_partition(kdev_t);
extern void si_swapinfo(struct sysinfo *);
-extern swp_entry_t __get_swap_page(unsigned short);
+extern swp_entry_t get_swap_page(void);
extern void get_swaphandle_info(swp_entry_t, unsigned long *, kdev_t *,
struct inode **);
extern int swap_duplicate(swp_entry_t);
extern int swap_count(struct page *);
extern int valid_swaphandles(swp_entry_t, unsigned long *);
-#define get_swap_page() __get_swap_page(1)
-extern void __swap_free(swp_entry_t, unsigned short);
-#define swap_free(entry) __swap_free((entry), 1)
+extern void swap_free(swp_entry_t);
struct swap_list_t {
int head; /* head of priority-ordered swapfile list */
int next; /* swapfile to be used next */
diff --git a/mm/filemap.c b/mm/filemap.c
index 3d6817ce077c..609e3bb04d94 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1704,6 +1704,7 @@ success:
* and possibly copy it over to another page..
*/
old_page = page;
+ mark_page_accessed(page);
if (no_share) {
struct page *new_page = alloc_page(GFP_HIGHUSER);
@@ -2553,7 +2554,6 @@ repeat:
}
if (cached_page)
page_cache_release(cached_page);
- mark_page_accessed(page);
return page;
}
@@ -2571,7 +2571,10 @@ struct page *read_cache_page(struct address_space *mapping,
retry:
page = __read_cache_page(mapping, index, filler, data);
- if (IS_ERR(page) || Page_Uptodate(page))
+ if (IS_ERR(page))
+ goto out;
+ mark_page_accessed(page);
+ if (Page_Uptodate(page))
goto out;
lock_page(page);
@@ -2835,6 +2838,7 @@ generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
unlock:
kunmap(page);
/* Mark it unlocked again and drop the page.. */
+ SetPageReferenced(page);
UnlockPage(page);
page_cache_release(page);
diff --git a/mm/memory.c b/mm/memory.c
index 3987ece050c0..efd520264e75 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -85,8 +85,6 @@ void __free_pte(pte_t pte)
if (page->mapping) {
if (pte_dirty(pte))
set_page_dirty(page);
- if (pte_young(pte))
- mark_page_accessed(page);
}
free_page_and_swap_cache(page);
@@ -939,10 +937,8 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
break;
/* Recheck swapcachedness once the page is locked */
can_reuse = exclusive_swap_page(old_page);
-#if 1
if (can_reuse)
- delete_from_swap_cache_nolock(old_page);
-#endif
+ delete_from_swap_cache(old_page);
UnlockPage(old_page);
if (!can_reuse)
break;
@@ -1088,23 +1084,19 @@ void swapin_readahead(swp_entry_t entry)
unsigned long offset;
/*
- * Get the number of handles we should do readahead io to. Also,
- * grab temporary references on them, releasing them as io completes.
+ * Get the number of handles we should do readahead io to.
*/
num = valid_swaphandles(entry, &offset);
for (i = 0; i < num; offset++, i++) {
/* Don't block on I/O for read-ahead */
- if (atomic_read(&nr_async_pages) >= pager_daemon.swap_cluster
- * (1 << page_cluster)) {
- while (i++ < num)
- swap_free(SWP_ENTRY(SWP_TYPE(entry), offset++));
+ if (atomic_read(&nr_async_pages) >=
+ pager_daemon.swap_cluster << page_cluster)
break;
- }
/* Ok, do the async read-ahead now */
new_page = read_swap_cache_async(SWP_ENTRY(SWP_TYPE(entry), offset));
- if (new_page != NULL)
- page_cache_release(new_page);
- swap_free(SWP_ENTRY(SWP_TYPE(entry), offset));
+ if (!new_page)
+ break;
+ page_cache_release(new_page);
}
return;
}
@@ -1164,11 +1156,12 @@ static int do_swap_page(struct mm_struct * mm,
pte = mk_pte(page, vma->vm_page_prot);
swap_free(entry);
+ mark_page_accessed(page);
if (exclusive_swap_page(page)) {
if (vma->vm_flags & VM_WRITE)
pte = pte_mkwrite(pte);
pte = pte_mkdirty(pte);
- delete_from_swap_cache_nolock(page);
+ delete_from_swap_cache(page);
}
UnlockPage(page);
diff --git a/mm/shmem.c b/mm/shmem.c
index 1e4efc4a0ff5..a24c868edca0 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -234,45 +234,55 @@ static int shmem_writepage(struct page * page)
int error;
struct shmem_inode_info *info;
swp_entry_t *entry, swap;
+ struct address_space *mapping;
+ unsigned long index;
struct inode *inode;
if (!PageLocked(page))
BUG();
-
- inode = page->mapping->host;
+
+ mapping = page->mapping;
+ index = page->index;
+ inode = mapping->host;
info = &inode->u.shmem_i;
- swap = __get_swap_page(2);
- error = -ENOMEM;
- if (!swap.val) {
- activate_page(page);
- SetPageDirty(page);
- goto out;
- }
spin_lock(&info->lock);
- entry = shmem_swp_entry(info, page->index);
- if (IS_ERR(entry)) /* this had been allocted on page allocation */
+ entry = shmem_swp_entry(info, index);
+ if (IS_ERR(entry)) /* this had been allocated on page allocation */
BUG();
- shmem_recalc_inode(page->mapping->host);
- error = -EAGAIN;
+ shmem_recalc_inode(inode);
if (entry->val)
BUG();
- *entry = swap;
- error = 0;
- /* Remove the from the page cache */
+ /* Remove it from the page cache */
lru_cache_del(page);
remove_inode_page(page);
+ swap_list_lock();
+ swap = get_swap_page();
+
+ if (!swap.val) {
+ swap_list_unlock();
+ /* Add it back to the page cache */
+ add_to_page_cache_locked(page, mapping, index);
+ activate_page(page);
+ SetPageDirty(page);
+ error = -ENOMEM;
+ goto out;
+ }
+
/* Add it to the swap cache */
add_to_swap_cache(page, swap);
- page_cache_release(page);
- info->swapped++;
+ swap_list_unlock();
- spin_unlock(&info->lock);
set_page_dirty(page);
+ info->swapped++;
+ *entry = swap;
+ error = 0;
out:
+ spin_unlock(&info->lock);
UnlockPage(page);
+ page_cache_release(page);
return error;
}
@@ -356,7 +366,7 @@ repeat:
swap_free(*entry);
*entry = (swp_entry_t) {0};
- delete_from_swap_cache_nolock(page);
+ delete_from_swap_cache(page);
flags = page->flags & ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1);
page->flags = flags | (1 << PG_dirty);
add_to_page_cache_locked(page, mapping, idx);
diff --git a/mm/swap.c b/mm/swap.c
index 18b504deb45c..37b9ea1babb6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -54,7 +54,6 @@ void deactivate_page_nolock(struct page * page)
del_page_from_active_list(page);
add_page_to_inactive_list(page);
}
- ClearPageReferenced(page);
}
void deactivate_page(struct page * page)
@@ -73,7 +72,6 @@ void activate_page_nolock(struct page * page)
del_page_from_inactive_list(page);
add_page_to_active_list(page);
}
- SetPageReferenced(page);
}
void activate_page(struct page * page)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 0963ca7a9f41..ed712d227fc9 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -23,17 +23,11 @@
*/
static int swap_writepage(struct page *page)
{
- /* One for the page cache, one for this user, one for page->buffers */
- if (page_count(page) > 2 + !!page->buffers)
- goto in_use;
- if (swap_count(page) > 1)
- goto in_use;
-
- delete_from_swap_cache_nolock(page);
- UnlockPage(page);
- return 0;
-
-in_use:
+ if (exclusive_swap_page(page)) {
+ delete_from_swap_cache(page);
+ UnlockPage(page);
+ return 0;
+ }
rw_swap_page(WRITE, page);
return 0;
}
@@ -75,8 +69,6 @@ void add_to_swap_cache(struct page *page, swp_entry_t entry)
#endif
if (!PageLocked(page))
BUG();
- if (PageTestandSetSwapCache(page))
- BUG();
if (page->mapping)
BUG();
@@ -92,51 +84,42 @@ void add_to_swap_cache(struct page *page, swp_entry_t entry)
*/
void __delete_from_swap_cache(struct page *page)
{
- struct address_space *mapping = page->mapping;
- swp_entry_t entry;
-
#ifdef SWAP_CACHE_INFO
swap_cache_del_total++;
#endif
- if (mapping != &swapper_space)
+ if (!PageLocked(page))
BUG();
- if (!PageSwapCache(page) || !PageLocked(page))
+ if (!PageSwapCache(page))
BUG();
- entry.val = page->index;
- PageClearSwapCache(page);
ClearPageDirty(page);
__remove_inode_page(page);
- swap_free(entry);
}
/*
- * This will never put the page into the free list, the caller has
- * a reference on the page.
+ * This must be called only on pages that have
+ * been verified to be in the swap cache and locked.
+ * It will never put the page into the free list,
+ * the caller has a reference on the page.
*/
-void delete_from_swap_cache_nolock(struct page *page)
+void delete_from_swap_cache(struct page *page)
{
+ swp_entry_t entry;
+
if (!PageLocked(page))
BUG();
if (block_flushpage(page, 0))
lru_cache_del(page);
+ entry.val = page->index;
+
spin_lock(&pagecache_lock);
__delete_from_swap_cache(page);
spin_unlock(&pagecache_lock);
- page_cache_release(page);
-}
-/*
- * This must be called only on pages that have
- * been verified to be in the swap cache and locked.
- */
-void delete_from_swap_cache(struct page *page)
-{
- lock_page(page);
- delete_from_swap_cache_nolock(page);
- UnlockPage(page);
+ swap_free(entry);
+ page_cache_release(page);
}
/*
@@ -156,7 +139,7 @@ void free_page_and_swap_cache(struct page *page)
*/
if (PageSwapCache(page) && !TryLockPage(page)) {
if (exclusive_swap_page(page))
- delete_from_swap_cache_nolock(page);
+ delete_from_swap_cache(page);
UnlockPage(page);
}
page_cache_release(page);
@@ -213,19 +196,24 @@ struct page * read_swap_cache_async(swp_entry_t entry)
new_page = alloc_page(GFP_HIGHUSER);
if (!new_page)
goto out; /* Out of memory */
+ if (TryLockPage(new_page))
+ BUG();
/*
* Check the swap cache again, in case we stalled above.
- * The BKL is guarding against races between this check
+ * swap_list_lock is guarding against races between this check
* and where the new page is added to the swap cache below.
+ * It is also guarding against race where try_to_swap_out
+ * allocates entry with get_swap_page then adds to cache.
*/
+ swap_list_lock();
found_page = __find_get_page(&swapper_space, entry.val, hash);
if (found_page)
goto out_free_page;
/*
* Make sure the swap entry is still in use. It could have gone
- * while caller waited for BKL, or while allocating page above,
+ * since caller dropped page_table_lock, while allocating page above,
* or while allocating page in prior call via swapin_readahead.
*/
if (!swap_duplicate(entry)) /* Account for the swap cache */
@@ -234,13 +222,15 @@ struct page * read_swap_cache_async(swp_entry_t entry)
/*
* Add it to the swap cache and read its contents.
*/
- if (TryLockPage(new_page))
- BUG();
add_to_swap_cache(new_page, entry);
+ swap_list_unlock();
+
rw_swap_page(READ, new_page);
return new_page;
out_free_page:
+ swap_list_unlock();
+ UnlockPage(new_page);
page_cache_release(new_page);
out:
return found_page;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c9783aa2dd70..f3b73b43abe6 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -14,6 +14,7 @@
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/shm.h>
+#include <linux/compiler.h>
#include <asm/pgtable.h>
@@ -33,7 +34,7 @@ struct swap_info_struct swap_info[MAX_SWAPFILES];
#define SWAPFILE_CLUSTER 256
-static inline int scan_swap_map(struct swap_info_struct *si, unsigned short count)
+static inline int scan_swap_map(struct swap_info_struct *si)
{
unsigned long offset;
/*
@@ -86,7 +87,8 @@ static inline int scan_swap_map(struct swap_info_struct *si, unsigned short coun
si->lowest_bit = si->max;
si->highest_bit = 0;
}
- si->swap_map[offset] = count;
+ /* Initial count 1 for user reference + 1 for swap cache */
+ si->swap_map[offset] = 2;
nr_swap_pages--;
si->cluster_next = offset+1;
return offset;
@@ -96,7 +98,12 @@ static inline int scan_swap_map(struct swap_info_struct *si, unsigned short coun
return 0;
}
-swp_entry_t __get_swap_page(unsigned short count)
+/*
+ * Callers of get_swap_page must hold swap_list_lock across the call,
+ * and across the following add_to_swap_cache, to guard against races
+ * with read_swap_cache_async.
+ */
+swp_entry_t get_swap_page(void)
{
struct swap_info_struct * p;
unsigned long offset;
@@ -104,20 +111,17 @@ swp_entry_t __get_swap_page(unsigned short count)
int type, wrapped = 0;
entry.val = 0; /* Out of memory */
- if (count >= SWAP_MAP_MAX)
- goto bad_count;
- swap_list_lock();
type = swap_list.next;
if (type < 0)
goto out;
- if (nr_swap_pages == 0)
+ if (nr_swap_pages <= 0)
goto out;
while (1) {
p = &swap_info[type];
if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
swap_device_lock(p);
- offset = scan_swap_map(p, count);
+ offset = scan_swap_map(p);
swap_device_unlock(p);
if (offset) {
entry = SWP_ENTRY(type,offset);
@@ -142,21 +146,14 @@ swp_entry_t __get_swap_page(unsigned short count)
goto out; /* out of swap space */
}
out:
- swap_list_unlock();
- return entry;
-
-bad_count:
- printk(KERN_ERR "get_swap_page: bad count %hd from %p\n",
- count, __builtin_return_address(0));
return entry;
}
-
/*
* Caller has made sure that the swapdevice corresponding to entry
* is still around or has not been recycled.
*/
-void __swap_free(swp_entry_t entry, unsigned short count)
+void swap_free(swp_entry_t entry)
{
struct swap_info_struct * p;
unsigned long offset, type;
@@ -180,9 +177,7 @@ void __swap_free(swp_entry_t entry, unsigned short count)
swap_list.next = type;
swap_device_lock(p);
if (p->swap_map[offset] < SWAP_MAP_MAX) {
- if (p->swap_map[offset] < count)
- goto bad_count;
- if (!(p->swap_map[offset] -= count)) {
+ if (!--(p->swap_map[offset])) {
if (offset < p->lowest_bit)
p->lowest_bit = offset;
if (offset > p->highest_bit)
@@ -207,11 +202,6 @@ bad_offset:
bad_free:
printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val);
goto out;
-bad_count:
- swap_device_unlock(p);
- swap_list_unlock();
- printk(KERN_ERR "swap_free: Bad count %hd current count %hd\n", count, p->swap_map[offset]);
- goto out;
}
/*
@@ -229,9 +219,9 @@ static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address,
{
pte_t pte = *dir;
- if (pte_to_swp_entry(pte).val != entry.val)
+ if (likely(pte_to_swp_entry(pte).val != entry.val))
return;
- if (pte_none(pte) || pte_present(pte))
+ if (unlikely(pte_none(pte) || pte_present(pte)))
return;
get_page(page);
set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
@@ -458,7 +448,7 @@ static int try_to_unuse(unsigned int type)
*/
lock_page(page);
if (PageSwapCache(page))
- delete_from_swap_cache_nolock(page);
+ delete_from_swap_cache(page);
SetPageDirty(page);
UnlockPage(page);
flush_page_to_ram(page);
@@ -567,14 +557,8 @@ asmlinkage long sys_swapoff(const char * specialfile)
for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
p = swap_info + type;
if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) {
- if (p->swap_file) {
- if (p->swap_file == nd.dentry)
- break;
- } else {
- if (S_ISBLK(nd.dentry->d_inode->i_mode)
- && (p->swap_device == nd.dentry->d_inode->i_rdev))
- break;
- }
+ if (p->swap_file == nd.dentry)
+ break;
}
prev = type;
}
@@ -616,19 +600,21 @@ asmlinkage long sys_swapoff(const char * specialfile)
goto out_dput;
}
if (p->swap_device)
- blkdev_put(nd.dentry->d_inode->i_bdev, BDEV_SWAP);
+ blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP);
path_release(&nd);
swap_list_lock();
- nd.dentry = p->swap_file;
- p->swap_file = NULL;
+ swap_device_lock(p);
nd.mnt = p->swap_vfsmnt;
+ nd.dentry = p->swap_file;
p->swap_vfsmnt = NULL;
+ p->swap_file = NULL;
p->swap_device = 0;
p->max = 0;
swap_map = p->swap_map;
p->swap_map = NULL;
p->flags = 0;
+ swap_device_unlock(p);
swap_list_unlock();
vfree(swap_map);
err = 0;
@@ -711,6 +697,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
unsigned long maxpages = 1;
int swapfilesize;
struct block_device *bdev = NULL;
+ unsigned short *swap_map;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -760,6 +747,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
p->swap_device = dev;
set_blocksize(dev, PAGE_SIZE);
+ bd_acquire(swap_inode);
bdev = swap_inode->i_bdev;
bdops = devfs_get_ops(devfs_get_handle_from_inode(swap_inode));
if (bdops) bdev->bd_op = bdops;
@@ -772,29 +760,24 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
if (!dev || (blk_size[MAJOR(dev)] &&
!blk_size[MAJOR(dev)][MINOR(dev)]))
goto bad_swap;
- error = -EBUSY;
- for (i = 0 ; i < nr_swapfiles ; i++) {
- if (i == type)
- continue;
- if (dev == swap_info[i].swap_device)
- goto bad_swap;
- }
swapfilesize = 0;
if (blk_size[MAJOR(dev)])
swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)]
>> (PAGE_SHIFT - 10);
- } else if (S_ISREG(swap_inode->i_mode)) {
- error = -EBUSY;
- for (i = 0 ; i < nr_swapfiles ; i++) {
- if (i == type || !swap_info[i].swap_file)
- continue;
- if (swap_inode == swap_info[i].swap_file->d_inode)
- goto bad_swap;
- }
+ } else if (S_ISREG(swap_inode->i_mode))
swapfilesize = swap_inode->i_size >> PAGE_SHIFT;
- } else
+ else
goto bad_swap;
+ error = -EBUSY;
+ for (i = 0 ; i < nr_swapfiles ; i++) {
+ struct swap_info_struct *q = &swap_info[i];
+ if (i == type || !q->swap_file)
+ continue;
+ if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping)
+ goto bad_swap;
+ }
+
swap_header = (void *) __get_free_page(GFP_USER);
if (!swap_header) {
printk("Unable to start swapping: out of memory :-)\n");
@@ -900,6 +883,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
}
p->swap_map[0] = SWAP_MAP_BAD;
swap_list_lock();
+ swap_device_lock(p);
p->max = maxpages;
p->flags = SWP_WRITEOK;
p->pages = nr_good_pages;
@@ -922,6 +906,7 @@ asmlinkage long sys_swapon(const char * specialfile, int swap_flags)
} else {
swap_info[prev].next = p - swap_info;
}
+ swap_device_unlock(p);
swap_list_unlock();
error = 0;
goto out;
@@ -929,11 +914,10 @@ bad_swap:
if (bdev)
blkdev_put(bdev, BDEV_SWAP);
bad_swap_2:
- if (p->swap_map)
- vfree(p->swap_map);
+ swap_list_lock();
+ swap_map = p->swap_map;
nd.mnt = p->swap_vfsmnt;
nd.dentry = p->swap_file;
- swap_list_lock();
p->swap_device = 0;
p->swap_file = NULL;
p->swap_vfsmnt = NULL;
@@ -942,6 +926,8 @@ bad_swap_2:
if (!(swap_flags & SWAP_FLAG_PREFER))
++least_priority;
swap_list_unlock();
+ if (swap_map)
+ vfree(swap_map);
path_release(&nd);
out:
if (swap_header)
@@ -987,43 +973,31 @@ int swap_duplicate(swp_entry_t entry)
unsigned long offset, type;
int result = 0;
- /* Swap entry 0 is illegal */
- if (!entry.val)
- goto out;
type = SWP_TYPE(entry);
if (type >= nr_swapfiles)
goto bad_file;
p = type + swap_info;
offset = SWP_OFFSET(entry);
- if (offset >= p->max)
- goto bad_offset;
- if (!p->swap_map[offset])
- goto bad_unused;
- /*
- * Entry is valid, so increment the map count.
- */
+
swap_device_lock(p);
- if (p->swap_map[offset] < SWAP_MAP_MAX)
- p->swap_map[offset]++;
- else {
- if (swap_overflow++ < 5)
- printk(KERN_WARNING "swap_dup: swap entry overflow\n");
- p->swap_map[offset] = SWAP_MAP_MAX;
+ if (offset < p->max && p->swap_map[offset]) {
+ if (p->swap_map[offset] < SWAP_MAP_MAX - 1) {
+ p->swap_map[offset]++;
+ result = 1;
+ } else if (p->swap_map[offset] <= SWAP_MAP_MAX) {
+ if (swap_overflow++ < 5)
+ printk(KERN_WARNING "swap_dup: swap entry overflow\n");
+ p->swap_map[offset] = SWAP_MAP_MAX;
+ result = 1;
+ }
}
swap_device_unlock(p);
- result = 1;
out:
return result;
bad_file:
printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val);
goto out;
-bad_offset:
- /* Don't report: can happen in read_swap_cache_async after swapoff */
- goto out;
-bad_unused:
- /* Don't report: can happen in read_swap_cache_async after blocking */
- goto out;
}
/*
@@ -1068,7 +1042,7 @@ bad_unused:
}
/*
- * Kernel_lock protects against swap device deletion.
+ * Prior swap_duplicate protects against swap device deletion.
*/
void get_swaphandle_info(swp_entry_t entry, unsigned long *offset,
kdev_t *dev, struct inode **swapf)
@@ -1108,8 +1082,8 @@ void get_swaphandle_info(swp_entry_t entry, unsigned long *offset,
}
/*
- * Kernel_lock protects against swap device deletion. Grab an extra
- * reference on the swaphandle so that it dos not become unused.
+ * swap_device_lock prevents swap_map being freed. Don't grab an extra
+ * reference on the swaphandle, it doesn't matter if it becomes unused.
*/
int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
{
@@ -1117,20 +1091,23 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset)
unsigned long toff;
struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info;
- *offset = SWP_OFFSET(entry);
- toff = *offset = (*offset >> page_cluster) << page_cluster;
+ if (!page_cluster) /* no readahead */
+ return 0;
+ toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster;
+ if (!toff) /* first page is swap header */
+ toff++, i--;
+ *offset = toff;
swap_device_lock(swapdev);
do {
/* Don't read-ahead past the end of the swap area */
if (toff >= swapdev->max)
break;
- /* Don't read in bad or busy pages */
+ /* Don't read in free or bad pages */
if (!swapdev->swap_map[toff])
break;
if (swapdev->swap_map[toff] == SWAP_MAP_BAD)
break;
- swapdev->swap_map[toff]++;
toff++;
ret++;
} while (--i);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b6d222ea4e2e..1c3f6b3e2f92 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,14 +52,9 @@ static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct*
/* Don't look at this pte if it's been accessed recently. */
if (ptep_test_and_clear_young(page_table)) {
flush_tlb_page(vma, address);
- mark_page_accessed(page);
return 0;
}
- /* Don't bother with it if the page is otherwise active */
- if (PageActive(page))
- return 0;
-
if (TryLockPage(page))
return 0;
@@ -85,8 +80,8 @@ static inline int try_to_swap_out(struct mm_struct * mm, struct vm_area_struct*
entry.val = page->index;
if (pte_dirty(pte))
set_page_dirty(page);
-set_swap_pte:
swap_duplicate(entry);
+set_swap_pte:
set_pte(page_table, swp_entry_to_pte(entry));
drop_pte:
mm->rss--;
@@ -130,16 +125,18 @@ drop_pte:
* we have the swap cache set up to associate the
* page with that swap entry.
*/
+ swap_list_lock();
entry = get_swap_page();
- if (!entry.val)
- goto out_unlock_restore; /* No swap space left */
-
- /* Add it to the swap cache and mark it dirty */
- add_to_swap_cache(page, entry);
- set_page_dirty(page);
- goto set_swap_pte;
+ if (entry.val) {
+ /* Add it to the swap cache and mark it dirty */
+ add_to_swap_cache(page, entry);
+ swap_list_unlock();
+ set_page_dirty(page);
+ goto set_swap_pte;
+ }
-out_unlock_restore:
+ /* No swap space left */
+ swap_list_unlock();
set_pte(page_table, pte);
UnlockPage(page);
return 0;
@@ -243,9 +240,9 @@ static inline int swap_out_vma(struct mm_struct * mm, struct vm_area_struct * vm
struct mm_struct *swap_mm = &init_mm;
/*
- * Returns non-zero if we scanned all `count' pages
+ * Returns remaining count of pages to be swapped out by followup call.
*/
-static inline int swap_out_mm(struct mm_struct * mm, int count, int * race, zone_t * classzone)
+static inline int swap_out_mm(struct mm_struct * mm, int count, int * mmcounter, zone_t * classzone)
{
unsigned long address;
struct vm_area_struct* vma;
@@ -255,11 +252,12 @@ static inline int swap_out_mm(struct mm_struct * mm, int count, int * race, zone
* and ptes.
*/
spin_lock(&mm->page_table_lock);
- *race = 1;
- if (swap_mm != mm)
- goto out_unlock;
- *race = 0;
address = mm->swap_address;
+ if (address == TASK_SIZE || swap_mm != mm) {
+ /* We raced: don't count this mm but try again */
+ ++*mmcounter;
+ goto out_unlock;
+ }
vma = find_vma(mm, address);
if (vma) {
if (address < vma->vm_start)
@@ -267,31 +265,26 @@ static inline int swap_out_mm(struct mm_struct * mm, int count, int * race, zone
for (;;) {
count = swap_out_vma(mm, vma, address, count, classzone);
- if (!count)
- goto out_unlock;
vma = vma->vm_next;
if (!vma)
break;
+ if (!count)
+ goto out_unlock;
address = vma->vm_start;
}
}
- /* Reset to 0 when we reach the end of address space */
- mm->swap_address = 0;
-
- spin_lock(&mmlist_lock);
- swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
- spin_unlock(&mmlist_lock);
+ /* Indicate that we reached the end of address space */
+ mm->swap_address = TASK_SIZE;
out_unlock:
spin_unlock(&mm->page_table_lock);
-
return count;
}
static int FASTCALL(swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
{
- int counter, race;
+ int counter;
struct mm_struct *mm;
/* Then, look at the other mm's */
@@ -304,9 +297,10 @@ static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_
spin_lock(&mmlist_lock);
mm = swap_mm;
- if (mm == &init_mm) {
+ while (mm->swap_address == TASK_SIZE || mm == &init_mm) {
+ mm->swap_address = 0;
mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
- if (mm == &init_mm)
+ if (mm == swap_mm)
goto empty;
swap_mm = mm;
}
@@ -315,13 +309,13 @@ static int swap_out(unsigned int priority, zone_t * classzone, unsigned int gfp_
atomic_inc(&mm->mm_users);
spin_unlock(&mmlist_lock);
- nr_pages = swap_out_mm(mm, nr_pages, &race, classzone);
+ nr_pages = swap_out_mm(mm, nr_pages, &counter, classzone);
mmput(mm);
if (!nr_pages)
return 1;
- } while (race || --counter >= 0);
+ } while (--counter >= 0);
return 0;
@@ -330,15 +324,15 @@ empty:
return 0;
}
-static int FASTCALL(shrink_cache(struct list_head * lru, int * max_scan, int this_max_scan, int nr_pages, zone_t * classzone, unsigned int gfp_mask));
-static int shrink_cache(struct list_head * lru, int * max_scan, int this_max_scan, int nr_pages, zone_t * classzone, unsigned int gfp_mask)
+static int FASTCALL(shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask));
+static int shrink_cache(int nr_pages, int max_scan, zone_t * classzone, unsigned int gfp_mask)
{
struct list_head * entry;
- int __max_scan = *max_scan;
spin_lock(&pagemap_lru_lock);
- while (__max_scan && this_max_scan && (entry = lru->prev) != lru) {
+ while (max_scan && (entry = inactive_list.prev) != &inactive_list) {
struct page * page;
+ swp_entry_t swap;
if (unlikely(current->need_resched)) {
spin_unlock(&pagemap_lru_lock);
@@ -353,18 +347,16 @@ static int shrink_cache(struct list_head * lru, int * max_scan, int this_max_sca
if (unlikely(!PageInactive(page) && !PageActive(page)))
BUG();
- this_max_scan--;
-
list_del(entry);
- list_add(entry, lru);
+ list_add(entry, &inactive_list);
if (PageTestandClearReferenced(page))
continue;
+ max_scan--;
+
if (unlikely(!memclass(page->zone, classzone)))
continue;
- __max_scan--;
-
/* Racy check to avoid trylocking when not worthwhile */
if (!page->buffers && page_count(page) != 1)
continue;
@@ -479,14 +471,24 @@ static int shrink_cache(struct list_head * lru, int * max_scan, int this_max_sca
}
/* point of no return */
- if (likely(!PageSwapCache(page)))
+ if (likely(!PageSwapCache(page))) {
+ swap.val = 0;
__remove_inode_page(page);
- else
+ } else {
+ swap.val = page->index;
__delete_from_swap_cache(page);
+ }
spin_unlock(&pagecache_lock);
__lru_cache_del(page);
+ if (unlikely(swap.val != 0)) {
+ /* must drop lru lock if getting swap_list lock */
+ spin_unlock(&pagemap_lru_lock);
+ swap_free(swap);
+ spin_lock(&pagemap_lru_lock);
+ }
+
UnlockPage(page);
/* effectively free the page here */
@@ -498,7 +500,6 @@ static int shrink_cache(struct list_head * lru, int * max_scan, int this_max_sca
}
spin_unlock(&pagemap_lru_lock);
- *max_scan = __max_scan;
return nr_pages;
}
@@ -509,14 +510,10 @@ static int shrink_cache(struct list_head * lru, int * max_scan, int this_max_sca
* We move them the other way when we see the
* reference bit on the page.
*/
-static void balance_inactive(int nr_pages)
+static void refill_inactive(int nr_pages)
{
struct list_head * entry;
- /* If we have more inactive pages than active don't do anything */
- if (nr_active_pages < nr_inactive_pages)
- return;
-
spin_lock(&pagemap_lru_lock);
entry = active_list.prev;
while (nr_pages-- && entry != &active_list) {
@@ -541,14 +538,17 @@ static void balance_inactive(int nr_pages)
static int FASTCALL(shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages));
static int shrink_caches(int priority, zone_t * classzone, unsigned int gfp_mask, int nr_pages)
{
- int max_scan = (nr_inactive_pages + nr_active_pages / DEF_PRIORITY) / priority;
+ int max_scan = nr_inactive_pages / priority;
nr_pages -= kmem_cache_reap(gfp_mask);
if (nr_pages <= 0)
return 0;
- balance_inactive(nr_pages);
- nr_pages = shrink_cache(&inactive_list, &max_scan, nr_inactive_pages, nr_pages, classzone, gfp_mask);
+ /* Do we want to age the active list? */
+ if (nr_inactive_pages < nr_active_pages*2)
+ refill_inactive(nr_pages);
+
+ nr_pages = shrink_cache(nr_pages, max_scan, classzone, gfp_mask);
if (nr_pages <= 0)
return 0;