From 2e0376aabc38f32620c2f52a7ac596ecd632f165 Mon Sep 17 00:00:00 2001 From: Douglas Gilbert Date: Tue, 13 Aug 2002 23:54:08 -0500 Subject: This version of sg for the lk 2.5 series re-adds direct IO support using work done by Kai Makisara (on st driver, posted 2002/7/29). Changelog: Changes since 3.5.26 (20020708) - re-add direct IO using Kai Makisara's work - re-tab to 8, start using C99-isms - simplify memory management Like Kai's patch, this one needs kernel/ksyms.c altered to export get_user_pages(). Kai's worker routines st_map_user_pages() and st_unmap_user_pages() are duplicated as is. Hopefully these routines will find a home in a library soon. The re-tabbing makes the patches rather large so here are 2 urls: This tarball contains sg.h and sg.c http://www.torque.net/sg/p/sg3527.tgz This gzipped patch is against lk 2.5.31 and touches kernel/ksyms.c as well http://www.torque.net/sg/p/sg_3527_lk2531.diff.gz Testing is ongoing, everything works apart from "zero copy" copy. That uses mmap-ed IO on the read side and direct IO on the write side. Not too many people would be using that I suspect. Doug Gilbert --- include/scsi/sg.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/scsi/sg.h b/include/scsi/sg.h index 5d609266c70c..9ffc9f960c7e 100644 --- a/include/scsi/sg.h +++ b/include/scsi/sg.h @@ -11,9 +11,13 @@ Original driver (sg.h): Version 2 and 3 extensions to driver: * Copyright (C) 1998 - 2002 Douglas Gilbert - Version: 3.5.26 (20020708) + Version: 3.5.27 (20020812) This version is for 2.5 series kernels. + Changes since 3.5.26 (20020708) + - re-add direct IO using Kai Makisara's work + - re-tab to 8, start using C99-isms + - simplify memory management Changes since 3.5.25 (20020504) - driverfs additions - copy_to/from_user() fixes [William Stinson] -- cgit v1.2.3 From 16dc2073cc61ce5ede5392cf23ebaeea8a988f4a Mon Sep 17 00:00:00 2001 From: David Brownell Date: Wed, 14 Aug 2002 00:11:37 -0700 Subject: [PATCH] USB core cleanups Moves some functions that are only used by usbfs to be private, and documents some of the interface issues that need to be cleaned up. --- drivers/usb/core/devio.c | 20 +++++--- drivers/usb/core/hcd.h | 10 ++++ drivers/usb/core/hub.h | 4 ++ drivers/usb/core/message.c | 2 + drivers/usb/core/usb.c | 117 +++++++++++++++++---------------------------- include/linux/usb.h | 17 ++----- 6 files changed, 78 insertions(+), 92 deletions(-) (limited to 'include') diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index e80a3ed170f8..0b81ee069ec0 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -46,6 +46,7 @@ #include #include +#include "hcd.h" /* for usbcore internals */ struct async { struct list_head asynclist; @@ -724,7 +725,7 @@ static int proc_resetdevice(struct dev_state *ps) lock_kernel(); if (intf->driver && ps->dev) { - usb_bind_driver(intf->driver,ps->dev, i); + usb_bind_driver (intf->driver, intf); } unlock_kernel(); } @@ -1062,8 +1063,8 @@ static int proc_ioctl (struct dev_state *ps, void *arg) int size; void *buf = 0; int retval = 0; - struct usb_interface *ifp = 0; - struct usb_driver *driver = 0; + struct usb_interface *ifp = 0; + struct usb_driver *driver = 0; /* get input parameters and alloc buffer */ if (copy_from_user(&ctrl, (void *) arg, sizeof (ctrl))) @@ -1102,10 +1103,10 @@ static int proc_ioctl (struct dev_state *ps, void *arg) unlock_kernel(); break; - /* let kernel drivers try to (re)bind to the interface */ - case USBDEVFS_CONNECT: - usb_find_interface_driver_for_ifnum (ps->dev, ctrl.ifno); - break; + /* let kernel drivers try to (re)bind to the interface */ + case USBDEVFS_CONNECT: + usb_find_interface_driver (ps->dev, ifp); + break; /* talk directly to the interface's driver */ default: @@ -1144,6 +1145,11 @@ static int proc_ioctl (struct dev_state *ps, void *arg) return retval; } +/* + * NOTE: All requests here that have interface numbers as parameters + * are assuming that somehow the configuration has been prevented from + * changing. But there's no mechanism to ensure that... + */ static int usbdev_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) { struct dev_state *ps = (struct dev_state *)file->private_data; diff --git a/drivers/usb/core/hcd.h b/drivers/usb/core/hcd.h index e33a8b409ee4..4e3efb0a3a2b 100644 --- a/drivers/usb/core/hcd.h +++ b/drivers/usb/core/hcd.h @@ -348,6 +348,16 @@ extern struct semaphore usb_bus_list_lock; extern void usb_bus_get (struct usb_bus *bus); extern void usb_bus_put (struct usb_bus *bus); +extern struct usb_interface *usb_ifnum_to_if (struct usb_device *dev, + unsigned ifnum); + +extern int usb_find_interface_driver (struct usb_device *dev, + struct usb_interface *interface); + +/* for probe/disconnect with correct module usage counting */ +void *usb_bind_driver(struct usb_driver *driver, struct usb_interface *intf); +void usb_unbind_driver(struct usb_device *device, struct usb_interface *intf); + /*-------------------------------------------------------------------------*/ /* hub.h ... DeviceRemovable in 2.4.2-ac11, gone in 2.4.10 */ diff --git a/drivers/usb/core/hub.h b/drivers/usb/core/hub.h index f709bb86d8a1..6b7a18e8f378 100644 --- a/drivers/usb/core/hub.h +++ b/drivers/usb/core/hub.h @@ -123,6 +123,10 @@ struct usb_hub_status { * Hub descriptor * See USB 2.0 spec Table 11-13 */ + +#define USB_DT_HUB (USB_TYPE_CLASS | 0x09) +#define USB_DT_HUB_NONVAR_SIZE 7 + struct usb_hub_descriptor { __u8 bDescLength; __u8 bDescriptorType; diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c index b7955bc73ae0..91ac190c9569 100644 --- a/drivers/usb/core/message.c +++ b/drivers/usb/core/message.c @@ -8,6 +8,8 @@ #include #include +#include "hcd.h" /* for usbcore internals */ + struct usb_api_data { wait_queue_head_t wqh; int done; diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index 22aaddd711d9..718389719dc4 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -52,7 +52,6 @@ extern void usb_major_cleanup(void); * Prototypes for the device driver probing/loading functions */ static void usb_find_drivers(struct usb_device *); -static int usb_find_interface_driver(struct usb_device *, unsigned int); static void usb_check_support(struct usb_device *); /* @@ -119,7 +118,7 @@ void usb_scan_devices(void) } /** - * usb_unbind_driver - disconnects a driver from a device + * usb_unbind_driver - disconnects a driver from a device (usbcore-internal) * @device: usb device to be disconnected * @intf: interface of the device to be disconnected * Context: BKL held @@ -164,21 +163,27 @@ void usb_unbind_driver(struct usb_device *device, struct usb_interface *intf) } /** - * usb_bind_driver - connect a driver to a device's interface - * @driver: device driver to be bound to a devices interface - * @dev: device to be bound - * @ifnum: index number of the interface to be used + * usb_bind_driver - connect a driver to a device's interface (usbcore-internal) + * @driver: device driver to be bound to interface + * @interface: interface that the driver will be using + * Context: BKL held + * + * Does a safe binding of a driver to one of a device's interfaces. + * Returns the driver's data for the binding, or null indicating + * that the driver did not bind to this interface. * - * Does a save binding of a driver to a device's interface - * Returns a pointer to the drivers private description of the binding + * This differs from usb_driver_claim_interface(), which is called from + * drivers and neither calls the driver's probe() entry nor does any + * locking to guard against removing driver modules. */ - -void *usb_bind_driver(struct usb_driver *driver, struct usb_device *dev, unsigned int ifnum) +void * +usb_bind_driver (struct usb_driver *driver, struct usb_interface *interface) { int i,m; void *private = NULL; const struct usb_device_id *id; - struct usb_interface *interface; + struct usb_device *dev = interface_to_usbdev (interface); + int ifnum; if (driver->owner) { m = try_inc_mod_count(driver->owner); @@ -187,7 +192,14 @@ void *usb_bind_driver(struct usb_driver *driver, struct usb_device *dev, unsigne unlock_kernel(); } - interface = &dev->actconfig->interface[ifnum]; + // START TEMPORARY + // driver->probe() hasn't yet changed to take interface not dev+ifnum, + // so we still need ifnum here. + for (ifnum = 0; ifnum < dev->actconfig->bNumInterfaces; ifnum++) + if (&dev->actconfig->interface [ifnum] == interface) + break; + BUG_ON (ifnum == dev->actconfig->bNumInterfaces); + // END TEMPORARY id = driver->id_table; /* new style driver? */ @@ -252,7 +264,7 @@ static void usb_drivers_purge(struct usb_driver *driver,struct usb_device *dev) * This will go through the list looking for another * driver that can handle the device */ - usb_find_interface_driver(dev, i); + usb_find_interface_driver(dev, interface); } } } @@ -294,29 +306,7 @@ void usb_deregister(struct usb_driver *driver) } /** - * usb_ifnum_to_ifpos - convert the interface number to the interface position - * @dev: the device to use - * @ifnum: the interface number (bInterfaceNumber); not interface position - * - * This is used to convert the interface _number_ (as in - * interface.bInterfaceNumber) to the interface _position_ (as in - * dev->actconfig->interface + position). Note that the number is the same as - * the position for all interfaces _except_ devices with interfaces not - * sequentially numbered (e.g., 0, 2, 3, etc). - */ -int usb_ifnum_to_ifpos(struct usb_device *dev, unsigned ifnum) -{ - int i; - - for (i = 0; i < dev->actconfig->bNumInterfaces; i++) - if (dev->actconfig->interface[i].altsetting[0].bInterfaceNumber == ifnum) - return i; - - return -EINVAL; -} - -/** - * usb_ifnum_to_if - get the interface object with a given interface number + * usb_ifnum_to_if - get the interface object with a given interface number (usbcore-internal) * @dev: the device whose current configuration is considered * @ifnum: the desired interface * @@ -392,7 +382,8 @@ static void usb_check_support(struct usb_device *dev) /* now we check this device */ if (dev->devnum > 0) for (i = 0; i < dev->actconfig->bNumInterfaces; i++) - usb_find_interface_driver(dev, i); + usb_find_interface_driver (dev, + dev->actconfig->interface + i); } @@ -605,7 +596,7 @@ usb_match_id(struct usb_device *dev, struct usb_interface *interface, } /* - * This entrypoint gets called for each new device. + * This entrypoint gets called for unclaimed interfaces. * * We now walk the list of registered USB drivers, * looking for one that will accept this interface. @@ -620,21 +611,27 @@ usb_match_id(struct usb_device *dev, struct usb_interface *interface, * * Returns: 0 if a driver accepted the interface, -1 otherwise */ -static int usb_find_interface_driver(struct usb_device *dev, unsigned ifnum) +int usb_find_interface_driver ( + struct usb_device *dev, + struct usb_interface *interface +) { struct list_head *tmp; - struct usb_interface *interface; void *private; struct usb_driver *driver; + int ifnum; - if ((!dev) || (ifnum >= dev->actconfig->bNumInterfaces)) { - err("bad find_interface_driver params"); - return -1; - } - down(&dev->serialize); - interface = dev->actconfig->interface + ifnum; + /* FIXME It's just luck that for some devices with drivers that set + * configuration in probe(), the interface numbers still make sense. + * That's one of several unsafe assumptions involved in configuring + * devices, and in binding drivers to their interfaces. + */ + for (ifnum = 0; ifnum < dev->actconfig->bNumInterfaces; ifnum++) + if (&dev->actconfig->interface [ifnum] == interface) + break; + BUG_ON (ifnum == dev->actconfig->bNumInterfaces); if (usb_interface_claimed(interface)) goto out_err; @@ -645,7 +642,7 @@ static int usb_find_interface_driver(struct usb_device *dev, unsigned ifnum) driver = list_entry(tmp, struct usb_driver, driver_list); tmp = tmp->next; - private = usb_bind_driver(driver, dev, ifnum); + private = usb_bind_driver(driver, interface); /* probe() may have changed the config on us */ interface = dev->actconfig->interface + ifnum; @@ -664,25 +661,6 @@ out_err: return -1; } -/** - * usb_find_interface_driver_for_ifnum - finds a usb interface driver for the specified ifnum - * @dev: the device to use - * @ifnum: the interface number (bInterfaceNumber); not interface position! - * - * This converts a ifnum to ifpos via a call to usb_ifnum_to_ifpos and then - * calls usb_find_interface_driver() with the found ifpos. Note - * usb_find_interface_driver's ifnum parameter is actually interface position. - */ -int usb_find_interface_driver_for_ifnum(struct usb_device *dev, unsigned ifnum) -{ - int ifpos = usb_ifnum_to_ifpos(dev, ifnum); - - if (0 > ifpos) - return -EINVAL; - - return usb_find_interface_driver(dev, ifpos); -} - #ifdef CONFIG_HOTPLUG /* @@ -956,7 +934,7 @@ static void usb_find_drivers(struct usb_device *dev) /* if this interface hasn't already been claimed */ if (!usb_interface_claimed(interface)) { - if (usb_find_interface_driver(dev, ifnum)) + if (usb_find_interface_driver(dev, interface)) rejected++; else claimed++; @@ -1655,8 +1633,6 @@ module_exit(usb_exit); * These symbols are exported for device (or host controller) * driver modules to use. */ -EXPORT_SYMBOL(usb_ifnum_to_ifpos); -EXPORT_SYMBOL(usb_ifnum_to_if); EXPORT_SYMBOL(usb_epnum_to_ep_desc); EXPORT_SYMBOL(usb_register); @@ -1668,7 +1644,6 @@ EXPORT_SYMBOL(usb_free_dev); EXPORT_SYMBOL(usb_get_dev); EXPORT_SYMBOL(usb_hub_tt_clear_buffer); -EXPORT_SYMBOL(usb_find_interface_driver_for_ifnum); EXPORT_SYMBOL(usb_driver_claim_interface); EXPORT_SYMBOL(usb_interface_claimed); EXPORT_SYMBOL(usb_driver_release_interface); @@ -1679,8 +1654,6 @@ EXPORT_SYMBOL(usb_new_device); EXPORT_SYMBOL(usb_reset_device); EXPORT_SYMBOL(usb_connect); EXPORT_SYMBOL(usb_disconnect); -EXPORT_SYMBOL(usb_bind_driver); -EXPORT_SYMBOL(usb_unbind_driver); EXPORT_SYMBOL(__usb_get_extra_descriptor); diff --git a/include/linux/usb.h b/include/linux/usb.h index 7f44f0ff05ef..cc7f1aa7520f 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -167,10 +167,6 @@ struct usb_device; #define USB_DT_OTHER_SPEED_CONFIG 0x07 #define USB_DT_INTERFACE_POWER 0x08 -// FIXME should be internal to hub driver -#define USB_DT_HUB (USB_TYPE_CLASS | 0x09) -#define USB_DT_HUB_NONVAR_SIZE 7 - /* * Descriptor sizes per descriptor type */ @@ -258,6 +254,8 @@ struct usb_interface { void *private_data; }; #define to_usb_interface(d) container_of(d, struct usb_interface, dev) +#define interface_to_usbdev(intf) \ + container_of(intf->dev.parent, struct usb_device, dev) /* USB_DT_CONFIG: Configuration descriptor information. * @@ -303,10 +301,8 @@ struct usb_qualifier_descriptor { __u8 bRESERVED; } __attribute__ ((packed)); -/* helpers for driver access to descriptors */ -extern int usb_ifnum_to_ifpos(struct usb_device *dev, unsigned ifnum); -extern struct usb_interface * - usb_ifnum_to_if(struct usb_device *dev, unsigned ifnum); +// FIXME remove; exported only for drivers/usb/misc/auserwald.c +// prefer usb_device->epnum[0..31] extern struct usb_endpoint_descriptor * usb_epnum_to_ep_desc(struct usb_device *dev, unsigned epnum); @@ -435,10 +431,6 @@ extern void usb_free_dev(struct usb_device *); /* for when layers above USB add new non-USB drivers */ extern void usb_scan_devices(void); -/* for probe/disconnect with correct module usage counting */ -void *usb_bind_driver(struct usb_driver *driver, struct usb_device *dev, unsigned int ifnum); -void usb_unbind_driver(struct usb_device *device, struct usb_interface *intf); - /* mostly for devices emulating SCSI over USB */ extern int usb_reset_device(struct usb_device *dev); @@ -446,7 +438,6 @@ extern int usb_reset_device(struct usb_device *dev); extern int usb_get_current_frame_number (struct usb_device *usb_dev); /* used these for multi-interface device registration */ -extern int usb_find_interface_driver_for_ifnum(struct usb_device *dev, unsigned int ifnum); extern void usb_driver_claim_interface(struct usb_driver *driver, struct usb_interface *iface, void* priv); extern int usb_interface_claimed(struct usb_interface *iface); -- cgit v1.2.3 From f601a8a6374d5e1fdc2a9641dde46709b6dea6dc Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 14 Aug 2002 00:30:03 -0700 Subject: USB: changed usb_match_id to not need the usb_device pointer. --- drivers/usb/core/usb.c | 18 +++++++++--------- drivers/usb/serial/usbserial.c | 2 +- drivers/usb/storage/scsiglue.c | 2 +- include/linux/usb.h | 3 +-- 4 files changed, 12 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index 718389719dc4..67548c9fe50a 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -206,7 +206,7 @@ usb_bind_driver (struct usb_driver *driver, struct usb_interface *interface) if (id) { for (i = 0; i < interface->num_altsetting; i++) { interface->act_altsetting = i; - id = usb_match_id(dev, interface, id); + id = usb_match_id(interface, id); if (id) { down(&driver->serialize); private = driver->probe(dev,ifnum,id); @@ -466,7 +466,6 @@ void usb_driver_release_interface(struct usb_driver *driver, struct usb_interfac /** * usb_match_id - find first usb_device_id matching device or interface - * @dev: the device whose descriptors are considered when matching * @interface: the interface of interest * @id: array of usb_device_id structures, terminated by zero entry * @@ -528,15 +527,18 @@ void usb_driver_release_interface(struct usb_driver *driver, struct usb_interfac * its associated class and subclass. */ const struct usb_device_id * -usb_match_id(struct usb_device *dev, struct usb_interface *interface, - const struct usb_device_id *id) +usb_match_id(struct usb_interface *interface, const struct usb_device_id *id) { - struct usb_interface_descriptor *intf = 0; + struct usb_interface_descriptor *intf; + struct usb_device *dev; /* proc_connectinfo in devio.c may call us with id == NULL. */ if (id == NULL) return NULL; + intf = &interface->altsetting [interface->act_altsetting]; + dev = interface_to_usbdev(interface); + /* It is important to check that id->driver_info is nonzero, since an entry that is all zeroes except for a nonzero id->driver_info is the way to create an entry that @@ -575,19 +577,17 @@ usb_match_id(struct usb_device *dev, struct usb_interface *interface, (id->bDeviceProtocol != dev->descriptor.bDeviceProtocol)) continue; - intf = &interface->altsetting [interface->act_altsetting]; - if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_CLASS) && (id->bInterfaceClass != intf->bInterfaceClass)) continue; if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_SUBCLASS) && (id->bInterfaceSubClass != intf->bInterfaceSubClass)) - continue; + continue; if ((id->match_flags & USB_DEVICE_ID_MATCH_INT_PROTOCOL) && (id->bInterfaceProtocol != intf->bInterfaceProtocol)) - continue; + continue; return id; } diff --git a/drivers/usb/serial/usbserial.c b/drivers/usb/serial/usbserial.c index c232a4ec3010..a5cc1450d24a 100644 --- a/drivers/usb/serial/usbserial.c +++ b/drivers/usb/serial/usbserial.c @@ -1193,7 +1193,7 @@ static void * usb_serial_probe(struct usb_device *dev, unsigned int ifnum, interface = &dev->actconfig->interface[ifnum]; list_for_each (tmp, &usb_serial_driver_list) { type = list_entry(tmp, struct usb_serial_device_type, driver_list); - id_pattern = usb_match_id(dev, interface, type->id_table); + id_pattern = usb_match_id(interface, type->id_table); if (id_pattern != NULL) { dbg("descriptor matches"); found = 1; diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c index b223f089723e..c2cf254e5383 100644 --- a/drivers/usb/storage/scsiglue.c +++ b/drivers/usb/storage/scsiglue.c @@ -265,7 +265,7 @@ static int bus_reset( Scsi_Cmnd *srb ) US_DEBUGPX("simulating disconnect/reconnect.\n"); down(&intf->driver->serialize); intf->driver->disconnect(pusb_dev_save, intf->private_data); - id = usb_match_id(pusb_dev_save, intf, intf->driver->id_table); + id = usb_match_id(intf, intf->driver->id_table); intf->driver->probe(pusb_dev_save, i, id); up(&intf->driver->serialize); } diff --git a/include/linux/usb.h b/include/linux/usb.h index cc7f1aa7520f..ad34a45f28c4 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -443,8 +443,7 @@ extern void usb_driver_claim_interface(struct usb_driver *driver, extern int usb_interface_claimed(struct usb_interface *iface); extern void usb_driver_release_interface(struct usb_driver *driver, struct usb_interface *iface); -const struct usb_device_id *usb_match_id(struct usb_device *dev, - struct usb_interface *interface, +const struct usb_device_id *usb_match_id(struct usb_interface *interface, const struct usb_device_id *id); /** -- cgit v1.2.3 From 86ae817e395b064f6cb64239b9b9f4eb9fcac25e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Wed, 14 Aug 2002 20:54:13 -0700 Subject: [PATCH] init_tasks is not defined anywhere. It's referenced by mips and mips64 (both far out of date), but never actually defined anywhere. --- include/linux/sched.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index fcd107c3d29c..767d1b7bd0ec 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -445,7 +445,6 @@ extern union thread_union init_thread_union; extern struct task_struct init_task; extern struct mm_struct init_mm; -extern struct task_struct *init_tasks[NR_CPUS]; /* PID hashing. (shouldnt this be dynamic?) */ #define PIDHASH_SZ (4096 >> 2) -- cgit v1.2.3 From 6a85ced0a7500dbf574277784536ba1c550daf5a Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Wed, 14 Aug 2002 21:14:22 -0700 Subject: [PATCH] Modular x86 MTRR driver. This patch from Pat Mochel cleans up the hell that was mtrr.c into something a lot more modular and easy to understand, by doing the implementation-per-file as has been done to various other things by Pat and myself over the last months. It's functionally identical from a kernel internal point of view, and a userspace point of view, and is basically just a very large code clean up. --- arch/i386/kernel/Makefile | 3 +- arch/i386/kernel/cpu/Makefile | 2 + arch/i386/kernel/cpu/mtrr/Makefile | 8 + arch/i386/kernel/cpu/mtrr/amd.c | 121 ++ arch/i386/kernel/cpu/mtrr/centaur.c | 219 ++++ arch/i386/kernel/cpu/mtrr/changelog | 229 ++++ arch/i386/kernel/cpu/mtrr/cyrix.c | 378 ++++++ arch/i386/kernel/cpu/mtrr/generic.c | 184 +++ arch/i386/kernel/cpu/mtrr/if.c | 396 ++++++ arch/i386/kernel/cpu/mtrr/main.c | 628 ++++++++++ arch/i386/kernel/cpu/mtrr/mtrr.h | 98 ++ arch/i386/kernel/cpu/mtrr/state.c | 338 +++++ arch/i386/kernel/mtrr.c | 2303 ----------------------------------- arch/i386/kernel/smpboot.c | 4 - include/linux/smp.h | 1 + 15 files changed, 2603 insertions(+), 2309 deletions(-) create mode 100644 arch/i386/kernel/cpu/mtrr/Makefile create mode 100644 arch/i386/kernel/cpu/mtrr/amd.c create mode 100644 arch/i386/kernel/cpu/mtrr/centaur.c create mode 100644 arch/i386/kernel/cpu/mtrr/changelog create mode 100644 arch/i386/kernel/cpu/mtrr/cyrix.c create mode 100644 arch/i386/kernel/cpu/mtrr/generic.c create mode 100644 arch/i386/kernel/cpu/mtrr/if.c create mode 100644 arch/i386/kernel/cpu/mtrr/main.c create mode 100644 arch/i386/kernel/cpu/mtrr/mtrr.h create mode 100644 arch/i386/kernel/cpu/mtrr/state.c delete mode 100644 arch/i386/kernel/mtrr.c (limited to 'include') diff --git a/arch/i386/kernel/Makefile b/arch/i386/kernel/Makefile index cf333bbeb083..b4705a70ab3c 100644 --- a/arch/i386/kernel/Makefile +++ b/arch/i386/kernel/Makefile @@ -6,7 +6,7 @@ EXTRA_TARGETS := kernel.o head.o init_task.o O_TARGET := kernel.o -export-objs := mca.o mtrr.o i386_ksyms.o time.o +export-objs := mca.o msr.o i386_ksyms.o time.o obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ ptrace.o i8259.o ioport.o ldt.o setup.o time.o sys_i386.o \ @@ -15,7 +15,6 @@ obj-y := process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \ obj-y += cpu/ obj-$(CONFIG_MCA) += mca.o -obj-$(CONFIG_MTRR) += mtrr.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o diff --git a/arch/i386/kernel/cpu/Makefile b/arch/i386/kernel/cpu/Makefile index 18b9d13323d4..5260e209d395 100644 --- a/arch/i386/kernel/cpu/Makefile +++ b/arch/i386/kernel/cpu/Makefile @@ -13,4 +13,6 @@ obj-y += rise.o obj-y += nexgen.o obj-y += umc.o +obj-$(CONFIG_MTRR) += mtrr/ + include $(TOPDIR)/Rules.make diff --git a/arch/i386/kernel/cpu/mtrr/Makefile b/arch/i386/kernel/cpu/mtrr/Makefile new file mode 100644 index 000000000000..61698bbf6e3c --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/Makefile @@ -0,0 +1,8 @@ +obj-y := main.o if.o generic.o state.o +obj-y += amd.o +obj-y += cyrix.o +obj-y += centaur.o + +export-objs := main.o + +include $(TOPDIR)/Rules.make diff --git a/arch/i386/kernel/cpu/mtrr/amd.c b/arch/i386/kernel/cpu/mtrr/amd.c new file mode 100644 index 000000000000..0949cdbf848a --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/amd.c @@ -0,0 +1,121 @@ +#include +#include +#include +#include + +#include "mtrr.h" + +static void +amd_get_mtrr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type * type) +{ + unsigned long low, high; + + rdmsr(MSR_K6_UWCCR, low, high); + /* Upper dword is region 1, lower is region 0 */ + if (reg == 1) + low = high; + /* The base masks off on the right alignment */ + *base = (low & 0xFFFE0000) >> PAGE_SHIFT; + *type = 0; + if (low & 1) + *type = MTRR_TYPE_UNCACHABLE; + if (low & 2) + *type = MTRR_TYPE_WRCOMB; + if (!(low & 3)) { + *size = 0; + return; + } + /* + * This needs a little explaining. The size is stored as an + * inverted mask of bits of 128K granularity 15 bits long offset + * 2 bits + * + * So to get a size we do invert the mask and add 1 to the lowest + * mask bit (4 as its 2 bits in). This gives us a size we then shift + * to turn into 128K blocks + * + * eg 111 1111 1111 1100 is 512K + * + * invert 000 0000 0000 0011 + * +1 000 0000 0000 0100 + * *128K ... + */ + low = (~low) & 0x1FFFC; + *size = (low + 4) << (15 - PAGE_SHIFT); + return; +} + +static void amd_set_mtrr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +/* [SUMMARY] Set variable MTRR register on the local CPU. + The register to set. + The base address of the region. + The size of the region. If this is 0 the region is disabled. + The type of the region. + If TRUE, do the change safely. If FALSE, safety measures should + be done externally. + [RETURNS] Nothing. +*/ +{ + u32 regs[2]; + + /* + * Low is MTRR0 , High MTRR 1 + */ + rdmsr(MSR_K6_UWCCR, regs[0], regs[1]); + /* + * Blank to disable + */ + if (size == 0) + regs[reg] = 0; + else + /* Set the register to the base, the type (off by one) and an + inverted bitmask of the size The size is the only odd + bit. We are fed say 512K We invert this and we get 111 1111 + 1111 1011 but if you subtract one and invert you get the + desired 111 1111 1111 1100 mask + + But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */ + regs[reg] = (-size >> (15 - PAGE_SHIFT) & 0x0001FFFC) + | (base << PAGE_SHIFT) | (type + 1); + + /* + * The writeback rule is quite specific. See the manual. Its + * disable local interrupts, write back the cache, set the mtrr + */ + wbinvd(); + wrmsr(MSR_K6_UWCCR, regs[0], regs[1]); +} + +static int amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +{ + /* Apply the K6 block alignment and size rules + In order + o Uncached or gathering only + o 128K or bigger block + o Power of 2 block + o base suitably aligned to the power + */ + if (type > MTRR_TYPE_WRCOMB || size < (1 << (17 - PAGE_SHIFT)) + || (size & ~(size - 1)) - size || (base & (size - 1))) + return -EINVAL; + return 0; +} + +static struct mtrr_ops amd_mtrr_ops = { + .vendor = X86_VENDOR_AMD, + .set = amd_set_mtrr, + .get = amd_get_mtrr, + .get_free_region = generic_get_free_region, + .validate_add_page = amd_validate_add_page, + .have_wrcomb = positive_have_wrcomb, +}; + +int __init amd_init_mtrr(void) +{ + set_mtrr_ops(&amd_mtrr_ops); + return 0; +} + +//arch_initcall(amd_mtrr_init); diff --git a/arch/i386/kernel/cpu/mtrr/centaur.c b/arch/i386/kernel/cpu/mtrr/centaur.c new file mode 100644 index 000000000000..a473fb1bdeeb --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/centaur.c @@ -0,0 +1,219 @@ +#include +#include +#include +#include +#include "mtrr.h" + +static struct { + unsigned long high; + unsigned long low; +} centaur_mcr[8]; + +static u8 centaur_mcr_reserved; +static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ + +/* + * Report boot time MCR setups + */ + +static int +centaur_get_free_region(unsigned long base, unsigned long size) +/* [SUMMARY] Get a free MTRR. + The starting (base) address of the region. + The size (in bytes) of the region. + [RETURNS] The index of the region on success, else -1 on error. +*/ +{ + int i, max; + mtrr_type ltype; + unsigned long lbase, lsize; + + max = num_var_ranges; + for (i = 0; i < max; ++i) { + if (centaur_mcr_reserved & (1 << i)) + continue; + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lsize == 0) + return i; + } + return -ENOSPC; +} + +void +mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) +{ + centaur_mcr[mcr].low = lo; + centaur_mcr[mcr].high = hi; +} + +static void +centaur_get_mcr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type * type) +{ + *base = centaur_mcr[reg].high >> PAGE_SHIFT; + *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; + *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */ + if (centaur_mcr_type == 1 && ((centaur_mcr[reg].low & 31) & 2)) + *type = MTRR_TYPE_UNCACHABLE; + if (centaur_mcr_type == 1 && (centaur_mcr[reg].low & 31) == 25) + *type = MTRR_TYPE_WRBACK; + if (centaur_mcr_type == 0 && (centaur_mcr[reg].low & 31) == 31) + *type = MTRR_TYPE_WRBACK; + +} + +static void centaur_set_mcr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + unsigned long low, high; + + if (size == 0) { + /* Disable */ + high = low = 0; + } else { + high = base << PAGE_SHIFT; + if (centaur_mcr_type == 0) + low = -size << PAGE_SHIFT | 0x1f; /* only support write-combining... */ + else { + if (type == MTRR_TYPE_UNCACHABLE) + low = -size << PAGE_SHIFT | 0x02; /* NC */ + else + low = -size << PAGE_SHIFT | 0x09; /* WWO,WC */ + } + } + centaur_mcr[reg].high = high; + centaur_mcr[reg].low = low; + wrmsr(MSR_IDT_MCR0 + reg, low, high); +} +/* + * Initialise the later (saner) Winchip MCR variant. In this version + * the BIOS can pass us the registers it has used (but not their values) + * and the control register is read/write + */ + +static void __init +centaur_mcr1_init(void) +{ + unsigned i; + u32 lo, hi; + + /* Unfortunately, MCR's are read-only, so there is no way to + * find out what the bios might have done. + */ + + rdmsr(MSR_IDT_MCR_CTRL, lo, hi); + if (((lo >> 17) & 7) == 1) { /* Type 1 Winchip2 MCR */ + lo &= ~0x1C0; /* clear key */ + lo |= 0x040; /* set key to 1 */ + wrmsr(MSR_IDT_MCR_CTRL, lo, hi); /* unlock MCR */ + } + + centaur_mcr_type = 1; + + /* + * Clear any unconfigured MCR's. + */ + + for (i = 0; i < 8; ++i) { + if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) { + if (!(lo & (1 << (9 + i)))) + wrmsr(MSR_IDT_MCR0 + i, 0, 0); + else + /* + * If the BIOS set up an MCR we cannot see it + * but we don't wish to obliterate it + */ + centaur_mcr_reserved |= (1 << i); + } + } + /* + * Throw the main write-combining switch... + * However if OOSTORE is enabled then people have already done far + * cleverer things and we should behave. + */ + + lo |= 15; /* Write combine enables */ + wrmsr(MSR_IDT_MCR_CTRL, lo, hi); +} + +/* + * Initialise the original winchip with read only MCR registers + * no used bitmask for the BIOS to pass on and write only control + */ + +static void __init +centaur_mcr0_init(void) +{ + unsigned i; + + /* Unfortunately, MCR's are read-only, so there is no way to + * find out what the bios might have done. + */ + + /* Clear any unconfigured MCR's. + * This way we are sure that the centaur_mcr array contains the actual + * values. The disadvantage is that any BIOS tweaks are thus undone. + * + */ + for (i = 0; i < 8; ++i) { + if (centaur_mcr[i].high == 0 && centaur_mcr[i].low == 0) + wrmsr(MSR_IDT_MCR0 + i, 0, 0); + } + + wrmsr(MSR_IDT_MCR_CTRL, 0x01F0001F, 0); /* Write only */ +} + +/* + * Initialise Winchip series MCR registers + */ + +static void __init +centaur_mcr_init(void) +{ + struct set_mtrr_context ctxt; + + set_mtrr_prepare_save(&ctxt); + set_mtrr_cache_disable(&ctxt); + + if (boot_cpu_data.x86_model == 4) + centaur_mcr0_init(); + else if (boot_cpu_data.x86_model == 8 || boot_cpu_data.x86_model == 9) + centaur_mcr1_init(); + + set_mtrr_done(&ctxt); +} + +static int centaur_validate_add_page(unsigned long base, + unsigned long size, unsigned int type) +{ + /* + * FIXME: Winchip2 supports uncached + */ + if (type != MTRR_TYPE_WRCOMB && + (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) { + printk(KERN_WARNING + "mtrr: only write-combining%s supported\n", + centaur_mcr_type ? " and uncacheable are" + : " is"); + return -EINVAL; + } + return 0; +} + +static struct mtrr_ops centaur_mtrr_ops = { + .vendor = X86_VENDOR_CENTAUR, + .init = centaur_mcr_init, + .set = centaur_set_mcr, + .get = centaur_get_mcr, + .get_free_region = centaur_get_free_region, + .validate_add_page = centaur_validate_add_page, + .have_wrcomb = positive_have_wrcomb, +}; + +int __init centaur_init_mtrr(void) +{ + set_mtrr_ops(¢aur_mtrr_ops); + return 0; +} + +//arch_initcall(centaur_init_mtrr); diff --git a/arch/i386/kernel/cpu/mtrr/changelog b/arch/i386/kernel/cpu/mtrr/changelog new file mode 100644 index 000000000000..af1368535955 --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/changelog @@ -0,0 +1,229 @@ + ChangeLog + + Prehistory Martin Tischhäuser + Initial register-setting code (from proform-1.0). + 19971216 Richard Gooch + Original version for /proc/mtrr interface, SMP-safe. + v1.0 + 19971217 Richard Gooch + Bug fix for ioctls()'s. + Added sample code in Documentation/mtrr.txt + v1.1 + 19971218 Richard Gooch + Disallow overlapping regions. + 19971219 Jens Maurer + Register-setting fixups. + v1.2 + 19971222 Richard Gooch + Fixups for kernel 2.1.75. + v1.3 + 19971229 David Wragg + Register-setting fixups and conformity with Intel conventions. + 19971229 Richard Gooch + Cosmetic changes and wrote this ChangeLog ;-) + 19980106 Richard Gooch + Fixups for kernel 2.1.78. + v1.4 + 19980119 David Wragg + Included passive-release enable code (elsewhere in PCI setup). + v1.5 + 19980131 Richard Gooch + Replaced global kernel lock with private spinlock. + v1.6 + 19980201 Richard Gooch + Added wait for other CPUs to complete changes. + v1.7 + 19980202 Richard Gooch + Bug fix in definition of for UP. + v1.8 + 19980319 Richard Gooch + Fixups for kernel 2.1.90. + 19980323 Richard Gooch + Move SMP BIOS fixup before secondary CPUs call + v1.9 + 19980325 Richard Gooch + Fixed test for overlapping regions: confused by adjacent regions + 19980326 Richard Gooch + Added wbinvd in . + 19980401 Richard Gooch + Bug fix for non-SMP compilation. + 19980418 David Wragg + Fixed-MTRR synchronisation for SMP and use atomic operations + instead of spinlocks. + 19980418 Richard Gooch + Differentiate different MTRR register classes for BIOS fixup. + v1.10 + 19980419 David Wragg + Bug fix in variable MTRR synchronisation. + v1.11 + 19980419 Richard Gooch + Fixups for kernel 2.1.97. + v1.12 + 19980421 Richard Gooch + Safer synchronisation across CPUs when changing MTRRs. + v1.13 + 19980423 Richard Gooch + Bugfix for SMP systems without MTRR support. + v1.14 + 19980427 Richard Gooch + Trap calls to and on non-MTRR machines. + v1.15 + 19980427 Richard Gooch + Use atomic bitops for setting SMP change mask. + v1.16 + 19980428 Richard Gooch + Removed spurious diagnostic message. + v1.17 + 19980429 Richard Gooch + Moved register-setting macros into this file. + Moved setup code from init/main.c to i386-specific areas. + v1.18 + 19980502 Richard Gooch + Moved MTRR detection outside conditionals in . + v1.19 + 19980502 Richard Gooch + Documentation improvement: mention Pentium II and AGP. + v1.20 + 19980521 Richard Gooch + Only manipulate interrupt enable flag on local CPU. + Allow enclosed uncachable regions. + v1.21 + 19980611 Richard Gooch + Always define . + v1.22 + 19980901 Richard Gooch + Removed module support in order to tidy up code. + Added sanity check for / before . + Created addition queue for prior to SMP commence. + v1.23 + 19980902 Richard Gooch + Ported patch to kernel 2.1.120-pre3. + v1.24 + 19980910 Richard Gooch + Removed sanity checks and addition queue: Linus prefers an OOPS. + v1.25 + 19981001 Richard Gooch + Fixed harmless compiler warning in include/asm-i386/mtrr.h + Fixed version numbering and history for v1.23 -> v1.24. + v1.26 + 19990118 Richard Gooch + Added devfs support. + v1.27 + 19990123 Richard Gooch + Changed locking to spin with reschedule. + Made use of new . + v1.28 + 19990201 Zoltán Böszörményi + Extended the driver to be able to use Cyrix style ARRs. + 19990204 Richard Gooch + Restructured Cyrix support. + v1.29 + 19990204 Zoltán Böszörményi + Refined ARR support: enable MAPEN in set_mtrr_prepare() + and disable MAPEN in set_mtrr_done(). + 19990205 Richard Gooch + Minor cleanups. + v1.30 + 19990208 Zoltán Böszörményi + Protect plain 6x86s (and other processors without the + Page Global Enable feature) against accessing CR4 in + set_mtrr_prepare() and set_mtrr_done(). + 19990210 Richard Gooch + Turned and into function pointers. + v1.31 + 19990212 Zoltán Böszörményi + Major rewrite of cyrix_arr_init(): do not touch ARRs, + leave them as the BIOS have set them up. + Enable usage of all 8 ARRs. + Avoid multiplications by 3 everywhere and other + code clean ups/speed ups. + 19990213 Zoltán Böszörményi + Set up other Cyrix processors identical to the boot cpu. + Since Cyrix don't support Intel APIC, this is l'art pour l'art. + Weigh ARRs by size: + If size <= 32M is given, set up ARR# we were given. + If size > 32M is given, set up ARR7 only if it is free, + fail otherwise. + 19990214 Zoltán Böszörményi + Also check for size >= 256K if we are to set up ARR7, + mtrr_add() returns the value it gets from set_mtrr() + 19990218 Zoltán Böszörményi + Remove Cyrix "coma bug" workaround from here. + Moved to linux/arch/i386/kernel/setup.c and + linux/include/asm-i386/bugs.h + 19990228 Richard Gooch + Added MTRRIOC_KILL_ENTRY ioctl(2) + Trap for counter underflow in . + Trap for 4 MiB aligned regions for PPro, stepping <= 7. + 19990301 Richard Gooch + Created hook. + 19990305 Richard Gooch + Temporarily disable AMD support now MTRR capability flag is set. + v1.32 + 19990308 Zoltán Böszörményi + Adjust my changes (19990212-19990218) to Richard Gooch's + latest changes. (19990228-19990305) + v1.33 + 19990309 Richard Gooch + Fixed typo in message. + 19990310 Richard Gooch + Support K6-II/III based on Alan Cox's patches. + v1.34 + 19990511 Bart Hartgers + Support Centaur C6 MCR's. + 19990512 Richard Gooch + Minor cleanups. + v1.35 + 19990707 Zoltán Böszörményi + Check whether ARR3 is protected in cyrix_get_free_region() + and mtrr_del(). The code won't attempt to delete or change it + from now on if the BIOS protected ARR3. It silently skips ARR3 + in cyrix_get_free_region() or returns with an error code from + mtrr_del(). + 19990711 Zoltán Böszörményi + Reset some bits in the CCRs in cyrix_arr_init() to disable SMM + if ARR3 isn't protected. This is needed because if SMM is active + and ARR3 isn't protected then deleting and setting ARR3 again + may lock up the processor. With SMM entirely disabled, it does + not happen. + 19990812 Zoltán Böszörményi + Rearrange switch() statements so the driver accomodates to + the fact that the AMD Athlon handles its MTRRs the same way + as Intel does. + 19990814 Zoltán Böszörményi + Double check for Intel in mtrr_add()'s big switch() because + that revision check is only valid for Intel CPUs. + 19990819 Alan Cox + Tested Zoltan's changes on a pre production Athlon - 100% + success. + 19991008 Manfred Spraul + replaced spin_lock_reschedule() with a normal semaphore. + v1.36 + 20000221 Richard Gooch + Compile fix if procfs and devfs not enabled. + Formatting changes. + v1.37 + 20001109 H. Peter Anvin + Use the new centralized CPU feature detects. + + v1.38 + 20010309 Dave Jones + Add support for Cyrix III. + + v1.39 + 20010312 Dave Jones + Ugh, I broke AMD support. + Reworked fix by Troels Walsted Hansen + + v1.40 + 20010327 Dave Jones + Adapted Cyrix III support to include VIA C3. + + v2.0 + 20020306 Patrick Mochel + Split mtrr.c -> mtrr/*.c + Converted to Linux Kernel Coding Style + Fixed several minor nits in form + Moved some SMP-only functions out, so they can be used + for power management in the future. + TODO: Fix user interface cruft. diff --git a/arch/i386/kernel/cpu/mtrr/cyrix.c b/arch/i386/kernel/cpu/mtrr/cyrix.c new file mode 100644 index 000000000000..ace423c0a83c --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/cyrix.c @@ -0,0 +1,378 @@ +#include +#include +#include +#include +#include +#include "mtrr.h" + +int arr3_protected; + +static void +cyrix_get_arr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type * type) +{ + unsigned long flags; + unsigned char arr, ccr3, rcr, shift; + + arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ + + /* Save flags and disable interrupts */ + local_irq_save(flags); + + ccr3 = getCx86(CX86_CCR3); + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ + ((unsigned char *) base)[3] = getCx86(arr); + ((unsigned char *) base)[2] = getCx86(arr + 1); + ((unsigned char *) base)[1] = getCx86(arr + 2); + rcr = getCx86(CX86_RCR_BASE + reg); + setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ + + /* Enable interrupts if it was enabled previously */ + local_irq_restore(flags); + shift = ((unsigned char *) base)[1] & 0x0f; + *base >>= PAGE_SHIFT; + + /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 + * Note: shift==0xf means 4G, this is unsupported. + */ + if (shift) + *size = (reg < 7 ? 0x1UL : 0x40UL) << (shift - 1); + else + *size = 0; + + /* Bit 0 is Cache Enable on ARR7, Cache Disable on ARR0-ARR6 */ + if (reg < 7) { + switch (rcr) { + case 1: + *type = MTRR_TYPE_UNCACHABLE; + break; + case 8: + *type = MTRR_TYPE_WRBACK; + break; + case 9: + *type = MTRR_TYPE_WRCOMB; + break; + case 24: + default: + *type = MTRR_TYPE_WRTHROUGH; + break; + } + } else { + switch (rcr) { + case 0: + *type = MTRR_TYPE_UNCACHABLE; + break; + case 8: + *type = MTRR_TYPE_WRCOMB; + break; + case 9: + *type = MTRR_TYPE_WRBACK; + break; + case 25: + default: + *type = MTRR_TYPE_WRTHROUGH; + break; + } + } +} + +static int +cyrix_get_free_region(unsigned long base, unsigned long size) +/* [SUMMARY] Get a free ARR. + The starting (base) address of the region. + The size (in bytes) of the region. + [RETURNS] The index of the region on success, else -1 on error. +*/ +{ + int i; + mtrr_type ltype; + unsigned long lbase, lsize; + + /* If we are to set up a region >32M then look at ARR7 immediately */ + if (size > 0x2000) { + cyrix_get_arr(7, &lbase, &lsize, <ype); + if (lsize == 0) + return 7; + /* Else try ARR0-ARR6 first */ + } else { + for (i = 0; i < 7; i++) { + cyrix_get_arr(i, &lbase, &lsize, <ype); + if ((i == 3) && arr3_protected) + continue; + if (lsize == 0) + return i; + } + /* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */ + cyrix_get_arr(i, &lbase, &lsize, <ype); + if ((lsize == 0) && (size >= 0x40)) + return i; + } + return -ENOSPC; +} + +static void cyrix_set_arr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + unsigned char arr, arr_type, arr_size; + u32 cr0, ccr3; + u32 cr4 = 0; + + arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ + + /* count down from 32M (ARR0-ARR6) or from 2G (ARR7) */ + if (reg >= 7) + size >>= 6; + + size &= 0x7fff; /* make sure arr_size <= 14 */ + for (arr_size = 0; size; arr_size++, size >>= 1) ; + + if (reg < 7) { + switch (type) { + case MTRR_TYPE_UNCACHABLE: + arr_type = 1; + break; + case MTRR_TYPE_WRCOMB: + arr_type = 9; + break; + case MTRR_TYPE_WRTHROUGH: + arr_type = 24; + break; + default: + arr_type = 8; + break; + } + } else { + switch (type) { + case MTRR_TYPE_UNCACHABLE: + arr_type = 0; + break; + case MTRR_TYPE_WRCOMB: + arr_type = 8; + break; + case MTRR_TYPE_WRTHROUGH: + arr_type = 25; + break; + default: + arr_type = 9; + break; + } + } + + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if ( cpu_has_pge ) { + cr4 = read_cr4(); + write_cr4(cr4 & (unsigned char) ~(1 << 7)); + } + + /* Disable and flush caches. Note that wbinvd flushes the TLBs as + a side-effect */ + cr0 = read_cr0() | 0x40000000; + wbinvd(); + write_cr0(cr0); + wbinvd(); + + /* Cyrix ARRs - everything else were excluded at the top */ + ccr3 = getCx86(CX86_CCR3); + + /* Cyrix ARRs - everything else were excluded at the top */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); + + base <<= PAGE_SHIFT; + setCx86(arr, ((unsigned char *) &base)[3]); + setCx86(arr + 1, ((unsigned char *) &base)[2]); + setCx86(arr + 2, (((unsigned char *) &base)[1]) | arr_size); + setCx86(CX86_RCR_BASE + reg, arr_type); + + /* Flush caches and TLBs */ + wbinvd(); + + /* Cyrix ARRs - everything else was excluded at the top */ + setCx86(CX86_CCR3, ccr3); + + /* Enable caches */ + write_cr0(read_cr0() & 0xbfffffff); + + /* Restore value of CR4 */ + if ( cpu_has_pge ) + write_cr4(cr4); +} + +typedef struct { + unsigned long base; + unsigned long size; + mtrr_type type; +} arr_state_t; + +arr_state_t arr_state[8] __initdata = { + {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, + {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL}, {0UL, 0UL, 0UL} +}; + +unsigned char ccr_state[7] __initdata = { 0, 0, 0, 0, 0, 0, 0 }; + +static void __init +cyrix_arr_init_secondary(void) +{ + int i; + u32 cr0, ccr3, cr4 = 0; + + /* flush cache and enable MAPEN */ + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if ( cpu_has_pge ) { + cr4 = read_cr4(); + write_cr4(cr4 & (unsigned char) ~(1 << 7)); + } + + /* Disable and flush caches. Note that wbinvd flushes the TLBs as + a side-effect */ + cr0 = read_cr0() | 0x40000000; + wbinvd(); + write_cr0(cr0); + wbinvd(); + + /* Cyrix ARRs - everything else were excluded at the top */ + ccr3 = getCx86(CX86_CCR3); + + /* Cyrix ARRs - everything else were excluded at the top */ + setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); + + /* the CCRs are not contiguous */ + for (i = 0; i < 4; i++) + setCx86(CX86_CCR0 + i, ccr_state[i]); + for (; i < 7; i++) + setCx86(CX86_CCR4 + i, ccr_state[i]); + for (i = 0; i < 8; i++) + cyrix_set_arr(i, arr_state[i].base, + arr_state[i].size, arr_state[i].type); + + /* Flush caches and TLBs */ + wbinvd(); + + /* Cyrix ARRs - everything else was excluded at the top */ + setCx86(CX86_CCR3, ccr3); + + /* Enable caches */ + write_cr0(read_cr0() & 0xbfffffff); + + /* Restore value of CR4 */ + if ( cpu_has_pge ) + write_cr4(cr4); +} + +/* + * On Cyrix 6x86(MX) and M II the ARR3 is special: it has connection + * with the SMM (System Management Mode) mode. So we need the following: + * Check whether SMI_LOCK (CCR3 bit 0) is set + * if it is set, write a warning message: ARR3 cannot be changed! + * (it cannot be changed until the next processor reset) + * if it is reset, then we can change it, set all the needed bits: + * - disable access to SMM memory through ARR3 range (CCR1 bit 7 reset) + * - disable access to SMM memory (CCR1 bit 2 reset) + * - disable SMM mode (CCR1 bit 1 reset) + * - disable write protection of ARR3 (CCR6 bit 1 reset) + * - (maybe) disable ARR3 + * Just to be sure, we enable ARR usage by the processor (CCR5 bit 5 set) + */ +static void __init +cyrix_arr_init(void) +{ + struct set_mtrr_context ctxt; + unsigned char ccr[7]; + int ccrc[7] = { 0, 0, 0, 0, 0, 0, 0 }; +#ifdef CONFIG_SMP + int i; +#endif + + /* flush cache and enable MAPEN */ + set_mtrr_prepare_save(&ctxt); + set_mtrr_cache_disable(&ctxt); + + /* Save all CCRs locally */ + ccr[0] = getCx86(CX86_CCR0); + ccr[1] = getCx86(CX86_CCR1); + ccr[2] = getCx86(CX86_CCR2); + ccr[3] = ctxt.ccr3; + ccr[4] = getCx86(CX86_CCR4); + ccr[5] = getCx86(CX86_CCR5); + ccr[6] = getCx86(CX86_CCR6); + + if (ccr[3] & 1) { + ccrc[3] = 1; + arr3_protected = 1; + } else { + /* Disable SMM mode (bit 1), access to SMM memory (bit 2) and + * access to SMM memory through ARR3 (bit 7). + */ + if (ccr[1] & 0x80) { + ccr[1] &= 0x7f; + ccrc[1] |= 0x80; + } + if (ccr[1] & 0x04) { + ccr[1] &= 0xfb; + ccrc[1] |= 0x04; + } + if (ccr[1] & 0x02) { + ccr[1] &= 0xfd; + ccrc[1] |= 0x02; + } + arr3_protected = 0; + if (ccr[6] & 0x02) { + ccr[6] &= 0xfd; + ccrc[6] = 1; /* Disable write protection of ARR3 */ + setCx86(CX86_CCR6, ccr[6]); + } + /* Disable ARR3. This is safe now that we disabled SMM. */ + /* cyrix_set_arr_up (3, 0, 0, 0, FALSE); */ + } + /* If we changed CCR1 in memory, change it in the processor, too. */ + if (ccrc[1]) + setCx86(CX86_CCR1, ccr[1]); + + /* Enable ARR usage by the processor */ + if (!(ccr[5] & 0x20)) { + ccr[5] |= 0x20; + ccrc[5] = 1; + setCx86(CX86_CCR5, ccr[5]); + } +#ifdef CONFIG_SMP + for (i = 0; i < 7; i++) + ccr_state[i] = ccr[i]; + for (i = 0; i < 8; i++) + cyrix_get_arr(i, + &arr_state[i].base, &arr_state[i].size, + &arr_state[i].type); +#endif + + set_mtrr_done(&ctxt); /* flush cache and disable MAPEN */ + + if (ccrc[5]) + printk("mtrr: ARR usage was not enabled, enabled manually\n"); + if (ccrc[3]) + printk("mtrr: ARR3 cannot be changed\n"); +/* + if ( ccrc[1] & 0x80) printk ("mtrr: SMM memory access through ARR3 disabled\n"); + if ( ccrc[1] & 0x04) printk ("mtrr: SMM memory access disabled\n"); + if ( ccrc[1] & 0x02) printk ("mtrr: SMM mode disabled\n"); +*/ + if (ccrc[6]) + printk("mtrr: ARR3 was write protected, unprotected\n"); +} + +static struct mtrr_ops cyrix_mtrr_ops = { + .vendor = X86_VENDOR_CYRIX, + .init = cyrix_arr_init, + .init_secondary = cyrix_arr_init_secondary, + .set = cyrix_set_arr, + .get = cyrix_get_arr, + .get_free_region = cyrix_get_free_region, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = positive_have_wrcomb, +}; + +int __init cyrix_init_mtrr(void) +{ + set_mtrr_ops(&cyrix_mtrr_ops); + return 0; +} + +//arch_initcall(cyrix_init_mtrr); diff --git a/arch/i386/kernel/cpu/mtrr/generic.c b/arch/i386/kernel/cpu/mtrr/generic.c new file mode 100644 index 000000000000..05f2d807a915 --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/generic.c @@ -0,0 +1,184 @@ +#include +#include +#include +#include +#include +#include +#include "mtrr.h" + + +int generic_get_free_region(unsigned long base, unsigned long size) +/* [SUMMARY] Get a free MTRR. + The starting (base) address of the region. + The size (in bytes) of the region. + [RETURNS] The index of the region on success, else -1 on error. +*/ +{ + int i, max; + mtrr_type ltype; + unsigned long lbase, lsize; + + max = num_var_ranges; + for (i = 0; i < max; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lsize == 0) + return i; + } + return -ENOSPC; +} + + +void generic_get_mtrr(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type * type) +{ + unsigned long mask_lo, mask_hi, base_lo, base_hi; + + rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); + if ((mask_lo & 0x800) == 0) { + /* Invalid (i.e. free) range */ + *base = 0; + *size = 0; + *type = 0; + return; + } + + rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); + + /* Work out the shifted address mask. */ + mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) + | mask_lo >> PAGE_SHIFT; + + /* This works correctly if size is a power of two, i.e. a + contiguous range. */ + *size = -mask_lo; + *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; + *type = base_lo & 0xff; +} + +void generic_set_mtrr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +/* [SUMMARY] Set variable MTRR register on the local CPU. + The register to set. + The base address of the region. + The size of the region. If this is 0 the region is disabled. + The type of the region. + If TRUE, do the change safely. If FALSE, safety measures should + be done externally. + [RETURNS] Nothing. +*/ +{ + u32 cr0, cr4 = 0; + u32 deftype_lo, deftype_hi; + + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if ( cpu_has_pge ) { + cr4 = read_cr4(); + write_cr4(cr4 & (unsigned char) ~(1 << 7)); + } + + /* Disable and flush caches. Note that wbinvd flushes the TLBs as + a side-effect */ + cr0 = read_cr0() | 0x40000000; + wbinvd(); + write_cr0(cr0); + wbinvd(); + + /* Save MTRR state */ + rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + + /* Disable MTRRs, and set the default type to uncached */ + wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi); + + if (size == 0) { + /* The invalid bit is kept in the mask, so we simply clear the + relevant mask register to disable a range. */ + wrmsr(MTRRphysMask_MSR(reg), 0, 0); + } else { + wrmsr(MTRRphysBase_MSR(reg), base << PAGE_SHIFT | type, + (base & size_and_mask) >> (32 - PAGE_SHIFT)); + wrmsr(MTRRphysMask_MSR(reg), -size << PAGE_SHIFT | 0x800, + (-size & size_and_mask) >> (32 - PAGE_SHIFT)); + } + + /* Flush caches and TLBs */ + wbinvd(); + + /* Intel (P6) standard MTRRs */ + wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + + /* Enable caches */ + write_cr0(read_cr0() & 0xbfffffff); + + /* Restore value of CR4 */ + if ( cpu_has_pge ) + write_cr4(cr4); +} + +int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type) +{ + unsigned long lbase, last; + + /* For Intel PPro stepping <= 7, must be 4 MiB aligned + and not touch 0x70000000->0x7003FFFF */ + if (is_cpu(INTEL) && boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model == 1 && + boot_cpu_data.x86_mask <= 7) { + if (base & ((1 << (22 - PAGE_SHIFT)) - 1)) { + printk(KERN_WARNING + "mtrr: base(0x%lx000) is not 4 MiB aligned\n", + base); + return -EINVAL; + } + if (!(base + size < 0x70000000 || base > 0x7003FFFF) && + (type == MTRR_TYPE_WRCOMB + || type == MTRR_TYPE_WRBACK)) { + printk(KERN_WARNING + "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); + return -EINVAL; + } + } + + if (base + size < 0x100) { + printk(KERN_WARNING + "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n", + base, size); + return -EINVAL; + } + /* Check upper bits of base and last are equal and lower bits are 0 + for base and 1 for last */ + last = base + size - 1; + for (lbase = base; !(lbase & 1) && (last & 1); + lbase = lbase >> 1, last = last >> 1) ; + if (lbase != last) { + printk(KERN_WARNING + "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", + base, size); + return -EINVAL; + } + return 0; +} + + +int generic_have_wrcomb(void) +{ + unsigned long config, dummy; + rdmsr(MTRRcap_MSR, config, dummy); + return (config & (1 << 10)); +} + +int positive_have_wrcomb(void) +{ + return 1; +} + +/* generic structure... + */ +struct mtrr_ops generic_mtrr_ops = { + .use_intel_if = 1, + .init_secondary = generic_init_secondary, + .get = generic_get_mtrr, + .get_free_region = generic_get_free_region, + .set = generic_set_mtrr, + .validate_add_page = generic_validate_add_page, + .have_wrcomb = generic_have_wrcomb, +}; diff --git a/arch/i386/kernel/cpu/mtrr/if.c b/arch/i386/kernel/cpu/mtrr/if.c new file mode 100644 index 000000000000..ffa199da2f67 --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/if.c @@ -0,0 +1,396 @@ +#include +#include +#include +#include +#include +#include + +/* What kind of fucking hack is this? */ +#define MTRR_NEED_STRINGS + +#include +#include "mtrr.h" + +static char *ascii_buffer; +static unsigned int ascii_buf_bytes; + +extern unsigned int *usage_table; + +#define LINE_SIZE 80 + +static int +mtrr_file_add(unsigned long base, unsigned long size, + unsigned int type, char increment, struct file *file, int page) +{ + int reg, max; + unsigned int *fcount = file->private_data; + + max = num_var_ranges; + if (fcount == NULL) { + if ((fcount = + kmalloc(max * sizeof *fcount, GFP_KERNEL)) == NULL) { + printk("mtrr: could not allocate\n"); + return -ENOMEM; + } + memset(fcount, 0, max * sizeof *fcount); + file->private_data = fcount; + } + if (!page) { + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + printk + ("mtrr: size and base must be multiples of 4 kiB\n"); + printk("mtrr: size: 0x%lx base: 0x%lx\n", size, base); + return -EINVAL; + } + base >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + } + reg = mtrr_add_page(base, size, type, 1); + if (reg >= 0) + ++fcount[reg]; + return reg; +} + +static int +mtrr_file_del(unsigned long base, unsigned long size, + struct file *file, int page) +{ + int reg; + unsigned int *fcount = file->private_data; + + if (!page) { + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + printk + ("mtrr: size and base must be multiples of 4 kiB\n"); + printk("mtrr: size: 0x%lx base: 0x%lx\n", size, base); + return -EINVAL; + } + base >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + } + reg = mtrr_del_page(-1, base, size); + if (reg < 0) + return reg; + if (fcount == NULL) + return reg; + if (fcount[reg] < 1) + return -EINVAL; + --fcount[reg]; + return reg; +} + +static ssize_t +mtrr_read(struct file *file, char *buf, size_t len, loff_t * ppos) +{ + if (*ppos >= ascii_buf_bytes) + return 0; + if (*ppos + len > ascii_buf_bytes) + len = ascii_buf_bytes - *ppos; + if (copy_to_user(buf, ascii_buffer + *ppos, len)) + return -EFAULT; + *ppos += len; + return len; +} + +static ssize_t +mtrr_write(struct file *file, const char *buf, size_t len, loff_t * ppos) +/* Format of control line: + "base=%Lx size=%Lx type=%s" OR: + "disable=%d" +*/ +{ + int i, err; + unsigned long reg; + unsigned long long base, size; + char *ptr; + char line[LINE_SIZE]; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + /* Can't seek (pwrite) on this device */ + if (ppos != &file->f_pos) + return -ESPIPE; + memset(line, 0, LINE_SIZE); + if (len > LINE_SIZE) + len = LINE_SIZE; + if (copy_from_user(line, buf, len - 1)) + return -EFAULT; + ptr = line + strlen(line) - 1; + if (*ptr == '\n') + *ptr = '\0'; + if (!strncmp(line, "disable=", 8)) { + reg = simple_strtoul(line + 8, &ptr, 0); + err = mtrr_del_page(reg, 0, 0); + if (err < 0) + return err; + return len; + } + if (strncmp(line, "base=", 5)) { + printk("mtrr: no \"base=\" in line: \"%s\"\n", line); + return -EINVAL; + } + base = simple_strtoull(line + 5, &ptr, 0); + for (; isspace(*ptr); ++ptr) ; + if (strncmp(ptr, "size=", 5)) { + printk("mtrr: no \"size=\" in line: \"%s\"\n", line); + return -EINVAL; + } + size = simple_strtoull(ptr + 5, &ptr, 0); + if ((base & 0xfff) || (size & 0xfff)) { + printk("mtrr: size and base must be multiples of 4 kiB\n"); + printk("mtrr: size: 0x%Lx base: 0x%Lx\n", size, base); + return -EINVAL; + } + for (; isspace(*ptr); ++ptr) ; + if (strncmp(ptr, "type=", 5)) { + printk("mtrr: no \"type=\" in line: \"%s\"\n", line); + return -EINVAL; + } + ptr += 5; + for (; isspace(*ptr); ++ptr) ; + for (i = 0; i < MTRR_NUM_TYPES; ++i) { +// if (strcmp(ptr, mtrr_strings[i])) + continue; + base >>= PAGE_SHIFT; + size >>= PAGE_SHIFT; + err = + mtrr_add_page((unsigned long) base, (unsigned long) size, i, + 1); + if (err < 0) + return err; + return len; + } + printk("mtrr: illegal type: \"%s\"\n", ptr); + return -EINVAL; +} + +static int +mtrr_ioctl(struct inode *inode, struct file *file, + unsigned int cmd, unsigned long arg) +{ + int err; + mtrr_type type; + struct mtrr_sentry sentry; + struct mtrr_gentry gentry; + + switch (cmd) { + default: + return -ENOIOCTLCMD; + case MTRRIOC_ADD_ENTRY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&sentry, (void *) arg, sizeof sentry)) + return -EFAULT; + err = + mtrr_file_add(sentry.base, sentry.size, sentry.type, 1, + file, 0); + if (err < 0) + return err; + break; + case MTRRIOC_SET_ENTRY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&sentry, (void *) arg, sizeof sentry)) + return -EFAULT; + err = mtrr_add(sentry.base, sentry.size, sentry.type, 0); + if (err < 0) + return err; + break; + case MTRRIOC_DEL_ENTRY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&sentry, (void *) arg, sizeof sentry)) + return -EFAULT; + err = mtrr_file_del(sentry.base, sentry.size, file, 0); + if (err < 0) + return err; + break; + case MTRRIOC_KILL_ENTRY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&sentry, (void *) arg, sizeof sentry)) + return -EFAULT; + err = mtrr_del(-1, sentry.base, sentry.size); + if (err < 0) + return err; + break; + case MTRRIOC_GET_ENTRY: + if (copy_from_user(&gentry, (void *) arg, sizeof gentry)) + return -EFAULT; + if (gentry.regnum >= num_var_ranges) + return -EINVAL; + mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); + + /* Hide entries that go above 4GB */ + if (gentry.base + gentry.size > 0x100000 + || gentry.size == 0x100000) + gentry.base = gentry.size = gentry.type = 0; + else { + gentry.base <<= PAGE_SHIFT; + gentry.size <<= PAGE_SHIFT; + gentry.type = type; + } + + if (copy_to_user((void *) arg, &gentry, sizeof gentry)) + return -EFAULT; + break; + case MTRRIOC_ADD_PAGE_ENTRY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&sentry, (void *) arg, sizeof sentry)) + return -EFAULT; + err = + mtrr_file_add(sentry.base, sentry.size, sentry.type, 1, + file, 1); + if (err < 0) + return err; + break; + case MTRRIOC_SET_PAGE_ENTRY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&sentry, (void *) arg, sizeof sentry)) + return -EFAULT; + err = mtrr_add_page(sentry.base, sentry.size, sentry.type, 0); + if (err < 0) + return err; + break; + case MTRRIOC_DEL_PAGE_ENTRY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&sentry, (void *) arg, sizeof sentry)) + return -EFAULT; + err = mtrr_file_del(sentry.base, sentry.size, file, 1); + if (err < 0) + return err; + break; + case MTRRIOC_KILL_PAGE_ENTRY: + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&sentry, (void *) arg, sizeof sentry)) + return -EFAULT; + err = mtrr_del_page(-1, sentry.base, sentry.size); + if (err < 0) + return err; + break; + case MTRRIOC_GET_PAGE_ENTRY: + if (copy_from_user(&gentry, (void *) arg, sizeof gentry)) + return -EFAULT; + if (gentry.regnum >= num_var_ranges) + return -EINVAL; + mtrr_if->get(gentry.regnum, &gentry.base, &gentry.size, &type); + gentry.type = type; + + if (copy_to_user((void *) arg, &gentry, sizeof gentry)) + return -EFAULT; + break; + } + return 0; +} + +static int +mtrr_close(struct inode *ino, struct file *file) +{ + int i, max; + unsigned int *fcount = file->private_data; + + if (fcount == NULL) + return 0; + max = num_var_ranges; + for (i = 0; i < max; ++i) { + while (fcount[i] > 0) { + if (mtrr_del(i, 0, 0) < 0) + printk("mtrr: reg %d not used\n", i); + --fcount[i]; + } + } + kfree(fcount); + file->private_data = NULL; + return 0; +} + +static struct file_operations mtrr_fops = { + .owner = THIS_MODULE, + .read = mtrr_read, + .write = mtrr_write, + .ioctl = mtrr_ioctl, + .release = mtrr_close, +}; + +# ifdef CONFIG_PROC_FS + +static struct proc_dir_entry *proc_root_mtrr; + +# endif /* CONFIG_PROC_FS */ + +static devfs_handle_t devfs_handle; + +char * attrib_to_str(int x) +{ + return (x <= 6) ? mtrr_strings[x] : "?"; +} + +void compute_ascii(void) +{ + char factor; + int i, max; + mtrr_type type; + unsigned long base, size; + + ascii_buf_bytes = 0; + max = num_var_ranges; + for (i = 0; i < max; i++) { + mtrr_if->get(i, &base, &size, &type); + if (size == 0) + usage_table[i] = 0; + else { + if (size < (0x100000 >> PAGE_SHIFT)) { + /* less than 1MB */ + factor = 'K'; + size <<= PAGE_SHIFT - 10; + } else { + factor = 'M'; + size >>= 20 - PAGE_SHIFT; + } + sprintf + (ascii_buffer + ascii_buf_bytes, + "reg%02i: base=0x%05lx000 (%4liMB), size=%4li%cB: %s, count=%d\n", + i, base, base >> (20 - PAGE_SHIFT), size, factor, + attrib_to_str(type), usage_table[i]); + ascii_buf_bytes += + strlen(ascii_buffer + ascii_buf_bytes); + } + } + devfs_set_file_size(devfs_handle, ascii_buf_bytes); +# ifdef CONFIG_PROC_FS + if (proc_root_mtrr) + proc_root_mtrr->size = ascii_buf_bytes; +# endif /* CONFIG_PROC_FS */ +} + +static int __init mtrr_if_init(void) +{ + int max = num_var_ranges; + + if ((ascii_buffer = kmalloc(max * LINE_SIZE, GFP_KERNEL)) == NULL) { + printk("mtrr: could not allocate\n"); + return -ENOMEM; + } + ascii_buf_bytes = 0; + compute_ascii(); +#ifdef CONFIG_PROC_FS + proc_root_mtrr = + create_proc_entry("mtrr", S_IWUSR | S_IRUGO, &proc_root); + if (proc_root_mtrr) { + proc_root_mtrr->owner = THIS_MODULE; + proc_root_mtrr->proc_fops = &mtrr_fops; + } +#endif +#ifdef USERSPACE_INTERFACE + devfs_handle = devfs_register(NULL, "cpu/mtrr", DEVFS_FL_DEFAULT, 0, 0, + S_IFREG | S_IRUGO | S_IWUSR, + &mtrr_fops, NULL); +#endif + return 0; +} + +arch_initcall(mtrr_if_init); diff --git a/arch/i386/kernel/cpu/mtrr/main.c b/arch/i386/kernel/cpu/mtrr/main.c new file mode 100644 index 000000000000..8f0d8a3c8bd7 --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/main.c @@ -0,0 +1,628 @@ +/* Generic MTRR (Memory Type Range Register) driver. + + Copyright (C) 1997-2000 Richard Gooch + Copyright (c) 2002 Patrick Mochel + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with this library; if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + Richard Gooch may be reached by email at rgooch@atnf.csiro.au + The postal address is: + Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia. + + Source: "Pentium Pro Family Developer's Manual, Volume 3: + Operating System Writer's Guide" (Intel document number 242692), + section 11.11.7 + + This was cleaned and made readable by Patrick Mochel + on 6-7 March 2002. + Source: Intel Architecture Software Developers Manual, Volume 3: + System Programming Guide; Section 9.11. (1997 edition - PPro). +*/ + +#include +#include +#include +#include + +#define MTRR_NEED_STRINGS +#include + +#include +#include +#include +#include "mtrr.h" + +#define MTRR_VERSION "2.0 (20020519)" + +u32 num_var_ranges = 0; + +unsigned int *usage_table; +static DECLARE_MUTEX(main_lock); + +u32 size_or_mask, size_and_mask; + +static struct mtrr_ops * mtrr_ops[X86_VENDOR_NUM] = {}; +struct mtrr_ops * mtrr_if = NULL; + +__initdata char *mtrr_if_name[] = { + "none", "Intel", "AMD K6", "Cyrix ARR", "Centaur MCR" +}; + +static void set_mtrr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type); + +static unsigned int arr3_protected; + +void set_mtrr_ops(struct mtrr_ops * ops) +{ + if (ops->vendor && ops->vendor < X86_VENDOR_NUM) + mtrr_ops[ops->vendor] = ops; +} + +/* Returns non-zero if we have the write-combining memory type */ +static int have_wrcomb(void) +{ + struct pci_dev *dev = NULL; + + /* WTF is this? + * Someone, please shoot me. + */ + + /* ServerWorks LE chipsets have problems with write-combining + Don't allow it and leave room for other chipsets to be tagged */ + + if ((dev = pci_find_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) { + if ((dev->vendor == PCI_VENDOR_ID_SERVERWORKS) && + (dev->device == PCI_DEVICE_ID_SERVERWORKS_LE)) { + printk(KERN_INFO + "mtrr: Serverworks LE detected. Write-combining disabled.\n"); + return 0; + } + } + return (mtrr_if->have_wrcomb ? mtrr_if->have_wrcomb() : 0); +} + +/* This function returns the number of variable MTRRs */ +void __init set_num_var_ranges(void) +{ + unsigned long config = 0, dummy; + + if (use_intel()) { + rdmsr(MTRRcap_MSR, config, dummy); + } else if (is_cpu(AMD)) + config = 2; + else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) + config = 8; + num_var_ranges = config & 0xff; +} + +static char * attrib_to_str(int x) +{ + return (x <= 6) ? mtrr_strings[x] : "?"; +} + +static void init_table(void) +{ + int i, max; + + max = num_var_ranges; + if ((usage_table = kmalloc(max * sizeof *usage_table, GFP_KERNEL)) + == NULL) { + printk("mtrr: could not allocate\n"); + return; + } + for (i = 0; i < max; i++) + usage_table[i] = 1; +#ifdef USERSPACE_INTERFACE + if ((ascii_buffer = kmalloc(max * LINE_SIZE, GFP_KERNEL)) == NULL) { + printk("mtrr: could not allocate\n"); + return; + } + ascii_buf_bytes = 0; + compute_ascii(); +#endif +} + +struct set_mtrr_data { + atomic_t count; + atomic_t gate; + unsigned long smp_base; + unsigned long smp_size; + unsigned int smp_reg; + mtrr_type smp_type; +}; + +#ifdef CONFIG_SMP + +static void ipi_handler(void *info) +/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. + [RETURNS] Nothing. +*/ +{ + struct set_mtrr_data *data = info; + unsigned long flags; + + local_irq_save(flags); + + atomic_dec(&data->count); + while(!atomic_read(&data->gate)) { + cpu_relax(); + barrier(); + } + + /* The master has cleared me to execute */ + mtrr_if->set(data->smp_reg, data->smp_base, + data->smp_size, data->smp_type); + + atomic_dec(&data->count); + while(atomic_read(&data->gate)) { + cpu_relax(); + barrier(); + } + local_irq_restore(flags); +} + +#endif + +/** + * set_mtrr - update mtrrs on all processors + * @reg: mtrr in question + * @base: mtrr base + * @size: mtrr size + * @type: mtrr type + * + * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: + * + * 1. Send IPI to do the following: + * 2. Disable Interrupts + * 3. Wait for all procs to do so + * 4. Enter no-fill cache mode + * 5. Flush caches + * 6. Clear PGE bit + * 7. Flush all TLBs + * 8. Disable all range registers + * 9. Update the MTRRs + * 10. Enable all range registers + * 11. Flush all TLBs and caches again + * 12. Enter normal cache mode and reenable caching + * 13. Set PGE + * 14. Wait for buddies to catch up + * 15. Enable interrupts. + * + * What does that mean for us? Well, first we set data.count to the number + * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait + * until it hits 0 and proceed. We set the data.gate flag and reset data.count. + * Meanwhile, they are waiting for that flag to be set. Once it's set, each + * CPU goes through the transition of updating MTRRs. The CPU vendors may each do it + * differently, so we call mtrr_if->set() callback and let them take care of it. + * When they're done, they again decrement data->count and wait for data.gate to + * be reset. + * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag. + * Everyone then enables interrupts and we all continue on. + * + * Note that the mechanism is the same for UP systems, too; all the SMP stuff + * becomes nops. + */ +static void set_mtrr(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type) +{ + struct set_mtrr_data data; + unsigned long flags; + + data.smp_reg = reg; + data.smp_base = base; + data.smp_size = size; + data.smp_type = type; + atomic_set(&data.count, num_booting_cpus() - 1); + atomic_set(&data.gate,0); + + /* Start the ball rolling on other CPUs */ + if (smp_call_function(ipi_handler, &data, 1, 0) != 0) + panic("mtrr: timed out waiting for other CPUs\n"); + + local_irq_save(flags); + + while(atomic_read(&data.count)) { + cpu_relax(); + barrier(); + } + /* ok, reset count and toggle gate */ + atomic_set(&data.count, num_booting_cpus() - 1); + atomic_set(&data.gate,1); + + /* do our MTRR business */ + mtrr_if->set(reg,base,size,type); + + /* wait for the others */ + while(atomic_read(&data.count)) { + cpu_relax(); + barrier(); + } + local_irq_restore(flags); + atomic_set(&data.gate,0); +} + +/** + * mtrr_add_page - Add a memory type region + * @base: Physical base address of region in pages (4 KB) + * @size: Physical size of region in pages (4 KB) + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region + * + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. + * + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. + * + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. + * + * The available types are + * + * %MTRR_TYPE_UNCACHABLE - No caching + * + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. + */ + +int mtrr_add_page(unsigned long base, unsigned long size, + unsigned int type, char increment) +{ + int i; + mtrr_type ltype; + unsigned long lbase, lsize; + int error; + + if (!mtrr_if) + return -ENXIO; + + if ((error = mtrr_if->validate_add_page(base,size,type))) + return error; + + if (type >= MTRR_NUM_TYPES) { + printk("mtrr: type: %u illegal\n", type); + return -EINVAL; + } + + /* If the type is WC, check that this processor supports it */ + if ((type == MTRR_TYPE_WRCOMB) && !have_wrcomb()) { + printk(KERN_WARNING + "mtrr: your processor doesn't support write-combining\n"); + return -ENOSYS; + } + + if (base & size_or_mask || size & size_or_mask) { + printk("mtrr: base or size exceeds the MTRR width\n"); + return -EINVAL; + } + + error = -EINVAL; + + /* Search for existing MTRR */ + down(&main_lock); + for (i = 0; i < num_var_ranges; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (base >= lbase + lsize) + continue; + if ((base < lbase) && (base + size <= lbase)) + continue; + /* At this point we know there is some kind of overlap/enclosure */ + if ((base < lbase) || (base + size > lbase + lsize)) { + printk(KERN_WARNING + "mtrr: 0x%lx000,0x%lx000 overlaps existing" + " 0x%lx000,0x%lx000\n", base, size, lbase, + lsize); + goto out; + } + /* New region is enclosed by an existing region */ + if (ltype != type) { + if (type == MTRR_TYPE_UNCACHABLE) + continue; + printk ("mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", + base, size, attrib_to_str(ltype), + attrib_to_str(type)); + goto out; + } + if (increment) + ++usage_table[i]; + compute_ascii(); + error = i; + goto out; + } + /* Search for an empty MTRR */ + i = mtrr_if->get_free_region(base, size); + if (i >= 0) { + set_mtrr(i, base, size, type); + usage_table[i] = 1; + compute_ascii(); + } else + printk("mtrr: no more MTRRs available\n"); + error = i; + out: + up(&main_lock); + return error; +} + +/** + * mtrr_add - Add a memory type region + * @base: Physical base address of region + * @size: Physical size of region + * @type: Type of MTRR desired + * @increment: If this is true do usage counting on the region + * + * Memory type region registers control the caching on newer Intel and + * non Intel processors. This function allows drivers to request an + * MTRR is added. The details and hardware specifics of each processor's + * implementation are hidden from the caller, but nevertheless the + * caller should expect to need to provide a power of two size on an + * equivalent power of two boundary. + * + * If the region cannot be added either because all regions are in use + * or the CPU cannot support it a negative value is returned. On success + * the register number for this entry is returned, but should be treated + * as a cookie only. + * + * On a multiprocessor machine the changes are made to all processors. + * This is required on x86 by the Intel processors. + * + * The available types are + * + * %MTRR_TYPE_UNCACHABLE - No caching + * + * %MTRR_TYPE_WRBACK - Write data back in bursts whenever + * + * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts + * + * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes + * + * BUGS: Needs a quiet flag for the cases where drivers do not mind + * failures and do not wish system log messages to be sent. + */ + +int +mtrr_add(unsigned long base, unsigned long size, unsigned int type, + char increment) +{ + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + printk("mtrr: size and base must be multiples of 4 kiB\n"); + printk("mtrr: size: 0x%lx base: 0x%lx\n", size, base); + return -EINVAL; + } + return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, + increment); +} + +/** + * mtrr_del_page - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region + * + * If register is supplied then base and size are ignored. This is + * how drivers should call it. + * + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. + */ + +int mtrr_del_page(int reg, unsigned long base, unsigned long size) +{ + int i, max; + mtrr_type ltype; + unsigned long lbase, lsize; + int error = -EINVAL; + + if (!mtrr_if) + return -ENXIO; + + max = num_var_ranges; + down(&main_lock); + if (reg < 0) { + /* Search for existing MTRR */ + for (i = 0; i < max; ++i) { + mtrr_if->get(i, &lbase, &lsize, <ype); + if (lbase == base && lsize == size) { + reg = i; + break; + } + } + if (reg < 0) { + printk("mtrr: no MTRR for %lx000,%lx000 found\n", base, + size); + goto out; + } + } + if (reg >= max) { + printk("mtrr: register: %d too big\n", reg); + goto out; + } + if (is_cpu(CYRIX) && !use_intel()) { + if ((reg == 3) && arr3_protected) { + printk("mtrr: ARR3 cannot be changed\n"); + goto out; + } + } + mtrr_if->get(reg, &lbase, &lsize, <ype); + if (lsize < 1) { + printk("mtrr: MTRR %d not used\n", reg); + goto out; + } + if (usage_table[reg] < 1) { + printk("mtrr: reg: %d has count=0\n", reg); + goto out; + } + if (--usage_table[reg] < 1) + set_mtrr(reg, 0, 0, 0); + compute_ascii(); + error = reg; + out: + up(&main_lock); + return error; +} +/** + * mtrr_del - delete a memory type region + * @reg: Register returned by mtrr_add + * @base: Physical base address + * @size: Size of region + * + * If register is supplied then base and size are ignored. This is + * how drivers should call it. + * + * Releases an MTRR region. If the usage count drops to zero the + * register is freed and the region returns to default state. + * On success the register is returned, on failure a negative error + * code. + */ + +int +mtrr_del(int reg, unsigned long base, unsigned long size) +{ + if ((base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1))) { + printk("mtrr: size and base must be multiples of 4 kiB\n"); + printk("mtrr: size: 0x%lx base: 0x%lx\n", size, base); + return -EINVAL; + } + return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); +} + +EXPORT_SYMBOL(mtrr_add); +EXPORT_SYMBOL(mtrr_del); + +/* HACK ALERT! + * These should be called implicitly, but we can't yet until all the initcall + * stuff is done... + */ +extern void amd_init_mtrr(void); +extern void cyrix_init_mtrr(void); +extern void centaur_init_mtrr(void); + +static void __init init_ifs(void) +{ + amd_init_mtrr(); + cyrix_init_mtrr(); + centaur_init_mtrr(); +} + +/** + * mtrr_init - initialie mtrrs on the boot CPU + * + * This needs to be called early; before any of the other CPUs are + * initialized (i.e. before smp_init()). + * + */ +int __init mtrr_init(void) +{ + init_ifs(); + + if ( cpu_has_mtrr ) { + mtrr_if = &generic_mtrr_ops; + size_or_mask = 0xff000000; /* 36 bits */ + size_and_mask = 0x00f00000; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + /* The original Athlon docs said that + total addressable memory is 44 bits wide. + It was not really clear whether its MTRRs + follow this or not. (Read: 44 or 36 bits). + However, "x86-64_overview.pdf" explicitly + states that "previous implementations support + 36 bit MTRRs" and also provides a way to + query the width (in bits) of the physical + addressable memory on the Hammer family. + */ + if (boot_cpu_data.x86 == 7 + && (cpuid_eax(0x80000000) >= 0x80000008)) { + u32 phys_addr; + phys_addr = cpuid_eax(0x80000008) & 0xff; + size_or_mask = + ~((1 << (phys_addr - PAGE_SHIFT)) - 1); + size_and_mask = ~size_or_mask & 0xfff00000; + } + /* Athlon MTRRs use an Intel-compatible interface for + * getting and setting */ + break; + case X86_VENDOR_CENTAUR: + if (boot_cpu_data.x86 == 6) { + /* VIA Cyrix family have Intel style MTRRs, but don't support PAE */ + size_or_mask = 0xfff00000; /* 32 bits */ + size_and_mask = 0; + } + break; + + default: + break; + } + } else { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if ( cpu_has_k6_mtrr ) { + /* Pre-Athlon (K6) AMD CPU MTRRs */ + mtrr_if = mtrr_ops[X86_VENDOR_AMD]; + size_or_mask = 0xfff00000; /* 32 bits */ + size_and_mask = 0; + } + break; + case X86_VENDOR_CENTAUR: + if ( cpu_has_centaur_mcr ) { + mtrr_if = mtrr_ops[X86_VENDOR_CENTAUR]; + size_or_mask = 0xfff00000; /* 32 bits */ + size_and_mask = 0; + } + break; + case X86_VENDOR_CYRIX: + if ( cpu_has_cyrix_arr ) { + mtrr_if = mtrr_ops[X86_VENDOR_CYRIX]; + size_or_mask = 0xfff00000; /* 32 bits */ + size_and_mask = 0; + } + break; + default: + break; + } + } + if (mtrr_if) { + set_num_var_ranges(); + if (use_intel()) { + /* Only for Intel MTRRs */ + get_mtrr_state(); + } + init_table(); + } +#if 0 + printk("mtrr: v%s Richard Gooch (rgooch@atnf.csiro.au)\n" + "mtrr: detected mtrr type: %s\n", + MTRR_VERSION, mtrr_if_name[mtrr_if]); +#endif + return mtrr_if ? -ENXIO : 0; +} + +//subsys_initcall(mtrr_init); + diff --git a/arch/i386/kernel/cpu/mtrr/mtrr.h b/arch/i386/kernel/cpu/mtrr/mtrr.h new file mode 100644 index 000000000000..5c1aa8ab552e --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/mtrr.h @@ -0,0 +1,98 @@ +/* + * local mtrr defines. + */ + +#ifndef TRUE +#define TRUE 1 +#define FALSE 0 +#endif + +#define MTRRcap_MSR 0x0fe +#define MTRRdefType_MSR 0x2ff + +#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) +#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) + +#define NUM_FIXED_RANGES 88 +#define MTRRfix64K_00000_MSR 0x250 +#define MTRRfix16K_80000_MSR 0x258 +#define MTRRfix16K_A0000_MSR 0x259 +#define MTRRfix4K_C0000_MSR 0x268 +#define MTRRfix4K_C8000_MSR 0x269 +#define MTRRfix4K_D0000_MSR 0x26a +#define MTRRfix4K_D8000_MSR 0x26b +#define MTRRfix4K_E0000_MSR 0x26c +#define MTRRfix4K_E8000_MSR 0x26d +#define MTRRfix4K_F0000_MSR 0x26e +#define MTRRfix4K_F8000_MSR 0x26f + +#define MTRR_CHANGE_MASK_FIXED 0x01 +#define MTRR_CHANGE_MASK_VARIABLE 0x02 +#define MTRR_CHANGE_MASK_DEFTYPE 0x04 + +/* In the Intel processor's MTRR interface, the MTRR type is always held in + an 8 bit field: */ +typedef u8 mtrr_type; + +struct mtrr_ops { + u32 vendor; + u32 use_intel_if; + void (*init)(void); + void (*init_secondary)(void); + void (*set)(unsigned int reg, unsigned long base, + unsigned long size, mtrr_type type); + void (*get)(unsigned int reg, unsigned long *base, + unsigned long *size, mtrr_type * type); + int (*get_free_region) (unsigned long base, unsigned long size); + + int (*validate_add_page)(unsigned long base, unsigned long size, + unsigned int type); + int (*have_wrcomb)(void); +}; + +extern int generic_get_free_region(unsigned long base, unsigned long size); +extern void generic_init_secondary(void); +extern int generic_validate_add_page(unsigned long base, unsigned long size, + unsigned int type); + +extern struct mtrr_ops generic_mtrr_ops; + +extern int generic_have_wrcomb(void); +extern int positive_have_wrcomb(void); + +/* library functions for processor-specific routines */ +struct set_mtrr_context { + unsigned long flags; + unsigned long deftype_lo; + unsigned long deftype_hi; + unsigned long cr4val; + unsigned long ccr3; +}; + +struct mtrr_var_range { + unsigned long base_lo; + unsigned long base_hi; + unsigned long mask_lo; + unsigned long mask_hi; +}; + +void set_mtrr_done(struct set_mtrr_context *ctxt); +void set_mtrr_cache_disable(struct set_mtrr_context *ctxt); +void set_mtrr_prepare_save(struct set_mtrr_context *ctxt); + +void get_mtrr_state(void); + +extern void set_mtrr_ops(struct mtrr_ops * ops); + +/* Don't even ask... */ +extern void compute_ascii(void); + +extern u32 size_or_mask, size_and_mask; +extern struct mtrr_ops * mtrr_if; + +#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) +#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) + +extern unsigned int num_var_ranges; + +extern char * mtrr_if_name[]; diff --git a/arch/i386/kernel/cpu/mtrr/state.c b/arch/i386/kernel/cpu/mtrr/state.c new file mode 100644 index 000000000000..a2eef500f7bc --- /dev/null +++ b/arch/i386/kernel/cpu/mtrr/state.c @@ -0,0 +1,338 @@ +#include +#include +#include +#include +#include +#include +#include "mtrr.h" + +struct mtrr_state { + struct mtrr_var_range *var_ranges; + mtrr_type fixed_ranges[NUM_FIXED_RANGES]; + unsigned char enabled; + mtrr_type def_type; +}; + +static unsigned long smp_changes_mask __initdata = 0; +struct mtrr_state mtrr_state = {}; + +static int __init set_fixed_ranges(mtrr_type * frs) +{ + unsigned long *p = (unsigned long *) frs; + int changed = FALSE; + int i; + unsigned long lo, hi; + + rdmsr(MTRRfix64K_00000_MSR, lo, hi); + if (p[0] != lo || p[1] != hi) { + wrmsr(MTRRfix64K_00000_MSR, p[0], p[1]); + changed = TRUE; + } + + for (i = 0; i < 2; i++) { + rdmsr(MTRRfix16K_80000_MSR + i, lo, hi); + if (p[2 + i * 2] != lo || p[3 + i * 2] != hi) { + wrmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], + p[3 + i * 2]); + changed = TRUE; + } + } + + for (i = 0; i < 8; i++) { + rdmsr(MTRRfix4K_C0000_MSR + i, lo, hi); + if (p[6 + i * 2] != lo || p[7 + i * 2] != hi) { + wrmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], + p[7 + i * 2]); + changed = TRUE; + } + } + return changed; +} + +/* Set the MSR pair relating to a var range. Returns TRUE if + changes are made */ +static int __init set_mtrr_var_ranges(unsigned int index, struct mtrr_var_range *vr) +{ + unsigned int lo, hi; + int changed = FALSE; + + rdmsr(MTRRphysBase_MSR(index), lo, hi); + if ((vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) + || (vr->base_hi & 0xfUL) != (hi & 0xfUL)) { + wrmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); + changed = TRUE; + } + + rdmsr(MTRRphysMask_MSR(index), lo, hi); + + if ((vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL) + || (vr->mask_hi & 0xfUL) != (hi & 0xfUL)) { + wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); + changed = TRUE; + } + return changed; +} + +static unsigned long set_mtrr_state(u32 deftype_lo, u32 deftype_hi) +/* [SUMMARY] Set the MTRR state for this CPU. + The MTRR state information to read. + Some relevant CPU context. + [NOTE] The CPU must already be in a safe state for MTRR changes. + [RETURNS] 0 if no changes made, else a mask indication what was changed. +*/ +{ + unsigned int i; + unsigned long change_mask = 0; + + for (i = 0; i < num_var_ranges; i++) + if (set_mtrr_var_ranges(i, &mtrr_state.var_ranges[i])) + change_mask |= MTRR_CHANGE_MASK_VARIABLE; + + if (set_fixed_ranges(mtrr_state.fixed_ranges)) + change_mask |= MTRR_CHANGE_MASK_FIXED; + + /* Set_mtrr_restore restores the old value of MTRRdefType, + so to set it we fiddle with the saved value */ + if ((deftype_lo & 0xff) != mtrr_state.def_type + || ((deftype_lo & 0xc00) >> 10) != mtrr_state.enabled) { + deftype_lo |= (mtrr_state.def_type | mtrr_state.enabled << 10); + change_mask |= MTRR_CHANGE_MASK_DEFTYPE; + } + + return change_mask; +} + + +/* Some BIOS's are fucked and don't set all MTRRs the same! */ +static void __init mtrr_state_warn(void) +{ + unsigned long mask = smp_changes_mask; + if (!mask) + return; + if (mask & MTRR_CHANGE_MASK_FIXED) + printk + ("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); + if (mask & MTRR_CHANGE_MASK_VARIABLE) + printk + ("mtrr: your CPUs had inconsistent variable MTRR settings\n"); + if (mask & MTRR_CHANGE_MASK_DEFTYPE) + printk + ("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); + printk("mtrr: probably your BIOS does not setup all CPUs\n"); +} + +/* Free resources associated with a struct mtrr_state */ +static void __init finalize_mtrr_state(void) +{ + if (mtrr_state.var_ranges) + kfree(mtrr_state.var_ranges); + mtrr_state.var_ranges = NULL; +} + +/* Get the MSR pair relating to a var range */ +static void __init +get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) +{ + rdmsr(MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); + rdmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); +} + +static void __init +get_fixed_ranges(mtrr_type * frs) +{ + unsigned long *p = (unsigned long *) frs; + int i; + + rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); + + for (i = 0; i < 2; i++) + rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); + for (i = 0; i < 8; i++) + rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); +} + +/* Grab all of the MTRR state for this CPU into *state */ +void get_mtrr_state(void) +{ + unsigned int i; + struct mtrr_var_range *vrs; + unsigned long lo, dummy; + + if (!mtrr_state.var_ranges) { + mtrr_state.var_ranges = kmalloc(num_var_ranges * sizeof (struct mtrr_var_range), + GFP_KERNEL); + if (!mtrr_state.var_ranges) + return; + } + vrs = mtrr_state.var_ranges; + + for (i = 0; i < num_var_ranges; i++) + get_mtrr_var_range(i, &vrs[i]); + get_fixed_ranges(mtrr_state.fixed_ranges); + + rdmsr(MTRRdefType_MSR, lo, dummy); + mtrr_state.def_type = (lo & 0xff); + mtrr_state.enabled = (lo & 0xc00) >> 10; +} + + +/* Put the processor into a state where MTRRs can be safely set */ +void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) +{ + unsigned int cr0; + + /* Disable interrupts locally */ + local_irq_save(ctxt->flags); + + if (use_intel() || is_cpu(CYRIX)) { + + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if ( cpu_has_pge ) { + ctxt->cr4val = read_cr4(); + write_cr4(ctxt->cr4val & (unsigned char) ~(1 << 7)); + } + + /* Disable and flush caches. Note that wbinvd flushes the TLBs as + a side-effect */ + cr0 = read_cr0() | 0x40000000; + wbinvd(); + write_cr0(cr0); + wbinvd(); + + if (use_intel()) + /* Save MTRR state */ + rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + else + /* Cyrix ARRs - everything else were excluded at the top */ + ctxt->ccr3 = getCx86(CX86_CCR3); + } +} + +void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) +{ + if (use_intel()) + /* Disable MTRRs, and set the default type to uncached */ + wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, + ctxt->deftype_hi); + else if (is_cpu(CYRIX)) + /* Cyrix ARRs - everything else were excluded at the top */ + setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); +} + +/* Restore the processor after a set_mtrr_prepare */ +void set_mtrr_done(struct set_mtrr_context *ctxt) +{ + if (use_intel() || is_cpu(CYRIX)) { + + /* Flush caches and TLBs */ + wbinvd(); + + /* Restore MTRRdefType */ + if (use_intel()) + /* Intel (P6) standard MTRRs */ + wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + else + /* Cyrix ARRs - everything else was excluded at the top */ + setCx86(CX86_CCR3, ctxt->ccr3); + + /* Enable caches */ + write_cr0(read_cr0() & 0xbfffffff); + + /* Restore value of CR4 */ + if ( cpu_has_pge ) + write_cr4(ctxt->cr4val); + } + /* Re-enable interrupts locally (if enabled previously) */ + local_irq_restore(ctxt->flags); +} + +void __init generic_init_secondary(void) +{ + u32 cr0, cr4 = 0; + u32 deftype_lo, deftype_hi; + unsigned long mask, count; + + /* Note that this is not ideal, since the cache is only flushed/disabled + for this CPU while the MTRRs are changed, but changing this requires + more invasive changes to the way the kernel boots */ + + /* Save value of CR4 and clear Page Global Enable (bit 7) */ + if ( cpu_has_pge ) { + cr4 = read_cr4(); + write_cr4(cr4 & (unsigned char) ~(1 << 7)); + } + + /* Disable and flush caches. Note that wbinvd flushes the TLBs as + a side-effect */ + cr0 = read_cr0() | 0x40000000; + wbinvd(); + write_cr0(cr0); + wbinvd(); + + /* Save MTRR state */ + rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + + /* Disable MTRRs, and set the default type to uncached */ + wrmsr(MTRRdefType_MSR, deftype_lo & 0xf300UL, deftype_hi); + + /* Actually set the state */ + mask = set_mtrr_state(deftype_lo,deftype_hi); + + /* Flush caches and TLBs */ + wbinvd(); + + /* Intel (P6) standard MTRRs */ + wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + + /* Enable caches */ + write_cr0(read_cr0() & 0xbfffffff); + + /* Restore value of CR4 */ + if ( cpu_has_pge ) + write_cr4(cr4); + + /* Use the atomic bitops to update the global mask */ + for (count = 0; count < sizeof mask * 8; ++count) { + if (mask & 0x01) + set_bit(count, &smp_changes_mask); + mask >>= 1; + } +} + +/** + * mtrr_init_secondary - setup AP MTRR state + * + * Yes, this code is exactly the same as the set_mtrr code, except for the + * piece in the middle - you set all the ranges at once, instead of one + * register at a time. + * Shoot me. + */ +void __init mtrr_init_secondary_cpu(void) +{ + unsigned long flags; + + if (!mtrr_if || !mtrr_if->init_secondary) { + /* I see no MTRRs I can support in SMP mode... */ + printk("mtrr: SMP support incomplete for this vendor\n"); + return; + } + + local_irq_save(flags); + mtrr_if->init_secondary(); + local_irq_restore(flags); +} + +/** + * mtrr_final_init - finalize initialization sequence. + */ +static int __init mtrr_finalize_state(void) +{ + if (use_intel()) { + finalize_mtrr_state(); + mtrr_state_warn(); + } + return 0; +} + +arch_initcall(mtrr_finalize_state); + diff --git a/arch/i386/kernel/mtrr.c b/arch/i386/kernel/mtrr.c deleted file mode 100644 index 79e925e8c59a..000000000000 --- a/arch/i386/kernel/mtrr.c +++ /dev/null @@ -1,2303 +0,0 @@ -/* Generic MTRR (Memory Type Range Register) driver. - - Copyright (C) 1997-2000 Richard Gooch - - This library is free software; you can redistribute it and/or - modify it under the terms of the GNU Library General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - This library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Library General Public License for more details. - - You should have received a copy of the GNU Library General Public - License along with this library; if not, write to the Free - Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - - Richard Gooch may be reached by email at rgooch@atnf.csiro.au - The postal address is: - Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia. - - Source: "Pentium Pro Family Developer's Manual, Volume 3: - Operating System Writer's Guide" (Intel document number 242692), - section 11.11.7 - - ChangeLog - - Prehistory Martin Tischhäuser - Initial register-setting code (from proform-1.0). - 19971216 Richard Gooch - Original version for /proc/mtrr interface, SMP-safe. - v1.0 - 19971217 Richard Gooch - Bug fix for ioctls()'s. - Added sample code in Documentation/mtrr.txt - v1.1 - 19971218 Richard Gooch - Disallow overlapping regions. - 19971219 Jens Maurer - Register-setting fixups. - v1.2 - 19971222 Richard Gooch - Fixups for kernel 2.1.75. - v1.3 - 19971229 David Wragg - Register-setting fixups and conformity with Intel conventions. - 19971229 Richard Gooch - Cosmetic changes and wrote this ChangeLog ;-) - 19980106 Richard Gooch - Fixups for kernel 2.1.78. - v1.4 - 19980119 David Wragg - Included passive-release enable code (elsewhere in PCI setup). - v1.5 - 19980131 Richard Gooch - Replaced global kernel lock with private spinlock. - v1.6 - 19980201 Richard Gooch - Added wait for other CPUs to complete changes. - v1.7 - 19980202 Richard Gooch - Bug fix in definition of for UP. - v1.8 - 19980319 Richard Gooch - Fixups for kernel 2.1.90. - 19980323 Richard Gooch - Move SMP BIOS fixup before secondary CPUs call - v1.9 - 19980325 Richard Gooch - Fixed test for overlapping regions: confused by adjacent regions - 19980326 Richard Gooch - Added wbinvd in . - 19980401 Richard Gooch - Bug fix for non-SMP compilation. - 19980418 David Wragg - Fixed-MTRR synchronisation for SMP and use atomic operations - instead of spinlocks. - 19980418 Richard Gooch - Differentiate different MTRR register classes for BIOS fixup. - v1.10 - 19980419 David Wragg - Bug fix in variable MTRR synchronisation. - v1.11 - 19980419 Richard Gooch - Fixups for kernel 2.1.97. - v1.12 - 19980421 Richard Gooch - Safer synchronisation across CPUs when changing MTRRs. - v1.13 - 19980423 Richard Gooch - Bugfix for SMP systems without MTRR support. - v1.14 - 19980427 Richard Gooch - Trap calls to and on non-MTRR machines. - v1.15 - 19980427 Richard Gooch - Use atomic bitops for setting SMP change mask. - v1.16 - 19980428 Richard Gooch - Removed spurious diagnostic message. - v1.17 - 19980429 Richard Gooch - Moved register-setting macros into this file. - Moved setup code from init/main.c to i386-specific areas. - v1.18 - 19980502 Richard Gooch - Moved MTRR detection outside conditionals in . - v1.19 - 19980502 Richard Gooch - Documentation improvement: mention Pentium II and AGP. - v1.20 - 19980521 Richard Gooch - Only manipulate interrupt enable flag on local CPU. - Allow enclosed uncachable regions. - v1.21 - 19980611 Richard Gooch - Always define . - v1.22 - 19980901 Richard Gooch - Removed module support in order to tidy up code. - Added sanity check for / before . - Created addition queue for prior to SMP commence. - v1.23 - 19980902 Richard Gooch - Ported patch to kernel 2.1.120-pre3. - v1.24 - 19980910 Richard Gooch - Removed sanity checks and addition queue: Linus prefers an OOPS. - v1.25 - 19981001 Richard Gooch - Fixed harmless compiler warning in include/asm-i386/mtrr.h - Fixed version numbering and history for v1.23 -> v1.24. - v1.26 - 19990118 Richard Gooch - Added devfs support. - v1.27 - 19990123 Richard Gooch - Changed locking to spin with reschedule. - Made use of new . - v1.28 - 19990201 Zoltán Böszörményi - Extended the driver to be able to use Cyrix style ARRs. - 19990204 Richard Gooch - Restructured Cyrix support. - v1.29 - 19990204 Zoltán Böszörményi - Refined ARR support: enable MAPEN in set_mtrr_prepare() - and disable MAPEN in set_mtrr_done(). - 19990205 Richard Gooch - Minor cleanups. - v1.30 - 19990208 Zoltán Böszörményi - Protect plain 6x86s (and other processors without the - Page Global Enable feature) against accessing CR4 in - set_mtrr_prepare() and set_mtrr_done(). - 19990210 Richard Gooch - Turned and into function pointers. - v1.31 - 19990212 Zoltán Böszörményi - Major rewrite of cyrix_arr_init(): do not touch ARRs, - leave them as the BIOS have set them up. - Enable usage of all 8 ARRs. - Avoid multiplications by 3 everywhere and other - code clean ups/speed ups. - 19990213 Zoltán Böszörményi - Set up other Cyrix processors identical to the boot cpu. - Since Cyrix don't support Intel APIC, this is l'art pour l'art. - Weigh ARRs by size: - If size <= 32M is given, set up ARR# we were given. - If size > 32M is given, set up ARR7 only if it is free, - fail otherwise. - 19990214 Zoltán Böszörményi - Also check for size >= 256K if we are to set up ARR7, - mtrr_add() returns the value it gets from set_mtrr() - 19990218 Zoltán Böszörményi - Remove Cyrix "coma bug" workaround from here. - Moved to linux/arch/i386/kernel/setup.c and - linux/include/asm-i386/bugs.h - 19990228 Richard Gooch - Added MTRRIOC_KILL_ENTRY ioctl(2) - Trap for counter underflow in . - Trap for 4 MiB aligned regions for PPro, stepping <= 7. - 19990301 Richard Gooch - Created hook. - 19990305 Richard Gooch - Temporarily disable AMD support now MTRR capability flag is set. - v1.32 - 19990308 Zoltán Böszörményi - Adjust my changes (19990212-19990218) to Richard Gooch's - latest changes. (19990228-19990305) - v1.33 - 19990309 Richard Gooch - Fixed typo in message. - 19990310 Richard Gooch - Support K6-II/III based on Alan Cox's patches. - v1.34 - 19990511 Bart Hartgers - Support Centaur C6 MCR's. - 19990512 Richard Gooch - Minor cleanups. - v1.35 - 19990707 Zoltán Böszörményi - Check whether ARR3 is protected in cyrix_get_free_region() - and mtrr_del(). The code won't attempt to delete or change it - from now on if the BIOS protected ARR3. It silently skips ARR3 - in cyrix_get_free_region() or returns with an error code from - mtrr_del(). - 19990711 Zoltán Böszörményi - Reset some bits in the CCRs in cyrix_arr_init() to disable SMM - if ARR3 isn't protected. This is needed because if SMM is active - and ARR3 isn't protected then deleting and setting ARR3 again - may lock up the processor. With SMM entirely disabled, it does - not happen. - 19990812 Zoltán Böszörményi - Rearrange switch() statements so the driver accomodates to - the fact that the AMD Athlon handles its MTRRs the same way - as Intel does. - 19990814 Zoltán Böszörményi - Double check for Intel in mtrr_add()'s big switch() because - that revision check is only valid for Intel CPUs. - 19990819 Alan Cox - Tested Zoltan's changes on a pre production Athlon - 100% - success. - 19991008 Manfred Spraul - replaced spin_lock_reschedule() with a normal semaphore. - v1.36 - 20000221 Richard Gooch - Compile fix if procfs and devfs not enabled. - Formatting changes. - v1.37 - 20001109 H. Peter Anvin - Use the new centralized CPU feature detects. - - v1.38 - 20010309 Dave Jones - Add support for Cyrix III. - - v1.39 - 20010312 Dave Jones - Ugh, I broke AMD support. - Reworked fix by Troels Walsted Hansen - - v1.40 - 20010327 Dave Jones - Adapted Cyrix III support to include VIA C3. - -*/ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#define MTRR_NEED_STRINGS -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#define MTRR_VERSION "1.40 (20010327)" - -#define TRUE 1 -#define FALSE 0 - -/* - * The code assumes all processors support the same MTRR - * interface. This is generally a good assumption, but could - * potentially be a problem. - */ -enum mtrr_if_type { - MTRR_IF_NONE, /* No MTRRs supported */ - MTRR_IF_INTEL, /* Intel (P6) standard MTRRs */ - MTRR_IF_AMD_K6, /* AMD pre-Athlon MTRRs */ - MTRR_IF_CYRIX_ARR, /* Cyrix ARRs */ - MTRR_IF_CENTAUR_MCR, /* Centaur MCRs */ -} mtrr_if = MTRR_IF_NONE; - -static __initdata char *mtrr_if_name[] = { - "none", "Intel", "AMD K6", "Cyrix ARR", "Centaur MCR" -}; - -#define MTRRcap_MSR 0x0fe -#define MTRRdefType_MSR 0x2ff - -#define MTRRphysBase_MSR(reg) (0x200 + 2 * (reg)) -#define MTRRphysMask_MSR(reg) (0x200 + 2 * (reg) + 1) - -#define NUM_FIXED_RANGES 88 -#define MTRRfix64K_00000_MSR 0x250 -#define MTRRfix16K_80000_MSR 0x258 -#define MTRRfix16K_A0000_MSR 0x259 -#define MTRRfix4K_C0000_MSR 0x268 -#define MTRRfix4K_C8000_MSR 0x269 -#define MTRRfix4K_D0000_MSR 0x26a -#define MTRRfix4K_D8000_MSR 0x26b -#define MTRRfix4K_E0000_MSR 0x26c -#define MTRRfix4K_E8000_MSR 0x26d -#define MTRRfix4K_F0000_MSR 0x26e -#define MTRRfix4K_F8000_MSR 0x26f - -#ifdef CONFIG_SMP -# define MTRR_CHANGE_MASK_FIXED 0x01 -# define MTRR_CHANGE_MASK_VARIABLE 0x02 -# define MTRR_CHANGE_MASK_DEFTYPE 0x04 -#endif - -/* In the Intel processor's MTRR interface, the MTRR type is always held in - an 8 bit field: */ -typedef u8 mtrr_type; - -#define LINE_SIZE 80 -#define JIFFIE_TIMEOUT 100 - -#ifdef CONFIG_SMP -# define set_mtrr(reg,base,size,type) set_mtrr_smp (reg, base, size, type) -#else -# define set_mtrr(reg,base,size,type) (*set_mtrr_up) (reg, base, size, type, \ - TRUE) -#endif - -#if defined(CONFIG_PROC_FS) || defined(CONFIG_DEVFS_FS) -# define USERSPACE_INTERFACE -#endif - -#ifndef USERSPACE_INTERFACE -# define compute_ascii() while (0) -#endif - -#ifdef USERSPACE_INTERFACE -static char *ascii_buffer; -static unsigned int ascii_buf_bytes; -#endif -static unsigned int *usage_table; -static DECLARE_MUTEX(main_lock); - -/* Private functions */ -#ifdef USERSPACE_INTERFACE -static void compute_ascii (void); -#endif - - -struct set_mtrr_context -{ - unsigned long flags; - unsigned long deftype_lo; - unsigned long deftype_hi; - unsigned long cr4val; - unsigned long ccr3; -}; - -static int arr3_protected; - -/* Put the processor into a state where MTRRs can be safely set */ -static void set_mtrr_prepare_save (struct set_mtrr_context *ctxt) -{ - /* Disable interrupts locally */ - local_irq_save(ctxt->flags); - - if ( mtrr_if != MTRR_IF_INTEL && mtrr_if != MTRR_IF_CYRIX_ARR ) - return; - - /* Save value of CR4 and clear Page Global Enable (bit 7) */ - if ( cpu_has_pge ) { - ctxt->cr4val = read_cr4(); - write_cr4(ctxt->cr4val & (unsigned char) ~(1<<7)); - } - - /* Disable and flush caches. Note that wbinvd flushes the TLBs as - a side-effect */ - { - unsigned int cr0 = read_cr0() | 0x40000000; - wbinvd(); - write_cr0( cr0 ); - wbinvd(); - } - - if ( mtrr_if == MTRR_IF_INTEL ) { - /* Save MTRR state */ - rdmsr (MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); - } else { - /* Cyrix ARRs - everything else were excluded at the top */ - ctxt->ccr3 = getCx86 (CX86_CCR3); - } -} /* End Function set_mtrr_prepare_save */ - -static void set_mtrr_cache_disable (struct set_mtrr_context *ctxt) -{ - if ( mtrr_if != MTRR_IF_INTEL && mtrr_if != MTRR_IF_CYRIX_ARR ) - return; - - if ( mtrr_if == MTRR_IF_INTEL ) { - /* Disable MTRRs, and set the default type to uncached */ - wrmsr (MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi); - } else { - /* Cyrix ARRs - everything else were excluded at the top */ - setCx86 (CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10); - } -} /* End Function set_mtrr_cache_disable */ - -/* Restore the processor after a set_mtrr_prepare */ -static void set_mtrr_done (struct set_mtrr_context *ctxt) -{ - if ( mtrr_if != MTRR_IF_INTEL && mtrr_if != MTRR_IF_CYRIX_ARR ) { - local_irq_restore (ctxt->flags); - return; - } - - /* Flush caches and TLBs */ - wbinvd(); - - /* Restore MTRRdefType */ - if ( mtrr_if == MTRR_IF_INTEL ) { - /* Intel (P6) standard MTRRs */ - wrmsr (MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); - } else { - /* Cyrix ARRs - everything else was excluded at the top */ - setCx86 (CX86_CCR3, ctxt->ccr3); - } - - /* Enable caches */ - write_cr0( read_cr0() & 0xbfffffff ); - - /* Restore value of CR4 */ - if ( cpu_has_pge ) - write_cr4(ctxt->cr4val); - - /* Re-enable interrupts locally (if enabled previously) */ - local_irq_restore (ctxt->flags); -} /* End Function set_mtrr_done */ - -/* This function returns the number of variable MTRRs */ -static unsigned int get_num_var_ranges (void) -{ - unsigned long config, dummy; - - switch ( mtrr_if ) - { - case MTRR_IF_INTEL: - rdmsr (MTRRcap_MSR, config, dummy); - return (config & 0xff); - case MTRR_IF_AMD_K6: - return 2; - case MTRR_IF_CYRIX_ARR: - return 8; - case MTRR_IF_CENTAUR_MCR: - return 8; - default: - return 0; - } -} /* End Function get_num_var_ranges */ - -/* Returns non-zero if we have the write-combining memory type */ -static int have_wrcomb (void) -{ - unsigned long config, dummy; - struct pci_dev *dev = NULL; - - /* ServerWorks LE chipsets have problems with write-combining - Don't allow it and leave room for other chipsets to be tagged */ - - if ((dev = pci_find_class(PCI_CLASS_BRIDGE_HOST << 8, NULL)) != NULL) { - if ((dev->vendor == PCI_VENDOR_ID_SERVERWORKS) && - (dev->device == PCI_DEVICE_ID_SERVERWORKS_LE)) { - printk (KERN_INFO "mtrr: Serverworks LE detected. Write-combining disabled.\n"); - return 0; - } - } - - switch ( mtrr_if ) - { - case MTRR_IF_INTEL: - rdmsr (MTRRcap_MSR, config, dummy); - return (config & (1<<10)); - case MTRR_IF_AMD_K6: - case MTRR_IF_CENTAUR_MCR: - case MTRR_IF_CYRIX_ARR: - return 1; - default: - return 0; - } -} /* End Function have_wrcomb */ - -static u32 size_or_mask, size_and_mask; - -static void intel_get_mtrr (unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type *type) -{ - unsigned long mask_lo, mask_hi, base_lo, base_hi; - - rdmsr (MTRRphysMask_MSR(reg), mask_lo, mask_hi); - if ( (mask_lo & 0x800) == 0 ) - { - /* Invalid (i.e. free) range */ - *base = 0; - *size = 0; - *type = 0; - return; - } - - rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); - - /* Work out the shifted address mask. */ - mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) - | mask_lo >> PAGE_SHIFT; - - /* This works correctly if size is a power of two, i.e. a - contiguous range. */ - *size = -mask_lo; - *base = base_hi << (32 - PAGE_SHIFT) | base_lo >> PAGE_SHIFT; - *type = base_lo & 0xff; -} /* End Function intel_get_mtrr */ - -static void cyrix_get_arr (unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type *type) -{ - unsigned long flags; - unsigned char arr, ccr3, rcr, shift; - - arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ - - /* Save flags and disable interrupts */ - local_irq_save(flags); - - ccr3 = getCx86 (CX86_CCR3); - setCx86 (CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - ((unsigned char *) base)[3] = getCx86 (arr); - ((unsigned char *) base)[2] = getCx86 (arr+1); - ((unsigned char *) base)[1] = getCx86 (arr+2); - rcr = getCx86(CX86_RCR_BASE + reg); - setCx86 (CX86_CCR3, ccr3); /* disable MAPEN */ - - /* Enable interrupts if it was enabled previously */ - local_irq_restore (flags); - shift = ((unsigned char *) base)[1] & 0x0f; - *base >>= PAGE_SHIFT; - - /* Power of two, at least 4K on ARR0-ARR6, 256K on ARR7 - * Note: shift==0xf means 4G, this is unsupported. - */ - if (shift) - *size = (reg < 7 ? 0x1UL : 0x40UL) << (shift - 1); - else - *size = 0; - - /* Bit 0 is Cache Enable on ARR7, Cache Disable on ARR0-ARR6 */ - if (reg < 7) - { - switch (rcr) - { - case 1: *type = MTRR_TYPE_UNCACHABLE; break; - case 8: *type = MTRR_TYPE_WRBACK; break; - case 9: *type = MTRR_TYPE_WRCOMB; break; - case 24: - default: *type = MTRR_TYPE_WRTHROUGH; break; - } - } else - { - switch (rcr) - { - case 0: *type = MTRR_TYPE_UNCACHABLE; break; - case 8: *type = MTRR_TYPE_WRCOMB; break; - case 9: *type = MTRR_TYPE_WRBACK; break; - case 25: - default: *type = MTRR_TYPE_WRTHROUGH; break; - } - } -} /* End Function cyrix_get_arr */ - -static void amd_get_mtrr (unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type *type) -{ - unsigned long low, high; - - rdmsr (MSR_K6_UWCCR, low, high); - /* Upper dword is region 1, lower is region 0 */ - if (reg == 1) low = high; - /* The base masks off on the right alignment */ - *base = (low & 0xFFFE0000) >> PAGE_SHIFT; - *type = 0; - if (low & 1) *type = MTRR_TYPE_UNCACHABLE; - if (low & 2) *type = MTRR_TYPE_WRCOMB; - if ( !(low & 3) ) - { - *size = 0; - return; - } - /* - * This needs a little explaining. The size is stored as an - * inverted mask of bits of 128K granularity 15 bits long offset - * 2 bits - * - * So to get a size we do invert the mask and add 1 to the lowest - * mask bit (4 as its 2 bits in). This gives us a size we then shift - * to turn into 128K blocks - * - * eg 111 1111 1111 1100 is 512K - * - * invert 000 0000 0000 0011 - * +1 000 0000 0000 0100 - * *128K ... - */ - low = (~low) & 0x1FFFC; - *size = (low + 4) << (15 - PAGE_SHIFT); - return; -} /* End Function amd_get_mtrr */ - -static struct -{ - unsigned long high; - unsigned long low; -} centaur_mcr[8]; - -static u8 centaur_mcr_reserved; -static u8 centaur_mcr_type; /* 0 for winchip, 1 for winchip2 */ - -/* - * Report boot time MCR setups - */ - -void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi) -{ - centaur_mcr[mcr].low = lo; - centaur_mcr[mcr].high = hi; -} - -static void centaur_get_mcr (unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type *type) -{ - *base = centaur_mcr[reg].high >> PAGE_SHIFT; - *size = -(centaur_mcr[reg].low & 0xfffff000) >> PAGE_SHIFT; - *type = MTRR_TYPE_WRCOMB; /* If it is there, it is write-combining */ - if(centaur_mcr_type==1 && ((centaur_mcr[reg].low&31)&2)) - *type = MTRR_TYPE_UNCACHABLE; - if(centaur_mcr_type==1 && (centaur_mcr[reg].low&31)==25) - *type = MTRR_TYPE_WRBACK; - if(centaur_mcr_type==0 && (centaur_mcr[reg].low&31)==31) - *type = MTRR_TYPE_WRBACK; - -} /* End Function centaur_get_mcr */ - -static void (*get_mtrr) (unsigned int reg, unsigned long *base, - unsigned long *size, mtrr_type *type); - -static void intel_set_mtrr_up (unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type, int do_safe) -/* [SUMMARY] Set variable MTRR register on the local CPU. - The register to set. - The base address of the region. - The size of the region. If this is 0 the region is disabled. - The type of the region. - If TRUE, do the change safely. If FALSE, safety measures should - be done externally. - [RETURNS] Nothing. -*/ -{ - struct set_mtrr_context ctxt; - - if (do_safe) { - set_mtrr_prepare_save (&ctxt); - set_mtrr_cache_disable (&ctxt); - } - if (size == 0) - { - /* The invalid bit is kept in the mask, so we simply clear the - relevant mask register to disable a range. */ - wrmsr (MTRRphysMask_MSR (reg), 0, 0); - } - else - { - wrmsr (MTRRphysBase_MSR (reg), base << PAGE_SHIFT | type, - (base & size_and_mask) >> (32 - PAGE_SHIFT)); - wrmsr (MTRRphysMask_MSR (reg), -size << PAGE_SHIFT | 0x800, - (-size & size_and_mask) >> (32 - PAGE_SHIFT)); - } - if (do_safe) set_mtrr_done (&ctxt); -} /* End Function intel_set_mtrr_up */ - -static void cyrix_set_arr_up (unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type, int do_safe) -{ - struct set_mtrr_context ctxt; - unsigned char arr, arr_type, arr_size; - - arr = CX86_ARR_BASE + (reg << 1) + reg; /* avoid multiplication by 3 */ - - /* count down from 32M (ARR0-ARR6) or from 2G (ARR7) */ - if (reg >= 7) - size >>= 6; - - size &= 0x7fff; /* make sure arr_size <= 14 */ - for(arr_size = 0; size; arr_size++, size >>= 1); - - if (reg<7) - { - switch (type) { - case MTRR_TYPE_UNCACHABLE: arr_type = 1; break; - case MTRR_TYPE_WRCOMB: arr_type = 9; break; - case MTRR_TYPE_WRTHROUGH: arr_type = 24; break; - default: arr_type = 8; break; - } - } - else - { - switch (type) - { - case MTRR_TYPE_UNCACHABLE: arr_type = 0; break; - case MTRR_TYPE_WRCOMB: arr_type = 8; break; - case MTRR_TYPE_WRTHROUGH: arr_type = 25; break; - default: arr_type = 9; break; - } - } - - if (do_safe) { - set_mtrr_prepare_save (&ctxt); - set_mtrr_cache_disable (&ctxt); - } - base <<= PAGE_SHIFT; - setCx86(arr, ((unsigned char *) &base)[3]); - setCx86(arr+1, ((unsigned char *) &base)[2]); - setCx86(arr+2, (((unsigned char *) &base)[1]) | arr_size); - setCx86(CX86_RCR_BASE + reg, arr_type); - if (do_safe) set_mtrr_done (&ctxt); -} /* End Function cyrix_set_arr_up */ - -static void amd_set_mtrr_up (unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type, int do_safe) -/* [SUMMARY] Set variable MTRR register on the local CPU. - The register to set. - The base address of the region. - The size of the region. If this is 0 the region is disabled. - The type of the region. - If TRUE, do the change safely. If FALSE, safety measures should - be done externally. - [RETURNS] Nothing. -*/ -{ - u32 regs[2]; - struct set_mtrr_context ctxt; - - if (do_safe) { - set_mtrr_prepare_save (&ctxt); - set_mtrr_cache_disable (&ctxt); - } - /* - * Low is MTRR0 , High MTRR 1 - */ - rdmsr (MSR_K6_UWCCR, regs[0], regs[1]); - /* - * Blank to disable - */ - if (size == 0) - regs[reg] = 0; - else - /* Set the register to the base, the type (off by one) and an - inverted bitmask of the size The size is the only odd - bit. We are fed say 512K We invert this and we get 111 1111 - 1111 1011 but if you subtract one and invert you get the - desired 111 1111 1111 1100 mask - - But ~(x - 1) == ~x + 1 == -x. Two's complement rocks! */ - regs[reg] = (-size>>(15-PAGE_SHIFT) & 0x0001FFFC) - | (base<base_lo, vr->base_hi); - rdmsr (MTRRphysMask_MSR (index), vr->mask_lo, vr->mask_hi); -} /* End Function get_mtrr_var_range */ - - -/* Set the MSR pair relating to a var range. Returns TRUE if - changes are made */ -static int __init set_mtrr_var_range_testing (unsigned int index, - struct mtrr_var_range *vr) -{ - unsigned int lo, hi; - int changed = FALSE; - - rdmsr(MTRRphysBase_MSR(index), lo, hi); - if ( (vr->base_lo & 0xfffff0ffUL) != (lo & 0xfffff0ffUL) - || (vr->base_hi & 0xfUL) != (hi & 0xfUL) ) - { - wrmsr (MTRRphysBase_MSR(index), vr->base_lo, vr->base_hi); - changed = TRUE; - } - - rdmsr (MTRRphysMask_MSR(index), lo, hi); - - if ( (vr->mask_lo & 0xfffff800UL) != (lo & 0xfffff800UL) - || (vr->mask_hi & 0xfUL) != (hi & 0xfUL) ) - { - wrmsr(MTRRphysMask_MSR(index), vr->mask_lo, vr->mask_hi); - changed = TRUE; - } - return changed; -} /* End Function set_mtrr_var_range_testing */ - -static void __init get_fixed_ranges(mtrr_type *frs) -{ - unsigned long *p = (unsigned long *)frs; - int i; - - rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); - - for (i = 0; i < 2; i++) - rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i*2], p[3 + i*2]); - for (i = 0; i < 8; i++) - rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i*2], p[7 + i*2]); -} /* End Function get_fixed_ranges */ - -static int __init set_fixed_ranges_testing(mtrr_type *frs) -{ - unsigned long *p = (unsigned long *)frs; - int changed = FALSE; - int i; - unsigned long lo, hi; - - rdmsr(MTRRfix64K_00000_MSR, lo, hi); - if (p[0] != lo || p[1] != hi) - { - wrmsr (MTRRfix64K_00000_MSR, p[0], p[1]); - changed = TRUE; - } - - for (i = 0; i < 2; i++) - { - rdmsr (MTRRfix16K_80000_MSR + i, lo, hi); - if (p[2 + i*2] != lo || p[3 + i*2] != hi) - { - wrmsr (MTRRfix16K_80000_MSR + i, p[2 + i*2], p[3 + i*2]); - changed = TRUE; - } - } - - for (i = 0; i < 8; i++) - { - rdmsr (MTRRfix4K_C0000_MSR + i, lo, hi); - if (p[6 + i*2] != lo || p[7 + i*2] != hi) - { - wrmsr(MTRRfix4K_C0000_MSR + i, p[6 + i*2], p[7 + i*2]); - changed = TRUE; - } - } - return changed; -} /* End Function set_fixed_ranges_testing */ - -struct mtrr_state -{ - unsigned int num_var_ranges; - struct mtrr_var_range *var_ranges; - mtrr_type fixed_ranges[NUM_FIXED_RANGES]; - unsigned char enabled; - mtrr_type def_type; -}; - - -/* Grab all of the MTRR state for this CPU into *state */ -static void __init get_mtrr_state(struct mtrr_state *state) -{ - unsigned int nvrs, i; - struct mtrr_var_range *vrs; - unsigned long lo, dummy; - - nvrs = state->num_var_ranges = get_num_var_ranges(); - vrs = state->var_ranges - = kmalloc (nvrs * sizeof (struct mtrr_var_range), GFP_KERNEL); - if (vrs == NULL) - nvrs = state->num_var_ranges = 0; - - for (i = 0; i < nvrs; i++) - get_mtrr_var_range (i, &vrs[i]); - get_fixed_ranges (state->fixed_ranges); - - rdmsr (MTRRdefType_MSR, lo, dummy); - state->def_type = (lo & 0xff); - state->enabled = (lo & 0xc00) >> 10; -} /* End Function get_mtrr_state */ - - -/* Free resources associated with a struct mtrr_state */ -static void __init finalize_mtrr_state(struct mtrr_state *state) -{ - if (state->var_ranges) kfree (state->var_ranges); -} /* End Function finalize_mtrr_state */ - - -static unsigned long __init set_mtrr_state (struct mtrr_state *state, - struct set_mtrr_context *ctxt) -/* [SUMMARY] Set the MTRR state for this CPU. - The MTRR state information to read. - Some relevant CPU context. - [NOTE] The CPU must already be in a safe state for MTRR changes. - [RETURNS] 0 if no changes made, else a mask indication what was changed. -*/ -{ - unsigned int i; - unsigned long change_mask = 0; - - for (i = 0; i < state->num_var_ranges; i++) - if ( set_mtrr_var_range_testing (i, &state->var_ranges[i]) ) - change_mask |= MTRR_CHANGE_MASK_VARIABLE; - - if ( set_fixed_ranges_testing(state->fixed_ranges) ) - change_mask |= MTRR_CHANGE_MASK_FIXED; - /* Set_mtrr_restore restores the old value of MTRRdefType, - so to set it we fiddle with the saved value */ - if ( (ctxt->deftype_lo & 0xff) != state->def_type - || ( (ctxt->deftype_lo & 0xc00) >> 10 ) != state->enabled) - { - ctxt->deftype_lo |= (state->def_type | state->enabled << 10); - change_mask |= MTRR_CHANGE_MASK_DEFTYPE; - } - - return change_mask; -} /* End Function set_mtrr_state */ - - -static atomic_t undone_count; -static volatile int wait_barrier_cache_disable = FALSE; -static volatile int wait_barrier_execute = FALSE; -static volatile int wait_barrier_cache_enable = FALSE; - -struct set_mtrr_data -{ - unsigned long smp_base; - unsigned long smp_size; - unsigned int smp_reg; - mtrr_type smp_type; -}; - -static void ipi_handler (void *info) -/* [SUMMARY] Synchronisation handler. Executed by "other" CPUs. - [RETURNS] Nothing. -*/ -{ - struct set_mtrr_data *data = info; - struct set_mtrr_context ctxt; - set_mtrr_prepare_save (&ctxt); - /* Notify master that I've flushed and disabled my cache */ - atomic_dec (&undone_count); - while (wait_barrier_cache_disable) { rep_nop(); barrier(); } - set_mtrr_cache_disable (&ctxt); - /* Notify master that I've flushed and disabled my cache */ - atomic_dec (&undone_count); - while (wait_barrier_execute) { rep_nop(); barrier(); } - /* The master has cleared me to execute */ - (*set_mtrr_up) (data->smp_reg, data->smp_base, data->smp_size, - data->smp_type, FALSE); - /* Notify master CPU that I've executed the function */ - atomic_dec (&undone_count); - /* Wait for master to clear me to enable cache and return */ - while (wait_barrier_cache_enable) { rep_nop(); barrier(); } - set_mtrr_done (&ctxt); -} /* End Function ipi_handler */ - -static void set_mtrr_smp (unsigned int reg, unsigned long base, - unsigned long size, mtrr_type type) -{ - struct set_mtrr_data data; - struct set_mtrr_context ctxt; - - data.smp_reg = reg; - data.smp_base = base; - data.smp_size = size; - data.smp_type = type; - wait_barrier_cache_disable = TRUE; - wait_barrier_execute = TRUE; - wait_barrier_cache_enable = TRUE; - atomic_set (&undone_count, num_booting_cpus() - 1); - /* Start the ball rolling on other CPUs */ - if (smp_call_function (ipi_handler, &data, 1, 0) != 0) - panic ("mtrr: timed out waiting for other CPUs\n"); - /* Flush and disable the local CPU's cache */ - set_mtrr_prepare_save (&ctxt); - /* Wait for all other CPUs to flush and disable their caches */ - while (atomic_read (&undone_count) > 0) { rep_nop(); barrier(); } - /* Set up for completion wait and then release other CPUs to change MTRRs*/ - atomic_set (&undone_count, num_booting_cpus() - 1); - wait_barrier_cache_disable = FALSE; - set_mtrr_cache_disable (&ctxt); - - /* Wait for all other CPUs to flush and disable their caches */ - while (atomic_read (&undone_count) > 0) { rep_nop(); barrier(); } - /* Set up for completion wait and then release other CPUs to change MTRRs*/ - atomic_set (&undone_count, num_booting_cpus() - 1); - wait_barrier_execute = FALSE; - (*set_mtrr_up) (reg, base, size, type, FALSE); - /* Now wait for other CPUs to complete the function */ - while (atomic_read (&undone_count) > 0) { rep_nop(); barrier(); } - /* Now all CPUs should have finished the function. Release the barrier to - allow them to re-enable their caches and return from their interrupt, - then enable the local cache and return */ - wait_barrier_cache_enable = FALSE; - set_mtrr_done (&ctxt); -} /* End Function set_mtrr_smp */ - - -/* Some BIOS's are fucked and don't set all MTRRs the same! */ -static void __init mtrr_state_warn(unsigned long mask) -{ - if (!mask) return; - if (mask & MTRR_CHANGE_MASK_FIXED) - printk ("mtrr: your CPUs had inconsistent fixed MTRR settings\n"); - if (mask & MTRR_CHANGE_MASK_VARIABLE) - printk ("mtrr: your CPUs had inconsistent variable MTRR settings\n"); - if (mask & MTRR_CHANGE_MASK_DEFTYPE) - printk ("mtrr: your CPUs had inconsistent MTRRdefType settings\n"); - printk ("mtrr: probably your BIOS does not setup all CPUs\n"); -} /* End Function mtrr_state_warn */ - -#endif /* CONFIG_SMP */ - -static char *attrib_to_str (int x) -{ - return (x <= 6) ? mtrr_strings[x] : "?"; -} /* End Function attrib_to_str */ - -static void init_table (void) -{ - int i, max; - - max = get_num_var_ranges (); - if ( ( usage_table = kmalloc (max * sizeof *usage_table, GFP_KERNEL) ) - == NULL ) - { - printk ("mtrr: could not allocate\n"); - return; - } - for (i = 0; i < max; i++) usage_table[i] = 1; -#ifdef USERSPACE_INTERFACE - if ( ( ascii_buffer = kmalloc (max * LINE_SIZE, GFP_KERNEL) ) == NULL ) - { - printk ("mtrr: could not allocate\n"); - return; - } - ascii_buf_bytes = 0; - compute_ascii (); -#endif -} /* End Function init_table */ - -static int generic_get_free_region (unsigned long base, unsigned long size) -/* [SUMMARY] Get a free MTRR. - The starting (base) address of the region. - The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ -{ - int i, max; - mtrr_type ltype; - unsigned long lbase, lsize; - - max = get_num_var_ranges (); - for (i = 0; i < max; ++i) - { - (*get_mtrr) (i, &lbase, &lsize, <ype); - if (lsize == 0) return i; - } - return -ENOSPC; -} /* End Function generic_get_free_region */ - -static int centaur_get_free_region (unsigned long base, unsigned long size) -/* [SUMMARY] Get a free MTRR. - The starting (base) address of the region. - The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ -{ - int i, max; - mtrr_type ltype; - unsigned long lbase, lsize; - - max = get_num_var_ranges (); - for (i = 0; i < max; ++i) - { - if(centaur_mcr_reserved & (1< The starting (base) address of the region. - The size (in bytes) of the region. - [RETURNS] The index of the region on success, else -1 on error. -*/ -{ - int i; - mtrr_type ltype; - unsigned long lbase, lsize; - - /* If we are to set up a region >32M then look at ARR7 immediately */ - if (size > 0x2000) - { - cyrix_get_arr (7, &lbase, &lsize, <ype); - if (lsize == 0) return 7; - /* Else try ARR0-ARR6 first */ - } - else - { - for (i = 0; i < 7; i++) - { - cyrix_get_arr (i, &lbase, &lsize, <ype); - if ((i == 3) && arr3_protected) continue; - if (lsize == 0) return i; - } - /* ARR0-ARR6 isn't free, try ARR7 but its size must be at least 256K */ - cyrix_get_arr (i, &lbase, &lsize, <ype); - if ((lsize == 0) && (size >= 0x40)) return i; - } - return -ENOSPC; -} /* End Function cyrix_get_free_region */ - -static int (*get_free_region) (unsigned long base, - unsigned long size) = generic_get_free_region; - -/** - * mtrr_add_page - Add a memory type region - * @base: Physical base address of region in pages (4 KB) - * @size: Physical size of region in pages (4 KB) - * @type: Type of MTRR desired - * @increment: If this is true do usage counting on the region - * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. - * - * If the region cannot be added either because all regions are in use - * or the CPU cannot support it a negative value is returned. On success - * the register number for this entry is returned, but should be treated - * as a cookie only. - * - * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. - * - * The available types are - * - * %MTRR_TYPE_UNCACHABLE - No caching - * - * %MTRR_TYPE_WRBACK - Write data back in bursts whenever - * - * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts - * - * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes - * - * BUGS: Needs a quiet flag for the cases where drivers do not mind - * failures and do not wish system log messages to be sent. - */ - -int mtrr_add_page(unsigned long base, unsigned long size, unsigned int type, char increment) -{ -/* [SUMMARY] Add an MTRR entry. - The starting (base, in pages) address of the region. - The size of the region. (in pages) - The type of the new region. - If true and the region already exists, the usage count will be - incremented. - [RETURNS] The MTRR register on success, else a negative number indicating - the error code. - [NOTE] This routine uses a spinlock. -*/ - int i, max; - mtrr_type ltype; - unsigned long lbase, lsize, last; - - switch ( mtrr_if ) - { - case MTRR_IF_NONE: - return -ENXIO; /* No MTRRs whatsoever */ - - case MTRR_IF_AMD_K6: - /* Apply the K6 block alignment and size rules - In order - o Uncached or gathering only - o 128K or bigger block - o Power of 2 block - o base suitably aligned to the power - */ - if ( type > MTRR_TYPE_WRCOMB || size < (1 << (17-PAGE_SHIFT)) || - (size & ~(size-1))-size || ( base & (size-1) ) ) - return -EINVAL; - break; - - case MTRR_IF_INTEL: - /* For Intel PPro stepping <= 7, must be 4 MiB aligned - and not touch 0x70000000->0x7003FFFF */ - if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == 1 && - boot_cpu_data.x86_mask <= 7 ) - { - if ( base & ((1 << (22-PAGE_SHIFT))-1) ) - { - printk (KERN_WARNING "mtrr: base(0x%lx000) is not 4 MiB aligned\n", base); - return -EINVAL; - } - if (!(base + size < 0x70000000 || base > 0x7003FFFF) && - (type == MTRR_TYPE_WRCOMB || type == MTRR_TYPE_WRBACK)) - { - printk (KERN_WARNING "mtrr: writable mtrr between 0x70000000 and 0x7003FFFF may hang the CPU.\n"); - return -EINVAL; - } - } - /* Fall through */ - - case MTRR_IF_CYRIX_ARR: - case MTRR_IF_CENTAUR_MCR: - if ( mtrr_if == MTRR_IF_CENTAUR_MCR ) - { - /* - * FIXME: Winchip2 supports uncached - */ - if (type != MTRR_TYPE_WRCOMB && (centaur_mcr_type == 0 || type != MTRR_TYPE_UNCACHABLE)) - { - printk (KERN_WARNING "mtrr: only write-combining%s supported\n", - centaur_mcr_type?" and uncacheable are":" is"); - return -EINVAL; - } - } - else if (base + size < 0x100) - { - printk (KERN_WARNING "mtrr: cannot set region below 1 MiB (0x%lx000,0x%lx000)\n", - base, size); - return -EINVAL; - } - /* Check upper bits of base and last are equal and lower bits are 0 - for base and 1 for last */ - last = base + size - 1; - for (lbase = base; !(lbase & 1) && (last & 1); - lbase = lbase >> 1, last = last >> 1); - if (lbase != last) - { - printk (KERN_WARNING "mtrr: base(0x%lx000) is not aligned on a size(0x%lx000) boundary\n", - base, size); - return -EINVAL; - } - break; - - default: - return -EINVAL; - } - - if (type >= MTRR_NUM_TYPES) - { - printk ("mtrr: type: %u illegal\n", type); - return -EINVAL; - } - - /* If the type is WC, check that this processor supports it */ - if ( (type == MTRR_TYPE_WRCOMB) && !have_wrcomb () ) - { - printk (KERN_WARNING "mtrr: your processor doesn't support write-combining\n"); - return -ENOSYS; - } - - if ( base & size_or_mask || size & size_or_mask ) - { - printk ("mtrr: base or size exceeds the MTRR width\n"); - return -EINVAL; - } - - increment = increment ? 1 : 0; - max = get_num_var_ranges (); - /* Search for existing MTRR */ - down(&main_lock); - for (i = 0; i < max; ++i) - { - (*get_mtrr) (i, &lbase, &lsize, <ype); - if (base >= lbase + lsize) continue; - if ( (base < lbase) && (base + size <= lbase) ) continue; - /* At this point we know there is some kind of overlap/enclosure */ - if ( (base < lbase) || (base + size > lbase + lsize) ) - { - up(&main_lock); - printk (KERN_WARNING "mtrr: 0x%lx000,0x%lx000 overlaps existing" - " 0x%lx000,0x%lx000\n", - base, size, lbase, lsize); - return -EINVAL; - } - /* New region is enclosed by an existing region */ - if (ltype != type) - { - if (type == MTRR_TYPE_UNCACHABLE) continue; - up(&main_lock); - printk ( "mtrr: type mismatch for %lx000,%lx000 old: %s new: %s\n", - base, size, attrib_to_str (ltype), attrib_to_str (type) ); - return -EINVAL; - } - if (increment) ++usage_table[i]; - compute_ascii (); - up(&main_lock); - return i; - } - /* Search for an empty MTRR */ - i = (*get_free_region) (base, size); - if (i < 0) - { - up(&main_lock); - printk ("mtrr: no more MTRRs available\n"); - return i; - } - set_mtrr (i, base, size, type); - usage_table[i] = 1; - compute_ascii (); - up(&main_lock); - return i; -} /* End Function mtrr_add_page */ - -/** - * mtrr_add - Add a memory type region - * @base: Physical base address of region - * @size: Physical size of region - * @type: Type of MTRR desired - * @increment: If this is true do usage counting on the region - * - * Memory type region registers control the caching on newer Intel and - * non Intel processors. This function allows drivers to request an - * MTRR is added. The details and hardware specifics of each processor's - * implementation are hidden from the caller, but nevertheless the - * caller should expect to need to provide a power of two size on an - * equivalent power of two boundary. - * - * If the region cannot be added either because all regions are in use - * or the CPU cannot support it a negative value is returned. On success - * the register number for this entry is returned, but should be treated - * as a cookie only. - * - * On a multiprocessor machine the changes are made to all processors. - * This is required on x86 by the Intel processors. - * - * The available types are - * - * %MTRR_TYPE_UNCACHABLE - No caching - * - * %MTRR_TYPE_WRBACK - Write data back in bursts whenever - * - * %MTRR_TYPE_WRCOMB - Write data back soon but allow bursts - * - * %MTRR_TYPE_WRTHROUGH - Cache reads but not writes - * - * BUGS: Needs a quiet flag for the cases where drivers do not mind - * failures and do not wish system log messages to be sent. - */ - -int mtrr_add(unsigned long base, unsigned long size, unsigned int type, char increment) -{ -/* [SUMMARY] Add an MTRR entry. - The starting (base) address of the region. - The size (in bytes) of the region. - The type of the new region. - If true and the region already exists, the usage count will be - incremented. - [RETURNS] The MTRR register on success, else a negative number indicating - the error code. -*/ - - if ( (base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)) ) - { - printk ("mtrr: size and base must be multiples of 4 kiB\n"); - printk ("mtrr: size: 0x%lx base: 0x%lx\n", size, base); - return -EINVAL; - } - return mtrr_add_page(base >> PAGE_SHIFT, size >> PAGE_SHIFT, type, increment); -} /* End Function mtrr_add */ - -/** - * mtrr_del_page - delete a memory type region - * @reg: Register returned by mtrr_add - * @base: Physical base address - * @size: Size of region - * - * If register is supplied then base and size are ignored. This is - * how drivers should call it. - * - * Releases an MTRR region. If the usage count drops to zero the - * register is freed and the region returns to default state. - * On success the register is returned, on failure a negative error - * code. - */ - -int mtrr_del_page (int reg, unsigned long base, unsigned long size) -/* [SUMMARY] Delete MTRR/decrement usage count. - The register. If this is less than 0 then <> and <> must - be supplied. - The base address of the region. This is ignored if <> is >= 0. - The size of the region. This is ignored if <> is >= 0. - [RETURNS] The register on success, else a negative number indicating - the error code. - [NOTE] This routine uses a spinlock. -*/ -{ - int i, max; - mtrr_type ltype; - unsigned long lbase, lsize; - - if ( mtrr_if == MTRR_IF_NONE ) return -ENXIO; - - max = get_num_var_ranges (); - down (&main_lock); - if (reg < 0) - { - /* Search for existing MTRR */ - for (i = 0; i < max; ++i) - { - (*get_mtrr) (i, &lbase, &lsize, <ype); - if (lbase == base && lsize == size) - { - reg = i; - break; - } - } - if (reg < 0) - { - up(&main_lock); - printk ("mtrr: no MTRR for %lx000,%lx000 found\n", base, size); - return -EINVAL; - } - } - if (reg >= max) - { - up (&main_lock); - printk ("mtrr: register: %d too big\n", reg); - return -EINVAL; - } - if ( mtrr_if == MTRR_IF_CYRIX_ARR ) - { - if ( (reg == 3) && arr3_protected ) - { - up (&main_lock); - printk ("mtrr: ARR3 cannot be changed\n"); - return -EINVAL; - } - } - (*get_mtrr) (reg, &lbase, &lsize, <ype); - if (lsize < 1) - { - up (&main_lock); - printk ("mtrr: MTRR %d not used\n", reg); - return -EINVAL; - } - if (usage_table[reg] < 1) - { - up (&main_lock); - printk ("mtrr: reg: %d has count=0\n", reg); - return -EINVAL; - } - if (--usage_table[reg] < 1) set_mtrr (reg, 0, 0, 0); - compute_ascii (); - up (&main_lock); - return reg; -} /* End Function mtrr_del_page */ - -/** - * mtrr_del - delete a memory type region - * @reg: Register returned by mtrr_add - * @base: Physical base address - * @size: Size of region - * - * If register is supplied then base and size are ignored. This is - * how drivers should call it. - * - * Releases an MTRR region. If the usage count drops to zero the - * register is freed and the region returns to default state. - * On success the register is returned, on failure a negative error - * code. - */ - -int mtrr_del (int reg, unsigned long base, unsigned long size) -/* [SUMMARY] Delete MTRR/decrement usage count. - The register. If this is less than 0 then <> and <> must - be supplied. - The base address of the region. This is ignored if <> is >= 0. - The size of the region. This is ignored if <> is >= 0. - [RETURNS] The register on success, else a negative number indicating - the error code. -*/ -{ - if ( (base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)) ) - { - printk ("mtrr: size and base must be multiples of 4 kiB\n"); - printk ("mtrr: size: 0x%lx base: 0x%lx\n", size, base); - return -EINVAL; - } - return mtrr_del_page(reg, base >> PAGE_SHIFT, size >> PAGE_SHIFT); -} - -#ifdef USERSPACE_INTERFACE - -static int mtrr_file_add (unsigned long base, unsigned long size, - unsigned int type, char increment, struct file *file, int page) -{ - int reg, max; - unsigned int *fcount = file->private_data; - - max = get_num_var_ranges (); - if (fcount == NULL) - { - if ( ( fcount = kmalloc (max * sizeof *fcount, GFP_KERNEL) ) == NULL ) - { - printk ("mtrr: could not allocate\n"); - return -ENOMEM; - } - memset (fcount, 0, max * sizeof *fcount); - file->private_data = fcount; - } - if (!page) { - if ( (base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)) ) - { - printk ("mtrr: size and base must be multiples of 4 kiB\n"); - printk ("mtrr: size: 0x%lx base: 0x%lx\n", size, base); - return -EINVAL; - } - base >>= PAGE_SHIFT; - size >>= PAGE_SHIFT; - } - reg = mtrr_add_page (base, size, type, 1); - if (reg >= 0) ++fcount[reg]; - return reg; -} /* End Function mtrr_file_add */ - -static int mtrr_file_del (unsigned long base, unsigned long size, - struct file *file, int page) -{ - int reg; - unsigned int *fcount = file->private_data; - - if (!page) { - if ( (base & (PAGE_SIZE - 1)) || (size & (PAGE_SIZE - 1)) ) - { - printk ("mtrr: size and base must be multiples of 4 kiB\n"); - printk ("mtrr: size: 0x%lx base: 0x%lx\n", size, base); - return -EINVAL; - } - base >>= PAGE_SHIFT; - size >>= PAGE_SHIFT; - } - reg = mtrr_del_page (-1, base, size); - if (reg < 0) return reg; - if (fcount == NULL) return reg; - if (fcount[reg] < 1) return -EINVAL; - --fcount[reg]; - return reg; -} /* End Function mtrr_file_del */ - -static ssize_t mtrr_read (struct file *file, char *buf, size_t len, - loff_t *ppos) -{ - if (*ppos >= ascii_buf_bytes) return 0; - if (*ppos + len > ascii_buf_bytes) len = ascii_buf_bytes - *ppos; - if ( copy_to_user (buf, ascii_buffer + *ppos, len) ) return -EFAULT; - *ppos += len; - return len; -} /* End Function mtrr_read */ - -static ssize_t mtrr_write (struct file *file, const char *buf, size_t len, - loff_t *ppos) -/* Format of control line: - "base=%Lx size=%Lx type=%s" OR: - "disable=%d" -*/ -{ - int i, err; - unsigned long reg; - unsigned long long base, size; - char *ptr; - char line[LINE_SIZE]; - - if ( !capable(CAP_SYS_ADMIN)) return -EPERM; - /* Can't seek (pwrite) on this device */ - if (ppos != &file->f_pos) return -ESPIPE; - memset (line, 0, LINE_SIZE); - if (len > LINE_SIZE) len = LINE_SIZE; - if ( copy_from_user (line, buf, len - 1) ) return -EFAULT; - ptr = line + strlen (line) - 1; - if (*ptr == '\n') *ptr = '\0'; - if ( !strncmp (line, "disable=", 8) ) - { - reg = simple_strtoul (line + 8, &ptr, 0); - err = mtrr_del_page (reg, 0, 0); - if (err < 0) return err; - return len; - } - if ( strncmp (line, "base=", 5) ) - { - printk ("mtrr: no \"base=\" in line: \"%s\"\n", line); - return -EINVAL; - } - base = simple_strtoull (line + 5, &ptr, 0); - for (; isspace (*ptr); ++ptr); - if ( strncmp (ptr, "size=", 5) ) - { - printk ("mtrr: no \"size=\" in line: \"%s\"\n", line); - return -EINVAL; - } - size = simple_strtoull (ptr + 5, &ptr, 0); - if ( (base & 0xfff) || (size & 0xfff) ) - { - printk ("mtrr: size and base must be multiples of 4 kiB\n"); - printk ("mtrr: size: 0x%Lx base: 0x%Lx\n", size, base); - return -EINVAL; - } - for (; isspace (*ptr); ++ptr); - if ( strncmp (ptr, "type=", 5) ) - { - printk ("mtrr: no \"type=\" in line: \"%s\"\n", line); - return -EINVAL; - } - ptr += 5; - for (; isspace (*ptr); ++ptr); - for (i = 0; i < MTRR_NUM_TYPES; ++i) - { - if ( strcmp (ptr, mtrr_strings[i]) ) continue; - base >>= PAGE_SHIFT; - size >>= PAGE_SHIFT; - err = mtrr_add_page ((unsigned long)base, (unsigned long)size, i, 1); - if (err < 0) return err; - return len; - } - printk ("mtrr: illegal type: \"%s\"\n", ptr); - return -EINVAL; -} /* End Function mtrr_write */ - -static int mtrr_ioctl (struct inode *inode, struct file *file, - unsigned int cmd, unsigned long arg) -{ - int err; - mtrr_type type; - struct mtrr_sentry sentry; - struct mtrr_gentry gentry; - - switch (cmd) - { - default: - return -ENOIOCTLCMD; - case MTRRIOC_ADD_ENTRY: - if ( ! capable(CAP_SYS_ADMIN) ) return -EPERM; - if ( copy_from_user (&sentry, (void *) arg, sizeof sentry) ) - return -EFAULT; - err = mtrr_file_add (sentry.base, sentry.size, sentry.type, 1, file, 0); - if (err < 0) return err; - break; - case MTRRIOC_SET_ENTRY: - if ( !capable(CAP_SYS_ADMIN) ) return -EPERM; - if ( copy_from_user (&sentry, (void *) arg, sizeof sentry) ) - return -EFAULT; - err = mtrr_add (sentry.base, sentry.size, sentry.type, 0); - if (err < 0) return err; - break; - case MTRRIOC_DEL_ENTRY: - if ( !capable(CAP_SYS_ADMIN) ) return -EPERM; - if ( copy_from_user (&sentry, (void *) arg, sizeof sentry) ) - return -EFAULT; - err = mtrr_file_del (sentry.base, sentry.size, file, 0); - if (err < 0) return err; - break; - case MTRRIOC_KILL_ENTRY: - if ( !capable(CAP_SYS_ADMIN) ) return -EPERM; - if ( copy_from_user (&sentry, (void *) arg, sizeof sentry) ) - return -EFAULT; - err = mtrr_del (-1, sentry.base, sentry.size); - if (err < 0) return err; - break; - case MTRRIOC_GET_ENTRY: - if ( copy_from_user (&gentry, (void *) arg, sizeof gentry) ) - return -EFAULT; - if ( gentry.regnum >= get_num_var_ranges () ) return -EINVAL; - (*get_mtrr) (gentry.regnum, &gentry.base, &gentry.size, &type); - - /* Hide entries that go above 4GB */ - if (gentry.base + gentry.size > 0x100000 || gentry.size == 0x100000) - gentry.base = gentry.size = gentry.type = 0; - else { - gentry.base <<= PAGE_SHIFT; - gentry.size <<= PAGE_SHIFT; - gentry.type = type; - } - - if ( copy_to_user ( (void *) arg, &gentry, sizeof gentry) ) - return -EFAULT; - break; - case MTRRIOC_ADD_PAGE_ENTRY: - if ( !capable(CAP_SYS_ADMIN) ) return -EPERM; - if ( copy_from_user (&sentry, (void *) arg, sizeof sentry) ) - return -EFAULT; - err = mtrr_file_add (sentry.base, sentry.size, sentry.type, 1, file, 1); - if (err < 0) return err; - break; - case MTRRIOC_SET_PAGE_ENTRY: - if ( !capable(CAP_SYS_ADMIN) ) return -EPERM; - if ( copy_from_user (&sentry, (void *) arg, sizeof sentry) ) - return -EFAULT; - err = mtrr_add_page (sentry.base, sentry.size, sentry.type, 0); - if (err < 0) return err; - break; - case MTRRIOC_DEL_PAGE_ENTRY: - if ( !capable(CAP_SYS_ADMIN) ) return -EPERM; - if ( copy_from_user (&sentry, (void *) arg, sizeof sentry) ) - return -EFAULT; - err = mtrr_file_del (sentry.base, sentry.size, file, 1); - if (err < 0) return err; - break; - case MTRRIOC_KILL_PAGE_ENTRY: - if ( !capable(CAP_SYS_ADMIN) ) return -EPERM; - if ( copy_from_user (&sentry, (void *) arg, sizeof sentry) ) - return -EFAULT; - err = mtrr_del_page (-1, sentry.base, sentry.size); - if (err < 0) return err; - break; - case MTRRIOC_GET_PAGE_ENTRY: - if ( copy_from_user (&gentry, (void *) arg, sizeof gentry) ) - return -EFAULT; - if ( gentry.regnum >= get_num_var_ranges () ) return -EINVAL; - (*get_mtrr) (gentry.regnum, &gentry.base, &gentry.size, &type); - gentry.type = type; - - if ( copy_to_user ( (void *) arg, &gentry, sizeof gentry) ) - return -EFAULT; - break; - } - return 0; -} /* End Function mtrr_ioctl */ - -static int mtrr_close (struct inode *ino, struct file *file) -{ - int i, max; - unsigned int *fcount = file->private_data; - - if (fcount == NULL) return 0; - max = get_num_var_ranges (); - for (i = 0; i < max; ++i) - { - while (fcount[i] > 0) - { - if (mtrr_del (i, 0, 0) < 0) printk ("mtrr: reg %d not used\n", i); - --fcount[i]; - } - } - kfree (fcount); - file->private_data = NULL; - return 0; -} /* End Function mtrr_close */ - -static struct file_operations mtrr_fops = -{ - .owner = THIS_MODULE, - .read = mtrr_read, - .write = mtrr_write, - .ioctl = mtrr_ioctl, - .release = mtrr_close, -}; - -# ifdef CONFIG_PROC_FS - -static struct proc_dir_entry *proc_root_mtrr; - -# endif /* CONFIG_PROC_FS */ - -static devfs_handle_t devfs_handle; - -static void compute_ascii (void) -{ - char factor; - int i, max; - mtrr_type type; - unsigned long base, size; - - ascii_buf_bytes = 0; - max = get_num_var_ranges (); - for (i = 0; i < max; i++) - { - (*get_mtrr) (i, &base, &size, &type); - if (size == 0) usage_table[i] = 0; - else - { - if (size < (0x100000 >> PAGE_SHIFT)) - { - /* less than 1MB */ - factor = 'K'; - size <<= PAGE_SHIFT - 10; - } - else - { - factor = 'M'; - size >>= 20 - PAGE_SHIFT; - } - sprintf - (ascii_buffer + ascii_buf_bytes, - "reg%02i: base=0x%05lx000 (%4liMB), size=%4li%cB: %s, count=%d\n", - i, base, base >> (20 - PAGE_SHIFT), size, factor, - attrib_to_str (type), usage_table[i]); - ascii_buf_bytes += strlen (ascii_buffer + ascii_buf_bytes); - } - } - devfs_set_file_size (devfs_handle, ascii_buf_bytes); -# ifdef CONFIG_PROC_FS - if (proc_root_mtrr) - proc_root_mtrr->size = ascii_buf_bytes; -# endif /* CONFIG_PROC_FS */ -} /* End Function compute_ascii */ - -#endif /* USERSPACE_INTERFACE */ - -EXPORT_SYMBOL(mtrr_add); -EXPORT_SYMBOL(mtrr_del); - -#ifdef CONFIG_SMP - -typedef struct -{ - unsigned long base; - unsigned long size; - mtrr_type type; -} arr_state_t; - -arr_state_t arr_state[8] __initdata = -{ - {0UL,0UL,0UL}, {0UL,0UL,0UL}, {0UL,0UL,0UL}, {0UL,0UL,0UL}, - {0UL,0UL,0UL}, {0UL,0UL,0UL}, {0UL,0UL,0UL}, {0UL,0UL,0UL} -}; - -unsigned char ccr_state[7] __initdata = { 0, 0, 0, 0, 0, 0, 0 }; - -static void __init cyrix_arr_init_secondary(void) -{ - struct set_mtrr_context ctxt; - int i; - - /* flush cache and enable MAPEN */ - set_mtrr_prepare_save (&ctxt); - set_mtrr_cache_disable (&ctxt); - - /* the CCRs are not contiguous */ - for(i=0; i<4; i++) setCx86(CX86_CCR0 + i, ccr_state[i]); - for( ; i<7; i++) setCx86(CX86_CCR4 + i, ccr_state[i]); - for(i=0; i<8; i++) - cyrix_set_arr_up(i, - arr_state[i].base, arr_state[i].size, arr_state[i].type, FALSE); - - set_mtrr_done (&ctxt); /* flush cache and disable MAPEN */ -} /* End Function cyrix_arr_init_secondary */ - -#endif - -/* - * On Cyrix 6x86(MX) and M II the ARR3 is special: it has connection - * with the SMM (System Management Mode) mode. So we need the following: - * Check whether SMI_LOCK (CCR3 bit 0) is set - * if it is set, write a warning message: ARR3 cannot be changed! - * (it cannot be changed until the next processor reset) - * if it is reset, then we can change it, set all the needed bits: - * - disable access to SMM memory through ARR3 range (CCR1 bit 7 reset) - * - disable access to SMM memory (CCR1 bit 2 reset) - * - disable SMM mode (CCR1 bit 1 reset) - * - disable write protection of ARR3 (CCR6 bit 1 reset) - * - (maybe) disable ARR3 - * Just to be sure, we enable ARR usage by the processor (CCR5 bit 5 set) - */ -static void __init cyrix_arr_init(void) -{ - struct set_mtrr_context ctxt; - unsigned char ccr[7]; - int ccrc[7] = { 0, 0, 0, 0, 0, 0, 0 }; -#ifdef CONFIG_SMP - int i; -#endif - - /* flush cache and enable MAPEN */ - set_mtrr_prepare_save (&ctxt); - set_mtrr_cache_disable (&ctxt); - - /* Save all CCRs locally */ - ccr[0] = getCx86 (CX86_CCR0); - ccr[1] = getCx86 (CX86_CCR1); - ccr[2] = getCx86 (CX86_CCR2); - ccr[3] = ctxt.ccr3; - ccr[4] = getCx86 (CX86_CCR4); - ccr[5] = getCx86 (CX86_CCR5); - ccr[6] = getCx86 (CX86_CCR6); - - if (ccr[3] & 1) - { - ccrc[3] = 1; - arr3_protected = 1; - } - else - { - /* Disable SMM mode (bit 1), access to SMM memory (bit 2) and - * access to SMM memory through ARR3 (bit 7). - */ - if (ccr[1] & 0x80) { ccr[1] &= 0x7f; ccrc[1] |= 0x80; } - if (ccr[1] & 0x04) { ccr[1] &= 0xfb; ccrc[1] |= 0x04; } - if (ccr[1] & 0x02) { ccr[1] &= 0xfd; ccrc[1] |= 0x02; } - arr3_protected = 0; - if (ccr[6] & 0x02) { - ccr[6] &= 0xfd; ccrc[6] = 1; /* Disable write protection of ARR3 */ - setCx86 (CX86_CCR6, ccr[6]); - } - /* Disable ARR3. This is safe now that we disabled SMM. */ - /* cyrix_set_arr_up (3, 0, 0, 0, FALSE); */ - } - /* If we changed CCR1 in memory, change it in the processor, too. */ - if (ccrc[1]) setCx86 (CX86_CCR1, ccr[1]); - - /* Enable ARR usage by the processor */ - if (!(ccr[5] & 0x20)) - { - ccr[5] |= 0x20; ccrc[5] = 1; - setCx86 (CX86_CCR5, ccr[5]); - } - -#ifdef CONFIG_SMP - for(i=0; i<7; i++) ccr_state[i] = ccr[i]; - for(i=0; i<8; i++) - cyrix_get_arr(i, - &arr_state[i].base, &arr_state[i].size, &arr_state[i].type); -#endif - - set_mtrr_done (&ctxt); /* flush cache and disable MAPEN */ - - if ( ccrc[5] ) printk ("mtrr: ARR usage was not enabled, enabled manually\n"); - if ( ccrc[3] ) printk ("mtrr: ARR3 cannot be changed\n"); -/* - if ( ccrc[1] & 0x80) printk ("mtrr: SMM memory access through ARR3 disabled\n"); - if ( ccrc[1] & 0x04) printk ("mtrr: SMM memory access disabled\n"); - if ( ccrc[1] & 0x02) printk ("mtrr: SMM mode disabled\n"); -*/ - if ( ccrc[6] ) printk ("mtrr: ARR3 was write protected, unprotected\n"); -} /* End Function cyrix_arr_init */ - -/* - * Initialise the later (saner) Winchip MCR variant. In this version - * the BIOS can pass us the registers it has used (but not their values) - * and the control register is read/write - */ - -static void __init centaur_mcr1_init(void) -{ - unsigned i; - u32 lo, hi; - - /* Unfortunately, MCR's are read-only, so there is no way to - * find out what the bios might have done. - */ - - rdmsr(MSR_IDT_MCR_CTRL, lo, hi); - if(((lo>>17)&7)==1) /* Type 1 Winchip2 MCR */ - { - lo&= ~0x1C0; /* clear key */ - lo|= 0x040; /* set key to 1 */ - wrmsr(MSR_IDT_MCR_CTRL, lo, hi); /* unlock MCR */ - } - - centaur_mcr_type = 1; - - /* - * Clear any unconfigured MCR's. - */ - - for (i = 0; i < 8; ++i) - { - if(centaur_mcr[i]. high == 0 && centaur_mcr[i].low == 0) - { - if(!(lo & (1<<(9+i)))) - wrmsr (MSR_IDT_MCR0 + i , 0, 0); - else - /* - * If the BIOS set up an MCR we cannot see it - * but we don't wish to obliterate it - */ - centaur_mcr_reserved |= (1<= 0x80000008)) { - u32 phys_addr; - phys_addr = cpuid_eax(0x80000008) & 0xff ; - size_or_mask = ~((1 << (phys_addr - PAGE_SHIFT)) - 1); - size_and_mask = ~size_or_mask & 0xfff00000; - break; - } - size_or_mask = 0xff000000; /* 36 bits */ - size_and_mask = 0x00f00000; - break; - - case X86_VENDOR_CENTAUR: - /* VIA Cyrix family have Intel style MTRRs, but don't support PAE */ - if (boot_cpu_data.x86 == 6) { - size_or_mask = 0xfff00000; /* 32 bits */ - size_and_mask = 0; - } - break; - - default: - /* Intel, etc. */ - size_or_mask = 0xff000000; /* 36 bits */ - size_and_mask = 0x00f00000; - break; - } - - } else if ( cpu_has_k6_mtrr ) { - /* Pre-Athlon (K6) AMD CPU MTRRs */ - mtrr_if = MTRR_IF_AMD_K6; - get_mtrr = amd_get_mtrr; - set_mtrr_up = amd_set_mtrr_up; - size_or_mask = 0xfff00000; /* 32 bits */ - size_and_mask = 0; - } else if ( cpu_has_cyrix_arr ) { - /* Cyrix ARRs */ - mtrr_if = MTRR_IF_CYRIX_ARR; - get_mtrr = cyrix_get_arr; - set_mtrr_up = cyrix_set_arr_up; - get_free_region = cyrix_get_free_region; - cyrix_arr_init(); - size_or_mask = 0xfff00000; /* 32 bits */ - size_and_mask = 0; - } else if ( cpu_has_centaur_mcr ) { - /* Centaur MCRs */ - mtrr_if = MTRR_IF_CENTAUR_MCR; - get_mtrr = centaur_get_mcr; - set_mtrr_up = centaur_set_mcr_up; - get_free_region = centaur_get_free_region; - centaur_mcr_init(); - size_or_mask = 0xfff00000; /* 32 bits */ - size_and_mask = 0; - } else { - /* No supported MTRR interface */ - mtrr_if = MTRR_IF_NONE; - } - - printk ("mtrr: v%s Richard Gooch (rgooch@atnf.csiro.au)\n" - "mtrr: detected mtrr type: %s\n", - MTRR_VERSION, mtrr_if_name[mtrr_if]); - - return (mtrr_if != MTRR_IF_NONE); -} /* End Function mtrr_setup */ - -#ifdef CONFIG_SMP - -static volatile unsigned long smp_changes_mask __initdata = 0; -static struct mtrr_state smp_mtrr_state __initdata = {0, 0}; - -void __init mtrr_init_boot_cpu(void) -{ - if ( !mtrr_setup () ) - return; - - if ( mtrr_if == MTRR_IF_INTEL ) { - /* Only for Intel MTRRs */ - get_mtrr_state (&smp_mtrr_state); - } -} /* End Function mtrr_init_boot_cpu */ - -static void __init intel_mtrr_init_secondary_cpu(void) -{ - unsigned long mask, count; - struct set_mtrr_context ctxt; - - /* Note that this is not ideal, since the cache is only flushed/disabled - for this CPU while the MTRRs are changed, but changing this requires - more invasive changes to the way the kernel boots */ - set_mtrr_prepare_save (&ctxt); - set_mtrr_cache_disable (&ctxt); - mask = set_mtrr_state (&smp_mtrr_state, &ctxt); - set_mtrr_done (&ctxt); - /* Use the atomic bitops to update the global mask */ - for (count = 0; count < sizeof mask * 8; ++count) - { - if (mask & 0x01) set_bit (count, &smp_changes_mask); - mask >>= 1; - } -} /* End Function intel_mtrr_init_secondary_cpu */ - -void __init mtrr_init_secondary_cpu(void) -{ - switch ( mtrr_if ) { - case MTRR_IF_INTEL: - /* Intel (P6) standard MTRRs */ - intel_mtrr_init_secondary_cpu(); - break; - case MTRR_IF_CYRIX_ARR: - /* This is _completely theoretical_! - * I assume here that one day Cyrix will support Intel APIC. - * In reality on non-Intel CPUs we won't even get to this routine. - * Hopefully no one will plug two Cyrix processors in a dual P5 board. - * :-) - */ - cyrix_arr_init_secondary (); - break; - case MTRR_IF_NONE: - break; - default: - /* I see no MTRRs I can support in SMP mode... */ - printk ("mtrr: SMP support incomplete for this vendor\n"); - } -} /* End Function mtrr_init_secondary_cpu */ -#endif /* CONFIG_SMP */ - -int __init mtrr_init(void) -{ -#ifdef CONFIG_SMP - /* mtrr_setup() should already have been called from mtrr_init_boot_cpu() */ - - if ( mtrr_if == MTRR_IF_INTEL ) { - finalize_mtrr_state (&smp_mtrr_state); - mtrr_state_warn (smp_changes_mask); - } -#else - if ( !mtrr_setup() ) - return 0; /* MTRRs not supported? */ -#endif - -#ifdef CONFIG_PROC_FS - proc_root_mtrr = create_proc_entry ("mtrr", S_IWUSR | S_IRUGO, &proc_root); - if (proc_root_mtrr) { - proc_root_mtrr->owner = THIS_MODULE; - proc_root_mtrr->proc_fops = &mtrr_fops; - } -#endif -#ifdef USERSPACE_INTERFACE - devfs_handle = devfs_register (NULL, "cpu/mtrr", DEVFS_FL_DEFAULT, 0, 0, - S_IFREG | S_IRUGO | S_IWUSR, - &mtrr_fops, NULL); -#endif - init_table (); - return 0; -} /* End Function mtrr_init */ - -/* - * Local Variables: - * mode:c - * c-file-style:"k&r" - * c-basic-offset:4 - * End: - */ diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index b8c97ff93fb2..a2e4b515708b 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -968,10 +968,6 @@ static void __init smp_boot_cpus(unsigned int max_cpus) { int apicid, cpu, bit; -#ifdef CONFIG_MTRR - /* Must be done before other processors booted */ - mtrr_init_boot_cpu (); -#endif /* * Initialize the logical to physical CPU number mapping * and the per-CPU profiling counter/multiplier diff --git a/include/linux/smp.h b/include/linux/smp.h index 6f6c6ed7a239..8ed8547e5212 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -96,6 +96,7 @@ static inline void smp_send_reschedule_all(void) { } #define cpu_online_map 1 #define cpu_online(cpu) ({ cpu; 1; }) #define num_online_cpus() 1 +#define num_booting_cpus() 1 struct notifier_block; -- cgit v1.2.3 From 6a95284048359fb4e1c96e02c6be0be9bdc71d6c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 14 Aug 2002 21:20:44 -0700 Subject: [PATCH] pagevec infrastructure This is the first patch in a series of eight which address pagemap_lru_lock contention, and which simplify the VM locking hierarchy. Most testing has been done with all eight patches applied, so it would be best not to cherrypick, please. The workload which was optimised was: 4x500MHz PIII CPUs, mem=512m, six disks, six filesystems, six processes each flat-out writing a large file onto one of the disks. ie: heavy page replacement load. The frequency with which pagemap_lru_lock is taken is reduced by 90%. Lockmeter claims that pagemap_lru_lock contention on the 4-way has been reduced by 98%. Total amount of system time lost to lock spinning went from 2.5% to 0.85%. Anton ran a similar test on 8-way PPC, the reduction in system time was around 25%, and the reduction in time spent playing with pagemap_lru_lock was 80%. http://samba.org/~anton/linux/2.5.30/standard/ versus http://samba.org/~anton/linux/2.5.30/akpm/ Throughput changes on uniprocessor are modest: a 1% speedup with this workload due to shortened code paths and improved cache locality. The patches do two main things: 1: In almost all places where the kernel was doing something with lots of pages one-at-a-time, convert the code to do the same thing sixteen-pages-at-a-time. Take the lock once rather than sixteen times. Take the lock for the minimum possible time. 2: Multithread the pagecache reclaim function: don't hold pagemap_lru_lock while reclaiming pagecache pages. That function was massively expensive. One fallout from this work is that we never take any other locks while holding pagemap_lru_lock. So this lock conceptually disappears from the VM locking hierarchy. So. This is all basically a code tweak to improve kernel scalability. It does it by optimising the existing design, rather than by redesign. There is little conceptual change to how the VM works. This is as far as I can tweak it. It seems that the results are now acceptable on SMP. But things are still bad on NUMA. It is expected that the per-zone LRU and per-zone LRU lock patches will fix NUMA as well, but that has yet to be tested. This first patch introduces `struct pagevec', which is the basic unit of batched work. It is simply: struct pagevec { unsigned nr; struct page *pages[16]; }; pagevecs are used in the following patches to get the VM away from page-at-a-time operations. This patch includes all the pagevec library functions which are used in later patches. --- include/linux/pagevec.h | 76 +++++++++++++++++++++++ mm/page_alloc.c | 9 +++ mm/swap.c | 160 +++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 236 insertions(+), 9 deletions(-) create mode 100644 include/linux/pagevec.h (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h new file mode 100644 index 000000000000..7d091aea7543 --- /dev/null +++ b/include/linux/pagevec.h @@ -0,0 +1,76 @@ +/* + * include/linux/pagevec.h + * + * In many places it is efficient to batch an operation up against multiple + * pages. A pagevec is a multipage container which is used for that. + */ + +#define PAGEVEC_SIZE 16 + +struct page; + +struct pagevec { + unsigned nr; + struct page *pages[PAGEVEC_SIZE]; +}; + +void __pagevec_release(struct pagevec *pvec); +void __pagevec_release_nonlru(struct pagevec *pvec); +void __pagevec_free(struct pagevec *pvec); +void __pagevec_lru_add(struct pagevec *pvec); +void __pagevec_lru_del(struct pagevec *pvec); +void pagevec_deactivate_inactive(struct pagevec *pvec); + +static inline void pagevec_init(struct pagevec *pvec) +{ + pvec->nr = 0; +} + +static inline unsigned pagevec_count(struct pagevec *pvec) +{ + return pvec->nr; +} + +static inline unsigned pagevec_space(struct pagevec *pvec) +{ + return PAGEVEC_SIZE - pvec->nr; +} + +/* + * Add a page to a pagevec. Returns the number of slots still available. + */ +static inline unsigned pagevec_add(struct pagevec *pvec, struct page *page) +{ + pvec->pages[pvec->nr++] = page; + return pagevec_space(pvec); +} + +static inline void pagevec_release(struct pagevec *pvec) +{ + if (pagevec_count(pvec)) + __pagevec_release(pvec); +} + +static inline void pagevec_release_nonlru(struct pagevec *pvec) +{ + if (pagevec_count(pvec)) + __pagevec_release_nonlru(pvec); +} + +static inline void pagevec_free(struct pagevec *pvec) +{ + if (pagevec_count(pvec)) + __pagevec_free(pvec); +} + +static inline void pagevec_lru_add(struct pagevec *pvec) +{ + if (pagevec_count(pvec)) + __pagevec_lru_add(pvec); +} + +static inline void pagevec_lru_del(struct pagevec *pvec) +{ + if (pagevec_count(pvec)) + __pagevec_lru_del(pvec); +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0e8696dfa898..2f51b6ac9df5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -22,6 +22,7 @@ #include #include #include +#include unsigned long totalram_pages; unsigned long totalhigh_pages; @@ -458,6 +459,14 @@ void page_cache_release(struct page *page) } } +void __pagevec_free(struct pagevec *pvec) +{ + int i = pagevec_count(pvec); + + while (--i >= 0) + __free_pages_ok(pvec->pages[i], 0); +} + void __free_pages(struct page *page, unsigned int order) { if (!PageReserved(page) && put_page_testzero(page)) diff --git a/mm/swap.c b/mm/swap.c index 6dcaf7567559..0d2be8e015d5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -17,11 +17,9 @@ #include #include #include +#include #include - -#include -#include /* for copy_to/from_user */ -#include +#include /* How many pages do we try to swap or page in/out together? */ int page_cluster; @@ -38,6 +36,9 @@ static inline void activate_page_nolock(struct page * page) } } +/* + * FIXME: speed this up? + */ void activate_page(struct page * page) { spin_lock(&pagemap_lru_lock); @@ -51,9 +52,10 @@ void activate_page(struct page * page) */ void lru_cache_add(struct page * page) { - if (!TestSetPageLRU(page)) { + if (!PageLRU(page)) { spin_lock(&pagemap_lru_lock); - add_page_to_inactive_list(page); + if (!TestSetPageLRU(page)) + add_page_to_inactive_list(page); spin_unlock(&pagemap_lru_lock); } } @@ -68,11 +70,10 @@ void lru_cache_add(struct page * page) void __lru_cache_del(struct page * page) { if (TestClearPageLRU(page)) { - if (PageActive(page)) { + if (PageActive(page)) del_page_from_active_list(page); - } else { + else del_page_from_inactive_list(page); - } } } @@ -87,6 +88,147 @@ void lru_cache_del(struct page * page) spin_unlock(&pagemap_lru_lock); } +/* + * Batched page_cache_release(). Decrement the reference count on all the + * pagevec's pages. If it fell to zero then remove the page from the LRU and + * free it. + * + * Avoid taking pagemap_lru_lock if possible, but if it is taken, retain it + * for the remainder of the operation. + * + * The locking in this function is against shrink_cache(): we recheck the + * page count inside the lock to see whether shrink_cache grabbed the page + * via the LRU. If it did, give up: shrink_cache will free it. + * + * This function reinitialises the caller's pagevec. + */ +void __pagevec_release(struct pagevec *pvec) +{ + int i; + int lock_held = 0; + struct pagevec pages_to_free; + + pagevec_init(&pages_to_free); + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + + if (!put_page_testzero(page)) + continue; + + if (!lock_held && PageLRU(page)) { + spin_lock(&pagemap_lru_lock); + lock_held = 1; + } + + if (TestClearPageLRU(page)) { + if (PageActive(page)) + del_page_from_active_list(page); + else + del_page_from_inactive_list(page); + } + if (page_count(page) == 0) + pagevec_add(&pages_to_free, page); + } + if (lock_held) + spin_unlock(&pagemap_lru_lock); + + pagevec_free(&pages_to_free); + pagevec_init(pvec); +} + +/* + * pagevec_release() for pages which are known to not be on the LRU + * + * This function reinitialises the caller's pagevec. + */ +void __pagevec_release_nonlru(struct pagevec *pvec) +{ + int i; + struct pagevec pages_to_free; + + pagevec_init(&pages_to_free); + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + + BUG_ON(PageLRU(page)); + if (put_page_testzero(page)) + pagevec_add(&pages_to_free, page); + } + pagevec_free(&pages_to_free); + pagevec_init(pvec); +} + +/* + * Move all the inactive pages to the head of the inactive list + * and release them. Reinitialises the caller's pagevec. + */ +void pagevec_deactivate_inactive(struct pagevec *pvec) +{ + int i; + int lock_held = 0; + + if (pagevec_count(pvec) == 0) + return; + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + + if (!lock_held) { + if (PageActive(page) || !PageLRU(page)) + continue; + spin_lock(&pagemap_lru_lock); + lock_held = 1; + } + if (!PageActive(page) && PageLRU(page)) + list_move(&page->lru, &inactive_list); + } + if (lock_held) + spin_unlock(&pagemap_lru_lock); + __pagevec_release(pvec); +} + +/* + * Add the passed pages to the inactive_list, then drop the caller's refcount + * on them. Reinitialises the caller's pagevec. + */ +void __pagevec_lru_add(struct pagevec *pvec) +{ + int i; + + spin_lock(&pagemap_lru_lock); + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + + if (TestSetPageLRU(page)) + BUG(); + add_page_to_inactive_list(page); + } + spin_unlock(&pagemap_lru_lock); + pagevec_release(pvec); +} + +/* + * Remove the passed pages from the LRU, then drop the caller's refcount on + * them. Reinitialises the caller's pagevec. + */ +void __pagevec_lru_del(struct pagevec *pvec) +{ + int i; + + spin_lock(&pagemap_lru_lock); + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + + if (!TestClearPageLRU(page)) + BUG(); + if (PageActive(page)) + del_page_from_active_list(page); + else + del_page_from_inactive_list(page); + } + spin_unlock(&pagemap_lru_lock); + pagevec_release(pvec); +} + /* * Perform any setup for the swap system */ -- cgit v1.2.3 From 3aa1dc772547672e6ff453117d169c47a5a7cbc5 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 14 Aug 2002 21:20:48 -0700 Subject: [PATCH] multithread page reclaim This patch multithreads the main page reclaim function, shrink_cache(). This function used to run under pagemap_lru_lock. Instead, we grab that lock, put 32 pages from the LRU into a private list, drop the pagemap_lru_lock and then proceed to attempt to free those pages. Any pages which were succesfully reclaimed are batch-freed. Pages which were not reclaimed are re-added to the LRU. This patch reduces pagemap_lru_lock contention on the 4-way by a factor of thirty. The shrink_cache() code has been simplified somewhat. refill_inactive() was being called too often - often just to process two or three pages. Fiddled with that so it processes pages at the same rate, but works on 32 pages at a time. Added a couple of mark_page_accessed() calls into mm/memory.c from 2.4. They seem appropriate. Change the shrink_caches() logic so that it will still trickle through the active list (via refill_inactive) even if the inactive list is much larger than the active list. --- include/linux/mm.h | 1 + include/linux/page-flags.h | 2 + include/linux/swap.h | 9 +- mm/filemap.c | 3 +- mm/memory.c | 2 + mm/swap_state.c | 1 + mm/vmscan.c | 519 +++++++++++++++++++++++++++------------------ 7 files changed, 322 insertions(+), 215 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index baafd9a57b25..df42d899e41f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -195,6 +195,7 @@ struct page { */ #define get_page(p) atomic_inc(&(p)->count) #define put_page(p) __free_page(p) +#define __put_page(p) atomic_dec(&(p)->count) #define put_page_testzero(p) atomic_dec_and_test(&(p)->count) #define page_count(p) atomic_read(&(p)->count) #define set_page_count(p,v) atomic_set(&(p)->count, v) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index f6b48a987cd4..9801b15876d9 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -154,6 +154,7 @@ extern void get_page_state(struct page_state *ret); ret; \ }) +#define SetPageLRU(page) set_bit(PG_lru, &(page)->flags) #define PageLRU(page) test_bit(PG_lru, &(page)->flags) #define TestSetPageLRU(page) test_and_set_bit(PG_lru, &(page)->flags) #define TestClearPageLRU(page) test_and_clear_bit(PG_lru, &(page)->flags) @@ -161,6 +162,7 @@ extern void get_page_state(struct page_state *ret); #define PageActive(page) test_bit(PG_active, &(page)->flags) #define SetPageActive(page) set_bit(PG_active, &(page)->flags) #define ClearPageActive(page) clear_bit(PG_active, &(page)->flags) +#define TestClearPageActive(page) test_and_clear_bit(PG_active, &(page)->flags) #define PageSlab(page) test_bit(PG_slab, &(page)->flags) #define SetPageSlab(page) set_bit(PG_slab, &(page)->flags) diff --git a/include/linux/swap.h b/include/linux/swap.h index b3bae533a6a4..8dbd9d7e401d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -227,12 +227,17 @@ do { \ BUG(); \ } while (0) +#define __add_page_to_active_list(page) \ +do { \ + list_add(&(page)->lru, &active_list); \ + inc_page_state(nr_active); \ +} while (0) + #define add_page_to_active_list(page) \ do { \ DEBUG_LRU_PAGE(page); \ SetPageActive(page); \ - list_add(&(page)->lru, &active_list); \ - inc_page_state(nr_active); \ + __add_page_to_active_list(page); \ } while (0) #define add_page_to_inactive_list(page) \ diff --git a/mm/filemap.c b/mm/filemap.c index c06901afd44c..68ad674fa961 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -545,7 +545,8 @@ int add_to_page_cache(struct page *page, page_cache_get(page); } write_unlock(&mapping->page_lock); - if (!error) + /* Anon pages are already on the LRU */ + if (!error && !PageSwapCache(page)) lru_cache_add(page); return error; } diff --git a/mm/memory.c b/mm/memory.c index f1c4f9c2dcbd..006644503383 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1180,6 +1180,7 @@ static int do_swap_page(struct mm_struct * mm, KERNEL_STAT_INC(pgmajfault); } + mark_page_accessed(page); lock_page(page); /* @@ -1257,6 +1258,7 @@ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, flush_page_to_ram(page); entry = pte_mkwrite(pte_mkdirty(mk_pte(page, vma->vm_page_prot))); lru_cache_add(page); + mark_page_accessed(page); } set_pte(page_table, entry); diff --git a/mm/swap_state.c b/mm/swap_state.c index 62c3448c2b34..332e0d26732d 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -381,6 +381,7 @@ struct page * read_swap_cache_async(swp_entry_t entry) /* * Initiate read into locked page and return. */ + lru_cache_add(new_page); swap_readpage(NULL, new_page); return new_page; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 21ca8d248bef..45aad6a76ae4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -23,6 +23,7 @@ #include #include #include /* for try_to_release_page() */ +#include #include #include @@ -36,10 +37,35 @@ */ #define DEF_PRIORITY (6) -static inline int is_page_cache_freeable(struct page * page) -{ - return page_count(page) - !!PagePrivate(page) == 1; -} +#ifdef ARCH_HAS_PREFETCH +#define prefetch_prev_lru_page(_page, _base, _field) \ + do { \ + if ((_page)->lru.prev != _base) { \ + struct page *prev; \ + \ + prev = list_entry(_page->lru.prev, \ + struct page, lru); \ + prefetch(&prev->_field); \ + } \ + } while (0) +#else +#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0) +#endif + +#ifdef ARCH_HAS_PREFETCHW +#define prefetchw_prev_lru_page(_page, _base, _field) \ + do { \ + if ((_page)->lru.prev != _base) { \ + struct page *prev; \ + \ + prev = list_entry(_page->lru.prev, \ + struct page, lru); \ + prefetchw(&prev->_field); \ + } \ + } while (0) +#else +#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) +#endif /* Must be called with page's pte_chain_lock held. */ static inline int page_mapping_inuse(struct page * page) @@ -61,89 +87,49 @@ static inline int page_mapping_inuse(struct page * page) return 0; } -static int -shrink_cache(int nr_pages, zone_t *classzone, - unsigned int gfp_mask, int priority, int max_scan) +static inline int is_page_cache_freeable(struct page *page) +{ + return page_count(page) - !!PagePrivate(page) == 2; +} + +static /* inline */ int +shrink_list(struct list_head *page_list, int nr_pages, zone_t *classzone, + unsigned int gfp_mask, int priority, int *max_scan) { - struct list_head * entry; struct address_space *mapping; + LIST_HEAD(ret_pages); + struct pagevec freed_pvec; + const int nr_pages_in = nr_pages; + int pgactivate = 0; - spin_lock(&pagemap_lru_lock); - while (--max_scan >= 0 && - (entry = inactive_list.prev) != &inactive_list) { + pagevec_init(&freed_pvec); + while (!list_empty(page_list)) { struct page *page; int may_enter_fs; - if (need_resched()) { - spin_unlock(&pagemap_lru_lock); - __set_current_state(TASK_RUNNING); - schedule(); - spin_lock(&pagemap_lru_lock); - continue; - } - - page = list_entry(entry, struct page, lru); - - if (unlikely(!PageLRU(page))) - BUG(); - if (unlikely(PageActive(page))) - BUG(); - - list_del(entry); - list_add(entry, &inactive_list); - KERNEL_STAT_INC(pgscan); - - /* - * Zero page counts can happen because we unlink the pages - * _after_ decrementing the usage count.. - */ - if (unlikely(!page_count(page))) - continue; - + page = list_entry(page_list->prev, struct page, lru); + list_del(&page->lru); if (!memclass(page_zone(page), classzone)) - continue; - - /* - * swap activity never enters the filesystem and is safe - * for GFP_NOFS allocations. - */ - may_enter_fs = (gfp_mask & __GFP_FS) || - (PageSwapCache(page) && (gfp_mask & __GFP_IO)); - - /* - * IO in progress? Leave it at the back of the list. - */ - if (unlikely(PageWriteback(page))) { - if (may_enter_fs) { - page_cache_get(page); - spin_unlock(&pagemap_lru_lock); - wait_on_page_writeback(page); - page_cache_release(page); - spin_lock(&pagemap_lru_lock); - } - continue; - } + goto keep; if (TestSetPageLocked(page)) - continue; + goto keep; - if (PageWriteback(page)) { /* The non-racy check */ - unlock_page(page); - continue; + BUG_ON(PageActive(page)); + may_enter_fs = (gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (gfp_mask & __GFP_IO)); + if (PageWriteback(page)) { + if (may_enter_fs) + wait_on_page_writeback(page); /* throttling */ + else + goto keep_locked; } - /* - * The page is in active use or really unfreeable. Move to - * the active list. - */ pte_chain_lock(page); if (page_referenced(page) && page_mapping_inuse(page)) { - del_page_from_inactive_list(page); - add_page_to_active_list(page); + /* In active use or really unfreeable. Activate it. */ pte_chain_unlock(page); - unlock_page(page); - KERNEL_STAT_INC(pgactivate); - continue; + goto activate_locked; } /* @@ -153,18 +139,9 @@ shrink_cache(int nr_pages, zone_t *classzone, * XXX: implement swap clustering ? */ if (page->pte.chain && !page->mapping && !PagePrivate(page)) { - page_cache_get(page); pte_chain_unlock(page); - spin_unlock(&pagemap_lru_lock); - if (!add_to_swap(page)) { - activate_page(page); - unlock_page(page); - page_cache_release(page); - spin_lock(&pagemap_lru_lock); - continue; - } - page_cache_release(page); - spin_lock(&pagemap_lru_lock); + if (!add_to_swap(page)) + goto activate_locked; pte_chain_lock(page); } @@ -174,30 +151,22 @@ shrink_cache(int nr_pages, zone_t *classzone, */ if (page->pte.chain) { switch (try_to_unmap(page)) { - case SWAP_ERROR: - case SWAP_FAIL: - goto page_active; - case SWAP_AGAIN: - pte_chain_unlock(page); - unlock_page(page); - continue; - case SWAP_SUCCESS: - ; /* try to free the page below */ + case SWAP_ERROR: + case SWAP_FAIL: + pte_chain_unlock(page); + goto activate_locked; + case SWAP_AGAIN: + pte_chain_unlock(page); + goto keep_locked; + case SWAP_SUCCESS: + ; /* try to free the page below */ } } pte_chain_unlock(page); mapping = page->mapping; if (PageDirty(page) && is_page_cache_freeable(page) && - page->mapping && may_enter_fs) { - /* - * It is not critical here to write it only if - * the page is unmapped beause any direct writer - * like O_DIRECT would set the page's dirty bitflag - * on the physical page after having successfully - * pinned it and after the I/O to the page is finished, - * so the direct writes to the page cannot get lost. - */ + mapping && may_enter_fs) { int (*writeback)(struct page *, int *); const int cluster_size = SWAP_CLUSTER_MAX; int nr_to_write = cluster_size; @@ -205,13 +174,9 @@ shrink_cache(int nr_pages, zone_t *classzone, writeback = mapping->a_ops->vm_writeback; if (writeback == NULL) writeback = generic_vm_writeback; - page_cache_get(page); - spin_unlock(&pagemap_lru_lock); (*writeback)(page, &nr_to_write); - max_scan -= (cluster_size - nr_to_write); - page_cache_release(page); - spin_lock(&pagemap_lru_lock); - continue; + *max_scan -= (cluster_size - nr_to_write); + goto keep; } /* @@ -227,162 +192,292 @@ shrink_cache(int nr_pages, zone_t *classzone, * will do this, as well as the blockdev mapping. * try_to_release_page() will discover that cleanness and will * drop the buffers and mark the page clean - it can be freed. + * + * Rarely, pages can have buffers and no ->mapping. These are + * the pages which were not successfully invalidated in + * truncate_complete_page(). We try to drop those buffers here + * and if that worked, and the page is no longer mapped into + * process address space (page_count == 0) it can be freed. + * Otherwise, leave the page on the LRU so it is swappable. */ if (PagePrivate(page)) { - spin_unlock(&pagemap_lru_lock); - - /* avoid to free a locked page */ - page_cache_get(page); + if (!try_to_release_page(page, 0)) + goto keep_locked; + if (!mapping && page_count(page) == 1) + goto free_it; + } - if (try_to_release_page(page, gfp_mask)) { - if (!mapping) { - /* effectively free the page here */ - unlock_page(page); - page_cache_release(page); - - spin_lock(&pagemap_lru_lock); - if (--nr_pages) - continue; - break; - } else { - /* - * The page is still in pagecache so undo the stuff - * before the try_to_release_page since we've not - * finished and we can now try the next step. - */ - page_cache_release(page); - - spin_lock(&pagemap_lru_lock); - } - } else { - /* failed to drop the buffers so stop here */ - unlock_page(page); - page_cache_release(page); + if (!mapping) + goto keep_locked; /* truncate got there first */ - spin_lock(&pagemap_lru_lock); - continue; - } - } + write_lock(&mapping->page_lock); /* - * This is the non-racy check for busy page. + * The non-racy check for busy page. It is critical to check + * PageDirty _after_ making sure that the page is freeable and + * not in use by anybody. (pagecache + us == 2) */ - if (mapping) { - write_lock(&mapping->page_lock); - if (is_page_cache_freeable(page)) - goto page_freeable; + if (page_count(page) != 2 || PageDirty(page)) { write_unlock(&mapping->page_lock); - } - unlock_page(page); - continue; -page_freeable: - /* - * It is critical to check PageDirty _after_ we made sure - * the page is freeable* so not in use by anybody. - */ - if (PageDirty(page)) { - write_unlock(&mapping->page_lock); - unlock_page(page); - continue; + goto keep_locked; } - /* point of no return */ - if (likely(!PageSwapCache(page))) { - __remove_from_page_cache(page); - write_unlock(&mapping->page_lock); - } else { - swp_entry_t swap; - swap.val = page->index; + if (PageSwapCache(page)) { + swp_entry_t swap = { .val = page->index }; __delete_from_swap_cache(page); write_unlock(&mapping->page_lock); swap_free(swap); + } else { + __remove_from_page_cache(page); + write_unlock(&mapping->page_lock); } - - __lru_cache_del(page); + __put_page(page); /* The pagecache ref */ +free_it: unlock_page(page); + nr_pages--; + if (!pagevec_add(&freed_pvec, page)) + __pagevec_release_nonlru(&freed_pvec); + continue; - /* effectively free the page here */ - page_cache_release(page); - KERNEL_STAT_INC(pgsteal); - if (--nr_pages) - continue; - goto out; -page_active: - /* - * OK, we don't know what to do with the page. - * It's no use keeping it here, so we move it to - * the active list. - */ - del_page_from_inactive_list(page); - add_page_to_active_list(page); - pte_chain_unlock(page); +activate_locked: + SetPageActive(page); + pgactivate++; +keep_locked: unlock_page(page); - KERNEL_STAT_INC(pgactivate); +keep: + list_add(&page->lru, &ret_pages); + BUG_ON(PageLRU(page)); } -out: spin_unlock(&pagemap_lru_lock); + list_splice(&ret_pages, page_list); + if (pagevec_count(&freed_pvec)) + __pagevec_release_nonlru(&freed_pvec); + KERNEL_STAT_ADD(pgsteal, nr_pages_in - nr_pages); + KERNEL_STAT_ADD(pgactivate, pgactivate); return nr_pages; } /* - * This moves pages from the active list to - * the inactive list. + * pagemap_lru_lock is heavily contented. We relieve it by quickly privatising + * a batch of pages and working on them outside the lock. Any pages which were + * not freed will be added back to the LRU. + * + * shrink_cache() is passed the number of pages to try to free, and returns + * the number which are yet-to-free. * - * We move them the other way if the page is - * referenced by one or more processes, from rmap + * For pagecache intensive workloads, the first loop here is the hottest spot + * in the kernel (apart from the copy_*_user functions). */ -static void refill_inactive(int nr_pages) +static /* inline */ int +shrink_cache(int nr_pages, zone_t *classzone, + unsigned int gfp_mask, int priority, int max_scan) { - struct list_head * entry; + LIST_HEAD(page_list); + struct pagevec pvec; + int nr_to_process; + + /* + * Try to ensure that we free `nr_pages' pages in one pass of the loop. + */ + nr_to_process = nr_pages; + if (nr_to_process < SWAP_CLUSTER_MAX) + nr_to_process = SWAP_CLUSTER_MAX; + + pagevec_init(&pvec); spin_lock(&pagemap_lru_lock); - entry = active_list.prev; - while (nr_pages-- && entry != &active_list) { - struct page * page; + while (max_scan > 0 && nr_pages > 0) { + struct page *page; + int n = 0; - page = list_entry(entry, struct page, lru); - entry = entry->prev; + while (n < nr_to_process && !list_empty(&inactive_list)) { + page = list_entry(inactive_list.prev, struct page, lru); - KERNEL_STAT_INC(pgscan); + prefetchw_prev_lru_page(page, &inactive_list, flags); - pte_chain_lock(page); - if (page->pte.chain && page_referenced(page)) { + if (!TestClearPageLRU(page)) + BUG(); + list_del(&page->lru); + if (page_count(page) == 0) { + /* It is currently in pagevec_release() */ + SetPageLRU(page); + list_add(&page->lru, &inactive_list); + continue; + } + list_add(&page->lru, &page_list); + page_cache_get(page); + n++; + } + spin_unlock(&pagemap_lru_lock); + + if (list_empty(&page_list)) + goto done; + + max_scan -= n; + mod_page_state(nr_inactive, -n); + KERNEL_STAT_ADD(pgscan, n); + nr_pages = shrink_list(&page_list, nr_pages, classzone, + gfp_mask, priority, &max_scan); + + if (nr_pages <= 0 && list_empty(&page_list)) + goto done; + + spin_lock(&pagemap_lru_lock); + /* + * Put back any unfreeable pages. + */ + while (!list_empty(&page_list)) { + page = list_entry(page_list.prev, struct page, lru); + if (TestSetPageLRU(page)) + BUG(); list_del(&page->lru); - list_add(&page->lru, &active_list); + if (PageActive(page)) + __add_page_to_active_list(page); + else + add_page_to_inactive_list(page); + if (!pagevec_add(&pvec, page)) { + spin_unlock(&pagemap_lru_lock); + __pagevec_release(&pvec); + spin_lock(&pagemap_lru_lock); + } + } + } + spin_unlock(&pagemap_lru_lock); +done: + pagevec_release(&pvec); + return nr_pages; +} + +/* + * This moves pages from the active list to the inactive list. + * + * We move them the other way if the page is referenced by one or more + * processes, from rmap. + * + * If the pages are mostly unmapped, the processing is fast and it is + * appropriate to hold pagemap_lru_lock across the whole operation. But if + * the pages are mapped, the processing is slow (page_referenced()) so we + * should drop pagemap_lru_lock around each page. It's impossible to balance + * this, so instead we remove the pages from the LRU while processing them. + * It is safe to rely on PG_active against the non-LRU pages in here because + * nobody will play with that bit on a non-LRU page. + * + * The downside is that we have to touch page->count against each page. + * But we had to alter page->flags anyway. + */ +static /* inline */ void refill_inactive(const int nr_pages_in) +{ + int pgdeactivate = 0; + int nr_pages = nr_pages_in; + LIST_HEAD(l_hold); /* The pages which were snipped off */ + LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ + LIST_HEAD(l_active); /* Pages to go onto the active_list */ + struct page *page; + struct pagevec pvec; + + spin_lock(&pagemap_lru_lock); + while (nr_pages && !list_empty(&active_list)) { + page = list_entry(active_list.prev, struct page, lru); + prefetchw_prev_lru_page(page, &active_list, flags); + if (!TestClearPageLRU(page)) + BUG(); + page_cache_get(page); + list_move(&page->lru, &l_hold); + nr_pages--; + } + spin_unlock(&pagemap_lru_lock); + + while (!list_empty(&l_hold)) { + page = list_entry(l_hold.prev, struct page, lru); + list_del(&page->lru); + if (page->pte.chain) { + if (test_and_set_bit(PG_chainlock, &page->flags)) { + list_add(&page->lru, &l_active); + continue; + } + if (page->pte.chain && page_referenced(page)) { + pte_chain_unlock(page); + list_add(&page->lru, &l_active); + continue; + } pte_chain_unlock(page); - continue; } - del_page_from_active_list(page); - add_page_to_inactive_list(page); - pte_chain_unlock(page); - KERNEL_STAT_INC(pgdeactivate); + list_add(&page->lru, &l_inactive); + pgdeactivate++; + } + + pagevec_init(&pvec); + spin_lock(&pagemap_lru_lock); + while (!list_empty(&l_inactive)) { + page = list_entry(l_inactive.prev, struct page, lru); + prefetchw_prev_lru_page(page, &l_inactive, flags); + if (TestSetPageLRU(page)) + BUG(); + if (!TestClearPageActive(page)) + BUG(); + list_move(&page->lru, &inactive_list); + if (!pagevec_add(&pvec, page)) { + spin_unlock(&pagemap_lru_lock); + __pagevec_release(&pvec); + spin_lock(&pagemap_lru_lock); + } + } + while (!list_empty(&l_active)) { + page = list_entry(l_active.prev, struct page, lru); + prefetchw_prev_lru_page(page, &l_active, flags); + if (TestSetPageLRU(page)) + BUG(); + BUG_ON(!PageActive(page)); + list_move(&page->lru, &active_list); + if (!pagevec_add(&pvec, page)) { + spin_unlock(&pagemap_lru_lock); + __pagevec_release(&pvec); + spin_lock(&pagemap_lru_lock); + } } spin_unlock(&pagemap_lru_lock); + pagevec_release(&pvec); + + mod_page_state(nr_active, -pgdeactivate); + mod_page_state(nr_inactive, pgdeactivate); + KERNEL_STAT_ADD(pgscan, nr_pages_in - nr_pages); + KERNEL_STAT_ADD(pgdeactivate, pgdeactivate); } -static int FASTCALL(shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages)); -static int shrink_caches(zone_t * classzone, int priority, unsigned int gfp_mask, int nr_pages) +static /* inline */ int +shrink_caches(zone_t *classzone, int priority, + unsigned int gfp_mask, int nr_pages) { - int chunk_size = nr_pages; unsigned long ratio; struct page_state ps; int max_scan; + static atomic_t nr_to_refill = ATOMIC_INIT(0); - nr_pages -= kmem_cache_reap(gfp_mask); - if (nr_pages <= 0) - return 0; - - nr_pages = chunk_size; + if (kmem_cache_reap(gfp_mask) >= nr_pages) + return 0; /* - * Try to keep the active list 2/3 of the size of the cache + * Try to keep the active list 2/3 of the size of the cache. And + * make sure that refill_inactive is given a decent number of pages. + * + * The "ratio+1" here is important. With pagecache-intensive workloads + * the inactive list is huge, and `ratio' evaluates to zero all the + * time. Which pins the active list memory. So we add one to `ratio' + * just to make sure that the kernel will slowly sift through the + * active list. */ get_page_state(&ps); ratio = (unsigned long)nr_pages * ps.nr_active / ((ps.nr_inactive | 1) * 2); - refill_inactive(ratio); + atomic_add(ratio+1, &nr_to_refill); + if (atomic_read(&nr_to_refill) > SWAP_CLUSTER_MAX) { + atomic_sub(SWAP_CLUSTER_MAX, &nr_to_refill); + refill_inactive(SWAP_CLUSTER_MAX); + } + max_scan = ps.nr_inactive / priority; nr_pages = shrink_cache(nr_pages, classzone, gfp_mask, priority, max_scan); + if (nr_pages <= 0) return 0; -- cgit v1.2.3 From 9eb76ee2a6f64fe412bef315eccbb1dd63a203ae Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 14 Aug 2002 21:20:57 -0700 Subject: [PATCH] batched addition of pages to the LRU The patch goes through the various places which were calling lru_cache_add() against bulk pages and batches them up. Also. This whole patch series improves the behaviour of the system under heavy writeback load. There is a reduction in page allocation failures, some reduction in loss of interactivity due to page allocators getting stuck on writeback from the VM. (This is still bad though). I think it's due to the change here in mpage_writepages(). That function was originally unconditionally refiling written-back pages to the head of the inactive list. The theory being that they should be moved out of the way of page allocators, who would end up waiting on them. It appears that this simply had the effect of pushing dirty, unwritten data closer to the tail of the inactive list, making things worse. So instead, if the caller is (typically) balance_dirty_pages() then leave the pages where they are on the LRU. If the caller is PF_MEMALLOC then the pages *have* to be refiled. This is because VM writeback is clustered along mapping->dirty_pages, and it's almost certain that the pages which are being written are near the tail of the LRU. If they were left there, page allocators would block on them too soon. It would effectively become a synchronous write. --- fs/mpage.c | 14 +++++++++++--- include/linux/pagemap.h | 2 ++ mm/filemap.c | 49 ++++++++++++++++++++++++++++++++++++------------- mm/readahead.c | 13 +++++++++++-- mm/shmem.c | 2 +- mm/swap_state.c | 6 ++++-- 6 files changed, 65 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/fs/mpage.c b/fs/mpage.c index f098220fcf8e..b4a678288565 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -263,18 +263,25 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, struct bio *bio = NULL; unsigned page_idx; sector_t last_block_in_bio = 0; + struct pagevec lru_pvec; + pagevec_init(&lru_pvec); for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = list_entry(pages->prev, struct page, list); prefetchw(&page->flags); list_del(&page->list); - if (!add_to_page_cache(page, mapping, page->index)) + if (!add_to_page_cache(page, mapping, page->index)) { bio = do_mpage_readpage(bio, page, nr_pages - page_idx, &last_block_in_bio, get_block); - page_cache_release(page); + if (!pagevec_add(&lru_pvec, page)) + __pagevec_lru_add(&lru_pvec); + } else { + page_cache_release(page); + } } + pagevec_lru_add(&lru_pvec); BUG_ON(!list_empty(pages)); if (bio) mpage_bio_submit(READ, bio); @@ -566,7 +573,8 @@ mpage_writepages(struct address_space *mapping, bio = mpage_writepage(bio, page, get_block, &last_block_in_bio, &ret); } - if (!PageActive(page) && PageLRU(page)) { + if ((current->flags & PF_MEMALLOC) && + !PageActive(page) && PageLRU(page)) { if (!pagevec_add(&pvec, page)) pagevec_deactivate_inactive(&pvec); page = NULL; diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index b559ccd68520..69e214920908 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -58,6 +58,8 @@ extern struct page * read_cache_page(struct address_space *mapping, extern int add_to_page_cache(struct page *page, struct address_space *mapping, unsigned long index); +extern int add_to_page_cache_lru(struct page *page, + struct address_space *mapping, unsigned long index); extern void remove_from_page_cache(struct page *page); extern void __remove_from_page_cache(struct page *page); diff --git a/mm/filemap.c b/mm/filemap.c index 68ad674fa961..454786425aa0 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -21,6 +21,7 @@ #include #include #include +#include #include /* * This is needed for the following functions: @@ -530,27 +531,37 @@ int filemap_fdatawait(struct address_space * mapping) * In the case of swapcache, try_to_swap_out() has already locked the page, so * SetPageLocked() is ugly-but-OK there too. The required page state has been * set up by swap_out_add_to_swap_cache(). + * + * This function does not add the page to the LRU. The caller must do that. */ int add_to_page_cache(struct page *page, - struct address_space *mapping, unsigned long offset) + struct address_space *mapping, pgoff_t offset) { int error; + page_cache_get(page); write_lock(&mapping->page_lock); error = radix_tree_insert(&mapping->page_tree, offset, page); if (!error) { SetPageLocked(page); ClearPageDirty(page); ___add_to_page_cache(page, mapping, offset); - page_cache_get(page); + } else { + page_cache_release(page); } write_unlock(&mapping->page_lock); - /* Anon pages are already on the LRU */ - if (!error && !PageSwapCache(page)) - lru_cache_add(page); return error; } +int add_to_page_cache_lru(struct page *page, + struct address_space *mapping, pgoff_t offset) +{ + int ret = add_to_page_cache(page, mapping, offset); + if (ret == 0) + lru_cache_add(page); + return ret; +} + /* * This adds the requested page to the page cache if it isn't already there, * and schedules an I/O to read in its contents from disk. @@ -566,7 +577,7 @@ static int page_cache_read(struct file * file, unsigned long offset) if (!page) return -ENOMEM; - error = add_to_page_cache(page, mapping, offset); + error = add_to_page_cache_lru(page, mapping, offset); if (!error) { error = mapping->a_ops->readpage(file, page); page_cache_release(page); @@ -797,7 +808,7 @@ repeat: if (!cached_page) return NULL; } - err = add_to_page_cache(cached_page, mapping, index); + err = add_to_page_cache_lru(cached_page, mapping, index); if (!err) { page = cached_page; cached_page = NULL; @@ -830,7 +841,7 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index) return NULL; } page = alloc_pages(mapping->gfp_mask & ~__GFP_FS, 0); - if (page && add_to_page_cache(page, mapping, index)) { + if (page && add_to_page_cache_lru(page, mapping, index)) { page_cache_release(page); page = NULL; } @@ -994,7 +1005,7 @@ no_cached_page: break; } } - error = add_to_page_cache(cached_page, mapping, index); + error = add_to_page_cache_lru(cached_page, mapping, index); if (error) { if (error == -EEXIST) goto find_page; @@ -1704,7 +1715,7 @@ repeat: if (!cached_page) return ERR_PTR(-ENOMEM); } - err = add_to_page_cache(cached_page, mapping, index); + err = add_to_page_cache_lru(cached_page, mapping, index); if (err == -EEXIST) goto repeat; if (err < 0) { @@ -1764,8 +1775,14 @@ retry: return page; } -static inline struct page * __grab_cache_page(struct address_space *mapping, - unsigned long index, struct page **cached_page) +/* + * If the page was newly created, increment its refcount and add it to the + * caller's lru-buffering pagevec. This function is specifically for + * generic_file_write(). + */ +static inline struct page * +__grab_cache_page(struct address_space *mapping, unsigned long index, + struct page **cached_page, struct pagevec *lru_pvec) { int err; struct page *page; @@ -1782,6 +1799,9 @@ repeat: goto repeat; if (err == 0) { page = *cached_page; + page_cache_get(page); + if (!pagevec_add(lru_pvec, page)) + __pagevec_lru_add(lru_pvec); *cached_page = NULL; } } @@ -1828,6 +1848,7 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf, int err; unsigned bytes; time_t time_now; + struct pagevec lru_pvec; if (unlikely((ssize_t)count < 0)) return -EINVAL; @@ -1949,6 +1970,7 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf, goto out_status; } + pagevec_init(&lru_pvec); do { unsigned long index; unsigned long offset; @@ -1972,7 +1994,7 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf, __get_user(dummy, buf+bytes-1); } - page = __grab_cache_page(mapping, index, &cached_page); + page = __grab_cache_page(mapping, index, &cached_page, &lru_pvec); if (!page) { status = -ENOMEM; break; @@ -2034,6 +2056,7 @@ ssize_t generic_file_write_nolock(struct file *file, const char *buf, out_status: err = written ? written : status; out: + pagevec_lru_add(&lru_pvec); return err; } diff --git a/mm/readahead.c b/mm/readahead.c index 194f56db2ae4..209c9813525d 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -12,6 +12,7 @@ #include #include #include +#include struct backing_dev_info default_backing_dev_info = { .ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE, @@ -36,6 +37,9 @@ read_pages(struct file *file, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { unsigned page_idx; + struct pagevec lru_pvec; + + pagevec_init(&lru_pvec); if (mapping->a_ops->readpages) return mapping->a_ops->readpages(mapping, pages, nr_pages); @@ -43,10 +47,15 @@ read_pages(struct file *file, struct address_space *mapping, for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = list_entry(pages->prev, struct page, list); list_del(&page->list); - if (!add_to_page_cache(page, mapping, page->index)) + if (!add_to_page_cache(page, mapping, page->index)) { + if (!pagevec_add(&lru_pvec, page)) + __pagevec_lru_add(&lru_pvec); mapping->a_ops->readpage(file, page); - page_cache_release(page); + } else { + page_cache_release(page); + } } + pagevec_lru_add(&lru_pvec); return 0; } diff --git a/mm/shmem.c b/mm/shmem.c index abb0685049e2..e4f8340f7a5d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -668,7 +668,7 @@ repeat: page = page_cache_alloc(mapping); if (!page) goto no_mem; - error = add_to_page_cache(page, mapping, idx); + error = add_to_page_cache_lru(page, mapping, idx); if (error < 0) { page_cache_release(page); goto no_mem; diff --git a/mm/swap_state.c b/mm/swap_state.c index 332e0d26732d..a69e81415c2b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -71,6 +71,9 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry) return -ENOENT; } error = add_to_page_cache(page, &swapper_space, entry.val); + /* + * Anon pages are already on the LRU, we don't run lru_cache_add here. + */ if (error != 0) { swap_free(entry); if (error == -EEXIST) @@ -275,8 +278,7 @@ int move_from_swap_cache(struct page *page, unsigned long index, SetPageDirty(page); ___add_to_page_cache(page, mapping, index); /* fix that up */ - list_del(&page->list); - list_add(&page->list, &mapping->dirty_pages); + list_move(&page->list, &mapping->dirty_pages); write_unlock(&mapping->page_lock); write_unlock(&swapper_space.page_lock); -- cgit v1.2.3 From aaba9265318483297267400fbfce1c399b3ac018 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 14 Aug 2002 21:21:05 -0700 Subject: [PATCH] make pagemap_lru_lock irq-safe It is expensive for a CPU to take an interrupt while holding the page LRU lock, because other CPUs will pile up on the lock while the interrupt runs. Disabling interrupts while holding the lock reduces contention by an additional 30% on 4-way. This is when the only source of interrupts is disk completion. The improvement will be higher with more CPUs and it will be higher if there is networking happening. The maximum hold time of this lock is 17 microseconds on 500 MHx PIII, which is well inside the kernel's maximum interrupt latency (which was 100 usecs when I last looked, a year ago). This optimisation is not needed on uniprocessor, but the patch disables IRQs while holding pagemap_lru_lock anyway, so it becomes an irq-safe spinlock, and pages can be moved from the LRU in interrupt context. pagemap_lru_lock has been renamed to _pagemap_lru_lock to pick up any missed uses, and to reliably break any out-of-tree patches which may be using the old semantics. --- include/linux/swap.h | 2 +- mm/filemap.c | 2 +- mm/swap.c | 28 ++++++++++++++-------------- mm/vmscan.c | 28 ++++++++++++++-------------- 4 files changed, 30 insertions(+), 30 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 8dbd9d7e401d..e09e96170182 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -211,7 +211,7 @@ extern struct swap_list_t swap_list; asmlinkage long sys_swapoff(const char *); asmlinkage long sys_swapon(const char *, int); -extern spinlock_t pagemap_lru_lock; +extern spinlock_t _pagemap_lru_lock; extern void FASTCALL(mark_page_accessed(struct page *)); diff --git a/mm/filemap.c b/mm/filemap.c index b6ee6f656254..b8cc0c5d0d87 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -62,7 +62,7 @@ * ->inode_lock (__mark_inode_dirty) * ->sb_lock (fs/fs-writeback.c) */ -spinlock_t pagemap_lru_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; +spinlock_t _pagemap_lru_lock __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; /* * Remove a page from the page cache and free it. Caller has to make diff --git a/mm/swap.c b/mm/swap.c index d1e6925eed70..8f9e4b76f02a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -41,9 +41,9 @@ static inline void activate_page_nolock(struct page * page) */ void activate_page(struct page * page) { - spin_lock(&pagemap_lru_lock); + spin_lock_irq(&_pagemap_lru_lock); activate_page_nolock(page); - spin_unlock(&pagemap_lru_lock); + spin_unlock_irq(&_pagemap_lru_lock); } /** @@ -53,10 +53,10 @@ void activate_page(struct page * page) void lru_cache_add(struct page * page) { if (!PageLRU(page)) { - spin_lock(&pagemap_lru_lock); + spin_lock_irq(&_pagemap_lru_lock); if (!TestSetPageLRU(page)) add_page_to_inactive_list(page); - spin_unlock(&pagemap_lru_lock); + spin_unlock_irq(&_pagemap_lru_lock); } } @@ -83,9 +83,9 @@ void __lru_cache_del(struct page * page) */ void lru_cache_del(struct page * page) { - spin_lock(&pagemap_lru_lock); + spin_lock_irq(&_pagemap_lru_lock); __lru_cache_del(page); - spin_unlock(&pagemap_lru_lock); + spin_unlock_irq(&_pagemap_lru_lock); } /* @@ -116,7 +116,7 @@ void __pagevec_release(struct pagevec *pvec) continue; if (!lock_held) { - spin_lock(&pagemap_lru_lock); + spin_lock_irq(&_pagemap_lru_lock); lock_held = 1; } @@ -130,7 +130,7 @@ void __pagevec_release(struct pagevec *pvec) pagevec_add(&pages_to_free, page); } if (lock_held) - spin_unlock(&pagemap_lru_lock); + spin_unlock_irq(&_pagemap_lru_lock); pagevec_free(&pages_to_free); pagevec_init(pvec); @@ -175,14 +175,14 @@ void pagevec_deactivate_inactive(struct pagevec *pvec) if (!lock_held) { if (PageActive(page) || !PageLRU(page)) continue; - spin_lock(&pagemap_lru_lock); + spin_lock_irq(&_pagemap_lru_lock); lock_held = 1; } if (!PageActive(page) && PageLRU(page)) list_move(&page->lru, &inactive_list); } if (lock_held) - spin_unlock(&pagemap_lru_lock); + spin_unlock_irq(&_pagemap_lru_lock); __pagevec_release(pvec); } @@ -194,7 +194,7 @@ void __pagevec_lru_add(struct pagevec *pvec) { int i; - spin_lock(&pagemap_lru_lock); + spin_lock_irq(&_pagemap_lru_lock); for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -202,7 +202,7 @@ void __pagevec_lru_add(struct pagevec *pvec) BUG(); add_page_to_inactive_list(page); } - spin_unlock(&pagemap_lru_lock); + spin_unlock_irq(&_pagemap_lru_lock); pagevec_release(pvec); } @@ -214,7 +214,7 @@ void __pagevec_lru_del(struct pagevec *pvec) { int i; - spin_lock(&pagemap_lru_lock); + spin_lock_irq(&_pagemap_lru_lock); for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -225,7 +225,7 @@ void __pagevec_lru_del(struct pagevec *pvec) else del_page_from_inactive_list(page); } - spin_unlock(&pagemap_lru_lock); + spin_unlock_irq(&_pagemap_lru_lock); pagevec_release(pvec); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 45aad6a76ae4..5c8f56420d26 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -284,7 +284,7 @@ shrink_cache(int nr_pages, zone_t *classzone, pagevec_init(&pvec); - spin_lock(&pagemap_lru_lock); + spin_lock_irq(&_pagemap_lru_lock); while (max_scan > 0 && nr_pages > 0) { struct page *page; int n = 0; @@ -307,7 +307,7 @@ shrink_cache(int nr_pages, zone_t *classzone, page_cache_get(page); n++; } - spin_unlock(&pagemap_lru_lock); + spin_unlock_irq(&_pagemap_lru_lock); if (list_empty(&page_list)) goto done; @@ -321,7 +321,7 @@ shrink_cache(int nr_pages, zone_t *classzone, if (nr_pages <= 0 && list_empty(&page_list)) goto done; - spin_lock(&pagemap_lru_lock); + spin_lock_irq(&_pagemap_lru_lock); /* * Put back any unfreeable pages. */ @@ -335,13 +335,13 @@ shrink_cache(int nr_pages, zone_t *classzone, else add_page_to_inactive_list(page); if (!pagevec_add(&pvec, page)) { - spin_unlock(&pagemap_lru_lock); + spin_unlock_irq(&_pagemap_lru_lock); __pagevec_release(&pvec); - spin_lock(&pagemap_lru_lock); + spin_lock_irq(&_pagemap_lru_lock); } } } - spin_unlock(&pagemap_lru_lock); + spin_unlock_irq(&_pagemap_lru_lock); done: pagevec_release(&pvec); return nr_pages; @@ -374,7 +374,7 @@ static /* inline */ void refill_inactive(const int nr_pages_in) struct page *page; struct pagevec pvec; - spin_lock(&pagemap_lru_lock); + spin_lock_irq(&_pagemap_lru_lock); while (nr_pages && !list_empty(&active_list)) { page = list_entry(active_list.prev, struct page, lru); prefetchw_prev_lru_page(page, &active_list, flags); @@ -384,7 +384,7 @@ static /* inline */ void refill_inactive(const int nr_pages_in) list_move(&page->lru, &l_hold); nr_pages--; } - spin_unlock(&pagemap_lru_lock); + spin_unlock_irq(&_pagemap_lru_lock); while (!list_empty(&l_hold)) { page = list_entry(l_hold.prev, struct page, lru); @@ -406,7 +406,7 @@ static /* inline */ void refill_inactive(const int nr_pages_in) } pagevec_init(&pvec); - spin_lock(&pagemap_lru_lock); + spin_lock_irq(&_pagemap_lru_lock); while (!list_empty(&l_inactive)) { page = list_entry(l_inactive.prev, struct page, lru); prefetchw_prev_lru_page(page, &l_inactive, flags); @@ -416,9 +416,9 @@ static /* inline */ void refill_inactive(const int nr_pages_in) BUG(); list_move(&page->lru, &inactive_list); if (!pagevec_add(&pvec, page)) { - spin_unlock(&pagemap_lru_lock); + spin_unlock_irq(&_pagemap_lru_lock); __pagevec_release(&pvec); - spin_lock(&pagemap_lru_lock); + spin_lock_irq(&_pagemap_lru_lock); } } while (!list_empty(&l_active)) { @@ -429,12 +429,12 @@ static /* inline */ void refill_inactive(const int nr_pages_in) BUG_ON(!PageActive(page)); list_move(&page->lru, &active_list); if (!pagevec_add(&pvec, page)) { - spin_unlock(&pagemap_lru_lock); + spin_unlock_irq(&_pagemap_lru_lock); __pagevec_release(&pvec); - spin_lock(&pagemap_lru_lock); + spin_lock_irq(&_pagemap_lru_lock); } } - spin_unlock(&pagemap_lru_lock); + spin_unlock_irq(&_pagemap_lru_lock); pagevec_release(&pvec); mod_page_state(nr_active, -pgdeactivate); -- cgit v1.2.3 From eed29d66442c0e6babcea33ab03f02cdf49e62af Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 14 Aug 2002 21:21:10 -0700 Subject: [PATCH] pagemap_lru_lock wrapup Some fallout from the pagemap_lru_lock changes: - lru_cache_del() is no longer used. Kill it. - page_cache_release() almost never actually frees pages. So inline page_cache_release() and move its rarely-called slow path into (the misnamed) mm/swap.c - update the locking comment in filemap.c. pagemap_lru_lock used to be one of the outermost locks in the VM locking hierarchy. Now, we never take any other locks while holding pagemap_lru_lock. So it doesn't have any relationship with anything. - put_page() now removes pages from the LRU on the final put. The lock is interrupt safe. --- include/linux/mm.h | 7 ++++++- include/linux/pagemap.h | 8 ++++++-- include/linux/swap.h | 2 -- kernel/ksyms.c | 2 +- mm/filemap.c | 1 - mm/page_alloc.c | 10 +--------- mm/swap.c | 31 ++++++++++++------------------- mm/vmscan.c | 6 ++++++ 8 files changed, 32 insertions(+), 35 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index df42d899e41f..4d7de397481f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -194,11 +194,16 @@ struct page { * routine so they can be sure the page doesn't go away from under them. */ #define get_page(p) atomic_inc(&(p)->count) -#define put_page(p) __free_page(p) #define __put_page(p) atomic_dec(&(p)->count) #define put_page_testzero(p) atomic_dec_and_test(&(p)->count) #define page_count(p) atomic_read(&(p)->count) #define set_page_count(p,v) atomic_set(&(p)->count, v) +extern void FASTCALL(__page_cache_release(struct page *)); +#define put_page(p) \ + do { \ + if (put_page_testzero(p)) \ + __page_cache_release(p); \ + } while (0) /* * Multiple processes may "see" the same page. E.g. for untouched diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 69e214920908..ff10783f8632 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -23,14 +23,18 @@ #define PAGE_CACHE_ALIGN(addr) (((addr)+PAGE_CACHE_SIZE-1)&PAGE_CACHE_MASK) #define page_cache_get(x) get_page(x) -extern void FASTCALL(page_cache_release(struct page *)); + +static inline void page_cache_release(struct page *page) +{ + if (!PageReserved(page) && put_page_testzero(page)) + __page_cache_release(page); +} static inline struct page *page_cache_alloc(struct address_space *x) { return alloc_pages(x->gfp_mask, 0); } - typedef int filler_t(void *, struct page *); extern struct page * find_get_page(struct address_space *mapping, diff --git a/include/linux/swap.h b/include/linux/swap.h index e09e96170182..a7f1f96ff9f1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -156,8 +156,6 @@ extern int FASTCALL(page_over_rsslimit(struct page *)); /* linux/mm/swap.c */ extern void FASTCALL(lru_cache_add(struct page *)); -extern void FASTCALL(__lru_cache_del(struct page *)); -extern void FASTCALL(lru_cache_del(struct page *)); extern void FASTCALL(activate_page(struct page *)); diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 011b4983ce91..dff658338907 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -93,7 +93,7 @@ EXPORT_SYMBOL(__alloc_pages); EXPORT_SYMBOL(alloc_pages_node); EXPORT_SYMBOL(__get_free_pages); EXPORT_SYMBOL(get_zeroed_page); -EXPORT_SYMBOL(page_cache_release); +EXPORT_SYMBOL(__page_cache_release); EXPORT_SYMBOL(__free_pages); EXPORT_SYMBOL(free_pages); EXPORT_SYMBOL(num_physpages); diff --git a/mm/filemap.c b/mm/filemap.c index b8cc0c5d0d87..7efaf57a0b50 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -53,7 +53,6 @@ /* * Lock ordering: * - * pagemap_lru_lock * ->i_shared_lock (vmtruncate) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_list_lock diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2f51b6ac9df5..c8de0bcaa9f4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -90,6 +90,7 @@ static void __free_pages_ok (struct page *page, unsigned int order) KERNEL_STAT_ADD(pgfree, 1<mapping != NULL); BUG_ON(PageLocked(page)); @@ -450,15 +451,6 @@ unsigned long get_zeroed_page(unsigned int gfp_mask) return 0; } -void page_cache_release(struct page *page) -{ - if (!PageReserved(page) && put_page_testzero(page)) { - if (PageLRU(page)) - lru_cache_del(page); - __free_pages_ok(page, 0); - } -} - void __pagevec_free(struct pagevec *pvec) { int i = pagevec_count(pvec); diff --git a/mm/swap.c b/mm/swap.c index 8f9e4b76f02a..38f35888c777 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -60,32 +60,25 @@ void lru_cache_add(struct page * page) } } -/** - * __lru_cache_del: remove a page from the page lists - * @page: the page to add - * - * This function is for when the caller already holds - * the pagemap_lru_lock. +/* + * This path almost never happens - pages are normally freed via pagevecs. */ -void __lru_cache_del(struct page * page) +void __page_cache_release(struct page *page) { - if (TestClearPageLRU(page)) { + BUG_ON(page_count(page) != 0); + if (PageLRU(page)) { + unsigned long flags; + + spin_lock_irqsave(&_pagemap_lru_lock, flags); + if (!TestClearPageLRU(page)) + BUG(); if (PageActive(page)) del_page_from_active_list(page); else del_page_from_inactive_list(page); + spin_unlock_irqrestore(&_pagemap_lru_lock, flags); } -} - -/** - * lru_cache_del: remove a page from the page lists - * @page: the page to remove - */ -void lru_cache_del(struct page * page) -{ - spin_lock_irq(&_pagemap_lru_lock); - __lru_cache_del(page); - spin_unlock_irq(&_pagemap_lru_lock); + __free_page(page); } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 5c8f56420d26..d6afbf198b45 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -165,6 +165,12 @@ shrink_list(struct list_head *page_list, int nr_pages, zone_t *classzone, pte_chain_unlock(page); mapping = page->mapping; + /* + * FIXME: this is CPU-inefficient for shared mappings. + * try_to_unmap() will set the page dirty and ->vm_writeback + * will write it. So we're back to page-at-a-time writepage + * in LRU order. + */ if (PageDirty(page) && is_page_cache_freeable(page) && mapping && may_enter_fs) { int (*writeback)(struct page *, int *); -- cgit v1.2.3 From 44260240ce0d1e19e84138ac775811574a9e1326 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 14 Aug 2002 21:21:15 -0700 Subject: [PATCH] deferred and batched addition of pages to the LRU The remaining source of page-at-a-time activity against pagemap_lru_lock is the anonymous pagefault path, which cannot be changed to operate against multiple pages at a time. But what we can do is to batch up just its adding of pages to the LRU, via buffering and deferral. This patch is based on work from Bill Irwin. The patch changes lru_cache_add to put the pages into a per-CPU pagevec. They are added to the LRU 16-at-a-time. And in the page reclaim code, purge the local CPU's buffer before starting. This is mainly to decrease the chances of pages staying off the LRU for very long periods: if the machine is under memory pressure, CPUs will spill their pages onto the LRU promptly. A consequence of this change is that we can have up to 15*num_cpus pages which are not on the LRU. Which could have a slight effect on VM accuracy, but I find that doubtful. If the system is under memory pressure the pages will be added to the LRU promptly, and these pages are the most-recently-touched ones - the VM isn't very interested in them anyway. This optimisation could be made SMP-specific, but I felt it best to turn it on for UP as well for consistency and better testing coverage. --- include/linux/pagevec.h | 1 + mm/swap.c | 25 ++++++++++++++++++------- mm/vmscan.c | 2 ++ 3 files changed, 21 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 7d091aea7543..36017cf91bbf 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -19,6 +19,7 @@ void __pagevec_release_nonlru(struct pagevec *pvec); void __pagevec_free(struct pagevec *pvec); void __pagevec_lru_add(struct pagevec *pvec); void __pagevec_lru_del(struct pagevec *pvec); +void lru_add_drain(void); void pagevec_deactivate_inactive(struct pagevec *pvec); static inline void pagevec_init(struct pagevec *pvec) diff --git a/mm/swap.c b/mm/swap.c index 38f35888c777..f9f7d4adaeec 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -50,14 +50,25 @@ void activate_page(struct page * page) * lru_cache_add: add a page to the page lists * @page: the page to add */ -void lru_cache_add(struct page * page) +static struct pagevec lru_add_pvecs[NR_CPUS]; + +void lru_cache_add(struct page *page) { - if (!PageLRU(page)) { - spin_lock_irq(&_pagemap_lru_lock); - if (!TestSetPageLRU(page)) - add_page_to_inactive_list(page); - spin_unlock_irq(&_pagemap_lru_lock); - } + struct pagevec *pvec = &lru_add_pvecs[get_cpu()]; + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + __pagevec_lru_add(pvec); + put_cpu(); +} + +void lru_add_drain(void) +{ + struct pagevec *pvec = &lru_add_pvecs[get_cpu()]; + + if (pagevec_count(pvec)) + __pagevec_lru_add(pvec); + put_cpu(); } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index d6afbf198b45..53b337114308 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -290,6 +290,7 @@ shrink_cache(int nr_pages, zone_t *classzone, pagevec_init(&pvec); + lru_add_drain(); spin_lock_irq(&_pagemap_lru_lock); while (max_scan > 0 && nr_pages > 0) { struct page *page; @@ -380,6 +381,7 @@ static /* inline */ void refill_inactive(const int nr_pages_in) struct page *page; struct pagevec pvec; + lru_add_drain(); spin_lock_irq(&_pagemap_lru_lock); while (nr_pages && !list_empty(&active_list)) { page = list_entry(active_list.prev, struct page, lru); -- cgit v1.2.3 From fb9100d0a6a949aaaeaa1957021207ac6600b5ce Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 14 Aug 2002 21:29:56 -0700 Subject: [PATCH] Clean up the RPC socket slot allocation code [2/2] Patch by Chuck Lever. Remove the timeout logic from call_reserve. This improves the overall RPC call ordering, and ensures that soft tasks don't time out and give up before they have attempted to send their message down the socket. --- include/linux/sunrpc/xprt.h | 5 ++-- net/sunrpc/clnt.c | 61 ++++++++++++------------------------------ net/sunrpc/xprt.c | 65 ++++++++++++++------------------------------- 3 files changed, 39 insertions(+), 92 deletions(-) (limited to 'include') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index c9b93c6a7a27..2ce2c8223384 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -57,8 +57,7 @@ struct rpc_timeout { unsigned long to_current, /* current timeout */ to_initval, /* initial timeout */ to_maxval, /* max timeout */ - to_increment, /* if !exponential */ - to_resrvval; /* reserve timeout */ + to_increment; /* if !exponential */ short to_retries; /* max # of retries */ unsigned char to_exponential; }; @@ -173,7 +172,7 @@ void xprt_default_timeout(struct rpc_timeout *, int); void xprt_set_timeout(struct rpc_timeout *, unsigned int, unsigned long); -int xprt_reserve(struct rpc_task *); +void xprt_reserve(struct rpc_task *); void xprt_transmit(struct rpc_task *); void xprt_receive(struct rpc_task *); int xprt_adjust_timeout(struct rpc_timeout *); diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 76d9eaa767ed..048978395042 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -394,8 +394,6 @@ call_start(struct rpc_task *task) static void call_reserve(struct rpc_task *task) { - struct rpc_clnt *clnt = task->tk_client; - dprintk("RPC: %4d call_reserve\n", task->tk_pid); if (!rpcauth_uptodatecred(task)) { @@ -405,7 +403,6 @@ call_reserve(struct rpc_task *task) task->tk_status = 0; task->tk_action = call_reserveresult; - task->tk_timeout = clnt->cl_timeout.to_resrvval; xprt_reserve(task); } @@ -448,17 +445,10 @@ call_reserveresult(struct rpc_task *task) } switch (status) { - case -EAGAIN: - case -ENOBUFS: - task->tk_timeout = task->tk_client->cl_timeout.to_resrvval; + case -EAGAIN: /* woken up; retry */ task->tk_action = call_reserve; return; - case -ETIMEDOUT: - dprintk("RPC: timed out while reserving request slot\n"); - task->tk_action = call_timeout; - return; - case -EIO: - /* probably a shutdown */ + case -EIO: /* probably a shutdown */ break; default: printk(KERN_ERR "%s: unrecognized error %d, exiting\n", @@ -560,6 +550,9 @@ call_bind(struct rpc_task *task) struct rpc_clnt *clnt = task->tk_client; struct rpc_xprt *xprt = clnt->cl_xprt; + dprintk("RPC: %4d call_bind xprt %p %s connected\n", task->tk_pid, + xprt, (xprt_connected(xprt) ? "is" : "is not")); + task->tk_action = (xprt_connected(xprt)) ? call_transmit : call_reconnect; if (!clnt->cl_port) { @@ -696,20 +689,15 @@ static void call_timeout(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_rqst *req = task->tk_rqstp; - - if (req) { - struct rpc_timeout *to = &req->rq_timeout; + struct rpc_timeout *to = &task->tk_rqstp->rq_timeout; - if (xprt_adjust_timeout(to)) { - dprintk("RPC: %4d call_timeout (minor timeo)\n", - task->tk_pid); - goto minor_timeout; - } - to->to_retries = clnt->cl_timeout.to_retries; + if (xprt_adjust_timeout(to)) { + dprintk("RPC: %4d call_timeout (minor)\n", task->tk_pid); + goto retry; } + to->to_retries = clnt->cl_timeout.to_retries; - dprintk("RPC: %4d call_timeout (major timeo)\n", task->tk_pid); + dprintk("RPC: %4d call_timeout (major)\n", task->tk_pid); if (clnt->cl_softrtry) { if (clnt->cl_chatty && !task->tk_exit) printk(KERN_NOTICE "%s: server %s not responding, timed out\n", @@ -717,33 +705,18 @@ call_timeout(struct rpc_task *task) rpc_exit(task, -EIO); return; } + if (clnt->cl_chatty && !(task->tk_flags & RPC_CALL_MAJORSEEN) && rpc_ntimeo(&clnt->cl_rtt) > 7) { task->tk_flags |= RPC_CALL_MAJORSEEN; - if (req) - printk(KERN_NOTICE "%s: server %s not responding, still trying\n", - clnt->cl_protname, clnt->cl_server); -#ifdef RPC_DEBUG - else - printk(KERN_NOTICE "%s: task %d can't get a request slot\n", - clnt->cl_protname, task->tk_pid); -#endif + printk(KERN_NOTICE "%s: server %s not responding, still trying\n", + clnt->cl_protname, clnt->cl_server); } if (clnt->cl_autobind) clnt->cl_port = 0; -minor_timeout: - if (!req) - task->tk_action = call_reserve; - else if (!clnt->cl_port) { - task->tk_action = call_bind; - clnt->cl_stats->rpcretrans++; - } else if (!xprt_connected(clnt->cl_xprt)) { - task->tk_action = call_reconnect; - clnt->cl_stats->rpcretrans++; - } else { - task->tk_action = call_transmit; - clnt->cl_stats->rpcretrans++; - } +retry: + clnt->cl_stats->rpcretrans++; + task->tk_action = call_bind; task->tk_status = 0; } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 8f5b4eb9e259..5b05e9c9a6dc 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -84,7 +84,7 @@ */ static void xprt_request_init(struct rpc_task *, struct rpc_xprt *); static void do_xprt_transmit(struct rpc_task *); -static void xprt_reserve_status(struct rpc_task *task); +static inline void do_xprt_reserve(struct rpc_task *); static void xprt_disconnect(struct rpc_xprt *); static void xprt_reconn_status(struct rpc_task *task); static struct socket *xprt_create_socket(int, struct rpc_timeout *); @@ -1179,61 +1179,39 @@ do_xprt_transmit(struct rpc_task *task) /* * Reserve an RPC call slot. */ -int +void xprt_reserve(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - /* We already have an initialized request. */ - if (task->tk_rqstp) - return 0; - - spin_lock(&xprt->xprt_lock); - xprt_reserve_status(task); - if (task->tk_rqstp) { - task->tk_timeout = 0; - } else if (!task->tk_timeout) { - task->tk_status = -ENOBUFS; - } else { - dprintk("RPC: xprt_reserve waiting on backlog\n"); - task->tk_status = -EAGAIN; - rpc_sleep_on(&xprt->backlog, task, NULL, NULL); + task->tk_status = -EIO; + if (!xprt->shutdown) { + spin_lock(&xprt->xprt_lock); + do_xprt_reserve(task); + spin_unlock(&xprt->xprt_lock); } - spin_unlock(&xprt->xprt_lock); - dprintk("RPC: %4d xprt_reserve returns %d\n", - task->tk_pid, task->tk_status); - return task->tk_status; } -/* - * Reservation callback - */ -static void -xprt_reserve_status(struct rpc_task *task) +static inline void +do_xprt_reserve(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - struct rpc_rqst *req; - if (xprt->shutdown) { - task->tk_status = -EIO; - } else if (task->tk_status < 0) { - /* NOP */ - } else if (task->tk_rqstp) { - /* We've already been given a request slot: NOP */ - } else { - if (!(req = xprt->free)) - goto out_nofree; - /* OK: There's room for us. Grab a free slot */ - xprt->free = req->rq_next; - req->rq_next = NULL; + task->tk_status = 0; + if (task->tk_rqstp) + return; + if (xprt->free) { + struct rpc_rqst *req = xprt->free; + xprt->free = req->rq_next; + req->rq_next = NULL; task->tk_rqstp = req; xprt_request_init(task, xprt); + return; } - - return; - -out_nofree: + dprintk("RPC: waiting for request slot\n"); task->tk_status = -EAGAIN; + task->tk_timeout = 0; + rpc_sleep_on(&xprt->backlog, task, NULL, NULL); } /* @@ -1249,7 +1227,6 @@ xprt_request_init(struct rpc_task *task, struct rpc_xprt *xprt) xid = CURRENT_TIME << 12; dprintk("RPC: %4d reserved req %p xid %08x\n", task->tk_pid, req, xid); - task->tk_status = 0; req->rq_timeout = xprt->timeout; req->rq_task = task; req->rq_xprt = xprt; @@ -1311,7 +1288,6 @@ xprt_set_timeout(struct rpc_timeout *to, unsigned int retr, unsigned long incr) to->to_initval = to->to_increment = incr; to->to_maxval = incr * retr; - to->to_resrvval = incr * retr; to->to_retries = retr; to->to_exponential = 0; } @@ -1352,7 +1328,6 @@ xprt_setup(struct socket *sock, int proto, if (to) { xprt->timeout = *to; xprt->timeout.to_current = to->to_initval; - xprt->timeout.to_resrvval = to->to_maxval << 1; } else xprt_default_timeout(&xprt->timeout, xprt->prot); -- cgit v1.2.3