From 74ba685ef4520523ddd4e7aeeb8a37f2533eed49 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@redhat.com>
Date: Wed, 3 Sep 2003 01:07:47 +0100
Subject: [CPUFREQ] Move kernel/cpufreq.c to drivers/cpufreq/cpufreq.c Also
 remove $Id$ tag. No other code change.

---
 kernel/Makefile  |   1 -
 kernel/cpufreq.c | 963 -------------------------------------------------------
 2 files changed, 964 deletions(-)
 delete mode 100644 kernel/cpufreq.c

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 9bf11ae7195b..638a2f6c341c 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -15,7 +15,6 @@ obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += ksyms.o module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_PM) += power/
-obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_COMPAT) += compat.o
 obj-$(CONFIG_IKCONFIG) += configs.o
diff --git a/kernel/cpufreq.c b/kernel/cpufreq.c
deleted file mode 100644
index 7f80c321c785..000000000000
--- a/kernel/cpufreq.c
+++ /dev/null
@@ -1,963 +0,0 @@
-/*
- *  linux/kernel/cpufreq.c
- *
- *  Copyright (C) 2001 Russell King
- *            (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
- *
- *  $Id: cpufreq.c,v 1.59 2003/01/20 17:31:48 db Exp $
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/notifier.h>
-#include <linux/cpufreq.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/spinlock.h>
-#include <linux/device.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/completion.h>
-
-/**
- * The "cpufreq driver" - the arch- or hardware-dependend low
- * level driver of CPUFreq support, and its spinlock. This lock
- * also protects the cpufreq_cpu_data array.
- */
-static struct cpufreq_driver   	*cpufreq_driver;
-static struct cpufreq_policy	*cpufreq_cpu_data[NR_CPUS];
-static spinlock_t		cpufreq_driver_lock = SPIN_LOCK_UNLOCKED;
-
-/* internal prototype */
-static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event);
-
-
-/**
- * Two notifier lists: the "policy" list is involved in the 
- * validation process for a new CPU frequency policy; the 
- * "transition" list for kernel code that needs to handle
- * changes to devices when the CPU clock speed changes.
- * The mutex locks both lists.
- */
-static struct notifier_block    *cpufreq_policy_notifier_list;
-static struct notifier_block    *cpufreq_transition_notifier_list;
-static DECLARE_RWSEM		(cpufreq_notifier_rwsem);
-
-
-static LIST_HEAD(cpufreq_governor_list);
-static DECLARE_MUTEX		(cpufreq_governor_sem);
-
-static struct cpufreq_policy * cpufreq_cpu_get(unsigned int cpu)
-{
-	struct cpufreq_policy *data;
-	unsigned long flags;
-
-	if (cpu >= NR_CPUS)
-		goto err_out;
-
-	/* get the cpufreq driver */
-	spin_lock_irqsave(&cpufreq_driver_lock, flags);
-
-	if (!cpufreq_driver)
-		goto err_out_unlock;
-
-	if (!try_module_get(cpufreq_driver->owner))
-		goto err_out_unlock;
-
-
-	/* get the CPU */
-	data = cpufreq_cpu_data[cpu];
-
-	if (!data)
-		goto err_out_put_module;
-
-	if (!kobject_get(&data->kobj))
-		goto err_out_put_module;
-
-
-	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
-
-	return data;
-
- err_out_put_module:
-	module_put(cpufreq_driver->owner);
- err_out_unlock:
-	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
- err_out:
-	return NULL;
-}
-
-static void cpufreq_cpu_put(struct cpufreq_policy *data)
-{
-	kobject_put(&data->kobj);
-	module_put(cpufreq_driver->owner);
-}
-
-/*********************************************************************
- *                          SYSFS INTERFACE                          *
- *********************************************************************/
-
-/**
- * cpufreq_parse_governor - parse a governor string
- */
-int cpufreq_parse_governor (char *str_governor, unsigned int *policy,
-				struct cpufreq_governor **governor)
-{
-	if (!strnicmp(str_governor, "performance", CPUFREQ_NAME_LEN)) {
-		*policy = CPUFREQ_POLICY_PERFORMANCE;
-		return 0;
-	} else if (!strnicmp(str_governor, "powersave", CPUFREQ_NAME_LEN)) {
-		*policy = CPUFREQ_POLICY_POWERSAVE;
-		return 0;
-	} else 	{
-		struct cpufreq_governor *t;
-		down(&cpufreq_governor_sem);
-		if (!cpufreq_driver || !cpufreq_driver->target)
-			goto out;
-		list_for_each_entry(t, &cpufreq_governor_list, governor_list) {
-			if (!strnicmp(str_governor,t->name,CPUFREQ_NAME_LEN)) {
-				*governor = t;
-				*policy = CPUFREQ_POLICY_GOVERNOR;
-				up(&cpufreq_governor_sem);
-				return 0;
-			}
-		}
-	out:
-		up(&cpufreq_governor_sem);
-	}
-	return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(cpufreq_parse_governor);
-
-
-/* drivers/base/cpu.c */
-extern struct sysdev_class cpu_sysdev_class;
-
-
-/**
- * cpufreq_per_cpu_attr_read() / show_##file_name() - print out cpufreq information
- *
- * Write out information from cpufreq_driver->policy[cpu]; object must be
- * "unsigned int".
- */
-
-#define show_one(file_name, object)		 			\
-static ssize_t show_##file_name 					\
-(struct cpufreq_policy * policy, char *buf)				\
-{									\
-	return sprintf (buf, "%u\n", policy->object);			\
-}
-
-show_one(cpuinfo_min_freq, cpuinfo.min_freq);
-show_one(cpuinfo_max_freq, cpuinfo.max_freq);
-show_one(scaling_min_freq, min);
-show_one(scaling_max_freq, max);
-
-/**
- * cpufreq_per_cpu_attr_write() / store_##file_name() - sysfs write access
- */
-#define store_one(file_name, object)			\
-static ssize_t store_##file_name					\
-(struct cpufreq_policy * policy, const char *buf, size_t count)		\
-{									\
-	unsigned int ret = -EINVAL;					\
-	struct cpufreq_policy new_policy;				\
-									\
-	ret = cpufreq_get_policy(&new_policy, policy->cpu);		\
-	if (ret)							\
-		return -EINVAL;						\
-									\
-	ret = sscanf (buf, "%u", &new_policy.object);			\
-	if (ret != 1)							\
-		return -EINVAL;						\
-									\
-	ret = cpufreq_set_policy(&new_policy);				\
-									\
-	return ret ? ret : count;					\
-}
-
-store_one(scaling_min_freq,min);
-store_one(scaling_max_freq,max);
-
-/**
- * show_scaling_governor - show the current policy for the specified CPU
- */
-static ssize_t show_scaling_governor (struct cpufreq_policy * policy, char *buf)
-{
-	switch (policy->policy) {
-	case CPUFREQ_POLICY_POWERSAVE:
-		return sprintf(buf, "powersave\n");
-	case CPUFREQ_POLICY_PERFORMANCE:
-		return sprintf(buf, "performance\n");
-	case CPUFREQ_POLICY_GOVERNOR:
-		return snprintf(buf, CPUFREQ_NAME_LEN, "%s\n", policy->governor->name);
-	default:
-		return -EINVAL;
-	}
-}
-
-
-/**
- * store_scaling_governor - store policy for the specified CPU
- */
-static ssize_t store_scaling_governor (struct cpufreq_policy * policy, 
-				       const char *buf, size_t count) 
-{
-	unsigned int ret = -EINVAL;
-	char	str_governor[16];
-	struct cpufreq_policy new_policy;
-
-	ret = cpufreq_get_policy(&new_policy, policy->cpu);
-	if (ret)
-		return ret;
-
-	ret = sscanf (buf, "%15s", str_governor);
-	if (ret != 1)
-		return -EINVAL;
-
-	if (cpufreq_parse_governor(str_governor, &new_policy.policy, &new_policy.governor))
-		return -EINVAL;
-
-	ret = cpufreq_set_policy(&new_policy);
-
-	return ret ? ret : count;
-}
-
-/**
- * show_scaling_driver - show the cpufreq driver currently loaded
- */
-static ssize_t show_scaling_driver (struct cpufreq_policy * policy, char *buf)
-{
-	return snprintf(buf, CPUFREQ_NAME_LEN, "%s\n", cpufreq_driver->name);
-}
-
-/**
- * show_scaling_available_governors - show the available CPUfreq governors
- */
-static ssize_t show_scaling_available_governors (struct cpufreq_policy * policy,
-				char *buf)
-{
-	ssize_t i = 0;
-	struct cpufreq_governor *t;
-
-	i += sprintf(buf, "performance powersave");
-
-	if (!cpufreq_driver->target)
-		goto out;
-
-	list_for_each_entry(t, &cpufreq_governor_list, governor_list) {
-		if (i >= (ssize_t) ((PAGE_SIZE / sizeof(char)) - (CPUFREQ_NAME_LEN + 2)))
-			goto out;
-		i += snprintf(&buf[i], CPUFREQ_NAME_LEN, " %s", t->name);
-	}
- out:
-	i += sprintf(&buf[i], "\n");
-	return i;
-}
-
-
-#define define_one_ro(_name) \
-struct freq_attr _name = { \
-	.attr = { .name = __stringify(_name), .mode = 0444 }, \
-	.show = show_##_name, \
-}
-
-#define define_one_rw(_name) \
-struct freq_attr _name = { \
-	.attr = { .name = __stringify(_name), .mode = 0644 }, \
-	.show = show_##_name, \
-	.store = store_##_name, \
-}
-
-define_one_ro(cpuinfo_min_freq);
-define_one_ro(cpuinfo_max_freq);
-define_one_ro(scaling_available_governors);
-define_one_ro(scaling_driver);
-define_one_rw(scaling_min_freq);
-define_one_rw(scaling_max_freq);
-define_one_rw(scaling_governor);
-
-static struct attribute * default_attrs[] = {
-	&cpuinfo_min_freq.attr,
-	&cpuinfo_max_freq.attr,
-	&scaling_min_freq.attr,
-	&scaling_max_freq.attr,
-	&scaling_governor.attr,
-	&scaling_driver.attr,
-	&scaling_available_governors.attr,
-	NULL
-};
-
-#define to_policy(k) container_of(k,struct cpufreq_policy,kobj)
-#define to_attr(a) container_of(a,struct freq_attr,attr)
-
-static ssize_t show(struct kobject * kobj, struct attribute * attr ,char * buf)
-{
-	struct cpufreq_policy * policy = to_policy(kobj);
-	struct freq_attr * fattr = to_attr(attr);
-	ssize_t ret;
-	policy = cpufreq_cpu_get(policy->cpu);
-	if (!policy)
-		return -EINVAL;
-	ret = fattr->show ? fattr->show(policy,buf) : 0;
-	cpufreq_cpu_put(policy);
-	return ret;
-}
-
-static ssize_t store(struct kobject * kobj, struct attribute * attr, 
-		     const char * buf, size_t count)
-{
-	struct cpufreq_policy * policy = to_policy(kobj);
-	struct freq_attr * fattr = to_attr(attr);
-	ssize_t ret;
-	policy = cpufreq_cpu_get(policy->cpu);
-	if (!policy)
-		return -EINVAL;
-	ret = fattr->store ? fattr->store(policy,buf,count) : 0;
-	cpufreq_cpu_put(policy);
-	return ret;
-}
-
-static void cpufreq_sysfs_release(struct kobject * kobj)
-{
-	struct cpufreq_policy * policy = to_policy(kobj);
-	complete(&policy->kobj_unregister);
-}
-
-static struct sysfs_ops sysfs_ops = {
-	.show	= show,
-	.store	= store,
-};
-
-static struct kobj_type ktype_cpufreq = {
-	.sysfs_ops	= &sysfs_ops,
-	.default_attrs	= default_attrs,
-	.release	= cpufreq_sysfs_release,
-};
-
-
-/**
- * cpufreq_add_dev - add a CPU device
- *
- * Adds the cpufreq interface for a CPU device. 
- */
-static int cpufreq_add_dev (struct sys_device * sys_dev)
-{
-	unsigned int cpu = sys_dev->id;
-	int ret = 0;
-	struct cpufreq_policy new_policy;
-	struct cpufreq_policy *policy;
-	struct freq_attr **drv_attr;
-	unsigned long flags;
-
-	if (!try_module_get(cpufreq_driver->owner))
-		return -EINVAL;
-
-	policy = kmalloc(sizeof(struct cpufreq_policy), GFP_KERNEL);
-	if (!policy)
-		return -ENOMEM;
-	memset(policy, 0, sizeof(struct cpufreq_policy));
-
-	policy->cpu = cpu;
-	init_MUTEX_LOCKED(&policy->lock);
-	init_completion(&policy->kobj_unregister);
-
-	/* call driver. From then on the cpufreq must be able
-	 * to accept all calls to ->verify and ->setpolicy for this CPU
-	 */
-	ret = cpufreq_driver->init(policy);
-	if (ret)
-		goto err_out;
-
-	memcpy(&new_policy, policy, sizeof(struct cpufreq_policy));
-
-	/* prepare interface data */
-	policy->kobj.parent = &sys_dev->kobj;
-	policy->kobj.ktype = &ktype_cpufreq;
-	strlcpy(policy->kobj.name, "cpufreq", KOBJ_NAME_LEN);
-
-	ret = kobject_register(&policy->kobj);
-	if (ret)
-		goto err_out;
-
-	/* set up files for this cpu device */
-	drv_attr = cpufreq_driver->attr;
-	while ((drv_attr) && (*drv_attr)) {
-		sysfs_create_file(&policy->kobj, &((*drv_attr)->attr));
-		drv_attr++;
-	}
-
-	spin_lock_irqsave(&cpufreq_driver_lock, flags);
-	cpufreq_cpu_data[cpu] = policy;
-	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
-
-	up(&policy->lock);
-	
-	/* set default policy */
-	ret = cpufreq_set_policy(&new_policy);
-	if (ret)
-		goto err_out_unregister;
-
-	module_put(cpufreq_driver->owner);
-	return 0;
-
-
- err_out_unregister:
-	spin_lock_irqsave(&cpufreq_driver_lock, flags);
-	cpufreq_cpu_data[cpu] = NULL;
-	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
-
-	kobject_unregister(&policy->kobj);
-	wait_for_completion(&policy->kobj_unregister);
-
- err_out:
-	kfree(policy);
-	module_put(cpufreq_driver->owner);
-	return ret;
-}
-
-
-/**
- * cpufreq_remove_dev - remove a CPU device
- *
- * Removes the cpufreq interface for a CPU device.
- */
-static int cpufreq_remove_dev (struct sys_device * sys_dev)
-{
-	unsigned int cpu = sys_dev->id;
-	unsigned long flags;
-	struct cpufreq_policy *data;
-
-	spin_lock_irqsave(&cpufreq_driver_lock, flags);
-	data = cpufreq_cpu_data[cpu];
-
-	if (!data) {
-		spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
-		return -EINVAL;
-	}
-	cpufreq_cpu_data[cpu] = NULL;
-	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
-
-	if (!kobject_get(&data->kobj))
-		return -EFAULT;
-
-	kobject_unregister(&data->kobj);
-
-	kobject_put(&data->kobj);
-
-	/* we need to make sure that the underlying kobj is actually
-	 * not referenced anymore by anybody before we proceed with 
-	 * unloading.
-	 */
-	wait_for_completion(&data->kobj_unregister);
-
-	if (cpufreq_driver->target)
-		__cpufreq_governor(data, CPUFREQ_GOV_STOP);
-
-	if (cpufreq_driver->exit)
-		cpufreq_driver->exit(data);
-
-	kfree(data);
-
-	return 0;
-}
-
-/**
- *	cpufreq_resume - restore the CPU clock frequency after resume
- *
- *	Restore the CPU clock frequency so that our idea of the current
- *	frequency reflects the actual hardware.
- */
-static int cpufreq_resume(struct sys_device * sysdev)
-{
-	int cpu = sysdev->id;
-	unsigned int ret = 0;
-	struct cpufreq_policy *cpu_policy;
-
-	if (!cpu_online(cpu))
-		return 0;
-
-	/* we may be lax here as interrupts are off. Nonetheless
-	 * we need to grab the correct cpu policy, as to check
-	 * whether we really run on this CPU.
-	 */
-
-	cpu_policy = cpufreq_cpu_get(cpu);
-	if (!cpu_policy)
-		return -EINVAL;
-
-	if (cpufreq_driver->setpolicy)
-		ret = cpufreq_driver->setpolicy(cpu_policy);
-	else
-	/* CPUFREQ_RELATION_H or CPUFREQ_RELATION_L have the same effect here, as cpu_policy->cur is known
-	 * to be a valid and exact target frequency
-	 */
-		ret = cpufreq_driver->target(cpu_policy, cpu_policy->cur, CPUFREQ_RELATION_H);
-
-	cpufreq_cpu_put(cpu_policy);
-
-	return ret;
-}
-
-static struct sysdev_driver cpufreq_sysdev_driver = {
-	.add		= cpufreq_add_dev,
-	.remove		= cpufreq_remove_dev,
-	.resume		= cpufreq_resume,
-};
-
-
-/*********************************************************************
- *                     NOTIFIER LISTS INTERFACE                      *
- *********************************************************************/
-
-/**
- *	cpufreq_register_notifier - register a driver with cpufreq
- *	@nb: notifier function to register
- *      @list: CPUFREQ_TRANSITION_NOTIFIER or CPUFREQ_POLICY_NOTIFIER
- *
- *	Add a driver to one of two lists: either a list of drivers that 
- *      are notified about clock rate changes (once before and once after
- *      the transition), or a list of drivers that are notified about
- *      changes in cpufreq policy.
- *
- *	This function may sleep, and has the same return conditions as
- *	notifier_chain_register.
- */
-int cpufreq_register_notifier(struct notifier_block *nb, unsigned int list)
-{
-	int ret;
-
-	down_write(&cpufreq_notifier_rwsem);
-	switch (list) {
-	case CPUFREQ_TRANSITION_NOTIFIER:
-		ret = notifier_chain_register(&cpufreq_transition_notifier_list, nb);
-		break;
-	case CPUFREQ_POLICY_NOTIFIER:
-		ret = notifier_chain_register(&cpufreq_policy_notifier_list, nb);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-	up_write(&cpufreq_notifier_rwsem);
-
-	return ret;
-}
-EXPORT_SYMBOL(cpufreq_register_notifier);
-
-
-/**
- *	cpufreq_unregister_notifier - unregister a driver with cpufreq
- *	@nb: notifier block to be unregistered
- *      @list: CPUFREQ_TRANSITION_NOTIFIER or CPUFREQ_POLICY_NOTIFIER
- *
- *	Remove a driver from the CPU frequency notifier list.
- *
- *	This function may sleep, and has the same return conditions as
- *	notifier_chain_unregister.
- */
-int cpufreq_unregister_notifier(struct notifier_block *nb, unsigned int list)
-{
-	int ret;
-
-	down_write(&cpufreq_notifier_rwsem);
-	switch (list) {
-	case CPUFREQ_TRANSITION_NOTIFIER:
-		ret = notifier_chain_unregister(&cpufreq_transition_notifier_list, nb);
-		break;
-	case CPUFREQ_POLICY_NOTIFIER:
-		ret = notifier_chain_unregister(&cpufreq_policy_notifier_list, nb);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-	up_write(&cpufreq_notifier_rwsem);
-
-	return ret;
-}
-EXPORT_SYMBOL(cpufreq_unregister_notifier);
-
-
-/*********************************************************************
- *                              GOVERNORS                            *
- *********************************************************************/
-
-
-int __cpufreq_driver_target(struct cpufreq_policy *policy,
-			    unsigned int target_freq,
-			    unsigned int relation)
-{
-	return cpufreq_driver->target(policy, target_freq, relation);
-}
-EXPORT_SYMBOL_GPL(__cpufreq_driver_target);
-
-
-int cpufreq_driver_target(struct cpufreq_policy *policy,
-			  unsigned int target_freq,
-			  unsigned int relation)
-{
-	unsigned int ret;
-
-	policy = cpufreq_cpu_get(policy->cpu);
-	if (!policy)
-		return -EINVAL;
-
-	down(&policy->lock);
-
-	ret = __cpufreq_driver_target(policy, target_freq, relation);
-
-	up(&policy->lock);
-
-	cpufreq_cpu_put(policy);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(cpufreq_driver_target);
-
-
-static int __cpufreq_governor(struct cpufreq_policy *policy, unsigned int event)
-{
-	int ret = 0;
-
-	switch (policy->policy) {
-	case CPUFREQ_POLICY_POWERSAVE: 
-		if ((event == CPUFREQ_GOV_LIMITS) || (event == CPUFREQ_GOV_START)) {
-			ret = __cpufreq_driver_target(policy, policy->min, CPUFREQ_RELATION_L);
-		}
-		break;
-	case CPUFREQ_POLICY_PERFORMANCE:
-		if ((event == CPUFREQ_GOV_LIMITS) || (event == CPUFREQ_GOV_START)) {
-			ret = __cpufreq_driver_target(policy, policy->max, CPUFREQ_RELATION_H);
-		}
-		break;
-	case CPUFREQ_POLICY_GOVERNOR:
-		ret = -EINVAL;
-		if (!try_module_get(policy->governor->owner))
-			break;
-		ret = policy->governor->governor(policy, event);
-		/* we keep one module reference alive for each CPU governed by this CPU */
-		if ((event != CPUFREQ_GOV_START) || ret)
-			module_put(policy->governor->owner);
-		if ((event == CPUFREQ_GOV_STOP) && !ret)
-			module_put(policy->governor->owner);
-		break;
-	default:
-		ret = -EINVAL;
-	}
-
-	return ret;
-}
-
-
-int cpufreq_governor(unsigned int cpu, unsigned int event)
-{
-	int ret = 0;
-	struct cpufreq_policy *policy = cpufreq_cpu_get(cpu);
-
-	if (!policy)
-		return -EINVAL;
-
-	down(&policy->lock);
-	ret = __cpufreq_governor(policy, event);
-	up(&policy->lock);
-
-	cpufreq_cpu_put(policy);
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(cpufreq_governor);
-
-
-int cpufreq_register_governor(struct cpufreq_governor *governor)
-{
-	struct cpufreq_governor *t;
-
-	if (!governor)
-		return -EINVAL;
-
-	if (!strnicmp(governor->name,"powersave",CPUFREQ_NAME_LEN))
-		return -EBUSY;
-	if (!strnicmp(governor->name,"performance",CPUFREQ_NAME_LEN))
-		return -EBUSY;
-
-	down(&cpufreq_governor_sem);
-	
-	list_for_each_entry(t, &cpufreq_governor_list, governor_list) {
-		if (!strnicmp(governor->name,t->name,CPUFREQ_NAME_LEN)) {
-			up(&cpufreq_governor_sem);
-			return -EBUSY;
-		}
-	}
-	list_add(&governor->governor_list, &cpufreq_governor_list);
-
- 	up(&cpufreq_governor_sem);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(cpufreq_register_governor);
-
-
-void cpufreq_unregister_governor(struct cpufreq_governor *governor)
-{
-	if (!governor)
-		return;
-
-	down(&cpufreq_governor_sem);
-	list_del(&governor->governor_list);
-	up(&cpufreq_governor_sem);
-	return;
-}
-EXPORT_SYMBOL_GPL(cpufreq_unregister_governor);
-
-
-
-/*********************************************************************
- *                          POLICY INTERFACE                         *
- *********************************************************************/
-
-/**
- * cpufreq_get_policy - get the current cpufreq_policy
- * @policy: struct cpufreq_policy into which the current cpufreq_policy is written
- *
- * Reads the current cpufreq policy.
- */
-int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu)
-{
-	struct cpufreq_policy *cpu_policy;
-	if (!policy)
-		return -EINVAL;
-
-	cpu_policy = cpufreq_cpu_get(cpu);
-	if (!cpu_policy)
-		return -EINVAL;
-
-	down(&cpu_policy->lock);
-	memcpy(policy, cpu_policy, sizeof(struct cpufreq_policy));
-	up(&cpu_policy->lock);
-
-	cpufreq_cpu_put(cpu_policy);
-
-	return 0;
-}
-EXPORT_SYMBOL(cpufreq_get_policy);
-
-
-/**
- *	cpufreq_set_policy - set a new CPUFreq policy
- *	@policy: policy to be set.
- *
- *	Sets a new CPU frequency and voltage scaling policy.
- */
-int cpufreq_set_policy(struct cpufreq_policy *policy)
-{
-	int ret = 0;
-	struct cpufreq_policy *data;
-
-	if (!policy)
-		return -EINVAL;
-
-	data = cpufreq_cpu_get(policy->cpu);
-	if (!data)
-		return -EINVAL;
-
-	/* lock this CPU */
-	down(&data->lock);
-
-	memcpy(&policy->cpuinfo, 
-	       &data->cpuinfo, 
-	       sizeof(struct cpufreq_cpuinfo));
-
-	/* verify the cpu speed can be set within this limit */
-	ret = cpufreq_driver->verify(policy);
-	if (ret)
-		goto error_out;
-
-	down_read(&cpufreq_notifier_rwsem);
-
-	/* adjust if necessary - all reasons */
-	notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_ADJUST,
-			    policy);
-
-	/* adjust if necessary - hardware incompatibility*/
-	notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_INCOMPATIBLE,
-			    policy);
-
-	/* verify the cpu speed can be set within this limit,
-	   which might be different to the first one */
-	ret = cpufreq_driver->verify(policy);
-	if (ret) {
-		up_read(&cpufreq_notifier_rwsem);
-		goto error_out;
-	}
-
-	/* notification of the new policy */
-	notifier_call_chain(&cpufreq_policy_notifier_list, CPUFREQ_NOTIFY,
-			    policy);
-
-	up_read(&cpufreq_notifier_rwsem);
-
-	data->min    = policy->min;
-	data->max    = policy->max;
-
-	if (cpufreq_driver->setpolicy) {
-		data->policy = policy->policy;
-		ret = cpufreq_driver->setpolicy(policy);
-	} else {
-		if ((policy->policy != data->policy) || 
-		    ((policy->policy == CPUFREQ_POLICY_GOVERNOR) && (policy->governor != data->governor))) {
-			/* save old, working values */
-			unsigned int old_pol = data->policy;
-			struct cpufreq_governor *old_gov = data->governor;
-
-			/* end old governor */
-			__cpufreq_governor(data, CPUFREQ_GOV_STOP);
-
-			/* start new governor */
-			data->policy = policy->policy;
-			data->governor = policy->governor;
-			if (__cpufreq_governor(data, CPUFREQ_GOV_START)) {
-				/* new governor failed, so re-start old one */
-				data->policy = old_pol;
-				data->governor = old_gov;
-				__cpufreq_governor(data, CPUFREQ_GOV_START);
-			}
-			/* might be a policy change, too, so fall through */
-		}
-		__cpufreq_governor(data, CPUFREQ_GOV_LIMITS);
-	}
-
- error_out:
-	up(&data->lock);
-	cpufreq_cpu_put(data);
-
-	return ret;
-}
-EXPORT_SYMBOL(cpufreq_set_policy);
-
-
-
-/*********************************************************************
- *            EXTERNALLY AFFECTING FREQUENCY CHANGES                 *
- *********************************************************************/
-
-/**
- * adjust_jiffies - adjust the system "loops_per_jiffy"
- *
- * This function alters the system "loops_per_jiffy" for the clock
- * speed change. Note that loops_per_jiffy cannot be updated on SMP
- * systems as each CPU might be scaled differently. So, use the arch 
- * per-CPU loops_per_jiffy value wherever possible.
- */
-#ifndef CONFIG_SMP
-static unsigned long l_p_j_ref;
-static unsigned int  l_p_j_ref_freq;
-
-static inline void adjust_jiffies(unsigned long val, struct cpufreq_freqs *ci)
-{
-	if (!l_p_j_ref_freq) {
-		l_p_j_ref = loops_per_jiffy;
-		l_p_j_ref_freq = ci->old;
-	}
-	if ((val == CPUFREQ_PRECHANGE  && ci->old < ci->new) ||
-	    (val == CPUFREQ_POSTCHANGE && ci->old > ci->new))
-		loops_per_jiffy = cpufreq_scale(l_p_j_ref, l_p_j_ref_freq, ci->new);
-}
-#else
-#define adjust_jiffies(x...) do {} while (0)
-#endif
-
-
-/**
- * cpufreq_notify_transition - call notifier chain and adjust_jiffies on frequency transition
- *
- * This function calls the transition notifiers and the "adjust_jiffies" function. It is called
- * twice on all CPU frequency changes that have external effects. 
- */
-void cpufreq_notify_transition(struct cpufreq_freqs *freqs, unsigned int state)
-{
-	if (irqs_disabled())
-		return;   /* Only valid if we're in the resume process where
-			   * everyone knows what CPU frequency we are at */
-
-	down_read(&cpufreq_notifier_rwsem);
-	switch (state) {
-	case CPUFREQ_PRECHANGE:
-		notifier_call_chain(&cpufreq_transition_notifier_list, CPUFREQ_PRECHANGE, freqs);
-		adjust_jiffies(CPUFREQ_PRECHANGE, freqs);
-		break;
-	case CPUFREQ_POSTCHANGE:
-		adjust_jiffies(CPUFREQ_POSTCHANGE, freqs);
-		notifier_call_chain(&cpufreq_transition_notifier_list, CPUFREQ_POSTCHANGE, freqs);
-		cpufreq_cpu_data[freqs->cpu]->cur = freqs->new;
-		break;
-	}
-	up_read(&cpufreq_notifier_rwsem);
-}
-EXPORT_SYMBOL_GPL(cpufreq_notify_transition);
-
-
-
-/*********************************************************************
- *               REGISTER / UNREGISTER CPUFREQ DRIVER                *
- *********************************************************************/
-
-/**
- * cpufreq_register_driver - register a CPU Frequency driver
- * @driver_data: A struct cpufreq_driver containing the values#
- * submitted by the CPU Frequency driver.
- *
- *   Registers a CPU Frequency driver to this core code. This code 
- * returns zero on success, -EBUSY when another driver got here first
- * (and isn't unregistered in the meantime). 
- *
- */
-int cpufreq_register_driver(struct cpufreq_driver *driver_data)
-{
-	unsigned long flags;
-
-	if (!driver_data || !driver_data->verify || !driver_data->init ||
-	    ((!driver_data->setpolicy) && (!driver_data->target)))
-		return -EINVAL;
-
-	spin_lock_irqsave(&cpufreq_driver_lock, flags);
-	if (cpufreq_driver) {
-		spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
-		return -EBUSY;
-	}
-	cpufreq_driver = driver_data;
-	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
-
-	return sysdev_driver_register(&cpu_sysdev_class,&cpufreq_sysdev_driver);
-}
-EXPORT_SYMBOL_GPL(cpufreq_register_driver);
-
-
-/**
- * cpufreq_unregister_driver - unregister the current CPUFreq driver
- *
- *    Unregister the current CPUFreq driver. Only call this if you have 
- * the right to do so, i.e. if you have succeeded in initialising before!
- * Returns zero if successful, and -EINVAL if the cpufreq_driver is
- * currently not initialised.
- */
-int cpufreq_unregister_driver(struct cpufreq_driver *driver)
-{
-	unsigned long flags;
-
-	if (!cpufreq_driver || (driver != cpufreq_driver))
-		return -EINVAL;
-
-	sysdev_driver_unregister(&cpu_sysdev_class, &cpufreq_sysdev_driver);
-
-	spin_lock_irqsave(&cpufreq_driver_lock, flags);
-	cpufreq_driver = NULL;
-	spin_unlock_irqrestore(&cpufreq_driver_lock, flags);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(cpufreq_unregister_driver);
-- 
cgit v1.2.3


From 968f11a8688e1be78719154d05bcab061bbfde2b Mon Sep 17 00:00:00 2001
From: Jamie Lokier <jamie@shareable.org>
Date: Thu, 4 Sep 2003 18:00:45 -0700
Subject: [PATCH] Unpinned futexes v2: indexing changes

This changes the way futexes are indexed, so that they don't pin pages.
It also fixes some bugs with private mappings and COW pages.

Currently, all futexes look up the page at the userspace address and pin
it, using the pair (page,offset) as an index into a table of waiting
futexes.  Any page with a futex waiting on it remains pinned in RAM,
which is a problem when many futexes are used, especially with FUTEX_FD.

Another problem is that the page is not always the correct one, if it
can be changed later by a COW (copy on write) operation.  This can
happen when waiting on a futex without writing to it after fork(),
exec() or mmap(), if the page is then written to before attempting to
wake a futex at the same adress.

There are two symptoms of the COW problem:
 - The wrong process can receive wakeups
 - A process can fail to receive required wakeups.

This patch fixes both by changing the indexing so that VM_SHARED
mappings use the triple (inode,offset,index), and private mappings use
the pair (mm,virtual_address).

The former correctly handles all shared mappings, including tmpfs and
therefore all kinds of shared memory (IPC shm, /dev/shm and
MAP_ANON|MAP_SHARED).  This works because every mapping which is
VM_SHARED has an associated non-zero vma->vm_file, and hence inode.
(This is ensured in do_mmap_pgoff, where it calls shmem_zero_setup).

The latter handles all private mappings, both files and anonymous.  It
isn't affected by COW, because it doesn't care about the actual pages,
just the virtual address.

The patch has a few bonuses:

        1. It removes the vcache implementation, as only futexes were
           using it, and they don't any more.

        2. Removing the vcache should make COW page faults a bit faster.

        3. Futex operations no longer take the page table lock, walk
           the page table, fault in pages that aren't mapped in the
           page table, or do a vcache hash lookup - they are mostly a
           simple offset calculation with one hash for the futex
           table.  So they should be noticably faster.

Special thanks to Hugh Dickins, Andrew Morton and Rusty Russell for
insightful feedback.  All suggestions are included.
---
 include/linux/mm.h     |   1 +
 include/linux/vcache.h |  26 ---
 kernel/futex.c         | 427 ++++++++++++++++++++++++++-----------------------
 mm/Makefile            |   2 +-
 mm/fremap.c            |  31 +++-
 mm/memory.c            |   2 -
 mm/vcache.c            |  90 -----------
 7 files changed, 249 insertions(+), 330 deletions(-)
 delete mode 100644 include/linux/vcache.h
 delete mode 100644 mm/vcache.c

(limited to 'kernel')

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 445fc58751c7..9f9743146b54 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -110,6 +110,7 @@ struct vm_area_struct {
 #define VM_RESERVED	0x00080000	/* Don't unmap it from swap_out */
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
+#define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
 
 #ifndef VM_STACK_DEFAULT_FLAGS		/* arch can override this */
 #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS
diff --git a/include/linux/vcache.h b/include/linux/vcache.h
deleted file mode 100644
index 5708fe6a908a..000000000000
--- a/include/linux/vcache.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- * virtual => physical mapping cache support.
- */
-#ifndef _LINUX_VCACHE_H
-#define _LINUX_VCACHE_H
-
-typedef struct vcache_s {
-	unsigned long address;
-	struct mm_struct *mm;
-	struct list_head hash_entry;
-	void (*callback)(struct vcache_s *data, struct page *new_page);
-} vcache_t;
-
-extern spinlock_t vcache_lock;
-
-extern void __attach_vcache(vcache_t *vcache,
-		unsigned long address,
-		struct mm_struct *mm,
-		void (*callback)(struct vcache_s *data, struct page *new_page));
-
-extern void __detach_vcache(vcache_t *vcache);
-
-extern void invalidate_vcache(unsigned long address, struct mm_struct *mm,
-				struct page *new_page);
-
-#endif
diff --git a/kernel/futex.c b/kernel/futex.c
index 4557addfc6d6..a4feceee661a 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -5,6 +5,9 @@
  *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
  *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
  *
+ *  Removed page pinning, fix privately mapped COW pages and other cleanups
+ *  (C) Copyright 2003 Jamie Lokier
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -33,11 +36,31 @@
 #include <linux/hash.h>
 #include <linux/init.h>
 #include <linux/futex.h>
-#include <linux/vcache.h>
 #include <linux/mount.h>
+#include <linux/pagemap.h>
 
 #define FUTEX_HASHBITS 8
 
+/*
+ * Futexes are matched on equal values of this key.
+ * The key type depends on whether it's a shared or private mapping.
+ */
+union futex_key {
+	struct {
+		unsigned long pgoff;
+		struct inode *inode;
+	} shared;
+	struct {
+		unsigned long uaddr;
+		struct mm_struct *mm;
+	} private;
+	struct {
+		unsigned long word;
+		void *ptr;
+	} both;
+	int offset;
+};
+
 /*
  * We use this hashed waitqueue instead of a normal wait_queue_t, so
  * we can wake only the relevant ones (hashed queues may be shared):
@@ -46,12 +69,8 @@ struct futex_q {
 	struct list_head list;
 	wait_queue_head_t waiters;
 
-	/* Page struct and offset within it. */
-	struct page *page;
-	int offset;
-
-	/* the virtual => physical COW-safe cache */
-	vcache_t vcache;
+	/* Key which the futex is hashed on. */
+	union futex_key key;
 
 	/* For fd, sigio sent using these. */
 	int fd;
@@ -66,111 +85,149 @@ static spinlock_t futex_lock = SPIN_LOCK_UNLOCKED;
 static struct vfsmount *futex_mnt;
 
 /*
- * These are all locks that are necessery to look up a physical
- * mapping safely, and modify/search the futex hash, atomically:
+ * We hash on the keys returned from get_futex_key (see below).
  */
-static inline void lock_futex_mm(void)
+static inline struct list_head *hash_futex(union futex_key *key)
 {
-	spin_lock(&current->mm->page_table_lock);
-	spin_lock(&vcache_lock);
-	spin_lock(&futex_lock);
-}
-
-static inline void unlock_futex_mm(void)
-{
-	spin_unlock(&futex_lock);
-	spin_unlock(&vcache_lock);
-	spin_unlock(&current->mm->page_table_lock);
+	return &futex_queues[hash_long(key->both.word
+				       + (unsigned long) key->both.ptr
+				       + key->offset, FUTEX_HASHBITS)];
 }
 
 /*
- * The physical page is shared, so we can hash on its address:
+ * Return 1 if two futex_keys are equal, 0 otherwise.
  */
-static inline struct list_head *hash_futex(struct page *page, int offset)
+static inline int match_futex(union futex_key *key1, union futex_key *key2)
 {
-	return &futex_queues[hash_long((unsigned long)page + offset,
-							FUTEX_HASHBITS)];
+	return (key1->both.word == key2->both.word
+		&& key1->both.ptr == key2->both.ptr
+		&& key1->offset == key2->offset);
 }
 
 /*
- * Get kernel address of the user page and pin it.
+ * Get parameters which are the keys for a futex.
+ *
+ * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode,
+ * offset_within_page).  For private mappings, it's (uaddr, current->mm).
+ * We can usually work out the index without swapping in the page.
  *
- * Must be called with (and returns with) all futex-MM locks held.
+ * Returns: 0, or negative error code.
+ * The key words are stored in *key on success.
+ *
+ * Should be called with &current->mm->mmap_sem,
+ * but NOT &futex_lock or &current->mm->page_table_lock.
  */
-static inline struct page *__pin_page_atomic (struct page *page)
-{
-	if (!PageReserved(page))
-		get_page(page);
-	return page;
-}
-
-static struct page *__pin_page(unsigned long addr)
+static int get_futex_key(unsigned long uaddr, union futex_key *key)
 {
 	struct mm_struct *mm = current->mm;
-	struct page *page, *tmp;
+	struct vm_area_struct *vma;
+	struct page *page;
 	int err;
 
 	/*
-	 * Do a quick atomic lookup first - this is the fastpath.
+	 * The futex address must be "naturally" aligned.
+	 */
+	key->offset = uaddr % PAGE_SIZE;
+	if (unlikely((key->offset % sizeof(u32)) != 0))
+		return -EINVAL;
+	uaddr -= key->offset;
+
+	/*
+	 * The futex is hashed differently depending on whether
+	 * it's in a shared or private mapping.  So check vma first.
+	 */
+	vma = find_extend_vma(mm, uaddr);
+	if (unlikely(!vma))
+		return -EFAULT;
+
+	/*
+	 * Permissions.
 	 */
-	page = follow_page(mm, addr, 0);
-	if (likely(page != NULL))
-		return __pin_page_atomic(page);
+	if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
+		return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
 
 	/*
-	 * No luck - need to fault in the page:
+	 * Private mappings are handled in a simple way.
+	 *
+	 * NOTE: When userspace waits on a MAP_SHARED mapping, even if
+	 * it's a read-only handle, it's expected that futexes attach to
+	 * the object not the particular process.  Therefore we use
+	 * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
+	 * mappings of _writable_ handles.
 	 */
-repeat_lookup:
+	if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
+		key->private.mm = mm;
+		key->private.uaddr = uaddr;
+		return 0;
+	}
 
-	unlock_futex_mm();
+	/*
+	 * Linear mappings are also simple.
+	 */
+	key->shared.inode = vma->vm_file->f_dentry->d_inode;
+	if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
+		key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT)
+				     + vma->vm_pgoff);
+		return 0;
+	}
 
-	down_read(&mm->mmap_sem);
-	err = get_user_pages(current, mm, addr, 1, 0, 0, &page, NULL);
-	up_read(&mm->mmap_sem);
+	/*
+	 * We could walk the page table to read the non-linear
+	 * pte, and get the page index without fetching the page
+	 * from swap.  But that's a lot of code to duplicate here
+	 * for a rare case, so we simply fetch the page.
+	 */
 
-	lock_futex_mm();
+	/*
+	 * Do a quick atomic lookup first - this is the fastpath.
+	 */
+	spin_lock(&current->mm->page_table_lock);
+	page = follow_page(mm, uaddr, 0);
+	if (likely(page != NULL)) {
+		key->shared.pgoff =
+			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
+		spin_unlock(&current->mm->page_table_lock);
+		return 0;
+	}
+	spin_unlock(&current->mm->page_table_lock);
 
-	if (err < 0)
-		return NULL;
 	/*
-	 * Since the faulting happened with locks released, we have to
-	 * check for races:
+	 * Do it the general way.
 	 */
-	tmp = follow_page(mm, addr, 0);
-	if (tmp != page) {
+	err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL);
+	if (err >= 0) {
+		key->shared.pgoff =
+			page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 		put_page(page);
-		goto repeat_lookup;
 	}
-
-	return page;
+	return err;
 }
 
+
 /*
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
-static inline int futex_wake(unsigned long uaddr, int offset, int num)
+static inline int futex_wake(unsigned long uaddr, int num)
 {
 	struct list_head *i, *next, *head;
-	struct page *page;
-	int ret = 0;
+	union futex_key key;
+	int ret;
 
-	lock_futex_mm();
+	down_read(&current->mm->mmap_sem);
 
-	page = __pin_page(uaddr - offset);
-	if (!page) {
-		unlock_futex_mm();
-		return -EFAULT;
-	}
+	ret = get_futex_key(uaddr, &key);
+	if (unlikely(ret != 0))
+		goto out;
 
-	head = hash_futex(page, offset);
+	head = hash_futex(&key);
 
+	spin_lock(&futex_lock);
 	list_for_each_safe(i, next, head) {
 		struct futex_q *this = list_entry(i, struct futex_q, list);
 
-		if (this->page == page && this->offset == offset) {
+		if (match_futex (&this->key, &key)) {
 			list_del_init(i);
-			__detach_vcache(&this->vcache);
 			wake_up_all(&this->waiters);
 			if (this->filp)
 				send_sigio(&this->filp->f_owner, this->fd, POLL_IN);
@@ -179,113 +236,74 @@ static inline int futex_wake(unsigned long uaddr, int offset, int num)
 				break;
 		}
 	}
+	spin_unlock(&futex_lock);
 
-	unlock_futex_mm();
-	put_page(page);
-
+out:
+	up_read(&current->mm->mmap_sem);
 	return ret;
 }
 
-/*
- * This gets called by the COW code, we have to rehash any
- * futexes that were pending on the old physical page, and
- * rehash it to the new physical page. The pagetable_lock
- * and vcache_lock is already held:
- */
-static void futex_vcache_callback(vcache_t *vcache, struct page *new_page)
-{
-	struct futex_q *q = container_of(vcache, struct futex_q, vcache);
-	struct list_head *head = hash_futex(new_page, q->offset);
-
-	spin_lock(&futex_lock);
-
-	if (!list_empty(&q->list)) {
-		put_page(q->page);
-		q->page = new_page;
-		__pin_page_atomic(new_page);
-		list_del(&q->list);
-		list_add_tail(&q->list, head);
-	}
-
-	spin_unlock(&futex_lock);
-}
-
 /*
  * Requeue all waiters hashed on one physical page to another
  * physical page.
  */
-static inline int futex_requeue(unsigned long uaddr1, int offset1,
-	unsigned long uaddr2, int offset2, int nr_wake, int nr_requeue)
+static inline int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
+				int nr_wake, int nr_requeue)
 {
 	struct list_head *i, *next, *head1, *head2;
-	struct page *page1 = NULL, *page2 = NULL;
-	int ret = 0;
+	union futex_key key1, key2;
+	int ret;
 
-	lock_futex_mm();
+	down_read(&current->mm->mmap_sem);
 
-	page1 = __pin_page(uaddr1 - offset1);
-	if (!page1)
+	ret = get_futex_key(uaddr1, &key1);
+	if (unlikely(ret != 0))
 		goto out;
-	page2 = __pin_page(uaddr2 - offset2);
-	if (!page2)
+	ret = get_futex_key(uaddr2, &key2);
+	if (unlikely(ret != 0))
 		goto out;
 
-	head1 = hash_futex(page1, offset1);
-	head2 = hash_futex(page2, offset2);
+	head1 = hash_futex(&key1);
+	head2 = hash_futex(&key2);
 
+	spin_lock(&futex_lock);
 	list_for_each_safe(i, next, head1) {
 		struct futex_q *this = list_entry(i, struct futex_q, list);
 
-		if (this->page == page1 && this->offset == offset1) {
+		if (match_futex (&this->key, &key1)) {
 			list_del_init(i);
-			__detach_vcache(&this->vcache);
 			if (++ret <= nr_wake) {
 				wake_up_all(&this->waiters);
 				if (this->filp)
 					send_sigio(&this->filp->f_owner,
 							this->fd, POLL_IN);
 			} else {
-				put_page(this->page);
-				__pin_page_atomic (page2);
 				list_add_tail(i, head2);
-				__attach_vcache(&this->vcache, uaddr2,
-					current->mm, futex_vcache_callback);
-				this->offset = offset2;
-				this->page = page2;
+				this->key = key2;
 				if (ret - nr_wake >= nr_requeue)
 					break;
 			}
 		}
 	}
+	spin_unlock(&futex_lock);
 
 out:
-	unlock_futex_mm();
-
-	if (page1)
-		put_page(page1);
-	if (page2)
-		put_page(page2);
-
+	up_read(&current->mm->mmap_sem);
 	return ret;
 }
 
-static inline void __queue_me(struct futex_q *q, struct page *page,
-				unsigned long uaddr, int offset,
-				int fd, struct file *filp)
+static inline void queue_me(struct futex_q *q, union futex_key *key,
+			    int fd, struct file *filp)
 {
-	struct list_head *head = hash_futex(page, offset);
+	struct list_head *head = hash_futex(key);
 
-	q->offset = offset;
+	q->key = *key;
 	q->fd = fd;
 	q->filp = filp;
-	q->page = page;
 
+	spin_lock(&futex_lock);
 	list_add_tail(&q->list, head);
-	/*
-	 * We register a futex callback to this virtual address,
-	 * to make sure a COW properly rehashes the futex-queue.
-	 */
-	__attach_vcache(&q->vcache, uaddr, current->mm, futex_vcache_callback);
+	spin_unlock(&futex_lock);
 }
 
 /* Return 1 if we were still queued (ie. 0 means we were woken) */
@@ -293,83 +311,107 @@ static inline int unqueue_me(struct futex_q *q)
 {
 	int ret = 0;
 
-	spin_lock(&vcache_lock);
 	spin_lock(&futex_lock);
 	if (!list_empty(&q->list)) {
 		list_del(&q->list);
-		__detach_vcache(&q->vcache);
 		ret = 1;
 	}
 	spin_unlock(&futex_lock);
-	spin_unlock(&vcache_lock);
 	return ret;
 }
 
-static inline int futex_wait(unsigned long uaddr,
-		      int offset,
-		      int val,
-		      unsigned long time)
+static inline int futex_wait(unsigned long uaddr, int val, unsigned long time)
 {
 	DECLARE_WAITQUEUE(wait, current);
-	int ret = 0, curval;
-	struct page *page;
+	int ret, curval;
+	union futex_key key;
 	struct futex_q q;
 
+ try_again:
 	init_waitqueue_head(&q.waiters);
 
-	lock_futex_mm();
+	down_read(&current->mm->mmap_sem);
 
-	page = __pin_page(uaddr - offset);
-	if (!page) {
-		unlock_futex_mm();
-		return -EFAULT;
-	}
-	__queue_me(&q, page, uaddr, offset, -1, NULL);
+	ret = get_futex_key(uaddr, &key);
+	if (unlikely(ret != 0))
+		goto out_release_sem;
+
+	queue_me(&q, &key, -1, NULL);
 
 	/*
-	 * Page is pinned, but may no longer be in this address space.
-	 * It cannot schedule, so we access it with the spinlock held.
+	 * Access the page after the futex is queued.
+	 * We hold the mmap semaphore, so the mapping cannot have changed
+	 * since we looked it up.
 	 */
 	if (get_user(curval, (int *)uaddr) != 0) {
-		unlock_futex_mm();
 		ret = -EFAULT;
-		goto out;
+		goto out_unqueue;
 	}
 	if (curval != val) {
-		unlock_futex_mm();
 		ret = -EWOULDBLOCK;
-		goto out;
+		goto out_unqueue;
 	}
+
+	/*
+	 * Now the futex is queued and we have checked the data, we
+	 * don't want to hold mmap_sem while we sleep.
+	 */	
+	up_read(&current->mm->mmap_sem);
+
 	/*
-	 * The get_user() above might fault and schedule so we
-	 * cannot just set TASK_INTERRUPTIBLE state when queueing
-	 * ourselves into the futex hash. This code thus has to
+	 * There might have been scheduling since the queue_me(), as we
+	 * cannot hold a spinlock across the get_user() in case it
+	 * faults.  So we cannot just set TASK_INTERRUPTIBLE state when
+	 * queueing ourselves into the futex hash.  This code thus has to
 	 * rely on the futex_wake() code doing a wakeup after removing
 	 * the waiter from the list.
 	 */
 	add_wait_queue(&q.waiters, &wait);
+	spin_lock(&futex_lock);
 	set_current_state(TASK_INTERRUPTIBLE);
-	if (!list_empty(&q.list)) {
-		unlock_futex_mm();
-		time = schedule_timeout(time);
+
+	if (unlikely(list_empty(&q.list))) {
+		/*
+		 * We were woken already.
+		 */
+		spin_unlock(&futex_lock);
+		set_current_state(TASK_RUNNING);
+		return 0;
 	}
+
+	spin_unlock(&futex_lock);
+	time = schedule_timeout(time);
 	set_current_state(TASK_RUNNING);
+
 	/*
 	 * NOTE: we don't remove ourselves from the waitqueue because
 	 * we are the only user of it.
 	 */
-	if (time == 0) {
-		ret = -ETIMEDOUT;
-		goto out;
-	}
+
+	/*
+	 * Were we woken or interrupted for a valid reason?
+	 */
+	ret = unqueue_me(&q);
+	if (ret == 0)
+		return 0;
+	if (time == 0)
+		return -ETIMEDOUT;
 	if (signal_pending(current))
-		ret = -EINTR;
-out:
-	/* Were we woken up anyway? */
+		return -EINTR;
+
+	/*
+	 * No, it was a spurious wakeup.  Try again.  Should never happen. :)
+	 */
+	goto try_again;
+
+ out_unqueue:
+	/*
+	 * Were we unqueued anyway?
+	 */
 	if (!unqueue_me(&q))
 		ret = 0;
-	put_page(q.page);
-
+ out_release_sem:
+	up_read(&current->mm->mmap_sem);
 	return ret;
 }
 
@@ -378,7 +420,6 @@ static int futex_close(struct inode *inode, struct file *filp)
 	struct futex_q *q = filp->private_data;
 
 	unqueue_me(q);
-	put_page(q->page);
 	kfree(filp->private_data);
 	return 0;
 }
@@ -406,12 +447,12 @@ static struct file_operations futex_fops = {
 
 /* Signal allows caller to avoid the race which would occur if they
    set the sigio stuff up afterwards. */
-static int futex_fd(unsigned long uaddr, int offset, int signal)
+static int futex_fd(unsigned long uaddr, int signal)
 {
-	struct page *page = NULL;
 	struct futex_q *q;
+	union futex_key key;
 	struct file *filp;
-	int ret;
+	int ret, err;
 
 	ret = -EINVAL;
 	if (signal < 0 || signal > _NSIG)
@@ -450,69 +491,47 @@ static int futex_fd(unsigned long uaddr, int offset, int signal)
 		goto out;
 	}
 
-	lock_futex_mm();
-
-	page = __pin_page(uaddr - offset);
-	if (!page) {
-		unlock_futex_mm();
+	down_read(&current->mm->mmap_sem);
+	err = get_futex_key(uaddr, &key);
+	up_read(&current->mm->mmap_sem);
 
+	if (unlikely(err != 0)) {
 		put_unused_fd(ret);
 		put_filp(filp);
 		kfree(q);
-		return -EFAULT;
+		return err;
 	}
 
 	init_waitqueue_head(&q->waiters);
 	filp->private_data = q;
 
-	__queue_me(q, page, uaddr, offset, ret, filp);
-
-	unlock_futex_mm();
+	queue_me(q, &key, ret, filp);
 
 	/* Now we map fd to filp, so userspace can access it */
 	fd_install(ret, filp);
-	page = NULL;
 out:
-	if (page)
-		put_page(page);
 	return ret;
 }
 
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
 		unsigned long uaddr2, int val2)
 {
-	unsigned long pos_in_page;
 	int ret;
 
-	pos_in_page = uaddr % PAGE_SIZE;
-
-	/* Must be "naturally" aligned */
-	if (pos_in_page % sizeof(u32))
-		return -EINVAL;
-
 	switch (op) {
 	case FUTEX_WAIT:
-		ret = futex_wait(uaddr, pos_in_page, val, timeout);
+		ret = futex_wait(uaddr, val, timeout);
 		break;
 	case FUTEX_WAKE:
-		ret = futex_wake(uaddr, pos_in_page, val);
+		ret = futex_wake(uaddr, val);
 		break;
 	case FUTEX_FD:
 		/* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
-		ret = futex_fd(uaddr, pos_in_page, val);
+		ret = futex_fd(uaddr, val);
 		break;
 	case FUTEX_REQUEUE:
-	{
-		unsigned long pos_in_page2 = uaddr2 % PAGE_SIZE;
-
-		/* Must be "naturally" aligned */
-		if (pos_in_page2 % sizeof(u32))
-			return -EINVAL;
-
-		ret = futex_requeue(uaddr, pos_in_page, uaddr2, pos_in_page2,
-				    val, val2);
+		ret = futex_requeue(uaddr, uaddr2, val, val2);
 		break;
-	}
 	default:
 		ret = -ENOSYS;
 	}
diff --git a/mm/Makefile b/mm/Makefile
index a8de64ff3525..c66aba5886f8 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,6 +9,6 @@ mmu-$(CONFIG_MMU)	:= fremap.o highmem.o madvise.o memory.o mincore.o \
 
 obj-y			:= bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
 			   page_alloc.o page-writeback.o pdflush.o readahead.o \
-			   slab.o swap.o truncate.o vcache.o vmscan.o $(mmu-y)
+			   slab.o swap.o truncate.o vmscan.o $(mmu-y)
 
 obj-$(CONFIG_SWAP)	+= page_io.o swap_state.o swapfile.o
diff --git a/mm/fremap.c b/mm/fremap.c
index 8f96af82b4e8..b19bdde07bb6 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -144,9 +144,10 @@ long sys_remap_file_pages(unsigned long start, unsigned long size,
 		return err;
 #endif
 
-	down_read(&mm->mmap_sem);
-
+	/* We need down_write() to change vma->vm_flags. */
+	down_write(&mm->mmap_sem);
 	vma = find_vma(mm, start);
+
 	/*
 	 * Make sure the vma is shared, that it supports prefaulting,
 	 * and that the remapped range is valid and fully within
@@ -155,11 +156,27 @@ long sys_remap_file_pages(unsigned long start, unsigned long size,
 	if (vma && (vma->vm_flags & VM_SHARED) &&
 		vma->vm_ops && vma->vm_ops->populate &&
 			end > start && start >= vma->vm_start &&
-				end <= vma->vm_end)
-		err = vma->vm_ops->populate(vma, start, size, vma->vm_page_prot,
-				pgoff, flags & MAP_NONBLOCK);
-
-	up_read(&mm->mmap_sem);
+				end <= vma->vm_end) {
+
+		/* Must set VM_NONLINEAR before any pages are populated. */
+		if (pgoff != ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff)
+			vma->vm_flags |= VM_NONLINEAR;
+
+		/* ->populate can take a long time, so downgrade the lock. */
+		downgrade_write(&mm->mmap_sem);
+		err = vma->vm_ops->populate(vma, start, size,
+					    vma->vm_page_prot,
+					    pgoff, flags & MAP_NONBLOCK);
+
+		/*
+		 * We can't clear VM_NONLINEAR because we'd have to do
+		 * it after ->populate completes, and that would prevent
+		 * downgrading the lock.  (Locks can't be upgraded).
+		 */
+		up_read(&mm->mmap_sem);
+	} else {
+		up_write(&mm->mmap_sem);
+	}
 
 	return err;
 }
diff --git a/mm/memory.c b/mm/memory.c
index 61b782f40df9..980953dbbfb4 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -43,7 +43,6 @@
 #include <linux/swap.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
-#include <linux/vcache.h>
 #include <linux/rmap-locking.h>
 #include <linux/module.h>
 
@@ -962,7 +961,6 @@ static inline void establish_pte(struct vm_area_struct * vma, unsigned long addr
 static inline void break_cow(struct vm_area_struct * vma, struct page * new_page, unsigned long address, 
 		pte_t *page_table)
 {
-	invalidate_vcache(address, vma->vm_mm, new_page);
 	flush_cache_page(vma, address);
 	establish_pte(vma, address, page_table, pte_mkwrite(pte_mkdirty(mk_pte(new_page, vma->vm_page_prot))));
 }
diff --git a/mm/vcache.c b/mm/vcache.c
deleted file mode 100644
index 599e0f25490d..000000000000
--- a/mm/vcache.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- *  linux/mm/vcache.c
- *
- *  virtual => physical page mapping cache. Users of this mechanism
- *  register callbacks for a given (virt,mm,phys) page mapping, and
- *  the kernel guarantees to call back when this mapping is invalidated.
- *  (ie. upon COW or unmap.)
- *
- *  Started by Ingo Molnar, Copyright (C) 2002
- */
-
-#include <linux/mm.h>
-#include <linux/init.h>
-#include <linux/hash.h>
-#include <linux/vcache.h>
-
-#define VCACHE_HASHBITS 8
-#define VCACHE_HASHSIZE (1 << VCACHE_HASHBITS)
-
-spinlock_t vcache_lock = SPIN_LOCK_UNLOCKED;
-
-static struct list_head hash[VCACHE_HASHSIZE];
-
-static struct list_head *hash_vcache(unsigned long address,
-					struct mm_struct *mm)
-{
-        return &hash[hash_long(address + (unsigned long)mm, VCACHE_HASHBITS)];
-}
-
-void __attach_vcache(vcache_t *vcache,
-		unsigned long address,
-		struct mm_struct *mm,
-		void (*callback)(struct vcache_s *data, struct page *new))
-{
-	struct list_head *hash_head;
-
-	address &= PAGE_MASK;
-	vcache->address = address;
-	vcache->mm = mm;
-	vcache->callback = callback;
-
-	hash_head = hash_vcache(address, mm);
-
-	list_add_tail(&vcache->hash_entry, hash_head);
-}
-
-void __detach_vcache(vcache_t *vcache)
-{
-	list_del_init(&vcache->hash_entry);
-}
-
-void invalidate_vcache(unsigned long address, struct mm_struct *mm,
-				struct page *new_page)
-{
-	struct list_head *l, *hash_head;
-	vcache_t *vcache;
-
-	address &= PAGE_MASK;
-
-	hash_head = hash_vcache(address, mm);
-	/*
-	 * This is safe, because this path is called with the pagetable
-	 * lock held. So while other mm's might add new entries in
-	 * parallel, *this* mm is locked out, so if the list is empty
-	 * now then we do not have to take the vcache lock to see it's
-	 * really empty.
-	 */
-	if (likely(list_empty(hash_head)))
-		return;
-
-	spin_lock(&vcache_lock);
-	list_for_each(l, hash_head) {
-		vcache = list_entry(l, vcache_t, hash_entry);
-		if (vcache->address != address || vcache->mm != mm)
-			continue;
-		vcache->callback(vcache, new_page);
-	}
-	spin_unlock(&vcache_lock);
-}
-
-static int __init vcache_init(void)
-{
-        unsigned int i;
-
-	for (i = 0; i < VCACHE_HASHSIZE; i++)
-		INIT_LIST_HEAD(hash + i);
-	return 0;
-}
-__initcall(vcache_init);
-
-- 
cgit v1.2.3


From 25a6ca892403f9d5dc2e1ed28db13e31b9fea2d2 Mon Sep 17 00:00:00 2001
From: Alexander Viro <viro@www.linux.org.uk>
Date: Thu, 4 Sep 2003 20:53:47 -0700
Subject: [PATCH] large dev_t - second series (5/15)

	cdevname() killed, there was only one remaining user
(tty_paranoia_check()) and in that case cdevname() was worse
than plain major:minor (basically, it's "you've got corrupted
inode that was supposed to belong to tty device; here's what
I'd found in ->i_rdev")
---
 drivers/char/tty_io.c | 13 ++++++-------
 fs/char_dev.c         | 21 ---------------------
 include/linux/fs.h    |  1 -
 kernel/ksyms.c        |  1 -
 4 files changed, 6 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index 5b4693c5f990..83640a923209 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -181,17 +181,16 @@ inline int tty_paranoia_check(struct tty_struct *tty, struct inode *inode,
 			      const char *routine)
 {
 #ifdef TTY_PARANOIA_CHECK
-	static const char badmagic[] = KERN_WARNING
-		"Warning: bad magic number for tty struct (%s) in %s\n";
-	static const char badtty[] = KERN_WARNING
-		"Warning: null TTY for (%s) in %s\n";
-
 	if (!tty) {
-		printk(badtty, cdevname(inode->i_rdev), routine);
+		printk(KERN_WARNING
+			"null TTY for (%d:%d) in %s\n",
+			imajor(inode), iminor(inode), routine);
 		return 1;
 	}
 	if (tty->magic != TTY_MAGIC) {
-		printk(badmagic, cdevname(inode->i_rdev), routine);
+		printk(KERN_WARNING
+			"bad magic number for tty struct (%d:%d) in %s\n",
+			imajor(inode), iminor(inode), routine);
 		return 1;
 	}
 #endif
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 3c6258c73384..e41bf7caa5ee 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -328,27 +328,6 @@ struct file_operations def_chr_fops = {
 	.open = chrdev_open,
 };
 
-const char *cdevname(kdev_t dev)
-{
-	static char buffer[40];
-	const char *name = "unknown-char";
-	unsigned int major = major(dev);
-	unsigned int minor = minor(dev);
-	int i = major_to_index(major);
-	struct char_device_struct *cd;
-
-	read_lock(&chrdevs_lock);
-	for (cd = chrdevs[i]; cd; cd = cd->next)
-		if (cd->major == major)
-			break;
-	if (cd)
-		name = cd->name;
-	sprintf(buffer, "%s(%d,%d)", name, major, minor);
-	read_unlock(&chrdevs_lock);
-
-	return buffer;
-}
-
 static struct kobject *exact_match(dev_t dev, int *part, void *data)
 {
 	struct cdev *p = data;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 87ae270bf96c..fa2bd2dd7600 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1156,7 +1156,6 @@ extern struct block_device *lookup_bdev(const char *);
 extern struct block_device *open_bdev_excl(const char *, int, int, void *);
 extern void close_bdev_excl(struct block_device *, int);
 
-extern const char * cdevname(kdev_t);
 extern void init_special_inode(struct inode *, umode_t, dev_t);
 
 /* Invalid inode operations -- fs/bad_inode.c */
diff --git a/kernel/ksyms.c b/kernel/ksyms.c
index e503bd8b0349..9f61a0496c2a 100644
--- a/kernel/ksyms.c
+++ b/kernel/ksyms.c
@@ -511,7 +511,6 @@ EXPORT_SYMBOL(vsnprintf);
 EXPORT_SYMBOL(vsscanf);
 EXPORT_SYMBOL(__bdevname);
 EXPORT_SYMBOL(bdevname);
-EXPORT_SYMBOL(cdevname);
 EXPORT_SYMBOL(simple_strtoull);
 EXPORT_SYMBOL(simple_strtoul);
 EXPORT_SYMBOL(simple_strtol);
-- 
cgit v1.2.3