From 85b2148a13080ee8244d9bb60b42da7dee7725a3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 Sep 2002 22:50:18 -0700 Subject: [PATCH] deadline scheduler This introduces the deadline-ioscheduler, making it the default. 2nd patch coming that deletes elevator_linus in a minute. This one has read_expire at 500ms, and writes_starved at 2. --- include/linux/elevator.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include') diff --git a/include/linux/elevator.h b/include/linux/elevator.h index e98168f92e67..8e4ca13c458a 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -59,6 +59,12 @@ extern elevator_t elevator_linus; #define elv_linus_sequence(rq) ((long)(rq)->elevator_private) #define ELV_LINUS_SEEK_COST 16 +/* + * deadline i/o scheduler. uses request time outs to prevent indefinite + * starvation + */ +extern elevator_t iosched_deadline; + /* * use the /proc/iosched interface, all the below is history -> */ -- cgit v1.2.3 From 2684cd69c942e9b63d094059992e46a988fdb7f1 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 24 Sep 2002 22:50:31 -0700 Subject: [PATCH] remove elevator_linus Patch killing off elevator_linus for good. Sniffle. --- drivers/block/elevator.c | 118 ----------------------------------------------- include/linux/elevator.h | 8 ---- 2 files changed, 126 deletions(-) (limited to 'include') diff --git a/drivers/block/elevator.c b/drivers/block/elevator.c index 10729a1f0c1c..68f2ded9d86e 100644 --- a/drivers/block/elevator.c +++ b/drivers/block/elevator.c @@ -157,114 +157,6 @@ inline int elv_try_last_merge(request_queue_t *q, struct request **req, return ret; } -static int bio_rq_before(struct bio *bio, struct request *rq) -{ - if (!kdev_same(to_kdev_t(bio->bi_bdev->bd_dev), rq->rq_dev)) - return 0; - return bio->bi_sector < rq->sector; -} - -/* - * elevator_linux starts here - */ -int elevator_linus_merge(request_queue_t *q, struct request **req, - struct bio *bio) -{ - struct list_head *entry, *good; - struct request *__rq; - int ret; - - if ((ret = elv_try_last_merge(q, req, bio))) - return ret; - - entry = &q->queue_head; - good = &q->queue_head; - ret = ELEVATOR_NO_MERGE; - while ((entry = entry->prev) != &q->queue_head) { - __rq = list_entry_rq(entry); - - if (__rq->flags & (REQ_BARRIER | REQ_STARTED)) - break; - if (!(__rq->flags & REQ_CMD)) - break; - - if (bio_data_dir(bio) != rq_data_dir(__rq)) { - if (bio_data_dir(bio) == WRITE) - break; - good = entry->prev; - continue; - } - - ret = elv_try_merge(__rq, bio); - if (ret) { - *req = __rq; - q->last_merge = &__rq->queuelist; - return ret; - } - - if (bio_rq_before(bio, __rq)) - good = entry->prev; - - } - - if (good != &q->queue_head) - *req = list_entry_rq(good); - - return ELEVATOR_NO_MERGE; -} - -void elevator_linus_merge_req(request_queue_t *q, struct request *req, - struct request *next) -{ - if (elv_linus_sequence(next) < elv_linus_sequence(req)) - elv_linus_sequence(req) = elv_linus_sequence(next); -} - -void elevator_linus_add_request(request_queue_t *q, struct request *rq, - struct list_head *insert_here) -{ - elevator_t *e = &q->elevator; - int lat = 0, *latency = e->elevator_data; - - if (!insert_here) - insert_here = q->queue_head.prev; - - if (!(rq->flags & REQ_BARRIER)) - lat = latency[rq_data_dir(rq)]; - - elv_linus_sequence(rq) = lat; - - list_add(&rq->queuelist, insert_here); - - /* - * new merges must not precede this barrier - */ - if (rq->flags & REQ_BARRIER) - q->last_merge = NULL; - else if (!q->last_merge) - q->last_merge = &rq->queuelist; -} - -int elevator_linus_init(request_queue_t *q, elevator_t *e) -{ - int *latency; - - latency = kmalloc(2 * sizeof(int), GFP_KERNEL); - if (!latency) - return -ENOMEM; - - latency[READ] = 1024; - latency[WRITE] = 2048; - - e->elevator_data = latency; - return 0; -} - -void elevator_linus_exit(request_queue_t *q, elevator_t *e) -{ - kfree(e->elevator_data); -} - /* * elevator noop * @@ -442,15 +334,6 @@ inline struct list_head *elv_get_sort_head(request_queue_t *q, return &q->queue_head; } -elevator_t elevator_linus = { - elevator_merge_fn: elevator_linus_merge, - elevator_merge_req_fn: elevator_linus_merge_req, - elevator_next_req_fn: elevator_noop_next_request, - elevator_add_req_fn: elevator_linus_add_request, - elevator_init_fn: elevator_linus_init, - elevator_exit_fn: elevator_linus_exit, -}; - elevator_t elevator_noop = { elevator_merge_fn: elevator_noop_merge, elevator_next_req_fn: elevator_noop_next_request, @@ -459,7 +342,6 @@ elevator_t elevator_noop = { module_init(elevator_global_init); -EXPORT_SYMBOL(elevator_linus); EXPORT_SYMBOL(elevator_noop); EXPORT_SYMBOL(__elv_add_request); diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 8e4ca13c458a..c5cc69788530 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -51,14 +51,6 @@ extern inline struct list_head *elv_get_sort_head(request_queue_t *, struct requ */ extern elevator_t elevator_noop; -/* - * elevator linus. based on linus ideas of starvation control, using - * sequencing to manage inserts and merges. - */ -extern elevator_t elevator_linus; -#define elv_linus_sequence(rq) ((long)(rq)->elevator_private) -#define ELV_LINUS_SEEK_COST 16 - /* * deadline i/o scheduler. uses request time outs to prevent indefinite * starvation -- cgit v1.2.3 From 5dd6a6e5cdad7a7018accce3ccd888dea2667405 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 24 Sep 2002 22:50:59 -0700 Subject: [PATCH] exit-fix-2.5.38-E3 This fixes a number of bugs in the thread-release code: - notify parents only if the group leader is a zombie, and if it's not a detached thread. - do not reparent children to zombie tasks. - introduce the TASK_DEAD state for tasks, to serialize the task-release path. (to some it might be confusing that tasks are zombies first, then dead :-) - simplify tasklist_lock usage in release_task(). the effect of the above bugs ranged from unkillable hung zombies to kernel crashes. None of those happens with the patch applied. --- include/linux/sched.h | 5 +++-- kernel/exit.c | 46 +++++++++++++++++++++++++++++++--------------- 2 files changed, 34 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index f1346010d73e..471dcb9c108d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -100,8 +100,9 @@ extern unsigned long nr_uninterruptible(void); #define TASK_RUNNING 0 #define TASK_INTERRUPTIBLE 1 #define TASK_UNINTERRUPTIBLE 2 -#define TASK_ZOMBIE 4 -#define TASK_STOPPED 8 +#define TASK_STOPPED 4 +#define TASK_ZOMBIE 8 +#define TASK_DEAD 16 #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) diff --git a/kernel/exit.c b/kernel/exit.c index 7189e9bce6d4..6ed07def4c62 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -32,6 +32,7 @@ int getrusage(struct task_struct *, int, struct rusage *); static struct dentry * __unhash_process(struct task_struct *p) { struct dentry *proc_dentry; + nr_threads--; detach_pid(p, PIDTYPE_PID); detach_pid(p, PIDTYPE_TGID); @@ -57,31 +58,31 @@ static struct dentry * __unhash_process(struct task_struct *p) void release_task(struct task_struct * p) { struct dentry *proc_dentry; + task_t *leader; - if (p->state != TASK_ZOMBIE) + if (p->state < TASK_ZOMBIE) BUG(); if (p != current) wait_task_inactive(p); atomic_dec(&p->user->processes); security_ops->task_free_security(p); free_uid(p->user); - if (unlikely(p->ptrace)) { - write_lock_irq(&tasklist_lock); + write_lock_irq(&tasklist_lock); + if (unlikely(p->ptrace)) __ptrace_unlink(p); - write_unlock_irq(&tasklist_lock); - } BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); - write_lock_irq(&tasklist_lock); __exit_sighand(p); proc_dentry = __unhash_process(p); /* * If we are the last non-leader member of the thread * group, and the leader is zombie, then notify the - * group leader's parent process. + * group leader's parent process. (if it wants notification.) */ - if (p->group_leader != p && thread_group_empty(p)) - do_notify_parent(p->group_leader, p->group_leader->exit_signal); + leader = p->group_leader; + if (leader != p && thread_group_empty(leader) && + leader->state == TASK_ZOMBIE && leader->exit_signal != -1) + do_notify_parent(leader, leader->exit_signal); p->parent->cutime += p->utime + p->cutime; p->parent->cstime += p->stime + p->cstime; @@ -159,7 +160,7 @@ static int __will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) for_each_task_pid(pgrp, PIDTYPE_PGID, p, l, pid) { if (p == ignored_task - || p->state == TASK_ZOMBIE + || p->state >= TASK_ZOMBIE || p->real_parent->pid == 1) continue; if (p->real_parent->pgrp != pgrp @@ -435,8 +436,11 @@ void exit_mm(struct task_struct *tsk) static inline void choose_new_parent(task_t *p, task_t *reaper, task_t *child_reaper) { - /* Make sure we're not reparenting to ourselves. */ - if (p == reaper) + /* + * Make sure we're not reparenting to ourselves and that + * the parent is not a zombie. + */ + if (p == reaper || reaper->state >= TASK_ZOMBIE) p->real_parent = child_reaper; else p->real_parent = reaper; @@ -774,9 +778,10 @@ static int eligible_child(pid_t pid, int options, task_t *p) asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru) { - int flag, retval; DECLARE_WAITQUEUE(wait, current); struct task_struct *tsk; + unsigned long state; + int flag, retval; if (options & ~(WNOHANG|WUNTRACED|__WNOTHREAD|__WCLONE|__WALL)) return -EINVAL; @@ -827,7 +832,15 @@ repeat: */ if (ret == 2) continue; + /* + * Try to move the task's state to DEAD + * only one thread is allowed to do this: + */ + state = xchg(&p->state, TASK_DEAD); + if (state != TASK_ZOMBIE) + continue; read_unlock(&tasklist_lock); + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; if (!retval && stat_addr) { if (p->sig->group_exit) @@ -835,13 +848,16 @@ repeat: else retval = put_user(p->exit_code, stat_addr); } - if (retval) - goto end_wait4; + if (retval) { + p->state = TASK_ZOMBIE; + goto end_wait4; + } retval = p->pid; if (p->real_parent != p->parent) { write_lock_irq(&tasklist_lock); __ptrace_unlink(p); do_notify_parent(p, SIGCHLD); + p->state = TASK_ZOMBIE; write_unlock_irq(&tasklist_lock); } else release_task(p); -- cgit v1.2.3 From bce5aeb50b4b996e3a20d4df247714dc3d8d6b69 Mon Sep 17 00:00:00 2001 From: "Martin J. Bligh" Date: Wed, 25 Sep 2002 07:05:01 -0700 Subject: [PATCH] NUMA-Q fixes - Remove the const that someone incorrectly stuck in there, it type conflicts. Alan has a better plan for fixing this long term, but this fixes the compile warning for now. - Move the printk of the xquad_portio setup *after* we put something in the variable so it actually prints something useful, not 0 ;-) - To derive the size of the xquad_portio area, multiply the number of nodes by the size of each nodes, not the size of two nodes (and remove define). Doh! --- arch/i386/boot/compressed/misc.c | 2 +- arch/i386/kernel/smpboot.c | 6 +++--- include/asm-i386/io.h | 1 - 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/i386/boot/compressed/misc.c b/arch/i386/boot/compressed/misc.c index 42ce2febe8b7..fcec73a7e379 100644 --- a/arch/i386/boot/compressed/misc.c +++ b/arch/i386/boot/compressed/misc.c @@ -121,7 +121,7 @@ static int vidport; static int lines, cols; #ifdef CONFIG_MULTIQUAD -static void * const xquad_portio = NULL; +static void * xquad_portio = NULL; #endif #include "../../../../lib/inflate.c" diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c index 8a04f3d2c8aa..9d513dc1ceb2 100644 --- a/arch/i386/kernel/smpboot.c +++ b/arch/i386/kernel/smpboot.c @@ -1060,11 +1060,11 @@ static void __init smp_boot_cpus(unsigned int max_cpus) if (clustered_apic_mode && (numnodes > 1)) { printk("Remapping cross-quad port I/O for %d quads\n", numnodes); + xquad_portio = ioremap (XQUAD_PORTIO_BASE, + numnodes * XQUAD_PORTIO_QUAD); printk("xquad_portio vaddr 0x%08lx, len %08lx\n", (u_long) xquad_portio, - (u_long) numnodes * XQUAD_PORTIO_LEN); - xquad_portio = ioremap (XQUAD_PORTIO_BASE, - numnodes * XQUAD_PORTIO_LEN); + (u_long) numnodes * XQUAD_PORTIO_QUAD); } /* diff --git a/include/asm-i386/io.h b/include/asm-i386/io.h index b2afa09c062f..41bc8ef0bdf8 100644 --- a/include/asm-i386/io.h +++ b/include/asm-i386/io.h @@ -40,7 +40,6 @@ #define XQUAD_PORTIO_BASE 0xfe400000 #define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ -#define XQUAD_PORTIO_LEN 0x80000 /* Only remapping first 2 quads */ #ifdef __KERNEL__ -- cgit v1.2.3 From 3da08d6c052734e186e835dc05ff9a33746c21e4 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 25 Sep 2002 07:20:08 -0700 Subject: [PATCH] prepare_to_wait/finish_wait sleep/wakeup API This is worth a whopping 2% on spwecweb on an 8-way. Which is faintly surprising because __wake_up and other wait/wakeup functions are not apparent in the specweb profiles which I've seen. The main objective of this is to reduce the CPU cost of the wait/wakeup operation. When a task is woken up, its waitqueue is removed from the waitqueue_head by the waker (ie: immediately), rather than by the woken process. This means that a subsequent wakeup does not need to revisit the just-woken task. It also means that the just-woken task does not need to take the waitqueue_head's lock, which may well reside in another CPU's cache. I have no decent measurements on the effect of this change - possibly a 20-30% drop in __wake_up cost in Badari's 40-dds-to-40-disks test (it was the most expensive function), but it's inconclusive. And no quantitative testing of which I am aware has been performed by networking people. The API is very simple to use (Linus thought it up): my_func(waitqueue_head_t *wqh) { DEFINE_WAIT(wait); prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); if (!some_test) schedule(); finish_wait(wqh, &wait); } or: DEFINE_WAIT(wait); while (!some_test_1) { prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); if (!some_test_2) schedule(); ... } finish_wait(wqh, &wait); You need to bear in mind that once prepare_to_wait has been performed, your task could be removed from the waitqueue_head and placed into TASK_RUNNING at any time. You don't know whether or not you're still on the waitqueue_head. Running prepare_to_wait() when you're already on the waitqueue_head is fine - it will do the right thing. Running finish_wait() when you're actually not on the waitqueue_head is fine. Running finish_wait() when you've _never_ been on the waitqueue_head is fine, as ling as the DEFINE_WAIT() macro was used to initialise the waitqueue. You don't need to fiddle with current->state. prepare_to_wait() and finish_wait() will do that. finish_wait() will always return in state TASK_RUNNING. There are plenty of usage examples in vm-wakeups.patch and tcp-wakeups.patch. --- include/linux/wait.h | 26 ++++++++++++++++++++++++++ kernel/fork.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/ksyms.c | 4 ++++ 3 files changed, 76 insertions(+) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 8664b02f230d..b6ce459f8792 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -119,6 +119,32 @@ static inline void __remove_wait_queue(wait_queue_head_t *head, _raced; \ }) +/* + * Waitqueue's which are removed from the waitqueue_head at wakeup time + */ +void FASTCALL(prepare_to_wait(wait_queue_head_t *q, + wait_queue_t *wait, int state)); +void FASTCALL(prepare_to_wait_exclusive(wait_queue_head_t *q, + wait_queue_t *wait, int state)); +void FASTCALL(finish_wait(wait_queue_head_t *q, wait_queue_t *wait)); +int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync); + +#define DEFINE_WAIT(name) \ + wait_queue_t name = { \ + .task = current, \ + .func = autoremove_wake_function, \ + .task_list = { .next = &name.task_list, \ + .prev = &name.task_list, \ + }, \ + } + +#define init_wait(wait) \ + do { \ + wait->task = current; \ + wait->func = autoremove_wake_function; \ + INIT_LIST_HEAD(&wait->task_list); \ + } while (0) + #endif /* __KERNEL__ */ #endif diff --git a/kernel/fork.c b/kernel/fork.c index 062a4d1f9c3e..5880309f3fee 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -103,6 +103,52 @@ void remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) spin_unlock_irqrestore(&q->lock, flags); } +void prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + __set_current_state(state); + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} + +void +prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state) +{ + unsigned long flags; + + __set_current_state(state); + wait->flags |= WQ_FLAG_EXCLUSIVE; + spin_lock_irqsave(&q->lock, flags); + if (list_empty(&wait->task_list)) + __add_wait_queue_tail(q, wait); + spin_unlock_irqrestore(&q->lock, flags); +} + +void finish_wait(wait_queue_head_t *q, wait_queue_t *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + if (!list_empty(&wait->task_list)) { + spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + spin_unlock_irqrestore(&q->lock, flags); + } +} + +int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync) +{ + int ret = default_wake_function(wait, mode, sync); + + if (ret) + list_del_init(&wait->task_list); + return ret; +} + void __init fork_init(unsigned long mempages) { /* create a slab on which task_structs can be allocated */ diff --git a/kernel/ksyms.c b/kernel/ksyms.c index 557ae8f7ded2..cd69b97e8e9d 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -400,6 +400,10 @@ EXPORT_SYMBOL(irq_stat); EXPORT_SYMBOL(add_wait_queue); EXPORT_SYMBOL(add_wait_queue_exclusive); EXPORT_SYMBOL(remove_wait_queue); +EXPORT_SYMBOL(prepare_to_wait); +EXPORT_SYMBOL(prepare_to_wait_exclusive); +EXPORT_SYMBOL(finish_wait); +EXPORT_SYMBOL(autoremove_wake_function); /* completion handling */ EXPORT_SYMBOL(wait_for_completion); -- cgit v1.2.3 From dfdacf598759e7027914d50a77e8cd3a98bf7481 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 25 Sep 2002 07:20:13 -0700 Subject: [PATCH] use prepare_to_wait in VM/VFS This uses the new wakeup machinery in some hot parts of the VFS and block layers. wait_on_buffer(), wait_on_page(), lock_page(), blk_congestion_wait(). Also in get_request_wait(), although the benefit for exclusive wakeups will be lower. --- drivers/block/ll_rw_blk.c | 17 ++++++--------- fs/buffer.c | 16 ++++++-------- include/linux/pagemap.h | 8 ++++++- mm/filemap.c | 55 +++++++++++++++++------------------------------ 4 files changed, 40 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/drivers/block/ll_rw_blk.c b/drivers/block/ll_rw_blk.c index dc521fe7bcaf..a2595200d838 100644 --- a/drivers/block/ll_rw_blk.c +++ b/drivers/block/ll_rw_blk.c @@ -1233,24 +1233,23 @@ static struct request *get_request(request_queue_t *q, int rw) */ static struct request *get_request_wait(request_queue_t *q, int rw) { - DECLARE_WAITQUEUE(wait, current); + DEFINE_WAIT(wait); struct request_list *rl = &q->rq[rw]; struct request *rq; spin_lock_prefetch(q->queue_lock); generic_unplug_device(q); - add_wait_queue_exclusive(&rl->wait, &wait); do { - set_current_state(TASK_UNINTERRUPTIBLE); + prepare_to_wait_exclusive(&rl->wait, &wait, + TASK_UNINTERRUPTIBLE); if (!rl->count) schedule(); + finish_wait(&rl->wait, &wait); spin_lock_irq(q->queue_lock); rq = get_request(q, rw); spin_unlock_irq(q->queue_lock); } while (rq == NULL); - remove_wait_queue(&rl->wait, &wait); - current->state = TASK_RUNNING; return rq; } @@ -1460,18 +1459,16 @@ void blk_put_request(struct request *req) */ void blk_congestion_wait(int rw, long timeout) { - DECLARE_WAITQUEUE(wait, current); + DEFINE_WAIT(wait); struct congestion_state *cs = &congestion_states[rw]; if (atomic_read(&cs->nr_congested_queues) == 0) return; blk_run_queues(); - set_current_state(TASK_UNINTERRUPTIBLE); - add_wait_queue(&cs->wqh, &wait); + prepare_to_wait(&cs->wqh, &wait, TASK_UNINTERRUPTIBLE); if (atomic_read(&cs->nr_congested_queues) != 0) schedule_timeout(timeout); - set_current_state(TASK_RUNNING); - remove_wait_queue(&cs->wqh, &wait); + finish_wait(&cs->wqh, &wait); } /* diff --git a/fs/buffer.c b/fs/buffer.c index 0b9766099e3d..3b8477f2aca7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -128,22 +128,18 @@ void unlock_buffer(struct buffer_head *bh) */ void __wait_on_buffer(struct buffer_head * bh) { - wait_queue_head_t *wq = bh_waitq_head(bh); - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + wait_queue_head_t *wqh = bh_waitq_head(bh); + DEFINE_WAIT(wait); get_bh(bh); - add_wait_queue(wq, &wait); do { + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); blk_run_queues(); - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (!buffer_locked(bh)) - break; - schedule(); + if (buffer_locked(bh)) + schedule(); } while (buffer_locked(bh)); - tsk->state = TASK_RUNNING; - remove_wait_queue(wq, &wait); put_bh(bh); + finish_wait(wqh, &wait); } static inline void diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 43390b2e2ef4..bfc986131fe6 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -74,9 +74,15 @@ static inline void ___add_to_page_cache(struct page *page, inc_page_state(nr_pagecache); } -extern void FASTCALL(lock_page(struct page *page)); +extern void FASTCALL(__lock_page(struct page *page)); extern void FASTCALL(unlock_page(struct page *page)); +static inline void lock_page(struct page *page) +{ + if (TestSetPageLocked(page)) + __lock_page(page); +} + /* * This is exported only for wait_on_page_locked/wait_on_page_writeback. * Never use this directly! diff --git a/mm/filemap.c b/mm/filemap.c index 9118a5794f27..f45168a04974 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -632,19 +632,15 @@ static inline wait_queue_head_t *page_waitqueue(struct page *page) void wait_on_page_bit(struct page *page, int bit_nr) { wait_queue_head_t *waitqueue = page_waitqueue(page); - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + DEFINE_WAIT(wait); - add_wait_queue(waitqueue, &wait); do { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (!test_bit(bit_nr, &page->flags)) - break; + prepare_to_wait(waitqueue, &wait, TASK_UNINTERRUPTIBLE); sync_page(page); - schedule(); + if (test_bit(bit_nr, &page->flags)) + schedule(); } while (test_bit(bit_nr, &page->flags)); - __set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(waitqueue, &wait); + finish_wait(waitqueue, &wait); } EXPORT_SYMBOL(wait_on_page_bit); @@ -690,38 +686,27 @@ void end_page_writeback(struct page *page) EXPORT_SYMBOL(end_page_writeback); /* - * Get a lock on the page, assuming we need to sleep - * to get it.. + * Get a lock on the page, assuming we need to sleep to get it. + * + * Ugly: running sync_page() in state TASK_UNINTERRUPTIBLE is scary. If some + * random driver's requestfn sets TASK_RUNNING, we could busywait. However + * chances are that on the second loop, the block layer's plug list is empty, + * so sync_page() will then return in state TASK_UNINTERRUPTIBLE. */ -static void __lock_page(struct page *page) +void __lock_page(struct page *page) { - wait_queue_head_t *waitqueue = page_waitqueue(page); - struct task_struct *tsk = current; - DECLARE_WAITQUEUE(wait, tsk); + wait_queue_head_t *wqh = page_waitqueue(page); + DEFINE_WAIT(wait); - add_wait_queue_exclusive(waitqueue, &wait); - for (;;) { - set_task_state(tsk, TASK_UNINTERRUPTIBLE); - if (PageLocked(page)) { - sync_page(page); + while (TestSetPageLocked(page)) { + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + sync_page(page); + if (PageLocked(page)) schedule(); - } - if (!TestSetPageLocked(page)) - break; } - __set_task_state(tsk, TASK_RUNNING); - remove_wait_queue(waitqueue, &wait); -} - -/* - * Get an exclusive lock on the page, optimistically - * assuming it's not locked.. - */ -void lock_page(struct page *page) -{ - if (TestSetPageLocked(page)) - __lock_page(page); + finish_wait(wqh, &wait); } +EXPORT_SYMBOL(__lock_page); /* * a rather lightweight function, finding and getting a reference to a -- cgit v1.2.3 From b65bbded3935b896d55cb6b3e420a085d3089368 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 25 Sep 2002 07:20:18 -0700 Subject: [PATCH] slab reclaim balancing A patch from Ed Tomlinson which improves the way in which the kernel reclaims slab objects. The theory is: a cached object's usefulness is measured in terms of the number of disk seeks which it saves. Furthermore, we assume that one dentry or inode saves as many seeks as one pagecache page. So we reap slab objects at the same rate as we reclaim pages. For each 1% of reclaimed pagecache we reclaim 1% of slab. (Actually, we _scan_ 1% of slab for each 1% of scanned pages). Furthermore we assume that one swapout costs twice as many seeks as one pagecache page, and twice as many seeks as one slab object. So we double the pressure on slab when anonymous pages are being considered for eviction. The code works nicely, and smoothly. Possibly it does not shrink slab hard enough, but that is now very easy to tune up and down. It is just: ratio *= 3; in shrink_caches(). Slab caches no longer hold onto completely empty pages. Instead, pages are freed as soon as they have zero objects. This is possibly a performance hit for slabs which have constructors, but it's doubtful. Most allocations after a batch of frees are satisfied from inside internally-fragmented pages and by the time slab gets back onto using the wholly-empty pages they'll be cache-cold. slab would be better off going and requesting a new, cache-warm page and reconstructing the objects therein. (Once we have the per-cpu hot-page allocator in place. It's happening). As a consequence of the above, kmem_cache_shrink() is now unused. No great loss there - the serialising effect of kmem_cache_shrink and its semaphore in front of page reclaim was measurably bad. Still todo: - batch up the shrinking so we don't call into prune_dcache and friends at high frequency asking for a tiny number of objects. - Maybe expose the shrink ratio via a tunable. - clean up slab.c - highmem page reclaim in prune_icache: highmem pages can pin inodes. --- fs/dcache.c | 30 +++++++---------------------- fs/dquot.c | 19 +++++-------------- fs/inode.c | 29 +++++++++------------------- include/linux/dcache.h | 2 +- include/linux/mm.h | 1 + mm/page_alloc.c | 11 +++++++++++ mm/slab.c | 8 ++++++-- mm/vmscan.c | 51 +++++++++++++++++++++++++++++++++++--------------- 8 files changed, 76 insertions(+), 75 deletions(-) (limited to 'include') diff --git a/fs/dcache.c b/fs/dcache.c index ac127d32eed9..1715f006ccd4 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -329,12 +329,11 @@ static inline void prune_one_dentry(struct dentry * dentry) void prune_dcache(int count) { spin_lock(&dcache_lock); - for (;;) { + for (; count ; count--) { struct dentry *dentry; struct list_head *tmp; tmp = dentry_unused.prev; - if (tmp == &dentry_unused) break; list_del_init(tmp); @@ -349,12 +348,8 @@ void prune_dcache(int count) dentry_stat.nr_unused--; /* Unused dentry with a count? */ - if (atomic_read(&dentry->d_count)) - BUG(); - + BUG_ON(atomic_read(&dentry->d_count)); prune_one_dentry(dentry); - if (!--count) - break; } spin_unlock(&dcache_lock); } @@ -573,19 +568,11 @@ void shrink_dcache_anon(struct list_head *head) /* * This is called from kswapd when we think we need some - * more memory, but aren't really sure how much. So we - * carefully try to free a _bit_ of our dcache, but not - * too much. - * - * Priority: - * 1 - very urgent: shrink everything - * ... - * 6 - base-level: try to shrink a bit. + * more memory. */ -int shrink_dcache_memory(int priority, unsigned int gfp_mask) +int shrink_dcache_memory(int ratio, unsigned int gfp_mask) { - int count = 0; - + int entries = dentry_stat.nr_dentry / ratio + 1; /* * Nasty deadlock avoidance. * @@ -600,11 +587,8 @@ int shrink_dcache_memory(int priority, unsigned int gfp_mask) if (!(gfp_mask & __GFP_FS)) return 0; - count = dentry_stat.nr_unused / priority; - - prune_dcache(count); - kmem_cache_shrink(dentry_cache); - return 0; + prune_dcache(entries); + return entries; } #define NAME_ALLOC_LEN(len) ((len+16) & ~15) diff --git a/fs/dquot.c b/fs/dquot.c index 58095d92cbee..3b1efaef018a 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -480,26 +480,17 @@ static void prune_dqcache(int count) /* * This is called from kswapd when we think we need some - * more memory, but aren't really sure how much. So we - * carefully try to free a _bit_ of our dqcache, but not - * too much. - * - * Priority: - * 1 - very urgent: shrink everything - * ... - * 6 - base-level: try to shrink a bit. + * more memory */ -int shrink_dqcache_memory(int priority, unsigned int gfp_mask) +int shrink_dqcache_memory(int ratio, unsigned int gfp_mask) { - int count = 0; + int entries = dqstats.allocated_dquots / ratio + 1; lock_kernel(); - count = dqstats.free_dquots / priority; - prune_dqcache(count); + prune_dqcache(entries); unlock_kernel(); - kmem_cache_shrink(dquot_cachep); - return 0; + return entries; } /* diff --git a/fs/inode.c b/fs/inode.c index 89c96e221043..c07e1e7e1a35 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -386,10 +386,11 @@ void prune_icache(int goal) count = 0; entry = inode_unused.prev; - while (entry != &inode_unused) - { + for(; goal; goal--) { struct list_head *tmp = entry; + if (entry == &inode_unused) + break; entry = entry->prev; inode = INODE(tmp); if (inode->i_state & (I_FREEING|I_CLEAR|I_LOCK)) @@ -403,8 +404,6 @@ void prune_icache(int goal) list_add(tmp, freeable); inode->i_state |= I_FREEING; count++; - if (!--goal) - break; } inodes_stat.nr_unused -= count; spin_unlock(&inode_lock); @@ -414,19 +413,11 @@ void prune_icache(int goal) /* * This is called from kswapd when we think we need some - * more memory, but aren't really sure how much. So we - * carefully try to free a _bit_ of our icache, but not - * too much. - * - * Priority: - * 1 - very urgent: shrink everything - * ... - * 6 - base-level: try to shrink a bit. + * more memory. */ -int shrink_icache_memory(int priority, int gfp_mask) +int shrink_icache_memory(int ratio, unsigned int gfp_mask) { - int count = 0; - + int entries = inodes_stat.nr_inodes / ratio + 1; /* * Nasty deadlock avoidance.. * @@ -437,12 +428,10 @@ int shrink_icache_memory(int priority, int gfp_mask) if (!(gfp_mask & __GFP_FS)) return 0; - count = inodes_stat.nr_unused / priority; - - prune_icache(count); - kmem_cache_shrink(inode_cachep); - return 0; + prune_icache(entries); + return entries; } +EXPORT_SYMBOL(shrink_icache_memory); /* * Called with the inode lock held. diff --git a/include/linux/dcache.h b/include/linux/dcache.h index f99a03f17e60..a64a657545fe 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -186,7 +186,7 @@ extern int shrink_dcache_memory(int, unsigned int); extern void prune_dcache(int); /* icache memory management (defined in linux/fs/inode.c) */ -extern int shrink_icache_memory(int, int); +extern int shrink_icache_memory(int, unsigned int); extern void prune_icache(int); /* quota cache memory management (defined in linux/fs/dquot.c) */ diff --git a/include/linux/mm.h b/include/linux/mm.h index c63e4947387f..482db998aca7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -524,6 +524,7 @@ extern struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned lon extern struct page * vmalloc_to_page(void *addr); extern unsigned long get_page_cache_size(void); +extern unsigned int nr_used_zone_pages(void); #endif /* __KERNEL__ */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 435a12dd1574..a1cce719581d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -479,6 +479,17 @@ unsigned int nr_free_pages(void) return sum; } +unsigned int nr_used_zone_pages(void) +{ + unsigned int pages = 0; + struct zone *zone; + + for_each_zone(zone) + pages += zone->nr_active + zone->nr_inactive; + + return pages; +} + static unsigned int nr_free_zone_pages(int offset) { pg_data_t *pgdat; diff --git a/mm/slab.c b/mm/slab.c index 549cd2f465ea..962598c0b1b7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1496,7 +1496,11 @@ static inline void kmem_cache_free_one(kmem_cache_t *cachep, void *objp) if (unlikely(!--slabp->inuse)) { /* Was partial or full, now empty. */ list_del(&slabp->list); - list_add(&slabp->list, &cachep->slabs_free); +/* list_add(&slabp->list, &cachep->slabs_free); */ + if (unlikely(list_empty(&cachep->slabs_partial))) + list_add(&slabp->list, &cachep->slabs_partial); + else + kmem_slab_destroy(cachep, slabp); } else if (unlikely(inuse == cachep->num)) { /* Was full. */ list_del(&slabp->list); @@ -1970,7 +1974,7 @@ static int s_show(struct seq_file *m, void *p) } list_for_each(q,&cachep->slabs_partial) { slabp = list_entry(q, slab_t, list); - if (slabp->inuse == cachep->num || !slabp->inuse) + if (slabp->inuse == cachep->num) BUG(); active_objs += slabp->inuse; active_slabs++; diff --git a/mm/vmscan.c b/mm/vmscan.c index 5eade9423f0d..4302f698a7a4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -70,6 +70,10 @@ #define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0) #endif +#ifndef CONFIG_QUOTA +#define shrink_dqcache_memory(ratio, gfp_mask) do { } while (0) +#endif + /* Must be called with page's pte_chain_lock held. */ static inline int page_mapping_inuse(struct page * page) { @@ -97,7 +101,7 @@ static inline int is_page_cache_freeable(struct page *page) static /* inline */ int shrink_list(struct list_head *page_list, int nr_pages, - unsigned int gfp_mask, int *max_scan) + unsigned int gfp_mask, int *max_scan, int *nr_mapped) { struct address_space *mapping; LIST_HEAD(ret_pages); @@ -116,6 +120,10 @@ shrink_list(struct list_head *page_list, int nr_pages, if (TestSetPageLocked(page)) goto keep; + /* Double the slab pressure for mapped and swapcache pages */ + if (page_mapped(page) || PageSwapCache(page)) + (*nr_mapped)++; + BUG_ON(PageActive(page)); may_enter_fs = (gfp_mask & __GFP_FS) || (PageSwapCache(page) && (gfp_mask & __GFP_IO)); @@ -320,7 +328,7 @@ keep: */ static /* inline */ int shrink_cache(int nr_pages, struct zone *zone, - unsigned int gfp_mask, int max_scan) + unsigned int gfp_mask, int max_scan, int *nr_mapped) { LIST_HEAD(page_list); struct pagevec pvec; @@ -371,7 +379,8 @@ shrink_cache(int nr_pages, struct zone *zone, max_scan -= nr_scan; KERNEL_STAT_ADD(pgscan, nr_scan); - nr_pages = shrink_list(&page_list,nr_pages,gfp_mask,&max_scan); + nr_pages = shrink_list(&page_list, nr_pages, + gfp_mask, &max_scan, nr_mapped); if (nr_pages <= 0 && list_empty(&page_list)) goto done; @@ -522,14 +531,10 @@ refill_inactive_zone(struct zone *zone, const int nr_pages_in) static /* inline */ int shrink_zone(struct zone *zone, int max_scan, - unsigned int gfp_mask, int nr_pages) + unsigned int gfp_mask, int nr_pages, int *nr_mapped) { unsigned long ratio; - /* This is bogus for ZONE_HIGHMEM? */ - if (kmem_cache_reap(gfp_mask) >= nr_pages) - return 0; - /* * Try to keep the active list 2/3 of the size of the cache. And * make sure that refill_inactive is given a decent number of pages. @@ -547,7 +552,8 @@ shrink_zone(struct zone *zone, int max_scan, atomic_sub(SWAP_CLUSTER_MAX, &zone->refill_counter); refill_inactive_zone(zone, SWAP_CLUSTER_MAX); } - nr_pages = shrink_cache(nr_pages, zone, gfp_mask, max_scan); + nr_pages = shrink_cache(nr_pages, zone, gfp_mask, + max_scan, nr_mapped); return nr_pages; } @@ -557,6 +563,9 @@ shrink_caches(struct zone *classzone, int priority, { struct zone *first_classzone; struct zone *zone; + int ratio; + int nr_mapped = 0; + int pages = nr_used_zone_pages(); first_classzone = classzone->zone_pgdat->node_zones; for (zone = classzone; zone >= first_classzone; zone--) { @@ -581,16 +590,28 @@ shrink_caches(struct zone *classzone, int priority, max_scan = zone->nr_inactive >> priority; if (max_scan < to_reclaim * 2) max_scan = to_reclaim * 2; - unreclaimed = shrink_zone(zone, max_scan, gfp_mask, to_reclaim); + unreclaimed = shrink_zone(zone, max_scan, + gfp_mask, to_reclaim, &nr_mapped); nr_pages -= to_reclaim - unreclaimed; *total_scanned += max_scan; } - shrink_dcache_memory(priority, gfp_mask); - shrink_icache_memory(1, gfp_mask); -#ifdef CONFIG_QUOTA - shrink_dqcache_memory(DEF_PRIORITY, gfp_mask); -#endif + /* + * Here we assume it costs one seek to replace a lru page and that + * it also takes a seek to recreate a cache object. With this in + * mind we age equal percentages of the lru and ageable caches. + * This should balance the seeks generated by these structures. + * + * NOTE: for now I do this for all zones. If we find this is too + * aggressive on large boxes we may want to exclude ZONE_HIGHMEM + * + * If we're encountering mapped pages on the LRU then increase the + * pressure on slab to avoid swapping. + */ + ratio = (pages / (*total_scanned + nr_mapped + 1)) + 1; + shrink_dcache_memory(ratio, gfp_mask); + shrink_icache_memory(ratio, gfp_mask); + shrink_dqcache_memory(ratio, gfp_mask); return nr_pages; } -- cgit v1.2.3 From 4f3e8109bd947edd8e620f6f6439ecc0f4f7d996 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 25 Sep 2002 07:20:23 -0700 Subject: [PATCH] increase traffic on linux-kernel [This has four scalps already. Thomas Molina has agreed to track things as they are identified ] Infrastructure to detect sleep-inside-spinlock bugs. Really only useful if compiled with CONFIG_PREEMPT=y. It prints out a whiny message and a stack backtrace if someone calls a function which might sleep from within an atomic region. This patch generates a storm of output at boot, due to drivers/ide/ide-probe.c:init_irq() calling lots of things which it shouldn't under ide_lock. It'll find other bugs too. --- include/asm-i386/semaphore.h | 4 ++-- include/linux/kernel.h | 7 +++++++ include/linux/rwsem.h | 2 ++ kernel/ksyms.c | 4 +++- kernel/sched.c | 17 +++++++++++++++++ mm/page_alloc.c | 3 +++ mm/slab.c | 3 +++ 7 files changed, 37 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/asm-i386/semaphore.h b/include/asm-i386/semaphore.h index a0ce1b8dba69..9c456727e8a3 100644 --- a/include/asm-i386/semaphore.h +++ b/include/asm-i386/semaphore.h @@ -116,7 +116,7 @@ static inline void down(struct semaphore * sem) #if WAITQUEUE_DEBUG CHECK_MAGIC(sem->__magic); #endif - + might_sleep(); __asm__ __volatile__( "# atomic down operation\n\t" LOCK "decl %0\n\t" /* --sem->count */ @@ -142,7 +142,7 @@ static inline int down_interruptible(struct semaphore * sem) #if WAITQUEUE_DEBUG CHECK_MAGIC(sem->__magic); #endif - + might_sleep(); __asm__ __volatile__( "# atomic interruptible down operation\n\t" LOCK "decl %1\n\t" /* --sem->count */ diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5efa540d55f8..44c38b134498 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -40,6 +40,13 @@ struct completion; +#ifdef CONFIG_DEBUG_KERNEL +void __might_sleep(char *file, int line); +#define might_sleep() __might_sleep(__FILE__, __LINE__) +#else +#define might_sleep() do {} while(0) +#endif + extern struct notifier_block *panic_notifier_list; NORET_TYPE void panic(const char * fmt, ...) __attribute__ ((NORET_AND format (printf, 1, 2))); diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 4a7e2bb0d7c4..bfb988885002 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -40,6 +40,7 @@ extern void FASTCALL(rwsemtrace(struct rw_semaphore *sem, const char *str)); */ static inline void down_read(struct rw_semaphore *sem) { + might_sleep(); rwsemtrace(sem,"Entering down_read"); __down_read(sem); rwsemtrace(sem,"Leaving down_read"); @@ -62,6 +63,7 @@ static inline int down_read_trylock(struct rw_semaphore *sem) */ static inline void down_write(struct rw_semaphore *sem) { + might_sleep(); rwsemtrace(sem,"Entering down_write"); __down_write(sem); rwsemtrace(sem,"Leaving down_write"); diff --git a/kernel/ksyms.c b/kernel/ksyms.c index cd69b97e8e9d..0409fc676f29 100644 --- a/kernel/ksyms.c +++ b/kernel/ksyms.c @@ -497,7 +497,9 @@ EXPORT_SYMBOL(jiffies_64); EXPORT_SYMBOL(xtime); EXPORT_SYMBOL(do_gettimeofday); EXPORT_SYMBOL(do_settimeofday); - +#ifdef CONFIG_DEBUG_KERNEL +EXPORT_SYMBOL(__might_sleep); +#endif #if !defined(__ia64__) EXPORT_SYMBOL(loops_per_jiffy); #endif diff --git a/kernel/sched.c b/kernel/sched.c index 304f90fd4bdf..9965e5f7549e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2150,3 +2150,20 @@ void __init sched_init(void) enter_lazy_tlb(&init_mm, current, smp_processor_id()); } +#ifdef CONFIG_DEBUG_KERNEL +void __might_sleep(char *file, int line) +{ +#if defined(in_atomic) + static unsigned long prev_jiffy; /* ratelimiting */ + + if (in_atomic()) { + if (time_before(jiffies, prev_jiffy + HZ)) + return; + prev_jiffy = jiffies; + printk("Sleeping function called from illegal" + " context at %s:%d\n", file, line); + dump_stack(); + } +#endif +} +#endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a1cce719581d..ab3284a3b78a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -321,6 +321,9 @@ __alloc_pages(unsigned int gfp_mask, unsigned int order, struct page * page; int freed, i; + if (gfp_mask & __GFP_WAIT) + might_sleep(); + KERNEL_STAT_ADD(pgalloc, 1<zones; /* the list of zones suitable for gfp_mask */ diff --git a/mm/slab.c b/mm/slab.c index 962598c0b1b7..a6bd0a98734b 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1370,6 +1370,9 @@ static inline void * __kmem_cache_alloc (kmem_cache_t *cachep, int flags) unsigned long save_flags; void* objp; + if (flags & __GFP_WAIT) + might_sleep(); + kmem_cache_alloc_head(cachep, flags); try_again: local_irq_save(save_flags); -- cgit v1.2.3