diff -Nur linux-2.4.33-imedia/CREDITS linux-2.4.33-imedia-patching/CREDITS --- linux-2.4.33-imedia/CREDITS 2005-01-19 16:09:22.000000000 +0200 +++ linux-2.4.33-imedia-patching/CREDITS 2006-01-26 15:19:42.000000000 +0200 @@ -1007,8 +1007,8 @@ N: Nigel Gamble E: nigel@nrg.org -E: nigel@sgi.com D: Interrupt-driven printer driver +D: Preemptible kernel S: 120 Alley Way S: Mountain View, California 94040 S: USA diff -Nur linux-2.4.33-imedia/Documentation/Configure.help linux-2.4.33-imedia-patching/Documentation/Configure.help --- linux-2.4.33-imedia/Documentation/Configure.help 2006-01-11 20:29:27.000000000 +0200 +++ linux-2.4.33-imedia-patching/Documentation/Configure.help 2006-01-26 15:19:42.000000000 +0200 @@ -296,6 +296,17 @@ If you have a system with several CPUs, you do not need to say Y here: the local APIC will be used automatically. +Preemptible Kernel +CONFIG_PREEMPT + This option reduces the latency of the kernel when reacting to + real-time or interactive events by allowing a low priority process to + be preempted even if it is in kernel mode executing a system call. + This allows applications to run more reliably even when the system is + under load. + + Say Y here if you are building a kernel for a desktop, embedded or + real-time system. Say N if you are unsure. + Kernel math emulation CONFIG_MATH_EMULATION Linux can emulate a math coprocessor (used for floating point diff -Nur linux-2.4.33-imedia/Documentation/preempt-locking.txt linux-2.4.33-imedia-patching/Documentation/preempt-locking.txt --- linux-2.4.33-imedia/Documentation/preempt-locking.txt 1970-01-01 02:00:00.000000000 +0200 +++ linux-2.4.33-imedia-patching/Documentation/preempt-locking.txt 2006-01-26 15:19:42.000000000 +0200 @@ -0,0 +1,104 @@ + Proper Locking Under a Preemptible Kernel: + Keeping Kernel Code Preempt-Safe + Robert Love + Last Updated: 22 Jan 2002 + + +INTRODUCTION + + +A preemptible kernel creates new locking issues. The issues are the same as +those under SMP: concurrency and reentrancy. Thankfully, the Linux preemptible +kernel model leverages existing SMP locking mechanisms. Thus, the kernel +requires explicit additional locking for very few additional situations. + +This document is for all kernel hackers. Developing code in the kernel +requires protecting these situations. + + +RULE #1: Per-CPU data structures need explicit protection + + +Two similar problems arise. An example code snippet: + + struct this_needs_locking tux[NR_CPUS]; + tux[smp_processor_id()] = some_value; + /* task is preempted here... */ + something = tux[smp_processor_id()]; + +First, since the data is per-CPU, it may not have explicit SMP locking, but +require it otherwise. Second, when a preempted task is finally rescheduled, +the previous value of smp_processor_id may not equal the current. You must +protect these situations by disabling preemption around them. + + +RULE #2: CPU state must be protected. + + +Under preemption, the state of the CPU must be protected. This is arch- +dependent, but includes CPU structures and state not preserved over a context +switch. For example, on x86, entering and exiting FPU mode is now a critical +section that must occur while preemption is disabled. Think what would happen +if the kernel is executing a floating-point instruction and is then preempted. +Remember, the kernel does not save FPU state except for user tasks. Therefore, +upon preemption, the FPU registers will be sold to the lowest bidder. Thus, +preemption must be disabled around such regions. + +Note, some FPU functions are already explicitly preempt safe. For example, +kernel_fpu_begin and kernel_fpu_end will disable and enable preemption. +However, math_state_restore must be called with preemption disabled. + + +RULE #3: Lock acquire and release must be performed by same task + + +A lock acquired in one task must be released by the same task. This +means you can't do oddball things like acquire a lock and go off to +play while another task releases it. If you want to do something +like this, acquire and release the task in the same code path and +have the caller wait on an event by the other task. + + +SOLUTION + + +Data protection under preemption is achieved by disabling preemption for the +duration of the critical region. + +preempt_enable() decrement the preempt counter +preempt_disable() increment the preempt counter +preempt_enable_no_resched() decrement, but do not immediately preempt +preempt_get_count() return the preempt counter + +The functions are nestable. In other words, you can call preempt_disable +n-times in a code path, and preemption will not be reenabled until the n-th +call to preempt_enable. The preempt statements define to nothing if +preemption is not enabled. + +Note that you do not need to explicitly prevent preemption if you are holding +any locks or interrupts are disabled, since preemption is implicitly disabled +in those cases. + +Example: + + cpucache_t *cc; /* this is per-CPU */ + preempt_disable(); + cc = cc_data(searchp); + if (cc && cc->avail) { + __free_block(searchp, cc_entry(cc), cc->avail); + cc->avail = 0; + } + preempt_enable(); + return 0; + +Notice how the preemption statements must encompass every reference of the +critical variables. Another example: + + int buf[NR_CPUS]; + set_cpu_val(buf); + if (buf[smp_processor_id()] == -1) printf(KERN_INFO "wee!\n"); + spin_lock(&buf_lock); + /* ... */ + +This code is not preempt-safe, but see how easily we can fix it by simply +moving the spin_lock up two lines. diff -Nur linux-2.4.33-imedia/Documentation/sched-coding.txt linux-2.4.33-imedia-patching/Documentation/sched-coding.txt --- linux-2.4.33-imedia/Documentation/sched-coding.txt 1970-01-01 02:00:00.000000000 +0200 +++ linux-2.4.33-imedia-patching/Documentation/sched-coding.txt 2006-01-26 15:19:42.000000000 +0200 @@ -0,0 +1,126 @@ + Reference for various scheduler-related methods in the O(1) scheduler + Robert Love , MontaVista Software + + +Note most of these methods are local to kernel/sched.c - this is by design. +The scheduler is meant to be self-contained and abstracted away. This document +is primarily for understanding the scheduler, not interfacing to it. Some of +the discussed interfaces, however, are general process/scheduling methods. +They are typically defined in include/linux/sched.h. + + +Main Scheduling Methods +----------------------- + +void load_balance(runqueue_t *this_rq, int idle) + Attempts to pull tasks from one cpu to another to balance cpu usage, + if needed. This method is called explicitly if the runqueues are + inbalanced or periodically by the timer tick. Prior to calling, + the current runqueue must be locked and interrupts disabled. + +void schedule() + The main scheduling function. Upon return, the highest priority + process will be active. + + +Locking +------- + +Each runqueue has its own lock, rq->lock. When multiple runqueues need +to be locked, lock acquires must be ordered by ascending &runqueue value. + +A specific runqueue is locked via + + task_rq_lock(task_t pid, unsigned long *flags) + +which disables preemption, disables interrupts, and locks the runqueue pid is +running on. Likewise, + + task_rq_unlock(task_t pid, unsigned long *flags) + +unlocks the runqueue pid is running on, restores interrupts to their previous +state, and reenables preemption. + +The routines + + double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) + +and + + double_rq_unlock(runqueue_t *rq1, runqueue_t rq2) + +safely lock and unlock, respectively, the two specified runqueues. They do +not, however, disable and restore interrupts. Users are required to do so +manually before and after calls. + + +Values +------ + +MAX_PRIO + The maximum priority of the system, stored in the task as task->prio. + Lower priorities are higher. Normal (non-RT) priorities range from + MAX_RT_PRIO to (MAX_PRIO - 1). +MAX_RT_PRIO + The maximum real-time priority of the system. Valid RT priorities + range from 0 to (MAX_RT_PRIO - 1). +MAX_USER_RT_PRIO + The maximum real-time priority that is exported to user-space. Should + always be equal to or less than MAX_RT_PRIO. Setting it less allows + kernel threads to have higher priorities than any user-space task. +MIN_TIMESLICE +MAX_TIMESLICE + Respectively, the minimum and maximum timeslices (quanta) of a process. + +Data +---- + +struct runqueue + The main per-CPU runqueue data structure. +struct task_struct + The main per-process data structure. + + +General Methods +--------------- + +cpu_rq(cpu) + Returns the runqueue of the specified cpu. +this_rq() + Returns the runqueue of the current cpu. +task_rq(pid) + Returns the runqueue which holds the specified pid. +cpu_curr(cpu) + Returns the task currently running on the given cpu. +rt_task(pid) + Returns true if pid is real-time, false if not. + + +Process Control Methods +----------------------- + +void set_user_nice(task_t *p, long nice) + Sets the "nice" value of task p to the given value. +int setscheduler(pid_t pid, int policy, struct sched_param *param) + Sets the scheduling policy and parameters for the given pid. +void set_cpus_allowed(task_t *p, unsigned long new_mask) + Sets a given task's CPU affinity and migrates it to a proper cpu. + Callers must have a valid reference to the task and assure the + task not exit prematurely. No locks can be held during the call. +set_task_state(tsk, state_value) + Sets the given task's state to the given value. +set_current_state(state_value) + Sets the current task's state to the given value. +void set_tsk_need_resched(struct task_struct *tsk) + Sets need_resched in the given task. +void clear_tsk_need_resched(struct task_struct *tsk) + Clears need_resched in the given task. +void set_need_resched() + Sets need_resched in the current task. +void clear_need_resched() + Clears need_resched in the current task. +int need_resched() + Returns true if need_resched is set in the current task, false + otherwise. +yield() + Place the current process at the end of the runqueue and call schedule. diff -Nur linux-2.4.33-imedia/Documentation/sched-design.txt linux-2.4.33-imedia-patching/Documentation/sched-design.txt --- linux-2.4.33-imedia/Documentation/sched-design.txt 1970-01-01 02:00:00.000000000 +0200 +++ linux-2.4.33-imedia-patching/Documentation/sched-design.txt 2006-01-26 15:19:42.000000000 +0200 @@ -0,0 +1,165 @@ + Goals, Design and Implementation of the + new ultra-scalable O(1) scheduler + + + This is an edited version of an email Ingo Molnar sent to + lkml on 4 Jan 2002. It describes the goals, design, and + implementation of Ingo's new ultra-scalable O(1) scheduler. + Last Updated: 18 April 2002. + + +Goal +==== + +The main goal of the new scheduler is to keep all the good things we know +and love about the current Linux scheduler: + + - good interactive performance even during high load: if the user + types or clicks then the system must react instantly and must execute + the user tasks smoothly, even during considerable background load. + + - good scheduling/wakeup performance with 1-2 runnable processes. + + - fairness: no process should stay without any timeslice for any + unreasonable amount of time. No process should get an unjustly high + amount of CPU time. + + - priorities: less important tasks can be started with lower priority, + more important tasks with higher priority. + + - SMP efficiency: no CPU should stay idle if there is work to do. + + - SMP affinity: processes which run on one CPU should stay affine to + that CPU. Processes should not bounce between CPUs too frequently. + + - plus additional scheduler features: RT scheduling, CPU binding. + +and the goal is also to add a few new things: + + - fully O(1) scheduling. Are you tired of the recalculation loop + blowing the L1 cache away every now and then? Do you think the goodness + loop is taking a bit too long to finish if there are lots of runnable + processes? This new scheduler takes no prisoners: wakeup(), schedule(), + the timer interrupt are all O(1) algorithms. There is no recalculation + loop. There is no goodness loop either. + + - 'perfect' SMP scalability. With the new scheduler there is no 'big' + runqueue_lock anymore - it's all per-CPU runqueues and locks - two + tasks on two separate CPUs can wake up, schedule and context-switch + completely in parallel, without any interlocking. All + scheduling-relevant data is structured for maximum scalability. + + - better SMP affinity. The old scheduler has a particular weakness that + causes the random bouncing of tasks between CPUs if/when higher + priority/interactive tasks, this was observed and reported by many + people. The reason is that the timeslice recalculation loop first needs + every currently running task to consume its timeslice. But when this + happens on eg. an 8-way system, then this property starves an + increasing number of CPUs from executing any process. Once the last + task that has a timeslice left has finished using up that timeslice, + the recalculation loop is triggered and other CPUs can start executing + tasks again - after having idled around for a number of timer ticks. + The more CPUs, the worse this effect. + + Furthermore, this same effect causes the bouncing effect as well: + whenever there is such a 'timeslice squeeze' of the global runqueue, + idle processors start executing tasks which are not affine to that CPU. + (because the affine tasks have finished off their timeslices already.) + + The new scheduler solves this problem by distributing timeslices on a + per-CPU basis, without having any global synchronization or + recalculation. + + - batch scheduling. A significant proportion of computing-intensive tasks + benefit from batch-scheduling, where timeslices are long and processes + are roundrobin scheduled. The new scheduler does such batch-scheduling + of the lowest priority tasks - so nice +19 jobs will get + 'batch-scheduled' automatically. With this scheduler, nice +19 jobs are + in essence SCHED_IDLE, from an interactiveness point of view. + + - handle extreme loads more smoothly, without breakdown and scheduling + storms. + + - O(1) RT scheduling. For those RT folks who are paranoid about the + O(nr_running) property of the goodness loop and the recalculation loop. + + - run fork()ed children before the parent. Andrea has pointed out the + advantages of this a few months ago, but patches for this feature + do not work with the old scheduler as well as they should, + because idle processes often steal the new child before the fork()ing + CPU gets to execute it. + + +Design +====== + +the core of the new scheduler are the following mechanizms: + + - *two*, priority-ordered 'priority arrays' per CPU. There is an 'active' + array and an 'expired' array. The active array contains all tasks that + are affine to this CPU and have timeslices left. The expired array + contains all tasks which have used up their timeslices - but this array + is kept sorted as well. The active and expired array is not accessed + directly, it's accessed through two pointers in the per-CPU runqueue + structure. If all active tasks are used up then we 'switch' the two + pointers and from now on the ready-to-go (former-) expired array is the + active array - and the empty active array serves as the new collector + for expired tasks. + + - there is a 64-bit bitmap cache for array indices. Finding the highest + priority task is thus a matter of two x86 BSFL bit-search instructions. + +the split-array solution enables us to have an arbitrary number of active +and expired tasks, and the recalculation of timeslices can be done +immediately when the timeslice expires. Because the arrays are always +access through the pointers in the runqueue, switching the two arrays can +be done very quickly. + +this is a hybride priority-list approach coupled with roundrobin +scheduling and the array-switch method of distributing timeslices. + + - there is a per-task 'load estimator'. + +one of the toughest things to get right is good interactive feel during +heavy system load. While playing with various scheduler variants i found +that the best interactive feel is achieved not by 'boosting' interactive +tasks, but by 'punishing' tasks that want to use more CPU time than there +is available. This method is also much easier to do in an O(1) fashion. + +to establish the actual 'load' the task contributes to the system, a +complex-looking but pretty accurate method is used: there is a 4-entry +'history' ringbuffer of the task's activities during the last 4 seconds. +This ringbuffer is operated without much overhead. The entries tell the +scheduler a pretty accurate load-history of the task: has it used up more +CPU time or less during the past N seconds. [the size '4' and the interval +of 4x 1 seconds was found by lots of experimentation - this part is +flexible and can be changed in both directions.] + +the penalty a task gets for generating more load than the CPU can handle +is a priority decrease - there is a maximum amount to this penalty +relative to their static priority, so even fully CPU-bound tasks will +observe each other's priorities, and will share the CPU accordingly. + +the SMP load-balancer can be extended/switched with additional parallel +computing and cache hierarchy concepts: NUMA scheduling, multi-core CPUs +can be supported easily by changing the load-balancer. Right now it's +tuned for my SMP systems. + +i skipped the prev->mm == next->mm advantage - no workload i know of shows +any sensitivity to this. It can be added back by sacrificing O(1) +schedule() [the current and one-lower priority list can be searched for a +that->mm == current->mm condition], but costs a fair number of cycles +during a number of important workloads, so i wanted to avoid this as much +as possible. + +- the SMP idle-task startup code was still racy and the new scheduler +triggered this. So i streamlined the idle-setup code a bit. We do not call +into schedule() before all processors have started up fully and all idle +threads are in place. + +- the patch also cleans up a number of aspects of sched.c - moves code +into other areas of the kernel where it's appropriate, and simplifies +certain code paths and data constructs. As a result, the new scheduler's +code is smaller than the old one. + + Ingo diff -Nur linux-2.4.33-imedia/MAINTAINERS linux-2.4.33-imedia-patching/MAINTAINERS --- linux-2.4.33-imedia/MAINTAINERS 2005-11-16 21:12:54.000000000 +0200 +++ linux-2.4.33-imedia-patching/MAINTAINERS 2006-01-26 15:19:43.000000000 +0200 @@ -1535,6 +1535,14 @@ W: http://prism54.org S: Maintained +PREEMPTIBLE KERNEL +P: Robert M. Love +M: rml@tech9.net +L: linux-kernel@vger.kernel.org +L: kpreempt-tech@lists.sourceforge.net +W: http://tech9.net/rml/linux +S: Supported + PROMISE DC4030 CACHING DISK CONTROLLER DRIVER P: Peter Denison M: promise@pnd-pc.demon.co.uk @@ -1624,6 +1632,14 @@ L: linux-kernel@vger.kernel.org S: Maintained +SCHEDULER +P: Ingo Molnar +M: mingo@elte.hu +P: Robert Love +M: rml@tech9.net +L: linux-kernel@vger.kernel.org +S: Maintained + SC1200 WDT DRIVER P: Zwane Mwaikambo M: zwane@commfireservices.com diff -Nur linux-2.4.33-imedia/Makefile linux-2.4.33-imedia-patching/Makefile --- linux-2.4.33-imedia/Makefile 2006-01-11 20:29:28.000000000 +0200 +++ linux-2.4.33-imedia-patching/Makefile 2006-01-26 15:19:53.000000000 +0200 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 4 SUBLEVEL = 33 -EXTRAVERSION = -pre1 +EXTRAVERSION = -imedia KERNELRELEASE=$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)$(EXTRAVERSION) diff -Nur linux-2.4.33-imedia/arch/alpha/kernel/entry.S linux-2.4.33-imedia-patching/arch/alpha/kernel/entry.S --- linux-2.4.33-imedia/arch/alpha/kernel/entry.S 2003-06-13 17:51:29.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/alpha/kernel/entry.S 2006-01-26 15:19:42.000000000 +0200 @@ -690,6 +690,7 @@ .end entSys .globl ret_from_fork +#if CONFIG_SMP .align 3 .ent ret_from_fork ret_from_fork: @@ -697,6 +698,9 @@ mov $17,$16 jsr $31,schedule_tail .end ret_from_fork +#else +ret_from_fork = ret_from_sys_call +#endif .align 3 .ent reschedule diff -Nur linux-2.4.33-imedia/arch/alpha/kernel/process.c linux-2.4.33-imedia-patching/arch/alpha/kernel/process.c --- linux-2.4.33-imedia/arch/alpha/kernel/process.c 2003-08-25 14:44:39.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/alpha/kernel/process.c 2006-01-26 15:19:42.000000000 +0200 @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -74,9 +75,6 @@ cpu_idle(void) { /* An endless idle loop with no priority at all. */ - current->nice = 20; - current->counter = -100; - while (1) { /* FIXME -- EV6 and LCA45 know how to power down the CPU. */ @@ -186,6 +184,7 @@ args.mode = mode; args.restart_cmd = restart_cmd; #ifdef CONFIG_SMP + preempt_disable(); smp_call_function(common_shutdown_1, &args, 1, 0); #endif common_shutdown_1(&args); diff -Nur linux-2.4.33-imedia/arch/alpha/kernel/smp.c linux-2.4.33-imedia-patching/arch/alpha/kernel/smp.c --- linux-2.4.33-imedia/arch/alpha/kernel/smp.c 2003-06-13 17:51:29.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/alpha/kernel/smp.c 2006-01-26 15:19:42.000000000 +0200 @@ -81,6 +81,7 @@ int smp_num_probed; /* Internal processor count */ int smp_num_cpus = 1; /* Number that came online. */ int smp_threads_ready; /* True once the per process idle is forked. */ +unsigned long cache_decay_ticks; int __cpu_number_map[NR_CPUS]; int __cpu_logical_map[NR_CPUS]; @@ -155,11 +156,6 @@ { int cpuid = hard_smp_processor_id(); - if (current != init_tasks[cpu_number_map(cpuid)]) { - printk("BUG: smp_calling: cpu %d current %p init_tasks[cpu_number_map(cpuid)] %p\n", - cpuid, current, init_tasks[cpu_number_map(cpuid)]); - } - DBGS(("CALLIN %d state 0x%lx\n", cpuid, current->state)); /* Turn on machine checks. */ @@ -217,9 +213,6 @@ DBGS(("smp_callin: commencing CPU %d current %p\n", cpuid, current)); - /* Setup the scheduler for this processor. */ - init_idle(); - /* ??? This should be in init_idle. */ atomic_inc(&init_mm.mm_count); current->active_mm = &init_mm; @@ -449,14 +442,11 @@ if (idle == &init_task) panic("idle process is init_task for CPU %d", cpuid); - idle->processor = cpuid; - idle->cpus_runnable = 1 << cpuid; /* we schedule the first task manually */ + init_idle(idle, cpuid); + unhash_process(idle); + __cpu_logical_map[cpunum] = cpuid; __cpu_number_map[cpuid] = cpunum; - - del_from_runqueue(idle); - unhash_process(idle); - init_tasks[cpunum] = idle; DBGS(("smp_boot_one_cpu: CPU %d state 0x%lx flags 0x%lx\n", cpuid, idle->state, idle->flags)); @@ -563,13 +553,10 @@ __cpu_number_map[boot_cpuid] = 0; __cpu_logical_map[0] = boot_cpuid; - current->processor = boot_cpuid; smp_store_cpu_info(boot_cpuid); smp_setup_percpu_timer(boot_cpuid); - init_idle(); - /* ??? This should be in init_idle. */ atomic_inc(&init_mm.mm_count); current->active_mm = &init_mm; diff -Nur linux-2.4.33-imedia/arch/arm/kernel/process.c linux-2.4.33-imedia-patching/arch/arm/kernel/process.c --- linux-2.4.33-imedia/arch/arm/kernel/process.c 2003-08-25 14:44:39.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/arm/kernel/process.c 2006-01-26 15:19:42.000000000 +0200 @@ -87,8 +87,6 @@ { /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; while (1) { void (*idle)(void) = pm_idle; diff -Nur linux-2.4.33-imedia/arch/cris/kernel/process.c linux-2.4.33-imedia-patching/arch/cris/kernel/process.c --- linux-2.4.33-imedia/arch/cris/kernel/process.c 2003-08-25 14:44:39.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/cris/kernel/process.c 2006-01-26 15:19:42.000000000 +0200 @@ -163,8 +163,6 @@ { /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; while(1) { void (*idle)(void) = pm_idle; diff -Nur linux-2.4.33-imedia/arch/i386/config.in linux-2.4.33-imedia-patching/arch/i386/config.in --- linux-2.4.33-imedia/arch/i386/config.in 2004-11-17 13:54:21.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/i386/config.in 2006-01-26 15:19:42.000000000 +0200 @@ -25,6 +25,9 @@ mainmenu_option next_comment comment 'Processor type and features' +bool 'Low latency scheduling' CONFIG_LOLAT +dep_bool 'Control low latency with sysctl' CONFIG_LOLAT_SYSCTL $CONFIG_LOLAT + choice 'Processor family' \ "386 CONFIG_M386 \ 486 CONFIG_M486 \ @@ -225,6 +228,7 @@ bool 'Math emulation' CONFIG_MATH_EMULATION bool 'MTRR (Memory Type Range Register) support' CONFIG_MTRR bool 'Symmetric multi-processing support' CONFIG_SMP +bool 'Preemptible Kernel' CONFIG_PREEMPT if [ "$CONFIG_SMP" != "y" ]; then bool 'Local APIC support on uniprocessors' CONFIG_X86_UP_APIC dep_bool 'IO-APIC support on uniprocessors' CONFIG_X86_UP_IOAPIC $CONFIG_X86_UP_APIC @@ -258,9 +262,12 @@ fi fi -if [ "$CONFIG_SMP" = "y" -a "$CONFIG_X86_CMPXCHG" = "y" ]; then - define_bool CONFIG_HAVE_DEC_LOCK y +if [ "$CONFIG_SMP" = "y" -o "$CONFIG_PREEMPT" = "y" ]; then + if [ "$CONFIG_X86_CMPXCHG" = "y" ]; then + define_bool CONFIG_HAVE_DEC_LOCK y + fi fi + endmenu mainmenu_option next_comment diff -Nur linux-2.4.33-imedia/arch/i386/defconfig linux-2.4.33-imedia-patching/arch/i386/defconfig --- linux-2.4.33-imedia/arch/i386/defconfig 2005-01-19 16:09:25.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/i386/defconfig 2006-01-26 15:19:42.000000000 +0200 @@ -64,7 +64,9 @@ # CONFIG_HIGHMEM is not set # CONFIG_MATH_EMULATION is not set # CONFIG_MTRR is not set -CONFIG_SMP=y +# CONFIG_SMP is not set +CONFIG_PREEMPT=y +CONFIG_LOWLAT=y CONFIG_NR_CPUS=32 # CONFIG_X86_NUMA is not set # CONFIG_X86_TSC_DISABLE is not set diff -Nur linux-2.4.33-imedia/arch/i386/kernel/cpuid.c linux-2.4.33-imedia-patching/arch/i386/kernel/cpuid.c --- linux-2.4.33-imedia/arch/i386/kernel/cpuid.c 2001-10-11 19:04:57.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/i386/kernel/cpuid.c 2006-01-26 15:19:42.000000000 +0200 @@ -60,7 +60,8 @@ static inline void do_cpuid(int cpu, u32 reg, u32 *data) { struct cpuid_command cmd; - + + preempt_disable(); if ( cpu == smp_processor_id() ) { cpuid(reg, &data[0], &data[1], &data[2], &data[3]); } else { @@ -70,6 +71,7 @@ smp_call_function(cpuid_smp_cpuid, &cmd, 1, 1); } + preempt_enable(); } #else /* ! CONFIG_SMP */ diff -Nur linux-2.4.33-imedia/arch/i386/kernel/entry.S linux-2.4.33-imedia-patching/arch/i386/kernel/entry.S --- linux-2.4.33-imedia/arch/i386/kernel/entry.S 2003-06-13 17:51:29.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/i386/kernel/entry.S 2006-01-26 15:19:42.000000000 +0200 @@ -73,16 +73,36 @@ * these are offsets into the task-struct. */ state = 0 -flags = 4 +preempt_count = 4 sigpending = 8 addr_limit = 12 exec_domain = 16 need_resched = 20 tsk_ptrace = 24 -processor = 52 +cpu = 32 + +/* These are offsets into the irq_stat structure + * There is one per cpu and it is aligned to 32 + * byte boundry (we put that here as a shift count) + */ +irq_array_shift = CONFIG_X86_L1_CACHE_SHIFT + +irq_stat_local_irq_count = 4 +irq_stat_local_bh_count = 8 ENOSYS = 38 +#ifdef CONFIG_SMP +#define GET_CPU_INDX movl cpu(%ebx),%eax; \ + shll $irq_array_shift,%eax +#define GET_CURRENT_CPU_INDX GET_CURRENT(%ebx); \ + GET_CPU_INDX +#define CPU_INDX (,%eax) +#else +#define GET_CPU_INDX +#define GET_CURRENT_CPU_INDX GET_CURRENT(%ebx) +#define CPU_INDX +#endif #define SAVE_ALL \ cld; \ @@ -184,9 +204,11 @@ ENTRY(ret_from_fork) +#if CONFIG_SMP pushl %ebx call SYMBOL_NAME(schedule_tail) addl $4, %esp +#endif GET_CURRENT(%ebx) testb $0x02,tsk_ptrace(%ebx) # PT_TRACESYS jne tracesys_exit @@ -255,16 +277,43 @@ ALIGN ENTRY(ret_from_intr) GET_CURRENT(%ebx) +#ifdef CONFIG_PREEMPT + cli + decl preempt_count(%ebx) +#endif ret_from_exception: movl EFLAGS(%esp),%eax # mix EFLAGS and CS movb CS(%esp),%al testl $(VM_MASK | 3),%eax # return to VM86 mode or non-supervisor? jne ret_from_sys_call +#ifdef CONFIG_PREEMPT + cmpl $0,preempt_count(%ebx) + jnz restore_all + cmpl $0,need_resched(%ebx) + jz restore_all + movl SYMBOL_NAME(irq_stat)+irq_stat_local_bh_count CPU_INDX,%ecx + addl SYMBOL_NAME(irq_stat)+irq_stat_local_irq_count CPU_INDX,%ecx + jnz restore_all + incl preempt_count(%ebx) + sti + call SYMBOL_NAME(preempt_schedule) + jmp ret_from_intr +#else jmp restore_all +#endif ALIGN reschedule: - call SYMBOL_NAME(schedule) # test + movl EFLAGS(%esp),%eax # mix EFLAGS and CS + movb CS(%esp),%al + testl $(VM_MASK | 3),%eax # return to VM86 mode or non-supervisor? + jne userspace_resched + + call SYMBOL_NAME(schedule) + jmp ret_from_sys_call + +userspace_resched: + call SYMBOL_NAME(schedule_userspace) jmp ret_from_sys_call ENTRY(divide_error) @@ -297,6 +346,9 @@ GET_CURRENT(%ebx) call *%edi addl $8,%esp +#ifdef CONFIG_PREEMPT + cli +#endif jmp ret_from_exception ENTRY(coprocessor_error) @@ -316,12 +368,18 @@ movl %cr0,%eax testl $0x4,%eax # EM (math emulation bit) jne device_not_available_emulate +#ifdef CONFIG_PREEMPT + cli +#endif call SYMBOL_NAME(math_state_restore) jmp ret_from_exception device_not_available_emulate: pushl $0 # temporary storage for ORIG_EIP call SYMBOL_NAME(math_emulate) addl $4,%esp +#ifdef CONFIG_PREEMPT + cli +#endif jmp ret_from_exception ENTRY(debug) @@ -645,8 +703,8 @@ .long SYMBOL_NAME(sys_tkill) .long SYMBOL_NAME(sys_sendfile64) .long SYMBOL_NAME(sys_ni_syscall) /* 240 reserved for futex */ - .long SYMBOL_NAME(sys_ni_syscall) /* reserved for sched_setaffinity */ - .long SYMBOL_NAME(sys_ni_syscall) /* reserved for sched_getaffinity */ + .long SYMBOL_NAME(sys_sched_setaffinity) + .long SYMBOL_NAME(sys_sched_getaffinity) .long SYMBOL_NAME(sys_ni_syscall) /* sys_set_thread_area */ .long SYMBOL_NAME(sys_ni_syscall) /* sys_get_thread_area */ .long SYMBOL_NAME(sys_ni_syscall) /* 245 sys_io_setup */ diff -Nur linux-2.4.33-imedia/arch/i386/kernel/i387.c linux-2.4.33-imedia-patching/arch/i386/kernel/i387.c --- linux-2.4.33-imedia/arch/i386/kernel/i387.c 2005-04-04 04:42:19.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/i386/kernel/i387.c 2006-01-26 15:19:42.000000000 +0200 @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -89,6 +90,8 @@ { struct task_struct *tsk = current; + preempt_disable(); + if (tsk->flags & PF_USEDFPU) { __save_init_fpu(tsk); return; diff -Nur linux-2.4.33-imedia/arch/i386/kernel/ioport.c linux-2.4.33-imedia-patching/arch/i386/kernel/ioport.c --- linux-2.4.33-imedia/arch/i386/kernel/ioport.c 2003-06-13 17:51:29.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/i386/kernel/ioport.c 2006-01-26 15:19:42.000000000 +0200 @@ -55,7 +55,7 @@ asmlinkage int sys_ioperm(unsigned long from, unsigned long num, int turn_on) { struct thread_struct * t = ¤t->thread; - struct tss_struct * tss = init_tss + smp_processor_id(); + struct tss_struct * tss; if ((from + num <= from) || (from + num > IO_BITMAP_SIZE*32)) return -EINVAL; @@ -66,6 +66,8 @@ * IO bitmap up. ioperm() is much less timing critical than clone(), * this is why we delay this operation until now: */ + preempt_disable(); + tss = init_tss + smp_processor_id(); if (!t->ioperm) { /* * just in case ... @@ -84,6 +86,7 @@ memcpy(tss->io_bitmap, t->io_bitmap, IO_BITMAP_BYTES); tss->bitmap = IO_BITMAP_OFFSET; /* Activate it in the TSS */ } + preempt_enable(); return 0; } diff -Nur linux-2.4.33-imedia/arch/i386/kernel/irq.c linux-2.4.33-imedia-patching/arch/i386/kernel/irq.c --- linux-2.4.33-imedia/arch/i386/kernel/irq.c 2003-11-28 20:26:19.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/i386/kernel/irq.c 2006-01-26 15:19:42.000000000 +0200 @@ -284,9 +284,11 @@ show("wait_on_irq"); count = ~0; } + preempt_disable(); __sti(); SYNC_OTHER_CORES(cpu); __cli(); + preempt_enable_no_resched(); if (irqs_running()) continue; if (global_irq_lock) @@ -360,8 +362,9 @@ __save_flags(flags); if (flags & (1 << EFLAGS_IF_SHIFT)) { - int cpu = smp_processor_id(); + int cpu; __cli(); + cpu = smp_processor_id(); if (!local_irq_count(cpu)) get_irqlock(cpu); } @@ -369,11 +372,14 @@ void __global_sti(void) { - int cpu = smp_processor_id(); + int cpu; + preempt_disable(); + cpu = smp_processor_id(); if (!local_irq_count(cpu)) release_irqlock(cpu); __sti(); + preempt_enable(); } /* @@ -388,13 +394,15 @@ int retval; int local_enabled; unsigned long flags; - int cpu = smp_processor_id(); + int cpu; __save_flags(flags); local_enabled = (flags >> EFLAGS_IF_SHIFT) & 1; /* default to local */ retval = 2 + local_enabled; + preempt_disable(); + cpu = smp_processor_id(); /* check for global flags if we're not in an interrupt */ if (!local_irq_count(cpu)) { if (local_enabled) @@ -402,6 +410,7 @@ if (global_irq_holder == cpu) retval = 0; } + preempt_enable(); return retval; } diff -Nur linux-2.4.33-imedia/arch/i386/kernel/ldt.c linux-2.4.33-imedia-patching/arch/i386/kernel/ldt.c --- linux-2.4.33-imedia/arch/i386/kernel/ldt.c 2004-02-18 15:36:30.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/i386/kernel/ldt.c 2006-01-26 15:19:42.000000000 +0200 @@ -190,6 +190,7 @@ goto out; } + preempt_disable(); down(&mm->context.sem); if (ldt_info.entry_number >= mm->context.size) { error = alloc_ldt(¤t->mm->context, ldt_info.entry_number+1, 1); @@ -236,6 +237,7 @@ out_unlock: up(&mm->context.sem); + preempt_enable(); out: return error; } diff -Nur linux-2.4.33-imedia/arch/i386/kernel/microcode.c linux-2.4.33-imedia-patching/arch/i386/kernel/microcode.c --- linux-2.4.33-imedia/arch/i386/kernel/microcode.c 2005-01-19 16:09:25.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/i386/kernel/microcode.c 2006-01-26 15:19:42.000000000 +0200 @@ -412,11 +412,14 @@ goto out_free; } + preempt_disable(); if (smp_call_function(do_update_one, NULL, 1, 1) != 0) { printk(KERN_ERR "microcode: Error! Could not run on all processors\n"); + preempt_enable(); error = -EIO; } do_update_one(NULL); + preempt_enable(); out_free: for (i = 0; i < smp_num_cpus; i++) { diff -Nur linux-2.4.33-imedia/arch/i386/kernel/msr.c linux-2.4.33-imedia-patching/arch/i386/kernel/msr.c --- linux-2.4.33-imedia/arch/i386/kernel/msr.c 2001-10-11 19:04:57.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/i386/kernel/msr.c 2006-01-26 15:19:42.000000000 +0200 @@ -114,8 +114,9 @@ { struct msr_command cmd; + preempt_disable(); if ( cpu == smp_processor_id() ) { - return wrmsr_eio(reg, eax, edx); + cmd.err = wrmsr_eio(reg, eax, edx); } else { cmd.cpu = cpu; cmd.reg = reg; @@ -123,16 +124,19 @@ cmd.data[1] = edx; smp_call_function(msr_smp_wrmsr, &cmd, 1, 1); - return cmd.err; } + + preempt_enable(); + return cmd.err; } static inline int do_rdmsr(int cpu, u32 reg, u32 *eax, u32 *edx) { struct msr_command cmd; + preempt_disable(); if ( cpu == smp_processor_id() ) { - return rdmsr_eio(reg, eax, edx); + cmd.err = rdmsr_eio(reg, eax, edx); } else { cmd.cpu = cpu; cmd.reg = reg; @@ -141,9 +145,10 @@ *eax = cmd.data[0]; *edx = cmd.data[1]; - - return cmd.err; } + + preempt_enable(); + return cmd.err; } #else /* ! CONFIG_SMP */ diff -Nur linux-2.4.33-imedia/arch/i386/kernel/mtrr.c linux-2.4.33-imedia-patching/arch/i386/kernel/mtrr.c --- linux-2.4.33-imedia/arch/i386/kernel/mtrr.c 2005-06-01 03:56:56.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/i386/kernel/mtrr.c 2006-01-26 15:19:42.000000000 +0200 @@ -1065,6 +1065,9 @@ wait_barrier_execute = TRUE; wait_barrier_cache_enable = TRUE; atomic_set (&undone_count, smp_num_cpus - 1); + + preempt_disable(); + /* Start the ball rolling on other CPUs */ if (smp_call_function (ipi_handler, &data, 1, 0) != 0) panic ("mtrr: timed out waiting for other CPUs\n"); @@ -1090,6 +1093,9 @@ then enable the local cache and return */ wait_barrier_cache_enable = FALSE; set_mtrr_done (&ctxt); + + preempt_enable(); + } /* End Function set_mtrr_smp */ diff -Nur linux-2.4.33-imedia/arch/i386/kernel/process.c linux-2.4.33-imedia-patching/arch/i386/kernel/process.c --- linux-2.4.33-imedia/arch/i386/kernel/process.c 2005-11-16 21:12:54.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/i386/kernel/process.c 2006-01-26 15:19:42.000000000 +0200 @@ -126,15 +126,12 @@ void cpu_idle (void) { /* endless idle loop with no priority at all */ - init_idle(); - current->nice = 20; - current->counter = -100; while (1) { void (*idle)(void) = pm_idle; if (!idle) idle = default_idle; - while (!current->need_resched) + if (!current->need_resched) idle(); schedule(); check_pgt_cache(); @@ -665,15 +662,17 @@ asm volatile("mov %%gs,%0":"=m" (prev->gs)); /* - * Restore %fs and %gs. + * Restore %fs and %gs if needed. */ - loadsegment(fs, next->fs); - loadsegment(gs, next->gs); + if (unlikely(prev->fs | prev->gs | next->fs | next->gs)) { + loadsegment(fs, next->fs); + loadsegment(gs, next->gs); + } /* * Now maybe reload the debug registers */ - if (next->debugreg[7]){ + if (unlikely(next->debugreg[7])) { loaddebug(next, 0); loaddebug(next, 1); loaddebug(next, 2); @@ -683,7 +682,7 @@ loaddebug(next, 7); } - if (prev->ioperm || next->ioperm) { + if (unlikely(prev->ioperm || next->ioperm)) { if (next->ioperm) { /* * 4 cachelines copy ... not good, but not that diff -Nur linux-2.4.33-imedia/arch/i386/kernel/setup.c linux-2.4.33-imedia-patching/arch/i386/kernel/setup.c --- linux-2.4.33-imedia/arch/i386/kernel/setup.c 2005-04-04 04:42:19.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/i386/kernel/setup.c 2006-01-26 15:19:42.000000000 +0200 @@ -3224,9 +3224,10 @@ load_TR(nr); load_LDT(&init_mm.context); - /* - * Clear all 6 debug registers: - */ + /* Clear %fs and %gs. */ + asm volatile ("xorl %eax, %eax; movl %eax, %fs; movl %eax, %gs"); + + /* Clear all 6 debug registers: */ #define CD(register) __asm__("movl %0,%%db" #register ::"r"(0) ); diff -Nur linux-2.4.33-imedia/arch/i386/kernel/smp.c linux-2.4.33-imedia-patching/arch/i386/kernel/smp.c --- linux-2.4.33-imedia/arch/i386/kernel/smp.c 2004-11-17 13:54:21.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/i386/kernel/smp.c 2006-01-26 15:19:42.000000000 +0200 @@ -360,10 +360,14 @@ asmlinkage void smp_invalidate_interrupt (void) { - unsigned long cpu = smp_processor_id(); + unsigned long cpu; + + preempt_disable(); + + cpu = smp_processor_id(); if (!test_bit(cpu, &flush_cpumask)) - return; + goto out; /* * This was a BUG() but until someone can quote me the * line from the intel manual that guarantees an IPI to @@ -384,6 +388,8 @@ } ack_APIC_irq(); clear_bit(cpu, &flush_cpumask); +out: + preempt_enable(); } static void flush_tlb_others (unsigned long cpumask, struct mm_struct *mm, @@ -427,23 +433,28 @@ flush_mm = NULL; flush_va = 0; - spin_unlock(&tlbstate_lock); + _raw_spin_unlock(&tlbstate_lock); } void flush_tlb_current_task(void) { struct mm_struct *mm = current->mm; - unsigned long cpu_mask = mm->cpu_vm_mask & ~(1 << smp_processor_id()); + unsigned long cpu_mask; + preempt_disable(); + cpu_mask = mm->cpu_vm_mask & ~(1UL << smp_processor_id()); local_flush_tlb(); if (cpu_mask) flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + preempt_enable(); } void flush_tlb_mm (struct mm_struct * mm) { - unsigned long cpu_mask = mm->cpu_vm_mask & ~(1 << smp_processor_id()); + unsigned long cpu_mask; + preempt_disable(); + cpu_mask = mm->cpu_vm_mask & ~(1UL << smp_processor_id()); if (current->active_mm == mm) { if (current->mm) local_flush_tlb(); @@ -452,13 +463,16 @@ } if (cpu_mask) flush_tlb_others(cpu_mask, mm, FLUSH_ALL); + preempt_enable(); } void flush_tlb_page(struct vm_area_struct * vma, unsigned long va) { struct mm_struct *mm = vma->vm_mm; - unsigned long cpu_mask = mm->cpu_vm_mask & ~(1 << smp_processor_id()); + unsigned long cpu_mask; + preempt_disable(); + cpu_mask = mm->cpu_vm_mask & ~(1UL << smp_processor_id()); if (current->active_mm == mm) { if(current->mm) __flush_tlb_one(va); @@ -468,6 +482,7 @@ if (cpu_mask) flush_tlb_others(cpu_mask, mm, va); + preempt_enable(); } static inline void do_flush_tlb_all_local(void) @@ -486,9 +501,11 @@ void flush_tlb_all(void) { + preempt_disable(); smp_call_function (flush_tlb_all_ipi,0,1,1); do_flush_tlb_all_local(); + preempt_enable(); } /* @@ -503,6 +520,17 @@ } /* + * this function sends a reschedule IPI to all (other) CPUs. + * This should only be used if some 'global' task became runnable, + * such as a RT task, that must be handled now. The first CPU + * that manages to grab the task will run it. + */ +void fastcall smp_send_reschedule_all(void) +{ + send_IPI_allbutself(RESCHEDULE_VECTOR); +} + +/* * Structure and data for smp_call_function(). This is designed to minimise * static memory requirements. It also looks cleaner. */ @@ -564,7 +592,7 @@ if (wait) while (atomic_read(&data.finished) != cpus) barrier(); - spin_unlock(&call_lock); + _raw_spin_unlock(&call_lock); return 0; } @@ -572,7 +600,7 @@ static void stop_this_cpu (void * dummy) { /* - * Remove this CPU: + * Remove this CPU: assumes preemption is disabled */ clear_bit(smp_processor_id(), &cpu_online_map); __cli(); diff -Nur linux-2.4.33-imedia/arch/i386/kernel/smpboot.c linux-2.4.33-imedia-patching/arch/i386/kernel/smpboot.c --- linux-2.4.33-imedia/arch/i386/kernel/smpboot.c 2004-04-14 16:05:25.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/i386/kernel/smpboot.c 2006-01-26 15:19:42.000000000 +0200 @@ -353,7 +353,7 @@ * (This works even if the APIC is not enabled.) */ phys_id = GET_APIC_ID(apic_read(APIC_ID)); - cpuid = current->processor; + cpuid = cpu(); if (test_and_set_bit(cpuid, &cpu_online_map)) { printk("huh, phys CPU#%d, CPU#%d already present??\n", phys_id, cpuid); @@ -423,6 +423,7 @@ */ smp_store_cpu_info(cpuid); + disable_APIC_timer(); /* * Allow the master to continue. */ @@ -453,6 +454,7 @@ smp_callin(); while (!atomic_read(&smp_commenced)) rep_nop(); + enable_APIC_timer(); /* * low-memory mappings have been cleared, flush them from * the local TLBs too. @@ -791,16 +793,13 @@ if (!idle) panic("No idle process for CPU %d", cpu); - idle->processor = cpu; - idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */ + init_idle(idle, cpu); map_cpu_to_boot_apicid(cpu, apicid); idle->thread.eip = (unsigned long) start_secondary; - del_from_runqueue(idle); unhash_process(idle); - init_tasks[cpu] = idle; /* start_eip had better be page-aligned! */ start_eip = setup_trampoline(); @@ -913,6 +912,7 @@ } cycles_t cacheflush_time; +unsigned long cache_decay_ticks; static void smp_tune_scheduling (void) { @@ -946,9 +946,13 @@ cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth; } + cache_decay_ticks = (long)cacheflush_time/cpu_khz * HZ / 1000; + printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", (long)cacheflush_time/(cpu_khz/1000), ((long)cacheflush_time*100/(cpu_khz/1000)) % 100); + printk("task migration cache decay timeout: %ld msecs.\n", + (cache_decay_ticks + 1) * 1000 / HZ); } /* @@ -1014,8 +1018,7 @@ map_cpu_to_boot_apicid(0, boot_cpu_apicid); global_irq_holder = 0; - current->processor = 0; - init_idle(); + current->cpu = 0; smp_tune_scheduling(); /* diff -Nur linux-2.4.33-imedia/arch/i386/kernel/traps.c linux-2.4.33-imedia-patching/arch/i386/kernel/traps.c --- linux-2.4.33-imedia/arch/i386/kernel/traps.c 2005-11-16 21:12:54.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/i386/kernel/traps.c 2006-01-26 15:19:42.000000000 +0200 @@ -750,6 +750,8 @@ * * Careful.. There are problems with IBM-designed IRQ13 behaviour. * Don't touch unless you *really* know how it works. + * + * Must be called with kernel preemption disabled. */ asmlinkage void math_state_restore(struct pt_regs regs) { diff -Nur linux-2.4.33-imedia/arch/i386/lib/dec_and_lock.c linux-2.4.33-imedia-patching/arch/i386/lib/dec_and_lock.c --- linux-2.4.33-imedia/arch/i386/lib/dec_and_lock.c 2000-07-08 04:20:16.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/i386/lib/dec_and_lock.c 2006-01-26 15:19:42.000000000 +0200 @@ -8,6 +8,7 @@ */ #include +#include #include int atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock) diff -Nur linux-2.4.33-imedia/arch/i386/mm/init.c linux-2.4.33-imedia-patching/arch/i386/mm/init.c --- linux-2.4.33-imedia/arch/i386/mm/init.c 2004-04-14 16:05:25.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/i386/mm/init.c 2006-01-26 15:19:42.000000000 +0200 @@ -46,6 +46,7 @@ int do_check_pgt_cache(int low, int high) { int freed = 0; + preempt_disable(); if(pgtable_cache_size > high) { do { if (pgd_quicklist) { @@ -62,6 +63,7 @@ } } while(pgtable_cache_size > low); } + preempt_enable(); return freed; } diff -Nur linux-2.4.33-imedia/arch/ia64/kernel/entry.S linux-2.4.33-imedia-patching/arch/ia64/kernel/entry.S --- linux-2.4.33-imedia/arch/ia64/kernel/entry.S 2005-04-04 04:42:19.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/ia64/kernel/entry.S 2006-01-26 15:19:42.000000000 +0200 @@ -175,7 +175,7 @@ mov r8=r13 // return pointer to previously running task mov r13=in0 // set "current" pointer ;; - ssm psr.i // renable psr.i AFTER the ic bit is serialized + ssm psr.i DO_LOAD_SWITCH_STACK #ifdef CONFIG_SMP diff -Nur linux-2.4.33-imedia/arch/ia64/kernel/process.c linux-2.4.33-imedia-patching/arch/ia64/kernel/process.c --- linux-2.4.33-imedia/arch/ia64/kernel/process.c 2005-04-04 04:42:19.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/ia64/kernel/process.c 2006-01-26 15:19:42.000000000 +0200 @@ -147,9 +147,6 @@ cpu_idle (void *unused) { init_idle(); - current->nice = 20; - current->counter = -100; - /* endless idle loop with no priority at all */ while (1) { diff -Nur linux-2.4.33-imedia/arch/m68k/kernel/process.c linux-2.4.33-imedia-patching/arch/m68k/kernel/process.c --- linux-2.4.33-imedia/arch/m68k/kernel/process.c 2003-06-13 17:51:31.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/m68k/kernel/process.c 2006-01-26 15:19:42.000000000 +0200 @@ -80,8 +80,6 @@ { /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; idle(); } diff -Nur linux-2.4.33-imedia/arch/mips/config-shared.in linux-2.4.33-imedia-patching/arch/mips/config-shared.in --- linux-2.4.33-imedia/arch/mips/config-shared.in 2005-01-19 16:09:27.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/mips/config-shared.in 2006-01-26 15:19:42.000000000 +0200 @@ -838,6 +838,7 @@ define_bool CONFIG_HOTPLUG_PCI n fi +dep_bool 'Preemptible Kernel' CONFIG_PREEMPT $CONFIG_NEW_IRQ bool 'System V IPC' CONFIG_SYSVIPC bool 'BSD Process Accounting' CONFIG_BSD_PROCESS_ACCT bool 'Sysctl support' CONFIG_SYSCTL diff -Nur linux-2.4.33-imedia/arch/mips/kernel/i8259.c linux-2.4.33-imedia-patching/arch/mips/kernel/i8259.c --- linux-2.4.33-imedia/arch/mips/kernel/i8259.c 2003-08-25 14:44:40.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/mips/kernel/i8259.c 2006-01-26 15:19:42.000000000 +0200 @@ -8,6 +8,7 @@ * Copyright (C) 1992 Linus Torvalds * Copyright (C) 1994 - 2000 Ralf Baechle */ +#include #include #include #include diff -Nur linux-2.4.33-imedia/arch/mips/kernel/irq.c linux-2.4.33-imedia-patching/arch/mips/kernel/irq.c --- linux-2.4.33-imedia/arch/mips/kernel/irq.c 2004-02-18 15:36:30.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/mips/kernel/irq.c 2006-01-26 15:19:42.000000000 +0200 @@ -8,6 +8,8 @@ * Copyright (C) 1992 Linus Torvalds * Copyright (C) 1994 - 2000 Ralf Baechle */ + +#include #include #include #include @@ -19,11 +21,13 @@ #include #include #include -#include +#include +#include #include #include #include +#include /* * Controller mappings for all interrupt sources: @@ -429,6 +433,8 @@ struct irqaction * action; unsigned int status; + preempt_disable(); + kstat.irqs[cpu][irq]++; spin_lock(&desc->lock); desc->handler->ack(irq); @@ -490,6 +496,27 @@ if (softirq_pending(cpu)) do_softirq(); + +#if defined(CONFIG_PREEMPT) + while (--current->preempt_count == 0) { + db_assert(intr_off()); + db_assert(!in_interrupt()); + + if (current->need_resched == 0) { + break; + } + + current->preempt_count ++; + sti(); + if (user_mode(regs)) { + schedule(); + } else { + preempt_schedule(); + } + cli(); + } +#endif + return 1; } diff -Nur linux-2.4.33-imedia/arch/mips/kernel/process.c linux-2.4.33-imedia-patching/arch/mips/kernel/process.c --- linux-2.4.33-imedia/arch/mips/kernel/process.c 2003-08-25 14:44:40.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/mips/kernel/process.c 2006-01-26 15:19:42.000000000 +0200 @@ -38,8 +38,6 @@ ATTRIB_NORET void cpu_idle(void) { /* endless idle loop with no priority at all */ - current->nice = 20; - current->counter = -100; init_idle(); while (1) { diff -Nur linux-2.4.33-imedia/arch/mips/mm/extable.c linux-2.4.33-imedia-patching/arch/mips/mm/extable.c --- linux-2.4.33-imedia/arch/mips/mm/extable.c 2002-11-29 01:53:10.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/mips/mm/extable.c 2006-01-26 15:19:42.000000000 +0200 @@ -3,6 +3,7 @@ */ #include #include +#include #include #include diff -Nur linux-2.4.33-imedia/arch/mips64/kernel/process.c linux-2.4.33-imedia-patching/arch/mips64/kernel/process.c --- linux-2.4.33-imedia/arch/mips64/kernel/process.c 2003-08-25 14:44:40.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/mips64/kernel/process.c 2006-01-26 15:19:42.000000000 +0200 @@ -39,8 +39,7 @@ { /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; + while (1) { while (!current->need_resched) if (cpu_wait) diff -Nur linux-2.4.33-imedia/arch/parisc/kernel/process.c linux-2.4.33-imedia-patching/arch/parisc/kernel/process.c --- linux-2.4.33-imedia/arch/parisc/kernel/process.c 2003-06-13 17:51:31.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/parisc/kernel/process.c 2006-01-26 15:19:42.000000000 +0200 @@ -65,8 +65,6 @@ { /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; while (1) { while (!current->need_resched) { diff -Nur linux-2.4.33-imedia/arch/ppc/8xx_io/uart.c linux-2.4.33-imedia-patching/arch/ppc/8xx_io/uart.c --- linux-2.4.33-imedia/arch/ppc/8xx_io/uart.c 2005-01-19 16:09:35.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/ppc/8xx_io/uart.c 2006-01-26 15:19:42.000000000 +0200 @@ -1849,7 +1849,6 @@ printk("lsr = %d (jiff=%lu)...", lsr, jiffies); #endif current->state = TASK_INTERRUPTIBLE; -/* current->counter = 0; make us low-priority */ schedule_timeout(char_time); if (signal_pending(current)) break; diff -Nur linux-2.4.33-imedia/arch/ppc/config.in linux-2.4.33-imedia-patching/arch/ppc/config.in --- linux-2.4.33-imedia/arch/ppc/config.in 2004-08-08 02:26:04.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/ppc/config.in 2006-01-26 15:19:42.000000000 +0200 @@ -167,6 +167,8 @@ int 'Maximum number of CPUs (2-32)' CONFIG_NR_CPUS 32 fi +bool 'Preemptible kernel support' CONFIG_PREEMPT + if [ "$CONFIG_6xx" = "y" -a "$CONFIG_8260" = "n" ];then bool 'AltiVec Support' CONFIG_ALTIVEC bool 'Thermal Management Support' CONFIG_TAU diff -Nur linux-2.4.33-imedia/arch/ppc/kernel/entry.S linux-2.4.33-imedia-patching/arch/ppc/kernel/entry.S --- linux-2.4.33-imedia/arch/ppc/kernel/entry.S 2004-04-14 16:05:27.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/ppc/kernel/entry.S 2006-01-26 15:19:42.000000000 +0200 @@ -273,7 +273,9 @@ .globl ret_from_fork ret_from_fork: +#ifdef CONFIG_SMP bl schedule_tail +#endif lwz r0,TASK_PTRACE(r2) andi. r0,r0,PT_TRACESYS bnel- syscall_trace @@ -287,6 +289,46 @@ */ cmpi 0,r3,0 beq restore +#ifdef CONFIG_PREEMPT + lwz r3,PREEMPT_COUNT(r2) + cmpi 0,r3,1 + bge ret_from_except + lwz r5,_MSR(r1) + andi. r5,r5,MSR_PR + bne ret_from_except + lwz r5,NEED_RESCHED(r2) + cmpi 0,r5,0 + beq ret_from_except + lis r3,irq_stat@h + ori r3,r3,irq_stat@l +#ifdef CONFIG_SMP + lwz r5,CPU(r2) + rlwinm r5,r5,5,0,26 + add r3,r3,r5 +#endif + lwz r5,4(r3) + lwz r3,8(r3) + add r3,r3,r5 + cmpi 0,r3,0 + bne ret_from_except + lwz r3,PREEMPT_COUNT(r2) + addi r3,r3,1 + stw r3,PREEMPT_COUNT(r2) + mfmsr r0 + ori r0,r0,MSR_EE + mtmsr r0 + sync + bl preempt_schedule + mfmsr r0 + rlwinm r0,r0,0,17,15 + mtmsr r0 + sync + lwz r3,PREEMPT_COUNT(r2) + subi r3,r3,1 + stw r3,PREEMPT_COUNT(r2) + li r3,1 + b ret_from_intercept +#endif /* CONFIG_PREEMPT */ .globl ret_from_except ret_from_except: lwz r3,_MSR(r1) /* Returning to user mode? */ diff -Nur linux-2.4.33-imedia/arch/ppc/kernel/idle.c linux-2.4.33-imedia-patching/arch/ppc/kernel/idle.c --- linux-2.4.33-imedia/arch/ppc/kernel/idle.c 2003-08-25 14:44:40.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/ppc/kernel/idle.c 2006-01-26 15:19:42.000000000 +0200 @@ -46,9 +46,7 @@ do_power_save = 1; /* endless loop with no priority at all */ - current->nice = 20; - current->counter = -100; - init_idle(); + init_idle(current, smp_processor_id()); for (;;) { #ifdef CONFIG_SMP if (!do_power_save) { @@ -64,13 +62,12 @@ } } #endif +#ifdef CONFIG_6xx if (do_power_save && !current->need_resched) +#endif power_save(); - - if (current->need_resched) { - schedule(); - check_pgt_cache(); - } + schedule(); + check_pgt_cache(); } return 0; } diff -Nur linux-2.4.33-imedia/arch/ppc/kernel/irq.c linux-2.4.33-imedia-patching/arch/ppc/kernel/irq.c --- linux-2.4.33-imedia/arch/ppc/kernel/irq.c 2003-11-28 20:26:19.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/ppc/kernel/irq.c 2006-01-26 15:19:42.000000000 +0200 @@ -551,6 +551,34 @@ return 1; /* lets ret_from_int know we can do checks */ } +#ifdef CONFIG_PREEMPT +int +preempt_intercept(struct pt_regs *regs) +{ + int ret; + + preempt_disable(); + + switch(regs->trap) { + case 0x500: + ret = do_IRQ(regs); + break; +#if !defined(CONFIG_4xx) || defined(CONFIG_440) + case 0x900: +#else + case 0x1000: +#endif + ret = timer_interrupt(regs); + break; + default: + BUG(); + } + + preempt_enable(); + return ret; +} +#endif /* CONFIG_PREEMPT */ + unsigned long probe_irq_on (void) { return 0; @@ -647,11 +675,13 @@ show("wait_on_irq"); count = ~0; } + preempt_disable(); __sti(); /* don't worry about the lock race Linus found * on intel here. -- Cort */ __cli(); + preempt_enable_no_resched(); if (atomic_read(&global_irq_count)) continue; if (global_irq_lock) @@ -727,6 +757,8 @@ global_irq_holder = cpu; } +#define EFLAGS_IF_SHIFT 15 + /* * A global "cli()" while in an interrupt context * turns into just a local cli(). Interrupts @@ -744,9 +776,10 @@ unsigned long flags; __save_flags(flags); - if (flags & (1 << 15)) { - int cpu = smp_processor_id(); + if (flags & (1 << EFLAGS_IF_SHIFT)) { + int cpu; __cli(); + cpu = smp_processor_id(); if (!local_irq_count(cpu)) get_irqlock(cpu); } @@ -754,11 +787,14 @@ void __global_sti(void) { - int cpu = smp_processor_id(); + int cpu; + preempt_disable(); + cpu = smp_processor_id(); if (!local_irq_count(cpu)) release_irqlock(cpu); __sti(); + preempt_enable(); } /* @@ -773,19 +809,23 @@ int retval; int local_enabled; unsigned long flags; + int cpu; __save_flags(flags); - local_enabled = (flags >> 15) & 1; + local_enabled = (flags >> EFLAGS_IF_SHIFT) & 1; /* default to local */ retval = 2 + local_enabled; /* check for global flags if we're not in an interrupt */ - if (!local_irq_count(smp_processor_id())) { + preempt_disable(); + cpu = smp_processor_id(); + if (!local_irq_count(cpu)) { if (local_enabled) retval = 1; - if (global_irq_holder == (unsigned char) smp_processor_id()) + if (global_irq_holder == cpu) retval = 0; } + preempt_enable(); return retval; } diff -Nur linux-2.4.33-imedia/arch/ppc/kernel/mk_defs.c linux-2.4.33-imedia-patching/arch/ppc/kernel/mk_defs.c --- linux-2.4.33-imedia/arch/ppc/kernel/mk_defs.c 2003-11-28 20:26:19.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/ppc/kernel/mk_defs.c 2006-01-26 15:19:42.000000000 +0200 @@ -34,11 +34,13 @@ /*DEFINE(KERNELBASE, KERNELBASE);*/ DEFINE(STATE, offsetof(struct task_struct, state)); DEFINE(NEXT_TASK, offsetof(struct task_struct, next_task)); - DEFINE(COUNTER, offsetof(struct task_struct, counter)); - DEFINE(PROCESSOR, offsetof(struct task_struct, processor)); + DEFINE(CPU, offsetof(struct task_struct, cpu)); DEFINE(SIGPENDING, offsetof(struct task_struct, sigpending)); DEFINE(THREAD, offsetof(struct task_struct, thread)); DEFINE(MM, offsetof(struct task_struct, mm)); +#ifdef CONFIG_PREEMPT + DEFINE(PREEMPT_COUNT, offsetof(struct task_struct, preempt_count)); +#endif DEFINE(ACTIVE_MM, offsetof(struct task_struct, active_mm)); DEFINE(TASK_STRUCT_SIZE, sizeof(struct task_struct)); DEFINE(KSP, offsetof(struct thread_struct, ksp)); diff -Nur linux-2.4.33-imedia/arch/ppc/kernel/open_pic.c linux-2.4.33-imedia-patching/arch/ppc/kernel/open_pic.c --- linux-2.4.33-imedia/arch/ppc/kernel/open_pic.c 2004-02-18 15:36:30.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/ppc/kernel/open_pic.c 2006-01-26 15:19:42.000000000 +0200 @@ -601,19 +601,24 @@ void __init do_openpic_setup_cpu(void) { int i; - u32 msk = 1 << smp_hw_index[smp_processor_id()]; +#ifdef CONFIG_IRQ_ALL_CPUS + u32 msk; +#endif /* CONFIG_IRQ_ALL_CPUS */ spin_lock(&openpic_setup_lock); #ifdef CONFIG_IRQ_ALL_CPUS + msk = 1 << smp_hw_index[smp_processor_id()]; + /* let the openpic know we want intrs. default affinity * is 0xffffffff until changed via /proc * That's how it's done on x86. If we want it differently, then * we should make sure we also change the default values of irq_affinity * in irq.c. */ - for (i = 0; i < NumSources; i++) + for (i = 0; i < NumSources; i++) { openpic_mapirq(i, msk, ~0U); + } #endif /* CONFIG_IRQ_ALL_CPUS */ openpic_set_priority(0); diff -Nur linux-2.4.33-imedia/arch/ppc/kernel/setup.c linux-2.4.33-imedia-patching/arch/ppc/kernel/setup.c --- linux-2.4.33-imedia/arch/ppc/kernel/setup.c 2004-04-14 16:05:27.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/ppc/kernel/setup.c 2006-01-26 15:19:42.000000000 +0200 @@ -502,6 +502,20 @@ strcpy(cmd_line, CONFIG_CMDLINE); #endif /* CONFIG_CMDLINE */ +#ifdef CONFIG_PREEMPT + /* Override the irq routines for external & timer interrupts here, + * as the MMU has only been minimally setup at this point and + * there are no protections on page zero. + */ + { + extern int preempt_intercept(struct pt_regs *); + + do_IRQ_intercept = (unsigned long) &preempt_intercept; + timer_interrupt_intercept = (unsigned long) &preempt_intercept; + + } +#endif /* CONFIG_PREEMPT */ + platform_init(r3, r4, r5, r6, r7); if (ppc_md.progress) diff -Nur linux-2.4.33-imedia/arch/ppc/kernel/smp.c linux-2.4.33-imedia-patching/arch/ppc/kernel/smp.c --- linux-2.4.33-imedia/arch/ppc/kernel/smp.c 2003-08-25 14:44:40.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/ppc/kernel/smp.c 2006-01-26 15:19:42.000000000 +0200 @@ -294,8 +294,6 @@ cpu_callin_map[0] = 1; current->processor = 0; - init_idle(); - for (i = 0; i < NR_CPUS; i++) { prof_counter[i] = 1; prof_multiplier[i] = 1; @@ -351,7 +349,8 @@ p = init_task.prev_task; if (!p) panic("No idle task for CPU %d", i); - del_from_runqueue(p); + init_idle(p, i); + unhash_process(p); init_tasks[i] = p; diff -Nur linux-2.4.33-imedia/arch/ppc/kernel/temp.c linux-2.4.33-imedia-patching/arch/ppc/kernel/temp.c --- linux-2.4.33-imedia/arch/ppc/kernel/temp.c 2003-08-25 14:44:40.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/ppc/kernel/temp.c 2006-01-26 15:19:42.000000000 +0200 @@ -138,7 +138,7 @@ static void tau_timeout(void * info) { - unsigned long cpu = smp_processor_id(); + unsigned long cpu; unsigned long flags; int size; int shrink; @@ -146,6 +146,8 @@ /* disabling interrupts *should* be okay */ save_flags(flags); cli(); + cpu = smp_processor_id(); + #ifndef CONFIG_TAU_INT TAUupdate(cpu); #endif @@ -191,13 +193,15 @@ static void tau_timeout_smp(unsigned long unused) { - /* schedule ourselves to be run again */ mod_timer(&tau_timer, jiffies + shrink_timer) ; + + preempt_disable(); #ifdef CONFIG_SMP smp_call_function(tau_timeout, NULL, 1, 0); #endif tau_timeout(NULL); + preempt_enable(); } /* diff -Nur linux-2.4.33-imedia/arch/ppc/lib/dec_and_lock.c linux-2.4.33-imedia-patching/arch/ppc/lib/dec_and_lock.c --- linux-2.4.33-imedia/arch/ppc/lib/dec_and_lock.c 2001-11-16 20:10:08.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/ppc/lib/dec_and_lock.c 2006-01-26 15:19:42.000000000 +0200 @@ -1,4 +1,5 @@ #include +#include #include #include #include diff -Nur linux-2.4.33-imedia/arch/ppc/mm/init.c linux-2.4.33-imedia-patching/arch/ppc/mm/init.c --- linux-2.4.33-imedia/arch/ppc/mm/init.c 2003-11-28 20:26:19.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/ppc/mm/init.c 2006-01-26 15:19:42.000000000 +0200 @@ -126,6 +126,9 @@ int do_check_pgt_cache(int low, int high) { int freed = 0; + + preempt_disable(); + if (pgtable_cache_size > high) { do { if (pgd_quicklist) { @@ -138,6 +141,9 @@ } } while (pgtable_cache_size > low); } + + preempt_enable(); + return freed; } diff -Nur linux-2.4.33-imedia/arch/ppc/mm/tlb.c linux-2.4.33-imedia-patching/arch/ppc/mm/tlb.c --- linux-2.4.33-imedia/arch/ppc/mm/tlb.c 2003-08-25 14:44:40.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/ppc/mm/tlb.c 2006-01-26 15:19:42.000000000 +0200 @@ -58,11 +58,14 @@ * we can and should dispense with flush_tlb_all(). * -- paulus. */ + + preempt_disable(); local_flush_tlb_range(&init_mm, TASK_SIZE, ~0UL); #ifdef CONFIG_SMP smp_send_tlb_invalidate(0); #endif /* CONFIG_SMP */ + preempt_enable(); } /* @@ -73,8 +76,10 @@ void local_flush_tlb_mm(struct mm_struct *mm) { + preempt_disable(); if (Hash == 0) { _tlbia(); + preempt_enable(); return; } @@ -88,6 +93,7 @@ #ifdef CONFIG_SMP smp_send_tlb_invalidate(0); #endif + preempt_enable(); } void @@ -97,8 +103,10 @@ pmd_t *pmd; pte_t *pte; + preempt_disable(); if (Hash == 0) { _tlbie(vmaddr); + preempt_enable(); return; } mm = (vmaddr < TASK_SIZE)? vma->vm_mm: &init_mm; @@ -111,6 +119,7 @@ #ifdef CONFIG_SMP smp_send_tlb_invalidate(0); #endif + preempt_enable(); } @@ -127,13 +136,17 @@ unsigned long pmd_end; unsigned int ctx = mm->context; + preempt_disable(); if (Hash == 0) { _tlbia(); + preempt_enable(); return; } start &= PAGE_MASK; - if (start >= end) + if (start >= end) { + preempt_enable(); return; + } pmd = pmd_offset(pgd_offset(mm, start), start); do { pmd_end = (start + PGDIR_SIZE) & PGDIR_MASK; @@ -156,4 +169,5 @@ #ifdef CONFIG_SMP smp_send_tlb_invalidate(0); #endif + preempt_enable(); } diff -Nur linux-2.4.33-imedia/arch/ppc64/kernel/idle.c linux-2.4.33-imedia-patching/arch/ppc64/kernel/idle.c --- linux-2.4.33-imedia/arch/ppc64/kernel/idle.c 2004-02-18 15:36:30.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/ppc64/kernel/idle.c 2006-01-26 15:19:42.000000000 +0200 @@ -87,15 +87,12 @@ long oldval; unsigned long CTRL; - /* endless loop with no priority at all */ - current->nice = 20; - current->counter = -100; - /* ensure iSeries run light will be out when idle */ current->thread.flags &= ~PPC_FLAG_RUN_LIGHT; CTRL = mfspr(CTRLF); CTRL &= ~RUNLATCH; mtspr(CTRLT, CTRL); + /* endless loop with no priority at all */ init_idle(); lpaca = get_paca(); @@ -133,8 +130,7 @@ { long oldval; - current->nice = 20; - current->counter = -100; + /* endless loop with no priority at all */ init_idle(); for (;;) { @@ -161,8 +157,6 @@ unsigned long start_snooze; ppaca = &paca[(lpaca->xPacaIndex) ^ 1]; - current->nice = 20; - current->counter = -100; init_idle(); for (;;) { @@ -240,9 +234,6 @@ struct paca_struct *lpaca = get_paca(); /* endless loop with no priority at all */ - current->nice = 20; - current->counter = -100; - init_idle(); for (;;) { diff -Nur linux-2.4.33-imedia/arch/s390/kernel/asm-offsets.c linux-2.4.33-imedia-patching/arch/s390/kernel/asm-offsets.c --- linux-2.4.33-imedia/arch/s390/kernel/asm-offsets.c 2002-08-03 03:39:43.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/s390/kernel/asm-offsets.c 2006-01-26 15:19:42.000000000 +0200 @@ -26,7 +26,7 @@ DEFINE(__TASK_need_resched, offsetof(struct task_struct, need_resched),); DEFINE(__TASK_ptrace, offsetof(struct task_struct, ptrace),); - DEFINE(__TASK_processor, offsetof(struct task_struct, processor),); + DEFINE(__TASK_processor, offsetof(struct task_struct, cpu),); return 0; } diff -Nur linux-2.4.33-imedia/arch/s390/kernel/bitmap.S linux-2.4.33-imedia-patching/arch/s390/kernel/bitmap.S --- linux-2.4.33-imedia/arch/s390/kernel/bitmap.S 2000-05-12 21:41:44.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/s390/kernel/bitmap.S 2006-01-26 15:19:42.000000000 +0200 @@ -35,3 +35,21 @@ .byte 0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4 .byte 0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,8 + .globl _sb_findmap +_sb_findmap: + .byte 8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 diff -Nur linux-2.4.33-imedia/arch/s390/kernel/entry.S linux-2.4.33-imedia-patching/arch/s390/kernel/entry.S --- linux-2.4.33-imedia/arch/s390/kernel/entry.S 2004-11-17 13:54:21.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/s390/kernel/entry.S 2006-01-26 15:19:42.000000000 +0200 @@ -254,13 +254,14 @@ ret_from_fork: basr %r13,0 l %r13,.Lentry_base-.(%r13) # setup base pointer to &entry_base + # not saving R14 here because we go to sysc_return ultimately + l %r1,BASED(.Lschedtail) + basr %r14,%r1 # call schedule_tail (unlock stuff) GET_CURRENT # load pointer to task_struct to R9 stosm 24(%r15),0x03 # reenable interrupts sr %r0,%r0 # child returns 0 st %r0,SP_R2(%r15) # store return value (change R2 on stack) - l %r1,BASED(.Lschedtail) - la %r14,BASED(sysc_return) - br %r1 # call schedule_tail, return to sysc_return + b BASED(sysc_return) # # clone, fork, vfork, exec and sigreturn need glue, diff -Nur linux-2.4.33-imedia/arch/s390/kernel/process.c linux-2.4.33-imedia-patching/arch/s390/kernel/process.c --- linux-2.4.33-imedia/arch/s390/kernel/process.c 2004-02-18 15:36:30.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/s390/kernel/process.c 2006-01-26 15:19:42.000000000 +0200 @@ -50,15 +50,11 @@ * The idle loop on a S390... */ -int cpu_idle(void *unused) +int cpu_idle(void) { psw_t wait_psw; unsigned long reg; - /* endless idle loop with no priority at all */ - init_idle(); - current->nice = 20; - current->counter = -100; while (1) { __cli(); if (current->need_resched) { @@ -96,7 +92,7 @@ { struct task_struct *tsk = current; - printk("CPU: %d %s\n", tsk->processor, print_tainted()); + printk("CPU: %d %s\n", tsk->cpu, print_tainted()); printk("Process %s (pid: %d, task: %08lx, ksp: %08x)\n", current->comm, current->pid, (unsigned long) tsk, tsk->thread.ksp); diff -Nur linux-2.4.33-imedia/arch/s390/kernel/smp.c linux-2.4.33-imedia-patching/arch/s390/kernel/smp.c --- linux-2.4.33-imedia/arch/s390/kernel/smp.c 2004-11-17 13:54:21.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/s390/kernel/smp.c 2006-01-26 15:19:42.000000000 +0200 @@ -38,7 +38,7 @@ #include /* prototypes */ -extern int cpu_idle(void * unused); +extern int cpu_idle(void); extern __u16 boot_cpu_addr; extern volatile int __cpu_logical_map[]; @@ -56,6 +56,7 @@ spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; unsigned long cpu_online_map; +unsigned long cache_decay_ticks; /* * Setup routine for controlling SMP activation @@ -512,7 +513,7 @@ { int curr_cpu; - current->processor = 0; + current->cpu = 0; smp_num_cpus = 1; cpu_online_map = 1; for (curr_cpu = 0; @@ -553,7 +554,7 @@ pfault_init(); #endif /* cpu_idle will call schedule for us */ - return cpu_idle(NULL); + return cpu_idle(); } /* @@ -591,12 +592,9 @@ idle = init_task.prev_task; if (!idle) panic("No idle process for CPU %d",cpu); - idle->processor = cpu; - idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */ + init_idle(idle, cpu); - del_from_runqueue(idle); unhash_process(idle); - init_tasks[cpu] = idle; cpu_lowcore = get_cpu_lowcore(cpu); cpu_lowcore->save_area[15] = idle->thread.ksp; @@ -648,6 +646,8 @@ panic("Couldn't request external interrupt 0x1202"); smp_count_cpus(); memset(lowcore_ptr,0,sizeof(lowcore_ptr)); + + cache_decay_ticks = (200 * HZ) / 1000; /* Is 200ms ok? Robus? XXX */ /* * Initialize the logical to physical CPU number mapping diff -Nur linux-2.4.33-imedia/arch/s390/kernel/traps.c linux-2.4.33-imedia-patching/arch/s390/kernel/traps.c --- linux-2.4.33-imedia/arch/s390/kernel/traps.c 2002-11-29 01:53:11.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/s390/kernel/traps.c 2006-01-26 15:19:42.000000000 +0200 @@ -142,7 +142,7 @@ * We can't print the backtrace of a running process. It is * unreliable at best and can cause kernel oopses. */ - if (task_has_cpu(tsk)) + if (tsk->state == TASK_RUNNING) return; show_trace((unsigned long *) tsk->thread.ksp); } diff -Nur linux-2.4.33-imedia/arch/s390x/kernel/asm-offsets.c linux-2.4.33-imedia-patching/arch/s390x/kernel/asm-offsets.c --- linux-2.4.33-imedia/arch/s390x/kernel/asm-offsets.c 2002-08-03 03:39:43.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/s390x/kernel/asm-offsets.c 2006-01-26 15:19:42.000000000 +0200 @@ -26,7 +26,7 @@ DEFINE(__TASK_need_resched, offsetof(struct task_struct, need_resched),); DEFINE(__TASK_ptrace, offsetof(struct task_struct, ptrace),); - DEFINE(__TASK_processor, offsetof(struct task_struct, processor),); + DEFINE(__TASK_processor, offsetof(struct task_struct, cpu),); return 0; } diff -Nur linux-2.4.33-imedia/arch/s390x/kernel/bitmap.S linux-2.4.33-imedia-patching/arch/s390x/kernel/bitmap.S --- linux-2.4.33-imedia/arch/s390x/kernel/bitmap.S 2001-02-14 00:13:44.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/s390x/kernel/bitmap.S 2006-01-26 15:19:42.000000000 +0200 @@ -35,3 +35,21 @@ .byte 0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4 .byte 0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,8 + .globl _sb_findmap +_sb_findmap: + .byte 8,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 + .byte 4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0 diff -Nur linux-2.4.33-imedia/arch/s390x/kernel/entry.S linux-2.4.33-imedia-patching/arch/s390x/kernel/entry.S --- linux-2.4.33-imedia/arch/s390x/kernel/entry.S 2004-11-17 13:54:21.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/s390x/kernel/entry.S 2006-01-26 15:19:42.000000000 +0200 @@ -240,11 +240,11 @@ # .globl ret_from_fork ret_from_fork: + brasl %r14,schedule_tail GET_CURRENT # load pointer to task_struct to R9 stosm 48(%r15),0x03 # reenable interrupts xc SP_R2(8,%r15),SP_R2(%r15) # child returns 0 - larl %r14,sysc_return - jg schedule_tail # return to sysc_return + j sysc_return # # clone, fork, vfork, exec and sigreturn need glue, diff -Nur linux-2.4.33-imedia/arch/s390x/kernel/process.c linux-2.4.33-imedia-patching/arch/s390x/kernel/process.c --- linux-2.4.33-imedia/arch/s390x/kernel/process.c 2004-02-18 15:36:30.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/s390x/kernel/process.c 2006-01-26 15:19:42.000000000 +0200 @@ -55,10 +55,6 @@ psw_t wait_psw; unsigned long reg; - /* endless idle loop with no priority at all */ - init_idle(); - current->nice = 20; - current->counter = -100; while (1) { __cli(); if (current->need_resched) { @@ -93,7 +89,7 @@ { struct task_struct *tsk = current; - printk("CPU: %d %s\n", tsk->processor, print_tainted()); + printk("CPU: %d %s\n", tsk->cpu, print_tainted()); printk("Process %s (pid: %d, task: %016lx, ksp: %016lx)\n", current->comm, current->pid, (unsigned long) tsk, tsk->thread.ksp); diff -Nur linux-2.4.33-imedia/arch/s390x/kernel/smp.c linux-2.4.33-imedia-patching/arch/s390x/kernel/smp.c --- linux-2.4.33-imedia/arch/s390x/kernel/smp.c 2004-11-17 13:54:21.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/s390x/kernel/smp.c 2006-01-26 15:19:42.000000000 +0200 @@ -38,7 +38,7 @@ #include /* prototypes */ -extern int cpu_idle(void * unused); +extern int cpu_idle(void); extern __u16 boot_cpu_addr; extern volatile int __cpu_logical_map[]; @@ -56,6 +56,7 @@ spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; unsigned long cpu_online_map; +unsigned long cache_decay_ticks; /* * Setup routine for controlling SMP activation @@ -494,7 +495,7 @@ { int curr_cpu; - current->processor = 0; + current->cpu = 0; smp_num_cpus = 1; cpu_online_map = 1; for (curr_cpu = 0; @@ -534,7 +535,7 @@ pfault_init(); #endif /* cpu_idle will call schedule for us */ - return cpu_idle(NULL); + return cpu_idle(); } /* @@ -572,12 +573,9 @@ idle = init_task.prev_task; if (!idle) panic("No idle process for CPU %d",cpu); - idle->processor = cpu; - idle->cpus_runnable = 1 << cpu; /* we schedule the first task manually */ + init_idle(idle, cpu); - del_from_runqueue(idle); unhash_process(idle); - init_tasks[cpu] = idle; cpu_lowcore = get_cpu_lowcore(cpu); cpu_lowcore->save_area[15] = idle->thread.ksp; @@ -631,6 +629,8 @@ smp_count_cpus(); memset(lowcore_ptr,0,sizeof(lowcore_ptr)); + cache_decay_ticks = (200 * HZ) / 1000; /* Is 200ms ok? Robus? XXX */ + /* * Initialize the logical to physical CPU number mapping */ diff -Nur linux-2.4.33-imedia/arch/s390x/kernel/traps.c linux-2.4.33-imedia-patching/arch/s390x/kernel/traps.c --- linux-2.4.33-imedia/arch/s390x/kernel/traps.c 2002-11-29 01:53:11.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/s390x/kernel/traps.c 2006-01-26 15:19:42.000000000 +0200 @@ -144,7 +144,7 @@ * We can't print the backtrace of a running process. It is * unreliable at best and can cause kernel oopses. */ - if (task_has_cpu(tsk)) + if (tsk->state == TASK_RUNNING) return; show_trace((unsigned long *) tsk->thread.ksp); } diff -Nur linux-2.4.33-imedia/arch/sh/kernel/process.c linux-2.4.33-imedia-patching/arch/sh/kernel/process.c --- linux-2.4.33-imedia/arch/sh/kernel/process.c 2003-08-25 14:44:40.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/sh/kernel/process.c 2006-01-26 15:19:42.000000000 +0200 @@ -42,8 +42,6 @@ { /* endless idle loop with no priority at all */ init_idle(); - current->nice = 20; - current->counter = -100; while (1) { if (hlt_counter) { diff -Nur linux-2.4.33-imedia/arch/sparc/kernel/process.c linux-2.4.33-imedia-patching/arch/sparc/kernel/process.c --- linux-2.4.33-imedia/arch/sparc/kernel/process.c 2005-04-04 04:42:19.000000000 +0300 +++ linux-2.4.33-imedia-patching/arch/sparc/kernel/process.c 2006-01-26 15:19:42.000000000 +0200 @@ -80,8 +80,6 @@ goto out; /* endless idle loop with no priority at all */ - current->nice = 20; - current->counter = -100; init_idle(); for (;;) { @@ -134,8 +132,6 @@ int cpu_idle(void) { /* endless idle loop with no priority at all */ - current->nice = 20; - current->counter = -100; init_idle(); while(1) { diff -Nur linux-2.4.33-imedia/arch/sparc64/kernel/process.c linux-2.4.33-imedia-patching/arch/sparc64/kernel/process.c --- linux-2.4.33-imedia/arch/sparc64/kernel/process.c 2006-01-11 20:29:26.000000000 +0200 +++ linux-2.4.33-imedia-patching/arch/sparc64/kernel/process.c 2006-01-26 15:19:42.000000000 +0200 @@ -54,8 +54,6 @@ return -EPERM; /* endless idle loop with no priority at all */ - current->nice = 20; - current->counter = -100; init_idle(); for (;;) { @@ -84,8 +82,6 @@ #define unidle_me() (cpu_data[current->processor].idle_volume = 0) int cpu_idle(void) { - current->nice = 20; - current->counter = -100; init_idle(); while(1) { diff -Nur linux-2.4.33-imedia/drivers/block/ll_rw_blk.c linux-2.4.33-imedia-patching/drivers/block/ll_rw_blk.c --- linux-2.4.33-imedia/drivers/block/ll_rw_blk.c 2004-11-17 13:54:21.000000000 +0200 +++ linux-2.4.33-imedia-patching/drivers/block/ll_rw_blk.c 2006-01-26 15:19:42.000000000 +0200 @@ -1328,6 +1328,7 @@ kstat.pgpgin += count; break; } + conditional_schedule(); } /** diff -Nur linux-2.4.33-imedia/drivers/char/drm-4.0/tdfx_drv.c linux-2.4.33-imedia-patching/drivers/char/drm-4.0/tdfx_drv.c --- linux-2.4.33-imedia/drivers/char/drm-4.0/tdfx_drv.c 2004-02-18 15:36:31.000000000 +0200 +++ linux-2.4.33-imedia-patching/drivers/char/drm-4.0/tdfx_drv.c 2006-01-26 15:19:42.000000000 +0200 @@ -554,7 +554,6 @@ lock.context, current->pid, j, dev->lock.lock_time, jiffies); current->state = TASK_INTERRUPTIBLE; - current->policy |= SCHED_YIELD; schedule_timeout(DRM_LOCK_SLICE-j); DRM_DEBUG("jiffies=%d\n", jiffies); } diff -Nur linux-2.4.33-imedia/drivers/char/mem.c linux-2.4.33-imedia-patching/drivers/char/mem.c --- linux-2.4.33-imedia/drivers/char/mem.c 2004-08-08 02:26:04.000000000 +0300 +++ linux-2.4.33-imedia-patching/drivers/char/mem.c 2006-01-26 15:19:42.000000000 +0200 @@ -401,7 +401,7 @@ if (count > size) count = size; - zap_page_range(mm, addr, count); + zap_page_range(mm, addr, count, 0); zeromap_page_range(addr, count, PAGE_COPY); size -= count; diff -Nur linux-2.4.33-imedia/drivers/char/mwave/mwavedd.c linux-2.4.33-imedia-patching/drivers/char/mwave/mwavedd.c --- linux-2.4.33-imedia/drivers/char/mwave/mwavedd.c 2003-06-13 17:51:33.000000000 +0300 +++ linux-2.4.33-imedia-patching/drivers/char/mwave/mwavedd.c 2006-01-26 15:19:42.000000000 +0200 @@ -279,7 +279,6 @@ pDrvData->IPCs[ipcnum].bIsHere = FALSE; pDrvData->IPCs[ipcnum].bIsEnabled = TRUE; #if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,0) - current->nice = -20; /* boost to provide priority timing */ #else current->priority = 0x28; /* boost to provide priority timing */ #endif diff -Nur linux-2.4.33-imedia/drivers/char/random.c linux-2.4.33-imedia-patching/drivers/char/random.c --- linux-2.4.33-imedia/drivers/char/random.c 2005-06-01 03:56:56.000000000 +0300 +++ linux-2.4.33-imedia-patching/drivers/char/random.c 2006-01-26 15:19:42.000000000 +0200 @@ -1378,6 +1378,11 @@ buf += i; ret += i; add_timer_randomness(&extract_timer_state, nbytes); +#if LOWLATENCY_NEEDED + /* This can happen in softirq's, but that's what we want */ + if (conditional_schedule_needed()) + break; +#endif } /* Wipe data just returned from memory */ diff -Nur linux-2.4.33-imedia/drivers/char/serial_txx927.c linux-2.4.33-imedia-patching/drivers/char/serial_txx927.c --- linux-2.4.33-imedia/drivers/char/serial_txx927.c 2005-01-19 16:09:51.000000000 +0200 +++ linux-2.4.33-imedia-patching/drivers/char/serial_txx927.c 2006-01-26 15:19:42.000000000 +0200 @@ -1526,7 +1526,6 @@ printk("cisr = %d (jiff=%lu)...", cisr, jiffies); #endif current->state = TASK_INTERRUPTIBLE; - current->counter = 0; /* make us low-priority */ schedule_timeout(char_time); if (signal_pending(current)) break; diff -Nur linux-2.4.33-imedia/drivers/i2c/i2c-algo-bit.c linux-2.4.33-imedia-patching/drivers/i2c/i2c-algo-bit.c --- linux-2.4.33-imedia/drivers/i2c/i2c-algo-bit.c 2005-04-04 04:42:19.000000000 +0300 +++ linux-2.4.33-imedia-patching/drivers/i2c/i2c-algo-bit.c 2006-01-26 15:19:42.000000000 +0200 @@ -367,6 +367,7 @@ return (retval<0)? retval : -EFAULT; /* got a better one ?? */ } + conditional_schedule(); #if 0 /* from asm/delay.h */ __delay(adap->mdelay * (loops_per_sec / 1000) ); diff -Nur linux-2.4.33-imedia/drivers/i2c/i2c-core.c linux-2.4.33-imedia-patching/drivers/i2c/i2c-core.c --- linux-2.4.33-imedia/drivers/i2c/i2c-core.c 2005-06-01 03:56:56.000000000 +0300 +++ linux-2.4.33-imedia-patching/drivers/i2c/i2c-core.c 2006-01-26 15:19:42.000000000 +0200 @@ -724,6 +724,8 @@ { int ret; + conditional_schedule(); + if (adap->algo->master_xfer) { DEB2(printk(KERN_DEBUG "i2c-core.o: master_xfer: %s with %d msgs.\n", adap->name,num)); @@ -746,6 +748,8 @@ struct i2c_adapter *adap=client->adapter; struct i2c_msg msg; + conditional_schedule(); + if (client->adapter->algo->master_xfer) { msg.addr = client->addr; msg.flags = client->flags & I2C_M_TEN; @@ -775,6 +779,9 @@ struct i2c_adapter *adap=client->adapter; struct i2c_msg msg; int ret; + + conditional_schedule(); + if (client->adapter->algo->master_xfer) { msg.addr = client->addr; msg.flags = client->flags & I2C_M_TEN; diff -Nur linux-2.4.33-imedia/drivers/ieee1394/csr.c linux-2.4.33-imedia-patching/drivers/ieee1394/csr.c --- linux-2.4.33-imedia/drivers/ieee1394/csr.c 2004-02-18 15:36:31.000000000 +0200 +++ linux-2.4.33-imedia-patching/drivers/ieee1394/csr.c 2006-01-26 15:19:42.000000000 +0200 @@ -18,6 +18,7 @@ */ #include +#include #include /* needed for MODULE_PARM */ #include #include diff -Nur linux-2.4.33-imedia/drivers/md/md.c linux-2.4.33-imedia-patching/drivers/md/md.c --- linux-2.4.33-imedia/drivers/md/md.c 2005-11-16 21:12:54.000000000 +0200 +++ linux-2.4.33-imedia-patching/drivers/md/md.c 2006-01-26 15:19:42.000000000 +0200 @@ -2955,8 +2955,6 @@ * bdflush, otherwise bdflush will deadlock if there are too * many dirty RAID5 blocks. */ - current->policy = SCHED_OTHER; - current->nice = -20; md_unlock_kernel(); complete(thread->event); @@ -3480,11 +3478,6 @@ "(but not more than %d KB/sec) for reconstruction.\n", sysctl_speed_limit_max); - /* - * Resync has low priority. - */ - current->nice = 19; - is_mddev_idle(mddev); /* this also initializes IO event counters */ for (m = 0; m < SYNC_MARKS; m++) { mark[m] = jiffies; @@ -3562,16 +3555,13 @@ currspeed = (j-mddev->resync_mark_cnt)/2/((jiffies-mddev->resync_mark)/HZ +1) +1; if (currspeed > sysctl_speed_limit_min) { - current->nice = 19; - if ((currspeed > sysctl_speed_limit_max) || !is_mddev_idle(mddev)) { current->state = TASK_INTERRUPTIBLE; md_schedule_timeout(HZ/4); goto repeat; } - } else - current->nice = -20; + } } printk(KERN_INFO "md: md%d: sync done.\n",mdidx(mddev)); err = 0; diff -Nur linux-2.4.33-imedia/drivers/sound/sound_core.c linux-2.4.33-imedia-patching/drivers/sound/sound_core.c --- linux-2.4.33-imedia/drivers/sound/sound_core.c 2001-09-30 22:26:08.000000000 +0300 +++ linux-2.4.33-imedia-patching/drivers/sound/sound_core.c 2006-01-26 15:19:42.000000000 +0200 @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include diff -Nur linux-2.4.33-imedia/drivers/video/fbcon-cfb16.c linux-2.4.33-imedia-patching/drivers/video/fbcon-cfb16.c --- linux-2.4.33-imedia/drivers/video/fbcon-cfb16.c 2001-10-15 23:47:13.000000000 +0300 +++ linux-2.4.33-imedia-patching/drivers/video/fbcon-cfb16.c 2006-01-26 15:19:42.000000000 +0200 @@ -189,6 +189,7 @@ case 4: case 8: while (count--) { + conditional_schedule(); c = scr_readw(s++) & p->charmask; cdat = p->fontdata + c * fontheight(p); for (rows = fontheight(p), dest = dest0; rows--; dest += bytes) { @@ -206,6 +207,7 @@ case 12: case 16: while (count--) { + conditional_schedule(); c = scr_readw(s++) & p->charmask; cdat = p->fontdata + (c * fontheight(p) << 1); for (rows = fontheight(p), dest = dest0; rows--; dest += bytes) { diff -Nur linux-2.4.33-imedia/fs/adfs/map.c linux-2.4.33-imedia-patching/fs/adfs/map.c --- linux-2.4.33-imedia/fs/adfs/map.c 2001-10-25 23:53:53.000000000 +0300 +++ linux-2.4.33-imedia-patching/fs/adfs/map.c 2006-01-26 15:19:42.000000000 +0200 @@ -12,6 +12,7 @@ #include #include #include +#include #include "adfs.h" diff -Nur linux-2.4.33-imedia/fs/binfmt_elf.c linux-2.4.33-imedia-patching/fs/binfmt_elf.c --- linux-2.4.33-imedia/fs/binfmt_elf.c 2006-01-11 20:29:27.000000000 +0200 +++ linux-2.4.33-imedia-patching/fs/binfmt_elf.c 2006-01-26 15:19:42.000000000 +0200 @@ -1267,7 +1267,7 @@ psinfo.pr_state = i; psinfo.pr_sname = (i < 0 || i > 5) ? '.' : "RSDZTD"[i]; psinfo.pr_zomb = psinfo.pr_sname == 'Z'; - psinfo.pr_nice = current->nice; + psinfo.pr_nice = task_nice(current); psinfo.pr_flag = current->flags; psinfo.pr_uid = NEW_TO_OLD_UID(current->uid); psinfo.pr_gid = NEW_TO_OLD_GID(current->gid); diff -Nur linux-2.4.33-imedia/fs/buffer.c linux-2.4.33-imedia-patching/fs/buffer.c --- linux-2.4.33-imedia/fs/buffer.c 2005-11-16 21:12:54.000000000 +0200 +++ linux-2.4.33-imedia-patching/fs/buffer.c 2006-01-26 15:19:42.000000000 +0200 @@ -124,7 +124,7 @@ int dummy5; /* unused */ } b_un; unsigned int data[N_PARAM]; -} bdf_prm = {{30, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0}}; +} bdf_prm = {{50, 500, 0, 0, 5*HZ, 30*HZ, 60, 20, 0}}; /* These are the min and max parameter values that we will allow to be assigned */ int bdflush_min[N_PARAM] = { 0, 1, 0, 0, 0, 1*HZ, 0, 0, 0}; @@ -261,8 +261,10 @@ if (dev != NODEV && bh->b_dev != dev) continue; - if (test_and_set_bit(BH_Lock, &bh->b_state)) + if (test_and_set_bit(BH_Lock, &bh->b_state)) { + __refile_buffer(bh); continue; + } if (buffer_delay(bh)) { if (write_buffer_delay(bh)) { if (count) @@ -278,6 +280,7 @@ spin_unlock(&lru_list_lock); write_locked_buffers(array, count); + conditional_schedule(); return -EAGAIN; } unlock_buffer(bh); @@ -311,12 +314,19 @@ struct buffer_head * next; int nr; - next = lru_list[index]; nr = nr_buffers_type[index]; +repeat: + next = lru_list[index]; while (next && --nr >= 0) { struct buffer_head *bh = next; next = bh->b_next_free; + if (conditional_schedule_needed()) { + spin_unlock(&lru_list_lock); + unconditional_schedule(); + spin_lock(&lru_list_lock); + goto repeat; + } if (!buffer_locked(bh)) { if (refile) __refile_buffer(bh); @@ -324,7 +334,6 @@ } if (dev != NODEV && bh->b_dev != dev) continue; - get_bh(bh); spin_unlock(&lru_list_lock); wait_on_buffer (bh); @@ -357,6 +366,15 @@ { int err = 0; +#if LOWLATENCY_NEEDED + /* + * syncing devA when there are lots of buffers dirty against + * devB is expensive. + */ + if (enable_lowlatency) + dev = NODEV; +#endif + /* One pass for no-wait, three for wait: * 0) write out all dirty, unlocked buffers; * 1) wait for all dirty locked buffers; @@ -724,6 +742,7 @@ int i, nlist, slept; struct buffer_head * bh, * bh_next; kdev_t dev = to_kdev_t(bdev->bd_dev); /* will become bdev */ + int lolat_retry = 0; retry: slept = 0; @@ -741,6 +760,17 @@ /* Not hashed? */ if (!bh->b_pprev) continue; + + if (lolat_retry < 10 && conditional_schedule_needed()) { + get_bh(bh); + spin_unlock(&lru_list_lock); + unconditional_schedule(); + spin_lock(&lru_list_lock); + put_bh(bh); + slept = 1; + lolat_retry++; + } + if (buffer_locked(bh)) { get_bh(bh); spin_unlock(&lru_list_lock); @@ -892,12 +922,18 @@ struct buffer_head *bh; struct list_head tmp; int err = 0, err2; - + DEFINE_RESCHED_COUNT; + INIT_LIST_HEAD(&tmp); - +repeat: spin_lock(&lru_list_lock); while (!list_empty(list)) { + if (conditional_schedule_needed()) { + spin_unlock(&lru_list_lock); + unconditional_schedule(); + goto repeat; + } bh = BH_ENTRY(list->next); list_del(&bh->b_inode_buffers); if (!buffer_dirty(bh) && !buffer_locked(bh)) @@ -922,8 +958,18 @@ spin_lock(&lru_list_lock); } } + if (TEST_RESCHED_COUNT(32)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + spin_unlock(&lru_list_lock); + unconditional_schedule(); + spin_lock(&lru_list_lock); + } + } } + RESET_RESCHED_COUNT(); + while (!list_empty(&tmp)) { bh = BH_ENTRY(tmp.prev); remove_inode_queue(bh); @@ -933,6 +979,7 @@ if (!buffer_uptodate(bh)) err = -EIO; brelse(bh); + conditional_schedule(); spin_lock(&lru_list_lock); } @@ -960,11 +1007,20 @@ struct buffer_head *bh; struct list_head *p; int err = 0; + DEFINE_RESCHED_COUNT; +repeat: + conditional_schedule(); spin_lock(&lru_list_lock); - repeat: list_for_each_prev(p, list) { + if (TEST_RESCHED_COUNT(32)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + spin_unlock(&lru_list_lock); + goto repeat; + } + } bh = BH_ENTRY(p); if (buffer_locked(bh)) { get_bh(bh); @@ -973,7 +1029,6 @@ if (!buffer_uptodate(bh)) err = -EIO; brelse(bh); - spin_lock(&lru_list_lock); goto repeat; } } @@ -990,12 +1045,24 @@ void invalidate_inode_buffers(struct inode *inode) { struct list_head * entry; - + +repeat: + conditional_schedule(); spin_lock(&lru_list_lock); - while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers) + while ((entry = inode->i_dirty_buffers.next) != &inode->i_dirty_buffers) { + if (conditional_schedule_needed()) { + spin_unlock(&lru_list_lock); + goto repeat; + } remove_inode_queue(BH_ENTRY(entry)); - while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers) + } + while ((entry = inode->i_dirty_data_buffers.next) != &inode->i_dirty_data_buffers) { + if (conditional_schedule_needed()) { + spin_unlock(&lru_list_lock); + goto repeat; + } remove_inode_queue(BH_ENTRY(entry)); + } spin_unlock(&lru_list_lock); } @@ -1018,6 +1085,7 @@ bh = get_hash_table(dev, block, size); if (bh) { touch_buffer(bh); + conditional_schedule(); return bh; } diff -Nur linux-2.4.33-imedia/fs/dcache.c linux-2.4.33-imedia-patching/fs/dcache.c --- linux-2.4.33-imedia/fs/dcache.c 2006-01-11 20:29:27.000000000 +0200 +++ linux-2.4.33-imedia-patching/fs/dcache.c 2006-01-26 15:19:42.000000000 +0200 @@ -324,11 +324,23 @@ void prune_dcache(int count) { + DEFINE_RESCHED_COUNT; + +redo: spin_lock(&dcache_lock); for (;;) { struct dentry *dentry; struct list_head *tmp; + if (TEST_RESCHED_COUNT(100)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + spin_unlock(&dcache_lock); + unconditional_schedule(); + goto redo; + } + } + tmp = dentry_unused.prev; if (tmp == &dentry_unused) @@ -483,6 +495,7 @@ struct dentry *this_parent = parent; struct list_head *next; int found = 0; + DEFINE_RESCHED_COUNT; spin_lock(&dcache_lock); repeat: @@ -497,6 +510,13 @@ list_add(&dentry->d_lru, dentry_unused.prev); found++; } + + if (TEST_RESCHED_COUNT(500) && found > 10) { + if (conditional_schedule_needed()) /* Typically sys_rmdir() */ + goto out; + RESET_RESCHED_COUNT(); + } + /* * Descend a level if the d_subdirs list is non-empty. */ @@ -521,6 +541,7 @@ #endif goto resume; } +out: spin_unlock(&dcache_lock); return found; } @@ -536,8 +557,10 @@ { int found; - while ((found = select_parent(parent)) != 0) + while ((found = select_parent(parent)) != 0) { prune_dcache(found); + conditional_schedule(); /* Typically sys_rmdir() */ + } } /* @@ -569,7 +592,7 @@ if (!(gfp_mask & __GFP_FS)) return 0; - count = dentry_stat.nr_unused / priority; + count = dentry_stat.nr_unused * priority / 100; prune_dcache(count); return kmem_cache_shrink(dentry_cache); diff -Nur linux-2.4.33-imedia/fs/exec.c linux-2.4.33-imedia-patching/fs/exec.c --- linux-2.4.33-imedia/fs/exec.c 2005-01-19 16:10:10.000000000 +0200 +++ linux-2.4.33-imedia-patching/fs/exec.c 2006-01-26 15:19:42.000000000 +0200 @@ -245,7 +245,7 @@ memset(kaddr+offset+len, 0, PAGE_SIZE-offset-len); } - err = copy_from_user(kaddr+offset, str, bytes_to_copy); + err = ll_copy_from_user(kaddr+offset, str, bytes_to_copy); if (err) { ret = -EFAULT; goto out; @@ -459,8 +459,8 @@ active_mm = current->active_mm; current->mm = mm; current->active_mm = mm; - task_unlock(current); activate_mm(active_mm, mm); + task_unlock(current); mm_release(); if (old_mm) { if (active_mm != old_mm) BUG(); diff -Nur linux-2.4.33-imedia/fs/ext2/dir.c linux-2.4.33-imedia-patching/fs/ext2/dir.c --- linux-2.4.33-imedia/fs/ext2/dir.c 2005-04-04 04:42:20.000000000 +0300 +++ linux-2.4.33-imedia-patching/fs/ext2/dir.c 2006-01-26 15:19:42.000000000 +0200 @@ -153,6 +153,7 @@ struct address_space *mapping = dir->i_mapping; struct page *page = read_cache_page(mapping, n, (filler_t*)mapping->a_ops->readpage, NULL); + conditional_schedule(); /* Scanning large directories */ if (!IS_ERR(page)) { wait_on_page(page); kmap(page); diff -Nur linux-2.4.33-imedia/fs/ext2/inode.c linux-2.4.33-imedia-patching/fs/ext2/inode.c --- linux-2.4.33-imedia/fs/ext2/inode.c 2004-08-08 02:26:05.000000000 +0300 +++ linux-2.4.33-imedia-patching/fs/ext2/inode.c 2006-01-26 15:19:42.000000000 +0200 @@ -727,8 +727,13 @@ { unsigned long block_to_free = 0, count = 0; unsigned long nr; + DEFINE_RESCHED_COUNT; for ( ; p < q ; p++) { + if (TEST_RESCHED_COUNT(32)) { + RESET_RESCHED_COUNT(); + conditional_schedule(); + } nr = le32_to_cpu(*p); if (nr) { *p = 0; @@ -771,6 +776,7 @@ if (depth--) { int addr_per_block = EXT2_ADDR_PER_BLOCK(inode->i_sb); for ( ; p < q ; p++) { + conditional_schedule(); /* Deleting large files */ nr = le32_to_cpu(*p); if (!nr) continue; diff -Nur linux-2.4.33-imedia/fs/ext3/balloc.c linux-2.4.33-imedia-patching/fs/ext3/balloc.c --- linux-2.4.33-imedia/fs/ext3/balloc.c 2003-06-13 17:51:37.000000000 +0300 +++ linux-2.4.33-imedia-patching/fs/ext3/balloc.c 2006-01-26 15:19:42.000000000 +0200 @@ -363,6 +363,9 @@ } } #endif + /* superblock lock is held, so this is safe */ + conditional_schedule(); + BUFFER_TRACE(bitmap_bh, "clear bit"); if (!ext3_clear_bit (bit + i, bitmap_bh->b_data)) { ext3_error(sb, __FUNCTION__, diff -Nur linux-2.4.33-imedia/fs/ext3/inode.c linux-2.4.33-imedia-patching/fs/ext3/inode.c --- linux-2.4.33-imedia/fs/ext3/inode.c 2004-08-08 02:26:05.000000000 +0300 +++ linux-2.4.33-imedia-patching/fs/ext3/inode.c 2006-01-26 15:19:42.000000000 +0200 @@ -930,6 +930,8 @@ prev_blocks = inode->i_blocks; + conditional_schedule(); /* Reading large directories */ + bh = ext3_getblk (handle, inode, block, create, err); if (!bh) return bh; @@ -1633,6 +1635,7 @@ */ for (p = first; p < last; p++) { u32 nr = le32_to_cpu(*p); + conditional_schedule(); if (nr) { struct buffer_head *bh; @@ -1687,6 +1690,7 @@ } for (p = first; p < last; p++) { + conditional_schedule(); nr = le32_to_cpu(*p); if (nr) { /* accumulate blocks to free if they're contiguous */ diff -Nur linux-2.4.33-imedia/fs/ext3/namei.c linux-2.4.33-imedia-patching/fs/ext3/namei.c --- linux-2.4.33-imedia/fs/ext3/namei.c 2003-06-13 17:51:37.000000000 +0300 +++ linux-2.4.33-imedia-patching/fs/ext3/namei.c 2006-01-26 15:19:42.000000000 +0200 @@ -157,6 +157,7 @@ if ((bh = bh_use[ra_ptr++]) == NULL) goto next; wait_on_buffer(bh); + conditional_schedule(); if (!buffer_uptodate(bh)) { /* read error, skip block & hope for the best */ brelse(bh); diff -Nur linux-2.4.33-imedia/fs/fat/cache.c linux-2.4.33-imedia-patching/fs/fat/cache.c --- linux-2.4.33-imedia/fs/fat/cache.c 2001-10-12 23:48:42.000000000 +0300 +++ linux-2.4.33-imedia-patching/fs/fat/cache.c 2006-01-26 15:19:42.000000000 +0200 @@ -14,6 +14,7 @@ #include #include #include +#include #if 0 # define PRINTK(x) printk x diff -Nur linux-2.4.33-imedia/fs/inode.c linux-2.4.33-imedia-patching/fs/inode.c --- linux-2.4.33-imedia/fs/inode.c 2005-11-16 21:12:54.000000000 +0200 +++ linux-2.4.33-imedia-patching/fs/inode.c 2006-01-26 15:19:42.000000000 +0200 @@ -348,6 +348,8 @@ filemap_fdatawait(inode->i_mapping); + conditional_schedule(); + spin_lock(&inode_lock); inode->i_state &= ~I_LOCK; __refile_inode(inode); @@ -650,6 +652,7 @@ while (!list_empty(head)) { struct inode *inode; + conditional_schedule(); inode = list_entry(head->next, struct inode, i_list); list_del(&inode->i_list); @@ -686,9 +689,22 @@ if (tmp == head) break; inode = list_entry(tmp, struct inode, i_list); + + if (conditional_schedule_needed()) { + atomic_inc(&inode->i_count); + spin_unlock(&inode_lock); + unconditional_schedule(); + spin_lock(&inode_lock); + atomic_dec(&inode->i_count); + } + if (inode->i_sb != sb) continue; + atomic_inc(&inode->i_count); + spin_unlock(&inode_lock); invalidate_inode_buffers(inode); + spin_lock(&inode_lock); + atomic_dec(&inode->i_count); if (!atomic_read(&inode->i_count)) { list_del_init(&inode->i_hash); list_del(&inode->i_list); @@ -798,15 +814,28 @@ int avg_pages; #endif struct inode * inode; + int nr_to_scan = inodes_stat.nr_unused; +resume: spin_lock(&inode_lock); - count = 0; entry = inode_unused.prev; - while (entry != &inode_unused) - { + while (entry != &inode_unused && nr_to_scan--) { struct list_head *tmp = entry; + if (conditional_schedule_needed()) { + /* + * Need to drop the lock. Reposition + * the list head so we start here next time. + * This can corrupt the LRU nature of the + * unused list, but this isn't very important. + */ + list_del(&inode_unused); + list_add(&inode_unused, entry); + spin_unlock(&inode_lock); + unconditional_schedule(); + goto resume; + } entry = entry->prev; inode = INODE(tmp); if (inode->i_state & (I_FREEING|I_CLEAR|I_LOCK)) @@ -914,7 +943,7 @@ if (!(gfp_mask & __GFP_FS)) return 0; - count = inodes_stat.nr_unused / priority; + count = inodes_stat.nr_unused * priority / 100; prune_icache(count); return kmem_cache_shrink(inode_cachep); @@ -1008,6 +1037,8 @@ if (inode) { struct inode * old; + conditional_schedule(); /* sync_old_buffers */ + spin_lock(&inode_lock); /* We released the lock, so.. */ old = find_inode(sb, ino, head, find_actor, opaque); diff -Nur linux-2.4.33-imedia/fs/jbd/checkpoint.c linux-2.4.33-imedia-patching/fs/jbd/checkpoint.c --- linux-2.4.33-imedia/fs/jbd/checkpoint.c 2002-11-29 01:53:15.000000000 +0200 +++ linux-2.4.33-imedia-patching/fs/jbd/checkpoint.c 2006-01-26 15:19:42.000000000 +0200 @@ -431,7 +431,11 @@ { transaction_t *transaction, *last_transaction, *next_transaction; int ret = 0; + int ll_retries = 4; /* lowlatency addition */ +restart: + if (ll_retries-- == 0) + goto out; transaction = journal->j_checkpoint_transactions; if (transaction == 0) goto out; @@ -451,6 +455,12 @@ jh = next_jh; next_jh = jh->b_cpnext; ret += __try_to_free_cp_buf(jh); + if (conditional_schedule_needed()) { + spin_unlock(&journal_datalist_lock); + unconditional_schedule(); + spin_lock(&journal_datalist_lock); + goto restart; + } } while (jh != last_jh); } } while (transaction != last_transaction); diff -Nur linux-2.4.33-imedia/fs/jbd/commit.c linux-2.4.33-imedia-patching/fs/jbd/commit.c --- linux-2.4.33-imedia/fs/jbd/commit.c 2004-02-18 15:36:31.000000000 +0200 +++ linux-2.4.33-imedia-patching/fs/jbd/commit.c 2006-01-26 15:19:42.000000000 +0200 @@ -257,6 +257,16 @@ __journal_remove_journal_head(bh); refile_buffer(bh); release_buffer_page(bh); + if (conditional_schedule_needed()) { + if (commit_transaction->t_sync_datalist) + commit_transaction->t_sync_datalist = + next_jh; + if (bufs) + break; + spin_unlock(&journal_datalist_lock); + unconditional_schedule(); + goto write_out_data; + } } } if (bufs == ARRAY_SIZE(wbuf)) { @@ -280,8 +290,7 @@ journal_brelse_array(wbuf, bufs); lock_journal(journal); spin_lock(&journal_datalist_lock); - if (bufs) - goto write_out_data_locked; + goto write_out_data_locked; } /* @@ -317,6 +326,15 @@ */ while ((jh = commit_transaction->t_async_datalist)) { struct buffer_head *bh = jh2bh(jh); + + if (conditional_schedule_needed()) { + spin_unlock(&journal_datalist_lock); + unlock_journal(journal); + unconditional_schedule(); + lock_journal(journal); + spin_lock(&journal_datalist_lock); + continue; /* List may have changed */ + } if (__buffer_state(bh, Freed)) { BUFFER_TRACE(bh, "Cleaning freed buffer"); clear_bit(BH_Freed, &bh->b_state); @@ -536,6 +554,8 @@ wait_for_iobuf: while (commit_transaction->t_iobuf_list != NULL) { struct buffer_head *bh; + + conditional_schedule(); jh = commit_transaction->t_iobuf_list->b_tprev; bh = jh2bh(jh); if (buffer_locked(bh)) { @@ -695,6 +715,8 @@ struct buffer_head *bh; int was_freed = 0; + conditional_schedule(); /* journal is locked */ + jh = commit_transaction->t_forget; J_ASSERT_JH(jh, jh->b_transaction == commit_transaction || jh->b_transaction == journal->j_running_transaction); diff -Nur linux-2.4.33-imedia/fs/jffs2/background.c linux-2.4.33-imedia-patching/fs/jffs2/background.c --- linux-2.4.33-imedia/fs/jffs2/background.c 2001-10-25 10:07:09.000000000 +0300 +++ linux-2.4.33-imedia-patching/fs/jffs2/background.c 2006-01-26 15:19:42.000000000 +0200 @@ -106,9 +106,6 @@ sprintf(current->comm, "jffs2_gcd_mtd%d", c->mtd->index); - /* FIXME in the 2.2 backport */ - current->nice = 10; - for (;;) { spin_lock_irq(¤t->sigmask_lock); siginitsetinv (¤t->blocked, sigmask(SIGHUP) | sigmask(SIGKILL) | sigmask(SIGSTOP) | sigmask(SIGCONT)); diff -Nur linux-2.4.33-imedia/fs/nfsd/nfssvc.c linux-2.4.33-imedia-patching/fs/nfsd/nfssvc.c --- linux-2.4.33-imedia/fs/nfsd/nfssvc.c 2002-11-29 01:53:15.000000000 +0200 +++ linux-2.4.33-imedia-patching/fs/nfsd/nfssvc.c 2006-01-26 15:19:42.000000000 +0200 @@ -250,6 +250,7 @@ svc_exit_thread(rqstp); /* Release module */ + unlock_kernel(); MOD_DEC_USE_COUNT; } diff -Nur linux-2.4.33-imedia/fs/nls/nls_base.c linux-2.4.33-imedia-patching/fs/nls/nls_base.c --- linux-2.4.33-imedia/fs/nls/nls_base.c 2002-08-03 03:39:45.000000000 +0300 +++ linux-2.4.33-imedia-patching/fs/nls/nls_base.c 2006-01-26 15:19:42.000000000 +0200 @@ -18,6 +18,7 @@ #ifdef CONFIG_KMOD #include #endif +#include #include static struct nls_table *tables; diff -Nur linux-2.4.33-imedia/fs/proc/array.c linux-2.4.33-imedia-patching/fs/proc/array.c --- linux-2.4.33-imedia/fs/proc/array.c 2005-01-19 16:10:11.000000000 +0200 +++ linux-2.4.33-imedia-patching/fs/proc/array.c 2006-01-26 15:19:42.000000000 +0200 @@ -345,9 +345,8 @@ /* scale priority and nice values from timeslices to -20..20 */ /* to make it look like a "normal" Unix priority/nice value */ - priority = task->counter; - priority = 20 - (priority * 10 + DEF_COUNTER / 2) / DEF_COUNTER; - nice = task->nice; + priority = task_prio(task); + nice = task_nice(task); read_lock(&tasklist_lock); ppid = task->pid ? task->p_opptr->pid : 0; @@ -397,7 +396,7 @@ task->nswap, task->cnswap, task->exit_signal, - task->processor); + task->cpu); if(mm) mmput(mm); return res; @@ -422,9 +421,11 @@ if (end > PMD_SIZE) end = PMD_SIZE; do { - pte_t page = *pte; + pte_t page; struct page *ptpage; + conditional_schedule(); /* For `top' and `ps' */ + page = *pte; address += PAGE_SIZE; pte++; if (pte_none(page)) diff -Nur linux-2.4.33-imedia/fs/proc/generic.c linux-2.4.33-imedia-patching/fs/proc/generic.c --- linux-2.4.33-imedia/fs/proc/generic.c 2005-01-19 16:10:11.000000000 +0200 +++ linux-2.4.33-imedia-patching/fs/proc/generic.c 2006-01-26 15:19:42.000000000 +0200 @@ -101,6 +101,8 @@ retval = n; break; } + + conditional_schedule(); /* Some /proc files are large */ /* This is a hack to allow mangling of file pos independent * of actual bytes read. Simply place the data at page, diff -Nur linux-2.4.33-imedia/fs/proc/proc_misc.c linux-2.4.33-imedia-patching/fs/proc/proc_misc.c --- linux-2.4.33-imedia/fs/proc/proc_misc.c 2004-08-08 02:26:06.000000000 +0300 +++ linux-2.4.33-imedia-patching/fs/proc/proc_misc.c 2006-01-26 15:19:42.000000000 +0200 @@ -109,11 +109,11 @@ a = avenrun[0] + (FIXED_1/200); b = avenrun[1] + (FIXED_1/200); c = avenrun[2] + (FIXED_1/200); - len = sprintf(page,"%d.%02d %d.%02d %d.%02d %d/%d %d\n", + len = sprintf(page,"%d.%02d %d.%02d %d.%02d %ld/%d %d\n", LOAD_INT(a), LOAD_FRAC(a), LOAD_INT(b), LOAD_FRAC(b), LOAD_INT(c), LOAD_FRAC(c), - nr_running, nr_threads, last_pid); + nr_running(), nr_threads, last_pid); return proc_calc_metrics(page, start, off, count, eof, len); } @@ -125,7 +125,7 @@ int len; uptime = jiffies; - idle = init_tasks[0]->times.tms_utime + init_tasks[0]->times.tms_stime; + idle = init_task.times.tms_utime + init_task.times.tms_stime; /* The formula for the fraction parts really is ((t * 100) / HZ) % 100, but that would overflow about every five days at HZ == 100. @@ -374,10 +374,10 @@ } proc_sprintf(page, &off, &len, - "\nctxt %u\n" + "\nctxt %lu\n" "btime %lu\n" "processes %lu\n", - kstat.context_swtch, + nr_context_switches(), xtime.tv_sec - jif / HZ, total_forks); diff -Nur linux-2.4.33-imedia/fs/reiserfs/buffer2.c linux-2.4.33-imedia-patching/fs/reiserfs/buffer2.c --- linux-2.4.33-imedia/fs/reiserfs/buffer2.c 2003-08-25 14:44:43.000000000 +0300 +++ linux-2.4.33-imedia-patching/fs/reiserfs/buffer2.c 2006-01-26 15:19:42.000000000 +0200 @@ -40,6 +40,8 @@ } } +extern unsigned long nr_context_switches(void); + /* * reiserfs_bread() reads a specified block and returns the buffer that contains * it. It returns NULL if the block was unreadable. @@ -51,11 +53,12 @@ struct buffer_head * reiserfs_bread (struct super_block *super, int n_block, int n_size) { struct buffer_head *result; - PROC_EXP( unsigned int ctx_switches = kstat.context_swtch ); + PROC_EXP( unsigned int ctx_switches = nr_context_switches() ); result = bread (super -> s_dev, n_block, n_size); + conditional_schedule(); PROC_INFO_INC( super, breads ); - PROC_EXP( if( kstat.context_swtch != ctx_switches ) + PROC_EXP( if( nr_context_switches() != ctx_switches ) PROC_INFO_INC( super, bread_miss ) ); return result; } diff -Nur linux-2.4.33-imedia/fs/reiserfs/journal.c linux-2.4.33-imedia-patching/fs/reiserfs/journal.c --- linux-2.4.33-imedia/fs/reiserfs/journal.c 2004-08-08 02:26:06.000000000 +0300 +++ linux-2.4.33-imedia-patching/fs/reiserfs/journal.c 2006-01-26 15:19:42.000000000 +0200 @@ -577,6 +577,7 @@ /* lock the current transaction */ inline static void lock_journal(struct super_block *p_s_sb) { PROC_INFO_INC( p_s_sb, journal.lock_journal ); + conditional_schedule(); while(atomic_read(&(SB_JOURNAL(p_s_sb)->j_wlock)) > 0) { PROC_INFO_INC( p_s_sb, journal.lock_journal_wait ); sleep_on(&(SB_JOURNAL(p_s_sb)->j_wait)) ; @@ -707,6 +708,7 @@ mark_buffer_dirty(tbh) ; } ll_rw_block(WRITE, 1, &tbh) ; + conditional_schedule(); count++ ; put_bh(tbh) ; /* once for our get_hash */ } @@ -836,6 +838,7 @@ set_bit(BH_Dirty, &(SB_JOURNAL(p_s_sb)->j_header_bh->b_state)) ; ll_rw_block(WRITE, 1, &(SB_JOURNAL(p_s_sb)->j_header_bh)) ; wait_on_buffer((SB_JOURNAL(p_s_sb)->j_header_bh)) ; + conditional_schedule(); if (!buffer_uptodate(SB_JOURNAL(p_s_sb)->j_header_bh)) { reiserfs_warning( p_s_sb, "reiserfs: journal-837: IO error during journal replay\n" ); return -EIO ; @@ -2363,6 +2366,7 @@ } int journal_begin(struct reiserfs_transaction_handle *th, struct super_block * p_s_sb, unsigned long nblocks) { + conditional_schedule(); return do_journal_begin_r(th, p_s_sb, nblocks, 0) ; } @@ -2503,6 +2507,7 @@ } int journal_end(struct reiserfs_transaction_handle *th, struct super_block *p_s_sb, unsigned long nblocks) { + conditional_schedule(); return do_journal_end(th, p_s_sb, nblocks, 0) ; } @@ -2974,6 +2979,7 @@ RFALSE( buffer_locked(bh) && cur_tb != NULL, "waiting while do_balance was running\n") ; wait_on_buffer(bh) ; + conditional_schedule(); } PROC_INFO_INC( p_s_sb, journal.prepare_retry ); retry_count++ ; @@ -3148,6 +3154,7 @@ /* copy all the real blocks into log area. dirty log blocks */ if (test_bit(BH_JDirty, &cn->bh->b_state)) { struct buffer_head *tmp_bh ; + conditional_schedule(); tmp_bh = journal_getblk(p_s_sb, SB_ONDISK_JOURNAL_1st_BLOCK(p_s_sb) + ((cur_write_start + jindex) % SB_ONDISK_JOURNAL_SIZE(p_s_sb))) ; mark_buffer_uptodate(tmp_bh, 1) ; diff -Nur linux-2.4.33-imedia/fs/reiserfs/stree.c linux-2.4.33-imedia-patching/fs/reiserfs/stree.c --- linux-2.4.33-imedia/fs/reiserfs/stree.c 2003-08-25 14:44:43.000000000 +0300 +++ linux-2.4.33-imedia-patching/fs/reiserfs/stree.c 2006-01-26 15:19:42.000000000 +0200 @@ -652,9 +652,8 @@ stop at leaf level - set to DISK_LEAF_NODE_LEVEL */ ) { - int n_block_number = SB_ROOT_BLOCK (p_s_sb), - expected_level = SB_TREE_HEIGHT (p_s_sb), - n_block_size = p_s_sb->s_blocksize; + int n_block_number, expected_level; + int n_block_size = p_s_sb->s_blocksize; struct buffer_head * p_s_bh; struct path_element * p_s_last_element; int n_node_level, n_retval; @@ -666,7 +665,8 @@ #endif PROC_INFO_INC( p_s_sb, search_by_key ); - + conditional_schedule(); + /* As we add each node to a path we increase its count. This means that we must be careful to release all nodes in a path before we either discard the path struct or re-use the path struct, as we do here. */ @@ -678,6 +678,8 @@ /* With each iteration of this loop we search through the items in the current node, and calculate the next current node(next path element) for the next iteration of this loop.. */ + n_block_number = SB_ROOT_BLOCK (p_s_sb); + expected_level = SB_TREE_HEIGHT (p_s_sb); while ( 1 ) { #ifdef CONFIG_REISERFS_CHECK @@ -1104,6 +1106,8 @@ for (n_counter = *p_n_removed; n_counter < n_unfm_number; n_counter++, p_n_unfm_pointer-- ) { + conditional_schedule(); + if (item_moved (&s_ih, p_s_path)) { need_research = 1 ; break; diff -Nur linux-2.4.33-imedia/include/asm-alpha/bitops.h linux-2.4.33-imedia-patching/include/asm-alpha/bitops.h --- linux-2.4.33-imedia/include/asm-alpha/bitops.h 2001-10-13 01:35:54.000000000 +0300 +++ linux-2.4.33-imedia-patching/include/asm-alpha/bitops.h 2006-01-26 15:19:42.000000000 +0200 @@ -3,6 +3,7 @@ #include #include +#include /* * Copyright 1994, Linus Torvalds. @@ -60,25 +61,25 @@ __asm__ __volatile__( "1: ldl_l %0,%3\n" - " and %0,%2,%0\n" + " bic %0,%2,%0\n" " stl_c %0,%1\n" " beq %0,2f\n" ".subsection 2\n" "2: br 1b\n" ".previous" :"=&r" (temp), "=m" (*m) - :"Ir" (~(1UL << (nr & 31))), "m" (*m)); + :"Ir" (1UL << (nr & 31)), "m" (*m)); } /* * WARNING: non atomic version. */ static __inline__ void -__change_bit(unsigned long nr, volatile void * addr) +__clear_bit(unsigned long nr, volatile void * addr) { int *m = ((int *) addr) + (nr >> 5); - *m ^= 1 << (nr & 31); + *m &= ~(1 << (nr & 31)); } static inline void @@ -99,6 +100,17 @@ :"Ir" (1UL << (nr & 31)), "m" (*m)); } +/* + * WARNING: non atomic version. + */ +static __inline__ void +__change_bit(unsigned long nr, volatile void * addr) +{ + int *m = ((int *) addr) + (nr >> 5); + + *m ^= 1 << (nr & 31); +} + static inline int test_and_set_bit(unsigned long nr, volatile void *addr) { @@ -181,20 +193,6 @@ return (old & mask) != 0; } -/* - * WARNING: non atomic version. - */ -static __inline__ int -__test_and_change_bit(unsigned long nr, volatile void * addr) -{ - unsigned long mask = 1 << (nr & 0x1f); - int *m = ((int *) addr) + (nr >> 5); - int old = *m; - - *m = old ^ mask; - return (old & mask) != 0; -} - static inline int test_and_change_bit(unsigned long nr, volatile void * addr) { @@ -220,6 +218,20 @@ return oldbit != 0; } +/* + * WARNING: non atomic version. + */ +static __inline__ int +__test_and_change_bit(unsigned long nr, volatile void * addr) +{ + unsigned long mask = 1 << (nr & 0x1f); + int *m = ((int *) addr) + (nr >> 5); + int old = *m; + + *m = old ^ mask; + return (old & mask) != 0; +} + static inline int test_bit(int nr, volatile void * addr) { @@ -235,12 +247,15 @@ */ static inline unsigned long ffz_b(unsigned long x) { - unsigned long sum = 0; + unsigned long sum, x1, x2, x4; x = ~x & -~x; /* set first 0 bit, clear others */ - if (x & 0xF0) sum += 4; - if (x & 0xCC) sum += 2; - if (x & 0xAA) sum += 1; + x1 = x & 0xAA; + x2 = x & 0xCC; + x4 = x & 0xF0; + sum = x2 ? 2 : 0; + sum += (x4 != 0) * 4; + sum += (x1 != 0); return sum; } @@ -257,24 +272,46 @@ __asm__("cmpbge %1,%2,%0" : "=r"(bits) : "r"(word), "r"(~0UL)); qofs = ffz_b(bits); - __asm__("extbl %1,%2,%0" : "=r"(bits) : "r"(word), "r"(qofs)); + bits = __kernel_extbl(word, qofs); bofs = ffz_b(bits); return qofs*8 + bofs; #endif } +/* + * __ffs = Find First set bit in word. Undefined if no set bit exists. + */ +static inline unsigned long __ffs(unsigned long word) +{ +#if defined(__alpha_cix__) && defined(__alpha_fix__) + /* Whee. EV67 can calculate it directly. */ + unsigned long result; + __asm__("cttz %1,%0" : "=r"(result) : "r"(word)); + return result; +#else + unsigned long bits, qofs, bofs; + + __asm__("cmpbge $31,%1,%0" : "=r"(bits) : "r"(word)); + qofs = ffz_b(bits); + bits = __kernel_extbl(word, qofs); + bofs = ffz_b(~bits); + + return qofs*8 + bofs; +#endif +} + #ifdef __KERNEL__ /* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore - * differs in spirit from the above ffz (man ffs). + * differs in spirit from the above __ffs. */ static inline int ffs(int word) { - int result = ffz(~word); + int result = __ffs(word); return word ? result+1 : 0; } @@ -316,6 +353,14 @@ #define hweight16(x) hweight64((x) & 0xfffful) #define hweight8(x) hweight64((x) & 0xfful) #else +static inline unsigned long hweight64(unsigned long w) +{ + unsigned long result; + for (result = 0; w ; w >>= 1) + result += (w & 1); + return result; +} + #define hweight32(x) generic_hweight32(x) #define hweight16(x) generic_hweight16(x) #define hweight8(x) generic_hweight8(x) @@ -369,9 +414,32 @@ */ #define find_first_zero_bit(addr, size) \ find_next_zero_bit((addr), (size), 0) +#define find_first_bit(addr, size) \ + find_next_bit((addr), (size), 0) #ifdef __KERNEL__ +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is set. + */ +static inline unsigned long +sched_find_first_bit(unsigned long b[3]) +{ + unsigned long b0 = b[0], b1 = b[1], b2 = b[2]; + unsigned long ofs; + + ofs = (b1 ? 64 : 128); + b1 = (b1 ? b1 : b2); + ofs = (b0 ? 0 : ofs); + b0 = (b0 ? b0 : b1); + + return __ffs(b0) + ofs; +} + + #define ext2_set_bit __test_and_set_bit #define ext2_clear_bit __test_and_clear_bit #define ext2_test_bit test_bit diff -Nur linux-2.4.33-imedia/include/asm-arm/bitops.h linux-2.4.33-imedia-patching/include/asm-arm/bitops.h --- linux-2.4.33-imedia/include/asm-arm/bitops.h 2003-08-25 14:44:43.000000000 +0300 +++ linux-2.4.33-imedia-patching/include/asm-arm/bitops.h 2006-01-26 15:19:42.000000000 +0200 @@ -2,6 +2,8 @@ * Copyright 1995, Russell King. * Various bits and pieces copyrights include: * Linus Torvalds (test_bit). + * Big endian support: Copyright 2001, Nicolas Pitre + * reworked by rmk. * * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). * @@ -17,81 +19,267 @@ #ifdef __KERNEL__ +#include + #define smp_mb__before_clear_bit() do { } while (0) #define smp_mb__after_clear_bit() do { } while (0) /* - * Function prototypes to keep gcc -Wall happy. + * These functions are the basis of our bit ops. + * First, the atomic bitops. + * + * The endian issue for these functions is handled by the macros below. */ -extern void set_bit(int nr, volatile void * addr); +static inline void +____atomic_set_bit_mask(unsigned int mask, volatile unsigned char *p) +{ + unsigned long flags; + + local_irq_save(flags); + *p |= mask; + local_irq_restore(flags); +} + +static inline void +____atomic_clear_bit_mask(unsigned int mask, volatile unsigned char *p) +{ + unsigned long flags; + + local_irq_save(flags); + *p &= ~mask; + local_irq_restore(flags); +} + +static inline void +____atomic_change_bit_mask(unsigned int mask, volatile unsigned char *p) +{ + unsigned long flags; + + local_irq_save(flags); + *p ^= mask; + local_irq_restore(flags); +} -static inline void __set_bit(int nr, volatile void *addr) +static inline int +____atomic_test_and_set_bit_mask(unsigned int mask, volatile unsigned char *p) { - ((unsigned char *) addr)[nr >> 3] |= (1U << (nr & 7)); + unsigned long flags; + unsigned int res; + + local_irq_save(flags); + res = *p; + *p = res | mask; + local_irq_restore(flags); + + return res & mask; } -extern void clear_bit(int nr, volatile void * addr); +static inline int +____atomic_test_and_clear_bit_mask(unsigned int mask, volatile unsigned char *p) +{ + unsigned long flags; + unsigned int res; + + local_irq_save(flags); + res = *p; + *p = res & ~mask; + local_irq_restore(flags); + + return res & mask; +} -static inline void __clear_bit(int nr, volatile void *addr) +static inline int +____atomic_test_and_change_bit_mask(unsigned int mask, volatile unsigned char *p) { - ((unsigned char *) addr)[nr >> 3] &= ~(1U << (nr & 7)); + unsigned long flags; + unsigned int res; + + local_irq_save(flags); + res = *p; + *p = res ^ mask; + local_irq_restore(flags); + + return res & mask; } -extern void change_bit(int nr, volatile void * addr); +/* + * Now the non-atomic variants. We let the compiler handle all optimisations + * for these. + */ +static inline void ____nonatomic_set_bit(int nr, volatile void *p) +{ + ((unsigned char *) p)[nr >> 3] |= (1U << (nr & 7)); +} -static inline void __change_bit(int nr, volatile void *addr) +static inline void ____nonatomic_clear_bit(int nr, volatile void *p) { - ((unsigned char *) addr)[nr >> 3] ^= (1U << (nr & 7)); + ((unsigned char *) p)[nr >> 3] &= ~(1U << (nr & 7)); } -extern int test_and_set_bit(int nr, volatile void * addr); +static inline void ____nonatomic_change_bit(int nr, volatile void *p) +{ + ((unsigned char *) p)[nr >> 3] ^= (1U << (nr & 7)); +} -static inline int __test_and_set_bit(int nr, volatile void *addr) +static inline int ____nonatomic_test_and_set_bit(int nr, volatile void *p) { unsigned int mask = 1 << (nr & 7); unsigned int oldval; - oldval = ((unsigned char *) addr)[nr >> 3]; - ((unsigned char *) addr)[nr >> 3] = oldval | mask; + oldval = ((unsigned char *) p)[nr >> 3]; + ((unsigned char *) p)[nr >> 3] = oldval | mask; return oldval & mask; } -extern int test_and_clear_bit(int nr, volatile void * addr); - -static inline int __test_and_clear_bit(int nr, volatile void *addr) +static inline int ____nonatomic_test_and_clear_bit(int nr, volatile void *p) { unsigned int mask = 1 << (nr & 7); unsigned int oldval; - oldval = ((unsigned char *) addr)[nr >> 3]; - ((unsigned char *) addr)[nr >> 3] = oldval & ~mask; + oldval = ((unsigned char *) p)[nr >> 3]; + ((unsigned char *) p)[nr >> 3] = oldval & ~mask; return oldval & mask; } -extern int test_and_change_bit(int nr, volatile void * addr); - -static inline int __test_and_change_bit(int nr, volatile void *addr) +static inline int ____nonatomic_test_and_change_bit(int nr, volatile void *p) { unsigned int mask = 1 << (nr & 7); unsigned int oldval; - oldval = ((unsigned char *) addr)[nr >> 3]; - ((unsigned char *) addr)[nr >> 3] = oldval ^ mask; + oldval = ((unsigned char *) p)[nr >> 3]; + ((unsigned char *) p)[nr >> 3] = oldval ^ mask; return oldval & mask; } -extern int find_first_zero_bit(void * addr, unsigned size); -extern int find_next_zero_bit(void * addr, int size, int offset); - /* * This routine doesn't need to be atomic. */ -static inline int test_bit(int nr, const void * addr) +static inline int ____test_bit(int nr, const void * p) { - return (((unsigned char *) addr)[nr >> 3] >> (nr & 7)) & 1; + return (((unsigned char *) p)[nr >> 3] >> (nr & 7)) & 1; } /* + * A note about Endian-ness. + * ------------------------- + * + * When the ARM is put into big endian mode via CR15, the processor + * merely swaps the order of bytes within words, thus: + * + * ------------ physical data bus bits ----------- + * D31 ... D24 D23 ... D16 D15 ... D8 D7 ... D0 + * little byte 3 byte 2 byte 1 byte 0 + * big byte 0 byte 1 byte 2 byte 3 + * + * This means that reading a 32-bit word at address 0 returns the same + * value irrespective of the endian mode bit. + * + * Peripheral devices should be connected with the data bus reversed in + * "Big Endian" mode. ARM Application Note 61 is applicable, and is + * available from http://www.arm.com/. + * + * The following assumes that the data bus connectivity for big endian + * mode has been followed. + * + * Note that bit 0 is defined to be 32-bit word bit 0, not byte 0 bit 0. + */ + +/* + * Little endian assembly bitops. nr = 0 -> byte 0 bit 0. + */ +extern void _set_bit_le(int nr, volatile void * p); +extern void _clear_bit_le(int nr, volatile void * p); +extern void _change_bit_le(int nr, volatile void * p); +extern int _test_and_set_bit_le(int nr, volatile void * p); +extern int _test_and_clear_bit_le(int nr, volatile void * p); +extern int _test_and_change_bit_le(int nr, volatile void * p); +extern int _find_first_zero_bit_le(void * p, unsigned size); +extern int _find_next_zero_bit_le(void * p, int size, int offset); + +/* + * Big endian assembly bitops. nr = 0 -> byte 3 bit 0. + */ +extern void _set_bit_be(int nr, volatile void * p); +extern void _clear_bit_be(int nr, volatile void * p); +extern void _change_bit_be(int nr, volatile void * p); +extern int _test_and_set_bit_be(int nr, volatile void * p); +extern int _test_and_clear_bit_be(int nr, volatile void * p); +extern int _test_and_change_bit_be(int nr, volatile void * p); +extern int _find_first_zero_bit_be(void * p, unsigned size); +extern int _find_next_zero_bit_be(void * p, int size, int offset); + + +/* + * The __* form of bitops are non-atomic and may be reordered. + */ +#define ATOMIC_BITOP_LE(name,nr,p) \ + (__builtin_constant_p(nr) ? \ + ____atomic_##name##_mask(1 << ((nr) & 7), \ + ((unsigned char *)(p)) + ((nr) >> 3)) : \ + _##name##_le(nr,p)) + +#define ATOMIC_BITOP_BE(name,nr,p) \ + (__builtin_constant_p(nr) ? \ + ____atomic_##name##_mask(1 << ((nr) & 7), \ + ((unsigned char *)(p)) + (((nr) >> 3) ^ 3)) : \ + _##name##_be(nr,p)) + +#define NONATOMIC_BITOP_LE(name,nr,p) \ + (____nonatomic_##name(nr, p)) + +#define NONATOMIC_BITOP_BE(name,nr,p) \ + (____nonatomic_##name(nr ^ 0x18, p)) + +#ifndef __ARMEB__ +/* + * These are the little endian, atomic definitions. + */ +#define set_bit(nr,p) ATOMIC_BITOP_LE(set_bit,nr,p) +#define clear_bit(nr,p) ATOMIC_BITOP_LE(clear_bit,nr,p) +#define change_bit(nr,p) ATOMIC_BITOP_LE(change_bit,nr,p) +#define test_and_set_bit(nr,p) ATOMIC_BITOP_LE(test_and_set_bit,nr,p) +#define test_and_clear_bit(nr,p) ATOMIC_BITOP_LE(test_and_clear_bit,nr,p)+#define test_and_change_bit(nr,p) ATOMIC_BITOP_LE(test_and_change_bit,nr,p) +#define test_bit(nr,p) ____test_bit(nr,p) +#define find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz) +#define find_next_zero_bit(p,sz,off) _find_next_zero_bit_le(p,sz,off) + +/* + * These are the little endian, non-atomic definitions. + */ +#define __set_bit(nr,p) NONATOMIC_BITOP_LE(set_bit,nr,p)+#define __clear_bit(nr,p) NONATOMIC_BITOP_LE(clear_bit,nr,p) +#define __change_bit(nr,p) NONATOMIC_BITOP_LE(change_bit,nr,p) +#define __test_and_set_bit(nr,p) NONATOMIC_BITOP_LE(test_and_set_bit,nr,p) +#define __test_and_clear_bit(nr,p) NONATOMIC_BITOP_LE(test_and_clear_bit,nr,p) +#define __test_and_change_bit(nr,p) NONATOMIC_BITOP_LE(test_and_change_bit,nr,p) +#define __test_bit(nr,p) ____test_bit(nr,p) + +#else + +/* + * These are the big endian, atomic definitions. + */ +#define set_bit(nr,p) ATOMIC_BITOP_BE(set_bit,nr,p) +#define clear_bit(nr,p) ATOMIC_BITOP_BE(clear_bit,nr,p) +#define change_bit(nr,p) ATOMIC_BITOP_BE(change_bit,nr,p) +#define test_and_set_bit(nr,p) ATOMIC_BITOP_BE(test_and_set_bit,nr,p) +#define test_and_clear_bit(nr,p) ATOMIC_BITOP_BE(test_and_clear_bit,nr,p)+#define test_and_change_bit(nr,p) ATOMIC_BITOP_BE(test_and_change_bit,nr,p) +#define test_bit(nr,p) ____test_bit((nr) ^ 0x18, p) +#define find_first_zero_bit(p,sz) _find_first_zero_bit_be(p,sz) +#define find_next_zero_bit(p,sz,off) _find_next_zero_bit_be(p,sz,off) + +/* + * These are the big endian, non-atomic definitions. + */ +#define __set_bit(nr,p) NONATOMIC_BITOP_BE(set_bit,nr,p)+#define __clear_bit(nr,p) NONATOMIC_BITOP_BE(clear_bit,nr,p) +#define __change_bit(nr,p) NONATOMIC_BITOP_BE(change_bit,nr,p) +#define __test_and_set_bit(nr,p) NONATOMIC_BITOP_BE(test_and_set_bit,nr,p) +#define __test_and_clear_bit(nr,p) NONATOMIC_BITOP_BE(test_and_clear_bit,nr,p) +#define __test_and_change_bit(nr,p) NONATOMIC_BITOP_BE(test_and_change_bit,nr,p) +#define __test_bit(nr,p) ____test_bit((nr) ^ 0x18, p) + +#endif + +/* * ffz = Find First Zero in word. Undefined if no zero exists, * so code should check against ~0UL first.. */ @@ -110,6 +298,29 @@ } /* + * ffz = Find First Zero in word. Undefined if no zero exists, + * so code should check against ~0UL first.. + */ +static inline unsigned long __ffs(unsigned long word) +{ + int k; + + k = 31; + if (word & 0x0000ffff) { k -= 16; word <<= 16; } + if (word & 0x00ff0000) { k -= 8; word <<= 8; } + if (word & 0x0f000000) { k -= 4; word <<= 4; } + if (word & 0x30000000) { k -= 2; word <<= 2; } + if (word & 0x40000000) { k -= 1; } + return k; +} + +/* + * fls: find last bit set. + */ + +#define fls(x) generic_fls(x) + +/* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). @@ -118,6 +329,22 @@ #define ffs(x) generic_ffs(x) /* + * Find first bit set in a 168-bit bitmap, where the first + * 128 bits are unlikely to be set. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + unsigned long v; + unsigned int off; + + for (off = 0; v = b[off], off < 4; off++) { + if (unlikely(v)) + break; + } + return __ffs(v) + off * 32; +} + +/* * hweightN: returns the hamming weight (i.e. the number * of bits set) of a N-bit word */ @@ -126,18 +353,25 @@ #define hweight16(x) generic_hweight16(x) #define hweight8(x) generic_hweight8(x) -#define ext2_set_bit test_and_set_bit -#define ext2_clear_bit test_and_clear_bit -#define ext2_test_bit test_bit -#define ext2_find_first_zero_bit find_first_zero_bit -#define ext2_find_next_zero_bit find_next_zero_bit - -/* Bitmap functions for the minix filesystem. */ -#define minix_test_and_set_bit(nr,addr) test_and_set_bit(nr,addr) -#define minix_set_bit(nr,addr) set_bit(nr,addr) -#define minix_test_and_clear_bit(nr,addr) test_and_clear_bit(nr,addr) -#define minix_test_bit(nr,addr) test_bit(nr,addr) -#define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size) +/* + * Ext2 is defined to use little-endian byte ordering. + * These do not need to be atomic. + */ +#define ext2_set_bit(nr,p) NONATOMIC_BITOP_LE(test_and_set_bit,nr,p) +#define ext2_clear_bit(nr,p) NONATOMIC_BITOP_LE(test_and_clear_bit,nr,p) +#define ext2_test_bit(nr,p) __test_bit(nr,p) +#define ext2_find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz) +#define ext2_find_next_zero_bit(p,sz,off) _find_next_zero_bit_le(p,sz,off) + +/* + * Minix is defined to use little-endian byte ordering. + * These do not need to be atomic. + */ +#define minix_set_bit(nr,p) NONATOMIC_BITOP_LE(set_bit,nr,p) +#define minix_test_bit(nr,p) __test_bit(nr,p) +#define minix_test_and_set_bit(nr,p) NONATOMIC_BITOP_LE(test_and_set_bit,nr,p) +#define minix_test_and_clear_bit(nr,p) NONATOMIC_BITOP_LE(test_and_clear_bit,nr,p) +#define minix_find_first_zero_bit(p,sz) _find_first_zero_bit_le(p,sz) #endif /* __KERNEL__ */ diff -Nur linux-2.4.33-imedia/include/asm-cris/bitops.h linux-2.4.33-imedia-patching/include/asm-cris/bitops.h --- linux-2.4.33-imedia/include/asm-cris/bitops.h 2004-02-18 15:36:32.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-cris/bitops.h 2006-01-26 15:19:42.000000000 +0200 @@ -22,6 +22,7 @@ /* We use generic_ffs so get it; include guards resolve the possible mutually inclusion. */ #include +#include /* * Some hacks to defeat gcc over-optimizations.. @@ -375,7 +376,45 @@ #define minix_test_bit(nr,addr) test_bit(nr,addr) #define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size) -#endif /* __KERNEL__ */ +#if 0 +/* TODO: see below */ +#define sched_find_first_zero_bit(addr) find_first_zero_bit(addr, 168) + +#else +/* TODO: left out pending where to put it.. (there are .h dependencies) */ + + /* + * Every architecture must define this function. It's the fastest + * way of searching a 168-bit bitmap where the first 128 bits are + * unlikely to be set. It's guaranteed that at least one of the 168 + * bits is cleared. + */ +#if 0 +#if MAX_RT_PRIO != 128 || MAX_PRIO != 168 +# error update this function. +#endif +#else +#define MAX_RT_PRIO 128 +#define MAX_PRIO 168 +#endif + +static inline int sched_find_first_zero_bit(char *bitmap) +{ + unsigned int *b = (unsigned int *)bitmap; + unsigned int rt; + + rt = b[0] & b[1] & b[2] & b[3]; + if (unlikely(rt != 0xffffffff)) + return find_first_zero_bit(bitmap, MAX_RT_PRIO); + + if (b[4] != ~0) + return ffz(b[4]) + MAX_RT_PRIO; + return ffz(b[5]) + 32 + MAX_RT_PRIO; +} +#undef MAX_PRIO +#undef MAX_RT_PRIO +#endif +#endif /* __KERNEL__ */ #endif /* _CRIS_BITOPS_H */ diff -Nur linux-2.4.33-imedia/include/asm-generic/bitops.h linux-2.4.33-imedia-patching/include/asm-generic/bitops.h --- linux-2.4.33-imedia/include/asm-generic/bitops.h 2000-11-28 03:47:38.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-generic/bitops.h 2006-01-26 15:19:42.000000000 +0200 @@ -51,6 +51,12 @@ return ((mask & *addr) != 0); } +/* + * fls: find last bit set. + */ + +#define fls(x) generic_fls(x) + #ifdef __KERNEL__ /* diff -Nur linux-2.4.33-imedia/include/asm-i386/bitops.h linux-2.4.33-imedia-patching/include/asm-i386/bitops.h --- linux-2.4.33-imedia/include/asm-i386/bitops.h 2006-01-11 19:27:16.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-i386/bitops.h 2006-01-26 15:19:42.000000000 +0200 @@ -6,6 +6,7 @@ */ #include +#include /* * These have to be done with inline assembly: that way the bit-setting @@ -75,6 +76,14 @@ :"=m" (ADDR) :"Ir" (nr)); } + +static __inline__ void __clear_bit(int nr, volatile void * addr) +{ + __asm__ __volatile__( + "btrl %1,%0" + :"=m" (ADDR) + :"Ir" (nr)); +} #define smp_mb__before_clear_bit() barrier() #define smp_mb__after_clear_bit() barrier() @@ -283,6 +292,34 @@ } /** + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit-number of the first set bit, not the number of the byte + * containing a bit. + */ +static __inline__ int find_first_bit(void * addr, unsigned size) +{ + int d0, d1; + int res; + + /* This looks at memory. Mark it volatile to tell gcc not to move it around */ + __asm__ __volatile__( + "xorl %%eax,%%eax\n\t" + "repe; scasl\n\t" + "jz 1f\n\t" + "leal -4(%%edi),%%edi\n\t" + "bsfl (%%edi),%%eax\n" + "1:\tsubl %%ebx,%%edi\n\t" + "shll $3,%%edi\n\t" + "addl %%edi,%%eax" + :"=a" (res), "=&c" (d0), "=&D" (d1) + :"1" ((size + 31) >> 5), "2" (addr), "b" (addr)); + return res; +} + +/** * find_next_zero_bit - find the first zero bit in a memory region * @addr: The address to base the search on * @offset: The bitnumber to start searching at @@ -295,7 +332,7 @@ if (bit) { /* - * Look for zero in first byte + * Look for zero in first 32 bits. */ __asm__("bsfl %1,%0\n\t" "jne 1f\n\t" @@ -316,6 +353,39 @@ } /** + * find_next_bit - find the first set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +static __inline__ int find_next_bit (void * addr, int size, int offset) +{ + unsigned long * p = ((unsigned long *) addr) + (offset >> 5); + int set = 0, bit = offset & 31, res; + + if (bit) { + /* + * Look for nonzero in the first 32 bits: + */ + __asm__("bsfl %1,%0\n\t" + "jne 1f\n\t" + "movl $32, %0\n" + "1:" + : "=r" (set) + : "r" (*p >> bit)); + if (set < (32 - bit)) + return set + offset; + set = 32 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr)); + return (offset + set + res); +} + +/** * ffz - find first zero in word. * @word: The word to search * @@ -329,8 +399,41 @@ return word; } +/** + * __ffs - find first bit in word. + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static __inline__ unsigned long __ffs(unsigned long word) +{ + __asm__("bsfl %1,%0" + :"=r" (word) + :"rm" (word)); + return word; +} + #ifdef __KERNEL__ +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +static inline int _sched_find_first_bit(unsigned long *b) +{ + if (unlikely(b[0])) + return __ffs(b[0]); + if (unlikely(b[1])) + return __ffs(b[1]) + 32; + if (unlikely(b[2])) + return __ffs(b[2]) + 64; + if (b[3]) + return __ffs(b[3]) + 96; + return __ffs(b[4]) + 128; +} + /** * ffs - find first bit set * @x: the word to search diff -Nur linux-2.4.33-imedia/include/asm-i386/desc.h linux-2.4.33-imedia-patching/include/asm-i386/desc.h --- linux-2.4.33-imedia/include/asm-i386/desc.h 2004-02-18 15:36:32.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-i386/desc.h 2006-01-26 15:19:42.000000000 +0200 @@ -71,9 +71,12 @@ static inline void clear_LDT(void) { - int cpu = smp_processor_id(); + int cpu; + preempt_disable(); + cpu = smp_processor_id(); set_ldt_desc(cpu, &default_ldt[0], 5); __load_LDT(cpu); + preempt_enable(); } /* diff -Nur linux-2.4.33-imedia/include/asm-i386/hardirq.h linux-2.4.33-imedia-patching/include/asm-i386/hardirq.h --- linux-2.4.33-imedia/include/asm-i386/hardirq.h 2006-01-11 19:27:17.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-i386/hardirq.h 2006-01-26 15:19:42.000000000 +0200 @@ -19,12 +19,16 @@ /* * Are we in an interrupt context? Either doing bottom half - * or hardware interrupt processing? + * or hardware interrupt processing? Note the preempt check, + * this is both a bugfix and an optimization. If we are + * preemptible, we cannot be in an interrupt. */ -#define in_interrupt() ({ int __cpu = smp_processor_id(); \ - (local_irq_count(__cpu) + local_bh_count(__cpu) != 0); }) +#define in_interrupt() (preempt_is_disabled() && \ + ({unsigned long __cpu = smp_processor_id(); \ + (local_irq_count(__cpu) + local_bh_count(__cpu) != 0); })) -#define in_irq() (local_irq_count(smp_processor_id()) != 0) +#define in_irq() (preempt_is_disabled() && \ + (local_irq_count(smp_processor_id()) != 0)) #ifndef CONFIG_SMP @@ -36,6 +40,8 @@ #define synchronize_irq() barrier() +#define release_irqlock(cpu) do { } while (0) + #else #include diff -Nur linux-2.4.33-imedia/include/asm-i386/highmem.h linux-2.4.33-imedia-patching/include/asm-i386/highmem.h --- linux-2.4.33-imedia/include/asm-i386/highmem.h 2006-01-11 19:27:18.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-i386/highmem.h 2006-01-26 15:19:42.000000000 +0200 @@ -91,6 +91,7 @@ enum fixed_addresses idx; unsigned long vaddr; + preempt_disable(); if (page < highmem_start_page) return page_address(page); @@ -112,8 +113,10 @@ unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); - if (vaddr < FIXADDR_START) // FIXME + if (vaddr < FIXADDR_START) { // FIXME + preempt_enable(); return; + } if (vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)) out_of_line_bug(); @@ -125,6 +128,8 @@ pte_clear(kmap_pte-idx); __flush_tlb_one(vaddr); #endif + + preempt_enable(); } #endif /* __KERNEL__ */ diff -Nur linux-2.4.33-imedia/include/asm-i386/hw_irq.h linux-2.4.33-imedia-patching/include/asm-i386/hw_irq.h --- linux-2.4.33-imedia/include/asm-i386/hw_irq.h 2006-01-11 19:27:17.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-i386/hw_irq.h 2006-01-26 15:19:42.000000000 +0200 @@ -95,6 +95,18 @@ #define __STR(x) #x #define STR(x) __STR(x) +#define GET_CURRENT \ + "movl %esp, %ebx\n\t" \ + "andl $-8192, %ebx\n\t" + +#ifdef CONFIG_PREEMPT +#define BUMP_LOCK_COUNT \ + GET_CURRENT \ + "incl 4(%ebx)\n\t" +#else +#define BUMP_LOCK_COUNT +#endif + #define SAVE_ALL \ "cld\n\t" \ "pushl %es\n\t" \ @@ -108,15 +120,12 @@ "pushl %ebx\n\t" \ "movl $" STR(__KERNEL_DS) ",%edx\n\t" \ "movl %edx,%ds\n\t" \ - "movl %edx,%es\n\t" + "movl %edx,%es\n\t" \ + BUMP_LOCK_COUNT #define IRQ_NAME2(nr) nr##_interrupt(void) #define IRQ_NAME(nr) IRQ_NAME2(IRQ##nr) -#define GET_CURRENT \ - "movl %esp, %ebx\n\t" \ - "andl $-8192, %ebx\n\t" - /* * SMP has a few special interrupts for IPI messages */ diff -Nur linux-2.4.33-imedia/include/asm-i386/i387.h linux-2.4.33-imedia-patching/include/asm-i386/i387.h --- linux-2.4.33-imedia/include/asm-i386/i387.h 2006-01-11 19:31:56.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-i386/i387.h 2006-01-26 15:19:42.000000000 +0200 @@ -12,6 +12,7 @@ #define __ASM_I386_I387_H #include +#include #include #include #include @@ -24,7 +25,7 @@ extern void restore_fpu( struct task_struct *tsk ); extern void kernel_fpu_begin(void); -#define kernel_fpu_end() stts() +#define kernel_fpu_end() do { stts(); preempt_enable(); } while(0) #define unlazy_fpu( tsk ) do { \ diff -Nur linux-2.4.33-imedia/include/asm-i386/mmu_context.h linux-2.4.33-imedia-patching/include/asm-i386/mmu_context.h --- linux-2.4.33-imedia/include/asm-i386/mmu_context.h 2006-01-11 19:27:17.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-i386/mmu_context.h 2006-01-26 15:19:42.000000000 +0200 @@ -29,7 +29,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu) { - if (prev != next) { + if (likely(prev != next)) { /* stop flush ipis for the previous mm */ clear_bit(cpu, &prev->cpu_vm_mask); #ifdef CONFIG_SMP diff -Nur linux-2.4.33-imedia/include/asm-i386/pgalloc.h linux-2.4.33-imedia-patching/include/asm-i386/pgalloc.h --- linux-2.4.33-imedia/include/asm-i386/pgalloc.h 2006-01-11 19:27:17.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-i386/pgalloc.h 2006-01-26 15:19:42.000000000 +0200 @@ -75,20 +75,26 @@ { unsigned long *ret; + preempt_disable(); if ((ret = pgd_quicklist) != NULL) { pgd_quicklist = (unsigned long *)(*ret); ret[0] = 0; pgtable_cache_size--; - } else + preempt_enable(); + } else { + preempt_enable(); ret = (unsigned long *)get_pgd_slow(); + } return (pgd_t *)ret; } static inline void free_pgd_fast(pgd_t *pgd) { + preempt_disable(); *(unsigned long *)pgd = (unsigned long) pgd_quicklist; pgd_quicklist = (unsigned long *) pgd; pgtable_cache_size++; + preempt_enable(); } static inline void free_pgd_slow(pgd_t *pgd) @@ -119,19 +125,23 @@ { unsigned long *ret; + preempt_disable(); if ((ret = (unsigned long *)pte_quicklist) != NULL) { pte_quicklist = (unsigned long *)(*ret); ret[0] = ret[1]; pgtable_cache_size--; } + preempt_enable(); return (pte_t *)ret; } static inline void pte_free_fast(pte_t *pte) { + preempt_disable(); *(unsigned long *)pte = (unsigned long) pte_quicklist; pte_quicklist = (unsigned long *) pte; pgtable_cache_size++; + preempt_enable(); } static __inline__ void pte_free_slow(pte_t *pte) diff -Nur linux-2.4.33-imedia/include/asm-i386/smp.h linux-2.4.33-imedia-patching/include/asm-i386/smp.h --- linux-2.4.33-imedia/include/asm-i386/smp.h 2006-01-11 19:27:17.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-i386/smp.h 2006-01-26 15:19:42.000000000 +0200 @@ -40,6 +40,7 @@ extern void smp_flush_tlb(void); extern void smp_message_irq(int cpl, void *dev_id, struct pt_regs *regs); extern void fastcall smp_send_reschedule(int cpu); +extern void fastcall smp_send_reschedule_all(void); extern void smp_invalidate_rcv(void); /* Process an NMI */ extern void (*mtrr_hook) (void); extern void zap_low_mappings (void); @@ -81,7 +82,7 @@ * so this is correct in the x86 case. */ -#define smp_processor_id() (current->processor) +#define smp_processor_id() (current->cpu) static __inline int hard_smp_processor_id(void) { @@ -99,17 +100,5 @@ #define NO_PROC_ID 0xFF /* No processor magic marker */ -/* - * This magic constant controls our willingness to transfer - * a process across CPUs. Such a transfer incurs misses on the L1 - * cache, and on a P6 or P5 with multiple L2 caches L2 hits. My - * gut feeling is this will vary by board in value. For a board - * with separate L2 cache it probably depends also on the RSS, and - * for a board with shared L2 cache it ought to decay fast as other - * processes are run. - */ - -#define PROC_CHANGE_PENALTY 15 /* Schedule penalty */ - #endif #endif diff -Nur linux-2.4.33-imedia/include/asm-i386/smplock.h linux-2.4.33-imedia-patching/include/asm-i386/smplock.h --- linux-2.4.33-imedia/include/asm-i386/smplock.h 2006-01-11 19:27:17.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-i386/smplock.h 2006-01-26 15:19:42.000000000 +0200 @@ -14,7 +14,15 @@ extern spinlock_cacheline_t kernel_flag_cacheline; #define kernel_flag kernel_flag_cacheline.lock +#ifdef CONFIG_SMP #define kernel_locked() spin_is_locked(&kernel_flag) +#else +#ifdef CONFIG_PREEMPT +#define kernel_locked() preempt_get_count() +#else +#define kernel_locked() 1 +#endif +#endif /* * Release global kernel lock and global interrupt lock @@ -46,6 +54,11 @@ */ static __inline__ void lock_kernel(void) { +#ifdef CONFIG_PREEMPT + if (current->lock_depth == -1) + spin_lock(&kernel_flag); + ++current->lock_depth; +#else #if 1 if (!++current->lock_depth) spin_lock(&kernel_flag); @@ -58,6 +71,7 @@ :"=m" (__dummy_lock(&kernel_flag)), "=m" (current->lock_depth)); #endif +#endif } static __inline__ void unlock_kernel(void) diff -Nur linux-2.4.33-imedia/include/asm-i386/softirq.h linux-2.4.33-imedia-patching/include/asm-i386/softirq.h --- linux-2.4.33-imedia/include/asm-i386/softirq.h 2006-01-11 19:27:17.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-i386/softirq.h 2006-01-26 15:19:42.000000000 +0200 @@ -5,14 +5,15 @@ #include #define __cpu_bh_enable(cpu) \ - do { barrier(); local_bh_count(cpu)--; } while (0) + do { barrier(); local_bh_count(cpu)--; preempt_enable(); } while (0) #define cpu_bh_disable(cpu) \ - do { local_bh_count(cpu)++; barrier(); } while (0) + do { preempt_disable(); local_bh_count(cpu)++; barrier(); } while (0) #define local_bh_disable() cpu_bh_disable(smp_processor_id()) #define __local_bh_enable() __cpu_bh_enable(smp_processor_id()) -#define in_softirq() (local_bh_count(smp_processor_id()) != 0) +#define in_softirq() ( preempt_is_disabled() & \ + (local_bh_count(smp_processor_id()) != 0)) /* * NOTE: this assembly code assumes: @@ -22,7 +23,7 @@ * If you change the offsets in irq_stat then you have to * update this code as well. */ -#define local_bh_enable() \ +#define _local_bh_enable() \ do { \ unsigned int *ptr = &local_bh_count(smp_processor_id()); \ \ @@ -45,4 +46,6 @@ /* no registers clobbered */ ); \ } while (0) +#define local_bh_enable() do { _local_bh_enable(); preempt_enable(); } while (0) + #endif /* __ASM_SOFTIRQ_H */ diff -Nur linux-2.4.33-imedia/include/asm-i386/spinlock.h linux-2.4.33-imedia-patching/include/asm-i386/spinlock.h --- linux-2.4.33-imedia/include/asm-i386/spinlock.h 2006-01-11 19:27:16.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-i386/spinlock.h 2006-01-26 15:19:42.000000000 +0200 @@ -77,7 +77,7 @@ :"=m" (lock->lock) : : "memory" -static inline void spin_unlock(spinlock_t *lock) +static inline void _raw_spin_unlock(spinlock_t *lock) { #if SPINLOCK_DEBUG if (lock->magic != SPINLOCK_MAGIC) @@ -97,7 +97,7 @@ :"=q" (oldval), "=m" (lock->lock) \ :"0" (oldval) : "memory" -static inline void spin_unlock(spinlock_t *lock) +static inline void _raw_spin_unlock(spinlock_t *lock) { char oldval = 1; #if SPINLOCK_DEBUG @@ -113,7 +113,7 @@ #endif -static inline int spin_trylock(spinlock_t *lock) +static inline int _raw_spin_trylock(spinlock_t *lock) { char oldval; __asm__ __volatile__( @@ -123,7 +123,7 @@ return oldval > 0; } -static inline void spin_lock(spinlock_t *lock) +static inline void _raw_spin_lock(spinlock_t *lock) { #if SPINLOCK_DEBUG __label__ here; @@ -179,7 +179,7 @@ */ /* the spinlock helpers are in arch/i386/kernel/semaphore.c */ -static inline void read_lock(rwlock_t *rw) +static inline void _raw_read_lock(rwlock_t *rw) { #if SPINLOCK_DEBUG if (rw->magic != RWLOCK_MAGIC) @@ -188,7 +188,7 @@ __build_read_lock(rw, "__read_lock_failed"); } -static inline void write_lock(rwlock_t *rw) +static inline void _raw_write_lock(rwlock_t *rw) { #if SPINLOCK_DEBUG if (rw->magic != RWLOCK_MAGIC) @@ -197,10 +197,10 @@ __build_write_lock(rw, "__write_lock_failed"); } -#define read_unlock(rw) asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory") -#define write_unlock(rw) asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory") +#define _raw_read_unlock(rw) asm volatile("lock ; incl %0" :"=m" ((rw)->lock) : : "memory") +#define _raw_write_unlock(rw) asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0":"=m" ((rw)->lock) : : "memory") -static inline int write_trylock(rwlock_t *lock) +static inline int _raw_write_trylock(rwlock_t *lock) { atomic_t *count = (atomic_t *)lock; if (atomic_sub_and_test(RW_LOCK_BIAS, count)) diff -Nur linux-2.4.33-imedia/include/asm-i386/system.h linux-2.4.33-imedia-patching/include/asm-i386/system.h --- linux-2.4.33-imedia/include/asm-i386/system.h 2006-01-11 19:27:16.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-i386/system.h 2006-01-26 15:19:42.000000000 +0200 @@ -12,25 +12,22 @@ struct task_struct; /* one of the stranger aspects of C forward declarations.. */ extern void FASTCALL(__switch_to(struct task_struct *prev, struct task_struct *next)); -#define prepare_to_switch() do { } while(0) #define switch_to(prev,next,last) do { \ asm volatile("pushl %%esi\n\t" \ "pushl %%edi\n\t" \ "pushl %%ebp\n\t" \ "movl %%esp,%0\n\t" /* save ESP */ \ - "movl %3,%%esp\n\t" /* restore ESP */ \ + "movl %2,%%esp\n\t" /* restore ESP */ \ "movl $1f,%1\n\t" /* save EIP */ \ - "pushl %4\n\t" /* restore EIP */ \ + "pushl %3\n\t" /* restore EIP */ \ "jmp __switch_to\n" \ "1:\t" \ "popl %%ebp\n\t" \ "popl %%edi\n\t" \ "popl %%esi\n\t" \ - :"=m" (prev->thread.esp),"=m" (prev->thread.eip), \ - "=b" (last) \ + :"=m" (prev->thread.esp),"=m" (prev->thread.eip) \ :"m" (next->thread.esp),"m" (next->thread.eip), \ - "a" (prev), "d" (next), \ - "b" (prev)); \ + "a" (prev), "d" (next)); \ } while (0) #define _set_base(addr,base) do { unsigned long __pr; \ @@ -323,6 +320,13 @@ #define __save_and_cli(x) do { __save_flags(x); __cli(); } while(0); #define __save_and_sti(x) do { __save_flags(x); __sti(); } while(0); +#define irqs_disabled() \ +({ \ + unsigned long flags; \ + __save_flags(flags); \ + !(flags & (1<<9)); \ +}) + /* For spinlocks etc */ #if 0 #define local_irq_save(x) __asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory") diff -Nur linux-2.4.33-imedia/include/asm-ia64/bitops.h linux-2.4.33-imedia-patching/include/asm-ia64/bitops.h --- linux-2.4.33-imedia/include/asm-ia64/bitops.h 2003-11-28 20:26:21.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-ia64/bitops.h 2006-01-26 15:19:42.000000000 +0200 @@ -4,6 +4,9 @@ /* * Copyright (C) 1998-2003 Hewlett-Packard Co * David Mosberger-Tang + * + * 02/06/02 find_next_bit() and find_first_bit() added from Erich Focht's ia64 O(1) + * scheduler patch */ #include @@ -91,6 +94,17 @@ } /** + * __clear_bit - Clears a bit in memory (non-atomic version) + */ +static __inline__ void +__clear_bit (int nr, volatile void *addr) +{ + volatile __u32 *p = (__u32 *) addr + (nr >> 5); + __u32 m = 1 << (nr & 31); + *p &= ~m; +} + +/** * change_bit - Toggle a bit in memory * @nr: Bit to clear * @addr: Address to start counting from @@ -266,12 +280,11 @@ } /** - * ffz - find the first zero bit in a memory region - * @x: The address to start the search at + * ffz - find the first zero bit in a long word + * @x: The long word to find the bit in * - * Returns the bit-number (0..63) of the first (least significant) zero bit, not - * the number of the byte containing a bit. Undefined if no zero exists, so - * code should check against ~0UL first... + * Returns the bit-number (0..63) of the first (least significant) zero bit. Undefined if + * no zero exists, so code should check against ~0UL first... */ static inline unsigned long ffz (unsigned long x) @@ -297,6 +310,21 @@ return result; } +/** + * __ffs - find first bit in word. + * @x: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static __inline__ unsigned long +__ffs (unsigned long x) +{ + unsigned long result; + + __asm__ ("popcnt %0=%1" : "=r" (result) : "r" ((x - 1) & ~x)); + return result; +} + #ifdef __KERNEL__ /* @@ -313,6 +341,12 @@ return exp - 0xffff; } +static int +fls (int x) +{ + return ia64_fls((unsigned int) x); +} + /* * ffs: find first bit set. This is defined the same way as the libc and compiler builtin * ffs routines, therefore differs in spirit from the above ffz (man ffs): it operates on @@ -385,8 +419,53 @@ */ #define find_first_zero_bit(addr, size) find_next_zero_bit((addr), (size), 0) +/* + * Find next bit in a bitmap reasonably efficiently.. + */ +static inline int +find_next_bit (void *addr, unsigned long size, unsigned long offset) +{ + unsigned long *p = ((unsigned long *) addr) + (offset >> 6); + unsigned long result = offset & ~63UL; + unsigned long tmp; + + if (offset >= size) + return size; + size -= result; + offset &= 63UL; + if (offset) { + tmp = *(p++); + tmp &= ~0UL << offset; + if (size < 64) + goto found_first; + if (tmp) + goto found_middle; + size -= 64; + result += 64; + } + while (size & ~63UL) { + if ((tmp = *(p++))) + goto found_middle; + result += 64; + size -= 64; + } + if (!size) + return result; + tmp = *p; + found_first: + tmp &= ~0UL >> (64-size); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ + found_middle: + return result + __ffs(tmp); +} + +#define find_first_bit(addr, size) find_next_bit((addr), (size), 0) + #ifdef __KERNEL__ +#define __clear_bit(nr, addr) clear_bit(nr, addr) + #define ext2_set_bit test_and_set_bit #define ext2_clear_bit test_and_clear_bit #define ext2_test_bit test_bit @@ -400,6 +479,16 @@ #define minix_test_bit(nr,addr) test_bit(nr,addr) #define minix_find_first_zero_bit(addr,size) find_first_zero_bit(addr,size) +static inline int +sched_find_first_bit (unsigned long *b) +{ + if (unlikely(b[0])) + return __ffs(b[0]); + if (unlikely(b[1])) + return 64 + __ffs(b[1]); + return __ffs(b[2]) + 128; +} + #endif /* __KERNEL__ */ #endif /* _ASM_IA64_BITOPS_H */ diff -Nur linux-2.4.33-imedia/include/asm-m68k/bitops.h linux-2.4.33-imedia-patching/include/asm-m68k/bitops.h --- linux-2.4.33-imedia/include/asm-m68k/bitops.h 2004-02-18 15:36:32.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-m68k/bitops.h 2006-01-26 15:19:42.000000000 +0200 @@ -97,6 +97,7 @@ (__builtin_constant_p(nr) ? \ __constant_clear_bit(nr, vaddr) : \ __generic_clear_bit(nr, vaddr)) +#define __clear_bit(nr,vaddr) clear_bit(nr,vaddr) static inline void __constant_clear_bit(int nr, volatile void *vaddr) { @@ -238,6 +239,28 @@ return 32 - cnt; } +#define __ffs(x) (ffs(x) - 1) + + +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + if (unlikely(b[0])) + return __ffs(b[0]); + if (unlikely(b[1])) + return __ffs(b[1]) + 32; + if (unlikely(b[2])) + return __ffs(b[2]) + 64; + if (b[3]) + return __ffs(b[3]) + 96; + return __ffs(b[4]) + 128; +} + /* * hweightN: returns the hamming weight (i.e. the number diff -Nur linux-2.4.33-imedia/include/asm-mips/bitops.h linux-2.4.33-imedia-patching/include/asm-mips/bitops.h --- linux-2.4.33-imedia/include/asm-mips/bitops.h 2004-02-18 15:36:32.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-mips/bitops.h 2006-01-26 15:19:42.000000000 +0200 @@ -51,6 +51,8 @@ #ifdef CONFIG_CPU_HAS_LLSC +#include + /* * These functions for MIPS ISA > 1 are interrupt and SMP proof and * interrupt friendly @@ -593,21 +595,30 @@ * * Undefined if no zero exists, so code should check against ~0UL first. */ -static __inline__ unsigned long ffz(unsigned long word) +extern __inline__ unsigned long ffz(unsigned long word) { - int b = 0, s; + unsigned int __res; + unsigned int mask = 1; - word = ~word; - s = 16; if (word << 16 != 0) s = 0; b += s; word >>= s; - s = 8; if (word << 24 != 0) s = 0; b += s; word >>= s; - s = 4; if (word << 28 != 0) s = 0; b += s; word >>= s; - s = 2; if (word << 30 != 0) s = 0; b += s; word >>= s; - s = 1; if (word << 31 != 0) s = 0; b += s; + __asm__ ( + ".set\tnoreorder\n\t" + ".set\tnoat\n\t" + "move\t%0,$0\n" + "1:\tand\t$1,%2,%1\n\t" + "beqz\t$1,2f\n\t" + "sll\t%1,1\n\t" + "bnez\t%1,1b\n\t" + "addiu\t%0,1\n\t" + ".set\tat\n\t" + ".set\treorder\n" + "2:\n\t" + : "=&r" (__res), "=r" (mask) + : "r" (word), "1" (mask) + : "$1"); - return b; + return __res; } - #ifdef __KERNEL__ /* diff -Nur linux-2.4.33-imedia/include/asm-mips/smplock.h linux-2.4.33-imedia-patching/include/asm-mips/smplock.h --- linux-2.4.33-imedia/include/asm-mips/smplock.h 2004-02-18 15:36:32.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-mips/smplock.h 2006-01-26 15:19:42.000000000 +0200 @@ -8,12 +8,21 @@ #ifndef __ASM_SMPLOCK_H #define __ASM_SMPLOCK_H +#include #include #include extern spinlock_t kernel_flag; +#ifdef CONFIG_SMP #define kernel_locked() spin_is_locked(&kernel_flag) +#else +#ifdef CONFIG_PREEMPT +#define kernel_locked() preempt_get_count() +#else +#define kernel_locked() 1 +#endif +#endif /* * Release global kernel lock and global interrupt lock @@ -45,8 +54,14 @@ */ static __inline__ void lock_kernel(void) { +#ifdef CONFIG_PREEMPT + if (current->lock_depth == -1) + spin_lock(&kernel_flag); + ++current->lock_depth; +#else if (!++current->lock_depth) spin_lock(&kernel_flag); +#endif } static __inline__ void unlock_kernel(void) diff -Nur linux-2.4.33-imedia/include/asm-mips/softirq.h linux-2.4.33-imedia-patching/include/asm-mips/softirq.h --- linux-2.4.33-imedia/include/asm-mips/softirq.h 2002-11-29 01:53:15.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-mips/softirq.h 2006-01-26 15:19:42.000000000 +0200 @@ -15,6 +15,7 @@ static inline void cpu_bh_disable(int cpu) { + preempt_disable(); local_bh_count(cpu)++; barrier(); } @@ -23,6 +24,7 @@ { barrier(); local_bh_count(cpu)--; + preempt_enable(); } @@ -36,6 +38,7 @@ cpu = smp_processor_id(); \ if (!--local_bh_count(cpu) && softirq_pending(cpu)) \ do_softirq(); \ + preempt_enable(); \ } while (0) #define in_softirq() (local_bh_count(smp_processor_id()) != 0) diff -Nur linux-2.4.33-imedia/include/asm-mips/system.h linux-2.4.33-imedia-patching/include/asm-mips/system.h --- linux-2.4.33-imedia/include/asm-mips/system.h 2004-02-18 15:36:32.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-mips/system.h 2006-01-26 15:19:42.000000000 +0200 @@ -333,4 +333,18 @@ #define die_if_kernel(msg, regs) \ __die_if_kernel(msg, regs, __FILE__ ":", __FUNCTION__, __LINE__) +extern __inline__ int intr_on(void) +{ + unsigned long flags; + save_flags(flags); + return flags & 1; +} + +extern __inline__ int intr_off(void) +{ + return ! intr_on(); +} + +#define irqs_disabled() intr_off() + #endif /* _ASM_SYSTEM_H */ diff -Nur linux-2.4.33-imedia/include/asm-ppc/bitops.h linux-2.4.33-imedia-patching/include/asm-ppc/bitops.h --- linux-2.4.33-imedia/include/asm-ppc/bitops.h 2003-08-25 14:44:44.000000000 +0300 +++ linux-2.4.33-imedia-patching/include/asm-ppc/bitops.h 2006-01-26 15:19:42.000000000 +0200 @@ -7,9 +7,18 @@ #define _PPC_BITOPS_H #include +#include #include #include +#ifdef CONFIG_IBM405_ERR77 +#define PPC405_ERR77(ra,rb) dcbt ra, rb; +#define PPC405_ERR77_SYNC sync; +#else +#define PPC405_ERR77(ra,rb) +#define PPC405_ERR77_SYNC +#endif + /* * The test_and_*_bit operations are taken to imply a memory barrier * on SMP systems. @@ -26,7 +35,7 @@ * These used to be if'd out here because using : "cc" as a constraint * resulted in errors from egcs. Things appear to be OK with gcc-2.95. */ -static __inline__ void set_bit(int nr, volatile void * addr) +static __inline__ void set_bit(int nr, volatile unsigned long * addr) { unsigned long old; unsigned long mask = 1 << (nr & 0x1f); @@ -46,7 +55,7 @@ /* * non-atomic version */ -static __inline__ void __set_bit(int nr, volatile void *addr) +static __inline__ void __set_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -60,7 +69,7 @@ #define smp_mb__before_clear_bit() smp_mb() #define smp_mb__after_clear_bit() smp_mb() -static __inline__ void clear_bit(int nr, volatile void *addr) +static __inline__ void clear_bit(int nr, volatile unsigned long *addr) { unsigned long old; unsigned long mask = 1 << (nr & 0x1f); @@ -80,7 +89,7 @@ /* * non-atomic version */ -static __inline__ void __clear_bit(int nr, volatile void *addr) +static __inline__ void __clear_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -88,7 +97,7 @@ *p &= ~mask; } -static __inline__ void change_bit(int nr, volatile void *addr) +static __inline__ void change_bit(int nr, volatile unsigned long *addr) { unsigned long old; unsigned long mask = 1 << (nr & 0x1f); @@ -108,7 +117,7 @@ /* * non-atomic version */ -static __inline__ void __change_bit(int nr, volatile void *addr) +static __inline__ void __change_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -119,7 +128,7 @@ /* * test_and_*_bit do imply a memory barrier (?) */ -static __inline__ int test_and_set_bit(int nr, volatile void *addr) +static __inline__ int test_and_set_bit(int nr, volatile unsigned long *addr) { unsigned int old, t; unsigned int mask = 1 << (nr & 0x1f); @@ -142,7 +151,7 @@ /* * non-atomic version */ -static __inline__ int __test_and_set_bit(int nr, volatile void *addr) +static __inline__ int __test_and_set_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -152,7 +161,7 @@ return (old & mask) != 0; } -static __inline__ int test_and_clear_bit(int nr, volatile void *addr) +static __inline__ int test_and_clear_bit(int nr, volatile unsigned long *addr) { unsigned int old, t; unsigned int mask = 1 << (nr & 0x1f); @@ -175,7 +184,7 @@ /* * non-atomic version */ -static __inline__ int __test_and_clear_bit(int nr, volatile void *addr) +static __inline__ int __test_and_clear_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -185,7 +194,7 @@ return (old & mask) != 0; } -static __inline__ int test_and_change_bit(int nr, volatile void *addr) +static __inline__ int test_and_change_bit(int nr, volatile unsigned long *addr) { unsigned int old, t; unsigned int mask = 1 << (nr & 0x1f); @@ -208,7 +217,7 @@ /* * non-atomic version */ -static __inline__ int __test_and_change_bit(int nr, volatile void *addr) +static __inline__ int __test_and_change_bit(int nr, volatile unsigned long *addr) { unsigned long mask = 1 << (nr & 0x1f); unsigned long *p = ((unsigned long *)addr) + (nr >> 5); @@ -218,7 +227,7 @@ return (old & mask) != 0; } -static __inline__ int test_bit(int nr, __const__ volatile void *addr) +static __inline__ int test_bit(int nr, __const__ volatile unsigned long *addr) { __const__ unsigned int *p = (__const__ unsigned int *) addr; @@ -226,7 +235,7 @@ } /* Return the bit position of the most significant 1 bit in a word */ -static __inline__ int __ilog2(unsigned int x) +static __inline__ int __ilog2(unsigned long x) { int lz; @@ -234,13 +243,18 @@ return 31 - lz; } -static __inline__ int ffz(unsigned int x) +static __inline__ int ffz(unsigned long x) { if ((x = ~x) == 0) return 32; return __ilog2(x & -x); } +static inline int __ffs(unsigned long x) +{ + return __ilog2(x & -x); +} + /* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore @@ -252,6 +266,18 @@ } /* + * fls: find last (most-significant) bit set. + * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. + */ +static __inline__ int fls(unsigned int x) +{ + int lz; + + asm ("cntlzw %0,%1" : "=r" (lz) : "r" (x)); + return 32 - lz; +} + +/* * hweightN: returns the hamming weight (i.e. the number * of bits set) of a N-bit word */ @@ -261,13 +287,95 @@ #define hweight8(x) generic_hweight8(x) /* + * Find the first bit set in a 140-bit bitmap. + * The first 100 bits are unlikely to be set. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + if (unlikely(b[0])) + return __ffs(b[0]); + if (unlikely(b[1])) + return __ffs(b[1]) + 32; + if (unlikely(b[2])) + return __ffs(b[2]) + 64; + if (b[3]) + return __ffs(b[3]) + 96; + return __ffs(b[4]) + 128; +} + +/** + * find_next_bit - find the next set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search + */ +static __inline__ unsigned long find_next_bit(unsigned long *addr, + unsigned long size, unsigned long offset) +{ + unsigned int *p = ((unsigned int *) addr) + (offset >> 5); + unsigned int result = offset & ~31UL; + unsigned int tmp; + + if (offset >= size) + return size; + size -= result; + offset &= 31UL; + if (offset) { + tmp = *p++; + tmp &= ~0UL << offset; + if (size < 32) + goto found_first; + if (tmp) + goto found_middle; + size -= 32; + result += 32; + } + while (size >= 32) { + if ((tmp = *p++) != 0) + goto found_middle; + result += 32; + size -= 32; + } + if (!size) + return result; + tmp = *p; + +found_first: + tmp &= ~0UL >> (32 - size); + if (tmp == 0UL) /* Are any bits set? */ + return result + size; /* Nope. */ +found_middle: + return result + __ffs(tmp); +} + +/** + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search + * + * Returns the bit-number of the first set bit, not the number of the byte + * containing a bit. + */ +#define find_first_bit(addr, size) \ + find_next_bit((addr), (size), 0) + +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +#define _sched_find_first_bit(map) \ + find_first_bit(map, MAX_PRIO) + +/* * This implementation of find_{first,next}_zero_bit was stolen from * Linus' asm-alpha/bitops.h. */ #define find_first_zero_bit(addr, size) \ find_next_zero_bit((addr), (size), 0) -static __inline__ unsigned long find_next_zero_bit(void * addr, +static __inline__ unsigned long find_next_zero_bit(unsigned long * addr, unsigned long size, unsigned long offset) { unsigned int * p = ((unsigned int *) addr) + (offset >> 5); @@ -306,8 +414,8 @@ } -#define ext2_set_bit(nr, addr) __test_and_set_bit((nr) ^ 0x18, addr) -#define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, addr) +#define ext2_set_bit(nr, addr) __test_and_set_bit((nr) ^ 0x18, (unsigned long *)(addr)) +#define ext2_clear_bit(nr, addr) __test_and_clear_bit((nr) ^ 0x18, (unsigned long *)(addr)) static __inline__ int ext2_test_bit(int nr, __const__ void * addr) { diff -Nur linux-2.4.33-imedia/include/asm-ppc/dma.h linux-2.4.33-imedia-patching/include/asm-ppc/dma.h --- linux-2.4.33-imedia/include/asm-ppc/dma.h 2003-06-13 17:51:38.000000000 +0300 +++ linux-2.4.33-imedia-patching/include/asm-ppc/dma.h 2006-01-26 15:19:42.000000000 +0200 @@ -11,6 +11,7 @@ #include #include #include +#include #include /* diff -Nur linux-2.4.33-imedia/include/asm-ppc/hardirq.h linux-2.4.33-imedia-patching/include/asm-ppc/hardirq.h --- linux-2.4.33-imedia/include/asm-ppc/hardirq.h 2003-08-25 14:44:44.000000000 +0300 +++ linux-2.4.33-imedia-patching/include/asm-ppc/hardirq.h 2006-01-26 15:19:42.000000000 +0200 @@ -31,10 +31,12 @@ * Are we in an interrupt context? Either doing bottom half * or hardware interrupt processing? */ -#define in_interrupt() ({ int __cpu = smp_processor_id(); \ - (local_irq_count(__cpu) + local_bh_count(__cpu) != 0); }) +#define in_interrupt() (preempt_is_disabled() && \ + ({ unsigned long __cpu = smp_processor_id(); \ + (local_irq_count(__cpu) + local_bh_count(__cpu) != 0); })) -#define in_irq() (local_irq_count(smp_processor_id()) != 0) +#define in_irq() (preempt_is_disabled() && \ + (local_irq_count(smp_processor_id()) != 0)) #ifndef CONFIG_SMP @@ -45,6 +47,7 @@ #define hardirq_exit(cpu) (local_irq_count(cpu)--) #define synchronize_irq() do { } while (0) +#define release_irqlock(cpu) do { } while (0) #else /* CONFIG_SMP */ diff -Nur linux-2.4.33-imedia/include/asm-ppc/highmem.h linux-2.4.33-imedia-patching/include/asm-ppc/highmem.h --- linux-2.4.33-imedia/include/asm-ppc/highmem.h 2003-11-28 20:26:21.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-ppc/highmem.h 2006-01-26 15:19:42.000000000 +0200 @@ -84,6 +84,7 @@ unsigned int idx; unsigned long vaddr; + preempt_disable(); if (page < highmem_start_page) return page_address(page); @@ -105,8 +106,10 @@ unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; unsigned int idx = type + KM_TYPE_NR*smp_processor_id(); - if (vaddr < KMAP_FIX_BEGIN) // FIXME + if (vaddr < KMAP_FIX_BEGIN) { // FIXME + preempt_enable(); return; + } if (vaddr != KMAP_FIX_BEGIN + idx * PAGE_SIZE) BUG(); @@ -118,6 +121,7 @@ pte_clear(kmap_pte+idx); flush_tlb_page(0, vaddr); #endif + preempt_enable(); } #endif /* __KERNEL__ */ diff -Nur linux-2.4.33-imedia/include/asm-ppc/hw_irq.h linux-2.4.33-imedia-patching/include/asm-ppc/hw_irq.h --- linux-2.4.33-imedia/include/asm-ppc/hw_irq.h 2003-06-13 17:51:38.000000000 +0300 +++ linux-2.4.33-imedia-patching/include/asm-ppc/hw_irq.h 2006-01-26 15:19:42.000000000 +0200 @@ -20,6 +20,12 @@ #define __save_and_cli(flags) ({__save_flags(flags);__cli();}) #define __save_and_sti(flags) ({__save_flags(flags);__sti();}) +#define mfmsr() ({unsigned int rval; \ + asm volatile("mfmsr %0" : "=r" (rval)); rval;}) +#define mtmsr(v) asm volatile("mtmsr %0" : : "r" (v)) + +#define irqs_disabled() ((mfmsr() & MSR_EE) == 0) + extern void do_lost_interrupts(unsigned long); #define mask_irq(irq) ({if (irq_desc[irq].handler && irq_desc[irq].handler->disable) irq_desc[irq].handler->disable(irq);}) diff -Nur linux-2.4.33-imedia/include/asm-ppc/mmu_context.h linux-2.4.33-imedia-patching/include/asm-ppc/mmu_context.h --- linux-2.4.33-imedia/include/asm-ppc/mmu_context.h 2003-06-13 17:51:38.000000000 +0300 +++ linux-2.4.33-imedia-patching/include/asm-ppc/mmu_context.h 2006-01-26 15:19:43.000000000 +0200 @@ -155,6 +155,10 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, int cpu) { +#ifdef CONFIG_PREEMPT + if (preempt_get_count() == 0) + BUG(); +#endif tsk->thread.pgdir = next->pgd; get_mmu_context(next); set_context(next->context, next->pgd); diff -Nur linux-2.4.33-imedia/include/asm-ppc/pgalloc.h linux-2.4.33-imedia-patching/include/asm-ppc/pgalloc.h --- linux-2.4.33-imedia/include/asm-ppc/pgalloc.h 2003-11-28 20:26:21.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-ppc/pgalloc.h 2006-01-26 15:19:43.000000000 +0200 @@ -72,20 +72,26 @@ { unsigned long *ret; + preempt_disable(); if ((ret = pgd_quicklist) != NULL) { pgd_quicklist = (unsigned long *)(*ret); ret[0] = 0; pgtable_cache_size--; - } else + preempt_enable(); + } else { + preempt_enable(); ret = (unsigned long *)get_pgd_slow(); + } return (pgd_t *)ret; } extern __inline__ void free_pgd_fast(pgd_t *pgd) { + preempt_disable(); *(unsigned long **)pgd = pgd_quicklist; pgd_quicklist = (unsigned long *) pgd; pgtable_cache_size++; + preempt_enable(); } extern __inline__ void free_pgd_slow(pgd_t *pgd) @@ -124,19 +130,23 @@ { unsigned long *ret; + preempt_disable(); if ((ret = pte_quicklist) != NULL) { pte_quicklist = (unsigned long *)(*ret); ret[0] = 0; pgtable_cache_size--; } + preempt_enable(); return (pte_t *)ret; } extern __inline__ void pte_free_fast(pte_t *pte) { + preempt_disable(); *(unsigned long **)pte = pte_quicklist; pte_quicklist = (unsigned long *) pte; pgtable_cache_size++; + preempt_enable(); } extern __inline__ void pte_free_slow(pte_t *pte) diff -Nur linux-2.4.33-imedia/include/asm-ppc/smplock.h linux-2.4.33-imedia-patching/include/asm-ppc/smplock.h --- linux-2.4.33-imedia/include/asm-ppc/smplock.h 2003-06-13 17:51:38.000000000 +0300 +++ linux-2.4.33-imedia-patching/include/asm-ppc/smplock.h 2006-01-26 15:19:43.000000000 +0200 @@ -12,7 +12,15 @@ extern spinlock_t kernel_flag; +#ifdef CONFIG_SMP #define kernel_locked() spin_is_locked(&kernel_flag) +#else +#ifdef CONFIG_PREEMPT +#define kernel_locked() preempt_get_count() +#else +#define kernel_locked() 1 +#endif +#endif /* * Release global kernel lock and global interrupt lock @@ -44,8 +52,14 @@ */ static __inline__ void lock_kernel(void) { +#ifdef CONFIG_PREEMPT + if (current->lock_depth == -1) + spin_lock(&kernel_flag); + ++current->lock_depth; +#else if (!++current->lock_depth) spin_lock(&kernel_flag); +#endif } static __inline__ void unlock_kernel(void) diff -Nur linux-2.4.33-imedia/include/asm-ppc/softirq.h linux-2.4.33-imedia-patching/include/asm-ppc/softirq.h --- linux-2.4.33-imedia/include/asm-ppc/softirq.h 2003-06-13 17:51:38.000000000 +0300 +++ linux-2.4.33-imedia-patching/include/asm-ppc/softirq.h 2006-01-26 15:19:43.000000000 +0200 @@ -7,6 +7,7 @@ #define local_bh_disable() \ do { \ + preempt_disable(); \ local_bh_count(smp_processor_id())++; \ barrier(); \ } while (0) @@ -15,9 +16,10 @@ do { \ barrier(); \ local_bh_count(smp_processor_id())--; \ + preempt_enable(); \ } while (0) -#define local_bh_enable() \ +#define _local_bh_enable() \ do { \ if (!--local_bh_count(smp_processor_id()) \ && softirq_pending(smp_processor_id())) { \ @@ -25,7 +27,14 @@ } \ } while (0) -#define in_softirq() (local_bh_count(smp_processor_id()) != 0) +#define local_bh_enable() \ +do { \ + _local_bh_enable(); \ + preempt_enable(); \ +} while (0) + +#define in_softirq() (preempt_is_disabled() && \ + (local_bh_count(smp_processor_id()) != 0)) #endif /* __ASM_SOFTIRQ_H */ #endif /* __KERNEL__ */ diff -Nur linux-2.4.33-imedia/include/asm-ppc64/bitops.h linux-2.4.33-imedia-patching/include/asm-ppc64/bitops.h --- linux-2.4.33-imedia/include/asm-ppc64/bitops.h 2003-06-13 17:51:38.000000000 +0300 +++ linux-2.4.33-imedia-patching/include/asm-ppc64/bitops.h 2006-01-26 15:19:43.000000000 +0200 @@ -41,12 +41,12 @@ #define smp_mb__before_clear_bit() smp_mb() #define smp_mb__after_clear_bit() smp_mb() -static __inline__ int test_bit(unsigned long nr, __const__ volatile void *addr) +static __inline__ int test_bit(unsigned long nr, __const__ volatile unsigned long *addr) { return (1UL & (((__const__ long *) addr)[nr >> 6] >> (nr & 63))); } -static __inline__ void set_bit(unsigned long nr, volatile void *addr) +static __inline__ void set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old; unsigned long mask = 1UL << (nr & 0x3f); @@ -62,7 +62,7 @@ : "cc"); } -static __inline__ void clear_bit(unsigned long nr, volatile void *addr) +static __inline__ void clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old; unsigned long mask = 1UL << (nr & 0x3f); @@ -78,7 +78,7 @@ : "cc"); } -static __inline__ void change_bit(unsigned long nr, volatile void *addr) +static __inline__ void change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old; unsigned long mask = 1UL << (nr & 0x3f); @@ -94,7 +94,7 @@ : "cc"); } -static __inline__ int test_and_set_bit(unsigned long nr, volatile void *addr) +static __inline__ int test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old, t; unsigned long mask = 1UL << (nr & 0x3f); @@ -114,7 +114,7 @@ return (old & mask) != 0; } -static __inline__ int test_and_clear_bit(unsigned long nr, volatile void *addr) +static __inline__ int test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old, t; unsigned long mask = 1UL << (nr & 0x3f); @@ -134,7 +134,7 @@ return (old & mask) != 0; } -static __inline__ int test_and_change_bit(unsigned long nr, volatile void *addr) +static __inline__ int test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long old, t; unsigned long mask = 1UL << (nr & 0x3f); @@ -157,7 +157,7 @@ /* * non-atomic versions */ -static __inline__ void __set_bit(unsigned long nr, volatile void *addr) +static __inline__ void __set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -165,7 +165,7 @@ *p |= mask; } -static __inline__ void __clear_bit(unsigned long nr, volatile void *addr) +static __inline__ void __clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -173,7 +173,7 @@ *p &= ~mask; } -static __inline__ void __change_bit(unsigned long nr, volatile void *addr) +static __inline__ void __change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -181,7 +181,7 @@ *p ^= mask; } -static __inline__ int __test_and_set_bit(unsigned long nr, volatile void *addr) +static __inline__ int __test_and_set_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -191,7 +191,7 @@ return (old & mask) != 0; } -static __inline__ int __test_and_clear_bit(unsigned long nr, volatile void *addr) +static __inline__ int __test_and_clear_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); @@ -201,7 +201,7 @@ return (old & mask) != 0; } -static __inline__ int __test_and_change_bit(unsigned long nr, volatile void *addr) +static __inline__ int __test_and_change_bit(unsigned long nr, volatile unsigned long *addr) { unsigned long mask = 1UL << (nr & 0x3f); unsigned long *p = ((unsigned long *)addr) + (nr >> 6); diff -Nur linux-2.4.33-imedia/include/asm-s390/bitops.h linux-2.4.33-imedia-patching/include/asm-s390/bitops.h --- linux-2.4.33-imedia/include/asm-s390/bitops.h 2002-08-03 03:39:45.000000000 +0300 +++ linux-2.4.33-imedia-patching/include/asm-s390/bitops.h 2006-01-26 15:19:43.000000000 +0200 @@ -47,272 +47,217 @@ extern const char _oi_bitmap[]; extern const char _ni_bitmap[]; extern const char _zb_findmap[]; +extern const char _sb_findmap[]; #ifdef CONFIG_SMP /* * SMP save set_bit routine based on compare and swap (CS) */ -static __inline__ void set_bit_cs(int nr, volatile void * addr) +static inline void set_bit_cs(int nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" /* make OR mask */ - " l %0,0(%1)\n" - "0: lr %2,%0\n" /* CS loop starts here */ - " or %2,%3\n" /* set bit */ - " cs %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 31); /* make OR mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " or %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save clear_bit routine based on compare and swap (CS) */ -static __inline__ void clear_bit_cs(int nr, volatile void * addr) +static inline void clear_bit_cs(int nr, volatile void *ptr) { - static const int minusone = -1; - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" - " x %3,%4\n" /* make AND mask */ - " l %0,0(%1)\n" - "0: lr %2,%0\n" /* CS loop starts here */ - " nr %2,%3\n" /* clear bit */ - " cs %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) - : "m" (minusone) : "cc", "memory" ); + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = ~(1UL << (nr & 31)); /* make AND mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " nr %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save change_bit routine based on compare and swap (CS) */ -static __inline__ void change_bit_cs(int nr, volatile void * addr) +static inline void change_bit_cs(int nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" /* make XR mask */ - " l %0,0(%1)\n" - "0: lr %2,%0\n" /* CS loop starts here */ - " xr %2,%3\n" /* change bit */ - " cs %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 31); /* make XOR mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " xr %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save test_and_set_bit routine based on compare and swap (CS) */ -static __inline__ int test_and_set_bit_cs(int nr, volatile void * addr) +static inline int test_and_set_bit_cs(int nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" /* make OR mask */ - " l %0,0(%1)\n" - "0: lr %2,%0\n" /* CS loop starts here */ - " or %2,%3\n" /* set bit */ - " cs %0,%2,0(%1)\n" - " jl 0b\n" - " nr %0,%3\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); - return nr != 0; + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 31); /* make OR/test mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " or %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old & mask) != 0; } /* * SMP save test_and_clear_bit routine based on compare and swap (CS) */ -static __inline__ int test_and_clear_bit_cs(int nr, volatile void * addr) +static inline int test_and_clear_bit_cs(int nr, volatile void *ptr) { - static const int minusone = -1; - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" - " l %0,0(%1)\n" - " x %3,%4\n" /* make AND mask */ - "0: lr %2,%0\n" /* CS loop starts here */ - " nr %2,%3\n" /* clear bit */ - " cs %0,%2,0(%1)\n" - " jl 0b\n" - " x %3,%4\n" - " nr %0,%3\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) - : "m" (minusone) : "cc", "memory" ); - return nr; + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = ~(1UL << (nr & 31)); /* make AND mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " nr %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old ^ new) != 0; } /* * SMP save test_and_change_bit routine based on compare and swap (CS) */ -static __inline__ int test_and_change_bit_cs(int nr, volatile void * addr) +static inline int test_and_change_bit_cs(int nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lhi %2,3\n" /* CS must be aligned on 4 byte b. */ - " nr %2,%1\n" /* isolate last 2 bits of address */ - " xr %1,%2\n" /* make addr % 4 == 0 */ - " sll %2,3\n" - " ar %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 3; /* align address to 4 */ + nr += (addr & 3) << 3; /* add alignment to bit number */ #endif - " lhi %2,31\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srl %0,3\n" - " lhi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sll %3,0(%2)\n" /* make OR mask */ - " l %0,0(%1)\n" - "0: lr %2,%0\n" /* CS loop starts here */ - " xr %2,%3\n" /* change bit */ - " cs %0,%2,0(%1)\n" - " jl 0b\n" - " nr %0,%3\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); - return nr != 0; + addr += (nr ^ (nr & 31)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 31); /* make XOR mask */ + asm volatile( + " l %0,0(%4)\n" + "0: lr %1,%0\n" + " xr %1,%3\n" + " cs %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned int *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old & mask) != 0; } #endif /* CONFIG_SMP */ /* * fast, non-SMP set_bit routine */ -static __inline__ void __set_bit(int nr, volatile void * addr) +static inline void __set_bit(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %0,7\n" - " xr %1,%2\n" - " nr %0,%2\n" - " srl %1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " oc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_set_bit(const int nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x01" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory"); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x02" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x04" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x08" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x10" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x20" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x40" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x80" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + asm volatile("oc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_set_bit(const int nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 3); + switch (nr&7) { + case 0: + asm volatile ("oi 0(%1),0x01" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("oi 0(%1),0x02" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("oi 0(%1),0x04" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("oi 0(%1),0x08" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("oi 0(%1),0x10" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("oi 0(%1),0x20" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("oi 0(%1),0x40" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("oi 0(%1),0x80" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define set_bit_simple(nr,addr) \ @@ -323,76 +268,58 @@ /* * fast, non-SMP clear_bit routine */ -static __inline__ void -__clear_bit(int nr, volatile void * addr) +static inline void +__clear_bit(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %0,7\n" - " xr %1,%2\n" - " nr %0,%2\n" - " srl %1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " nc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_clear_bit(const int nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFE" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFD" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFB" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xF7" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xEF" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xDF" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xBF" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0x7F" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + asm volatile("nc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_ni_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_clear_bit(const int nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 3); + switch (nr&7) { + case 0: + asm volatile ("ni 0(%1),0xFE" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("ni 0(%1),0xFD" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("ni 0(%1),0xFB" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("ni 0(%1),0xF7" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("ni 0(%1),0xEF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("ni 0(%1),0xDF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("ni 0(%1),0xBF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("ni 0(%1),0x7F" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define clear_bit_simple(nr,addr) \ @@ -403,75 +330,57 @@ /* * fast, non-SMP change_bit routine */ -static __inline__ void __change_bit(int nr, volatile void * addr) +static inline void __change_bit(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %0,7\n" - " xr %1,%2\n" - " nr %0,%2\n" - " srl %1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " xc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_change_bit(const int nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x01" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x02" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x04" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x08" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x10" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x20" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x40" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x80" - : "=m" (*((volatile char *) addr + ((nr>>3)^3))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + asm volatile("xc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_change_bit(const int nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 3); + switch (nr&7) { + case 0: + asm volatile ("xi 0(%1),0x01" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("xi 0(%1),0x02" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("xi 0(%1),0x04" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("xi 0(%1),0x08" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("xi 0(%1),0x10" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("xi 0(%1),0x20" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("xi 0(%1),0x40" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("xi 0(%1),0x80" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define change_bit_simple(nr,addr) \ @@ -482,74 +391,54 @@ /* * fast, non-SMP test_and_set_bit routine */ -static __inline__ int test_and_set_bit_simple(int nr, volatile void * addr) +static inline int test_and_set_bit_simple(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %2,7\n" - " xr %1,%3\n" - " nr %2,%3\n" - " srl %1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " oc 0(1,%1),0(%2)" - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); - return oldbit & 1; + unsigned long addr; + unsigned char ch; + + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + ch = *(unsigned char *) addr; + asm volatile("oc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_set_bit(X,Y) test_and_set_bit_simple(X,Y) /* * fast, non-SMP test_and_clear_bit routine */ -static __inline__ int test_and_clear_bit_simple(int nr, volatile void * addr) +static inline int test_and_clear_bit_simple(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %2,7\n" - " xr %1,%3\n" - " nr %2,%3\n" - " srl %1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " nc 0(1,%1),0(%2)" - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + ch = *(unsigned char *) addr; + asm volatile("nc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_ni_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_clear_bit(X,Y) test_and_clear_bit_simple(X,Y) /* * fast, non-SMP test_and_change_bit routine */ -static __inline__ int test_and_change_bit_simple(int nr, volatile void * addr) +static inline int test_and_change_bit_simple(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lhi %1,24\n" - " lhi %2,7\n" - " xr %1,%3\n" - " nr %2,%1\n" - " srl %1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " xc 0(1,%1),0(%2)" - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + ch = *(unsigned char *) addr; + asm volatile("xc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_change_bit(X,Y) test_and_change_bit_simple(X,Y) @@ -574,25 +463,17 @@ * This routine doesn't need to be atomic. */ -static __inline__ int __test_bit(int nr, volatile void * addr) +static inline int __test_bit(int nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lhi %2,24\n" - " lhi %1,7\n" - " xr %2,%3\n" - " nr %1,%3\n" - " srl %2,3\n" - " ic %0,0(%2,%4)\n" - " srl %0,0(%1)" - : "=d&" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "r" (nr), "a" (addr) : "cc" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 24) >> 3); + ch = *(unsigned char *) addr; + return (ch >> (nr & 7)) & 1; } -static __inline__ int __constant_test_bit(int nr, volatile void * addr) { +static inline int __constant_test_bit(int nr, volatile void * addr) { return (((volatile char *) addr)[(nr>>3)^3] & (1<<(nr&7))) != 0; } @@ -604,7 +485,7 @@ /* * Find-bit routines.. */ -static __inline__ int find_first_zero_bit(void * addr, unsigned size) +static inline int find_first_zero_bit(void * addr, unsigned size) { unsigned long cmp, count; int res; @@ -642,7 +523,45 @@ return (res < size) ? res : size; } -static __inline__ int find_next_zero_bit (void * addr, int size, int offset) +static inline int find_first_bit(void * addr, unsigned size) +{ + unsigned long cmp, count; + int res; + + if (!size) + return 0; + __asm__(" slr %1,%1\n" + " lr %2,%3\n" + " slr %0,%0\n" + " ahi %2,31\n" + " srl %2,5\n" + "0: c %1,0(%0,%4)\n" + " jne 1f\n" + " ahi %0,4\n" + " brct %2,0b\n" + " lr %0,%3\n" + " j 4f\n" + "1: l %2,0(%0,%4)\n" + " sll %0,3\n" + " lhi %1,0xff\n" + " tml %2,0xffff\n" + " jnz 2f\n" + " ahi %0,16\n" + " srl %2,16\n" + "2: tml %2,0x00ff\n" + " jnz 3f\n" + " ahi %0,8\n" + " srl %2,8\n" + "3: nr %2,%1\n" + " ic %2,0(%2,%5)\n" + " alr %0,%2\n" + "4:" + : "=&a" (res), "=&d" (cmp), "=&a" (count) + : "a" (size), "a" (addr), "a" (&_sb_findmap) : "cc" ); + return (res < size) ? res : size; +} + +static inline int find_next_zero_bit (void * addr, int size, int offset) { unsigned long * p = ((unsigned long *) addr) + (offset >> 5); unsigned long bitvec, reg; @@ -680,11 +599,49 @@ return (offset + res); } +static inline int find_next_bit (void * addr, int size, int offset) +{ + unsigned long * p = ((unsigned long *) addr) + (offset >> 5); + unsigned long bitvec, reg; + int set, bit = offset & 31, res; + + if (bit) { + /* + * Look for set bit in first word + */ + bitvec = (*p) >> bit; + __asm__(" slr %0,%0\n" + " lhi %2,0xff\n" + " tml %1,0xffff\n" + " jnz 0f\n" + " ahi %0,16\n" + " srl %1,16\n" + "0: tml %1,0x00ff\n" + " jnz 1f\n" + " ahi %0,8\n" + " srl %1,8\n" + "1: nr %1,%2\n" + " ic %1,0(%1,%3)\n" + " alr %0,%1" + : "=&d" (set), "+a" (bitvec), "=&d" (reg) + : "a" (&_sb_findmap) : "cc" ); + if (set < (32 - bit)) + return set + offset; + offset += 32 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = find_first_bit (p, size - 32 * (p - (unsigned long *) addr)); + return (offset + res); +} + /* * ffz = Find First Zero in word. Undefined if no zero exists, * so code should check against ~0UL first.. */ -static __inline__ unsigned long ffz(unsigned long word) +static inline unsigned long ffz(unsigned long word) { unsigned long reg; int result; @@ -708,40 +665,109 @@ } /* + * __ffs = find first bit in word. Undefined if no bit exists, + * so code should check against 0UL first.. + */ +static inline unsigned long __ffs(unsigned long word) +{ + unsigned long reg, result; + + __asm__(" slr %0,%0\n" + " lhi %2,0xff\n" + " tml %1,0xffff\n" + " jnz 0f\n" + " ahi %0,16\n" + " srl %1,16\n" + "0: tml %1,0x00ff\n" + " jnz 1f\n" + " ahi %0,8\n" + " srl %1,8\n" + "1: nr %1,%2\n" + " ic %1,0(%1,%3)\n" + " alr %0,%1" + : "=&d" (result), "+a" (word), "=&d" (reg) + : "a" (&_sb_findmap) : "cc" ); + return result; +} + +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + return find_first_bit(b, 140); +} + +/* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ -extern int __inline__ ffs (int x) +extern int inline ffs (int x) { - int r; + int r = 1; if (x == 0) - return 0; - __asm__(" slr %0,%0\n" - " tml %1,0xffff\n" + return 0; + __asm__(" tml %1,0xffff\n" " jnz 0f\n" - " ahi %0,16\n" " srl %1,16\n" + " ahi %0,16\n" "0: tml %1,0x00ff\n" " jnz 1f\n" - " ahi %0,8\n" " srl %1,8\n" + " ahi %0,8\n" "1: tml %1,0x000f\n" " jnz 2f\n" - " ahi %0,4\n" " srl %1,4\n" + " ahi %0,4\n" "2: tml %1,0x0003\n" " jnz 3f\n" - " ahi %0,2\n" " srl %1,2\n" + " ahi %0,2\n" "3: tml %1,0x0001\n" " jnz 4f\n" " ahi %0,1\n" "4:" : "=&d" (r), "+d" (x) : : "cc" ); - return r+1; + return r; +} + +/* + * fls: find last bit set. + */ +extern __inline__ int fls(int x) +{ + int r = 32; + + if (x == 0) + return 0; + __asm__(" tmh %1,0xffff\n" + " jz 0f\n" + " sll %1,16\n" + " ahi %0,-16\n" + "0: tmh %1,0xff00\n" + " jz 1f\n" + " sll %1,8\n" + " ahi %0,-8\n" + "1: tmh %1,0xf000\n" + " jz 2f\n" + " sll %1,4\n" + " ahi %0,-4\n" + "2: tmh %1,0xc000\n" + " jz 3f\n" + " sll %1,2\n" + " ahi %0,-2\n" + "3: tmh %1,0x8000\n" + " jz 4f\n" + " ahi %0,-1\n" + "4:" + : "+d" (r), "+d" (x) : : "cc" ); + return r; } /* @@ -769,7 +795,7 @@ #define ext2_set_bit(nr, addr) test_and_set_bit((nr)^24, addr) #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr)^24, addr) #define ext2_test_bit(nr, addr) test_bit((nr)^24, addr) -static __inline__ int ext2_find_first_zero_bit(void *vaddr, unsigned size) +static inline int ext2_find_first_zero_bit(void *vaddr, unsigned size) { unsigned long cmp, count; int res; @@ -808,7 +834,7 @@ return (res < size) ? res : size; } -static __inline__ int +static inline int ext2_find_next_zero_bit(void *vaddr, unsigned size, unsigned offset) { unsigned long *addr = vaddr; diff -Nur linux-2.4.33-imedia/include/asm-s390x/bitops.h linux-2.4.33-imedia-patching/include/asm-s390x/bitops.h --- linux-2.4.33-imedia/include/asm-s390x/bitops.h 2002-08-03 03:39:45.000000000 +0300 +++ linux-2.4.33-imedia-patching/include/asm-s390x/bitops.h 2006-01-26 15:19:43.000000000 +0200 @@ -51,271 +51,220 @@ extern const char _oi_bitmap[]; extern const char _ni_bitmap[]; extern const char _zb_findmap[]; +extern const char _sb_findmap[]; #ifdef CONFIG_SMP /* * SMP save set_bit routine based on compare and swap (CS) */ -static __inline__ void set_bit_cs(unsigned long nr, volatile void * addr) +static inline void set_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sllg %3,%3,0(%2)\n" /* make OR mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " ogr %2,%3\n" /* set bit */ - " csg %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 63); /* make OR mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " ogr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save clear_bit routine based on compare and swap (CS) */ -static __inline__ void clear_bit_cs(unsigned long nr, volatile void * addr) +static inline void clear_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,-2\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " lghi %3,-2\n" - " rllg %3,%3,0(%2)\n" /* make AND mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " ngr %2,%3\n" /* clear bit */ - " csg %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = ~(1UL << (nr & 63)); /* make AND mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " ngr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save change_bit routine based on compare and swap (CS) */ -static __inline__ void change_bit_cs(unsigned long nr, volatile void * addr) +static inline void change_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sllg %3,%3,0(%2)\n" /* make XR mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " xgr %2,%3\n" /* change bit */ - " csg %0,%2,0(%1)\n" - " jl 0b" - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 63); /* make XOR mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " xgr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); } /* * SMP save test_and_set_bit routine based on compare and swap (CS) */ -static __inline__ int -test_and_set_bit_cs(unsigned long nr, volatile void * addr) +static inline int +test_and_set_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sllg %3,%3,0(%2)\n" /* make OR mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " ogr %2,%3\n" /* set bit */ - " csg %0,%2,0(%1)\n" - " jl 0b\n" - " ngr %0,%3\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); - return nr != 0; + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 63); /* make OR/test mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " ogr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old & mask) != 0; } /* * SMP save test_and_clear_bit routine based on compare and swap (CS) */ -static __inline__ int -test_and_clear_bit_cs(unsigned long nr, volatile void * addr) +static inline int +test_and_clear_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,-2\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " rllg %3,%3,0(%2)\n" /* make AND mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " ngr %2,%3\n" /* clear bit */ - " csg %0,%2,0(%1)\n" - " jl 0b\n" - " xgr %0,%2\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); - return nr != 0; + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = ~(1UL << (nr & 63)); /* make AND mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " ngr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old ^ new) != 0; } /* * SMP save test_and_change_bit routine based on compare and swap (CS) */ -static __inline__ int -test_and_change_bit_cs(unsigned long nr, volatile void * addr) +static inline int +test_and_change_bit_cs(unsigned long nr, volatile void *ptr) { - unsigned long bits, mask; - __asm__ __volatile__( + unsigned long addr, old, new, mask; + + addr = (unsigned long) ptr; #if ALIGN_CS == 1 - " lghi %2,7\n" /* CS must be aligned on 4 byte b. */ - " ngr %2,%1\n" /* isolate last 2 bits of address */ - " xgr %1,%2\n" /* make addr % 4 == 0 */ - " sllg %2,%2,3\n" - " agr %0,%2\n" /* add alignement to bitnr */ + addr ^= addr & 7; /* align address to 8 */ + nr += (addr & 7) << 3; /* add alignment to bit number */ #endif - " lghi %2,63\n" - " nr %2,%0\n" /* make shift value */ - " xr %0,%2\n" - " srlg %0,%0,3\n" - " lghi %3,1\n" - " la %1,0(%0,%1)\n" /* calc. address for CS */ - " sllg %3,%3,0(%2)\n" /* make OR mask */ - " lg %0,0(%1)\n" - "0: lgr %2,%0\n" /* CS loop starts here */ - " xgr %2,%3\n" /* change bit */ - " csg %0,%2,0(%1)\n" - " jl 0b\n" - " ngr %0,%3\n" /* isolate old bit */ - : "+a" (nr), "+a" (addr), "=&a" (bits), "=&d" (mask) : - : "cc", "memory" ); - return nr != 0; + addr += (nr ^ (nr & 63)) >> 3; /* calculate address for CS */ + mask = 1UL << (nr & 63); /* make XOR mask */ + asm volatile( + " lg %0,0(%4)\n" + "0: lgr %1,%0\n" + " xgr %1,%3\n" + " csg %0,%1,0(%4)\n" + " jl 0b" + : "=&d" (old), "=&d" (new), "+m" (*(unsigned long *) addr) + : "d" (mask), "a" (addr) + : "cc" ); + return (old & mask) != 0; } #endif /* CONFIG_SMP */ /* * fast, non-SMP set_bit routine */ -static __inline__ void __set_bit(unsigned long nr, volatile void * addr) +static inline void __set_bit(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %0,7\n" - " xgr %1,%2\n" - " nr %0,%2\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " oc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "a" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_set_bit(const unsigned long nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x01" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory"); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x02" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x04" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x08" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x10" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x20" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x40" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "oi 0(1),0x80" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + asm volatile("oc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_set_bit(const unsigned long nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 7); + switch (nr&7) { + case 0: + asm volatile ("oi 0(%1),0x01" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("oi 0(%1),0x02" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("oi 0(%1),0x04" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("oi 0(%1),0x08" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("oi 0(%1),0x10" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("oi 0(%1),0x20" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("oi 0(%1),0x40" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("oi 0(%1),0x80" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define set_bit_simple(nr,addr) \ @@ -326,76 +275,58 @@ /* * fast, non-SMP clear_bit routine */ -static __inline__ void -__clear_bit(unsigned long nr, volatile void * addr) +static inline void +__clear_bit(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %0,7\n" - " xgr %1,%2\n" - " nr %0,%2\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " nc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_clear_bit(const unsigned long nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFE" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFD" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xFB" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xF7" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xEF" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xDF" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0xBF" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "ni 0(1),0x7F" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + asm volatile("nc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_ni_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_clear_bit(const unsigned long nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 7); + switch (nr&7) { + case 0: + asm volatile ("ni 0(%1),0xFE" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("ni 0(%1),0xFD" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("ni 0(%1),0xFB" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("ni 0(%1),0xF7" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("ni 0(%1),0xEF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("ni 0(%1),0xDF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("ni 0(%1),0xBF" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("ni 0(%1),0x7F" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define clear_bit_simple(nr,addr) \ @@ -406,75 +337,57 @@ /* * fast, non-SMP change_bit routine */ -static __inline__ void __change_bit(unsigned long nr, volatile void * addr) +static inline void __change_bit(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %0,7\n" - " xgr %1,%2\n" - " nr %0,%2\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%3)\n" - " la %0,0(%0,%4)\n" - " xc 0(1,%1),0(%0)" - : "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); -} - -static __inline__ void -__constant_change_bit(const unsigned long nr, volatile void * addr) -{ - switch (nr&7) { - case 0: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x01" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 1: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x02" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 2: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x04" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 3: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x08" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 4: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x10" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "cc", "memory" ); - break; - case 5: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x20" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 6: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x40" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - case 7: - __asm__ __volatile__ ("la 1,%0\n\t" - "xi 0(1),0x80" - : "=m" (*((volatile char *) addr + ((nr>>3)^7))) - : : "1", "cc", "memory" ); - break; - } + unsigned long addr; + + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + asm volatile("xc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); +} + +static inline void +__constant_change_bit(const unsigned long nr, volatile void *ptr) +{ + unsigned long addr; + + addr = ((unsigned long) ptr) + ((nr >> 3) ^ 7); + switch (nr&7) { + case 0: + asm volatile ("xi 0(%1),0x01" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 1: + asm volatile ("xi 0(%1),0x02" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 2: + asm volatile ("xi 0(%1),0x04" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 3: + asm volatile ("xi 0(%1),0x08" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 4: + asm volatile ("xi 0(%1),0x10" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 5: + asm volatile ("xi 0(%1),0x20" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 6: + asm volatile ("xi 0(%1),0x40" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + case 7: + asm volatile ("xi 0(%1),0x80" + : "+m" (*(char *) addr) : "a" (addr) : "cc" ); + break; + } } #define change_bit_simple(nr,addr) \ @@ -485,77 +398,57 @@ /* * fast, non-SMP test_and_set_bit routine */ -static __inline__ int -test_and_set_bit_simple(unsigned long nr, volatile void * addr) +static inline int +test_and_set_bit_simple(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %2,7\n" - " xgr %1,%3\n" - " nr %2,%3\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " oc 0(1,%1),0(%2)" - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); - return oldbit & 1; + unsigned long addr; + unsigned char ch; + + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + ch = *(unsigned char *) addr; + asm volatile("oc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_set_bit(X,Y) test_and_set_bit_simple(X,Y) /* * fast, non-SMP test_and_clear_bit routine */ -static __inline__ int -test_and_clear_bit_simple(unsigned long nr, volatile void * addr) +static inline int +test_and_clear_bit_simple(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %2,7\n" - " xgr %1,%3\n" - " nr %2,%3\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " nc 0(1,%1),0(%2)" - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr), "a" (&_ni_bitmap) : "cc", "memory" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + ch = *(unsigned char *) addr; + asm volatile("nc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_ni_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_clear_bit(X,Y) test_and_clear_bit_simple(X,Y) /* * fast, non-SMP test_and_change_bit routine */ -static __inline__ int -test_and_change_bit_simple(unsigned long nr, volatile void * addr) +static inline int +test_and_change_bit_simple(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lghi %1,56\n" - " lghi %2,7\n" - " xgr %1,%3\n" - " nr %2,%3\n" - " srlg %1,%1,3\n" - " la %1,0(%1,%4)\n" - " ic %0,0(%1)\n" - " srl %0,0(%2)\n" - " la %2,0(%2,%5)\n" - " xc 0(1,%1),0(%2)" - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr), "a" (&_oi_bitmap) : "cc", "memory" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + ch = *(unsigned char *) addr; + asm volatile("xc 0(1,%1),0(%2)" + : "+m" (*(char *) addr) + : "a" (addr), "a" (_oi_bitmap + (nr & 7)) + : "cc" ); + return (ch >> (nr & 7)) & 1; } #define __test_and_change_bit(X,Y) test_and_change_bit_simple(X,Y) @@ -580,26 +473,18 @@ * This routine doesn't need to be atomic. */ -static __inline__ int __test_bit(unsigned long nr, volatile void * addr) +static inline int __test_bit(unsigned long nr, volatile void *ptr) { - unsigned long reg1, reg2; - int oldbit; + unsigned long addr; + unsigned char ch; - __asm__ __volatile__( - " lghi %2,56\n" - " lghi %1,7\n" - " xgr %2,%3\n" - " nr %1,%3\n" - " srlg %2,%2,3\n" - " ic %0,0(%2,%4)\n" - " srl %0,0(%1)\n" - : "=&d" (oldbit), "=&a" (reg1), "=&a" (reg2) - : "d" (nr), "a" (addr) : "cc" ); - return oldbit & 1; + addr = (unsigned long) ptr + ((nr ^ 56) >> 3); + ch = *(unsigned char *) addr; + return (ch >> (nr & 7)) & 1; } -static __inline__ int -__constant_test_bit(unsigned long nr, volatile void * addr) { +static inline int +__constant_test_bit(unsigned long nr, volatile void *addr) { return (((volatile char *) addr)[(nr>>3)^7] & (1<<(nr&7))) != 0; } @@ -611,7 +496,7 @@ /* * Find-bit routines.. */ -static __inline__ unsigned long +static inline unsigned long find_first_zero_bit(void * addr, unsigned long size) { unsigned long res, cmp, count; @@ -653,7 +538,49 @@ return (res < size) ? res : size; } -static __inline__ unsigned long +static inline unsigned long +find_first_bit(void * addr, unsigned long size) +{ + unsigned long res, cmp, count; + + if (!size) + return 0; + __asm__(" slgr %1,%1\n" + " lgr %2,%3\n" + " slgr %0,%0\n" + " aghi %2,63\n" + " srlg %2,%2,6\n" + "0: cg %1,0(%0,%4)\n" + " jne 1f\n" + " aghi %0,8\n" + " brct %2,0b\n" + " lgr %0,%3\n" + " j 5f\n" + "1: lg %2,0(%0,%4)\n" + " sllg %0,%0,3\n" + " clr %2,%1\n" + " jne 2f\n" + " aghi %0,32\n" + " srlg %2,%2,32\n" + "2: lghi %1,0xff\n" + " tmll %2,0xffff\n" + " jnz 3f\n" + " aghi %0,16\n" + " srl %2,16\n" + "3: tmll %2,0x00ff\n" + " jnz 4f\n" + " aghi %0,8\n" + " srl %2,8\n" + "4: ngr %2,%1\n" + " ic %2,0(%2,%5)\n" + " algr %0,%2\n" + "5:" + : "=&a" (res), "=&d" (cmp), "=&a" (count) + : "a" (size), "a" (addr), "a" (&_sb_findmap) : "cc" ); + return (res < size) ? res : size; +} + +static inline unsigned long find_next_zero_bit (void * addr, unsigned long size, unsigned long offset) { unsigned long * p = ((unsigned long *) addr) + (offset >> 6); @@ -697,14 +624,56 @@ return (offset + res); } +static inline unsigned long +find_next_bit (void * addr, unsigned long size, unsigned long offset) +{ + unsigned long * p = ((unsigned long *) addr) + (offset >> 6); + unsigned long bitvec, reg; + unsigned long set, bit = offset & 63, res; + + if (bit) { + /* + * Look for zero in first word + */ + bitvec = (*p) >> bit; + __asm__(" slgr %0,%0\n" + " ltr %1,%1\n" + " jnz 0f\n" + " aghi %0,32\n" + " srlg %1,%1,32\n" + "0: lghi %2,0xff\n" + " tmll %1,0xffff\n" + " jnz 1f\n" + " aghi %0,16\n" + " srlg %1,%1,16\n" + "1: tmll %1,0x00ff\n" + " jnz 2f\n" + " aghi %0,8\n" + " srlg %1,%1,8\n" + "2: ngr %1,%2\n" + " ic %1,0(%1,%3)\n" + " algr %0,%1" + : "=&d" (set), "+a" (bitvec), "=&d" (reg) + : "a" (&_sb_findmap) : "cc" ); + if (set < (64 - bit)) + return set + offset; + offset += 64 - bit; + p++; + } + /* + * No set bit yet, search remaining full words for a bit + */ + res = find_first_bit (p, size - 64 * (p - (unsigned long *) addr)); + return (offset + res); +} + /* * ffz = Find First Zero in word. Undefined if no zero exists, * so code should check against ~0UL first.. */ -static __inline__ unsigned long ffz(unsigned long word) +static inline unsigned long ffz(unsigned long word) { - unsigned long reg; - int result; + unsigned long reg, result; __asm__(" lhi %2,-1\n" " slgr %0,%0\n" @@ -730,40 +699,112 @@ } /* + * __ffs = find first bit in word. Undefined if no bit exists, + * so code should check against 0UL first.. + */ +static inline unsigned long __ffs (unsigned long word) +{ + unsigned long reg, result; + + __asm__(" slgr %0,%0\n" + " ltr %1,%1\n" + " jnz 0f\n" + " aghi %0,32\n" + " srlg %1,%1,32\n" + "0: lghi %2,0xff\n" + " tmll %1,0xffff\n" + " jnz 1f\n" + " aghi %0,16\n" + " srlg %1,%1,16\n" + "1: tmll %1,0x00ff\n" + " jnz 2f\n" + " aghi %0,8\n" + " srlg %1,%1,8\n" + "2: ngr %1,%2\n" + " ic %1,0(%1,%3)\n" + " algr %0,%1" + : "=&d" (result), "+a" (word), "=&d" (reg) + : "a" (&_sb_findmap) : "cc" ); + return result; +} + +/* + * Every architecture must define this function. It's the fastest + * way of searching a 140-bit bitmap where the first 100 bits are + * unlikely to be set. It's guaranteed that at least one of the 140 + * bits is cleared. + */ +static inline int sched_find_first_bit(unsigned long *b) +{ + return find_first_bit(b, 140); +} + +/* * ffs: find first bit set. This is defined the same way as * the libc and compiler builtin ffs routines, therefore * differs in spirit from the above ffz (man ffs). */ - -extern int __inline__ ffs (int x) +extern int inline ffs (int x) { - int r; + int r = 1; if (x == 0) - return 0; - __asm__(" slr %0,%0\n" - " tml %1,0xffff\n" + return 0; + __asm__(" tml %1,0xffff\n" " jnz 0f\n" - " ahi %0,16\n" " srl %1,16\n" + " ahi %0,16\n" "0: tml %1,0x00ff\n" " jnz 1f\n" - " ahi %0,8\n" " srl %1,8\n" + " ahi %0,8\n" "1: tml %1,0x000f\n" " jnz 2f\n" - " ahi %0,4\n" " srl %1,4\n" + " ahi %0,4\n" "2: tml %1,0x0003\n" " jnz 3f\n" - " ahi %0,2\n" " srl %1,2\n" + " ahi %0,2\n" "3: tml %1,0x0001\n" " jnz 4f\n" " ahi %0,1\n" "4:" : "=&d" (r), "+d" (x) : : "cc" ); - return r+1; + return r; +} + +/* + * fls: find last bit set. + */ +extern __inline__ int fls(int x) +{ + int r = 32; + + if (x == 0) + return 0; + __asm__(" tmh %1,0xffff\n" + " jz 0f\n" + " sll %1,16\n" + " ahi %0,-16\n" + "0: tmh %1,0xff00\n" + " jz 1f\n" + " sll %1,8\n" + " ahi %0,-8\n" + "1: tmh %1,0xf000\n" + " jz 2f\n" + " sll %1,4\n" + " ahi %0,-4\n" + "2: tmh %1,0xc000\n" + " jz 3f\n" + " sll %1,2\n" + " ahi %0,-2\n" + "3: tmh %1,0x8000\n" + " jz 4f\n" + " ahi %0,-1\n" + "4:" + : "+d" (r), "+d" (x) : : "cc" ); + return r; } /* @@ -791,7 +832,7 @@ #define ext2_set_bit(nr, addr) test_and_set_bit((nr)^56, addr) #define ext2_clear_bit(nr, addr) test_and_clear_bit((nr)^56, addr) #define ext2_test_bit(nr, addr) test_bit((nr)^56, addr) -static __inline__ unsigned long +static inline unsigned long ext2_find_first_zero_bit(void *vaddr, unsigned long size) { unsigned long res, cmp, count; @@ -833,7 +874,7 @@ return (res < size) ? res : size; } -static __inline__ unsigned long +static inline unsigned long ext2_find_next_zero_bit(void *vaddr, unsigned long size, unsigned long offset) { unsigned long *addr = vaddr; diff -Nur linux-2.4.33-imedia/include/asm-sparc64/system.h linux-2.4.33-imedia-patching/include/asm-sparc64/system.h --- linux-2.4.33-imedia/include/asm-sparc64/system.h 2006-01-11 20:29:28.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/asm-sparc64/system.h 2006-01-26 15:19:43.000000000 +0200 @@ -186,7 +186,11 @@ #define flush_user_windows flushw_user #define flush_register_windows flushw_all -#define prepare_to_switch flushw_all + +#define prepare_arch_schedule(prev) task_lock(prev) +#define finish_arch_schedule(prev) task_unlock(prev) +#define prepare_arch_switch(rq) do { spin_unlock(&(rq)->lock); flushw_all(); } +#define finish_arch_switch(rq) __sti() #ifndef CONFIG_DEBUG_SPINLOCK #define CHECK_LOCKS(PREV) do { } while(0) diff -Nur linux-2.4.33-imedia/include/linux/brlock.h linux-2.4.33-imedia-patching/include/linux/brlock.h --- linux-2.4.33-imedia/include/linux/brlock.h 2006-01-11 19:27:30.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/linux/brlock.h 2006-01-26 15:19:43.000000000 +0200 @@ -125,11 +125,11 @@ } #else -# define br_read_lock(idx) ((void)(idx)) -# define br_read_unlock(idx) ((void)(idx)) -# define br_write_lock(idx) ((void)(idx)) -# define br_write_unlock(idx) ((void)(idx)) -#endif +# define br_read_lock(idx) ({ (void)(idx); preempt_disable(); }) +# define br_read_unlock(idx) ({ (void)(idx); preempt_enable(); }) +# define br_write_lock(idx) ({ (void)(idx); preempt_disable(); }) +# define br_write_unlock(idx) ({ (void)(idx); preempt_enable(); }) +#endif /* CONFIG_SMP */ /* * Now enumerate all of the possible sw/hw IRQ protected diff -Nur linux-2.4.33-imedia/include/linux/dcache.h linux-2.4.33-imedia-patching/include/linux/dcache.h --- linux-2.4.33-imedia/include/linux/dcache.h 2006-01-11 19:27:16.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/linux/dcache.h 2006-01-26 15:19:43.000000000 +0200 @@ -127,31 +127,6 @@ extern spinlock_t dcache_lock; -/** - * d_drop - drop a dentry - * @dentry: dentry to drop - * - * d_drop() unhashes the entry from the parent - * dentry hashes, so that it won't be found through - * a VFS lookup any more. Note that this is different - * from deleting the dentry - d_delete will try to - * mark the dentry negative if possible, giving a - * successful _negative_ lookup, while d_drop will - * just make the cache lookup fail. - * - * d_drop() is used mainly for stuff that wants - * to invalidate a dentry for some reason (NFS - * timeouts or autofs deletes). - */ - -static __inline__ void d_drop(struct dentry * dentry) -{ - spin_lock(&dcache_lock); - list_del(&dentry->d_hash); - INIT_LIST_HEAD(&dentry->d_hash); - spin_unlock(&dcache_lock); -} - static __inline__ int dname_external(struct dentry *d) { return d->d_name.name != d->d_iname; @@ -276,3 +251,34 @@ #endif /* __KERNEL__ */ #endif /* __LINUX_DCACHE_H */ + +#if !defined(__LINUX_DCACHE_H_INLINES) && defined(_TASK_STRUCT_DEFINED) +#define __LINUX_DCACHE_H_INLINES + +#ifdef __KERNEL__ +/** + * d_drop - drop a dentry + * @dentry: dentry to drop + * + * d_drop() unhashes the entry from the parent + * dentry hashes, so that it won't be found through + * a VFS lookup any more. Note that this is different + * from deleting the dentry - d_delete will try to + * mark the dentry negative if possible, giving a + * successful _negative_ lookup, while d_drop will + * just make the cache lookup fail. + * + * d_drop() is used mainly for stuff that wants + * to invalidate a dentry for some reason (NFS + * timeouts or autofs deletes). + */ + +static __inline__ void d_drop(struct dentry * dentry) +{ + spin_lock(&dcache_lock); + list_del(&dentry->d_hash); + INIT_LIST_HEAD(&dentry->d_hash); + spin_unlock(&dcache_lock); +} +#endif +#endif diff -Nur linux-2.4.33-imedia/include/linux/fs_struct.h linux-2.4.33-imedia-patching/include/linux/fs_struct.h --- linux-2.4.33-imedia/include/linux/fs_struct.h 2001-07-14 01:10:44.000000000 +0300 +++ linux-2.4.33-imedia-patching/include/linux/fs_struct.h 2006-01-26 15:19:43.000000000 +0200 @@ -20,6 +20,15 @@ extern void exit_fs(struct task_struct *); extern void set_fs_altroot(void); +struct fs_struct *copy_fs_struct(struct fs_struct *old); +void put_fs_struct(struct fs_struct *fs); + +#endif +#endif + +#if !defined(_LINUX_FS_STRUCT_H_INLINES) && defined(_TASK_STRUCT_DEFINED) +#define _LINUX_FS_STRUCT_H_INLINES +#ifdef __KERNEL__ /* * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values. * It can block. Requires the big lock held. @@ -65,9 +74,5 @@ mntput(old_pwdmnt); } } - -struct fs_struct *copy_fs_struct(struct fs_struct *old); -void put_fs_struct(struct fs_struct *fs); - #endif #endif diff -Nur linux-2.4.33-imedia/include/linux/highmem.h linux-2.4.33-imedia-patching/include/linux/highmem.h --- linux-2.4.33-imedia/include/linux/highmem.h 2006-01-11 19:27:18.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/linux/highmem.h 2006-01-26 15:19:43.000000000 +0200 @@ -33,18 +33,8 @@ { unsigned long addr; - __save_flags(*flags); + local_irq_save(*flags); - /* - * could be low - */ - if (!PageHighMem(bh->b_page)) - return bh->b_data; - - /* - * it's a highmem page - */ - __cli(); addr = (unsigned long) kmap_atomic(bh->b_page, KM_BH_IRQ); if (addr & ~PAGE_MASK) @@ -58,7 +48,7 @@ unsigned long ptr = (unsigned long) buffer & PAGE_MASK; kunmap_atomic((void *) ptr, KM_BH_IRQ); - __restore_flags(*flags); + local_irq_restore(*flags); } #else /* CONFIG_HIGHMEM */ diff -Nur linux-2.4.33-imedia/include/linux/kernel_stat.h linux-2.4.33-imedia-patching/include/linux/kernel_stat.h --- linux-2.4.33-imedia/include/linux/kernel_stat.h 2006-01-11 19:27:17.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/linux/kernel_stat.h 2006-01-26 15:19:43.000000000 +0200 @@ -31,9 +31,10 @@ #elif !defined(CONFIG_ARCH_S390) unsigned int irqs[NR_CPUS][NR_IRQS]; #endif - unsigned int context_swtch; }; +extern unsigned long nr_context_switches(void); + extern struct kernel_stat kstat; extern unsigned long nr_context_switches(void); diff -Nur linux-2.4.33-imedia/include/linux/list.h linux-2.4.33-imedia-patching/include/linux/list.h --- linux-2.4.33-imedia/include/linux/list.h 2006-01-11 19:27:16.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/linux/list.h 2006-01-26 15:19:43.000000000 +0200 @@ -19,6 +19,8 @@ struct list_head *next, *prev; }; +typedef struct list_head list_t; + #define LIST_HEAD_INIT(name) { &(name), &(name) } #define LIST_HEAD(name) \ diff -Nur linux-2.4.33-imedia/include/linux/low-latency.h linux-2.4.33-imedia-patching/include/linux/low-latency.h --- linux-2.4.33-imedia/include/linux/low-latency.h 1970-01-01 02:00:00.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/linux/low-latency.h 2006-01-26 15:19:43.000000000 +0200 @@ -0,0 +1,109 @@ +/* + * include/linux/low-latency.h + * + * Andrew Morton + */ + +#ifndef LOW_LATENCY_H_INCLUDED +#define LOW_LATENCY_H_INCLUDED + +#if defined(CONFIG_LOLAT) +#define LOWLATENCY_NEEDED 1 +#else +#define LOWLATENCY_NEEDED 0 +#endif + +#if LOWLATENCY_NEEDED + +#include /* For ____cacheline_aligned */ + +#ifdef CONFIG_LOLAT_SYSCTL +extern struct low_latency_enable_struct { + int yep; +} ____cacheline_aligned __enable_lowlatency; +#define enable_lowlatency __enable_lowlatency.yep + +#else +#define enable_lowlatency 1 +#endif + +/* + * Set this non-zero to generate low-latency instrumentation + */ +#define LOWLATENCY_DEBUG 0 + +/* + * Set this non-zero for robustness testing + */ +#define LOWLATENCY_ALWAYS_SCHEDULE 0 + +#if LOWLATENCY_DEBUG + +#if LOWLATENCY_ALWAYS_SCHEDULE +#define conditional_schedule_needed() ((enable_lowlatency == 2) || (enable_lowlatency && current->need_resched)) +#else +#define conditional_schedule_needed() (enable_lowlatency && current->need_resched) +#endif + +struct lolat_stats_t { + unsigned long count; + int visited; + const char *file; + int line; + struct lolat_stats_t *next; +}; + +void set_running_and_schedule(struct lolat_stats_t *stats); + +#define unconditional_schedule() \ + do { \ + static struct lolat_stats_t stats = { \ + file: __FILE__, \ + line: __LINE__, \ + }; \ + set_running_and_schedule(&stats); \ + } while (0) + +extern void show_lolat_stats(void); + +#else /* LOWLATENCY_DEBUG */ + +#if LOWLATENCY_ALWAYS_SCHEDULE +#define conditional_schedule_needed() 1 +#else +#define conditional_schedule_needed() (current->need_resched) +#endif + +void set_running_and_schedule(void); +#define unconditional_schedule() set_running_and_schedule() + +#endif /* LOWLATENCY_DEBUG */ + +#define conditional_schedule() \ + do { \ + if (conditional_schedule_needed()) \ + unconditional_schedule(); \ + } while (0) + +#define DEFINE_RESCHED_COUNT int resched_count = 0 +#define TEST_RESCHED_COUNT(n) (enable_lowlatency && (++resched_count > (n))) +#define RESET_RESCHED_COUNT() resched_count = 0 +extern int ll_copy_to_user(void *to_user, const void *from, unsigned long len); +extern int ll_copy_from_user(void *to, const void *from_user, unsigned long len); + +#else /* LOWLATENCY_NEEDED */ + +#define conditional_schedule_needed() 0 +#define conditional_schedule() +#define unconditional_schedule() + +#define DEFINE_RESCHED_COUNT +#define TEST_RESCHED_COUNT(n) 0 +#define RESET_RESCHED_COUNT() +#define ll_copy_to_user(to_user, from, len) copy_to_user((to_user), (from), (len)) +#define ll_copy_from_user(to, from_user, len) copy_from_user((to), (from_user), (len)) + +#endif /* LOWLATENCY_NEEDED */ + +#endif /* LOW_LATENCY_H_INCLUDED */ + diff -Nur linux-2.4.33-imedia/include/linux/mm.h linux-2.4.33-imedia-patching/include/linux/mm.h --- linux-2.4.33-imedia/include/linux/mm.h 2006-01-11 19:27:17.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/linux/mm.h 2006-01-26 15:19:43.000000000 +0200 @@ -124,6 +124,8 @@ */ extern pgprot_t protection_map[16]; +/* Actions for zap_page_range() */ +#define ZPR_COND_RESCHED 1 /* Do a conditional_schedule() occasionally */ /* * These are the virtual MM functions - opening of an area, closing and @@ -487,7 +489,7 @@ extern void shmem_lock(struct file * file, int lock); extern int shmem_zero_setup(struct vm_area_struct *); -extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size); +extern void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, int actions); extern int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); extern int remap_page_range(unsigned long from, unsigned long to, unsigned long size, pgprot_t prot); extern int zeromap_page_range(unsigned long from, unsigned long size, pgprot_t prot); diff -Nur linux-2.4.33-imedia/include/linux/reiserfs_fs.h linux-2.4.33-imedia-patching/include/linux/reiserfs_fs.h --- linux-2.4.33-imedia/include/linux/reiserfs_fs.h 2006-01-11 20:35:37.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/linux/reiserfs_fs.h 2006-01-26 15:19:43.000000000 +0200 @@ -1329,8 +1329,8 @@ #define fs_generation(s) ((s)->u.reiserfs_sb.s_generation_counter) #define get_generation(s) atomic_read (&fs_generation(s)) #define FILESYSTEM_CHANGED_TB(tb) (get_generation((tb)->tb_sb) != (tb)->fs_gen) -#define fs_changed(gen,s) (gen != get_generation (s)) - +#define __fs_changed(gen,s) (gen != get_generation (s)) +#define fs_changed(gen,s) ({conditional_schedule(); __fs_changed(gen,s);}) /***************************************************************************/ /* FIXATE NODES */ diff -Nur linux-2.4.33-imedia/include/linux/sched.h linux-2.4.33-imedia-patching/include/linux/sched.h --- linux-2.4.33-imedia/include/linux/sched.h 2006-01-11 19:27:17.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/linux/sched.h 2006-01-26 15:19:43.000000000 +0200 @@ -26,6 +26,7 @@ #include #include #include +#include struct exec_domain; @@ -73,16 +74,16 @@ #define CT_TO_SECS(x) ((x) / HZ) #define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) -extern int nr_running, nr_threads; +extern int nr_threads; extern int last_pid; +extern unsigned long nr_running(void); +extern unsigned long nr_uninterruptible(void); #include #include #include #include -#ifdef __KERNEL__ #include -#endif #include @@ -91,6 +92,11 @@ #define TASK_UNINTERRUPTIBLE 2 #define TASK_ZOMBIE 4 #define TASK_STOPPED 8 +#define PREEMPT_ACTIVE 0x4000000 + +#define task_cpu(p) ((p)->cpu) +#define set_task_cpu(p, c) do { (p)->cpu = (c); } while (0) +#define cpu_online(i) ((i) < smp_num_cpus) #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -105,15 +111,13 @@ /* * Scheduling policies */ -#define SCHED_OTHER 0 +#define SCHED_NORMAL 0 #define SCHED_FIFO 1 #define SCHED_RR 2 +#define SCHED_BATCH 3 -/* - * This is an additional bit set when we want to - * yield the CPU for one re-schedule.. - */ -#define SCHED_YIELD 0x10 +/* compatibility */ +#define SCHED_OTHER SCHED_NORMAL struct sched_param { int sched_priority; @@ -132,21 +136,30 @@ * a separate lock). */ extern rwlock_t tasklist_lock; -extern spinlock_t runqueue_lock; extern spinlock_t mmlist_lock; +typedef struct task_struct task_t; + extern void sched_init(void); -extern void init_idle(void); +extern void init_idle(task_t *idle, int cpu); extern void show_state(void); extern void cpu_init (void); extern void trap_init(void); extern void update_process_times(int user); -extern void update_one_process(struct task_struct *p, unsigned long user, +extern void update_one_process(task_t *p, unsigned long user, unsigned long system, int cpu); +extern void scheduler_tick(int user_tick, int system); +extern void migration_init(void); +extern unsigned long cache_decay_ticks; +extern int set_user(uid_t new_ruid, int dumpclear); #define MAX_SCHEDULE_TIMEOUT LONG_MAX extern signed long FASTCALL(schedule_timeout(signed long timeout)); asmlinkage void schedule(void); +asmlinkage void schedule_userspace(void); +#ifdef CONFIG_PREEMPT +asmlinkage void preempt_schedule(void); +#endif extern int schedule_task(struct tq_struct *task); extern void flush_scheduled_tasks(void); @@ -160,6 +173,36 @@ #endif /* + * Priority of a process goes from 0..MAX_PRIO-1, valid RT + * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL tasks are + * in the range MAX_RT_PRIO..MAX_PRIO-1. Priority values + * are inverted: lower p->prio value means higher priority. + * + * The MAX_RT_USER_PRIO value allows the actual maximum + * RT priority to be separate from the value exported to + * user-space. This allows kernel threads to set their + * priority to a value higher than any user task. Note: + * MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO. + */ + +#define MAX_USER_RT_PRIO 100 +#define MAX_RT_PRIO MAX_USER_RT_PRIO + +#define MAX_PRIO (MAX_RT_PRIO + 40) + +/* + * The maximum RT priority is configurable. If the resulting + * bitmap is 160-bits , we can use a hand-coded routine which + * is optimal. Otherwise, we fall back on a generic routine for + * finding the first set bit from an arbitrarily-sized bitmap. + */ +#if MAX_PRIO < 160 && MAX_PRIO > 127 +#define sched_find_first_bit(map) _sched_find_first_bit(map) +#else +#define sched_find_first_bit(map) find_first_bit(map, MAX_PRIO) +#endif + +/* * The default fd array needs to be at least BITS_PER_LONG, * as this is the granularity returned by copy_fdset(). */ @@ -280,12 +323,14 @@ extern struct user_struct root_user; #define INIT_USER (&root_user) +typedef struct prio_array prio_array_t; + struct task_struct { /* * offsets of these are hardcoded elsewhere - touch with care */ volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ - unsigned long flags; /* per process flags, defined below */ + int preempt_count; /* 0 => preemptable, <0 => BUG */ int sigpending; mm_segment_t addr_limit; /* thread address space: 0-0xBFFFFFFF for user-thead @@ -297,36 +342,30 @@ int lock_depth; /* Lock depth */ -/* - * offset 32 begins here on 32-bit platforms. We keep - * all fields in a single cacheline that are needed for - * the goodness() loop in schedule(). - */ - long counter; - long nice; - unsigned long policy; - struct mm_struct *mm; - int processor; - /* - * cpus_runnable is ~0 if the process is not running on any - * CPU. It's (1 << cpu) if it's running on a CPU. This mask - * is updated under the runqueue lock. - * - * To determine whether a process might run on a CPU, this - * mask is AND-ed with cpus_allowed. - */ - unsigned long cpus_runnable, cpus_allowed; /* - * (only the 'next' pointer fits into the cacheline, but - * that's just fine.) + * offset 32 begins here on 32-bit platforms. */ - struct list_head run_list; - unsigned long sleep_time; + unsigned int cpu; + int prio, static_prio; + list_t run_list; + prio_array_t *array; + + unsigned long sleep_avg; + long interactive_credit; + unsigned long timestamp; + int activated; - struct task_struct *next_task, *prev_task; - struct mm_struct *active_mm; + unsigned long policy; + unsigned long cpus_allowed; + unsigned int time_slice, first_time_slice; + + task_t *next_task, *prev_task; + + struct mm_struct *mm, *active_mm; struct list_head local_pages; + unsigned int allocation_order, nr_local_pages; + unsigned long flags; /* task state */ struct linux_binfmt *binfmt; @@ -348,12 +387,12 @@ * older sibling, respectively. (p->father can be replaced with * p->p_pptr->pid) */ - struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; + task_t *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; struct list_head thread_group; /* PID hash table linkage. */ - struct task_struct *pidhash_next; - struct task_struct **pidhash_pprev; + task_t *pidhash_next; + task_t **pidhash_pprev; wait_queue_head_t wait_chldexit; /* for wait4() */ struct completion *vfork_done; /* for vfork() */ @@ -433,6 +472,8 @@ #define PF_FREE_PAGES 0x00002000 /* per process page freeing */ #define PF_NOIO 0x00004000 /* avoid generating further I/O */ #define PF_FSTRANS 0x00008000 /* inside a filesystem transaction */ +#define PF_BATCH 0x00080000 /* batch-priority process */ + #define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */ @@ -454,9 +495,16 @@ */ #define _STK_LIM (8*1024*1024) -#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */ -#define MAX_COUNTER (20*HZ/100) -#define DEF_NICE (0) +#if CONFIG_SMP +extern void set_cpus_allowed(task_t *p, unsigned long new_mask); +#else +#define set_cpus_allowed(p, new_mask) do { } while (0) +#endif + +extern void set_user_nice(task_t *p, long nice); +extern int task_prio(task_t *p); +extern int task_nice(task_t *p); +extern int idle_cpu(int cpu); extern void yield(void); @@ -477,14 +525,14 @@ addr_limit: KERNEL_DS, \ exec_domain: &default_exec_domain, \ lock_depth: -1, \ - counter: DEF_COUNTER, \ - nice: DEF_NICE, \ - policy: SCHED_OTHER, \ + prio: MAX_PRIO-20, \ + static_prio: MAX_PRIO-20, \ + policy: SCHED_NORMAL, \ + cpus_allowed: -1, \ mm: NULL, \ active_mm: &init_mm, \ - cpus_runnable: ~0UL, \ - cpus_allowed: ~0UL, \ run_list: LIST_HEAD_INIT(tsk.run_list), \ + time_slice: HZ, \ next_task: &tsk, \ prev_task: &tsk, \ p_opptr: &tsk, \ @@ -518,24 +566,24 @@ #endif union task_union { - struct task_struct task; + task_t task; unsigned long stack[INIT_TASK_SIZE/sizeof(long)]; }; extern union task_union init_task_union; extern struct mm_struct init_mm; -extern struct task_struct *init_tasks[NR_CPUS]; +extern task_t *init_tasks[NR_CPUS]; /* PID hashing. (shouldnt this be dynamic?) */ #define PIDHASH_SZ (4096 >> 2) -extern struct task_struct *pidhash[PIDHASH_SZ]; +extern task_t *pidhash[PIDHASH_SZ]; #define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) -static inline void hash_pid(struct task_struct *p) +static inline void hash_pid(task_t *p) { - struct task_struct **htable = &pidhash[pid_hashfn(p->pid)]; + task_t **htable = &pidhash[pid_hashfn(p->pid)]; if((p->pidhash_next = *htable) != NULL) (*htable)->pidhash_pprev = &p->pidhash_next; @@ -543,16 +591,16 @@ p->pidhash_pprev = htable; } -static inline void unhash_pid(struct task_struct *p) +static inline void unhash_pid(task_t *p) { if(p->pidhash_next) p->pidhash_next->pidhash_pprev = p->pidhash_pprev; *p->pidhash_pprev = p->pidhash_next; } -static inline struct task_struct *find_task_by_pid(int pid) +static inline task_t *find_task_by_pid(int pid) { - struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)]; + task_t *p, **htable = &pidhash[pid_hashfn(pid)]; for(p = *htable; p && p->pid != pid; p = p->pidhash_next) ; @@ -560,19 +608,6 @@ return p; } -#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL) - -static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu) -{ - tsk->processor = cpu; - tsk->cpus_runnable = 1UL << cpu; -} - -static inline void task_release_cpu(struct task_struct *tsk) -{ - tsk->cpus_runnable = ~0UL; -} - /* per-UID process charging. */ extern struct user_struct * alloc_uid(uid_t); extern void free_uid(struct user_struct *); @@ -600,47 +635,51 @@ extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q)); extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q, signed long timeout)); -extern int FASTCALL(wake_up_process(struct task_struct * tsk)); +extern int FASTCALL(wake_up_process(task_t * tsk)); +extern void FASTCALL(wake_up_forked_process(task_t * tsk)); +extern void FASTCALL(sched_exit(task_t * p)); #define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) #define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) #define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0) -#define wake_up_sync(x) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) -#define wake_up_sync_nr(x, nr) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) #define wake_up_interruptible(x) __wake_up((x),TASK_INTERRUPTIBLE, 1) #define wake_up_interruptible_nr(x, nr) __wake_up((x),TASK_INTERRUPTIBLE, nr) #define wake_up_interruptible_all(x) __wake_up((x),TASK_INTERRUPTIBLE, 0) -#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) -#define wake_up_interruptible_sync_nr(x, nr) __wake_up_sync((x),TASK_INTERRUPTIBLE, nr) +#ifdef CONFIG_SMP +#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) +#else +#define wake_up_interruptible_sync(x) __wake_up((x),TASK_INTERRUPTIBLE, 1) +#endif + asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru); extern int in_group_p(gid_t); extern int in_egroup_p(gid_t); extern void proc_caches_init(void); -extern void flush_signals(struct task_struct *); -extern void flush_signal_handlers(struct task_struct *); +extern void flush_signals(task_t *); +extern void flush_signal_handlers(task_t *); extern void sig_exit(int, int, struct siginfo *); extern int dequeue_signal(sigset_t *, siginfo_t *); extern void block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask); extern void unblock_all_signals(void); -extern int send_sig_info(int, struct siginfo *, struct task_struct *); -extern int force_sig_info(int, struct siginfo *, struct task_struct *); +extern int send_sig_info(int, struct siginfo *, task_t *); +extern int force_sig_info(int, struct siginfo *, task_t *); extern int kill_pg_info(int, struct siginfo *, pid_t); extern int kill_sl_info(int, struct siginfo *, pid_t); extern int kill_proc_info(int, struct siginfo *, pid_t); -extern void notify_parent(struct task_struct *, int); -extern void do_notify_parent(struct task_struct *, int); -extern void force_sig(int, struct task_struct *); -extern int send_sig(int, struct task_struct *, int); +extern void notify_parent(task_t *, int); +extern void do_notify_parent(task_t *, int); +extern void force_sig(int, task_t *); +extern int send_sig(int, task_t *, int); extern int kill_pg(pid_t, int, int); extern int kill_sl(pid_t, int, int); extern int kill_proc(pid_t, int, int); extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *); extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long); -static inline int signal_pending(struct task_struct *p) +static inline int signal_pending(task_t *p) { return (p->sigpending != 0); } @@ -679,7 +718,7 @@ This is required every time the blocked sigset_t changes. All callers should have t->sigmask_lock. */ -static inline void recalc_sigpending(struct task_struct *t) +static inline void recalc_sigpending(task_t *t) { t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked); } @@ -786,16 +825,17 @@ extern int expand_fdset(struct files_struct *, int nr); extern void free_fdset(fd_set *, int); -extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); +extern int copy_thread(int, unsigned long, unsigned long, unsigned long, task_t *, struct pt_regs *); extern void flush_thread(void); extern void exit_thread(void); -extern void exit_mm(struct task_struct *); -extern void exit_files(struct task_struct *); -extern void exit_sighand(struct task_struct *); +extern void exit_mm(task_t *); +extern void exit_files(task_t *); +extern void exit_sighand(task_t *); extern void reparent_to_init(void); extern void daemonize(void); +extern task_t *child_reaper; extern int do_execve(char *, char **, char **, struct pt_regs *); extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long); @@ -809,6 +849,9 @@ extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags); +extern void wait_task_inactive(task_t * p); +extern void kick_if_running(task_t * p); + #define __wait_event(wq, condition) \ do { \ wait_queue_t __wait; \ @@ -890,27 +933,12 @@ for (task = next_thread(current) ; task != current ; task = next_thread(task)) #define next_thread(p) \ - list_entry((p)->thread_group.next, struct task_struct, thread_group) + list_entry((p)->thread_group.next, task_t, thread_group) #define thread_group_leader(p) (p->pid == p->tgid) -static inline void del_from_runqueue(struct task_struct * p) -{ - nr_running--; - p->sleep_time = jiffies; - list_del(&p->run_list); - p->run_list.next = NULL; -} - -static inline int task_on_runqueue(struct task_struct *p) -{ - return (p->run_list.next != NULL); -} - -static inline void unhash_process(struct task_struct *p) +static inline void unhash_process(task_t *p) { - if (task_on_runqueue(p)) - out_of_line_bug(); write_lock_irq(&tasklist_lock); nr_threads--; unhash_pid(p); @@ -920,12 +948,12 @@ } /* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ -static inline void task_lock(struct task_struct *p) +static inline void task_lock(task_t *p) { spin_lock(&p->alloc_lock); } -static inline void task_unlock(struct task_struct *p) +static inline void task_unlock(task_t *p) { spin_unlock(&p->alloc_lock); } @@ -961,5 +989,30 @@ __cond_resched(); } +static inline void set_need_resched(void) +{ + current->need_resched = 1; +} + +static inline void clear_need_resched(void) +{ + current->need_resched = 0; +} + +static inline void set_tsk_need_resched(struct task_struct *tsk) +{ + tsk->need_resched = 1; +} + +static inline void clear_tsk_need_resched(struct task_struct *tsk) +{ + tsk->need_resched = 0; +} + +#define _TASK_STRUCT_DEFINED +#include +#include +#include + #endif /* __KERNEL__ */ #endif diff -Nur linux-2.4.33-imedia/include/linux/smp.h linux-2.4.33-imedia-patching/include/linux/smp.h --- linux-2.4.33-imedia/include/linux/smp.h 2006-01-11 19:27:17.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/linux/smp.h 2006-01-26 15:19:43.000000000 +0200 @@ -81,11 +81,21 @@ #define smp_processor_id() 0 #define hard_smp_processor_id() 0 #define smp_threads_ready 1 +#ifndef CONFIG_PREEMPT #define kernel_lock() +#endif #define cpu_logical_map(cpu) 0 #define cpu_number_map(cpu) 0 #define smp_call_function(func,info,retry,wait) ({ 0; }) #define cpu_online_map 1 +static inline void smp_send_reschedule(int cpu) { } +static inline void smp_send_reschedule_all(void) { } #endif + +/* + * Common definitions: + */ +#define cpu() smp_processor_id() + #endif diff -Nur linux-2.4.33-imedia/include/linux/smp_lock.h linux-2.4.33-imedia-patching/include/linux/smp_lock.h --- linux-2.4.33-imedia/include/linux/smp_lock.h 2006-01-11 19:27:17.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/linux/smp_lock.h 2006-01-26 15:19:43.000000000 +0200 @@ -3,7 +3,7 @@ #include -#ifndef CONFIG_SMP +#if !defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT) #define lock_kernel() do { } while(0) #define unlock_kernel() do { } while(0) diff -Nur linux-2.4.33-imedia/include/linux/spinlock.h linux-2.4.33-imedia-patching/include/linux/spinlock.h --- linux-2.4.33-imedia/include/linux/spinlock.h 2006-01-11 19:27:16.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/linux/spinlock.h 2006-01-26 15:19:43.000000000 +0200 @@ -2,6 +2,7 @@ #define __LINUX_SPINLOCK_H #include +#include #include @@ -64,8 +65,10 @@ #if (DEBUG_SPINLOCKS < 1) +#ifndef CONFIG_PREEMPT #define atomic_dec_and_lock(atomic,lock) atomic_dec_and_test(atomic) #define ATOMIC_DEC_AND_LOCK +#endif /* * Your basic spinlocks, allowing only a single CPU anywhere @@ -82,11 +85,11 @@ #endif #define spin_lock_init(lock) do { } while(0) -#define spin_lock(lock) (void)(lock) /* Not "unused variable". */ +#define _raw_spin_lock(lock) (void)(lock) /* Not "unused variable". */ #define spin_is_locked(lock) (0) -#define spin_trylock(lock) ({1; }) +#define _raw_spin_trylock(lock) ({1; }) #define spin_unlock_wait(lock) do { } while(0) -#define spin_unlock(lock) do { } while(0) +#define _raw_spin_unlock(lock) do { } while(0) #elif (DEBUG_SPINLOCKS < 2) @@ -146,13 +149,78 @@ #endif #define rwlock_init(lock) do { } while(0) -#define read_lock(lock) (void)(lock) /* Not "unused variable". */ -#define read_unlock(lock) (void)(lock) /* Not "unused variable". */ -#define write_lock(lock) (void)(lock) /* Not "unused variable". */ -#define write_unlock(lock) do { } while(0) +#define _raw_read_lock(lock) (void)(lock) /* Not "unused variable". */ +#define _raw_read_unlock(lock) (void)(lock) /* Not "unused variable". */ +#define _raw_write_lock(lock) (void)(lock) /* Not "unused variable". */ +#define _raw_write_unlock(lock) do { } while(0) #endif /* !SMP */ +#ifdef CONFIG_PREEMPT + +#define preempt_get_count() (current->preempt_count) +#define preempt_is_disabled() (preempt_get_count() != 0) + +#define preempt_disable() \ +do { \ + ++current->preempt_count; \ + barrier(); \ +} while (0) + +#define preempt_enable_no_resched() \ +do { \ + --current->preempt_count; \ + barrier(); \ +} while (0) + +#define preempt_enable() \ +do { \ + --current->preempt_count; \ + barrier(); \ + if (unlikely(current->preempt_count < current->need_resched)) \ + preempt_schedule(); \ +} while (0) + +#define spin_lock(lock) \ +do { \ + preempt_disable(); \ + _raw_spin_lock(lock); \ +} while(0) + +#define spin_trylock(lock) ({preempt_disable(); _raw_spin_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) +#define spin_unlock(lock) \ +do { \ + _raw_spin_unlock(lock); \ + preempt_enable(); \ +} while (0) + +#define read_lock(lock) ({preempt_disable(); _raw_read_lock(lock);}) +#define read_unlock(lock) ({_raw_read_unlock(lock); preempt_enable();}) +#define write_lock(lock) ({preempt_disable(); _raw_write_lock(lock);}) +#define write_unlock(lock) ({_raw_write_unlock(lock); preempt_enable();}) +#define write_trylock(lock) ({preempt_disable();_raw_write_trylock(lock) ? \ + 1 : ({preempt_enable(); 0;});}) + +#else + +#define preempt_get_count() (0) +#define preempt_is_disabled() (1) +#define preempt_disable() do { } while (0) +#define preempt_enable_no_resched() do {} while(0) +#define preempt_enable() do { } while (0) + +#define spin_lock(lock) _raw_spin_lock(lock) +#define spin_trylock(lock) _raw_spin_trylock(lock) +#define spin_unlock(lock) _raw_spin_unlock(lock) + +#define read_lock(lock) _raw_read_lock(lock) +#define read_unlock(lock) _raw_read_unlock(lock) +#define write_lock(lock) _raw_write_lock(lock) +#define write_unlock(lock) _raw_write_unlock(lock) +#define write_trylock(lock) _raw_write_trylock(lock) +#endif + /* "lock on reference count zero" */ #ifndef ATOMIC_DEC_AND_LOCK #include diff -Nur linux-2.4.33-imedia/include/linux/sysctl.h linux-2.4.33-imedia-patching/include/linux/sysctl.h --- linux-2.4.33-imedia/include/linux/sysctl.h 2006-01-11 20:29:28.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/linux/sysctl.h 2006-01-26 15:19:43.000000000 +0200 @@ -125,6 +125,7 @@ KERN_CORE_USES_PID=52, /* int: use core or core.%pid */ KERN_TAINTED=53, /* int: various kernel tainted flags */ KERN_CADPID=54, /* int: PID of the process to notify on CAD */ + KERN_LOWLATENCY=55, /* int: enable low latency scheduling */ KERN_CORE_PATTERN=56, /* string: pattern for core-files */ KERN_PPC_L3CR=57, /* l3cr register on PPC */ KERN_EXCEPTION_TRACE=58, /* boolean: exception trace */ diff -Nur linux-2.4.33-imedia/include/linux/tqueue.h linux-2.4.33-imedia-patching/include/linux/tqueue.h --- linux-2.4.33-imedia/include/linux/tqueue.h 2006-01-11 19:27:17.000000000 +0200 +++ linux-2.4.33-imedia-patching/include/linux/tqueue.h 2006-01-26 15:19:43.000000000 +0200 @@ -94,6 +94,22 @@ extern spinlock_t tqueue_lock; /* + * Call all "bottom halfs" on a given list. + */ + +extern void __run_task_queue(task_queue *list); + +static inline void run_task_queue(task_queue *list) +{ + if (TQ_ACTIVE(*list)) + __run_task_queue(list); +} + +#endif /* _LINUX_TQUEUE_H */ + +#if !defined(_LINUX_TQUEUE_H_INLINES) && defined(_TASK_STRUCT_DEFINED) +#define _LINUX_TQUEUE_H_INLINES +/* * Queue a task on a tq. Return non-zero if it was successfully * added. */ @@ -109,17 +125,4 @@ } return ret; } - -/* - * Call all "bottom halfs" on a given list. - */ - -extern void __run_task_queue(task_queue *list); - -static inline void run_task_queue(task_queue *list) -{ - if (TQ_ACTIVE(*list)) - __run_task_queue(list); -} - -#endif /* _LINUX_TQUEUE_H */ +#endif diff -Nur linux-2.4.33-imedia/init/main.c linux-2.4.33-imedia-patching/init/main.c --- linux-2.4.33-imedia/init/main.c 2004-11-17 13:54:22.000000000 +0200 +++ linux-2.4.33-imedia-patching/init/main.c 2006-01-26 15:19:43.000000000 +0200 @@ -298,8 +298,6 @@ extern void setup_arch(char **); extern void cpu_idle(void); -unsigned long wait_init_idle; - #ifndef CONFIG_SMP #ifdef CONFIG_X86_LOCAL_APIC @@ -313,29 +311,19 @@ #else - /* Called by boot processor to activate the rest. */ static void __init smp_init(void) { /* Get other processors into their bootup holding patterns. */ smp_boot_cpus(); - wait_init_idle = cpu_online_map; - clear_bit(current->processor, &wait_init_idle); /* Don't wait on me! */ smp_threads_ready=1; smp_commence(); - - /* Wait for the other cpus to set up their idle processes */ - printk("Waiting on wait_init_idle (map = 0x%lx)\n", wait_init_idle); - while (wait_init_idle) { - cpu_relax(); - barrier(); - } - printk("All processors have done init_idle\n"); } #endif + /* * We need to finalize in a non-__init function or else race conditions * between the root thread and the init thread may cause start_kernel to @@ -347,7 +335,6 @@ { kernel_thread(init, NULL, CLONE_FS | CLONE_FILES | CLONE_SIGNAL); unlock_kernel(); - current->need_resched = 1; cpu_idle(); } @@ -434,6 +421,7 @@ acpi_early_init(); /* before LAPIC and SMP init */ printk("POSIX conformance testing by UNIFIX\n"); + init_idle(current, smp_processor_id()); /* * We count on the initial thread going ok * Like idlers init is an unlocked kernel thread, which will @@ -471,6 +459,10 @@ */ static void __init do_basic_setup(void) { + /* Start the per-CPU migration threads */ +#if CONFIG_SMP + migration_init(); +#endif /* * Tell the world that we're going to be the grim diff -Nur linux-2.4.33-imedia/kernel/capability.c linux-2.4.33-imedia-patching/kernel/capability.c --- linux-2.4.33-imedia/kernel/capability.c 2000-06-24 07:06:37.000000000 +0300 +++ linux-2.4.33-imedia-patching/kernel/capability.c 2006-01-26 15:19:43.000000000 +0200 @@ -8,6 +8,8 @@ #include #include +unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ + kernel_cap_t cap_bset = CAP_INIT_EFF_SET; /* Note: never hold tasklist_lock while spinning for this one */ diff -Nur linux-2.4.33-imedia/kernel/exit.c linux-2.4.33-imedia-patching/kernel/exit.c --- linux-2.4.33-imedia/kernel/exit.c 2002-11-29 01:53:15.000000000 +0200 +++ linux-2.4.33-imedia-patching/kernel/exit.c 2006-01-26 15:19:43.000000000 +0200 @@ -28,49 +28,22 @@ static void release_task(struct task_struct * p) { - if (p != current) { + if (p == current) + BUG(); #ifdef CONFIG_SMP - /* - * Wait to make sure the process isn't on the - * runqueue (active on some other CPU still) - */ - for (;;) { - task_lock(p); - if (!task_has_cpu(p)) - break; - task_unlock(p); - do { - cpu_relax(); - barrier(); - } while (task_has_cpu(p)); - } - task_unlock(p); + wait_task_inactive(p); #endif - atomic_dec(&p->user->processes); - free_uid(p->user); - unhash_process(p); - - release_thread(p); - current->cmin_flt += p->min_flt + p->cmin_flt; - current->cmaj_flt += p->maj_flt + p->cmaj_flt; - current->cnswap += p->nswap + p->cnswap; - /* - * Potentially available timeslices are retrieved - * here - this way the parent does not get penalized - * for creating too many processes. - * - * (this cannot be used to artificially 'generate' - * timeslices, because any timeslice recovered here - * was given away by the parent in the first place.) - */ - current->counter += p->counter; - if (current->counter >= MAX_COUNTER) - current->counter = MAX_COUNTER; - p->pid = 0; - free_task_struct(p); - } else { - printk("task releasing itself\n"); - } + atomic_dec(&p->user->processes); + free_uid(p->user); + unhash_process(p); + + release_thread(p); + current->cmin_flt += p->min_flt + p->cmin_flt; + current->cmaj_flt += p->maj_flt + p->cmaj_flt; + current->cnswap += p->nswap + p->cnswap; + sched_exit(p); + p->pid = 0; + free_task_struct(p); } /* @@ -150,6 +123,79 @@ return retval; } +/** + * reparent_to_init() - Reparent the calling kernel thread to the init task. + * + * If a kernel thread is launched as a result of a system call, or if + * it ever exits, it should generally reparent itself to init so that + * it is correctly cleaned up on exit. + * + * The various task state such as scheduling policy and priority may have + * been inherited from a user process, so we reset them to sane values here. + * + * NOTE that reparent_to_init() gives the caller full capabilities. + */ +void reparent_to_init(void) +{ + write_lock_irq(&tasklist_lock); + + /* Reparent to init */ + REMOVE_LINKS(current); + current->p_pptr = child_reaper; + current->p_opptr = child_reaper; + SET_LINKS(current); + + /* Set the exit signal to SIGCHLD so we signal init on exit */ + current->exit_signal = SIGCHLD; + + current->ptrace = 0; + if ((current->policy == SCHED_OTHER) && (task_nice(current) < 0)) + set_user_nice(current, 0); + /* cpus_allowed? */ + /* rt_priority? */ + /* signals? */ + current->cap_effective = CAP_INIT_EFF_SET; + current->cap_inheritable = CAP_INIT_INH_SET; + current->cap_permitted = CAP_FULL_SET; + current->keep_capabilities = 0; + memcpy(current->rlim, init_task.rlim, sizeof(*(current->rlim))); + current->user = INIT_USER; + + write_unlock_irq(&tasklist_lock); +} + +/* + * Put all the gunge required to become a kernel thread without + * attached user resources in one place where it belongs. + */ + +void daemonize(void) +{ + struct fs_struct *fs; + + + /* + * If we were started as result of loading a module, close all of the + * user space pages. We don't need them, and if we didn't close them + * they would be locked into memory. + */ + exit_mm(current); + + current->session = 1; + current->pgrp = 1; + current->tty = NULL; + + /* Become as one with the init task */ + + exit_fs(current); /* current->fs->count--; */ + fs = init_task.fs; + current->fs = fs; + atomic_inc(&fs->count); + exit_files(current); + current->files = init_task.files; + atomic_inc(¤t->files->count); + } + /* * When we die, we re-parent all our children. * Try to give them to another thread in our thread @@ -196,6 +242,7 @@ } i++; set >>= 1; + conditional_schedule(); /* sys_exit, many files open */ } } } @@ -282,7 +329,9 @@ current->mm = NULL; /* active_mm is still 'mm' */ atomic_inc(&mm->mm_count); + preempt_disable(); enter_lazy_tlb(mm, current, smp_processor_id()); + preempt_enable(); return mm; } @@ -313,8 +362,8 @@ /* more a memory barrier than a real lock */ task_lock(tsk); tsk->mm = NULL; - task_unlock(tsk); enter_lazy_tlb(mm, current, smp_processor_id()); + task_unlock(tsk); mmput(mm); } } @@ -435,6 +484,13 @@ tsk->flags |= PF_EXITING; del_timer_sync(&tsk->real_timer); +#if 0 + if (unlikely(preempt_get_count())) + printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", + current->comm, current->pid, + preempt_get_count()); +#endif + fake_volatile: #ifdef CONFIG_BSD_PROCESS_ACCT acct_process(code); diff -Nur linux-2.4.33-imedia/kernel/fork.c linux-2.4.33-imedia-patching/kernel/fork.c --- linux-2.4.33-imedia/kernel/fork.c 2005-01-19 16:10:13.000000000 +0200 +++ linux-2.4.33-imedia-patching/kernel/fork.c 2006-01-26 15:19:43.000000000 +0200 @@ -31,7 +31,6 @@ /* The idle threads do not count.. */ int nr_threads; -int nr_running; int max_threads; unsigned long total_forks; /* Handle normal Linux uptimes. */ @@ -39,6 +38,8 @@ struct task_struct *pidhash[PIDHASH_SZ]; +rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ + void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) { unsigned long flags; @@ -700,6 +701,13 @@ if (p->binfmt && p->binfmt->module) __MOD_INC_USE_COUNT(p->binfmt->module); +#ifdef CONFIG_PREEMPT + /* + * Continue with preemption disabled as part of the context + * switch, so start with preempt_count set to 1. + */ + p->preempt_count = 1; +#endif p->did_exec = 0; p->swappable = 0; p->state = TASK_UNINTERRUPTIBLE; @@ -709,8 +717,7 @@ if (p->pid == 0 && current->pid != 0) goto bad_fork_cleanup; - p->run_list.next = NULL; - p->run_list.prev = NULL; + INIT_LIST_HEAD(&p->run_list); p->p_cptr = NULL; init_waitqueue_head(&p->wait_chldexit); @@ -736,14 +743,15 @@ #ifdef CONFIG_SMP { int i; - p->cpus_runnable = ~0UL; - p->processor = current->processor; + /* ?? should we just memset this ?? */ for(i = 0; i < smp_num_cpus; i++) - p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0; + p->per_cpu_utime[cpu_logical_map(i)] = + p->per_cpu_stime[cpu_logical_map(i)] = 0; spin_lock_init(&p->sigmask_lock); } #endif + p->array = NULL; p->lock_depth = -1; /* -1 = no lock */ p->start_time = jiffies; @@ -778,15 +786,27 @@ p->pdeath_signal = 0; /* - * "share" dynamic priority between parent and child, thus the - * total amount of dynamic priorities in the system doesn't change, - * more scheduling fairness. This is only important in the first - * timeslice, on the long run the scheduling behaviour is unchanged. - */ - p->counter = (current->counter + 1) >> 1; - current->counter >>= 1; - if (!current->counter) - current->need_resched = 1; + * Share the timeslice between parent and child, thus the + * total amount of pending timeslices in the system doesnt change, + * resulting in more scheduling fairness. + */ + __cli(); + if (!current->time_slice) + BUG(); + p->time_slice = (current->time_slice + 1) >> 1; + p->first_time_slice = 1; + current->time_slice >>= 1; + if (!current->time_slice) { + /* + * This case is rare, it happens when the parent has only + * a single jiffy left from its timeslice. Taking the + * runqueue lock is not a problem. + */ + current->time_slice = 1; + scheduler_tick(0,0); + } + p->timestamp = jiffies; + __sti(); /* * Ok, add it to the run-queues and make it @@ -823,10 +843,16 @@ if (p->ptrace & PT_PTRACED) send_sig(SIGSTOP, p, 1); - wake_up_process(p); /* do this last */ + wake_up_forked_process(p); /* do this last */ ++total_forks; if (clone_flags & CLONE_VFORK) wait_for_completion(&vfork); + else + /* + * Let the child process run first, to avoid most of the + * COW overhead when the child exec()s afterwards. + */ + current->need_resched = 1; fork_out: return retval; diff -Nur linux-2.4.33-imedia/kernel/ksyms.c linux-2.4.33-imedia-patching/kernel/ksyms.c --- linux-2.4.33-imedia/kernel/ksyms.c 2004-02-18 15:36:32.000000000 +0200 +++ linux-2.4.33-imedia-patching/kernel/ksyms.c 2006-01-26 15:19:43.000000000 +0200 @@ -461,30 +461,44 @@ /* process management */ EXPORT_SYMBOL(complete_and_exit); EXPORT_SYMBOL(__wake_up); -EXPORT_SYMBOL(__wake_up_sync); EXPORT_SYMBOL(wake_up_process); EXPORT_SYMBOL(sleep_on); EXPORT_SYMBOL(sleep_on_timeout); EXPORT_SYMBOL(interruptible_sleep_on); EXPORT_SYMBOL(interruptible_sleep_on_timeout); EXPORT_SYMBOL(schedule); +#ifdef CONFIG_PREEMPT +EXPORT_SYMBOL(preempt_schedule); +#endif EXPORT_SYMBOL(schedule_timeout); #if CONFIG_SMP EXPORT_SYMBOL(set_cpus_allowed); +EXPORT_SYMBOL(__wake_up_sync); #endif EXPORT_SYMBOL(yield); +EXPORT_SYMBOL(set_user_nice); +EXPORT_SYMBOL(task_nice); +EXPORT_SYMBOL_GPL(idle_cpu); EXPORT_SYMBOL(__cond_resched); EXPORT_SYMBOL(jiffies); EXPORT_SYMBOL(xtime); EXPORT_SYMBOL(do_gettimeofday); EXPORT_SYMBOL(do_settimeofday); +#if LOWLATENCY_NEEDED +EXPORT_SYMBOL(set_running_and_schedule); +#ifdef CONFIG_LOLAT_SYSCTL +EXPORT_SYMBOL(__enable_lowlatency); +#endif +#endif + #if !defined(__ia64__) EXPORT_SYMBOL(loops_per_jiffy); #endif EXPORT_SYMBOL(kstat); EXPORT_SYMBOL(nr_running); +EXPORT_SYMBOL(nr_context_switches); /* misc */ EXPORT_SYMBOL(panic); diff -Nur linux-2.4.33-imedia/kernel/module.c linux-2.4.33-imedia-patching/kernel/module.c --- linux-2.4.33-imedia/kernel/module.c 2003-08-25 14:44:44.000000000 +0300 +++ linux-2.4.33-imedia-patching/kernel/module.c 2006-01-26 15:19:43.000000000 +0200 @@ -1187,6 +1187,11 @@ return ERR_PTR(-ENOMEM); lock_kernel(); for (v = module_list, n = *pos; v; n -= v->nsyms, v = v->next) { +#if 0 + /* We can't actually do this, because we'd create a + * race against module unload. Need a semaphore. */ + conditional_schedule(); +#endif if (n < v->nsyms) { p->mod = v; p->index = n; diff -Nur linux-2.4.33-imedia/kernel/ptrace.c linux-2.4.33-imedia-patching/kernel/ptrace.c --- linux-2.4.33-imedia/kernel/ptrace.c 2006-01-11 20:29:28.000000000 +0200 +++ linux-2.4.33-imedia-patching/kernel/ptrace.c 2006-01-26 15:19:43.000000000 +0200 @@ -32,20 +32,7 @@ if (child->state != TASK_STOPPED) return -ESRCH; #ifdef CONFIG_SMP - /* Make sure the child gets off its CPU.. */ - for (;;) { - task_lock(child); - if (!task_has_cpu(child)) - break; - task_unlock(child); - do { - if (child->state != TASK_STOPPED) - return -ESRCH; - barrier(); - cpu_relax(); - } while (task_has_cpu(child)); - } - task_unlock(child); + wait_task_inactive(child); #endif } diff -Nur linux-2.4.33-imedia/kernel/sched.c linux-2.4.33-imedia-patching/kernel/sched.c --- linux-2.4.33-imedia/kernel/sched.c 2004-11-17 13:54:22.000000000 +0200 +++ linux-2.4.33-imedia-patching/kernel/sched.c 2006-01-26 15:19:43.000000000 +0200 @@ -1,341 +1,558 @@ /* - * linux/kernel/sched.c + * kernel/sched.c * * Kernel scheduler and related syscalls * - * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 1991-2002 Linus Torvalds * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe * 1998-11-19 Implemented schedule_timeout() and related stuff * by Andrea Arcangeli * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar + * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: + * hybrid priority-list and round-robin design with + * an array-switch method of distributing timeslices + * and per-CPU runqueues. Additional code by Davide + * Libenzi, Robert Love, and Rusty Russell. */ -/* - * 'sched.c' is the main kernel file. It contains scheduling primitives - * (sleep_on, wakeup, schedule etc) as well as a number of simple system - * call functions (type getpid()), which just extract a field from - * current-task - */ - -#include #include +#include #include +#include +#include #include -#include +#include #include -#include #include -#include -#include - -#include -#include - -extern void timer_bh(void); -extern void tqueue_bh(void); -extern void immediate_bh(void); +#include /* - * scheduler variables - */ + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], + * and back. + */ +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) -unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ +/* + * 'User priority' is the nice value converted to something we + * can work with better when scaling various scheduler parameters, + * it's a [ 0 ... 39 ] range. + */ +#define USER_PRIO(p) ((p)-MAX_RT_PRIO) +#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) +#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) +#define AVG_TIMESLICE (MAX_TIMESLICE * MAX_PRIO/MAX_USER_PRIO) -extern void mem_use(void); +/* + * These are the 'tuning knobs' of the scheduler: + * + * Minimum timeslice is 10 msecs, default timeslice is 100 msecs, + * maximum timeslice is 200 msecs. Timeslices get refilled after + * they expire. + */ +#define MIN_TIMESLICE ((10 * HZ) / 1000 ?: 1) +#define MAX_TIMESLICE ((200 * HZ) / 1000 ?: 1) +#define ON_RUNQUEUE_WEIGHT 30 +#define CHILD_PENALTY 95 +#define PARENT_PENALTY 100 +#define EXIT_WEIGHT 3 +#define PRIO_BONUS_RATIO 25 +#define INTERACTIVE_DELTA 2 +#define CREDIT_LIMIT 100 +#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100) +#define MAX_SLEEP_AVG (AVG_TIMESLICE * MAX_BONUS) +#define STARVATION_LIMIT (MAX_SLEEP_AVG) /* - * Scheduling quanta. + * If a task is 'interactive' then we reinsert it in the active + * array after it has expired its current timeslice. (it will not + * continue to run immediately, it will still roundrobin with + * other interactive tasks.) + * + * This part scales the interactivity limit depending on niceness. * - * NOTE! The unix "nice" value influences how long a process - * gets. The nice value ranges from -20 to +19, where a -20 - * is a "high-priority" task, and a "+10" is a low-priority - * task. + * We scale it linearly, offset by the INTERACTIVE_DELTA delta. + * Here are a few examples of different nice levels: * - * We want the time-slice to be around 50ms or so, so this - * calculation depends on the value of HZ. + * TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0] + * TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0] + * TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0] + * TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0] + * + * (the X axis represents the possible -5 ... 0 ... +5 dynamic + * priority range a task can explore, a value of '1' means the + * task is rated interactive.) + * + * Ie. nice +19 tasks can never get 'interactive' enough to be + * reinserted into the active array. And only heavily CPU-hog nice -20 + * tasks will be expired. Default nice 0 tasks are somewhere between, + * it takes some effort for them to get interactive, but it's not + * too hard. */ -#if HZ < 200 -#define TICK_SCALE(x) ((x) >> 2) -#elif HZ < 400 -#define TICK_SCALE(x) ((x) >> 1) -#elif HZ < 800 -#define TICK_SCALE(x) (x) -#elif HZ < 1600 -#define TICK_SCALE(x) ((x) << 1) -#else -#define TICK_SCALE(x) ((x) << 2) -#endif -#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1) +#define SCALE(v1,v1_max,v2_max) \ + (v1) * (v2_max) / (v1_max) +#define DELTA(p) \ + (SCALE(TASK_NICE(p), 40, MAX_USER_PRIO*PRIO_BONUS_RATIO/100) + \ + INTERACTIVE_DELTA) -/* - * Init task must be ok at boot for the ix86 as we will check its signals - * via the SMP irq return path. - */ - -struct task_struct * init_tasks[NR_CPUS] = {&init_task, }; +#define TASK_INTERACTIVE(p) \ + ((p)->prio <= (p)->static_prio - DELTA(p)) -/* - * The tasklist_lock protects the linked list of processes. - * - * The runqueue_lock locks the parts that actually access - * and change the run-queues, and have to be interrupt-safe. - * - * If both locks are to be concurrently held, the runqueue_lock - * nests inside the tasklist_lock. - * - * task->alloc_lock nests inside tasklist_lock. - */ -spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ -rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ +#define CURRENT_BONUS(p) \ + ((p)->sleep_avg * MAX_BONUS / MAX_SLEEP_AVG) + +#define TIMESLICE_GRANULARITY(p) (MIN_TIMESLICE * \ + (1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1))) -static LIST_HEAD(runqueue_head); +#define JUST_INTERACTIVE_SLEEP(p) \ + (MAX_SLEEP_AVG * (MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1) + +#define HIGH_CREDIT(p) \ + ((p)->interactive_credit > CREDIT_LIMIT) + +#define LOW_CREDIT(p) \ + ((p)->interactive_credit < -CREDIT_LIMIT) /* - * We align per-CPU scheduling data on cacheline boundaries, - * to prevent cacheline ping-pong. + * BASE_TIMESLICE scales user-nice values [ -20 ... 19 ] + * to time slice values. + * + * The higher a process's priority, the bigger timeslices + * it gets during one round of execution. But even the lowest + * priority process gets MIN_TIMESLICE worth of execution time. + * + * task_timeslice() is the interface that is used by the scheduler. + * SCHED_BATCH tasks get longer timeslices to make use of better + * caching. They are inherently noninteractive and they are + * immediately preempted by SCHED_NORMAL tasks so there is no + * downside in using shorter timeslices. */ -static union { - struct schedule_data { - struct task_struct * curr; - cycles_t last_schedule; - } schedule_data; - char __pad [SMP_CACHE_BYTES]; -} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; - -#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr -#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule -struct kernel_stat kstat; -extern struct task_struct *child_reaper; +#define BASE_TIMESLICE(p) \ + (MAX_TIMESLICE * (MAX_PRIO-(p)->static_prio)/MAX_USER_PRIO) -#ifdef CONFIG_SMP +static inline unsigned int task_timeslice(task_t *p) +{ + unsigned int time_slice = BASE_TIMESLICE(p); -#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) -#define can_schedule(p,cpu) \ - ((p)->cpus_runnable & (p)->cpus_allowed & (1UL << cpu)) + if (time_slice < MIN_TIMESLICE) + time_slice = MIN_TIMESLICE; + if (p->policy == SCHED_BATCH) + return time_slice * 20; + else + return time_slice; +} -#else +/* + * These are the runqueue data structures: + */ -#define idle_task(cpu) (&init_task) -#define can_schedule(p,cpu) (1) +#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) -#endif +typedef struct runqueue runqueue_t; -void scheduling_functions_start_here(void) { } +struct prio_array { + int nr_active; + unsigned long bitmap[BITMAP_SIZE]; + list_t queue[MAX_PRIO]; +}; /* - * This is the function that decides how desirable a process is.. - * You can weigh different processes against each other depending - * on what CPU they've run on lately etc to try to handle cache - * and TLB miss penalties. + * This is the main, per-CPU runqueue data structure. * - * Return values: - * -1000: never select this - * 0: out of time, recalculate counters (but it might still be - * selected) - * +ve: "goodness" value (the larger, the better) - * +1000: realtime process, select this. + * Locking rule: those places that want to lock multiple runqueues + * (such as the load balancing or the process migration code), lock + * acquire operations must be ordered by ascending &runqueue. */ +struct runqueue { + spinlock_t lock; + unsigned long nr_running, nr_switches, expired_timestamp, + nr_uninterruptible; + task_t *curr, *idle; + prio_array_t *active, *expired, arrays[2]; + int best_expired_prio, prev_nr_running[NR_CPUS]; -static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) -{ - int weight; + task_t *migration_thread; + list_t migration_queue; /* - * select the current process after every other - * runnable process, but before the idle thread. - * Also, dont trigger a counter recalculation. + * The batch queue is a secondary ready-queue: */ - weight = -1; - if (p->policy & SCHED_YIELD) - goto out; + unsigned long nr_batch; + list_t batch_queue; /* - * Non-RT process - normal case first. + * Per-CPU idle CPU time tracking: + * + * - idle_ticks_left counts back from HZ to 0. + * - idle_count is the number of idle ticks in the last second. + * - once it reaches 0, a new idle_avg is calculated. */ - if (p->policy == SCHED_OTHER) { - /* - * Give the process a first-approximation goodness value - * according to the number of clock-ticks it has left. - * - * Don't do any other calculations if the time slice is - * over.. - */ - weight = p->counter; - if (!weight) - goto out; - -#ifdef CONFIG_SMP - /* Give a largish advantage to the same processor... */ - /* (this is equivalent to penalizing other processors) */ - if (p->processor == this_cpu) - weight += PROC_CHANGE_PENALTY; + #define IDLE_TICKS (HZ) + + unsigned int idle_ticks_left, idle_count, idle_avg; + +} ____cacheline_aligned; + +static struct runqueue runqueues[NR_CPUS] __cacheline_aligned; + +#define cpu_rq(cpu) (runqueues + (cpu)) +#define this_rq() cpu_rq(smp_processor_id()) +#define task_rq(p) cpu_rq(task_cpu(p)) +#define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#define rt_task(p) ((p)->prio < MAX_RT_PRIO) + +/* + * Default context-switch locking: + */ +#ifndef prepare_arch_switch +# define prepare_arch_switch(rq, next) do { } while(0) +# define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) +# define task_running(rq, p) ((rq)->curr == (p)) #endif - /* .. and a slight advantage to the current MM */ - if (p->mm == this_mm || !p->mm) - weight += 1; - weight += 20 - p->nice; - goto out; +/* + * task_rq_lock - lock the runqueue a given task resides on and disable + * interrupts. Note the ordering: we can safely lookup the task_rq without + * explicitly disabling preemption. + */ +static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) +{ + struct runqueue *rq; + +repeat_lock_task: + preempt_disable(); + local_irq_save(*flags); + rq = task_rq(p); + spin_lock(&rq->lock); + if (unlikely(rq != task_rq(p))) { + spin_unlock_irqrestore(&rq->lock, *flags); + preempt_enable(); + goto repeat_lock_task; } + return rq; +} - /* - * Realtime process, select the first one on the - * runqueue (taking priorities within processes - * into account). - */ - weight = 1000 + p->rt_priority; -out: - return weight; +static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) +{ + spin_unlock_irqrestore(&rq->lock, *flags); + preempt_enable(); } /* - * the 'goodness value' of replacing a process on a given CPU. - * positive value means 'replace', zero or negative means 'dont'. + * rq_lock - lock a given runqueue and disable interrupts. */ -static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) +static inline runqueue_t *this_rq_lock(void) { - return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm); + runqueue_t *rq; + + local_irq_disable(); + rq = this_rq(); + spin_lock(&rq->lock); + + return rq; +} + +static inline void rq_unlock(runqueue_t *rq) +{ + spin_unlock(&rq->lock); + local_irq_enable(); } /* - * This is ugly, but reschedule_idle() is very timing-critical. - * We are called with the runqueue spinlock held and we must - * not claim the tasklist_lock. + * Adding/removing a task to/from a priority array: */ -static FASTCALL(void reschedule_idle(struct task_struct * p)); +static inline void dequeue_task(struct task_struct *p, prio_array_t *array) +{ + array->nr_active--; + list_del(&p->run_list); + if (list_empty(array->queue + p->prio)) + __clear_bit(p->prio, array->bitmap); +} -static void fastcall reschedule_idle(struct task_struct * p) +static inline void enqueue_task(struct task_struct *p, prio_array_t *array) { -#ifdef CONFIG_SMP - int this_cpu = smp_processor_id(); - struct task_struct *tsk, *target_tsk; - int cpu, best_cpu, i, max_prio; - cycles_t oldest_idle; + list_add_tail(&p->run_list, array->queue + p->prio); + __set_bit(p->prio, array->bitmap); + array->nr_active++; + p->array = array; +} - /* - * shortcut if the woken up task's last CPU is - * idle now. - */ - best_cpu = p->processor; - if (can_schedule(p, best_cpu)) { - tsk = idle_task(best_cpu); - if (cpu_curr(best_cpu) == tsk) { - int need_resched; -send_now_idle: - /* - * If need_resched == -1 then we can skip sending - * the IPI altogether, tsk->need_resched is - * actively watched by the idle thread. - */ - need_resched = tsk->need_resched; - tsk->need_resched = 1; - if ((best_cpu != this_cpu) && !need_resched) - smp_send_reschedule(best_cpu); - return; - } - } +static inline int effective_prio(task_t *p) +{ + int bonus, prio; + + if (rt_task(p)) + return p->prio; /* - * We know that the preferred CPU has a cache-affine current - * process, lets try to find a new idle CPU for the woken-up - * process. Select the least recently active idle CPU. (that - * one will have the least active cache context.) Also find - * the executing process which has the least priority. - */ - oldest_idle = (cycles_t) -1; - target_tsk = NULL; - max_prio = 0; - - for (i = 0; i < smp_num_cpus; i++) { - cpu = cpu_logical_map(i); - if (!can_schedule(p, cpu)) - continue; - tsk = cpu_curr(cpu); + * Here we scale the actual sleep average [0 .... MAX_SLEEP_AVG] + * into the -5 ... 0 ... +5 bonus/penalty range. + * + * We use 25% of the full 0...39 priority range so that: + * + * 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs. + * 2) nice -20 CPU hogs do not get preempted by nice 0 tasks. + * + * Both properties are important to certain workloads. + */ + bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; + + prio = p->static_prio - bonus; + if (prio < MAX_RT_PRIO) + prio = MAX_RT_PRIO; + if (prio > MAX_PRIO-1) + prio = MAX_PRIO-1; + return prio; +} + +/* + * __activate_task - move a task to the runqueue. + */ +static inline void __activate_task(task_t *p, runqueue_t *rq) +{ + enqueue_task(p, rq->active); + rq->nr_running++; +} + +static void recalc_task_prio(task_t *p, unsigned long now) +{ + unsigned long sleep_time = now - p->timestamp; + + if (sleep_time > MAX_SLEEP_AVG) + sleep_time = MAX_SLEEP_AVG; + + if (likely(sleep_time > 0)) { /* - * We use the first available idle CPU. This creates - * a priority list between idle CPUs, but this is not - * a problem. + * User tasks that sleep a long time are categorised as + * idle and will get just interactive status to stay active & + * prevent them suddenly becoming cpu hogs and starving + * other processes. */ - if (tsk == idle_task(cpu)) { -#if defined(__i386__) && defined(CONFIG_SMP) - /* - * Check if two siblings are idle in the same - * physical package. Use them if found. + if (p->mm && p->activated != -1 && + sleep_time > JUST_INTERACTIVE_SLEEP(p)){ + p->sleep_avg = MAX_SLEEP_AVG - + AVG_TIMESLICE; + if (!HIGH_CREDIT(p)) + p->interactive_credit++; + } else { + /* + * The lower the sleep avg a task has the more + * rapidly it will rise with sleep time. + */ + sleep_time *= (MAX_BONUS - CURRENT_BONUS(p)) ? : 1; + + /* + * Tasks with low interactive_credit are limited to + * one timeslice worth of sleep avg bonus. + */ + if (LOW_CREDIT(p) && + sleep_time > task_timeslice(p)) + sleep_time = task_timeslice(p); + + /* + * Non high_credit tasks waking from uninterruptible + * sleep are limited in their sleep_avg rise as they + * are likely to be cpu hogs waiting on I/O */ - if (smp_num_siblings == 2) { - if (cpu_curr(cpu_sibling_map[cpu]) == - idle_task(cpu_sibling_map[cpu])) { - oldest_idle = last_schedule(cpu); - target_tsk = tsk; - break; - } - - } -#endif - if (last_schedule(cpu) < oldest_idle) { - oldest_idle = last_schedule(cpu); - target_tsk = tsk; + if (p->activated == -1 && !HIGH_CREDIT(p) && p->mm){ + if (p->sleep_avg >= JUST_INTERACTIVE_SLEEP(p)) + sleep_time = 0; + else if (p->sleep_avg + sleep_time >= + JUST_INTERACTIVE_SLEEP(p)){ + p->sleep_avg = + JUST_INTERACTIVE_SLEEP(p); + sleep_time = 0; + } } - } else { - if (oldest_idle == (cycles_t)-1) { - int prio = preemption_goodness(tsk, p, cpu); - if (prio > max_prio) { - max_prio = prio; - target_tsk = tsk; - } + /* + * This code gives a bonus to interactive tasks. + * + * The boost works by updating the 'average sleep time' + * value here, based on ->timestamp. The more time a task + * spends sleeping, the higher the average gets - and the + * higher the priority boost gets as well. + */ + p->sleep_avg += sleep_time; + + if (p->sleep_avg > MAX_SLEEP_AVG){ + p->sleep_avg = MAX_SLEEP_AVG; + if (!HIGH_CREDIT(p)) + p->interactive_credit++; } } } - tsk = target_tsk; - if (tsk) { - if (oldest_idle != (cycles_t)-1) { - best_cpu = tsk->processor; - goto send_now_idle; + + p->prio = effective_prio(p); +} + +static inline void activate_task(task_t *p, runqueue_t *rq) +{ + recalc_task_prio(p, jiffies); + + /* + * This checks to make sure it's not an uninterruptible task + * that is now waking up. + */ + if (!p->activated){ + /* + * Tasks which were woken up by interrupts (ie. hw events) + * are most likely of interactive nature. So we give them + * the credit of extending their sleep time to the period + * of time they spend on the runqueue, waiting for execution + * on a CPU, first time around: + */ + if (in_interrupt()) + p->activated = 2; + else + /* + * Normal first-time wakeups get a credit too for on-runqueue + * time, but it will be weighted down: + */ + p->activated = 1; } - tsk->need_resched = 1; - if (tsk->processor != this_cpu) - smp_send_reschedule(tsk->processor); - } - return; - + p->timestamp = jiffies; + + __activate_task(p, rq); +} -#else /* UP */ - int this_cpu = smp_processor_id(); +static inline void activate_batch_task(task_t *p, runqueue_t *rq) +{ + rq->nr_batch--; + list_del(&p->run_list); + activate_task(p, rq); + p->flags &= ~PF_BATCH; +} + +static inline void deactivate_task(struct task_struct *p, runqueue_t *rq) +{ + rq->nr_running--; + if (p->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; + dequeue_task(p, p->array); + p->array = NULL; +} + +static inline void deactivate_batch_task(task_t *p, runqueue_t *rq) +{ + prio_array_t *array = p->array; + + deactivate_task(p, rq); + rq->nr_batch++; + if (array == rq->expired) + list_add_tail(&p->run_list, &rq->batch_queue); + else + list_add(&p->run_list, &rq->batch_queue); + /* + * Via this bit we can tell whether a task is in the batchqueue, + * this information is not available in any other cheap way. + */ + p->flags |= PF_BATCH; +} + +static inline void resched_task(task_t *p) +{ +#if CONFIG_SMP + int need_resched; struct task_struct *tsk; + int i,cpu; - tsk = cpu_curr(this_cpu); - if (preemption_goodness(tsk, p, this_cpu) > 0) - tsk->need_resched = 1; + preempt_disable(); + need_resched = p->need_resched; + wmb(); + set_tsk_need_resched(p); + if (!need_resched && (p->cpu != smp_processor_id())) + smp_send_reschedule(p->cpu); + preempt_enable(); +#if LOWLATENCY_NEEDED + if (enable_lowlatency && (p->policy != SCHED_OTHER)) { + struct task_struct *t; + for (i = 0; i < smp_num_cpus; i++) { + cpu = cpu_logical_map(i); + t = cpu_curr(cpu); + if (t != tsk) + t->need_resched = 1; + } + } +#endif +#else + set_tsk_need_resched(p); #endif } +#ifdef CONFIG_SMP + /* - * Careful! - * - * This has to add the process to the _end_ of the - * run-queue, not the beginning. The goodness value will - * determine whether this process will run next. This is - * important to get SCHED_FIFO and SCHED_RR right, where - * a process that is either pre-empted or its time slice - * has expired, should be moved to the tail of the run - * queue for its priority - Bhavesh Davda + * Wait for a process to unschedule. This is used by the exit() and + * ptrace() code. */ -static inline void add_to_runqueue(struct task_struct * p) +void wait_task_inactive(task_t * p) { - list_add_tail(&p->run_list, &runqueue_head); - nr_running++; + unsigned long flags; + runqueue_t *rq; + +repeat: + preempt_disable(); + rq = task_rq(p); + if (unlikely(task_running(rq, p))) { + cpu_relax(); + /* + * enable/disable preemption just to make this + * a preemption point - we are busy-waiting + * anyway. + */ + preempt_enable(); + goto repeat; + } + rq = task_rq_lock(p, &flags); + if (unlikely(task_running(rq, p))) { + task_rq_unlock(rq, &flags); + preempt_enable(); + goto repeat; + } + task_rq_unlock(rq, &flags); + preempt_enable(); } +#endif -static inline void move_last_runqueue(struct task_struct * p) +/* + * Kick the remote CPU if the task is running currently, + * this code is used by the signal code to signal tasks + * which are in user-mode as quickly as possible. + * + * (Note that we do this lockless - if the task does anything + * while the message is in flight then it will notice the + * sigpending condition anyway.) + * + * this code also activates batch processes if they get a signal. + */ +void kick_if_running(task_t * p) { - list_del(&p->run_list); - list_add_tail(&p->run_list, &runqueue_head); + if (task_running(task_rq(p), p) && (p->cpu != smp_processor_id())) + resched_task(p); + /* + * If batch processes get signals but are not running currently + * then give them a chance to handle the signal. (the kernel + * side signal handling code will run for sure, the userspace + * part depends on system load and might be delayed indefinitely.) + */ + if (p->policy == SCHED_BATCH) { + unsigned long flags; + runqueue_t *rq; + + rq = task_rq_lock(p, &flags); + if (p->flags & PF_BATCH) + activate_batch_task(p, rq); + task_rq_unlock(rq, &flags); + } } /* @@ -345,416 +562,902 @@ * progress), and as such you're allowed to do the simpler * "current->state = TASK_RUNNING" to mark yourself runnable * without the overhead of this. + * + * returns failure only if the task is already active. */ -static inline int try_to_wake_up(struct task_struct * p, int synchronous) +static int try_to_wake_up(task_t * p, int sync) { unsigned long flags; int success = 0; + long old_state; + runqueue_t *rq; - /* - * We want the common case fall through straight, thus the goto. - */ - spin_lock_irqsave(&runqueue_lock, flags); +repeat_lock_task: + rq = task_rq_lock(p, &flags); + old_state = p->state; + if (!p->array) { + /* + * Fast-migrate the task if it's not running or runnable + * currently. Do not violate hard affinity. + */ + if (unlikely(sync && !task_running(rq, p) && + (task_cpu(p) != smp_processor_id()) && + (p->cpus_allowed & (1UL << smp_processor_id())))) { + + set_task_cpu(p, smp_processor_id()); + + task_rq_unlock(rq, &flags); + goto repeat_lock_task; + } + if (old_state == TASK_UNINTERRUPTIBLE){ + /* + * Limit tasks waking from UNINTERRUPTIBLE SLEEP to + * just interactive state to prevent cpu hogs getting + * interactive state during disk i/o + */ + if (p->mm) + p->activated = -1; + rq->nr_uninterruptible--; + } + activate_task(p, rq); + + if (p->prio < rq->curr->prio || rq->curr->policy == SCHED_BATCH) + resched_task(rq->curr); + success = 1; + } p->state = TASK_RUNNING; - if (task_on_runqueue(p)) - goto out; - add_to_runqueue(p); - if (!synchronous || !(p->cpus_allowed & (1UL << smp_processor_id()))) - reschedule_idle(p); - success = 1; -out: - spin_unlock_irqrestore(&runqueue_lock, flags); + task_rq_unlock(rq, &flags); + return success; } -inline int fastcall wake_up_process(struct task_struct * p) +int fastcall wake_up_process(task_t * p) { return try_to_wake_up(p, 0); } -static void process_timeout(unsigned long __data) +void fastcall wake_up_forked_process(task_t * p) { - struct task_struct * p = (struct task_struct *) __data; + runqueue_t *rq; + preempt_disable(); + rq = this_rq_lock(); + + p->state = TASK_RUNNING; + /* + * We decrease the sleep average of forking parents + * and children as well, to keep max-interactive tasks + * from forking tasks that are max-interactive. + */ + current->sleep_avg = CURRENT_BONUS(current) * + PARENT_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS; + p->sleep_avg = CURRENT_BONUS(p) * + CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS; + + p->interactive_credit = 0; - wake_up_process(p); + p->prio = effective_prio(p); + + set_task_cpu(p, smp_processor_id()); + activate_task(p, rq); + + rq_unlock(rq); + preempt_enable(); } -/** - * schedule_timeout - sleep until timeout - * @timeout: timeout value in jiffies - * - * Make the current task sleep until @timeout jiffies have - * elapsed. The routine will return immediately unless - * the current task state has been set (see set_current_state()). - * - * You can set the task state as follows - - * - * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to - * pass before the routine returns. The routine will return 0 - * - * %TASK_INTERRUPTIBLE - the routine may return early if a signal is - * delivered to the current task. In this case the remaining time - * in jiffies will be returned, or 0 if the timer expired in time - * - * The current task state is guaranteed to be TASK_RUNNING when this - * routine returns. - * - * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule - * the CPU away without a bound on the timeout. In this case the return - * value will be %MAX_SCHEDULE_TIMEOUT. +/* + * Potentially available exiting-child timeslices are + * retrieved here - this way the parent does not get + * penalized for creating too many processes. * - * In all cases the return value is guaranteed to be non-negative. + * (this cannot be used to 'generate' timeslices + * artificially, because any timeslice recovered here + * was given away by the parent in the first place.) */ -signed long fastcall schedule_timeout(signed long timeout) +void fastcall sched_exit(task_t * p) { - struct timer_list timer; - unsigned long expire; + __cli(); + if (p->first_time_slice) { + current->time_slice += p->time_slice; + if (unlikely(current->time_slice > MAX_TIMESLICE)) + current->time_slice = MAX_TIMESLICE; + } + __sti(); + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. + */ + if (p->sleep_avg < current->sleep_avg) + current->sleep_avg = (current->sleep_avg * EXIT_WEIGHT + + p->sleep_avg) / (EXIT_WEIGHT + 1); +} - switch (timeout) - { - case MAX_SCHEDULE_TIMEOUT: - /* - * These two special cases are useful to be comfortable - * in the caller. Nothing more. We could take - * MAX_SCHEDULE_TIMEOUT from one of the negative value - * but I' d like to return a valid offset (>=0) to allow - * the caller to do everything it want with the retval. - */ - schedule(); - goto out; - default: - /* - * Another bit of PARANOID. Note that the retval will be - * 0 since no piece of kernel is supposed to do a check - * for a negative retval of schedule_timeout() (since it - * should never happens anyway). You just have the printk() - * that will tell you if something is gone wrong and where. - */ - if (timeout < 0) - { - printk(KERN_ERR "schedule_timeout: wrong timeout " - "value %lx from %p\n", timeout, - __builtin_return_address(0)); - current->state = TASK_RUNNING; - goto out; - } +#if CONFIG_SMP || CONFIG_PREEMPT +asmlinkage void schedule_tail(task_t *prev) +{ + finish_arch_switch(this_rq(), prev); +} +#endif + +static inline task_t * context_switch(task_t *prev, task_t *next) +{ + struct mm_struct *mm = next->mm; + struct mm_struct *oldmm = prev->active_mm; + + if (unlikely(!mm)) { + next->active_mm = oldmm; + atomic_inc(&oldmm->mm_count); + enter_lazy_tlb(oldmm, next, smp_processor_id()); + } else + switch_mm(oldmm, mm, next, smp_processor_id()); + + if (unlikely(!prev->mm)) { + prev->active_mm = NULL; + mmdrop(oldmm); } - expire = timeout + jiffies; + /* Here we just switch the register state and the stack. */ + switch_to(prev, next, prev); - init_timer(&timer); - timer.expires = expire; - timer.data = (unsigned long) current; - timer.function = process_timeout; + return prev; +} - add_timer(&timer); - schedule(); - del_timer_sync(&timer); +unsigned long nr_running(void) +{ + unsigned long i, sum = 0; + + for (i = 0; i < NR_CPUS; i++) + sum += cpu_rq(i)->nr_running; + + return sum; +} + +unsigned long nr_uninterruptible(void) +{ + unsigned long i, sum = 0; - timeout = expire - jiffies; + for (i = 0; i < NR_CPUS; i++) + sum += cpu_rq(i)->nr_uninterruptible; - out: - return timeout < 0 ? 0 : timeout; + return sum; +} + +unsigned long nr_context_switches(void) +{ + unsigned long i, sum = 0; + + for (i = 0; i < NR_CPUS; i++) + sum += cpu_rq(i)->nr_switches; + + return sum; } /* - * schedule_tail() is getting called from the fork return path. This - * cleans up all remaining scheduler things, without impacting the - * common case. + * double_rq_lock - safely lock two runqueues + * + * Note this does not disable interrupts like task_rq_lock, + * you need to do so manually before calling. */ -static inline void __schedule_tail(struct task_struct *prev) +static inline void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) { -#ifdef CONFIG_SMP - int policy; + if (rq1 == rq2) + spin_lock(&rq1->lock); + else { + if (rq1 < rq2) { + spin_lock(&rq1->lock); + spin_lock(&rq2->lock); + } else { + spin_lock(&rq2->lock); + spin_lock(&rq1->lock); + } + } +} - /* - * prev->policy can be written from here only before `prev' - * can be scheduled (before setting prev->cpus_runnable to ~0UL). - * Of course it must also be read before allowing prev - * to be rescheduled, but since the write depends on the read - * to complete, wmb() is enough. (the spin_lock() acquired - * before setting cpus_runnable is not enough because the spin_lock() - * common code semantics allows code outside the critical section - * to enter inside the critical section) +/* + * double_rq_unlock - safely unlock two runqueues + * + * Note this does not restore interrupts like task_rq_unlock, + * you need to do so manually after calling. + */ +static inline void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) +{ + spin_unlock(&rq1->lock); + if (rq1 != rq2) + spin_unlock(&rq2->lock); +} + +#if CONFIG_SMP + +/* + * Batch balancing is much simpler since it's optimized for + * CPU-intensive workloads. The balancer keeps the batch-queue + * length as close to the average length as possible. It weighs + * runqueue distribution based on the idle percentage of each + * CPU - this way statistical fairness of timeslice distribution + * is preserved, in the long run it does not matter whether a + * batch task is queued to a busy CPU or not, it will get an + * equal share of all available idle CPU time. + * + * CPU-intensive SCHED_BATCH processes have a much lower + * fork()/exit() flux, so the balancing does not have to + * be prepared for high statistical fluctuations in queue + * length. + */ +static inline void load_balance_batch(runqueue_t *this_rq, int this_cpu) +{ + int i, nr_batch, nr_idle, goal, rq_goal; + runqueue_t *rq_src; + + /* + * First the unlocked fastpath - is there any work to do? + * fastpath #1: no batch processes in the system, + * fastpath #2: no idle time available in the system. + * fastpath #3: no balancing needed for the current queue. */ - policy = prev->policy; - prev->policy = policy & ~SCHED_YIELD; - wmb(); + nr_batch = 0; + nr_idle = 0; - /* - * fast path falls through. We have to clear cpus_runnable before - * checking prev->state to avoid a wakeup race. Protect against - * the task exiting early. - */ - task_lock(prev); - task_release_cpu(prev); - mb(); - if (prev->state == TASK_RUNNING) - goto needs_resched; + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; -out_unlock: - task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */ - return; + nr_batch += cpu_rq(i)->nr_batch; + nr_idle += cpu_rq(i)->idle_avg; + } + if (!nr_batch || !nr_idle) + return; + + goal = this_rq->idle_avg * nr_batch / nr_idle; + if (this_rq->nr_batch >= goal) + return; /* - * Slow path - we 'push' the previous process and - * reschedule_idle() will attempt to find a new - * processor for it. (but it might preempt the - * current process as well.) We must take the runqueue - * lock and re-check prev->state to be correct. It might - * still happen that this process has a preemption - * 'in progress' already - but this is not a problem and - * might happen in other circumstances as well. + * The slow path - the local batch-queue is too short and + * needs balancing. We unlock the runqueue (but keep + * interrupts disabled) to simplify locking. (It does not + * matter if the runqueues change meanwhile - this is all + * statistical balancing so only the long run effects matter.) */ -needs_resched: - { - unsigned long flags; + spin_unlock(&this_rq->lock); - /* - * Avoid taking the runqueue lock in cases where - * no preemption-check is necessery: - */ - if ((prev == idle_task(smp_processor_id())) || - (policy & SCHED_YIELD)) - goto out_unlock; + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i) || (i == this_cpu)) + continue; - spin_lock_irqsave(&runqueue_lock, flags); - if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev)) - reschedule_idle(prev); - spin_unlock_irqrestore(&runqueue_lock, flags); - goto out_unlock; + rq_src = cpu_rq(i); + double_rq_lock(this_rq, rq_src); + + rq_goal = rq_src->idle_avg * nr_batch / nr_idle; + + if (rq_src->nr_batch > rq_goal) { + /* + * Migrate a single batch-process. + */ + list_t *tmp = rq_src->batch_queue.prev; + + list_del(tmp); + list_add_tail(tmp, &this_rq->batch_queue); + rq_src->nr_batch--; + this_rq->nr_batch++; + set_task_cpu(list_entry(tmp, task_t, run_list), this_cpu); + } + + double_rq_unlock(this_rq, rq_src); + if (this_rq->nr_batch >= goal) + break; } -#else - prev->policy &= ~SCHED_YIELD; -#endif /* CONFIG_SMP */ + spin_lock(&this_rq->lock); +} +/* + * Lock the busiest runqueue as well, this_rq is locked already. + * Recalculate nr_running if we have to drop the runqueue lock. + */ +static inline unsigned int double_lock_balance(runqueue_t *this_rq, + runqueue_t *busiest, int this_cpu, int idle, unsigned int nr_running) +{ + if (unlikely(!spin_trylock(&busiest->lock))) { + if (busiest < this_rq) { + spin_unlock(&this_rq->lock); + spin_lock(&busiest->lock); + spin_lock(&this_rq->lock); + /* Need to recalculate nr_running */ + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + nr_running = this_rq->nr_running; + else + nr_running = this_rq->prev_nr_running[this_cpu]; + } else + spin_lock(&busiest->lock); + } + return nr_running; +} + +static inline runqueue_t *find_busiest_queue(runqueue_t *this_rq, int this_cpu, int idle, int *imbalance) +{ + int nr_running, load, max_load, i; + runqueue_t *busiest, *rq_src; + + /* + * We search all runqueues to find the most busy one. + * We do this lockless to reduce cache-bouncing overhead, + * we re-check the 'best' source CPU later on again, with + * the lock held. + * + * We fend off statistical fluctuations in runqueue lengths by + * saving the runqueue length during the previous load-balancing + * operation and using the smaller one the current and saved lengths. + * If a runqueue is long enough for a longer amount of time then + * we recognize it and pull tasks from it. + * + * The 'current runqueue length' is a statistical maximum variable, + * for that one we take the longer one - to avoid fluctuations in + * the other direction. So for a load-balance to happen it needs + * stable long runqueue on the target CPU and stable short runqueue + * on the local runqueue. + * + * We make an exception if this CPU is about to become idle - in + * that case we are less picky about moving a task across CPUs and + * take what can be taken. + */ + if (idle || (this_rq->nr_running > this_rq->prev_nr_running[this_cpu])) + nr_running = this_rq->nr_running; + else + nr_running = this_rq->prev_nr_running[this_cpu]; + + busiest = NULL; + max_load = 1; + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_online(i)) + continue; + + rq_src = cpu_rq(i); + if (idle || (rq_src->nr_running < this_rq->prev_nr_running[i])) + load = rq_src->nr_running; + else + load = this_rq->prev_nr_running[i]; + this_rq->prev_nr_running[i] = rq_src->nr_running; + + if ((load > max_load) && (rq_src != this_rq)) { + busiest = rq_src; + max_load = load; + } + } + + if (likely(!busiest)) + goto out; + + *imbalance = (max_load - nr_running) / 2; + + /* It needs an at least ~25% imbalance to trigger balancing. */ + if (!idle && (*imbalance < (max_load + 3)/4)) { + busiest = NULL; + goto out; + } + + nr_running = double_lock_balance(this_rq, busiest, this_cpu, idle, nr_running); + /* + * Make sure nothing changed since we checked the + * runqueue length. + */ + if (busiest->nr_running <= nr_running + 1) { + spin_unlock(&busiest->lock); + busiest = NULL; + } +out: + return busiest; } -asmlinkage void schedule_tail(struct task_struct *prev) +/* + * Move a task from a remote runqueue to the local runqueue. + * Both runqueues must be locked. + */ +static inline void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, runqueue_t *this_rq, int this_cpu) { - __schedule_tail(prev); + dequeue_task(p, src_array); + src_rq->nr_running--; + set_task_cpu(p, this_cpu); + this_rq->nr_running++; + enqueue_task(p, this_rq->active); + /* + * Note that idle threads have a prio of MAX_PRIO, for this test + * to be always true for them. + */ + if (p->prio < this_rq->curr->prio) + set_need_resched(); } /* - * 'schedule()' is the scheduler function. It's a very simple and nice - * scheduler: it's not perfect, but certainly works for most things. - * - * The goto is "interesting". + * Current runqueue is empty, or rebalance tick: if there is an + * inbalance (current runqueue is too short) then pull from + * busiest runqueue(s). * - * NOTE!! Task 0 is the 'idle' task, which gets called when no other - * tasks can run. It can not be killed, and it cannot sleep. The 'state' - * information in task[0] is never used. + * We call this with the current runqueue locked, + * irqs disabled. */ -asmlinkage void schedule(void) +static void load_balance(runqueue_t *this_rq, int idle) { - struct schedule_data * sched_data; - struct task_struct *prev, *next, *p; - struct list_head *tmp; - int this_cpu, c; + int imbalance, idx, this_cpu = smp_processor_id(); + runqueue_t *busiest; + prio_array_t *array; + list_t *head, *curr; + task_t *tmp; + busiest = find_busiest_queue(this_rq, this_cpu, idle, &imbalance); + if (!busiest) + goto balance_batch; - spin_lock_prefetch(&runqueue_lock); + /* + * We first consider expired tasks. Those will likely not be + * executed in the near future, and they are most likely to + * be cache-cold, thus switching CPUs has the least effect + * on them. + */ + if (busiest->expired->nr_active) + array = busiest->expired; + else + array = busiest->active; - BUG_ON(!current->active_mm); -need_resched_back: - prev = current; - this_cpu = prev->processor; +new_array: + /* Start searching at priority 0: */ + idx = 0; +skip_bitmap: + if (!idx) + idx = sched_find_first_bit(array->bitmap); + else + idx = find_next_bit(array->bitmap, MAX_PRIO, idx); + if (idx == MAX_PRIO) { + if (array == busiest->expired) { + array = busiest->active; + goto new_array; + } + goto out_unlock; + } - if (unlikely(in_interrupt())) { - printk("Scheduling in interrupt\n"); - BUG(); + head = array->queue + idx; + curr = head->prev; +skip_queue: + tmp = list_entry(curr, task_t, run_list); + + /* + * We do not migrate tasks that are: + * 1) running (obviously), or + * 2) cannot be migrated to this CPU due to cpus_allowed, or + * 3) are cache-hot on their current CPU. + */ + +#define CAN_MIGRATE_TASK(p,rq,this_cpu) \ + ((jiffies - (p)->timestamp > cache_decay_ticks) && \ + !task_running(rq, p) && \ + ((p)->cpus_allowed & (1UL << (this_cpu)))) + + curr = curr->prev; + + if (!CAN_MIGRATE_TASK(tmp, busiest, this_cpu)) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; + } + pull_task(busiest, array, tmp, this_rq, this_cpu); + if (!idle && --imbalance) { + if (curr != head) + goto skip_queue; + idx++; + goto skip_bitmap; } +out_unlock: + spin_unlock(&busiest->lock); +balance_batch: + load_balance_batch(this_rq, this_cpu); +} + +/* + * One of the idle_cpu_tick() or the busy_cpu_tick() function will + * gets called every timer tick, on every CPU. Our balancing action + * frequency and balancing agressivity depends on whether the CPU is + * idle or not. + * + * busy-rebalance every 250 msecs. idle-rebalance every 1 msec. (or on + * systems with HZ=100, every 10 msecs.) + */ +#define BUSY_REBALANCE_TICK (HZ/4 ?: 1) +#define IDLE_REBALANCE_TICK (HZ/1000 ?: 1) + +static inline void idle_tick(runqueue_t *rq) +{ + if (jiffies % IDLE_REBALANCE_TICK) + return; + spin_lock(&rq->lock); + load_balance(rq, 1); + spin_unlock(&rq->lock); +} - release_kernel_lock(prev, this_cpu); +#endif - /* - * 'sched_data' is protected by the fact that we can run - * only one process per CPU. - */ - sched_data = & aligned_data[this_cpu].schedule_data; +/* + * We place interactive tasks back into the active array, if possible. + * + * To guarantee that this does not starve expired tasks we ignore the + * interactivity of a task if the first expired task had to wait more + * than a 'reasonable' amount of time. This deadline timeout is + * load-dependent, as the frequency of array switched decreases with + * increasing number of running tasks. We also ignore the interactivity + * if a better static_prio task has expired: + */ +#define EXPIRED_STARVING(rq) \ + (( STARVATION_LIMIT && ((rq)->expired_timestamp && \ + (jiffies - (rq)->expired_timestamp >= \ + STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ + ((rq)->curr->static_prio > (rq)->best_expired_prio)) - spin_lock_irq(&runqueue_lock); +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + */ +void scheduler_tick(int user_ticks, int sys_ticks) +{ + int cpu = smp_processor_id(); + runqueue_t *rq = this_rq(); + task_t *p = current; + +#if CONFIG_SMP + if (user_ticks || sys_ticks) { + /* + * This code is rare, triggered only once per second: + */ + if (--rq->idle_ticks_left <= 0) { + /* + * Maintain a simple running average: + */ + rq->idle_avg += rq->idle_count; + rq->idle_avg >>= 1; + + rq->idle_ticks_left = IDLE_TICKS; + rq->idle_count = 0; - /* move an exhausted RR process to be last.. */ - if (unlikely(prev->policy == SCHED_RR)) - if (!prev->counter) { - prev->counter = NICE_TO_TICKS(prev->nice); - move_last_runqueue(prev); } + } + if (p == rq->idle || p->policy == SCHED_BATCH) + rq->idle_count++; +#endif + if (p == rq->idle) { + if (local_bh_count(cpu) || local_irq_count(cpu) > 1) + kstat.per_cpu_system[cpu] += sys_ticks; +#if CONFIG_SMP + idle_tick(rq); +#endif + return; + } + if (TASK_NICE(p) > 0 || p->policy == SCHED_BATCH) + kstat.per_cpu_nice[cpu] += user_ticks; + else + kstat.per_cpu_user[cpu] += user_ticks; + kstat.per_cpu_system[cpu] += sys_ticks; - switch (prev->state) { - case TASK_INTERRUPTIBLE: - if (signal_pending(prev)) { - prev->state = TASK_RUNNING; - break; - } - default: - del_from_runqueue(prev); - case TASK_RUNNING:; + /* Task might have expired already, but not scheduled off yet */ + if (p->array != rq->active) { + set_tsk_need_resched(p); + return; + } + spin_lock(&rq->lock); + if (unlikely(rt_task(p))) { + /* + * RR tasks need a special form of timeslice management. + * FIFO tasks have no timeslices. + */ + if ((p->policy == SCHED_RR) && !--p->time_slice) { + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + set_tsk_need_resched(p); + + /* put it at the end of the queue: */ + dequeue_task(p, rq->active); + enqueue_task(p, rq->active); + } + goto out; + } + /* + * The task was running during this tick - update the + * time slice counter and the sleep average. Note: we + * do not update a process's priority until it either + * goes to sleep or uses up its timeslice. This makes + * it possible for interactive tasks to use up their + * timeslices at their highest priority levels. + */ + if (!--p->time_slice) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + p->time_slice = task_timeslice(p); + p->first_time_slice = 0; + + if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { + if (!rq->expired_timestamp) + rq->expired_timestamp = jiffies; + enqueue_task(p, rq->expired); + if (p->static_prio < rq->best_expired_prio) + rq->best_expired_prio = p->static_prio; + } else + enqueue_task(p, rq->active); + } else { + /* + * Prevent a too long timeslice allowing a task to monopolize + * the CPU. We do this by splitting up the timeslice into + * smaller pieces. + * + * Note: this does not mean the task's timeslices expire or + * get lost in any way, they just might be preempted by + * another task of equal priority. (one with higher + * priority would have preempted this task already.) We + * requeue this task to the end of the list on this priority + * level, which is in essence a round-robin of tasks with + * equal priority. + */ + if (p->mm && TASK_INTERACTIVE(p) && !((task_timeslice(p) - + p->time_slice) % TIMESLICE_GRANULARITY(p)) && + (p->time_slice > MIN_TIMESLICE) && + (p->array == rq->active)) { + dequeue_task(p, rq->active); + set_tsk_need_resched(p); + p->prio = effective_prio(p); + enqueue_task(p, rq->active); + } + + } +out: +#if CONFIG_SMP + if (!(jiffies % BUSY_REBALANCE_TICK)) + load_balance(rq, 0); +#endif + spin_unlock(&rq->lock); +} + +void scheduling_functions_start_here(void) { } + +/* + * This function is called by the lowlevel kernel entry code if + * pure userspace code is preempted. Such processes, if SCHED_BATCH, + * are candidates for batch scheduling. Every other process (including + * kernel-mode SCHED_BATCH processes) is scheduled in a non-batch way. + */ +asmlinkage void schedule_userspace(void) +{ + runqueue_t *rq; + + if (current->policy != SCHED_BATCH) { + schedule(); + return; } - prev->need_resched = 0; /* - * this is the scheduler proper: + * Only handle batch tasks that are runnable. */ + if (current->state == TASK_RUNNING) { + rq = this_rq_lock(); + deactivate_batch_task(current, rq); + + // we can keep irqs disabled: + spin_unlock(&rq->lock); + } + + schedule(); +} + +/* + * 'schedule()' is the main scheduler function. + */ +asmlinkage void schedule(void) +{ + task_t *prev, *next; + runqueue_t *rq; + prio_array_t *array; + list_t *queue; + unsigned long run_time; + int idx; + if (unlikely(in_interrupt())) + BUG(); -repeat_schedule: - /* - * Default process to select.. - */ - next = idle_task(this_cpu); - c = -1000; - list_for_each(tmp, &runqueue_head) { - p = list_entry(tmp, struct task_struct, run_list); - if (can_schedule(p, this_cpu)) { - int weight = goodness(p, this_cpu, prev->active_mm); - if (weight > c) - c = weight, next = p; - } - } +need_resched: + preempt_disable(); + prev = current; + rq = this_rq(); - /* Do we need to re-calculate counters? */ - if (unlikely(!c)) { - struct task_struct *p; - - spin_unlock_irq(&runqueue_lock); - read_lock(&tasklist_lock); - for_each_task(p) - p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); - read_unlock(&tasklist_lock); - spin_lock_irq(&runqueue_lock); - goto repeat_schedule; - } + release_kernel_lock(prev, smp_processor_id()); + if (likely(jiffies - prev->timestamp < MAX_SLEEP_AVG)) + run_time = jiffies - prev->timestamp; + else + run_time = MAX_SLEEP_AVG; /* - * from this point on nothing can prevent us from - * switching to the next task, save this fact in - * sched_data. + * Tasks with interactive credits get charged less run_time + * at high sleep_avg to delay them losing their interactive + * status */ - sched_data->curr = next; - task_set_cpu(next, this_cpu); - spin_unlock_irq(&runqueue_lock); - - if (unlikely(prev == next)) { - /* We won't go through the normal tail, so do this by hand */ - prev->policy &= ~SCHED_YIELD; - goto same_process; - } - -#ifdef CONFIG_SMP - /* - * maintain the per-process 'last schedule' value. - * (this has to be recalculated even if we reschedule to - * the same process) Currently this is only used on SMP, - * and it's approximate, so we do not have to maintain - * it while holding the runqueue spinlock. - */ - sched_data->last_schedule = get_cycles(); + if (HIGH_CREDIT(prev)) + run_time /= (CURRENT_BONUS(prev) ? : 1); /* - * We drop the scheduler lock early (it's a global spinlock), - * thus we have to lock the previous process from getting - * rescheduled during switch_to(). + * Ensure everything gets charged at least one tick. */ + if (!run_time) + run_time = 1; -#endif /* CONFIG_SMP */ + spin_lock_irq(&rq->lock); - kstat.context_swtch++; +#ifdef CONFIG_PREEMPT /* - * there are 3 processes which are affected by a context switch: - * - * prev == .... ==> (last => next) - * - * It's the 'much more previous' 'prev' that is on next's stack, - * but prev is set to (the just run) 'last' process by switch_to(). - * This might sound slightly confusing but makes tons of sense. + * if entering from preempt_schedule, off a kernel preemption, + * go straight to picking the next task. */ - prepare_to_switch(); - { - struct mm_struct *mm = next->mm; - struct mm_struct *oldmm = prev->active_mm; - if (!mm) { - BUG_ON(next->active_mm); - next->active_mm = oldmm; - atomic_inc(&oldmm->mm_count); - enter_lazy_tlb(oldmm, next, this_cpu); - } else { - BUG_ON(next->active_mm != mm); - switch_mm(oldmm, mm, next, this_cpu); + if (unlikely(preempt_get_count() & PREEMPT_ACTIVE)) + goto pick_next_task; +#endif + switch (prev->state) { + case TASK_INTERRUPTIBLE: + if (unlikely(signal_pending(prev))) { + prev->state = TASK_RUNNING; + break; } + default: + deactivate_task(prev, rq); + case TASK_RUNNING: + ; + } +pick_next_task: + if (unlikely(!rq->nr_running)) { +#if CONFIG_SMP + load_balance(rq, 1); + if (rq->nr_running) + goto pick_next_task; +#endif + /* + * Pick a task from the batch queue if available. + */ + if (rq->nr_batch) { + list_t *tmp = rq->batch_queue.next; - if (!prev->mm) { - prev->active_mm = NULL; - mmdrop(oldmm); - } + next = list_entry(tmp, task_t, run_list); + activate_batch_task(next, rq); + } else + next = rq->idle; + rq->expired_timestamp = 0; + goto switch_tasks; } - /* - * This just switches the register state and the - * stack. - */ - switch_to(prev, next, prev); - __schedule_tail(prev); + array = rq->active; + if (unlikely(!array->nr_active)) { + /* + * Switch the active and expired arrays. + */ + rq->active = rq->expired; + rq->expired = array; + array = rq->active; + rq->expired_timestamp = 0; + rq->best_expired_prio = MAX_PRIO; + } + + idx = sched_find_first_bit(array->bitmap); + queue = array->queue + idx; + next = list_entry(queue->next, task_t, run_list); + + if (next->activated > 0) { + unsigned long delta = jiffies - next->timestamp; + + if (next->activated == 1) + delta = delta * ON_RUNQUEUE_WEIGHT / 100; + + array = next->array; + dequeue_task(next, array); + recalc_task_prio(next, next->timestamp + delta); + enqueue_task(next, array); + } + next->activated = 0; +switch_tasks: + prefetch(next); + clear_tsk_need_resched(prev); + + prev->sleep_avg -= run_time; + if ((long)prev->sleep_avg <= 0){ + prev->sleep_avg = 0; + if (!(HIGH_CREDIT(prev) || LOW_CREDIT(prev))) + prev->interactive_credit--; + } + prev->timestamp = jiffies; + + if (likely(prev != next)) { + rq->nr_switches++; + rq->curr = next; + + prepare_arch_switch(rq, next); + prev = context_switch(prev, next); + barrier(); + rq = this_rq(); + finish_arch_switch(rq, prev); + } else + spin_unlock_irq(&rq->lock); -same_process: reacquire_kernel_lock(current); - if (current->need_resched) - goto need_resched_back; - return; + preempt_enable_no_resched(); + if (need_resched()) + goto need_resched; +} + +#ifdef CONFIG_PREEMPT +/* + * this is is the entry point to schedule() from in-kernel preemption + */ +asmlinkage void preempt_schedule(void) +{ + if (unlikely(irqs_disabled())) + return; + +need_resched: + current->preempt_count += PREEMPT_ACTIVE; + schedule(); + current->preempt_count -= PREEMPT_ACTIVE; + + /* we could miss a preemption opportunity between schedule and now */ + barrier(); + if (unlikely(current->need_resched)) + goto need_resched; } +#endif /* CONFIG_PREEMPT */ /* - * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything - * up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the - * non-exclusive tasks and one exclusive task. + * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just + * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve + * number) then we wake all the non-exclusive tasks and one exclusive task. * * There are circumstances in which we can try to wake a task which has already - * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero - * in this (rare) case, and we handle it by contonuing to scan the queue. + * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns + * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, const int sync) +static inline void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync) { struct list_head *tmp; - struct task_struct *p; - - CHECK_MAGIC_WQHEAD(q); - WQ_CHECK_LIST_HEAD(&q->task_list); - - list_for_each(tmp,&q->task_list) { - unsigned int state; - wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); + unsigned int state; + wait_queue_t *curr; + task_t *p; - CHECK_MAGIC(curr->__magic); + list_for_each(tmp, &q->task_list) { + curr = list_entry(tmp, wait_queue_t, task_list); p = curr->task; state = p->state; - if (state & mode) { - WQ_NOTE_WAKER(curr); - if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) + if ((state & mode) && try_to_wake_up(p, sync) && + ((curr->flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)) break; - } } } -void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, int nr) +void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) { - if (q) { - unsigned long flags; - wq_read_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr, 0); - wq_read_unlock_irqrestore(&q->lock, flags); - } + unsigned long flags; + + if (unlikely(!q)) + return; + + spin_lock_irqsave(&q->lock, flags); + __wake_up_common(q, mode, nr_exclusive, 0); + spin_unlock_irqrestore(&q->lock, flags); } -void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr) +#if CONFIG_SMP + +void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) { - if (q) { - unsigned long flags; - wq_read_lock_irqsave(&q->lock, flags); - __wake_up_common(q, mode, nr, 1); - wq_read_unlock_irqrestore(&q->lock, flags); - } + unsigned long flags; + + if (unlikely(!q)) + return; + + spin_lock_irqsave(&q->lock, flags); + if (likely(nr_exclusive)) + __wake_up_common(q, mode, nr_exclusive, 1); + else + __wake_up_common(q, mode, nr_exclusive, 0); + spin_unlock_irqrestore(&q->lock, flags); } +#endif + void fastcall complete(struct completion *x) { unsigned long flags; @@ -791,14 +1494,14 @@ init_waitqueue_entry(&wait, current); #define SLEEP_ON_HEAD \ - wq_write_lock_irqsave(&q->lock,flags); \ + spin_lock_irqsave(&q->lock,flags); \ __add_wait_queue(q, &wait); \ - wq_write_unlock(&q->lock); + spin_unlock(&q->lock); #define SLEEP_ON_TAIL \ - wq_write_lock_irq(&q->lock); \ + spin_lock_irq(&q->lock); \ __remove_wait_queue(q, &wait); \ - wq_write_unlock_irqrestore(&q->lock,flags); + spin_unlock_irqrestore(&q->lock,flags); void fastcall interruptible_sleep_on(wait_queue_head_t *q) { @@ -850,43 +1553,41 @@ void scheduling_functions_end_here(void) { } -#if CONFIG_SMP -/** - * set_cpus_allowed() - change a given task's processor affinity - * @p: task to bind - * @new_mask: bitmask of allowed processors - * - * Upon return, the task is running on a legal processor. Note the caller - * must have a valid reference to the task: it must not exit() prematurely. - * This call can sleep; do not hold locks on call. - */ -void set_cpus_allowed(struct task_struct *p, unsigned long new_mask) +void set_user_nice(task_t *p, long nice) { - new_mask &= cpu_online_map; - BUG_ON(!new_mask); - - p->cpus_allowed = new_mask; + unsigned long flags; + prio_array_t *array; + runqueue_t *rq; + if (TASK_NICE(p) == nice || nice < -20 || nice > 19) + return; /* - * If the task is on a no-longer-allowed processor, we need to move - * it. If the task is not current, then set need_resched and send - * its processor an IPI to reschedule. - */ - if (!(p->cpus_runnable & p->cpus_allowed)) { - if (p != current) { - p->need_resched = 1; - smp_send_reschedule(p->processor); - } + * We have to be careful, if called from sys_setpriority(), + * the task might be in the middle of scheduling on another CPU. + */ + rq = task_rq_lock(p, &flags); + if (rt_task(p)) { + p->static_prio = NICE_TO_PRIO(nice); + goto out_unlock; + } + array = p->array; + if (array) + dequeue_task(p, array); + p->static_prio = NICE_TO_PRIO(nice); + p->prio = NICE_TO_PRIO(nice); + if (array) { + enqueue_task(p, array); /* - * Wait until we are on a legal processor. If the task is - * current, then we should be on a legal processor the next - * time we reschedule. Otherwise, we need to wait for the IPI. + * If the task is running and lowered its priority, + * or increased its priority then reschedule its CPU: */ - while (!(p->cpus_runnable & p->cpus_allowed)) - schedule(); + if ((NICE_TO_PRIO(nice) < p->static_prio) || + task_running(rq, p)) + resched_task(rq->curr); } +out_unlock: + task_rq_unlock(rq, &flags); } -#endif /* CONFIG_SMP */ #ifndef __alpha__ @@ -898,7 +1599,7 @@ asmlinkage long sys_nice(int increment) { - long newprio; + long nice; /* * Setpriority might change our priority at the same moment. @@ -914,34 +1615,52 @@ if (increment > 40) increment = 40; - newprio = current->nice + increment; - if (newprio < -20) - newprio = -20; - if (newprio > 19) - newprio = 19; - current->nice = newprio; + nice = PRIO_TO_NICE(current->static_prio) + increment; + if (nice < -20) + nice = -20; + if (nice > 19) + nice = 19; + set_user_nice(current, nice); return 0; } #endif -static inline struct task_struct *find_process_by_pid(pid_t pid) +/* + * This is the priority value as seen by users in /proc + * + * RT tasks are offset by -200. Normal tasks are centered + * around 0, value goes from -16 to +15. + */ +int task_prio(task_t *p) +{ + return p->prio - MAX_USER_RT_PRIO; +} + +int task_nice(task_t *p) +{ + return TASK_NICE(p); +} + +int idle_cpu(int cpu) { - struct task_struct *tsk = current; + return cpu_curr(cpu) == cpu_rq(cpu)->idle; +} - if (pid) - tsk = find_task_by_pid(pid); - return tsk; +static inline task_t *find_process_by_pid(pid_t pid) +{ + return pid ? find_task_by_pid(pid) : current; } -static int setscheduler(pid_t pid, int policy, - struct sched_param *param) +static int setscheduler(pid_t pid, int policy, struct sched_param *param) { struct sched_param lp; - struct task_struct *p; - int retval; + int retval = -EINVAL; + prio_array_t *array; + unsigned long flags; + runqueue_t *rq; + task_t *p; - retval = -EINVAL; if (!param || pid < 0) goto out_nounlock; @@ -953,56 +1672,73 @@ * We play safe to avoid deadlocks. */ read_lock_irq(&tasklist_lock); - spin_lock(&runqueue_lock); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) - goto out_unlock; - + goto out_unlock_tasklist; + + /* + * To be able to change p->policy safely, the apropriate + * runqueue lock must be held. + */ + rq = task_rq_lock(p, &flags); + if (policy < 0) policy = p->policy; else { retval = -EINVAL; if (policy != SCHED_FIFO && policy != SCHED_RR && - policy != SCHED_OTHER) + policy != SCHED_NORMAL && policy != SCHED_BATCH) goto out_unlock; } - + /* - * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid - * priority for SCHED_OTHER is 0. + * Valid priorities for SCHED_FIFO and SCHED_RR are + * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and + * SCHED_BATCH is 0. */ retval = -EINVAL; - if (lp.sched_priority < 0 || lp.sched_priority > 99) + if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1) goto out_unlock; - if ((policy == SCHED_OTHER) != (lp.sched_priority == 0)) + if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) != + (lp.sched_priority == 0)) goto out_unlock; retval = -EPERM; - if ((policy == SCHED_FIFO || policy == SCHED_RR) && + if ((policy == SCHED_FIFO || policy == SCHED_RR) && !capable(CAP_SYS_NICE)) goto out_unlock; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) goto out_unlock; + if (p->flags & PF_BATCH) + activate_batch_task(p, rq); + array = p->array; + if (array) + deactivate_task(p, task_rq(p)); retval = 0; p->policy = policy; p->rt_priority = lp.sched_priority; - - current->need_resched = 1; + if (policy != SCHED_NORMAL && policy != SCHED_BATCH) + p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; + else + p->prio = p->static_prio; + if (array) + activate_task(p, task_rq(p)); out_unlock: - spin_unlock(&runqueue_lock); + task_rq_unlock(rq, &flags); +out_unlock_tasklist: read_unlock_irq(&tasklist_lock); out_nounlock: return retval; } -asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, +asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, struct sched_param *param) { return setscheduler(pid, policy, param); @@ -1015,10 +1751,9 @@ asmlinkage long sys_sched_getscheduler(pid_t pid) { - struct task_struct *p; - int retval; + int retval = -EINVAL; + task_t *p; - retval = -EINVAL; if (pid < 0) goto out_nounlock; @@ -1026,7 +1761,7 @@ read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (p) - retval = p->policy & ~SCHED_YIELD; + retval = p->policy; read_unlock(&tasklist_lock); out_nounlock: @@ -1035,11 +1770,10 @@ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param) { - struct task_struct *p; struct sched_param lp; - int retval; + int retval = -EINVAL; + task_t *p; - retval = -EINVAL; if (!param || pid < 0) goto out_nounlock; @@ -1064,44 +1798,121 @@ return retval; } -asmlinkage long sys_sched_yield(void) +/** + * sys_sched_setaffinity - set the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to the new cpu mask + */ +asmlinkage int sys_sched_setaffinity(pid_t pid, unsigned int len, + unsigned long *user_mask_ptr) { + unsigned long new_mask; + int retval; + task_t *p; + + if (len < sizeof(new_mask)) + return -EINVAL; + + if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) + return -EFAULT; + + new_mask &= cpu_online_map; + if (!new_mask) + return -EINVAL; + + read_lock(&tasklist_lock); + + p = find_process_by_pid(pid); + if (!p) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + /* - * Trick. sched_yield() first counts the number of truly - * 'pending' runnable processes, then returns if it's - * only the current processes. (This test does not have - * to be atomic.) In threaded applications this optimization - * gets triggered quite often. + * It is not safe to call set_cpus_allowed with the + * tasklist_lock held. We will bump the task_struct's + * usage count and then drop tasklist_lock. */ + get_task_struct(p); + read_unlock(&tasklist_lock); - int nr_pending = nr_running; + retval = -EPERM; + if ((current->euid != p->euid) && (current->euid != p->uid) && + !capable(CAP_SYS_NICE)) + goto out_unlock; -#if CONFIG_SMP - int i; + retval = 0; + set_cpus_allowed(p, new_mask); - // Subtract non-idle processes running on other CPUs. - for (i = 0; i < smp_num_cpus; i++) { - int cpu = cpu_logical_map(i); - if (aligned_data[cpu].schedule_data.curr != idle_task(cpu)) - nr_pending--; - } -#else - // on UP this process is on the runqueue as well - nr_pending--; -#endif - if (nr_pending) { - /* - * This process can only be rescheduled by us, - * so this is safe without any locking. - */ - if (current->policy == SCHED_OTHER) - current->policy |= SCHED_YIELD; - current->need_resched = 1; - - spin_lock_irq(&runqueue_lock); - move_last_runqueue(current); - spin_unlock_irq(&runqueue_lock); +out_unlock: + free_task_struct(p); + return retval; +} + +/** + * sys_sched_getaffinity - get the cpu affinity of a process + * @pid: pid of the process + * @len: length in bytes of the bitmask pointed to by user_mask_ptr + * @user_mask_ptr: user-space pointer to hold the current cpu mask + */ +asmlinkage int sys_sched_getaffinity(pid_t pid, unsigned int len, + unsigned long *user_mask_ptr) +{ + unsigned int real_len; + unsigned long mask; + int retval; + task_t *p; + + real_len = sizeof(mask); + if (len < real_len) + return -EINVAL; + + read_lock(&tasklist_lock); + + retval = -ESRCH; + p = find_process_by_pid(pid); + if (!p) + goto out_unlock; + + retval = 0; + mask = p->cpus_allowed & cpu_online_map; + +out_unlock: + read_unlock(&tasklist_lock); + if (retval) + return retval; + if (copy_to_user(user_mask_ptr, &mask, real_len)) + return -EFAULT; + return real_len; +} + +asmlinkage long sys_sched_yield(void) +{ + runqueue_t *rq; + prio_array_t *array; + preempt_disable(); + rq = this_rq_lock(); + array = current->array; + + /* + * We implement yielding by moving the task into the expired + * queue. + * + * (special rule: RT tasks will just roundrobin in the active + * array.) + */ + if (likely(!rt_task(current))) { + dequeue_task(current, array); + enqueue_task(current, rq->expired); + } else { + list_del(¤t->run_list); + list_add_tail(¤t->run_list, array->queue + current->prio); } + spin_unlock(&rq->lock); + + schedule(); + return 0; } @@ -1131,9 +1942,10 @@ switch (policy) { case SCHED_FIFO: case SCHED_RR: - ret = 99; + ret = MAX_USER_RT_PRIO-1; break; - case SCHED_OTHER: + case SCHED_NORMAL: + case SCHED_BATCH: ret = 0; break; } @@ -1149,7 +1961,8 @@ case SCHED_RR: ret = 1; break; - case SCHED_OTHER: + case SCHED_NORMAL: + case SCHED_BATCH: ret = 0; } return ret; @@ -1157,9 +1970,9 @@ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval) { - struct timespec t; - struct task_struct *p; int retval = -EINVAL; + struct timespec t; + task_t *p; if (pid < 0) goto out_nounlock; @@ -1168,8 +1981,8 @@ read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (p) - jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice), - &t); + jiffies_to_timespec(p->policy & SCHED_FIFO ? + 0 : task_timeslice(p), &t); read_unlock(&tasklist_lock); if (p) retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; @@ -1177,14 +1990,14 @@ return retval; } -static void show_task(struct task_struct * p) +static void show_task(task_t * p) { unsigned long free = 0; int state; static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" }; printk("%-13.13s ", p->comm); - state = p->state ? ffz(~p->state) + 1 : 0; + state = p->state ? __ffs(p->state) + 1 : 0; if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *)) printk(stat_nam[state]); else @@ -1225,7 +2038,7 @@ printk(" (NOTLB)\n"); { - extern void show_trace_task(struct task_struct *tsk); + extern void show_trace_task(task_t *tsk); show_trace_task(p); } } @@ -1247,7 +2060,7 @@ void show_state(void) { - struct task_struct *p; + task_t *p; #if (BITS_PER_LONG == 32) printk("\n" @@ -1270,121 +2083,251 @@ read_unlock(&tasklist_lock); } -/** - * reparent_to_init() - Reparent the calling kernel thread to the init task. - * - * If a kernel thread is launched as a result of a system call, or if - * it ever exits, it should generally reparent itself to init so that - * it is correctly cleaned up on exit. - * - * The various task state such as scheduling policy and priority may have - * been inherited fro a user process, so we reset them to sane values here. - * - * NOTE that reparent_to_init() gives the caller full capabilities. - */ -void reparent_to_init(void) +void __init init_idle(task_t *idle, int cpu) { - struct task_struct *this_task = current; + runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle)); + unsigned long flags; + + __save_flags(flags); + __cli(); + double_rq_lock(idle_rq, rq); + + idle_rq->curr = idle_rq->idle = idle; + deactivate_task(idle, rq); + idle->array = NULL; + idle->prio = MAX_PRIO; + idle->state = TASK_RUNNING; + set_task_cpu(idle, cpu); + double_rq_unlock(idle_rq, rq); + set_tsk_need_resched(idle); + __restore_flags(flags); + + /* Set the preempt count _outside_ the spinlocks! */ + idle->preempt_count = (idle->lock_depth >= 0); +} - write_lock_irq(&tasklist_lock); +#if CONFIG_SMP - /* Reparent to init */ - REMOVE_LINKS(this_task); - this_task->p_pptr = child_reaper; - this_task->p_opptr = child_reaper; - SET_LINKS(this_task); +/* + * This is how migration works: + * + * 1) we queue a migration_req_t structure in the source CPU's + * runqueue and wake up that CPU's migration thread. + * 2) we down() the locked semaphore => thread blocks. + * 3) migration thread wakes up (implicitly it forces the migrated + * thread off the CPU) + * 4) it gets the migration request and checks whether the migrated + * task is still in the wrong runqueue. + * 5) if it's in the wrong runqueue then the migration thread removes + * it and puts it into the right queue. + * 6) migration thread up()s the semaphore. + * 7) we wake up and the migration is done. + */ + +typedef struct { + list_t list; + task_t *task; + struct semaphore sem; +} migration_req_t; - /* Set the exit signal to SIGCHLD so we signal init on exit */ - this_task->exit_signal = SIGCHLD; +/* + * Change a given task's CPU affinity. Migrate the process to a + * proper CPU and schedule it away if the CPU it's executing on + * is removed from the allowed bitmask. + * + * NOTE: the caller must have a valid reference to the task, the + * task must not exit() & deallocate itself prematurely. The + * call is not atomic; no spinlocks may be held. + */ +void set_cpus_allowed(task_t *p, unsigned long new_mask) +{ + unsigned long flags; + migration_req_t req; + runqueue_t *rq; - /* We also take the runqueue_lock while altering task fields - * which affect scheduling decisions */ - spin_lock(&runqueue_lock); + new_mask &= cpu_online_map; + if (!new_mask) + BUG(); - this_task->ptrace = 0; - this_task->nice = DEF_NICE; - this_task->policy = SCHED_OTHER; - /* cpus_allowed? */ - /* rt_priority? */ - /* signals? */ - this_task->cap_effective = CAP_INIT_EFF_SET; - this_task->cap_inheritable = CAP_INIT_INH_SET; - this_task->cap_permitted = CAP_FULL_SET; - this_task->keep_capabilities = 0; - memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim))); - switch_uid(INIT_USER); + preempt_disable(); + rq = task_rq_lock(p, &flags); + p->cpus_allowed = new_mask; + /* + * Can the task run on the task's current CPU? If not then + * migrate the process off to a proper CPU. + */ + if (new_mask & (1UL << task_cpu(p))) { + task_rq_unlock(rq, &flags); + goto out; + } + /* + * If the task is not on a runqueue (and not running), then + * it is sufficient to simply update the task's cpu field. + */ + if (!p->array && !task_running(rq, p)) { + set_task_cpu(p, __ffs(p->cpus_allowed)); + task_rq_unlock(rq, &flags); + goto out; + } + init_MUTEX_LOCKED(&req.sem); + req.task = p; + list_add(&req.list, &rq->migration_queue); + task_rq_unlock(rq, &flags); + wake_up_process(rq->migration_thread); - spin_unlock(&runqueue_lock); - write_unlock_irq(&tasklist_lock); + down(&req.sem); +out: + preempt_enable(); + return; } -/* - * Put all the gunge required to become a kernel thread without - * attached user resources in one place where it belongs. - */ +static __initdata int master_migration_thread; -void daemonize(void) +static int migration_thread(void * bind_cpu) { - struct fs_struct *fs; + int cpu = (int) (long) bind_cpu; + struct sched_param param = { sched_priority: MAX_RT_PRIO-1 }; + runqueue_t *rq; + int ret; + daemonize(); + sigfillset(¤t->blocked); + set_fs(KERNEL_DS); /* - * If we were started as result of loading a module, close all of the - * user space pages. We don't need them, and if we didn't close them - * they would be locked into memory. + * The first migration thread is started on the boot CPU, it + * migrates the other migration threads to their destination CPUs. */ - exit_mm(current); + if (cpu != master_migration_thread) { + while (!cpu_rq(master_migration_thread)->migration_thread) + yield(); + set_cpus_allowed(current, 1UL << cpu); + } + printk("migration_task %d on cpu=%d\n", cpu, smp_processor_id()); + ret = setscheduler(0, SCHED_FIFO, ¶m); - current->session = 1; - current->pgrp = 1; - current->tty = NULL; + rq = this_rq(); + rq->migration_thread = current; - /* Become as one with the init task */ + sprintf(current->comm, "migration_CPU%d", smp_processor_id()); - exit_fs(current); /* current->fs->count--; */ - fs = init_task.fs; - current->fs = fs; - atomic_inc(&fs->count); - exit_files(current); - current->files = init_task.files; - atomic_inc(¤t->files->count); -} + for (;;) { + runqueue_t *rq_src, *rq_dest; + struct list_head *head; + int cpu_src, cpu_dest; + migration_req_t *req; + unsigned long flags; + task_t *p; + + spin_lock_irqsave(&rq->lock, flags); + head = &rq->migration_queue; + current->state = TASK_INTERRUPTIBLE; + if (list_empty(head)) { + spin_unlock_irqrestore(&rq->lock, flags); + schedule(); + continue; + } + req = list_entry(head->next, migration_req_t, list); + list_del_init(head->next); + spin_unlock_irqrestore(&rq->lock, flags); + + p = req->task; + cpu_dest = __ffs(p->cpus_allowed); + rq_dest = cpu_rq(cpu_dest); +repeat: + cpu_src = task_cpu(p); + rq_src = cpu_rq(cpu_src); + + local_irq_save(flags); + double_rq_lock(rq_src, rq_dest); + if (task_cpu(p) != cpu_src) { + double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); + goto repeat; + } + if (rq_src == rq) { + set_task_cpu(p, cpu_dest); + if (p->array) { + deactivate_task(p, rq_src); + activate_task(p, rq_dest); + } + } + double_rq_unlock(rq_src, rq_dest); + local_irq_restore(flags); -extern unsigned long wait_init_idle; + up(&req->sem); + } +} -void __init init_idle(void) +void __init migration_init(void) { - struct schedule_data * sched_data; - sched_data = &aligned_data[smp_processor_id()].schedule_data; + int cpu; + + master_migration_thread = smp_processor_id(); + current->cpus_allowed = 1UL << master_migration_thread; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (!cpu_online(cpu)) + continue; + if (kernel_thread(migration_thread, (void *) (long) cpu, + CLONE_FS | CLONE_FILES | CLONE_SIGNAL) < 0) + BUG(); + } + current->cpus_allowed = -1L; - if (current != &init_task && task_on_runqueue(current)) { - printk("UGH! (%d:%d) was on the runqueue, removing.\n", - smp_processor_id(), current->pid); - del_from_runqueue(current); + for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (!cpu_online(cpu)) + continue; + while (!cpu_rq(cpu)->migration_thread) + schedule_timeout(2); } - sched_data->curr = current; - sched_data->last_schedule = get_cycles(); - clear_bit(current->processor, &wait_init_idle); } +#endif -extern void init_timervecs (void); +extern void init_timervecs(void); +extern void timer_bh(void); +extern void tqueue_bh(void); +extern void immediate_bh(void); void __init sched_init(void) { + runqueue_t *rq; + int i, j, k; + + for (i = 0; i < NR_CPUS; i++) { + prio_array_t *array; + + rq = cpu_rq(i); + rq->active = rq->arrays; + rq->expired = rq->arrays + 1; + rq->best_expired_prio = MAX_PRIO; + + spin_lock_init(&rq->lock); + INIT_LIST_HEAD(&rq->migration_queue); + INIT_LIST_HEAD(&rq->batch_queue); + rq->idle_ticks_left = IDLE_TICKS; + + for (j = 0; j < 2; j++) { + array = rq->arrays + j; + for (k = 0; k < MAX_PRIO; k++) { + INIT_LIST_HEAD(array->queue + k); + __clear_bit(k, array->bitmap); + } + // delimiter for bitsearch + __set_bit(MAX_PRIO, array->bitmap); + } + } /* * We have to do a little magic to get the first * process right in SMP mode. */ - int cpu = smp_processor_id(); - int nr; - - init_task.processor = cpu; - - for(nr = 0; nr < PIDHASH_SZ; nr++) - pidhash[nr] = NULL; + rq = this_rq(); + rq->curr = current; + rq->idle = current; + set_task_cpu(current, smp_processor_id()); + wake_up_process(current); init_timervecs(); - init_bh(TIMER_BH, timer_bh); init_bh(TQUEUE_BH, tqueue_bh); init_bh(IMMEDIATE_BH, immediate_bh); @@ -1393,5 +2336,94 @@ * The boot idle thread does lazy MMU switching as well: */ atomic_inc(&init_mm.mm_count); - enter_lazy_tlb(&init_mm, current, cpu); + enter_lazy_tlb(&init_mm, current, smp_processor_id()); } + +#if LOWLATENCY_NEEDED +#if LOWLATENCY_DEBUG + +static struct lolat_stats_t *lolat_stats_head; +static spinlock_t lolat_stats_lock = SPIN_LOCK_UNLOCKED; + +void set_running_and_schedule(struct lolat_stats_t *stats) +{ + spin_lock(&lolat_stats_lock); + if (stats->visited == 0) { + stats->visited = 1; + stats->next = lolat_stats_head; + lolat_stats_head = stats; + } + stats->count++; + spin_unlock(&lolat_stats_lock); + + if (current->state != TASK_RUNNING) + set_current_state(TASK_RUNNING); + schedule(); +} + +void show_lolat_stats(void) +{ + struct lolat_stats_t *stats = lolat_stats_head; + + printk("Low latency scheduling stats:\n"); + while (stats) { + printk("%s:%d: %lu\n", stats->file, stats->line, stats->count); + stats->count = 0; + stats = stats->next; + } +} + +#else /* LOWLATENCY_DEBUG */ + +void set_running_and_schedule() +{ + if (current->state != TASK_RUNNING) + __set_current_state(TASK_RUNNING); + schedule(); +} + +#endif /* LOWLATENCY_DEBUG */ + +int ll_copy_to_user(void *to_user, const void *from, unsigned long len) +{ + while (len) { + unsigned long n_to_copy = len; + unsigned long remainder; + + if (n_to_copy > 4096) + n_to_copy = 4096; + remainder = copy_to_user(to_user, from, n_to_copy); + if (remainder) + return remainder + len; + to_user = ((char *)to_user) + n_to_copy; + from = ((char *)from) + n_to_copy; + len -= n_to_copy; + conditional_schedule(); + } + return 0; +} + +int ll_copy_from_user(void *to, const void *from_user, unsigned long len) +{ + while (len) { + unsigned long n_to_copy = len; + unsigned long remainder; + + if (n_to_copy > 4096) + n_to_copy = 4096; + remainder = copy_from_user(to, from_user, n_to_copy); + if (remainder) + return remainder + len; + to = ((char *)to) + n_to_copy; + from_user = ((char *)from_user) + n_to_copy; + len -= n_to_copy; + conditional_schedule(); + } + return 0; +} + +#ifdef CONFIG_LOLAT_SYSCTL +struct low_latency_enable_struct __enable_lowlatency = { 0, }; +#endif + +#endif /* LOWLATENCY_NEEDED */ diff -Nur linux-2.4.33-imedia/kernel/signal.c linux-2.4.33-imedia-patching/kernel/signal.c --- linux-2.4.33-imedia/kernel/signal.c 2004-02-18 15:36:32.000000000 +0200 +++ linux-2.4.33-imedia-patching/kernel/signal.c 2006-01-26 15:19:43.000000000 +0200 @@ -507,11 +507,9 @@ * process of changing - but no harm is done by that * other than doing an extra (lightweight) IPI interrupt. */ - spin_lock(&runqueue_lock); - if (task_has_cpu(t) && t->processor != smp_processor_id()) - smp_send_reschedule(t->processor); - spin_unlock(&runqueue_lock); -#endif /* CONFIG_SMP */ + if ((t->state == TASK_RUNNING) && (t->cpu != cpu())) + kick_if_running(t); +#endif if (t->state & TASK_INTERRUPTIBLE) { wake_up_process(t); diff -Nur linux-2.4.33-imedia/kernel/softirq.c linux-2.4.33-imedia-patching/kernel/softirq.c --- linux-2.4.33-imedia/kernel/softirq.c 2004-11-17 13:54:22.000000000 +0200 +++ linux-2.4.33-imedia-patching/kernel/softirq.c 2006-01-26 15:19:43.000000000 +0200 @@ -60,7 +60,7 @@ asmlinkage void do_softirq() { - int cpu = smp_processor_id(); + int cpu; __u32 pending; unsigned long flags; __u32 mask; @@ -70,6 +70,8 @@ local_irq_save(flags); + cpu = smp_processor_id(); + pending = softirq_pending(cpu); if (pending) { @@ -99,10 +101,11 @@ mask &= ~pending; goto restart; } - __local_bh_enable(); if (pending) wakeup_softirqd(cpu); + + __local_bh_enable(); } local_irq_restore(flags); @@ -151,10 +154,11 @@ void fastcall __tasklet_schedule(struct tasklet_struct *t) { - int cpu = smp_processor_id(); + int cpu; unsigned long flags; local_irq_save(flags); + cpu = smp_processor_id(); t->next = tasklet_vec[cpu].list; tasklet_vec[cpu].list = t; cpu_raise_softirq(cpu, TASKLET_SOFTIRQ); @@ -175,10 +179,11 @@ static void tasklet_action(struct softirq_action *a) { - int cpu = smp_processor_id(); + int cpu; struct tasklet_struct *list; local_irq_disable(); + cpu = smp_processor_id(); list = tasklet_vec[cpu].list; tasklet_vec[cpu].list = NULL; local_irq_enable(); @@ -209,10 +214,11 @@ static void tasklet_hi_action(struct softirq_action *a) { - int cpu = smp_processor_id(); + int cpu; struct tasklet_struct *list; local_irq_disable(); + cpu = smp_processor_id(); list = tasklet_hi_vec[cpu].list; tasklet_hi_vec[cpu].list = NULL; local_irq_enable(); @@ -364,13 +370,13 @@ int cpu = cpu_logical_map(bind_cpu); daemonize(); - current->nice = 19; + set_user_nice(current, 19); sigfillset(¤t->blocked); /* Migrate to the right CPU */ - current->cpus_allowed = 1UL << cpu; - while (smp_processor_id() != cpu) - schedule(); + set_cpus_allowed(current, 1UL << cpu); + if (cpu() != cpu) + BUG(); sprintf(current->comm, "ksoftirqd_CPU%d", bind_cpu); @@ -395,7 +401,7 @@ } } -static __init int spawn_ksoftirqd(void) +__init int spawn_ksoftirqd(void) { int cpu; diff -Nur linux-2.4.33-imedia/kernel/sys.c linux-2.4.33-imedia-patching/kernel/sys.c --- linux-2.4.33-imedia/kernel/sys.c 2003-11-28 20:26:21.000000000 +0200 +++ linux-2.4.33-imedia-patching/kernel/sys.c 2006-01-26 15:19:43.000000000 +0200 @@ -225,7 +225,7 @@ error = -ESRCH; if (niceval < -20) niceval = -20; - if (niceval > 19) + if (niceval > 18) niceval = 19; read_lock(&tasklist_lock); @@ -239,10 +239,13 @@ } if (error == -ESRCH) error = 0; - if (niceval < p->nice && !capable(CAP_SYS_NICE)) + if (niceval < task_nice(p) && !capable(CAP_SYS_NICE)) error = -EACCES; - else - p->nice = niceval; + else { + if (niceval > 18 && p->mm) + p->policy = SCHED_BATCH; + set_user_nice(p, niceval); + } } read_unlock(&tasklist_lock); @@ -268,7 +271,7 @@ long niceval; if (!proc_sel(p, which, who)) continue; - niceval = 20 - p->nice; + niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; } @@ -320,6 +323,7 @@ notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); printk(KERN_EMERG "System halted.\n"); machine_halt(); + unlock_kernel(); do_exit(0); break; @@ -327,6 +331,7 @@ notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); printk(KERN_EMERG "Power down.\n"); machine_power_off(); + unlock_kernel(); do_exit(0); break; diff -Nur linux-2.4.33-imedia/kernel/sysctl.c linux-2.4.33-imedia-patching/kernel/sysctl.c --- linux-2.4.33-imedia/kernel/sysctl.c 2006-01-11 20:29:28.000000000 +0200 +++ linux-2.4.33-imedia-patching/kernel/sysctl.c 2006-01-26 15:19:43.000000000 +0200 @@ -278,6 +278,10 @@ {KERN_EXCEPTION_TRACE,"exception-trace", &exception_trace,sizeof(int),0644,NULL,&proc_dointvec}, #endif +#ifdef CONFIG_LOLAT_SYSCTL + {KERN_LOWLATENCY, "lowlatency", &enable_lowlatency, sizeof (int), + 0644, NULL, &proc_dointvec}, +#endif {0} }; diff -Nur linux-2.4.33-imedia/kernel/timer.c linux-2.4.33-imedia-patching/kernel/timer.c --- linux-2.4.33-imedia/kernel/timer.c 2002-11-29 01:53:15.000000000 +0200 +++ linux-2.4.33-imedia-patching/kernel/timer.c 2006-01-26 15:19:43.000000000 +0200 @@ -25,6 +25,8 @@ #include +struct kernel_stat kstat; + /* * Timekeeping variables */ @@ -598,25 +600,7 @@ int cpu = smp_processor_id(), system = user_tick ^ 1; update_one_process(p, user_tick, system, cpu); - if (p->pid) { - if (--p->counter <= 0) { - p->counter = 0; - /* - * SCHED_FIFO is priority preemption, so this is - * not the place to decide whether to reschedule a - * SCHED_FIFO task or not - Bhavesh Davda - */ - if (p->policy != SCHED_FIFO) { - p->need_resched = 1; - } - } - if (p->nice > 0) - kstat.per_cpu_nice[cpu] += user_tick; - else - kstat.per_cpu_user[cpu] += user_tick; - kstat.per_cpu_system[cpu] += system; - } else if (local_bh_count(cpu) || local_irq_count(cpu) > 1) - kstat.per_cpu_system[cpu] += system; + scheduler_tick(user_tick, system); } /* @@ -624,17 +608,7 @@ */ static unsigned long count_active_tasks(void) { - struct task_struct *p; - unsigned long nr = 0; - - read_lock(&tasklist_lock); - for_each_task(p) { - if ((p->state == TASK_RUNNING || - (p->state & TASK_UNINTERRUPTIBLE))) - nr += FIXED_1; - } - read_unlock(&tasklist_lock); - return nr; + return (nr_running() + nr_uninterruptible()) * FIXED_1; } /* @@ -827,6 +801,89 @@ #endif +static void process_timeout(unsigned long __data) +{ + wake_up_process((task_t *)__data); +} + +/** + * schedule_timeout - sleep until timeout + * @timeout: timeout value in jiffies + * + * Make the current task sleep until @timeout jiffies have + * elapsed. The routine will return immediately unless + * the current task state has been set (see set_current_state()). + * + * You can set the task state as follows - + * + * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to + * pass before the routine returns. The routine will return 0 + * + * %TASK_INTERRUPTIBLE - the routine may return early if a signal is + * delivered to the current task. In this case the remaining time + * in jiffies will be returned, or 0 if the timer expired in time + * + * The current task state is guaranteed to be TASK_RUNNING when this + * routine returns. + * + * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule + * the CPU away without a bound on the timeout. In this case the return + * value will be %MAX_SCHEDULE_TIMEOUT. + * + * In all cases the return value is guaranteed to be non-negative. + */ +signed long fastcall schedule_timeout(signed long timeout) +{ + struct timer_list timer; + unsigned long expire; + + switch (timeout) + { + case MAX_SCHEDULE_TIMEOUT: + /* + * These two special cases are useful to be comfortable + * in the caller. Nothing more. We could take + * MAX_SCHEDULE_TIMEOUT from one of the negative value + * but I' d like to return a valid offset (>=0) to allow + * the caller to do everything it want with the retval. + */ + schedule(); + goto out; + default: + /* + * Another bit of PARANOID. Note that the retval will be + * 0 since no piece of kernel is supposed to do a check + * for a negative retval of schedule_timeout() (since it + * should never happens anyway). You just have the printk() + * that will tell you if something is gone wrong and where. + */ + if (timeout < 0) + { + printk(KERN_ERR "schedule_timeout: wrong timeout " + "value %lx from %p\n", timeout, + __builtin_return_address(0)); + current->state = TASK_RUNNING; + goto out; + } + } + + expire = timeout + jiffies; + + init_timer(&timer); + timer.expires = expire; + timer.data = (unsigned long) current; + timer.function = process_timeout; + + add_timer(&timer); + schedule(); + del_timer_sync(&timer); + + timeout = expire - jiffies; + +out: + return timeout < 0 ? 0 : timeout; +} + /* Thread ID - the internal kernel "pid" */ asmlinkage long sys_gettid(void) { @@ -873,4 +930,3 @@ } return 0; } - diff -Nur linux-2.4.33-imedia/lib/dec_and_lock.c linux-2.4.33-imedia-patching/lib/dec_and_lock.c --- linux-2.4.33-imedia/lib/dec_and_lock.c 2001-10-03 19:11:26.000000000 +0300 +++ linux-2.4.33-imedia-patching/lib/dec_and_lock.c 2006-01-26 15:19:43.000000000 +0200 @@ -1,5 +1,6 @@ #include #include +#include #include /* diff -Nur linux-2.4.33-imedia/mm/filemap.c linux-2.4.33-imedia-patching/mm/filemap.c --- linux-2.4.33-imedia/mm/filemap.c 2005-06-01 03:56:56.000000000 +0300 +++ linux-2.4.33-imedia-patching/mm/filemap.c 2006-01-26 15:19:43.000000000 +0200 @@ -185,7 +185,9 @@ { struct list_head *head, *curr; struct page * page; + int ll_count = 100; +restart: head = &inode->i_mapping->clean_pages; spin_lock(&pagemap_lru_lock); @@ -196,6 +198,14 @@ page = list_entry(curr, struct page, list); curr = curr->next; + if (conditional_schedule_needed() && ll_count) { + spin_unlock(&pagecache_lock); + spin_unlock(&pagemap_lru_lock); + unconditional_schedule(); + ll_count--; + goto restart; + } + /* We cannot invalidate something in dirty.. */ if (PageDirty(page)) continue; @@ -259,8 +269,8 @@ page_cache_release(page); } -static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *)); -static int fastcall truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial) +static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *, int *)); +static int fastcall truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial, int *restart_count) { struct list_head *curr; struct page * page; @@ -271,6 +281,17 @@ while (curr != head) { unsigned long offset; + if (conditional_schedule_needed() && *restart_count) { + (*restart_count)--; + list_del(head); + list_add(head, curr); /* Restart on this page */ + spin_unlock(&pagecache_lock); + unconditional_schedule(); + spin_lock(&pagecache_lock); + unlocked = 1; + goto restart; + } + page = list_entry(curr, struct page, list); offset = page->index; @@ -303,13 +324,11 @@ } else wait_on_page(page); - page_cache_release(page); - - if (current->need_resched) { - __set_current_state(TASK_RUNNING); - schedule(); + if (LOWLATENCY_NEEDED) { + *restart_count = 4; /* We made progress */ } + page_cache_release(page); spin_lock(&pagecache_lock); goto restart; } @@ -332,13 +351,14 @@ { unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); + int restart_count = 4; int unlocked; spin_lock(&pagecache_lock); do { - unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial); - unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial); - unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial); + unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial, &restart_count); + unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial, &restart_count); + unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial, &restart_count); } while (unlocked); /* Traversed all three lists without dropping the lock */ spin_unlock(&pagecache_lock); @@ -483,6 +503,7 @@ page_cache_get(page); spin_unlock(&pagecache_lock); + conditional_schedule(); /* sys_msync() (only used by minixfs, udf) */ lock_page(page); /* The buffers could have been free'd while we waited for the page lock */ @@ -612,12 +633,14 @@ list_del(&page->list); list_add(&page->list, &mapping->locked_pages); - if (!PageDirty(page)) - continue; - page_cache_get(page); spin_unlock(&pagecache_lock); + conditional_schedule(); /* sys_msync() */ + + if (!PageDirty(page)) + goto clean; + lock_page(page); if (PageDirty(page)) { @@ -628,7 +651,7 @@ ret = err; } else UnlockPage(page); - +clean: page_cache_release(page); spin_lock(&pagecache_lock); } @@ -646,7 +669,8 @@ int filemap_fdatawait(struct address_space * mapping) { int ret = 0; - + DEFINE_RESCHED_COUNT; +restart: spin_lock(&pagecache_lock); while (!list_empty(&mapping->locked_pages)) { @@ -655,6 +679,17 @@ list_del(&page->list); list_add(&page->list, &mapping->clean_pages); + if (TEST_RESCHED_COUNT(32)) { + RESET_RESCHED_COUNT(); + if (conditional_schedule_needed()) { + page_cache_get(page); + spin_unlock(&pagecache_lock); + unconditional_schedule(); + page_cache_release(page); + goto restart; + } + } + if (!PageLocked(page)) continue; @@ -764,8 +799,10 @@ spin_lock(&pagecache_lock); page = __find_page_nolock(mapping, offset, *hash); spin_unlock(&pagecache_lock); - if (page) + if (page) { + conditional_schedule(); return 0; + } page = page_cache_alloc(mapping); if (!page) @@ -1035,6 +1072,11 @@ * the hash-list needs a held write-lock. */ repeat: + if (conditional_schedule_needed()) { + spin_unlock(&pagecache_lock); + unconditional_schedule(); + spin_lock(&pagecache_lock); + } page = __find_page_nolock(mapping, offset, hash); if (page) { page_cache_get(page); @@ -1488,6 +1530,8 @@ page_cache_get(page); spin_unlock(&pagecache_lock); + conditional_schedule(); /* sys_read() */ + if (!Page_Uptodate(page)) goto page_not_up_to_date; generic_file_readahead(reada_ok, filp, inode, page); @@ -2247,6 +2291,12 @@ address += PAGE_SIZE; pte++; } while (address && (address < end)); + + if (conditional_schedule_needed()) { + spin_unlock(&vma->vm_mm->page_table_lock); + unconditional_schedule(); /* syncing large mapped files */ + spin_lock(&vma->vm_mm->page_table_lock); + } return error; } @@ -2658,7 +2708,9 @@ if (vma->vm_flags & VM_LOCKED) return -EINVAL; - zap_page_range(vma->vm_mm, start, end - start); + zap_page_range(vma->vm_mm, start, end - start, + ZPR_COND_RESCHED); /* sys_madvise(MADV_DONTNEED) */ + return 0; } @@ -3228,6 +3280,9 @@ goto sync_failure; page_fault = __copy_from_user(kaddr+offset, buf, bytes); flush_dcache_page(page); + + conditional_schedule(); + status = mapping->a_ops->commit_write(file, page, offset, offset+bytes); if (page_fault) goto fail_write; diff -Nur linux-2.4.33-imedia/mm/memory.c linux-2.4.33-imedia-patching/mm/memory.c --- linux-2.4.33-imedia/mm/memory.c 2005-04-04 04:42:20.000000000 +0300 +++ linux-2.4.33-imedia-patching/mm/memory.c 2006-01-26 15:19:43.000000000 +0200 @@ -357,7 +357,7 @@ /* * remove user pages in a given range. */ -void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) +static void do_zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size) { mmu_gather_t *tlb; pgd_t * dir; @@ -478,6 +478,10 @@ struct page *map; while (!(map = follow_page(mm, start, write))) { spin_unlock(&mm->page_table_lock); + + /* Pinning down many physical pages (kiobufs, mlockall) */ + conditional_schedule(); + switch (handle_mm_fault(mm, vma, start, write)) { case 1: tsk->min_flt++; @@ -641,6 +645,21 @@ iobuf->locked = 0; } +#define MAX_ZAP_BYTES 256*PAGE_SIZE + +void zap_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, int actions) +{ + while (size) { + unsigned long chunk = size; + if (actions & ZPR_COND_RESCHED && chunk > MAX_ZAP_BYTES) + chunk = MAX_ZAP_BYTES; + do_zap_page_range(mm, address, chunk); + if (actions & ZPR_COND_RESCHED) + conditional_schedule(); + address += chunk; + size -= chunk; + } +} /* * Lock down all of the pages of a kiovec for IO. @@ -750,11 +769,18 @@ return 0; } -static inline void zeromap_pte_range(pte_t * pte, unsigned long address, - unsigned long size, pgprot_t prot) +static inline void zeromap_pte_range(struct mm_struct *mm, pte_t * pte, + unsigned long address, unsigned long size, + pgprot_t prot) { unsigned long end; + if (conditional_schedule_needed()) { + spin_unlock(&mm->page_table_lock); + unconditional_schedule(); /* mmap(/dev/zero) */ + spin_lock(&mm->page_table_lock); + } + address &= ~PMD_MASK; end = address + size; if (end > PMD_SIZE) @@ -782,7 +808,7 @@ pte_t * pte = pte_alloc(mm, pmd, address); if (!pte) return -ENOMEM; - zeromap_pte_range(pte, address, end - address, prot); + zeromap_pte_range(mm, pte, address, end - address, prot); address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -1017,7 +1043,7 @@ /* mapping wholly truncated? */ if (mpnt->vm_pgoff >= pgoff) { - zap_page_range(mm, start, len); + zap_page_range(mm, start, len, 0); continue; } @@ -1030,7 +1056,7 @@ /* Ok, partially affected.. */ start += diff << PAGE_SHIFT; len = (len - diff) << PAGE_SHIFT; - zap_page_range(mm, start, len); + zap_page_range(mm, start, len, 0); } while ((mpnt = mpnt->vm_next_share) != NULL); } diff -Nur linux-2.4.33-imedia/mm/mmap.c linux-2.4.33-imedia-patching/mm/mmap.c --- linux-2.4.33-imedia/mm/mmap.c 2005-01-19 16:10:13.000000000 +0200 +++ linux-2.4.33-imedia-patching/mm/mmap.c 2006-01-26 15:19:43.000000000 +0200 @@ -600,7 +600,7 @@ fput(file); /* Undo any partial mapping done by a device driver. */ - zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start); + zap_page_range(mm, vma->vm_start, vma->vm_end - vma->vm_start, 0); free_vma: kmem_cache_free(vm_area_cachep, vma); return error; @@ -1000,7 +1000,7 @@ remove_shared_vm_struct(mpnt); mm->map_count--; - zap_page_range(mm, st, size); + zap_page_range(mm, st, size, ZPR_COND_RESCHED); /* sys_munmap() */ /* * Fix the mapping, and free the old area if it wasn't reused. @@ -1175,7 +1175,7 @@ } mm->map_count--; remove_shared_vm_struct(mpnt); - zap_page_range(mm, start, size); + zap_page_range(mm, start, size, ZPR_COND_RESCHED); /* sys_exit() */ if (mpnt->vm_file) fput(mpnt->vm_file); kmem_cache_free(vm_area_cachep, mpnt); diff -Nur linux-2.4.33-imedia/mm/mremap.c linux-2.4.33-imedia-patching/mm/mremap.c --- linux-2.4.33-imedia/mm/mremap.c 2005-01-19 16:10:13.000000000 +0200 +++ linux-2.4.33-imedia-patching/mm/mremap.c 2006-01-26 15:19:43.000000000 +0200 @@ -122,7 +122,7 @@ flush_cache_range(mm, new_addr, new_addr + len); while ((offset += PAGE_SIZE) < len) move_one_page(mm, new_addr + offset, old_addr + offset); - zap_page_range(mm, new_addr, len); + zap_page_range(mm, new_addr, len, 0); return -1; } diff -Nur linux-2.4.33-imedia/mm/oom_kill.c linux-2.4.33-imedia-patching/mm/oom_kill.c --- linux-2.4.33-imedia/mm/oom_kill.c 2004-11-17 13:54:22.000000000 +0200 +++ linux-2.4.33-imedia-patching/mm/oom_kill.c 2006-01-26 15:19:43.000000000 +0200 @@ -86,7 +86,7 @@ * Niced processes are most likely less important, so double * their badness points. */ - if (p->nice > 0) + if (task_nice(p) > 0) points *= 2; /* @@ -150,7 +150,7 @@ * all the memory it needs. That way it should be able to * exit() and clear out its resources quickly... */ - p->counter = 5 * HZ; + p->time_slice = HZ; p->flags |= PF_MEMALLOC | PF_MEMDIE; /* This process has hardware access, be more careful. */ diff -Nur linux-2.4.33-imedia/mm/slab.c linux-2.4.33-imedia-patching/mm/slab.c --- linux-2.4.33-imedia/mm/slab.c 2004-11-17 13:54:22.000000000 +0200 +++ linux-2.4.33-imedia-patching/mm/slab.c 2006-01-26 15:19:43.000000000 +0200 @@ -49,7 +49,8 @@ * constructors and destructors are called without any locking. * Several members in kmem_cache_t and slab_t never change, they * are accessed without any locking. - * The per-cpu arrays are never accessed from the wrong cpu, no locking. + * The per-cpu arrays are never accessed from the wrong cpu, no locking, + * and local interrupts are disabled so slab code is preempt-safe. * The non-constant members are protected with a per-cache irq spinlock. * * Further notes from the original documentation: @@ -858,12 +859,14 @@ */ static void smp_call_function_all_cpus(void (*func) (void *arg), void *arg) { + preempt_disable(); local_irq_disable(); func(arg); local_irq_enable(); if (smp_call_function(func, arg, 1, 1)) BUG(); + preempt_enable(); } typedef struct ccupdate_struct_s { @@ -935,6 +938,7 @@ list_del(&slabp->list); spin_unlock_irq(&cachep->spinlock); + conditional_schedule(); kmem_slab_destroy(cachep, slabp); ret++; spin_lock_irq(&cachep->spinlock); @@ -1851,6 +1855,7 @@ */ spin_unlock_irq(&best_cachep->spinlock); kmem_slab_destroy(best_cachep, slabp); + conditional_schedule(); /* try_to_free_pages() */ spin_lock_irq(&best_cachep->spinlock); } spin_unlock_irq(&best_cachep->spinlock); diff -Nur linux-2.4.33-imedia/mm/swapfile.c linux-2.4.33-imedia-patching/mm/swapfile.c --- linux-2.4.33-imedia/mm/swapfile.c 2005-04-04 04:42:20.000000000 +0300 +++ linux-2.4.33-imedia-patching/mm/swapfile.c 2006-01-26 15:19:43.000000000 +0200 @@ -834,7 +834,7 @@ len += sprintf(buf + len, "partition\t"); usedswap = 0; - for (j = 0; j < ptr->max; ++j) + for (j = 0; j < ptr->max; ++j) { switch (ptr->swap_map[j]) { case SWAP_MAP_BAD: case 0: @@ -842,6 +842,8 @@ default: usedswap++; } + conditional_schedule(); + } len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), usedswap << (PAGE_SHIFT - 10), ptr->prio); } @@ -1140,6 +1142,11 @@ if (swap_info[i].flags != SWP_USED) continue; for (j = 0; j < swap_info[i].max; ++j) { + if (conditional_schedule_needed()) { + swap_list_unlock(); + conditional_schedule(); + swap_list_lock(); + } switch (swap_info[i].swap_map[j]) { case 0: case SWAP_MAP_BAD: diff -Nur linux-2.4.33-imedia/mm/vmscan.c linux-2.4.33-imedia-patching/mm/vmscan.c --- linux-2.4.33-imedia/mm/vmscan.c 2005-11-16 21:12:54.000000000 +0200 +++ linux-2.4.33-imedia-patching/mm/vmscan.c 2006-01-26 15:19:43.000000000 +0200 @@ -210,6 +210,7 @@ { pte_t * pte; unsigned long pmd_end; + DEFINE_RESCHED_COUNT; if (pmd_none(*dir)) return count; @@ -235,11 +236,17 @@ address += PAGE_SIZE; break; } + if (TEST_RESCHED_COUNT(4)) { + if (conditional_schedule_needed()) + goto out; + RESET_RESCHED_COUNT(); + } } } address += PAGE_SIZE; pte++; } while (address && (address < end)); +out: mm->swap_address = address; return count; } @@ -268,6 +275,8 @@ count = swap_out_pmd(mm, vma, pmd, address, end, count, classzone); if (!count) break; + if (conditional_schedule_needed()) + return count; address = (address + PMD_SIZE) & PMD_MASK; pmd++; } while (address && (address < end)); @@ -292,6 +301,8 @@ count = swap_out_pgd(mm, vma, pgdir, address, end, count, classzone); if (!count) break; + if (conditional_schedule_needed()) + return count; address = (address + PGDIR_SIZE) & PGDIR_MASK; pgdir++; } while (address && (address < end)); @@ -313,6 +324,7 @@ * Find the proper vm-area after freezing the vma chain * and ptes. */ +continue_scan: spin_lock(&mm->page_table_lock); address = mm->swap_address; if (address == TASK_SIZE || swap_mm != mm) { @@ -330,6 +342,12 @@ vma = vma->vm_next; if (!vma) break; + if (conditional_schedule_needed()) { /* Scanning a large vma */ + spin_unlock(&mm->page_table_lock); + unconditional_schedule(); + /* Continue from where we left off */ + goto continue_scan; + } if (!count) goto out_unlock; address = vma->vm_start; diff -Nur linux-2.4.33-imedia/net/bluetooth/bnep/core.c linux-2.4.33-imedia-patching/net/bluetooth/bnep/core.c --- linux-2.4.33-imedia/net/bluetooth/bnep/core.c 2004-08-08 02:26:06.000000000 +0300 +++ linux-2.4.33-imedia-patching/net/bluetooth/bnep/core.c 2006-01-26 15:19:43.000000000 +0200 @@ -479,7 +479,7 @@ sigfillset(¤t->blocked); flush_signals(current); - current->nice = -15; + set_user_nice(current, -15); set_fs(KERNEL_DS); diff -Nur linux-2.4.33-imedia/net/bluetooth/cmtp/core.c linux-2.4.33-imedia-patching/net/bluetooth/cmtp/core.c --- linux-2.4.33-imedia/net/bluetooth/cmtp/core.c 2003-08-25 14:44:44.000000000 +0300 +++ linux-2.4.33-imedia-patching/net/bluetooth/cmtp/core.c 2006-01-26 15:19:43.000000000 +0200 @@ -298,7 +298,7 @@ sigfillset(¤t->blocked); flush_signals(current); - current->nice = -15; + set_user_nice(current, -15); set_fs(KERNEL_DS); diff -Nur linux-2.4.33-imedia/net/core/dev.c linux-2.4.33-imedia-patching/net/core/dev.c --- linux-2.4.33-imedia/net/core/dev.c 2005-04-04 04:42:20.000000000 +0300 +++ linux-2.4.33-imedia-patching/net/core/dev.c 2006-01-26 15:19:43.000000000 +0200 @@ -1093,9 +1093,15 @@ int cpu = smp_processor_id(); if (dev->xmit_lock_owner != cpu) { + /* + * The spin_lock effectivly does a preempt lock, but + * we are about to drop that... + */ + preempt_disable(); spin_unlock(&dev->queue_lock); spin_lock(&dev->xmit_lock); dev->xmit_lock_owner = cpu; + preempt_enable(); if (!netif_queue_stopped(dev)) { if (netdev_nit) @@ -1274,7 +1280,7 @@ int netif_rx(struct sk_buff *skb) { - int this_cpu = smp_processor_id(); + int this_cpu; struct softnet_data *queue; unsigned long flags; @@ -1284,9 +1290,10 @@ /* The code is rearranged so that the path is the most short when CPU is congested, but is still operating. */ - queue = &softnet_data[this_cpu]; local_irq_save(flags); + this_cpu = smp_processor_id(); + queue = &softnet_data[this_cpu]; netdev_rx_stat[this_cpu].total++; if (queue->input_pkt_queue.qlen <= netdev_max_backlog) { diff -Nur linux-2.4.33-imedia/net/core/iovec.c linux-2.4.33-imedia-patching/net/core/iovec.c --- linux-2.4.33-imedia/net/core/iovec.c 2001-09-10 17:57:00.000000000 +0300 +++ linux-2.4.33-imedia-patching/net/core/iovec.c 2006-01-26 15:19:43.000000000 +0200 @@ -88,7 +88,7 @@ if(iov->iov_len) { int copy = min_t(unsigned int, iov->iov_len, len); - if (copy_to_user(iov->iov_base, kdata, copy)) + if (ll_copy_to_user(iov->iov_base, kdata, copy)) goto out; kdata+=copy; len-=copy; diff -Nur linux-2.4.33-imedia/net/core/skbuff.c linux-2.4.33-imedia-patching/net/core/skbuff.c --- linux-2.4.33-imedia/net/core/skbuff.c 2003-08-25 14:44:44.000000000 +0300 +++ linux-2.4.33-imedia-patching/net/core/skbuff.c 2006-01-26 15:19:43.000000000 +0200 @@ -111,33 +111,37 @@ static __inline__ struct sk_buff *skb_head_from_pool(void) { - struct sk_buff_head *list = &skb_head_pool[smp_processor_id()].list; + struct sk_buff_head *list; + struct sk_buff *skb = NULL; + unsigned long flags; - if (skb_queue_len(list)) { - struct sk_buff *skb; - unsigned long flags; + local_irq_save(flags); - local_irq_save(flags); + list = &skb_head_pool[smp_processor_id()].list; + + if (skb_queue_len(list)) skb = __skb_dequeue(list); - local_irq_restore(flags); - return skb; - } - return NULL; + + local_irq_restore(flags); + return skb; } static __inline__ void skb_head_to_pool(struct sk_buff *skb) { - struct sk_buff_head *list = &skb_head_pool[smp_processor_id()].list; + struct sk_buff_head *list; + unsigned long flags; - if (skb_queue_len(list) < sysctl_hot_list_len) { - unsigned long flags; + local_irq_save(flags); + list = &skb_head_pool[smp_processor_id()].list; - local_irq_save(flags); + if (skb_queue_len(list) < sysctl_hot_list_len) { __skb_queue_head(list, skb); local_irq_restore(flags); return; } + + local_irq_restore(flags); kmem_cache_free(skbuff_head_cache, skb); } diff -Nur linux-2.4.33-imedia/net/ipv4/tcp_minisocks.c linux-2.4.33-imedia-patching/net/ipv4/tcp_minisocks.c --- linux-2.4.33-imedia/net/ipv4/tcp_minisocks.c 2004-11-17 13:54:22.000000000 +0200 +++ linux-2.4.33-imedia-patching/net/ipv4/tcp_minisocks.c 2006-01-26 15:19:43.000000000 +0200 @@ -433,6 +433,9 @@ { struct tcp_tw_bucket *tw; int killed = 0; +#if LOWLATENCY_NEEDED + int max_killed = 0; +#endif /* NOTE: compare this to previous version where lock * was released after detaching chain. It was racy, @@ -446,6 +449,13 @@ goto out; while((tw = tcp_tw_death_row[tcp_tw_death_row_slot]) != NULL) { +#if LOWLATENCY_NEEDED + /* This loop takes ~6 usecs per iteration. */ + if (killed > 100) { + max_killed = 1; + break; + } +#endif tcp_tw_death_row[tcp_tw_death_row_slot] = tw->next_death; if (tw->next_death) tw->next_death->pprev_death = tw->pprev_death; @@ -458,12 +468,24 @@ killed++; spin_lock(&tw_death_lock); + + } + +#if LOWLATENCY_NEEDED + if (max_killed) { /* More to do: do it soon */ + mod_timer(&tcp_tw_timer, jiffies+2); + tcp_tw_count -= killed; + } + else +#endif + { + tcp_tw_death_row_slot = + ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); + + if ((tcp_tw_count -= killed) != 0) + mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); } - tcp_tw_death_row_slot = - ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1)); - if ((tcp_tw_count -= killed) != 0) - mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD); net_statistics[smp_processor_id()*2].TimeWaited += killed; out: spin_unlock(&tw_death_lock); diff -Nur linux-2.4.33-imedia/net/socket.c linux-2.4.33-imedia-patching/net/socket.c --- linux-2.4.33-imedia/net/socket.c 2005-01-19 16:10:14.000000000 +0200 +++ linux-2.4.33-imedia-patching/net/socket.c 2006-01-26 15:19:43.000000000 +0200 @@ -132,7 +132,7 @@ static struct net_proto_family *net_families[NPROTO]; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) static atomic_t net_family_lockct = ATOMIC_INIT(0); static spinlock_t net_family_lock = SPIN_LOCK_UNLOCKED; diff -Nur linux-2.4.33-imedia/net/sunrpc/pmap_clnt.c linux-2.4.33-imedia-patching/net/sunrpc/pmap_clnt.c --- linux-2.4.33-imedia/net/sunrpc/pmap_clnt.c 2002-08-03 03:39:46.000000000 +0300 +++ linux-2.4.33-imedia-patching/net/sunrpc/pmap_clnt.c 2006-01-26 15:19:43.000000000 +0200 @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include