/* * linux/kernel/rtsched.h * * NOTE: This is a .h file that is mostly source, not the usual convention. * It is coded this way to allow the depend rules to correctly set * up the make file dependencies. This is an alternate scheduler * that replaces the core scheduler in sched.c. It does not, however, * replace most of the static support functions that call schedule. * By making this an include file for sched.c, all of those functions * are retained without the need for duplicate code and its attendant * support issues. At the same time, keeping it a seperate file allows * diff and patch to work most cleanly and correctly. * * Kernel scheduler and related syscalls * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2000, 2001 MontaVista Software Inc. * * 1998-12-28 Implemented better SMP scheduling by Ingo Molnar * 2000-03-15 Added the Real Time run queue support by George Anzinger * 2000-8-29 Added code to do lazy recalculation of counters * by George Anzinger */ /* * 'sched.c' is the main kernel file. It contains scheduling primitives * (sleep_on, wakeup, schedule etc) as well as a number of simple system * call functions (type getpid()), which just extract a field from * current-task */ #ifndef preempt_disable #define preempt_disable() #define preempt_enable() #define preempt_is_disabled() 0 #define preempt_enable_no_resched() #endif /* * scheduler variables */ #define VERSION_DATE "<20011203.1609.50>" /* * We align per-CPU scheduling data on cacheline boundaries, * to prevent cacheline ping-pong. */ static union { struct schedule_data { struct task_struct * curr; cycles_t last_schedule; struct list_head schedule_data_list; int cpu,effprio; } schedule_data; char __pad [SMP_CACHE_BYTES]; } aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0,{0,0},0,0}}}; #define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr static void newprio_ready_q(struct task_struct * tptr,int newprio); #ifdef CONFIG_SMP static void newprio_executing(struct task_struct *tptr,int newprio); static struct list_head hed_cpu_prio __cacheline_aligned = LIST_HEAD_INIT(hed_cpu_prio); #endif /* * task_on_rq tests for task actually in the ready queue. * task_on_runque tests for task either on ready queue or being executed * (by virtue of our seting a running tasks run_list.next to 1) */ #define task_on_rq(p) ((unsigned)p->run_list.next > 1) static struct list_head rq[MAX_PRI+1] ____cacheline_aligned; static struct ready_queue { int recalc; /* # of counter recalculations on SCHED_OTHER */ int ticks; /* # of ticks for all in SCHED_OTHER ready Q */ } runq ____cacheline_aligned; /* set the bit map up with guard bits below. This will result in * priority -1 if there are no tasks in the ready queue which will * happen as we are not putting the idle tasks in the ready queue. */ static struct { int guard; int rq_bit_ary[(MAX_PRI/32) +1]; }rq_bits = {-1,{0,0,0,0}}; #define rq_bit_map rq_bits.rq_bit_ary static int high_prio=0; #define Rdy_Q_Hed(pri) &rq[pri] #define PREEMPTION_THRESHOLD 1 #define NOT_RT 0 /* Use priority zero for non-RT processes */ #define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule struct kernel_stat kstat; #ifdef CONFIG_SMP /* * At the moment, we will ignor cpus_allowed, primarily because if it were * used, we would have a conflict in the runq.ticks count (i.e. since we * are not scheduleing some tasks, the count would not reflect what is * is really on the list). Oh, and also, nowhere is there code in the * kernel to set cpus_allowed to anything but -1. In the long run, we * would like to try seperate lists for each cpu, at which point * cpus_allowed could be used to direct the task to the proper list. * Well, darn, now there is code that messes with cpus_allowed. We will change * sometime soon.... */ #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) #define can_schedule(p,cpu) \ ((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu)) #else #define idle_task(cpu) (&init_task) #define can_schedule(p,cpu) (1) #endif void scheduling_functions_start_here(void) { } /* * This is the function that decides how desirable a process is.. * You can weigh different processes against each other depending * on what CPU they've run on lately etc to try to handle cache * and TLB miss penalties. * * Return values: * -1000: never select this * 0: out of time, recalculate counters (but it might still be * selected) * +ve: "goodness" value (the larger, the better) */ static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) { int weight; /* * goodness is NEVER called for Realtime processes! * Realtime process, select the first one on the * runqueue (taking priorities within processes * into account). */ /* * Give the process a first-approximation goodness value * according to the number of clock-ticks it has left. * * Don't do any other calculations if the time slice is * over or if this is an idle task. */ weight = p->counter; if (weight <= 0) goto out; #ifdef CONFIG_SMP /* Give a largish advantage to the same processor... */ /* (this is equivalent to penalizing other processors) */ if (p->processor == this_cpu) weight += PROC_CHANGE_PENALTY; #endif /* .. and a slight advantage to the current MM */ if (p->mm == this_mm || !p->mm) weight += 1; weight += 20 - p->nice; out: return weight; } /* * the 'goodness value' of replacing a process on a given CPU. * positive value means 'replace', zero or negative means 'dont'. */ static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) { return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm); } /* * This is ugly, but reschedule_idle() is very timing-critical. * We are called with the runqueue spinlock held and we must * not claim the tasklist_lock. */ static FASTCALL(void reschedule_idle(struct task_struct * p)); static void reschedule_idle(struct task_struct * p) { #ifdef CONFIG_SMP int this_cpu = smp_processor_id(), target_cpu; struct task_struct *target_tsk; struct list_head *cptr; struct schedule_data *sch; int best_cpu; /* * shortcut if the woken up task's last CPU is * idle now. */ best_cpu = p->processor; target_tsk = idle_task(best_cpu); if (cpu_curr(best_cpu) == target_tsk) goto preempt_now; /* * For real time, the choice is simple. We just check * if the most available processor is working on a lower * priority task. If so we bounce it, if not, there is * nothing more important than what we are doing. * Note that this will pick up any idle cpu(s) we may * have as they will have effprio of -1. */ cptr = hed_cpu_prio.prev; sch = list_entry(cptr, struct schedule_data, schedule_data_list); target_tsk = sch->curr; if (p->effprio > sch->effprio){ goto preempt_now; } /* * If all cpus are doing real time and we failed * above, then there is no help for this task. */ if ( sch->effprio ) goto out_no_target; /* * Non-real time contender and one or more processors * doing non-real time things. * So we have a non-real time task contending among * other non-real time tasks on one or more processors * We know we have no idle cpus. */ /* * No CPU is idle, but maybe this process has enough priority * to preempt it's preferred CPU. */ target_tsk = cpu_curr(best_cpu); if (target_tsk->effprio == 0 && preemption_goodness(target_tsk, p, best_cpu) > 0) goto preempt_now; for (; cptr != &hed_cpu_prio; cptr = cptr->prev ){ sch =list_entry(cptr, struct schedule_data, schedule_data_list); if (sch->effprio != 0) break; if (sch->cpu != best_cpu){ target_tsk = sch->curr; if ( preemption_goodness(target_tsk, p, sch->cpu) > PREEMPTION_THRESHOLD) goto preempt_now; } } out_no_target: return; preempt_now: target_cpu = target_tsk->processor; target_tsk->need_resched = 1; /* * the APIC stuff can go outside of the lock because * it uses no task information, only CPU#. */ if ((target_cpu != this_cpu) && (target_tsk != idle_task(target_cpu))) smp_send_reschedule(target_cpu); return; #else /* UP */ struct task_struct *tsk; tsk = cpu_curr(0); if ((high_prio > tsk->effprio) || (!tsk->effprio && preemption_goodness(tsk, p, 0) > PREEMPTION_THRESHOLD)){ tsk->need_resched = 1; } #endif } /* * This routine maintains the list of smp processors. This is * a by directional list maintained in priority order. The above * code used this list to find a processor to use for a new task. * The search will be backward thru the list as we want to take * the lowest prioity cpu first. We put equal prioities such that * the new one will be ahead of the old, so the new should stay * around a bit longer. */ #ifdef CONFIG_SMP static inline void re_queue_cpu(struct task_struct *next, struct schedule_data *sch) { struct list_head *cpuptr; list_del(&sch->schedule_data_list); sch->effprio = next->effprio; cpuptr = hed_cpu_prio.next; while (cpuptr != &hed_cpu_prio && sch->effprio < list_entry(cpuptr, struct schedule_data, schedule_data_list)->effprio ) cpuptr = cpuptr->next; list_add_tail(&sch->schedule_data_list,cpuptr); next->newprio = &newprio_executing; } #else #define re_queue_cpu(a,b) #endif /* * Careful! * * This has to add the process to the _beginning_ of the * run-queue, not the end. See the comment about "This is * subtle" in the scheduler proper.. * * For real time tasks we do this a bit differently. We * keep a priority list of ready tasks. We remove tasks * from this list when they are running so a running real * time task will not be in either the ready list or the run * queue. Also, in the name of speed and real time, only * priority is important so we spend a few bytes on the queue. * We have a doubly linked list for each priority. This makes * Insert and removal very fast. We also keep a bit map of * the priority queues where a bit says if the queue is empty * or not. We also keep loose track of the highest priority * queue that is currently occupied. This high_prio mark * is updated when a higher priority task enters the ready * queue and only goes down when we look for a task in the * ready queue at high_prio and find none. Then, and only * then, we examine the bit map to find the true high_prio. */ #define BF 31 /* bit flip constant */ #define set_rq_bit(bit) set_bit(BF-((bit)&0x1f),&rq_bit_map[(bit) >> 5]) #define clear_rq_bit(bit) clear_bit(BF-((bit)&0x1f),&rq_bit_map[(bit) >> 5]) static inline void _del_from_runqueue(struct task_struct * p) { nr_running--; list_del( &p->run_list ); if (list_empty(Rdy_Q_Hed(p->effprio))){ clear_rq_bit(p->effprio); } /* p->run_list.next = NULL; !=0 prevents requeue */ p->run_list.next = NULL; p->newprio = NULL; if( !p->effprio) runq.ticks -= p->counter; } /* Exported for main.c, also used in init code here */ void __del_from_runqueue(struct task_struct * p) { _del_from_runqueue(p); } static inline struct task_struct * get_next_task(struct task_struct * prev, int this_cpu) { struct list_head *next, *rqptr; struct task_struct *it=0; int *i,c,oldcounter; repeat_schedule: rqptr = Rdy_Q_Hed(high_prio); next = rqptr->next; if (unlikely( next == rqptr)){ for (i=&rq_bit_map[MAX_PRI/32],high_prio=BF+((MAX_PRI/32)*32); (*i == 0);high_prio -=32,i--); high_prio -= ffz(~*i); if (unlikely(high_prio < 0)){ /* * No tasks to run, return this cpu's idle task * It is not in the ready queue, so no need to remove it. * But first make sure its priority keeps it out of * the way. */ high_prio = 0; it = idle_task(this_cpu); it->effprio = -1; return it; } goto repeat_schedule; } /* * If there is only one task on the list, it is a no brainer. * But really, this also prevents us from looping on recalulation * if the one and only task is trying to yield. These sort of * loops are NOT_FUN. Note: we use likely() to tilt toward * real-time tasks, even thou they are, usually unlikely. We * are, after all, a real time scheduler. */ if ( likely(high_prio || next->next == rqptr)){ it = list_entry(next, struct task_struct, run_list); back_from_figure_non_rt_next: _del_from_runqueue(it); return it; } /* * Here we set up a SCHED_OTHER yield. Note that for other policies * yield is handled else where. This means we can use == and = * instead of & and &= to test and clear the flag. If the prev * task has all the runq.ticks, then we just do the recaculation * version and let the winner take all (yield fails). Otherwise * we fource the counter to zero for the loop and put it back * after we found some other task. We must remember to update * runq.ticks during all this. Also, we don't give it all back * if the yielder has more than the next guy. */ oldcounter = 0; if ( unlikely(prev->policy == (SCHED_YIELD | SCHED_OTHER)) ){ if ( unlikely(prev->counter == runq.ticks)) { prev->policy = SCHED_OTHER; runq.ticks = 0; }else{ oldcounter = prev->counter; prev->counter = 0; } } c = -1000; if (likely(runq.ticks > 0)) { do { int weight; struct task_struct *p = list_entry(next, struct task_struct, run_list); /* if (can_schedule(p, this_cpu))*/ { weight = goodness(p, this_cpu, prev->active_mm); if (weight > c) c = weight, it = p; } next = next->next; } while (next != rqptr); /* * if we get out of sync with the runq.ticks counter * force it to 0 and catch it next time around. Note we * catch a negative counter on entry. */ if ( unlikely(c <= 0 )){ runq.ticks = 0; } }else{ #ifdef CONFIG_SMP /* * Here we update the tasks that are current on other * processors */ struct list_head *wkptr, *cptr=&aligned_data[(this_cpu)]. schedule_data. schedule_data_list; runq.ticks = 0; list_for_each ( wkptr, &hed_cpu_prio) { struct task_struct *p; if (cptr == wkptr ) continue; p = list_entry(wkptr, struct schedule_data, schedule_data_list)->curr; if ( p->effprio == 0){ p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); p->counter_recalc++; } } #else runq.ticks = 0; #endif runq.recalc++; do { int weight; struct task_struct *p = list_entry(next, struct task_struct, run_list); runq.ticks += p->counter = NICE_TO_TICKS(p->nice); p->counter_recalc++; /* if (can_schedule(p, this_cpu)) */ { weight = goodness(p, this_cpu, prev->active_mm); if (weight > c) c = weight, it = p; } next = next->next; } while (next != rqptr); } /* Undo the stuff we did for SCHED_YIELD. We know we did something * if oldcounter != 0. */ if (unlikely(oldcounter)){ prev->counter = (it->counter < oldcounter) ? it->counter : oldcounter; runq.ticks += prev->counter-oldcounter; prev->policy &= ~SCHED_YIELD; } goto back_from_figure_non_rt_next; } /* Add to the head of the run queue */ static inline void add_to_runqueue(struct task_struct * p,int cpu) { struct list_head *next; int prio; /* idle tasks, don't get put in the list */ if (unlikely(p == idle_task(cpu))) return; prio = p->effprio; next = Rdy_Q_Hed(prio); if (list_empty(next)) { /* an empty queue */ set_rq_bit(prio); if (high_prio < prio) { high_prio = prio; } } list_add(&p->run_list,next); p->newprio = newprio_ready_q; if ( likely(!p->effprio )) { int diff,c; if ((diff = runq.recalc - p->counter_recalc) != 0) { p->counter_recalc = runq.recalc; c = NICE_TO_TICKS(p->nice) << 1; p->counter = diff > 8 ? c - 1 : /* max priority */ c + ((p->counter - c) >> diff); } runq.ticks += p->counter; } nr_running++; } /* * This function is only called from schedule() so it need not worry * about updating the counter as it should never be out of date. * If you change this, remember to do the update. */ static inline void add_last_runqueue(struct task_struct * p) { struct list_head *next = Rdy_Q_Hed(p->effprio); if (list_empty(next)) { /* empty list, set the bit */ set_rq_bit(p->effprio); if (p->effprio > high_prio){ high_prio = p->effprio; } } list_add_tail(&p->run_list,next); p->newprio = newprio_ready_q; if ( !p->effprio ) runq.ticks += p->counter; nr_running++; } static inline void move_first_runqueue(struct task_struct * p) { list_del(&p->run_list); list_add_tail(&p->run_list, Rdy_Q_Hed(p->effprio)); } /* * When we have a task in some queue by priority, we need * to provide a way to change that priority. Depending on the * queue we must do different things. We handle this by putting * a function address in the task_struct (newprio()). * * First a front end routine to take care of the case were the task * is not in any priority queues. We take the runqueue_lock * here, so the caller must not. Since we may be called * recursively, protect against a dead lock. */ static struct task_struct *newprio_inuse; static int newprio_inuse_count; void set_newprio(struct task_struct * tptr, int newprio) { if ( newprio_inuse != current){ spin_lock_irq(&runqueue_lock); newprio_inuse = current; } newprio_inuse_count++; if (! tptr->newprio ) { tptr->effprio = newprio; }else if ( tptr->effprio != newprio) { tptr->newprio(tptr,newprio); } if ( ! --newprio_inuse_count ){ spin_unlock_irq(&runqueue_lock); newprio_inuse = 0; } } /* * Here are the routines we use for the ready queue and an executing * process. Note that the executing process may fall out of favor * as a result of the change. We do the right thing. Note that newprio * is not cleared so we test here to see if the task is still running. */ static void newprio_ready_q(struct task_struct * tptr,int newprio) { _del_from_runqueue(tptr); tptr->effprio = newprio; add_to_runqueue(tptr,0); reschedule_idle(tptr); } #ifdef CONFIG_SMP static void newprio_executing(struct task_struct *tptr,int newprio) { int cpu; struct schedule_data *sched_data; if(!newprio || newprio < tptr->effprio){ tptr->need_resched = 1; } cpu = tptr->processor; sched_data = & aligned_data[cpu].schedule_data; tptr->effprio = newprio; if( sched_data->curr != tptr) return; /* if not expected, out of here */ re_queue_cpu(tptr,sched_data); if ((cpu != smp_processor_id()) && tptr->need_resched) smp_send_reschedule(cpu); } #endif /* * Wake up a process. Put it on the ready-queue if it's not * already there. The "current" process is not on the * ready-queue (it makes it much easier to figure out if we * need to preempt, esp. the real time case). It is possible * to wake the current process. This happens when it is waken * before schedule has had a chance to put it properly to * sleep. If schedule did not turn on ints in the middle of * things this would all be ok, however, it does so we have the * possibility of being in that window. * The "current" process is never on the * run-queue (except when the actual re-schedule is in * progress), and as such you're allowed to do the simpler * "current->state = TASK_RUNNING" to mark yourself runnable * without the overhead of this. */ static inline int try_to_wake_up(struct task_struct * p, int synchronous) { unsigned long flags; int success = 0; TRACE_PROCESS(TRACE_EV_PROCESS_WAKEUP, p->pid, p->state); /* * We want the common case fall through straight, thus the goto. */ spin_lock_irqsave(&runqueue_lock, flags); p->state = TASK_RUNNING; if ( task_on_runqueue(p) ) goto out; add_to_runqueue(p,0); if (!synchronous /*|| !(p->cpus_allowed & (1 << smp_processor_id())*/) reschedule_idle(p); success = 1; out: spin_unlock_irqrestore(&runqueue_lock, flags); return success; } inline int wake_up_process(struct task_struct * p) { return try_to_wake_up(p, 0); } /* * schedule_tail() is getting called from the fork return path. This * cleans up all remaining scheduler things, without impacting the * common case. */ static inline void __schedule_tail(struct task_struct *prev) { #ifdef CONFIG_SMP /* * fast path falls through. We have to clear cpus_runnable before * checking prev->state to avoid a wakeup race. Protect against * the task exiting early. */ task_lock(prev); task_release_cpu(prev); mb(); if (task_on_rq(prev)) goto needs_resched; out_unlock: task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */ return; /* * Slow path - we 'push' the previous process and * reschedule_idle() will attempt to find a new * processor for it. (but it might preempt the * current process as well.) We must take the runqueue * lock and re-check prev->state to be correct. It might * still happen that this process has a preemption * 'in progress' already - but this is not a problem and * might happen in other circumstances as well. */ needs_resched: { unsigned long flags; /* * Avoid taking the runqueue lock in cases where * no preemption-check is necessery: * Note: Idle task is NEVER on the ready queue so * no need to check if prev was idle. */ spin_lock_irqsave(&runqueue_lock, flags); if (task_on_rq(prev) /* && !task_has_cpu(prev)*/ ) reschedule_idle(prev); spin_unlock_irqrestore(&runqueue_lock, flags); goto out_unlock; } #define smp_label_a _smp_label_a: #define smp_label_b _smp_label_b: #else prev->policy &= ~SCHED_YIELD; #define smp_label_a #define smp_label_b #endif /* CONFIG_SMP */ } asmlinkage void schedule_tail(struct task_struct *prev) { __schedule_tail(prev); preempt_enable(); } /* * 'schedule()' is the scheduler function. It's a very simple and nice * scheduler: it's not perfect, but certainly works for most things. * * The goto is "interesting". * * NOTE!! Task 0 is the 'idle' task, which gets called when no other * tasks can run. It can not be killed, and it cannot sleep. The 'state' * information in task[0] is never used. */ asmlinkage void schedule(void) { struct schedule_data * sched_data; struct task_struct *prev, *next; int this_cpu; spin_lock_prefetch(&runqueue_lock); try_try_again: preempt_disable(); if (unlikely(!current->active_mm)) BUG(); prev = current; this_cpu = prev->processor; if (unlikely(in_interrupt())) { printk("Scheduling in interrupt\n"); BUG(); } release_kernel_lock(prev, this_cpu); /* * 'sched_data' is protected by the fact that we can run * only one process per CPU. */ sched_data = & aligned_data[this_cpu].schedule_data; spin_lock_irq(&runqueue_lock); #ifdef CONFIG_PREEMPT /* * Note that this is an '&' NOT an '&&'... */ if (preempt_is_disabled() & PREEMPT_ACTIVE) goto sw_TASK_RUNNING; #endif if (prev->state == TASK_INTERRUPTIBLE) { //case TASK_INTERRUPTIBLE: if (likely( ! signal_pending(prev))) { goto sw_default; } prev->state = TASK_RUNNING; } if (prev->state != TASK_RUNNING) { goto sw_default; } //case TASK_RUNNING: #ifdef CONFIG_PREEMPT sw_TASK_RUNNING: #endif /* * move an exhausted RR process to be last.. * Do the same for Yields */ if (!prev->counter && (prev->policy & SCHED_RR)) goto move_rr_last; if (prev->policy & SCHED_YIELD) goto move_yield_last; /* * There is a case where current is already * in the ready que. That is where it was * on the way out, but the wait already * expired, so wake_up_process has already * done it. In this case, we don't!! */ if (!task_on_rq(prev)) add_to_runqueue(prev,this_cpu); goto move_rr_back; //default: sw_default: prev->sleep_time = jiffies; prev->run_list.next = 0; move_rr_back: prev->need_resched = 0; smp_label_a next = get_next_task(prev, this_cpu); smp_label_b next->run_list.next = (struct list_head *)1; sched_data->curr = next; re_queue_cpu(next,sched_data); spin_unlock_irq(&runqueue_lock); if (unlikely(prev == next)) { goto same_process; } #ifdef CONFIG_SMP /* * maintain the per-process 'last schedule' value. * (this has to be recalculated even if we reschedule to * the same process) Currently this is only used on SMP, * and it's approximate, so we do not have to maintain * it while holding the runqueue spinlock. */ sched_data->last_schedule = get_cycles(); /* * We drop the scheduler lock early (it's a global spinlock), * thus we have to lock the previous process from getting * rescheduled during switch_to() (since we are still on his stack). * * Here is how we do it. The cpus_runnable flag will be held until * the task is truly available. On the other hand, this task * is put in the ready queue during the above runqueue_lock so * it may be picked up by another cpu. Suppose that cpu is this * one. Now the prior cpu left the task in the ready queue and * we have just pluck it from there. No conflict so far, but if * cpus_runnable is not clear, the other cpu is still in the switch code. * There are no locks there SAVE THIS ONE!!! Oh woe is me! * At the same time, under these conditions, i.e. a task is * coming out of the ready queue before we actually switch, it * would be good to not switch cpus. So lets define a "wanted" * bit in the cpus_runnable member. Oops, it is now a cpu bit mask * so, since only a few folks look at it, we will fudge it a bit. * Choose an addition that is more than on bit away from a single bit * * We will spin here waiting for cpus_runnable to go to zero. Until * this happens, we must not change the processor value as * interrupt code depends on this being right for "current". */ #define WANTED 10 #define TAKEN 20 { unsigned long cur_cpus_runnable = next->cpus_runnable; atomic_add(WANTED,(atomic_t *)&next->cpus_runnable); /* * It is either "WANTED+cur_cpus_runnable" which means we * need to wait or is: * A. The old cpu_id + WANTED or * B. WANTED - 1 which means it cleared (or was clear). * C. TAKEN + cur_cpus_runnable */ while ((cur_cpus_runnable != ~0UL) && (volatile int)next->cpus_runnable == WANTED + cur_cpus_runnable) { unsigned long my_cpu = 1 << this_cpu; barrier(); /* * OK, so while we wait, lets look in on prev and see * if he is wanted. */ if ( (volatile int)prev->cpus_runnable != my_cpu) { /* * Another cpu wants the task we have yet to * switch away from. Lets steal it back. * Once WANTED is set on prev, we can clear it * either here or in schedule_tail. The other * cpu can clear it by coming here where it will * be known by him as next... * Here, we set it to (TAKEN+my_cpu), in * schedule_tail it is set to my_cpu */ spin_lock_irq(&runqueue_lock); if ( (volatile int)prev->cpus_runnable != my_cpu) { spin_unlock_irq(&runqueue_lock); continue; } /* * Three possibilities on the state of next: * 0.) cpus_runnable has gone to ~0UL. Means the * prior cpu has finished and is not * interested. So put back in ready queue. * 5.) Other cpu noticed our interest and stoled * it back (cpus_runnable will be * TAKEN + his flag). Do nothing. * 3.) No change, put back in the ready queue * Note, case 3 presents a bit of a race on our * clearing the WANTED bit. So, we subtract and * if the result is negative, set it to zero. */ if ( (volatile int)next->cpus_runnable != cur_cpus_runnable + TAKEN) { atomic_add(-WANTED, (atomic_t *)&next->cpus_runnable); if ((volatile int)next->cpus_runnable < 0) { next->cpus_runnable = ~0UL; } add_to_runqueue(next,this_cpu); } /* * So much for "next". Now lets take prev. * Setting cpus_runnable to TAKEN+old will pop the * waiter out of the wait loop. * We then wait for him to clear TAKEN to * complete the handshake. We hand shake here * to keep the other cpu from seeing some later * state that may be wrong. */ prev->cpus_runnable = TAKEN + my_cpu; next = prev; spin_unlock_irq(&runqueue_lock); while ((volatile int)prev->cpus_runnable == TAKEN + my_cpu) { barrier(); } spin_lock_irq(&runqueue_lock); goto _smp_label_b; } } /* * if we poped out of the while because cpus_runnable has TAKEN * set it means the prior owner stoled back the task. Time to * rescan the ready queue (after clearing the TAKEN bit to * complete the handshake). The other possibilities are: * cpus_runnable = WANTED -1 ( was clear when we started) * cpus_runnable = -1 (was his, but the other cpu finished, * seting -1) */ if ((volatile int)next->cpus_runnable == TAKEN + cur_cpus_runnable){ atomic_add(-TAKEN,(atomic_t *)&next->cpus_runnable); spin_lock_irq(&runqueue_lock); goto _smp_label_a; } } /* * Gosh wasn't that fun! */ task_set_cpu(next,this_cpu); #endif /* CONFIG_SMP */ /* * An interesting problem here. Since we turned on interrupts, * we could now have a need schedule flag set in prev. Actually * this can only happen on interrupt and then only be meaningful * if it is done by a wakeup() call to reschedule_idle(). This * is covered as that code will set the need_resched flag in the * task found by cpu_curr() which comes from the cpu structs * which we have already updated. * The remaining problems come from left over timeouts against * prev, but he was the target and he is gone now... unless * we did not really switch. So in the switch path we will * clear the need_resched flag, not in the no switch path. */ kstat.context_swtch++; /* * there are 3 processes which are affected by a context switch: * * prev == .... ==> (last => next) * * It's the 'much more previous' 'prev' that is on next's stack, * but prev is set to (the just run) 'last' process by switch_to(). * This might sound slightly confusing but makes tons of sense. */ prepare_to_switch(); { struct mm_struct *mm = next->mm; struct mm_struct *oldmm = prev->active_mm; if (!mm) { if (next->active_mm) BUG(); next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next, this_cpu); } else { if (next->active_mm != mm) BUG(); switch_mm(oldmm, mm, next, this_cpu); } if (!prev->mm) { prev->active_mm = NULL; mmdrop(oldmm); } } TRACE_SCHEDCHANGE(prev->pid, next->pid, prev->state); /* * This just switches the register state and the * stack. */ switch_to(prev, next, prev); __schedule_tail(prev); prev->need_resched = 0; same_process: reacquire_kernel_lock(current); preempt_enable_no_resched(); if ( ! current->need_resched) return; /* The task managed to get its need_resched flag set already! */ goto try_try_again; move_rr_last: prev->counter = NICE_TO_TICKS(prev->nice); move_yield_last: if (prev->effprio) /* non-real time tasks get cleared later */ prev->policy &= ~SCHED_YIELD; add_last_runqueue(prev); goto move_rr_back; } static inline struct task_struct *find_process_by_pid(pid_t pid); static int setscheduler(pid_t pid, int policy, struct sched_param *param) { struct sched_param lp; struct task_struct *p; int retval; retval = -EINVAL; if (!param || pid < 0) goto out_nounlock; retval = -EFAULT; if (copy_from_user(&lp, param, sizeof(struct sched_param))) goto out_nounlock; /* * We play safe to avoid deadlocks. */ read_lock_irq(&tasklist_lock); spin_lock(&runqueue_lock); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) goto out_unlock; if (policy < 0) policy = p->policy; else { retval = -EINVAL; if (policy != SCHED_FIFO && policy != SCHED_RR && policy != SCHED_OTHER) goto out_unlock; } /* * Valid priorities for SCHED_FIFO and SCHED_RR are 1..MAX_PRI, valid * priority for SCHED_OTHER is 0. */ retval = -EINVAL; if (lp.sched_priority < 0 || lp.sched_priority > MAX_PRI) goto out_unlock; if ((policy == SCHED_OTHER) != (lp.sched_priority == 0)) goto out_unlock; retval = -EPERM; if ((policy == SCHED_FIFO || policy == SCHED_RR) && !capable(CAP_SYS_NICE)) goto out_unlock; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) goto out_unlock; retval = 0; p->policy = policy; if ( policy == SCHED_FIFO) { p->counter = -100; /* we don't count down neg couters */ }else{ p->counter = NICE_TO_TICKS(p->nice); } p->rt_priority = lp.sched_priority; spin_unlock_irq(&runqueue_lock); set_newprio(p,lp.sched_priority); goto out_readunlock; out_unlock: spin_unlock_irq(&runqueue_lock); out_readunlock: read_unlock(&tasklist_lock); out_nounlock: return retval; } asmlinkage long sys_sched_yield(void) { /* * Trick. sched_yield() first checks to see if it will be REALLY * lonly in the ready queue. It just returns if it is the only * game in town. The multilple ready queues really help here. * (This test does not have * to be atomic.) In threaded applications this optimization * gets triggered quite often. */ if ( ! list_empty(Rdy_Q_Hed(current->effprio))){ /* * I think this is safe as only the current task can * here and only the current task will be clearing this bit */ current->policy |= SCHED_YIELD; schedule(); } return 0; } /* Seems to be the first place we hear about a given cpu as it comes up. * A new (including the first) cpu is reporting for duty. Since he is * already running we must patch him into the processor queue. * We get here the first time the processor enters the idle code and also * one more time for the boot cpu so... be careful to not redo what is * already done. Also note that the fork that created the task put it * in the ready queue, so we need to take it out, except the initial cpus * task was not created by a fork. No matter, the removal code works even * then. * We give the idle task prioity -1 to keep it out of the way of tasks * that have real work to do. */ extern unsigned long wait_init_idle; void __init init_idle(void) { struct schedule_data * sched_data; int cpu=smp_processor_id(); sched_data = &aligned_data[cpu].schedule_data; if (task_on_rq(current)) { del_from_runqueue(current); } sched_data->curr = current; sched_data->last_schedule = get_cycles(); current->effprio = current->rt_priority = 0; sched_data->effprio = -1; /* idle flag */ sched_data->cpu = cpu; clear_bit(current->processor, &wait_init_idle); #ifdef CONFIG_SMP if ( ! sched_data->schedule_data_list.next ) { list_add_tail(&sched_data->schedule_data_list,&hed_cpu_prio); } #endif #ifdef CONFIG_PREEMPT if (current->processor) current->preempt_count = 0; #endif } extern void init_timervecs (void); void __init sched_init(void) { /* * We have to do a little magic to get the first * process right in SMP mode. */ int cpu = smp_processor_id(); int nr; int i; init_task.processor = cpu; /* Init the ready queue */ for (i=0;i<=MAX_PRI ;i++){ INIT_LIST_HEAD(Rdy_Q_Hed(i)); } for(nr = 0; nr < PIDHASH_SZ; nr++) pidhash[nr] = NULL; printk("rtsched version " VERSION_DATE "\n"); init_timervecs(); init_bh(TIMER_BH, timer_bh); init_bh(TQUEUE_BH, tqueue_bh); init_bh(IMMEDIATE_BH, immediate_bh); /* * The boot idle thread does lazy MMU switching as well: */ atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current, cpu); }