/*
* kernel/sched.c
*
* Kernel scheduler and related syscalls
*
* Copyright (C) 1991-2002 Linus Torvalds
*
* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and
* make semaphores SMP safe
* 1998-11-19 Implemented schedule_timeout() and related stuff
* by Andrea Arcangeli
* 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar:
* hybrid priority-list and round-robin design with
* an array-switch method of distributing timeslices
* and per-CPU runqueues. Cleanups and useful suggestions
* by Davide Libenzi, preemptible kernel bits by Robert Love.
* 2003-09-03 Interactivity tuning by Con Kolivas.
* 2004-04-02 Scheduler domains code by Nick Piggin
*/
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/init.h>
#include <asm/uaccess.h>
#include <linux/highmem.h>
#include <linux/smp_lock.h>
#include <asm/mmu_context.h>
#include <linux/interrupt.h>
#include <linux/capability.h>
#include <linux/completion.h>
#include <linux/kernel_stat.h>
#include <linux/debug_locks.h>
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/profile.h>
#include <linux/freezer.h>
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/delay.h>
#include <linux/smp.h>
#include <linux/threads.h>
#include <linux/timer.h>
#include <linux/rcupdate.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/percpu.h>
#include <linux/kthread.h>
#include <linux/seq_file.h>
#include <linux/syscalls.h>
#include <linux/times.h>
#include <linux/tsacct_kern.h>
#include <linux/kprobes.h>
#include <linux/delayacct.h>
#include <linux/reciprocal_div.h>
#include <asm/tlb.h>
#include <asm/unistd.h>
/*
* Scheduler clock - returns current time in nanosec units.
* This is default implementation.
* Architectures and sub-architectures can override this.
*/
unsigned long long __attribute__((weak)) sched_clock(void)
{
return (unsigned long long)jiffies * (1000000000 / HZ);
}
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
* and back.
*/
#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20)
#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20)
#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio)
/*
* 'User priority' is the nice value converted to something we
* can work with better when scaling various scheduler parameters,
* it's a [ 0 ... 39 ] range.
*/
#define USER_PRIO(p) ((p)-MAX_RT_PRIO)
#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio)
#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
/*
* Some helpers for converting nanosecond timing to jiffy resolution
*/
#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ))
#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
/*
* These are the 'tuning knobs' of the scheduler:
*
* Minimum timeslice is 5 msecs (or 1 jiffy, whichever is larger),
* default timeslice is 100 msecs, maximum timeslice is 800 msecs.
* Timeslices get refilled after they expire.
*/
#define MIN_TIMESLICE max(5 * HZ / 1000, 1)
#define DEF_TIMESLICE (100 * HZ / 1000)
#define ON_RUNQUEUE_WEIGHT 30
#define CHILD_PENALTY 95
#define PARENT_PENALTY 100
#define EXIT_WEIGHT 3
#define PRIO_BONUS_RATIO 25
#define MAX_BONUS (MAX_USER_PRIO * PRIO_BONUS_RATIO / 100)
#define INTERACTIVE_DELTA 2
#define MAX_SLEEP_AVG (DEF_TIMESLICE * MAX_BONUS)
#define STARVATION_LIMIT (MAX_SLEEP_AVG)
#define NS_MAX_SLEEP_AVG (JIFFIES_TO_NS(MAX_SLEEP_AVG))
/*
* If a task is 'interactive' then we reinsert it in the active
* array after it has expired its current timeslice. (it will not
* continue to run immediately, it will still roundrobin with
* other interactive tasks.)
*
* This part scales the interactivity limit depending on niceness.
*
* We scale it linearly, offset by the INTERACTIVE_DELTA delta.
* Here are a few examples of different nice levels:
*
* TASK_INTERACTIVE(-20): [1,1,1,1,1,1,1,1,1,0,0]
* TASK_INTERACTIVE(-10): [1,1,1,1,1,1,1,0,0,0,0]
* TASK_INTERACTIVE( 0): [1,1,1,1,0,0,0,0,0,0,0]
* TASK_INTERACTIVE( 10): [1,1,0,0,0,0,0,0,0,0,0]
* TASK_INTERACTIVE( 19): [0,0,0,0,0,0,0,0,0,0,0]
*
* (the X axis represents the possible -5 ... 0 ... +5 dynamic
* priority range a task can explore, a value of '1' means the
* task is rated interactive.)
*
* Ie. nice +19 tasks can never get 'interactive' enough to be
* reinserted into the active array. And only heavily CPU-hog nice -20
* tasks will be expired. Default nice 0 tasks are somewhere between,
* it takes some effort for them to get interactive, but it's not
* too hard.
*/
#define CURRENT_BONUS(p) \
(NS_TO_JIFFIES((p)->sleep_avg) * MAX_BONUS / \
MAX_SLEEP_AVG)
#define GRANULARITY (10 * HZ / 1000 ? : 1)
#ifdef CONFIG_SMP
#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)) * \
num_online_cpus())
#else
#define TIMESLICE_GRANULARITY(p) (GRANULARITY * \
(1 << (((MAX_BONUS - CURRENT_BONUS(p)) ? : 1) - 1)))
#endif
#define SCALE(v1,v1_max,v2_max) \
(v1) * (v2_max) / (v1_max)
#define DELTA(p) \
(SCALE(TASK_NICE(p) + 20, 40, MAX_BONUS) - 20 * MAX_BONUS / 40 + \
INTERACTIVE_DELTA)
#define TASK_INTERACTIVE(p) \
((p)->prio <= (p)->static_prio - DELTA(p))
#define INTERACTIVE_SLEEP(p) \
(JIFFIES_TO_NS(MAX_SLEEP_AVG * \
(MAX_BONUS / 2 + DELTA((p)) + 1) / MAX_BONUS - 1))
#define TASK_PREEMPTS_CURR(p, rq) \
((p)->prio < (rq)->curr->prio)
#define SCALE_PRIO(x, prio) \
max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
static unsigned int static_prio_timeslice(int static_prio)
{
if (static_prio < NICE_TO_PRIO(0))
return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
else
return SCALE_PRIO(DEF_TIMESLICE, static_prio);
}
#ifdef CONFIG_SMP
/*
* Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
* Since cpu_power is a 'constant', we can use a reciprocal divide.
*/
static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load)
{
return reciprocal_divide(load, sg->reciprocal_cpu_power);
}
/*
* Each time a sched group cpu_power is changed,
* we must compute its reciprocal value
*/
static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val)
{
sg->__cpu_power += val;
sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power);
}
#endif
/*
* task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ]
* to time slice values: [800ms ... 100ms ... 5ms]
*
* The higher a thread's priority, the bigger timeslices
* it gets during one round of execution. But even the lowest
* priority thread gets MIN_TIMESLICE worth of execution time.
*/
static inline unsigned int task_timeslice(struct task_struct *p)
{
return static_prio_timeslice(p->static_prio);
}
/*
* These are the runqueue data structures:
*/
struct prio_array {
unsigned int nr_active;
DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
struct list_head queue[MAX_PRIO];
};
/*
* This is the main, per-CPU runqueue data structure.
*
* Locking rule: those places that want to lock multiple runqueues
* (such as the load balancing or the thread migration code), lock
* acquire operations must be ordered by ascending &runqueue.
*/
struct rq {
spinlock_t lock;
/*
* nr_running and cpu_load should be in the same cacheline because
* remote CPUs use both these fields when doing load calculation.
*/
unsigned long nr_running;
unsigned long raw_weighted_load;
#ifdef CONFIG_SMP
unsigned long cpu_load[3];
unsigned char idle_at_tick;
#ifdef CONFIG_NO_HZ
unsigned char in_nohz_recently;
#endif
#endif
unsigned long long nr_switches;
/*
* This is part of a global counter where only the total sum
* over all CPUs matters. A task can increase this counter on
* one CPU and if it got migrated afterwards it may decrease
* it on another CPU. Always updated under the runqueue lock:
*/
unsigned long nr_uninterruptible;
unsigned long expired_timestamp;
/* Cached timestamp set by update_cpu_clock() */
unsigned long long most_recent_timestamp;
struct task_struct *curr, *idle;
unsigned long next_balance;
struct mm_struct *prev_mm;
struct prio_array *active, *expired, arrays[2];
int best_expired_prio;
atomic_t nr_iowait;
#ifdef CONFIG_SMP
struct sched_domain *sd;
/* For active balancing */
int active_balance;
int push_cpu;
int cpu; /* cpu of this runqueue */
struct task_struct *migration_thread;
struct list_head migration_queue;
#endif
#ifdef CONFIG_SCHEDSTATS
/* latency stats */
struct sched_info rq_sched_info;
/* sys_sched_yield() stats */
unsigned long yld_exp_empty;
unsigned long yld_act_empty;
unsigned long yld_both_empty;
unsigned long yld_cnt;
/* schedule() stats */
unsigned long sched_switch;
unsigned long sched_cnt;
unsigned long sched_goidle;
/* try_to_wake_up() stats */
unsigned long ttwu_cnt;
unsigned long ttwu_local;
#endif
struct lock_class_key rq_lock_key;
};
static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp;
static DEFINE_MUTEX(sched_hotcpu_mutex);
static inline int cpu_of(struct rq *rq)
{
#ifdef CONFIG_SMP
return rq->cpu;
#else
return 0;
#endif
}
/*
* The domain tree (rq->sd) is protected by RCU's quiescent state transition.
* See detach_destroy_domains: synchronize_sched for details.
*
* The domain tree of any CPU may only be accessed from within
* preempt-disabled sections.
*/
#define for_each_domain(cpu, __sd) \
for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
#define this_rq() (&__get_cpu_var(runqueues))
#define task_rq(p) cpu_rq(task_cpu(p))
#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
#ifndef prepare_arch_switch
# define prepare_arch_switch(next) do { } while (0)
#endif
#ifndef finish_arch_switch
# define finish_arch_switch(prev) do { } while (0)
#endif
#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline int task_running(struct rq *rq, struct task_struct *p)
{
return rq->curr == p;
}
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
rq->lock.owner = current;
#endif
/*
* If we are tracking spinlock dependencies then we have to
* fix up the runqueue lock - which gets 'carried over' from
* prev into current:
*/
spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
spin_unlock_irq(&rq->lock);
}
#else /* __ARCH_WANT_UNLOCKED_CTXSW */
static inline int task_running(struct rq *rq, struct task_struct *p)
{
#ifdef CONFIG_SMP
return p->oncpu;
#else
return rq->curr == p;
#endif
}
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
#ifdef CONFIG_SMP
/*
* We can optimise this out completely for !SMP, because the
* SMP rebalancing from interrupt is the only thing that cares
* here.
*/
next->oncpu = 1;
#endif
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
spin_unlock_irq(&rq->lock);
#else
spin_unlock(&rq->lock);
#endif
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
#ifdef CONFIG_SMP
/*
* After ->oncpu is cleared, the task can be moved to a different CPU.
* We must ensure this doesn't happen until the switch is completely
* finished.
*/
smp_wmb();
prev->oncpu = 0;
#endif
#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_enable();
#endif
}
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
/*
* __task_rq_lock - lock the runqueue a given task resides on.
* Must be called interrupts disabled.
*/
static inline struct rq *__task_rq_lock(struct task_struct *p)
__acquires(rq->lock)
{
struct rq *rq;
repeat_lock_task:
rq = task_rq(p);
spin_lock(&rq->lock);
if (unlikely(rq != task_rq(p))) {
spin_unlock(&rq->lock);
goto repeat_lock_task;
}
return rq;
}
/*
* task_rq_lock - lock the runqueue a given task resides on and disable
* interrupts. Note the ordering: we can safely lookup the task_rq without
* explicitly disabling preemption.
*/
static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
__acquires(rq->lock)
{
struct rq *rq;
repeat_lock_task:
local_irq_save(*flags);
rq = task_rq(p);
spin_lock(&rq->lock);
if (unlikely(rq != task_rq(p))) {
spin_unlock_irqrestore(&rq->lock, *flags);
goto repeat_lock_task;
}
return rq;
}
static inline void __task_rq_unlock(struct rq *rq)
__releases(rq->lock)
{
spin_unlock(&rq->lock);
}
static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
__releases(rq->lock)
{
spin_unlock_irqrestore(&rq->lock, *flags);
}
#ifdef CONFIG_SCHEDSTATS
/*
* bump this up when changing the output format or the meaning of an existing
* format, so that tools can adapt (or abort)
*/
#define SCHEDSTAT_VERSION 14
static int show_schedstat(struct seq_file *seq, void *v)
{
int cpu;
seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
seq_printf(seq, "timestamp %lu\n", jiffies);
for_each_online_cpu(cpu) {
struct rq *rq = cpu_rq(cpu);
#ifdef CONFIG_SMP
struct sched_domain *sd;
int dcnt = 0;
#endif
/* runqueue-specific stats */
seq_printf(seq,
"cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu",
cpu, rq->yld_both_empty,
rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt,
rq->sched_switch, rq->sched_cnt, rq->sched_goidle,
rq->ttwu_cnt, rq->ttwu_local,
rq->rq_sched_info.cpu_time,
rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt);
seq_printf(seq, "\n");
#ifdef CONFIG_SMP
/* domain-specific stats */
preempt_disable();
for_each_domain(cpu, sd) {
enum idle_type itype;
char mask_str[NR_CPUS];
cpumask_scnprintf(mask_str, NR_CPUS, sd->span);
seq_printf(seq, "domain%d %s", dcnt++, mask_str);
for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES;
itype++) {
seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu "
"%lu",
sd->lb_cnt[itype],
sd->lb_balanced[itype],
sd->lb_failed[itype],
sd->lb_imbalance[itype],
sd->lb_gained[itype],
sd->lb_hot_gained[itype],
sd->lb_nobusyq[itype],
sd->lb_nobusyg[itype]);
}
seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu"
" %lu %lu %lu\n",
sd->alb_cnt, sd->alb_failed, sd->alb_pushed,
sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed,
sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed,
sd->ttwu_wake_remote, sd->ttwu_move_affine,
sd->ttwu_move_balance);
}
preempt_enable();
#endif
}
return 0;
}
static int schedstat_open(struct inode *inode, struct file *file)
{
unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32);
char *buf = kmalloc(size, GFP_KERNEL);
struct seq_file *m;
int res;
if (!buf)
return -ENOMEM;
res = single_open(file, show_schedstat, NULL);
if (!res) {
m = file->private_data;
m->buf = buf;
m->size = size;
} else
kfree(buf);
return res;
}
const struct file_operations proc_schedstat_operations = {
.open = schedstat_open,
.read = seq_read,
.llseek = seq_lseek,
.release = single_release,
};
/*
* Expects runqueue lock to be held for atomicity of update
*/
static inline void
rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
{
if (rq) {
rq->rq_sched_info.run_delay += delta_jiffies;
rq->rq_sched_info.pcnt++;
}
}
/*
* Expects runqueue lock to be held for atomicity of update
*/
static inline void
rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
{
if (rq)
rq->rq_sched_info.cpu_time += delta_jiffies;
}
# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
#else /* !CONFIG_SCHEDSTATS */
static inline void
rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
{}
static inline void
rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
{}
# define schedstat_inc(rq, field) do { } while (0)
# define schedstat_add(rq, field, amt) do { } while (0)
#endif
/*
* this_rq_lock - lock this runqueue and disable interrupts.
*/
static inline struct rq *this_rq_lock(void)
__acquires(rq->lock)
{
struct rq *rq;
local_irq_disable();
rq = this_rq();
spin_lock(&rq->lock);
return rq;
}
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
/*
* Called when a process is dequeued from the active array and given
* the cpu. We should note that with the exception of interactive
* tasks, the expired queue will become the active queue after the active
* queue is empty, without explicitly dequeuing and requeuing tasks in the
* expired queue. (Interactive tasks may be requeued directly to the
* active queue, thus delaying tasks in the expired queue from running;
* see scheduler_tick()).
*
* This function is only called from sched_info_arrive(), rather than
* dequeue_task(). Even though a task may be queued and dequeued multiple
* times as it is shuffled about, we're really interested in knowing how
* long it was from the *first* time it was queued to the time that it
* finally hit a cpu.
*/
static inline void sched_info_dequeued(struct task_struct *t)
{
t->sched_info.last_queued = 0;
}
/*
* Called when a task finally hits the cpu. We can now calculate how
* long it was waiting to run. We also note when it began so that we
* can keep stats on how long its timeslice is.
*/
static void sched_info_arrive(struct task_struct *t)
{
unsigned long now = jiffies, delta_jiffies = 0;
if (t->sched_info.last_queued)
delta_jiffies = now - t->sched_info.last_queued;
sched_info_dequeued(t);
t->sched_info.run_delay += delta_jiffies;
t->sched_info.last_arrival = now;
t->sched_info.pcnt++;
rq_sched_info_arrive(task_rq(t), delta_jiffies);
}
/*
* Called when a process is queued into either the active or expired
* array. The time is noted and later used to determine how long we
* had to wait for us to reach the cpu. Since the expired queue will
* become the active queue after active queue is empty, without dequeuing
* and requeuing any tasks, we are interested in queuing to either. It
* is unusual but not impossible for tasks to be dequeued and immediately
* requeued in the same or another array: this can happen in sched_yield(),
* set_user_nice(), and even load_balance() as it moves tasks from runqueue
* to runqueue.
*
* This function is only called from enqueue_task(), but also only updates
* the timestamp if it is already not set. It's assumed that
* sched_info_dequeued() will clear that stamp when appropriate.
*/
static inline void sched_info_queued(struct task_struct *t)
{
if (unlikely(sched_info_on()))
if (!t->sched_info.last_queued)
t->sched_info.last_queued = jiffies;
}
/*
* Called when a process ceases being the active-running process, either
* voluntarily or involuntarily. Now we can calculate how long we ran.
*/
static inline void sched_info_depart(struct task_struct *t)
{
unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
t->sched_info.cpu_time += delta_jiffies;
rq_sched_info_depart(task_rq(t), delta_jiffies);
}
/*
* Called when tasks are switched involuntarily due, typically, to expiring
* their time slice. (This may also be called when switching to or from
* the idle task.) We are only called when prev != next.
*/
static inline void
__sched_info_switch(struct task_struct *prev, struct task_struct *next)
{
struct rq *rq = task_rq(prev);
/*
* prev now departs the cpu. It's not interesting to record
* stats about how efficient we were at scheduling the idle
* process, however.
*/
if (prev != rq->idle)
sched_info_depart(prev);
if (next != rq->idle)
sched_info_arrive(next);
}
static inline void
sched_info_switch(struct task_struct *prev, struct task_struct *next)
{
if (unlikely(sched_info_on()))
__sched_info_switch(prev, next);
}
#else
#define sched_info_queued(t) do { } while (0)
#define sched_info_switch(t, next) do { } while (0)
#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
/*
* Adding/removing a task to/from a priority array:
*/
static void dequeue_task(struct task_struct *p, struct prio_array *array)
{
array->nr_active--;
list_del(&p->run_list);
if (list_empty(array->queue + p->prio))
__clear_bit(p->prio, array->bitmap);
}
static void enqueue_task(struct task_struct *p, struct prio_array *array)
{
sched_info_queued(p);
list_add_tail(&p->run_list, array->queue + p->prio);
__set_bit(p->prio, array->bitmap);
array->nr_active++;
p->array = array;
}
/*
* Put task to the end of the run list without the overhead of dequeue
* followed by enqueue.
*/
static void requeue_task(struct task_struct *p, struct prio_array *array)
{
list_move_tail(&p->run_list, array->queue + p->prio);
}
static inline void
enqueue_task_head(struct task_struct *p, struct prio_array *array)
{
list_add(&p->run_list, array->queue + p->prio);
__set_bit(p->prio, array->bitmap);
array->nr_active++;
p->array = array;
}
/*
* __normal_prio - return the priority that is based on the static
* priority but is modified by bonuses/penalties.
*
* We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
* into the -5 ... 0 ... +5 bonus/penalty range.
*
* We use 25% of the full 0...39 priority range so that:
*
* 1) nice +19 interactive tasks do not preempt nice 0 CPU hogs.
* 2) nice -20 CPU hogs do not get preempted by nice 0 tasks.
*
* Both properties are important to certain workloads.
*/
static inline int __normal_prio(struct task_struct *p)
{
int bonus, prio;
bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
prio = p->static_prio - bonus;
if (prio < MAX_RT_PRIO)
prio = MAX_RT_PRIO;
if (prio > MAX_PRIO-1)
prio = MAX_PRIO-1;
return prio;
}
/*
* To aid in avoiding the subversion of "niceness" due to uneven distribution
* of tasks with abnormal "nice" values across CPUs the contribution that
* each task makes to its run queue's load is weighted according to its
* scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
* scaled version of the new time slice allocation that they receive on time
* slice expiry etc.
*/
/*
* Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
* If static_prio_timeslice() is ever changed to break this assumption then
* this code will need modification
*/
#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
#define LOAD_WEIGHT(lp) \
(((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
#define PRIO_TO_LOAD_WEIGHT(prio) \
LOAD_WEIGHT(static_prio_timeslice(prio))
#define RTPRIO_TO_LOAD_WEIGHT(rp) \
(PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
static void set_load_weight(struct task_struct *p)
{
if (has_rt_policy(p)) {
#ifdef CONFIG_SMP
if (p == task_rq(p)->migration_thread)
/*
* The migration thread does the actual balancing.
* Giving its load any weight will skew balancing
* adversely.
*/
p->load_weight = 0;
else
#endif
p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
} else
p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
}
static inline void
inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
{
rq->raw_weighted_load += p->load_weight;
}
static inline void
dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
{
rq->raw_weighted_load -= p->load_weight;
}
static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
{
rq->nr_running++;
inc_raw_weighted_load(rq, p);
}
static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
{
rq->nr_running--;
dec_raw_weighted_load(rq, p);
}
/*
* Calculate the expected normal priority: i.e. priority
* without taking RT-inheritance into account. Might be
* boosted by interactivity modifiers. Changes upon fork,
* setprio syscalls, and whenever the interactivity
* estimator recalculates.
*/
static inline int normal_prio(struct task_struct *p)
{
int prio;
if (has_rt_policy(p))
prio = MAX_RT_PRIO-1 - p->rt_priority;
else
prio = __normal_prio(p);
return prio;
}
/*
* Calculate the current priority, i.e. the priority
* taken into account by the scheduler. This value might
* be boosted by RT tasks, or might be boosted by
* interactivity modifiers. Will be RT if the task got
* RT-boosted. If not then it returns p->normal_prio.
*/
static int effective_prio(struct task_struct *p)
{
p->normal_prio = normal_prio(p);
/*
* If we are RT tasks or we were boosted to RT priority,
* keep the priority unchanged. Otherwise, update priority
* to the normal priority:
*/
if (!rt_prio(p->prio))
return p->normal_prio;
return p->prio;
}
/*
* __activate_task - move a task to the runqueue.
*/
static void __activate_task(struct task_struct *p, struct rq *rq)
{
struct prio_array *target = rq->active;
if (batch_task(p))
target = rq->expired;
enqueue_task(p, target);
inc_nr_running(p, rq);
}
/*
* __activate_idle_task - move idle task to the _front_ of runqueue.
*/
static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
{
enqueue_task_head(p, rq->active);
inc_nr_running(p, rq);
}
/*
* Recalculate p->normal_prio and p->prio after having slept,
* updating the sleep-average too:
*/
static int recalc_task_prio(struct task_struct *p, unsigned long long now)
{
/* Caller must always ensure 'now >= p->timestamp' */
unsigned long sleep_time = now - p->timestamp;
if (batch_task(p))
sleep_time = 0;
if (likely(sleep_time > 0)) {
/*
* This ceiling is set to the lowest priority that would allow
* a task to be reinserted into the active array on timeslice
* completion.
*/
unsigned long ceiling = INTERACTIVE_SLEEP(p);
if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
/*
* Prevents user tasks from achieving best priority
* with one single large enough sleep.
*/
p->sleep_avg = ceiling;
/*
* Using INTERACTIVE_SLEEP() as a ceiling places a
* nice(0) task 1ms sleep away from promotion, and
* gives it 700ms to round-robin with no chance of
* being demoted. This is more than generous, so
* mark this sleep as non-interactive to prevent the
* on-runqueue bonus logic from intervening should
* this task not receive cpu immediately.
*/
p->sleep_type = SLEEP_NONINTERACTIVE;
} else {
/*
* Tasks waking from uninterruptible sleep are
* limited in their sleep_avg rise as they
* are likely to be waiting on I/O
*/
if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
if (p->sleep_avg >= ceiling)
sleep_time = 0;
else if (p->sleep_avg + sleep_time >=
ceiling) {
p->sleep_avg = ceiling;
sleep_time = 0;
}
}
/*
* This code gives a bonus to interactive tasks.
*
* The boost works by updating the 'average sleep time'
* value here, based on ->timestamp. The more time a
* task spends sleeping, the higher the average gets -
* and the higher the priority boost gets as well.
*/
p->sleep_avg += sleep_time;
}
if (p->sleep_avg > NS_MAX_SLEEP_AVG)
p->sleep_avg = NS_MAX_SLEEP_AVG;
}
return effective_prio(p);
}
/*
* activate_task - move a task to the runqueue and do priority recalculation
*
* Update all the scheduling statistics stuff. (sleep average
* calculation, priority modifiers, etc.)
*/
static void activate_task(struct task_struct *p, struct rq *rq, int local)
{
unsigned long long now;
if (rt_task(p))
goto out;
now = sched_clock();
#ifdef CONFIG_SMP
if (!local) {
/* Compensate for drifting sched_clock */
struct rq *this_rq = this_rq();
now = (now - this_rq->most_recent_timestamp)
+ rq->most_recent_timestamp;
}
#endif
/*
* Sleep time is in units of nanosecs, so shift by 20 to get a
* milliseconds-range estimation of the amount of time that the task
* spent sleeping:
*/
if (unlikely(prof_on == SLEEP_PROFILING)) {
if (p->state == TASK_UNINTERRUPTIBLE)
profile_hits(SLEEP_PROFILING, (void *)get_wchan(p),
(now - p->timestamp) >> 20);
}
p->prio = recalc_task_prio(p, now);
/*
* This checks to make sure it's not an uninterruptible task
* that is now waking up.
*/
if (p->sleep_type == SLEEP_NORMAL) {
/*
* Tasks which were woken up by interrupts (ie. hw events)
* are most likely of interactive nature. So we give them
* the credit of extending their sleep time to the period
* of time they spend on the runqueue, waiting for execution
* on a CPU, first time around:
*/
if (in_interrupt())
p->sleep_type = SLEEP_INTERRUPTED;
else {
/*
* Normal first-time wakeups get a credit too for
* on-runqueue time, but it will be weighted down:
*/
p->sleep_type = SLEEP_INTERACTIVE;
}
}
p->timestamp = now;
out:
__activate_task(p, rq);
}
/*
* deactivate_task - remove a task from the runqueue.
*/
static void deactivate_task(struct task_struct *p, struct rq *rq)
{
dec_nr_running(p, rq);
dequeue_task(p, p->array);
p->array = NULL;
}
/*
* resched_task - mark a task 'to be rescheduled now'.
*
* On UP this means the setting of the need_resched flag, on SMP it
* might also involve a cross-CPU call to trigger the scheduler on
* the target CPU.
*/
#ifdef CONFIG_SMP
#ifndef tsk_is_polling
#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
#endif
static void resched_task(struct task_struct *p)
{
int cpu;
assert_spin_locked(&task_rq(p)->lock);
if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
return;
set_tsk_thread_flag(p, TIF_NEED_RESCHED);
cpu = task_cpu(p);
if (cpu == smp_processor_id())
return;
/* NEED_RESCHED must be visible before we test polling */
smp_mb();
if (!tsk_is_polling(p))
smp_send_reschedule(cpu);
}
static void resched_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
if (!spin_trylock_irqsave(&rq->lock, flags))
return;
resched_task(cpu_curr(cpu));
spin_unlock_irqrestore(&rq->lock, flags);
}
#else
static inline void resched_task(struct task_struct *p)
{
assert_spin_locked(&task_rq(p)->lock);
set_tsk_need_resched(p);
}
#endif
/**
* task_curr - is this task currently executing on a CPU?
* @p: the task in question.
*/
inline int task_curr(const struct task_struct *p)
{
return cpu_curr(task_cpu(p)) == p;
}
/* Used instead of source_load when we know the type == 0 */
unsigned long weighted_cpuload(const int cpu)
{
return cpu_rq(cpu)->raw_weighted_load;
}
#ifdef CONFIG_SMP
struct migration_req {
struct list_head list;
struct task_struct *task;
int dest_cpu;
struct completion done;
};
/*
* The task's runqueue lock must be held.
* Returns true if you have to wait for migration thread.
*/
static int
migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
{
struct rq *rq = task_rq(p);
/*
* If the task is not on a runqueue (and not running), then
* it is sufficient to simply update the task's cpu field.
*/
if (!p->array && !task_running(rq, p)) {
set_task_cpu(p, dest_cpu);
return 0;
}
init_completion(&req->done);
req->task = p;
req->dest_cpu = dest_cpu;
list_add(&req->list, &rq->migration_queue);
return 1;
}
/*
* wait_task_inactive - wait for a thread to unschedule.
*
* The caller must ensure that the task *will* unschedule sometime soon,
* else this function might spin for a *long* time. This function can't
* be called with interrupts off, or it may introduce deadlock with
* smp_call_function() if an IPI is sent by the same process we are
* waiting to become inactive.
*/
void wait_task_inactive(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
struct prio_array *array;
int running;
repeat:
/*
* We do the initial early heuristics without holding
* any task-queue locks at all. We'll only try to get
* the runqueue lock when things look like they will
* work out!
*/
rq = task_rq(p);
/*
* If the task is actively running on another CPU
* still, just relax and busy-wait without holding
* any locks.
*
* NOTE! Since we don't hold any locks, it's not
* even sure that "rq" stays as the right runqueue!
* But we don't care, since "task_running()" will
* return false if the runqueue has changed and p
* is actually now running somewhere else!
*/
while (task_running(rq, p))
cpu_relax();
/*
* Ok, time to look more closely! We need the rq
* lock now, to be *sure*. If we're wrong, we'll
* just go back and repeat.
*/
rq = task_rq_lock(p, &flags);
running = task_running(rq, p);
array = p->array;
task_rq_unlock(rq, &flags);
/*
* Was it really running after all now that we
* checked with the proper locks actually held?
*
* Oops. Go back and try again..
*/
if (unlikely(running)) {
cpu_relax();
goto repeat;
}
/*
* It's not enough that it's not actively running,
* it must be off the runqueue _entirely_, and not
* preempted!
*
* So if it wa still runnable (but just not actively
* running right now), it's preempted, and we should
* yield - it could be a while.
*/
if (unlikely(array)) {
yield();
goto repeat;
}
/*
* Ahh, all good. It wasn't running, and it wasn't
* runnable, which means that it will never become
* running in the future either. We're all done!
*/
}
/***
* kick_process - kick a running thread to enter/exit the kernel
* @p: the to-be-kicked thread
*
* Cause a process which is running on another CPU to enter
* kernel-mode, without any delay. (to get signals handled.)
*
* NOTE: this function doesnt have to take the runqueue lock,
* because all it wants to ensure is that the remote task enters
* the kernel. If the IPI races and the task has been migrated
* to another CPU then no harm is done and the purpose has been
* achieved as well.
*/
void kick_process(struct task_struct *p)
{
int cpu;
preempt_disable();
cpu = task_cpu(p);
if ((cpu != smp_processor_id()) && task_curr(p))
smp_send_reschedule(cpu);
preempt_enable();
}
/*
* Return a low guess at the load of a migration-source cpu weighted
* according to the scheduling class and "nice" value.
*
* We want to under-estimate the load of migration sources, to
* balance conservatively.
*/
static inline unsigned long source_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
if (type == 0)
return rq->raw_weighted_load;
return min(rq->cpu_load[type-1], rq->raw_weighted_load);
}
/*
* Return a high guess at the load of a migration-target cpu weighted
* according to the scheduling class and "nice" value.
*/
static inline unsigned long target_load(int cpu, int type)
{
struct rq *rq = cpu_rq(cpu);
if (type == 0)
return rq->raw_weighted_load;
return max(rq->cpu_load[type-1], rq->raw_weighted_load);
}
/*
* Return the average load per task on the cpu's run queue
*/
static inline unsigned long cpu_avg_load_per_task(int cpu)
{
struct rq *rq = cpu_rq(cpu);
unsigned long n = rq->nr_running;
return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
}
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
*/
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
{
struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups;
unsigned long min_load = ULONG_MAX, this_load = 0;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
do {
unsigned long load, avg_load;
int local_group;
int i;
/* Skip over this group if it has no CPUs allowed */
if (!cpus_intersects(group->cpumask, p->cpus_allowed))
goto nextgroup;
local_group = cpu_isset(this_cpu, group->cpumask);
/* Tally up the load of all CPUs in the group */
avg_load = 0;
for_each_cpu_mask(i, group->cpumask) {
/* Bias balancing toward cpus of our domain */
if (local_group)
load = source_load(i, load_idx);
else
load = target_load(i, load_idx);
avg_load += load;
}
/* Adjust by relative CPU power of the group */
avg_load = sg_div_cpu_power(group,
avg_load * SCHED_LOAD_SCALE);
if (local_group) {
this_load = avg_load;
this = group;
} else if (avg_load < min_load) {
min_load = avg_load;
idlest = group;
}
nextgroup:
group = group->next;
} while (group != sd->groups);
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
return idlest;
}
/*
* find_idlest_cpu - find the idlest cpu among the cpus in group.
*/
static int
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
cpumask_t tmp;
unsigned long load, min_load = ULONG_MAX;
int idlest = -1;
int i;
/* Traverse only the allowed CPUs */
cpus_and(tmp, group->cpumask, p->cpus_allowed);
for_each_cpu_mask(i, tmp) {
load = weighted_cpuload(i);
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
idlest = i;
}
}
return idlest;
}
/*
* sched_balance_self: balance the current task (running on cpu) in domains
* that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
* SD_BALANCE_EXEC.
*
* Balance, ie. select the least loaded group.
*
* Returns the target CPU number, or the same CPU if no balancing is needed.
*
* preempt must be disabled.
*/
static int sched_balance_self(int cpu, int flag)
{
struct task_struct *t = current;
struct sched_domain *tmp, *sd = NULL;
for_each_domain(cpu, tmp) {
/*
* If power savings logic is enabled for a domain, stop there.
*/
if (tmp->flags & SD_POWERSAVINGS_BALANCE)
break;
if (tmp->flags & flag)
sd = tmp;
}
while (sd) {
cpumask_t span;
struct sched_group *group;
int new_cpu, weight;
if (!(sd->flags & flag)) {
sd = sd->child;
continue;
}
span = sd->span;
group = find_idlest_group(sd, t, cpu);
if (!group) {
sd = sd->child;
continue;
}
new_cpu = find_idlest_cpu(group, t, cpu);
if (new_cpu == -1 || new_cpu == cpu) {
/* Now try balancing at a lower domain level of cpu */
sd = sd->child;
continue;
}
/* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
sd = NULL;
weight = cpus_weight(span);
for_each_domain(cpu, tmp) {
if (weight <= cpus_weight(tmp->span))
break;
if (tmp->flags & flag)
sd = tmp;
}
/* while loop will break here if sd == NULL */
}
return cpu;
}
#endif /* CONFIG_SMP */
/*
* wake_idle() will wake a task on an idle cpu if task->cpu is
* not idle and an idle cpu is available. The span of cpus to
* search starts with cpus closest then further out as needed,
* so we always favor a closer, idle cpu.
*
* Returns the CPU we should wake onto.
*/
#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
static int wake_idle(int cpu, struct task_struct *p)
{
cpumask_t tmp;
struct sched_domain *sd;
int i;
/*
* If it is idle, then it is the best cpu to run this task.
*
* This cpu is also the best, if it has more than one task already.
* Siblings must be also busy(in most cases) as they didn't already
* pickup the extra load from this cpu and hence we need not check
* sibling runqueue info. This will avoid the checks and cache miss
* penalities associated with that.
*/
if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
return cpu;
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_IDLE) {
cpus_and(tmp, sd->span, p->cpus_allowed);
for_each_cpu_mask(i, tmp) {
if (idle_cpu(i))
return i;
}
}
else
break;
}
return cpu;
}
#else
static inline int wake_idle(int cpu, struct task_struct *p)
{
return cpu;
}
#endif
/***
* try_to_wake_up - wake up a thread
* @p: the to-be-woken-up thread
* @state: the mask of task states that can be woken
* @sync: do a synchronous wakeup?
*
* Put it on the run-queue if it's not already there. The "current"
* thread is always on the run-queue (except when the actual
* re-schedule is in progress), and as such you're allowed to do
* the simpler "current->state = TASK_RUNNING" to mark yourself
* runnable without the overhead of this.
*
* returns failure only if the task is already active.
*/
static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
{
int cpu, this_cpu, success = 0;
unsigned long flags;
long old_state;
struct rq *rq;
#ifdef CONFIG_SMP
struct sched_domain *sd, *this_sd = NULL;
unsigned long load, this_load;
int new_cpu;
#endif
rq = task_rq_lock(p, &flags);
old_state = p->state;
if (!(old_state & state))
goto out;
if (p->array)
goto out_running;
cpu = task_cpu(p);
this_cpu = smp_processor_id();
#ifdef CONFIG_SMP
if (unlikely(task_running(rq, p)))
goto out_activate;
new_cpu = cpu;
schedstat_inc(rq, ttwu_cnt);
if (cpu == this_cpu) {
schedstat_inc(rq, ttwu_local);
goto out_set_cpu;
}
for_each_domain(this_cpu, sd) {
if (cpu_isset(cpu, sd->span)) {
schedstat_inc(