--- zzzz-none-000/linux-3.10.107/include/linux/sched.h 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/include/linux/sched.h 2021-02-04 17:41:59.000000000 +0000 @@ -3,6 +3,8 @@ #include +#include + struct sched_param { int sched_priority; @@ -16,19 +18,22 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include -#include +#include #include #include +#include #include #include #include @@ -52,22 +57,87 @@ #include #include #include +#include +#include #include -struct exec_domain; +#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ + +/* + * Extended scheduling parameters data structure. + * + * This is needed because the original struct sched_param can not be + * altered without introducing ABI issues with legacy applications + * (e.g., in sched_getparam()). + * + * However, the possibility of specifying more than just a priority for + * the tasks may be useful for a wide variety of application fields, e.g., + * multimedia, streaming, automation and control, and many others. + * + * This variant (sched_attr) is meant at describing a so-called + * sporadic time-constrained task. In such model a task is specified by: + * - the activation period or minimum instance inter-arrival time; + * - the maximum (or average, depending on the actual scheduling + * discipline) computation time of all instances, a.k.a. runtime; + * - the deadline (relative to the actual activation time) of each + * instance. + * Very briefly, a periodic (sporadic) task asks for the execution of + * some specific computation --which is typically called an instance-- + * (at most) every period. Moreover, each instance typically lasts no more + * than the runtime and must be completed by time instant t equal to + * the instance activation time + the deadline. + * + * This is reflected by the actual fields of the sched_attr structure: + * + * @size size of the structure, for fwd/bwd compat. + * + * @sched_policy task's scheduling policy + * @sched_flags for customizing the scheduler behaviour + * @sched_nice task's nice value (SCHED_NORMAL/BATCH) + * @sched_priority task's static priority (SCHED_FIFO/RR) + * @sched_deadline representative of the task's deadline + * @sched_runtime representative of the task's runtime + * @sched_period representative of the task's period + * + * Given this task model, there are a multiplicity of scheduling algorithms + * and policies, that can be used to ensure all the tasks will make their + * timing constraints. + * + * As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the + * only user of this new interface. More information about the algorithm + * available in the scheduling class file or in Documentation/. + */ +struct sched_attr { + u32 size; + + u32 sched_policy; + u64 sched_flags; + + /* SCHED_NORMAL, SCHED_BATCH */ + s32 sched_nice; + + /* SCHED_FIFO, SCHED_RR */ + u32 sched_priority; + + /* SCHED_DEADLINE */ + u64 sched_runtime; + u64 sched_deadline; + u64 sched_period; +}; + struct futex_pi_state; struct robust_list_head; struct bio_list; struct fs_struct; struct perf_event_context; struct blk_plug; +struct filename; +struct nameidata; -/* - * List of flags we want to share for kernel threads, - * if only because they are not used by them anyway. - */ -#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) +#define VMACACHE_BITS 2 +#define VMACACHE_SIZE (1U << VMACACHE_BITS) +#define VMACACHE_MASK (VMACACHE_SIZE - 1) /* * These are the constant used to fake the fixed-point load-average @@ -99,21 +169,18 @@ DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); extern unsigned long nr_running(void); +extern bool single_task_running(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); -extern unsigned long this_cpu_load(void); - +extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load); extern void calc_global_load(unsigned long ticks); -extern void update_cpu_load_nohz(void); -/* Notifier for when a task gets migrated to a new CPU */ -struct task_migration_notifier { - struct task_struct *task; - int from_cpu; - int to_cpu; -}; -extern void register_task_migration_notifier(struct notifier_block *n); +#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) +extern void update_cpu_load_nohz(void); +#else +static inline void update_cpu_load_nohz(void) { } +#endif extern unsigned long get_parent_ip(unsigned long addr); @@ -125,8 +192,6 @@ #ifdef CONFIG_SCHED_DEBUG extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m); extern void proc_sched_set_task(struct task_struct *p); -extern void -print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); #endif /* @@ -145,16 +210,18 @@ #define __TASK_STOPPED 4 #define __TASK_TRACED 8 /* in tsk->exit_state */ -#define EXIT_ZOMBIE 16 -#define EXIT_DEAD 32 +#define EXIT_DEAD 16 +#define EXIT_ZOMBIE 32 +#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD) /* in tsk->state again */ #define TASK_DEAD 64 #define TASK_WAKEKILL 128 #define TASK_WAKING 256 #define TASK_PARKED 512 -#define TASK_STATE_MAX 1024 +#define TASK_NOLOAD 1024 +#define TASK_STATE_MAX 2048 -#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP" +#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN" extern char ___assert_task_state[1 - 2*!!( sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)]; @@ -164,6 +231,8 @@ #define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED) #define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED) +#define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD) + /* Convenience macros for the sake of wake_up */ #define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE) #define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED) @@ -171,21 +240,58 @@ /* get_task_state() */ #define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \ TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \ - __TASK_TRACED) + __TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD) #define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) -#define task_is_dead(task) ((task)->exit_state != 0) #define task_is_stopped_or_traced(task) \ ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) #define task_contributes_to_load(task) \ ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ - (task->flags & PF_FROZEN) == 0) + (task->flags & PF_FROZEN) == 0 && \ + (task->state & TASK_NOLOAD) == 0) + +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + +#define __set_task_state(tsk, state_value) \ + do { \ + (tsk)->task_state_change = _THIS_IP_; \ + (tsk)->state = (state_value); \ + } while (0) +#define set_task_state(tsk, state_value) \ + do { \ + (tsk)->task_state_change = _THIS_IP_; \ + smp_store_mb((tsk)->state, (state_value)); \ + } while (0) + +/* + * set_current_state() includes a barrier so that the write of current->state + * is correctly serialised wrt the caller's subsequent test of whether to + * actually sleep: + * + * set_current_state(TASK_UNINTERRUPTIBLE); + * if (do_i_need_to_sleep()) + * schedule(); + * + * If the caller does not need such serialisation then use __set_current_state() + */ +#define __set_current_state(state_value) \ + do { \ + current->task_state_change = _THIS_IP_; \ + current->state = (state_value); \ + } while (0) +#define set_current_state(state_value) \ + do { \ + current->task_state_change = _THIS_IP_; \ + smp_store_mb(current->state, (state_value)); \ + } while (0) + +#else #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) #define set_task_state(tsk, state_value) \ - set_mb((tsk)->state, (state_value)) + smp_store_mb((tsk)->state, (state_value)) /* * set_current_state() includes a barrier so that the write of current->state @@ -198,10 +304,12 @@ * * If the caller does not need such serialisation then use __set_current_state() */ -#define __set_current_state(state_value) \ +#define __set_current_state(state_value) \ do { current->state = (state_value); } while (0) -#define set_current_state(state_value) \ - set_mb(current->state, (state_value)) +#define set_current_state(state_value) \ + smp_store_mb(current->state, (state_value)) + +#endif /* Task command name length */ #define TASK_COMM_LEN 16 @@ -229,6 +337,8 @@ extern void init_idle(struct task_struct *idle, int cpu); extern void init_idle_bootup_task(struct task_struct *idle); +extern cpumask_var_t cpu_isolated_map; + extern int runqueue_is_locked(int cpu); #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) @@ -259,9 +369,6 @@ */ extern void show_stack(struct task_struct *task, unsigned long *sp); -void io_schedule(void); -long io_schedule_timeout(long timeout); - extern void cpu_init (void); extern void trap_init(void); extern void update_process_times(int user); @@ -277,6 +384,7 @@ void __user *buffer, size_t *lenp, loff_t *ppos); extern unsigned int softlockup_panic; +extern unsigned int hardlockup_panic; void lockup_detector_init(void); #else static inline void touch_softlockup_watchdog(void) @@ -293,6 +401,14 @@ } #endif +#ifdef CONFIG_DETECT_HUNG_TASK +void reset_hung_task_detector(void); +#else +static inline void reset_hung_task_detector(void) +{ +} +#endif + /* Attach to any functions which should be ignored in wchan output. */ #define __sched __attribute__((__section__(".sched.text"))) @@ -310,6 +426,13 @@ asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); +extern long io_schedule_timeout(long timeout); + +static inline void io_schedule(void) +{ + io_schedule_timeout(MAX_SCHEDULE_TIMEOUT); +} + struct nsproxy; struct user_namespace; @@ -322,28 +445,37 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -extern void arch_unmap_area(struct mm_struct *, unsigned long); -extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm) {} #endif - -extern void set_dumpable(struct mm_struct *mm, int value); -extern int get_dumpable(struct mm_struct *mm); - #define SUID_DUMP_DISABLE 0 /* No setuid dumping */ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ /* mm flags */ -/* dumpable bits */ -#define MMF_DUMPABLE 0 /* core dump is permitted */ -#define MMF_DUMP_SECURELY 1 /* core file is readable only by root */ +/* for SUID_DUMP_* above */ #define MMF_DUMPABLE_BITS 2 #define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) +extern void set_dumpable(struct mm_struct *mm, int value); +/* + * This returns the actual value of the suid_dumpable flag. For things + * that are using this for checking for privilege transitions, it must + * test against SUID_DUMP_USER rather than treating it as a boolean + * value. + */ +static inline int __get_dumpable(unsigned long mm_flags) +{ + return mm_flags & MMF_DUMPABLE_MASK; +} + +static inline int get_dumpable(struct mm_struct *mm) +{ + return __get_dumpable(mm->flags); +} + /* coredump filter bits */ #define MMF_DUMP_ANON_PRIVATE 2 #define MMF_DUMP_ANON_SHARED 3 @@ -352,9 +484,11 @@ #define MMF_DUMP_ELF_HEADERS 6 #define MMF_DUMP_HUGETLB_PRIVATE 7 #define MMF_DUMP_HUGETLB_SHARED 8 +#define MMF_DUMP_DAX_PRIVATE 9 +#define MMF_DUMP_DAX_SHARED 10 #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS -#define MMF_DUMP_FILTER_BITS 7 +#define MMF_DUMP_FILTER_BITS 9 #define MMF_DUMP_FILTER_MASK \ (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) #define MMF_DUMP_FILTER_DEFAULT \ @@ -399,39 +533,49 @@ }; /** - * struct cputime - snaphsot of system and user cputime + * struct prev_cputime - snaphsot of system and user cputime * @utime: time spent in user mode * @stime: time spent in system mode + * @lock: protects the above two fields * - * Gathers a generic snapshot of user and system time. + * Stores previous user/system time values such that we can guarantee + * monotonicity. */ -struct cputime { +struct prev_cputime { +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE cputime_t utime; cputime_t stime; + raw_spinlock_t lock; +#endif }; +static inline void prev_cputime_init(struct prev_cputime *prev) +{ +#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE + prev->utime = prev->stime = 0; + raw_spin_lock_init(&prev->lock); +#endif +} + /** * struct task_cputime - collected CPU time counts * @utime: time spent in user mode, in &cputime_t units * @stime: time spent in kernel mode, in &cputime_t units * @sum_exec_runtime: total time spent on the CPU, in nanoseconds * - * This is an extension of struct cputime that includes the total runtime - * spent by the task from the scheduler point of view. - * - * As a result, this structure groups together three kinds of CPU time - * that are tracked for threads and thread groups. Most things considering - * CPU time want to group these counts together and treat all three - * of them in parallel. + * This structure groups together three kinds of CPU time that are tracked for + * threads and thread groups. Most things considering CPU time want to group + * these counts together and treat all three of them in parallel. */ struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; + /* Alternate field names when used to cache expirations. */ -#define prof_exp stime #define virt_exp utime +#define prof_exp stime #define sched_exp sum_exec_runtime #define INIT_CPUTIME \ @@ -442,28 +586,58 @@ } /* - * Disable preemption until the scheduler is running. - * Reset by start_kernel()->sched_init()->init_idle(). + * This is the atomic variant of task_cputime, which can be used for + * storing and updating task_cputime statistics without locking. + */ +struct task_cputime_atomic { + atomic64_t utime; + atomic64_t stime; + atomic64_t sum_exec_runtime; +}; + +#define INIT_CPUTIME_ATOMIC \ + (struct task_cputime_atomic) { \ + .utime = ATOMIC64_INIT(0), \ + .stime = ATOMIC64_INIT(0), \ + .sum_exec_runtime = ATOMIC64_INIT(0), \ + } + +#define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) + +/* + * Disable preemption until the scheduler is running -- use an unconditional + * value so that it also works on !PREEMPT_COUNT kernels. + * + * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count(). + */ +#define INIT_PREEMPT_COUNT PREEMPT_OFFSET + +/* + * Initial preempt_count value; reflects the preempt_count schedule invariant + * which states that during context switches: * - * We include PREEMPT_ACTIVE to avoid cond_resched() from working - * before the scheduler is active -- see should_resched(). + * preempt_count() == 2*PREEMPT_DISABLE_OFFSET + * + * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels. + * Note: See finish_task_switch(). */ -#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE) +#define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /** * struct thread_group_cputimer - thread group interval timer counts - * @cputime: thread group interval timers. - * @running: non-zero when there are timers running and - * @cputime receives updates. - * @lock: lock for fields in this struct. + * @cputime_atomic: atomic thread group interval timers. + * @running: true when there are timers running and + * @cputime_atomic receives updates. + * @checking_timer: true when a thread in the group is in the + * process of checking for thread group timers. * * This structure contains the version of task_cputime, above, that is * used for thread group CPU timer calculations. */ struct thread_group_cputimer { - struct task_cputime cputime; - int running; - raw_spinlock_t lock; + struct task_cputime_atomic cputime_atomic; + bool running; + bool checking_timer; }; #include @@ -559,12 +733,11 @@ * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ + seqlock_t stats_lock; cputime_t utime, stime, cutime, cstime; cputime_t gtime; cputime_t cgtime; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - struct cputime prev_cputime; -#endif + struct prev_cputime prev_cputime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; @@ -601,18 +774,6 @@ unsigned audit_tty_log_passwd; struct tty_audit_buf *tty_audit_buf; #endif -#ifdef CONFIG_CGROUPS - /* - * group_rwsem prevents new tasks from entering the threadgroup and - * member tasks from exiting,a more specifically, setting of - * PF_EXITING. fork and exit paths are protected with this rwsem - * using threadgroup_change_begin/end(). Users which require - * threadgroup to remain stable should use threadgroup_[un]lock() - * which also takes care of exec path. Currently, cgroup is the - * only user. - */ - struct rw_semaphore group_rwsem; -#endif oom_flags_t oom_flags; short oom_score_adj; /* OOM kill score adjustment */ @@ -653,7 +814,6 @@ struct user_struct { atomic_t __count; /* reference count */ atomic_t processes; /* How many processes does this user have? */ - atomic_t files; /* How many open files does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */ #ifdef CONFIG_INOTIFY_USER atomic_t inotify_watches; /* How many inotify watches does this user have? */ @@ -682,7 +842,7 @@ struct hlist_node uidhash_node; kuid_t uid; -#ifdef CONFIG_PERF_EVENTS +#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) atomic_long_t locked_vm; #endif }; @@ -698,7 +858,7 @@ struct backing_dev_info; struct reclaim_state; -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +#ifdef CONFIG_SCHED_INFO struct sched_info { /* cumulative counters */ unsigned long pcount; /* # of times run on this cpu */ @@ -708,7 +868,7 @@ unsigned long long last_arrival,/* when we last ran on a cpu */ last_queued; /* when we were last queued to run */ }; -#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */ +#endif /* CONFIG_SCHED_INFO */ #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info { @@ -730,7 +890,7 @@ * associated with the operation is added to XXX_delay. * XXX_delay contains the accumulated delay time in nanoseconds. */ - struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */ + u64 blkio_start; /* Shared by blkio, swapin */ u64 blkio_delay; /* wait for sync block io completion */ u64 swapin_delay; /* wait for swapin block io completion */ u32 blkio_count; /* total count of the number of sync block */ @@ -738,7 +898,7 @@ u32 swapin_count; /* total count of the number of swapin block */ /* io operations performed */ - struct timespec freepages_start, freepages_end; + u64 freepages_start; u64 freepages_delay; /* wait for memory reclaim */ u32 freepages_count; /* total count of memory reclaim */ }; @@ -764,10 +924,54 @@ }; /* - * Increase resolution of cpu_power calculations + * Increase resolution of cpu_capacity calculations */ -#define SCHED_POWER_SHIFT 10 -#define SCHED_POWER_SCALE (1L << SCHED_POWER_SHIFT) +#define SCHED_CAPACITY_SHIFT 10 +#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT) + +/* + * Wake-queues are lists of tasks with a pending wakeup, whose + * callers have already marked the task as woken internally, + * and can thus carry on. A common use case is being able to + * do the wakeups once the corresponding user lock as been + * released. + * + * We hold reference to each task in the list across the wakeup, + * thus guaranteeing that the memory is still valid by the time + * the actual wakeups are performed in wake_up_q(). + * + * One per task suffices, because there's never a need for a task to be + * in two wake queues simultaneously; it is forbidden to abandon a task + * in a wake queue (a call to wake_up_q() _must_ follow), so if a task is + * already in a wake queue, the wakeup will happen soon and the second + * waker can just skip it. + * + * The WAKE_Q macro declares and initializes the list head. + * wake_up_q() does NOT reinitialize the list; it's expected to be + * called near the end of a function, where the fact that the queue is + * not used again will be easy to see by inspection. + * + * Note that this can cause spurious wakeups. schedule() callers + * must ensure the call is done inside a loop, confirming that the + * wakeup condition has in fact occurred. + */ +struct wake_q_node { + struct wake_q_node *next; +}; + +struct wake_q_head { + struct wake_q_node *first; + struct wake_q_node **lastp; +}; + +#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01) + +#define WAKE_Q(name) \ + struct wake_q_head name = { WAKE_Q_TAIL, &name.first } + +extern void wake_q_add(struct wake_q_head *head, + struct task_struct *task); +extern void wake_up_q(struct wake_q_head *head); /* * sched-domains (multiprocessor balancing) declarations: @@ -779,14 +983,35 @@ #define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */ #define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */ #define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */ -#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */ +#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */ +#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */ #define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */ #define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */ #define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */ #define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */ #define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */ +#define SD_NUMA 0x4000 /* cross-node balancing */ + +#ifdef CONFIG_SCHED_SMT +static inline int cpu_smt_flags(void) +{ + return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES; +} +#endif -extern int __weak arch_sd_sibiling_asym_packing(void); +#ifdef CONFIG_SCHED_MC +static inline int cpu_core_flags(void) +{ + return SD_SHARE_PKG_RESOURCES; +} +#endif + +#ifdef CONFIG_NUMA +static inline int cpu_numa_flags(void) +{ + return SD_NUMA; +} +#endif struct sched_domain_attr { int relax_domain_level; @@ -826,7 +1051,9 @@ unsigned int balance_interval; /* initialise to 1. units in ms. */ unsigned int nr_balance_failed; /* initialise to 0 */ - u64 last_update; + /* idle_balance() stats */ + u64 max_newidle_lb_cost; + unsigned long next_decay_max_lb_cost; #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ @@ -892,6 +1119,37 @@ bool cpus_share_cache(int this_cpu, int that_cpu); +typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); +typedef int (*sched_domain_flags_f)(void); + +#define SDTL_OVERLAP 0x01 + +struct sd_data { + struct sched_domain **__percpu sd; + struct sched_group **__percpu sg; + struct sched_group_capacity **__percpu sgc; +}; + +struct sched_domain_topology_level { + sched_domain_mask_f mask; + sched_domain_flags_f sd_flags; + int flags; + int numa_level; + struct sd_data data; +#ifdef CONFIG_SCHED_DEBUG + char *name; +#endif +}; + +extern void set_sched_topology(struct sched_domain_topology_level *tl); +extern void wake_up_if_idle(int cpu); + +#ifdef CONFIG_SCHED_DEBUG +# define SD_INIT_NAME(type) .name = #type +#else +# define SD_INIT_NAME(type) +#endif + #else /* CONFIG_SMP */ struct sched_domain_attr; @@ -925,19 +1183,28 @@ struct uts_namespace; struct load_weight { - unsigned long weight, inv_weight; + unsigned long weight; + u32 inv_weight; }; +/* + * The load_avg/util_avg accumulates an infinite geometric series. + * 1) load_avg factors frequency scaling into the amount of time that a + * sched_entity is runnable on a rq into its weight. For cfs_rq, it is the + * aggregated such weights of all runnable and blocked sched_entities. + * 2) util_avg factors frequency and cpu scaling into the amount of time + * that a sched_entity is running on a CPU, in the range [0..SCHED_LOAD_SCALE]. + * For cfs_rq, it is the aggregated such times of all runnable and + * blocked sched_entities. + * The 64 bit load_sum can: + * 1) for cfs_rq, afford 4353082796 (=2^64/47742/88761) entities with + * the highest weight (=88761) always runnable, we should not overflow + * 2) for entity, support any load.weight always runnable + */ struct sched_avg { - /* - * These sums represent an infinite geometric series and so are bound - * above by 1024/(1-y). Thus we only need a u32 to store them for for all - * choices of y < 1-2^(-32)*1024. - */ - u32 runnable_avg_sum, runnable_avg_period; - u64 last_runnable_update; - s64 decay_count; - unsigned long load_avg_contrib; + u64 last_update_time, load_sum; + u32 util_sum, period_contrib; + unsigned long load_avg, util_avg; }; #ifdef CONFIG_SCHEDSTATS @@ -994,6 +1261,7 @@ #endif #ifdef CONFIG_FAIR_GROUP_SCHED + int depth; struct sched_entity *parent; /* rq on which this entity is (to be) queued: */ struct cfs_rq *cfs_rq; @@ -1001,13 +1269,8 @@ struct cfs_rq *my_q; #endif -/* - * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be - * removed when useful for applications beyond shares distribution (e.g. - * load-balance). - */ -#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) - /* Per-entity load-tracking */ +#ifdef CONFIG_SMP + /* Per entity load average tracking */ struct sched_avg avg; #endif }; @@ -1028,7 +1291,64 @@ #endif }; +struct sched_dl_entity { + struct rb_node rb_node; + + /* + * Original scheduling parameters. Copied here from sched_attr + * during sched_setattr(), they will remain the same until + * the next sched_setattr(). + */ + u64 dl_runtime; /* maximum runtime for each instance */ + u64 dl_deadline; /* relative deadline of each instance */ + u64 dl_period; /* separation of two instances (period) */ + u64 dl_bw; /* dl_runtime / dl_deadline */ + /* + * Actual scheduling parameters. Initialized with the values above, + * they are continously updated during task execution. Note that + * the remaining runtime could be < 0 in case we are in overrun. + */ + s64 runtime; /* remaining runtime for this instance */ + u64 deadline; /* absolute deadline for this instance */ + unsigned int flags; /* specifying the scheduler behaviour */ + + /* + * Some bool flags: + * + * @dl_throttled tells if we exhausted the runtime. If so, the + * task has to wait for a replenishment to be performed at the + * next firing of dl_timer. + * + * @dl_new tells if a new instance arrived. If so we must + * start executing it with full runtime and reset its absolute + * deadline; + * + * @dl_boosted tells if we are boosted due to DI. If so we are + * outside bandwidth enforcement mechanism (but only until we + * exit the critical section); + * + * @dl_yielded tells if task gave up the cpu before consuming + * all its available runtime during the last job. + */ + int dl_throttled, dl_new, dl_boosted, dl_yielded; + + /* + * Bandwidth enforcement timer. Each -deadline task has its + * own bandwidth to be enforced, thus we need one timer per task. + */ + struct hrtimer dl_timer; +}; + +union rcu_special { + struct { + u8 blocked; + u8 need_qs; + u8 exp_need_qs; + u8 pad; /* Otherwise the compiler can store garbage here. */ + } b; /* Bits. */ + u32 s; /* Set of bits. */ +}; struct rcu_node; enum perf_event_task_context { @@ -1038,7 +1358,33 @@ perf_nr_task_contexts, }; +/* Track pages that require TLB flushes */ +struct tlbflush_unmap_batch { + /* + * Each bit set is a CPU that potentially has a TLB entry for one of + * the PFNs being flushed. See set_tlb_ubc_flush_pending(). + */ + struct cpumask cpumask; + + /* True if any bit in cpumask is set */ + bool flush_required; + + /* + * If true then the PTE was dirty when unmapped. The entry must be + * flushed before IO is initiated or a stale TLB entry potentially + * allows an update without redirtying the page. + */ + bool writable; +}; + struct task_struct { +#ifdef CONFIG_THREAD_INFO_IN_TASK + /* + * For reasons of header soup (see current_thread_info()), this + * must be the first element of task_struct. + */ + struct thread_info thread_info; +#endif volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; atomic_t usage; @@ -1048,6 +1394,14 @@ #ifdef CONFIG_SMP struct llist_node wake_entry; int on_cpu; +#ifdef CONFIG_THREAD_INFO_IN_TASK + unsigned int cpu; /* current CPU */ +#endif + unsigned int wakee_flips; + unsigned long wakee_flip_decay_ts; + struct task_struct *last_wakee; + + int wake_cpu; #endif int on_rq; @@ -1059,21 +1413,13 @@ #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif + struct sched_dl_entity dl; #ifdef CONFIG_PREEMPT_NOTIFIERS /* list of struct preempt_notifier: */ struct hlist_head preempt_notifiers; #endif - /* - * fpu_counter contains the number of consecutive context switches - * that the FPU is used. If this is over a threshold, the lazy fpu - * saving becomes unlazy to save the trap. This is an unsigned char - * so that after 256 times the counter wraps and the behavior turns - * lazy again; this to deal with bursty apps that only use FPU for - * a short time - */ - unsigned char fpu_counter; #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif @@ -1084,29 +1430,31 @@ #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; - char rcu_read_unlock_special; + union rcu_special rcu_read_unlock_special; struct list_head rcu_node_entry; -#endif /* #ifdef CONFIG_PREEMPT_RCU */ -#ifdef CONFIG_TREE_PREEMPT_RCU struct rcu_node *rcu_blocked_node; -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - struct rt_mutex *rcu_boost_mutex; -#endif /* #ifdef CONFIG_RCU_BOOST */ +#endif /* #ifdef CONFIG_PREEMPT_RCU */ +#ifdef CONFIG_TASKS_RCU + unsigned long rcu_tasks_nvcsw; + bool rcu_tasks_holdout; + struct list_head rcu_tasks_holdout_list; + int rcu_tasks_idle_cpu; +#endif /* #ifdef CONFIG_TASKS_RCU */ -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) +#ifdef CONFIG_SCHED_INFO struct sched_info sched_info; #endif struct list_head tasks; #ifdef CONFIG_SMP struct plist_node pushable_tasks; + struct rb_node pushable_dl_tasks; #endif struct mm_struct *mm, *active_mm; -#ifdef CONFIG_COMPAT_BRK - unsigned brk_randomized:1; -#endif + /* per-thread vma caching */ + u64 vmacache_seqnum; + struct vm_area_struct *vmacache[VMACACHE_SIZE]; #if defined(SPLIT_RSS_COUNTING) struct task_rss_stat rss_stat; #endif @@ -1114,22 +1462,33 @@ int exit_state; int exit_code, exit_signal; int pdeath_signal; /* The signal sent when the parent dies */ - unsigned int jobctl; /* JOBCTL_*, siglock protected */ + unsigned long jobctl; /* JOBCTL_*, siglock protected */ /* Used for emulating ABI behavior of previous Linux versions */ unsigned int personality; - unsigned did_exec:1; - unsigned in_execve:1; /* Tell the LSMs that the process is doing an - * execve */ + /* scheduler bits, serialized by scheduler locks */ + unsigned sched_reset_on_fork:1; + unsigned sched_contributes_to_load:1; + unsigned sched_migrated:1; + unsigned :0; /* force alignment to the next boundary */ + + /* unserialized, strictly 'current' */ + unsigned in_execve:1; /* bit to tell LSMs we're in execve */ unsigned in_iowait:1; +#ifdef CONFIG_MEMCG + unsigned memcg_may_oom:1; +#endif +#ifdef CONFIG_MEMCG_KMEM + unsigned memcg_kmem_skip_account:1; +#endif +#ifdef CONFIG_COMPAT_BRK + unsigned brk_randomized:1; +#endif - /* task may not gain privileges */ - unsigned no_new_privs:1; + unsigned long atomic_flags; /* Flags needing atomic access. */ - /* Revert to default priority/policy when forking */ - unsigned sched_reset_on_fork:1; - unsigned sched_contributes_to_load:1; + struct restart_block restart_block; pid_t pid; pid_t tgid; @@ -1171,9 +1530,7 @@ cputime_t utime, stime, utimescaled, stimescaled; cputime_t gtime; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE - struct cputime prev_cputime; -#endif + struct prev_cputime prev_cputime; #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN seqlock_t vtime_seqlock; unsigned long long vtime_snap; @@ -1184,8 +1541,8 @@ } vtime_snap_whence; #endif unsigned long nvcsw, nivcsw; /* context switch counts */ - struct timespec start_time; /* monotonic time */ - struct timespec real_start_time; /* boot based time */ + u64 start_time; /* monotonic time in nsec */ + u64 real_start_time; /* boot based time in nsec */ /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ unsigned long min_flt, maj_flt; @@ -1193,6 +1550,7 @@ struct list_head cpu_timers[3]; /* process credentials */ + const struct cred __rcu *ptracer_cred; /* Tracer's credentials at attach */ const struct cred __rcu *real_cred; /* objective and real subjective task * credentials (COW) */ const struct cred __rcu *cred; /* effective (overridable) subjective task @@ -1202,17 +1560,16 @@ it with task_lock()) - initialized normally by setup_new_exec */ /* file system info */ - int link_count, total_link_count; + struct nameidata *nameidata; #ifdef CONFIG_SYSVIPC /* ipc stuff */ struct sysv_sem sysvsem; + struct sysv_shm sysvshm; #endif #ifdef CONFIG_DETECT_HUNG_TASK /* hung task detection */ unsigned long last_switch_count; #endif -/* CPU-specific state of this task */ - struct thread_struct thread; /* filesystem information */ struct fs_struct *fs; /* open file information */ @@ -1229,9 +1586,7 @@ unsigned long sas_ss_sp; size_t sas_ss_size; - int (*notifier)(void *priv); - void *notifier_data; - sigset_t *notifier_mask; + struct callback_head *task_works; struct audit_context *audit_context; @@ -1251,9 +1606,12 @@ /* Protection of the PI data structures: */ raw_spinlock_t pi_lock; + struct wake_q_node wake_q; + #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task */ - struct plist_head pi_waiters; + struct rb_root pi_waiters; + struct rb_node *pi_waiters_leftmost; /* Deadlock detection and priority inheritance handling */ struct rt_mutex_waiter *pi_blocked_on; #endif @@ -1337,6 +1695,9 @@ struct mutex perf_event_mutex; struct list_head perf_event_list; #endif +#ifdef CONFIG_DEBUG_PREEMPT + unsigned long preempt_disable_ip; +#endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; /* Protected by alloc_lock */ short il_next; @@ -1344,12 +1705,50 @@ #endif #ifdef CONFIG_NUMA_BALANCING int numa_scan_seq; - int numa_migrate_seq; unsigned int numa_scan_period; + unsigned int numa_scan_period_max; + int numa_preferred_nid; + unsigned long numa_migrate_retry; u64 node_stamp; /* migration stamp */ + u64 last_task_numa_placement; + u64 last_sum_exec_runtime; struct callback_head numa_work; + + struct list_head numa_entry; + struct numa_group *numa_group; + + /* + * numa_faults is an array split into four regions: + * faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer + * in this precise order. + * + * faults_memory: Exponential decaying average of faults on a per-node + * basis. Scheduling placement decisions are made based on these + * counts. The values remain static for the duration of a PTE scan. + * faults_cpu: Track the nodes the process was running on when a NUMA + * hinting fault was incurred. + * faults_memory_buffer and faults_cpu_buffer: Record faults per node + * during the current scan window. When the scan completes, the counts + * in faults_memory and faults_cpu decay and these values are copied. + */ + unsigned long *numa_faults; + unsigned long total_numa_faults; + + /* + * numa_faults_locality tracks if faults recorded during the last + * scan window were remote/local or failed to migrate. The task scan + * period is adapted based on the locality of the faults with different + * weights depending on whether they were shared or private faults + */ + unsigned long numa_faults_locality[3]; + + unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + struct tlbflush_unmap_batch tlb_ubc; +#endif + struct rcu_head rcu; /* @@ -1384,6 +1783,9 @@ unsigned long timer_slack_ns; unsigned long default_timer_slack_ns; +#ifdef CONFIG_KASAN + unsigned int kasan_depth; +#endif #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Index of current stored address in ret_stack */ int curr_ret_stack; @@ -1405,23 +1807,13 @@ /* bitmask and counter of trace recursion */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ -#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ - struct memcg_batch_info { - int do_batch; /* incremented when batch uncharge started */ - struct mem_cgroup *memcg; /* target memcg of uncharge */ - unsigned long nr_pages; /* uncharged usage */ - unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ - } memcg_batch; - unsigned int memcg_kmem_skip_account; - struct memcg_oom_info { - struct mem_cgroup *memcg; - gfp_t gfp_mask; - int order; - unsigned int may_oom:1; - } memcg_oom; -#endif -#ifdef CONFIG_HAVE_HW_BREAKPOINT - atomic_t ptrace_bp_refcnt; +#ifdef CONFIG_MEMCG + struct mem_cgroup *memcg_in_oom; + gfp_t memcg_oom_gfp_mask; + int memcg_oom_order; + + /* number of pages to reclaim on returning to userland */ + unsigned int memcg_nr_pages_over_high; #endif #ifdef CONFIG_UPROBES struct uprobe_task *utask; @@ -1430,21 +1822,62 @@ unsigned int sequential_io; unsigned int sequential_io_avg; #endif +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP + unsigned long task_state_change; +#endif + int pagefault_disabled; +/* CPU-specific state of this task */ + struct thread_struct thread; +/* + * WARNING: on x86, 'thread_struct' contains a variable-sized + * structure. It *MUST* be at the end of 'task_struct'. + * + * Do not put anything below here! + */ }; +#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT +extern int arch_task_struct_size __read_mostly; +#else +# define arch_task_struct_size (sizeof(struct task_struct)) +#endif + /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +#define TNF_MIGRATED 0x01 +#define TNF_NO_GROUP 0x02 +#define TNF_SHARED 0x04 +#define TNF_FAULT_LOCAL 0x08 +#define TNF_MIGRATE_FAIL 0x10 + #ifdef CONFIG_NUMA_BALANCING -extern void task_numa_fault(int node, int pages, bool migrated); +extern void task_numa_fault(int last_node, int node, int pages, int flags); +extern pid_t task_numa_group_id(struct task_struct *p); extern void set_numabalancing_state(bool enabled); +extern void task_numa_free(struct task_struct *p); +extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page, + int src_nid, int dst_cpu); #else -static inline void task_numa_fault(int node, int pages, bool migrated) +static inline void task_numa_fault(int last_node, int node, int pages, + int flags) { } +static inline pid_t task_numa_group_id(struct task_struct *p) +{ + return 0; +} static inline void set_numabalancing_state(bool enabled) { } +static inline void task_numa_free(struct task_struct *p) +{ +} +static inline bool should_numa_migrate_memory(struct task_struct *p, + struct page *page, int src_nid, int dst_cpu) +{ + return true; +} #endif static inline struct pid *task_pid(struct task_struct *task) @@ -1520,6 +1953,24 @@ } +static inline int pid_alive(const struct task_struct *p); +static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) +{ + pid_t pid = 0; + + rcu_read_lock(); + if (pid_alive(tsk)) + pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); + rcu_read_unlock(); + + return pid; +} + +static inline pid_t task_ppid_nr(const struct task_struct *tsk) +{ + return task_ppid_nr_ns(tsk, &init_pid_ns); +} + static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) { @@ -1556,21 +2007,26 @@ * Test if a process is not yet dead (at most zombie state) * If pid_alive fails, then pointers within the task structure * can be stale and must not be dereferenced. + * + * Return: 1 if the process is alive. 0 otherwise. */ -static inline int pid_alive(struct task_struct *p) +static inline int pid_alive(const struct task_struct *p) { return p->pids[PIDTYPE_PID].pid != NULL; } /** - * is_global_init - check if a task structure is init + * is_global_init - check if a task structure is init. Since init + * is free to have sub-threads we need to check tgid. * @tsk: Task structure to be checked. * * Check if a task structure is the first user space task the kernel created. + * + * Return: 1 if the task structure is init. 0 otherwise. */ static inline int is_global_init(struct task_struct *tsk) { - return tsk->pid == 1; + return task_tgid_nr(tsk) == 1; } extern struct pid *cad_pid; @@ -1645,13 +2101,11 @@ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */ #define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */ -#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */ -#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */ #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ -#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */ #define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */ #define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */ +#define PF_SUSPEND_TASK 0x80000000 /* this thread called freeze_processes and should not be frozen */ /* * Only the _current_ task can read/write to tsk->flags, but other @@ -1700,6 +2154,33 @@ current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; } +/* Per-process atomic flags. */ +#define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ +#define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ +#define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ + + +#define TASK_PFA_TEST(name, func) \ + static inline bool task_##func(struct task_struct *p) \ + { return test_bit(PFA_##name, &p->atomic_flags); } +#define TASK_PFA_SET(name, func) \ + static inline void task_set_##func(struct task_struct *p) \ + { set_bit(PFA_##name, &p->atomic_flags); } +#define TASK_PFA_CLEAR(name, func) \ + static inline void task_clear_##func(struct task_struct *p) \ + { clear_bit(PFA_##name, &p->atomic_flags); } + +TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs) +TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs) + +TASK_PFA_TEST(SPREAD_PAGE, spread_page) +TASK_PFA_SET(SPREAD_PAGE, spread_page) +TASK_PFA_CLEAR(SPREAD_PAGE, spread_page) + +TASK_PFA_TEST(SPREAD_SLAB, spread_slab) +TASK_PFA_SET(SPREAD_SLAB, spread_slab) +TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) + /* * task->jobctl flags */ @@ -1713,49 +2194,38 @@ #define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */ #define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */ -#define JOBCTL_STOP_DEQUEUED (1 << JOBCTL_STOP_DEQUEUED_BIT) -#define JOBCTL_STOP_PENDING (1 << JOBCTL_STOP_PENDING_BIT) -#define JOBCTL_STOP_CONSUME (1 << JOBCTL_STOP_CONSUME_BIT) -#define JOBCTL_TRAP_STOP (1 << JOBCTL_TRAP_STOP_BIT) -#define JOBCTL_TRAP_NOTIFY (1 << JOBCTL_TRAP_NOTIFY_BIT) -#define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT) -#define JOBCTL_LISTENING (1 << JOBCTL_LISTENING_BIT) +#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT) +#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT) +#define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT) +#define JOBCTL_TRAP_STOP (1UL << JOBCTL_TRAP_STOP_BIT) +#define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT) +#define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT) +#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT) #define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY) #define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK) extern bool task_set_jobctl_pending(struct task_struct *task, - unsigned int mask); + unsigned long mask); extern void task_clear_jobctl_trapping(struct task_struct *task); extern void task_clear_jobctl_pending(struct task_struct *task, - unsigned int mask); - -#ifdef CONFIG_PREEMPT_RCU - -#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */ -#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */ + unsigned long mask); static inline void rcu_copy_process(struct task_struct *p) { +#ifdef CONFIG_PREEMPT_RCU p->rcu_read_lock_nesting = 0; - p->rcu_read_unlock_special = 0; -#ifdef CONFIG_TREE_PREEMPT_RCU + p->rcu_read_unlock_special.s = 0; p->rcu_blocked_node = NULL; -#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#ifdef CONFIG_RCU_BOOST - p->rcu_boost_mutex = NULL; -#endif /* #ifdef CONFIG_RCU_BOOST */ INIT_LIST_HEAD(&p->rcu_node_entry); +#endif /* #ifdef CONFIG_PREEMPT_RCU */ +#ifdef CONFIG_TASKS_RCU + p->rcu_tasks_holdout = false; + INIT_LIST_HEAD(&p->rcu_tasks_holdout_list); + p->rcu_tasks_idle_cpu = -1; +#endif /* #ifdef CONFIG_TASKS_RCU */ } -#else - -static inline void rcu_copy_process(struct task_struct *p) -{ -} - -#endif - static inline void tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags) { @@ -1763,6 +2233,10 @@ task->flags |= orig_flags & flags; } +extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, + const struct cpumask *trial); +extern int task_can_attach(struct task_struct *p, + const struct cpumask *cs_cpus_allowed); #ifdef CONFIG_SMP extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); @@ -1791,13 +2265,6 @@ static inline void calc_load_exit_idle(void) { } #endif /* CONFIG_NO_HZ_COMMON */ -#ifndef CONFIG_CPUMASK_OFFSTACK -static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) -{ - return set_cpus_allowed_ptr(p, &new_mask); -} -#endif - /* * Do not use outside of architecture code which knows its limitations. * @@ -1812,6 +2279,7 @@ */ extern u64 cpu_clock(int cpu); extern u64 local_clock(void); +extern u64 running_clock(void); extern u64 sched_clock_cpu(int cpu); @@ -1836,7 +2304,9 @@ * but then during bootup it turns out that sched_clock() * is reliable after all: */ -extern int sched_clock_stable; +extern int sched_clock_stable(void); +extern void set_sched_clock_stable(void); +extern void clear_sched_clock_stable(void); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); @@ -1904,10 +2374,19 @@ static inline void sched_autogroup_exit(struct signal_struct *sig) { } #endif -extern bool yield_to(struct task_struct *p, bool preempt); +extern int yield_to(struct task_struct *p, bool preempt); extern void set_user_nice(struct task_struct *p, long nice); extern int task_prio(const struct task_struct *p); -extern int task_nice(const struct task_struct *p); +/** + * task_nice - return the nice value of a given task. + * @p: the task in question. + * + * Return: The nice value [ -20 ... 0 ... 19 ]. + */ +static inline int task_nice(const struct task_struct *p) +{ + return PRIO_TO_NICE((p)->static_prio); +} extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); @@ -1915,10 +2394,14 @@ const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); +extern int sched_setattr(struct task_struct *, + const struct sched_attr *); extern struct task_struct *idle_task(int cpu); /** * is_idle_task - is the specified task an idle task? * @p: the task in question. + * + * Return: 1 if @p is an idle task. 0 otherwise. */ static inline bool is_idle_task(const struct task_struct *p) { @@ -1929,13 +2412,10 @@ void yield(void); -/* - * The default (Linux) execution domain. - */ -extern struct exec_domain default_exec_domain; - union thread_union { +#ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; +#endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; @@ -1971,8 +2451,6 @@ extern struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns); -extern void __set_special_pids(struct pid *pid); - /* per-UID process charging. */ extern struct user_struct * alloc_uid(kuid_t); static inline struct user_struct *get_uid(struct user_struct *u) @@ -1994,31 +2472,38 @@ #else static inline void kick_process(struct task_struct *tsk) { } #endif -extern void sched_fork(struct task_struct *p); +extern int sched_fork(unsigned long clone_flags, struct task_struct *p); extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void); extern void flush_signals(struct task_struct *); -extern void __flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); -static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) +static inline int kernel_dequeue_signal(siginfo_t *info) { - unsigned long flags; + struct task_struct *tsk = current; + siginfo_t __info; int ret; - spin_lock_irqsave(&tsk->sighand->siglock, flags); - ret = dequeue_signal(tsk, mask, info); - spin_unlock_irqrestore(&tsk->sighand->siglock, flags); + spin_lock_irq(&tsk->sighand->siglock); + ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info); + spin_unlock_irq(&tsk->sighand->siglock); return ret; } -extern void block_all_signals(int (*notifier)(void *priv), void *priv, - sigset_t *mask); -extern void unblock_all_signals(void); +static inline void kernel_signal_stop(void) +{ + spin_lock_irq(¤t->sighand->siglock); + if (current->jobctl & JOBCTL_STOP_DEQUEUED) + __set_current_state(TASK_STOPPED); + spin_unlock_irq(¤t->sighand->siglock); + + schedule(); +} + extern void release_task(struct task_struct * p); extern int send_sig_info(int, struct siginfo *, struct task_struct *); extern int force_sigsegv(int, struct task_struct *); @@ -2080,8 +2565,10 @@ static inline int sas_ss_flags(unsigned long sp) { - return (current->sas_ss_size == 0 ? SS_DISABLE - : on_sig_stack(sp) ? SS_ONSTACK : 0); + if (!current->sas_ss_size) + return SS_DISABLE; + + return on_sig_stack(sp) ? SS_ONSTACK : 0; } static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) @@ -2120,11 +2607,23 @@ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct */ extern void mm_release(struct task_struct *, struct mm_struct *); -/* Allocate a new mm structure and copy contents from tsk->mm */ -extern struct mm_struct *dup_mm(struct task_struct *tsk); +#ifdef CONFIG_HAVE_COPY_THREAD_TLS +extern int copy_thread_tls(unsigned long, unsigned long, unsigned long, + struct task_struct *, unsigned long); +#else extern int copy_thread(unsigned long, unsigned long, unsigned long, struct task_struct *); + +/* Architectures that haven't opted into copy_thread_tls get the tls argument + * via pt_regs, so ignore the tls argument passed via C. */ +static inline int copy_thread_tls( + unsigned long clone_flags, unsigned long sp, unsigned long arg, + struct task_struct *p, unsigned long tls) +{ + return copy_thread(clone_flags, sp, arg, p); +} +#endif extern void flush_thread(void); extern void exit_thread(void); @@ -2136,17 +2635,23 @@ extern void do_group_exit(int); -extern int allow_signal(int); -extern int disallow_signal(int); - -extern int do_execve(const char *, +extern int do_execve(struct filename *, const char __user * const __user *, const char __user * const __user *); +extern int do_execveat(int, struct filename *, + const char __user * const __user *, + const char __user * const __user *, + int); +extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); -extern void set_task_comm(struct task_struct *tsk, char *from); +extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec); +static inline void set_task_comm(struct task_struct *tsk, const char *from) +{ + __set_task_comm(tsk, from, false); +} extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP @@ -2161,6 +2666,9 @@ } #endif +#define tasklist_empty() \ + list_empty(&init_task.tasks) + #define next_task(p) \ list_entry_rcu((p)->tasks.next, struct task_struct, tasks) @@ -2269,58 +2777,54 @@ spin_unlock_irqrestore(&tsk->sighand->siglock, *flags); } -#ifdef CONFIG_CGROUPS +/** + * threadgroup_change_begin - mark the beginning of changes to a threadgroup + * @tsk: task causing the changes + * + * All operations which modify a threadgroup - a new thread joining the + * group, death of a member thread (the assertion of PF_EXITING) and + * exec(2) dethreading the process and replacing the leader - are wrapped + * by threadgroup_change_{begin|end}(). This is to provide a place which + * subsystems needing threadgroup stability can hook into for + * synchronization. + */ static inline void threadgroup_change_begin(struct task_struct *tsk) { - down_read(&tsk->signal->group_rwsem); -} -static inline void threadgroup_change_end(struct task_struct *tsk) -{ - up_read(&tsk->signal->group_rwsem); + might_sleep(); + cgroup_threadgroup_change_begin(tsk); } /** - * threadgroup_lock - lock threadgroup - * @tsk: member task of the threadgroup to lock - * - * Lock the threadgroup @tsk belongs to. No new task is allowed to enter - * and member tasks aren't allowed to exit (as indicated by PF_EXITING) or - * change ->group_leader/pid. This is useful for cases where the threadgroup - * needs to stay stable across blockable operations. - * - * fork and exit paths explicitly call threadgroup_change_{begin|end}() for - * synchronization. While held, no new task will be added to threadgroup - * and no existing live task will have its PF_EXITING set. + * threadgroup_change_end - mark the end of changes to a threadgroup + * @tsk: task causing the changes * - * de_thread() does threadgroup_change_{begin|end}() when a non-leader - * sub-thread becomes a new leader. + * See threadgroup_change_begin(). */ -static inline void threadgroup_lock(struct task_struct *tsk) +static inline void threadgroup_change_end(struct task_struct *tsk) { - down_write(&tsk->signal->group_rwsem); + cgroup_threadgroup_change_end(tsk); } -/** - * threadgroup_unlock - unlock threadgroup - * @tsk: member task of the threadgroup to unlock - * - * Reverse threadgroup_lock(). - */ -static inline void threadgroup_unlock(struct task_struct *tsk) +#ifdef CONFIG_THREAD_INFO_IN_TASK + +static inline struct thread_info *task_thread_info(struct task_struct *task) { - up_write(&tsk->signal->group_rwsem); + return &task->thread_info; +} +static inline void *task_stack_page(const struct task_struct *task) +{ + return task->stack; +} +#define setup_thread_stack(new,old) do { } while(0) +static inline unsigned long *end_of_stack(const struct task_struct *task) +{ + return task->stack; } -#else -static inline void threadgroup_change_begin(struct task_struct *tsk) {} -static inline void threadgroup_change_end(struct task_struct *tsk) {} -static inline void threadgroup_lock(struct task_struct *tsk) {} -static inline void threadgroup_unlock(struct task_struct *tsk) {} -#endif -#ifndef __HAVE_THREAD_FUNCTIONS +#elif !defined(__HAVE_THREAD_FUNCTIONS) #define task_thread_info(task) ((struct thread_info *)(task)->stack) -#define task_stack_page(task) ((task)->stack) +#define task_stack_page(task) ((void *)(task)->stack) static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) { @@ -2328,12 +2832,27 @@ task_thread_info(p)->task = p; } +/* + * Return the address of the last usable long on the stack. + * + * When the stack grows down, this is just above the thread + * info struct. Going any lower will corrupt the threadinfo. + * + * When the stack grows up, this is the highest address. + * Beyond that position, we corrupt data on the next page. + */ static inline unsigned long *end_of_stack(struct task_struct *p) { +#ifdef CONFIG_STACK_GROWSUP + return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1; +#else return (unsigned long *)(task_thread_info(p) + 1); +#endif } #endif +#define task_stack_end_corrupted(task) \ + (*(end_of_stack(task)) != STACK_END_MAGIC) static inline int object_is_on_stack(void *obj) { @@ -2356,6 +2875,7 @@ return (unsigned long)n - (unsigned long)end_of_stack(p); } #endif +extern void set_task_stack_end_magic(struct task_struct *tsk); /* set thread flags in other task's structures * - see asm/thread_info.h for TIF_xxxx flags available @@ -2431,11 +2951,6 @@ return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); } -static inline int need_resched(void) -{ - return unlikely(test_thread_flag(TIF_NEED_RESCHED)); -} - /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return @@ -2446,30 +2961,33 @@ extern int _cond_resched(void); #define cond_resched() ({ \ - __might_sleep(__FILE__, __LINE__, 0); \ + ___might_sleep(__FILE__, __LINE__, 0); \ _cond_resched(); \ }) extern int __cond_resched_lock(spinlock_t *lock); -#ifdef CONFIG_PREEMPT_COUNT -#define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET -#else -#define PREEMPT_LOCK_OFFSET 0 -#endif - #define cond_resched_lock(lock) ({ \ - __might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \ + ___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\ __cond_resched_lock(lock); \ }) extern int __cond_resched_softirq(void); #define cond_resched_softirq() ({ \ - __might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ + ___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \ __cond_resched_softirq(); \ }) +static inline void cond_resched_rcu(void) +{ +#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU) + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); +#endif +} + /* * Does a critical section need to be broken due to another * task waiting?: (technically does not depend on CONFIG_PREEMPT, @@ -2486,18 +3004,17 @@ /* * Idle thread specific functions to determine the need_resched - * polling state. We have two versions, one based on TS_POLLING in - * thread_info.status and one based on TIF_POLLING_NRFLAG in - * thread_info.flags + * polling state. */ -#ifdef TS_POLLING +#ifdef TIF_POLLING_NRFLAG static inline int tsk_is_polling(struct task_struct *p) { - return task_thread_info(p)->status & TS_POLLING; + return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); } + static inline void __current_set_polling(void) { - current_thread_info()->status |= TS_POLLING; + set_thread_flag(TIF_POLLING_NRFLAG); } static inline bool __must_check current_set_polling_and_test(void) @@ -2506,16 +3023,16 @@ /* * Polling state must be visible before we test NEED_RESCHED, - * paired by resched_task() + * paired by resched_curr() */ - smp_mb(); + smp_mb__after_atomic(); return unlikely(tif_need_resched()); } static inline void __current_clr_polling(void) { - current_thread_info()->status &= ~TS_POLLING; + clear_thread_flag(TIF_POLLING_NRFLAG); } static inline bool __must_check current_clr_polling_and_test(void) @@ -2524,70 +3041,47 @@ /* * Polling state must be visible before we test NEED_RESCHED, - * paired by resched_task() + * paired by resched_curr() */ - smp_mb(); + smp_mb__after_atomic(); return unlikely(tif_need_resched()); } -#elif defined(TIF_POLLING_NRFLAG) -static inline int tsk_is_polling(struct task_struct *p) -{ - return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); -} -static inline void __current_set_polling(void) -{ - set_thread_flag(TIF_POLLING_NRFLAG); -} +#else +static inline int tsk_is_polling(struct task_struct *p) { return 0; } +static inline void __current_set_polling(void) { } +static inline void __current_clr_polling(void) { } static inline bool __must_check current_set_polling_and_test(void) { - __current_set_polling(); - - /* - * Polling state must be visible before we test NEED_RESCHED, - * paired by resched_task() - * - * XXX: assumes set/clear bit are identical barrier wise. - */ - smp_mb__after_clear_bit(); - return unlikely(tif_need_resched()); } - -static inline void __current_clr_polling(void) +static inline bool __must_check current_clr_polling_and_test(void) { - clear_thread_flag(TIF_POLLING_NRFLAG); + return unlikely(tif_need_resched()); } +#endif -static inline bool __must_check current_clr_polling_and_test(void) +static inline void current_clr_polling(void) { __current_clr_polling(); /* - * Polling state must be visible before we test NEED_RESCHED, - * paired by resched_task() + * Ensure we check TIF_NEED_RESCHED after we clear the polling bit. + * Once the bit is cleared, we'll get IPIs with every new + * TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also + * fold. */ - smp_mb__after_clear_bit(); + smp_mb(); /* paired with resched_curr() */ - return unlikely(tif_need_resched()); + preempt_fold_need_resched(); } -#else -static inline int tsk_is_polling(struct task_struct *p) { return 0; } -static inline void __current_set_polling(void) { } -static inline void __current_clr_polling(void) { } - -static inline bool __must_check current_set_polling_and_test(void) +static __always_inline bool need_resched(void) { return unlikely(tif_need_resched()); } -static inline bool __must_check current_clr_polling_and_test(void) -{ - return unlikely(tif_need_resched()); -} -#endif /* * Thread group CPU time accounting. @@ -2595,11 +3089,6 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times); void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times); -static inline void thread_group_cputime_init(struct signal_struct *sig) -{ - raw_spin_lock_init(&sig->cputimer.lock); -} - /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. @@ -2627,7 +3116,16 @@ static inline unsigned int task_cpu(const struct task_struct *p) { +#ifdef CONFIG_THREAD_INFO_IN_TASK + return p->cpu; +#else return task_thread_info(p)->cpu; +#endif +} + +static inline int task_node(const struct task_struct *p) +{ + return cpu_to_node(task_cpu(p)); } extern void set_task_cpu(struct task_struct *p, unsigned int cpu); @@ -2697,29 +3195,24 @@ #define TASK_SIZE_OF(tsk) TASK_SIZE #endif -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); -extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); #else static inline void mm_update_next_owner(struct mm_struct *mm) { } - -static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) -{ -} -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ static inline unsigned long task_rlimit(const struct task_struct *tsk, unsigned int limit) { - return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur); + return READ_ONCE(tsk->signal->rlim[limit].rlim_cur); } static inline unsigned long task_rlimit_max(const struct task_struct *tsk, unsigned int limit) { - return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max); + return READ_ONCE(tsk->signal->rlim[limit].rlim_max); } static inline unsigned long rlimit(unsigned int limit)