--- zzzz-none-000/linux-3.10.107/kernel/sched/cputime.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/kernel/sched/cputime.c 2021-02-04 17:41:59.000000000 +0000 @@ -22,6 +22,7 @@ */ DEFINE_PER_CPU(u64, cpu_hardirq_time); DEFINE_PER_CPU(u64, cpu_softirq_time); +DEFINE_PER_CPU(u64, cpu_ksoftirqd_time); static DEFINE_PER_CPU(u64, irq_start_time); static int sched_clock_irqtime; @@ -68,8 +69,19 @@ */ if (hardirq_count()) __this_cpu_add(cpu_hardirq_time, delta); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) - __this_cpu_add(cpu_softirq_time, delta); + else if (softirq_count()) { + /* + * With the original IRQ time accounting patch, As + * cpu_softirq_time does not account for softirq time when + * ksoftirqd is running, we store it in cpu_ksoftirqd_time. + * We later use this along with cpu_softirq_time to update + * cpustat[CPUTIME_SOFTIRQ] + */ + if (curr == this_cpu_ksoftirqd()) + __this_cpu_add(cpu_ksoftirqd_time, delta); + else + __this_cpu_add(cpu_softirq_time, delta); + } irq_time_write_end(); local_irq_restore(flags); @@ -99,7 +111,22 @@ int ret = 0; local_irq_save(flags); - latest_ns = this_cpu_read(cpu_softirq_time); + /* + * When ksoftirqd runs, cpu_sotfirq_time is not updated (to account + * time correctly to the ksoftirqd task), but + * cpustat[CPUTIME_SOFTIRQ] continues to be incremented. So the + * difference in the values of cpustat[CPUTIME_SOFTIRQ] and + * cpu_softirq_time increases, and is propotional to the duration + * ksoftirqd runs. After ksoftirqd sleeps, the cpu_softirq_time starts + * being updated again, but because of the previous difference in + * values it takes time to match the value in cpustat[CPUTIME_SOFTIRQ]. + * So during this period, this function returns 0 which causes incorrect + * softirq values in /proc/stat. So the solution is to plug the + * difference in values using another variable (cpu_ksoftirqd_time) + * which is updated when ksofitrqd runs. + */ + latest_ns = this_cpu_read(cpu_softirq_time) + + this_cpu_read(cpu_ksoftirqd_time); if (nsecs_to_cputime64(latest_ns) > cpustat[CPUTIME_SOFTIRQ]) ret = 1; local_irq_restore(flags); @@ -121,7 +148,7 @@ * is the only cgroup, then nothing else should be necessary. * */ - __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; + __this_cpu_add(kernel_cpustat.cpustat[index], tmp); cpuacct_account_field(p, index, tmp); } @@ -142,7 +169,7 @@ p->utimescaled += cputime_scaled; account_group_user_time(p, cputime); - index = (TASK_NICE(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; + index = (task_nice(p) > 0) ? CPUTIME_NICE : CPUTIME_USER; /* Add user time to cpustat. */ task_group_account_field(p, index, (__force u64) cputime); @@ -169,7 +196,7 @@ p->gtime += cputime; /* Add guest time to cpustat. */ - if (TASK_NICE(p) > 0) { + if (task_nice(p) > 0) { cpustat[CPUTIME_NICE] += (__force u64) cputime; cpustat[CPUTIME_GUEST_NICE] += (__force u64) cputime; } else { @@ -258,16 +285,22 @@ { #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { - u64 steal, st = 0; + u64 steal; + unsigned long steal_jiffies; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; - st = steal_ticks(steal); - this_rq()->prev_steal_time += st * TICK_NSEC; + /* + * steal is in nsecs but our caller is expecting steal + * time in jiffies. Lets cast the result to jiffies + * granularity and account the rest on the next rounds. + */ + steal_jiffies = nsecs_to_jiffies(steal); + this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); - account_steal_time(st); - return st; + account_steal_time(jiffies_to_cputime(steal_jiffies)); + return steal_jiffies; } #endif return false; @@ -282,24 +315,29 @@ struct signal_struct *sig = tsk->signal; cputime_t utime, stime; struct task_struct *t; - - times->utime = sig->utime; - times->stime = sig->stime; - times->sum_exec_runtime = sig->sum_sched_runtime; + unsigned int seq, nextseq; + unsigned long flags; rcu_read_lock(); - /* make sure we can trust tsk->thread_group list */ - if (!likely(pid_alive(tsk))) - goto out; - - t = tsk; + /* Attempt a lockless read on the first round. */ + nextseq = 0; do { - task_cputime(t, &utime, &stime); - times->utime += utime; - times->stime += stime; - times->sum_exec_runtime += task_sched_runtime(t); - } while_each_thread(tsk, t); -out: + seq = nextseq; + flags = read_seqbegin_or_lock_irqsave(&sig->stats_lock, &seq); + times->utime = sig->utime; + times->stime = sig->stime; + times->sum_exec_runtime = sig->sum_sched_runtime; + + for_each_thread(tsk, t) { + task_cputime(t, &utime, &stime); + times->utime += utime; + times->stime += stime; + times->sum_exec_runtime += task_sched_runtime(t); + } + /* If lockless access failed, take the lock. */ + nextseq = 1; + } while (need_seqretry(&sig->stats_lock, seq)); + done_seqretry_irqrestore(&sig->stats_lock, seq, flags); rcu_read_unlock(); } @@ -378,11 +416,8 @@ #ifdef CONFIG_VIRT_CPU_ACCOUNTING #ifndef __ARCH_HAS_VTIME_TASK_SWITCH -void vtime_task_switch(struct task_struct *prev) +void vtime_common_task_switch(struct task_struct *prev) { - if (!vtime_accounting_enabled()) - return; - if (is_idle_task(prev)) vtime_account_idle(prev); else @@ -404,11 +439,8 @@ * vtime_account(). */ #ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account_irq_enter(struct task_struct *tsk) +void vtime_common_account_irq_enter(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - if (!in_interrupt()) { /* * If we interrupted user, context_tracking_in_user() @@ -428,7 +460,7 @@ } vtime_account_system(tsk); } -EXPORT_SYMBOL_GPL(vtime_account_irq_enter); +EXPORT_SYMBOL_GPL(vtime_common_account_irq_enter); #endif /* __ARCH_HAS_VTIME_ACCOUNT */ #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ @@ -439,6 +471,7 @@ *ut = p->utime; *st = p->stime; } +EXPORT_SYMBOL_GPL(task_cputime_adjusted); void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { @@ -515,9 +548,8 @@ for (;;) { /* Make sure "rtime" is the bigger of stime/rtime */ - if (stime > rtime) { - u64 tmp = rtime; rtime = stime; stime = tmp; - } + if (stime > rtime) + swap(rtime, stime); /* Make sure 'total' fits in 32 bits */ if (total >> 32) @@ -551,37 +583,43 @@ } /* - * Adjust tick based cputime random precision against scheduler - * runtime accounting. + * Adjust tick based cputime random precision against scheduler runtime + * accounting. + * + * Tick based cputime accounting depend on random scheduling timeslices of a + * task to be interrupted or not by the timer. Depending on these + * circumstances, the number of these interrupts may be over or + * under-optimistic, matching the real user and system cputime with a variable + * precision. + * + * Fix this by scaling these tick based values against the total runtime + * accounted by the CFS scheduler. + * + * This code provides the following guarantees: + * + * stime + utime == rtime + * stime_i+1 >= stime_i, utime_i+1 >= utime_i + * + * Assuming that rtime_i+1 >= rtime_i. */ static void cputime_adjust(struct task_cputime *curr, - struct cputime *prev, + struct prev_cputime *prev, cputime_t *ut, cputime_t *st) { cputime_t rtime, stime, utime; + unsigned long flags; - if (vtime_accounting_enabled()) { - *ut = curr->utime; - *st = curr->stime; - return; - } - - /* - * Tick based cputime accounting depend on random scheduling - * timeslices of a task to be interrupted or not by the timer. - * Depending on these circumstances, the number of these interrupts - * may be over or under-optimistic, matching the real user and system - * cputime with a variable precision. - * - * Fix this by scaling these tick based values against the total - * runtime accounted by the CFS scheduler. - */ + /* Serialize concurrent callers such that we can honour our guarantees */ + raw_spin_lock_irqsave(&prev->lock, flags); rtime = nsecs_to_cputime(curr->sum_exec_runtime); /* - * Update userspace visible utime/stime values only if actual execution - * time is bigger than already exported. Note that can happen, that we - * provided bigger values due to scaling inaccuracy on big numbers. + * This is possible under two circumstances: + * - rtime isn't monotonic after all (a bug); + * - we got reordered by the lock. + * + * In both cases this acts as a filter such that the rest of the code + * can assume it is monotonic regardless of anything else. */ if (prev->stime + prev->utime >= rtime) goto out; @@ -589,29 +627,53 @@ stime = curr->stime; utime = curr->utime; - if (utime == 0) { - stime = rtime; - } else if (stime == 0) { + /* + * If either stime or both stime and utime are 0, assume all runtime is + * userspace. Once a task gets some ticks, the monotonicy code at + * 'update' will ensure things converge to the observed ratio. + */ + if (stime == 0) { utime = rtime; - } else { - cputime_t total = stime + utime; + goto update; + } - stime = scale_stime((__force u64)stime, - (__force u64)rtime, (__force u64)total); - utime = rtime - stime; + if (utime == 0) { + stime = rtime; + goto update; } + stime = scale_stime((__force u64)stime, (__force u64)rtime, + (__force u64)(stime + utime)); + +update: /* - * If the tick based count grows faster than the scheduler one, - * the result of the scaling may go backward. - * Let's enforce monotonicity. + * Make sure stime doesn't go backwards; this preserves monotonicity + * for utime because rtime is monotonic. + * + * utime_i+1 = rtime_i+1 - stime_i + * = rtime_i+1 - (rtime_i - utime_i) + * = (rtime_i+1 - rtime_i) + utime_i + * >= utime_i */ - prev->stime = max(prev->stime, stime); - prev->utime = max(prev->utime, utime); + if (stime < prev->stime) + stime = prev->stime; + utime = rtime - stime; + /* + * Make sure utime doesn't go backwards; this still preserves + * monotonicity for stime, analogous argument to above. + */ + if (utime < prev->utime) { + utime = prev->utime; + stime = rtime - utime; + } + + prev->stime = stime; + prev->utime = utime; out: *ut = prev->utime; *st = prev->stime; + raw_spin_unlock_irqrestore(&prev->lock, flags); } void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) @@ -623,10 +685,8 @@ task_cputime(p, &cputime.utime, &cputime.stime); cputime_adjust(&cputime, &p->prev_cputime, ut, st); } +EXPORT_SYMBOL_GPL(task_cputime_adjusted); -/* - * Must be called with siglock held. - */ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st) { struct task_cputime cputime; @@ -668,23 +728,17 @@ void vtime_account_system(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - write_seqlock(&tsk->vtime_seqlock); __vtime_account_system(tsk); write_sequnlock(&tsk->vtime_seqlock); } -void vtime_account_irq_exit(struct task_struct *tsk) +void vtime_gen_account_irq_exit(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - write_seqlock(&tsk->vtime_seqlock); + __vtime_account_system(tsk); if (context_tracking_in_user()) tsk->vtime_snap_whence = VTIME_USER; - __vtime_account_system(tsk); write_sequnlock(&tsk->vtime_seqlock); } @@ -692,12 +746,8 @@ { cputime_t delta_cpu; - if (!vtime_accounting_enabled()) - return; - - delta_cpu = get_vtime_delta(tsk); - write_seqlock(&tsk->vtime_seqlock); + delta_cpu = get_vtime_delta(tsk); tsk->vtime_snap_whence = VTIME_SYS; account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); write_sequnlock(&tsk->vtime_seqlock); @@ -705,22 +755,27 @@ void vtime_user_enter(struct task_struct *tsk) { - if (!vtime_accounting_enabled()) - return; - write_seqlock(&tsk->vtime_seqlock); - tsk->vtime_snap_whence = VTIME_USER; __vtime_account_system(tsk); + tsk->vtime_snap_whence = VTIME_USER; write_sequnlock(&tsk->vtime_seqlock); } void vtime_guest_enter(struct task_struct *tsk) { + /* + * The flags must be updated under the lock with + * the vtime_snap flush and update. + * That enforces a right ordering and update sequence + * synchronization against the reader (task_gtime()) + * that can thus safely catch up with a tickless delta. + */ write_seqlock(&tsk->vtime_seqlock); __vtime_account_system(tsk); current->flags |= PF_VCPU; write_sequnlock(&tsk->vtime_seqlock); } +EXPORT_SYMBOL_GPL(vtime_guest_enter); void vtime_guest_exit(struct task_struct *tsk) { @@ -729,6 +784,7 @@ current->flags &= ~PF_VCPU; write_sequnlock(&tsk->vtime_seqlock); } +EXPORT_SYMBOL_GPL(vtime_guest_exit); void vtime_account_idle(struct task_struct *tsk) { @@ -737,11 +793,6 @@ account_idle_time(delta_cpu); } -bool vtime_accounting_enabled(void) -{ - return context_tracking_active(); -} - void arch_vtime_task_switch(struct task_struct *prev) { write_seqlock(&prev->vtime_seqlock); @@ -769,6 +820,9 @@ unsigned int seq; cputime_t gtime; + if (!context_tracking_is_enabled()) + return t->gtime; + do { seq = read_seqbegin(&t->vtime_seqlock);