--- zzzz-none-000/linux-3.10.107/kernel/exit.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/kernel/exit.c 2021-02-04 17:41:59.000000000 +0000 @@ -59,7 +59,7 @@ #include #include -static void exit_mm(struct task_struct * tsk); +static void exit_mm(struct task_struct *tsk); static void __unhash_process(struct task_struct *p, bool group_dead) { @@ -115,32 +115,30 @@ if (tsk == sig->curr_target) sig->curr_target = next_thread(tsk); - /* - * Accumulate here the counters for all threads but the - * group leader as they die, so they can be added into - * the process-wide totals when those are taken. - * The group leader stays around as a zombie as long - * as there are other threads. When it gets reaped, - * the exit.c code will add its counts into these totals. - * We won't ever get here for the group leader, since it - * will have been the last reference on the signal_struct. - */ - task_cputime(tsk, &utime, &stime); - sig->utime += utime; - sig->stime += stime; - sig->gtime += task_gtime(tsk); - sig->min_flt += tsk->min_flt; - sig->maj_flt += tsk->maj_flt; - sig->nvcsw += tsk->nvcsw; - sig->nivcsw += tsk->nivcsw; - sig->inblock += task_io_get_inblock(tsk); - sig->oublock += task_io_get_oublock(tsk); - task_io_accounting_add(&sig->ioac, &tsk->ioac); - sig->sum_sched_runtime += tsk->se.sum_exec_runtime; } + /* + * Accumulate here the counters for all threads as they die. We could + * skip the group leader because it is the last user of signal_struct, + * but we want to avoid the race with thread_group_cputime() which can + * see the empty ->thread_head list. + */ + task_cputime(tsk, &utime, &stime); + write_seqlock(&sig->stats_lock); + sig->utime += utime; + sig->stime += stime; + sig->gtime += task_gtime(tsk); + sig->min_flt += tsk->min_flt; + sig->maj_flt += tsk->maj_flt; + sig->nvcsw += tsk->nvcsw; + sig->nivcsw += tsk->nivcsw; + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); + sig->sum_sched_runtime += tsk->se.sum_exec_runtime; sig->nr_threads--; __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); /* * Do this under ->siglock, we can race with another thread @@ -151,7 +149,7 @@ spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); - clear_tsk_thread_flag(tsk,TIF_SIGPENDING); + clear_tsk_thread_flag(tsk, TIF_SIGPENDING); if (group_dead) { flush_sigqueue(&sig->shared_pending); tty_kref_put(tty); @@ -168,7 +166,7 @@ } -void release_task(struct task_struct * p) +void release_task(struct task_struct *p) { struct task_struct *leader; int zap_leader; @@ -192,7 +190,8 @@ */ zap_leader = 0; leader = p->group_leader; - if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) { + if (leader != p && thread_group_empty(leader) + && leader->exit_state == EXIT_ZOMBIE) { /* * If we were the last child thread and the leader has * exited already, and the leader's parent ignores SIGCHLD, @@ -213,27 +212,6 @@ } /* - * This checks not only the pgrp, but falls back on the pid if no - * satisfactory pgrp is found. I dunno - gdb doesn't work correctly - * without this... - * - * The caller must hold rcu lock or the tasklist lock. - */ -struct pid *session_of_pgrp(struct pid *pgrp) -{ - struct task_struct *p; - struct pid *sid = NULL; - - p = pid_task(pgrp, PIDTYPE_PGID); - if (p == NULL) - p = pid_task(pgrp, PIDTYPE_PID); - if (p != NULL) - sid = task_session(p); - - return sid; -} - -/* * Determine if a process group is "orphaned", according to the POSIX * definition in 2.2.2.52. Orphaned process groups are not to be affected * by terminal-generated stop signals. Newly orphaned process groups are @@ -241,7 +219,8 @@ * * "I ask you, have you ever known what it is to be an orphan?" */ -static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) +static int will_become_orphaned_pgrp(struct pid *pgrp, + struct task_struct *ignored_task) { struct task_struct *p; @@ -294,9 +273,9 @@ struct task_struct *ignored_task = tsk; if (!parent) - /* exit: our father is in a different pgrp than - * we are and we were the only connection outside. - */ + /* exit: our father is in a different pgrp than + * we are and we were the only connection outside. + */ parent = tsk->real_parent; else /* reparent: our child is in a different pgrp than @@ -313,57 +292,7 @@ } } -void __set_special_pids(struct pid *pid) -{ - struct task_struct *curr = current->group_leader; - - if (task_session(curr) != pid) - change_pid(curr, PIDTYPE_SID, pid); - - if (task_pgrp(curr) != pid) - change_pid(curr, PIDTYPE_PGID, pid); -} - -/* - * Let kernel threads use this to say that they allow a certain signal. - * Must not be used if kthread was cloned with CLONE_SIGHAND. - */ -int allow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - /* This is only needed for daemonize()'ed kthreads */ - sigdelset(¤t->blocked, sig); - /* - * Kernel threads handle their own signals. Let the signal code - * know it'll be handled, so that they don't get converted to - * SIGKILL or just silently dropped. - */ - current->sighand->action[(sig)-1].sa.sa_handler = (void __user *)2; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(allow_signal); - -int disallow_signal(int sig) -{ - if (!valid_signal(sig) || sig < 1) - return -EINVAL; - - spin_lock_irq(¤t->sighand->siglock); - current->sighand->action[(sig)-1].sa.sa_handler = SIG_IGN; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); - return 0; -} - -EXPORT_SYMBOL(disallow_signal); - -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ @@ -406,14 +335,18 @@ } /* - * Search through everything else. We should not get - * here often + * Search through everything else, we should not get here often. */ - do_each_thread(g, c) { - if (c->mm == mm) - goto assign_new_owner; - } while_each_thread(g, c); - + for_each_process(g) { + if (g->flags & PF_KTHREAD) + continue; + for_each_thread(g, c) { + if (c->mm == mm) + goto assign_new_owner; + if (c->mm) + break; + } + } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are @@ -445,13 +378,13 @@ task_unlock(c); put_task_struct(c); } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we * aren't already.. */ -static void exit_mm(struct task_struct * tsk) +static void exit_mm(struct task_struct *tsk) { struct mm_struct *mm = tsk->mm; struct core_state *core_state; @@ -471,6 +404,7 @@ core_state = mm->core_state; if (core_state) { struct core_thread self; + up_read(&mm->mmap_sem); self.task = tsk; @@ -501,6 +435,46 @@ task_unlock(tsk); mm_update_next_owner(mm); mmput(mm); + if (test_thread_flag(TIF_MEMDIE)) + exit_oom_victim(); +} + +static struct task_struct *find_alive_thread(struct task_struct *p) +{ + struct task_struct *t; + + for_each_thread(p, t) { + if (!(t->flags & PF_EXITING)) + return t; + } + return NULL; +} + +static struct task_struct *find_child_reaper(struct task_struct *father) + __releases(&tasklist_lock) + __acquires(&tasklist_lock) +{ + struct pid_namespace *pid_ns = task_active_pid_ns(father); + struct task_struct *reaper = pid_ns->child_reaper; + + if (likely(reaper != father)) + return reaper; + + reaper = find_alive_thread(father); + if (reaper) { + pid_ns->child_reaper = reaper; + return reaper; + } + + write_unlock_irq(&tasklist_lock); + if (unlikely(pid_ns == &init_pid_ns)) { + panic("Attempted to kill init! exitcode=0x%08x\n", + father->signal->group_exit_code ?: father->exit_code); + } + zap_pid_ns_processes(pid_ns); + write_lock_irq(&tasklist_lock); + + return father; } /* @@ -510,58 +484,36 @@ * child_subreaper for its children (like a service manager) * 3. give it to the init process (PID 1) in our pid namespace */ -static struct task_struct *find_new_reaper(struct task_struct *father) - __releases(&tasklist_lock) - __acquires(&tasklist_lock) +static struct task_struct *find_new_reaper(struct task_struct *father, + struct task_struct *child_reaper) { - struct pid_namespace *pid_ns = task_active_pid_ns(father); - struct task_struct *thread; + struct task_struct *thread, *reaper; - thread = father; - while_each_thread(father, thread) { - if (thread->flags & PF_EXITING) - continue; - if (unlikely(pid_ns->child_reaper == father)) - pid_ns->child_reaper = thread; + thread = find_alive_thread(father); + if (thread) return thread; - } - - if (unlikely(pid_ns->child_reaper == father)) { - write_unlock_irq(&tasklist_lock); - if (unlikely(pid_ns == &init_pid_ns)) { - panic("Attempted to kill init! exitcode=0x%08x\n", - father->signal->group_exit_code ?: - father->exit_code); - } - - zap_pid_ns_processes(pid_ns); - write_lock_irq(&tasklist_lock); - } else if (father->signal->has_child_subreaper) { - struct task_struct *reaper; + if (father->signal->has_child_subreaper) { /* - * Find the first ancestor marked as child_subreaper. - * Note that the code below checks same_thread_group(reaper, - * pid_ns->child_reaper). This is what we need to DTRT in a - * PID namespace. However we still need the check above, see - * http://marc.info/?l=linux-kernel&m=131385460420380 + * Find the first ->is_child_subreaper ancestor in our pid_ns. + * We start from father to ensure we can not look into another + * namespace, this is safe because all its threads are dead. */ - for (reaper = father->real_parent; - reaper != &init_task; + for (reaper = father; + !same_thread_group(reaper, child_reaper); reaper = reaper->real_parent) { - if (same_thread_group(reaper, pid_ns->child_reaper)) + /* call_usermodehelper() descendants need this check */ + if (reaper == &init_task) break; if (!reaper->signal->is_child_subreaper) continue; - thread = reaper; - do { - if (!(thread->flags & PF_EXITING)) - return reaper; - } while_each_thread(reaper, thread); + thread = find_alive_thread(reaper); + if (thread) + return thread; } } - return pid_ns->child_reaper; + return child_reaper; } /* @@ -570,74 +522,64 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { - list_move_tail(&p->sibling, &p->real_parent->children); - /* - * If this is a threaded reparent there is no need to - * notify anyone anything has happened. - */ - if (same_thread_group(p->real_parent, father)) + if (unlikely(p->exit_state == EXIT_DEAD)) return; - /* - * We don't want people slaying init. - * - * Note: we do this even if it is EXIT_DEAD, wait_task_zombie() - * can change ->exit_state to EXIT_ZOMBIE. If this is the final - * state, do_notify_parent() was already called and ->exit_signal - * doesn't matter. - */ + /* We don't want people slaying init. */ p->exit_signal = SIGCHLD; - if (p->exit_state == EXIT_DEAD) - return; - /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { if (do_notify_parent(p, p->exit_signal)) { p->exit_state = EXIT_DEAD; - list_move_tail(&p->sibling, dead); + list_add(&p->ptrace_entry, dead); } } kill_orphaned_pgrp(p, father); } -static void forget_original_parent(struct task_struct *father) +/* + * This does two things: + * + * A. Make init inherit all the child processes + * B. Check to see if any process groups have become orphaned + * as a result of our exiting, and if they have any stopped + * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) + */ +static void forget_original_parent(struct task_struct *father, + struct list_head *dead) { - struct task_struct *p, *n, *reaper; - LIST_HEAD(dead_children); + struct task_struct *p, *t, *reaper; - write_lock_irq(&tasklist_lock); - /* - * Note that exit_ptrace() and find_new_reaper() might - * drop tasklist_lock and reacquire it. - */ - exit_ptrace(father); - reaper = find_new_reaper(father); + if (unlikely(!list_empty(&father->ptraced))) + exit_ptrace(father, dead); + + /* Can drop and reacquire tasklist_lock */ + reaper = find_child_reaper(father); + if (list_empty(&father->children)) + return; - list_for_each_entry_safe(p, n, &father->children, sibling) { - struct task_struct *t = p; - do { + reaper = find_new_reaper(father, reaper); + list_for_each_entry(p, &father->children, sibling) { + for_each_thread(p, t) { t->real_parent = reaper; - if (t->parent == father) { - BUG_ON(t->ptrace); + BUG_ON((!t->ptrace) != (t->parent == father)); + if (likely(!t->ptrace)) t->parent = t->real_parent; - } if (t->pdeath_signal) group_send_sig_info(t->pdeath_signal, SEND_SIG_NOINFO, t); - } while_each_thread(p, t); - reparent_leader(father, p, &dead_children); - } - write_unlock_irq(&tasklist_lock); - - BUG_ON(!list_empty(&father->children)); - - list_for_each_entry_safe(p, n, &dead_children, sibling) { - list_del_init(&p->sibling); - release_task(p); + } + /* + * If this is a threaded reparent there is no need to + * notify anyone anything has happened. + */ + if (!same_thread_group(reaper, father)) + reparent_leader(father, p, dead); } + list_splice_tail_init(&father->children, &reaper->children); } /* @@ -647,18 +589,12 @@ static void exit_notify(struct task_struct *tsk, int group_dead) { bool autoreap; - - /* - * This does two things: - * - * A. Make init inherit all the child processes - * B. Check to see if any process groups have become orphaned - * as a result of our exiting, and if they have any stopped - * jobs, send them a SIGHUP and then a SIGCONT. (POSIX 3.2.2.2) - */ - forget_original_parent(tsk); + struct task_struct *p, *n; + LIST_HEAD(dead); write_lock_irq(&tasklist_lock); + forget_original_parent(tsk, &dead); + if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -676,15 +612,18 @@ } tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE; + if (tsk->exit_state == EXIT_DEAD) + list_add(&tsk->ptrace_entry, &dead); /* mt-exec, de_thread() is waiting for group leader */ if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exit_task); write_unlock_irq(&tasklist_lock); - /* If the process is dead, release it - nobody will wait for it */ - if (autoreap) - release_task(tsk); + list_for_each_entry_safe(p, n, &dead, ptrace_entry) { + list_del_init(&p->ptrace_entry); + release_task(p); + } } #ifdef CONFIG_DEBUG_STACK_USAGE @@ -701,9 +640,8 @@ spin_lock(&low_water_lock); if (free < lowest_to_date) { - printk(KERN_WARNING "%s (%d) used greatest stack depth: " - "%lu bytes left\n", - current->comm, task_pid_nr(current), free); + pr_warn("%s (%d) used greatest stack depth: %lu bytes left\n", + current->comm, task_pid_nr(current), free); lowest_to_date = free; } spin_unlock(&low_water_lock); @@ -716,6 +654,7 @@ { struct task_struct *tsk = current; int group_dead; + TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); @@ -744,8 +683,7 @@ * leave this task alone and wait for reboot. */ if (unlikely(tsk->flags & PF_EXITING)) { - printk(KERN_ALERT - "Fixing recursive fault but reboot is needed!\n"); + pr_alert("Fixing recursive fault but reboot is needed!\n"); /* * We can do this unlocked here. The futex code uses * this flag just to verify whether the pi state @@ -768,15 +706,17 @@ smp_mb(); raw_spin_unlock_wait(&tsk->pi_lock); - if (unlikely(in_atomic())) - printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", - current->comm, task_pid_nr(current), - preempt_count()); + if (unlikely(in_atomic())) { + pr_info("note: %s[%d] exited with preempt_count %d\n", + current->comm, task_pid_nr(current), + preempt_count()); + preempt_count_set(PREEMPT_ENABLED); + } - acct_update_integrals(tsk); /* sync mm's RSS info before statistics gathering */ if (tsk->mm) sync_mm_rss(tsk->mm); + acct_update_integrals(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); @@ -806,7 +746,6 @@ disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); - check_stack_usage(); exit_thread(); /* @@ -817,17 +756,18 @@ */ perf_event_exit_task(tsk); - cgroup_exit(tsk, 1); + cgroup_exit(tsk); - module_put(task_thread_info(tsk)->exec_domain->module); - - proc_exit_connector(tsk); /* * FIXME: do that only when needed, using sched_exit tracepoint */ - ptrace_put_breakpoints(tsk); + flush_ptrace_hw_breakpoint(tsk); + TASKS_RCU(preempt_disable()); + TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu)); + TASKS_RCU(preempt_enable()); exit_notify(tsk, group_dead); + proc_exit_connector(tsk); #ifdef CONFIG_NUMA task_lock(tsk); mpol_put(tsk->mempolicy); @@ -841,7 +781,7 @@ /* * Make sure we are holding no locks: */ - debug_check_no_locks_held(tsk); + debug_check_no_locks_held(); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done @@ -860,10 +800,12 @@ validate_creds_for_do_exit(tsk); + check_stack_usage(); preempt_disable(); if (tsk->nr_dirtied) __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); exit_rcu(); + TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); /* * The setting of TASK_RUNNING by try_to_wake_up() may be delayed @@ -889,7 +831,6 @@ for (;;) cpu_relax(); /* For when BUG is null */ } - EXPORT_SYMBOL_GPL(do_exit); void complete_and_exit(struct completion *comp, long code) @@ -899,7 +840,6 @@ do_exit(code); } - EXPORT_SYMBOL(complete_and_exit); SYSCALL_DEFINE1(exit, int, error_code) @@ -922,6 +862,7 @@ exit_code = sig->group_exit_code; else if (!thread_group_empty(current)) { struct sighand_struct *const sighand = current->sighand; + spin_lock_irq(&sighand->siglock); if (signal_group_exit(sig)) /* Another thread got here before we took the lock. */ @@ -977,17 +918,28 @@ task_pid_type(p, wo->wo_type) == wo->wo_pid; } -static int eligible_child(struct wait_opts *wo, struct task_struct *p) +static int +eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; - /* Wait for all children (clone and not) if __WALL is set; - * otherwise, wait for clone children *only* if __WCLONE is - * set; otherwise, wait for non-clone children *only*. (Note: - * A "clone" child here is one that reports to its parent - * using a signal other than SIGCHLD.) */ - if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) - && !(wo->wo_flags & __WALL)) + + /* + * Wait for all children (clone and not) if __WALL is set or + * if it is traced by us. + */ + if (ptrace || (wo->wo_flags & __WALL)) + return 1; + + /* + * Otherwise, wait for clone children *only* if __WCLONE is set; + * otherwise, wait for non-clone children *only*. + * + * Note: a "clone" child here is one that reports to its parent + * using a signal other than SIGCHLD, or a non-leader thread which + * we can only see if it is traced by us. + */ + if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; @@ -1029,8 +981,7 @@ */ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) { - unsigned long state; - int retval, status, traced; + int state, retval, status; pid_t pid = task_pid_vnr(p); uid_t uid = from_kuid_munged(current_user_ns(), task_uid(p)); struct siginfo __user *infop; @@ -1044,6 +995,8 @@ get_task_struct(p); read_unlock(&tasklist_lock); + sched_annotate_sleep(); + if ((exit_code & 0x7f) == 0) { why = CLD_EXITED; status = exit_code >> 8; @@ -1053,25 +1006,25 @@ } return wait_noreap_copyout(wo, p, pid, uid, why, status); } - /* - * Try to move the task's state to DEAD - * only one thread is allowed to do this: + * Move the task's state to DEAD/TRACE, only one thread can do this. */ - state = xchg(&p->exit_state, EXIT_DEAD); - if (state != EXIT_ZOMBIE) { - BUG_ON(state != EXIT_DEAD); + state = (ptrace_reparented(p) && thread_group_leader(p)) ? + EXIT_TRACE : EXIT_DEAD; + if (cmpxchg(&p->exit_state, EXIT_ZOMBIE, state) != EXIT_ZOMBIE) return 0; - } + /* + * We own this thread, nobody else can reap it. + */ + read_unlock(&tasklist_lock); + sched_annotate_sleep(); - traced = ptrace_reparented(p); /* - * It can be ptraced but not reparented, check - * thread_group_leader() to filter out sub-threads. + * Check thread_group_leader() to exclude the traced sub-threads. */ - if (likely(!traced) && thread_group_leader(p)) { - struct signal_struct *psig; - struct signal_struct *sig; + if (state == EXIT_DEAD && thread_group_leader(p)) { + struct signal_struct *sig = p->signal; + struct signal_struct *psig = current->signal; unsigned long maxrss; cputime_t tgutime, tgstime; @@ -1083,21 +1036,21 @@ * accumulate in the parent's signal_struct c* fields. * * We don't bother to take a lock here to protect these - * p->signal fields, because they are only touched by - * __exit_signal, which runs with tasklist_lock - * write-locked anyway, and so is excluded here. We do - * need to protect the access to parent->signal fields, - * as other threads in the parent group can be right - * here reaping other children at the same time. + * p->signal fields because the whole thread group is dead + * and nobody can change them. + * + * psig->stats_lock also protects us from our sub-theads + * which can reap other children at the same time. Until + * we change k_getrusage()-like users to rely on this lock + * we have to take ->siglock as well. * - * We use thread_group_cputime_adjusted() to get times for the thread - * group, which consolidates times for all threads in the - * group including the group leader. + * We use thread_group_cputime_adjusted() to get times for + * the thread group, which consolidates times for all threads + * in the group including the group leader. */ thread_group_cputime_adjusted(p, &tgutime, &tgstime); - spin_lock_irq(&p->real_parent->sighand->siglock); - psig = p->real_parent->signal; - sig = p->signal; + spin_lock_irq(¤t->sighand->siglock); + write_seqlock(&psig->stats_lock); psig->cutime += tgutime + sig->cutime; psig->cstime += tgstime + sig->cstime; psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime; @@ -1120,15 +1073,10 @@ psig->cmaxrss = maxrss; task_io_accounting_add(&psig->ioac, &p->ioac); task_io_accounting_add(&psig->ioac, &sig->ioac); - spin_unlock_irq(&p->real_parent->sighand->siglock); + write_sequnlock(&psig->stats_lock); + spin_unlock_irq(¤t->sighand->siglock); } - /* - * Now we are sure this task is interesting, and no other - * thread can reap it because we set its state to EXIT_DEAD. - */ - read_unlock(&tasklist_lock); - retval = wo->wo_rusage ? getrusage(p, RUSAGE_BOTH, wo->wo_rusage) : 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) @@ -1162,22 +1110,19 @@ if (!retval) retval = pid; - if (traced) { + if (state == EXIT_TRACE) { write_lock_irq(&tasklist_lock); /* We dropped tasklist, ptracer could die and untrace */ ptrace_unlink(p); - /* - * If this is not a sub-thread, notify the parent. - * If parent wants a zombie, don't release it now. - */ - if (thread_group_leader(p) && - !do_notify_parent(p, p->exit_signal)) { - p->exit_state = EXIT_ZOMBIE; - p = NULL; - } + + /* If parent wants a zombie, don't release it now */ + state = EXIT_ZOMBIE; + if (do_notify_parent(p, p->exit_signal)) + state = EXIT_DEAD; + p->exit_state = state; write_unlock_irq(&tasklist_lock); } - if (p != NULL) + if (state == EXIT_DEAD) release_task(p); return retval; @@ -1262,6 +1207,7 @@ pid = task_pid_vnr(p); why = ptrace ? CLD_TRAPPED : CLD_STOPPED; read_unlock(&tasklist_lock); + sched_annotate_sleep(); if (unlikely(wo->wo_flags & WNOWAIT)) return wait_noreap_copyout(wo, p, pid, uid, why, exit_code); @@ -1324,6 +1270,7 @@ pid = task_pid_vnr(p); get_task_struct(p); read_unlock(&tasklist_lock); + sched_annotate_sleep(); if (!wo->wo_info) { retval = wo->wo_rusage @@ -1354,7 +1301,18 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, struct task_struct *p) { - int ret = eligible_child(wo, p); + /* + * We can race with wait_task_zombie() from another thread. + * Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition + * can't confuse the checks below. + */ + int exit_state = ACCESS_ONCE(p->exit_state); + int ret; + + if (unlikely(exit_state == EXIT_DEAD)) + return 0; + + ret = eligible_child(wo, ptrace, p); if (!ret) return ret; @@ -1372,33 +1330,44 @@ return 0; } - /* dead body doesn't have much to contribute */ - if (unlikely(p->exit_state == EXIT_DEAD)) { + if (unlikely(exit_state == EXIT_TRACE)) { /* - * But do not ignore this task until the tracer does - * wait_task_zombie()->do_notify_parent(). + * ptrace == 0 means we are the natural parent. In this case + * we should clear notask_error, debugger will notify us. */ - if (likely(!ptrace) && unlikely(ptrace_reparented(p))) + if (likely(!ptrace)) wo->notask_error = 0; return 0; } - /* slay zombie? */ - if (p->exit_state == EXIT_ZOMBIE) { + if (likely(!ptrace) && unlikely(p->ptrace)) { /* - * A zombie ptracee is only visible to its ptracer. - * Notification and reaping will be cascaded to the real - * parent when the ptracer detaches. + * If it is traced by its real parent's group, just pretend + * the caller is ptrace_do_wait() and reap this child if it + * is zombie. + * + * This also hides group stop state from real parent; otherwise + * a single stop can be reported twice as group and ptrace stop. + * If a ptracer wants to distinguish these two events for its + * own children it should create a separate process which takes + * the role of real parent. */ - if (likely(!ptrace) && unlikely(p->ptrace)) { - /* it will become visible, clear notask_error */ - wo->notask_error = 0; - return 0; - } + if (!ptrace_reparented(p)) + ptrace = 1; + } + /* slay zombie? */ + if (exit_state == EXIT_ZOMBIE) { /* we don't reap group leaders with subthreads */ - if (!delay_group_leader(p)) - return wait_task_zombie(wo, p); + if (!delay_group_leader(p)) { + /* + * A zombie ptracee is only visible to its ptracer. + * Notification and reaping will be cascaded to the + * real parent when the ptracer detaches. + */ + if (unlikely(ptrace) || likely(!p->ptrace)) + return wait_task_zombie(wo, p); + } /* * Allow access to stopped/continued state via zombie by @@ -1424,19 +1393,6 @@ wo->notask_error = 0; } else { /* - * If @p is ptraced by a task in its real parent's group, - * hide group stop/continued state when looking at @p as - * the real parent; otherwise, a single stop can be - * reported twice as group and ptrace stops. - * - * If a ptracer wants to distinguish the two events for its - * own children, it should create a separate process which - * takes the role of real parent. - */ - if (likely(!ptrace) && p->ptrace && !ptrace_reparented(p)) - return 0; - - /* * @p is alive and it's gonna stop, continue or exit, so * there always is something to wait for. */ @@ -1474,6 +1430,7 @@ list_for_each_entry(p, &tsk->children, sibling) { int ret = wait_consider_task(wo, 0, p); + if (ret) return ret; } @@ -1487,6 +1444,7 @@ list_for_each_entry(p, &tsk->ptraced, ptrace_entry) { int ret = wait_consider_task(wo, 1, p); + if (ret) return ret; } @@ -1528,7 +1486,7 @@ add_wait_queue(¤t->signal->wait_chldexit, &wo->child_wait); repeat: /* - * If there is nothing that can match our critiera just get out. + * If there is nothing that can match our criteria, just get out. * We will clear ->notask_error to zero if we see any child that * might later match our criteria, even if we are not able to reap * it yet.