--- zzzz-none-000/linux-3.10.107/mm/memory-failure.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/mm/memory-failure.c 2021-02-04 17:41:59.000000000 +0000 @@ -20,6 +20,14 @@ * this code has to be extremely careful. Generally it tries to use * normal locking rules, as in get the standard locks, even if that means * the error handling takes potentially a long time. + * + * It can be very tempting to add handling for obscure cases here. + * In general any code for handling new cases should only be added iff: + * - You know how to test it. + * - You have a test that can be added to mce-test + * https://git.kernel.org/cgit/utils/cpu/mce/mce-test.git/ + * - The case actually shows up as a frequent (top 10) page state in + * tools/vm/page-types when running a real workload. * * There are several operations here with exponential complexity because * of unsuitable VM data structures. For example the operation to map back @@ -28,13 +36,6 @@ * are rare we hope to get away with this. This avoids impacting the core * VM. */ - -/* - * Notebook: - * - hugetlb needs more code - * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages - * - pass bad pages to kdump next kernel - */ #include #include #include @@ -55,7 +56,9 @@ #include #include #include +#include #include "internal.h" +#include "ras/ras_event.h" int sysctl_memory_failure_early_kill __read_mostly = 0; @@ -128,31 +131,15 @@ * can only guarantee that the page either belongs to the memcg tasks, or is * a freed page. */ -#ifdef CONFIG_MEMCG_SWAP +#ifdef CONFIG_MEMCG u64 hwpoison_filter_memcg; EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); static int hwpoison_filter_task(struct page *p) { - struct mem_cgroup *mem; - struct cgroup_subsys_state *css; - unsigned long ino; - if (!hwpoison_filter_memcg) return 0; - mem = try_get_mem_cgroup_from_page(p); - if (!mem) - return -EINVAL; - - css = mem_cgroup_css(mem); - /* root_mem_cgroup has NULL dentries */ - if (!css->cgroup->dentry) - return -EINVAL; - - ino = css->cgroup->dentry->d_inode->i_ino; - css_put(css); - - if (ino != hwpoison_filter_memcg) + if (page_cgroup_ino(p) != hwpoison_filter_memcg) return -EINVAL; return 0; @@ -206,7 +193,7 @@ #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; + si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { si.si_code = BUS_MCEERR_AR; @@ -237,27 +224,17 @@ lru_add_drain_all(); if (PageLRU(p)) return; - drain_all_pages(); + drain_all_pages(page_zone(p)); if (PageLRU(p) || is_free_buddy_page(p)) return; } /* - * Only call shrink_slab here (which would also shrink other caches) if - * access is not potentially fatal. + * Only call shrink_node_slabs here (which would also shrink + * other caches) if access is not potentially fatal. */ - if (access) { - int nr; - do { - struct shrink_control shrink = { - .gfp_mask = GFP_KERNEL, - }; - - nr = shrink_slab(&shrink, 1000, 1000); - if (page_count(p) == 1) - break; - } while (nr > 10); - } + if (access) + drop_slab_node(page_to_nid(p)); } EXPORT_SYMBOL_GPL(shake_page); @@ -382,15 +359,44 @@ } } -static int task_early_kill(struct task_struct *tsk, int force_early) +/* + * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) + * on behalf of the thread group. Return task_struct of the (first found) + * dedicated thread if found, and return NULL otherwise. + * + * We already hold read_lock(&tasklist_lock) in the caller, so we don't + * have to call rcu_read_lock/unlock() in this function. + */ +static struct task_struct *find_early_kill_thread(struct task_struct *tsk) +{ + struct task_struct *t; + + for_each_thread(tsk, t) + if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) + return t; + return NULL; +} + +/* + * Determine whether a given process is "early kill" process which expects + * to be signaled when some page under the process is hwpoisoned. + * Return task_struct of the dedicated thread (main thread unless explicitly + * specified) if the process is "early kill," and otherwise returns NULL. + */ +static struct task_struct *task_early_kill(struct task_struct *tsk, + int force_early) { + struct task_struct *t; if (!tsk->mm) - return 0; + return NULL; if (force_early) - return 1; - if (tsk->flags & PF_MCE_PROCESS) - return !!(tsk->flags & PF_MCE_EARLY); - return sysctl_memory_failure_early_kill; + return tsk; + t = find_early_kill_thread(tsk); + if (t) + return t; + if (sysctl_memory_failure_early_kill) + return tsk; + return NULL; } /* @@ -408,20 +414,21 @@ if (av == NULL) /* Not actually mapped anymore */ return; - pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff = page_to_pgoff(page); read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk, force_early)) + if (!t) continue; anon_vma_interval_tree_foreach(vmac, &av->rb_root, pgoff, pgoff) { vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } read_unlock(&tasklist_lock); @@ -438,14 +445,14 @@ struct task_struct *tsk; struct address_space *mapping = page->mapping; - mutex_lock(&mapping->i_mmap_mutex); + i_mmap_lock_read(mapping); read_lock(&tasklist_lock); for_each_process(tsk) { - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + pgoff_t pgoff = page_to_pgoff(page); + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk, force_early)) + if (!t) continue; - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { /* @@ -455,12 +462,12 @@ * Assume applications who requested early kill want * to be informed of all such data corruptions. */ - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } read_unlock(&tasklist_lock); - mutex_unlock(&mapping->i_mmap_mutex); + i_mmap_unlock_read(mapping); } /* @@ -487,22 +494,34 @@ kfree(tk); } -/* - * Error handlers for various types of pages. - */ - -enum outcome { - IGNORED, /* Error: cannot be handled */ - FAILED, /* Error: handling failed */ - DELAYED, /* Will be handled later */ - RECOVERED, /* Successfully recovered */ +static const char *action_name[] = { + [MF_IGNORED] = "Ignored", + [MF_FAILED] = "Failed", + [MF_DELAYED] = "Delayed", + [MF_RECOVERED] = "Recovered", }; -static const char *action_name[] = { - [IGNORED] = "Ignored", - [FAILED] = "Failed", - [DELAYED] = "Delayed", - [RECOVERED] = "Recovered", +static const char * const action_page_types[] = { + [MF_MSG_KERNEL] = "reserved kernel page", + [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", + [MF_MSG_SLAB] = "kernel slab page", + [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", + [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", + [MF_MSG_HUGE] = "huge page", + [MF_MSG_FREE_HUGE] = "free huge page", + [MF_MSG_UNMAP_FAILED] = "unmapping failed page", + [MF_MSG_DIRTY_SWAPCACHE] = "dirty swapcache page", + [MF_MSG_CLEAN_SWAPCACHE] = "clean swapcache page", + [MF_MSG_DIRTY_MLOCKED_LRU] = "dirty mlocked LRU page", + [MF_MSG_CLEAN_MLOCKED_LRU] = "clean mlocked LRU page", + [MF_MSG_DIRTY_UNEVICTABLE_LRU] = "dirty unevictable LRU page", + [MF_MSG_CLEAN_UNEVICTABLE_LRU] = "clean unevictable LRU page", + [MF_MSG_DIRTY_LRU] = "dirty LRU page", + [MF_MSG_CLEAN_LRU] = "clean LRU page", + [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", + [MF_MSG_BUDDY] = "free buddy page", + [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", + [MF_MSG_UNKNOWN] = "unknown page", }; /* @@ -536,7 +555,7 @@ */ static int me_kernel(struct page *p, unsigned long pfn) { - return IGNORED; + return MF_IGNORED; } /* @@ -545,7 +564,7 @@ static int me_unknown(struct page *p, unsigned long pfn) { printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); - return FAILED; + return MF_FAILED; } /* @@ -554,7 +573,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) { int err; - int ret = FAILED; + int ret = MF_FAILED; struct address_space *mapping; delete_from_lru_cache(p); @@ -564,7 +583,7 @@ * should be the one m_f() holds. */ if (PageAnon(p)) - return RECOVERED; + return MF_RECOVERED; /* * Now truncate the page in the page cache. This is really @@ -578,7 +597,7 @@ /* * Page has been teared down in the meanwhile */ - return FAILED; + return MF_FAILED; } /* @@ -595,7 +614,7 @@ !try_to_release_page(p, GFP_NOIO)) { pr_info("MCE %#lx: failed to release buffers\n", pfn); } else { - ret = RECOVERED; + ret = MF_RECOVERED; } } else { /* @@ -603,7 +622,7 @@ * This fails on dirty or anything with private pages */ if (invalidate_inode_page(p)) - ret = RECOVERED; + ret = MF_RECOVERED; else printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", pfn); @@ -612,7 +631,7 @@ } /* - * Dirty cache page page + * Dirty pagecache page * Issues: when the error hit a hole page the error is not properly * propagated. */ @@ -689,9 +708,9 @@ ClearPageUptodate(p); if (!delete_from_lru_cache(p)) - return DELAYED; + return MF_DELAYED; else - return FAILED; + return MF_FAILED; } static int me_swapcache_clean(struct page *p, unsigned long pfn) @@ -699,9 +718,9 @@ delete_from_swap_cache(p); if (!delete_from_lru_cache(p)) - return RECOVERED; + return MF_RECOVERED; else - return FAILED; + return MF_FAILED; } /* @@ -714,6 +733,10 @@ { int res = 0; struct page *hpage = compound_head(p); + + if (!PageHuge(hpage)) + return MF_DELAYED; + /* * We can safely recover from error on free or reserved (i.e. * not in-use) hugepage by dequeuing it from freelist. @@ -727,9 +750,9 @@ if (!(page_mapping(hpage) || PageAnon(hpage))) { res = dequeue_hwpoisoned_huge_page(hpage); if (!res) - return RECOVERED; + return MF_RECOVERED; } - return DELAYED; + return MF_DELAYED; } /* @@ -753,18 +776,16 @@ #define lru (1UL << PG_lru) #define swapbacked (1UL << PG_swapbacked) #define head (1UL << PG_head) -#define tail (1UL << PG_tail) -#define compound (1UL << PG_compound) #define slab (1UL << PG_slab) #define reserved (1UL << PG_reserved) static struct page_state { unsigned long mask; unsigned long res; - char *msg; + enum mf_action_page_type type; int (*action)(struct page *p, unsigned long pfn); } error_states[] = { - { reserved, reserved, "reserved kernel", me_kernel }, + { reserved, reserved, MF_MSG_KERNEL, me_kernel }, /* * free pages are specially detected outside this table: * PG_buddy pages only make a small fraction of all free pages. @@ -775,31 +796,26 @@ * currently unused objects without touching them. But just * treat it as standard kernel for now. */ - { slab, slab, "kernel slab", me_kernel }, + { slab, slab, MF_MSG_SLAB, me_kernel }, -#ifdef CONFIG_PAGEFLAGS_EXTENDED - { head, head, "huge", me_huge_page }, - { tail, tail, "huge", me_huge_page }, -#else - { compound, compound, "huge", me_huge_page }, -#endif + { head, head, MF_MSG_HUGE, me_huge_page }, - { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, - { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, + { sc|dirty, sc|dirty, MF_MSG_DIRTY_SWAPCACHE, me_swapcache_dirty }, + { sc|dirty, sc, MF_MSG_CLEAN_SWAPCACHE, me_swapcache_clean }, - { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, - { mlock|dirty, mlock, "clean mlocked LRU", me_pagecache_clean }, + { mlock|dirty, mlock|dirty, MF_MSG_DIRTY_MLOCKED_LRU, me_pagecache_dirty }, + { mlock|dirty, mlock, MF_MSG_CLEAN_MLOCKED_LRU, me_pagecache_clean }, - { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, - { unevict|dirty, unevict, "clean unevictable LRU", me_pagecache_clean }, + { unevict|dirty, unevict|dirty, MF_MSG_DIRTY_UNEVICTABLE_LRU, me_pagecache_dirty }, + { unevict|dirty, unevict, MF_MSG_CLEAN_UNEVICTABLE_LRU, me_pagecache_clean }, - { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, - { lru|dirty, lru, "clean LRU", me_pagecache_clean }, + { lru|dirty, lru|dirty, MF_MSG_DIRTY_LRU, me_pagecache_dirty }, + { lru|dirty, lru, MF_MSG_CLEAN_LRU, me_pagecache_clean }, /* * Catchall entry: must be at end. */ - { 0, 0, "unknown page state", me_unknown }, + { 0, 0, MF_MSG_UNKNOWN, me_unknown }, }; #undef dirty @@ -819,10 +835,13 @@ * "Dirty/Clean" indication is not 100% accurate due to the possibility of * setting PG_dirty outside page lock. See also comment above set_page_dirty(). */ -static void action_result(unsigned long pfn, char *msg, int result) +static void action_result(unsigned long pfn, enum mf_action_page_type type, + enum mf_result result) { - pr_err("MCE %#lx: %s page recovery: %s\n", - pfn, msg, action_name[result]); + trace_memory_failure_event(pfn, type, result); + + pr_err("MCE %#lx: recovery action for %s: %s\n", + pfn, action_page_types[type], action_name[result]); } static int page_action(struct page_state *ps, struct page *p, @@ -832,25 +851,91 @@ int count; result = ps->action(p, pfn); - action_result(pfn, ps->msg, result); count = page_count(p) - 1; - if (ps->action == me_swapcache_dirty && result == DELAYED) + if (ps->action == me_swapcache_dirty && result == MF_DELAYED) count--; if (count != 0) { printk(KERN_ERR - "MCE %#lx: %s page still referenced by %d users\n", - pfn, ps->msg, count); - result = FAILED; + "MCE %#lx: %s still referenced by %d users\n", + pfn, action_page_types[ps->type], count); + result = MF_FAILED; } + action_result(pfn, ps->type, result); /* Could do more checks here if page looks ok */ /* * Could adjust zone counters here to correct for the missing page. */ - return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; + return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; +} + +/** + * get_hwpoison_page() - Get refcount for memory error handling: + * @page: raw error page (hit by memory error) + * + * Return: return 0 if failed to grab the refcount, otherwise true (some + * non-zero value.) + */ +int get_hwpoison_page(struct page *page) +{ + struct page *head = compound_head(page); + + if (PageHuge(head)) + return get_page_unless_zero(head); + + /* + * Thp tail page has special refcounting rule (refcount of tail pages + * is stored in ->_mapcount,) so we can't call get_page_unless_zero() + * directly for tail pages. + */ + if (PageTransHuge(head)) { + /* + * Non anonymous thp exists only in allocation/free time. We + * can't handle such a case correctly, so let's give it up. + * This should be better than triggering BUG_ON when kernel + * tries to touch the "partially handled" page. + */ + if (!PageAnon(head)) { + pr_err("MCE: %#lx: non anonymous thp\n", + page_to_pfn(page)); + return 0; + } + + if (get_page_unless_zero(head)) { + if (PageTail(page)) + get_page(page); + return 1; + } else { + return 0; + } + } + + return get_page_unless_zero(page); +} +EXPORT_SYMBOL_GPL(get_hwpoison_page); + +/** + * put_hwpoison_page() - Put refcount for memory error handling: + * @page: raw error page (hit by memory error) + */ +void put_hwpoison_page(struct page *page) +{ + struct page *head = compound_head(page); + + if (PageHuge(head)) { + put_page(head); + return; + } + + if (PageTransHuge(head)) + if (page != head) + put_page(head); + + put_page(page); } +EXPORT_SYMBOL_GPL(put_hwpoison_page); /* * Do all that is necessary to remove user space mappings. Unmap @@ -865,10 +950,15 @@ int ret; int kill = 1, forcekill; struct page *hpage = *hpagep; - struct page *ppage; + /* + * Here we are interested only in user-mapped pages, so skip any + * other types of pages. + */ if (PageReserved(p) || PageSlab(p)) return SWAP_SUCCESS; + if (!(PageLRU(hpage) || PageHuge(p))) + return SWAP_SUCCESS; /* * This check implies we don't kill processes if their pages @@ -877,8 +967,10 @@ if (!page_mapped(hpage)) return SWAP_SUCCESS; - if (PageKsm(p)) + if (PageKsm(p)) { + pr_err("MCE %#lx: can't handle KSM pages.\n", pfn); return SWAP_FAIL; + } if (PageSwapCache(p)) { printk(KERN_ERR @@ -907,59 +999,6 @@ } /* - * ppage: poisoned page - * if p is regular page(4k page) - * ppage == real poisoned page; - * else p is hugetlb or THP, ppage == head page. - */ - ppage = hpage; - - if (PageTransHuge(hpage)) { - /* - * Verify that this isn't a hugetlbfs head page, the check for - * PageAnon is just for avoid tripping a split_huge_page - * internal debug check, as split_huge_page refuses to deal with - * anything that isn't an anon page. PageAnon can't go away fro - * under us because we hold a refcount on the hpage, without a - * refcount on the hpage. split_huge_page can't be safely called - * in the first place, having a refcount on the tail isn't - * enough * to be safe. - */ - if (!PageHuge(hpage) && PageAnon(hpage)) { - if (unlikely(split_huge_page(hpage))) { - /* - * FIXME: if splitting THP is failed, it is - * better to stop the following operation rather - * than causing panic by unmapping. System might - * survive if the page is freed later. - */ - printk(KERN_INFO - "MCE %#lx: failed to split THP\n", pfn); - - BUG_ON(!PageHWPoison(p)); - return SWAP_FAIL; - } - /* - * We pinned the head page for hwpoison handling, - * now we split the thp and we are interested in - * the hwpoisoned raw page, so move the refcount - * to it. Similarly, page lock is shifted. - */ - if (hpage != p) { - if (!(flags & MF_COUNT_INCREASED)) { - put_page(hpage); - get_page(p); - } - lock_page(p); - unlock_page(hpage); - *hpagep = p; - } - /* THP is split, so ppage should be the real poisoned page. */ - ppage = p; - } - } - - /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, * because ttu takes the rmap data structures down. @@ -968,12 +1007,12 @@ * there's nothing that can be done. */ if (kill) - collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); + collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - ret = try_to_unmap(ppage, ttu); + ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(ppage)); + pfn, page_mapcount(hpage)); /* * Now that the dirty bit has been propagated to the @@ -985,7 +1024,7 @@ * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - forcekill = PageDirty(ppage) || (flags & MF_MUST_KILL); + forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); kill_procs(&tokill, forcekill, trapno, ret != SWAP_SUCCESS, p, pfn, flags); @@ -995,7 +1034,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_trans_order(hpage); + int nr_pages = 1 << compound_order(hpage); for (i = 0; i < nr_pages; i++) SetPageHWPoison(hpage + i); } @@ -1003,7 +1042,7 @@ static void clear_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_trans_order(hpage); + int nr_pages = 1 << compound_order(hpage); for (i = 0; i < nr_pages; i++) ClearPageHWPoison(hpage + i); } @@ -1031,6 +1070,7 @@ struct page_state *ps; struct page *p; struct page *hpage; + struct page *orig_head; int res; unsigned int nr_pages; unsigned long page_flags; @@ -1046,7 +1086,7 @@ } p = pfn_to_page(pfn); - hpage = compound_head(p); + orig_head = hpage = compound_head(p); if (TestSetPageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); return 0; @@ -1063,7 +1103,7 @@ nr_pages = 1 << compound_order(hpage); else /* normal page or thp */ nr_pages = 1; - atomic_long_add(nr_pages, &num_poisoned_pages); + num_poisoned_pages_add(nr_pages); /* * We need/can do nothing about count=0 pages. @@ -1079,10 +1119,9 @@ * In fact it's dangerous to directly bump up page count from 0, * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. */ - if (!(flags & MF_COUNT_INCREASED) && - !get_page_unless_zero(hpage)) { + if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { if (is_free_buddy_page(p)) { - action_result(pfn, "free buddy", DELAYED); + action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); return 0; } else if (PageHuge(hpage)) { /* @@ -1092,21 +1131,36 @@ if (PageHWPoison(hpage)) { if ((hwpoison_filter(p) && TestClearPageHWPoison(p)) || (p != hpage && TestSetPageHWPoison(hpage))) { - atomic_long_sub(nr_pages, &num_poisoned_pages); + num_poisoned_pages_sub(nr_pages); unlock_page(hpage); return 0; } } set_page_hwpoison_huge_page(hpage); res = dequeue_hwpoisoned_huge_page(hpage); - action_result(pfn, "free huge", - res ? IGNORED : DELAYED); + action_result(pfn, MF_MSG_FREE_HUGE, + res ? MF_IGNORED : MF_DELAYED); unlock_page(hpage); return res; } else { - action_result(pfn, "high order kernel", IGNORED); + action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); + return -EBUSY; + } + } + + if (!PageHuge(p) && PageTransHuge(hpage)) { + if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) { + if (!PageAnon(hpage)) + pr_err("MCE: %#lx: non anonymous thp\n", pfn); + else + pr_err("MCE: %#lx: thp split failed\n", pfn); + if (TestClearPageHWPoison(p)) + num_poisoned_pages_sub(nr_pages); + put_hwpoison_page(p); return -EBUSY; } + VM_BUG_ON_PAGE(!page_count(p), p); + hpage = compound_head(p); } /* @@ -1118,29 +1172,34 @@ * walked by the page reclaim code, however that's not a big loss. */ if (!PageHuge(p)) { - if (!PageLRU(hpage)) - shake_page(hpage, 0); - if (!PageLRU(hpage)) { + if (!PageLRU(p)) + shake_page(p, 0); + if (!PageLRU(p)) { /* * shake_page could have turned it free. */ if (is_free_buddy_page(p)) { - action_result(pfn, "free buddy, 2nd try", - DELAYED); + if (flags & MF_COUNT_INCREASED) + action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); + else + action_result(pfn, MF_MSG_BUDDY_2ND, + MF_DELAYED); return 0; } - action_result(pfn, "non LRU", IGNORED); - put_page(p); - return -EBUSY; } } + lock_page(hpage); + /* - * Lock the page and wait for writeback to finish. - * It's very difficult to mess with pages currently under IO - * and in many cases impossible, so we just avoid it here. + * The page could have changed compound pages during the locking. + * If this happens just bail out. */ - lock_page(hpage); + if (PageCompound(p) && compound_head(p) != orig_head) { + action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); + res = -EBUSY; + goto out; + } /* * We use page flags to determine what action should be taken, but @@ -1156,28 +1215,30 @@ */ if (!PageHWPoison(p)) { printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); - atomic_long_sub(nr_pages, &num_poisoned_pages); - put_page(hpage); - res = 0; - goto out; + num_poisoned_pages_sub(nr_pages); + unlock_page(hpage); + put_hwpoison_page(hpage); + return 0; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &num_poisoned_pages); + num_poisoned_pages_sub(nr_pages); unlock_page(hpage); - put_page(hpage); + put_hwpoison_page(hpage); return 0; } + if (!PageHuge(p) && !PageTransTail(p) && !PageLRU(p)) + goto identify_page_state; + /* * For error on the tail page, we should set PG_hwpoison * on the head page to show that the hugepage is hwpoisoned */ if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { - action_result(pfn, "hugepage already hardware poisoned", - IGNORED); + action_result(pfn, MF_MSG_POISONED_HUGE, MF_IGNORED); unlock_page(hpage); - put_page(hpage); + put_hwpoison_page(hpage); return 0; } /* @@ -1189,6 +1250,10 @@ if (PageHuge(p)) set_page_hwpoison_huge_page(hpage); + /* + * It's very difficult to mess with pages currently under IO + * and in many cases impossible, so we just avoid it here. + */ wait_on_page_writeback(p); /* @@ -1200,7 +1265,7 @@ */ if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) != SWAP_SUCCESS) { - printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); + action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto out; } @@ -1209,11 +1274,12 @@ * Torn down by someone else? */ if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { - action_result(pfn, "already truncated LRU", IGNORED); + action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); res = -EBUSY; goto out; } +identify_page_state: res = -EBUSY; /* * The first check uses the current page flags which may not have any @@ -1223,6 +1289,9 @@ for (ps = error_states;; ps++) if ((p->flags & ps->mask) == ps->res) break; + + page_flags |= (p->flags & (1UL << PG_dirty)); + if (!ps->mask) for (ps = error_states;; ps++) if ((page_flags & ps->mask) == ps->res) @@ -1281,10 +1350,10 @@ mf_cpu = &get_cpu_var(memory_failure_cpu); spin_lock_irqsave(&mf_cpu->lock, proc_flags); - if (kfifo_put(&mf_cpu->fifo, &entry)) + if (kfifo_put(&mf_cpu->fifo, entry)) schedule_work_on(smp_processor_id(), &mf_cpu->work); else - pr_err("Memory failure: buffer overflow when queuing memory failure at 0x%#lx\n", + pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", pfn); spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); put_cpu_var(memory_failure_cpu); @@ -1298,14 +1367,17 @@ unsigned long proc_flags; int gotten; - mf_cpu = &__get_cpu_var(memory_failure_cpu); + mf_cpu = this_cpu_ptr(&memory_failure_cpu); for (;;) { spin_lock_irqsave(&mf_cpu->lock, proc_flags); gotten = kfifo_get(&mf_cpu->fifo, &entry); spin_unlock_irqrestore(&mf_cpu->lock, proc_flags); if (!gotten) break; - memory_failure(entry.pfn, entry.trapno, entry.flags); + if (entry.flags & MF_SOFT_OFFLINE) + soft_offline_page(pfn_to_page(entry.pfn), entry.flags); + else + memory_failure(entry.pfn, entry.trapno, entry.flags); } } @@ -1325,6 +1397,12 @@ } core_initcall(memory_failure_init); +#define unpoison_pr_info(fmt, pfn, rs) \ +({ \ + if (__ratelimit(rs)) \ + pr_info(fmt, pfn); \ +}) + /** * unpoison_memory - Unpoison a previously poisoned page * @pfn: Page number of the to be unpoisoned page @@ -1343,6 +1421,8 @@ struct page *p; int freeit = 0; unsigned int nr_pages; + static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); if (!pfn_valid(pfn)) return -ENXIO; @@ -1351,13 +1431,43 @@ page = compound_head(p); if (!PageHWPoison(p)) { - pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); + unpoison_pr_info("MCE: Page was already unpoisoned %#lx\n", + pfn, &unpoison_rs); + return 0; + } + + if (page_count(page) > 1) { + unpoison_pr_info("MCE: Someone grabs the hwpoison page %#lx\n", + pfn, &unpoison_rs); return 0; } - nr_pages = 1 << compound_trans_order(page); + if (page_mapped(page)) { + unpoison_pr_info("MCE: Someone maps the hwpoison page %#lx\n", + pfn, &unpoison_rs); + return 0; + } + + if (page_mapping(page)) { + unpoison_pr_info("MCE: the hwpoison page has non-NULL mapping %#lx\n", + pfn, &unpoison_rs); + return 0; + } + + /* + * unpoison_memory() can encounter thp only when the thp is being + * worked by memory_failure() and the page lock is not held yet. + * In such case, we yield to memory_failure() and make unpoison fail. + */ + if (!PageHuge(page) && PageTransHuge(page)) { + unpoison_pr_info("MCE: Memory failure is now running on %#lx\n", + pfn, &unpoison_rs); + return 0; + } + + nr_pages = 1 << compound_order(page); - if (!get_page_unless_zero(page)) { + if (!get_hwpoison_page(p)) { /* * Since HWPoisoned hugepage should have non-zero refcount, * race between memory failure and unpoison seems to happen. @@ -1365,12 +1475,14 @@ * to the end. */ if (PageHuge(page)) { - pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); + unpoison_pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", + pfn, &unpoison_rs); return 0; } if (TestClearPageHWPoison(p)) - atomic_long_sub(nr_pages, &num_poisoned_pages); - pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); + num_poisoned_pages_dec(); + unpoison_pr_info("MCE: Software-unpoisoned free page %#lx\n", + pfn, &unpoison_rs); return 0; } @@ -1382,17 +1494,18 @@ * the free buddy page pool. */ if (TestClearPageHWPoison(page)) { - pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); - atomic_long_sub(nr_pages, &num_poisoned_pages); + unpoison_pr_info("MCE: Software-unpoisoned page %#lx\n", + pfn, &unpoison_rs); + num_poisoned_pages_sub(nr_pages); freeit = 1; if (PageHuge(page)) clear_page_hwpoison_huge_page(page); } unlock_page(page); - put_page(page); - if (freeit) - put_page(page); + put_hwpoison_page(page); + if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) + put_hwpoison_page(page); return 0; } @@ -1405,7 +1518,7 @@ return alloc_huge_page_node(page_hstate(compound_head(p)), nid); else - return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); + return __alloc_pages_node(nid, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -1422,22 +1535,10 @@ return 1; /* - * The lock_memory_hotplug prevents a race with memory hotplug. - * This is a big hammer, a better would be nicer. - */ - lock_memory_hotplug(); - - /* - * Isolate the page, so that it doesn't get reallocated if it - * was free. This flag should be kept set until the source page - * is freed and PG_hwpoison on it is set. - */ - set_migratetype_isolate(p, true); - /* * When the target page is a free hugepage, just remove it * from free hugepage list. */ - if (!get_page_unless_zero(compound_head(p))) { + if (!get_hwpoison_page(p)) { if (PageHuge(p)) { pr_info("%s: %#lx free huge page\n", __func__, pfn); ret = 0; @@ -1453,7 +1554,6 @@ /* Not a free page */ ret = 1; } - unlock_memory_hotplug(); return ret; } @@ -1465,7 +1565,7 @@ /* * Try to free it. */ - put_page(page); + put_hwpoison_page(page); shake_page(page, 1); /* @@ -1474,7 +1574,7 @@ ret = __get_any_page(page, pfn, 0); if (ret == 1 && !PageLRU(page)) { /* Drop page reference which is from __get_any_page() */ - put_page(page); + put_hwpoison_page(page); pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", pfn, page->flags); return -EIO; @@ -1488,6 +1588,7 @@ int ret; unsigned long pfn = page_to_pfn(page); struct page *hpage = compound_head(page); + LIST_HEAD(pagelist); /* * This double-check of PageHWPoison is to avoid the race with @@ -1496,99 +1597,50 @@ lock_page(hpage); if (PageHWPoison(hpage)) { unlock_page(hpage); - put_page(hpage); + put_hwpoison_page(hpage); pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); return -EBUSY; } unlock_page(hpage); - /* Keep page count to indicate a given hugepage is isolated. */ - ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, - MIGRATE_SYNC); - put_page(hpage); + ret = isolate_huge_page(hpage, &pagelist); + /* + * get_any_page() and isolate_huge_page() takes a refcount each, + * so need to drop one here. + */ + put_hwpoison_page(hpage); + if (!ret) { + pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn); + return -EBUSY; + } + + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, + MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); + /* + * We know that soft_offline_huge_page() tries to migrate + * only one hugepage pointed to by hpage, so we need not + * run through the pagelist here. + */ + putback_active_hugepage(hpage); + if (ret > 0) + ret = -EIO; } else { /* overcommit hugetlb page will be freed to buddy */ if (PageHuge(page)) { set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); - atomic_long_add(1 << compound_order(hpage), - &num_poisoned_pages); + num_poisoned_pages_add(1 << compound_order(hpage)); } else { SetPageHWPoison(page); - atomic_long_inc(&num_poisoned_pages); + num_poisoned_pages_inc(); } } return ret; } -static int __soft_offline_page(struct page *page, int flags); - -/** - * soft_offline_page - Soft offline a page. - * @page: page to offline - * @flags: flags. Same as memory_failure(). - * - * Returns 0 on success, otherwise negated errno. - * - * Soft offline a page, by migration or invalidation, - * without killing anything. This is for the case when - * a page is not corrupted yet (so it's still valid to access), - * but has had a number of corrected errors and is better taken - * out. - * - * The actual policy on when to do that is maintained by - * user space. - * - * This should never impact any application or cause data loss, - * however it might take some time. - * - * This is not a 100% solution for all memory, but tries to be - * ``good enough'' for the majority of memory. - */ -int soft_offline_page(struct page *page, int flags) -{ - int ret; - unsigned long pfn = page_to_pfn(page); - struct page *hpage = compound_head(page); - - if (PageHWPoison(page)) { - pr_info("soft offline: %#lx page already poisoned\n", pfn); - return -EBUSY; - } - if (!PageHuge(page) && PageTransHuge(hpage)) { - if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { - pr_info("soft offline: %#lx: failed to split THP\n", - pfn); - return -EBUSY; - } - } - - ret = get_any_page(page, pfn, flags); - if (ret < 0) - return ret; - if (ret) { /* for in-use pages */ - if (PageHuge(page)) - ret = soft_offline_huge_page(page, flags); - else - ret = __soft_offline_page(page, flags); - } else { /* for free pages */ - if (PageHuge(page)) { - set_page_hwpoison_huge_page(hpage); - dequeue_hwpoisoned_huge_page(hpage); - atomic_long_add(1 << compound_trans_order(hpage), - &num_poisoned_pages); - } else { - SetPageHWPoison(page); - atomic_long_inc(&num_poisoned_pages); - } - } - unset_migratetype_isolate(page, MIGRATE_MOVABLE); - return ret; -} - static int __soft_offline_page(struct page *page, int flags) { int ret; @@ -1604,7 +1656,7 @@ wait_on_page_writeback(page); if (PageHWPoison(page)) { unlock_page(page); - put_page(page); + put_hwpoison_page(page); pr_info("soft offline: %#lx page already poisoned\n", pfn); return -EBUSY; } @@ -1619,10 +1671,10 @@ * would need to fix isolation locking first. */ if (ret == 1) { - put_page(page); + put_hwpoison_page(page); pr_info("soft_offline: %#lx: invalidated\n", pfn); SetPageHWPoison(page); - atomic_long_inc(&num_poisoned_pages); + num_poisoned_pages_inc(); return 0; } @@ -1636,38 +1688,26 @@ * Drop page reference which is came from get_any_page() * successful isolate_lru_page() already took another one. */ - put_page(page); + put_hwpoison_page(page); if (!ret) { LIST_HEAD(pagelist); inc_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { - putback_lru_pages(&pagelist); + if (!list_empty(&pagelist)) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) ret = -EIO; - } else { - /* - * After page migration succeeds, the source page can - * be trapped in pagevec and actual freeing is delayed. - * Freeing code works differently based on PG_hwpoison, - * so there's a race. We need to make sure that the - * source page should be freed back to buddy before - * setting PG_hwpoison. - */ - if (!is_free_buddy_page(page)) - lru_add_drain_all(); - if (!is_free_buddy_page(page)) - drain_all_pages(); - SetPageHWPoison(page); - if (!is_free_buddy_page(page)) - pr_info("soft offline: %#lx: page leaked\n", - pfn); - atomic_long_inc(&num_poisoned_pages); } } else { pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", @@ -1675,3 +1715,69 @@ } return ret; } + +/** + * soft_offline_page - Soft offline a page. + * @page: page to offline + * @flags: flags. Same as memory_failure(). + * + * Returns 0 on success, otherwise negated errno. + * + * Soft offline a page, by migration or invalidation, + * without killing anything. This is for the case when + * a page is not corrupted yet (so it's still valid to access), + * but has had a number of corrected errors and is better taken + * out. + * + * The actual policy on when to do that is maintained by + * user space. + * + * This should never impact any application or cause data loss, + * however it might take some time. + * + * This is not a 100% solution for all memory, but tries to be + * ``good enough'' for the majority of memory. + */ +int soft_offline_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_head(page); + + if (PageHWPoison(page)) { + pr_info("soft offline: %#lx page already poisoned\n", pfn); + if (flags & MF_COUNT_INCREASED) + put_hwpoison_page(page); + return -EBUSY; + } + if (!PageHuge(page) && PageTransHuge(hpage)) { + if (PageAnon(hpage) && unlikely(split_huge_page(hpage))) { + pr_info("soft offline: %#lx: failed to split THP\n", + pfn); + if (flags & MF_COUNT_INCREASED) + put_hwpoison_page(page); + return -EBUSY; + } + } + + get_online_mems(); + + ret = get_any_page(page, pfn, flags); + put_online_mems(); + if (ret > 0) { /* for in-use pages */ + if (PageHuge(page)) + ret = soft_offline_huge_page(page, flags); + else + ret = __soft_offline_page(page, flags); + } else if (ret == 0) { /* for free pages */ + if (PageHuge(page)) { + set_page_hwpoison_huge_page(hpage); + if (!dequeue_hwpoisoned_huge_page(hpage)) + num_poisoned_pages_add(1 << compound_order(hpage)); + } else { + if (!TestSetPageHWPoison(page)) + num_poisoned_pages_inc(); + } + } + return ret; +}