--- zzzz-none-000/linux-3.10.107/mm/vmscan.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/mm/vmscan.c 2021-02-04 17:41:59.000000000 +0000 @@ -11,6 +11,8 @@ * Multiqueue VM started 5.8.00, Rik van Riel. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -43,6 +45,7 @@ #include #include #include +#include #include #include @@ -56,32 +59,20 @@ #include struct scan_control { - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Number of pages freed so far during a call to shrink_zones() */ - unsigned long nr_reclaimed; - /* How many pages shrink_list() should reclaim */ unsigned long nr_to_reclaim; - unsigned long hibernation_mode; - /* This context's GFP mask */ gfp_t gfp_mask; - int may_writepage; - - /* Can mapped pages be reclaimed? */ - int may_unmap; - - /* Can pages be swapped as part of reclaim? */ - int may_swap; - + /* Allocation order */ int order; - /* Scan (total_size >> priority) pages at once */ - int priority; + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; /* * The memory cgroup that hit its limit and as a result is the @@ -89,11 +80,30 @@ */ struct mem_cgroup *target_mem_cgroup; - /* - * Nodemask of nodes allowed by the caller. If NULL, all nodes - * are scanned. - */ - nodemask_t *nodemask; + /* Scan (total_size >> priority) pages at once */ + int priority; + + unsigned int may_writepage:1; + + /* Can mapped pages be reclaimed? */ + unsigned int may_unmap:1; + + /* Can pages be swapped as part of reclaim? */ + unsigned int may_swap:1; + + /* Can cgroups be reclaimed below their normal consumption range? */ + unsigned int may_thrash:1; + + unsigned int hibernation_mode:1; + + /* One of the zones is ready for compaction */ + unsigned int compaction_ready:1; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -130,7 +140,11 @@ * From 0 .. 100. Higher means more swappy. */ int vm_swappiness = 60; -unsigned long vm_total_pages; /* The total number of pages which the VM controls */ +/* + * The total number of pages which are beyond the high watermark within all + * zones. + */ +unsigned long vm_total_pages; static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); @@ -140,13 +154,64 @@ { return !sc->target_mem_cgroup; } + +/** + * sane_reclaim - is the usual dirty throttling mechanism operational? + * @sc: scan_control in question + * + * The normal page dirty throttling mechanism in balance_dirty_pages() is + * completely broken with the legacy memcg and direct stalling in + * shrink_page_list() is used for throttling instead, which lacks all the + * niceties such as fairness, adaptive pausing, bandwidth proportional + * allocation and configurability. + * + * This function tests whether the vmscan currently in progress can assume + * that the normal dirty throttling mechanism is operational. + */ +static bool sane_reclaim(struct scan_control *sc) +{ + struct mem_cgroup *memcg = sc->target_mem_cgroup; + + if (!memcg) + return true; +#ifdef CONFIG_CGROUP_WRITEBACK + if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return true; +#endif + return false; +} #else static bool global_reclaim(struct scan_control *sc) { return true; } + +static bool sane_reclaim(struct scan_control *sc) +{ + return true; +} #endif +static unsigned long zone_reclaimable_pages(struct zone *zone) +{ + unsigned long nr; + + nr = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (get_nr_swap_pages() > 0) + nr += zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + + return nr; +} + +bool zone_reclaimable(struct zone *zone) +{ + return zone_page_state(zone, NR_PAGES_SCANNED) < + zone_reclaimable_pages(zone) * 6; +} + static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru) { if (!mem_cgroup_disabled()) @@ -156,14 +221,31 @@ } /* - * Add a shrinker callback to be called from the vm + * Add a shrinker callback to be called from the vm. */ -void register_shrinker(struct shrinker *shrinker) +int register_shrinker(struct shrinker *shrinker) { - atomic_long_set(&shrinker->nr_in_batch, 0); + size_t size = sizeof(*shrinker->nr_deferred); + + /* + * If we only have one possible node in the system anyway, save + * ourselves the trouble and disable NUMA aware behavior. This way we + * will save memory and some small loop time later. + */ + if (nr_node_ids == 1) + shrinker->flags &= ~SHRINKER_NUMA_AWARE; + + if (shrinker->flags & SHRINKER_NUMA_AWARE) + size *= nr_node_ids; + + shrinker->nr_deferred = kzalloc(size, GFP_KERNEL); + if (!shrinker->nr_deferred) + return -ENOMEM; + down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); up_write(&shrinker_rwsem); + return 0; } EXPORT_SYMBOL(register_shrinker); @@ -175,146 +257,228 @@ down_write(&shrinker_rwsem); list_del(&shrinker->list); up_write(&shrinker_rwsem); + kfree(shrinker->nr_deferred); } EXPORT_SYMBOL(unregister_shrinker); -static inline int do_shrinker_shrink(struct shrinker *shrinker, - struct shrink_control *sc, - unsigned long nr_to_scan) -{ - sc->nr_to_scan = nr_to_scan; - return (*shrinker->shrink)(shrinker, sc); +#define SHRINK_BATCH 128 + +static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, + struct shrinker *shrinker, + unsigned long nr_scanned, + unsigned long nr_eligible) +{ + unsigned long freed = 0; + unsigned long long delta; + long total_scan; + long freeable; + long nr; + long new_nr; + int nid = shrinkctl->nid; + long batch_size = shrinker->batch ? shrinker->batch + : SHRINK_BATCH; + long scanned = 0, next_deferred; + + freeable = shrinker->count_objects(shrinker, shrinkctl); + if (freeable == 0) + return 0; + + /* + * copy the current shrinker scan count into a local variable + * and zero it so that other concurrent shrinker invocations + * don't also do this scanning work. + */ + nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); + + total_scan = nr; + delta = (4 * nr_scanned) / shrinker->seeks; + delta *= freeable; + do_div(delta, nr_eligible + 1); + total_scan += delta; + if (total_scan < 0) { + pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", + shrinker->scan_objects, total_scan); + total_scan = freeable; + next_deferred = nr; + } else + next_deferred = total_scan; + + /* + * We need to avoid excessive windup on filesystem shrinkers + * due to large numbers of GFP_NOFS allocations causing the + * shrinkers to return -1 all the time. This results in a large + * nr being built up so when a shrink that can do some work + * comes along it empties the entire cache due to nr >>> + * freeable. This is bad for sustaining a working set in + * memory. + * + * Hence only allow the shrinker to scan the entire cache when + * a large delta change is calculated directly. + */ + if (delta < freeable / 4) + total_scan = min(total_scan, freeable / 2); + + /* + * Avoid risking looping forever due to too large nr value: + * never try to free more than twice the estimate number of + * freeable entries. + */ + if (total_scan > freeable * 2) + total_scan = freeable * 2; + + trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, + nr_scanned, nr_eligible, + freeable, delta, total_scan); + + /* + * Normally, we should not scan less than batch_size objects in one + * pass to avoid too frequent shrinker calls, but if the slab has less + * than batch_size objects in total and we are really tight on memory, + * we will try to reclaim all available objects, otherwise we can end + * up failing allocations although there are plenty of reclaimable + * objects spread over several slabs with usage less than the + * batch_size. + * + * We detect the "tight on memory" situations by looking at the total + * number of objects we want to scan (total_scan). If it is greater + * than the total number of objects on slab (freeable), we must be + * scanning at high prio and therefore should try to reclaim as much as + * possible. + */ + while (total_scan >= batch_size || + total_scan >= freeable) { + unsigned long ret; + unsigned long nr_to_scan = min(batch_size, total_scan); + + shrinkctl->nr_to_scan = nr_to_scan; + ret = shrinker->scan_objects(shrinker, shrinkctl); + if (ret == SHRINK_STOP) + break; + freed += ret; + + count_vm_events(SLABS_SCANNED, nr_to_scan); + total_scan -= nr_to_scan; + scanned += nr_to_scan; + + cond_resched(); + } + + if (next_deferred >= scanned) + next_deferred -= scanned; + else + next_deferred = 0; + /* + * move the unused scan count back into the shrinker in a + * manner that handles concurrent updates. If we exhausted the + * scan, there is no need to do an update. + */ + if (next_deferred > 0) + new_nr = atomic_long_add_return(next_deferred, + &shrinker->nr_deferred[nid]); + else + new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); + + trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); + return freed; } -#define SHRINK_BATCH 128 -/* - * Call the shrink functions to age shrinkable caches - * - * Here we assume it costs one seek to replace a lru page and that it also - * takes a seek to recreate a cache object. With this in mind we age equal - * percentages of the lru and ageable caches. This should balance the seeks - * generated by these structures. - * - * If the vm encountered mapped pages on the LRU it increase the pressure on - * slab to avoid swapping. - * - * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits. - * - * `lru_pages' represents the number of on-LRU pages in all the zones which - * are eligible for the caller's allocation attempt. It is used for balancing - * slab reclaim versus page reclaim. - * - * Returns the number of slab objects which we shrunk. - */ -unsigned long shrink_slab(struct shrink_control *shrink, - unsigned long nr_pages_scanned, - unsigned long lru_pages) +/** + * shrink_slab - shrink slab caches + * @gfp_mask: allocation context + * @nid: node whose slab caches to target + * @memcg: memory cgroup whose slab caches to target + * @nr_scanned: pressure numerator + * @nr_eligible: pressure denominator + * + * Call the shrink functions to age shrinkable caches. + * + * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set, + * unaware shrinkers will receive a node id of 0 instead. + * + * @memcg specifies the memory cgroup to target. If it is not NULL, + * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan + * objects from the memory cgroup specified. Otherwise all shrinkers + * are called, and memcg aware shrinkers are supposed to scan the + * global list then. + * + * @nr_scanned and @nr_eligible form a ratio that indicate how much of + * the available objects should be scanned. Page reclaim for example + * passes the number of pages scanned and the number of pages on the + * LRU lists that it considered on @nid, plus a bias in @nr_scanned + * when it encountered mapped pages. The ratio is further biased by + * the ->seeks setting of the shrink function, which indicates the + * cost to recreate an object relative to that of an LRU page. + * + * Returns the number of reclaimed slab objects. + */ +static unsigned long shrink_slab(gfp_t gfp_mask, int nid, + struct mem_cgroup *memcg, + unsigned long nr_scanned, + unsigned long nr_eligible) { struct shrinker *shrinker; - unsigned long ret = 0; + unsigned long freed = 0; - if (nr_pages_scanned == 0) - nr_pages_scanned = SWAP_CLUSTER_MAX; + if (memcg && !memcg_kmem_is_active(memcg)) + return 0; + + if (nr_scanned == 0) + nr_scanned = SWAP_CLUSTER_MAX; if (!down_read_trylock(&shrinker_rwsem)) { - /* Assume we'll be able to shrink next time */ - ret = 1; + /* + * If we would return 0, our callers would understand that we + * have nothing else to shrink and give up trying. By returning + * 1 we keep it going and assume we'll be able to shrink next + * time. + */ + freed = 1; goto out; } list_for_each_entry(shrinker, &shrinker_list, list) { - unsigned long long delta; - long total_scan; - long max_pass; - int shrink_ret = 0; - long nr; - long new_nr; - long batch_size = shrinker->batch ? shrinker->batch - : SHRINK_BATCH; + struct shrink_control sc = { + .gfp_mask = gfp_mask, + .nid = nid, + .memcg = memcg, + }; - max_pass = do_shrinker_shrink(shrinker, shrink, 0); - if (max_pass <= 0) + if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE)) continue; - /* - * copy the current shrinker scan count into a local variable - * and zero it so that other concurrent shrinker invocations - * don't also do this scanning work. - */ - nr = atomic_long_xchg(&shrinker->nr_in_batch, 0); - - total_scan = nr; - delta = (4 * nr_pages_scanned) / shrinker->seeks; - delta *= max_pass; - do_div(delta, lru_pages + 1); - total_scan += delta; - if (total_scan < 0) { - printk(KERN_ERR "shrink_slab: %pF negative objects to " - "delete nr=%ld\n", - shrinker->shrink, total_scan); - total_scan = max_pass; - } + if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) + sc.nid = 0; - /* - * We need to avoid excessive windup on filesystem shrinkers - * due to large numbers of GFP_NOFS allocations causing the - * shrinkers to return -1 all the time. This results in a large - * nr being built up so when a shrink that can do some work - * comes along it empties the entire cache due to nr >>> - * max_pass. This is bad for sustaining a working set in - * memory. - * - * Hence only allow the shrinker to scan the entire cache when - * a large delta change is calculated directly. - */ - if (delta < max_pass / 4) - total_scan = min(total_scan, max_pass / 2); - - /* - * Avoid risking looping forever due to too large nr value: - * never try to free more than twice the estimate number of - * freeable entries. - */ - if (total_scan > max_pass * 2) - total_scan = max_pass * 2; + freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); + } - trace_mm_shrink_slab_start(shrinker, shrink, nr, - nr_pages_scanned, lru_pages, - max_pass, delta, total_scan); + up_read(&shrinker_rwsem); +out: + cond_resched(); + return freed; +} - while (total_scan >= batch_size) { - int nr_before; +void drop_slab_node(int nid) +{ + unsigned long freed; - nr_before = do_shrinker_shrink(shrinker, shrink, 0); - shrink_ret = do_shrinker_shrink(shrinker, shrink, - batch_size); - if (shrink_ret == -1) - break; - if (shrink_ret < nr_before) - ret += nr_before - shrink_ret; - count_vm_events(SLABS_SCANNED, batch_size); - total_scan -= batch_size; + do { + struct mem_cgroup *memcg = NULL; - cond_resched(); - } + freed = 0; + do { + freed += shrink_slab(GFP_KERNEL, nid, memcg, + 1000, 1000); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); + } while (freed > 10); +} - /* - * move the unused scan count back into the shrinker in a - * manner that handles concurrent updates. If we exhausted the - * scan, there is no need to do an update. - */ - if (total_scan > 0) - new_nr = atomic_long_add_return(total_scan, - &shrinker->nr_in_batch); - else - new_nr = atomic_long_read(&shrinker->nr_in_batch); +void drop_slab(void) +{ + int nid; - trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr); - } - up_read(&shrinker_rwsem); -out: - cond_resched(); - return ret; + for_each_online_node(nid) + drop_slab_node(nid); } static inline int is_page_cache_freeable(struct page *page) @@ -327,14 +491,13 @@ return page_count(page) - page_has_private(page) == 2; } -static int may_write_to_queue(struct backing_dev_info *bdi, - struct scan_control *sc) +static int may_write_to_inode(struct inode *inode, struct scan_control *sc) { if (current->flags & PF_SWAPWRITE) return 1; - if (!bdi_write_congested(bdi)) + if (!inode_write_congested(inode)) return 1; - if (bdi == current->backing_dev_info) + if (inode_to_bdi(inode) == current->backing_dev_info) return 1; return 0; } @@ -386,7 +549,7 @@ * stalls if we need to run get_block(). We could test * PagePrivate for that. * - * If this process is currently in __generic_file_aio_write() against + * If this process is currently in __generic_file_write_iter() against * this page's queue, we can perform writeback even if that * will block. * @@ -405,7 +568,7 @@ if (page_has_private(page)) { if (try_to_free_buffers(page)) { ClearPageDirty(page); - printk("%s: orphaned page\n", __func__); + pr_info("%s: orphaned page\n", __func__); return PAGE_CLEAN; } } @@ -413,7 +576,7 @@ } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(mapping->backing_dev_info, sc)) + if (!may_write_to_inode(mapping->host, sc)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { @@ -451,12 +614,17 @@ * Same as remove_mapping, but if the page is removed from the mapping, it * gets returned with a refcount of 0. */ -static int __remove_mapping(struct address_space *mapping, struct page *page) +static int __remove_mapping(struct address_space *mapping, struct page *page, + bool reclaimed) { + unsigned long flags; + struct mem_cgroup *memcg; + BUG_ON(!PageLocked(page)); BUG_ON(mapping != page_mapping(page)); - spin_lock_irq(&mapping->tree_lock); + memcg = mem_cgroup_begin_page_stat(page); + spin_lock_irqsave(&mapping->tree_lock, flags); /* * The non racy check for a busy page. * @@ -492,17 +660,32 @@ if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; + mem_cgroup_swapout(page, swap); __delete_from_swap_cache(page); - spin_unlock_irq(&mapping->tree_lock); - swapcache_free(swap, page); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); + swapcache_free(swap); } else { void (*freepage)(struct page *); + void *shadow = NULL; freepage = mapping->a_ops->freepage; - - __delete_from_page_cache(page); - spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_cache_page(page); + /* + * Remember a shadow entry for reclaimed file cache in + * order to detect refaults, thus thrashing, later on. + * + * But don't store shadows in an address space that is + * already exiting. This is not just an optizimation, + * inode reclaim needs to empty out the radix tree or + * the nodes are lost. Don't plant shadows behind its + * back. + */ + if (reclaimed && page_is_file_cache(page) && + !mapping_exiting(mapping)) + shadow = workingset_eviction(mapping, page); + __delete_from_page_cache(page, shadow, memcg); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); if (freepage != NULL) freepage(page); @@ -511,7 +694,8 @@ return 1; cannot_free: - spin_unlock_irq(&mapping->tree_lock); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + mem_cgroup_end_page_stat(memcg); return 0; } @@ -523,7 +707,7 @@ */ int remove_mapping(struct address_space *mapping, struct page *page) { - if (__remove_mapping(mapping, page)) { + if (__remove_mapping(mapping, page, false)) { /* * Unfreezing the refcount with 1 rather than 2 effectively * drops the pagecache ref for us without requiring another @@ -546,11 +730,10 @@ */ void putback_lru_page(struct page *page) { - int lru; - int active = !!TestClearPageActive(page); + bool is_unevictable; int was_unevictable = PageUnevictable(page); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); redo: ClearPageUnevictable(page); @@ -562,14 +745,14 @@ * unevictable page on [in]active list. * We know how to handle that. */ - lru = active + page_lru_base_type(page); - lru_cache_add_lru(page, lru); + is_unevictable = false; + lru_cache_add(page); } else { /* * Put unevictable pages directly on zone's unevictable * list. */ - lru = LRU_UNEVICTABLE; + is_unevictable = true; add_page_to_unevictable_list(page); /* * When racing with an mlock or AS_UNEVICTABLE clearing @@ -589,7 +772,7 @@ * page is on unevictable list, it never be freed. To avoid that, * check after we added it to the list, again. */ - if (lru == LRU_UNEVICTABLE && page_evictable(page)) { + if (is_unevictable && page_evictable(page)) { if (!isolate_lru_page(page)) { put_page(page); goto redo; @@ -600,9 +783,9 @@ */ } - if (was_unevictable && lru != LRU_UNEVICTABLE) + if (was_unevictable && !is_unevictable) count_vm_event(UNEVICTABLE_PGRESCUED); - else if (!was_unevictable && lru == LRU_UNEVICTABLE) + else if (!was_unevictable && is_unevictable) count_vm_event(UNEVICTABLE_PGCULLED); put_page(page); /* drop ref from isolate */ @@ -670,6 +853,35 @@ return PAGEREF_RECLAIM; } +/* Check if a page is dirty or under writeback */ +static void page_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct address_space *mapping; + + /* + * Anonymous pages are not handled by flushers and must be written + * from reclaim context. Do not stall reclaim based on them + */ + if (!page_is_file_cache(page)) { + *dirty = false; + *writeback = false; + return; + } + + /* By default assume that the page flags are accurate */ + *dirty = PageDirty(page); + *writeback = PageWriteback(page); + + /* Verify dirty/writeback state if the filesystem supports it */ + if (!page_has_private(page)) + return; + + mapping = page_mapping(page); + if (mapping && mapping->a_ops->is_dirty_writeback) + mapping->a_ops->is_dirty_writeback(page, dirty, writeback); +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -678,25 +890,30 @@ struct scan_control *sc, enum ttu_flags ttu_flags, unsigned long *ret_nr_dirty, + unsigned long *ret_nr_unqueued_dirty, + unsigned long *ret_nr_congested, unsigned long *ret_nr_writeback, + unsigned long *ret_nr_immediate, bool force_reclaim) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); int pgactivate = 0; + unsigned long nr_unqueued_dirty = 0; unsigned long nr_dirty = 0; unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; unsigned long nr_writeback = 0; + unsigned long nr_immediate = 0; cond_resched(); - mem_cgroup_uncharge_start(); while (!list_empty(page_list)) { struct address_space *mapping; struct page *page; int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; + bool dirty, writeback; cond_resched(); @@ -706,8 +923,8 @@ if (!trylock_page(page)) goto keep; - VM_BUG_ON(PageActive(page)); - VM_BUG_ON(page_zone(page) != zone); + VM_BUG_ON_PAGE(PageActive(page), page); + VM_BUG_ON_PAGE(page_zone(page) != zone, page); sc->nr_scanned++; @@ -724,20 +941,74 @@ may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + /* + * The number of dirty pages determines if a zone is marked + * reclaim_congested which affects wait_iff_congested. kswapd + * will stall and start writing pages if the tail of the LRU + * is all dirty unqueued pages. + */ + page_check_dirty_writeback(page, &dirty, &writeback); + if (dirty || writeback) + nr_dirty++; + + if (dirty && !writeback) + nr_unqueued_dirty++; + + /* + * Treat this page as congested if the underlying BDI is or if + * pages are cycling through the LRU so quickly that the + * pages marked for immediate reclaim are making it to the + * end of the LRU a second time. + */ + mapping = page_mapping(page); + if (((dirty || writeback) && mapping && + inode_write_congested(mapping->host)) || + (writeback && PageReclaim(page))) + nr_congested++; + + /* + * If a page at the tail of the LRU is under writeback, there + * are three cases to consider. + * + * 1) If reclaim is encountering an excessive number of pages + * under writeback and this page is both under writeback and + * PageReclaim then it indicates that pages are being queued + * for IO but are being recycled through the LRU before the + * IO can complete. Waiting on the page itself risks an + * indefinite stall if it is impossible to writeback the + * page due to IO error or disconnected storage so instead + * note that the LRU is being scanned too quickly and the + * caller can stall after page list has been processed. + * + * 2) Global or new memcg reclaim encounters a page that is + * not marked for immediate reclaim, or the caller does not + * have __GFP_FS (or __GFP_IO if it's simply going to swap, + * not to fs). In this case mark the page for immediate + * reclaim and continue scanning. + * + * Require may_enter_fs because we would wait on fs, which + * may not have submitted IO yet. And the loop driver might + * enter reclaim, and deadlock if it waits on a page for + * which it is needed to do the write (loop masks off + * __GFP_IO|__GFP_FS for this reason); but more thought + * would probably show more reasons. + * + * 3) Legacy memcg encounters a page that is already marked + * PageReclaim. memcg does not have any dirty pages + * throttling so we could easily OOM just because too many + * pages are in writeback and there is nothing else to + * reclaim. Wait for the writeback to complete. + */ if (PageWriteback(page)) { - /* - * memcg doesn't have any dirty pages throttling so we - * could easily OOM just because too many pages are in - * writeback and there is nothing else to reclaim. - * - * Require may_enter_fs to wait on writeback, because - * fs may not have submitted IO yet. And a loop driver - * thread might enter reclaim, and deadlock if it waits - * on a page for which it is needed to do the write - * (loop masks off __GFP_IO|__GFP_FS for this reason); - * but more thought would probably show more reasons. - */ - if (global_reclaim(sc) || + /* Case 1 above */ + if (current_is_kswapd() && + PageReclaim(page) && + test_bit(ZONE_WRITEBACK, &zone->flags)) { + nr_immediate++; + goto keep_locked; + + /* Case 2 above */ + } else if (sane_reclaim(sc) || !PageReclaim(page) || !may_enter_fs) { /* * This is slightly racy - end_page_writeback() @@ -753,8 +1024,15 @@ SetPageReclaim(page); nr_writeback++; goto keep_locked; + + /* Case 3 above */ + } else { + unlock_page(page); + wait_on_page_writeback(page); + /* then go back and try same page again */ + list_add_tail(&page->lru, page_list); + continue; } - wait_on_page_writeback(page); } if (!force_reclaim) @@ -780,16 +1058,18 @@ if (!add_to_swap(page, page_list)) goto activate_locked; may_enter_fs = 1; - } - mapping = page_mapping(page); + /* Adding to swap updated mapping */ + mapping = page_mapping(page); + } /* * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, ttu_flags)) { + switch (try_to_unmap(page, + ttu_flags|TTU_BATCH_FLUSH)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -802,16 +1082,14 @@ } if (PageDirty(page)) { - nr_dirty++; - /* * Only kswapd can writeback filesystem pages to - * avoid risk of stack overflow but do not writeback - * unless under significant pressure. + * avoid risk of stack overflow but only writeback + * if many dirty pages have been encountered. */ if (page_is_file_cache(page) && (!current_is_kswapd() || - sc->priority >= DEF_PRIORITY - 2)) { + !test_bit(ZONE_DIRTY, &zone->flags))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -831,10 +1109,14 @@ if (!sc->may_writepage) goto keep_locked; - /* Page is dirty, try to write it out here */ + /* + * Page is dirty. Flush the TLB if a writable entry + * potentially exists to avoid CPU writes after IO + * starts and then write it out here. + */ + try_to_unmap_flush_dirty(); switch (pageout(page, mapping, sc)) { case PAGE_KEEP: - nr_congested++; goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; @@ -900,7 +1182,7 @@ } } - if (!mapping || !__remove_mapping(mapping, page)) + if (!mapping || !__remove_mapping(mapping, page, true)) goto keep_locked; /* @@ -932,32 +1214,28 @@ /* Not a candidate for swapping, so reclaim swap space. */ if (PageSwapCache(page) && vm_swap_full()) try_to_free_swap(page); - VM_BUG_ON(PageActive(page)); + VM_BUG_ON_PAGE(PageActive(page), page); SetPageActive(page); pgactivate++; keep_locked: unlock_page(page); keep: list_add(&page->lru, &ret_pages); - VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); + VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } - /* - * Tag a zone as congested if all the dirty pages encountered were - * backed by a congested BDI. In this case, reclaimers should just - * back off and wait for congestion to clear because further reclaim - * will encounter the same problem - */ - if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) - zone_set_flag(zone, ZONE_CONGESTED); - - free_hot_cold_page_list(&free_pages, 1); + mem_cgroup_uncharge_list(&free_pages); + try_to_unmap_flush(); + free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); - mem_cgroup_uncharge_end(); + *ret_nr_dirty += nr_dirty; + *ret_nr_congested += nr_congested; + *ret_nr_unqueued_dirty += nr_unqueued_dirty; *ret_nr_writeback += nr_writeback; + *ret_nr_immediate += nr_immediate; return nr_reclaimed; } @@ -969,7 +1247,7 @@ .priority = DEF_PRIORITY, .may_unmap = 1, }; - unsigned long ret, dummy1, dummy2; + unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; struct page *page, *next; LIST_HEAD(clean_pages); @@ -982,10 +1260,10 @@ } ret = shrink_page_list(&clean_pages, zone, &sc, - TTU_UNMAP|TTU_IGNORE_ACCESS, - &dummy1, &dummy2, true); + TTU_UNMAP|TTU_IGNORE_ACCESS, + &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); list_splice(&clean_pages, page_list); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); + mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); return ret; } @@ -1013,6 +1291,40 @@ ret = -EBUSY; +#if defined(CONFIG_AVM_LOW_MEMORY_STRATEGY) + /* + * ACHTUNG: + * + * Dynamischer Speicherfreigabe Hack zum Verhindern der Freigabe mehrfach + * belegter Blöcke + */ + { + static unsigned int busy_level = 1 << 2; + const unsigned int MAX_BUSY_LEVEL = 10 << 2; + const unsigned int MIN_BUSY_LEVEL = 1 << 2; + + if (unlikely(atomic_read(&(page->_count)) > (busy_level >> 2))) { + /* + * wir haben einen Block nicht freigegeben. Der countwert dieses Blockes war + * größer als busylevel/4. Der Busylevel soll etwas gesenkt werden um die + * Wahrscheinlichkeit start belegt Blöcke freizugeben zu erhöhen + */ + busy_level--; + + if (busy_level < MIN_BUSY_LEVEL) { + busy_level = MIN_BUSY_LEVEL; + } + + return ret; + } + busy_level++; + + if (busy_level > MAX_BUSY_LEVEL) { + busy_level = MAX_BUSY_LEVEL; + } + } +#endif /*--- #if defined(CONFIG_AVM_LOW_MEMORY_STRATEGY) ---*/ + /* * To minimise LRU disruption, the caller can indicate that it only * wants to isolate pages it will be able to operate on without @@ -1092,14 +1404,15 @@ unsigned long nr_taken = 0; unsigned long scan; - for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { + for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && + !list_empty(src); scan++) { struct page *page; int nr_pages; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); - VM_BUG_ON(!PageLRU(page)); + VM_BUG_ON_PAGE(!PageLRU(page), page); switch (__isolate_lru_page(page, mode)) { case 0: @@ -1154,7 +1467,7 @@ { int ret = -EBUSY; - VM_BUG_ON(!page_count(page)); + VM_BUG_ON_PAGE(!page_count(page), page); if (PageLRU(page)) { struct zone *zone = page_zone(page); @@ -1189,7 +1502,7 @@ if (current_is_kswapd()) return 0; - if (!global_reclaim(sc)) + if (!sane_reclaim(sc)) return 0; if (file) { @@ -1205,7 +1518,7 @@ * won't get blocked by normal direct-reclaimers, forming a circular * deadlock. */ - if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS) + if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) inactive >>= 3; return isolated > inactive; @@ -1225,7 +1538,7 @@ struct page *page = lru_to_page(page_list); int lru; - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); list_del(&page->lru); if (unlikely(!page_evictable(page))) { spin_unlock_irq(&zone->lru_lock); @@ -1252,6 +1565,7 @@ if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&zone->lru_lock); } else @@ -1266,6 +1580,19 @@ } /* + * If a kernel thread (such as nfsd for loop-back mounts) services + * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * In that case we should only throttle if the backing device it is + * writing to is congested. In other cases it is safe to throttle. + */ +static int current_may_throttle(void) +{ + return !(current->flags & PF_LESS_THROTTLE) || + current->backing_dev_info == NULL || + bdi_write_congested(current->backing_dev_info); +} + +/* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ @@ -1278,7 +1605,10 @@ unsigned long nr_reclaimed = 0; unsigned long nr_taken; unsigned long nr_dirty = 0; + unsigned long nr_congested = 0; + unsigned long nr_unqueued_dirty = 0; unsigned long nr_writeback = 0; + unsigned long nr_immediate = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); struct zone *zone = lruvec_zone(lruvec); @@ -1308,7 +1638,7 @@ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); if (global_reclaim(sc)) { - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scanned); else @@ -1320,7 +1650,9 @@ return 0; nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, - &nr_dirty, &nr_writeback, false); + &nr_dirty, &nr_unqueued_dirty, &nr_congested, + &nr_writeback, &nr_immediate, + false); spin_lock_irq(&zone->lru_lock); @@ -1341,7 +1673,8 @@ spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&page_list, 1); + mem_cgroup_uncharge_list(&page_list); + free_hot_cold_page_list(&page_list, true); /* * If reclaim is isolating dirty pages under writeback, it implies @@ -1353,21 +1686,51 @@ * as there is no guarantee the dirtying process is throttled in the * same way balance_dirty_pages() manages. * - * This scales the number of dirty pages that must be under writeback - * before throttling depending on priority. It is a simple backoff - * function that has the most effect in the range DEF_PRIORITY to - * DEF_PRIORITY-2 which is the priority reclaim is considered to be - * in trouble and reclaim is considered to be in trouble. - * - * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle - * DEF_PRIORITY-1 50% must be PageWriteback - * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble - * ... - * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any - * isolated page is PageWriteback + * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number + * of pages under pages flagged for immediate reclaim and stall if any + * are encountered in the nr_immediate check below. */ - if (nr_writeback && nr_writeback >= - (nr_taken >> (DEF_PRIORITY - sc->priority))) + if (nr_writeback && nr_writeback == nr_taken) + set_bit(ZONE_WRITEBACK, &zone->flags); + + /* + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling here. + */ + if (sane_reclaim(sc)) { + /* + * Tag a zone as congested if all the dirty pages scanned were + * backed by a congested BDI and wait_iff_congested will stall. + */ + if (nr_dirty && nr_dirty == nr_congested) + set_bit(ZONE_CONGESTED, &zone->flags); + + /* + * If dirty pages are scanned that are not queued for IO, it + * implies that flushers are not keeping up. In this case, flag + * the zone ZONE_DIRTY and kswapd will start writing pages from + * reclaim context. + */ + if (nr_unqueued_dirty == nr_taken) + set_bit(ZONE_DIRTY, &zone->flags); + + /* + * If kswapd scans pages marked marked for immediate + * reclaim and under writeback (nr_immediate), it implies + * that pages are cycling through the LRU faster than + * they are written so also forcibly stall. + */ + if (nr_immediate && current_may_throttle()) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + + /* + * Stall direct reclaim for IO completions if underlying BDIs or zone + * is congested. Allow kswapd to continue until it starts encountering + * unqueued dirty pages or cycling through the LRU too quickly. + */ + if (!sc->hibernation_mode && !current_is_kswapd() && + current_may_throttle()) wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, @@ -1410,7 +1773,7 @@ page = lru_to_page(list); lruvec = mem_cgroup_page_lruvec(page, zone); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON_PAGE(PageLRU(page), page); SetPageLRU(page); nr_pages = hpage_nr_pages(page); @@ -1425,6 +1788,7 @@ if (unlikely(PageCompound(page))) { spin_unlock_irq(&zone->lru_lock); + mem_cgroup_uncharge(page); (*get_compound_page_dtor(page))(page); spin_lock_irq(&zone->lru_lock); } else @@ -1466,7 +1830,7 @@ nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, isolate_mode, lru); if (global_reclaim(sc)) - zone->pages_scanned += nr_scanned; + __mod_zone_page_state(zone, NR_PAGES_SCANNED, nr_scanned); reclaim_stat->recent_scanned[file] += nr_taken; @@ -1523,7 +1887,7 @@ * Count referenced pages from currently used mappings as rotated, * even though only some of them are actually re-activated. This * helps balance scan pressure between file and anonymous pages in - * get_scan_ratio. + * get_scan_count. */ reclaim_stat->recent_rotated[file] += nr_rotated; @@ -1532,21 +1896,19 @@ __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&l_hold, 1); + mem_cgroup_uncharge_list(&l_hold); + free_hot_cold_page_list(&l_hold, true); } #ifdef CONFIG_SWAP -static int inactive_anon_is_low_global(struct zone *zone) +static bool inactive_anon_is_low_global(struct zone *zone) { unsigned long active, inactive; active = zone_page_state(zone, NR_ACTIVE_ANON); inactive = zone_page_state(zone, NR_INACTIVE_ANON); - if (inactive * zone->inactive_ratio < active) - return 1; - - return 0; + return inactive * zone->inactive_ratio < active; } /** @@ -1556,14 +1918,14 @@ * Returns true if the zone does not have enough inactive anon pages, * meaning some active anon pages need to be deactivated. */ -static int inactive_anon_is_low(struct lruvec *lruvec) +static bool inactive_anon_is_low(struct lruvec *lruvec) { /* * If we don't have swap space, anonymous page deactivation * is pointless. */ if (!total_swap_pages) - return 0; + return false; if (!mem_cgroup_disabled()) return mem_cgroup_inactive_anon_is_low(lruvec); @@ -1571,9 +1933,9 @@ return inactive_anon_is_low_global(lruvec_zone(lruvec)); } #else -static inline int inactive_anon_is_low(struct lruvec *lruvec) +static inline bool inactive_anon_is_low(struct lruvec *lruvec) { - return 0; + return false; } #endif @@ -1591,7 +1953,7 @@ * This uses a different ratio than the anonymous pages, because * the page cache uses a use-once replacement algorithm. */ -static int inactive_file_is_low(struct lruvec *lruvec) +static bool inactive_file_is_low(struct lruvec *lruvec) { unsigned long inactive; unsigned long active; @@ -1599,10 +1961,19 @@ inactive = get_lru_size(lruvec, LRU_INACTIVE_FILE); active = get_lru_size(lruvec, LRU_ACTIVE_FILE); - return active > inactive; +#if defined(CONFIG_AVM_LOW_MEMORY_STRATEGY) + /*--------------------------------------------------------------------------------------*\ + * Hier wird das Verhältniss zwischen activen und inactiven Blöcken geregelt + * Da es sehr teuer ist einen aktiven Block wieder zu rekonstruieren wird das Verhältnis + * nicht 1:1 sondern 1:2 gehalten. + \*--------------------------------------------------------------------------------------*/ + return ((active >> 1) > inactive); +#else + return (active > inactive); +#endif } -static int inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) +static bool inactive_list_is_low(struct lruvec *lruvec, enum lru_list lru) { if (is_file_lru(lru)) return inactive_file_is_low(lruvec); @@ -1622,13 +1993,6 @@ return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); } -static int vmscan_swappiness(struct scan_control *sc) -{ - if (global_reclaim(sc)) - return vm_swappiness; - return mem_cgroup_swappiness(sc->target_mem_cgroup); -} - enum scan_balance { SCAN_EQUAL, SCAN_FRACT, @@ -1645,8 +2009,9 @@ * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan */ -static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, - unsigned long *nr) +static void get_scan_count(struct lruvec *lruvec, int swappiness, + struct scan_control *sc, unsigned long *nr, + unsigned long *lru_pages) { struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat; u64 fraction[2]; @@ -1654,10 +2019,12 @@ struct zone *zone = lruvec_zone(lruvec); unsigned long anon_prio, file_prio; enum scan_balance scan_balance; - unsigned long anon, file, free; + unsigned long anon, file; bool force_scan = false; unsigned long ap, fp; enum lru_list lru; + bool some_scanned; + int pass; /* * If the zone or memcg is small, nr[l] can be 0. This @@ -1669,8 +2036,12 @@ * latencies, so it's better to scan a minimum amount there as * well. */ - if (current_is_kswapd() && zone->all_unreclaimable) - force_scan = true; + if (current_is_kswapd()) { + if (!zone_reclaimable(zone)) + force_scan = true; + if (!mem_cgroup_lruvec_online(lruvec)) + force_scan = true; + } if (!global_reclaim(sc)) force_scan = true; @@ -1687,7 +2058,7 @@ * using the memory controller's swap limit feature would be * too expensive. */ - if (!global_reclaim(sc) && !vmscan_swappiness(sc)) { + if (!global_reclaim(sc) && !swappiness) { scan_balance = SCAN_FILE; goto out; } @@ -1697,25 +2068,29 @@ * system is close to OOM, scan both anon and file equally * (unless the swappiness setting disagrees with swapping). */ - if (!sc->priority && vmscan_swappiness(sc)) { + if (!sc->priority && swappiness) { scan_balance = SCAN_EQUAL; goto out; } - anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + - get_lru_size(lruvec, LRU_INACTIVE_ANON); - file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + - get_lru_size(lruvec, LRU_INACTIVE_FILE); - /* - * If it's foreseeable that reclaiming the file cache won't be - * enough to get the zone back into a desirable shape, we have - * to swap. Better start now and leave the - probably heavily - * thrashing - remaining file pages alone. + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. */ if (global_reclaim(sc)) { - free = zone_page_state(zone, NR_FREE_PAGES); - if (unlikely(file + free <= high_wmark_pages(zone))) { + unsigned long zonefile; + unsigned long zonefree; + + zonefree = zone_page_state(zone, NR_FREE_PAGES); + zonefile = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (unlikely(zonefile + zonefree <= high_wmark_pages(zone))) { scan_balance = SCAN_ANON; goto out; } @@ -1736,7 +2111,7 @@ * With swappiness at 100, anonymous and file have the same priority. * This scanning priority is essentially the inverse of IO cost. */ - anon_prio = vmscan_swappiness(sc); + anon_prio = swappiness; file_prio = 200 - anon_prio; /* @@ -1750,6 +2125,12 @@ * * anon in [0], file in [1] */ + + anon = get_lru_size(lruvec, LRU_ACTIVE_ANON) + + get_lru_size(lruvec, LRU_INACTIVE_ANON); + file = get_lru_size(lruvec, LRU_ACTIVE_FILE) + + get_lru_size(lruvec, LRU_INACTIVE_FILE); + spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { reclaim_stat->recent_scanned[0] /= 2; @@ -1777,59 +2158,98 @@ fraction[1] = fp; denominator = ap + fp + 1; out: - for_each_evictable_lru(lru) { - int file = is_file_lru(lru); - unsigned long size; - unsigned long scan; - - size = get_lru_size(lruvec, lru); - scan = size >> sc->priority; - - if (!scan && force_scan) - scan = min(size, SWAP_CLUSTER_MAX); - - switch (scan_balance) { - case SCAN_EQUAL: - /* Scan lists relative to size */ - break; - case SCAN_FRACT: + some_scanned = false; + /* Only use force_scan on second pass. */ + for (pass = 0; !some_scanned && pass < 2; pass++) { + *lru_pages = 0; + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long size; + unsigned long scan; + + size = get_lru_size(lruvec, lru); + scan = size >> sc->priority; + + if (!scan && pass && force_scan) + scan = min(size, SWAP_CLUSTER_MAX); + + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + */ + scan = div64_u64(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) { + size = 0; + scan = 0; + } + break; + default: + /* Look ma, no brain */ + BUG(); + } + + *lru_pages += size; + nr[lru] = scan; + /* - * Scan types proportional to swappiness and - * their relative recent reclaim efficiency. + * Skip the second pass and don't force_scan, + * if we found something to scan. */ - scan = div64_u64(scan * fraction[file], denominator); - break; - case SCAN_FILE: - case SCAN_ANON: - /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) - scan = 0; - break; - default: - /* Look ma, no brain */ - BUG(); + some_scanned |= !!scan; } - nr[lru] = scan; } } /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +static void shrink_lruvec(struct lruvec *lruvec, int swappiness, + struct scan_control *sc, unsigned long *lru_pages) { unsigned long nr[NR_LRU_LISTS]; + unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list lru; unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; + bool scan_adjusted; - get_scan_count(lruvec, sc, nr); + get_scan_count(lruvec, swappiness, sc, nr, lru_pages); + + /* Record the original scan target for proportional adjustments later */ + memcpy(targets, nr, sizeof(nr)); + + /* + * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal + * event that can occur when there is little memory pressure e.g. + * multiple streaming readers/writers. Hence, we do not abort scanning + * when the requested number of pages are reclaimed when scanning at + * DEF_PRIORITY on the assumption that the fact we are direct + * reclaiming implies that kswapd is not keeping up and it is best to + * do a batch of work at once. For memcg reclaim one check is made to + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ + scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { + unsigned long nr_anon, nr_file, percentage; + unsigned long nr_scanned; + for_each_evictable_lru(lru) { if (nr[lru]) { nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); @@ -1839,17 +2259,60 @@ lruvec, sc); } } + + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) + continue; + /* - * On large memory systems, scan >> priority can become - * really large. This is fine for the starting priority; - * we want to put equal scanning pressure on each zone. - * However, if the VM has a harder time of freeing pages, - * with multiple processes reclaiming pages, the total - * freeing target can get unreasonably large. + * For kswapd and memcg, reclaim at least the number of pages + * requested. Ensure that the anon and file LRUs are scanned + * proportionally what was requested by get_scan_count(). We + * stop reclaiming one LRU and reduce the amount scanning + * proportional to the original scan target. + */ + nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; + nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + + /* + * It's just vindictive to attack the larger once the smaller + * has gone to zero. And given the way we stop scanning the + * smaller below, this makes sure that we only make one nudge + * towards proportionality once we've got nr_to_reclaim. */ - if (nr_reclaimed >= nr_to_reclaim && - sc->priority < DEF_PRIORITY) + if (!nr_file || !nr_anon) break; + + if (nr_file > nr_anon) { + unsigned long scan_target = targets[LRU_INACTIVE_ANON] + + targets[LRU_ACTIVE_ANON] + 1; + lru = LRU_BASE; + percentage = nr_anon * 100 / scan_target; + } else { + unsigned long scan_target = targets[LRU_INACTIVE_FILE] + + targets[LRU_ACTIVE_FILE] + 1; + lru = LRU_FILE; + percentage = nr_file * 100 / scan_target; + } + + /* Stop scanning the smaller of the LRU */ + nr[lru] = 0; + nr[lru + LRU_ACTIVE] = 0; + + /* + * Recalculate the other LRU scan count based on its original + * scan target and the percentage scanning already complete + */ + lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + lru += LRU_ACTIVE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + scan_adjusted = true; } blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; @@ -1931,7 +2394,7 @@ return true; /* If compaction would go ahead or the allocation would succeed, stop */ - switch (compaction_suitable(zone, sc->order)) { + switch (compaction_suitable(zone, sc->order, 0, 0)) { case COMPACT_PARTIAL: case COMPACT_CONTINUE: return false; @@ -1940,9 +2403,12 @@ } } -static void shrink_zone(struct zone *zone, struct scan_control *sc) +static bool shrink_zone(struct zone *zone, struct scan_control *sc, + bool is_classzone) { + struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_reclaimed, nr_scanned; + bool reclaimable = false; do { struct mem_cgroup *root = sc->target_mem_cgroup; @@ -1950,6 +2416,7 @@ .zone = zone, .priority = sc->priority, }; + unsigned long zone_lru_pages = 0; struct mem_cgroup *memcg; nr_reclaimed = sc->nr_reclaimed; @@ -1957,11 +2424,28 @@ memcg = mem_cgroup_iter(root, NULL, &reclaim); do { + unsigned long lru_pages; + unsigned long scanned; struct lruvec *lruvec; + int swappiness; + + if (mem_cgroup_low(root, memcg)) { + if (!sc->may_thrash) + continue; + mem_cgroup_events(memcg, MEMCG_LOW, 1); + } lruvec = mem_cgroup_zone_lruvec(zone, memcg); + swappiness = mem_cgroup_swappiness(memcg); + scanned = sc->nr_scanned; + + shrink_lruvec(lruvec, swappiness, sc, &lru_pages); + zone_lru_pages += lru_pages; - shrink_lruvec(lruvec, sc); + if (memcg && is_classzone) + shrink_slab(sc->gfp_mask, zone_to_nid(zone), + memcg, sc->nr_scanned - scanned, + lru_pages); /* * Direct reclaim and kswapd have to scan all memory @@ -1978,48 +2462,67 @@ mem_cgroup_iter_break(root, memcg); break; } - memcg = mem_cgroup_iter(root, memcg, &reclaim); - } while (memcg); + } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); + + /* + * Shrink the slab caches in the same proportion that + * the eligible LRU pages were scanned. + */ + if (global_reclaim(sc) && is_classzone) + shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL, + sc->nr_scanned - nr_scanned, + zone_lru_pages); + + if (reclaim_state) { + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } vmpressure(sc->gfp_mask, sc->target_mem_cgroup, sc->nr_scanned - nr_scanned, sc->nr_reclaimed - nr_reclaimed); + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + } while (should_continue_reclaim(zone, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + + return reclaimable; } -/* Returns true if compaction should go ahead for a high-order request */ -static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) +/* + * Returns true if compaction should go ahead for a high-order request, or + * the high-order allocation would succeed without compaction. + */ +static inline bool compaction_ready(struct zone *zone, int order) { unsigned long balance_gap, watermark; bool watermark_ok; - /* Do not consider compaction for orders reclaim is meant to satisfy */ - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER) - return false; - /* * Compaction takes time to run and there are potentially other * callers using the pages just freed. Continue reclaiming until * there is a buffer of free pages available to give compaction * a reasonable chance of completing and allocating the page */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); - watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); - watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); + watermark = high_wmark_pages(zone) + balance_gap + (2UL << order); + watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0); /* * If compaction is deferred, reclaim up to a point where * compaction will have a chance of success when re-enabled */ - if (compaction_deferred(zone, sc->order)) + if (compaction_deferred(zone, order)) return watermark_ok; - /* If compaction is not ready to start, keep reclaiming */ - if (!compaction_suitable(zone, sc->order)) + /* + * If compaction is not ready to start and allocation is not likely + * to succeed without it, then keep reclaiming. + */ + if (compaction_suitable(zone, order, 0, 0) == COMPACT_SKIPPED) return false; return watermark_ok; @@ -2041,10 +2544,7 @@ * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. * - * This function returns true if a zone is being reclaimed for a costly - * high-order allocation and compaction is ready to begin. This indicates to - * the caller that it should consider retrying the allocation instead of - * further reclaim. + * Returns true if a zone was reclaimable. */ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) { @@ -2052,45 +2552,61 @@ struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; - bool aborted_reclaim = false; + gfp_t orig_mask; + enum zone_type requested_highidx = gfp_zone(sc->gfp_mask); + bool reclaimable = false; /* * If the number of buffer_heads in the machine exceeds the maximum * allowed level, force direct reclaim to scan the highmem zone as * highmem pages could be pinning lowmem pages storing buffer_heads */ + orig_mask = sc->gfp_mask; if (buffer_heads_over_limit) sc->gfp_mask |= __GFP_HIGHMEM; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { + enum zone_type classzone_idx; + if (!populated_zone(zone)) continue; + + classzone_idx = requested_highidx; + while (!populated_zone(zone->zone_pgdat->node_zones + + classzone_idx)) + classzone_idx--; + /* * Take care memory controller reclaiming has small influence * to global LRU. */ if (global_reclaim(sc)) { - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + if (!cpuset_zone_allowed(zone, + GFP_KERNEL | __GFP_HARDWALL)) continue; - if (zone->all_unreclaimable && - sc->priority != DEF_PRIORITY) + + if (sc->priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; /* Let kswapd poll it */ - if (IS_ENABLED(CONFIG_COMPACTION)) { - /* - * If we already have plenty of memory free for - * compaction in this zone, don't free any more. - * Even though compaction is invoked for any - * non-zero order, only frequent costly order - * reclamation is disruptive enough to become a - * noticeable problem, like transparent huge - * page allocations. - */ - if (compaction_ready(zone, sc)) { - aborted_reclaim = true; - continue; - } + + /* + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticeable problem, like transparent huge + * page allocations. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && + sc->order > PAGE_ALLOC_COSTLY_ORDER && + zonelist_zone_idx(z) <= requested_highidx && + compaction_ready(zone, sc->order)) { + sc->compaction_ready = true; + continue; } + /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and @@ -2103,52 +2619,26 @@ &nr_soft_scanned); sc->nr_reclaimed += nr_soft_reclaimed; sc->nr_scanned += nr_soft_scanned; + if (nr_soft_reclaimed) + reclaimable = true; /* need some check for avoid more shrink_zone() */ } - shrink_zone(zone, sc); - } + if (shrink_zone(zone, sc, zone_idx(zone) == classzone_idx)) + reclaimable = true; - return aborted_reclaim; -} - -static unsigned long zone_reclaimable_pages(struct zone *zone) -{ - int nr; - - nr = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); - - if (get_nr_swap_pages() > 0) - nr += zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); - - return nr; -} - -static bool zone_reclaimable(struct zone *zone) -{ - return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; -} - -/* All zones in zonelist are unreclaimable? */ -static bool all_unreclaimable(struct zonelist *zonelist, - struct scan_control *sc) -{ - struct zoneref *z; - struct zone *zone; - - for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_zone(sc->gfp_mask), sc->nodemask) { - if (!populated_zone(zone)) - continue; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - if (!zone->all_unreclaimable) - return false; + if (global_reclaim(sc) && + !reclaimable && zone_reclaimable(zone)) + reclaimable = true; } - return true; + /* + * Restore to original mask to avoid the impact on the caller if we + * promoted it to __GFP_HIGHMEM. + */ + sc->gfp_mask = orig_mask; + + return reclaimable; } /* @@ -2168,16 +2658,13 @@ * else, the number of pages reclaimed */ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, - struct scan_control *sc, - struct shrink_control *shrink) + struct scan_control *sc) { + int initial_priority = sc->priority; unsigned long total_scanned = 0; - struct reclaim_state *reclaim_state = current->reclaim_state; - struct zoneref *z; - struct zone *zone; unsigned long writeback_threshold; - bool aborted_reclaim; - + bool zones_reclaimable; +retry: delayacct_freepages_start(); if (global_reclaim(sc)) @@ -2187,31 +2674,14 @@ vmpressure_prio(sc->gfp_mask, sc->target_mem_cgroup, sc->priority); sc->nr_scanned = 0; - aborted_reclaim = shrink_zones(zonelist, sc); + zones_reclaimable = shrink_zones(zonelist, sc); - /* - * Don't shrink slabs when reclaiming memory from - * over limit cgroups - */ - if (global_reclaim(sc)) { - unsigned long lru_pages = 0; - for_each_zone_zonelist(zone, z, zonelist, - gfp_zone(sc->gfp_mask)) { - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - - lru_pages += zone_reclaimable_pages(zone); - } - - shrink_slab(shrink, sc->nr_scanned, lru_pages); - if (reclaim_state) { - sc->nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; - } - } total_scanned += sc->nr_scanned; if (sc->nr_reclaimed >= sc->nr_to_reclaim) - goto out; + break; + + if (sc->compaction_ready) + break; /* * If we're getting trouble reclaiming, start doing @@ -2233,39 +2703,26 @@ WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } - - /* Take a nap, wait for some writeback to complete */ - if (!sc->hibernation_mode && sc->nr_scanned && - sc->priority < DEF_PRIORITY - 2) { - struct zone *preferred_zone; - - first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), - &cpuset_current_mems_allowed, - &preferred_zone); - wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); - } } while (--sc->priority >= 0); -out: delayacct_freepages_end(); if (sc->nr_reclaimed) return sc->nr_reclaimed; - /* - * As hibernation is going on, kswapd is freezed so that it can't mark - * the zone into all_unreclaimable. Thus bypassing all_unreclaimable - * check. - */ - if (oom_killer_disabled) - return 0; - /* Aborted reclaim to try compaction? don't OOM, then */ - if (aborted_reclaim) + if (sc->compaction_ready) return 1; - /* top priority shrink_zones still had more to do? don't OOM, then */ - if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc)) + /* Untapped cgroup reserves? Don't OOM, retry. */ + if (!sc->may_thrash) { + sc->priority = initial_priority; + sc->may_thrash = 1; + goto retry; + } + + /* Any of the zones still reclaimable? Don't OOM. */ + if (zones_reclaimable) return 1; return 0; @@ -2281,7 +2738,8 @@ for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; - if (!populated_zone(zone)) + if (!populated_zone(zone) || + zone_reclaimable_pages(zone) == 0) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -2352,7 +2810,7 @@ * should make reasonable progress. */ for_each_zone_zonelist_nodemask(zone, z, zonelist, - gfp_mask, nodemask) { + gfp_zone(gfp_mask), nodemask) { if (zone_idx(zone) > ZONE_NORMAL) continue; @@ -2402,18 +2860,14 @@ { unsigned long nr_reclaimed; struct scan_control sc = { + .nr_to_reclaim = SWAP_CLUSTER_MAX, .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .order = order, + .nodemask = nodemask, + .priority = DEF_PRIORITY, .may_writepage = !laptop_mode, - .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, - .order = order, - .priority = DEF_PRIORITY, - .target_mem_cgroup = NULL, - .nodemask = nodemask, - }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, }; /* @@ -2428,7 +2882,7 @@ sc.may_writepage, gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); @@ -2443,16 +2897,15 @@ unsigned long *nr_scanned) { struct scan_control sc = { - .nr_scanned = 0, .nr_to_reclaim = SWAP_CLUSTER_MAX, + .target_mem_cgroup = memcg, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .order = 0, - .priority = 0, - .target_mem_cgroup = memcg, }; struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg); + int swappiness = mem_cgroup_swappiness(memcg); + unsigned long lru_pages; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2468,7 +2921,7 @@ * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ - shrink_lruvec(lruvec, &sc); + shrink_lruvec(lruvec, swappiness, &sc, &lru_pages); trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -2477,26 +2930,22 @@ } unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, + unsigned long nr_pages, gfp_t gfp_mask, - bool noswap) + bool may_swap) { struct zonelist *zonelist; unsigned long nr_reclaimed; int nid; struct scan_control sc = { - .may_writepage = !laptop_mode, - .may_unmap = 1, - .may_swap = !noswap, - .nr_to_reclaim = SWAP_CLUSTER_MAX, - .order = 0, - .priority = DEF_PRIORITY, - .target_mem_cgroup = memcg, - .nodemask = NULL, /* we don't care the placement */ + .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), - }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, + .target_mem_cgroup = memcg, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = may_swap, }; /* @@ -2512,7 +2961,9 @@ sc.may_writepage, sc.gfp_mask); - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + current->flags |= PF_MEMALLOC; + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + current->flags &= ~PF_MEMALLOC; trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -2543,11 +2994,11 @@ unsigned long balance_gap, int classzone_idx) { if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone) + - balance_gap, classzone_idx, 0)) + balance_gap, classzone_idx)) return false; - if (IS_ENABLED(CONFIG_COMPACTION) && order && - !compaction_suitable(zone, order)) + if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, + order, 0, classzone_idx) == COMPACT_SKIPPED) return false; return true; @@ -2595,7 +3046,7 @@ * DEF_PRIORITY. Effectively, it considers them balanced so * they must be considered balanced here as well! */ - if (zone->all_unreclaimable) { + if (!zone_reclaimable(zone)) { balanced_pages += zone->managed_pages; continue; } @@ -2645,6 +3096,77 @@ } /* + * kswapd shrinks the zone by the number of pages required to reach + * the high watermark. + * + * Returns true if kswapd scanned at least the requested number of pages to + * reclaim or if the lack of progress was due to pages under writeback. + * This is used to determine if the scanning priority needs to be raised. + */ +static bool kswapd_shrink_zone(struct zone *zone, + int classzone_idx, + struct scan_control *sc, + unsigned long *nr_attempted) +{ + int testorder = sc->order; + unsigned long balance_gap; + bool lowmem_pressure; + + /* Reclaim above the high watermark. */ + sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); + + /* + * Kswapd reclaims only single pages with compaction enabled. Trying + * too hard to reclaim until contiguous free pages have become + * available can hurt performance by evicting too much useful data + * from memory. Do not reclaim more than needed for compaction. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + compaction_suitable(zone, sc->order, 0, classzone_idx) + != COMPACT_SKIPPED) + testorder = 0; + + /* + * We put equal pressure on every zone, unless one zone has way too + * many pages free already. The "too many pages" is defined as the + * high wmark plus a "gap" where the gap is either the low + * watermark or 1% of the zone, whichever is smaller. + */ + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); + + /* + * If there is no low memory pressure or the zone is balanced then no + * reclaim is necessary + */ + lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); + if (!lowmem_pressure && zone_balanced(zone, testorder, + balance_gap, classzone_idx)) + return true; + + shrink_zone(zone, sc, zone_idx(zone) == classzone_idx); + + /* Account for the number of pages attempted to reclaim */ + *nr_attempted += sc->nr_to_reclaim; + + clear_bit(ZONE_WRITEBACK, &zone->flags); + + /* + * If a zone reaches its high watermark, consider it to be no longer + * congested. It's possible there are dirty pages backed by congested + * BDIs but as pressure is relieved, speculatively avoid congestion + * waits. + */ + if (zone_reclaimable(zone) && + zone_balanced(zone, testorder, 0, classzone_idx)) { + clear_bit(ZONE_CONGESTED, &zone->flags); + clear_bit(ZONE_DIRTY, &zone->flags); + } + + return sc->nr_scanned >= sc->nr_to_reclaim; +} + +/* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * @@ -2668,35 +3190,26 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int *classzone_idx) { - bool pgdat_is_balanced = false; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ - struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .order = order, + .priority = DEF_PRIORITY, + .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = 1, - /* - * kswapd doesn't want to be bailed out while reclaim. because - * we want to put equal scanning pressure on each zone. - */ - .nr_to_reclaim = ULONG_MAX, - .order = order, - .target_mem_cgroup = NULL, - }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, }; -loop_again: - sc.priority = DEF_PRIORITY; - sc.nr_reclaimed = 0; - sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); do { - unsigned long lru_pages = 0; + unsigned long nr_attempted = 0; + bool raise_priority = true; + bool pgdat_needs_compaction = (order > 0); + + sc.nr_reclaimed = 0; /* * Scan in the highmem->dma direction for the highest @@ -2708,8 +3221,8 @@ if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && - sc.priority != DEF_PRIORITY) + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; /* @@ -2733,23 +3246,44 @@ end_zone = i; break; } else { - /* If balanced, clear the congested flag */ - zone_clear_flag(zone, ZONE_CONGESTED); + /* + * If balanced, clear the dirty and congested + * flags + */ + clear_bit(ZONE_CONGESTED, &zone->flags); + clear_bit(ZONE_DIRTY, &zone->flags); } } - if (i < 0) { - pgdat_is_balanced = true; + if (i < 0) goto out; - } for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - lru_pages += zone_reclaimable_pages(zone); + if (!populated_zone(zone)) + continue; + + /* + * If any zone is currently balanced then kswapd will + * not call compaction as it is expected that the + * necessary pages are already available. + */ + if (pgdat_needs_compaction && + zone_watermark_ok(zone, order, + low_wmark_pages(zone), + *classzone_idx, 0)) + pgdat_needs_compaction = false; } /* + * If we're getting trouble reclaiming, start doing writepage + * even in laptop mode. + */ + if (sc.priority < DEF_PRIORITY - 2) + sc.may_writepage = 1; + + /* * Now scan the zone in the dma->highmem direction, stopping * at the last zone which needs scanning. * @@ -2760,14 +3294,12 @@ */ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - int nr_slab, testorder; - unsigned long balance_gap; if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable && - sc.priority != DEF_PRIORITY) + if (sc.priority != DEF_PRIORITY && + !zone_reclaimable(zone)) continue; sc.nr_scanned = 0; @@ -2782,65 +3314,14 @@ sc.nr_reclaimed += nr_soft_reclaimed; /* - * We put equal pressure on every zone, unless - * one zone has way too many pages free - * already. The "too many pages" is defined - * as the high wmark plus a "gap" where the - * gap is either the low watermark or 1% - * of the zone, whichever is smaller. - */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + - KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); - /* - * Kswapd reclaims only single pages with compaction - * enabled. Trying too hard to reclaim until contiguous - * free pages have become available can hurt performance - * by evicting too much useful data from memory. - * Do not reclaim more than needed for compaction. - */ - testorder = order; - if (IS_ENABLED(CONFIG_COMPACTION) && order && - compaction_suitable(zone, order) != - COMPACT_SKIPPED) - testorder = 0; - - if ((buffer_heads_over_limit && is_highmem_idx(i)) || - !zone_balanced(zone, testorder, - balance_gap, end_zone)) { - shrink_zone(zone, &sc); - - reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - - if (nr_slab == 0 && !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; - } - - /* - * If we're getting trouble reclaiming, start doing - * writepage even in laptop mode. + * There should be no need to raise the scanning + * priority if enough pages are already being scanned + * that that high watermark would be met at 100% + * efficiency. */ - if (sc.priority < DEF_PRIORITY - 2) - sc.may_writepage = 1; - - if (zone->all_unreclaimable) { - if (end_zone && end_zone == i) - end_zone--; - continue; - } - - if (zone_balanced(zone, testorder, 0, end_zone)) - /* - * If a zone reaches its high watermark, - * consider it to be no longer congested. It's - * possible there are dirty pages backed by - * congested BDIs but as pressure is relieved, - * speculatively avoid congestion waits - */ - zone_clear_flag(zone, ZONE_CONGESTED); + if (kswapd_shrink_zone(zone, end_zone, + &sc, &nr_attempted)) + raise_priority = false; } /* @@ -2850,76 +3331,40 @@ */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && pfmemalloc_watermark_ok(pgdat)) - wake_up(&pgdat->pfmemalloc_wait); - - if (pgdat_balanced(pgdat, order, *classzone_idx)) { - pgdat_is_balanced = true; - break; /* kswapd: all done */ - } + wake_up_all(&pgdat->pfmemalloc_wait); /* - * We do this so kswapd doesn't build up large priorities for - * example when it is freeing in parallel with allocators. It - * matches the direct reclaim path behaviour in terms of impact - * on zone->*_priority. + * Fragmentation may mean that the system cannot be rebalanced + * for high-order allocations in all zones. If twice the + * allocation size has been reclaimed and the zones are still + * not balanced then recheck the watermarks at order-0 to + * prevent kswapd reclaiming excessively. Assume that a + * process requested a high-order can direct reclaim/compact. */ - if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) - break; - } while (--sc.priority >= 0); - -out: - if (!pgdat_is_balanced) { - cond_resched(); + if (order && sc.nr_reclaimed >= 2UL << order) + order = sc.order = 0; - try_to_freeze(); + /* Check if kswapd should be suspending */ + if (try_to_freeze() || kthread_should_stop()) + break; /* - * Fragmentation may mean that the system cannot be - * rebalanced for high-order allocations in all zones. - * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, - * it means the zones have been fully scanned and are still - * not balanced. For high-order allocations, there is - * little point trying all over again as kswapd may - * infinite loop. - * - * Instead, recheck all watermarks at order-0 as they - * are the most important. If watermarks are ok, kswapd will go - * back to sleep. High-order users can still perform direct - * reclaim if they wish. + * Compact if necessary and kswapd is reclaiming at least the + * high watermark number of pages as requsted */ - if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) - order = sc.order = 0; - - goto loop_again; - } - - /* - * If kswapd was reclaiming at a higher order, it has the option of - * sleeping without all zones being balanced. Before it does, it must - * ensure that the watermarks for order-0 on *all* zones are met and - * that the congestion flags are cleared. The congestion flag must - * be cleared as kswapd is the only mechanism that clears the flag - * and it is potentially going to sleep here. - */ - if (order) { - int zones_need_compaction = 1; - - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - /* Check if the memory needs to be defragmented. */ - if (zone_watermark_ok(zone, order, - low_wmark_pages(zone), *classzone_idx, 0)) - zones_need_compaction = 0; - } - - if (zones_need_compaction) + if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) compact_pgdat(pgdat, order); - } + /* + * Raise priority if scanning rate is too low or there was no + * progress in reclaiming pages + */ + if (raise_priority || !sc.nr_reclaimed) + sc.priority--; + } while (sc.priority >= 1 && + !pgdat_balanced(pgdat, order, *classzone_idx)); + +out: /* * Return the order we were reclaiming at so prepare_kswapd_sleep() * makes a decision on the order we were last reclaiming at. However, @@ -3104,7 +3549,7 @@ if (!populated_zone(zone)) return; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) return; pgdat = zone->zone_pgdat; if (pgdat->kswapd_max_order < order) { @@ -3113,7 +3558,7 @@ } if (!waitqueue_active(&pgdat->kswapd_wait)) return; - if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) + if (zone_balanced(zone, order, 0, 0)) return; trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); @@ -3133,17 +3578,13 @@ { struct reclaim_state reclaim_state; struct scan_control sc = { + .nr_to_reclaim = nr_to_reclaim, .gfp_mask = GFP_HIGHUSER_MOVABLE, - .may_swap = 1, - .may_unmap = 1, + .priority = DEF_PRIORITY, .may_writepage = 1, - .nr_to_reclaim = nr_to_reclaim, + .may_unmap = 1, + .may_swap = 1, .hibernation_mode = 1, - .order = 0, - .priority = DEF_PRIORITY, - }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, }; struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; @@ -3154,7 +3595,7 @@ reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); p->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); @@ -3213,7 +3654,7 @@ /* * Called by memory hotplug when all memory in a node is offlined. Caller must - * hold lock_memory_hotplug(). + * hold mem_hotplug_begin/end(). */ void kswapd_stop(int nid) { @@ -3250,7 +3691,7 @@ #define RECLAIM_OFF 0 #define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */ #define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */ -#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */ +#define RECLAIM_UNMAP (1<<2) /* Unmap pages during reclaim */ /* * Priority for ZONE_RECLAIM. This determines the fraction of pages @@ -3286,18 +3727,18 @@ } /* Work out how many page cache pages we can reclaim in this reclaim_mode */ -static long zone_pagecache_reclaimable(struct zone *zone) +static unsigned long zone_pagecache_reclaimable(struct zone *zone) { - long nr_pagecache_reclaimable; - long delta = 0; + unsigned long nr_pagecache_reclaimable; + unsigned long delta = 0; /* - * If RECLAIM_SWAP is set, then all file pages are considered + * If RECLAIM_UNMAP is set, then all file pages are considered * potentially reclaimable. Otherwise, we have to worry about * pages like swapcache and zone_unmapped_file_pages() provides * a better estimate */ - if (zone_reclaim_mode & RECLAIM_SWAP) + if (zone_reclaim_mode & RECLAIM_UNMAP) nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); else nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); @@ -3323,24 +3764,20 @@ struct task_struct *p = current; struct reclaim_state reclaim_state; struct scan_control sc = { - .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), - .may_swap = 1, .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), .order = order, .priority = ZONE_RECLAIM_PRIORITY, + .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_UNMAP), + .may_swap = 1, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; - unsigned long nr_slab_pages0, nr_slab_pages1; cond_resched(); /* - * We need to be able to allocate from the reserves for RECLAIM_SWAP + * We need to be able to allocate from the reserves for RECLAIM_UNMAP * and we also need to be able to write out pages for RECLAIM_WRITE - * and RECLAIM_SWAP. + * and RECLAIM_UNMAP. */ p->flags |= PF_MEMALLOC | PF_SWAPWRITE; lockdep_set_current_reclaim_state(gfp_mask); @@ -3353,45 +3790,10 @@ * priorities until we have enough memory freed. */ do { - shrink_zone(zone, &sc); + shrink_zone(zone, &sc, true); } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } - nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); - if (nr_slab_pages0 > zone->min_slab_pages) { - /* - * shrink_slab() does not currently allow us to determine how - * many pages were freed in this zone. So we take the current - * number of slab pages and shake the slab until it is reduced - * by the same nr_pages that we used for reclaiming unmapped - * pages. - * - * Note that shrink_slab will free memory on all zones and may - * take a long time. - */ - for (;;) { - unsigned long lru_pages = zone_reclaimable_pages(zone); - - /* No reclaimable slab or very low memory pressure */ - if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages)) - break; - - /* Freed enough memory */ - nr_slab_pages1 = zone_page_state(zone, - NR_SLAB_RECLAIMABLE); - if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) - break; - } - - /* - * Update nr_reclaimed by the number of slab pages we - * reclaimed from this zone. - */ - nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); - if (nr_slab_pages1 < nr_slab_pages0) - sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; - } - p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); lockdep_clear_current_reclaim_state(); @@ -3417,13 +3819,13 @@ zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) return ZONE_RECLAIM_FULL; - if (zone->all_unreclaimable) + if (!zone_reclaimable(zone)) return ZONE_RECLAIM_FULL; /* * Do not scan if the allocation should not be delayed. */ - if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) + if (!gfpflags_allow_blocking(gfp_mask) || (current->flags & PF_MEMALLOC)) return ZONE_RECLAIM_NOSCAN; /* @@ -3436,11 +3838,11 @@ if (node_state(node_id, N_CPU) && node_id != numa_node_id()) return ZONE_RECLAIM_NOSCAN; - if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) + if (test_and_set_bit(ZONE_RECLAIM_LOCKED, &zone->flags)) return ZONE_RECLAIM_NOSCAN; ret = __zone_reclaim(zone, gfp_mask, order); - zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); + clear_bit(ZONE_RECLAIM_LOCKED, &zone->flags); if (!ret) count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); @@ -3504,7 +3906,7 @@ if (page_evictable(page)) { enum lru_list lru = page_lru_base_type(page); - VM_BUG_ON(PageActive(page)); + VM_BUG_ON_PAGE(PageActive(page), page); ClearPageUnevictable(page); del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE); add_page_to_lru_list(page, lruvec, lru); @@ -3519,66 +3921,3 @@ } } #endif /* CONFIG_SHMEM */ - -static void warn_scan_unevictable_pages(void) -{ - printk_once(KERN_WARNING - "%s: The scan_unevictable_pages sysctl/node-interface has been " - "disabled for lack of a legitimate use case. If you have " - "one, please send an email to linux-mm@kvack.org.\n", - current->comm); -} - -/* - * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of - * all nodes' unevictable lists for evictable pages - */ -unsigned long scan_unevictable_pages; - -int scan_unevictable_handler(struct ctl_table *table, int write, - void __user *buffer, - size_t *length, loff_t *ppos) -{ - warn_scan_unevictable_pages(); - proc_doulongvec_minmax(table, write, buffer, length, ppos); - scan_unevictable_pages = 0; - return 0; -} - -#ifdef CONFIG_NUMA -/* - * per node 'scan_unevictable_pages' attribute. On demand re-scan of - * a specified node's per zone unevictable lists for evictable pages. - */ - -static ssize_t read_scan_unevictable_node(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - warn_scan_unevictable_pages(); - return sprintf(buf, "0\n"); /* always zero; should fit... */ -} - -static ssize_t write_scan_unevictable_node(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - warn_scan_unevictable_pages(); - return 1; -} - - -static DEVICE_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, - read_scan_unevictable_node, - write_scan_unevictable_node); - -int scan_unevictable_register_node(struct node *node) -{ - return device_create_file(&node->dev, &dev_attr_scan_unevictable_pages); -} - -void scan_unevictable_unregister_node(struct node *node) -{ - device_remove_file(&node->dev, &dev_attr_scan_unevictable_pages); -} -#endif