--- zzzz-none-000/linux-3.10.107/mm/slub.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/mm/slub.c 2021-02-04 17:41:59.000000000 +0000 @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -114,6 +115,15 @@ * the fast path and disables lockless freelists. */ +static inline int kmem_cache_fastdebug(struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_DEBUG + return (s->flags & SLAB_STORE_USER_LITE) && !(s->flags & (SLAB_RED_ZONE | SLAB_POISON | SLAB_TRACE)); +#else + return 0; +#endif +} + static inline int kmem_cache_debug(struct kmem_cache *s) { #ifdef CONFIG_SLUB_DEBUG @@ -123,6 +133,16 @@ #endif } +static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_CPU_PARTIAL + /* we want cpu-pools also in lite-debug-Mode! */ + return kmem_cache_fastdebug(s) || !kmem_cache_debug(s); +#else + return false; +#endif +} + /* * Issues still to be resolved: * @@ -146,29 +166,19 @@ /* * Maximum number of desirable partial slabs. * The existence of more partial slabs makes kmem_cache_shrink - * sort the partial list by the number of objects in the. + * sort the partial list by the number of objects in use. */ #define MAX_PARTIAL 10 -#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \ - SLAB_POISON | SLAB_STORE_USER) +#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_STORE_USER_LITE) /* * Debugging flags that require metadata to be stored in the slab. These get * disabled when slub_debug=O is used and a cache's min order increases with * metadata. */ -#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) - -/* - * Set of flags that will prevent slab merging - */ -#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ - SLAB_FAILSLAB) - -#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_CACHE_DMA | SLAB_NOTRACK) +#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | \ + SLAB_STORE_USER | SLAB_STORE_USER_LITE) #define OO_SHIFT 16 #define OO_MASK ((1 << OO_SHIFT) - 1) @@ -185,37 +195,56 @@ /* * Tracking user of a slab. */ +#if defined(CONFIG_SLUB_AVM_ALLOC_LIST) +#define TRACK_ADDRS_COUNT 4 +#else #define TRACK_ADDRS_COUNT 16 +#endif struct track { + int cpu; /* Was running on cpu */ + int pid; /* Pid context */ + unsigned long when; /* When did the operation occur */ unsigned long addr; /* Called from address */ #ifdef CONFIG_STACKTRACE unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ #endif - int cpu; /* Was running on cpu */ - int pid; /* Pid context */ - unsigned long when; /* When did the operation occur */ }; -enum track_item { TRACK_ALLOC, TRACK_FREE }; +static inline size_t track_size(unsigned int slab_flag) +{ + size_t len = sizeof(struct track); + +#ifdef CONFIG_STACKTRACE + if (slab_flag & SLAB_STORE_USER) { + return len; + } + /* unused stack-unwind-data for SLAB_STORE_USER_LITE */ + len = offsetof(struct track, addrs[0]); +#endif + return len; +} + +enum track_item { TRACK_ALLOC = 0, TRACK_FREE = 1}; -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); -static void sysfs_slab_remove(struct kmem_cache *); static void memcg_propagate_slab_attrs(struct kmem_cache *s); #else static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; } static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } -static inline void sysfs_slab_remove(struct kmem_cache *s) { } - static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { } #endif static inline void stat(const struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS - __this_cpu_inc(s->cpu_slab->stat[si]); + /* + * The rmw is racy on a preemptible kernel but this is acceptable, so + * avoid this_cpu_add()'s irq-disable overhead. + */ + raw_cpu_inc(s->cpu_slab->stat[si]); #endif } @@ -223,11 +252,6 @@ * Core slab cache functions *******************************************************************/ -static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) -{ - return s->node[node]; -} - /* Verify that a pointer has an address that is valid within a slab page */ static inline int check_valid_pointer(struct kmem_cache *s, struct page *page, const void *object) @@ -278,6 +302,10 @@ for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\ __p += (__s)->size) +#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \ + for (__p = (__addr), __idx = 1; __idx <= __objects;\ + __p += (__s)->size, __idx++) + /* Determine object index from a given position */ static inline int slab_index(void *p, struct kmem_cache *s, void *addr) { @@ -300,7 +328,7 @@ * back there or track user information then we can * only use the space before that information. */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER | SLAB_STORE_USER_LITE)) return s->inuse; /* * Else we can use all the padding etc for the allocation @@ -346,6 +374,21 @@ __bit_spin_unlock(PG_locked, &page->flags); } +static inline void set_page_slub_counters(struct page *page, unsigned long counters_new) +{ + struct page tmp; + tmp.counters = counters_new; + /* + * page->counters can cover frozen/inuse/objects as well + * as page->_count. If we assign to ->counters directly + * we run the risk of losing updates to page->_count, so + * be careful and only assign to the fields we need. + */ + page->frozen = tmp.frozen; + page->inuse = tmp.inuse; + page->objects = tmp.objects; +} + /* Interrupts must be disabled (for the fallback code to work right) */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, @@ -357,18 +400,19 @@ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; + freelist_old, counters_old, + freelist_new, counters_new)) + return true; } else #endif { slab_lock(page); - if (page->freelist == freelist_old && page->counters == counters_old) { + if (page->freelist == freelist_old && + page->counters == counters_old) { page->freelist = freelist_new; - page->counters = counters_new; + set_page_slub_counters(page, counters_new); slab_unlock(page); - return 1; + return true; } slab_unlock(page); } @@ -377,10 +421,10 @@ stat(s, CMPXCHG_DOUBLE_FAIL); #ifdef SLUB_DEBUG_CMPXCHG - printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); + pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif - return 0; + return false; } static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, @@ -392,9 +436,9 @@ defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) if (s->flags & __CMPXCHG_DOUBLE) { if (cmpxchg_double(&page->freelist, &page->counters, - freelist_old, counters_old, - freelist_new, counters_new)) - return 1; + freelist_old, counters_old, + freelist_new, counters_new)) + return true; } else #endif { @@ -402,12 +446,13 @@ local_irq_save(flags); slab_lock(page); - if (page->freelist == freelist_old && page->counters == counters_old) { + if (page->freelist == freelist_old && + page->counters == counters_old) { page->freelist = freelist_new; - page->counters = counters_new; + set_page_slub_counters(page, counters_new); slab_unlock(page); local_irq_restore(flags); - return 1; + return true; } slab_unlock(page); local_irq_restore(flags); @@ -417,13 +462,23 @@ stat(s, CMPXCHG_DOUBLE_FAIL); #ifdef SLUB_DEBUG_CMPXCHG - printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); + pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif - return 0; + return false; } #ifdef CONFIG_SLUB_DEBUG + +static void delayed_slub_corruption_handler(struct work_struct *work); +static DECLARE_DELAYED_WORK(delayed_bug, delayed_slub_corruption_handler); + +static void delayed_slub_corruption_handler(struct work_struct *work) +{ + pr_err("slub corruption: intentional crash in workqueue\n"); + BUG(); +} + /* * Determine a map of object in use on a page. * @@ -439,25 +494,50 @@ set_bit(slab_index(p, s, addr), map); } +#if defined(CONFIG_SLUB_AVM_ALLOC_LIST) +#define DEBUG_MIN_FLAGS (SLAB_STORE_USER_LITE | SLAB_DEBUG_FREE) +#else +#define DEBUG_MIN_FLAGS 0x0 +#endif /* * Debug settings: */ -#ifdef CONFIG_SLUB_DEBUG_ON +#if defined(CONFIG_SLUB_DEBUG_ON) static int slub_debug = DEBUG_DEFAULT_FLAGS; +#elif defined(CONFIG_KASAN) +static int slub_debug = SLAB_STORE_USER | DEBUG_MIN_FLAGS; #else -static int slub_debug; +static int slub_debug = DEBUG_MIN_FLAGS; #endif static char *slub_debug_slabs; static int disable_higher_order_debug; /* + * slub is about to manipulate internal object metadata. This memory lies + * outside the range of the allocated object, so accessing it would normally + * be reported by kasan as a bounds error. metadata_access_enable() is used + * to tell kasan that these accesses are OK. + */ +static inline void metadata_access_enable(void) +{ + kasan_disable_current(); +} + +static inline void metadata_access_disable(void) +{ + kasan_enable_current(); +} + +/* * Object debugging */ static void print_section(char *text, u8 *addr, unsigned int length) { + metadata_access_enable(); print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, length, 1); + metadata_access_disable(); } static struct track *get_track(struct kmem_cache *s, void *object, @@ -470,63 +550,111 @@ else p = object + s->inuse; - return p + alloc; + return (void *)p + (alloc ? track_size(s->flags) : 0); +} + +/** + * ret: old_addr-value + */ +static inline unsigned long reset_track(struct kmem_cache *s, void *object, + enum track_item alloc) +{ + unsigned long old_addr; + struct track *p = get_track(s, object, alloc); + + /*--- todo use: cmpxchg() ---*/ + old_addr = xchg(&p->addr, 0L); +#ifdef CONFIG_STACKTRACE + if (unlikely(s->flags & SLAB_STORE_USER)) + p->addrs[0] = 0L; +#endif + return old_addr; } -static void set_track(struct kmem_cache *s, void *object, +/** + * special case or TRACK_FREE! + * + * if alloc == TRACK_FREE: + * (a) clear alloc-addr + * (b) set free-addr ... - but only if old-alloc-addr exist + * (c) deliver old-alloc-addr + * + */ +static unsigned long set_track(struct kmem_cache *s, void *object, enum track_item alloc, unsigned long addr) { + unsigned long alloc_addr = 0; struct track *p = get_track(s, object, alloc); - if (addr) { + if (alloc == TRACK_FREE) { + /* use alloc-addr as indicator for an alloced object */ + alloc_addr = reset_track(s, object, TRACK_ALLOC); + if (alloc_addr == 0) + /* it should be double-freed */ + return alloc_addr; + } + p->addr = addr; + p->cpu = smp_processor_id(); + p->pid = current->pid; + +#if defined(CONFIG_SLUB_AVM_ALLOC_LIST) + if (likely(!slab_track_time)) + /* do not track initial for slab_allocator (too much entries) */ + p->when = 0; + else +#endif + p->when = jiffies; + #ifdef CONFIG_STACKTRACE + if (unlikely(s->flags & SLAB_STORE_USER)) { struct stack_trace trace; int i; trace.nr_entries = 0; + /*--- complete stacktrace is to slow ! ---*/ trace.max_entries = TRACK_ADDRS_COUNT; trace.entries = p->addrs; trace.skip = 3; + metadata_access_enable(); save_stack_trace(&trace); + metadata_access_disable(); /* See rant in lockdep.c */ if (trace.nr_entries != 0 && trace.entries[trace.nr_entries - 1] == ULONG_MAX) trace.nr_entries--; - for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++) p->addrs[i] = 0; + } #endif - p->addr = addr; - p->cpu = smp_processor_id(); - p->pid = current->pid; - p->when = jiffies; - } else - memset(p, 0, sizeof(struct track)); + return alloc_addr; } static void init_tracking(struct kmem_cache *s, void *object) { - if (!(s->flags & SLAB_STORE_USER)) + if (!(s->flags & (SLAB_STORE_USER | SLAB_STORE_USER_LITE))) return; - set_track(s, object, TRACK_FREE, 0UL); - set_track(s, object, TRACK_ALLOC, 0UL); + reset_track(s, object, TRACK_FREE); + reset_track(s, object, TRACK_ALLOC); } -static void print_track(const char *s, struct track *t) +static void print_track(const char *s, struct track *t, unsigned int flags) { if (!t->addr) return; - printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", - s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); + pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { int i; + + if (!(flags & SLAB_STORE_USER)) + return; for (i = 0; i < TRACK_ADDRS_COUNT; i++) if (t->addrs[i]) - printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); + pr_err("\t%pS\n", (void *)t->addrs[i]); else break; } @@ -535,46 +663,46 @@ static void print_tracking(struct kmem_cache *s, void *object) { - if (!(s->flags & SLAB_STORE_USER)) + if (!(s->flags & (SLAB_STORE_USER | SLAB_STORE_USER_LITE))) return; - print_track("Allocated", get_track(s, object, TRACK_ALLOC)); - print_track("Freed", get_track(s, object, TRACK_FREE)); + print_track("Allocated", get_track(s, object, TRACK_ALLOC), s->flags); + print_track("Freed", get_track(s, object, TRACK_FREE), s->flags); } static void print_page_info(struct page *page) { - printk(KERN_ERR "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", - page, page->objects, page->inuse, page->freelist, page->flags); + pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", + page, page->objects, page->inuse, page->freelist, page->flags); } static void slab_bug(struct kmem_cache *s, char *fmt, ...) { + struct va_format vaf; va_list args; - char buf[100]; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - printk(KERN_ERR "========================================" - "=====================================\n"); - printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); - printk(KERN_ERR "----------------------------------------" - "-------------------------------------\n\n"); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("=============================================================================\n"); + pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); + pr_err("-----------------------------------------------------------------------------\n\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + va_end(args); } static void slab_fix(struct kmem_cache *s, char *fmt, ...) { + struct va_format vaf; va_list args; - char buf[100]; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("FIX %s: %pV\n", s->name, &vaf); va_end(args); - printk(KERN_ERR "FIX %s: %s\n", s->name, buf); } static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) @@ -586,8 +714,8 @@ print_page_info(page); - printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", - p, p - addr, get_freepointer(s, p)); + pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + p, p - addr, get_freepointer(s, p)); if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); @@ -603,8 +731,8 @@ else off = s->inuse; - if (s->flags & SLAB_STORE_USER) - off += 2 * sizeof(struct track); + if (s->flags & (SLAB_STORE_USER | SLAB_STORE_USER_LITE)) + off += 2 * track_size(s->flags); if (off != s->size) /* Beginning of the filler is the free pointer */ @@ -613,14 +741,15 @@ dump_stack(); } -static void object_err(struct kmem_cache *s, struct page *page, +void object_err(struct kmem_cache *s, struct page *page, u8 *object, char *reason) { slab_bug(s, "%s", reason); print_trailer(s, page, object); } -static void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...) +static void slab_err(struct kmem_cache *s, struct page *page, + const char *fmt, ...) { va_list args; char buf[100]; @@ -660,7 +789,9 @@ u8 *fault; u8 *end; + metadata_access_enable(); fault = memchr_inv(start, value, bytes); + metadata_access_disable(); if (!fault) return 1; @@ -669,11 +800,16 @@ end--; slab_bug(s, "%s overwritten", what); - printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", + pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault[0], value); print_trailer(s, page, object); restore_bytes(s, what, value, fault, end); + + if (s->flags & SLAB_PANIC_CORRUPTION) { + pr_err("slub corruption: schedule delayed BUG()\n"); + schedule_delayed_work(&delayed_bug, 1000); + } return 0; } @@ -723,9 +859,9 @@ /* Freepointer is placed after the object. */ off += sizeof(void *); - if (s->flags & SLAB_STORE_USER) + if (s->flags & (SLAB_STORE_USER | SLAB_STORE_USER_LITE)) /* We also have user information there */ - off += 2 * sizeof(struct track); + off += 2 * track_size(s->flags); if (s->size == off) return 1; @@ -753,7 +889,9 @@ if (!remainder) return 1; + metadata_access_enable(); fault = memchr_inv(end - remainder, POISON_INUSE, remainder); + metadata_access_disable(); if (!fault) return 1; while (end > fault && end[-1] == POISON_INUSE) @@ -779,7 +917,8 @@ } else { if ((s->flags & SLAB_POISON) && s->object_size < s->inuse) { check_bytes_and_report(s, page, p, "Alignment padding", - endobject, POISON_INUSE, s->inuse - s->object_size); + endobject, POISON_INUSE, + s->inuse - s->object_size); } } @@ -831,12 +970,12 @@ maxobj = order_objects(compound_order(page), s->size, s->reserved); if (page->objects > maxobj) { slab_err(s, page, "objects %u > max %u", - s->name, page->objects, maxobj); + page->objects, maxobj); return 0; } if (page->inuse > page->objects) { slab_err(s, page, "inuse %u > max %u", - s->name, page->inuse, page->objects); + page->inuse, page->objects); return 0; } /* Slab_pad_check fixes things up after itself */ @@ -853,7 +992,7 @@ int nr = 0; void *fp; void *object = NULL; - unsigned long max_objects; + int max_objects; fp = page->freelist; while (fp && nr <= page->objects) { @@ -864,7 +1003,6 @@ object_err(s, page, object, "Freechain corrupt"); set_freepointer(s, object, NULL); - break; } else { slab_err(s, page, "Freepointer corrupt"); page->freelist = NULL; @@ -898,70 +1036,24 @@ return search == NULL; } -static void trace(struct kmem_cache *s, struct page *page, void *object, +static noinline void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) { - if (s->flags & SLAB_TRACE) { - printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", + pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n", s->name, alloc ? "alloc" : "free", object, page->inuse, page->freelist); if (!alloc) - print_section("Object ", (void *)object, s->object_size); + print_section("Object ", (void *)object, + s->object_size); dump_stack(); - } -} - -/* - * Hooks for other subsystems that check memory allocations. In a typical - * production configuration these hooks all should produce no code at all. - */ -static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) -{ - flags &= gfp_allowed_mask; - lockdep_trace_alloc(flags); - might_sleep_if(flags & __GFP_WAIT); - - return should_failslab(s->object_size, flags, s->flags); -} - -static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) -{ - flags &= gfp_allowed_mask; - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); -} - -static inline void slab_free_hook(struct kmem_cache *s, void *x) -{ - kmemleak_free_recursive(x, s->flags); - - /* - * Trouble is that we may no longer disable interupts in the fast path - * So in order to make the debug calls that expect irqs to be - * disabled we need to disable interrupts temporarily. - */ -#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) - { - unsigned long flags; - - local_irq_save(flags); - kmemcheck_slab_free(s, x, s->object_size); - debug_check_no_locks_freed(x, s->object_size); - local_irq_restore(flags); - } -#endif - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(x, s->object_size); } /* * Tracking of fully allocated slabs for debugging purposes. - * - * list_lock must be held. */ static void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) @@ -969,17 +1061,16 @@ if (!(s->flags & SLAB_STORE_USER)) return; + lockdep_assert_held(&n->list_lock); list_add(&page->lru, &n->full); } -/* - * list_lock must be held. - */ -static void remove_full(struct kmem_cache *s, struct page *page) +static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) { if (!(s->flags & SLAB_STORE_USER)) return; + lockdep_assert_held(&n->list_lock); list_del(&page->lru); } @@ -1023,14 +1114,15 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) { - if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) + if (!(s->flags & (SLAB_STORE_USER|SLAB_STORE_USER_LITE|SLAB_RED_ZONE|__OBJECT_POISON))) return; init_object(s, object, SLUB_RED_INACTIVE); init_tracking(s, object); } -static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page, +static noinline int alloc_debug_processing(struct kmem_cache *s, + struct page *page, void *object, unsigned long addr) { if (!check_slab(s, page)) @@ -1044,10 +1136,8 @@ if (!check_object(s, page, object, SLUB_RED_INACTIVE)) goto bad; - /* Success perform special debug activities for allocs */ - if (s->flags & SLAB_STORE_USER) - set_track(s, object, TRACK_ALLOC, addr); - trace(s, page, object, 1); + if (unlikely(s->flags & SLAB_TRACE)) + trace(s, page, object, 1); init_object(s, object, SLUB_RED_ACTIVE); return 1; @@ -1065,11 +1155,15 @@ return 0; } +/* Supports checking bulk free of a constructed freelist */ static noinline struct kmem_cache_node *free_debug_processing( - struct kmem_cache *s, struct page *page, void *object, + struct kmem_cache *s, struct page *page, + void *head, void *tail, int bulk_cnt, unsigned long addr, unsigned long *flags) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); + void *object = head; + int cnt = 0; spin_lock_irqsave(&n->list_lock, *flags); slab_lock(page); @@ -1077,6 +1171,9 @@ if (!check_slab(s, page)) goto fail; +next_object: + cnt++; + if (!check_valid_pointer(s, page, object)) { slab_err(s, page, "Invalid object pointer 0x%p", object); goto fail; @@ -1095,9 +1192,8 @@ slab_err(s, page, "Attempt to free object(0x%p) " "outside of slab", object); } else if (!page->slab_cache) { - printk(KERN_ERR - "SLUB : no slab for object 0x%p.\n", - object); + pr_err("SLUB : no slab for object 0x%p.\n", + object); dump_stack(); } else object_err(s, page, object, @@ -1105,11 +1201,24 @@ goto fail; } - if (s->flags & SLAB_STORE_USER) + if (s->flags & (SLAB_STORE_USER | SLAB_STORE_USER_LITE)) set_track(s, object, TRACK_FREE, addr); - trace(s, page, object, 0); + + if (unlikely(s->flags & SLAB_TRACE)) + trace(s, page, object, 0); + /* Freepointer not overwritten by init_object(), SLAB_POISON moved it */ init_object(s, object, SLUB_RED_INACTIVE); + + /* Reached end of constructed freelist yet? */ + if (object != tail) { + object = get_freepointer(s, object); + goto next_object; + } out: + if (cnt != bulk_cnt) + slab_err(s, page, "Bulk freelist count(%d) invalid(%d)\n", + bulk_cnt, cnt); + slab_unlock(page); /* * Keep node_lock to preserve integrity @@ -1124,6 +1233,32 @@ return NULL; } +static int free_debug_processing_fast(struct kmem_cache *s, struct page *page, + void *object, + unsigned long addr) +{ + struct kmem_cache_node *n; + unsigned long flags; + + if (s->flags & SLAB_DEBUG_FREE) { + + if (unlikely((s != page->slab_cache) || !check_valid_pointer(s, page, object))) { + /* force complete check with locks */ + n = free_debug_processing(s, page, object, object, addr, 1, &flags); + if (n) + spin_unlock_irqrestore(&n->list_lock, flags); + else + return 0; + } + } + if (s->flags & (SLAB_STORE_USER | SLAB_STORE_USER_LITE)) { + if (unlikely(set_track(s, object, TRACK_FREE, addr) == 0)) { + object_err(s, page, object, "Object already free"); + return 0; + } + } + return 1; +} static int __init setup_slub_debug(char *str) { slub_debug = DEBUG_DEFAULT_FLAGS; @@ -1140,15 +1275,6 @@ */ goto check_slabs; - if (tolower(*str) == 'o') { - /* - * Avoid enabling debugging on caches if its minimum order - * would increase as a result. - */ - disable_higher_order_debug = 1; - goto out; - } - slub_debug = 0; if (*str == '-') /* @@ -1179,22 +1305,42 @@ case 'a': slub_debug |= SLAB_FAILSLAB; break; + case 'l': + slub_debug |= SLAB_STORE_USER_LITE; + break; + case 'o': + /* + * Avoid enabling debugging on caches if its minimum + * order would increase as a result. + */ + disable_higher_order_debug = 1; + break; + case 'c': + slub_debug |= SLAB_PANIC_CORRUPTION; + break; default: - printk(KERN_ERR "slub_debug option '%c' " - "unknown. skipped\n", *str); + pr_err("slub_debug option '%c' unknown. skipped\n", + *str); } } - check_slabs: if (*str == ',') slub_debug_slabs = str + 1; out: + pr_err("%s: slub_debug =%s%s%s%s%s%s%s\n", __func__, + (slub_debug & SLAB_DEBUG_FREE) ? " DEBUG_FREE" : "", + (slub_debug & SLAB_RED_ZONE) ? " RED_ZONE" : "", + (slub_debug & SLAB_POISON) ? " POISON" : "", + (slub_debug & SLAB_STORE_USER) ? " STORE_USER" : "", + (slub_debug & SLAB_TRACE) ? " TRACE" : "", + (slub_debug & SLAB_FAILSLAB) ? " FAILSLAB" : "", + (slub_debug & SLAB_STORE_USER_LITE) ? " STORE_USER_LITE" : ""); return 1; } __setup("slub_debug", setup_slub_debug); -static unsigned long kmem_cache_flags(unsigned long object_size, +unsigned long kmem_cache_flags(unsigned long object_size, unsigned long flags, const char *name, void (*ctor)(void *)) { @@ -1205,9 +1351,17 @@ !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) flags |= slub_debug; + pr_debug("%s: flags =%s%s%s%s%s%s%s\n", __func__, + (flags & SLAB_DEBUG_FREE) ? " DEBUG_FREE" : "", + (flags & SLAB_RED_ZONE) ? " RED_ZONE" : "", + (flags & SLAB_POISON) ? " POISON" : "", + (flags & SLAB_STORE_USER) ? " STORE_USER" : "", + (flags & SLAB_TRACE) ? " TRACE" : "", + (flags & SLAB_FAILSLAB) ? " FAILSLAB" : "", + (flags & SLAB_STORE_USER_LITE) ? " STORE_USER_LITE" : ""); return flags; } -#else +#else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, struct page *page, void *object) {} @@ -1215,17 +1369,22 @@ struct page *page, void *object, unsigned long addr) { return 0; } static inline struct kmem_cache_node *free_debug_processing( - struct kmem_cache *s, struct page *page, void *object, + struct kmem_cache *s, struct page *page, + void *head, void *tail, int bulk_cnt, unsigned long addr, unsigned long *flags) { return NULL; } - +static int free_debug_processing_fast(struct kmem_cache *s, struct page *page, + void *object, + unsigned long addr) + { return 1; } static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } static inline int check_object(struct kmem_cache *s, struct page *page, void *object, u8 val) { return 1; } static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} -static inline void remove_full(struct kmem_cache *s, struct page *page) {} -static inline unsigned long kmem_cache_flags(unsigned long object_size, +static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, + struct page *page) {} +unsigned long kmem_cache_flags(unsigned long object_size, unsigned long flags, const char *name, void (*ctor)(void *)) { @@ -1244,30 +1403,135 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} -static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) - { return 0; } +#endif /* CONFIG_SLUB_DEBUG */ + +/* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + */ +static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) +{ + kmemleak_alloc(ptr, size, 1, flags); + kasan_kmalloc_large(ptr, size); +} + +static inline void kfree_hook(const void *x) +{ + kmemleak_free(x); + kasan_kfree_large(x); +} + +static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s, + gfp_t flags) +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(gfpflags_allow_blocking(flags)); + + if (should_failslab(s->object_size, flags, s->flags)) + return NULL; + + return memcg_kmem_get_cache(s, flags); +} static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - void *object) {} + size_t size, void **p) +{ + size_t i; -static inline void slab_free_hook(struct kmem_cache *s, void *x) {} + flags &= gfp_allowed_mask; + for (i = 0; i < size; i++) { + void *object = p[i]; -#endif /* CONFIG_SLUB_DEBUG */ + kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); + kmemleak_alloc_recursive(object, s->object_size, 1, + s->flags, flags); + kasan_slab_alloc(s, object); + } + memcg_kmem_put_cache(s); +} + +static inline void slab_free_hook(struct kmem_cache *s, void *x) +{ + kmemleak_free_recursive(x, s->flags); + + /* + * Trouble is that we may no longer disable interrupts in the fast path + * So in order to make the debug calls that expect irqs to be + * disabled we need to disable interrupts temporarily. + */ +#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) + { + unsigned long flags; + + local_irq_save(flags); + kmemcheck_slab_free(s, x, s->object_size); + debug_check_no_locks_freed(x, s->object_size); + local_irq_restore(flags); + } +#endif + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(x, s->object_size); + + kasan_slab_free(s, x); +} + +static inline void slab_free_freelist_hook(struct kmem_cache *s, + void *head, void *tail) +{ +/* + * Compiler cannot detect this function can be removed if slab_free_hook() + * evaluates to nothing. Thus, catch all relevant config debug options here. + */ +#if defined(CONFIG_KMEMCHECK) || \ + defined(CONFIG_LOCKDEP) || \ + defined(CONFIG_DEBUG_KMEMLEAK) || \ + defined(CONFIG_DEBUG_OBJECTS_FREE) || \ + defined(CONFIG_KASAN) + + void *object = head; + void *tail_obj = tail ? : head; + + do { + slab_free_hook(s, object); + } while ((object != tail_obj) && + (object = get_freepointer(s, object))); +#endif +} + +static void setup_object(struct kmem_cache *s, struct page *page, + void *object) +{ + setup_object_debug(s, page, object); + if (unlikely(s->ctor)) { + kasan_unpoison_object_data(s, object); + s->ctor(object); + kasan_poison_object_data(s, object); + } +} /* * Slab allocation and freeing */ -static inline struct page *alloc_slab_page(gfp_t flags, int node, - struct kmem_cache_order_objects oo) +static inline struct page *alloc_slab_page(struct kmem_cache *s, + gfp_t flags, int node, struct kmem_cache_order_objects oo) { + struct page *page; int order = oo_order(oo); flags |= __GFP_NOTRACK; if (node == NUMA_NO_NODE) - return alloc_pages(flags, order); + page = alloc_pages(flags, order); else - return alloc_pages_exact_node(node, flags, order); + page = __alloc_pages_node(node, flags, order); + + if (page && memcg_charge_slab(page, flags, order, s)) { + __free_pages(page, order); + page = NULL; + } + + return page; } static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) @@ -1275,10 +1539,12 @@ struct page *page; struct kmem_cache_order_objects oo = s->oo; gfp_t alloc_gfp; + void *start, *p; + int idx, order; flags &= gfp_allowed_mask; - if (flags & __GFP_WAIT) + if (gfpflags_allow_blocking(flags)) local_irq_enable(); flags |= s->allocflags; @@ -1288,25 +1554,28 @@ * so we fall-back to the minimum order allocation. */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; + if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_DIRECT_RECLAIM; - page = alloc_slab_page(alloc_gfp, node, oo); + page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { oo = s->min; + alloc_gfp = flags; /* * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - page = alloc_slab_page(flags, node, oo); - - if (page) - stat(s, ORDER_FALLBACK); + page = alloc_slab_page(s, alloc_gfp, node, oo); + if (unlikely(!page)) + goto out; + stat(s, ORDER_FALLBACK); } - if (kmemcheck_enabled && page - && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { + if (kmemcheck_enabled && + !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); - kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); + kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); /* * Objects from caches that have a constructor don't get @@ -1318,49 +1587,12 @@ kmemcheck_mark_unallocated_pages(page, pages); } - if (flags & __GFP_WAIT) - local_irq_disable(); - if (!page) - return NULL; - page->objects = oo_objects(oo); - mod_zone_page_state(page_zone(page), - (s->flags & SLAB_RECLAIM_ACCOUNT) ? - NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, - 1 << oo_order(oo)); - - return page; -} - -static void setup_object(struct kmem_cache *s, struct page *page, - void *object) -{ - setup_object_debug(s, page, object); - if (unlikely(s->ctor)) - s->ctor(object); -} - -static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) -{ - struct page *page; - void *start; - void *last; - void *p; - int order; - - BUG_ON(flags & GFP_SLAB_BUG_MASK); - - page = allocate_slab(s, - flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); - if (!page) - goto out; order = compound_order(page); - inc_slabs_node(s, page_to_nid(page), page->objects); - memcg_bind_pages(s, order); page->slab_cache = s; __SetPageSlab(page); - if (page->pfmemalloc) + if (page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); start = page_address(page); @@ -1368,22 +1600,47 @@ if (unlikely(s->flags & SLAB_POISON)) memset(start, POISON_INUSE, PAGE_SIZE << order); - last = start; - for_each_object(p, s, start, page->objects) { - setup_object(s, page, last); - set_freepointer(s, last, p); - last = p; + kasan_poison_slab(page); + + for_each_object_idx(p, idx, s, start, page->objects) { + setup_object(s, page, p); + if (likely(idx < page->objects)) + set_freepointer(s, p, p + s->size); + else + set_freepointer(s, p, NULL); } - setup_object(s, page, last); - set_freepointer(s, last, NULL); page->freelist = start; page->inuse = page->objects; page->frozen = 1; + out: + if (gfpflags_allow_blocking(flags)) + local_irq_disable(); + if (!page) + return NULL; + + mod_zone_page_state(page_zone(page), + (s->flags & SLAB_RECLAIM_ACCOUNT) ? + NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, + 1 << oo_order(oo)); + + inc_slabs_node(s, page_to_nid(page), page->objects); + return page; } +static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) +{ + if (unlikely(flags & GFP_SLAB_BUG_MASK)) { + pr_emerg("gfp: %u\n", flags & GFP_SLAB_BUG_MASK); + BUG(); + } + + return allocate_slab(s, + flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node); +} + static void __free_slab(struct kmem_cache *s, struct page *page) { int order = compound_order(page); @@ -1408,11 +1665,10 @@ __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); - memcg_release_pages(s, order); page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_memcg_kmem_pages(page, order); + __free_kmem_pages(page, order); } #define need_reserve_slab_rcu \ @@ -1442,10 +1698,7 @@ VM_BUG_ON(s->reserved != sizeof(*head)); head = page_address(page) + offset; } else { - /* - * RCU free overloads the RCU head over the LRU - */ - head = (void *)&page->lru; + head = &page->rcu_head; } call_rcu(head, rcu_free_slab); @@ -1461,11 +1714,9 @@ /* * Management of partially allocated slabs. - * - * list_lock must be held. */ -static inline void add_partial(struct kmem_cache_node *n, - struct page *page, int tail) +static inline void +__add_partial(struct kmem_cache_node *n, struct page *page, int tail) { n->nr_partial++; if (tail == DEACTIVATE_TO_TAIL) @@ -1474,23 +1725,32 @@ list_add(&page->lru, &n->partial); } -/* - * list_lock must be held. - */ -static inline void remove_partial(struct kmem_cache_node *n, - struct page *page) +static inline void add_partial(struct kmem_cache_node *n, + struct page *page, int tail) +{ + lockdep_assert_held(&n->list_lock); + __add_partial(n, page, tail); +} + +static inline void +__remove_partial(struct kmem_cache_node *n, struct page *page) { list_del(&page->lru); n->nr_partial--; } +static inline void remove_partial(struct kmem_cache_node *n, + struct page *page) +{ + lockdep_assert_held(&n->list_lock); + __remove_partial(n, page); +} + /* * Remove slab from the partial list, freeze it and * return the pointer to the freelist. * * Returns a list of objects or NULL if it fails. - * - * Must hold list_lock since we modify the partial list. */ static inline void *acquire_slab(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page, @@ -1500,6 +1760,8 @@ unsigned long counters; struct page new; + lockdep_assert_held(&n->list_lock); + /* * Zap the freelist and set the frozen bit. * The old freelist is the list of objects for the @@ -1573,7 +1835,8 @@ put_cpu_partial(s, page, 0); stat(s, CPU_PARTIAL_NODE); } - if (kmem_cache_debug(s) || available > s->cpu_partial / 2) + if (!kmem_cache_has_cpu_partial(s) + || available > s->cpu_partial / 2) break; } @@ -1618,31 +1881,29 @@ return NULL; do { - cpuset_mems_cookie = get_mems_allowed(); - zonelist = node_zonelist(slab_node(), flags); + cpuset_mems_cookie = read_mems_allowed_begin(); + zonelist = node_zonelist(mempolicy_slab_node(), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; n = get_node(s, zone_to_nid(zone)); - if (n && cpuset_zone_allowed_hardwall(zone, flags) && + if (n && cpuset_zone_allowed(zone, flags) && n->nr_partial > s->min_partial) { object = get_partial_node(s, n, c, flags); if (object) { /* - * Return the object even if - * put_mems_allowed indicated that - * the cpuset mems_allowed was - * updated in parallel. It's a - * harmless race between the alloc - * and the cpuset update. + * Don't check read_mems_allowed_retry() + * here - if mems_allowed was updated in + * parallel, that was a harmless race + * between allocation and the cpuset + * update */ - put_mems_allowed(cpuset_mems_cookie); return object; } } } - } while (!put_mems_allowed(cpuset_mems_cookie)); + } while (read_mems_allowed_retry(cpuset_mems_cookie)); #endif return NULL; } @@ -1654,7 +1915,12 @@ struct kmem_cache_cpu *c) { void *object; - int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; + int searchnode = node; + + if (node == NUMA_NO_NODE) + searchnode = numa_mem_id(); + else if (!node_present_pages(node)) + searchnode = node_to_mem_node(node); object = get_partial_node(s, get_node(s, searchnode), c, flags); if (object || node != NUMA_NO_NODE) @@ -1704,19 +1970,19 @@ #ifdef SLUB_DEBUG_CMPXCHG unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); - printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); + pr_info("%s %s: cmpxchg redo ", n, s->name); #ifdef CONFIG_PREEMPT if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) - printk("due to cpu change %d -> %d\n", + pr_warn("due to cpu change %d -> %d\n", tid_to_cpu(tid), tid_to_cpu(actual_tid)); else #endif if (tid_to_event(tid) != tid_to_event(actual_tid)) - printk("due to cpu running other code. Event %ld->%ld\n", + pr_warn("due to cpu running other code. Event %ld->%ld\n", tid_to_event(tid), tid_to_event(actual_tid)); else - printk("for unknown reason: actual=%lx was=%lx target=%lx\n", + pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", actual_tid, tid, next_tid(tid)); #endif stat(s, CMPXCHG_DOUBLE_CPU_FAIL); @@ -1733,7 +1999,8 @@ /* * Remove the cpu slab */ -static void deactivate_slab(struct kmem_cache *s, struct page *page, void *freelist) +static void deactivate_slab(struct kmem_cache *s, struct page *page, + void *freelist) { enum slab_modes { M_NONE, M_PARTIAL, M_FULL, M_FREE }; struct kmem_cache_node *n = get_node(s, page_to_nid(page)); @@ -1808,7 +2075,7 @@ new.frozen = 0; - if (!new.inuse && n->nr_partial > s->min_partial) + if (!new.inuse && n->nr_partial >= s->min_partial) m = M_FREE; else if (new.freelist) { m = M_PARTIAL; @@ -1842,7 +2109,7 @@ else if (l == M_FULL) - remove_full(s, page); + remove_full(s, n, page); if (m == M_PARTIAL) { @@ -1884,6 +2151,7 @@ static void unfreeze_partials(struct kmem_cache *s, struct kmem_cache_cpu *c) { +#ifdef CONFIG_SLUB_CPU_PARTIAL struct kmem_cache_node *n = NULL, *n2 = NULL; struct page *page, *discard_page = NULL; @@ -1918,7 +2186,7 @@ new.freelist, new.counters, "unfreezing slab")); - if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) { + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) { page->next = discard_page; discard_page = page; } else { @@ -1938,6 +2206,7 @@ discard_slab(s, page); stat(s, FREE_SLAB); } +#endif } /* @@ -1951,10 +2220,12 @@ */ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) { +#ifdef CONFIG_SLUB_CPU_PARTIAL struct page *oldpage; int pages; int pobjects; + preempt_disable(); do { pages = 0; pobjects = 0; @@ -1986,7 +2257,17 @@ page->pobjects = pobjects; page->next = oldpage; - } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); + } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) + != oldpage); + if (unlikely(!s->cpu_partial)) { + unsigned long flags; + + local_irq_save(flags); + unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); + local_irq_restore(flags); + } + preempt_enable(); +#endif } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) @@ -2049,11 +2330,19 @@ return 1; } +#ifdef CONFIG_SLUB_DEBUG static int count_free(struct page *page) { return page->objects - page->inuse; } +static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->total_objects); +} +#endif /* CONFIG_SLUB_DEBUG */ + +#if defined(CONFIG_SLUB_DEBUG) || defined(SLAB_SUPPORTS_SYSFS) static unsigned long count_partial(struct kmem_cache_node *n, int (*get_count)(struct page *)) { @@ -2067,49 +2356,43 @@ spin_unlock_irqrestore(&n->list_lock, flags); return x; } - -static inline unsigned long node_nr_objs(struct kmem_cache_node *n) -{ -#ifdef CONFIG_SLUB_DEBUG - return atomic_long_read(&n->total_objects); -#else - return 0; -#endif -} +#endif /* CONFIG_SLUB_DEBUG || SLAB_SUPPORTS_SYSFS */ static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { +#ifdef CONFIG_SLUB_DEBUG + static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); int node; + struct kmem_cache_node *n; + + if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) + return; - printk(KERN_WARNING - "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", + pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", nid, gfpflags); - printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " - "default order: %d, min order: %d\n", s->name, s->object_size, - s->size, oo_order(s->oo), oo_order(s->min)); + pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", + s->name, s->object_size, s->size, oo_order(s->oo), + oo_order(s->min)); if (oo_order(s->min) > get_order(s->object_size)) - printk(KERN_WARNING " %s debugging increased min order, use " - "slub_debug=O to disable.\n", s->name); + pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", + s->name); - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long nr_slabs; unsigned long nr_objs; unsigned long nr_free; - if (!n) - continue; - nr_free = count_partial(n, count_free); nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); - printk(KERN_WARNING - " node %d: slabs: %ld, objs: %ld, free: %ld\n", + pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", node, nr_slabs, nr_objs, nr_free); } +#endif } static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, @@ -2126,7 +2409,7 @@ page = new_slab(s, flags, node); if (page) { - c = __this_cpu_ptr(s->cpu_slab); + c = raw_cpu_ptr(s->cpu_slab); if (c->page) flush_slab(s, c); @@ -2155,8 +2438,8 @@ } /* - * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist - * or deactivate the page. + * Check the page->freelist of a page and either transfer the freelist to the + * per cpu freelist or deactivate the page. * * The page is still frozen if the return value is not NULL. * @@ -2203,23 +2486,15 @@ * And if we were unable to get a new slab from the partial slab lists then * we need to allocate a new slab. This is the slowest path since it involves * a call to the page allocator and the setup of a new slab. + * + * Version of __slab_alloc to use when we know that interrupts are + * already disabled (which is the case for bulk allocation). */ -static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, +static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { void *freelist; struct page *page; - unsigned long flags; - - local_irq_save(flags); -#ifdef CONFIG_PREEMPT - /* - * We may have been preempted and rescheduled on a different - * cpu before disabling interrupts. Need to reload cpu area - * pointer. - */ - c = this_cpu_ptr(s->cpu_slab); -#endif page = c->page; if (!page) @@ -2227,11 +2502,18 @@ redo: if (unlikely(!node_match(page, node))) { - stat(s, ALLOC_NODE_MISMATCH); - deactivate_slab(s, page, c->freelist); - c->page = NULL; - c->freelist = NULL; - goto new_slab; + int searchnode = node; + + if (node != NUMA_NO_NODE && !node_present_pages(node)) + searchnode = node_to_mem_node(node); + + if (unlikely(!node_match(page, searchnode))) { + stat(s, ALLOC_NODE_MISMATCH); + deactivate_slab(s, page, c->freelist); + c->page = NULL; + c->freelist = NULL; + goto new_slab; + } } /* @@ -2251,8 +2533,6 @@ if (freelist) goto load_freelist; - stat(s, ALLOC_SLOWPATH); - freelist = get_freelist(s, page); if (!freelist) { @@ -2272,7 +2552,6 @@ VM_BUG_ON(!c->page->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); - local_irq_restore(flags); return freelist; new_slab: @@ -2288,29 +2567,53 @@ freelist = new_slab_objects(s, gfpflags, node, &c); if (unlikely(!freelist)) { - if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(s, gfpflags, node); - - local_irq_restore(flags); + slab_out_of_memory(s, gfpflags, node); return NULL; } page = c->page; - if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) - goto load_freelist; - - /* Only entered in the debug case */ - if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr)) - goto new_slab; /* Slab failed checks. Next slab needed */ - + if (pfmemalloc_match(page, gfpflags)) { + if (likely(!kmem_cache_debug(s))) + goto load_freelist; + /* Only entered in the debug case */ + if (!alloc_debug_processing(s, page, freelist, addr)) + goto new_slab; /* Slab failed checks. Next slab needed */ + if (kmem_cache_fastdebug(s)) + /* deactivate_slab() is very expensive, switch off fastpath ! */ + goto load_freelist; + } deactivate_slab(s, page, get_freepointer(s, freelist)); c->page = NULL; c->freelist = NULL; - local_irq_restore(flags); return freelist; } /* + * Another one that disabled interrupt and compensates for possible + * cpu changes by refetching the per cpu area pointer. + */ +static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, + unsigned long addr, struct kmem_cache_cpu *c) +{ + void *p; + unsigned long flags; + + local_irq_save(flags); +#ifdef CONFIG_PREEMPT + /* + * We may have been preempted and rescheduled on a different + * cpu before disabling interrupts. Need to reload cpu area + * pointer. + */ + c = this_cpu_ptr(s->cpu_slab); +#endif + + p = ___slab_alloc(s, gfpflags, node, addr, c); + local_irq_restore(flags); + return p; +} + +/* * Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc) * have the fastpath folded into their functions. So no function call * overhead for requests that can be satisfied on the fastpath. @@ -2323,15 +2626,14 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr) { - void **object; + void *object; struct kmem_cache_cpu *c; struct page *page; unsigned long tid; - if (slab_pre_alloc_hook(s, gfpflags)) + s = slab_pre_alloc_hook(s, gfpflags); + if (!s) return NULL; - - s = memcg_kmem_get_cache(s, gfpflags); redo: /* * Must read kmem_cache cpu data via this cpu ptr. Preemption is @@ -2339,13 +2641,25 @@ * reading from one cpu area. That does not matter as long * as we end up on the original cpu again when doing the cmpxchg. * - * Preemption is disabled for the retrieval of the tid because that - * must occur from the current processor. We cannot allow rescheduling - * on a different processor between the determination of the pointer - * and the retrieval of the tid. + * We should guarantee that tid and kmem_cache are retrieved on + * the same cpu. It could be different if CONFIG_PREEMPT so we need + * to check if it is matched or not. */ - preempt_disable(); - c = __this_cpu_ptr(s->cpu_slab); + do { + tid = this_cpu_read(s->cpu_slab->tid); + c = raw_cpu_ptr(s->cpu_slab); + } while (IS_ENABLED(CONFIG_PREEMPT) && + unlikely(tid != READ_ONCE(c->tid))); + + /* + * Irqless object alloc/free algorithm used here depends on sequence + * of fetching cpu_slab's data. tid should be fetched before anything + * on c to guarantee that object and page associated with previous tid + * won't be used with current tid. If we fetch tid first, object and + * page could be one associated with next tid and our alloc/free + * request will be failed. In this case, we will retry. So, no problem. + */ + barrier(); /* * The transaction ids are globally unique per cpu and per operation on @@ -2353,28 +2667,28 @@ * occurs on the right processor and that there was no operation on the * linked list in between. */ - tid = c->tid; - preempt_enable(); object = c->freelist; page = c->page; - if (unlikely(!object || !node_match(page, node))) + if (unlikely(!object || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); - - else { + stat(s, ALLOC_SLOWPATH); + } else { void *next_object = get_freepointer_safe(s, object); /* * The cmpxchg will only match if there was no additional * operation and if we are on the right processor. * - * The cmpxchg does the following atomically (without lock semantics!) + * The cmpxchg does the following atomically (without lock + * semantics!) * 1. Relocate first pointer to the current per cpu area. * 2. Verify that tid and freelist have not been changed * 3. If they were not changed replace tid and freelist * - * Since this is without lock semantics the protection is only against - * code executing on this cpu *not* from access by other cpus. + * Since this is without lock semantics the protection is only + * against code executing on this cpu *not* from access by + * other cpus. */ if (unlikely(!this_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, @@ -2387,11 +2701,14 @@ prefetch_freepointer(s, next_object); stat(s, ALLOC_FASTPATH); } - +#ifdef CONFIG_SLUB_DEBUG + if ((s->flags & (SLAB_STORE_USER | SLAB_STORE_USER_LITE)) && object) + set_track(s, object, TRACK_ALLOC, addr); +#endif if (unlikely(gfpflags & __GFP_ZERO) && object) memset(object, 0, s->object_size); - slab_post_alloc_hook(s, gfpflags, object); + slab_post_alloc_hook(s, gfpflags, 1, &object); return object; } @@ -2406,7 +2723,8 @@ { void *ret = slab_alloc(s, gfpflags, _RET_IP_); - trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, s->size, gfpflags); + trace_kmem_cache_alloc(_RET_IP_, ret, s->object_size, + s->size, gfpflags); return ret; } @@ -2416,18 +2734,12 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) { void *ret = slab_alloc(s, gfpflags, _RET_IP_); + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); + kasan_kmalloc(s, ret, size); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_trace); - -void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) -{ - void *ret = kmalloc_order(size, flags, order); - trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); - return ret; -} -EXPORT_SYMBOL(kmalloc_order_trace); #endif #ifdef CONFIG_NUMA @@ -2451,6 +2763,8 @@ trace_kmalloc_node(_RET_IP_, ret, size, s->size, gfpflags, node); + + kasan_kmalloc(s, ret, size); return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node_trace); @@ -2458,7 +2772,7 @@ #endif /* - * Slow patch handling. This may still be called frequently since objects + * Slow path handling. This may still be called frequently since objects * have a longer lifetime than the cpu slabs in most processing loads. * * So we still attempt to reduce cache line usage. Just take the slab @@ -2466,10 +2780,11 @@ * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, unsigned long addr) + void *head, void *tail, int cnt, + unsigned long addr) + { void *prior; - void **object = (void *)x; int was_frozen; struct page new; unsigned long counters; @@ -2478,10 +2793,13 @@ stat(s, FREE_SLOWPATH); - if (kmem_cache_debug(s) && - !(n = free_debug_processing(s, page, x, addr, &flags))) - return; - + if (kmem_cache_debug(s)) { + if (!kmem_cache_fastdebug(s) || (cnt > 1)) { + n = free_debug_processing(s, page, head, tail, cnt, addr, &flags); + if (!n) + return; + } + } do { if (unlikely(n)) { spin_unlock_irqrestore(&n->list_lock, flags); @@ -2489,23 +2807,25 @@ } prior = page->freelist; counters = page->counters; - set_freepointer(s, object, prior); + set_freepointer(s, tail, prior); new.counters = counters; was_frozen = new.frozen; - new.inuse--; + new.inuse -= cnt; if ((!new.inuse || !prior) && !was_frozen) { - if (!kmem_cache_debug(s) && !prior) + if (kmem_cache_has_cpu_partial(s) && !prior) { /* - * Slab was on no list before and will be partially empty - * We can defer the list move and instead freeze it. + * Slab was on no list before and will be + * partially empty + * We can defer the list move and instead + * freeze it. */ new.frozen = 1; - else { /* Needs to be taken off a list */ + } else { /* Needs to be taken off a list */ - n = get_node(s, page_to_nid(page)); + n = get_node(s, page_to_nid(page)); /* * Speculatively acquire the list_lock. * If the cmpxchg does not succeed then we may @@ -2521,7 +2841,7 @@ } while (!cmpxchg_double_slab(s, page, prior, counters, - object, new.counters, + head, new.counters, "__slab_free")); if (likely(!n)) { @@ -2538,20 +2858,21 @@ * The list lock was not taken therefore no list * activity can be necessary. */ - if (was_frozen) - stat(s, FREE_FROZEN); - return; - } + if (was_frozen) + stat(s, FREE_FROZEN); + return; + } - if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) + if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) goto slab_empty; /* * Objects left in the slab. If it was not on the partial list before * then add it. */ - if (kmem_cache_debug(s) && unlikely(!prior)) { - remove_full(s, page); + if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { + if (kmem_cache_debug(s)) + remove_full(s, n, page); add_partial(n, page, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } @@ -2565,9 +2886,10 @@ */ remove_partial(n, page); stat(s, FREE_REMOVE_PARTIAL); - } else + } else { /* Slab must be on the full list */ - remove_full(s, page); + remove_full(s, n, page); + } spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); @@ -2584,43 +2906,54 @@ * * If fastpath is not possible then fall back to __slab_free where we deal * with all sorts of special processing. - */ -static __always_inline void slab_free(struct kmem_cache *s, - struct page *page, void *x, unsigned long addr) + * + * Bulk free of a freelist with several objects (all pointing to the + * same page) possible by specifying head and tail ptr, plus objects + * count (cnt). Bulk free indicated by tail pointer being set. + */ +static __always_inline void slab_free(struct kmem_cache *s, struct page *page, + void *head, void *tail, int cnt, + unsigned long addr) { - void **object = (void *)x; + void *tail_obj = tail ? : head; struct kmem_cache_cpu *c; unsigned long tid; - slab_free_hook(s, x); - + slab_free_freelist_hook(s, head, tail); + if ((kmem_cache_fastdebug(s)) && (cnt == 1)) { + if (!free_debug_processing_fast(s, page, tail_obj, addr)) + return; + } redo: /* * Determine the currently cpus per cpu slab. * The cpu may change afterward. However that does not matter since * data is retrieved via this pointer. If we are on the same cpu - * during the cmpxchg then the free will succedd. + * during the cmpxchg then the free will succeed. */ - preempt_disable(); - c = __this_cpu_ptr(s->cpu_slab); + do { + tid = this_cpu_read(s->cpu_slab->tid); + c = raw_cpu_ptr(s->cpu_slab); + } while (IS_ENABLED(CONFIG_PREEMPT) && + unlikely(tid != READ_ONCE(c->tid))); - tid = c->tid; - preempt_enable(); + /* Same with comment on barrier() in slab_alloc_node() */ + barrier(); if (likely(page == c->page)) { - set_freepointer(s, object, c->freelist); + set_freepointer(s, tail_obj, c->freelist); if (unlikely(!this_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, c->freelist, tid, - object, next_tid(tid)))) { + head, next_tid(tid)))) { note_cmpxchg_failure("slab_free", s, tid); goto redo; } stat(s, FREE_FASTPATH); } else - __slab_free(s, page, x, addr); + __slab_free(s, page, head, tail_obj, cnt, addr); } @@ -2629,11 +2962,166 @@ s = cache_from_obj(s, x); if (!s) return; - slab_free(s, virt_to_head_page(x), x, _RET_IP_); + slab_free(s, virt_to_head_page(x), x, NULL, 1, _RET_IP_); trace_kmem_cache_free(_RET_IP_, x); } EXPORT_SYMBOL(kmem_cache_free); +struct detached_freelist { + struct page *page; + void *tail; + void *freelist; + int cnt; + struct kmem_cache *s; +}; + +/* + * This function progressively scans the array with free objects (with + * a limited look ahead) and extract objects belonging to the same + * page. It builds a detached freelist directly within the given + * page/objects. This can happen without any need for + * synchronization, because the objects are owned by running process. + * The freelist is build up as a single linked list in the objects. + * The idea is, that this detached freelist can then be bulk + * transferred to the real freelist(s), but only requiring a single + * synchronization primitive. Look ahead in the array is limited due + * to performance reasons. + */ +static inline +int build_detached_freelist(struct kmem_cache *s, size_t size, + void **p, struct detached_freelist *df) +{ + size_t first_skipped_index = 0; + int lookahead = 3; + void *object; + + /* Always re-init detached_freelist */ + df->page = NULL; + + do { + object = p[--size]; + } while (!object && size); + + if (!object) + return 0; + + /* Support for memcg, compiler can optimize this out */ + df->s = cache_from_obj(s, object); + + /* Start new detached freelist */ + set_freepointer(df->s, object, NULL); + df->page = virt_to_head_page(object); + df->tail = object; + df->freelist = object; + p[size] = NULL; /* mark object processed */ + df->cnt = 1; + + while (size) { + object = p[--size]; + if (!object) + continue; /* Skip processed objects */ + + /* df->page is always set at this point */ + if (df->page == virt_to_head_page(object)) { + /* Opportunity build freelist */ + set_freepointer(df->s, object, df->freelist); + df->freelist = object; + df->cnt++; + p[size] = NULL; /* mark object processed */ + + continue; + } + + /* Limit look ahead search */ + if (!--lookahead) + break; + + if (!first_skipped_index) + first_skipped_index = size + 1; + } + + return first_skipped_index; +} + +/* Note that interrupts must be enabled when calling this function. */ +void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) +{ + if (WARN_ON(!size)) + return; + + do { + struct detached_freelist df; + + size = build_detached_freelist(s, size, p, &df); + if (unlikely(!df.page)) + continue; + + slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); + } while (likely(size)); +} +EXPORT_SYMBOL(kmem_cache_free_bulk); + +/* Note that interrupts must be enabled when calling this function. */ +int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) +{ + struct kmem_cache_cpu *c; + int i; + + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, flags); + if (unlikely(!s)) + return false; + /* + * Drain objects in the per cpu slab, while disabling local + * IRQs, which protects against PREEMPT and interrupts + * handlers invoking normal fastpath. + */ + local_irq_disable(); + c = this_cpu_ptr(s->cpu_slab); + + for (i = 0; i < size; i++) { + void *object = c->freelist; + + if (unlikely(!object)) { + /* + * Invoking slow path likely have side-effect + * of re-populating per CPU c->freelist + */ + p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, + _RET_IP_, c); + if (unlikely(!p[i])) + goto error; + + c = this_cpu_ptr(s->cpu_slab); + continue; /* goto for-loop */ + } + c->freelist = get_freepointer(s, object); + p[i] = object; + } + c->tid = next_tid(c->tid); + local_irq_enable(); + + /* Clear memory outside IRQ disabled fastpath loop */ + if (unlikely(flags & __GFP_ZERO)) { + int j; + + for (j = 0; j < i; j++) + memset(p[j], 0, s->object_size); + } + + /* memcg and kmem_cache debug support */ + slab_post_alloc_hook(s, flags, size, p); + return i; +error: + local_irq_enable(); + slab_post_alloc_hook(s, flags, i, p); + __kmem_cache_free_bulk(s, i, p); + return 0; +} +EXPORT_SYMBOL(kmem_cache_alloc_bulk); + + /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -2658,12 +3146,6 @@ static int slub_min_objects; /* - * Merge control. If this is set then no merging of slab caches will occur. - * (Could be removed. This was introduced to pacify the merge skeptics.) - */ -static int slub_nomerge; - -/* * Calculate the order of allocation given an slab object size. * * The order of allocation has significant impact on performance and other @@ -2698,20 +3180,15 @@ if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; - for (order = max(min_order, - fls(min_objects * size - 1) - PAGE_SHIFT); + for (order = max(min_order, get_order(min_objects * size + reserved)); order <= max_order; order++) { unsigned long slab_size = PAGE_SIZE << order; - if (slab_size < min_objects * size + reserved) - continue; - rem = (slab_size - reserved) % size; if (rem <= slab_size / fract_leftover) break; - } return order; @@ -2729,7 +3206,7 @@ * works by first attempting to generate a layout with * the best configuration and backing off gradually. * - * First we reduce the acceptable waste in a slab. Then + * First we increase the acceptable waste in a slab. Then * we reduce the minimum objects required in a slab. */ min_objects = slub_min_objects; @@ -2807,8 +3284,8 @@ * slab on the node for this slabcache. There are no concurrent accesses * possible. * - * Note that this function only works on the kmalloc_node_cache - * when allocating for the kmalloc_node_cache. This is used for bootstrapping + * Note that this function only works on the kmem_cache_node + * when allocating for the kmem_cache_node. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ static void early_kmem_cache_node_alloc(int node) @@ -2822,10 +3299,8 @@ BUG_ON(!page); if (page_to_nid(page) != node) { - printk(KERN_ERR "SLUB: Unable to allocate memory from " - "node %d\n", node); - printk(KERN_ERR "SLUB: Allocating a useless per node structure " - "in order to be able to continue\n"); + pr_err("SLUB: Unable to allocate memory from node %d\n", node); + pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); } n = page->freelist; @@ -2838,22 +3313,24 @@ init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); init_tracking(kmem_cache_node, n); #endif + kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node)); init_kmem_cache_node(n); inc_slabs_node(kmem_cache_node, node, page->objects); - add_partial(n, page, DEACTIVATE_TO_HEAD); + /* + * No locks need to be taken here as it has just been + * initialized and there is no concurrent access. + */ + __add_partial(n, page, DEACTIVATE_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = s->node[node]; - - if (n) - kmem_cache_free(kmem_cache_node, n); - + for_each_kmem_cache_node(s, node, n) { + kmem_cache_free(kmem_cache_node, n); s->node[node] = NULL; } } @@ -2891,7 +3368,9 @@ min = MAX_PARTIAL; s->min_partial = min; } - +#ifndef CONFIG_KASAN +#define KASAN_SHADOW_SCALE_SHIFT 0 +#endif /* * calculate_sizes() determines the order and the distribution of data within * a slab object. @@ -2908,6 +3387,7 @@ * the possible location of the free pointer. */ size = ALIGN(size, sizeof(void *)); + size = ALIGN(size, 1UL << KASAN_SHADOW_SCALE_SHIFT); #ifdef CONFIG_SLUB_DEBUG /* @@ -2952,12 +3432,12 @@ } #ifdef CONFIG_SLUB_DEBUG - if (flags & SLAB_STORE_USER) + if (flags & (SLAB_STORE_USER | SLAB_STORE_USER_LITE)) /* * Need to store information about allocs and frees after * the object. */ - size += 2 * sizeof(struct track); + size += 2 * track_size(flags); if (flags & SLAB_RED_ZONE) /* @@ -3056,10 +3536,10 @@ * A) The number of objects from per cpu partial slabs dumped to the * per node list when we reach the limit. * B) The number of objects in cpu partial slabs to extract from the - * per node list when we run out of per cpu objects. We only fetch 50% - * to keep some capacity around for frees. + * per node list when we run out of per cpu objects. We only fetch + * 50% to keep some capacity around for frees. */ - if (kmem_cache_debug(s)) + if (!kmem_cache_has_cpu_partial(s)) s->cpu_partial = 0; else if (s->size >= PAGE_SIZE) s->cpu_partial = 2; @@ -3084,8 +3564,8 @@ if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u " "order=%u offset=%u flags=%lx\n", - s->name, (unsigned long)s->size, s->size, oo_order(s->oo), - s->offset, flags); + s->name, (unsigned long)s->size, s->size, + oo_order(s->oo), s->offset, flags); return -EINVAL; } @@ -3106,8 +3586,7 @@ for_each_object(p, s, addr, page->objects) { if (!test_bit(slab_index(p, s, addr), map)) { - printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", - p, p - addr); + pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } @@ -3127,7 +3606,7 @@ list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { - remove_partial(n, page); + __remove_partial(n, page); discard_slab(s, page); } else { list_slab_objects(s, page, @@ -3142,12 +3621,11 @@ static inline int kmem_cache_close(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; flush_all(s); /* Attempt to free all objects */ - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) { free_partial(s, n); if (n->nr_partial || slabs_node(s, node)) return 1; @@ -3159,23 +3637,7 @@ int __kmem_cache_shutdown(struct kmem_cache *s) { - int rc = kmem_cache_close(s); - - if (!rc) { - /* - * We do the same lock strategy around sysfs_slab_add, see - * __kmem_cache_create. Because this is pretty much the last - * operation we do and the lock will be released shortly after - * that in slab_common.c, we could just move sysfs_slab_remove - * to a later point in common code. We should do that when we - * have a common sysfs framework for all allocators. - */ - mutex_unlock(&slab_mutex); - sysfs_slab_remove(s); - mutex_lock(&slab_mutex); - } - - return rc; + return kmem_cache_close(s); } /******************************************************************** @@ -3210,14 +3672,6 @@ __setup("slub_min_objects=", setup_slub_min_objects); -static int __init setup_slub_nomerge(char *str) -{ - slub_nomerge = 1; - return 1; -} - -__setup("slub_nomerge", setup_slub_nomerge); - void *__kmalloc(size_t size, gfp_t flags) { struct kmem_cache *s; @@ -3235,6 +3689,8 @@ trace_kmalloc(_RET_IP_, ret, size, s->size, flags); + kasan_kmalloc(s, ret, size); + return ret; } EXPORT_SYMBOL(__kmalloc); @@ -3245,12 +3701,12 @@ struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; - page = alloc_pages_node(node, flags, get_order(size)); + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_kmem_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); - kmemleak_alloc(ptr, size, 1, flags); + kmalloc_large_node_hook(ptr, size, flags); return ptr; } @@ -3278,12 +3734,14 @@ trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node); + kasan_kmalloc(s, ret, size); + return ret; } EXPORT_SYMBOL(__kmalloc_node); #endif -size_t ksize(const void *object) +static size_t __ksize(const void *object) { struct page *page; @@ -3299,43 +3757,16 @@ return slab_ksize(page->slab_cache); } -EXPORT_SYMBOL(ksize); -#ifdef CONFIG_SLUB_DEBUG -bool verify_mem_not_deleted(const void *x) +size_t ksize(const void *object) { - struct page *page; - void *object = (void *)x; - unsigned long flags; - bool rv; - - if (unlikely(ZERO_OR_NULL_PTR(x))) - return false; - - local_irq_save(flags); - - page = virt_to_head_page(x); - if (unlikely(!PageSlab(page))) { - /* maybe it was from stack? */ - rv = true; - goto out_unlock; - } - - slab_lock(page); - if (on_freelist(page->slab_cache, page, object)) { - object_err(page->slab_cache, page, object, "Object is on free-list"); - rv = false; - } else { - rv = true; - } - slab_unlock(page); - -out_unlock: - local_irq_restore(flags); - return rv; + size_t size = __ksize(object); + /* We assume that ksize callers could use whole allocated area, + so we need unpoison this area. */ + kasan_krealloc(object, size); + return size; } -EXPORT_SYMBOL(verify_mem_not_deleted); -#endif +EXPORT_SYMBOL(ksize); void kfree(const void *x) { @@ -3350,81 +3781,101 @@ page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); - kmemleak_free(x); - __free_memcg_kmem_pages(page, compound_order(page)); + kfree_hook(x); + __free_kmem_pages(page, compound_order(page)); return; } - slab_free(page->slab_cache, page, object, _RET_IP_); + slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_); } EXPORT_SYMBOL(kfree); +#define SHRINK_PROMOTE_MAX 32 + /* - * kmem_cache_shrink removes empty slabs from the partial lists and sorts - * the remaining slabs by the number of items in use. The slabs with the - * most items in use come first. New allocations will then fill those up - * and thus they can be removed from the partial lists. + * kmem_cache_shrink discards empty slabs and promotes the slabs filled + * up most to the head of the partial lists. New allocations will then + * fill those up and thus they can be removed from the partial lists. * * The slabs with the least items are placed last. This results in them * being allocated from last increasing the chance that the last objects * are freed in them. */ -int kmem_cache_shrink(struct kmem_cache *s) +int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) { int node; int i; struct kmem_cache_node *n; struct page *page; struct page *t; - int objects = oo_objects(s->max); - struct list_head *slabs_by_inuse = - kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL); + struct list_head discard; + struct list_head promote[SHRINK_PROMOTE_MAX]; unsigned long flags; + int ret = 0; - if (!slabs_by_inuse) - return -ENOMEM; - - flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - n = get_node(s, node); + if (deactivate) { + /* + * Disable empty slabs caching. Used to avoid pinning offline + * memory cgroups by kmem pages that can be freed. + */ + s->cpu_partial = 0; + s->min_partial = 0; - if (!n->nr_partial) - continue; + /* + * s->cpu_partial is checked locklessly (see put_cpu_partial), + * so we have to make sure the change is visible. + */ + kick_all_cpus_sync(); + } - for (i = 0; i < objects; i++) - INIT_LIST_HEAD(slabs_by_inuse + i); + flush_all(s); + for_each_kmem_cache_node(s, node, n) { + INIT_LIST_HEAD(&discard); + for (i = 0; i < SHRINK_PROMOTE_MAX; i++) + INIT_LIST_HEAD(promote + i); spin_lock_irqsave(&n->list_lock, flags); /* - * Build lists indexed by the items in use in each slab. + * Build lists of slabs to discard or promote. * * Note that concurrent frees may occur while we hold the * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - list_move(&page->lru, slabs_by_inuse + page->inuse); - if (!page->inuse) + int free = page->objects - page->inuse; + + /* Do not reread page->inuse */ + barrier(); + + /* We do not keep full slabs on the list */ + BUG_ON(free <= 0); + + if (free == page->objects) { + list_move(&page->lru, &discard); n->nr_partial--; + } else if (free <= SHRINK_PROMOTE_MAX) + list_move(&page->lru, promote + free - 1); } /* - * Rebuild the partial list with the slabs filled up most - * first and the least used slabs at the end. + * Promote the slabs filled up most to the head of the + * partial list. */ - for (i = objects - 1; i > 0; i--) - list_splice(slabs_by_inuse + i, n->partial.prev); + for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) + list_splice(promote + i, &n->partial); spin_unlock_irqrestore(&n->list_lock, flags); /* Release empty slabs */ - list_for_each_entry_safe(page, t, slabs_by_inuse, lru) + list_for_each_entry_safe(page, t, &discard, lru) discard_slab(s, page); + + if (slabs_node(s, node)) + ret = 1; } - kfree(slabs_by_inuse); - return 0; + return ret; } -EXPORT_SYMBOL(kmem_cache_shrink); static int slab_mem_going_offline_callback(void *arg) { @@ -3432,7 +3883,7 @@ mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - kmem_cache_shrink(s); + __kmem_cache_shrink(s, false); mutex_unlock(&slab_mutex); return 0; @@ -3559,6 +4010,7 @@ { int node; struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); + struct kmem_cache_node *n; memcpy(s, static_cache, kmem_cache->object_size); @@ -3568,20 +4020,18 @@ * IPIs around. */ __flush_cpu_slab(s, smp_processor_id()); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { struct page *p; - if (n) { - list_for_each_entry(p, &n->partial, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->partial, lru) + p->slab_cache = s; #ifdef CONFIG_SLUB_DEBUG - list_for_each_entry(p, &n->full, lru) - p->slab_cache = s; + list_for_each_entry(p, &n->full, lru) + p->slab_cache = s; #endif - } } + slab_init_memcg_params(s); list_add(&s->list, &slab_caches); return s; } @@ -3620,15 +4070,14 @@ kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ + setup_kmalloc_cache_index_table(); create_kmalloc_caches(0); #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); #endif - printk(KERN_INFO - "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d," - " CPUs=%d, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); @@ -3638,79 +4087,16 @@ { } -/* - * Find a mergeable slab cache - */ -static int slab_unmergeable(struct kmem_cache *s) -{ - if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE)) - return 1; - - if (s->ctor) - return 1; - - /* - * We may have set a slab to be unmergeable during bootstrap. - */ - if (s->refcount < 0) - return 1; - - return 0; -} - -static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size, - size_t align, unsigned long flags, const char *name, - void (*ctor)(void *)) -{ - struct kmem_cache *s; - - if (slub_nomerge || (flags & SLUB_NEVER_MERGE)) - return NULL; - - if (ctor) - return NULL; - - size = ALIGN(size, sizeof(void *)); - align = calculate_alignment(flags, align, size); - size = ALIGN(size, align); - flags = kmem_cache_flags(size, flags, name, NULL); - - list_for_each_entry(s, &slab_caches, list) { - if (slab_unmergeable(s)) - continue; - - if (size > s->size) - continue; - - if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME)) - continue; - /* - * Check if alignment is compatible. - * Courtesy of Adrian Drzewiecki - */ - if ((s->size & ~(align - 1)) != s->size) - continue; - - if (s->size - size >= sizeof(void *)) - continue; - - if (!cache_match_memcg(s, memcg)) - continue; - - return s; - } - return NULL; -} - struct kmem_cache * -__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size, - size_t align, unsigned long flags, void (*ctor)(void *)) +__kmem_cache_alias(const char *name, size_t size, size_t align, + unsigned long flags, void (*ctor)(void *)) { - struct kmem_cache *s; + struct kmem_cache *s, *c; - s = find_mergeable(memcg, size, align, flags, name, ctor); + s = find_mergeable(size, align, flags, name, ctor); if (s) { s->refcount++; + /* * Adjust the object sizes so that we clear * the complete object on kzalloc. @@ -3718,6 +4104,12 @@ s->object_size = max(s->object_size, (int)size); s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); + for_each_memcg_cache(c, s) { + c->object_size = s->object_size; + c->inuse = max_t(int, c->inuse, + ALIGN(size, sizeof(void *))); + } + if (sysfs_slab_alias(s, name)) { s->refcount--; s = NULL; @@ -3740,10 +4132,7 @@ return 0; memcg_propagate_slab_attrs(s); - mutex_unlock(&slab_mutex); err = sysfs_slab_add(s); - mutex_lock(&slab_mutex); - if (err) kmem_cache_close(s); @@ -3755,7 +4144,7 @@ * Use the cpu notifier to insure that the cpu slabs are flushed when * necessary. */ -static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, +static int slab_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -3781,7 +4170,7 @@ return NOTIFY_OK; } -static struct notifier_block __cpuinitdata slab_notifier = { +static struct notifier_block slab_notifier = { .notifier_call = slab_cpuup_callback }; @@ -3839,7 +4228,7 @@ } #endif -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS static int count_inuse(struct page *page) { return page->inuse; @@ -3901,10 +4290,10 @@ count++; } if (count != n->nr_partial) - printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " - "counter=%ld\n", s->name, count, n->nr_partial); + pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n", + s->name, count, n->nr_partial); - if (!(s->flags & SLAB_STORE_USER)) + if (!(s->flags & (SLAB_STORE_USER | SLAB_STORE_USER_LITE))) goto out; list_for_each_entry(page, &n->full, lru) { @@ -3912,9 +4301,8 @@ count++; } if (count != atomic_long_read(&n->nr_slabs)) - printk(KERN_ERR "SLUB: %s %ld slabs counted but " - "counter=%ld\n", s->name, count, - atomic_long_read(&n->nr_slabs)); + pr_err("SLUB: %s %ld slabs counted but counter=%ld\n", + s->name, count, atomic_long_read(&n->nr_slabs)); out: spin_unlock_irqrestore(&n->list_lock, flags); @@ -3927,16 +4315,14 @@ unsigned long count = 0; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; if (!map) return -ENOMEM; flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); - + for_each_kmem_cache_node(s, node, n) count += validate_slab_node(s, n, map); - } kfree(map); return count; } @@ -4090,6 +4476,7 @@ int node; unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * sizeof(unsigned long), GFP_KERNEL); + struct kmem_cache_node *n; if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), GFP_TEMPORARY)) { @@ -4099,8 +4486,7 @@ /* Push back cpu slabs */ flush_all(s); - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + for_each_kmem_cache_node(s, node, n) { unsigned long flags; struct page *page; @@ -4145,18 +4531,16 @@ if (num_online_cpus() > 1 && !cpumask_empty(to_cpumask(l->cpus)) && - len < PAGE_SIZE - 60) { - len += sprintf(buf + len, " cpus="); - len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, - to_cpumask(l->cpus)); - } + len < PAGE_SIZE - 60) + len += scnprintf(buf + len, PAGE_SIZE - len - 50, + " cpus=%*pbl", + cpumask_pr_args(to_cpumask(l->cpus))); if (nr_online_nodes > 1 && !nodes_empty(l->nodes) && - len < PAGE_SIZE - 60) { - len += sprintf(buf + len, " nodes="); - len += nodelist_scnprintf(buf + len, PAGE_SIZE - len - 50, - l->nodes); - } + len < PAGE_SIZE - 60) + len += scnprintf(buf + len, PAGE_SIZE - len - 50, + " nodes=%*pbl", + nodemask_pr_args(&l->nodes)); len += sprintf(buf + len, "\n"); } @@ -4170,68 +4554,65 @@ #endif #ifdef SLUB_RESILIENCY_TEST -static void resiliency_test(void) +static void __init resiliency_test(void) { u8 *p; BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); - printk(KERN_ERR "SLUB resiliency testing\n"); - printk(KERN_ERR "-----------------------\n"); - printk(KERN_ERR "A. Corruption after allocation\n"); + pr_err("SLUB resiliency testing\n"); + pr_err("-----------------------\n"); + pr_err("A. Corruption after allocation\n"); p = kzalloc(16, GFP_KERNEL); p[16] = 0x12; - printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" - " 0x12->0x%p\n\n", p + 16); + pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", + p + 16); validate_slab_cache(kmalloc_caches[4]); /* Hmmm... The next two are dangerous */ p = kzalloc(32, GFP_KERNEL); p[32 + sizeof(void *)] = 0x34; - printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" - " 0x34 -> -0x%p\n", p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); + pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n", + p); + pr_err("If allocated object is overwritten then not detectable\n\n"); validate_slab_cache(kmalloc_caches[5]); p = kzalloc(64, GFP_KERNEL); p += 64 + (get_cycles() & 0xff) * sizeof(void *); *p = 0x56; - printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", - p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); + pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", + p); + pr_err("If allocated object is overwritten then not detectable\n\n"); validate_slab_cache(kmalloc_caches[6]); - printk(KERN_ERR "\nB. Corruption after free\n"); + pr_err("\nB. Corruption after free\n"); p = kzalloc(128, GFP_KERNEL); kfree(p); *p = 0x78; - printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); + pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[7]); p = kzalloc(256, GFP_KERNEL); kfree(p); p[50] = 0x9a; - printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", - p); + pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[8]); p = kzalloc(512, GFP_KERNEL); kfree(p); p[512] = 0xab; - printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); + pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[9]); } #else -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS static void resiliency_test(void) {}; #endif #endif -#ifdef CONFIG_SYSFS +#ifdef SLAB_SUPPORTS_SYSFS enum slab_stat_type { SL_ALL, /* All slabs */ SL_PARTIAL, /* Only partially allocated slabs */ @@ -4253,22 +4634,21 @@ int node; int x; unsigned long *nodes; - unsigned long *per_cpu; - nodes = kzalloc(2 * sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); + nodes = kzalloc(sizeof(unsigned long) * nr_node_ids, GFP_KERNEL); if (!nodes) return -ENOMEM; - per_cpu = nodes + nr_node_ids; if (flags & SO_CPU) { int cpu; for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, + cpu); int node; struct page *page; - page = ACCESS_ONCE(c->page); + page = READ_ONCE(c->page); if (!page) continue; @@ -4283,7 +4663,7 @@ total += x; nodes[node] += x; - page = ACCESS_ONCE(c->partial); + page = READ_ONCE(c->partial); if (page) { node = page_to_nid(page); if (flags & SO_TOTAL) @@ -4295,23 +4675,21 @@ total += x; nodes[node] += x; } - - per_cpu[node]++; } } - lock_memory_hotplug(); + get_online_mems(); #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; - if (flags & SO_TOTAL) - x = atomic_long_read(&n->total_objects); - else if (flags & SO_OBJECTS) - x = atomic_long_read(&n->total_objects) - - count_partial(n, count_free); + for_each_kmem_cache_node(s, node, n) { + if (flags & SO_TOTAL) + x = atomic_long_read(&n->total_objects); + else if (flags & SO_OBJECTS) + x = atomic_long_read(&n->total_objects) - + count_partial(n, count_free); else x = atomic_long_read(&n->nr_slabs); total += x; @@ -4321,9 +4699,9 @@ } else #endif if (flags & SO_PARTIAL) { - for_each_node_state(node, N_NORMAL_MEMORY) { - struct kmem_cache_node *n = get_node(s, node); + struct kmem_cache_node *n; + for_each_kmem_cache_node(s, node, n) { if (flags & SO_TOTAL) x = count_partial(n, count_total); else if (flags & SO_OBJECTS) @@ -4336,12 +4714,12 @@ } x = sprintf(buf, "%lu", total); #ifdef CONFIG_NUMA - for_each_node_state(node, N_NORMAL_MEMORY) + for (node = 0; node < nr_node_ids; node++) if (nodes[node]) x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); #endif - unlock_memory_hotplug(); + put_online_mems(); kfree(nodes); return x + sprintf(buf + x, "\n"); } @@ -4350,16 +4728,12 @@ static int any_slab_objects(struct kmem_cache *s) { int node; + struct kmem_cache_node *n; - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - + for_each_kmem_cache_node(s, node, n) if (atomic_long_read(&n->total_objects)) return 1; - } + return 0; } #endif @@ -4411,7 +4785,7 @@ unsigned long order; int err; - err = strict_strtoul(buf, 10, &order); + err = kstrtoul(buf, 10, &order); if (err) return err; @@ -4439,7 +4813,7 @@ unsigned long min; int err; - err = strict_strtoul(buf, 10, &min); + err = kstrtoul(buf, 10, &min); if (err) return err; @@ -4459,10 +4833,10 @@ unsigned long objects; int err; - err = strict_strtoul(buf, 10, &objects); + err = kstrtoul(buf, 10, &objects); if (err) return err; - if (objects && kmem_cache_debug(s)) + if (objects && !kmem_cache_has_cpu_partial(s)) return -EINVAL; s->cpu_partial = objects; @@ -4481,7 +4855,7 @@ static ssize_t aliases_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", s->refcount - 1); + return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1); } SLAB_ATTR_RO(aliases); @@ -4619,6 +4993,14 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, size_t length) { + /* + * Tracing a merged cache is going to give confusing results + * as well as cause other issues like converting a mergeable + * cache into an umergeable one. + */ + if (s->refcount > 1) + return -EINVAL; + s->flags &= ~SLAB_TRACE; if (buf[0] == '1') { s->flags &= ~__CMPXCHG_DOUBLE; @@ -4691,6 +5073,29 @@ } SLAB_ATTR(store_user); +#if defined(CONFIG_SLUB_AVM_ALLOC_LIST) +static ssize_t store_user_lite_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER_LITE)); +} + +static ssize_t store_user_lite_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + if (any_slab_objects(s)) + return -EBUSY; + + s->flags &= ~SLAB_STORE_USER_LITE; + if (buf[0] == '1') { + s->flags &= ~__CMPXCHG_DOUBLE; + s->flags |= SLAB_STORE_USER_LITE; + } + calculate_sizes(s, -1); + return length; +} +SLAB_ATTR(store_user_lite); +#endif/*--- #if defined(CONFIG_SLUB_AVM_ALLOC_LIST) ---*/ + static ssize_t validate_show(struct kmem_cache *s, char *buf) { return 0; @@ -4736,6 +5141,9 @@ static ssize_t failslab_store(struct kmem_cache *s, const char *buf, size_t length) { + if (s->refcount > 1) + return -EINVAL; + s->flags &= ~SLAB_FAILSLAB; if (buf[0] == '1') s->flags |= SLAB_FAILSLAB; @@ -4752,12 +5160,9 @@ static ssize_t shrink_store(struct kmem_cache *s, const char *buf, size_t length) { - if (buf[0] == '1') { - int rc = kmem_cache_shrink(s); - - if (rc) - return rc; - } else + if (buf[0] == '1') + kmem_cache_shrink(s); + else return -EINVAL; return length; } @@ -4775,7 +5180,7 @@ unsigned long ratio; int err; - err = strict_strtoul(buf, 10, &ratio); + err = kstrtoul(buf, 10, &ratio); if (err) return err; @@ -4896,6 +5301,9 @@ &red_zone_attr.attr, &poison_attr.attr, &store_user_attr.attr, +#ifdef CONFIG_SLUB_AVM_ALLOC_LIST + &store_user_lite_attr.attr, +#endif &validate_attr.attr, &alloc_calls_attr.attr, &free_calls_attr.attr, @@ -4981,7 +5389,7 @@ err = attribute->store(s, buf, len); #ifdef CONFIG_MEMCG_KMEM if (slab_state >= FULL && err >= 0 && is_root_cache(s)) { - int i; + struct kmem_cache *c; mutex_lock(&slab_mutex); if (s->max_attr_size < len) @@ -5004,11 +5412,8 @@ * directly either failed or succeeded, in which case we loop * through the descendants with best-effort propagation. */ - for_each_memcg_cache_index(i) { - struct kmem_cache *c = cache_from_memcg(s, i); - if (c) - attribute->store(c, buf, len); - } + for_each_memcg_cache(c, s) + attribute->store(c, buf, len); mutex_unlock(&slab_mutex); } #endif @@ -5020,15 +5425,18 @@ #ifdef CONFIG_MEMCG_KMEM int i; char *buffer = NULL; + struct kmem_cache *root_cache; - if (!is_root_cache(s)) + if (is_root_cache(s)) return; + root_cache = s->memcg_params.root_cache; + /* * This mean this cache had no attribute written. Therefore, no point * in copying default values around */ - if (!s->max_attr_size) + if (!root_cache->max_attr_size) return; for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) { @@ -5050,7 +5458,7 @@ */ if (buffer) buf = buffer; - else if (s->max_attr_size < ARRAY_SIZE(mbuf)) + else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf)) buf = mbuf; else { buffer = (char *) get_zeroed_page(GFP_KERNEL); @@ -5059,7 +5467,7 @@ buf = buffer; } - attr->show(s->memcg_params->root_cache, buf); + attr->show(root_cache, buf); attr->store(s, buf, strlen(buf)); } @@ -5068,6 +5476,11 @@ #endif } +static void kmem_cache_release(struct kobject *k) +{ + slab_kmem_cache_release(to_slab(k)); +} + static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, @@ -5075,6 +5488,7 @@ static struct kobj_type slab_ktype = { .sysfs_ops = &slab_sysfs_ops, + .release = kmem_cache_release, }; static int uevent_filter(struct kset *kset, struct kobject *kobj) @@ -5092,6 +5506,15 @@ static struct kset *slab_kset; +static inline struct kset *cache_kset(struct kmem_cache *s) +{ +#ifdef CONFIG_MEMCG_KMEM + if (!is_root_cache(s)) + return s->memcg_params.root_cache->memcg_kset; +#endif + return slab_kset; +} + #define ID_STR_LENGTH 64 /* Create a unique string id for a slab cache: @@ -5125,11 +5548,6 @@ *p++ = '-'; p += sprintf(p, "%07d", s->size); -#ifdef CONFIG_MEMCG_KMEM - if (!is_root_cache(s)) - p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg)); -#endif - BUG_ON(p > name + ID_STR_LENGTH - 1); return name; } @@ -5156,29 +5574,40 @@ name = create_unique_id(s); } - s->kobj.kset = slab_kset; - err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name); - if (err) { - kobject_put(&s->kobj); - return err; - } + s->kobj.kset = cache_kset(s); + err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name); + if (err) + goto out; err = sysfs_create_group(&s->kobj, &slab_attr_group); - if (err) { - kobject_del(&s->kobj); - kobject_put(&s->kobj); - return err; + if (err) + goto out_del_kobj; + +#ifdef CONFIG_MEMCG_KMEM + if (is_root_cache(s)) { + s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj); + if (!s->memcg_kset) { + err = -ENOMEM; + goto out_del_kobj; + } } +#endif + kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ sysfs_slab_alias(s, s->name); - kfree(name); } - return 0; +out: + if (!unmergeable) + kfree(name); + return err; +out_del_kobj: + kobject_del(&s->kobj); + goto out; } -static void sysfs_slab_remove(struct kmem_cache *s) +void sysfs_slab_remove(struct kmem_cache *s) { if (slab_state < FULL) /* @@ -5187,6 +5616,9 @@ */ return; +#ifdef CONFIG_MEMCG_KMEM + kset_unregister(s->memcg_kset); +#endif kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); kobject_put(&s->kobj); @@ -5237,7 +5669,7 @@ slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); if (!slab_kset) { mutex_unlock(&slab_mutex); - printk(KERN_ERR "Cannot register slab subsystem.\n"); + pr_err("Cannot register slab subsystem.\n"); return -ENOSYS; } @@ -5246,8 +5678,8 @@ list_for_each_entry(s, &slab_caches, list) { err = sysfs_slab_add(s); if (err) - printk(KERN_ERR "SLUB: Unable to add boot slab %s" - " to sysfs\n", s->name); + pr_err("SLUB: Unable to add boot slab %s to sysfs\n", + s->name); } while (alias_list) { @@ -5256,8 +5688,8 @@ alias_list = alias_list->next; err = sysfs_slab_alias(al->s, al->name); if (err) - printk(KERN_ERR "SLUB: Unable to add boot slab alias" - " %s to sysfs\n", al->name); + pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n", + al->name); kfree(al); } @@ -5267,7 +5699,7 @@ } __initcall(slab_sysfs_init); -#endif /* CONFIG_SYSFS */ +#endif /* SLAB_SUPPORTS_SYSFS */ /* * The /proc/slabinfo ABI @@ -5275,21 +5707,15 @@ #ifdef CONFIG_SLABINFO void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) { - unsigned long nr_partials = 0; unsigned long nr_slabs = 0; unsigned long nr_objs = 0; unsigned long nr_free = 0; int node; + struct kmem_cache_node *n; - for_each_online_node(node) { - struct kmem_cache_node *n = get_node(s, node); - - if (!n) - continue; - - nr_partials += n->nr_partial; - nr_slabs += atomic_long_read(&n->nr_slabs); - nr_objs += atomic_long_read(&n->total_objects); + for_each_kmem_cache_node(s, node, n) { + nr_slabs += node_nr_slabs(n); + nr_objs += node_nr_objs(n); nr_free += count_partial(n, count_free); } @@ -5311,3 +5737,231 @@ return -EIO; } #endif /* CONFIG_SLABINFO */ + +#if defined(CONFIG_AVM_ENHANCED) +/** + * @brief get kmemalloc-area if addr in range + * attention! function unsaved for cachep - zone-page-spinlock necessary + * @return start (zero if not exist) + */ +unsigned long get_kmemalloc_area(unsigned long addr, + unsigned long *caller, + const char **cache_name, + unsigned long *size, int *freed) +{ + void *object; + unsigned long alloc_caller = 0, free_caller = 0; + unsigned int obj_idx; + void *base; + void *kstart; + struct page *page; + struct kmem_cache *s; + + object = (void *)addr; + page = virt_to_head_page(object); + if (!virt_addr_valid(page)) + return 0; + if (unlikely(!PageSlab(page))) + return 0; + + s = page->slab_cache; + if (!virt_addr_valid(s)) + return 0; + + base = page_address(page); + if ((object < base) || (object >= base + page->objects * s->size)) + return 0; + + obj_idx = slab_index(object, s, base); + kstart = base + (s->size * obj_idx); + if (cache_name) + *cache_name = s->name; + if (size) + *size = s->size; +#ifdef CONFIG_SLUB_DEBUG + if (s->flags & (SLAB_STORE_USER | SLAB_STORE_USER_LITE)) { + alloc_caller = get_track(s, kstart, TRACK_ALLOC)->addr; + free_caller = get_track(s, kstart, TRACK_FREE)->addr; + } +#endif + if (freed) + *freed = alloc_caller ? 0 : 1; + if (caller) + *caller = alloc_caller ? : free_caller; + return (unsigned long)kstart; +} +#endif /*--- #if defined(CONFIG_AVM_ENHANCED) ---*/ + +#if defined(CONFIG_SLUB_AVM_ALLOC_LIST) +/** + * @brief sorted on caller + * divide et impera to make caller-history (like add_caller()-function but this struct ;-)* + * @param ptoplist pointer for toplist to fill + * @param caller caller + * @return != 0 if no place in toplist + */ +#define TOP_TOIDX(p) ((p) - (&ptoplist->entry[0])) +static int mark_in_toplist(struct _slab_avm_topalloclist *ptoplist, unsigned long caller, unsigned long act_diff) +{ + unsigned int i, elements, idx; + struct _slab_avm_alloc_entry *q, *p; + + elements = ptoplist->entries; + p = &ptoplist->entry[0]; + while (elements) { + i = elements / 2; + q = &p[i]; + if (q->caller == caller) { + q->count++; + q->sum_time += (unsigned long long)act_diff; + return 0; + } + if (q->caller > caller) { + elements = i; + } else { + p = q + 1; + elements -= i + 1; + } + } + if (ptoplist->entries >= ARRAY_SIZE(ptoplist->entry)) { + ptoplist->ignored++; + return 1; + } + idx = TOP_TOIDX(p); + memmove(&p[1], p, (ptoplist->entries - idx) * sizeof(ptoplist->entry[0])); + ptoplist->entries++; + ptoplist->entry[idx].caller = caller; + ptoplist->entry[idx].sum_time = act_diff; + ptoplist->entry[idx].count = 1; + return 0; + } + +/** + * @brief sum caller-toplist entries + * @param ptoplistpointer + * @return void + */ +static unsigned long sum_toplist_entries(struct _slab_avm_topalloclist *ptoplist) +{ + unsigned long sum_count = 0; + unsigned int i; + + for (i = 0; i < ptoplist->entries; i++) { + sum_count += ptoplist->entry[i].count; +} + return sum_count; +} +/** + * @brief sort caller-toplist (greater first) + * @param ptoplistpointer for toplist to fill + * @return void + */ +static void sort_topalloclist(struct _slab_avm_topalloclist *ptoplist) +{ + unsigned int i, max_count, max_idx, idx = 0; + + for (;;) { + struct _slab_avm_alloc_entry tmp; + + max_count = 0; + for (i = idx; i < ptoplist->entries; i++) { + if (ptoplist->entry[i].count > max_count) { + max_count = ptoplist->entry[i].count; + max_idx = i; + } + } + if (max_count == 0) { + break; + } + /*--- swap ---*/ + memcpy(&tmp, &ptoplist->entry[idx], sizeof(tmp)); + memcpy(&ptoplist->entry[idx], &ptoplist->entry[max_idx], sizeof(tmp)); + memcpy(&ptoplist->entry[max_idx], &tmp, sizeof(tmp)); + idx++; + } +} + +/** + * @brief fill allocator-toplist for cachep + * @param ptoplist pointer for toplist to fill + * @param cachep cachepool + * @param tresh_jiffiesdiff only if caller older than ... + * @return void + */ +void cache_avm_topalloc_list(struct _slab_avm_topalloclist *ptoplist, + struct kmem_cache *s, + unsigned long tresh_jiffiesdiff) +{ + void *addr; + unsigned long act_jiffies = jiffies; + unsigned long act_diff; + struct track *pt; + int node; + void *p; + struct kmem_cache_node *n; + + memset(ptoplist, 0, sizeof(*ptoplist)); + + for_each_kmem_cache_node(s, node, n) { + unsigned long flags; + struct page *page; + + flush_all(s); /* be sure that all objects in lists (flush freelist's) */ + + if (!atomic_long_read(&n->nr_slabs)) + continue; + spin_lock_irqsave(&n->list_lock, flags); + list_for_each_entry(page, &n->partial, lru) { + + slab_lock(page); + addr = page_address(page); + for_each_object(p, s, addr, page->objects) { + if (!(s->flags & (SLAB_STORE_USER | SLAB_STORE_USER_LITE))) { + continue; + } + if (on_freelist(s, page, p)) { + continue; + } + pt = get_track(s, p, TRACK_ALLOC); + if ((pt->addr == 0) || (pt->when == 0)) { + /*--- all pt->when with zero: do not trace initial alloc's (see set_track)---*/ + continue; + } + act_diff = act_jiffies - pt->when; + if (act_diff < tresh_jiffiesdiff) { + /*--- too young ---*/ + continue; + } + mark_in_toplist(ptoplist, pt->addr, act_diff); + } + slab_unlock(page); + } + list_for_each_entry(page, &n->full, lru) { + + slab_lock(page); + addr = page_address(page); + for_each_object(p, s, addr, page->objects) { + if (!(s->flags & (SLAB_STORE_USER | SLAB_STORE_USER_LITE))) { + continue; + } + pt = get_track(s, p, TRACK_ALLOC); + if ((pt->addr == 0) || (pt->when == 0)) { + /*--- all pt->when with zero: do not trace initial alloc's (see set_track) ---*/ + continue; + } + act_diff = act_jiffies - pt->when; + if (act_diff < tresh_jiffiesdiff) { + /*--- too young ---*/ + continue; + } + mark_in_toplist(ptoplist, pt->addr, act_diff); + } + slab_unlock(page); + } + spin_unlock_irqrestore(&n->list_lock, flags); + } + sort_topalloclist(ptoplist); + ptoplist->sum_count = sum_toplist_entries(ptoplist) + ptoplist->ignored; +} + +#endif/*--- #if defined(CONFIG_DEBUG_SLAB_AVM_LITE) ---*/