--- zzzz-none-000/linux-3.10.107/drivers/md/dm-bufio.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/drivers/md/dm-bufio.c 2021-02-04 17:41:59.000000000 +0000 @@ -11,9 +11,11 @@ #include #include #include +#include #include #include #include +#include #define DM_MSG_PREFIX "bufio" @@ -34,26 +36,23 @@ /* * Check buffer ages in this interval (seconds) */ -#define DM_BUFIO_WORK_TIMER_SECS 10 +#define DM_BUFIO_WORK_TIMER_SECS 30 /* * Free buffers when they are older than this (seconds) */ -#define DM_BUFIO_DEFAULT_AGE_SECS 60 +#define DM_BUFIO_DEFAULT_AGE_SECS 300 /* - * The number of bvec entries that are embedded directly in the buffer. - * If the chunk size is larger, dm-io is used to do the io. + * The nr of bytes of cached data to keep around. */ -#define DM_BUFIO_INLINE_VECS 16 +#define DM_BUFIO_DEFAULT_RETAIN_BYTES (256 * 1024) /* - * Buffer hash + * The number of bvec entries that are embedded directly in the buffer. + * If the chunk size is larger, dm-io is used to do the io. */ -#define DM_BUFIO_HASH_BITS 20 -#define DM_BUFIO_HASH(block) \ - ((((block) >> DM_BUFIO_HASH_BITS) ^ (block)) & \ - ((1 << DM_BUFIO_HASH_BITS) - 1)) +#define DM_BUFIO_INLINE_VECS 16 /* * Don't try to use kmem_cache_alloc for blocks larger than this. @@ -104,7 +103,9 @@ struct list_head reserved_buffers; unsigned need_reserved_buffers; - struct hlist_head *cache_hash; + unsigned minimum_buffers; + + struct rb_root buffer_tree; wait_queue_head_t free_buffer_wait; int async_write_error; @@ -133,7 +134,7 @@ }; struct dm_buffer { - struct hlist_node hash_list; + struct rb_node node; struct list_head lru_list; sector_t block; void *data; @@ -145,6 +146,7 @@ unsigned long state; unsigned long last_accessed; struct dm_bufio_client *c; + struct list_head write_list; struct bio bio; struct bio_vec bio_vec[DM_BUFIO_INLINE_VECS]; }; @@ -220,6 +222,7 @@ * Buffers are freed after this timeout */ static unsigned dm_bufio_max_age = DM_BUFIO_DEFAULT_AGE_SECS; +static unsigned dm_bufio_retain_bytes = DM_BUFIO_DEFAULT_RETAIN_BYTES; static unsigned long dm_bufio_peak_allocated; static unsigned long dm_bufio_allocated_kmem_cache; @@ -250,6 +253,53 @@ */ static DEFINE_MUTEX(dm_bufio_clients_lock); +/*---------------------------------------------------------------- + * A red/black tree acts as an index for all the buffers. + *--------------------------------------------------------------*/ +static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) +{ + struct rb_node *n = c->buffer_tree.rb_node; + struct dm_buffer *b; + + while (n) { + b = container_of(n, struct dm_buffer, node); + + if (b->block == block) + return b; + + n = (b->block < block) ? n->rb_left : n->rb_right; + } + + return NULL; +} + +static void __insert(struct dm_bufio_client *c, struct dm_buffer *b) +{ + struct rb_node **new = &c->buffer_tree.rb_node, *parent = NULL; + struct dm_buffer *found; + + while (*new) { + found = container_of(*new, struct dm_buffer, node); + + if (found->block == b->block) { + BUG_ON(found != b); + return; + } + + parent = *new; + new = (found->block < b->block) ? + &((*new)->rb_left) : &((*new)->rb_right); + } + + rb_link_node(&b->node, parent, new); + rb_insert_color(&b->node, &c->buffer_tree); +} + +static void __remove(struct dm_bufio_client *c, struct dm_buffer *b) +{ + rb_erase(&b->node, &c->buffer_tree); +} + /*----------------------------------------------------------------*/ static void adjust_total_allocated(enum data_mode data_mode, long diff) @@ -349,7 +399,7 @@ if (gfp_mask & __GFP_NORETRY) noio_flag = memalloc_noio_save(); - ptr = __vmalloc(c->block_size, gfp_mask, PAGE_KERNEL); + ptr = __vmalloc(c->block_size, gfp_mask | __GFP_HIGHMEM, PAGE_KERNEL); if (gfp_mask & __GFP_NORETRY) memalloc_noio_restore(noio_flag); @@ -431,7 +481,7 @@ b->block = block; b->list_mode = dirty; list_add(&b->lru_list, &c->lru[dirty]); - hlist_add_head(&b->hash_list, &c->cache_hash[DM_BUFIO_HASH(block)]); + __insert(b->c, b); b->last_accessed = jiffies; } @@ -445,7 +495,7 @@ BUG_ON(!c->n_buffers[b->list_mode]); c->n_buffers[b->list_mode]--; - hlist_del(&b->hash_list); + __remove(b->c, b); list_del(&b->lru_list); } @@ -495,7 +545,8 @@ { struct dm_buffer *b = context; - b->bio.bi_end_io(&b->bio, error ? -EIO : 0); + b->bio.bi_error = error ? -EIO : 0; + b->bio.bi_end_io(&b->bio); } static void use_dmio(struct dm_buffer *b, int rw, sector_t block, @@ -525,13 +576,16 @@ b->bio.bi_end_io = end_io; r = dm_io(&io_req, 1, ®ion, NULL); - if (r) - end_io(&b->bio, r); + if (r) { + b->bio.bi_error = r; + end_io(&b->bio); + } } -static void inline_endio(struct bio *bio, int error) +static void inline_endio(struct bio *bio) { bio_end_io_t *end_fn = bio->bi_private; + int error = bio->bi_error; /* * Reset the bio to free any attached resources @@ -539,7 +593,8 @@ */ bio_reset(bio); - end_fn(bio, error); + bio->bi_error = error; + end_fn(bio); } static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, @@ -551,7 +606,7 @@ bio_init(&b->bio); b->bio.bi_io_vec = b->bio_vec; b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; - b->bio.bi_sector = block << b->c->sectors_per_block_bits; + b->bio.bi_iter.bi_sector = block << b->c->sectors_per_block_bits; b->bio.bi_bdev = b->c->bdev; b->bio.bi_end_io = inline_endio; /* @@ -611,36 +666,27 @@ * Set the error, clear B_WRITING bit and wake anyone who was waiting on * it. */ -static void write_endio(struct bio *bio, int error) +static void write_endio(struct bio *bio) { struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); - b->write_error = error; - if (unlikely(error)) { + b->write_error = bio->bi_error; + if (unlikely(bio->bi_error)) { struct dm_bufio_client *c = b->c; + int error = bio->bi_error; (void)cmpxchg(&c->async_write_error, 0, error); } BUG_ON(!test_bit(B_WRITING, &b->state)); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(B_WRITING, &b->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&b->state, B_WRITING); } /* - * This function is called when wait_on_bit is actually waiting. - */ -static int do_io_schedule(void *word) -{ - io_schedule(); - - return 0; -} - -/* * Initiate a write on a dirty buffer, but don't wait for it. * * - If the buffer is not dirty, exit. @@ -649,16 +695,33 @@ * - Submit our write and don't wait on it. We set B_WRITING indicating * that there is a write in progress. */ -static void __write_dirty_buffer(struct dm_buffer *b) +static void __write_dirty_buffer(struct dm_buffer *b, + struct list_head *write_list) { if (!test_bit(B_DIRTY, &b->state)) return; clear_bit(B_DIRTY, &b->state); - wait_on_bit_lock(&b->state, B_WRITING, - do_io_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit_lock_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); - submit_io(b, WRITE, b->block, write_endio); + if (!write_list) + submit_io(b, WRITE, b->block, write_endio); + else + list_add_tail(&b->write_list, write_list); +} + +static void __flush_write_list(struct list_head *write_list) +{ + struct blk_plug plug; + blk_start_plug(&plug); + while (!list_empty(write_list)) { + struct dm_buffer *b = + list_entry(write_list->next, struct dm_buffer, write_list); + list_del(&b->write_list); + submit_io(b, WRITE, b->block, write_endio); + dm_bufio_cond_resched(); + } + blk_finish_plug(&plug); } /* @@ -673,9 +736,9 @@ if (!b->state) /* fast case */ return; - wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); - __write_dirty_buffer(b); - wait_on_bit(&b->state, B_WRITING, do_io_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); + __write_dirty_buffer(b, NULL); + wait_on_bit_io(&b->state, B_WRITING, TASK_UNINTERRUPTIBLE); } /* @@ -729,7 +792,6 @@ io_schedule(); - set_task_state(current, TASK_RUNNING); remove_wait_queue(&c->free_buffer_wait, &wait); dm_bufio_lock(c); @@ -821,7 +883,8 @@ wake_up(&c->free_buffer_wait); } -static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait) +static void __write_dirty_buffers_async(struct dm_bufio_client *c, int no_wait, + struct list_head *write_list) { struct dm_buffer *b, *tmp; @@ -837,7 +900,7 @@ if (no_wait && test_bit(B_WRITING, &b->state)) return; - __write_dirty_buffer(b); + __write_dirty_buffer(b, write_list); dm_bufio_cond_resched(); } } @@ -860,8 +923,8 @@ buffers = dm_bufio_cache_size_per_client >> (c->sectors_per_block_bits + SECTOR_SHIFT); - if (buffers < DM_BUFIO_MIN_BUFFERS) - buffers = DM_BUFIO_MIN_BUFFERS; + if (buffers < c->minimum_buffers) + buffers = c->minimum_buffers; *limit_buffers = buffers; *threshold_buffers = buffers * DM_BUFIO_WRITEBACK_PERCENT / 100; @@ -872,7 +935,8 @@ * If we are over threshold_buffers, start freeing buffers. * If we're over "limit_buffers", block until we get under the limit. */ -static void __check_watermark(struct dm_bufio_client *c) +static void __check_watermark(struct dm_bufio_client *c, + struct list_head *write_list) { unsigned long threshold_buffers, limit_buffers; @@ -891,24 +955,7 @@ } if (c->n_buffers[LIST_DIRTY] > threshold_buffers) - __write_dirty_buffers_async(c, 1); -} - -/* - * Find a buffer in the hash. - */ -static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) -{ - struct dm_buffer *b; - - hlist_for_each_entry(b, &c->cache_hash[DM_BUFIO_HASH(block)], - hash_list) { - dm_bufio_cond_resched(); - if (b->block == block) - return b; - } - - return NULL; + __write_dirty_buffers_async(c, 1, write_list); } /*---------------------------------------------------------------- @@ -916,7 +963,8 @@ *--------------------------------------------------------------*/ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, - enum new_flag nf, int *need_submit) + enum new_flag nf, int *need_submit, + struct list_head *write_list) { struct dm_buffer *b, *new_b = NULL; @@ -943,7 +991,7 @@ goto found_buffer; } - __check_watermark(c); + __check_watermark(c, write_list); b = new_b; b->hold_count = 1; @@ -984,17 +1032,17 @@ * The endio routine for reading: set the error, clear the bit and wake up * anyone waiting on the buffer. */ -static void read_endio(struct bio *bio, int error) +static void read_endio(struct bio *bio) { struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); - b->read_error = error; + b->read_error = bio->bi_error; BUG_ON(!test_bit(B_READING, &b->state)); - smp_mb__before_clear_bit(); + smp_mb__before_atomic(); clear_bit(B_READING, &b->state); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); wake_up_bit(&b->state, B_READING); } @@ -1011,17 +1059,21 @@ int need_submit; struct dm_buffer *b; + LIST_HEAD(write_list); + dm_bufio_lock(c); - b = __bufio_new(c, block, nf, &need_submit); + b = __bufio_new(c, block, nf, &need_submit, &write_list); dm_bufio_unlock(c); + __flush_write_list(&write_list); + if (!b) return b; if (need_submit) submit_io(b, READ, b->block, read_endio); - wait_on_bit(&b->state, B_READING, do_io_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&b->state, B_READING, TASK_UNINTERRUPTIBLE); if (b->read_error) { int error = b->read_error; @@ -1066,6 +1118,8 @@ { struct blk_plug plug; + LIST_HEAD(write_list); + BUG_ON(dm_bufio_in_request()); blk_start_plug(&plug); @@ -1074,7 +1128,15 @@ for (; n_blocks--; block++) { int need_submit; struct dm_buffer *b; - b = __bufio_new(c, block, NF_PREFETCH, &need_submit); + b = __bufio_new(c, block, NF_PREFETCH, &need_submit, + &write_list); + if (unlikely(!list_empty(&write_list))) { + dm_bufio_unlock(c); + blk_finish_plug(&plug); + __flush_write_list(&write_list); + blk_start_plug(&plug); + dm_bufio_lock(c); + } if (unlikely(b != NULL)) { dm_bufio_unlock(c); @@ -1088,7 +1150,6 @@ goto flush_plug; dm_bufio_lock(c); } - } dm_bufio_unlock(c); @@ -1145,11 +1206,14 @@ void dm_bufio_write_dirty_buffers_async(struct dm_bufio_client *c) { + LIST_HEAD(write_list); + BUG_ON(dm_bufio_in_request()); dm_bufio_lock(c); - __write_dirty_buffers_async(c, 0); + __write_dirty_buffers_async(c, 0, &write_list); dm_bufio_unlock(c); + __flush_write_list(&write_list); } EXPORT_SYMBOL_GPL(dm_bufio_write_dirty_buffers_async); @@ -1166,8 +1230,13 @@ unsigned long buffers_processed = 0; struct dm_buffer *b, *tmp; + LIST_HEAD(write_list); + + dm_bufio_lock(c); + __write_dirty_buffers_async(c, 0, &write_list); + dm_bufio_unlock(c); + __flush_write_list(&write_list); dm_bufio_lock(c); - __write_dirty_buffers_async(c, 0); again: list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_DIRTY], lru_list) { @@ -1183,15 +1252,13 @@ dropped_lock = 1; b->hold_count++; dm_bufio_unlock(c); - wait_on_bit(&b->state, B_WRITING, - do_io_schedule, - TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&b->state, B_WRITING, + TASK_UNINTERRUPTIBLE); dm_bufio_lock(c); b->hold_count--; } else - wait_on_bit(&b->state, B_WRITING, - do_io_schedule, - TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&b->state, B_WRITING, + TASK_UNINTERRUPTIBLE); } if (!test_bit(B_DIRTY, &b->state) && @@ -1293,17 +1360,17 @@ BUG_ON(!b->hold_count); BUG_ON(test_bit(B_READING, &b->state)); - __write_dirty_buffer(b); + __write_dirty_buffer(b, NULL); if (b->hold_count == 1) { - wait_on_bit(&b->state, B_WRITING, - do_io_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&b->state, B_WRITING, + TASK_UNINTERRUPTIBLE); set_bit(B_DIRTY, &b->state); __unlink_buffer(b); __link_buffer(b, new_block, LIST_DIRTY); } else { sector_t old_block; - wait_on_bit_lock(&b->state, B_WRITING, - do_io_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit_lock_io(&b->state, B_WRITING, + TASK_UNINTERRUPTIBLE); /* * Relink buffer to "new_block" so that write_callback * sees "new_block" as a block number. @@ -1315,8 +1382,8 @@ __unlink_buffer(b); __link_buffer(b, new_block, b->list_mode); submit_io(b, WRITE, new_block, write_endio); - wait_on_bit(&b->state, B_WRITING, - do_io_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit_io(&b->state, B_WRITING, + TASK_UNINTERRUPTIBLE); __unlink_buffer(b); __link_buffer(b, old_block, b->list_mode); } @@ -1326,6 +1393,34 @@ } EXPORT_SYMBOL_GPL(dm_bufio_release_move); +/* + * Free the given buffer. + * + * This is just a hint, if the buffer is in use or dirty, this function + * does nothing. + */ +void dm_bufio_forget(struct dm_bufio_client *c, sector_t block) +{ + struct dm_buffer *b; + + dm_bufio_lock(c); + + b = __find(c, block); + if (b && likely(!b->hold_count) && likely(!b->state)) { + __unlink_buffer(b); + __free_buffer_wake(b); + } + + dm_bufio_unlock(c); +} +EXPORT_SYMBOL(dm_bufio_forget); + +void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n) +{ + c->minimum_buffers = n; +} +EXPORT_SYMBOL(dm_bufio_set_minimum_buffers); + unsigned dm_bufio_get_block_size(struct dm_bufio_client *c) { return c->block_size; @@ -1392,71 +1487,91 @@ } /* - * Test if the buffer is unused and too old, and commit it. - * At if noio is set, we must not do any I/O because we hold - * dm_bufio_clients_lock and we would risk deadlock if the I/O gets rerouted to - * different bufio client. + * We may not be able to evict this buffer if IO pending or the client + * is still using it. Caller is expected to know buffer is too old. + * + * And if GFP_NOFS is used, we must not do any I/O because we hold + * dm_bufio_clients_lock and we would risk deadlock if the I/O gets + * rerouted to different bufio client. */ -static int __cleanup_old_buffer(struct dm_buffer *b, gfp_t gfp, - unsigned long max_jiffies) +static bool __try_evict_buffer(struct dm_buffer *b, gfp_t gfp) { - if (jiffies - b->last_accessed < max_jiffies) - return 1; - - if (!(gfp & __GFP_IO)) { + if (!(gfp & __GFP_FS)) { if (test_bit(B_READING, &b->state) || test_bit(B_WRITING, &b->state) || test_bit(B_DIRTY, &b->state)) - return 1; + return false; } if (b->hold_count) - return 1; + return false; __make_buffer_clean(b); __unlink_buffer(b); __free_buffer_wake(b); - return 0; + return true; } -static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, - struct shrink_control *sc) +static unsigned get_retain_buffers(struct dm_bufio_client *c) +{ + unsigned retain_bytes = ACCESS_ONCE(dm_bufio_retain_bytes); + return retain_bytes / c->block_size; +} + +static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan, + gfp_t gfp_mask) { int l; struct dm_buffer *b, *tmp; + unsigned long freed = 0; + unsigned long count = nr_to_scan; + unsigned retain_target = get_retain_buffers(c); for (l = 0; l < LIST_SIZE; l++) { - list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) - if (!__cleanup_old_buffer(b, sc->gfp_mask, 0) && - !--nr_to_scan) - return; - dm_bufio_cond_resched(); + list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) { + if (__try_evict_buffer(b, gfp_mask)) + freed++; + if (!--nr_to_scan || ((count - freed) <= retain_target)) + return freed; + dm_bufio_cond_resched(); + } } + return freed; } -static int shrink(struct shrinker *shrinker, struct shrink_control *sc) +static unsigned long +dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct dm_bufio_client *c = - container_of(shrinker, struct dm_bufio_client, shrinker); - unsigned long r; - unsigned long nr_to_scan = sc->nr_to_scan; + struct dm_bufio_client *c; + unsigned long freed; - if (sc->gfp_mask & __GFP_IO) + c = container_of(shrink, struct dm_bufio_client, shrinker); + if (sc->gfp_mask & __GFP_FS) dm_bufio_lock(c); else if (!dm_bufio_trylock(c)) - return !nr_to_scan ? 0 : -1; + return SHRINK_STOP; - if (nr_to_scan) - __scan(c, nr_to_scan, sc); + freed = __scan(c, sc->nr_to_scan, sc->gfp_mask); + dm_bufio_unlock(c); + return freed; +} - r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; - if (r > INT_MAX) - r = INT_MAX; +static unsigned long +dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + struct dm_bufio_client *c; + unsigned long count; - dm_bufio_unlock(c); + c = container_of(shrink, struct dm_bufio_client, shrinker); + if (sc->gfp_mask & __GFP_FS) + dm_bufio_lock(c); + else if (!dm_bufio_trylock(c)) + return 0; - return r; + count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; + dm_bufio_unlock(c); + return count; } /* @@ -1474,24 +1589,20 @@ BUG_ON(block_size < 1 << SECTOR_SHIFT || (block_size & (block_size - 1))); - c = kmalloc(sizeof(*c), GFP_KERNEL); + c = kzalloc(sizeof(*c), GFP_KERNEL); if (!c) { r = -ENOMEM; goto bad_client; } - c->cache_hash = vmalloc(sizeof(struct hlist_head) << DM_BUFIO_HASH_BITS); - if (!c->cache_hash) { - r = -ENOMEM; - goto bad_hash; - } + c->buffer_tree = RB_ROOT; c->bdev = bdev; c->block_size = block_size; - c->sectors_per_block_bits = ffs(block_size) - 1 - SECTOR_SHIFT; - c->pages_per_block_bits = (ffs(block_size) - 1 >= PAGE_SHIFT) ? - ffs(block_size) - 1 - PAGE_SHIFT : 0; - c->blocks_per_page_bits = (ffs(block_size) - 1 < PAGE_SHIFT ? - PAGE_SHIFT - (ffs(block_size) - 1) : 0); + c->sectors_per_block_bits = __ffs(block_size) - SECTOR_SHIFT; + c->pages_per_block_bits = (__ffs(block_size) >= PAGE_SHIFT) ? + __ffs(block_size) - PAGE_SHIFT : 0; + c->blocks_per_page_bits = (__ffs(block_size) < PAGE_SHIFT ? + PAGE_SHIFT - __ffs(block_size) : 0); c->aux_size = aux_size; c->alloc_callback = alloc_callback; @@ -1502,13 +1613,12 @@ c->n_buffers[i] = 0; } - for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) - INIT_HLIST_HEAD(&c->cache_hash[i]); - mutex_init(&c->lock); INIT_LIST_HEAD(&c->reserved_buffers); c->need_reserved_buffers = reserved_buffers; + c->minimum_buffers = DM_BUFIO_MIN_BUFFERS; + init_waitqueue_head(&c->free_buffer_wait); c->async_write_error = 0; @@ -1558,7 +1668,8 @@ __cache_size_refresh(); mutex_unlock(&dm_bufio_clients_lock); - c->shrinker.shrink = shrink; + c->shrinker.count_objects = dm_bufio_shrink_count; + c->shrinker.scan_objects = dm_bufio_shrink_scan; c->shrinker.seeks = 1; c->shrinker.batch = 0; register_shrinker(&c->shrinker); @@ -1575,8 +1686,6 @@ } dm_io_client_destroy(c->dm_io); bad_dm_io: - vfree(c->cache_hash); -bad_hash: kfree(c); bad_client: return ERR_PTR(r); @@ -1603,9 +1712,7 @@ mutex_unlock(&dm_bufio_clients_lock); - for (i = 0; i < 1 << DM_BUFIO_HASH_BITS; i++) - BUG_ON(!hlist_empty(&c->cache_hash[i])); - + BUG_ON(!RB_EMPTY_ROOT(&c->buffer_tree)); BUG_ON(c->need_reserved_buffers); while (!list_empty(&c->reserved_buffers)) { @@ -1623,36 +1730,60 @@ BUG_ON(c->n_buffers[i]); dm_io_client_destroy(c->dm_io); - vfree(c->cache_hash); kfree(c); } EXPORT_SYMBOL_GPL(dm_bufio_client_destroy); -static void cleanup_old_buffers(void) +static unsigned get_max_age_hz(void) { - unsigned long max_age = ACCESS_ONCE(dm_bufio_max_age); - struct dm_bufio_client *c; + unsigned max_age = ACCESS_ONCE(dm_bufio_max_age); - if (max_age > ULONG_MAX / HZ) - max_age = ULONG_MAX / HZ; + if (max_age > UINT_MAX / HZ) + max_age = UINT_MAX / HZ; - mutex_lock(&dm_bufio_clients_lock); - list_for_each_entry(c, &dm_bufio_all_clients, client_list) { - if (!dm_bufio_trylock(c)) - continue; + return max_age * HZ; +} - while (!list_empty(&c->lru[LIST_CLEAN])) { - struct dm_buffer *b; - b = list_entry(c->lru[LIST_CLEAN].prev, - struct dm_buffer, lru_list); - if (__cleanup_old_buffer(b, 0, max_age * HZ)) - break; - dm_bufio_cond_resched(); - } +static bool older_than(struct dm_buffer *b, unsigned long age_hz) +{ + return time_after_eq(jiffies, b->last_accessed + age_hz); +} + +static void __evict_old_buffers(struct dm_bufio_client *c, unsigned long age_hz) +{ + struct dm_buffer *b, *tmp; + unsigned retain_target = get_retain_buffers(c); + unsigned count; + + dm_bufio_lock(c); + + count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY]; + list_for_each_entry_safe_reverse(b, tmp, &c->lru[LIST_CLEAN], lru_list) { + if (count <= retain_target) + break; + + if (!older_than(b, age_hz)) + break; + + if (__try_evict_buffer(b, 0)) + count--; - dm_bufio_unlock(c); dm_bufio_cond_resched(); } + + dm_bufio_unlock(c); +} + +static void cleanup_old_buffers(void) +{ + unsigned long max_age_hz = get_max_age_hz(); + struct dm_bufio_client *c; + + mutex_lock(&dm_bufio_clients_lock); + + list_for_each_entry(c, &dm_bufio_all_clients, client_list) + __evict_old_buffers(c, max_age_hz); + mutex_unlock(&dm_bufio_clients_lock); } @@ -1730,12 +1861,8 @@ cancel_delayed_work_sync(&dm_bufio_work); destroy_workqueue(dm_bufio_wq); - for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) { - struct kmem_cache *kc = dm_bufio_caches[i]; - - if (kc) - kmem_cache_destroy(kc); - } + for (i = 0; i < ARRAY_SIZE(dm_bufio_caches); i++) + kmem_cache_destroy(dm_bufio_caches[i]); for (i = 0; i < ARRAY_SIZE(dm_bufio_cache_names); i++) kfree(dm_bufio_cache_names[i]); @@ -1777,6 +1904,9 @@ module_param_named(max_age_seconds, dm_bufio_max_age, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(max_age_seconds, "Max age of a buffer in seconds"); +module_param_named(retain_bytes, dm_bufio_retain_bytes, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(retain_bytes, "Try to keep at least this many bytes cached in memory"); + module_param_named(peak_allocated_bytes, dm_bufio_peak_allocated, ulong, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(peak_allocated_bytes, "Tracks the maximum allocated memory");