--- zzzz-none-000/linux-3.10.107/drivers/md/dm-cache-policy-mq.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/drivers/md/dm-cache-policy-mq.c 2021-02-04 17:41:59.000000000 +0000 @@ -8,6 +8,7 @@ #include "dm.h" #include +#include #include #include #include @@ -26,19 +27,6 @@ /*----------------------------------------------------------------*/ -static unsigned long *alloc_bitset(unsigned nr_entries) -{ - size_t s = sizeof(unsigned long) * dm_div_up(nr_entries, BITS_PER_LONG); - return vzalloc(s); -} - -static void free_bitset(unsigned long *bits) -{ - vfree(bits); -} - -/*----------------------------------------------------------------*/ - /* * Large, sequential ios are probably better left on the origin device since * spindles tend to have good bandwidth. @@ -85,7 +73,7 @@ static void iot_update_stats(struct io_tracker *t, struct bio *bio) { - if (bio->bi_sector == from_oblock(t->last_end_oblock) + 1) + if (bio->bi_iter.bi_sector == from_oblock(t->last_end_oblock) + 1) t->nr_seq_samples++; else { /* @@ -100,7 +88,7 @@ t->nr_rand_samples++; } - t->last_end_oblock = to_oblock(bio->bi_sector + bio_sectors(bio) - 1); + t->last_end_oblock = to_oblock(bio_end_sector(bio) - 1); } static void iot_check_for_pattern_switch(struct io_tracker *t) @@ -137,17 +125,41 @@ * sorted queue. */ #define NR_QUEUE_LEVELS 16u +#define NR_SENTINELS NR_QUEUE_LEVELS * 3 + +#define WRITEBACK_PERIOD HZ struct queue { + unsigned nr_elts; + bool current_writeback_sentinels; + unsigned long next_writeback; struct list_head qs[NR_QUEUE_LEVELS]; + struct list_head sentinels[NR_SENTINELS]; }; static void queue_init(struct queue *q) { unsigned i; - for (i = 0; i < NR_QUEUE_LEVELS; i++) + q->nr_elts = 0; + q->current_writeback_sentinels = false; + q->next_writeback = 0; + for (i = 0; i < NR_QUEUE_LEVELS; i++) { INIT_LIST_HEAD(q->qs + i); + INIT_LIST_HEAD(q->sentinels + i); + INIT_LIST_HEAD(q->sentinels + NR_QUEUE_LEVELS + i); + INIT_LIST_HEAD(q->sentinels + (2 * NR_QUEUE_LEVELS) + i); + } +} + +static unsigned queue_size(struct queue *q) +{ + return q->nr_elts; +} + +static bool queue_empty(struct queue *q) +{ + return q->nr_elts == 0; } /* @@ -155,45 +167,66 @@ */ static void queue_push(struct queue *q, unsigned level, struct list_head *elt) { + q->nr_elts++; list_add_tail(elt, q->qs + level); } -static void queue_remove(struct list_head *elt) +static void queue_remove(struct queue *q, struct list_head *elt) { + q->nr_elts--; list_del(elt); } +static bool is_sentinel(struct queue *q, struct list_head *h) +{ + return (h >= q->sentinels) && (h < (q->sentinels + NR_SENTINELS)); +} + /* - * Shifts all regions down one level. This has no effect on the order of - * the queue. + * Gives us the oldest entry of the lowest popoulated level. If the first + * level is emptied then we shift down one level. */ -static void queue_shift_down(struct queue *q) +static struct list_head *queue_peek(struct queue *q) { unsigned level; + struct list_head *h; + + for (level = 0; level < NR_QUEUE_LEVELS; level++) + list_for_each(h, q->qs + level) + if (!is_sentinel(q, h)) + return h; - for (level = 1; level < NR_QUEUE_LEVELS; level++) - list_splice_init(q->qs + level, q->qs + level - 1); + return NULL; +} + +static struct list_head *queue_pop(struct queue *q) +{ + struct list_head *r = queue_peek(q); + + if (r) { + q->nr_elts--; + list_del(r); + } + + return r; } /* - * Gives us the oldest entry of the lowest popoulated level. If the first - * level is emptied then we shift down one level. + * Pops an entry from a level that is not past a sentinel. */ -static struct list_head *queue_pop(struct queue *q) +static struct list_head *queue_pop_old(struct queue *q) { unsigned level; - struct list_head *r; + struct list_head *h; for (level = 0; level < NR_QUEUE_LEVELS; level++) - if (!list_empty(q->qs + level)) { - r = q->qs[level].next; - list_del(r); - - /* have we just emptied the bottom level? */ - if (level == 0 && list_empty(q->qs)) - queue_shift_down(q); - - return r; + list_for_each(h, q->qs + level) { + if (is_sentinel(q, h)) + break; + + q->nr_elts--; + list_del(h); + return h; } return NULL; @@ -209,6 +242,62 @@ return r; } +static struct list_head *writeback_sentinel(struct queue *q, unsigned level) +{ + if (q->current_writeback_sentinels) + return q->sentinels + NR_QUEUE_LEVELS + level; + else + return q->sentinels + 2 * NR_QUEUE_LEVELS + level; +} + +static void queue_update_writeback_sentinels(struct queue *q) +{ + unsigned i; + struct list_head *h; + + if (time_after(jiffies, q->next_writeback)) { + for (i = 0; i < NR_QUEUE_LEVELS; i++) { + h = writeback_sentinel(q, i); + list_del(h); + list_add_tail(h, q->qs + i); + } + + q->next_writeback = jiffies + WRITEBACK_PERIOD; + q->current_writeback_sentinels = !q->current_writeback_sentinels; + } +} + +/* + * Sometimes we want to iterate through entries that have been pushed since + * a certain event. We use sentinel entries on the queues to delimit these + * 'tick' events. + */ +static void queue_tick(struct queue *q) +{ + unsigned i; + + for (i = 0; i < NR_QUEUE_LEVELS; i++) { + list_del(q->sentinels + i); + list_add_tail(q->sentinels + i, q->qs + i); + } +} + +typedef void (*iter_fn)(struct list_head *, void *); +static void queue_iterate_tick(struct queue *q, iter_fn fn, void *context) +{ + unsigned i; + struct list_head *h; + + for (i = 0; i < NR_QUEUE_LEVELS; i++) { + list_for_each_prev(h, q->qs + i) { + if (is_sentinel(q, h)) + break; + + fn(h, context); + } + } +} + /*----------------------------------------------------------------*/ /* @@ -218,17 +307,113 @@ struct hlist_node hlist; struct list_head list; dm_oblock_t oblock; - dm_cblock_t cblock; /* valid iff in_cache */ /* * FIXME: pack these better */ - bool in_cache:1; + bool dirty:1; unsigned hit_count; - unsigned generation; - unsigned tick; }; +/* + * Rather than storing the cblock in an entry, we allocate all entries in + * an array, and infer the cblock from the entry position. + * + * Free entries are linked together into a list. + */ +struct entry_pool { + struct entry *entries, *entries_end; + struct list_head free; + unsigned nr_allocated; +}; + +static int epool_init(struct entry_pool *ep, unsigned nr_entries) +{ + unsigned i; + + ep->entries = vzalloc(sizeof(struct entry) * nr_entries); + if (!ep->entries) + return -ENOMEM; + + ep->entries_end = ep->entries + nr_entries; + + INIT_LIST_HEAD(&ep->free); + for (i = 0; i < nr_entries; i++) + list_add(&ep->entries[i].list, &ep->free); + + ep->nr_allocated = 0; + + return 0; +} + +static void epool_exit(struct entry_pool *ep) +{ + vfree(ep->entries); +} + +static struct entry *alloc_entry(struct entry_pool *ep) +{ + struct entry *e; + + if (list_empty(&ep->free)) + return NULL; + + e = list_entry(list_pop(&ep->free), struct entry, list); + INIT_LIST_HEAD(&e->list); + INIT_HLIST_NODE(&e->hlist); + ep->nr_allocated++; + + return e; +} + +/* + * This assumes the cblock hasn't already been allocated. + */ +static struct entry *alloc_particular_entry(struct entry_pool *ep, dm_cblock_t cblock) +{ + struct entry *e = ep->entries + from_cblock(cblock); + + list_del_init(&e->list); + INIT_HLIST_NODE(&e->hlist); + ep->nr_allocated++; + + return e; +} + +static void free_entry(struct entry_pool *ep, struct entry *e) +{ + BUG_ON(!ep->nr_allocated); + ep->nr_allocated--; + INIT_HLIST_NODE(&e->hlist); + list_add(&e->list, &ep->free); +} + +/* + * Returns NULL if the entry is free. + */ +static struct entry *epool_find(struct entry_pool *ep, dm_cblock_t cblock) +{ + struct entry *e = ep->entries + from_cblock(cblock); + return !hlist_unhashed(&e->hlist) ? e : NULL; +} + +static bool epool_empty(struct entry_pool *ep) +{ + return list_empty(&ep->free); +} + +static bool in_pool(struct entry_pool *ep, struct entry *e) +{ + return e >= ep->entries && e < ep->entries_end; +} + +static dm_cblock_t infer_cblock(struct entry_pool *ep, struct entry *e) +{ + return to_cblock(e - ep->entries); +} + +/*----------------------------------------------------------------*/ + struct mq_policy { struct dm_cache_policy policy; @@ -238,13 +423,22 @@ struct io_tracker tracker; /* - * We maintain two queues of entries. The cache proper contains - * the currently active mappings. Whereas the pre_cache tracks - * blocks that are being hit frequently and potential candidates - * for promotion to the cache. + * Entries come from two pools, one of pre-cache entries, and one + * for the cache proper. + */ + struct entry_pool pre_cache_pool; + struct entry_pool cache_pool; + + /* + * We maintain three queues of entries. The cache proper, + * consisting of a clean and dirty queue, contains the currently + * active mappings. Whereas the pre_cache tracks blocks that + * are being hit frequently and potential candidates for promotion + * to the cache. */ struct queue pre_cache; - struct queue cache; + struct queue cache_clean; + struct queue cache_dirty; /* * Keeps track of time, incremented by the core. We use this to @@ -274,31 +468,9 @@ unsigned generation; unsigned generation_period; /* in lookups (will probably change) */ - /* - * Entries in the pre_cache whose hit count passes the promotion - * threshold move to the cache proper. Working out the correct - * value for the promotion_threshold is crucial to this policy. - */ - unsigned promote_threshold; - - /* - * We need cache_size entries for the cache, and choose to have - * cache_size entries for the pre_cache too. One motivation for - * using the same size is to make the hit counts directly - * comparable between pre_cache and cache. - */ - unsigned nr_entries; - unsigned nr_entries_allocated; - struct list_head free; - - /* - * Cache blocks may be unallocated. We store this info in a - * bitset. - */ - unsigned long *allocation_bitset; - unsigned nr_cblocks_allocated; - unsigned find_free_nr_words; - unsigned find_free_last_word; + unsigned discard_promote_adjustment; + unsigned read_promote_adjustment; + unsigned write_promote_adjustment; /* * The hash table allows us to quickly find an entry by origin @@ -309,48 +481,10 @@ struct hlist_head *table; }; -/*----------------------------------------------------------------*/ -/* Free/alloc mq cache entry structures. */ -static void takeout_queue(struct list_head *lh, struct queue *q) -{ - unsigned level; - - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_splice(q->qs + level, lh); -} - -static void free_entries(struct mq_policy *mq) -{ - struct entry *e, *tmp; - - takeout_queue(&mq->free, &mq->pre_cache); - takeout_queue(&mq->free, &mq->cache); - - list_for_each_entry_safe(e, tmp, &mq->free, list) - kmem_cache_free(mq_entry_cache, e); -} - -static int alloc_entries(struct mq_policy *mq, unsigned elts) -{ - unsigned u = mq->nr_entries; - - INIT_LIST_HEAD(&mq->free); - mq->nr_entries_allocated = 0; - - while (u--) { - struct entry *e = kmem_cache_zalloc(mq_entry_cache, GFP_KERNEL); - - if (!e) { - free_entries(mq); - return -ENOMEM; - } - - - list_add(&e->list, &mq->free); - } - - return 0; -} +#define DEFAULT_DISCARD_PROMOTE_ADJUSTMENT 1 +#define DEFAULT_READ_PROMOTE_ADJUSTMENT 4 +#define DEFAULT_WRITE_PROMOTE_ADJUSTMENT 8 +#define DISCOURAGE_DEMOTING_DIRTY_THRESHOLD 128 /*----------------------------------------------------------------*/ @@ -388,96 +522,14 @@ /*----------------------------------------------------------------*/ -/* - * Allocates a new entry structure. The memory is allocated in one lump, - * so we just handing it out here. Returns NULL if all entries have - * already been allocated. Cannot fail otherwise. - */ -static struct entry *alloc_entry(struct mq_policy *mq) -{ - struct entry *e; - - if (mq->nr_entries_allocated >= mq->nr_entries) { - BUG_ON(!list_empty(&mq->free)); - return NULL; - } - - e = list_entry(list_pop(&mq->free), struct entry, list); - INIT_LIST_HEAD(&e->list); - INIT_HLIST_NODE(&e->hlist); - - mq->nr_entries_allocated++; - return e; -} - -/*----------------------------------------------------------------*/ - -/* - * Mark cache blocks allocated or not in the bitset. - */ -static void alloc_cblock(struct mq_policy *mq, dm_cblock_t cblock) -{ - BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size)); - BUG_ON(test_bit(from_cblock(cblock), mq->allocation_bitset)); - - set_bit(from_cblock(cblock), mq->allocation_bitset); - mq->nr_cblocks_allocated++; -} - -static void free_cblock(struct mq_policy *mq, dm_cblock_t cblock) -{ - BUG_ON(from_cblock(cblock) > from_cblock(mq->cache_size)); - BUG_ON(!test_bit(from_cblock(cblock), mq->allocation_bitset)); - - clear_bit(from_cblock(cblock), mq->allocation_bitset); - mq->nr_cblocks_allocated--; -} - static bool any_free_cblocks(struct mq_policy *mq) { - return mq->nr_cblocks_allocated < from_cblock(mq->cache_size); -} - -/* - * Fills result out with a cache block that isn't in use, or return - * -ENOSPC. This does _not_ mark the cblock as allocated, the caller is - * reponsible for that. - */ -static int __find_free_cblock(struct mq_policy *mq, unsigned begin, unsigned end, - dm_cblock_t *result, unsigned *last_word) -{ - int r = -ENOSPC; - unsigned w; - - for (w = begin; w < end; w++) { - /* - * ffz is undefined if no zero exists - */ - if (mq->allocation_bitset[w] != ~0UL) { - *last_word = w; - *result = to_cblock((w * BITS_PER_LONG) + ffz(mq->allocation_bitset[w])); - if (from_cblock(*result) < from_cblock(mq->cache_size)) - r = 0; - - break; - } - } - - return r; + return !epool_empty(&mq->cache_pool); } -static int find_free_cblock(struct mq_policy *mq, dm_cblock_t *result) +static bool any_clean_cblocks(struct mq_policy *mq) { - int r; - - if (!any_free_cblocks(mq)) - return -ENOSPC; - - r = __find_free_cblock(mq, mq->find_free_last_word, mq->find_free_nr_words, result, &mq->find_free_last_word); - if (r == -ENOSPC && mq->find_free_last_word) - r = __find_free_cblock(mq, 0, mq->find_free_last_word, result, &mq->find_free_last_word); - - return r; + return !queue_empty(&mq->cache_clean); } /*----------------------------------------------------------------*/ @@ -496,33 +548,38 @@ return min((unsigned) ilog2(e->hit_count), NR_QUEUE_LEVELS - 1u); } +static bool in_cache(struct mq_policy *mq, struct entry *e) +{ + return in_pool(&mq->cache_pool, e); +} + /* * Inserts the entry into the pre_cache or the cache. Ensures the cache - * block is marked as allocated if necc. Inserts into the hash table. Sets the - * tick which records when the entry was last moved about. + * block is marked as allocated if necc. Inserts into the hash table. + * Sets the tick which records when the entry was last moved about. */ static void push(struct mq_policy *mq, struct entry *e) { - e->tick = mq->tick; hash_insert(mq, e); - if (e->in_cache) { - alloc_cblock(mq, e->cblock); - queue_push(&mq->cache, queue_level(e), &e->list); - } else + if (in_cache(mq, e)) + queue_push(e->dirty ? &mq->cache_dirty : &mq->cache_clean, + queue_level(e), &e->list); + else queue_push(&mq->pre_cache, queue_level(e), &e->list); } /* * Removes an entry from pre_cache or cache. Removes from the hash table. - * Frees off the cache block if necc. */ static void del(struct mq_policy *mq, struct entry *e) { - queue_remove(&e->list); + if (in_cache(mq, e)) + queue_remove(e->dirty ? &mq->cache_dirty : &mq->cache_clean, &e->list); + else + queue_remove(&mq->pre_cache, &e->list); + hash_remove(e); - if (e->in_cache) - free_cblock(mq, e->cblock); } /* @@ -531,24 +588,36 @@ */ static struct entry *pop(struct mq_policy *mq, struct queue *q) { - struct entry *e = container_of(queue_pop(q), struct entry, list); + struct entry *e; + struct list_head *h = queue_pop(q); - if (e) { - hash_remove(e); + if (!h) + return NULL; - if (e->in_cache) - free_cblock(mq, e->cblock); - } + e = container_of(h, struct entry, list); + hash_remove(e); return e; } -/* - * Has this entry already been updated? - */ -static bool updated_this_tick(struct mq_policy *mq, struct entry *e) +static struct entry *pop_old(struct mq_policy *mq, struct queue *q) +{ + struct entry *e; + struct list_head *h = queue_pop_old(q); + + if (!h) + return NULL; + + e = container_of(h, struct entry, list); + hash_remove(e); + + return e; +} + +static struct entry *peek(struct queue *q) { - return mq->tick == e->tick; + struct list_head *h = queue_peek(q); + return h ? container_of(h, struct entry, list) : NULL; } /* @@ -556,7 +625,8 @@ * of the entries. * * At the moment the threshold is taken by averaging the hit counts of some - * of the entries in the cache (the first 20 entries of the first level). + * of the entries in the cache (the first 20 entries across all levels in + * ascending order, giving preference to the clean entries at each level). * * We can be much cleverer than this though. For example, each promotion * could bump up the threshold helping to prevent churn. Much more to do @@ -571,14 +641,12 @@ struct list_head *head; struct entry *e; - if ((mq->hit_count >= mq->generation_period) && - (mq->nr_cblocks_allocated == from_cblock(mq->cache_size))) { - + if ((mq->hit_count >= mq->generation_period) && (epool_empty(&mq->cache_pool))) { mq->hit_count = 0; mq->generation++; for (level = 0; level < NR_QUEUE_LEVELS && count < MAX_TO_AVERAGE; level++) { - head = mq->cache.qs + level; + head = mq->cache_clean.qs + level; list_for_each_entry(e, head, list) { nr++; total += e->hit_count; @@ -586,11 +654,16 @@ if (++count >= MAX_TO_AVERAGE) break; } - } - mq->promote_threshold = nr ? total / nr : 1; - if (mq->promote_threshold * nr < total) - mq->promote_threshold++; + head = mq->cache_dirty.qs + level; + list_for_each_entry(e, head, list) { + nr++; + total += e->hit_count; + + if (++count >= MAX_TO_AVERAGE) + break; + } + } } } @@ -598,20 +671,9 @@ * Whenever we use an entry we bump up it's hit counter, and push it to the * back to it's current level. */ -static void requeue_and_update_tick(struct mq_policy *mq, struct entry *e) +static void requeue(struct mq_policy *mq, struct entry *e) { - if (updated_this_tick(mq, e)) - return; - - e->hit_count++; - mq->hit_count++; check_generation(mq); - - /* generation adjustment, to stop the counts increasing forever. */ - /* FIXME: divide? */ - /* e->hit_count -= min(e->hit_count - 1, mq->generation - e->generation); */ - e->generation = mq->generation; - del(mq, e); push(mq, e); } @@ -631,19 +693,62 @@ * - set the hit count to a hard coded value other than 1, eg, is it better * if it goes in at level 2? */ -static dm_cblock_t demote_cblock(struct mq_policy *mq, dm_oblock_t *oblock) +static int demote_cblock(struct mq_policy *mq, + struct policy_locker *locker, dm_oblock_t *oblock) { - dm_cblock_t result; - struct entry *demoted = pop(mq, &mq->cache); + struct entry *demoted = peek(&mq->cache_clean); + + if (!demoted) + /* + * We could get a block from mq->cache_dirty, but that + * would add extra latency to the triggering bio as it + * waits for the writeback. Better to not promote this + * time and hope there's a clean block next time this block + * is hit. + */ + return -ENOSPC; - BUG_ON(!demoted); - result = demoted->cblock; + if (locker->fn(locker, demoted->oblock)) + /* + * We couldn't lock the demoted block. + */ + return -EBUSY; + + del(mq, demoted); *oblock = demoted->oblock; - demoted->in_cache = false; - demoted->hit_count = 1; - push(mq, demoted); + free_entry(&mq->cache_pool, demoted); - return result; + /* + * We used to put the demoted block into the pre-cache, but I think + * it's simpler to just let it work it's way up from zero again. + * Stops blocks flickering in and out of the cache. + */ + + return 0; +} + +/* + * Entries in the pre_cache whose hit count passes the promotion + * threshold move to the cache proper. Working out the correct + * value for the promotion_threshold is crucial to this policy. + */ +static unsigned promote_threshold(struct mq_policy *mq) +{ + struct entry *e; + + if (any_free_cblocks(mq)) + return 0; + + e = peek(&mq->cache_clean); + if (e) + return e->hit_count; + + e = peek(&mq->cache_dirty); + if (e) + return e->hit_count + DISCOURAGE_DEMOTING_DIRTY_THRESHOLD; + + /* This should never happen */ + return 0; } /* @@ -655,24 +760,21 @@ * We bias towards reads, since they can be demoted at no cost if they * haven't been dirtied. */ -#define DISCARDED_PROMOTE_THRESHOLD 1 -#define READ_PROMOTE_THRESHOLD 4 -#define WRITE_PROMOTE_THRESHOLD 8 - static unsigned adjusted_promote_threshold(struct mq_policy *mq, bool discarded_oblock, int data_dir) { - if (discarded_oblock && any_free_cblocks(mq) && data_dir == WRITE) + if (data_dir == READ) + return promote_threshold(mq) + mq->read_promote_adjustment; + + if (discarded_oblock && (any_free_cblocks(mq) || any_clean_cblocks(mq))) { /* * We don't need to do any copying at all, so give this a - * very low threshold. In practice this only triggers - * during initial population after a format. + * very low threshold. */ - return DISCARDED_PROMOTE_THRESHOLD; + return mq->discard_promote_adjustment; + } - return data_dir == READ ? - (mq->promote_threshold + READ_PROMOTE_THRESHOLD) : - (mq->promote_threshold + WRITE_PROMOTE_THRESHOLD); + return promote_threshold(mq) + mq->write_promote_adjustment; } static bool should_promote(struct mq_policy *mq, struct entry *e, @@ -686,56 +788,73 @@ struct entry *e, struct policy_result *result) { - requeue_and_update_tick(mq, e); + requeue(mq, e); - if (e->in_cache) { + if (in_cache(mq, e)) { result->op = POLICY_HIT; - result->cblock = e->cblock; + result->cblock = infer_cblock(&mq->cache_pool, e); } return 0; } /* - * Moves and entry from the pre_cache to the cache. The main work is + * Moves an entry from the pre_cache to the cache. The main work is * finding which cache block to use. */ static int pre_cache_to_cache(struct mq_policy *mq, struct entry *e, + struct policy_locker *locker, struct policy_result *result) { - dm_cblock_t cblock; + int r; + struct entry *new_e; - if (find_free_cblock(mq, &cblock) == -ENOSPC) { + /* Ensure there's a free cblock in the cache */ + if (epool_empty(&mq->cache_pool)) { result->op = POLICY_REPLACE; - cblock = demote_cblock(mq, &result->old_oblock); + r = demote_cblock(mq, locker, &result->old_oblock); + if (r) { + result->op = POLICY_MISS; + return 0; + } + } else result->op = POLICY_NEW; - result->cblock = e->cblock = cblock; + new_e = alloc_entry(&mq->cache_pool); + BUG_ON(!new_e); + + new_e->oblock = e->oblock; + new_e->dirty = false; + new_e->hit_count = e->hit_count; del(mq, e); - e->in_cache = true; - push(mq, e); + free_entry(&mq->pre_cache_pool, e); + push(mq, new_e); + + result->cblock = infer_cblock(&mq->cache_pool, new_e); return 0; } static int pre_cache_entry_found(struct mq_policy *mq, struct entry *e, bool can_migrate, bool discarded_oblock, - int data_dir, struct policy_result *result) + int data_dir, struct policy_locker *locker, + struct policy_result *result) { int r = 0; - bool updated = updated_this_tick(mq, e); - requeue_and_update_tick(mq, e); - - if ((!discarded_oblock && updated) || - !should_promote(mq, e, discarded_oblock, data_dir)) + if (!should_promote(mq, e, discarded_oblock, data_dir)) { + requeue(mq, e); result->op = POLICY_MISS; - else if (!can_migrate) + + } else if (!can_migrate) r = -EWOULDBLOCK; - else - r = pre_cache_to_cache(mq, e, result); + + else { + requeue(mq, e); + r = pre_cache_to_cache(mq, e, locker, result); + } return r; } @@ -743,7 +862,7 @@ static void insert_in_pre_cache(struct mq_policy *mq, dm_oblock_t oblock) { - struct entry *e = alloc_entry(mq); + struct entry *e = alloc_entry(&mq->pre_cache_pool); if (!e) /* @@ -757,49 +876,55 @@ return; } - e->in_cache = false; + e->dirty = false; e->oblock = oblock; e->hit_count = 1; - e->generation = mq->generation; push(mq, e); } static void insert_in_cache(struct mq_policy *mq, dm_oblock_t oblock, + struct policy_locker *locker, struct policy_result *result) { + int r; struct entry *e; - dm_cblock_t cblock; - if (find_free_cblock(mq, &cblock) == -ENOSPC) { - result->op = POLICY_MISS; - insert_in_pre_cache(mq, oblock); - return; - } + if (epool_empty(&mq->cache_pool)) { + result->op = POLICY_REPLACE; + r = demote_cblock(mq, locker, &result->old_oblock); + if (unlikely(r)) { + result->op = POLICY_MISS; + insert_in_pre_cache(mq, oblock); + return; + } - e = alloc_entry(mq); - if (unlikely(!e)) { - result->op = POLICY_MISS; - return; + /* + * This will always succeed, since we've just demoted. + */ + e = alloc_entry(&mq->cache_pool); + BUG_ON(!e); + + } else { + e = alloc_entry(&mq->cache_pool); + result->op = POLICY_NEW; } e->oblock = oblock; - e->cblock = cblock; - e->in_cache = true; + e->dirty = false; e->hit_count = 1; - e->generation = mq->generation; push(mq, e); - result->op = POLICY_NEW; - result->cblock = e->cblock; + result->cblock = infer_cblock(&mq->cache_pool, e); } static int no_entry_found(struct mq_policy *mq, dm_oblock_t oblock, bool can_migrate, bool discarded_oblock, - int data_dir, struct policy_result *result) + int data_dir, struct policy_locker *locker, + struct policy_result *result) { - if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) == 1) { + if (adjusted_promote_threshold(mq, discarded_oblock, data_dir) <= 1) { if (can_migrate) - insert_in_cache(mq, oblock, result); + insert_in_cache(mq, oblock, locker, result); else return -EWOULDBLOCK; } else { @@ -816,21 +941,26 @@ */ static int map(struct mq_policy *mq, dm_oblock_t oblock, bool can_migrate, bool discarded_oblock, - int data_dir, struct policy_result *result) + int data_dir, struct policy_locker *locker, + struct policy_result *result) { int r = 0; struct entry *e = hash_lookup(mq, oblock); - if (e && e->in_cache) + if (e && in_cache(mq, e)) r = cache_entry_found(mq, e, result); - else if (iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) + + else if (mq->tracker.thresholds[PATTERN_SEQUENTIAL] && + iot_pattern(&mq->tracker) == PATTERN_SEQUENTIAL) result->op = POLICY_MISS; + else if (e) r = pre_cache_entry_found(mq, e, can_migrate, discarded_oblock, - data_dir, result); + data_dir, locker, result); + else r = no_entry_found(mq, oblock, can_migrate, discarded_oblock, - data_dir, result); + data_dir, locker, result); if (r == -EWOULDBLOCK) result->op = POLICY_MISS; @@ -854,24 +984,50 @@ { struct mq_policy *mq = to_mq_policy(p); - free_bitset(mq->allocation_bitset); - kfree(mq->table); - free_entries(mq); + vfree(mq->table); + epool_exit(&mq->cache_pool); + epool_exit(&mq->pre_cache_pool); kfree(mq); } +static void update_pre_cache_hits(struct list_head *h, void *context) +{ + struct entry *e = container_of(h, struct entry, list); + e->hit_count++; +} + +static void update_cache_hits(struct list_head *h, void *context) +{ + struct mq_policy *mq = context; + struct entry *e = container_of(h, struct entry, list); + e->hit_count++; + mq->hit_count++; +} + static void copy_tick(struct mq_policy *mq) { - unsigned long flags; + unsigned long flags, tick; spin_lock_irqsave(&mq->tick_lock, flags); - mq->tick = mq->tick_protected; + tick = mq->tick_protected; + if (tick != mq->tick) { + queue_iterate_tick(&mq->pre_cache, update_pre_cache_hits, mq); + queue_iterate_tick(&mq->cache_dirty, update_cache_hits, mq); + queue_iterate_tick(&mq->cache_clean, update_cache_hits, mq); + mq->tick = tick; + } + + queue_tick(&mq->pre_cache); + queue_tick(&mq->cache_dirty); + queue_tick(&mq->cache_clean); + queue_update_writeback_sentinels(&mq->cache_dirty); spin_unlock_irqrestore(&mq->tick_lock, flags); } static int mq_map(struct dm_cache_policy *p, dm_oblock_t oblock, bool can_block, bool can_migrate, bool discarded_oblock, - struct bio *bio, struct policy_result *result) + struct bio *bio, struct policy_locker *locker, + struct policy_result *result) { int r; struct mq_policy *mq = to_mq_policy(p); @@ -887,7 +1043,7 @@ iot_examine_bio(&mq->tracker, bio); r = map(mq, oblock, can_migrate, discarded_oblock, - bio_data_dir(bio), result); + bio_data_dir(bio), locker, result); mutex_unlock(&mq->lock); @@ -904,8 +1060,8 @@ return -EWOULDBLOCK; e = hash_lookup(mq, oblock); - if (e && e->in_cache) { - *cblock = e->cblock; + if (e && in_cache(mq, e)) { + *cblock = infer_cblock(&mq->cache_pool, e); r = 0; } else r = -ENOENT; @@ -915,6 +1071,36 @@ return r; } +static void __mq_set_clear_dirty(struct mq_policy *mq, dm_oblock_t oblock, bool set) +{ + struct entry *e; + + e = hash_lookup(mq, oblock); + BUG_ON(!e || !in_cache(mq, e)); + + del(mq, e); + e->dirty = set; + push(mq, e); +} + +static void mq_set_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +{ + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + __mq_set_clear_dirty(mq, oblock, true); + mutex_unlock(&mq->lock); +} + +static void mq_clear_dirty(struct dm_cache_policy *p, dm_oblock_t oblock) +{ + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + __mq_set_clear_dirty(mq, oblock, false); + mutex_unlock(&mq->lock); +} + static int mq_load_mapping(struct dm_cache_policy *p, dm_oblock_t oblock, dm_cblock_t cblock, uint32_t hint, bool hint_valid) @@ -922,52 +1108,64 @@ struct mq_policy *mq = to_mq_policy(p); struct entry *e; - e = alloc_entry(mq); - if (!e) - return -ENOMEM; - - e->cblock = cblock; + e = alloc_particular_entry(&mq->cache_pool, cblock); e->oblock = oblock; - e->in_cache = true; + e->dirty = false; /* this gets corrected in a minute */ e->hit_count = hint_valid ? hint : 1; - e->generation = mq->generation; push(mq, e); return 0; } +static int mq_save_hints(struct mq_policy *mq, struct queue *q, + policy_walk_fn fn, void *context) +{ + int r; + unsigned level; + struct list_head *h; + struct entry *e; + + for (level = 0; level < NR_QUEUE_LEVELS; level++) + list_for_each(h, q->qs + level) { + if (is_sentinel(q, h)) + continue; + + e = container_of(h, struct entry, list); + r = fn(context, infer_cblock(&mq->cache_pool, e), + e->oblock, e->hit_count); + if (r) + return r; + } + + return 0; +} + static int mq_walk_mappings(struct dm_cache_policy *p, policy_walk_fn fn, void *context) { struct mq_policy *mq = to_mq_policy(p); int r = 0; - struct entry *e; - unsigned level; mutex_lock(&mq->lock); - for (level = 0; level < NR_QUEUE_LEVELS; level++) - list_for_each_entry(e, &mq->cache.qs[level], list) { - r = fn(context, e->cblock, e->oblock, e->hit_count); - if (r) - goto out; - } + r = mq_save_hints(mq, &mq->cache_clean, fn, context); + if (!r) + r = mq_save_hints(mq, &mq->cache_dirty, fn, context); -out: mutex_unlock(&mq->lock); return r; } -static void remove_mapping(struct mq_policy *mq, dm_oblock_t oblock) +static void __remove_mapping(struct mq_policy *mq, dm_oblock_t oblock) { - struct entry *e = hash_lookup(mq, oblock); + struct entry *e; - BUG_ON(!e || !e->in_cache); + e = hash_lookup(mq, oblock); + BUG_ON(!e || !in_cache(mq, e)); del(mq, e); - e->in_cache = false; - push(mq, e); + free_entry(&mq->cache_pool, e); } static void mq_remove_mapping(struct dm_cache_policy *p, dm_oblock_t oblock) @@ -975,20 +1173,92 @@ struct mq_policy *mq = to_mq_policy(p); mutex_lock(&mq->lock); - remove_mapping(mq, oblock); + __remove_mapping(mq, oblock); mutex_unlock(&mq->lock); } -static void force_mapping(struct mq_policy *mq, - dm_oblock_t current_oblock, dm_oblock_t new_oblock) +static int __remove_cblock(struct mq_policy *mq, dm_cblock_t cblock) { - struct entry *e = hash_lookup(mq, current_oblock); + struct entry *e = epool_find(&mq->cache_pool, cblock); - BUG_ON(!e || !e->in_cache); + if (!e) + return -ENODATA; del(mq, e); - e->oblock = new_oblock; + free_entry(&mq->cache_pool, e); + + return 0; +} + +static int mq_remove_cblock(struct dm_cache_policy *p, dm_cblock_t cblock) +{ + int r; + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + r = __remove_cblock(mq, cblock); + mutex_unlock(&mq->lock); + + return r; +} + +#define CLEAN_TARGET_PERCENTAGE 25 + +static bool clean_target_met(struct mq_policy *mq) +{ + /* + * Cache entries may not be populated. So we're cannot rely on the + * size of the clean queue. + */ + unsigned nr_clean = from_cblock(mq->cache_size) - queue_size(&mq->cache_dirty); + unsigned target = from_cblock(mq->cache_size) * CLEAN_TARGET_PERCENTAGE / 100; + + return nr_clean >= target; +} + +static int __mq_writeback_work(struct mq_policy *mq, dm_oblock_t *oblock, + dm_cblock_t *cblock) +{ + struct entry *e = pop_old(mq, &mq->cache_dirty); + + if (!e && !clean_target_met(mq)) + e = pop(mq, &mq->cache_dirty); + + if (!e) + return -ENODATA; + + *oblock = e->oblock; + *cblock = infer_cblock(&mq->cache_pool, e); + e->dirty = false; push(mq, e); + + return 0; +} + +static int mq_writeback_work(struct dm_cache_policy *p, dm_oblock_t *oblock, + dm_cblock_t *cblock, bool critical_only) +{ + int r; + struct mq_policy *mq = to_mq_policy(p); + + mutex_lock(&mq->lock); + r = __mq_writeback_work(mq, oblock, cblock); + mutex_unlock(&mq->lock); + + return r; +} + +static void __force_mapping(struct mq_policy *mq, + dm_oblock_t current_oblock, dm_oblock_t new_oblock) +{ + struct entry *e = hash_lookup(mq, current_oblock); + + if (e && in_cache(mq, e)) { + del(mq, e); + e->oblock = new_oblock; + e->dirty = true; + push(mq, e); + } } static void mq_force_mapping(struct dm_cache_policy *p, @@ -997,19 +1267,23 @@ struct mq_policy *mq = to_mq_policy(p); mutex_lock(&mq->lock); - force_mapping(mq, current_oblock, new_oblock); + __force_mapping(mq, current_oblock, new_oblock); mutex_unlock(&mq->lock); } static dm_cblock_t mq_residency(struct dm_cache_policy *p) { + dm_cblock_t r; struct mq_policy *mq = to_mq_policy(p); - /* FIXME: lock mutex, not sure we can block here */ - return to_cblock(mq->nr_cblocks_allocated); + mutex_lock(&mq->lock); + r = to_cblock(mq->cache_pool.nr_allocated); + mutex_unlock(&mq->lock); + + return r; } -static void mq_tick(struct dm_cache_policy *p) +static void mq_tick(struct dm_cache_policy *p, bool can_block) { struct mq_policy *mq = to_mq_policy(p); unsigned long flags; @@ -1017,39 +1291,62 @@ spin_lock_irqsave(&mq->tick_lock, flags); mq->tick_protected++; spin_unlock_irqrestore(&mq->tick_lock, flags); + + if (can_block) { + mutex_lock(&mq->lock); + copy_tick(mq); + mutex_unlock(&mq->lock); + } } static int mq_set_config_value(struct dm_cache_policy *p, const char *key, const char *value) { struct mq_policy *mq = to_mq_policy(p); - enum io_pattern pattern; unsigned long tmp; - if (!strcasecmp(key, "random_threshold")) - pattern = PATTERN_RANDOM; - else if (!strcasecmp(key, "sequential_threshold")) - pattern = PATTERN_SEQUENTIAL; - else - return -EINVAL; - if (kstrtoul(value, 10, &tmp)) return -EINVAL; - mq->tracker.thresholds[pattern] = tmp; + if (!strcasecmp(key, "random_threshold")) { + mq->tracker.thresholds[PATTERN_RANDOM] = tmp; + + } else if (!strcasecmp(key, "sequential_threshold")) { + mq->tracker.thresholds[PATTERN_SEQUENTIAL] = tmp; + + } else if (!strcasecmp(key, "discard_promote_adjustment")) + mq->discard_promote_adjustment = tmp; + + else if (!strcasecmp(key, "read_promote_adjustment")) + mq->read_promote_adjustment = tmp; + + else if (!strcasecmp(key, "write_promote_adjustment")) + mq->write_promote_adjustment = tmp; + + else + return -EINVAL; return 0; } -static int mq_emit_config_values(struct dm_cache_policy *p, char *result, unsigned maxlen) +static int mq_emit_config_values(struct dm_cache_policy *p, char *result, + unsigned maxlen, ssize_t *sz_ptr) { - ssize_t sz = 0; + ssize_t sz = *sz_ptr; struct mq_policy *mq = to_mq_policy(p); - DMEMIT("4 random_threshold %u sequential_threshold %u", + DMEMIT("10 random_threshold %u " + "sequential_threshold %u " + "discard_promote_adjustment %u " + "read_promote_adjustment %u " + "write_promote_adjustment %u ", mq->tracker.thresholds[PATTERN_RANDOM], - mq->tracker.thresholds[PATTERN_SEQUENTIAL]); + mq->tracker.thresholds[PATTERN_SEQUENTIAL], + mq->discard_promote_adjustment, + mq->read_promote_adjustment, + mq->write_promote_adjustment); + *sz_ptr = sz; return 0; } @@ -1059,10 +1356,13 @@ mq->policy.destroy = mq_destroy; mq->policy.map = mq_map; mq->policy.lookup = mq_lookup; + mq->policy.set_dirty = mq_set_dirty; + mq->policy.clear_dirty = mq_clear_dirty; mq->policy.load_mapping = mq_load_mapping; mq->policy.walk_mappings = mq_walk_mappings; mq->policy.remove_mapping = mq_remove_mapping; - mq->policy.writeback_work = NULL; + mq->policy.remove_cblock = mq_remove_cblock; + mq->policy.writeback_work = mq_writeback_work; mq->policy.force_mapping = mq_force_mapping; mq->policy.residency = mq_residency; mq->policy.tick = mq_tick; @@ -1074,7 +1374,6 @@ sector_t origin_size, sector_t cache_block_size) { - int r; struct mq_policy *mq = kzalloc(sizeof(*mq), GFP_KERNEL); if (!mq) @@ -1082,47 +1381,47 @@ init_policy_functions(mq); iot_init(&mq->tracker, SEQUENTIAL_THRESHOLD_DEFAULT, RANDOM_THRESHOLD_DEFAULT); - mq->cache_size = cache_size; + + if (epool_init(&mq->pre_cache_pool, from_cblock(cache_size))) { + DMERR("couldn't initialize pool of pre-cache entries"); + goto bad_pre_cache_init; + } + + if (epool_init(&mq->cache_pool, from_cblock(cache_size))) { + DMERR("couldn't initialize pool of cache entries"); + goto bad_cache_init; + } + mq->tick_protected = 0; mq->tick = 0; mq->hit_count = 0; mq->generation = 0; - mq->promote_threshold = 0; + mq->discard_promote_adjustment = DEFAULT_DISCARD_PROMOTE_ADJUSTMENT; + mq->read_promote_adjustment = DEFAULT_READ_PROMOTE_ADJUSTMENT; + mq->write_promote_adjustment = DEFAULT_WRITE_PROMOTE_ADJUSTMENT; mutex_init(&mq->lock); spin_lock_init(&mq->tick_lock); - mq->find_free_nr_words = dm_div_up(from_cblock(mq->cache_size), BITS_PER_LONG); - mq->find_free_last_word = 0; queue_init(&mq->pre_cache); - queue_init(&mq->cache); - mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); - - mq->nr_entries = 2 * from_cblock(cache_size); - r = alloc_entries(mq, mq->nr_entries); - if (r) - goto bad_cache_alloc; + queue_init(&mq->cache_clean); + queue_init(&mq->cache_dirty); - mq->nr_entries_allocated = 0; - mq->nr_cblocks_allocated = 0; + mq->generation_period = max((unsigned) from_cblock(cache_size), 1024U); mq->nr_buckets = next_power(from_cblock(cache_size) / 2, 16); - mq->hash_bits = ffs(mq->nr_buckets) - 1; - mq->table = kzalloc(sizeof(*mq->table) * mq->nr_buckets, GFP_KERNEL); + mq->hash_bits = __ffs(mq->nr_buckets); + mq->table = vzalloc(sizeof(*mq->table) * mq->nr_buckets); if (!mq->table) goto bad_alloc_table; - mq->allocation_bitset = alloc_bitset(from_cblock(cache_size)); - if (!mq->allocation_bitset) - goto bad_alloc_bitset; - return &mq->policy; -bad_alloc_bitset: - kfree(mq->table); bad_alloc_table: - free_entries(mq); -bad_cache_alloc: + epool_exit(&mq->cache_pool); +bad_cache_init: + epool_exit(&mq->pre_cache_pool); +bad_pre_cache_init: kfree(mq); return NULL; @@ -1132,15 +1431,7 @@ static struct dm_cache_policy_type mq_policy_type = { .name = "mq", - .version = {1, 0, 0}, - .hint_size = 4, - .owner = THIS_MODULE, - .create = mq_create -}; - -static struct dm_cache_policy_type default_policy_type = { - .name = "default", - .version = {1, 0, 0}, + .version = {1, 4, 0}, .hint_size = 4, .owner = THIS_MODULE, .create = mq_create @@ -1155,36 +1446,21 @@ __alignof__(struct entry), 0, NULL); if (!mq_entry_cache) - goto bad; + return -ENOMEM; r = dm_cache_policy_register(&mq_policy_type); if (r) { DMERR("register failed %d", r); - goto bad_register_mq; - } - - r = dm_cache_policy_register(&default_policy_type); - if (!r) { - DMINFO("version %u.%u.%u loaded", - mq_policy_type.version[0], - mq_policy_type.version[1], - mq_policy_type.version[2]); - return 0; + kmem_cache_destroy(mq_entry_cache); + return -ENOMEM; } - DMERR("register failed (as default) %d", r); - - dm_cache_policy_unregister(&mq_policy_type); -bad_register_mq: - kmem_cache_destroy(mq_entry_cache); -bad: - return -ENOMEM; + return 0; } static void __exit mq_exit(void) { dm_cache_policy_unregister(&mq_policy_type); - dm_cache_policy_unregister(&default_policy_type); kmem_cache_destroy(mq_entry_cache); } @@ -1195,5 +1471,3 @@ MODULE_AUTHOR("Joe Thornber "); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("mq cache policy"); - -MODULE_ALIAS("dm-cache-default");