// SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Kent Overstreet * * Uses a block device as cache for other block devices; optimized for SSDs. * All allocation is done in buckets, which should match the erase block size * of the device. * * Buckets containing cached data are kept on a heap sorted by priority; * bucket priority is increased on cache hit, and periodically all the buckets * on the heap have their priority scaled down. This currently is just used as * an LRU but in the future should allow for more intelligent heuristics. * * Buckets have an 8 bit counter; freeing is accomplished by incrementing the * counter. Garbage collection is used to remove stale pointers. * * Indexing is done via a btree; nodes are not necessarily fully sorted, rather * as keys are inserted we only sort the pages that have not yet been written. * When garbage collection is run, we resort the entire node. * * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst. */ #include "bcache.h" #include "btree.h" #include "debug.h" #include "extents.h" #include #include #include #include #include #include #include #include #include #include #include /* * Todo: * register_bcache: Return errors out to userspace correctly * * Writeback: don't undirty key until after a cache flush * * Create an iterator for key pointers * * On btree write error, mark bucket such that it won't be freed from the cache * * Journalling: * Check for bad keys in replay * Propagate barriers * Refcount journal entries in journal_replay * * Garbage collection: * Finish incremental gc * Gc should free old UUIDs, data for invalid UUIDs * * Provide a way to list backing device UUIDs we have data cached for, and * probably how long it's been since we've seen them, and a way to invalidate * dirty data for devices that will never be attached again * * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so * that based on that and how much dirty data we have we can keep writeback * from being starved * * Add a tracepoint or somesuch to watch for writeback starvation * * When btree depth > 1 and splitting an interior node, we have to make sure * alloc_bucket() cannot fail. This should be true but is not completely * obvious. * * Plugging? * * If data write is less than hard sector size of ssd, round up offset in open * bucket to the next whole sector * * Superblock needs to be fleshed out for multiple cache devices * * Add a sysfs tunable for the number of writeback IOs in flight * * Add a sysfs tunable for the number of open data buckets * * IO tracking: Can we track when one process is doing io on behalf of another? * IO tracking: Don't use just an average, weigh more recent stuff higher * * Test module load/unload */ #define MAX_NEED_GC 64 #define MAX_SAVE_PRIO 72 #define MAX_GC_TIMES 100 #define MIN_GC_NODES 100 #define GC_SLEEP_MS 100 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) #define PTR_HASH(c, k) \ (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) static struct workqueue_struct *btree_io_wq; #define insert_lock(s, b) ((b)->level <= (s)->lock) static inline struct bset *write_block(struct btree *b) { return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c->cache); } static void bch_btree_init_next(struct btree *b) { /* If not a leaf node, always sort */ if (b->level && b->keys.nsets) bch_btree_sort(&b->keys, &b->c->sort); else bch_btree_sort_lazy(&b->keys, &b->c->sort); if (b->written < btree_blocks(b)) bch_bset_init_next(&b->keys, write_block(b), bset_magic(&b->c->cache->sb)); } /* Btree key manipulation */ void bkey_put(struct cache_set *c, struct bkey *k) { unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i)) atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); } /* Btree IO */ static uint64_t btree_csum_set(struct btree *b, struct bset *i) { uint64_t crc = b->key.ptr[0]; void *data = (void *) i + 8, *end = bset_bkey_last(i); crc = bch_crc64_update(crc, data, end - data); return crc ^ 0xffffffffffffffffULL; } void bch_btree_node_read_done(struct btree *b) { const char *err = "bad btree header"; struct bset *i = btree_bset_first(b); struct btree_iter *iter; /* * c->fill_iter can allocate an iterator with more memory space * than static MAX_BSETS. * See the comment arount cache_set->fill_iter. */ iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; iter->used = 0; #ifdef CONFIG_BCACHE_DEBUG iter->b = &b->keys; #endif if (!i->seq) goto err; for (; b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; i = write_block(b)) { err = "unsupported bset version"; if (i->version > BCACHE_BSET_VERSION) goto err; err = "bad btree header"; if (b->written + set_blocks(i, block_bytes(b->c->cache)) > btree_blocks(b)) goto err; err = "bad magic"; if (i->magic != bset_magic(&b->c->cache->sb)) goto err; err = "bad checksum"; switch (i->version) { case 0: if (i->csum != csum_set(i)) goto err; break; case BCACHE_BSET_VERSION: if (i->csum != btree_csum_set(b, i)) goto err; break; } err = "empty set"; if (i != b->keys.set[0].data && !i->keys) goto err; bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); b->written += set_blocks(i, block_bytes(b->c->cache)); } err = "corrupted btree"; for (i = write_block(b); bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); i = ((void *) i) + block_bytes(b->c->cache)) if (i->seq == b->keys.set[0].data->seq) goto err; bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); i = b->keys.set[0].data; err = "short btree key"; if (b->keys.set[0].size && bkey_cmp(&b->key, &b->keys.set[0].end) < 0) goto err; if (b->written < btree_blocks(b)) bch_bset_init_next(&b->keys, write_block(b), bset_magic(&b->c->cache->sb)); out: mempool_free(iter, &b->c->fill_iter); return; err: set_btree_node_io_error(b); bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", err, PTR_BUCKET_NR(b->c, &b->key, 0), bset_block_offset(b, i), i->keys); goto out; } static void btree_node_read_endio(struct bio *bio) { struct closure *cl = bio->bi_private; closure_put(cl); } static void bch_btree_node_read(struct btree *b) { uint64_t start_time = local_clock(); struct closure cl; struct bio *bio; trace_bcache_btree_read(b); closure_init_stack(&cl); bio = bch_bbio_alloc(b->c); bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9; bio->bi_end_io = btree_node_read_endio; bio->bi_private = &cl; bio->bi_opf = REQ_OP_READ | REQ_META; bch_bio_map(bio, b->keys.set[0].data); bch_submit_bbio(bio, b->c, &b->key, 0); closure_sync(&cl); if (bio->bi_status) set_btree_node_io_error(b); bch_bbio_free(bio, b->c); if (btree_node_io_error(b)) goto err; bch_btree_node_read_done(b); bch_time_stats_update(&b->c->btree_read_time, start_time); return; err: bch_cache_set_error(b->c, "io error reading bucket %zu", PTR_BUCKET_NR(b->c, &b->key, 0)); } static void btree_complete_write(struct btree *b, struct btree_write *w) { if (w->prio_blocked && !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) wake_up_allocators(b->c); if (w->journal) { atomic_dec_bug(w->journal); __closure_wake_up(&b->c->journal.wait); } w->prio_blocked = 0; w->journal = NULL; } static void btree_node_write_unlock(struct closure *cl) { struct btree *b = container_of(cl, struct btree, io); up(&b->io_mutex); } static void __btree_node_write_done(struct closure *cl) { struct btree *b = container_of(cl, struct btree, io); struct btree_write *w = btree_prev_write(b); bch_bbio_free(b->bio, b->c); b->bio = NULL; btree_complete_write(b, w); if (btree_node_dirty(b)) queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); closure_return_with_destructor(cl, btree_node_write_unlock); } static void btree_node_write_done(struct closure *cl) { struct btree *b = container_of(cl, struct btree, io); bio_free_pages(b->bio); __btree_node_write_done(cl); } static void btree_node_write_endio(struct bio *bio) { struct closure *cl = bio->bi_private; struct btree *b = container_of(cl, struct btree, io); if (bio->bi_status) set_btree_node_io_error(b); bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree"); closure_put(cl); } static void do_btree_node_write(struct btree *b) { struct closure *cl = &b->io; struct bset *i = btree_bset_last(b); BKEY_PADDED(key) k; i->version = BCACHE_BSET_VERSION; i->csum = btree_csum_set(b, i); BUG_ON(b->bio); b->bio = bch_bbio_alloc(b->c); b->bio->bi_end_io = btree_node_write_endio; b->bio->bi_private = cl; b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c->cache)); b->bio->bi_opf = REQ_OP_WRITE | REQ_META | REQ_FUA; bch_bio_map(b->bio, i); /* * If we're appending to a leaf node, we don't technically need FUA - * this write just needs to be persisted before the next journal write, * which will be marked FLUSH|FUA. * * Similarly if we're writing a new btree root - the pointer is going to * be in the next journal entry. * * But if we're writing a new btree node (that isn't a root) or * appending to a non leaf btree node, we need either FUA or a flush * when we write the parent with the new pointer. FUA is cheaper than a * flush, and writes appending to leaf nodes aren't blocking anything so * just make all btree node writes FUA to keep things sane. */ bkey_copy(&k.key, &b->key); SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_sector_offset(&b->keys, i)); if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { struct bio_vec *bv; void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); struct bvec_iter_all iter_all; bio_for_each_segment_all(bv, b->bio, iter_all) { memcpy(page_address(bv->bv_page), addr, PAGE_SIZE); addr += PAGE_SIZE; } bch_submit_bbio(b->bio, b->c, &k.key, 0); continue_at(cl, btree_node_write_done, NULL); } else { /* * No problem for multipage bvec since the bio is * just allocated */ b->bio->bi_vcnt = 0; bch_bio_map(b->bio, i); bch_submit_bbio(b->bio, b->c, &k.key, 0); closure_sync(cl); continue_at_nobarrier(cl, __btree_node_write_done, NULL); } } void __bch_btree_node_write(struct btree *b, struct closure *parent) { struct bset *i = btree_bset_last(b); lockdep_assert_held(&b->write_lock); trace_bcache_btree_write(b); BUG_ON(current->bio_list); BUG_ON(b->written >= btree_blocks(b)); BUG_ON(b->written && !i->keys); BUG_ON(btree_bset_first(b)->seq != i->seq); bch_check_keys(&b->keys, "writing"); cancel_delayed_work(&b->work); /* If caller isn't waiting for write, parent refcount is cache set */ down(&b->io_mutex); closure_init(&b->io, parent ?: &b->c->cl); clear_bit(BTREE_NODE_dirty, &b->flags); change_bit(BTREE_NODE_write_idx, &b->flags); do_btree_node_write(b); atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size, &b->c->cache->btree_sectors_written); b->written += set_blocks(i, block_bytes(b->c->cache)); } void bch_btree_node_write(struct btree *b, struct closure *parent) { unsigned int nsets = b->keys.nsets; lockdep_assert_held(&b->lock); __bch_btree_node_write(b, parent); /* * do verify if there was more than one set initially (i.e. we did a * sort) and we sorted down to a single set: */ if (nsets && !b->keys.nsets) bch_btree_verify(b); bch_btree_init_next(b); } static void bch_btree_node_write_sync(struct btree *b) { struct closure cl; closure_init_stack(&cl); mutex_lock(&b->write_lock); bch_btree_node_write(b, &cl); mutex_unlock(&b->write_lock); closure_sync(&cl); } static void btree_node_write_work(struct work_struct *w) { struct btree *b = container_of(to_delayed_work(w), struct btree, work); mutex_lock(&b->write_lock); if (btree_node_dirty(b)) __bch_btree_node_write(b, NULL); mutex_unlock(&b->write_lock); } static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) { struct bset *i = btree_bset_last(b); struct btree_write *w = btree_current_write(b); lockdep_assert_held(&b->write_lock); BUG_ON(!b->written); BUG_ON(!i->keys); if (!btree_node_dirty(b)) queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); set_btree_node_dirty(b); /* * w->journal is always the oldest journal pin of all bkeys * in the leaf node, to make sure the oldest jset seq won't * be increased before this btree node is flushed. */ if (journal_ref) { if (w->journal && journal_pin_cmp(b->c, w->journal, journal_ref)) { atomic_dec_bug(w->journal); w->journal = NULL; } if (!w->journal) { w->journal = journal_ref; atomic_inc(w->journal); } } /* Force write if set is too big */ if (set_bytes(i) > PAGE_SIZE - 48 && !current->bio_list) bch_btree_node_write(b, NULL); } /* * Btree in memory cache - allocation/freeing * mca -> memory cache */ #define mca_reserve(c) (((!IS_ERR_OR_NULL(c->root) && c->root->level) \ ? c->root->level : 1) * 8 + 16) #define mca_can_free(c) \ max_t(int, 0, c->btree_cache_used - mca_reserve(c)) static void mca_data_free(struct btree *b) { BUG_ON(b->io_mutex.count != 1); bch_btree_keys_free(&b->keys); b->c->btree_cache_used--; list_move(&b->list, &b->c->btree_cache_freed); } static void mca_bucket_free(struct btree *b) { BUG_ON(btree_node_dirty(b)); b->key.ptr[0] = 0; hlist_del_init_rcu(&b->hash); list_move(&b->list, &b->c->btree_cache_freeable); } static unsigned int btree_order(struct bkey *k) { return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); } static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) { if (!bch_btree_keys_alloc(&b->keys, max_t(unsigned int, ilog2(b->c->btree_pages), btree_order(k)), gfp)) { b->c->btree_cache_used++; list_move(&b->list, &b->c->btree_cache); } else { list_move(&b->list, &b->c->btree_cache_freed); } } static struct btree *mca_bucket_alloc(struct cache_set *c, struct bkey *k, gfp_t gfp) { /* * kzalloc() is necessary here for initialization, * see code comments in bch_btree_keys_init(). */ struct btree *b = kzalloc(sizeof(struct btree), gfp); if (!b) return NULL; init_rwsem(&b->lock); lockdep_set_novalidate_class(&b->lock); mutex_init(&b->write_lock); lockdep_set_novalidate_class(&b->write_lock); INIT_LIST_HEAD(&b->list); INIT_DELAYED_WORK(&b->work, btree_node_write_work); b->c = c; sema_init(&b->io_mutex, 1); mca_data_alloc(b, k, gfp); return b; } static int mca_reap(struct btree *b, unsigned int min_order, bool flush) { struct closure cl; closure_init_stack(&cl); lockdep_assert_held(&b->c->bucket_lock); if (!down_write_trylock(&b->lock)) return -ENOMEM; BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); if (b->keys.page_order < min_order) goto out_unlock; if (!flush) { if (btree_node_dirty(b)) goto out_unlock; if (down_trylock(&b->io_mutex)) goto out_unlock; up(&b->io_mutex); } retry: /* * BTREE_NODE_dirty might be cleared in btree_flush_btree() by * __bch_btree_node_write(). To avoid an extra flush, acquire * b->write_lock before checking BTREE_NODE_dirty bit. */ mutex_lock(&b->write_lock); /* * If this btree node is selected in btree_flush_write() by journal * code, delay and retry until the node is flushed by journal code * and BTREE_NODE_journal_flush bit cleared by btree_flush_write(). */ if (btree_node_journal_flush(b)) { pr_debug("bnode %p is flushing by journal, retry\n", b); mutex_unlock(&b->write_lock); udelay(1); goto retry; } if (btree_node_dirty(b)) __bch_btree_node_write(b, &cl); mutex_unlock(&b->write_lock); closure_sync(&cl); /* wait for any in flight btree write */ down(&b->io_mutex); up(&b->io_mutex); return 0; out_unlock: rw_unlock(true, b); return -ENOMEM; } static unsigned long bch_mca_scan(struct shrinker *shrink, struct shrink_control *sc) { struct cache_set *c = container_of(shrink, struct cache_set, shrink); struct btree *b, *t; unsigned long i, nr = sc->nr_to_scan; unsigned long freed = 0; unsigned int btree_cache_used; if (c->shrinker_disabled) return SHRINK_STOP; if (c->btree_cache_alloc_lock) return SHRINK_STOP; /* Return -1 if we can't do anything right now */ if (sc->gfp_mask & __GFP_IO) mutex_lock(&c->bucket_lock); else if (!mutex_trylock(&c->bucket_lock)) return -1; /* * It's _really_ critical that we don't free too many btree nodes - we * have to always leave ourselves a reserve. The reserve is how we * guarantee that allocating memory for a new btree node can always * succeed, so that inserting keys into the btree can always succeed and * IO can always make forward progress: */ nr /= c->btree_pages; if (nr == 0) nr = 1; nr = min_t(unsigned long, nr, mca_can_free(c)); i = 0; btree_cache_used = c->btree_cache_used; list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) { if (nr <= 0) goto out; if (!mca_reap(b, 0, false)) { mca_data_free(b); rw_unlock(true, b); freed++; } nr--; i++; } list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { if (nr <= 0 || i >= btree_cache_used) goto out; if (!mca_reap(b, 0, false)) { mca_bucket_free(b); mca_data_free(b); rw_unlock(true, b); freed++; } nr--; i++; } out: mutex_unlock(&c->bucket_lock); return freed * c->btree_pages; } static unsigned long bch_mca_count(struct shrinker *shrink, struct shrink_control *sc) { struct cache_set *c = container_of(shrink, struct cache_set, shrink); if (c->shrinker_disabled) return 0; if (c->btree_cache_alloc_lock) return 0; return mca_can_free(c) * c->btree_pages; } void bch_btree_cache_free(struct cache_set *c) { struct btree *b; struct closure cl; closure_init_stack(&cl); if (c->shrink.list.next) unregister_shrinker(&c->shrink); mutex_lock(&c->bucket_lock); #ifdef CONFIG_BCACHE_DEBUG if (c->verify_data) list_move(&c->verify_data->list, &c->btree_cache); free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb))); #endif list_splice(&c->btree_cache_freeable, &c->btree_cache); while (!list_empty(&c->btree_cache)) { b = list_first_entry(&c->btree_cache, struct btree, list); /* * This function is called by cache_set_free(), no I/O * request on cache now, it is unnecessary to acquire * b->write_lock before clearing BTREE_NODE_dirty anymore. */ if (btree_node_dirty(b)) { btree_complete_write(b, btree_current_write(b)); clear_bit(BTREE_NODE_dirty, &b->flags); } mca_data_free(b); } while (!list_empty(&c->btree_cache_freed)) { b = list_first_entry(&c->btree_cache_freed, struct btree, list); list_del(&b->list); cancel_delayed_work_sync(&b->work); kfree(b); } mutex_unlock(&c->bucket_lock); } int bch_btree_cache_alloc(struct cache_set *c) { unsigned int i; for (i = 0; i < mca_reserve(c); i++) if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) return -ENOMEM; list_splice_init(&c->btree_cache, &c->btree_cache_freeable); #ifdef CONFIG_BCACHE_DEBUG mutex_init(&c->verify_lock); c->verify_ondisk = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP, ilog2(meta_bucket_pages(&c->cache->sb))); if (!c->verify_ondisk) { /* * Don't worry about the mca_rereserve buckets * allocated in previous for-loop, they will be * handled properly in bch_cache_set_unregister(). */ return -ENOMEM; } c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); if (c->verify_data && c->verify_data->keys.set->data) list_del_init(&c->verify_data->list); else c->verify_data = NULL; #endif c->shrink.count_objects = bch_mca_count; c->shrink.scan_objects = bch_mca_scan; c->shrink.seeks = 4; c->shrink.batch = c->btree_pages * 2; if (register_shrinker(&c->shrink)) pr_warn("bcache: %s: could not register shrinker\n", __func__); return 0; } /* Btree in memory cache - hash table */ static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) { return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; } static struct btree *mca_find(struct cache_set *c, struct bkey *k) { struct btree *b; rcu_read_lock(); hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) goto out; b = NULL; out: rcu_read_unlock(); return b; } static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) { spin_lock(&c->btree_cannibalize_lock); if (likely(c->btree_cache_alloc_lock == NULL)) { c->btree_cache_alloc_lock = current; } else if (c->btree_cache_alloc_lock != current) { if (op) prepare_to_wait(&c->btree_cache_wait, &op->wait, TASK_UNINTERRUPTIBLE); spin_unlock(&c->btree_cannibalize_lock); return -EINTR; } spin_unlock(&c->btree_cannibalize_lock); return 0; } static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, struct bkey *k) { struct btree *b; trace_bcache_btree_cache_cannibalize(c); if (mca_cannibalize_lock(c, op)) return ERR_PTR(-EINTR); list_for_each_entry_reverse(b, &c->btree_cache, list) if (!mca_reap(b, btree_order(k), false)) return b; list_for_each_entry_reverse(b, &c->btree_cache, list) if (!mca_reap(b, btree_order(k), true)) return b; WARN(1, "btree cache cannibalize failed\n"); return ERR_PTR(-ENOMEM); } /* * We can only have one thread cannibalizing other cached btree nodes at a time, * or we'll deadlock. We use an open coded mutex to ensure that, which a * cannibalize_bucket() will take. This means every time we unlock the root of * the btree, we need to release this lock if we have it held. */ void bch_cannibalize_unlock(struct cache_set *c) { spin_lock(&c->btree_cannibalize_lock); if (c->btree_cache_alloc_lock == current) { c->btree_cache_alloc_lock = NULL; wake_up(&c->btree_cache_wait); } spin_unlock(&c->btree_cannibalize_lock); } static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, struct bkey *k, int level) { struct btree *b; BUG_ON(current->bio_list); lockdep_assert_held(&c->bucket_lock); if (mca_find(c, k)) return NULL; /* btree_free() doesn't free memory; it sticks the node on the end of * the list. Check if there's any freed nodes there: */ list_for_each_entry(b, &c->btree_cache_freeable, list) if (!mca_reap(b, btree_order(k), false)) goto out; /* We never free struct btree itself, just the memory that holds the on * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, &c->btree_cache_freed, list) if (!mca_reap(b, 0, false)) { mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); if (!b->keys.set[0].data) goto err; else goto out; } b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); if (!b) goto err; BUG_ON(!down_write_trylock(&b->lock)); if (!b->keys.set->data) goto err; out: BUG_ON(b->io_mutex.count != 1); bkey_copy(&b->key, k); list_move(&b->list, &c->btree_cache); hlist_del_init_rcu(&b->hash); hlist_add_head_rcu(&b->hash, mca_hash(c, k)); lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); b->parent = (void *) ~0UL; b->flags = 0; b->written = 0; b->level = level; if (!b->level) bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, &b->c->expensive_debug_checks); else bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, &b->c->expensive_debug_checks); return b; err: if (b) rw_unlock(true, b); b = mca_cannibalize(c, op, k); if (!IS_ERR(b)) goto out; return b; } /* * bch_btree_node_get - find a btree node in the cache and lock it, reading it * in from disk if necessary. * * If IO is necessary and running under submit_bio_noacct, returns -EAGAIN. * * The btree node will have either a read or a write lock held, depending on * level and op->lock. * * Note: Only error code or btree pointer will be returned, it is unncessary * for callers to check NULL pointer. */ struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, struct bkey *k, int level, bool write, struct btree *parent) { int i = 0; struct btree *b; BUG_ON(level < 0); retry: b = mca_find(c, k); if (!b) { if (current->bio_list) return ERR_PTR(-EAGAIN); mutex_lock(&c->bucket_lock); b = mca_alloc(c, op, k, level); mutex_unlock(&c->bucket_lock); if (!b) goto retry; if (IS_ERR(b)) return b; bch_btree_node_read(b); if (!write) downgrade_write(&b->lock); } else { rw_lock(write, b, level); if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { rw_unlock(write, b); goto retry; } BUG_ON(b->level != level); } if (btree_node_io_error(b)) { rw_unlock(write, b); return ERR_PTR(-EIO); } BUG_ON(!b->written); b->parent = parent; for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { prefetch(b->keys.set[i].tree); prefetch(b->keys.set[i].data); } for (; i <= b->keys.nsets; i++) prefetch(b->keys.set[i].data); return b; } static void btree_node_prefetch(struct btree *parent, struct bkey *k) { struct btree *b; mutex_lock(&parent->c->bucket_lock); b = mca_alloc(parent->c, NULL, k, parent->level - 1); mutex_unlock(&parent->c->bucket_lock); if (!IS_ERR_OR_NULL(b)) { b->parent = parent; bch_btree_node_read(b); rw_unlock(true, b); } } /* Btree alloc */ static void btree_node_free(struct btree *b) { trace_bcache_btree_node_free(b); BUG_ON(b == b->c->root); retry: mutex_lock(&b->write_lock); /* * If the btree node is selected and flushing in btree_flush_write(), * delay and retry until the BTREE_NODE_journal_flush bit cleared, * then it is safe to free the btree node here. Otherwise this btree * node will be in race condition. */ if (btree_node_journal_flush(b)) { mutex_unlock(&b->write_lock); pr_debug("bnode %p journal_flush set, retry\n", b); udelay(1); goto retry; } if (btree_node_dirty(b)) { btree_complete_write(b, btree_current_write(b)); clear_bit(BTREE_NODE_dirty, &b->flags); } mutex_unlock(&b->write_lock); cancel_delayed_work(&b->work); mutex_lock(&b->c->bucket_lock); bch_bucket_free(b->c, &b->key); mca_bucket_free(b); mutex_unlock(&b->c->bucket_lock); } /* * Only error code or btree pointer will be returned, it is unncessary for * callers to check NULL pointer. */ struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, int level, bool wait, struct btree *parent) { BKEY_PADDED(key) k; struct btree *b; mutex_lock(&c->bucket_lock); retry: /* return ERR_PTR(-EAGAIN) when it fails */ b = ERR_PTR(-EAGAIN); if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) goto err; bkey_put(c, &k.key); SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); b = mca_alloc(c, op, &k.key, level); if (IS_ERR(b)) goto err_free; if (!b) { cache_bug(c, "Tried to allocate bucket that was in btree cache"); goto retry; } b->parent = parent; bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->cache->sb)); mutex_unlock(&c->bucket_lock); trace_bcache_btree_node_alloc(b); return b; err_free: bch_bucket_free(c, &k.key); err: mutex_unlock(&c->bucket_lock); trace_bcache_btree_node_alloc_fail(c); return b; } static struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, int level, struct btree *parent) { return __bch_btree_node_alloc(c, op, level, op != NULL, parent); } static struct btree *btree_node_alloc_replacement(struct btree *b, struct btree_op *op) { struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); if (!IS_ERR(n)) { mutex_lock(&n->write_lock); bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); bkey_copy_key(&n->key, &b->key); mutex_unlock(&n->write_lock); } return n; } static void make_btree_freeing_key(struct btree *b, struct bkey *k) { unsigned int i; mutex_lock(&b->c->bucket_lock); atomic_inc(&b->c->prio_blocked); bkey_copy(k, &b->key); bkey_copy_key(k, &ZERO_KEY); for (i = 0; i < KEY_PTRS(k); i++) SET_PTR_GEN(k, i, bch_inc_gen(b->c->cache, PTR_BUCKET(b->c, &b->key, i))); mutex_unlock(&b->c->bucket_lock); } static int btree_check_reserve(struct btree *b, struct btree_op *op) { struct cache_set *c = b->c; struct cache *ca = c->cache; unsigned int reserve = (c->root->level - b->level) * 2 + 1; mutex_lock(&c->bucket_lock); if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { if (op) prepare_to_wait(&c->btree_cache_wait, &op->wait, TASK_UNINTERRUPTIBLE); mutex_unlock(&c->bucket_lock); return -EINTR; } mutex_unlock(&c->bucket_lock); return mca_cannibalize_lock(b->c, op); } /* Garbage collection */ static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) { uint8_t stale = 0; unsigned int i; struct bucket *g; /* * ptr_invalid() can't return true for the keys that mark btree nodes as * freed, but since ptr_bad() returns true we'll never actually use them * for anything and thus we don't want mark their pointers here */ if (!bkey_cmp(k, &ZERO_KEY)) return stale; for (i = 0; i < KEY_PTRS(k); i++) { if (!ptr_available(c, k, i)) continue; g = PTR_BUCKET(c, k, i); if (gen_after(g->last_gc, PTR_GEN(k, i))) g->last_gc = PTR_GEN(k, i); if (ptr_stale(c, k, i)) { stale = max(stale, ptr_stale(c, k, i)); continue; } cache_bug_on(GC_MARK(g) && (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), c, "inconsistent ptrs: mark = %llu, level = %i", GC_MARK(g), level); if (level) SET_GC_MARK(g, GC_MARK_METADATA); else if (KEY_DIRTY(k)) SET_GC_MARK(g, GC_MARK_DIRTY); else if (!GC_MARK(g)) SET_GC_MARK(g, GC_MARK_RECLAIMABLE); /* guard against overflow */ SET_GC_SECTORS_USED(g, min_t(unsigned int, GC_SECTORS_USED(g) + KEY_SIZE(k), MAX_GC_SECTORS_USED)); BUG_ON(!GC_SECTORS_USED(g)); } return stale; } #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) { unsigned int i; for (i = 0; i < KEY_PTRS(k); i++) if (ptr_available(c, k, i) && !ptr_stale(c, k, i)) { struct bucket *b = PTR_BUCKET(c, k, i); b->gen = PTR_GEN(k, i); if (level && bkey_cmp(k, &ZERO_KEY)) b->prio = BTREE_PRIO; else if (!level && b->prio == BTREE_PRIO) b->prio = INITIAL_PRIO; } __bch_btree_mark_key(c, level, k); } void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats) { stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets; } static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) { uint8_t stale = 0; unsigned int keys = 0, good_keys = 0; struct bkey *k; struct btree_iter iter; struct bset_tree *t; gc->nodes++; for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { stale = max(stale, btree_mark_key(b, k)); keys++; if (bch_ptr_bad(&b->keys, k)) continue; gc->key_bytes += bkey_u64s(k); gc->nkeys++; good_keys++; gc->data += KEY_SIZE(k); } for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) btree_bug_on(t->size && bset_written(&b->keys, t) && bkey_cmp(&b->key, &t->end) < 0, b, "found short btree key in gc"); if (b->c->gc_always_rewrite) return true; if (stale > 10) return true; if ((keys - good_keys) * 2 > keys) return true; return false; } #define GC_MERGE_NODES 4U struct gc_merge_info { struct btree *b; unsigned int keys; }; static int bch_btree_insert_node(struct btree *b, struct btree_op *op, struct keylist *insert_keys, atomic_t *journal_ref, struct bkey *replace_key); static int btree_gc_coalesce(struct btree *b, struct btree_op *op, struct gc_stat *gc, struct gc_merge_info *r) { unsigned int i, nodes = 0, keys = 0, blocks; struct btree *new_nodes[GC_MERGE_NODES]; struct keylist keylist; struct closure cl; struct bkey *k; bch_keylist_init(&keylist); if (btree_check_reserve(b, NULL)) return 0; memset(new_nodes, 0, sizeof(new_nodes)); closure_init_stack(&cl); while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) keys += r[nodes++].keys; blocks = btree_default_blocks(b->c) * 2 / 3; if (nodes < 2 || __set_blocks(b->keys.set[0].data, keys, block_bytes(b->c->cache)) > blocks * (nodes - 1)) return 0; for (i = 0; i < nodes; i++) { new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); if (IS_ERR(new_nodes[i])) goto out_nocoalesce; } /* * We have to check the reserve here, after we've allocated our new * nodes, to make sure the insert below will succeed - we also check * before as an optimization to potentially avoid a bunch of expensive * allocs/sorts */ if (btree_check_reserve(b, NULL)) goto out_nocoalesce; for (i = 0; i < nodes; i++) mutex_lock(&new_nodes[i]->write_lock); for (i = nodes - 1; i > 0; --i) { struct bset *n1 = btree_bset_first(new_nodes[i]); struct bset *n2 = btree_bset_first(new_nodes[i - 1]); struct bkey *k, *last = NULL; keys = 0; if (i > 1) { for (k = n2->start; k < bset_bkey_last(n2); k = bkey_next(k)) { if (__set_blocks(n1, n1->keys + keys + bkey_u64s(k), block_bytes(b->c->cache)) > blocks) break; last = k; keys += bkey_u64s(k); } } else { /* * Last node we're not getting rid of - we're getting * rid of the node at r[0]. Have to try and fit all of * the remaining keys into this node; we can't ensure * they will always fit due to rounding and variable * length keys (shouldn't be possible in practice, * though) */ if (__set_blocks(n1, n1->keys + n2->keys, block_bytes(b->c->cache)) > btree_blocks(new_nodes[i])) goto out_unlock_nocoalesce; keys = n2->keys; /* Take the key of the node we're getting rid of */ last = &r->b->key; } BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) > btree_blocks(new_nodes[i])); if (last) bkey_copy_key(&new_nodes[i]->key, last); memcpy(bset_bkey_last(n1), n2->start, (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); n1->keys += keys; r[i].keys = n1->keys; memmove(n2->start, bset_bkey_idx(n2, keys), (void *) bset_bkey_last(n2) - (void *) bset_bkey_idx(n2, keys)); n2->keys -= keys; if (__bch_keylist_realloc(&keylist, bkey_u64s(&new_nodes[i]->key))) goto out_unlock_nocoalesce; bch_btree_node_write(new_nodes[i], &cl); bch_keylist_add(&keylist, &new_nodes[i]->key); } for (i = 0; i < nodes; i++) mutex_unlock(&new_nodes[i]->write_lock); closure_sync(&cl); /* We emptied out this node */ BUG_ON(btree_bset_first(new_nodes[0])->keys); btree_node_free(new_nodes[0]); rw_unlock(true, new_nodes[0]); new_nodes[0] = NULL; for (i = 0; i < nodes; i++) { if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) goto out_nocoalesce; make_btree_freeing_key(r[i].b, keylist.top); bch_keylist_push(&keylist); } bch_btree_insert_node(b, op, &keylist, NULL, NULL); BUG_ON(!bch_keylist_empty(&keylist)); for (i = 0; i < nodes; i++) { btree_node_free(r[i].b); rw_unlock(true, r[i].b); r[i].b = new_nodes[i]; } memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); r[nodes - 1].b = ERR_PTR(-EINTR); trace_bcache_btree_gc_coalesce(nodes); gc->nodes--; bch_keylist_free(&keylist); /* Invalidated our iterator */ return -EINTR; out_unlock_nocoalesce: for (i = 0; i < nodes; i++) mutex_unlock(&new_nodes[i]->write_lock); out_nocoalesce: closure_sync(&cl); while ((k = bch_keylist_pop(&keylist))) if (!bkey_cmp(k, &ZERO_KEY)) atomic_dec(&b->c->prio_blocked); bch_keylist_free(&keylist); for (i = 0; i < nodes; i++) if (!IS_ERR_OR_NULL(new_nodes[i])) { btree_node_free(new_nodes[i]); rw_unlock(true, new_nodes[i]); } return 0; } static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, struct btree *replace) { struct keylist keys; struct btree *n; if (btree_check_reserve(b, NULL)) return 0; n = btree_node_alloc_replacement(replace, NULL); if (IS_ERR(n)) return 0; /* recheck reserve after allocating replacement node */ if (btree_check_reserve(b, NULL)) { btree_node_free(n); rw_unlock(true, n); return 0; } bch_btree_node_write_sync(n); bch_keylist_init(&keys); bch_keylist_add(&keys, &n->key); make_btree_freeing_key(replace, keys.top); bch_keylist_push(&keys); bch_btree_insert_node(b, op, &keys, NULL, NULL); BUG_ON(!bch_keylist_empty(&keys)); btree_node_free(replace); rw_unlock(true, n); /* Invalidated our iterator */ return -EINTR; } static unsigned int btree_gc_count_keys(struct btree *b) { struct bkey *k; struct btree_iter iter; unsigned int ret = 0; for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) ret += bkey_u64s(k); return ret; } static size_t btree_gc_min_nodes(struct cache_set *c) { size_t min_nodes; /* * Since incremental GC would stop 100ms when front * side I/O comes, so when there are many btree nodes, * if GC only processes constant (100) nodes each time, * GC would last a long time, and the front side I/Os * would run out of the buckets (since no new bucket * can be allocated during GC), and be blocked again. * So GC should not process constant nodes, but varied * nodes according to the number of btree nodes, which * realized by dividing GC into constant(100) times, * so when there are many btree nodes, GC can process * more nodes each time, otherwise, GC will process less * nodes each time (but no less than MIN_GC_NODES) */ min_nodes = c->gc_stats.nodes / MAX_GC_TIMES; if (min_nodes < MIN_GC_NODES) min_nodes = MIN_GC_NODES; return min_nodes; } static int btree_gc_recurse(struct btree *b, struct btree_op *op, struct closure *writes, struct gc_stat *gc) { int ret = 0; bool should_rewrite; struct bkey *k; struct btree_iter iter; struct gc_merge_info r[GC_MERGE_NODES]; struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); for (i = r; i < r + ARRAY_SIZE(r); i++) i->b = ERR_PTR(-EINTR); while (1) { k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); if (k) { r->b = bch_btree_node_get(b->c, op, k, b->level - 1, true, b); if (IS_ERR(r->b)) { ret = PTR_ERR(r->b); break; } r->keys = btree_gc_count_keys(r->b); ret = btree_gc_coalesce(b, op, gc, r); if (ret) break; } if (!last->b) break; if (!IS_ERR(last->b)) { should_rewrite = btree_gc_mark_node(last->b, gc); if (should_rewrite) { ret = btree_gc_rewrite_node(b, op, last->b); if (ret) break; } if (last->b->level) { ret = btree_gc_recurse(last->b, op, writes, gc); if (ret) break; } bkey_copy_key(&b->c->gc_done, &last->b->key); /* * Must flush leaf nodes before gc ends, since replace * operations aren't journalled */ mutex_lock(&last->b->write_lock); if (btree_node_dirty(last->b)) bch_btree_node_write(last->b, writes); mutex_unlock(&last->b->write_lock); rw_unlock(true, last->b); } memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); r->b = NULL; if (atomic_read(&b->c->search_inflight) && gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) { gc->nodes_pre = gc->nodes; ret = -EAGAIN; break; } if (need_resched()) { ret = -EAGAIN; break; } } for (i = r; i < r + ARRAY_SIZE(r); i++) if (!IS_ERR_OR_NULL(i->b)) { mutex_lock(&i->b->write_lock); if (btree_node_dirty(i->b)) bch_btree_node_write(i->b, writes); mutex_unlock(&i->b->write_lock); rw_unlock(true, i->b); } return ret; } static int bch_btree_gc_root(struct btree *b, struct btree_op *op, struct closure *writes, struct gc_stat *gc) { struct btree *n = NULL; int ret = 0; bool should_rewrite; should_rewrite = btree_gc_mark_node(b, gc); if (should_rewrite) { n = btree_node_alloc_replacement(b, NULL); if (!IS_ERR(n)) { bch_btree_node_write_sync(n); bch_btree_set_root(n); btree_node_free(b); rw_unlock(true, n); return -EINTR; } } __bch_btree_mark_key(b->c, b->level + 1, &b->key); if (b->level) { ret = btree_gc_recurse(b, op, writes, gc); if (ret) return ret; } bkey_copy_key(&b->c->gc_done, &b->key); return ret; } static void btree_gc_start(struct cache_set *c) { struct cache *ca; struct bucket *b; if (!c->gc_mark_valid) return; mutex_lock(&c->bucket_lock); c->gc_mark_valid = 0; c->gc_done = ZERO_KEY; ca = c->cache; for_each_bucket(b, ca) { b->last_gc = b->gen; if (!atomic_read(&b->pin)) { SET_GC_MARK(b, 0); SET_GC_SECTORS_USED(b, 0); } } mutex_unlock(&c->bucket_lock); } static void bch_btree_gc_finish(struct cache_set *c) { struct bucket *b; struct cache *ca; unsigned int i, j; uint64_t *k; mutex_lock(&c->bucket_lock); set_gc_sectors(c); c->gc_mark_valid = 1; c->need_gc = 0; for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), GC_MARK_METADATA); /* don't reclaim buckets to which writeback keys point */ rcu_read_lock(); for (i = 0; i < c->devices_max_used; i++) { struct bcache_device *d = c->devices[i]; struct cached_dev *dc; struct keybuf_key *w, *n; if (!d || UUID_FLASH_ONLY(&c->uuids[i])) continue; dc = container_of(d, struct cached_dev, disk); spin_lock(&dc->writeback_keys.lock); rbtree_postorder_for_each_entry_safe(w, n, &dc->writeback_keys.keys, node) for (j = 0; j < KEY_PTRS(&w->key); j++) SET_GC_MARK(PTR_BUCKET(c, &w->key, j), GC_MARK_DIRTY); spin_unlock(&dc->writeback_keys.lock); } rcu_read_unlock(); c->avail_nbuckets = 0; ca = c->cache; ca->invalidate_needs_gc = 0; for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++) SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); for (k = ca->prio_buckets; k < ca->prio_buckets + prio_buckets(ca) * 2; k++) SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); for_each_bucket(b, ca) { c->need_gc = max(c->need_gc, bucket_gc_gen(b)); if (atomic_read(&b->pin)) continue; BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) c->avail_nbuckets++; } mutex_unlock(&c->bucket_lock); } static void bch_btree_gc(struct cache_set *c) { int ret; struct gc_stat stats; struct closure writes; struct btree_op op; uint64_t start_time = local_clock(); trace_bcache_gc_start(c); memset(&stats, 0, sizeof(struct gc_stat)); closure_init_stack(&writes); bch_btree_op_init(&op, SHRT_MAX); btree_gc_start(c); /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ do { ret = bcache_btree_root(gc_root, c, &op, &writes, &stats); closure_sync(&writes); cond_resched(); if (ret == -EAGAIN) schedule_timeout_interruptible(msecs_to_jiffies (GC_SLEEP_MS)); else if (ret) pr_warn("gc failed!\n"); } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); bch_btree_gc_finish(c); wake_up_allocators(c); bch_time_stats_update(&c->btree_gc_time, start_time); stats.key_bytes *= sizeof(uint64_t); stats.data <<= 9; bch_update_bucket_in_use(c, &stats); memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); trace_bcache_gc_end(c); bch_moving_gc(c); } static bool gc_should_run(struct cache_set *c) { struct cache *ca = c->cache; if (ca->invalidate_needs_gc) return true; if (atomic_read(&c->sectors_to_gc) < 0) return true; return false; } static int bch_gc_thread(void *arg) { struct cache_set *c = arg; while (1) { wait_event_interruptible(c->gc_wait, kthread_should_stop() || test_bit(CACHE_SET_IO_DISABLE, &c->flags) || gc_should_run(c)); if (kthread_should_stop() || test_bit(CACHE_SET_IO_DISABLE, &c->flags)) break; set_gc_sectors(c); bch_btree_gc(c); } wait_for_kthread_stop(); return 0; } int bch_gc_thread_start(struct cache_set *c) { c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); return PTR_ERR_OR_ZERO(c->gc_thread); } /* Initial partial gc */ static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) { int ret = 0; struct bkey *k, *p = NULL; struct btree_iter iter; for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) bch_initial_mark_key(b->c, b->level, k); bch_initial_mark_key(b->c, b->level + 1, &b->key); if (b->level) { bch_btree_iter_init(&b->keys, &iter, NULL); do { k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); if (k) { btree_node_prefetch(b, k); /* * initiallize c->gc_stats.nodes * for incremental GC */ b->c->gc_stats.nodes++; } if (p) ret = bcache_btree(check_recurse, p, b, op); p = k; } while (p && !ret); } return ret; } static int bch_btree_check_thread(void *arg) { int ret; struct btree_check_info *info = arg; struct btree_check_state *check_state = info->state; struct cache_set *c = check_state->c; struct btree_iter iter; struct bkey *k, *p; int cur_idx, prev_idx, skip_nr; k = p = NULL; cur_idx = prev_idx = 0; ret = 0; /* root node keys are checked before thread created */ bch_btree_iter_init(&c->root->keys, &iter, NULL); k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); BUG_ON(!k); p = k; while (k) { /* * Fetch a root node key index, skip the keys which * should be fetched by other threads, then check the * sub-tree indexed by the fetched key. */ spin_lock(&check_state->idx_lock); cur_idx = check_state->key_idx; check_state->key_idx++; spin_unlock(&check_state->idx_lock); skip_nr = cur_idx - prev_idx; while (skip_nr) { k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); if (k) p = k; else { /* * No more keys to check in root node, * current checking threads are enough, * stop creating more. */ atomic_set(&check_state->enough, 1); /* Update check_state->enough earlier */ smp_mb__after_atomic(); goto out; } skip_nr--; cond_resched(); } if (p) { struct btree_op op; btree_node_prefetch(c->root, p); c->gc_stats.nodes++; bch_btree_op_init(&op, 0); ret = bcache_btree(check_recurse, p, c->root, &op); /* * The op may be added to cache_set's btree_cache_wait * in mca_cannibalize(), must ensure it is removed from * the list and release btree_cache_alloc_lock before * free op memory. * Otherwise, the btree_cache_wait will be damaged. */ bch_cannibalize_unlock(c); finish_wait(&c->btree_cache_wait, &(&op)->wait); if (ret) goto out; } p = NULL; prev_idx = cur_idx; cond_resched(); } out: info->result = ret; /* update check_state->started among all CPUs */ smp_mb__before_atomic(); if (atomic_dec_and_test(&check_state->started)) wake_up(&check_state->wait); return ret; } static int bch_btree_chkthread_nr(void) { int n = num_online_cpus()/2; if (n == 0) n = 1; else if (n > BCH_BTR_CHKTHREAD_MAX) n = BCH_BTR_CHKTHREAD_MAX; return n; } int bch_btree_check(struct cache_set *c) { int ret = 0; int i; struct bkey *k = NULL; struct btree_iter iter; struct btree_check_state check_state; /* check and mark root node keys */ for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) bch_initial_mark_key(c, c->root->level, k); bch_initial_mark_key(c, c->root->level + 1, &c->root->key); if (c->root->level == 0) return 0; memset(&check_state, 0, sizeof(struct btree_check_state)); check_state.c = c; check_state.total_threads = bch_btree_chkthread_nr(); check_state.key_idx = 0; spin_lock_init(&check_state.idx_lock); atomic_set(&check_state.started, 0); atomic_set(&check_state.enough, 0); init_waitqueue_head(&check_state.wait); rw_lock(0, c->root, c->root->level); /* * Run multiple threads to check btree nodes in parallel, * if check_state.enough is non-zero, it means current * running check threads are enough, unncessary to create * more. */ for (i = 0; i < check_state.total_threads; i++) { /* fetch latest check_state.enough earlier */ smp_mb__before_atomic(); if (atomic_read(&check_state.enough)) break; check_state.infos[i].result = 0; check_state.infos[i].state = &check_state; check_state.infos[i].thread = kthread_run(bch_btree_check_thread, &check_state.infos[i], "bch_btrchk[%d]", i); if (IS_ERR(check_state.infos[i].thread)) { pr_err("fails to run thread bch_btrchk[%d]\n", i); for (--i; i >= 0; i--) kthread_stop(check_state.infos[i].thread); ret = -ENOMEM; goto out; } atomic_inc(&check_state.started); } /* * Must wait for all threads to stop. */ wait_event(check_state.wait, atomic_read(&check_state.started) == 0); for (i = 0; i < check_state.total_threads; i++) { if (check_state.infos[i].result) { ret = check_state.infos[i].result; goto out; } } out: rw_unlock(0, c->root); return ret; } void bch_initial_gc_finish(struct cache_set *c) { struct cache *ca = c->cache; struct bucket *b; bch_btree_gc_finish(c); mutex_lock(&c->bucket_lock); /* * We need to put some unused buckets directly on the prio freelist in * order to get the allocator thread started - it needs freed buckets in * order to rewrite the prios and gens, and it needs to rewrite prios * and gens in order to free buckets. * * This is only safe for buckets that have no live data in them, which * there should always be some of. */ for_each_bucket(b, ca) { if (fifo_full(&ca->free[RESERVE_PRIO]) && fifo_full(&ca->free[RESERVE_BTREE])) break; if (bch_can_invalidate_bucket(ca, b) && !GC_MARK(b)) { __bch_invalidate_one_bucket(ca, b); if (!fifo_push(&ca->free[RESERVE_PRIO], b - ca->buckets)) fifo_push(&ca->free[RESERVE_BTREE], b - ca->buckets); } } mutex_unlock(&c->bucket_lock); } /* Btree insertion */ static bool btree_insert_key(struct btree *b, struct bkey *k, struct bkey *replace_key) { unsigned int status; BUG_ON(bkey_cmp(k, &b->key) > 0); status = bch_btree_insert_key(&b->keys, k, replace_key); if (status != BTREE_INSERT_STATUS_NO_INSERT) { bch_check_keys(&b->keys, "%u for %s", status, replace_key ? "replace" : "insert"); trace_bcache_btree_insert_key(b, k, replace_key != NULL, status); return true; } else return false; } static size_t insert_u64s_remaining(struct btree *b) { long ret = bch_btree_keys_u64s_remaining(&b->keys); /* * Might land in the middle of an existing extent and have to split it */ if (b->keys.ops->is_extents) ret -= KEY_MAX_U64S; return max(ret, 0L); } static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, struct keylist *insert_keys, struct bkey *replace_key) { bool ret = false; int oldsize = bch_count_data(&b->keys); while (!bch_keylist_empty(insert_keys)) { struct bkey *k = insert_keys->keys; if (bkey_u64s(k) > insert_u64s_remaining(b)) break; if (bkey_cmp(k, &b->key) <= 0) { if (!b->level) bkey_put(b->c, k); ret |= btree_insert_key(b, k, replace_key); bch_keylist_pop_front(insert_keys); } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { BKEY_PADDED(key) temp; bkey_copy(&temp.key, insert_keys->keys); bch_cut_back(&b->key, &temp.key); bch_cut_front(&b->key, insert_keys->keys); ret |= btree_insert_key(b, &temp.key, replace_key); break; } else { break; } } if (!ret) op->insert_collision = true; BUG_ON(!bch_keylist_empty(insert_keys) && b->level); BUG_ON(bch_count_data(&b->keys) < oldsize); return ret; } static int btree_split(struct btree *b, struct btree_op *op, struct keylist *insert_keys, struct bkey *replace_key) { bool split; struct btree *n1, *n2 = NULL, *n3 = NULL; uint64_t start_time = local_clock(); struct closure cl; struct keylist parent_keys; closure_init_stack(&cl); bch_keylist_init(&parent_keys); if (btree_check_reserve(b, op)) { if (!b->level) return -EINTR; else WARN(1, "insufficient reserve for split\n"); } n1 = btree_node_alloc_replacement(b, op); if (IS_ERR(n1)) goto err; split = set_blocks(btree_bset_first(n1), block_bytes(n1->c->cache)) > (btree_blocks(b) * 4) / 5; if (split) { unsigned int keys = 0; trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent); if (IS_ERR(n2)) goto err_free1; if (!b->parent) { n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL); if (IS_ERR(n3)) goto err_free2; } mutex_lock(&n1->write_lock); mutex_lock(&n2->write_lock); bch_btree_insert_keys(n1, op, insert_keys, replace_key); /* * Has to be a linear search because we don't have an auxiliary * search tree yet */ while (keys < (btree_bset_first(n1)->keys * 3) / 5) keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); bkey_copy_key(&n1->key, bset_bkey_idx(btree_bset_first(n1), keys)); keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; btree_bset_first(n1)->keys = keys; memcpy(btree_bset_first(n2)->start, bset_bkey_last(btree_bset_first(n1)), btree_bset_first(n2)->keys * sizeof(uint64_t)); bkey_copy_key(&n2->key, &b->key); bch_keylist_add(&parent_keys, &n2->key); bch_btree_node_write(n2, &cl); mutex_unlock(&n2->write_lock); rw_unlock(true, n2); } else { trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); mutex_lock(&n1->write_lock); bch_btree_insert_keys(n1, op, insert_keys, replace_key); } bch_keylist_add(&parent_keys, &n1->key); bch_btree_node_write(n1, &cl); mutex_unlock(&n1->write_lock); if (n3) { /* Depth increases, make a new root */ mutex_lock(&n3->write_lock); bkey_copy_key(&n3->key, &MAX_KEY); bch_btree_insert_keys(n3, op, &parent_keys, NULL); bch_btree_node_write(n3, &cl); mutex_unlock(&n3->write_lock); closure_sync(&cl); bch_btree_set_root(n3); rw_unlock(true, n3); } else if (!b->parent) { /* Root filled up but didn't need to be split */ closure_sync(&cl); bch_btree_set_root(n1); } else { /* Split a non root node */ closure_sync(&cl); make_btree_freeing_key(b, parent_keys.top); bch_keylist_push(&parent_keys); bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); BUG_ON(!bch_keylist_empty(&parent_keys)); } btree_node_free(b); rw_unlock(true, n1); bch_time_stats_update(&b->c->btree_split_time, start_time); return 0; err_free2: bkey_put(b->c, &n2->key); btree_node_free(n2); rw_unlock(true, n2); err_free1: bkey_put(b->c, &n1->key); btree_node_free(n1); rw_unlock(true, n1); err: WARN(1, "bcache: btree split failed (level %u)", b->level); if (n3 == ERR_PTR(-EAGAIN) || n2 == ERR_PTR(-EAGAIN) || n1 == ERR_PTR(-EAGAIN)) return -EAGAIN; return -ENOMEM; } static int bch_btree_insert_node(struct btree *b, struct btree_op *op, struct keylist *insert_keys, atomic_t *journal_ref, struct bkey *replace_key) { struct closure cl; BUG_ON(b->level && replace_key); closure_init_stack(&cl); mutex_lock(&b->write_lock); if (write_block(b) != btree_bset_last(b) && b->keys.last_set_unwritten) bch_btree_init_next(b); /* just wrote a set */ if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { mutex_unlock(&b->write_lock); goto split; } BUG_ON(write_block(b) != btree_bset_last(b)); if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { if (!b->level) bch_btree_leaf_dirty(b, journal_ref); else bch_btree_node_write(b, &cl); } mutex_unlock(&b->write_lock); /* wait for btree node write if necessary, after unlock */ closure_sync(&cl); return 0; split: if (current->bio_list) { op->lock = b->c->root->level + 1; return -EAGAIN; } else if (op->lock <= b->c->root->level) { op->lock = b->c->root->level + 1; return -EINTR; } else { /* Invalidated all iterators */ int ret = btree_split(b, op, insert_keys, replace_key); if (bch_keylist_empty(insert_keys)) return 0; else if (!ret) return -EINTR; return ret; } } int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, struct bkey *check_key) { int ret = -EINTR; uint64_t btree_ptr = b->key.ptr[0]; unsigned long seq = b->seq; struct keylist insert; bool upgrade = op->lock == -1; bch_keylist_init(&insert); if (upgrade) { rw_unlock(false, b); rw_lock(true, b, b->level); if (b->key.ptr[0] != btree_ptr || b->seq != seq + 1) { op->lock = b->level; goto out; } } SET_KEY_PTRS(check_key, 1); get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); bch_keylist_add(&insert, check_key); ret = bch_btree_insert_node(b, op, &insert, NULL, NULL); BUG_ON(!ret && !bch_keylist_empty(&insert)); out: if (upgrade) downgrade_write(&b->lock); return ret; } struct btree_insert_op { struct btree_op op; struct keylist *keys; atomic_t *journal_ref; struct bkey *replace_key; }; static int btree_insert_fn(struct btree_op *b_op, struct btree *b) { struct btree_insert_op *op = container_of(b_op, struct btree_insert_op, op); int ret = bch_btree_insert_node(b, &op->op, op->keys, op->journal_ref, op->replace_key); if (ret && !bch_keylist_empty(op->keys)) return ret; else return MAP_DONE; } int bch_btree_insert(struct cache_set *c, struct keylist *keys, atomic_t *journal_ref, struct bkey *replace_key) { struct btree_insert_op op; int ret = 0; BUG_ON(current->bio_list); BUG_ON(bch_keylist_empty(keys)); bch_btree_op_init(&op.op, 0); op.keys = keys; op.journal_ref = journal_ref; op.replace_key = replace_key; while (!ret && !bch_keylist_empty(keys)) { op.op.lock = 0; ret = bch_btree_map_leaf_nodes(&op.op, c, &START_KEY(keys->keys), btree_insert_fn); } if (ret) { struct bkey *k; pr_err("error %i\n", ret); while ((k = bch_keylist_pop(keys))) bkey_put(c, k); } else if (op.op.insert_collision) ret = -ESRCH; return ret; } void bch_btree_set_root(struct btree *b) { unsigned int i; struct closure cl; closure_init_stack(&cl); trace_bcache_btree_set_root(b); BUG_ON(!b->written); for (i = 0; i < KEY_PTRS(&b->key); i++) BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); mutex_lock(&b->c->bucket_lock); list_del_init(&b->list); mutex_unlock(&b->c->bucket_lock); b->c->root = b; bch_journal_meta(b->c, &cl); closure_sync(&cl); } /* Map across nodes or keys */ static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, struct bkey *from, btree_map_nodes_fn *fn, int flags) { int ret = MAP_CONTINUE; if (b->level) { struct bkey *k; struct btree_iter iter; bch_btree_iter_init(&b->keys, &iter, from); while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { ret = bcache_btree(map_nodes_recurse, k, b, op, from, fn, flags); from = NULL; if (ret != MAP_CONTINUE) return ret; } } if (!b->level || flags == MAP_ALL_NODES) ret = fn(op, b); return ret; } int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_nodes_fn *fn, int flags) { return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags); } int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, struct bkey *from, btree_map_keys_fn *fn, int flags) { int ret = MAP_CONTINUE; struct bkey *k; struct btree_iter iter; bch_btree_iter_init(&b->keys, &iter, from); while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { ret = !b->level ? fn(op, b, k) : bcache_btree(map_keys_recurse, k, b, op, from, fn, flags); from = NULL; if (ret != MAP_CONTINUE) return ret; } if (!b->level && (flags & MAP_END_KEY)) ret = fn(op, b, &KEY(KEY_INODE(&b->key), KEY_OFFSET(&b->key), 0)); return ret; } int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, struct bkey *from, btree_map_keys_fn *fn, int flags) { return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags); } /* Keybuf code */ static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) { /* Overlapping keys compare equal */ if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) return -1; if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) return 1; return 0; } static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, struct keybuf_key *r) { return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); } struct refill { struct btree_op op; unsigned int nr_found; struct keybuf *buf; struct bkey *end; keybuf_pred_fn *pred; }; static int refill_keybuf_fn(struct btree_op *op, struct btree *b, struct bkey *k) { struct refill *refill = container_of(op, struct refill, op); struct keybuf *buf = refill->buf; int ret = MAP_CONTINUE; if (bkey_cmp(k, refill->end) > 0) { ret = MAP_DONE; goto out; } if (!KEY_SIZE(k)) /* end key */ goto out; if (refill->pred(buf, k)) { struct keybuf_key *w; spin_lock(&buf->lock); w = array_alloc(&buf->freelist); if (!w) { spin_unlock(&buf->lock); return MAP_DONE; } w->private = NULL; bkey_copy(&w->key, k); if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) array_free(&buf->freelist, w); else refill->nr_found++; if (array_freelist_empty(&buf->freelist)) ret = MAP_DONE; spin_unlock(&buf->lock); } out: buf->last_scanned = *k; return ret; } void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, struct bkey *end, keybuf_pred_fn *pred) { struct bkey start = buf->last_scanned; struct refill refill; cond_resched(); bch_btree_op_init(&refill.op, -1); refill.nr_found = 0; refill.buf = buf; refill.end = end; refill.pred = pred; bch_btree_map_keys(&refill.op, c, &buf->last_scanned, refill_keybuf_fn, MAP_END_KEY); trace_bcache_keyscan(refill.nr_found, KEY_INODE(&start), KEY_OFFSET(&start), KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); spin_lock(&buf->lock); if (!RB_EMPTY_ROOT(&buf->keys)) { struct keybuf_key *w; w = RB_FIRST(&buf->keys, struct keybuf_key, node); buf->start = START_KEY(&w->key); w = RB_LAST(&buf->keys, struct keybuf_key, node); buf->end = w->key; } else { buf->start = MAX_KEY; buf->end = MAX_KEY; } spin_unlock(&buf->lock); } static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) { rb_erase(&w->node, &buf->keys); array_free(&buf->freelist, w); } void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) { spin_lock(&buf->lock); __bch_keybuf_del(buf, w); spin_unlock(&buf->lock); } bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, struct bkey *end) { bool ret = false; struct keybuf_key *p, *w, s; s.key = *start; if (bkey_cmp(end, &buf->start) <= 0 || bkey_cmp(start, &buf->end) >= 0) return false; spin_lock(&buf->lock); w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { p = w; w = RB_NEXT(w, node); if (p->private) ret = true; else __bch_keybuf_del(buf, p); } spin_unlock(&buf->lock); return ret; } struct keybuf_key *bch_keybuf_next(struct keybuf *buf) { struct keybuf_key *w; spin_lock(&buf->lock); w = RB_FIRST(&buf->keys, struct keybuf_key, node); while (w && w->private) w = RB_NEXT(w, node); if (w) w->private = ERR_PTR(-EINTR); spin_unlock(&buf->lock); return w; } struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, struct keybuf *buf, struct bkey *end, keybuf_pred_fn *pred) { struct keybuf_key *ret; while (1) { ret = bch_keybuf_next(buf); if (ret) break; if (bkey_cmp(&buf->last_scanned, end) >= 0) { pr_debug("scan finished\n"); break; } bch_refill_keybuf(c, buf, end, pred); } return ret; } void bch_keybuf_init(struct keybuf *buf) { buf->last_scanned = MAX_KEY; buf->keys = RB_ROOT; spin_lock_init(&buf->lock); array_allocator_init(&buf->freelist); } void bch_btree_exit(void) { if (btree_io_wq) destroy_workqueue(btree_io_wq); } int __init bch_btree_init(void) { btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0); if (!btree_io_wq) return -ENOMEM; return 0; }