--- zzzz-none-000/linux-3.10.107/drivers/md/bcache/journal.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/drivers/md/bcache/journal.c 2021-02-04 17:41:59.000000000 +0000 @@ -7,7 +7,9 @@ #include "bcache.h" #include "btree.h" #include "debug.h" -#include "request.h" +#include "extents.h" + +#include /* * Journal replay/recovery: @@ -22,42 +24,45 @@ * bit. */ -static void journal_read_endio(struct bio *bio, int error) +static void journal_read_endio(struct bio *bio) { struct closure *cl = bio->bi_private; closure_put(cl); } static int journal_read_bucket(struct cache *ca, struct list_head *list, - struct btree_op *op, unsigned bucket_index) + unsigned bucket_index) { struct journal_device *ja = &ca->journal; struct bio *bio = &ja->bio; struct journal_replay *i; struct jset *j, *data = ca->set->journal.w[0].data; + struct closure cl; unsigned len, left, offset = 0; int ret = 0; sector_t bucket = bucket_to_sector(ca->set, ca->sb.d[bucket_index]); - pr_debug("reading %llu", (uint64_t) bucket); + closure_init_stack(&cl); + + pr_debug("reading %u", bucket_index); while (offset < ca->sb.bucket_size) { reread: left = ca->sb.bucket_size - offset; - len = min_t(unsigned, left, PAGE_SECTORS * 8); + len = min_t(unsigned, left, PAGE_SECTORS << JSET_BITS); bio_reset(bio); - bio->bi_sector = bucket + offset; + bio->bi_iter.bi_sector = bucket + offset; bio->bi_bdev = ca->bdev; bio->bi_rw = READ; - bio->bi_size = len << 9; + bio->bi_iter.bi_size = len << 9; bio->bi_end_io = journal_read_endio; - bio->bi_private = &op->cl; + bio->bi_private = &cl; bch_bio_map(bio, data); - closure_bio_submit(bio, &op->cl, ca); - closure_sync(&op->cl); + closure_bio_submit(bio, &cl); + closure_sync(&cl); /* This function could be simpler now since we no longer write * journal entries that overlap bucket boundaries; this means @@ -70,19 +75,28 @@ struct list_head *where; size_t blocks, bytes = set_bytes(j); - if (j->magic != jset_magic(ca->set)) + if (j->magic != jset_magic(&ca->sb)) { + pr_debug("%u: bad magic", bucket_index); return ret; + } - if (bytes > left << 9) + if (bytes > left << 9 || + bytes > PAGE_SIZE << JSET_BITS) { + pr_info("%u: too big, %zu bytes, offset %u", + bucket_index, bytes, offset); return ret; + } if (bytes > len << 9) goto reread; - if (j->csum != csum_set(j)) + if (j->csum != csum_set(j)) { + pr_info("%u: bad csum, %zu bytes, offset %u", + bucket_index, bytes, offset); return ret; + } - blocks = set_blocks(j, ca->set); + blocks = set_blocks(j, block_bytes(ca->set)); while (!list_empty(list)) { i = list_first_entry(list, @@ -127,12 +141,11 @@ return ret; } -int bch_journal_read(struct cache_set *c, struct list_head *list, - struct btree_op *op) +int bch_journal_read(struct cache_set *c, struct list_head *list) { #define read_bucket(b) \ ({ \ - int ret = journal_read_bucket(ca, list, op, b); \ + int ret = journal_read_bucket(ca, list, b); \ __set_bit(b, bitmap); \ if (ret < 0) \ return ret; \ @@ -144,7 +157,7 @@ for_each_cache(ca, c, iter) { struct journal_device *ja = &ca->journal; - unsigned long bitmap[SB_JOURNAL_BUCKETS / BITS_PER_LONG]; + DECLARE_BITMAP(bitmap, SB_JOURNAL_BUCKETS); unsigned i, l, r, m; uint64_t seq; @@ -177,11 +190,15 @@ if (read_bucket(l)) goto bsearch; - if (list_empty(list)) + /* no journal entries on this device? */ + if (l == ca->sb.njournal_buckets) continue; bsearch: + BUG_ON(list_empty(list)); + /* Binary search */ - m = r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); + m = l; + r = find_next_bit(bitmap, ca->sb.njournal_buckets, l + 1); pr_debug("starting binary search, l %u r %u", l, r); while (l + 1 < r) { @@ -225,8 +242,14 @@ for (i = 0; i < ca->sb.njournal_buckets; i++) if (ja->seq[i] > seq) { seq = ja->seq[i]; - ja->cur_idx = ja->discard_idx = - ja->last_idx = i; + /* + * When journal_reclaim() goes to allocate for + * the first time, it'll use the bucket after + * ja->cur_idx + */ + ja->cur_idx = i; + ja->last_idx = ja->discard_idx = (i + 1) % + ca->sb.njournal_buckets; } } @@ -272,26 +295,21 @@ } for (k = i->j.start; - k < end(&i->j); - k = bkey_next(k)) { - unsigned j; + k < bset_bkey_last(&i->j); + k = bkey_next(k)) + if (!__bch_extent_invalid(c, k)) { + unsigned j; + + for (j = 0; j < KEY_PTRS(k); j++) + if (ptr_available(c, k, j)) + atomic_inc(&PTR_BUCKET(c, k, j)->pin); - for (j = 0; j < KEY_PTRS(k); j++) { - struct bucket *g = PTR_BUCKET(c, k, j); - atomic_inc(&g->pin); - - if (g->prio == BTREE_PRIO && - !ptr_stale(c, k, j)) - g->prio = INITIAL_PRIO; + bch_initial_mark_key(c, 0, k); } - - __bch_btree_mark_key(c, 0, k); - } } } -int bch_journal_replay(struct cache_set *s, struct list_head *list, - struct btree_op *op) +int bch_journal_replay(struct cache_set *s, struct list_head *list) { int ret = 0, keys = 0, entries = 0; struct bkey *k; @@ -299,30 +317,27 @@ list_entry(list->prev, struct journal_replay, list); uint64_t start = i->j.last_seq, end = i->j.seq, n = start; + struct keylist keylist; list_for_each_entry(i, list, list) { BUG_ON(i->pin && atomic_read(i->pin) != 1); - if (n != i->j.seq) - pr_err( - "journal entries %llu-%llu missing! (replaying %llu-%llu)\n", - n, i->j.seq - 1, start, end); + cache_set_err_on(n != i->j.seq, s, +"bcache: journal entries %llu-%llu missing! (replaying %llu-%llu)", + n, i->j.seq - 1, start, end); for (k = i->j.start; - k < end(&i->j); + k < bset_bkey_last(&i->j); k = bkey_next(k)) { - pr_debug("%s", pkey(k)); - bkey_copy(op->keys.top, k); - bch_keylist_push(&op->keys); + trace_bcache_journal_replay_key(k); - op->journal = i->pin; - atomic_inc(op->journal); + bch_keylist_init_single(&keylist, k); - ret = bch_btree_insert(op, s); + ret = bch_btree_insert(s, &keylist, i->pin, NULL); if (ret) goto err; - BUG_ON(!bch_keylist_empty(&op->keys)); + BUG_ON(!bch_keylist_empty(&keylist)); keys++; cond_resched(); @@ -336,14 +351,13 @@ pr_info("journal replay done, %i keys in %i entries, seq %llu", keys, entries, end); - +err: while (!list_empty(list)) { i = list_first_entry(list, struct journal_replay, list); list_del(&i->list); kfree(i); } -err: - closure_sync(&op->cl); + return ret; } @@ -355,53 +369,39 @@ * Try to find the btree node with that references the oldest journal * entry, best is our current candidate and is locked if non NULL: */ - struct btree *b, *best = NULL; - unsigned iter; - - for_each_cached_btree(b, c, iter) { - if (!down_write_trylock(&b->lock)) - continue; - - if (!btree_node_dirty(b) || - !btree_current_write(b)->journal) { - rw_unlock(true, b); - continue; + struct btree *b, *best; + unsigned i; +retry: + best = NULL; + + for_each_cached_btree(b, c, i) + if (btree_current_write(b)->journal) { + if (!best) + best = b; + else if (journal_pin_cmp(c, + btree_current_write(best)->journal, + btree_current_write(b)->journal)) { + best = b; + } } - if (!best) - best = b; - else if (journal_pin_cmp(c, - btree_current_write(best), - btree_current_write(b))) { - rw_unlock(true, best); - best = b; - } else - rw_unlock(true, b); - } - - if (best) - goto out; - - /* We can't find the best btree node, just pick the first */ - list_for_each_entry(b, &c->btree_cache, list) - if (!b->level && btree_node_dirty(b)) { - best = b; - rw_lock(true, best, best->level); - goto found; + b = best; + if (b) { + mutex_lock(&b->write_lock); + if (!btree_current_write(b)->journal) { + mutex_unlock(&b->write_lock); + /* We raced */ + goto retry; } -out: - if (!best) - return; -found: - if (btree_node_dirty(best)) - bch_btree_write(best, true, NULL); - rw_unlock(true, best); + __bch_btree_node_write(b, NULL); + mutex_unlock(&b->write_lock); + } } #define last_seq(j) ((j)->seq - fifo_used(&(j)->pin) + 1) -static void journal_discard_endio(struct bio *bio, int error) +static void journal_discard_endio(struct bio *bio) { struct journal_device *ja = container_of(bio, struct journal_device, discard_bio); @@ -449,13 +449,13 @@ atomic_set(&ja->discard_in_flight, DISCARD_IN_FLIGHT); bio_init(bio); - bio->bi_sector = bucket_to_sector(ca->set, + bio->bi_iter.bi_sector = bucket_to_sector(ca->set, ca->sb.d[ja->discard_idx]); bio->bi_bdev = ca->bdev; bio->bi_rw = REQ_WRITE|REQ_DISCARD; bio->bi_max_vecs = 1; bio->bi_io_vec = bio->bi_inline_vecs; - bio->bi_size = bucket_bytes(ca); + bio->bi_iter.bi_size = bucket_bytes(ca); bio->bi_end_io = journal_discard_endio; closure_get(&ca->set->cl); @@ -492,7 +492,7 @@ do_journal_discard(ca); if (c->journal.blocks_free) - return; + goto out; /* * Allocate: @@ -518,7 +518,7 @@ if (n) c->journal.blocks_free = c->sb.bucket_size >> c->block_bits; - +out: if (!journal_full(&c->journal)) __closure_wake_up(&c->journal.wait); } @@ -539,6 +539,7 @@ atomic_set(&fifo_back(&j->pin), 1); j->cur->data->seq = ++j->seq; + j->cur->dirty = false; j->cur->need_write = false; j->cur->data->keys = 0; @@ -546,55 +547,51 @@ pr_debug("journal_pin full (%zu)", fifo_used(&j->pin)); } -static void journal_write_endio(struct bio *bio, int error) +static void journal_write_endio(struct bio *bio) { struct journal_write *w = bio->bi_private; - cache_set_err_on(error, w->c, "journal io error"); - closure_put(&w->c->journal.io.cl); + cache_set_err_on(bio->bi_error, w->c, "journal io error"); + closure_put(&w->c->journal.io); } static void journal_write(struct closure *); static void journal_write_done(struct closure *cl) { - struct journal *j = container_of(cl, struct journal, io.cl); - struct cache_set *c = container_of(j, struct cache_set, journal); - + struct journal *j = container_of(cl, struct journal, io); struct journal_write *w = (j->cur == j->w) ? &j->w[1] : &j->w[0]; __closure_wake_up(&w->wait); + continue_at_nobarrier(cl, journal_write, system_wq); +} - if (c->journal_delay_ms) - closure_delay(&j->io, msecs_to_jiffies(c->journal_delay_ms)); +static void journal_write_unlock(struct closure *cl) +{ + struct cache_set *c = container_of(cl, struct cache_set, journal.io); - continue_at(cl, journal_write, system_wq); + c->journal.io_in_flight = 0; + spin_unlock(&c->journal.lock); } static void journal_write_unlocked(struct closure *cl) __releases(c->journal.lock) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); + struct cache_set *c = container_of(cl, struct cache_set, journal.io); struct cache *ca; struct journal_write *w = c->journal.cur; struct bkey *k = &c->journal.key; - unsigned i, sectors = set_blocks(w->data, c) * c->sb.block_size; + unsigned i, sectors = set_blocks(w->data, block_bytes(c)) * + c->sb.block_size; struct bio *bio; struct bio_list list; bio_list_init(&list); if (!w->need_write) { - /* - * XXX: have to unlock closure before we unlock journal lock, - * else we race with bch_journal(). But this way we race - * against cache set unregister. Doh. - */ - set_closure_fn(cl, NULL, NULL); - closure_sub(cl, CLOSURE_RUNNING + 1); - spin_unlock(&c->journal.lock); + closure_return_with_destructor(cl, journal_write_unlock); return; } else if (journal_full(&c->journal)) { journal_reclaim(c); @@ -602,9 +599,10 @@ btree_flush_write(c); continue_at(cl, journal_write, system_wq); + return; } - c->journal.blocks_free -= set_blocks(w->data, c); + c->journal.blocks_free -= set_blocks(w->data, block_bytes(c)); w->data->btree_level = c->root->level; @@ -614,7 +612,7 @@ for_each_cache(ca, c, i) w->data->prio_bucket[ca->sb.nr_this_dev] = ca->prio_buckets[0]; - w->data->magic = jset_magic(c); + w->data->magic = jset_magic(&c->sb); w->data->version = BCACHE_JSET_VERSION; w->data->last_seq = last_seq(&c->journal); w->data->csum = csum_set(w->data); @@ -626,10 +624,10 @@ atomic_long_add(sectors, &ca->meta_sectors_written); bio_reset(bio); - bio->bi_sector = PTR_OFFSET(k, i); + bio->bi_iter.bi_sector = PTR_OFFSET(k, i); bio->bi_bdev = ca->bdev; bio->bi_rw = REQ_WRITE|REQ_SYNC|REQ_META|REQ_FLUSH|REQ_FUA; - bio->bi_size = sectors << 9; + bio->bi_iter.bi_size = sectors << 9; bio->bi_end_io = journal_write_endio; bio->bi_private = w; @@ -650,125 +648,151 @@ spin_unlock(&c->journal.lock); while ((bio = bio_list_pop(&list))) - closure_bio_submit(bio, cl, c->cache[0]); + closure_bio_submit(bio, cl); continue_at(cl, journal_write_done, NULL); } static void journal_write(struct closure *cl) { - struct cache_set *c = container_of(cl, struct cache_set, journal.io.cl); + struct cache_set *c = container_of(cl, struct cache_set, journal.io); spin_lock(&c->journal.lock); journal_write_unlocked(cl); } -static void __journal_try_write(struct cache_set *c, bool noflush) +static void journal_try_write(struct cache_set *c) __releases(c->journal.lock) { - struct closure *cl = &c->journal.io.cl; + struct closure *cl = &c->journal.io; + struct journal_write *w = c->journal.cur; - if (!closure_trylock(cl, &c->cl)) - spin_unlock(&c->journal.lock); - else if (noflush && journal_full(&c->journal)) { + w->need_write = true; + + if (!c->journal.io_in_flight) { + c->journal.io_in_flight = 1; + closure_call(cl, journal_write_unlocked, NULL, &c->cl); + } else { spin_unlock(&c->journal.lock); - continue_at(cl, journal_write, system_wq); - } else - journal_write_unlocked(cl); + } } -#define journal_try_write(c) __journal_try_write(c, false) - -void bch_journal_meta(struct cache_set *c, struct closure *cl) +static struct journal_write *journal_wait_for_write(struct cache_set *c, + unsigned nkeys) { - struct journal_write *w; + size_t sectors; + struct closure cl; + bool wait = false; - if (CACHE_SYNC(&c->sb)) { - spin_lock(&c->journal.lock); + closure_init_stack(&cl); - w = c->journal.cur; - w->need_write = true; + spin_lock(&c->journal.lock); + + while (1) { + struct journal_write *w = c->journal.cur; + + sectors = __set_blocks(w->data, w->data->keys + nkeys, + block_bytes(c)) * c->sb.block_size; - if (cl) - BUG_ON(!closure_wait(&w->wait, cl)); + if (sectors <= min_t(size_t, + c->journal.blocks_free * c->sb.block_size, + PAGE_SECTORS << JSET_BITS)) + return w; + + if (wait) + closure_wait(&c->journal.wait, &cl); + + if (!journal_full(&c->journal)) { + if (wait) + trace_bcache_journal_entry_full(c); + + /* + * XXX: If we were inserting so many keys that they + * won't fit in an _empty_ journal write, we'll + * deadlock. For now, handle this in + * bch_keylist_realloc() - but something to think about. + */ + BUG_ON(!w->data->keys); + + journal_try_write(c); /* unlocks */ + } else { + if (wait) + trace_bcache_journal_full(c); - closure_flush(&c->journal.io); - __journal_try_write(c, true); + journal_reclaim(c); + spin_unlock(&c->journal.lock); + + btree_flush_write(c); + } + + closure_sync(&cl); + spin_lock(&c->journal.lock); + wait = true; } } +static void journal_write_work(struct work_struct *work) +{ + struct cache_set *c = container_of(to_delayed_work(work), + struct cache_set, + journal.work); + spin_lock(&c->journal.lock); + if (c->journal.cur->dirty) + journal_try_write(c); + else + spin_unlock(&c->journal.lock); +} + /* * Entry point to the journalling code - bio_insert() and btree_invalidate() * pass bch_journal() a list of keys to be journalled, and then * bch_journal() hands those same keys off to btree_insert_async() */ -void bch_journal(struct closure *cl) +atomic_t *bch_journal(struct cache_set *c, + struct keylist *keys, + struct closure *parent) { - struct btree_op *op = container_of(cl, struct btree_op, cl); - struct cache_set *c = op->c; struct journal_write *w; - size_t b, n = ((uint64_t *) op->keys.top) - op->keys.list; + atomic_t *ret; - if (op->type != BTREE_INSERT || - !CACHE_SYNC(&c->sb)) - goto out; + if (!CACHE_SYNC(&c->sb)) + return NULL; - /* - * If we're looping because we errored, might already be waiting on - * another journal write: - */ - while (atomic_read(&cl->parent->remaining) & CLOSURE_WAITING) - closure_sync(cl->parent); + w = journal_wait_for_write(c, bch_keylist_nkeys(keys)); - spin_lock(&c->journal.lock); + memcpy(bset_bkey_last(w->data), keys->keys, bch_keylist_bytes(keys)); + w->data->keys += bch_keylist_nkeys(keys); - if (journal_full(&c->journal)) { - /* XXX: tracepoint */ - closure_wait(&c->journal.wait, cl); + ret = &fifo_back(&c->journal.pin); + atomic_inc(ret); - journal_reclaim(c); + if (parent) { + closure_wait(&w->wait, parent); + journal_try_write(c); + } else if (!w->dirty) { + w->dirty = true; + schedule_delayed_work(&c->journal.work, + msecs_to_jiffies(c->journal_delay_ms)); + spin_unlock(&c->journal.lock); + } else { spin_unlock(&c->journal.lock); - - btree_flush_write(c); - continue_at(cl, bch_journal, bcache_wq); } - w = c->journal.cur; - w->need_write = true; - b = __set_blocks(w->data, w->data->keys + n, c); - - if (b * c->sb.block_size > PAGE_SECTORS << JSET_BITS || - b > c->journal.blocks_free) { - /* XXX: If we were inserting so many keys that they won't fit in - * an _empty_ journal write, we'll deadlock. For now, handle - * this in bch_keylist_realloc() - but something to think about. - */ - BUG_ON(!w->data->keys); - - /* XXX: tracepoint */ - BUG_ON(!closure_wait(&w->wait, cl)); - closure_flush(&c->journal.io); - - journal_try_write(c); - continue_at(cl, bch_journal, bcache_wq); - } - - memcpy(end(w->data), op->keys.list, n * sizeof(uint64_t)); - w->data->keys += n; + return ret; +} - op->journal = &fifo_back(&c->journal.pin); - atomic_inc(op->journal); +void bch_journal_meta(struct cache_set *c, struct closure *cl) +{ + struct keylist keys; + atomic_t *ref; - if (op->flush_journal) { - closure_flush(&c->journal.io); - closure_wait(&w->wait, cl->parent); - } + bch_keylist_init(&keys); - journal_try_write(c); -out: - bch_btree_insert_async(cl); + ref = bch_journal(c, &keys, cl); + if (ref) + atomic_dec_bug(ref); } void bch_journal_free(struct cache_set *c) @@ -782,8 +806,8 @@ { struct journal *j = &c->journal; - closure_init_unlocked(&j->io); spin_lock_init(&j->lock); + INIT_DELAYED_WORK(&j->work, journal_write_work); c->journal_delay_ms = 100;