--- zzzz-none-000/linux-3.10.107/drivers/block/aoe/aoecmd.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/drivers/block/aoe/aoecmd.c 2021-02-04 17:41:59.000000000 +0000 @@ -1,4 +1,4 @@ -/* Copyright (c) 2012 Coraid, Inc. See COPYING for GPL terms. */ +/* Copyright (c) 2013 Coraid, Inc. See COPYING for GPL terms. */ /* * aoecmd.c * Filesystem request handling methods @@ -35,14 +35,27 @@ MODULE_PARM_DESC(aoe_maxout, "Only aoe_maxout outstanding packets for every MAC on eX.Y."); -static wait_queue_head_t ktiowq; -static struct ktstate kts; +/* The number of online cpus during module initialization gives us a + * convenient heuristic cap on the parallelism used for ktio threads + * doing I/O completion. It is not important that the cap equal the + * actual number of running CPUs at any given time, but because of CPU + * hotplug, we take care to use ncpus instead of using + * num_online_cpus() after module initialization. + */ +static int ncpus; + +/* mutex lock used for synchronization while thread spawning */ +static DEFINE_MUTEX(ktio_spawn_lock); + +static wait_queue_head_t *ktiowq; +static struct ktstate *kts; /* io completion queue */ -static struct { +struct iocq_ktio { struct list_head head; spinlock_t lock; -} iocq; +}; +static struct iocq_ktio *iocq; static struct page *empty_page; @@ -183,8 +196,7 @@ t = f->t; f->buf = NULL; - f->lba = 0; - f->bv = NULL; + memset(&f->iter, 0, sizeof(f->iter)); f->r_skb = NULL; f->flags = 0; list_add(&f->head, &t->ffree); @@ -282,21 +294,14 @@ } static void -skb_fillup(struct sk_buff *skb, struct bio_vec *bv, ulong off, ulong cnt) +skb_fillup(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter) { int frag = 0; - ulong fcnt; -loop: - fcnt = bv->bv_len - (off - bv->bv_offset); - if (fcnt > cnt) - fcnt = cnt; - skb_fill_page_desc(skb, frag++, bv->bv_page, off, fcnt); - cnt -= fcnt; - if (cnt <= 0) - return; - bv++; - off = bv->bv_offset; - goto loop; + struct bio_vec bv; + + __bio_for_each_segment(bv, bio, iter, iter) + skb_fill_page_desc(skb, frag++, bv.bv_page, + bv.bv_offset, bv.bv_len); } static void @@ -333,12 +338,10 @@ t->nout++; f->waited = 0; f->waited_total = 0; - if (f->buf) - f->lba = f->buf->sector; /* set up ata header */ - ah->scnt = f->bcnt >> 9; - put_lba(ah, f->lba); + ah->scnt = f->iter.bi_size >> 9; + put_lba(ah, f->iter.bi_sector); if (t->d->flags & DEVFL_EXT) { ah->aflags |= AOEAFL_EXT; } else { @@ -347,11 +350,11 @@ ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */ } if (f->buf && bio_data_dir(f->buf->bio) == WRITE) { - skb_fillup(skb, f->bv, f->bv_off, f->bcnt); + skb_fillup(skb, f->buf->bio, f->iter); ah->aflags |= AOEAFL_WRITE; - skb->len += f->bcnt; - skb->data_len = f->bcnt; - skb->truesize += f->bcnt; + skb->len += f->iter.bi_size; + skb->data_len = f->iter.bi_size; + skb->truesize += f->iter.bi_size; t->wpkts++; } else { t->rpkts++; @@ -367,10 +370,8 @@ { struct frame *f; struct buf *buf; - struct aoetgt *t; struct sk_buff *skb; struct sk_buff_head queue; - ulong bcnt, fbcnt; buf = nextbuf(d); if (buf == NULL) @@ -378,40 +379,22 @@ f = newframe(d); if (f == NULL) return 0; - t = *d->tgt; - bcnt = d->maxbcnt; - if (bcnt == 0) - bcnt = DEFAULTBCNT; - if (bcnt > buf->resid) - bcnt = buf->resid; - fbcnt = bcnt; - f->bv = buf->bv; - f->bv_off = f->bv->bv_offset + (f->bv->bv_len - buf->bv_resid); - do { - if (fbcnt < buf->bv_resid) { - buf->bv_resid -= fbcnt; - buf->resid -= fbcnt; - break; - } - fbcnt -= buf->bv_resid; - buf->resid -= buf->bv_resid; - if (buf->resid == 0) { - d->ip.buf = NULL; - break; - } - buf->bv++; - buf->bv_resid = buf->bv->bv_len; - WARN_ON(buf->bv_resid == 0); - } while (fbcnt); /* initialize the headers & frame */ f->buf = buf; - f->bcnt = bcnt; - ata_rw_frameinit(f); + f->iter = buf->iter; + f->iter.bi_size = min_t(unsigned long, + d->maxbcnt ?: DEFAULTBCNT, + f->iter.bi_size); + bio_advance_iter(buf->bio, &buf->iter, f->iter.bi_size); + + if (!buf->iter.bi_size) + d->ip.buf = NULL; /* mark all tracking fields and load out */ buf->nframesout += 1; - buf->sector += bcnt >> 9; + + ata_rw_frameinit(f); skb = skb_clone(f->skb, GFP_ATOMIC); if (skb) { @@ -472,7 +455,6 @@ struct sk_buff *skb; struct sk_buff_head queue; struct aoe_hdr *h; - struct aoe_atahdr *ah; struct aoetgt *t; char buf[128]; u32 n; @@ -487,7 +469,6 @@ return; } h = (struct aoe_hdr *) skb_mac_header(skb); - ah = (struct aoe_atahdr *) (h+1); if (!(f->flags & FFL_PROBE)) { snprintf(buf, sizeof(buf), @@ -604,10 +585,7 @@ skb = nf->skb; nf->skb = f->skb; nf->buf = f->buf; - nf->bcnt = f->bcnt; - nf->lba = f->lba; - nf->bv = f->bv; - nf->bv_off = f->bv_off; + nf->iter = f->iter; nf->waited = 0; nf->waited_total = f->waited_total; nf->sent = f->sent; @@ -639,19 +617,19 @@ } f->flags |= FFL_PROBE; ifrotate(t); - f->bcnt = t->d->maxbcnt ? t->d->maxbcnt : DEFAULTBCNT; + f->iter.bi_size = t->d->maxbcnt ? t->d->maxbcnt : DEFAULTBCNT; ata_rw_frameinit(f); skb = f->skb; - for (frag = 0, n = f->bcnt; n > 0; ++frag, n -= m) { + for (frag = 0, n = f->iter.bi_size; n > 0; ++frag, n -= m) { if (n < PAGE_SIZE) m = n; else m = PAGE_SIZE; skb_fill_page_desc(skb, frag, empty_page, 0, m); } - skb->len += f->bcnt; - skb->data_len = f->bcnt; - skb->truesize += f->bcnt; + skb->len += f->iter.bi_size; + skb->data_len = f->iter.bi_size; + skb->truesize += f->iter.bi_size; skb = skb_clone(f->skb, GFP_ATOMIC); if (skb) { @@ -888,21 +866,15 @@ static void bio_pageinc(struct bio *bio) { - struct bio_vec *bv; + struct bio_vec bv; struct page *page; - int i; + struct bvec_iter iter; - bio_for_each_segment(bv, bio, i) { - page = bv->bv_page; + bio_for_each_segment(bv, bio, iter) { /* Non-zero page count for non-head members of - * compound pages is no longer allowed by the kernel, - * but this has never been seen here. + * compound pages is no longer allowed by the kernel. */ - if (unlikely(PageCompound(page))) - if (compound_head(page) != page) { - pr_crit("page tail used for block I/O\n"); - BUG(); - } + page = compound_head(bv.bv_page); atomic_inc(&page->_count); } } @@ -910,11 +882,14 @@ static void bio_pagedec(struct bio *bio) { - struct bio_vec *bv; - int i; + struct page *page; + struct bio_vec bv; + struct bvec_iter iter; - bio_for_each_segment(bv, bio, i) - atomic_dec(&bv->bv_page->_count); + bio_for_each_segment(bv, bio, iter) { + page = compound_head(bv.bv_page); + atomic_dec(&page->_count); + } } static void @@ -923,12 +898,8 @@ memset(buf, 0, sizeof(*buf)); buf->rq = rq; buf->bio = bio; - buf->resid = bio->bi_size; - buf->sector = bio->bi_sector; + buf->iter = bio->bi_iter; bio_pageinc(bio); - buf->bv = bio_iovec(bio); - buf->bv_resid = buf->bv->bv_len; - WARN_ON(buf->bv_resid == 0); } static struct buf * @@ -1113,24 +1084,18 @@ } static void -bvcpy(struct bio_vec *bv, ulong off, struct sk_buff *skb, long cnt) +bvcpy(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter, long cnt) { - ulong fcnt; - char *p; int soff = 0; -loop: - fcnt = bv->bv_len - (off - bv->bv_offset); - if (fcnt > cnt) - fcnt = cnt; - p = page_address(bv->bv_page) + off; - skb_copy_bits(skb, soff, p, fcnt); - soff += fcnt; - cnt -= fcnt; - if (cnt <= 0) - return; - bv++; - off = bv->bv_offset; - goto loop; + struct bio_vec bv; + + iter.bi_size = cnt; + + __bio_for_each_segment(bv, bio, iter, iter) { + char *p = page_address(bv.bv_page) + bv.bv_offset; + skb_copy_bits(skb, soff, p, bv.bv_len); + soff += bv.bv_len; + } } void @@ -1145,8 +1110,8 @@ d->ip.rq = NULL; do { bio = rq->bio; - bok = !fastfail && test_bit(BIO_UPTODATE, &bio->bi_flags); - } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_size)); + bok = !fastfail && !bio->bi_error; + } while (__blk_end_request(rq, bok ? 0 : -EIO, bio->bi_iter.bi_size)); /* cf. http://lkml.org/lkml/2006/10/31/28 */ if (!fastfail) @@ -1207,7 +1172,7 @@ ahout->cmdstat, ahin->cmdstat, d->aoemajor, d->aoeminor); noskb: if (buf) - clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); + buf->bio->bi_error = -EIO; goto out; } @@ -1220,10 +1185,18 @@ "aoe: runt data size in read from", (long) d->aoemajor, d->aoeminor, skb->len, n); - clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); + buf->bio->bi_error = -EIO; break; } - bvcpy(f->bv, f->bv_off, skb, n); + if (n > f->iter.bi_size) { + pr_err_ratelimited("%s e%ld.%d. bytes=%ld need=%u\n", + "aoe: too-large data size in read from", + (long) d->aoemajor, d->aoeminor, + n, f->iter.bi_size); + buf->bio->bi_error = -EIO; + break; + } + bvcpy(skb, f->buf->bio, f->iter, n); case ATA_CMD_PIO_WRITE: case ATA_CMD_PIO_WRITE_EXT: spin_lock_irq(&d->lock); @@ -1266,7 +1239,7 @@ aoe_freetframe(f); - if (buf && --buf->nframesout == 0 && buf->resid == 0) + if (buf && --buf->nframesout == 0 && buf->iter.bi_size == 0) aoe_end_buf(d, buf); spin_unlock_irq(&d->lock); @@ -1278,23 +1251,36 @@ * Returns true iff responses needing processing remain. */ static int -ktio(void) +ktio(int id) { struct frame *f; struct list_head *pos; int i; + int actual_id; for (i = 0; ; ++i) { if (i == MAXIOC) return 1; - if (list_empty(&iocq.head)) + if (list_empty(&iocq[id].head)) return 0; - pos = iocq.head.next; + pos = iocq[id].head.next; list_del(pos); - spin_unlock_irq(&iocq.lock); f = list_entry(pos, struct frame, head); + spin_unlock_irq(&iocq[id].lock); ktiocomplete(f); - spin_lock_irq(&iocq.lock); + + /* Figure out if extra threads are required. */ + actual_id = f->t->d->aoeminor % ncpus; + + if (!kts[actual_id].active) { + BUG_ON(id != 0); + mutex_lock(&ktio_spawn_lock); + if (!kts[actual_id].active + && aoe_ktstart(&kts[actual_id]) == 0) + kts[actual_id].active = 1; + mutex_unlock(&ktio_spawn_lock); + } + spin_lock_irq(&iocq[id].lock); } } @@ -1311,7 +1297,7 @@ complete(&k->rendez); /* tell spawner we're running */ do { spin_lock_irq(k->lock); - more = k->fn(); + more = k->fn(k->id); if (!more) { add_wait_queue(k->waitq, &wait); __set_current_state(TASK_INTERRUPTIBLE); @@ -1340,7 +1326,7 @@ struct task_struct *task; init_completion(&k->rendez); - task = kthread_run(kthread, k, k->name); + task = kthread_run(kthread, k, "%s", k->name); if (task == NULL || IS_ERR(task)) return -ENOMEM; k->task = task; @@ -1353,13 +1339,24 @@ static void ktcomplete(struct frame *f, struct sk_buff *skb) { + int id; ulong flags; f->r_skb = skb; - spin_lock_irqsave(&iocq.lock, flags); - list_add_tail(&f->head, &iocq.head); - spin_unlock_irqrestore(&iocq.lock, flags); - wake_up(&ktiowq); + id = f->t->d->aoeminor % ncpus; + spin_lock_irqsave(&iocq[id].lock, flags); + if (!kts[id].active) { + spin_unlock_irqrestore(&iocq[id].lock, flags); + /* The thread with id has not been spawned yet, + * so delegate the work to the main thread and + * try spawning a new thread. + */ + id = 0; + spin_lock_irqsave(&iocq[id].lock, flags); + } + list_add_tail(&f->head, &iocq[id].head); + spin_unlock_irqrestore(&iocq[id].lock, flags); + wake_up(&ktiowq[id]); } struct sk_buff * @@ -1697,8 +1694,8 @@ { if (buf == NULL) return; - buf->resid = 0; - clear_bit(BIO_UPTODATE, &buf->bio->bi_flags); + buf->iter.bi_size = 0; + buf->bio->bi_error = -EIO; if (buf->nframesout == 0) aoe_end_buf(d, buf); } @@ -1706,6 +1703,17 @@ void aoe_flush_iocq(void) { + int i; + + for (i = 0; i < ncpus; i++) { + if (kts[i].active) + aoe_flush_iocq_by_index(i); + } +} + +void +aoe_flush_iocq_by_index(int id) +{ struct frame *f; struct aoedev *d; LIST_HEAD(flist); @@ -1713,9 +1721,9 @@ struct sk_buff *skb; ulong flags; - spin_lock_irqsave(&iocq.lock, flags); - list_splice_init(&iocq.head, &flist); - spin_unlock_irqrestore(&iocq.lock, flags); + spin_lock_irqsave(&iocq[id].lock, flags); + list_splice_init(&iocq[id].head, &flist); + spin_unlock_irqrestore(&iocq[id].lock, flags); while (!list_empty(&flist)) { pos = flist.next; list_del(pos); @@ -1738,6 +1746,8 @@ aoecmd_init(void) { void *p; + int i; + int ret; /* get_zeroed_page returns page with ref count 1 */ p = (void *) get_zeroed_page(GFP_KERNEL | __GFP_REPEAT); @@ -1745,22 +1755,72 @@ return -ENOMEM; empty_page = virt_to_page(p); - INIT_LIST_HEAD(&iocq.head); - spin_lock_init(&iocq.lock); - init_waitqueue_head(&ktiowq); - kts.name = "aoe_ktio"; - kts.fn = ktio; - kts.waitq = &ktiowq; - kts.lock = &iocq.lock; - return aoe_ktstart(&kts); + ncpus = num_online_cpus(); + + iocq = kcalloc(ncpus, sizeof(struct iocq_ktio), GFP_KERNEL); + if (!iocq) + return -ENOMEM; + + kts = kcalloc(ncpus, sizeof(struct ktstate), GFP_KERNEL); + if (!kts) { + ret = -ENOMEM; + goto kts_fail; + } + + ktiowq = kcalloc(ncpus, sizeof(wait_queue_head_t), GFP_KERNEL); + if (!ktiowq) { + ret = -ENOMEM; + goto ktiowq_fail; + } + + mutex_init(&ktio_spawn_lock); + + for (i = 0; i < ncpus; i++) { + INIT_LIST_HEAD(&iocq[i].head); + spin_lock_init(&iocq[i].lock); + init_waitqueue_head(&ktiowq[i]); + snprintf(kts[i].name, sizeof(kts[i].name), "aoe_ktio%d", i); + kts[i].fn = ktio; + kts[i].waitq = &ktiowq[i]; + kts[i].lock = &iocq[i].lock; + kts[i].id = i; + kts[i].active = 0; + } + kts[0].active = 1; + if (aoe_ktstart(&kts[0])) { + ret = -ENOMEM; + goto ktstart_fail; + } + return 0; + +ktstart_fail: + kfree(ktiowq); +ktiowq_fail: + kfree(kts); +kts_fail: + kfree(iocq); + + return ret; } void aoecmd_exit(void) { - aoe_ktstop(&kts); + int i; + + for (i = 0; i < ncpus; i++) + if (kts[i].active) + aoe_ktstop(&kts[i]); + aoe_flush_iocq(); + /* Free up the iocq and thread speicific configuration + * allocated during startup. + */ + kfree(iocq); + kfree(kts); + kfree(ktiowq); + free_page((unsigned long) page_address(empty_page)); empty_page = NULL; }