--- zzzz-none-000/linux-3.10.107/drivers/md/raid1.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/drivers/md/raid1.c 2021-02-04 17:41:59.000000000 +0000 @@ -66,7 +66,8 @@ */ static int max_queued_requests = 1024; -static void allow_barrier(struct r1conf *conf); +static void allow_barrier(struct r1conf *conf, sector_t start_next_window, + sector_t bi_sector); static void lower_barrier(struct r1conf *conf); static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) @@ -84,10 +85,14 @@ } #define RESYNC_BLOCK_SIZE (64*1024) -//#define RESYNC_BLOCK_SIZE PAGE_SIZE +#define RESYNC_DEPTH 32 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) -#define RESYNC_WINDOW (2048*1024) +#define RESYNC_WINDOW (RESYNC_BLOCK_SIZE * RESYNC_DEPTH) +#define RESYNC_WINDOW_SECTORS (RESYNC_WINDOW >> 9) +#define CLUSTER_RESYNC_WINDOW (16 * RESYNC_WINDOW) +#define CLUSTER_RESYNC_WINDOW_SECTORS (CLUSTER_RESYNC_WINDOW >> 9) +#define NEXT_NORMALIO_DISTANCE (3 * RESYNC_WINDOW_SECTORS) static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) { @@ -234,6 +239,8 @@ struct bio *bio = r1_bio->master_bio; int done; struct r1conf *conf = r1_bio->mddev->private; + sector_t start_next_window = r1_bio->start_next_window; + sector_t bi_sector = bio->bi_iter.bi_sector; if (bio->bi_phys_segments) { unsigned long flags; @@ -241,18 +248,24 @@ bio->bi_phys_segments--; done = (bio->bi_phys_segments == 0); spin_unlock_irqrestore(&conf->device_lock, flags); + /* + * make_request() might be waiting for + * bi_phys_segments to decrease + */ + wake_up(&conf->wait_barrier); } else done = 1; if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) - clear_bit(BIO_UPTODATE, &bio->bi_flags); + bio->bi_error = -EIO; + if (done) { - bio_endio(bio, 0); + bio_endio(bio); /* * Wake up any possible resync thread that waits for the device * to go idle. */ - allow_barrier(conf); + allow_barrier(conf, start_next_window, bi_sector); } } @@ -264,9 +277,8 @@ if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { pr_debug("raid1: sync end %s on sectors %llu-%llu\n", (bio_data_dir(bio) == WRITE) ? "write" : "read", - (unsigned long long) bio->bi_sector, - (unsigned long long) bio->bi_sector + - bio_sectors(bio) - 1); + (unsigned long long) bio->bi_iter.bi_sector, + (unsigned long long) bio_end_sector(bio) - 1); call_bio_endio(r1_bio); } @@ -303,9 +315,9 @@ return mirror; } -static void raid1_end_read_request(struct bio *bio, int error) +static void raid1_end_read_request(struct bio *bio) { - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + int uptodate = !bio->bi_error; struct r1bio *r1_bio = bio->bi_private; int mirror; struct r1conf *conf = r1_bio->mddev->private; @@ -388,9 +400,8 @@ } } -static void raid1_end_write_request(struct bio *bio, int error) +static void raid1_end_write_request(struct bio *bio) { - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct r1bio *r1_bio = bio->bi_private; int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); struct r1conf *conf = r1_bio->mddev->private; @@ -401,7 +412,7 @@ /* * 'one mirror IO has finished' event handler: */ - if (!uptodate) { + if (bio->bi_error) { set_bit(WriteErrorSeen, &conf->mirrors[mirror].rdev->flags); if (!test_and_set_bit(WantReplacement, @@ -465,9 +476,8 @@ struct bio *mbio = r1_bio->master_bio; pr_debug("raid1: behind end write sectors" " %llu-%llu\n", - (unsigned long long) mbio->bi_sector, - (unsigned long long) mbio->bi_sector + - bio_sectors(mbio) - 1); + (unsigned long long) mbio->bi_iter.bi_sector, + (unsigned long long) bio_end_sector(mbio) - 1); call_bio_endio(r1_bio); } } @@ -486,7 +496,6 @@ bio_put(to_put); } - /* * This routine returns the disk from which the requested read should * be done. There is a per-array 'next expected sequential IO' sector @@ -532,8 +541,10 @@ has_nonrot_disk = 0; choose_next_idle = 0; - if (conf->mddev->recovery_cp < MaxSector && - (this_sector + sectors >= conf->next_resync)) + if ((conf->mddev->recovery_cp < this_sector + sectors) || + (mddev_is_clustered(conf->mddev) && + md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, + this_sector + sectors))) choose_first = 1; else choose_first = 0; @@ -548,7 +559,6 @@ rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED || rdev == NULL - || test_bit(Unmerged, &rdev->flags) || test_bit(Faulty, &rdev->flags)) continue; if (!test_bit(In_sync, &rdev->flags) && @@ -560,7 +570,7 @@ if (best_dist_disk < 0) { if (is_badblock(rdev, this_sector, sectors, &first_bad, &bad_sectors)) { - if (first_bad <= this_sector) + if (first_bad < this_sector) /* Cannot use this */ continue; best_good_sectors = first_bad - this_sector; @@ -699,45 +709,12 @@ return best_disk; } -static int raid1_mergeable_bvec(struct request_queue *q, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - struct mddev *mddev = q->queuedata; - struct r1conf *conf = mddev->private; - sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); - int max = biovec->bv_len; - - if (mddev->merge_check_needed) { - int disk; - rcu_read_lock(); - for (disk = 0; disk < conf->raid_disks * 2; disk++) { - struct md_rdev *rdev = rcu_dereference( - conf->mirrors[disk].rdev); - if (rdev && !test_bit(Faulty, &rdev->flags)) { - struct request_queue *q = - bdev_get_queue(rdev->bdev); - if (q->merge_bvec_fn) { - bvm->bi_sector = sector + - rdev->data_offset; - bvm->bi_bdev = rdev->bdev; - max = min(max, q->merge_bvec_fn( - q, bvm, biovec)); - } - } - } - rcu_read_unlock(); - } - return max; - -} - -int md_raid1_congested(struct mddev *mddev, int bits) +static int raid1_congested(struct mddev *mddev, int bits) { struct r1conf *conf = mddev->private; int i, ret = 0; - if ((bits & (1 << BDI_async_congested)) && + if ((bits & (1 << WB_async_congested)) && conf->pending_count >= max_queued_requests) return 1; @@ -752,7 +729,7 @@ /* Note the '|| 1' - when read_balance prefers * non-congested targets, it can be removed */ - if ((bits & (1<backing_dev_info, bits); else ret &= bdi_congested(&q->backing_dev_info, bits); @@ -761,15 +738,6 @@ rcu_read_unlock(); return ret; } -EXPORT_SYMBOL_GPL(md_raid1_congested); - -static int raid1_congested(void *data, int bits) -{ - struct mddev *mddev = data; - - return mddev_congested(mddev, bits) || - md_raid1_congested(mddev, bits); -} static void flush_pending_writes(struct r1conf *conf) { @@ -794,7 +762,7 @@ if (unlikely((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) /* Just ignore it */ - bio_endio(bio, 0); + bio_endio(bio); else generic_make_request(bio); bio = next; @@ -824,9 +792,7 @@ * there is no normal IO happeing. It must arrange to call * lower_barrier when the particular background IO completes. */ -#define RESYNC_DEPTH 32 - -static void raise_barrier(struct r1conf *conf) +static void raise_barrier(struct r1conf *conf, sector_t sector_nr) { spin_lock_irq(&conf->resync_lock); @@ -836,12 +802,26 @@ /* block any new IO from starting */ conf->barrier++; + conf->next_resync = sector_nr; - /* Now wait for all pending IO to complete */ + /* For these conditions we must wait: + * A: while the array is in frozen state + * B: while barrier >= RESYNC_DEPTH, meaning resync reach + * the max count which allowed. + * C: next_resync + RESYNC_SECTORS > start_next_window, meaning + * next resync will reach to the window which normal bios are + * handling. + * D: while there are any active requests in the current window. + */ wait_event_lock_irq(conf->wait_barrier, - !conf->nr_pending && conf->barrier < RESYNC_DEPTH, + !conf->array_frozen && + conf->barrier < RESYNC_DEPTH && + conf->current_window_requests == 0 && + (conf->start_next_window >= + conf->next_resync + RESYNC_SECTORS), conf->resync_lock); + conf->nr_pending++; spin_unlock_irq(&conf->resync_lock); } @@ -851,41 +831,107 @@ BUG_ON(conf->barrier <= 0); spin_lock_irqsave(&conf->resync_lock, flags); conf->barrier--; + conf->nr_pending--; spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); } -static void wait_barrier(struct r1conf *conf) +static bool need_to_wait_for_sync(struct r1conf *conf, struct bio *bio) { + bool wait = false; + + if (conf->array_frozen || !bio) + wait = true; + else if (conf->barrier && bio_data_dir(bio) == WRITE) { + if ((conf->mddev->curr_resync_completed + >= bio_end_sector(bio)) || + (conf->next_resync + NEXT_NORMALIO_DISTANCE + <= bio->bi_iter.bi_sector)) + wait = false; + else + wait = true; + } + + return wait; +} + +static sector_t wait_barrier(struct r1conf *conf, struct bio *bio) +{ + sector_t sector = 0; + spin_lock_irq(&conf->resync_lock); - if (conf->barrier) { + if (need_to_wait_for_sync(conf, bio)) { conf->nr_waiting++; /* Wait for the barrier to drop. * However if there are already pending * requests (preventing the barrier from * rising completely), and the - * pre-process bio queue isn't empty, + * per-process bio queue isn't empty, * then don't wait, as we need to empty - * that queue to get the nr_pending - * count down. + * that queue to allow conf->start_next_window + * to increase. */ wait_event_lock_irq(conf->wait_barrier, - !conf->barrier || - (conf->nr_pending && - current->bio_list && - !bio_list_empty(current->bio_list)), + !conf->array_frozen && + (!conf->barrier || + ((conf->start_next_window < + conf->next_resync + RESYNC_SECTORS) && + current->bio_list && + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1])))), conf->resync_lock); conf->nr_waiting--; } + + if (bio && bio_data_dir(bio) == WRITE) { + if (bio->bi_iter.bi_sector >= conf->next_resync) { + if (conf->start_next_window == MaxSector) + conf->start_next_window = + conf->next_resync + + NEXT_NORMALIO_DISTANCE; + + if ((conf->start_next_window + NEXT_NORMALIO_DISTANCE) + <= bio->bi_iter.bi_sector) + conf->next_window_requests++; + else + conf->current_window_requests++; + sector = conf->start_next_window; + } + } + conf->nr_pending++; spin_unlock_irq(&conf->resync_lock); + return sector; } -static void allow_barrier(struct r1conf *conf) +static void allow_barrier(struct r1conf *conf, sector_t start_next_window, + sector_t bi_sector) { unsigned long flags; + spin_lock_irqsave(&conf->resync_lock, flags); conf->nr_pending--; + if (start_next_window) { + if (start_next_window == conf->start_next_window) { + if (conf->start_next_window + NEXT_NORMALIO_DISTANCE + <= bi_sector) + conf->next_window_requests--; + else + conf->current_window_requests--; + } else + conf->current_window_requests--; + + if (!conf->current_window_requests) { + if (conf->next_window_requests) { + conf->current_window_requests = + conf->next_window_requests; + conf->next_window_requests = 0; + conf->start_next_window += + NEXT_NORMALIO_DISTANCE; + } else + conf->start_next_window = MaxSector; + } + } spin_unlock_irqrestore(&conf->resync_lock, flags); wake_up(&conf->wait_barrier); } @@ -894,8 +940,7 @@ { /* stop syncio and normal IO and wait for everything to * go quite. - * We increment barrier and nr_waiting, and then - * wait until nr_pending match nr_queued+extra + * We wait until nr_pending match nr_queued+extra * This is called in the context of one normal IO request * that has failed. Thus any sync request that might be pending * will be blocked by nr_pending, and we need to wait for @@ -905,8 +950,7 @@ * we continue. */ spin_lock_irq(&conf->resync_lock); - conf->barrier++; - conf->nr_waiting++; + conf->array_frozen = 1; wait_event_lock_irq_cmd(conf->wait_barrier, conf->nr_pending == conf->nr_queued+extra, conf->resync_lock, @@ -917,14 +961,12 @@ { /* reverse the effect of the freeze */ spin_lock_irq(&conf->resync_lock); - conf->barrier--; - conf->nr_waiting--; + conf->array_frozen = 0; wake_up(&conf->wait_barrier); spin_unlock_irq(&conf->resync_lock); } - -/* duplicate the data pages for behind I/O +/* duplicate the data pages for behind I/O */ static void alloc_behind_pages(struct bio *bio, struct r1bio *r1_bio) { @@ -955,7 +997,8 @@ if (bvecs[i].bv_page) put_page(bvecs[i].bv_page); kfree(bvecs); - pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); + pr_debug("%dB behind alloc failed, doing sync I/O\n", + bio->bi_iter.bi_size); } struct raid1_plug_cb { @@ -994,7 +1037,7 @@ if (unlikely((bio->bi_rw & REQ_DISCARD) && !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) /* Just ignore it */ - bio_endio(bio, 0); + bio_endio(bio); else generic_make_request(bio); bio = next; @@ -1023,6 +1066,7 @@ int first_clone; int sectors_handled; int max_sectors; + sector_t start_next_window; /* * Register the new request and wait if the reconstruction @@ -1033,8 +1077,11 @@ md_write_start(mddev, bio); /* wait on superblock update early */ if (bio_data_dir(bio) == WRITE && - bio_end_sector(bio) > mddev->suspend_lo && - bio->bi_sector < mddev->suspend_hi) { + ((bio_end_sector(bio) > mddev->suspend_lo && + bio->bi_iter.bi_sector < mddev->suspend_hi) || + (mddev_is_clustered(mddev) && + md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, bio_end_sector(bio))))) { /* As the suspend_* range is controlled by * userspace, we want an interruptible * wait. @@ -1045,14 +1092,17 @@ prepare_to_wait(&conf->wait_barrier, &w, TASK_INTERRUPTIBLE); if (bio_end_sector(bio) <= mddev->suspend_lo || - bio->bi_sector >= mddev->suspend_hi) + bio->bi_iter.bi_sector >= mddev->suspend_hi || + (mddev_is_clustered(mddev) && + !md_cluster_ops->area_resyncing(mddev, WRITE, + bio->bi_iter.bi_sector, bio_end_sector(bio)))) break; schedule(); } finish_wait(&conf->wait_barrier, &w); } - wait_barrier(conf); + start_next_window = wait_barrier(conf, bio); bitmap = mddev->bitmap; @@ -1067,7 +1117,7 @@ r1_bio->sectors = bio_sectors(bio); r1_bio->state = 0; r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_sector; + r1_bio->sector = bio->bi_iter.bi_sector; /* We might need to issue multiple reads to different * devices if there are bad blocks around, so we keep @@ -1077,7 +1127,7 @@ * non-zero, then it is the number of not-completed requests. */ bio->bi_phys_segments = 0; - clear_bit(BIO_SEG_VALID, &bio->bi_flags); + bio_clear_flag(bio, BIO_SEG_VALID); if (rw == READ) { /* @@ -1105,14 +1155,16 @@ atomic_read(&bitmap->behind_writes) == 0); } r1_bio->read_disk = rdisk; + r1_bio->start_next_window = 0; read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector, - max_sectors); + bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, + max_sectors); r1_bio->bios[rdisk] = read_bio; - read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; + read_bio->bi_iter.bi_sector = r1_bio->sector + + mirror->rdev->data_offset; read_bio->bi_bdev = mirror->rdev->bdev; read_bio->bi_end_io = raid1_end_read_request; read_bio->bi_rw = READ | do_sync; @@ -1124,7 +1176,7 @@ */ sectors_handled = (r1_bio->sector + max_sectors - - bio->bi_sector); + - bio->bi_iter.bi_sector); r1_bio->sectors = max_sectors; spin_lock_irq(&conf->device_lock); if (bio->bi_phys_segments == 0) @@ -1145,7 +1197,8 @@ r1_bio->sectors = bio_sectors(bio) - sectors_handled; r1_bio->state = 0; r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_sector + sectors_handled; + r1_bio->sector = bio->bi_iter.bi_sector + + sectors_handled; goto read_again; } else generic_make_request(read_bio); @@ -1173,6 +1226,7 @@ disks = conf->raid_disks * 2; retry_write: + r1_bio->start_next_window = start_next_window; blocked_rdev = NULL; rcu_read_lock(); max_sectors = r1_bio->sectors; @@ -1184,8 +1238,7 @@ break; } r1_bio->bios[i] = NULL; - if (!rdev || test_bit(Faulty, &rdev->flags) - || test_bit(Unmerged, &rdev->flags)) { + if (!rdev || test_bit(Faulty, &rdev->flags)) { if (i < conf->raid_disks) set_bit(R1BIO_Degraded, &r1_bio->state); continue; @@ -1241,14 +1294,24 @@ if (unlikely(blocked_rdev)) { /* Wait for this device to become unblocked */ int j; + sector_t old = start_next_window; for (j = 0; j < i; j++) if (r1_bio->bios[j]) rdev_dec_pending(conf->mirrors[j].rdev, mddev); r1_bio->state = 0; - allow_barrier(conf); + allow_barrier(conf, start_next_window, bio->bi_iter.bi_sector); md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf); + start_next_window = wait_barrier(conf, bio); + /* + * We must make sure the multi r1bios of bio have + * the same value of bi_phys_segments + */ + if (bio->bi_phys_segments && old && + old != start_next_window) + /* Wait for the former r1bio(s) to complete */ + wait_event(conf->wait_barrier, + bio->bi_phys_segments == 1); goto retry_write; } @@ -1264,7 +1327,7 @@ bio->bi_phys_segments++; spin_unlock_irq(&conf->device_lock); } - sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector; + sectors_handled = r1_bio->sector + max_sectors - bio->bi_iter.bi_sector; atomic_set(&r1_bio->remaining, 1); atomic_set(&r1_bio->behind_remaining, 0); @@ -1276,7 +1339,7 @@ continue; mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors); + bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors); if (first_clone) { /* do behind I/O ? @@ -1310,7 +1373,7 @@ r1_bio->bios[i] = mbio; - mbio->bi_sector = (r1_bio->sector + + mbio->bi_iter.bi_sector = (r1_bio->sector + conf->mirrors[i].rdev->data_offset); mbio->bi_bdev = conf->mirrors[i].rdev->bdev; mbio->bi_end_io = raid1_end_write_request; @@ -1350,7 +1413,7 @@ r1_bio->sectors = bio_sectors(bio) - sectors_handled; r1_bio->state = 0; r1_bio->mddev = mddev; - r1_bio->sector = bio->bi_sector + sectors_handled; + r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled; goto retry_write; } @@ -1377,7 +1440,6 @@ seq_printf(seq, "]"); } - static void error(struct mddev *mddev, struct md_rdev *rdev) { char b[BDEVNAME_SIZE]; @@ -1414,6 +1476,7 @@ */ set_bit(MD_RECOVERY_INTR, &mddev->recovery); set_bit(MD_CHANGE_DEVS, &mddev->flags); + set_bit(MD_CHANGE_PENDING, &mddev->flags); printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" "md/raid1:%s: Operation continuing on %d devices.\n", @@ -1448,11 +1511,19 @@ static void close_sync(struct r1conf *conf) { - wait_barrier(conf); - allow_barrier(conf); + wait_barrier(conf, NULL); + allow_barrier(conf, 0, 0); mempool_destroy(conf->r1buf_pool); conf->r1buf_pool = NULL; + + spin_lock_irq(&conf->resync_lock); + conf->next_resync = MaxSector - 2 * NEXT_NORMALIO_DISTANCE; + conf->start_next_window = MaxSector; + conf->current_window_requests += + conf->next_window_requests; + conf->next_window_requests = 0; + spin_unlock_irq(&conf->resync_lock); } static int raid1_spare_active(struct mddev *mddev) @@ -1463,7 +1534,7 @@ unsigned long flags; /* - * Find all failed disks within the RAID1 configuration + * Find all failed disks within the RAID1 configuration * and mark them readable. * Called under mddev lock, so rcu protection not needed. * device_lock used to avoid races with raid1_end_read_request @@ -1474,6 +1545,7 @@ struct md_rdev *rdev = conf->mirrors[i].rdev; struct md_rdev *repl = conf->mirrors[conf->raid_disks + i].rdev; if (repl + && !test_bit(Candidate, &repl->flags) && repl->recovery_offset == MaxSector && !test_bit(Faulty, &repl->flags) && !test_and_set_bit(In_sync, &repl->flags)) { @@ -1506,7 +1578,6 @@ return count; } - static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r1conf *conf = mddev->private; @@ -1515,25 +1586,32 @@ struct raid1_info *p; int first = 0; int last = conf->raid_disks - 1; - struct request_queue *q = bdev_get_queue(rdev->bdev); if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; + if (md_integrity_add_rdev(rdev, mddev)) + return -ENXIO; + if (rdev->raid_disk >= 0) first = last = rdev->raid_disk; - if (q->merge_bvec_fn) { - set_bit(Unmerged, &rdev->flags); - mddev->merge_check_needed = 1; - } + /* + * find the disk ... but prefer rdev->saved_raid_disk + * if possible. + */ + if (rdev->saved_raid_disk >= 0 && + rdev->saved_raid_disk >= first && + conf->mirrors[rdev->saved_raid_disk].rdev == NULL) + first = last = rdev->saved_raid_disk; for (mirror = first; mirror <= last; mirror++) { p = conf->mirrors+mirror; if (!p->rdev) { - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + if (mddev->gendisk) + disk_stack_limits(mddev->gendisk, rdev->bdev, + rdev->data_offset << 9); p->head_position = 0; rdev->raid_disk = mirror; @@ -1558,21 +1636,7 @@ break; } } - if (err == 0 && test_bit(Unmerged, &rdev->flags)) { - /* Some requests might not have seen this new - * merge_bvec_fn. We must wait for them to complete - * before merging the device fully. - * First we make sure any code which has tested - * our function has submitted the request, then - * we wait for all outstanding requests to complete. - */ - synchronize_sched(); - freeze_array(conf, 0); - unfreeze_array(conf); - clear_bit(Unmerged, &rdev->flags); - } - md_integrity_add_rdev(rdev, mddev); - if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); print_conf(conf); return err; @@ -1634,8 +1698,7 @@ return err; } - -static void end_sync_read(struct bio *bio, int error) +static void end_sync_read(struct bio *bio) { struct r1bio *r1_bio = bio->bi_private; @@ -1646,16 +1709,16 @@ * or re-read if the read failed. * We don't do much here, just schedule handling by raid1d */ - if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + if (!bio->bi_error) set_bit(R1BIO_Uptodate, &r1_bio->state); if (atomic_dec_and_test(&r1_bio->remaining)) reschedule_retry(r1_bio); } -static void end_sync_write(struct bio *bio, int error) +static void end_sync_write(struct bio *bio) { - int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + int uptodate = !bio->bi_error; struct r1bio *r1_bio = bio->bi_private; struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; @@ -1842,11 +1905,11 @@ idx ++; } set_bit(R1BIO_Uptodate, &r1_bio->state); - set_bit(BIO_UPTODATE, &bio->bi_flags); + bio->bi_error = 0; return 1; } -static int process_checks(struct r1bio *r1_bio) +static void process_checks(struct r1bio *r1_bio) { /* We have read all readable devices. If we haven't * got the block, then there is no hope left. @@ -1866,24 +1929,23 @@ for (i = 0; i < conf->raid_disks * 2; i++) { int j; int size; - int uptodate; + int error; struct bio *b = r1_bio->bios[i]; if (b->bi_end_io != end_sync_read) continue; - /* fixup the bio for reuse, but preserve BIO_UPTODATE */ - uptodate = test_bit(BIO_UPTODATE, &b->bi_flags); + /* fixup the bio for reuse, but preserve errno */ + error = b->bi_error; bio_reset(b); - if (!uptodate) - clear_bit(BIO_UPTODATE, &b->bi_flags); + b->bi_error = error; b->bi_vcnt = vcnt; - b->bi_size = r1_bio->sectors << 9; - b->bi_sector = r1_bio->sector + + b->bi_iter.bi_size = r1_bio->sectors << 9; + b->bi_iter.bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; b->bi_bdev = conf->mirrors[i].rdev->bdev; b->bi_end_io = end_sync_read; b->bi_private = r1_bio; - size = b->bi_size; + size = b->bi_iter.bi_size; for (j = 0; j < vcnt ; j++) { struct bio_vec *bi; bi = &b->bi_io_vec[j]; @@ -1897,7 +1959,7 @@ } for (primary = 0; primary < conf->raid_disks * 2; primary++) if (r1_bio->bios[primary]->bi_end_io == end_sync_read && - test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { + !r1_bio->bios[primary]->bi_error) { r1_bio->bios[primary]->bi_end_io = NULL; rdev_dec_pending(conf->mirrors[primary].rdev, mddev); break; @@ -1907,14 +1969,14 @@ int j; struct bio *pbio = r1_bio->bios[primary]; struct bio *sbio = r1_bio->bios[i]; - int uptodate = test_bit(BIO_UPTODATE, &sbio->bi_flags); + int error = sbio->bi_error; if (sbio->bi_end_io != end_sync_read) continue; - /* Now we can 'fixup' the BIO_UPTODATE flag */ - set_bit(BIO_UPTODATE, &sbio->bi_flags); + /* Now we can 'fixup' the error value */ + sbio->bi_error = 0; - if (uptodate) { + if (!error) { for (j = vcnt; j-- ; ) { struct page *p, *s; p = pbio->bi_io_vec[j].bv_page; @@ -1929,7 +1991,7 @@ if (j >= 0) atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) - && uptodate)) { + && !error)) { /* No need to write to this device. */ sbio->bi_end_io = NULL; rdev_dec_pending(conf->mirrors[i].rdev, mddev); @@ -1938,7 +2000,6 @@ bio_copy_data(sbio, pbio); } - return 0; } static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) @@ -1956,8 +2017,8 @@ return; if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) - if (process_checks(r1_bio) < 0) - return; + process_checks(r1_bio); + /* * schedule writes */ @@ -2111,7 +2172,8 @@ if (rdev->badblocks.shift < 0) return 0; - block_sectors = 1 << rdev->badblocks.shift; + block_sectors = roundup(1 << rdev->badblocks.shift, + bdev_logical_block_size(rdev->bdev) >> 9); sector = r1_bio->sector; sectors = ((sector + block_sectors) & ~(sector_t)(block_sectors - 1)) @@ -2141,11 +2203,11 @@ } wbio->bi_rw = WRITE; - wbio->bi_sector = r1_bio->sector; - wbio->bi_size = r1_bio->sectors << 9; + wbio->bi_iter.bi_sector = r1_bio->sector; + wbio->bi_iter.bi_size = r1_bio->sectors << 9; - md_trim_bio(wbio, sector - r1_bio->sector, sectors); - wbio->bi_sector += rdev->data_offset; + bio_trim(wbio, sector - r1_bio->sector, sectors); + wbio->bi_iter.bi_sector += rdev->data_offset; wbio->bi_bdev = rdev->bdev; if (submit_bio_wait(WRITE, wbio) < 0) /* failure! */ @@ -2170,11 +2232,11 @@ struct bio *bio = r1_bio->bios[m]; if (bio->bi_end_io == NULL) continue; - if (test_bit(BIO_UPTODATE, &bio->bi_flags) && + if (!bio->bi_error && test_bit(R1BIO_MadeGood, &r1_bio->state)) { rdev_clear_badblocks(rdev, r1_bio->sector, s, 0); } - if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && + if (bio->bi_error && test_bit(R1BIO_WriteError, &r1_bio->state)) { if (!rdev_set_badblocks(rdev, r1_bio->sector, s, 0)) md_error(conf->mddev, rdev); @@ -2187,6 +2249,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio) { int m; + bool fail = false; for (m = 0; m < conf->raid_disks * 2 ; m++) if (r1_bio->bios[m] == IO_MADE_GOOD) { struct md_rdev *rdev = conf->mirrors[m].rdev; @@ -2199,6 +2262,7 @@ * narrow down and record precise write * errors. */ + fail = true; if (!narrow_write_error(r1_bio, m)) { md_error(conf->mddev, conf->mirrors[m].rdev); @@ -2208,9 +2272,17 @@ rdev_dec_pending(conf->mirrors[m].rdev, conf->mddev); } - if (test_bit(R1BIO_WriteError, &r1_bio->state)) - close_write(r1_bio); - raid_end_bio_io(r1_bio); + if (fail) { + spin_lock_irq(&conf->device_lock); + list_add(&r1_bio->retry_list, &conf->bio_end_io_list); + conf->nr_queued++; + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(conf->mddev->thread); + } else { + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + close_write(r1_bio); + raid_end_bio_io(r1_bio); + } } static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio) @@ -2259,7 +2331,8 @@ } r1_bio->read_disk = disk; bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); - md_trim_bio(bio, r1_bio->sector - bio->bi_sector, max_sectors); + bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, + max_sectors); r1_bio->bios[r1_bio->read_disk] = bio; rdev = conf->mirrors[disk].rdev; printk_ratelimited(KERN_ERR @@ -2268,7 +2341,7 @@ mdname(mddev), (unsigned long long)r1_bio->sector, bdevname(rdev->bdev, b)); - bio->bi_sector = r1_bio->sector + rdev->data_offset; + bio->bi_iter.bi_sector = r1_bio->sector + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_end_io = raid1_end_read_request; bio->bi_rw = READ | do_sync; @@ -2277,7 +2350,7 @@ /* Drat - have to split this up more */ struct bio *mbio = r1_bio->master_bio; int sectors_handled = (r1_bio->sector + max_sectors - - mbio->bi_sector); + - mbio->bi_iter.bi_sector); r1_bio->sectors = max_sectors; spin_lock_irq(&conf->device_lock); if (mbio->bi_phys_segments == 0) @@ -2295,7 +2368,8 @@ r1_bio->state = 0; set_bit(R1BIO_ReadError, &r1_bio->state); r1_bio->mddev = mddev; - r1_bio->sector = mbio->bi_sector + sectors_handled; + r1_bio->sector = mbio->bi_iter.bi_sector + + sectors_handled; goto read_more; } else @@ -2314,6 +2388,29 @@ md_check_recovery(mddev); + if (!list_empty_careful(&conf->bio_end_io_list) && + !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + LIST_HEAD(tmp); + spin_lock_irqsave(&conf->device_lock, flags); + if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { + while (!list_empty(&conf->bio_end_io_list)) { + list_move(conf->bio_end_io_list.prev, &tmp); + conf->nr_queued--; + } + } + spin_unlock_irqrestore(&conf->device_lock, flags); + while (!list_empty(&tmp)) { + r1_bio = list_first_entry(&tmp, struct r1bio, + retry_list); + list_del(&r1_bio->retry_list); + if (mddev->degraded) + set_bit(R1BIO_Degraded, &r1_bio->state); + if (test_bit(R1BIO_WriteError, &r1_bio->state)) + close_write(r1_bio); + raid_end_bio_io(r1_bio); + } + } + blk_start_plug(&plug); for (;;) { @@ -2355,7 +2452,6 @@ blk_finish_plug(&plug); } - static int init_resync(struct r1conf *conf) { int buffs; @@ -2380,7 +2476,7 @@ * that can be installed to exclude normal IO requests. */ -static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster) +static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped) { struct r1conf *conf = mddev->private; struct r1bio *r1_bio; @@ -2414,6 +2510,11 @@ bitmap_close_sync(mddev->bitmap); close_sync(conf); + + if (mddev_is_clustered(mddev)) { + conf->cluster_sync_low = 0; + conf->cluster_sync_high = 0; + } return 0; } @@ -2433,19 +2534,16 @@ *skipped = 1; return sync_blocks; } - /* - * If there is non-resync activity waiting for a turn, - * and resync is going fast enough, - * then let it though before starting on this new sync request. + + /* we are incrementing sector_nr below. To be safe, we check against + * sector_nr + two times RESYNC_SECTORS */ - if (!go_faster && conf->nr_waiting) - msleep_interruptible(1000); - bitmap_cond_end_sync(mddev->bitmap, sector_nr); + bitmap_cond_end_sync(mddev->bitmap, sector_nr, + mddev_is_clustered(mddev) && (sector_nr + 2 * RESYNC_SECTORS > conf->cluster_sync_high)); r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); - raise_barrier(conf); - conf->next_resync = sector_nr; + raise_barrier(conf, sector_nr); rcu_read_lock(); /* @@ -2519,7 +2617,7 @@ } if (bio->bi_end_io) { atomic_inc(&rdev->nr_pending); - bio->bi_sector = sector_nr + rdev->data_offset; + bio->bi_iter.bi_sector = sector_nr + rdev->data_offset; bio->bi_bdev = rdev->bdev; bio->bi_private = r1_bio; } @@ -2619,8 +2717,8 @@ continue; /* remove last page from this bio */ bio->bi_vcnt--; - bio->bi_size -= len; - bio->bi_flags &= ~(1<< BIO_SEG_VALID); + bio->bi_iter.bi_size -= len; + bio_clear_flag(bio, BIO_SEG_VALID); } goto bio_full; } @@ -2633,6 +2731,16 @@ bio_full: r1_bio->sectors = nr_sectors; + if (mddev_is_clustered(mddev) && + conf->cluster_sync_high < sector_nr + nr_sectors) { + conf->cluster_sync_low = mddev->curr_resync_completed; + conf->cluster_sync_high = conf->cluster_sync_low + CLUSTER_RESYNC_WINDOW_SECTORS; + /* Send resync message */ + md_cluster_ops->resync_info_update(mddev, + conf->cluster_sync_low, + conf->cluster_sync_high); + } + /* For a user-requested sync, we read all readable devices and do a * compare */ @@ -2715,8 +2823,6 @@ goto abort; disk->rdev = rdev; q = bdev_get_queue(rdev->bdev); - if (q->merge_bvec_fn) - mddev->merge_check_needed = 1; disk->head_position = 0; disk->seq_start = MaxSector; @@ -2724,6 +2830,7 @@ conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; INIT_LIST_HEAD(&conf->retry_list); + INIT_LIST_HEAD(&conf->bio_end_io_list); spin_lock_init(&conf->resync_lock); init_waitqueue_head(&conf->wait_barrier); @@ -2732,6 +2839,9 @@ conf->pending_count = 0; conf->recovery_disabled = mddev->recovery_disabled - 1; + conf->start_next_window = MaxSector; + conf->current_window_requests = conf->next_window_requests = 0; + err = -EIO; for (i = 0; i < conf->raid_disks * 2; i++) { @@ -2774,8 +2884,7 @@ abort: if (conf) { - if (conf->r1bio_pool) - mempool_destroy(conf->r1bio_pool); + mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); @@ -2784,7 +2893,7 @@ return ERR_PTR(err); } -static int stop(struct mddev *mddev); +static void raid1_free(struct mddev *mddev, void *priv); static int run(struct mddev *mddev) { struct r1conf *conf; @@ -2806,7 +2915,7 @@ /* * copy the already verified devices into our private RAID1 * bookkeeping area. [whatever we allocate in run(), - * should be freed in stop()] + * should be freed in raid1_free()] */ if (mddev->private == NULL) conf = setup_conf(mddev); @@ -2842,9 +2951,9 @@ printk(KERN_NOTICE "md/raid1:%s: not clean" " -- starting background reconstruction\n", mdname(mddev)); - printk(KERN_INFO + printk(KERN_INFO "md/raid1:%s: active with %d out of %d mirrors\n", - mdname(mddev), mddev->raid_disks - mddev->degraded, + mdname(mddev), mddev->raid_disks - mddev->degraded, mddev->raid_disks); /* @@ -2857,10 +2966,6 @@ md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); if (mddev->queue) { - mddev->queue->backing_dev_info.congested_fn = raid1_congested; - mddev->queue->backing_dev_info.congested_data = mddev; - blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); - if (discard_supported) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); @@ -2870,37 +2975,22 @@ } ret = md_integrity_register(mddev); - if (ret) - stop(mddev); + if (ret) { + md_unregister_thread(&mddev->thread); + raid1_free(mddev, conf); + } return ret; } -static int stop(struct mddev *mddev) +static void raid1_free(struct mddev *mddev, void *priv) { - struct r1conf *conf = mddev->private; - struct bitmap *bitmap = mddev->bitmap; + struct r1conf *conf = priv; - /* wait for behind writes to complete */ - if (bitmap && atomic_read(&bitmap->behind_writes) > 0) { - printk(KERN_INFO "md/raid1:%s: behind writes in progress - waiting to stop.\n", - mdname(mddev)); - /* need to kick something here to make sure I/O goes? */ - wait_event(bitmap->behind_wait, - atomic_read(&bitmap->behind_writes) == 0); - } - - raise_barrier(conf); - lower_barrier(conf); - - md_unregister_thread(&mddev->thread); - if (conf->r1bio_pool) - mempool_destroy(conf->r1bio_pool); + mempool_destroy(conf->r1bio_pool); kfree(conf->mirrors); safe_put_page(conf->tmppage); kfree(conf->poolinfo); kfree(conf); - mddev->private = NULL; - return 0; } static int raid1_resize(struct mddev *mddev, sector_t sectors) @@ -2965,9 +3055,11 @@ return -EINVAL; } - err = md_allow_write(mddev); - if (err) - return err; + if (!mddev_is_clustered(mddev)) { + err = md_allow_write(mddev); + if (err) + return err; + } raid_disks = mddev->raid_disks + mddev->delta_disks; @@ -3033,6 +3125,7 @@ unfreeze_array(conf); + set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_wakeup_thread(mddev->thread); @@ -3049,10 +3142,10 @@ wake_up(&conf->wait_barrier); break; case 1: - raise_barrier(conf); + freeze_array(conf, 0); break; case 0: - lower_barrier(conf); + unfreeze_array(conf); break; } } @@ -3069,7 +3162,8 @@ mddev->new_chunk_sectors = 0; conf = setup_conf(mddev); if (!IS_ERR(conf)) - conf->barrier = 1; + /* Array must appear to be quiesced */ + conf->array_frozen = 1; return conf; } return ERR_PTR(-EINVAL); @@ -3082,7 +3176,7 @@ .owner = THIS_MODULE, .make_request = make_request, .run = run, - .stop = stop, + .free = raid1_free, .status = status, .error_handler = error, .hot_add_disk = raid1_add_disk, @@ -3094,6 +3188,7 @@ .check_reshape = raid1_reshape, .quiesce = raid1_quiesce, .takeover = raid1_takeover, + .congested = raid1_congested, }; static int __init raid_init(void)