--- zzzz-none-000/linux-3.10.107/drivers/md/dm.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/drivers/md/dm.c 2021-02-04 17:41:59.000000000 +0000 @@ -19,6 +19,12 @@ #include #include #include +#include +#include +#include +#include /* for rq_end_sector() */ +#include +#include #include @@ -49,6 +55,13 @@ static DEFINE_IDR(_minor_idr); static DEFINE_SPINLOCK(_minor_lock); + +static void do_deferred_remove(struct work_struct *w); + +static DECLARE_WORK(deferred_remove_work, do_deferred_remove); + +static struct workqueue_struct *deferred_remove_workqueue; + /* * For bio-based dm. * One of these is allocated per bio. @@ -60,6 +73,7 @@ struct bio *bio; unsigned long start_time; spinlock_t endio_lock; + struct dm_stats_aux stats_aux; }; /* @@ -69,9 +83,13 @@ struct dm_rq_target_io { struct mapped_device *md; struct dm_target *ti; - struct request *orig, clone; + struct request *orig, *clone; + struct kthread_work work; int error; union map_info info; + struct dm_stats_aux stats_aux; + unsigned long duration_jiffies; + unsigned n_sectors; }; /* @@ -88,13 +106,6 @@ struct bio clone; }; -union map_info *dm_get_mapinfo(struct bio *bio) -{ - if (bio && bio->bi_private) - return &((struct dm_target_io *)bio->bi_private)->info; - return NULL; -} - union map_info *dm_get_rq_mapinfo(struct request *rq) { if (rq && rq->end_io_data) @@ -114,18 +125,36 @@ #define DMF_FREEING 3 #define DMF_DELETING 4 #define DMF_NOFLUSH_SUSPENDING 5 -#define DMF_MERGE_IS_OPTIONAL 6 +#define DMF_DEFERRED_REMOVE 6 +#define DMF_SUSPENDED_INTERNALLY 7 + +/* + * A dummy definition to make RCU happy. + * struct dm_table should never be dereferenced in this file. + */ +struct dm_table { + int undefined__; +}; /* * Work processed by per-device workqueue. */ struct mapped_device { - struct rw_semaphore io_lock; + struct srcu_struct io_barrier; struct mutex suspend_lock; - rwlock_t map_lock; atomic_t holders; atomic_t open_count; + /* + * The current mapping. + * Use dm_get_live_table{_fast} or take suspend_lock for + * dereference. + */ + struct dm_table __rcu *map; + + struct list_head table_devices; + struct mutex table_devices_lock; + unsigned long flags; struct request_queue *queue; @@ -155,14 +184,10 @@ struct workqueue_struct *wq; /* - * The current mapping. - */ - struct dm_table *map; - - /* * io objects are allocated from here. */ mempool_t *io_pool; + mempool_t *rq_pool; struct bio_set *bs; @@ -189,19 +214,101 @@ /* zero-length flush that will be cloned and submitted to targets */ struct bio flush_bio; + + /* the number of internal suspends */ + unsigned internal_suspend_count; + + struct dm_stats stats; + + struct kthread_worker kworker; + struct task_struct *kworker_task; + + /* for request-based merge heuristic in dm_request_fn() */ + unsigned seq_rq_merge_deadline_usecs; + int last_rq_rw; + sector_t last_rq_pos; + ktime_t last_rq_start_time; + + /* for blk-mq request-based DM support */ + struct blk_mq_tag_set tag_set; + bool use_blk_mq; }; +#ifdef CONFIG_DM_MQ_DEFAULT +static bool use_blk_mq = true; +#else +static bool use_blk_mq = false; +#endif + +bool dm_use_blk_mq(struct mapped_device *md) +{ + return md->use_blk_mq; +} + /* * For mempools pre-allocation at the table loading time. */ struct dm_md_mempools { mempool_t *io_pool; + mempool_t *rq_pool; struct bio_set *bs; }; -#define MIN_IOS 256 +struct table_device { + struct list_head list; + atomic_t count; + struct dm_dev dm_dev; +}; + +#define RESERVED_BIO_BASED_IOS 16 +#define RESERVED_REQUEST_BASED_IOS 256 +#define RESERVED_MAX_IOS 1024 static struct kmem_cache *_io_cache; static struct kmem_cache *_rq_tio_cache; +static struct kmem_cache *_rq_cache; + +/* + * Bio-based DM's mempools' reserved IOs set by the user. + */ +static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; + +/* + * Request-based DM's mempools' reserved IOs set by the user. + */ +static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; + +static unsigned __dm_get_module_param(unsigned *module_param, + unsigned def, unsigned max) +{ + unsigned param = ACCESS_ONCE(*module_param); + unsigned modified_param = 0; + + if (!param) + modified_param = def; + else if (param > max) + modified_param = max; + + if (modified_param) { + (void)cmpxchg(module_param, param, modified_param); + param = modified_param; + } + + return param; +} + +unsigned dm_get_reserved_bio_based_ios(void) +{ + return __dm_get_module_param(&reserved_bio_based_ios, + RESERVED_BIO_BASED_IOS, RESERVED_MAX_IOS); +} +EXPORT_SYMBOL_GPL(dm_get_reserved_bio_based_ios); + +unsigned dm_get_reserved_rq_based_ios(void) +{ + return __dm_get_module_param(&reserved_rq_based_ios, + RESERVED_REQUEST_BASED_IOS, RESERVED_MAX_IOS); +} +EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); static int __init local_init(void) { @@ -216,22 +323,37 @@ if (!_rq_tio_cache) goto out_free_io_cache; + _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request), + __alignof__(struct request), 0, NULL); + if (!_rq_cache) + goto out_free_rq_tio_cache; + r = dm_uevent_init(); if (r) - goto out_free_rq_tio_cache; + goto out_free_rq_cache; + + deferred_remove_workqueue = alloc_workqueue("kdmremove", WQ_UNBOUND, 1); + if (!deferred_remove_workqueue) { + r = -ENOMEM; + goto out_uevent_exit; + } _major = major; r = register_blkdev(_major, _name); if (r < 0) - goto out_uevent_exit; + goto out_free_workqueue; if (!_major) _major = r; return 0; +out_free_workqueue: + destroy_workqueue(deferred_remove_workqueue); out_uevent_exit: dm_uevent_exit(); +out_free_rq_cache: + kmem_cache_destroy(_rq_cache); out_free_rq_tio_cache: kmem_cache_destroy(_rq_tio_cache); out_free_io_cache: @@ -242,6 +364,10 @@ static void local_exit(void) { + flush_scheduled_work(); + destroy_workqueue(deferred_remove_workqueue); + + kmem_cache_destroy(_rq_cache); kmem_cache_destroy(_rq_tio_cache); kmem_cache_destroy(_io_cache); unregister_blkdev(_major, _name); @@ -260,6 +386,7 @@ dm_io_init, dm_kcopyd_init, dm_interface_init, + dm_statistics_init, }; static void (*_exits[])(void) = { @@ -270,6 +397,7 @@ dm_io_exit, dm_kcopyd_exit, dm_interface_exit, + dm_statistics_exit, }; static int __init dm_init(void) @@ -332,7 +460,6 @@ dm_get(md); atomic_inc(&md->open_count); - out: spin_unlock(&_minor_lock); @@ -341,13 +468,20 @@ static void dm_blk_close(struct gendisk *disk, fmode_t mode) { - struct mapped_device *md = disk->private_data; + struct mapped_device *md; spin_lock(&_minor_lock); - atomic_dec(&md->open_count); - dm_put(md); + md = disk->private_data; + if (WARN_ON(!md)) + goto out; + if (atomic_dec_and_test(&md->open_count) && + (test_bit(DMF_DEFERRED_REMOVE, &md->flags))) + queue_work(deferred_remove_workqueue, &deferred_remove_work); + + dm_put(md); +out: spin_unlock(&_minor_lock); } @@ -359,14 +493,18 @@ /* * Guarantees nothing is using the device before it's deleted. */ -int dm_lock_for_deletion(struct mapped_device *md) +int dm_lock_for_deletion(struct mapped_device *md, bool mark_deferred, bool only_deferred) { int r = 0; spin_lock(&_minor_lock); - if (dm_open_count(md)) + if (dm_open_count(md)) { r = -EBUSY; + if (mark_deferred) + set_bit(DMF_DEFERRED_REMOVE, &md->flags); + } else if (only_deferred && !test_bit(DMF_DEFERRED_REMOVE, &md->flags)) + r = -EEXIST; else set_bit(DMF_DELETING, &md->flags); @@ -375,6 +513,42 @@ return r; } +int dm_cancel_deferred_remove(struct mapped_device *md) +{ + int r = 0; + + spin_lock(&_minor_lock); + + if (test_bit(DMF_DELETING, &md->flags)) + r = -EBUSY; + else + clear_bit(DMF_DEFERRED_REMOVE, &md->flags); + + spin_unlock(&_minor_lock); + + return r; +} + +static void do_deferred_remove(struct work_struct *w) +{ + dm_deferred_remove(); +} + +sector_t dm_get_size(struct mapped_device *md) +{ + return get_capacity(md->disk); +} + +struct request_queue *dm_get_md_queue(struct mapped_device *md) +{ + return md->queue; +} + +struct dm_stats *dm_get_stats(struct mapped_device *md) +{ + return &md->stats; +} + static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) { struct mapped_device *md = bdev->bd_disk->private_data; @@ -382,16 +556,16 @@ return dm_get_geometry(md, geo); } -static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) +static int dm_get_live_table_for_ioctl(struct mapped_device *md, + struct dm_target **tgt, struct block_device **bdev, + fmode_t *mode, int *srcu_idx) { - struct mapped_device *md = bdev->bd_disk->private_data; struct dm_table *map; - struct dm_target *tgt; - int r = -ENOTTY; + int r; retry: - map = dm_get_live_table(md); + r = -ENOTTY; + map = dm_get_live_table(md, srcu_idx); if (!map || !dm_table_get_size(map)) goto out; @@ -399,24 +573,57 @@ if (dm_table_get_num_targets(map) != 1) goto out; - tgt = dm_table_get_target(map, 0); + *tgt = dm_table_get_target(map, 0); + + if (!(*tgt)->type->prepare_ioctl) + goto out; if (dm_suspended_md(md)) { r = -EAGAIN; goto out; } - if (tgt->type->ioctl) - r = tgt->type->ioctl(tgt, cmd, arg); + r = (*tgt)->type->prepare_ioctl(*tgt, bdev, mode); + if (r < 0) + goto out; -out: - dm_table_put(map); + return r; - if (r == -ENOTCONN) { +out: + dm_put_live_table(md, *srcu_idx); + if (r == -ENOTCONN && !fatal_signal_pending(current)) { msleep(10); goto retry; } + return r; +} + +static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + struct dm_target *tgt; + struct block_device *tgt_bdev = NULL; + int srcu_idx, r; + + r = dm_get_live_table_for_ioctl(md, &tgt, &tgt_bdev, &mode, &srcu_idx); + if (r < 0) + return r; + + if (r > 0) { + /* + * Target determined this ioctl is being issued against + * a logical partition of the parent bdev; so extra + * validation is needed. + */ + r = scsi_verify_blk_ioctl(NULL, cmd); + if (r) + goto out; + } + r = __blkdev_driver_ioctl(tgt_bdev, mode, cmd, arg); +out: + dm_put_live_table(md, srcu_idx); return r; } @@ -446,6 +653,17 @@ mempool_free(tio, tio->md->io_pool); } +static struct request *alloc_clone_request(struct mapped_device *md, + gfp_t gfp_mask) +{ + return mempool_alloc(md->rq_pool, gfp_mask); +} + +static void free_clone_request(struct mapped_device *md, struct request *rq) +{ + mempool_free(rq, md->rq_pool); +} + static int md_in_flight(struct mapped_device *md) { return atomic_read(&md->pending[READ]) + @@ -455,8 +673,9 @@ static void start_io_acct(struct dm_io *io) { struct mapped_device *md = io->md; + struct bio *bio = io->bio; int cpu; - int rw = bio_data_dir(io->bio); + int rw = bio_data_dir(bio); io->start_time = jiffies; @@ -465,6 +684,10 @@ part_stat_unlock(); atomic_set(&dm_disk(md)->part0.in_flight[rw], atomic_inc_return(&md->pending[rw])); + + if (unlikely(dm_stats_used(&md->stats))) + dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, + bio_sectors(bio), false, 0, &io->stats_aux); } static void end_io_acct(struct dm_io *io) @@ -472,13 +695,14 @@ struct mapped_device *md = io->md; struct bio *bio = io->bio; unsigned long duration = jiffies - io->start_time; - int pending, cpu; + int pending; int rw = bio_data_dir(bio); - cpu = part_stat_lock(); - part_round_stats(cpu, &dm_disk(md)->part0); - part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); - part_stat_unlock(); + generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time); + + if (unlikely(dm_stats_used(&md->stats))) + dm_stats_account_io(&md->stats, bio->bi_rw, bio->bi_iter.bi_sector, + bio_sectors(bio), true, duration, &io->stats_aux); /* * After this is decremented the bio must not be touched if it is @@ -509,20 +733,153 @@ /* * Everyone (including functions in this file), should use this * function to access the md->map field, and make sure they call - * dm_table_put() when finished. + * dm_put_live_table() when finished. */ -struct dm_table *dm_get_live_table(struct mapped_device *md) +struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx) __acquires(md->io_barrier) { - struct dm_table *t; - unsigned long flags; + *srcu_idx = srcu_read_lock(&md->io_barrier); - read_lock_irqsave(&md->map_lock, flags); - t = md->map; - if (t) - dm_table_get(t); - read_unlock_irqrestore(&md->map_lock, flags); + return srcu_dereference(md->map, &md->io_barrier); +} - return t; +void dm_put_live_table(struct mapped_device *md, int srcu_idx) __releases(md->io_barrier) +{ + srcu_read_unlock(&md->io_barrier, srcu_idx); +} + +void dm_sync_table(struct mapped_device *md) +{ + synchronize_srcu(&md->io_barrier); + synchronize_rcu_expedited(); +} + +/* + * A fast alternative to dm_get_live_table/dm_put_live_table. + * The caller must not block between these two functions. + */ +static struct dm_table *dm_get_live_table_fast(struct mapped_device *md) __acquires(RCU) +{ + rcu_read_lock(); + return rcu_dereference(md->map); +} + +static void dm_put_live_table_fast(struct mapped_device *md) __releases(RCU) +{ + rcu_read_unlock(); +} + +/* + * Open a table device so we can use it as a map destination. + */ +static int open_table_device(struct table_device *td, dev_t dev, + struct mapped_device *md) +{ + static char *_claim_ptr = "I belong to device-mapper"; + struct block_device *bdev; + + int r; + + BUG_ON(td->dm_dev.bdev); + + bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _claim_ptr); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + + r = bd_link_disk_holder(bdev, dm_disk(md)); + if (r) { + blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL); + return r; + } + + td->dm_dev.bdev = bdev; + return 0; +} + +/* + * Close a table device that we've been using. + */ +static void close_table_device(struct table_device *td, struct mapped_device *md) +{ + if (!td->dm_dev.bdev) + return; + + bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md)); + blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL); + td->dm_dev.bdev = NULL; +} + +static struct table_device *find_table_device(struct list_head *l, dev_t dev, + fmode_t mode) { + struct table_device *td; + + list_for_each_entry(td, l, list) + if (td->dm_dev.bdev->bd_dev == dev && td->dm_dev.mode == mode) + return td; + + return NULL; +} + +int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, + struct dm_dev **result) { + int r; + struct table_device *td; + + mutex_lock(&md->table_devices_lock); + td = find_table_device(&md->table_devices, dev, mode); + if (!td) { + td = kmalloc(sizeof(*td), GFP_KERNEL); + if (!td) { + mutex_unlock(&md->table_devices_lock); + return -ENOMEM; + } + + td->dm_dev.mode = mode; + td->dm_dev.bdev = NULL; + + if ((r = open_table_device(td, dev, md))) { + mutex_unlock(&md->table_devices_lock); + kfree(td); + return r; + } + + format_dev_t(td->dm_dev.name, dev); + + atomic_set(&td->count, 0); + list_add(&td->list, &md->table_devices); + } + atomic_inc(&td->count); + mutex_unlock(&md->table_devices_lock); + + *result = &td->dm_dev; + return 0; +} +EXPORT_SYMBOL_GPL(dm_get_table_device); + +void dm_put_table_device(struct mapped_device *md, struct dm_dev *d) +{ + struct table_device *td = container_of(d, struct table_device, dm_dev); + + mutex_lock(&md->table_devices_lock); + if (atomic_dec_and_test(&td->count)) { + close_table_device(td, md); + list_del(&td->list); + kfree(td); + } + mutex_unlock(&md->table_devices_lock); +} +EXPORT_SYMBOL(dm_put_table_device); + +static void free_table_devices(struct list_head *devices) +{ + struct list_head *tmp, *next; + + list_for_each_safe(tmp, next, devices) { + struct table_device *td = list_entry(tmp, struct table_device, list); + + DMWARN("dm_destroy: %s still exists with %d references", + td->dm_dev.name, atomic_read(&td->count)); + kfree(td); + } } /* @@ -607,7 +964,7 @@ if (io_error == DM_ENDIO_REQUEUE) return; - if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { + if ((bio->bi_rw & REQ_FLUSH) && bio->bi_iter.bi_size) { /* * Preflush done for flush with data, reissue * without REQ_FLUSH. @@ -617,22 +974,29 @@ } else { /* done with normal IO or empty flush */ trace_block_bio_complete(md->queue, bio, io_error); - bio_endio(bio, io_error); + bio->bi_error = io_error; + bio_endio(bio); } } } -static void clone_endio(struct bio *bio, int error) +static void disable_write_same(struct mapped_device *md) { - int r = 0; - struct dm_target_io *tio = bio->bi_private; + struct queue_limits *limits = dm_get_queue_limits(md); + + /* device doesn't really support WRITE SAME, disable it */ + limits->max_write_same_sectors = 0; +} + +static void clone_endio(struct bio *bio) +{ + int error = bio->bi_error; + int r = error; + struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); struct dm_io *io = tio->io; struct mapped_device *md = tio->io->md; dm_endio_fn endio = tio->ti->type->end_io; - if (!bio_flagged(bio, BIO_UPTODATE) && !error) - error = -EIO; - if (endio) { r = endio(tio->ti, bio, error); if (r < 0 || r == DM_ENDIO_REQUEUE) @@ -650,6 +1014,10 @@ } } + if (unlikely(r == -EREMOTEIO && (bio->bi_rw & REQ_WRITE_SAME) && + !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors)) + disable_write_same(md); + free_tio(md, tio); dec_pending(io, error); } @@ -657,12 +1025,14 @@ /* * Partial completion handling for request-based dm */ -static void end_clone_bio(struct bio *clone, int error) +static void end_clone_bio(struct bio *clone) { - struct dm_rq_clone_bio_info *info = clone->bi_private; + struct dm_rq_clone_bio_info *info = + container_of(clone, struct dm_rq_clone_bio_info, clone); struct dm_rq_target_io *tio = info->tio; struct bio *bio = info->orig; - unsigned int nr_bytes = info->orig->bi_size; + unsigned int nr_bytes = info->orig->bi_iter.bi_size; + int error = clone->bi_error; bio_put(clone); @@ -704,12 +1074,28 @@ blk_update_request(tio->orig, 0, nr_bytes); } +static struct dm_rq_target_io *tio_from_request(struct request *rq) +{ + return (rq->q->mq_ops ? blk_mq_rq_to_pdu(rq) : rq->special); +} + +static void rq_end_stats(struct mapped_device *md, struct request *orig) +{ + if (unlikely(dm_stats_used(&md->stats))) { + struct dm_rq_target_io *tio = tio_from_request(orig); + tio->duration_jiffies = jiffies - tio->duration_jiffies; + dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig), + tio->n_sectors, true, tio->duration_jiffies, + &tio->stats_aux); + } +} + /* * Don't touch any member of the md after calling this function because * the md may be freed in dm_put() at the end of this function. * Or do dm_get() before calling this function and dm_put() later. */ -static void rq_completed(struct mapped_device *md, int rw, int run_queue) +static void rq_completed(struct mapped_device *md, int rw, bool run_queue) { atomic_dec(&md->pending[rw]); @@ -723,7 +1109,7 @@ * back into ->request_fn() could deadlock attempting to grab the * queue lock again. */ - if (run_queue) + if (!md->queue->mq_ops && run_queue) blk_run_queue_async(md->queue); /* @@ -735,16 +1121,32 @@ static void free_rq_clone(struct request *clone) { struct dm_rq_target_io *tio = clone->end_io_data; + struct mapped_device *md = tio->md; blk_rq_unprep_clone(clone); - free_rq_tio(tio); + + if (md->type == DM_TYPE_MQ_REQUEST_BASED) + /* stacked on blk-mq queue(s) */ + tio->ti->type->release_clone_rq(clone); + else if (!md->queue->mq_ops) + /* request_fn queue stacked on request_fn queue(s) */ + free_clone_request(md, clone); + /* + * NOTE: for the blk-mq queue stacked on request_fn queue(s) case: + * no need to call free_clone_request() because we leverage blk-mq by + * allocating the clone at the end of the blk-mq pdu (see: clone_rq) + */ + + if (!md->queue->mq_ops) + free_rq_tio(tio); } /* * Complete the clone and the original request. - * Must be called without queue lock. + * Must be called without clone's queue lock held, + * see end_clone_request() for more details. */ -static void dm_end_request(struct request *clone, int error) +void dm_end_request(struct request *clone, int error) { int rw = rq_data_dir(clone); struct dm_rq_target_io *tio = clone->end_io_data; @@ -765,71 +1167,100 @@ } free_rq_clone(clone); - blk_end_request_all(rq, error); + rq_end_stats(md, rq); + if (!rq->q->mq_ops) + blk_end_request_all(rq, error); + else + blk_mq_end_request(rq, error); rq_completed(md, rw, true); } static void dm_unprep_request(struct request *rq) { - struct request *clone = rq->special; + struct dm_rq_target_io *tio = tio_from_request(rq); + struct request *clone = tio->clone; - rq->special = NULL; - rq->cmd_flags &= ~REQ_DONTPREP; + if (!rq->q->mq_ops) { + rq->special = NULL; + rq->cmd_flags &= ~REQ_DONTPREP; + } - free_rq_clone(clone); + if (clone) + free_rq_clone(clone); + else if (!tio->md->queue->mq_ops) + free_rq_tio(tio); } /* * Requeue the original request of a clone. */ -void dm_requeue_unmapped_request(struct request *clone) +static void old_requeue_request(struct request *rq) { - int rw = rq_data_dir(clone); - struct dm_rq_target_io *tio = clone->end_io_data; - struct mapped_device *md = tio->md; - struct request *rq = tio->orig; struct request_queue *q = rq->q; unsigned long flags; - dm_unprep_request(rq); - spin_lock_irqsave(q->queue_lock, flags); blk_requeue_request(q, rq); + blk_run_queue_async(q); spin_unlock_irqrestore(q->queue_lock, flags); - - rq_completed(md, rw, 0); } -EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); -static void __stop_queue(struct request_queue *q) +static void dm_requeue_original_request(struct mapped_device *md, + struct request *rq) { - blk_stop_queue(q); + int rw = rq_data_dir(rq); + + rq_end_stats(md, rq); + dm_unprep_request(rq); + + if (!rq->q->mq_ops) + old_requeue_request(rq); + else { + blk_mq_requeue_request(rq); + blk_mq_kick_requeue_list(rq->q); + } + + rq_completed(md, rw, false); } -static void stop_queue(struct request_queue *q) +static void old_stop_queue(struct request_queue *q) { unsigned long flags; + if (blk_queue_stopped(q)) + return; + spin_lock_irqsave(q->queue_lock, flags); - __stop_queue(q); + blk_stop_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } -static void __start_queue(struct request_queue *q) +static void stop_queue(struct request_queue *q) { - if (blk_queue_stopped(q)) - blk_start_queue(q); + if (!q->mq_ops) + old_stop_queue(q); + else + blk_mq_stop_hw_queues(q); } -static void start_queue(struct request_queue *q) +static void old_start_queue(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); - __start_queue(q); + if (blk_queue_stopped(q)) + blk_start_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } +static void start_queue(struct request_queue *q) +{ + if (!q->mq_ops) + old_start_queue(q); + else + blk_mq_start_stopped_hw_queues(q, true); +} + static void dm_done(struct request *clone, int error, bool mapped) { int r = error; @@ -843,6 +1274,10 @@ r = rq_end_io(tio->ti, clone, error, &tio->info); } + if (unlikely(r == -EREMOTEIO && (clone->cmd_flags & REQ_WRITE_SAME) && + !clone->q->limits.max_write_same_sectors)) + disable_write_same(tio->md); + if (r <= 0) /* The target wants to complete the I/O */ dm_end_request(clone, r); @@ -851,7 +1286,7 @@ return; else if (r == DM_ENDIO_REQUEUE) /* The target wants to requeue the I/O */ - dm_requeue_unmapped_request(clone); + dm_requeue_original_request(tio->md, tio->orig); else { DMWARN("unimplemented target endio return value: %d", r); BUG(); @@ -864,8 +1299,23 @@ static void dm_softirq_done(struct request *rq) { bool mapped = true; - struct request *clone = rq->completion_data; - struct dm_rq_target_io *tio = clone->end_io_data; + struct dm_rq_target_io *tio = tio_from_request(rq); + struct request *clone = tio->clone; + int rw; + + if (!clone) { + rq_end_stats(tio->md, rq); + rw = rq_data_dir(rq); + if (!rq->q->mq_ops) { + blk_end_request_all(rq, tio->error); + rq_completed(tio->md, rw, false); + free_rq_tio(tio); + } else { + blk_mq_end_request(rq, tio->error); + rq_completed(tio->md, rw, false); + } + return; + } if (rq->cmd_flags & REQ_FAILED) mapped = false; @@ -877,54 +1327,55 @@ * Complete the clone and the original request with the error status * through softirq context. */ -static void dm_complete_request(struct request *clone, int error) +static void dm_complete_request(struct request *rq, int error) { - struct dm_rq_target_io *tio = clone->end_io_data; - struct request *rq = tio->orig; + struct dm_rq_target_io *tio = tio_from_request(rq); tio->error = error; - rq->completion_data = clone; - blk_complete_request(rq); + if (!rq->q->mq_ops) + blk_complete_request(rq); + else + blk_mq_complete_request(rq, error); } /* * Complete the not-mapped clone and the original request with the error status * through softirq context. * Target's rq_end_io() function isn't called. - * This may be used when the target's map_rq() function fails. + * This may be used when the target's map_rq() or clone_and_map_rq() functions fail. */ -void dm_kill_unmapped_request(struct request *clone, int error) +static void dm_kill_unmapped_request(struct request *rq, int error) { - struct dm_rq_target_io *tio = clone->end_io_data; - struct request *rq = tio->orig; - rq->cmd_flags |= REQ_FAILED; - dm_complete_request(clone, error); + dm_complete_request(rq, error); } -EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); /* - * Called with the queue lock held + * Called with the clone's queue lock held (for non-blk-mq) */ static void end_clone_request(struct request *clone, int error) { - /* - * For just cleaning up the information of the queue in which - * the clone was dispatched. - * The clone is *NOT* freed actually here because it is alloced from - * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. - */ - __blk_put_request(clone->q, clone); + struct dm_rq_target_io *tio = clone->end_io_data; + + if (!clone->q->mq_ops) { + /* + * For just cleaning up the information of the queue in which + * the clone was dispatched. + * The clone is *NOT* freed actually here because it is alloced + * from dm own mempool (REQ_ALLOCED isn't set). + */ + __blk_put_request(clone->q, clone); + } /* * Actual request completion is done in a softirq context which doesn't - * hold the queue lock. Otherwise, deadlock could occur because: + * hold the clone's queue lock. Otherwise, deadlock could occur because: * - another request may be submitted by the upper level driver * of the stacking during the completion * - the submission which requires queue lock may be done - * against this queue + * against this clone's queue */ - dm_complete_request(clone, error); + dm_complete_request(tio->orig, error); } /* @@ -977,6 +1428,46 @@ EXPORT_SYMBOL_GPL(dm_set_target_max_io_len); /* + * A target may call dm_accept_partial_bio only from the map routine. It is + * allowed for all bio types except REQ_FLUSH. + * + * dm_accept_partial_bio informs the dm that the target only wants to process + * additional n_sectors sectors of the bio and the rest of the data should be + * sent in a next bio. + * + * A diagram that explains the arithmetics: + * +--------------------+---------------+-------+ + * | 1 | 2 | 3 | + * +--------------------+---------------+-------+ + * + * <-------------- *tio->len_ptr ---------------> + * <------- bi_size -------> + * <-- n_sectors --> + * + * Region 1 was already iterated over with bio_advance or similar function. + * (it may be empty if the target doesn't use bio_advance) + * Region 2 is the remaining bio size that the target wants to process. + * (it may be empty if region 1 is non-empty, although there is no reason + * to make it empty) + * The target requires that region 3 is to be sent in the next bio. + * + * If the target wants to receive multiple copies of the bio (via num_*bios, etc), + * the partially processed part (the sum of regions 1+2) must be the same for all + * copies of the bio. + */ +void dm_accept_partial_bio(struct bio *bio, unsigned n_sectors) +{ + struct dm_target_io *tio = container_of(bio, struct dm_target_io, clone); + unsigned bi_size = bio->bi_iter.bi_size >> SECTOR_SHIFT; + BUG_ON(bio->bi_rw & REQ_FLUSH); + BUG_ON(bi_size > *tio->len_ptr); + BUG_ON(n_sectors > bi_size); + *tio->len_ptr -= bi_size - n_sectors; + bio->bi_iter.bi_size = n_sectors << SECTOR_SHIFT; +} +EXPORT_SYMBOL_GPL(dm_accept_partial_bio); + +/* * Flush current->bio_list when the target map method blocks. * This fixes deadlocks in snapshot and possibly in other targets. */ @@ -990,26 +1481,29 @@ struct dm_offload *o = container_of(cb, struct dm_offload, cb); struct bio_list list; struct bio *bio; + int i; INIT_LIST_HEAD(&o->cb.list); if (unlikely(!current->bio_list)) return; - list = *current->bio_list; - bio_list_init(current->bio_list); - - while ((bio = bio_list_pop(&list))) { - struct bio_set *bs = bio->bi_pool; - if (unlikely(!bs) || bs == fs_bio_set) { - bio_list_add(current->bio_list, bio); - continue; + for (i = 0; i < 2; i++) { + list = current->bio_list[i]; + bio_list_init(¤t->bio_list[i]); + + while ((bio = bio_list_pop(&list))) { + struct bio_set *bs = bio->bi_pool; + if (unlikely(!bs) || bs == fs_bio_set) { + bio_list_add(¤t->bio_list[i], bio); + continue; + } + + spin_lock(&bs->rescue_lock); + bio_list_add(&bs->rescue_list, bio); + queue_work(bs->rescue_workqueue, &bs->rescue_work); + spin_unlock(&bs->rescue_lock); } - - spin_lock(&bs->rescue_lock); - bio_list_add(&bs->rescue_list, bio); - queue_work(bs->rescue_workqueue, &bs->rescue_work); - spin_unlock(&bs->rescue_lock); } } @@ -1036,7 +1530,6 @@ struct dm_target *ti = tio->ti; clone->bi_end_io = clone_endio; - clone->bi_private = tio; /* * Map the clone. If r == 0 we don't need to do @@ -1044,7 +1537,7 @@ * this io. */ atomic_inc(&tio->io->io_count); - sector = clone->bi_sector; + sector = clone->bi_iter.bi_sector; dm_offload_start(&o); r = ti->type->map(ti, clone); @@ -1062,7 +1555,7 @@ md = tio->io->md; dec_pending(tio->io, r); free_tio(md, tio); - } else if (r) { + } else if (r != DM_MAPIO_SUBMITTED) { DMWARN("unimplemented target map return value: %d", r); BUG(); } @@ -1074,92 +1567,47 @@ struct bio *bio; struct dm_io *io; sector_t sector; - sector_t sector_count; - unsigned short idx; + unsigned sector_count; }; -static void bio_setup_sector(struct bio *bio, sector_t sector, sector_t len) -{ - bio->bi_sector = sector; - bio->bi_size = to_bytes(len); -} - -static void bio_setup_bv(struct bio *bio, unsigned short idx, unsigned short bv_count) +static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) { - bio->bi_idx = idx; - bio->bi_vcnt = idx + bv_count; - bio->bi_flags &= ~(1 << BIO_SEG_VALID); -} - -static void clone_bio_integrity(struct bio *bio, struct bio *clone, - unsigned short idx, unsigned len, unsigned offset, - unsigned trim) -{ - if (!bio_integrity(bio)) - return; - - bio_integrity_clone(clone, bio, GFP_NOIO); - - if (trim) - bio_integrity_trim(clone, bio_sector_offset(bio, idx, offset), len); + bio->bi_iter.bi_sector = sector; + bio->bi_iter.bi_size = to_bytes(len); } /* - * Creates a little bio that just does part of a bvec. + * Creates a bio that consists of range of complete bvecs. */ -static void clone_split_bio(struct dm_target_io *tio, struct bio *bio, - sector_t sector, unsigned short idx, - unsigned offset, unsigned len) +static void clone_bio(struct dm_target_io *tio, struct bio *bio, + sector_t sector, unsigned len) { struct bio *clone = &tio->clone; - struct bio_vec *bv = bio->bi_io_vec + idx; - - *clone->bi_io_vec = *bv; - - bio_setup_sector(clone, sector, len); - clone->bi_bdev = bio->bi_bdev; - clone->bi_rw = bio->bi_rw; - clone->bi_vcnt = 1; - clone->bi_io_vec->bv_offset = offset; - clone->bi_io_vec->bv_len = clone->bi_size; - clone->bi_flags |= 1 << BIO_CLONED; + __bio_clone_fast(clone, bio); - clone_bio_integrity(bio, clone, idx, len, offset, 1); -} + if (bio_integrity(bio)) + bio_integrity_clone(clone, bio, GFP_NOIO); -/* - * Creates a bio that consists of range of complete bvecs. - */ -static void clone_bio(struct dm_target_io *tio, struct bio *bio, - sector_t sector, unsigned short idx, - unsigned short bv_count, unsigned len) -{ - struct bio *clone = &tio->clone; - unsigned trim = 0; + bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); + clone->bi_iter.bi_size = to_bytes(len); - __bio_clone(clone, bio); - bio_setup_sector(clone, sector, len); - bio_setup_bv(clone, idx, bv_count); - - if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) - trim = 1; - clone_bio_integrity(bio, clone, idx, len, 0, trim); + if (bio_integrity(bio)) + bio_integrity_trim(clone, 0, len); } static struct dm_target_io *alloc_tio(struct clone_info *ci, - struct dm_target *ti, int nr_iovecs, + struct dm_target *ti, unsigned target_bio_nr) { struct dm_target_io *tio; struct bio *clone; - clone = bio_alloc_bioset(GFP_NOIO, nr_iovecs, ci->md->bs); + clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs); tio = container_of(clone, struct dm_target_io, clone); tio->io = ci->io; tio->ti = ti; - memset(&tio->info, 0, sizeof(tio->info)); tio->target_bio_nr = target_bio_nr; return tio; @@ -1167,25 +1615,22 @@ static void __clone_and_map_simple_bio(struct clone_info *ci, struct dm_target *ti, - unsigned target_bio_nr, sector_t len) + unsigned target_bio_nr, unsigned *len) { - struct dm_target_io *tio = alloc_tio(ci, ti, ci->bio->bi_max_vecs, target_bio_nr); + struct dm_target_io *tio = alloc_tio(ci, ti, target_bio_nr); struct bio *clone = &tio->clone; - /* - * Discard requests require the bio's inline iovecs be initialized. - * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush - * and discard, so no need for concern about wasted bvec allocations. - */ - __bio_clone(clone, ci->bio); + tio->len_ptr = len; + + __bio_clone_fast(clone, ci->bio); if (len) - bio_setup_sector(clone, ci->sector, len); + bio_setup_sector(clone, ci->sector, *len); __map_bio(tio); } static void __send_duplicate_bios(struct clone_info *ci, struct dm_target *ti, - unsigned num_bios, sector_t len) + unsigned num_bios, unsigned *len) { unsigned target_bio_nr; @@ -1200,16 +1645,13 @@ BUG_ON(bio_has_data(ci->bio)); while ((ti = dm_table_get_target(ci->map, target_nr++))) - __send_duplicate_bios(ci, ti, ti->num_flush_bios, 0); + __send_duplicate_bios(ci, ti, ti->num_flush_bios, NULL); return 0; } static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, - sector_t sector, int nr_iovecs, - unsigned short idx, unsigned short bv_count, - unsigned offset, unsigned len, - unsigned split_bvec) + sector_t sector, unsigned *len) { struct bio *bio = ci->bio; struct dm_target_io *tio; @@ -1223,11 +1665,9 @@ num_target_bios = ti->num_write_bios(ti, bio); for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { - tio = alloc_tio(ci, ti, nr_iovecs, target_bio_nr); - if (split_bvec) - clone_split_bio(tio, bio, sector, idx, offset, len); - else - clone_bio(tio, bio, sector, idx, bv_count, len); + tio = alloc_tio(ci, ti, target_bio_nr); + tio->len_ptr = len; + clone_bio(tio, bio, sector, *len); __map_bio(tio); } } @@ -1256,7 +1696,7 @@ is_split_required_fn is_split_required) { struct dm_target *ti; - sector_t len; + unsigned len; unsigned num_bios; do { @@ -1275,11 +1715,11 @@ return -EOPNOTSUPP; if (is_split_required && !is_split_required(ti)) - len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); + len = min((sector_t)ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); else - len = min(ci->sector_count, max_io_len(ci->sector, ti)); + len = min((sector_t)ci->sector_count, max_io_len(ci->sector, ti)); - __send_duplicate_bios(ci, ti, num_bios, len); + __send_duplicate_bios(ci, ti, num_bios, &len); ci->sector += len; } while (ci->sector_count -= len); @@ -1299,68 +1739,13 @@ } /* - * Find maximum number of sectors / bvecs we can process with a single bio. - */ -static sector_t __len_within_target(struct clone_info *ci, sector_t max, int *idx) -{ - struct bio *bio = ci->bio; - sector_t bv_len, total_len = 0; - - for (*idx = ci->idx; max && (*idx < bio->bi_vcnt); (*idx)++) { - bv_len = to_sector(bio->bi_io_vec[*idx].bv_len); - - if (bv_len > max) - break; - - max -= bv_len; - total_len += bv_len; - } - - return total_len; -} - -static int __split_bvec_across_targets(struct clone_info *ci, - struct dm_target *ti, sector_t max) -{ - struct bio *bio = ci->bio; - struct bio_vec *bv = bio->bi_io_vec + ci->idx; - sector_t remaining = to_sector(bv->bv_len); - unsigned offset = 0; - sector_t len; - - do { - if (offset) { - ti = dm_table_find_target(ci->map, ci->sector); - if (!dm_target_is_valid(ti)) - return -EIO; - - max = max_io_len(ci->sector, ti); - } - - len = min(remaining, max); - - __clone_and_map_data_bio(ci, ti, ci->sector, 1, ci->idx, 0, - bv->bv_offset + offset, len, 1); - - ci->sector += len; - ci->sector_count -= len; - offset += to_bytes(len); - } while (remaining -= len); - - ci->idx++; - - return 0; -} - -/* * Select the correct strategy for processing a non-flush bio. */ static int __split_and_process_non_flush(struct clone_info *ci) { struct bio *bio = ci->bio; struct dm_target *ti; - sector_t len, max; - int idx; + unsigned len; if (unlikely(bio->bi_rw & REQ_DISCARD)) return __send_discard(ci); @@ -1371,57 +1756,31 @@ if (!dm_target_is_valid(ti)) return -EIO; - max = max_io_len(ci->sector, ti); + len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); - /* - * Optimise for the simple case where we can do all of - * the remaining io with a single clone. - */ - if (ci->sector_count <= max) { - __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs, - ci->idx, bio->bi_vcnt - ci->idx, 0, - ci->sector_count, 0); - ci->sector_count = 0; - return 0; - } + __clone_and_map_data_bio(ci, ti, ci->sector, &len); - /* - * There are some bvecs that don't span targets. - * Do as many of these as possible. - */ - if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { - len = __len_within_target(ci, max, &idx); + ci->sector += len; + ci->sector_count -= len; - __clone_and_map_data_bio(ci, ti, ci->sector, bio->bi_max_vecs, - ci->idx, idx - ci->idx, 0, len, 0); - - ci->sector += len; - ci->sector_count -= len; - ci->idx = idx; - - return 0; - } - - /* - * Handle a bvec that must be split between two or more targets. - */ - return __split_bvec_across_targets(ci, ti, max); + return 0; } /* * Entry point to split a bio into clones and submit them to the targets. */ -static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) +static void __split_and_process_bio(struct mapped_device *md, + struct dm_table *map, struct bio *bio) { struct clone_info ci; int error = 0; - ci.map = dm_get_live_table(md); - if (unlikely(!ci.map)) { + if (unlikely(!map)) { bio_io_error(bio); return; } + ci.map = map; ci.md = md; ci.io = alloc_io(md); ci.io->error = 0; @@ -1429,8 +1788,7 @@ ci.io->bio = bio; ci.io->md = md; spin_lock_init(&ci.io->endio_lock); - ci.sector = bio->bi_sector; - ci.idx = bio->bi_idx; + ci.sector = bio->bi_iter.bi_sector; start_io_acct(ci.io); @@ -1448,130 +1806,60 @@ /* drop the extra reference count */ dec_pending(ci.io, error); - dm_table_put(ci.map); } /*----------------------------------------------------------------- * CRUD END *---------------------------------------------------------------*/ -static int dm_merge_bvec(struct request_queue *q, - struct bvec_merge_data *bvm, - struct bio_vec *biovec) -{ - struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_live_table(md); - struct dm_target *ti; - sector_t max_sectors; - int max_size = 0; - - if (unlikely(!map)) - goto out; - - ti = dm_table_find_target(map, bvm->bi_sector); - if (!dm_target_is_valid(ti)) - goto out_table; - - /* - * Find maximum amount of I/O that won't need splitting - */ - max_sectors = min(max_io_len(bvm->bi_sector, ti), - (sector_t) BIO_MAX_SECTORS); - max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; - if (max_size < 0) - max_size = 0; - - /* - * merge_bvec_fn() returns number of bytes - * it can accept at this offset - * max is precomputed maximal io size - */ - if (max_size && ti->type->merge) - max_size = ti->type->merge(ti, bvm, biovec, max_size); - /* - * If the target doesn't support merge method and some of the devices - * provided their merge_bvec method (we know this by looking at - * queue_max_hw_sectors), then we can't allow bios with multiple vector - * entries. So always set max_size to 0, and the code below allows - * just one page. - */ - else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) - - max_size = 0; - -out_table: - dm_table_put(map); - -out: - /* - * Always allow an entire first page - */ - if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) - max_size = biovec->bv_len; - - return max_size; -} - /* * The request function that just remaps the bio built up by * dm_merge_bvec. */ -static void _dm_request(struct request_queue *q, struct bio *bio) +static blk_qc_t dm_make_request(struct request_queue *q, struct bio *bio) { int rw = bio_data_dir(bio); struct mapped_device *md = q->queuedata; - int cpu; + int srcu_idx; + struct dm_table *map; - down_read(&md->io_lock); + map = dm_get_live_table(md, &srcu_idx); - cpu = part_stat_lock(); - part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); - part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); - part_stat_unlock(); + generic_start_io_acct(rw, bio_sectors(bio), &dm_disk(md)->part0); /* if we're suspended, we have to queue this io for later */ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { - up_read(&md->io_lock); + dm_put_live_table(md, srcu_idx); if (bio_rw(bio) != READA) queue_io(md, bio); else bio_io_error(bio); - return; + return BLK_QC_T_NONE; } - __split_and_process_bio(md, bio); - up_read(&md->io_lock); - return; + __split_and_process_bio(md, map, bio); + dm_put_live_table(md, srcu_idx); + return BLK_QC_T_NONE; } -static int dm_request_based(struct mapped_device *md) +int dm_request_based(struct mapped_device *md) { return blk_queue_stackable(md->queue); } -static void dm_request(struct request_queue *q, struct bio *bio) -{ - struct mapped_device *md = q->queuedata; - - if (dm_request_based(md)) - blk_queue_bio(q, bio); - else - _dm_request(q, bio); -} - -void dm_dispatch_request(struct request *rq) +static void dm_dispatch_clone_request(struct request *clone, struct request *rq) { int r; - if (blk_queue_io_stat(rq->q)) - rq->cmd_flags |= REQ_IO_STAT; + if (blk_queue_io_stat(clone->q)) + clone->cmd_flags |= REQ_IO_STAT; - rq->start_time = jiffies; - r = blk_insert_cloned_request(rq->q, rq); + clone->start_time = jiffies; + r = blk_insert_cloned_request(clone->q, clone); if (r) + /* must complete clone in terms of original request */ dm_complete_request(rq, r); } -EXPORT_SYMBOL_GPL(dm_dispatch_request); static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, void *data) @@ -1583,17 +1871,16 @@ info->orig = bio_orig; info->tio = tio; bio->bi_end_io = end_clone_bio; - bio->bi_private = info; return 0; } static int setup_clone(struct request *clone, struct request *rq, - struct dm_rq_target_io *tio) + struct dm_rq_target_io *tio, gfp_t gfp_mask) { int r; - r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, + r = blk_rq_prep_clone(clone, rq, tio->md->bs, gfp_mask, dm_rq_bio_constructor, tio); if (r) return r; @@ -1601,37 +1888,81 @@ clone->cmd = rq->cmd; clone->cmd_len = rq->cmd_len; clone->sense = rq->sense; - clone->buffer = rq->buffer; clone->end_io = end_clone_request; clone->end_io_data = tio; + tio->clone = clone; + return 0; } static struct request *clone_rq(struct request *rq, struct mapped_device *md, - gfp_t gfp_mask) + struct dm_rq_target_io *tio, gfp_t gfp_mask) { + /* + * Do not allocate a clone if tio->clone was already set + * (see: dm_mq_queue_rq). + */ + bool alloc_clone = !tio->clone; struct request *clone; - struct dm_rq_target_io *tio; - tio = alloc_rq_tio(md, gfp_mask); - if (!tio) + if (alloc_clone) { + clone = alloc_clone_request(md, gfp_mask); + if (!clone) + return NULL; + } else + clone = tio->clone; + + blk_rq_init(NULL, clone); + if (setup_clone(clone, rq, tio, gfp_mask)) { + /* -ENOMEM */ + if (alloc_clone) + free_clone_request(md, clone); return NULL; + } + return clone; +} + +static void map_tio_request(struct kthread_work *work); + +static void init_tio(struct dm_rq_target_io *tio, struct request *rq, + struct mapped_device *md) +{ tio->md = md; tio->ti = NULL; + tio->clone = NULL; tio->orig = rq; tio->error = 0; memset(&tio->info, 0, sizeof(tio->info)); + if (md->kworker_task) + init_kthread_work(&tio->work, map_tio_request); +} - clone = &tio->clone; - if (setup_clone(clone, rq, tio)) { - /* -ENOMEM */ - free_rq_tio(tio); +static struct dm_rq_target_io *prep_tio(struct request *rq, + struct mapped_device *md, gfp_t gfp_mask) +{ + struct dm_rq_target_io *tio; + int srcu_idx; + struct dm_table *table; + + tio = alloc_rq_tio(md, gfp_mask); + if (!tio) return NULL; + + init_tio(tio, rq, md); + + table = dm_get_live_table(md, &srcu_idx); + if (!dm_table_mq_request_based(table)) { + if (!clone_rq(rq, md, tio, gfp_mask)) { + dm_put_live_table(md, srcu_idx); + free_rq_tio(tio); + return NULL; + } } + dm_put_live_table(md, srcu_idx); - return clone; + return tio; } /* @@ -1640,18 +1971,18 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) { struct mapped_device *md = q->queuedata; - struct request *clone; + struct dm_rq_target_io *tio; if (unlikely(rq->special)) { DMWARN("Already has something in rq->special."); return BLKPREP_KILL; } - clone = clone_rq(rq, md, GFP_ATOMIC); - if (!clone) + tio = prep_tio(rq, md, GFP_ATOMIC); + if (!tio) return BLKPREP_DEFER; - rq->special = clone; + rq->special = tio; rq->cmd_flags |= REQ_DONTPREP; return BLKPREP_OK; @@ -1659,17 +1990,36 @@ /* * Returns: - * 0 : the request has been processed (not requeued) - * !0 : the request has been requeued + * 0 : the request has been processed + * DM_MAPIO_REQUEUE : the original request needs to be requeued + * < 0 : the request was completed due to failure */ -static int map_request(struct dm_target *ti, struct request *clone, +static int map_request(struct dm_rq_target_io *tio, struct request *rq, struct mapped_device *md) { - int r, requeued = 0; - struct dm_rq_target_io *tio = clone->end_io_data; + int r; + struct dm_target *ti = tio->ti; + struct request *clone = NULL; + + if (tio->clone) { + clone = tio->clone; + r = ti->type->map_rq(ti, clone, &tio->info); + } else { + r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone); + if (r < 0) { + /* The target wants to complete the I/O */ + dm_kill_unmapped_request(rq, r); + return r; + } + if (r != DM_MAPIO_REMAPPED) + return r; + if (setup_clone(clone, rq, tio, GFP_ATOMIC)) { + /* -ENOMEM */ + ti->type->release_clone_rq(clone); + return DM_MAPIO_REQUEUE; + } + } - tio->ti = ti; - r = ti->type->map_rq(ti, clone, &tio->info); switch (r) { case DM_MAPIO_SUBMITTED: /* The target has taken the I/O to submit by itself later */ @@ -1677,13 +2027,12 @@ case DM_MAPIO_REMAPPED: /* The target has remapped the I/O so dispatch it */ trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), - blk_rq_pos(tio->orig)); - dm_dispatch_request(clone); + blk_rq_pos(rq)); + dm_dispatch_clone_request(clone, rq); break; case DM_MAPIO_REQUEUE: /* The target wants to requeue the I/O */ - dm_requeue_unmapped_request(clone); - requeued = 1; + dm_requeue_original_request(md, tio->orig); break; default: if (r > 0) { @@ -1692,20 +2041,44 @@ } /* The target wants to complete the I/O */ - dm_kill_unmapped_request(clone, r); - break; + dm_kill_unmapped_request(rq, r); + return r; } - return requeued; + return 0; } -static struct request *dm_start_request(struct mapped_device *md, struct request *orig) +static void map_tio_request(struct kthread_work *work) { - struct request *clone; + struct dm_rq_target_io *tio = container_of(work, struct dm_rq_target_io, work); + struct request *rq = tio->orig; + struct mapped_device *md = tio->md; + + if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) + dm_requeue_original_request(md, rq); +} + +static void dm_start_request(struct mapped_device *md, struct request *orig) +{ + if (!orig->q->mq_ops) + blk_start_request(orig); + else + blk_mq_start_request(orig); + atomic_inc(&md->pending[rq_data_dir(orig)]); + + if (md->seq_rq_merge_deadline_usecs) { + md->last_rq_pos = rq_end_sector(orig); + md->last_rq_rw = rq_data_dir(orig); + md->last_rq_start_time = ktime_get(); + } - blk_start_request(orig); - clone = orig->special; - atomic_inc(&md->pending[rq_data_dir(clone)]); + if (unlikely(dm_stats_used(&md->stats))) { + struct dm_rq_target_io *tio = tio_from_request(orig); + tio->duration_jiffies = jiffies; + tio->n_sectors = blk_rq_sectors(orig); + dm_stats_account_io(&md->stats, orig->cmd_flags, blk_rq_pos(orig), + tio->n_sectors, false, 0, &tio->stats_aux); + } /* * Hold the md reference here for the in-flight I/O. @@ -1715,8 +2088,45 @@ * See the comment in rq_completed() too. */ dm_get(md); +} - return clone; +#define MAX_SEQ_RQ_MERGE_DEADLINE_USECS 100000 + +ssize_t dm_attr_rq_based_seq_io_merge_deadline_show(struct mapped_device *md, char *buf) +{ + return sprintf(buf, "%u\n", md->seq_rq_merge_deadline_usecs); +} + +ssize_t dm_attr_rq_based_seq_io_merge_deadline_store(struct mapped_device *md, + const char *buf, size_t count) +{ + unsigned deadline; + + if (!dm_request_based(md) || md->use_blk_mq) + return count; + + if (kstrtouint(buf, 10, &deadline)) + return -EINVAL; + + if (deadline > MAX_SEQ_RQ_MERGE_DEADLINE_USECS) + deadline = MAX_SEQ_RQ_MERGE_DEADLINE_USECS; + + md->seq_rq_merge_deadline_usecs = deadline; + + return count; +} + +static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md) +{ + ktime_t kt_deadline; + + if (!md->seq_rq_merge_deadline_usecs) + return false; + + kt_deadline = ns_to_ktime((u64)md->seq_rq_merge_deadline_usecs * NSEC_PER_USEC); + kt_deadline = ktime_add_safe(md->last_rq_start_time, kt_deadline); + + return !ktime_after(ktime_get(), kt_deadline); } /* @@ -1726,9 +2136,11 @@ static void dm_request_fn(struct request_queue *q) { struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_live_table(md); + int srcu_idx; + struct dm_table *map = dm_get_live_table(md, &srcu_idx); struct dm_target *ti; - struct request *rq, *clone; + struct request *rq; + struct dm_rq_target_io *tio; sector_t pos; /* @@ -1740,7 +2152,7 @@ while (!blk_queue_stopped(q)) { rq = blk_peek_request(q); if (!rq) - goto delay_and_out; + goto out; /* always use block 0 to find the target for flushes for now */ pos = 0; @@ -1750,60 +2162,38 @@ ti = dm_table_find_target(map, pos); if (!dm_target_is_valid(ti)) { /* - * Must perform setup, that dm_done() requires, + * Must perform setup, that rq_completed() requires, * before calling dm_kill_unmapped_request */ DMERR_LIMIT("request attempted access beyond the end of device"); - clone = dm_start_request(md, rq); - dm_kill_unmapped_request(clone, -EIO); + dm_start_request(md, rq); + dm_kill_unmapped_request(rq, -EIO); continue; } - if (ti->type->busy && ti->type->busy(ti)) + if (dm_request_peeked_before_merge_deadline(md) && + md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && + md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) goto delay_and_out; - clone = dm_start_request(md, rq); + if (ti->type->busy && ti->type->busy(ti)) + goto delay_and_out; - spin_unlock(q->queue_lock); - if (map_request(ti, clone, md)) - goto requeued; + dm_start_request(md, rq); + tio = tio_from_request(rq); + /* Establish tio->ti before queuing work (map_tio_request) */ + tio->ti = ti; + queue_kthread_work(&md->kworker, &tio->work); BUG_ON(!irqs_disabled()); - spin_lock(q->queue_lock); } goto out; -requeued: - BUG_ON(!irqs_disabled()); - spin_lock(q->queue_lock); - delay_and_out: - blk_delay_queue(q, HZ / 10); + blk_delay_queue(q, HZ / 100); out: - dm_table_put(map); -} - -int dm_underlying_device_busy(struct request_queue *q) -{ - return blk_lld_busy(q); -} -EXPORT_SYMBOL_GPL(dm_underlying_device_busy); - -static int dm_lld_busy(struct request_queue *q) -{ - int r; - struct mapped_device *md = q->queuedata; - struct dm_table *map = dm_get_live_table(md); - - if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) - r = 1; - else - r = dm_table_any_busy_target(map); - - dm_table_put(map); - - return r; + dm_put_live_table(md, srcu_idx); } static int dm_any_congested(void *congested_data, int bdi_bits) @@ -1813,20 +2203,19 @@ struct dm_table *map; if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { - map = dm_get_live_table(md); + map = dm_get_live_table_fast(md); if (map) { /* * Request-based dm cares about only own queue for * the query about congestion status of request_queue */ if (dm_request_based(md)) - r = md->queue->backing_dev_info.state & + r = md->queue->backing_dev_info.wb.state & bdi_bits; else r = dm_table_any_congested(map, bdi_bits); - - dm_table_put(map); } + dm_put_live_table_fast(md); } return r; @@ -1889,7 +2278,7 @@ { /* * Request-based dm devices cannot be stacked on top of bio-based dm - * devices. The type of this dm device has not been decided yet. + * devices. The type of this dm device may not have been decided yet. * The type is decided at the first table loading time. * To prevent problematic device stacking, clear the queue flag * for request stacking support until then. @@ -1898,12 +2287,54 @@ */ queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); + /* + * Initialize data that will only be used by a non-blk-mq DM queue + * - must do so here (in alloc_dev callchain) before queue is used + */ md->queue->queuedata = md; - md->queue->backing_dev_info.congested_fn = dm_any_congested; md->queue->backing_dev_info.congested_data = md; - blk_queue_make_request(md->queue, dm_request); +} + +static void dm_init_old_md_queue(struct mapped_device *md) +{ + md->use_blk_mq = false; + dm_init_md_queue(md); + + /* + * Initialize aspects of queue that aren't relevant for blk-mq + */ + md->queue->backing_dev_info.congested_fn = dm_any_congested; blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); - blk_queue_merge_bvec(md->queue, dm_merge_bvec); +} + +static void cleanup_mapped_device(struct mapped_device *md) +{ + if (md->wq) + destroy_workqueue(md->wq); + if (md->kworker_task) + kthread_stop(md->kworker_task); + mempool_destroy(md->io_pool); + mempool_destroy(md->rq_pool); + if (md->bs) + bioset_free(md->bs); + + if (md->disk) { + spin_lock(&_minor_lock); + md->disk->private_data = NULL; + spin_unlock(&_minor_lock); + del_gendisk(md->disk); + put_disk(md->disk); + } + + if (md->queue) + blk_cleanup_queue(md->queue); + + cleanup_srcu_struct(&md->io_barrier); + + if (md->bdev) { + bdput(md->bdev); + md->bdev = NULL; + } } /* @@ -1931,28 +2362,33 @@ if (r < 0) goto bad_minor; + r = init_srcu_struct(&md->io_barrier); + if (r < 0) + goto bad_io_barrier; + + md->use_blk_mq = use_blk_mq; md->type = DM_TYPE_NONE; - init_rwsem(&md->io_lock); mutex_init(&md->suspend_lock); mutex_init(&md->type_lock); + mutex_init(&md->table_devices_lock); spin_lock_init(&md->deferred_lock); - rwlock_init(&md->map_lock); atomic_set(&md->holders, 1); atomic_set(&md->open_count, 0); atomic_set(&md->event_nr, 0); atomic_set(&md->uevent_seq, 0); INIT_LIST_HEAD(&md->uevent_list); + INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); md->queue = blk_alloc_queue(GFP_KERNEL); if (!md->queue) - goto bad_queue; + goto bad; dm_init_md_queue(md); md->disk = alloc_disk(1); if (!md->disk) - goto bad_disk; + goto bad; atomic_set(&md->pending[0], 0); atomic_set(&md->pending[1], 0); @@ -1960,6 +2396,7 @@ INIT_WORK(&md->work, dm_wq_work); init_waitqueue_head(&md->eventq); init_completion(&md->kobj_holder.completion); + md->kworker_task = NULL; md->disk->major = _major; md->disk->first_minor = minor; @@ -1970,19 +2407,20 @@ add_disk(md->disk); format_dev_t(md->name, MKDEV(_major, minor)); - md->wq = alloc_workqueue("kdmflush", - WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); + md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0); if (!md->wq) - goto bad_thread; + goto bad; md->bdev = bdget_disk(md->disk, 0); if (!md->bdev) - goto bad_bdev; + goto bad; bio_init(&md->flush_bio); md->flush_bio.bi_bdev = md->bdev; md->flush_bio.bi_rw = WRITE_FLUSH; + dm_stats_init(&md->stats); + /* Populate the mapping, nobody knows we exist yet */ spin_lock(&_minor_lock); old_md = idr_replace(&_minor_idr, md, minor); @@ -1992,14 +2430,9 @@ return md; -bad_bdev: - destroy_workqueue(md->wq); -bad_thread: - del_gendisk(md->disk); - put_disk(md->disk); -bad_disk: - blk_cleanup_queue(md->queue); -bad_queue: +bad: + cleanup_mapped_device(md); +bad_io_barrier: free_minor(minor); bad_minor: module_put(THIS_MODULE); @@ -2015,22 +2448,15 @@ int minor = MINOR(disk_devt(md->disk)); unlock_fs(md); - bdput(md->bdev); - destroy_workqueue(md->wq); - if (md->io_pool) - mempool_destroy(md->io_pool); - if (md->bs) - bioset_free(md->bs); - blk_integrity_unregister(md->disk); - del_gendisk(md->disk); - free_minor(minor); - spin_lock(&_minor_lock); - md->disk->private_data = NULL; - spin_unlock(&_minor_lock); + cleanup_mapped_device(md); + if (md->use_blk_mq) + blk_mq_free_tag_set(&md->tag_set); + + free_table_devices(&md->table_devices); + dm_stats_cleanup(&md->stats); + free_minor(minor); - put_disk(md->disk); - blk_cleanup_queue(md->queue); module_put(THIS_MODULE); kfree(md); } @@ -2039,7 +2465,7 @@ { struct dm_md_mempools *p = dm_table_get_md_mempools(t); - if (md->io_pool && md->bs) { + if (md->bs) { /* The md already has necessary mempools. */ if (dm_table_get_type(t) == DM_TYPE_BIO_BASED) { /* @@ -2049,28 +2475,29 @@ bioset_free(md->bs); md->bs = p->bs; p->bs = NULL; - } else if (dm_table_get_type(t) == DM_TYPE_REQUEST_BASED) { - /* - * There's no need to reload with request-based dm - * because the size of front_pad doesn't change. - * Note for future: If you are to reload bioset, - * prep-ed requests in the queue may refer - * to bio from the old bioset, so you must walk - * through the queue to unprep. - */ } + /* + * There's no need to reload with request-based dm + * because the size of front_pad doesn't change. + * Note for future: If you are to reload bioset, + * prep-ed requests in the queue may refer + * to bio from the old bioset, so you must walk + * through the queue to unprep. + */ goto out; } - BUG_ON(!p || md->io_pool || md->bs); + BUG_ON(!p || md->io_pool || md->rq_pool || md->bs); md->io_pool = p->io_pool; p->io_pool = NULL; + md->rq_pool = p->rq_pool; + p->rq_pool = NULL; md->bs = p->bs; p->bs = NULL; out: - /* mempool bind completed, now no need any mempools in the table */ + /* mempool bind completed, no longer need any mempools in the table */ dm_table_free_md_mempools(t); } @@ -2104,59 +2531,6 @@ } /* - * Return 1 if the queue has a compulsory merge_bvec_fn function. - * - * If this function returns 0, then the device is either a non-dm - * device without a merge_bvec_fn, or it is a dm device that is - * able to split any bios it receives that are too big. - */ -int dm_queue_merge_is_compulsory(struct request_queue *q) -{ - struct mapped_device *dev_md; - - if (!q->merge_bvec_fn) - return 0; - - if (q->make_request_fn == dm_request) { - dev_md = q->queuedata; - if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) - return 0; - } - - return 1; -} - -static int dm_device_merge_is_compulsory(struct dm_target *ti, - struct dm_dev *dev, sector_t start, - sector_t len, void *data) -{ - struct block_device *bdev = dev->bdev; - struct request_queue *q = bdev_get_queue(bdev); - - return dm_queue_merge_is_compulsory(q); -} - -/* - * Return 1 if it is acceptable to ignore merge_bvec_fn based - * on the properties of the underlying devices. - */ -static int dm_table_merge_is_optional(struct dm_table *table) -{ - unsigned i = 0; - struct dm_target *ti; - - while (i < dm_table_get_num_targets(table)) { - ti = dm_table_get_target(table, i++); - - if (ti->type->iterate_devices && - ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) - return 0; - } - - return 1; -} - -/* * Returns old map, which caller must destroy. */ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, @@ -2165,15 +2539,13 @@ struct dm_table *old_map; struct request_queue *q = md->queue; sector_t size; - unsigned long flags; - int merge_is_optional; size = dm_table_get_size(t); /* * Wipe any geometry if the size of the table changed. */ - if (size != get_capacity(md->disk)) + if (size != dm_get_size(md)) memset(&md->geometry, 0, sizeof(md->geometry)); __set_size(md, size); @@ -2187,24 +2559,18 @@ * This must be done before setting the queue restrictions, * because request-based dm may be run just after the setting. */ - if (dm_table_request_based(t) && !blk_queue_stopped(q)) + if (dm_table_request_based(t)) stop_queue(q); __bind_mempools(md, t); - merge_is_optional = dm_table_merge_is_optional(t); - - write_lock_irqsave(&md->map_lock, flags); - old_map = md->map; - md->map = t; + old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + rcu_assign_pointer(md->map, t); md->immutable_target_type = dm_table_get_immutable_target_type(t); dm_table_set_restrictions(t, q, limits); - if (merge_is_optional) - set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); - else - clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); - write_unlock_irqrestore(&md->map_lock, flags); + if (old_map) + dm_sync_table(md); return old_map; } @@ -2214,16 +2580,14 @@ */ static struct dm_table *__unbind(struct mapped_device *md) { - struct dm_table *map = md->map; - unsigned long flags; + struct dm_table *map = rcu_dereference_protected(md->map, 1); if (!map) return NULL; dm_table_event_callback(map, NULL, NULL); - write_lock_irqsave(&md->map_lock, flags); - md->map = NULL; - write_unlock_irqrestore(&md->map_lock, flags); + RCU_INIT_POINTER(md->map, NULL); + dm_sync_table(md); return map; } @@ -2261,11 +2625,13 @@ void dm_set_md_type(struct mapped_device *md, unsigned type) { + BUG_ON(!mutex_is_locked(&md->type_lock)); md->type = type; } unsigned dm_get_md_type(struct mapped_device *md) { + BUG_ON(!mutex_is_locked(&md->type_lock)); return md->type; } @@ -2285,6 +2651,14 @@ } EXPORT_SYMBOL_GPL(dm_get_queue_limits); +static void init_rq_based_worker_thread(struct mapped_device *md) +{ + /* Initialize the request-based DM worker thread */ + init_kthread_worker(&md->kworker); + md->kworker_task = kthread_run(kthread_worker_fn, &md->kworker, + "kdmwork-%s", dm_device_name(md)); +} + /* * Fully initialize a request-based queue (->elevator, ->request_fn, etc). */ @@ -2292,23 +2666,161 @@ { struct request_queue *q = NULL; - if (md->queue->elevator) - return 1; - /* Fully initialize the queue */ q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); if (!q) - return 0; + return -EINVAL; + + /* disable dm_request_fn's merge heuristic by default */ + md->seq_rq_merge_deadline_usecs = 0; md->queue = q; - dm_init_md_queue(md); + dm_init_old_md_queue(md); blk_queue_softirq_done(md->queue, dm_softirq_done); blk_queue_prep_rq(md->queue, dm_prep_fn); - blk_queue_lld_busy(md->queue, dm_lld_busy); + + init_rq_based_worker_thread(md); elv_register_queue(md->queue); - return 1; + return 0; +} + +static int dm_mq_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) +{ + struct mapped_device *md = data; + struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); + + /* + * Must initialize md member of tio, otherwise it won't + * be available in dm_mq_queue_rq. + */ + tio->md = md; + + return 0; +} + +static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct request *rq = bd->rq; + struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); + struct mapped_device *md = tio->md; + int srcu_idx; + struct dm_table *map = dm_get_live_table(md, &srcu_idx); + struct dm_target *ti; + sector_t pos; + + /* always use block 0 to find the target for flushes for now */ + pos = 0; + if (!(rq->cmd_flags & REQ_FLUSH)) + pos = blk_rq_pos(rq); + + ti = dm_table_find_target(map, pos); + if (!dm_target_is_valid(ti)) { + dm_put_live_table(md, srcu_idx); + DMERR_LIMIT("request attempted access beyond the end of device"); + /* + * Must perform setup, that rq_completed() requires, + * before returning BLK_MQ_RQ_QUEUE_ERROR + */ + dm_start_request(md, rq); + return BLK_MQ_RQ_QUEUE_ERROR; + } + dm_put_live_table(md, srcu_idx); + + if (ti->type->busy && ti->type->busy(ti)) + return BLK_MQ_RQ_QUEUE_BUSY; + + dm_start_request(md, rq); + + /* Init tio using md established in .init_request */ + init_tio(tio, rq, md); + + /* + * Establish tio->ti before queuing work (map_tio_request) + * or making direct call to map_request(). + */ + tio->ti = ti; + + /* Clone the request if underlying devices aren't blk-mq */ + if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) { + /* clone request is allocated at the end of the pdu */ + tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io); + (void) clone_rq(rq, md, tio, GFP_ATOMIC); + queue_kthread_work(&md->kworker, &tio->work); + } else { + /* Direct call is fine since .queue_rq allows allocations */ + if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) { + /* Undo dm_start_request() before requeuing */ + rq_end_stats(md, rq); + rq_completed(md, rq_data_dir(rq), false); + return BLK_MQ_RQ_QUEUE_BUSY; + } + } + + return BLK_MQ_RQ_QUEUE_OK; +} + +static struct blk_mq_ops dm_mq_ops = { + .queue_rq = dm_mq_queue_rq, + .map_queue = blk_mq_map_queue, + .complete = dm_softirq_done, + .init_request = dm_mq_init_request, +}; + +static int dm_init_request_based_blk_mq_queue(struct mapped_device *md) +{ + unsigned md_type = dm_get_md_type(md); + struct request_queue *q; + int err; + + memset(&md->tag_set, 0, sizeof(md->tag_set)); + md->tag_set.ops = &dm_mq_ops; + md->tag_set.queue_depth = BLKDEV_MAX_RQ; + md->tag_set.numa_node = NUMA_NO_NODE; + md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + md->tag_set.nr_hw_queues = 1; + if (md_type == DM_TYPE_REQUEST_BASED) { + /* make the memory for non-blk-mq clone part of the pdu */ + md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request); + } else + md->tag_set.cmd_size = sizeof(struct dm_rq_target_io); + md->tag_set.driver_data = md; + + err = blk_mq_alloc_tag_set(&md->tag_set); + if (err) + return err; + + q = blk_mq_init_allocated_queue(&md->tag_set, md->queue); + if (IS_ERR(q)) { + err = PTR_ERR(q); + goto out_tag_set; + } + md->queue = q; + dm_init_md_queue(md); + + /* backfill 'mq' sysfs registration normally done in blk_register_queue */ + blk_mq_register_disk(md->disk); + + if (md_type == DM_TYPE_REQUEST_BASED) + init_rq_based_worker_thread(md); + + return 0; + +out_tag_set: + blk_mq_free_tag_set(&md->tag_set); + return err; +} + +static unsigned filter_md_type(unsigned type, struct mapped_device *md) +{ + if (type == DM_TYPE_BIO_BASED) + return type; + + return !md->use_blk_mq ? DM_TYPE_REQUEST_BASED : DM_TYPE_MQ_REQUEST_BASED; } /* @@ -2316,10 +2828,34 @@ */ int dm_setup_md_queue(struct mapped_device *md) { - if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && - !dm_init_request_based_queue(md)) { - DMWARN("Cannot initialize queue for request-based mapped device"); - return -EINVAL; + int r; + unsigned md_type = filter_md_type(dm_get_md_type(md), md); + + switch (md_type) { + case DM_TYPE_REQUEST_BASED: + r = dm_init_request_based_queue(md); + if (r) { + DMWARN("Cannot initialize queue for request-based mapped device"); + return r; + } + break; + case DM_TYPE_MQ_REQUEST_BASED: + r = dm_init_request_based_blk_mq_queue(md); + if (r) { + DMWARN("Cannot initialize queue for request-based blk-mq mapped device"); + return r; + } + break; + case DM_TYPE_BIO_BASED: + dm_init_old_md_queue(md); + blk_queue_make_request(md->queue, dm_make_request); + /* + * DM handles splitting bios as needed. Free the bio_split bioset + * since it won't be used (saves 1 process per bio-based DM device). + */ + bioset_free(md->queue->bio_split); + md->queue->bio_split = NULL; + break; } return 0; @@ -2370,6 +2906,19 @@ BUG_ON(test_bit(DMF_FREEING, &md->flags)); } +int dm_hold(struct mapped_device *md) +{ + spin_lock(&_minor_lock); + if (test_bit(DMF_FREEING, &md->flags)) { + spin_unlock(&_minor_lock); + return -EBUSY; + } + dm_get(md); + spin_unlock(&_minor_lock); + return 0; +} +EXPORT_SYMBOL_GPL(dm_hold); + const char *dm_device_name(struct mapped_device *md) { return md->name; @@ -2378,13 +2927,13 @@ static void __dm_destroy(struct mapped_device *md, bool wait) { - struct request_queue *q = md->queue; + struct request_queue *q = dm_get_md_queue(md); struct dm_table *map; + int srcu_idx; might_sleep(); spin_lock(&_minor_lock); - map = dm_get_live_table(md); idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); set_bit(DMF_FREEING, &md->flags); spin_unlock(&_minor_lock); @@ -2393,15 +2942,21 @@ queue_flag_set(QUEUE_FLAG_DYING, q); spin_unlock_irq(q->queue_lock); + if (dm_request_based(md) && md->kworker_task) + flush_kthread_worker(&md->kworker); + /* * Take suspend_lock so that presuspend and postsuspend methods * do not race with internal suspend. */ mutex_lock(&md->suspend_lock); + map = dm_get_live_table(md, &srcu_idx); if (!dm_suspended_md(md)) { dm_table_presuspend_targets(map); dm_table_postsuspend_targets(map); } + /* dm_put_live_table must be before msleep, otherwise deadlock is possible */ + dm_put_live_table(md, srcu_idx); mutex_unlock(&md->suspend_lock); /* @@ -2418,7 +2973,6 @@ dm_device_name(md), atomic_read(&md->holders)); dm_sysfs_exit(md); - dm_table_put(map); dm_table_destroy(__unbind(md)); free_dev(md); } @@ -2475,8 +3029,10 @@ struct mapped_device *md = container_of(work, struct mapped_device, work); struct bio *c; + int srcu_idx; + struct dm_table *map; - down_read(&md->io_lock); + map = dm_get_live_table(md, &srcu_idx); while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { spin_lock_irq(&md->deferred_lock); @@ -2486,23 +3042,19 @@ if (!c) break; - up_read(&md->io_lock); - if (dm_request_based(md)) generic_make_request(c); else - __split_and_process_bio(md, c); - - down_read(&md->io_lock); + __split_and_process_bio(md, map, c); } - up_read(&md->io_lock); + dm_put_live_table(md, srcu_idx); } static void dm_queue_flush(struct mapped_device *md) { clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); - smp_mb__after_clear_bit(); + smp_mb__after_atomic(); queue_work(md->wq, &md->work); } @@ -2528,10 +3080,10 @@ * reappear. */ if (dm_table_has_no_data_devices(table)) { - live_map = dm_get_live_table(md); + live_map = dm_get_live_table_fast(md); if (live_map) limits = md->queue->limits; - dm_table_put(live_map); + dm_put_live_table_fast(md); } if (!live_map) { @@ -2582,36 +3134,19 @@ } /* - * We need to be able to change a mapping table under a mounted - * filesystem. For example we might want to move some data in - * the background. Before the table can be swapped with - * dm_bind_table, dm_suspend must be called to flush any in - * flight bios and ensure that any further io gets deferred. - */ -/* - * Suspend mechanism in request-based dm. - * - * 1. Flush all I/Os by lock_fs() if needed. - * 2. Stop dispatching any I/O by stopping the request_queue. - * 3. Wait for all in-flight I/Os to be completed or requeued. + * If __dm_suspend returns 0, the device is completely quiescent + * now. There is no request-processing activity. All new requests + * are being added to md->deferred list. * - * To abort suspend, start the request_queue. + * Caller must hold md->suspend_lock */ -int dm_suspend(struct mapped_device *md, unsigned suspend_flags) +static int __dm_suspend(struct mapped_device *md, struct dm_table *map, + unsigned suspend_flags, int interruptible, + int dmf_suspended_flag) { - struct dm_table *map = NULL; - int r = 0; - int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; - int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; - - mutex_lock(&md->suspend_lock); - - if (dm_suspended_md(md)) { - r = -EINVAL; - goto out_unlock; - } - - map = dm_get_live_table(md); + bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG; + bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG; + int r; /* * DMF_NOFLUSH_SUSPENDING must be set before presuspend. @@ -2620,7 +3155,10 @@ if (noflush) set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - /* This does not get reverted if there's an error later. */ + /* + * This gets reverted if there's an error later and the targets + * provide the .presuspend_undo hook. + */ dm_table_presuspend_targets(map); /* @@ -2631,8 +3169,10 @@ */ if (!noflush && do_lockfs) { r = lock_fs(md); - if (r) - goto out; + if (r) { + dm_table_presuspend_undo_targets(map); + return r; + } } /* @@ -2647,16 +3187,19 @@ * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call * flush_workqueue(md->wq). */ - down_write(&md->io_lock); set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); - up_write(&md->io_lock); + if (map) + synchronize_srcu(&md->io_barrier); /* * Stop md->queue before flushing md->wq in case request-based * dm defers requests to md->wq from md->queue. */ - if (dm_request_based(md)) + if (dm_request_based(md)) { stop_queue(md->queue); + if (md->kworker_task) + flush_kthread_worker(&md->kworker); + } flush_workqueue(md->wq); @@ -2665,12 +3208,14 @@ * We call dm_wait_for_completion to wait for all existing requests * to finish. */ - r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); + r = dm_wait_for_completion(md, interruptible); + if (!r) + set_bit(dmf_suspended_flag, &md->flags); - down_write(&md->io_lock); if (noflush) clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); - up_write(&md->io_lock); + if (map) + synchronize_srcu(&md->io_barrier); /* were we interrupted ? */ if (r < 0) { @@ -2680,65 +3225,222 @@ start_queue(md->queue); unlock_fs(md); - goto out; /* pushback list is already flushed, so skip flush */ + dm_table_presuspend_undo_targets(map); + /* pushback list is already flushed, so skip flush */ } - /* - * If dm_wait_for_completion returned 0, the device is completely - * quiescent now. There is no request-processing activity. All new - * requests are being added to md->deferred list. - */ + return r; +} - set_bit(DMF_SUSPENDED, &md->flags); +/* + * We need to be able to change a mapping table under a mounted + * filesystem. For example we might want to move some data in + * the background. Before the table can be swapped with + * dm_bind_table, dm_suspend must be called to flush any in + * flight bios and ensure that any further io gets deferred. + */ +/* + * Suspend mechanism in request-based dm. + * + * 1. Flush all I/Os by lock_fs() if needed. + * 2. Stop dispatching any I/O by stopping the request_queue. + * 3. Wait for all in-flight I/Os to be completed or requeued. + * + * To abort suspend, start the request_queue. + */ +int dm_suspend(struct mapped_device *md, unsigned suspend_flags) +{ + struct dm_table *map = NULL; + int r = 0; - dm_table_postsuspend_targets(map); +retry: + mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); -out: - dm_table_put(map); + if (dm_suspended_md(md)) { + r = -EINVAL; + goto out_unlock; + } + + if (dm_suspended_internally_md(md)) { + /* already internally suspended, wait for internal resume */ + mutex_unlock(&md->suspend_lock); + r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); + if (r) + return r; + goto retry; + } + + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); + + r = __dm_suspend(md, map, suspend_flags, TASK_INTERRUPTIBLE, DMF_SUSPENDED); + if (r) + goto out_unlock; + + dm_table_postsuspend_targets(map); out_unlock: mutex_unlock(&md->suspend_lock); return r; } +static int __dm_resume(struct mapped_device *md, struct dm_table *map) +{ + if (map) { + int r = dm_table_resume_targets(map); + if (r) + return r; + } + + dm_queue_flush(md); + + /* + * Flushing deferred I/Os must be done after targets are resumed + * so that mapping of targets can work correctly. + * Request-based dm is queueing the deferred I/Os in its request_queue. + */ + if (dm_request_based(md)) + start_queue(md->queue); + + unlock_fs(md); + + return 0; +} + int dm_resume(struct mapped_device *md) { - int r = -EINVAL; + int r; struct dm_table *map = NULL; - mutex_lock(&md->suspend_lock); +retry: + r = -EINVAL; + mutex_lock_nested(&md->suspend_lock, SINGLE_DEPTH_NESTING); + if (!dm_suspended_md(md)) goto out; - map = dm_get_live_table(md); + if (dm_suspended_internally_md(md)) { + /* already internally suspended, wait for internal resume */ + mutex_unlock(&md->suspend_lock); + r = wait_on_bit(&md->flags, DMF_SUSPENDED_INTERNALLY, TASK_INTERRUPTIBLE); + if (r) + return r; + goto retry; + } + + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); if (!map || !dm_table_get_size(map)) goto out; - r = dm_table_resume_targets(map); + r = __dm_resume(md, map); if (r) goto out; - dm_queue_flush(md); + clear_bit(DMF_SUSPENDED, &md->flags); +out: + mutex_unlock(&md->suspend_lock); + + return r; +} + +/* + * Internal suspend/resume works like userspace-driven suspend. It waits + * until all bios finish and prevents issuing new bios to the target drivers. + * It may be used only from the kernel. + */ + +static void __dm_internal_suspend(struct mapped_device *md, unsigned suspend_flags) +{ + struct dm_table *map = NULL; + + if (md->internal_suspend_count++) + return; /* nested internal suspend */ + + if (dm_suspended_md(md)) { + set_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + return; /* nest suspend */ + } + + map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); /* - * Flushing deferred I/Os must be done after targets are resumed - * so that mapping of targets can work correctly. - * Request-based dm is queueing the deferred I/Os in its request_queue. + * Using TASK_UNINTERRUPTIBLE because only NOFLUSH internal suspend is + * supported. Properly supporting a TASK_INTERRUPTIBLE internal suspend + * would require changing .presuspend to return an error -- avoid this + * until there is a need for more elaborate variants of internal suspend. */ - if (dm_request_based(md)) - start_queue(md->queue); + (void) __dm_suspend(md, map, suspend_flags, TASK_UNINTERRUPTIBLE, + DMF_SUSPENDED_INTERNALLY); - unlock_fs(md); + dm_table_postsuspend_targets(map); +} - clear_bit(DMF_SUSPENDED, &md->flags); +static void __dm_internal_resume(struct mapped_device *md) +{ + BUG_ON(!md->internal_suspend_count); - r = 0; -out: - dm_table_put(map); + if (--md->internal_suspend_count) + return; /* resume from nested internal suspend */ + + if (dm_suspended_md(md)) + goto done; /* resume from nested suspend */ + + /* + * NOTE: existing callers don't need to call dm_table_resume_targets + * (which may fail -- so best to avoid it for now by passing NULL map) + */ + (void) __dm_resume(md, NULL); + +done: + clear_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); + smp_mb__after_atomic(); + wake_up_bit(&md->flags, DMF_SUSPENDED_INTERNALLY); +} + +void dm_internal_suspend_noflush(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + __dm_internal_suspend(md, DM_SUSPEND_NOFLUSH_FLAG); mutex_unlock(&md->suspend_lock); +} +EXPORT_SYMBOL_GPL(dm_internal_suspend_noflush); - return r; +void dm_internal_resume(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + __dm_internal_resume(md); + mutex_unlock(&md->suspend_lock); } +EXPORT_SYMBOL_GPL(dm_internal_resume); + +/* + * Fast variants of internal suspend/resume hold md->suspend_lock, + * which prevents interaction with userspace-driven suspend. + */ + +void dm_internal_suspend_fast(struct mapped_device *md) +{ + mutex_lock(&md->suspend_lock); + if (dm_suspended_md(md) || dm_suspended_internally_md(md)) + return; + + set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); + synchronize_srcu(&md->io_barrier); + flush_workqueue(md->wq); + dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL_GPL(dm_internal_suspend_fast); + +void dm_internal_resume_fast(struct mapped_device *md) +{ + if (dm_suspended_md(md) || dm_suspended_internally_md(md)) + goto done; + + dm_queue_flush(md); + +done: + mutex_unlock(&md->suspend_lock); +} +EXPORT_SYMBOL_GPL(dm_internal_resume_fast); /*----------------------------------------------------------------- * Event notification. @@ -2792,6 +3494,7 @@ { return md->disk; } +EXPORT_SYMBOL_GPL(dm_disk); struct kobject *dm_kobject(struct mapped_device *md) { @@ -2817,6 +3520,16 @@ return test_bit(DMF_SUSPENDED, &md->flags); } +int dm_suspended_internally_md(struct mapped_device *md) +{ + return test_bit(DMF_SUSPENDED_INTERNALLY, &md->flags); +} + +int dm_test_deferred_remove_flag(struct mapped_device *md) +{ + return test_bit(DMF_DEFERRED_REMOVE, &md->flags); +} + int dm_suspended(struct dm_target *ti) { return dm_suspended_md(dm_table_get_md(ti->table)); @@ -2829,34 +3542,50 @@ } EXPORT_SYMBOL_GPL(dm_noflush_suspending); -struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity, unsigned per_bio_data_size) +struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, + unsigned integrity, unsigned per_bio_data_size) { struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); - struct kmem_cache *cachep; - unsigned int pool_size; + struct kmem_cache *cachep = NULL; + unsigned int pool_size = 0; unsigned int front_pad; if (!pools) return NULL; - if (type == DM_TYPE_BIO_BASED) { + type = filter_md_type(type, md); + + switch (type) { + case DM_TYPE_BIO_BASED: cachep = _io_cache; - pool_size = 16; + pool_size = dm_get_reserved_bio_based_ios(); front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); - } else if (type == DM_TYPE_REQUEST_BASED) { + break; + case DM_TYPE_REQUEST_BASED: cachep = _rq_tio_cache; - pool_size = MIN_IOS; + pool_size = dm_get_reserved_rq_based_ios(); + pools->rq_pool = mempool_create_slab_pool(pool_size, _rq_cache); + if (!pools->rq_pool) + goto out; + /* fall through to setup remaining rq-based pools */ + case DM_TYPE_MQ_REQUEST_BASED: + if (!pool_size) + pool_size = dm_get_reserved_rq_based_ios(); front_pad = offsetof(struct dm_rq_clone_bio_info, clone); /* per_bio_data_size is not used. See __bind_mempools(). */ WARN_ON(per_bio_data_size != 0); - } else - goto out; + break; + default: + BUG(); + } - pools->io_pool = mempool_create_slab_pool(MIN_IOS, cachep); - if (!pools->io_pool) - goto out; + if (cachep) { + pools->io_pool = mempool_create_slab_pool(pool_size, cachep); + if (!pools->io_pool) + goto out; + } - pools->bs = bioset_create(pool_size, front_pad); + pools->bs = bioset_create_nobvec(pool_size, front_pad); if (!pools->bs) goto out; @@ -2876,8 +3605,8 @@ if (!pools) return; - if (pools->io_pool) - mempool_destroy(pools->io_pool); + mempool_destroy(pools->io_pool); + mempool_destroy(pools->rq_pool); if (pools->bs) bioset_free(pools->bs); @@ -2885,16 +3614,136 @@ kfree(pools); } +static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, + u32 flags) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + const struct pr_ops *ops; + struct dm_target *tgt; + fmode_t mode; + int srcu_idx, r; + + r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + if (r < 0) + return r; + + ops = bdev->bd_disk->fops->pr_ops; + if (ops && ops->pr_register) + r = ops->pr_register(bdev, old_key, new_key, flags); + else + r = -EOPNOTSUPP; + + dm_put_live_table(md, srcu_idx); + return r; +} + +static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, + u32 flags) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + const struct pr_ops *ops; + struct dm_target *tgt; + fmode_t mode; + int srcu_idx, r; + + r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + if (r < 0) + return r; + + ops = bdev->bd_disk->fops->pr_ops; + if (ops && ops->pr_reserve) + r = ops->pr_reserve(bdev, key, type, flags); + else + r = -EOPNOTSUPP; + + dm_put_live_table(md, srcu_idx); + return r; +} + +static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + const struct pr_ops *ops; + struct dm_target *tgt; + fmode_t mode; + int srcu_idx, r; + + r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + if (r < 0) + return r; + + ops = bdev->bd_disk->fops->pr_ops; + if (ops && ops->pr_release) + r = ops->pr_release(bdev, key, type); + else + r = -EOPNOTSUPP; + + dm_put_live_table(md, srcu_idx); + return r; +} + +static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, + enum pr_type type, bool abort) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + const struct pr_ops *ops; + struct dm_target *tgt; + fmode_t mode; + int srcu_idx, r; + + r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + if (r < 0) + return r; + + ops = bdev->bd_disk->fops->pr_ops; + if (ops && ops->pr_preempt) + r = ops->pr_preempt(bdev, old_key, new_key, type, abort); + else + r = -EOPNOTSUPP; + + dm_put_live_table(md, srcu_idx); + return r; +} + +static int dm_pr_clear(struct block_device *bdev, u64 key) +{ + struct mapped_device *md = bdev->bd_disk->private_data; + const struct pr_ops *ops; + struct dm_target *tgt; + fmode_t mode; + int srcu_idx, r; + + r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + if (r < 0) + return r; + + ops = bdev->bd_disk->fops->pr_ops; + if (ops && ops->pr_clear) + r = ops->pr_clear(bdev, key); + else + r = -EOPNOTSUPP; + + dm_put_live_table(md, srcu_idx); + return r; +} + +static const struct pr_ops dm_pr_ops = { + .pr_register = dm_pr_register, + .pr_reserve = dm_pr_reserve, + .pr_release = dm_pr_release, + .pr_preempt = dm_pr_preempt, + .pr_clear = dm_pr_clear, +}; + static const struct block_device_operations dm_blk_dops = { .open = dm_blk_open, .release = dm_blk_close, .ioctl = dm_blk_ioctl, .getgeo = dm_blk_getgeo, + .pr_ops = &dm_pr_ops, .owner = THIS_MODULE }; -EXPORT_SYMBOL(dm_get_mapinfo); - /* * module hooks */ @@ -2903,6 +3752,16 @@ module_param(major, uint, 0); MODULE_PARM_DESC(major, "The major number of the device mapper"); + +module_param(reserved_bio_based_ios, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reserved_bio_based_ios, "Reserved IOs in bio-based mempools"); + +module_param(reserved_rq_based_ios, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"); + +module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices"); + MODULE_DESCRIPTION(DM_NAME " driver"); MODULE_AUTHOR("Joe Thornber "); MODULE_LICENSE("GPL");