--- zzzz-none-000/linux-3.10.107/drivers/block/loop.c 2017-06-27 09:49:32.000000000 +0000 +++ scorpion-7490-727/linux-3.10.107/drivers/block/loop.c 2021-02-04 17:41:59.000000000 +0000 @@ -63,7 +63,6 @@ #include #include #include -#include #include #include #include @@ -76,6 +75,8 @@ #include #include #include +#include +#include "loop.h" #include @@ -85,28 +86,6 @@ static int max_part; static int part_shift; -/* - * Transfer functions - */ -static int transfer_none(struct loop_device *lo, int cmd, - struct page *raw_page, unsigned raw_off, - struct page *loop_page, unsigned loop_off, - int size, sector_t real_block) -{ - char *raw_buf = kmap_atomic(raw_page) + raw_off; - char *loop_buf = kmap_atomic(loop_page) + loop_off; - - if (cmd == READ) - memcpy(loop_buf, raw_buf, size); - else - memcpy(raw_buf, loop_buf, size); - - kunmap_atomic(loop_buf); - kunmap_atomic(raw_buf); - cond_resched(); - return 0; -} - static int transfer_xor(struct loop_device *lo, int cmd, struct page *raw_page, unsigned raw_off, struct page *loop_page, unsigned loop_off, @@ -145,14 +124,13 @@ static struct loop_func_table none_funcs = { .number = LO_CRYPT_NONE, - .transfer = transfer_none, -}; +}; static struct loop_func_table xor_funcs = { .number = LO_CRYPT_XOR, .transfer = transfer_xor, .init = xor_init -}; +}; /* xfer_funcs[0] is special - its release function is never called */ static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = { @@ -186,6 +164,62 @@ return get_size(lo->lo_offset, lo->lo_sizelimit, file); } +static void __loop_update_dio(struct loop_device *lo, bool dio) +{ + struct file *file = lo->lo_backing_file; + struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; + unsigned short sb_bsize = 0; + unsigned dio_align = 0; + bool use_dio; + + if (inode->i_sb->s_bdev) { + sb_bsize = bdev_logical_block_size(inode->i_sb->s_bdev); + dio_align = sb_bsize - 1; + } + + /* + * We support direct I/O only if lo_offset is aligned with the + * logical I/O size of backing device, and the logical block + * size of loop is bigger than the backing device's and the loop + * needn't transform transfer. + * + * TODO: the above condition may be loosed in the future, and + * direct I/O may be switched runtime at that time because most + * of requests in sane appplications should be PAGE_SIZE algined + */ + if (dio) { + if (queue_logical_block_size(lo->lo_queue) >= sb_bsize && + !(lo->lo_offset & dio_align) && + mapping->a_ops->direct_IO && + !lo->transfer) + use_dio = true; + else + use_dio = false; + } else { + use_dio = false; + } + + if (lo->use_dio == use_dio) + return; + + /* flush dirty pages before changing direct IO */ + vfs_fsync(file, 0); + + /* + * The flag of LO_FLAGS_DIRECT_IO is handled similarly with + * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup + * will get updated by ioctl(LOOP_GET_STATUS) + */ + blk_mq_freeze_queue(lo->lo_queue); + lo->use_dio = use_dio; + if (use_dio) + lo->lo_flags |= LO_FLAGS_DIRECT_IO; + else + lo->lo_flags &= ~LO_FLAGS_DIRECT_IO; + blk_mq_unfreeze_queue(lo->lo_queue); +} + static int figure_loop_size(struct loop_device *lo, loff_t offset, loff_t sizelimit) { @@ -212,304 +246,319 @@ struct page *lpage, unsigned loffs, int size, sector_t rblock) { - if (unlikely(!lo->transfer)) + int ret; + + ret = lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock); + if (likely(!ret)) return 0; - return lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock); + printk_ratelimited(KERN_ERR + "loop: Transfer error at byte offset %llu, length %i.\n", + (unsigned long long)rblock << 9, size); + return ret; } -/** - * __do_lo_send_write - helper for writing data to a loop device - * - * This helper just factors out common code between do_lo_send_direct_write() - * and do_lo_send_write(). - */ -static int __do_lo_send_write(struct file *file, - u8 *buf, const int len, loff_t pos) +static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos) { + struct iov_iter i; ssize_t bw; - mm_segment_t old_fs = get_fs(); + + iov_iter_bvec(&i, ITER_BVEC, bvec, 1, bvec->bv_len); file_start_write(file); - set_fs(get_ds()); - bw = file->f_op->write(file, buf, len, &pos); - set_fs(old_fs); + bw = vfs_iter_write(file, &i, ppos); file_end_write(file); - if (likely(bw == len)) + + if (likely(bw == bvec->bv_len)) return 0; - printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n", - (unsigned long long)pos, len); + + printk_ratelimited(KERN_ERR + "loop: Write error at byte offset %llu, length %i.\n", + (unsigned long long)*ppos, bvec->bv_len); if (bw >= 0) bw = -EIO; return bw; } -/** - * do_lo_send_direct_write - helper for writing data to a loop device - * - * This is the fast, non-transforming version that does not need double - * buffering. - */ -static int do_lo_send_direct_write(struct loop_device *lo, - struct bio_vec *bvec, loff_t pos, struct page *page) +static int lo_write_simple(struct loop_device *lo, struct request *rq, + loff_t pos) { - ssize_t bw = __do_lo_send_write(lo->lo_backing_file, - kmap(bvec->bv_page) + bvec->bv_offset, - bvec->bv_len, pos); - kunmap(bvec->bv_page); - cond_resched(); - return bw; + struct bio_vec bvec; + struct req_iterator iter; + int ret = 0; + + rq_for_each_segment(bvec, rq, iter) { + ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos); + if (ret < 0) + break; + cond_resched(); + } + + return ret; } -/** - * do_lo_send_write - helper for writing data to a loop device - * +/* * This is the slow, transforming version that needs to double buffer the * data as it cannot do the transformations in place without having direct * access to the destination pages of the backing file. */ -static int do_lo_send_write(struct loop_device *lo, struct bio_vec *bvec, - loff_t pos, struct page *page) +static int lo_write_transfer(struct loop_device *lo, struct request *rq, + loff_t pos) { - int ret = lo_do_transfer(lo, WRITE, page, 0, bvec->bv_page, - bvec->bv_offset, bvec->bv_len, pos >> 9); - if (likely(!ret)) - return __do_lo_send_write(lo->lo_backing_file, - page_address(page), bvec->bv_len, - pos); - printk(KERN_ERR "loop: Transfer error at byte offset %llu, " - "length %i.\n", (unsigned long long)pos, bvec->bv_len); - if (ret > 0) - ret = -EIO; - return ret; -} + struct bio_vec bvec, b; + struct req_iterator iter; + struct page *page; + int ret = 0; -static int lo_send(struct loop_device *lo, struct bio *bio, loff_t pos) -{ - int (*do_lo_send)(struct loop_device *, struct bio_vec *, loff_t, - struct page *page); - struct bio_vec *bvec; - struct page *page = NULL; - int i, ret = 0; + page = alloc_page(GFP_NOIO); + if (unlikely(!page)) + return -ENOMEM; - if (lo->transfer != transfer_none) { - page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); - if (unlikely(!page)) - goto fail; - kmap(page); - do_lo_send = do_lo_send_write; - } else { - do_lo_send = do_lo_send_direct_write; - } + rq_for_each_segment(bvec, rq, iter) { + ret = lo_do_transfer(lo, WRITE, page, 0, bvec.bv_page, + bvec.bv_offset, bvec.bv_len, pos >> 9); + if (unlikely(ret)) + break; - bio_for_each_segment(bvec, bio, i) { - ret = do_lo_send(lo, bvec, pos, page); + b.bv_page = page; + b.bv_offset = 0; + b.bv_len = bvec.bv_len; + ret = lo_write_bvec(lo->lo_backing_file, &b, &pos); if (ret < 0) break; - pos += bvec->bv_len; } - if (page) { - kunmap(page); - __free_page(page); - } -out: + + __free_page(page); return ret; -fail: - printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n"); - ret = -ENOMEM; - goto out; } -struct lo_read_data { - struct loop_device *lo; - struct page *page; - unsigned offset; - int bsize; -}; - -static int -lo_splice_actor(struct pipe_inode_info *pipe, struct pipe_buffer *buf, - struct splice_desc *sd) +static int lo_read_simple(struct loop_device *lo, struct request *rq, + loff_t pos) { - struct lo_read_data *p = sd->u.data; - struct loop_device *lo = p->lo; - struct page *page = buf->page; - sector_t IV; - int size; + struct bio_vec bvec; + struct req_iterator iter; + struct iov_iter i; + ssize_t len; + + rq_for_each_segment(bvec, rq, iter) { + iov_iter_bvec(&i, ITER_BVEC, &bvec, 1, bvec.bv_len); + len = vfs_iter_read(lo->lo_backing_file, &i, &pos); + if (len < 0) + return len; - IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) + - (buf->offset >> 9); - size = sd->len; - if (size > p->bsize) - size = p->bsize; + flush_dcache_page(bvec.bv_page); - if (lo_do_transfer(lo, READ, page, buf->offset, p->page, p->offset, size, IV)) { - printk(KERN_ERR "loop: transfer error block %ld\n", - page->index); - size = -EINVAL; - } + if (len != bvec.bv_len) { + struct bio *bio; - flush_dcache_page(p->page); - - if (size > 0) - p->offset += size; + __rq_for_each_bio(bio, rq) + zero_fill_bio(bio); + break; + } + cond_resched(); + } - return size; + return 0; } -static int -lo_direct_splice_actor(struct pipe_inode_info *pipe, struct splice_desc *sd) +static int lo_read_transfer(struct loop_device *lo, struct request *rq, + loff_t pos) { - return __splice_from_pipe(pipe, sd, lo_splice_actor); -} + struct bio_vec bvec, b; + struct req_iterator iter; + struct iov_iter i; + struct page *page; + ssize_t len; + int ret = 0; -static ssize_t -do_lo_receive(struct loop_device *lo, - struct bio_vec *bvec, int bsize, loff_t pos) -{ - struct lo_read_data cookie; - struct splice_desc sd; - struct file *file; - ssize_t retval; + page = alloc_page(GFP_NOIO); + if (unlikely(!page)) + return -ENOMEM; - cookie.lo = lo; - cookie.page = bvec->bv_page; - cookie.offset = bvec->bv_offset; - cookie.bsize = bsize; - - sd.len = 0; - sd.total_len = bvec->bv_len; - sd.flags = 0; - sd.pos = pos; - sd.u.data = &cookie; + rq_for_each_segment(bvec, rq, iter) { + loff_t offset = pos; - file = lo->lo_backing_file; - retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor); + b.bv_page = page; + b.bv_offset = 0; + b.bv_len = bvec.bv_len; + + iov_iter_bvec(&i, ITER_BVEC, &b, 1, b.bv_len); + len = vfs_iter_read(lo->lo_backing_file, &i, &pos); + if (len < 0) { + ret = len; + goto out_free_page; + } - return retval; -} + ret = lo_do_transfer(lo, READ, page, 0, bvec.bv_page, + bvec.bv_offset, len, offset >> 9); + if (ret) + goto out_free_page; -static int -lo_receive(struct loop_device *lo, struct bio *bio, int bsize, loff_t pos) -{ - struct bio_vec *bvec; - ssize_t s; - int i; + flush_dcache_page(bvec.bv_page); - bio_for_each_segment(bvec, bio, i) { - s = do_lo_receive(lo, bvec, bsize, pos); - if (s < 0) - return s; + if (len != bvec.bv_len) { + struct bio *bio; - if (s != bvec->bv_len) { - zero_fill_bio(bio); + __rq_for_each_bio(bio, rq) + zero_fill_bio(bio); break; } - pos += bvec->bv_len; } - return 0; + + ret = 0; +out_free_page: + __free_page(page); + return ret; } -static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) +static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos) { - loff_t pos; + /* + * We use punch hole to reclaim the free space used by the + * image a.k.a. discard. However we do not support discard if + * encryption is enabled, because it may give an attacker + * useful information. + */ + struct file *file = lo->lo_backing_file; + int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; int ret; - pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset; + if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) { + ret = -EOPNOTSUPP; + goto out; + } - if (bio_rw(bio) == WRITE) { - struct file *file = lo->lo_backing_file; + ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq)); + if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP)) + ret = -EIO; + out: + return ret; +} - if (bio->bi_rw & REQ_FLUSH) { - ret = vfs_fsync(file, 0); - if (unlikely(ret && ret != -EINVAL)) { - ret = -EIO; - goto out; - } - } +static int lo_req_flush(struct loop_device *lo, struct request *rq) +{ + struct file *file = lo->lo_backing_file; + int ret = vfs_fsync(file, 0); + if (unlikely(ret && ret != -EINVAL)) + ret = -EIO; - /* - * We use punch hole to reclaim the free space used by the - * image a.k.a. discard. However we do not support discard if - * encryption is enabled, because it may give an attacker - * useful information. - */ - if (bio->bi_rw & REQ_DISCARD) { - struct file *file = lo->lo_backing_file; - int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; - - if ((!file->f_op->fallocate) || - lo->lo_encrypt_key_size) { - ret = -EOPNOTSUPP; - goto out; - } - ret = file->f_op->fallocate(file, mode, pos, - bio->bi_size); - if (unlikely(ret && ret != -EINVAL && - ret != -EOPNOTSUPP)) - ret = -EIO; - goto out; - } + return ret; +} - ret = lo_send(lo, bio, pos); +static inline void handle_partial_read(struct loop_cmd *cmd, long bytes) +{ + if (bytes < 0 || (cmd->rq->cmd_flags & REQ_WRITE)) + return; - if ((bio->bi_rw & REQ_FUA) && !ret) { - ret = vfs_fsync(file, 0); - if (unlikely(ret && ret != -EINVAL)) - ret = -EIO; - } - } else - ret = lo_receive(lo, bio, lo->lo_blocksize, pos); + if (unlikely(bytes < blk_rq_bytes(cmd->rq))) { + struct bio *bio = cmd->rq->bio; -out: - return ret; + bio_advance(bio, bytes); + zero_fill_bio(bio); + } } -/* - * Add bio to back of pending list - */ -static void loop_add_bio(struct loop_device *lo, struct bio *bio) +static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2) { - lo->lo_bio_count++; - bio_list_add(&lo->lo_bio_list, bio); + struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb); + struct request *rq = cmd->rq; + + handle_partial_read(cmd, ret); + + if (ret > 0) + ret = 0; + else if (ret < 0) + ret = -EIO; + + blk_mq_complete_request(rq, ret); } -/* - * Grab first pending buffer - */ -static struct bio *loop_get_bio(struct loop_device *lo) +static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd, + loff_t pos, bool rw) { - lo->lo_bio_count--; - return bio_list_pop(&lo->lo_bio_list); + struct iov_iter iter; + struct bio_vec *bvec; + struct bio *bio = cmd->rq->bio; + struct file *file = lo->lo_backing_file; + int ret; + + /* nomerge for loop request queue */ + WARN_ON(cmd->rq->bio != cmd->rq->biotail); + + bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); + iov_iter_bvec(&iter, ITER_BVEC | rw, bvec, + bio_segments(bio), blk_rq_bytes(cmd->rq)); + /* + * This bio may be started from the middle of the 'bvec' + * because of bio splitting, so offset from the bvec must + * be passed to iov iterator + */ + iter.iov_offset = bio->bi_iter.bi_bvec_done; + + cmd->iocb.ki_pos = pos; + cmd->iocb.ki_filp = file; + cmd->iocb.ki_complete = lo_rw_aio_complete; + cmd->iocb.ki_flags = IOCB_DIRECT; + + if (rw == WRITE) + ret = file->f_op->write_iter(&cmd->iocb, &iter); + else + ret = file->f_op->read_iter(&cmd->iocb, &iter); + + if (ret != -EIOCBQUEUED) + cmd->iocb.ki_complete(&cmd->iocb, ret, 0); + return 0; } -static void loop_make_request(struct request_queue *q, struct bio *old_bio) + +static inline int lo_rw_simple(struct loop_device *lo, + struct request *rq, loff_t pos, bool rw) { - struct loop_device *lo = q->queuedata; - int rw = bio_rw(old_bio); + struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); - if (rw == READA) - rw = READ; + if (cmd->use_aio) + return lo_rw_aio(lo, cmd, pos, rw); - BUG_ON(!lo || (rw != READ && rw != WRITE)); + /* + * lo_write_simple and lo_read_simple should have been covered + * by io submit style function like lo_rw_aio(), one blocker + * is that lo_read_simple() need to call flush_dcache_page after + * the page is written from kernel, and it isn't easy to handle + * this in io submit style function which submits all segments + * of the req at one time. And direct read IO doesn't need to + * run flush_dcache_page(). + */ + if (rw == WRITE) + return lo_write_simple(lo, rq, pos); + else + return lo_read_simple(lo, rq, pos); +} - spin_lock_irq(&lo->lo_lock); - if (lo->lo_state != Lo_bound) - goto out; - if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY))) - goto out; - if (lo->lo_bio_count >= q->nr_congestion_on) - wait_event_lock_irq(lo->lo_req_wait, - lo->lo_bio_count < q->nr_congestion_off, - lo->lo_lock); - loop_add_bio(lo, old_bio); - wake_up(&lo->lo_event); - spin_unlock_irq(&lo->lo_lock); - return; +static int do_req_filebacked(struct loop_device *lo, struct request *rq) +{ + loff_t pos; + int ret; -out: - spin_unlock_irq(&lo->lo_lock); - bio_io_error(old_bio); + pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; + + if (rq->cmd_flags & REQ_WRITE) { + if (rq->cmd_flags & REQ_FLUSH) + ret = lo_req_flush(lo, rq); + else if (rq->cmd_flags & REQ_DISCARD) + ret = lo_discard(lo, rq, pos); + else if (lo->transfer) + ret = lo_write_transfer(lo, rq, pos); + else + ret = lo_rw_simple(lo, rq, pos, WRITE); + + } else { + if (lo->transfer) + ret = lo_read_transfer(lo, rq, pos); + else + ret = lo_rw_simple(lo, rq, pos, READ); + } + + return ret; } struct switch_request { @@ -517,57 +566,33 @@ struct completion wait; }; -static void do_loop_switch(struct loop_device *, struct switch_request *); - -static inline void loop_handle_bio(struct loop_device *lo, struct bio *bio) +static inline void loop_update_dio(struct loop_device *lo) { - if (unlikely(!bio->bi_bdev)) { - do_loop_switch(lo, bio->bi_private); - bio_put(bio); - } else { - int ret = do_bio_filebacked(lo, bio); - bio_endio(bio, ret); - } + __loop_update_dio(lo, io_is_direct(lo->lo_backing_file) | + lo->use_dio); } /* - * worker thread that handles reads/writes to file backed loop devices, - * to avoid blocking in our make_request_fn. it also does loop decrypting - * on reads for block backed loop, as that is too heavy to do from - * b_end_io context where irqs may be disabled. - * - * Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before - * calling kthread_stop(). Therefore once kthread_should_stop() is - * true, make_request will not place any more requests. Therefore - * once kthread_should_stop() is true and lo_bio is NULL, we are - * done with the loop. + * Do the actual switch; called from the BIO completion routine */ -static int loop_thread(void *data) +static void do_loop_switch(struct loop_device *lo, struct switch_request *p) { - struct loop_device *lo = data; - struct bio *bio; - - set_user_nice(current, -20); - - while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) { - - wait_event_interruptible(lo->lo_event, - !bio_list_empty(&lo->lo_bio_list) || - kthread_should_stop()); - - if (bio_list_empty(&lo->lo_bio_list)) - continue; - spin_lock_irq(&lo->lo_lock); - bio = loop_get_bio(lo); - if (lo->lo_bio_count < lo->lo_queue->nr_congestion_off) - wake_up(&lo->lo_req_wait); - spin_unlock_irq(&lo->lo_lock); + struct file *file = p->file; + struct file *old_file = lo->lo_backing_file; + struct address_space *mapping; - BUG_ON(!bio); - loop_handle_bio(lo, bio); - } + /* if no new file, only flush of queued bios requested */ + if (!file) + return; - return 0; + mapping = file->f_mapping; + mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); + lo->lo_backing_file = file; + lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ? + mapping->host->i_bdev->bd_block_size : PAGE_SIZE; + lo->old_gfp_mask = mapping_gfp_mask(mapping); + mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); + loop_update_dio(lo); } /* @@ -578,15 +603,18 @@ static int loop_switch(struct loop_device *lo, struct file *file) { struct switch_request w; - struct bio *bio = bio_alloc(GFP_KERNEL, 0); - if (!bio) - return -ENOMEM; - init_completion(&w.wait); + w.file = file; - bio->bi_private = &w; - bio->bi_bdev = NULL; - loop_make_request(lo->lo_queue, bio); - wait_for_completion(&w.wait); + + /* freeze queue and wait for completion of scheduled requests */ + blk_mq_freeze_queue(lo->lo_queue); + + /* do the switch action */ + do_loop_switch(lo, &w); + + /* unfreeze */ + blk_mq_unfreeze_queue(lo->lo_queue); + return 0; } @@ -595,38 +623,31 @@ */ static int loop_flush(struct loop_device *lo) { - /* loop not yet configured, no running thread, nothing to flush */ - if (!lo->lo_thread) - return 0; - return loop_switch(lo, NULL); } -/* - * Do the actual switch; called from the BIO completion routine - */ -static void do_loop_switch(struct loop_device *lo, struct switch_request *p) +static void loop_reread_partitions(struct loop_device *lo, + struct block_device *bdev) { - struct file *file = p->file; - struct file *old_file = lo->lo_backing_file; - struct address_space *mapping; - - /* if no new file, only flush of queued bios requested */ - if (!file) - goto out; + int rc; - mapping = file->f_mapping; - mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask); - lo->lo_backing_file = file; - lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ? - mapping->host->i_bdev->bd_block_size : PAGE_SIZE; - lo->old_gfp_mask = mapping_gfp_mask(mapping); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); -out: - complete(&p->wait); + /* + * bd_mutex has been held already in release path, so don't + * acquire it if this function is called in such case. + * + * If the reread partition isn't from release path, lo_refcnt + * must be at least one and it can only become zero when the + * current holder is released. + */ + if (!atomic_read(&lo->lo_refcnt)) + rc = __blkdev_reread_part(bdev); + else + rc = blkdev_reread_part(bdev); + if (rc) + pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n", + __func__, lo->lo_number, lo->lo_file_name, rc); } - /* * loop_change_fd switched the backing store of a loopback device to * a new file. This is useful for operating system installers to free up @@ -675,7 +696,7 @@ fput(old_file); if (lo->lo_flags & LO_FLAGS_PARTSCAN) - ioctl_by_bdev(bdev, BLKRRPART, 0); + loop_reread_partitions(lo, bdev); return 0; out_putf: @@ -719,7 +740,7 @@ spin_lock_irq(&lo->lo_lock); if (lo->lo_backing_file) - p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1); + p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1); spin_unlock_irq(&lo->lo_lock); if (IS_ERR_OR_NULL(p)) @@ -758,11 +779,19 @@ return sprintf(buf, "%s\n", partscan ? "1" : "0"); } +static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf) +{ + int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO); + + return sprintf(buf, "%s\n", dio ? "1" : "0"); +} + LOOP_ATTR_RO(backing_file); LOOP_ATTR_RO(offset); LOOP_ATTR_RO(sizelimit); LOOP_ATTR_RO(autoclear); LOOP_ATTR_RO(partscan); +LOOP_ATTR_RO(dio); static struct attribute *loop_attrs[] = { &loop_attr_backing_file.attr, @@ -770,6 +799,7 @@ &loop_attr_sizelimit.attr, &loop_attr_autoclear.attr, &loop_attr_partscan.attr, + &loop_attr_dio.attr, NULL, }; @@ -798,7 +828,7 @@ /* * We use punch hole to reclaim the free space used by the - * image a.k.a. discard. However we do support discard if + * image a.k.a. discard. However we do not support discard if * encryption is enabled, because it may give an attacker * useful information. */ @@ -806,7 +836,7 @@ lo->lo_encrypt_key_size) { q->limits.discard_granularity = 0; q->limits.discard_alignment = 0; - q->limits.max_discard_sectors = 0; + blk_queue_max_discard_sectors(q, 0); q->limits.discard_zeroes_data = 0; queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); return; @@ -814,11 +844,28 @@ q->limits.discard_granularity = inode->i_sb->s_blocksize; q->limits.discard_alignment = 0; - q->limits.max_discard_sectors = UINT_MAX >> 9; + blk_queue_max_discard_sectors(q, UINT_MAX >> 9); q->limits.discard_zeroes_data = 1; queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); } +static void loop_unprepare_queue(struct loop_device *lo) +{ + flush_kthread_worker(&lo->worker); + kthread_stop(lo->worker_task); +} + +static int loop_prepare_queue(struct loop_device *lo) +{ + init_kthread_worker(&lo->worker); + lo->worker_task = kthread_run(kthread_worker_fn, + &lo->worker, "loop%d", lo->lo_number); + if (IS_ERR(lo->worker_task)) + return -ENOMEM; + set_user_nice(lo->worker_task, MIN_NICE); + return 0; +} + static int loop_set_fd(struct loop_device *lo, fmode_t mode, struct block_device *bdev, unsigned int arg) { @@ -866,7 +913,7 @@ goto out_putf; if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) || - !file->f_op->write) + !file->f_op->write_iter) lo_flags |= LO_FLAGS_READ_ONLY; lo_blocksize = S_ISBLK(inode->i_mode) ? @@ -876,27 +923,29 @@ size = get_loop_size(lo, file); if ((loff_t)(sector_t)size != size) goto out_putf; + error = loop_prepare_queue(lo); + if (error) + goto out_putf; error = 0; set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0); + lo->use_dio = false; lo->lo_blocksize = lo_blocksize; lo->lo_device = bdev; lo->lo_flags = lo_flags; lo->lo_backing_file = file; - lo->transfer = transfer_none; + lo->transfer = NULL; lo->ioctl = NULL; lo->lo_sizelimit = 0; - lo->lo_bio_count = 0; lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); - bio_list_init(&lo->lo_bio_list); - if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync) blk_queue_flush(lo->lo_queue, REQ_FLUSH); + loop_update_dio(lo); set_capacity(lo->lo_disk, size); bd_set_size(bdev, size << 9); loop_sysfs_init(lo); @@ -905,18 +954,11 @@ set_blocksize(bdev, lo_blocksize); - lo->lo_thread = kthread_create(loop_thread, lo, "loop%d", - lo->lo_number); - if (IS_ERR(lo->lo_thread)) { - error = PTR_ERR(lo->lo_thread); - goto out_clr; - } lo->lo_state = Lo_bound; - wake_up_process(lo->lo_thread); if (part_shift) lo->lo_flags |= LO_FLAGS_PARTSCAN; if (lo->lo_flags & LO_FLAGS_PARTSCAN) - ioctl_by_bdev(bdev, BLKRRPART, 0); + loop_reread_partitions(lo, bdev); /* Grab the block_device to prevent its destruction after we * put /dev/loopXX inode. Later in loop_clr_fd() we bdput(bdev). @@ -924,18 +966,6 @@ bdgrab(bdev); return 0; -out_clr: - loop_sysfs_exit(lo); - lo->lo_thread = NULL; - lo->lo_device = NULL; - lo->lo_backing_file = NULL; - lo->lo_flags = 0; - set_capacity(lo->lo_disk, 0); - invalidate_bdev(bdev); - bd_set_size(bdev, 0); - kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE); - mapping_set_gfp_mask(mapping, lo->old_gfp_mask); - lo->lo_state = Lo_unbound; out_putf: fput(file); out: @@ -1000,7 +1030,7 @@ * /do something like mkfs/losetup -d causing the losetup -d * command to fail with EBUSY. */ - if (lo->lo_refcnt > 1) { + if (atomic_read(&lo->lo_refcnt) > 1) { lo->lo_flags |= LO_FLAGS_AUTOCLEAR; mutex_unlock(&lo->lo_ctl_mutex); return 0; @@ -1009,13 +1039,11 @@ if (filp == NULL) return -EINVAL; - spin_lock_irq(&lo->lo_lock); - lo->lo_state = Lo_rundown; - spin_unlock_irq(&lo->lo_lock); - - kthread_stop(lo->lo_thread); + /* freeze request queue during the transition */ + blk_mq_freeze_queue(lo->lo_queue); spin_lock_irq(&lo->lo_lock); + lo->lo_state = Lo_rundown; lo->lo_backing_file = NULL; spin_unlock_irq(&lo->lo_lock); @@ -1027,7 +1055,6 @@ lo->lo_offset = 0; lo->lo_sizelimit = 0; lo->lo_encrypt_key_size = 0; - lo->lo_thread = NULL; memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); memset(lo->lo_file_name, 0, LO_NAME_SIZE); @@ -1046,11 +1073,14 @@ lo->lo_state = Lo_unbound; /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); + blk_mq_unfreeze_queue(lo->lo_queue); + if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) - ioctl_by_bdev(bdev, BLKRRPART, 0); + loop_reread_partitions(lo, bdev); lo->lo_flags = 0; if (!part_shift) lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; + loop_unprepare_queue(lo); mutex_unlock(&lo->lo_ctl_mutex); /* * Need not hold lo_ctl_mutex to fput backing file. @@ -1078,9 +1108,12 @@ if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE) return -EINVAL; + /* I/O need to be drained during transfer transition */ + blk_mq_freeze_queue(lo->lo_queue); + err = loop_release_xfer(lo); if (err) - return err; + goto exit; if (info->lo_encrypt_type) { unsigned int type = info->lo_encrypt_type; @@ -1095,12 +1128,14 @@ err = loop_init_xfer(lo, xfer, info); if (err) - return err; + goto exit; if (lo->lo_offset != info->lo_offset || lo->lo_sizelimit != info->lo_sizelimit) - if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) - return -EFBIG; + if (figure_loop_size(lo, info->lo_offset, info->lo_sizelimit)) { + err = -EFBIG; + goto exit; + } loop_config_discard(lo); @@ -1118,13 +1153,6 @@ (info->lo_flags & LO_FLAGS_AUTOCLEAR)) lo->lo_flags ^= LO_FLAGS_AUTOCLEAR; - if ((info->lo_flags & LO_FLAGS_PARTSCAN) && - !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { - lo->lo_flags |= LO_FLAGS_PARTSCAN; - lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; - ioctl_by_bdev(lo->lo_device, BLKRRPART, 0); - } - lo->lo_encrypt_key_size = info->lo_encrypt_key_size; lo->lo_init[0] = info->lo_init[0]; lo->lo_init[1] = info->lo_init[1]; @@ -1132,9 +1160,22 @@ memcpy(lo->lo_encrypt_key, info->lo_encrypt_key, info->lo_encrypt_key_size); lo->lo_key_owner = uid; - } + } - return 0; + /* update dio if lo_offset or transfer is changed */ + __loop_update_dio(lo, lo->use_dio); + + exit: + blk_mq_unfreeze_queue(lo->lo_queue); + + if (!err && (info->lo_flags & LO_FLAGS_PARTSCAN) && + !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { + lo->lo_flags |= LO_FLAGS_PARTSCAN; + lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; + loop_reread_partitions(lo, lo->lo_device); + } + + return err; } static int @@ -1284,6 +1325,20 @@ return figure_loop_size(lo, lo->lo_offset, lo->lo_sizelimit); } +static int loop_set_dio(struct loop_device *lo, unsigned long arg) +{ + int error = -ENXIO; + if (lo->lo_state != Lo_bound) + goto out; + + __loop_update_dio(lo, !!arg); + if (lo->use_dio == !!arg) + return 0; + error = -EINVAL; + out: + return error; +} + static int lo_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { @@ -1327,6 +1382,11 @@ if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) err = loop_set_capacity(lo, bdev); break; + case LOOP_SET_DIRECT_IO: + err = -EPERM; + if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) + err = loop_set_dio(lo, arg); + break; default: err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL; } @@ -1503,24 +1563,20 @@ goto out; } - mutex_lock(&lo->lo_ctl_mutex); - lo->lo_refcnt++; - mutex_unlock(&lo->lo_ctl_mutex); + atomic_inc(&lo->lo_refcnt); out: mutex_unlock(&loop_index_mutex); return err; } -static void lo_release(struct gendisk *disk, fmode_t mode) +static void __lo_release(struct loop_device *lo) { - struct loop_device *lo = disk->private_data; int err; - mutex_lock(&lo->lo_ctl_mutex); - - if (--lo->lo_refcnt) - goto out; + if (atomic_dec_return(&lo->lo_refcnt)) + return; + mutex_lock(&lo->lo_ctl_mutex); if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) { /* * In autoclear mode, stop the loop thread @@ -1537,10 +1593,16 @@ loop_flush(lo); } -out: mutex_unlock(&lo->lo_ctl_mutex); } +static void lo_release(struct gendisk *disk, fmode_t mode) +{ + mutex_lock(&loop_index_mutex); + __lo_release(disk->private_data); + mutex_unlock(&loop_index_mutex); +} + static const struct block_device_operations lo_fops = { .owner = THIS_MODULE, .open = lo_open, @@ -1600,6 +1662,72 @@ EXPORT_SYMBOL(loop_register_transfer); EXPORT_SYMBOL(loop_unregister_transfer); +static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct loop_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); + struct loop_device *lo = cmd->rq->q->queuedata; + + blk_mq_start_request(bd->rq); + + if (lo->lo_state != Lo_bound) + return BLK_MQ_RQ_QUEUE_ERROR; + + if (lo->use_dio && !(cmd->rq->cmd_flags & (REQ_FLUSH | + REQ_DISCARD))) + cmd->use_aio = true; + else + cmd->use_aio = false; + + queue_kthread_work(&lo->worker, &cmd->work); + + return BLK_MQ_RQ_QUEUE_OK; +} + +static void loop_handle_cmd(struct loop_cmd *cmd) +{ + const bool write = cmd->rq->cmd_flags & REQ_WRITE; + struct loop_device *lo = cmd->rq->q->queuedata; + int ret = 0; + + if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) { + ret = -EIO; + goto failed; + } + + ret = do_req_filebacked(lo, cmd->rq); + failed: + /* complete non-aio request */ + if (!cmd->use_aio || ret) + blk_mq_complete_request(cmd->rq, ret ? -EIO : 0); +} + +static void loop_queue_work(struct kthread_work *work) +{ + struct loop_cmd *cmd = + container_of(work, struct loop_cmd, work); + + loop_handle_cmd(cmd); +} + +static int loop_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) +{ + struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq); + + cmd->rq = rq; + init_kthread_work(&cmd->work, loop_queue_work); + + return 0; +} + +static struct blk_mq_ops loop_mq_ops = { + .queue_rq = loop_queue_rq, + .map_queue = blk_mq_map_queue, + .init_request = loop_init_request, +}; + static int loop_add(struct loop_device **l, int i) { struct loop_device *lo; @@ -1626,15 +1754,30 @@ i = err; err = -ENOMEM; - lo->lo_queue = blk_alloc_queue(GFP_KERNEL); - if (!lo->lo_queue) + lo->tag_set.ops = &loop_mq_ops; + lo->tag_set.nr_hw_queues = 1; + lo->tag_set.queue_depth = 128; + lo->tag_set.numa_node = NUMA_NO_NODE; + lo->tag_set.cmd_size = sizeof(struct loop_cmd); + lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + lo->tag_set.driver_data = lo; + + err = blk_mq_alloc_tag_set(&lo->tag_set); + if (err) goto out_free_idr; + lo->lo_queue = blk_mq_init_queue(&lo->tag_set); + if (IS_ERR_OR_NULL(lo->lo_queue)) { + err = PTR_ERR(lo->lo_queue); + goto out_cleanup_tags; + } + lo->lo_queue->queuedata = lo; + /* - * set queue make_request_fn + * It doesn't make sense to enable merge because the I/O + * submitted to backing file is handled page by page. */ - blk_queue_make_request(lo->lo_queue, loop_make_request); - lo->lo_queue->queuedata = lo; + queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue); disk = lo->lo_disk = alloc_disk(1 << part_shift); if (!disk) @@ -1662,10 +1805,8 @@ disk->flags |= GENHD_FL_NO_PART_SCAN; disk->flags |= GENHD_FL_EXT_DEVT; mutex_init(&lo->lo_ctl_mutex); + atomic_set(&lo->lo_refcnt, 0); lo->lo_number = i; - lo->lo_thread = NULL; - init_waitqueue_head(&lo->lo_event); - init_waitqueue_head(&lo->lo_req_wait); spin_lock_init(&lo->lo_lock); disk->major = LOOP_MAJOR; disk->first_minor = i << part_shift; @@ -1679,6 +1820,8 @@ out_free_queue: blk_cleanup_queue(lo->lo_queue); +out_cleanup_tags: + blk_mq_free_tag_set(&lo->tag_set); out_free_idr: idr_remove(&loop_index_idr, i); out_free_dev: @@ -1689,8 +1832,9 @@ static void loop_remove(struct loop_device *lo) { - del_gendisk(lo->lo_disk); blk_cleanup_queue(lo->lo_queue); + del_gendisk(lo->lo_disk); + blk_mq_free_tag_set(&lo->tag_set); put_disk(lo->lo_disk); kfree(lo); } @@ -1779,7 +1923,7 @@ mutex_unlock(&lo->lo_ctl_mutex); break; } - if (lo->lo_refcnt > 0) { + if (atomic_read(&lo->lo_refcnt) > 0) { ret = -EBUSY; mutex_unlock(&lo->lo_ctl_mutex); break;