--- zzzz-none-000/linux-3.10.107/drivers/block/xen-blkfront.c	2017-06-27 09:49:32.000000000 +0000
+++ scorpion-7490-727/linux-3.10.107/drivers/block/xen-blkfront.c	2021-02-04 17:41:59.000000000 +0000
@@ -37,6 +37,7 @@
 
 #include <linux/interrupt.h>
 #include <linux/blkdev.h>
+#include <linux/blk-mq.h>
 #include <linux/hdreg.h>
 #include <linux/cdrom.h>
 #include <linux/module.h>
@@ -67,21 +68,56 @@
 
 struct grant {
 	grant_ref_t gref;
-	unsigned long pfn;
+	struct page *page;
 	struct list_head node;
 };
 
 struct blk_shadow {
 	struct blkif_request req;
 	struct request *request;
-	struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-	struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct grant **grants_used;
+	struct grant **indirect_grants;
+	struct scatterlist *sg;
+	unsigned int num_sg;
+};
+
+struct split_bio {
+	struct bio *bio;
+	atomic_t pending;
 };
 
 static DEFINE_MUTEX(blkfront_mutex);
 static const struct block_device_operations xlvbd_block_fops;
 
-#define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
+/*
+ * Maximum number of segments in indirect requests, the actual value used by
+ * the frontend driver is the minimum of this value and the value provided
+ * by the backend driver.
+ */
+
+static unsigned int xen_blkif_max_segments = 32;
+module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
+MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
+
+/*
+ * Maximum order of pages to be used for the shared ring between front and
+ * backend, 4KB page granularity is used.
+ */
+static unsigned int xen_blkif_max_ring_order;
+module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, S_IRUGO);
+MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
+
+#define BLK_RING_SIZE(info)	\
+	__CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages)
+
+#define BLK_MAX_RING_SIZE	\
+	__CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS)
+
+/*
+ * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
+ * characters are enough. Define to 20 to keep consist with backend.
+ */
+#define RINGREF_NAME_LEN (20)
 
 /*
  * We have one of these per vbd, whether ide, scsi or 'other'.  They
@@ -97,32 +133,34 @@
 	int vdevice;
 	blkif_vdev_t handle;
 	enum blkif_state connected;
-	int ring_ref;
+	int ring_ref[XENBUS_MAX_RING_GRANTS];
+	unsigned int nr_ring_pages;
 	struct blkif_front_ring ring;
 	unsigned int evtchn, irq;
 	struct request_queue *rq;
 	struct work_struct work;
 	struct gnttab_free_callback callback;
-	struct blk_shadow shadow[BLK_RING_SIZE];
+	struct blk_shadow shadow[BLK_MAX_RING_SIZE];
 	struct list_head grants;
+	struct list_head indirect_pages;
 	unsigned int persistent_gnts_c;
 	unsigned long shadow_free;
 	unsigned int feature_flush;
-	unsigned int flush_op;
 	unsigned int feature_discard:1;
 	unsigned int feature_secdiscard:1;
 	unsigned int discard_granularity;
 	unsigned int discard_alignment;
 	unsigned int feature_persistent:1;
+	/* Number of 4KB segments handled */
+	unsigned int max_indirect_segments;
 	int is_ready;
+	struct blk_mq_tag_set tag_set;
 };
 
 static unsigned int nr_minors;
 static unsigned long *minors;
 static DEFINE_SPINLOCK(minor_lock);
 
-#define MAXIMUM_OUTSTANDING_BLOCK_REQS \
-	(BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
 #define GRANT_INVALID_REF	0
 
 #define PARTS_PER_DISK		16
@@ -142,10 +180,31 @@
 
 #define DEV_NAME	"xvd"	/* name in /dev */
 
+/*
+ * Grants are always the same size as a Xen page (i.e 4KB).
+ * A physical segment is always the same size as a Linux page.
+ * Number of grants per physical segment
+ */
+#define GRANTS_PER_PSEG	(PAGE_SIZE / XEN_PAGE_SIZE)
+
+#define GRANTS_PER_INDIRECT_FRAME \
+	(XEN_PAGE_SIZE / sizeof(struct blkif_request_segment))
+
+#define PSEGS_PER_INDIRECT_FRAME	\
+	(GRANTS_INDIRECT_FRAME / GRANTS_PSEGS)
+
+#define INDIRECT_GREFS(_grants)		\
+	DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME)
+
+#define GREFS(_psegs)	((_psegs) * GRANTS_PER_PSEG)
+
+static int blkfront_setup_indirect(struct blkfront_info *info);
+static int blkfront_gather_backend_features(struct blkfront_info *info);
+
 static int get_id_from_freelist(struct blkfront_info *info)
 {
 	unsigned long free = info->shadow_free;
-	BUG_ON(free >= BLK_RING_SIZE);
+	BUG_ON(free >= BLK_RING_SIZE(info));
 	info->shadow_free = info->shadow[free].req.u.rw.id;
 	info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
 	return free;
@@ -181,7 +240,7 @@
 				kfree(gnt_list_entry);
 				goto out_of_memory;
 			}
-			gnt_list_entry->pfn = page_to_pfn(granted_page);
+			gnt_list_entry->page = granted_page;
 		}
 
 		gnt_list_entry->gref = GRANT_INVALID_REF;
@@ -196,7 +255,7 @@
 	                         &info->grants, node) {
 		list_del(&gnt_list_entry->node);
 		if (info->feature_persistent)
-			__free_page(pfn_to_page(gnt_list_entry->pfn));
+			__free_page(gnt_list_entry->page);
 		kfree(gnt_list_entry);
 		i--;
 	}
@@ -204,33 +263,77 @@
 	return -ENOMEM;
 }
 
-static struct grant *get_grant(grant_ref_t *gref_head,
-                               unsigned long pfn,
-                               struct blkfront_info *info)
+static struct grant *get_free_grant(struct blkfront_info *info)
 {
 	struct grant *gnt_list_entry;
-	unsigned long buffer_mfn;
 
 	BUG_ON(list_empty(&info->grants));
-	gnt_list_entry = list_first_entry(&info->grants, struct grant, node);
+	gnt_list_entry = list_first_entry(&info->grants, struct grant,
+					  node);
 	list_del(&gnt_list_entry->node);
 
-	if (gnt_list_entry->gref != GRANT_INVALID_REF) {
+	if (gnt_list_entry->gref != GRANT_INVALID_REF)
 		info->persistent_gnts_c--;
+
+	return gnt_list_entry;
+}
+
+static inline void grant_foreign_access(const struct grant *gnt_list_entry,
+					const struct blkfront_info *info)
+{
+	gnttab_page_grant_foreign_access_ref_one(gnt_list_entry->gref,
+						 info->xbdev->otherend_id,
+						 gnt_list_entry->page,
+						 0);
+}
+
+static struct grant *get_grant(grant_ref_t *gref_head,
+			       unsigned long gfn,
+			       struct blkfront_info *info)
+{
+	struct grant *gnt_list_entry = get_free_grant(info);
+
+	if (gnt_list_entry->gref != GRANT_INVALID_REF)
 		return gnt_list_entry;
+
+	/* Assign a gref to this page */
+	gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
+	BUG_ON(gnt_list_entry->gref == -ENOSPC);
+	if (info->feature_persistent)
+		grant_foreign_access(gnt_list_entry, info);
+	else {
+		/* Grant access to the GFN passed by the caller */
+		gnttab_grant_foreign_access_ref(gnt_list_entry->gref,
+						info->xbdev->otherend_id,
+						gfn, 0);
 	}
 
+	return gnt_list_entry;
+}
+
+static struct grant *get_indirect_grant(grant_ref_t *gref_head,
+					struct blkfront_info *info)
+{
+	struct grant *gnt_list_entry = get_free_grant(info);
+
+	if (gnt_list_entry->gref != GRANT_INVALID_REF)
+		return gnt_list_entry;
+
 	/* Assign a gref to this page */
 	gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
 	BUG_ON(gnt_list_entry->gref == -ENOSPC);
 	if (!info->feature_persistent) {
-		BUG_ON(!pfn);
-		gnt_list_entry->pfn = pfn;
+		struct page *indirect_page;
+
+		/* Fetch a pre-allocated page to use for indirect grefs */
+		BUG_ON(list_empty(&info->indirect_pages));
+		indirect_page = list_first_entry(&info->indirect_pages,
+						 struct page, lru);
+		list_del(&indirect_page->lru);
+		gnt_list_entry->page = indirect_page;
 	}
-	buffer_mfn = pfn_to_mfn(gnt_list_entry->pfn);
-	gnttab_grant_foreign_access_ref(gnt_list_entry->gref,
-	                                info->xbdev->otherend_id,
-	                                buffer_mfn, 0);
+	grant_foreign_access(gnt_list_entry, info);
+
 	return gnt_list_entry;
 }
 
@@ -353,19 +456,128 @@
 	return 0;
 }
 
-/*
- * Generate a Xen blkfront IO request from a blk layer request.  Reads
- * and writes are handled as expected.
- *
- * @req: a request struct
- */
-static int blkif_queue_request(struct request *req)
+static int blkif_queue_discard_req(struct request *req)
 {
 	struct blkfront_info *info = req->rq_disk->private_data;
 	struct blkif_request *ring_req;
 	unsigned long id;
+
+	/* Fill out a communications ring structure. */
+	ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
+	id = get_id_from_freelist(info);
+	info->shadow[id].request = req;
+
+	ring_req->operation = BLKIF_OP_DISCARD;
+	ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
+	ring_req->u.discard.id = id;
+	ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
+	if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
+		ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
+	else
+		ring_req->u.discard.flag = 0;
+
+	info->ring.req_prod_pvt++;
+
+	/* Keep a private copy so we can reissue requests when recovering. */
+	info->shadow[id].req = *ring_req;
+
+	return 0;
+}
+
+struct setup_rw_req {
+	unsigned int grant_idx;
+	struct blkif_request_segment *segments;
+	struct blkfront_info *info;
+	struct blkif_request *ring_req;
+	grant_ref_t gref_head;
+	unsigned int id;
+	/* Only used when persistent grant is used and it's a read request */
+	bool need_copy;
+	unsigned int bvec_off;
+	char *bvec_data;
+};
+
+static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
+				     unsigned int len, void *data)
+{
+	struct setup_rw_req *setup = data;
+	int n, ref;
+	struct grant *gnt_list_entry;
 	unsigned int fsect, lsect;
-	int i, ref;
+	/* Convenient aliases */
+	unsigned int grant_idx = setup->grant_idx;
+	struct blkif_request *ring_req = setup->ring_req;
+	struct blkfront_info *info = setup->info;
+	struct blk_shadow *shadow = &info->shadow[setup->id];
+
+	if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
+	    (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
+		if (setup->segments)
+			kunmap_atomic(setup->segments);
+
+		n = grant_idx / GRANTS_PER_INDIRECT_FRAME;
+		gnt_list_entry = get_indirect_grant(&setup->gref_head, info);
+		shadow->indirect_grants[n] = gnt_list_entry;
+		setup->segments = kmap_atomic(gnt_list_entry->page);
+		ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
+	}
+
+	gnt_list_entry = get_grant(&setup->gref_head, gfn, info);
+	ref = gnt_list_entry->gref;
+	shadow->grants_used[grant_idx] = gnt_list_entry;
+
+	if (setup->need_copy) {
+		void *shared_data;
+
+		shared_data = kmap_atomic(gnt_list_entry->page);
+		/*
+		 * this does not wipe data stored outside the
+		 * range sg->offset..sg->offset+sg->length.
+		 * Therefore, blkback *could* see data from
+		 * previous requests. This is OK as long as
+		 * persistent grants are shared with just one
+		 * domain. It may need refactoring if this
+		 * changes
+		 */
+		memcpy(shared_data + offset,
+		       setup->bvec_data + setup->bvec_off,
+		       len);
+
+		kunmap_atomic(shared_data);
+		setup->bvec_off += len;
+	}
+
+	fsect = offset >> 9;
+	lsect = fsect + (len >> 9) - 1;
+	if (ring_req->operation != BLKIF_OP_INDIRECT) {
+		ring_req->u.rw.seg[grant_idx] =
+			(struct blkif_request_segment) {
+				.gref       = ref,
+				.first_sect = fsect,
+				.last_sect  = lsect };
+	} else {
+		setup->segments[grant_idx % GRANTS_PER_INDIRECT_FRAME] =
+			(struct blkif_request_segment) {
+				.gref       = ref,
+				.first_sect = fsect,
+				.last_sect  = lsect };
+	}
+
+	(setup->grant_idx)++;
+}
+
+static int blkif_queue_rw_req(struct request *req)
+{
+	struct blkfront_info *info = req->rq_disk->private_data;
+	struct blkif_request *ring_req;
+	unsigned long id;
+	int i;
+	struct setup_rw_req setup = {
+		.grant_idx = 0,
+		.segments = NULL,
+		.info = info,
+		.need_copy = rq_data_dir(req) && info->feature_persistent,
+	};
 
 	/*
 	 * Used to store if we are able to queue the request by just using
@@ -373,24 +585,28 @@
 	 * as there are not sufficiently many free.
 	 */
 	bool new_persistent_gnts;
-	grant_ref_t gref_head;
-	struct grant *gnt_list_entry = NULL;
 	struct scatterlist *sg;
+	int num_sg, max_grefs, num_grant;
 
-	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
-		return 1;
+	max_grefs = req->nr_phys_segments * GRANTS_PER_PSEG;
+	if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
+		/*
+		 * If we are using indirect segments we need to account
+		 * for the indirect grefs used in the request.
+		 */
+		max_grefs += INDIRECT_GREFS(max_grefs);
 
-	/* Check if we have enought grants to allocate a requests */
-	if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+	/* Check if we have enough grants to allocate a requests */
+	if (info->persistent_gnts_c < max_grefs) {
 		new_persistent_gnts = 1;
 		if (gnttab_alloc_grant_references(
-		    BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c,
-		    &gref_head) < 0) {
+		    max_grefs - info->persistent_gnts_c,
+		    &setup.gref_head) < 0) {
 			gnttab_request_free_callback(
 				&info->callback,
 				blkif_restart_queue_callback,
 				info,
-				BLKIF_MAX_SEGMENTS_PER_REQUEST);
+				max_grefs);
 			return 1;
 		}
 	} else
@@ -401,81 +617,82 @@
 	id = get_id_from_freelist(info);
 	info->shadow[id].request = req;
 
-	ring_req->u.rw.id = id;
-	ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
-	ring_req->u.rw.handle = info->handle;
-
-	ring_req->operation = rq_data_dir(req) ?
-		BLKIF_OP_WRITE : BLKIF_OP_READ;
+	BUG_ON(info->max_indirect_segments == 0 &&
+	       GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST);
+	BUG_ON(info->max_indirect_segments &&
+	       GREFS(req->nr_phys_segments) > info->max_indirect_segments);
+
+	num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
+	num_grant = 0;
+	/* Calculate the number of grant used */
+	for_each_sg(info->shadow[id].sg, sg, num_sg, i)
+	       num_grant += gnttab_count_grant(sg->offset, sg->length);
 
-	if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
+	ring_req->u.rw.id = id;
+	info->shadow[id].num_sg = num_sg;
+	if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
 		/*
-		 * Ideally we can do an unordered flush-to-disk. In case the
-		 * backend onlysupports barriers, use that. A barrier request
-		 * a superset of FUA, so we can implement it the same
-		 * way.  (It's also a FLUSH+FUA, since it is
-		 * guaranteed ordered WRT previous writes.)
+		 * The indirect operation can only be a BLKIF_OP_READ or
+		 * BLKIF_OP_WRITE
 		 */
-		ring_req->operation = info->flush_op;
-	}
-
-	if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
-		/* id, sector_number and handle are set above. */
-		ring_req->operation = BLKIF_OP_DISCARD;
-		ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
-		if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
-			ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
-		else
-			ring_req->u.discard.flag = 0;
+		BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
+		ring_req->operation = BLKIF_OP_INDIRECT;
+		ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
+			BLKIF_OP_WRITE : BLKIF_OP_READ;
+		ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
+		ring_req->u.indirect.handle = info->handle;
+		ring_req->u.indirect.nr_segments = num_grant;
 	} else {
-		ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req,
-							   info->shadow[id].sg);
-		BUG_ON(ring_req->u.rw.nr_segments >
-		       BLKIF_MAX_SEGMENTS_PER_REQUEST);
-
-		for_each_sg(info->shadow[id].sg, sg, ring_req->u.rw.nr_segments, i) {
-			fsect = sg->offset >> 9;
-			lsect = fsect + (sg->length >> 9) - 1;
-
-			gnt_list_entry = get_grant(&gref_head, page_to_pfn(sg_page(sg)), info);
-			ref = gnt_list_entry->gref;
-
-			info->shadow[id].grants_used[i] = gnt_list_entry;
-
-			if (rq_data_dir(req) && info->feature_persistent) {
-				char *bvec_data;
-				void *shared_data;
-
-				BUG_ON(sg->offset + sg->length > PAGE_SIZE);
+		ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
+		ring_req->u.rw.handle = info->handle;
+		ring_req->operation = rq_data_dir(req) ?
+			BLKIF_OP_WRITE : BLKIF_OP_READ;
+		if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
+			/*
+			 * Ideally we can do an unordered flush-to-disk.
+			 * In case the backend onlysupports barriers, use that.
+			 * A barrier request a superset of FUA, so we can
+			 * implement it the same way.  (It's also a FLUSH+FUA,
+			 * since it is guaranteed ordered WRT previous writes.)
+			 */
+			switch (info->feature_flush &
+				((REQ_FLUSH|REQ_FUA))) {
+			case REQ_FLUSH|REQ_FUA:
+				ring_req->operation =
+					BLKIF_OP_WRITE_BARRIER;
+				break;
+			case REQ_FLUSH:
+				ring_req->operation =
+					BLKIF_OP_FLUSH_DISKCACHE;
+				break;
+			default:
+				ring_req->operation = 0;
+			}
+		}
+		ring_req->u.rw.nr_segments = num_grant;
+	}
 
-				shared_data = kmap_atomic(
-					pfn_to_page(gnt_list_entry->pfn));
-				bvec_data = kmap_atomic(sg_page(sg));
+	setup.ring_req = ring_req;
+	setup.id = id;
+	for_each_sg(info->shadow[id].sg, sg, num_sg, i) {
+		BUG_ON(sg->offset + sg->length > PAGE_SIZE);
 
-				/*
-				 * this does not wipe data stored outside the
-				 * range sg->offset..sg->offset+sg->length.
-				 * Therefore, blkback *could* see data from
-				 * previous requests. This is OK as long as
-				 * persistent grants are shared with just one
-				 * domain. It may need refactoring if this
-				 * changes
-				 */
-				memcpy(shared_data + sg->offset,
-				       bvec_data   + sg->offset,
-				       sg->length);
+		if (setup.need_copy) {
+			setup.bvec_off = sg->offset;
+			setup.bvec_data = kmap_atomic(sg_page(sg));
+		}
 
-				kunmap_atomic(bvec_data);
-				kunmap_atomic(shared_data);
-			}
+		gnttab_foreach_grant_in_range(sg_page(sg),
+					      sg->offset,
+					      sg->length,
+					      blkif_setup_rw_req_grant,
+					      &setup);
 
-			ring_req->u.rw.seg[i] =
-					(struct blkif_request_segment) {
-						.gref       = ref,
-						.first_sect = fsect,
-						.last_sect  = lsect };
-		}
+		if (setup.need_copy)
+			kunmap_atomic(setup.bvec_data);
 	}
+	if (setup.segments)
+		kunmap_atomic(setup.segments);
 
 	info->ring.req_prod_pvt++;
 
@@ -483,11 +700,29 @@
 	info->shadow[id].req = *ring_req;
 
 	if (new_persistent_gnts)
-		gnttab_free_grant_references(gref_head);
+		gnttab_free_grant_references(setup.gref_head);
 
 	return 0;
 }
 
+/*
+ * Generate a Xen blkfront IO request from a blk layer request.  Reads
+ * and writes are handled as expected.
+ *
+ * @req: a request struct
+ */
+static int blkif_queue_request(struct request *req)
+{
+	struct blkfront_info *info = req->rq_disk->private_data;
+
+	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
+		return 1;
+
+	if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE)))
+		return blkif_queue_discard_req(req);
+	else
+		return blkif_queue_rw_req(req);
+}
 
 static inline void flush_requests(struct blkfront_info *info)
 {
@@ -499,64 +734,74 @@
 		notify_remote_via_irq(info->irq);
 }
 
-/*
- * do_blkif_request
- *  read a block; request is in a request queue
- */
-static void do_blkif_request(struct request_queue *rq)
+static inline bool blkif_request_flush_invalid(struct request *req,
+					       struct blkfront_info *info)
 {
-	struct blkfront_info *info = NULL;
-	struct request *req;
-	int queued;
-
-	pr_debug("Entered do_blkif_request\n");
-
-	queued = 0;
+	return ((req->cmd_type != REQ_TYPE_FS) ||
+		((req->cmd_flags & REQ_FLUSH) &&
+		 !(info->feature_flush & REQ_FLUSH)) ||
+		((req->cmd_flags & REQ_FUA) &&
+		 !(info->feature_flush & REQ_FUA)));
+}
 
-	while ((req = blk_peek_request(rq)) != NULL) {
-		info = req->rq_disk->private_data;
+static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
+			   const struct blk_mq_queue_data *qd)
+{
+	struct blkfront_info *info = qd->rq->rq_disk->private_data;
 
-		if (RING_FULL(&info->ring))
-			goto wait;
+	blk_mq_start_request(qd->rq);
+	spin_lock_irq(&info->io_lock);
+	if (RING_FULL(&info->ring))
+		goto out_busy;
 
-		blk_start_request(req);
+	if (blkif_request_flush_invalid(qd->rq, info))
+		goto out_err;
 
-		if ((req->cmd_type != REQ_TYPE_FS) ||
-		    ((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) &&
-		    !info->flush_op)) {
-			__blk_end_request_all(req, -EIO);
-			continue;
-		}
+	if (blkif_queue_request(qd->rq))
+		goto out_busy;
 
-		pr_debug("do_blk_req %p: cmd %p, sec %lx, "
-			 "(%u/%u) buffer:%p [%s]\n",
-			 req, req->cmd, (unsigned long)blk_rq_pos(req),
-			 blk_rq_cur_sectors(req), blk_rq_sectors(req),
-			 req->buffer, rq_data_dir(req) ? "write" : "read");
-
-		if (blkif_queue_request(req)) {
-			blk_requeue_request(rq, req);
-wait:
-			/* Avoid pointless unplugs. */
-			blk_stop_queue(rq);
-			break;
-		}
+	flush_requests(info);
+	spin_unlock_irq(&info->io_lock);
+	return BLK_MQ_RQ_QUEUE_OK;
 
-		queued++;
-	}
+out_err:
+	spin_unlock_irq(&info->io_lock);
+	return BLK_MQ_RQ_QUEUE_ERROR;
 
-	if (queued != 0)
-		flush_requests(info);
+out_busy:
+	spin_unlock_irq(&info->io_lock);
+	blk_mq_stop_hw_queue(hctx);
+	return BLK_MQ_RQ_QUEUE_BUSY;
 }
 
-static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
+static struct blk_mq_ops blkfront_mq_ops = {
+	.queue_rq = blkif_queue_rq,
+	.map_queue = blk_mq_map_queue,
+};
+
+static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
+				unsigned int physical_sector_size,
+				unsigned int segments)
 {
 	struct request_queue *rq;
 	struct blkfront_info *info = gd->private_data;
 
-	rq = blk_init_queue(do_blkif_request, &info->io_lock);
-	if (rq == NULL)
+	memset(&info->tag_set, 0, sizeof(info->tag_set));
+	info->tag_set.ops = &blkfront_mq_ops;
+	info->tag_set.nr_hw_queues = 1;
+	info->tag_set.queue_depth =  BLK_RING_SIZE(info);
+	info->tag_set.numa_node = NUMA_NO_NODE;
+	info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+	info->tag_set.cmd_size = 0;
+	info->tag_set.driver_data = info;
+
+	if (blk_mq_alloc_tag_set(&info->tag_set))
+		return -1;
+	rq = blk_mq_init_queue(&info->tag_set);
+	if (IS_ERR(rq)) {
+		blk_mq_free_tag_set(&info->tag_set);
 		return -1;
+	}
 
 	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
 
@@ -571,14 +816,15 @@
 
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
 	blk_queue_logical_block_size(rq, sector_size);
-	blk_queue_max_hw_sectors(rq, 512);
+	blk_queue_physical_block_size(rq, physical_sector_size);
+	blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512);
 
 	/* Each segment in a request is up to an aligned page in size. */
 	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
 	blk_queue_max_segment_size(rq, PAGE_SIZE);
 
 	/* Ensure a merged request will fit in a single I/O ring slot. */
-	blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+	blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG);
 
 	/* Make sure buffer addresses are sector-aligned. */
 	blk_queue_dma_alignment(rq, 511);
@@ -591,17 +837,26 @@
 	return 0;
 }
 
+static const char *flush_info(unsigned int feature_flush)
+{
+	switch (feature_flush & ((REQ_FLUSH | REQ_FUA))) {
+	case REQ_FLUSH|REQ_FUA:
+		return "barrier: enabled;";
+	case REQ_FLUSH:
+		return "flush diskcache: enabled;";
+	default:
+		return "barrier or flush: disabled;";
+	}
+}
 
 static void xlvbd_flush(struct blkfront_info *info)
 {
 	blk_queue_flush(info->rq, info->feature_flush);
-	printk(KERN_INFO "blkfront: %s: %s: %s %s\n",
-	       info->gd->disk_name,
-	       info->flush_op == BLKIF_OP_WRITE_BARRIER ?
-		"barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
-		"flush diskcache" : "barrier or flush"),
-	       info->feature_flush ? "enabled" : "disabled",
-	       info->feature_persistent ? "using persistent grants" : "");
+	pr_info("blkfront: %s: %s %s %s %s %s\n",
+		info->gd->disk_name, flush_info(info->feature_flush),
+		"persistent grants:", info->feature_persistent ?
+		"enabled;" : "disabled;", "indirect descriptors:",
+		info->max_indirect_segments ? "enabled;" : "disabled;");
 }
 
 static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
@@ -674,7 +929,8 @@
 
 static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 			       struct blkfront_info *info,
-			       u16 vdisk_info, u16 sector_size)
+			       u16 vdisk_info, u16 sector_size,
+			       unsigned int physical_sector_size)
 {
 	struct gendisk *gd;
 	int nr_minors = 1;
@@ -741,7 +997,9 @@
 	gd->driverfs_dev = &(info->xbdev->dev);
 	set_capacity(gd, capacity);
 
-	if (xlvbd_init_blk_queue(gd, sector_size)) {
+	if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size,
+				 info->max_indirect_segments ? :
+				 BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 		del_gendisk(gd);
 		goto release;
 	}
@@ -771,19 +1029,15 @@
 static void xlvbd_release_gendisk(struct blkfront_info *info)
 {
 	unsigned int minor, nr_minors;
-	unsigned long flags;
 
 	if (info->rq == NULL)
 		return;
 
-	spin_lock_irqsave(&info->io_lock, flags);
-
 	/* No more blkif_request(). */
-	blk_stop_queue(info->rq);
+	blk_mq_stop_hw_queues(info->rq);
 
 	/* No more gnttab callback work. */
 	gnttab_cancel_free_callback(&info->callback);
-	spin_unlock_irqrestore(&info->io_lock, flags);
 
 	/* Flush gnttab callback work. Must be done with no locks held. */
 	flush_work(&info->work);
@@ -795,20 +1049,18 @@
 	xlbd_release_minors(minor, nr_minors);
 
 	blk_cleanup_queue(info->rq);
+	blk_mq_free_tag_set(&info->tag_set);
 	info->rq = NULL;
 
 	put_disk(info->gd);
 	info->gd = NULL;
 }
 
+/* Must be called with io_lock holded */
 static void kick_pending_request_queues(struct blkfront_info *info)
 {
-	if (!RING_FULL(&info->ring)) {
-		/* Re-enable calldowns. */
-		blk_start_queue(info->rq);
-		/* Kick things off immediately. */
-		do_blkif_request(info->rq);
-	}
+	if (!RING_FULL(&info->ring))
+		blk_mq_start_stopped_hw_queues(info->rq, true);
 }
 
 static void blkif_restart_queue(struct work_struct *work)
@@ -825,6 +1077,7 @@
 {
 	struct grant *persistent_gnt;
 	struct grant *n;
+	int i, j, segs;
 
 	/* Prevent new requests being issued until we fix things up. */
 	spin_lock_irq(&info->io_lock);
@@ -832,7 +1085,7 @@
 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
 	/* No more blkif_request(). */
 	if (info->rq)
-		blk_stop_queue(info->rq);
+		blk_mq_stop_hw_queues(info->rq);
 
 	/* Remove all persistent grants */
 	if (!list_empty(&info->grants)) {
@@ -845,12 +1098,68 @@
 				info->persistent_gnts_c--;
 			}
 			if (info->feature_persistent)
-				__free_page(pfn_to_page(persistent_gnt->pfn));
+				__free_page(persistent_gnt->page);
 			kfree(persistent_gnt);
 		}
 	}
 	BUG_ON(info->persistent_gnts_c != 0);
 
+	/*
+	 * Remove indirect pages, this only happens when using indirect
+	 * descriptors but not persistent grants
+	 */
+	if (!list_empty(&info->indirect_pages)) {
+		struct page *indirect_page, *n;
+
+		BUG_ON(info->feature_persistent);
+		list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+			list_del(&indirect_page->lru);
+			__free_page(indirect_page);
+		}
+	}
+
+	for (i = 0; i < BLK_RING_SIZE(info); i++) {
+		/*
+		 * Clear persistent grants present in requests already
+		 * on the shared ring
+		 */
+		if (!info->shadow[i].request)
+			goto free_shadow;
+
+		segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
+		       info->shadow[i].req.u.indirect.nr_segments :
+		       info->shadow[i].req.u.rw.nr_segments;
+		for (j = 0; j < segs; j++) {
+			persistent_gnt = info->shadow[i].grants_used[j];
+			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
+			if (info->feature_persistent)
+				__free_page(persistent_gnt->page);
+			kfree(persistent_gnt);
+		}
+
+		if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
+			/*
+			 * If this is not an indirect operation don't try to
+			 * free indirect segments
+			 */
+			goto free_shadow;
+
+		for (j = 0; j < INDIRECT_GREFS(segs); j++) {
+			persistent_gnt = info->shadow[i].indirect_grants[j];
+			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
+			__free_page(persistent_gnt->page);
+			kfree(persistent_gnt);
+		}
+
+free_shadow:
+		kfree(info->shadow[i].grants_used);
+		info->shadow[i].grants_used = NULL;
+		kfree(info->shadow[i].indirect_grants);
+		info->shadow[i].indirect_grants = NULL;
+		kfree(info->shadow[i].sg);
+		info->shadow[i].sg = NULL;
+	}
+
 	/* No more gnttab callback work. */
 	gnttab_cancel_free_callback(&info->callback);
 	spin_unlock_irq(&info->io_lock);
@@ -859,50 +1168,80 @@
 	flush_work(&info->work);
 
 	/* Free resources associated with old device channel. */
-	if (info->ring_ref != GRANT_INVALID_REF) {
-		gnttab_end_foreign_access(info->ring_ref, 0,
-					  (unsigned long)info->ring.sring);
-		info->ring_ref = GRANT_INVALID_REF;
-		info->ring.sring = NULL;
+	for (i = 0; i < info->nr_ring_pages; i++) {
+		if (info->ring_ref[i] != GRANT_INVALID_REF) {
+			gnttab_end_foreign_access(info->ring_ref[i], 0, 0);
+			info->ring_ref[i] = GRANT_INVALID_REF;
+		}
 	}
+	free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE));
+	info->ring.sring = NULL;
+
 	if (info->irq)
 		unbind_from_irqhandler(info->irq, info);
 	info->evtchn = info->irq = 0;
 
 }
 
+struct copy_from_grant {
+	const struct blk_shadow *s;
+	unsigned int grant_idx;
+	unsigned int bvec_offset;
+	char *bvec_data;
+};
+
+static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
+				  unsigned int len, void *data)
+{
+	struct copy_from_grant *info = data;
+	char *shared_data;
+	/* Convenient aliases */
+	const struct blk_shadow *s = info->s;
+
+	shared_data = kmap_atomic(s->grants_used[info->grant_idx]->page);
+
+	memcpy(info->bvec_data + info->bvec_offset,
+	       shared_data + offset, len);
+
+	info->bvec_offset += len;
+	info->grant_idx++;
+
+	kunmap_atomic(shared_data);
+}
+
 static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
 			     struct blkif_response *bret)
 {
 	int i = 0;
 	struct scatterlist *sg;
-	char *bvec_data;
-	void *shared_data;
-	int nseg;
-
-	nseg = s->req.u.rw.nr_segments;
+	int num_sg, num_grant;
+	struct copy_from_grant data = {
+		.s = s,
+		.grant_idx = 0,
+	};
+
+	num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
+		s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
+	num_sg = s->num_sg;
 
 	if (bret->operation == BLKIF_OP_READ && info->feature_persistent) {
-		/*
-		 * Copy the data received from the backend into the bvec.
-		 * Since bv_offset can be different than 0, and bv_len different
-		 * than PAGE_SIZE, we have to keep track of the current offset,
-		 * to be sure we are copying the data from the right shared page.
-		 */
-		for_each_sg(s->sg, sg, nseg, i) {
+		for_each_sg(s->sg, sg, num_sg, i) {
 			BUG_ON(sg->offset + sg->length > PAGE_SIZE);
-			shared_data = kmap_atomic(
-				pfn_to_page(s->grants_used[i]->pfn));
-			bvec_data = kmap_atomic(sg_page(sg));
-			memcpy(bvec_data   + sg->offset,
-			       shared_data + sg->offset,
-			       sg->length);
-			kunmap_atomic(bvec_data);
-			kunmap_atomic(shared_data);
+
+			data.bvec_offset = sg->offset;
+			data.bvec_data = kmap_atomic(sg_page(sg));
+
+			gnttab_foreach_grant_in_range(sg_page(sg),
+						      sg->offset,
+						      sg->length,
+						      blkif_copy_from_grant,
+						      &data);
+
+			kunmap_atomic(data.bvec_data);
 		}
 	}
 	/* Add the persistent grant into the list of free grants */
-	for (i = 0; i < nseg; i++) {
+	for (i = 0; i < num_grant; i++) {
 		if (gnttab_query_foreign_access(s->grants_used[i]->gref)) {
 			/*
 			 * If the grant is still mapped by the backend (the
@@ -927,6 +1266,31 @@
 			list_add_tail(&s->grants_used[i]->node, &info->grants);
 		}
 	}
+	if (s->req.operation == BLKIF_OP_INDIRECT) {
+		for (i = 0; i < INDIRECT_GREFS(num_grant); i++) {
+			if (gnttab_query_foreign_access(s->indirect_grants[i]->gref)) {
+				if (!info->feature_persistent)
+					pr_alert_ratelimited("backed has not unmapped grant: %u\n",
+							     s->indirect_grants[i]->gref);
+				list_add(&s->indirect_grants[i]->node, &info->grants);
+				info->persistent_gnts_c++;
+			} else {
+				struct page *indirect_page;
+
+				gnttab_end_foreign_access(s->indirect_grants[i]->gref, 0, 0UL);
+				/*
+				 * Add the used indirect page back to the list of
+				 * available pages for indirect grefs.
+				 */
+				if (!info->feature_persistent) {
+					indirect_page = s->indirect_grants[i]->page;
+					list_add(&indirect_page->lru, &info->indirect_pages);
+				}
+				s->indirect_grants[i]->gref = GRANT_INVALID_REF;
+				list_add_tail(&s->indirect_grants[i]->node, &info->grants);
+			}
+		}
+	}
 }
 
 static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -959,7 +1323,7 @@
 		 * never have given to it (we stamp it up to BLK_RING_SIZE -
 		 * look in get_id_from_freelist.
 		 */
-		if (id >= BLK_RING_SIZE) {
+		if (id >= BLK_RING_SIZE(info)) {
 			WARN(1, "%s: response to %s has incorrect id (%ld)\n",
 			     info->gd->disk_name, op_name(bret->operation), id);
 			/* We can't safely get the 'struct request' as
@@ -990,7 +1354,7 @@
 				queue_flag_clear(QUEUE_FLAG_DISCARD, rq);
 				queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq);
 			}
-			__blk_end_request_all(req, error);
+			blk_mq_complete_request(req, error);
 			break;
 		case BLKIF_OP_FLUSH_DISKCACHE:
 		case BLKIF_OP_WRITE_BARRIER:
@@ -1009,7 +1373,6 @@
 				if (error == -EOPNOTSUPP)
 					error = 0;
 				info->feature_flush = 0;
-				info->flush_op = 0;
 				xlvbd_flush(info);
 			}
 			/* fall through */
@@ -1019,7 +1382,7 @@
 				dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
 					"request: %x\n", bret->status);
 
-			__blk_end_request_all(req, error);
+			blk_mq_complete_request(req, error);
 			break;
 		default:
 			BUG();
@@ -1049,27 +1412,29 @@
 {
 	struct blkif_sring *sring;
 	int err, i;
+	unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE;
+	grant_ref_t gref[XENBUS_MAX_RING_GRANTS];
 
-	info->ring_ref = GRANT_INVALID_REF;
+	for (i = 0; i < info->nr_ring_pages; i++)
+		info->ring_ref[i] = GRANT_INVALID_REF;
 
-	sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
+	sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH,
+						       get_order(ring_size));
 	if (!sring) {
 		xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
 		return -ENOMEM;
 	}
 	SHARED_RING_INIT(sring);
-	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
-
-	for (i = 0; i < BLK_RING_SIZE; i++)
-		sg_init_table(info->shadow[i].sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+	FRONT_RING_INIT(&info->ring, sring, ring_size);
 
-	err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
+	err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref);
 	if (err < 0) {
-		free_page((unsigned long)sring);
+		free_pages((unsigned long)sring, get_order(ring_size));
 		info->ring.sring = NULL;
 		goto fail;
 	}
-	info->ring_ref = err;
+	for (i = 0; i < info->nr_ring_pages; i++)
+		info->ring_ref[i] = gref[i];
 
 	err = xenbus_alloc_evtchn(dev, &info->evtchn);
 	if (err)
@@ -1097,7 +1462,18 @@
 {
 	const char *message = NULL;
 	struct xenbus_transaction xbt;
-	int err;
+	int err, i;
+	unsigned int max_page_order = 0;
+	unsigned int ring_page_order = 0;
+
+	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+			   "max-ring-page-order", "%u", &max_page_order);
+	if (err != 1)
+		info->nr_ring_pages = 1;
+	else {
+		ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
+		info->nr_ring_pages = 1 << ring_page_order;
+	}
 
 	/* Create shared ring, alloc event channel. */
 	err = setup_blkring(dev, info);
@@ -1111,11 +1487,32 @@
 		goto destroy_blkring;
 	}
 
-	err = xenbus_printf(xbt, dev->nodename,
-			    "ring-ref", "%u", info->ring_ref);
-	if (err) {
-		message = "writing ring-ref";
-		goto abort_transaction;
+	if (info->nr_ring_pages == 1) {
+		err = xenbus_printf(xbt, dev->nodename,
+				    "ring-ref", "%u", info->ring_ref[0]);
+		if (err) {
+			message = "writing ring-ref";
+			goto abort_transaction;
+		}
+	} else {
+		err = xenbus_printf(xbt, dev->nodename,
+				    "ring-page-order", "%u", ring_page_order);
+		if (err) {
+			message = "writing ring-page-order";
+			goto abort_transaction;
+		}
+
+		for (i = 0; i < info->nr_ring_pages; i++) {
+			char ring_ref_name[RINGREF_NAME_LEN];
+
+			snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+			err = xenbus_printf(xbt, dev->nodename, ring_ref_name,
+					    "%u", info->ring_ref[i]);
+			if (err) {
+				message = "writing ring-ref";
+				goto abort_transaction;
+			}
+		}
 	}
 	err = xenbus_printf(xbt, dev->nodename,
 			    "event-channel", "%u", info->evtchn);
@@ -1143,6 +1540,9 @@
 		goto destroy_blkring;
 	}
 
+	for (i = 0; i < BLK_RING_SIZE(info); i++)
+		info->shadow[i].req.u.rw.id = i+1;
+	info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
 	xenbus_switch_state(dev, XenbusStateInitialised);
 
 	return 0;
@@ -1166,7 +1566,7 @@
 static int blkfront_probe(struct xenbus_device *dev,
 			  const struct xenbus_device_id *id)
 {
-	int err, vdevice, i;
+	int err, vdevice;
 	struct blkfront_info *info;
 
 	/* FIXME: Use dynamic device id if this is not set. */
@@ -1186,7 +1586,7 @@
 		char *type;
 		int len;
 		/* no unplug has been done: do not hook devices != xen vbds */
-		if (xen_platform_pci_unplug & XEN_UNPLUG_UNNECESSARY) {
+		if (xen_has_pv_and_legacy_disk_devices()) {
 			int major;
 
 			if (!VDEV_IS_EXTENDED(vdevice))
@@ -1197,7 +1597,7 @@
 			if (major != XENVBD_MAJOR) {
 				printk(KERN_INFO
 						"%s: HVM does not support vbd %d as xen block device\n",
-						__FUNCTION__, vdevice);
+						__func__, vdevice);
 				return -ENODEV;
 			}
 		}
@@ -1222,36 +1622,43 @@
 	info->xbdev = dev;
 	info->vdevice = vdevice;
 	INIT_LIST_HEAD(&info->grants);
+	INIT_LIST_HEAD(&info->indirect_pages);
 	info->persistent_gnts_c = 0;
 	info->connected = BLKIF_STATE_DISCONNECTED;
 	INIT_WORK(&info->work, blkif_restart_queue);
 
-	for (i = 0; i < BLK_RING_SIZE; i++)
-		info->shadow[i].req.u.rw.id = i+1;
-	info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
-
 	/* Front end dir is a number, which is used as the id. */
 	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
 	dev_set_drvdata(&dev->dev, info);
 
-	err = talk_to_blkback(dev, info);
-	if (err) {
-		kfree(info);
-		dev_set_drvdata(&dev->dev, NULL);
-		return err;
-	}
-
 	return 0;
 }
 
+static void split_bio_end(struct bio *bio)
+{
+	struct split_bio *split_bio = bio->bi_private;
+
+	if (atomic_dec_and_test(&split_bio->pending)) {
+		split_bio->bio->bi_phys_segments = 0;
+		split_bio->bio->bi_error = bio->bi_error;
+		bio_endio(split_bio->bio);
+		kfree(split_bio);
+	}
+	bio_put(bio);
+}
 
 static int blkif_recover(struct blkfront_info *info)
 {
 	int i;
-	struct blkif_request *req;
+	struct request *req, *n;
 	struct blk_shadow *copy;
-	unsigned int persistent;
-	int j, rc;
+	int rc;
+	struct bio *bio, *cloned_bio;
+	struct bio_list bio_list, merge_bio;
+	unsigned int segs, offset;
+	int pending, size;
+	struct split_bio *split_bio;
+	struct list_head requests;
 
 	/* Stage 1: Make a safe copy of the shadow state. */
 	copy = kmemdup(info->shadow, sizeof(info->shadow),
@@ -1261,55 +1668,43 @@
 
 	/* Stage 2: Set up free list. */
 	memset(&info->shadow, 0, sizeof(info->shadow));
-	for (i = 0; i < BLK_RING_SIZE; i++)
+	for (i = 0; i < BLK_RING_SIZE(info); i++)
 		info->shadow[i].req.u.rw.id = i+1;
 	info->shadow_free = info->ring.req_prod_pvt;
-	info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
-
-	/* Check if the backend supports persistent grants */
-	rc = xenbus_gather(XBT_NIL, info->xbdev->otherend,
-			   "feature-persistent", "%u", &persistent,
-			   NULL);
-	if (rc)
-		info->feature_persistent = 0;
-	else
-		info->feature_persistent = persistent;
+	info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
 
-	/* Allocate memory for grants */
-	rc = fill_grant_buffer(info, BLK_RING_SIZE *
-	                             BLKIF_MAX_SEGMENTS_PER_REQUEST);
+	rc = blkfront_gather_backend_features(info);
 	if (rc) {
-		xenbus_dev_fatal(info->xbdev, rc, "setting grant buffer failed");
 		kfree(copy);
 		return rc;
 	}
 
-	/* Stage 3: Find pending requests and requeue them. */
-	for (i = 0; i < BLK_RING_SIZE; i++) {
+	segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	blk_queue_max_segments(info->rq, segs);
+	bio_list_init(&bio_list);
+	INIT_LIST_HEAD(&requests);
+	for (i = 0; i < BLK_RING_SIZE(info); i++) {
 		/* Not in use? */
 		if (!copy[i].request)
 			continue;
 
-		/* Grab a request slot and copy shadow state into it. */
-		req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
-		*req = copy[i].req;
-
-		/* We get a new request id, and must reset the shadow state. */
-		req->u.rw.id = get_id_from_freelist(info);
-		memcpy(&info->shadow[req->u.rw.id], &copy[i], sizeof(copy[i]));
-
-		if (req->operation != BLKIF_OP_DISCARD) {
-		/* Rewrite any grant references invalidated by susp/resume. */
-			for (j = 0; j < req->u.rw.nr_segments; j++)
-				gnttab_grant_foreign_access_ref(
-					req->u.rw.seg[j].gref,
-					info->xbdev->otherend_id,
-					pfn_to_mfn(copy[i].grants_used[j]->pfn),
-					0);
+		/*
+		 * Get the bios in the request so we can re-queue them.
+		 */
+		if (copy[i].request->cmd_flags &
+		    (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+			/*
+			 * Flush operations don't contain bios, so
+			 * we need to requeue the whole request
+			 */
+			list_add(&copy[i].request->queuelist, &requests);
+			continue;
 		}
-		info->shadow[req->u.rw.id].req = *req;
-
-		info->ring.req_prod_pvt++;
+		merge_bio.head = copy[i].request->bio;
+		merge_bio.tail = copy[i].request->biotail;
+		bio_list_merge(&bio_list, &merge_bio);
+		copy[i].request->bio = NULL;
+		blk_end_request_all(copy[i].request, 0);
 	}
 
 	kfree(copy);
@@ -1321,13 +1716,50 @@
 	/* Now safe for us to use the shared ring */
 	info->connected = BLKIF_STATE_CONNECTED;
 
-	/* Send off requeued requests */
-	flush_requests(info);
-
 	/* Kick any other new requests queued since we resumed */
 	kick_pending_request_queues(info);
 
+	list_for_each_entry_safe(req, n, &requests, queuelist) {
+		/* Requeue pending requests (flush or discard) */
+		list_del_init(&req->queuelist);
+		BUG_ON(req->nr_phys_segments > segs);
+		blk_mq_requeue_request(req);
+	}
 	spin_unlock_irq(&info->io_lock);
+	blk_mq_kick_requeue_list(info->rq);
+
+	while ((bio = bio_list_pop(&bio_list)) != NULL) {
+		/* Traverse the list of pending bios and re-queue them */
+		if (bio_segments(bio) > segs) {
+			/*
+			 * This bio has more segments than what we can
+			 * handle, we have to split it.
+			 */
+			pending = (bio_segments(bio) + segs - 1) / segs;
+			split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO);
+			BUG_ON(split_bio == NULL);
+			atomic_set(&split_bio->pending, pending);
+			split_bio->bio = bio;
+			for (i = 0; i < pending; i++) {
+				offset = (i * segs * XEN_PAGE_SIZE) >> 9;
+				size = min((unsigned int)(segs * XEN_PAGE_SIZE) >> 9,
+					   (unsigned int)bio_sectors(bio) - offset);
+				cloned_bio = bio_clone(bio, GFP_NOIO);
+				BUG_ON(cloned_bio == NULL);
+				bio_trim(cloned_bio, offset, size);
+				cloned_bio->bi_private = split_bio;
+				cloned_bio->bi_end_io = split_bio_end;
+				submit_bio(cloned_bio->bi_rw, cloned_bio);
+			}
+			/*
+			 * Now we have to wait for all those smaller bios to
+			 * end, so we can also end the "parent" bio.
+			 */
+			continue;
+		}
+		/* We don't need to split this bio */
+		submit_bio(bio->bi_rw, bio);
+	}
 
 	return 0;
 }
@@ -1399,36 +1831,159 @@
 static void blkfront_setup_discard(struct blkfront_info *info)
 {
 	int err;
-	char *type;
 	unsigned int discard_granularity;
 	unsigned int discard_alignment;
 	unsigned int discard_secure;
 
-	type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL);
-	if (IS_ERR(type))
-		return;
+	info->feature_discard = 1;
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+		"discard-granularity", "%u", &discard_granularity,
+		"discard-alignment", "%u", &discard_alignment,
+		NULL);
+	if (!err) {
+		info->discard_granularity = discard_granularity;
+		info->discard_alignment = discard_alignment;
+	}
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+		    "discard-secure", "%d", &discard_secure,
+		    NULL);
+	if (!err)
+		info->feature_secdiscard = !!discard_secure;
+}
 
-	info->feature_secdiscard = 0;
-	if (strncmp(type, "phy", 3) == 0) {
-		err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
-			"discard-granularity", "%u", &discard_granularity,
-			"discard-alignment", "%u", &discard_alignment,
-			NULL);
-		if (!err) {
-			info->feature_discard = 1;
-			info->discard_granularity = discard_granularity;
-			info->discard_alignment = discard_alignment;
+static int blkfront_setup_indirect(struct blkfront_info *info)
+{
+	unsigned int psegs, grants;
+	int err, i;
+
+	if (info->max_indirect_segments == 0)
+		grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	else
+		grants = info->max_indirect_segments;
+	psegs = grants / GRANTS_PER_PSEG;
+
+	err = fill_grant_buffer(info,
+				(grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info));
+	if (err)
+		goto out_of_memory;
+
+	if (!info->feature_persistent && info->max_indirect_segments) {
+		/*
+		 * We are using indirect descriptors but not persistent
+		 * grants, we need to allocate a set of pages that can be
+		 * used for mapping indirect grefs
+		 */
+		int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info);
+
+		BUG_ON(!list_empty(&info->indirect_pages));
+		for (i = 0; i < num; i++) {
+			struct page *indirect_page = alloc_page(GFP_NOIO);
+			if (!indirect_page)
+				goto out_of_memory;
+			list_add(&indirect_page->lru, &info->indirect_pages);
 		}
-		err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
-			    "discard-secure", "%d", &discard_secure,
-			    NULL);
-		if (!err)
-			info->feature_secdiscard = discard_secure;
+	}
+
+	for (i = 0; i < BLK_RING_SIZE(info); i++) {
+		info->shadow[i].grants_used = kzalloc(
+			sizeof(info->shadow[i].grants_used[0]) * grants,
+			GFP_NOIO);
+		info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO);
+		if (info->max_indirect_segments)
+			info->shadow[i].indirect_grants = kzalloc(
+				sizeof(info->shadow[i].indirect_grants[0]) *
+				INDIRECT_GREFS(grants),
+				GFP_NOIO);
+		if ((info->shadow[i].grants_used == NULL) ||
+			(info->shadow[i].sg == NULL) ||
+		     (info->max_indirect_segments &&
+		     (info->shadow[i].indirect_grants == NULL)))
+			goto out_of_memory;
+		sg_init_table(info->shadow[i].sg, psegs);
+	}
+
 
-	} else if (strncmp(type, "file", 4) == 0)
-		info->feature_discard = 1;
+	return 0;
 
-	kfree(type);
+out_of_memory:
+	for (i = 0; i < BLK_RING_SIZE(info); i++) {
+		kfree(info->shadow[i].grants_used);
+		info->shadow[i].grants_used = NULL;
+		kfree(info->shadow[i].sg);
+		info->shadow[i].sg = NULL;
+		kfree(info->shadow[i].indirect_grants);
+		info->shadow[i].indirect_grants = NULL;
+	}
+	if (!list_empty(&info->indirect_pages)) {
+		struct page *indirect_page, *n;
+		list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) {
+			list_del(&indirect_page->lru);
+			__free_page(indirect_page);
+		}
+	}
+	return -ENOMEM;
+}
+
+/*
+ * Gather all backend feature-*
+ */
+static int blkfront_gather_backend_features(struct blkfront_info *info)
+{
+	int err;
+	int barrier, flush, discard, persistent;
+	unsigned int indirect_segments;
+
+	info->feature_flush = 0;
+
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			"feature-barrier", "%d", &barrier,
+			NULL);
+
+	/*
+	 * If there's no "feature-barrier" defined, then it means
+	 * we're dealing with a very old backend which writes
+	 * synchronously; nothing to do.
+	 *
+	 * If there are barriers, then we use flush.
+	 */
+	if (!err && barrier)
+		info->feature_flush = REQ_FLUSH | REQ_FUA;
+	/*
+	 * And if there is "feature-flush-cache" use that above
+	 * barriers.
+	 */
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			"feature-flush-cache", "%d", &flush,
+			NULL);
+
+	if (!err && flush)
+		info->feature_flush = REQ_FLUSH;
+
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			"feature-discard", "%d", &discard,
+			NULL);
+
+	if (!err && discard)
+		blkfront_setup_discard(info);
+
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			"feature-persistent", "%u", &persistent,
+			NULL);
+	if (err)
+		info->feature_persistent = 0;
+	else
+		info->feature_persistent = persistent;
+
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "feature-max-indirect-segments", "%u", &indirect_segments,
+			    NULL);
+	if (err)
+		info->max_indirect_segments = 0;
+	else
+		info->max_indirect_segments = min(indirect_segments,
+						  xen_blkif_max_segments);
+
+	return blkfront_setup_indirect(info);
 }
 
 /*
@@ -1439,9 +1994,9 @@
 {
 	unsigned long long sectors;
 	unsigned long sector_size;
+	unsigned int physical_sector_size;
 	unsigned int binfo;
 	int err;
-	int barrier, flush, discard, persistent;
 
 	switch (info->connected) {
 	case BLKIF_STATE_CONNECTED:
@@ -1457,14 +2012,14 @@
 		       sectors);
 		set_capacity(info->gd, sectors);
 		revalidate_disk(info->gd);
-		return;
 
+		return;
 	case BLKIF_STATE_SUSPENDED:
 		/*
 		 * If we are recovering from suspension, we need to wait
 		 * for the backend to announce it's features before
-		 * reconnecting, we need to know if the backend supports
-		 * persistent grants.
+		 * reconnecting, at least we need to know if the backend
+		 * supports indirect descriptors, and how many.
 		 */
 		blkif_recover(info);
 		return;
@@ -1488,61 +2043,25 @@
 		return;
 	}
 
-	info->feature_flush = 0;
-	info->flush_op = 0;
-
-	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
-			    "feature-barrier", "%d", &barrier,
-			    NULL);
-
-	/*
-	 * If there's no "feature-barrier" defined, then it means
-	 * we're dealing with a very old backend which writes
-	 * synchronously; nothing to do.
-	 *
-	 * If there are barriers, then we use flush.
-	 */
-	if (!err && barrier) {
-		info->feature_flush = REQ_FLUSH | REQ_FUA;
-		info->flush_op = BLKIF_OP_WRITE_BARRIER;
-	}
 	/*
-	 * And if there is "feature-flush-cache" use that above
-	 * barriers.
+	 * physcial-sector-size is a newer field, so old backends may not
+	 * provide this. Assume physical sector size to be the same as
+	 * sector_size in that case.
 	 */
-	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
-			    "feature-flush-cache", "%d", &flush,
-			    NULL);
-
-	if (!err && flush) {
-		info->feature_flush = REQ_FLUSH;
-		info->flush_op = BLKIF_OP_FLUSH_DISKCACHE;
-	}
+	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+			   "physical-sector-size", "%u", &physical_sector_size);
+	if (err != 1)
+		physical_sector_size = sector_size;
 
-	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
-			    "feature-discard", "%d", &discard,
-			    NULL);
-
-	if (!err && discard)
-		blkfront_setup_discard(info);
-
-	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
-			    "feature-persistent", "%u", &persistent,
-			    NULL);
-	if (err)
-		info->feature_persistent = 0;
-	else
-		info->feature_persistent = persistent;
-
-	/* Allocate memory for grants */
-	err = fill_grant_buffer(info, BLK_RING_SIZE *
-	                              BLKIF_MAX_SEGMENTS_PER_REQUEST);
+	err = blkfront_gather_backend_features(info);
 	if (err) {
-		xenbus_dev_fatal(info->xbdev, err, "setting grant buffer failed");
+		xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
+				 info->xbdev->otherend);
 		return;
 	}
 
-	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
+	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
+				  physical_sector_size);
 	if (err) {
 		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
 				 info->xbdev->otherend);
@@ -1573,8 +2092,15 @@
 	dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
 
 	switch (backend_state) {
-	case XenbusStateInitialising:
 	case XenbusStateInitWait:
+		if (dev->state != XenbusStateInitialising)
+			break;
+		if (talk_to_blkback(dev, info)) {
+			kfree(info);
+			dev_set_drvdata(&dev->dev, NULL);
+			break;
+		}
+	case XenbusStateInitialising:
 	case XenbusStateInitialised:
 	case XenbusStateReconfiguring:
 	case XenbusStateReconfigured:
@@ -1690,6 +2216,10 @@
 
 	bdev = bdget_disk(disk, 0);
 
+	if (!bdev) {
+		WARN(1, "Block device %s yanked out from us!\n", disk->disk_name);
+		goto out_mutex;
+	}
 	if (bdev->bd_openers)
 		goto out;
 
@@ -1720,6 +2250,7 @@
 
 out:
 	bdput(bdev);
+out_mutex:
 	mutex_unlock(&blkfront_mutex);
 }
 
@@ -1738,13 +2269,14 @@
 	{ "" }
 };
 
-static DEFINE_XENBUS_DRIVER(blkfront, ,
+static struct xenbus_driver blkfront_driver = {
+	.ids  = blkfront_ids,
 	.probe = blkfront_probe,
 	.remove = blkfront_remove,
 	.resume = blkfront_resume,
 	.otherend_changed = blkback_changed,
 	.is_ready = blkfront_is_ready,
-);
+};
 
 static int __init xlblk_init(void)
 {
@@ -1753,7 +2285,13 @@
 	if (!xen_domain())
 		return -ENODEV;
 
-	if (xen_hvm_domain() && !xen_platform_pci_unplug)
+	if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
+		pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
+			xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
+		xen_blkif_max_ring_order = 0;
+	}
+
+	if (!xen_has_pv_disk_devices())
 		return -ENODEV;
 
 	if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {