phase 11

2025-02-12 02:05:54 +00:00 · 2022-04-02 17:29:52 +05:00 · 2022-04-02 17:29:52 +05:00 · 07d9c3128d
commit 07d9c3128d
parent fb209289b8
47 changed files with 3310 additions and 2782 deletions
--- a/block/Kconfig
+++ b/block/Kconfig
@ -35,6 +35,9 @@ config BLK_CGROUP_RWSTAT
 config BLK_DEV_BSG_COMMON
 	tristate
 config BLK_ICQ
 	bool
 config BLK_DEV_BSGLIB
 	bool "Block layer SG support v4 helper lib"
 	select BLK_DEV_BSG_COMMON
@ -73,7 +76,7 @@ config BLK_DEV_ZONED
 config BLK_DEV_THROTTLING
 	bool "Block layer bio throttling support"
-	depends on BLK_CGROUP=y
+	depends on BLK_CGROUP
 	select BLK_CGROUP_RWSTAT
 	help
 	Block layer bio throttling support. It can be used to limit
@ -112,7 +115,7 @@ config BLK_WBT_MQ
 config BLK_CGROUP_IOLATENCY
 	bool "Enable support for latency based cgroup IO protection"
-	depends on BLK_CGROUP=y
+	depends on BLK_CGROUP
 	help
 	Enabling this option enables the .latency interface for IO throttling.
 	The IO controller will attempt to maintain average IO latencies below
@ -132,7 +135,7 @@ config BLK_CGROUP_FC_APPID
 config BLK_CGROUP_IOCOST
 	bool "Enable support for cost model based cgroup IO controller"
-	depends on BLK_CGROUP=y
+	depends on BLK_CGROUP
 	select BLK_RQ_IO_DATA_LEN
 	select BLK_RQ_ALLOC_TIME
 	help
@ -190,39 +193,31 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
 	  by falling back to the kernel crypto API when inline
 	  encryption hardware is not present.
 menu "Partition Types"
 source "block/partitions/Kconfig"
 endmenu
 endif # BLOCK
 config BLOCK_COMPAT
-	bool
+	def_bool COMPAT
 	depends on BLOCK && COMPAT
 	default y
 config BLK_MQ_PCI
-	bool
+	def_bool PCI
 	depends on BLOCK && PCI
 	default y
 config BLK_MQ_VIRTIO
 	bool
-	depends on BLOCK && VIRTIO
+	depends on VIRTIO
 	default y
 config BLK_MQ_RDMA
 	bool
-	depends on BLOCK && INFINIBAND
+	depends on INFINIBAND
 	default y
 config BLK_PM
-	def_bool BLOCK && PM
+	def_bool PM
 # do not use in new code
 config BLOCK_HOLDER_DEPRECATED
 	bool
 source "block/Kconfig.iosched"
 endif # BLOCK
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@ -1,6 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0
 if BLOCK
 menu "IO Schedulers"
 config MQ_IOSCHED_DEADLINE
@ -20,6 +18,7 @@ config MQ_IOSCHED_KYBER
 config IOSCHED_BFQ
 	tristate "BFQ I/O scheduler"
 	select BLK_ICQ
 	help
 	BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of
 	of the device among all processes according to their weights,
@ -45,5 +44,3 @@ config BFQ_CGROUP_DEBUG
 	files in a cgroup which can be useful for debugging.
 endmenu
 endif
--- a/block/Makefile
+++ b/block/Makefile
@ -3,13 +3,13 @@
 # Makefile for the kernel block layer
 #
-obj-$(CONFIG_BLOCK) := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
+obj-y		:= bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
-			blk-exec.o blk-merge.o blk-timeout.o \
+			blk-merge.o blk-timeout.o \
 			blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
 			blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
 			genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
-			disk-events.o
+			disk-events.o blk-ia-ranges.o
 obj-$(CONFIG_BOUNCE)		+= bounce.o
 obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
@ -36,6 +36,6 @@ obj-$(CONFIG_BLK_DEBUG_FS)	+= blk-mq-debugfs.o
 obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
 obj-$(CONFIG_BLK_SED_OPAL)	+= sed-opal.o
 obj-$(CONFIG_BLK_PM)		+= blk-pm.o
-obj-$(CONFIG_BLK_INLINE_ENCRYPTION)	+= keyslot-manager.o blk-crypto.o
+obj-$(CONFIG_BLK_INLINE_ENCRYPTION)	+= blk-crypto.o blk-crypto-profile.o
 obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)	+= blk-crypto-fallback.o
 obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED)	+= holder.o
--- a/block/bdev.c
+++ b/block/bdev.c
@ -12,6 +12,7 @@
 #include <linux/major.h>
 #include <linux/device_cgroup.h>
 #include <linux/blkdev.h>
 #include <linux/blk-integrity.h>
 #include <linux/backing-dev.h>
 #include <linux/module.h>
 #include <linux/blkpg.h>
@ -23,7 +24,6 @@
 #include <linux/pseudo_fs.h>
 #include <linux/uio.h>
 #include <linux/namei.h>
 #include <linux/cleancache.h>
 #include <linux/part_stat.h>
 #include <linux/uaccess.h>
 #include "../fs/internal.h"
@ -87,10 +87,6 @@ void invalidate_bdev(struct block_device *bdev)
 		lru_add_drain_all();	/* make sure all lru add caches are flushed */
 		invalidate_mapping_pages(mapping, 0, -1);
 	}
 	/* 99% of the time, we don't need to flush the cleancache on the bdev.
 	 * But, for the strange corners, lets be cautious
 	 */
 	cleancache_invalidate_inode(mapping);
 }
 EXPORT_SYMBOL(invalidate_bdev);
@ -184,14 +180,13 @@ int sb_min_blocksize(struct super_block *sb, int size)
 EXPORT_SYMBOL(sb_min_blocksize);
-int __sync_blockdev(struct block_device *bdev, int wait)
+int sync_blockdev_nowait(struct block_device *bdev)
 {
 	if (!bdev)
 		return 0;
-	if (!wait)
+	return filemap_flush(bdev->bd_inode->i_mapping);
 		return filemap_flush(bdev->bd_inode->i_mapping);
 	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
 }
 EXPORT_SYMBOL_GPL(sync_blockdev_nowait);
 /*
 * Write out and wait upon all the dirty data associated with a block
@ -199,7 +194,9 @@ int __sync_blockdev(struct block_device *bdev, int wait)
 */
 int sync_blockdev(struct block_device *bdev)
 {
-	return __sync_blockdev(bdev, 1);
+	if (!bdev)
 		return 0;
 	return filemap_write_and_wait(bdev->bd_inode->i_mapping);
 }
 EXPORT_SYMBOL(sync_blockdev);
@ -326,12 +323,12 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return result;
-	result = blk_queue_enter(bdev->bd_disk->queue, 0);
+	result = blk_queue_enter(bdev_get_queue(bdev), 0);
 	if (result)
 		return result;
 	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
 			      REQ_OP_READ);
-	blk_queue_exit(bdev->bd_disk->queue);
+	blk_queue_exit(bdev_get_queue(bdev));
 	return result;
 }
@ -362,7 +359,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return -EOPNOTSUPP;
-	result = blk_queue_enter(bdev->bd_disk->queue, 0);
+	result = blk_queue_enter(bdev_get_queue(bdev), 0);
 	if (result)
 		return result;
@ -375,7 +372,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 		clean_page_buffers(page);
 		unlock_page(page);
 	}
-	blk_queue_exit(bdev->bd_disk->queue);
+	blk_queue_exit(bdev_get_queue(bdev));
 	return result;
 }
@ -492,6 +489,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 	spin_lock_init(&bdev->bd_size_lock);
 	bdev->bd_partno = partno;
 	bdev->bd_inode = inode;
 	bdev->bd_queue = disk->queue;
 	bdev->bd_stats = alloc_percpu(struct disk_stats);
 	if (!bdev->bd_stats) {
 		iput(inode);
@ -662,7 +660,7 @@ static void blkdev_flush_mapping(struct block_device *bdev)
 static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
 {
 	struct gendisk *disk = bdev->bd_disk;
-	int ret = 0;
+	int ret;
 	if (disk->fops->open) {
 		ret = disk->fops->open(bdev, mode);
@ -747,21 +745,11 @@ struct block_device *blkdev_get_no_open(dev_t dev)
 	if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
 		bdev = NULL;
 	iput(inode);
 	if (!bdev)
 		return NULL;
 	if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) ||
 	    !try_module_get(bdev->bd_disk->fops->owner)) {
 		put_device(&bdev->bd_device);
 		return NULL;
 	}
 	return bdev;
 }
 void blkdev_put_no_open(struct block_device *bdev)
 {
 	module_put(bdev->bd_disk->fops->owner);
 	put_device(&bdev->bd_device);
 }
@ -817,12 +805,14 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
 	ret = -ENXIO;
 	if (!disk_live(disk))
 		goto abort_claiming;
 	if (!try_module_get(disk->fops->owner))
 		goto abort_claiming;
 	if (bdev_is_partition(bdev))
 		ret = blkdev_get_part(bdev, mode);
 	else
 		ret = blkdev_get_whole(bdev, mode);
 	if (ret)
-		goto abort_claiming;
+		goto put_module;
 	if (mode & FMODE_EXCL) {
 		bd_finish_claiming(bdev, holder);
@ -834,7 +824,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
 		 * used in blkdev_get/put().
 		 */
 		if ((mode & FMODE_WRITE) && !bdev->bd_write_holder &&
-		    (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
+		    (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) {
 			bdev->bd_write_holder = true;
 			unblock_events = false;
 		}
@ -844,7 +834,8 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
 	if (unblock_events)
 		disk_unblock_events(disk);
 	return bdev;
-
+put_module:
 	module_put(disk->fops->owner);
 abort_claiming:
 	if (mode & FMODE_EXCL)
 		bd_abort_claiming(bdev, holder);
@ -953,18 +944,21 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
 		blkdev_put_whole(bdev, mode);
 	mutex_unlock(&disk->open_mutex);
 	module_put(disk->fops->owner);
 	blkdev_put_no_open(bdev);
 }
 EXPORT_SYMBOL(blkdev_put);
 /**
- * lookup_bdev  - lookup a struct block_device by name
+ * lookup_bdev() - Look up a struct block_device by name.
- * @pathname:	special file representing the block device
+ * @pathname: Name of the block device in the filesystem.
- * @dev:	return value of the block device's dev_t
+ * @dev: Pointer to the block device's dev_t, if found.
 *
- * Get a reference to the blockdevice at @pathname in the current
+ * Lookup the block device's dev_t at @pathname in the current
- * namespace if possible and return it.  Return ERR_PTR(error)
+ * namespace if possible and return it in @dev.
- * otherwise.
+ *
 * Context: May sleep.
 * Return: 0 if succeeded, negative errno otherwise.
 */
 int lookup_bdev(const char *pathname, dev_t *dev)
 {
@ -1016,7 +1010,7 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty)
 }
 EXPORT_SYMBOL(__invalidate_device);
-void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
+void sync_bdevs(bool wait)
 {
 	struct inode *inode, *old_inode = NULL;
@ -1047,8 +1041,19 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
 		bdev = I_BDEV(inode);
 		mutex_lock(&bdev->bd_disk->open_mutex);
-		if (bdev->bd_openers)
+		if (!bdev->bd_openers) {
-			func(bdev, arg);
+			; /* skip */
 		} else if (wait) {
 			/*
 			 * We keep the error status of individual mapping so
 			 * that applications can catch the writeback error using
 			 * fsync(2). See filemap_fdatawait_keep_errors() for
 			 * details.
 			 */
 			filemap_fdatawait_keep_errors(inode->i_mapping);
 		} else {
 			filemap_fdatawrite(inode->i_mapping);
 		}
 		mutex_unlock(&bdev->bd_disk->open_mutex);
 		spin_lock(&blockdev_superblock->s_inode_list_lock);
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@ -6,13 +6,13 @@
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/cgroup.h>
 #include <linux/elevator.h>
 #include <linux/ktime.h>
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/sbitmap.h>
 #include <linux/delay.h>
 #include "elevator.h"
 #include "bfq-iosched.h"
 #ifdef CONFIG_BFQ_CGROUP_DEBUG
@ -463,7 +463,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
 {
 	if (blkg_rwstat_init(&stats->bytes, gfp) ||
 	    blkg_rwstat_init(&stats->ios, gfp))
-		return -ENOMEM;
+		goto error;
 #ifdef CONFIG_BFQ_CGROUP_DEBUG
 	if (blkg_rwstat_init(&stats->merged, gfp) ||
@ -476,13 +476,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
 	    bfq_stat_init(&stats->dequeue, gfp) ||
 	    bfq_stat_init(&stats->group_wait_time, gfp) ||
 	    bfq_stat_init(&stats->idle_time, gfp) ||
-	    bfq_stat_init(&stats->empty_time, gfp)) {
+	    bfq_stat_init(&stats->empty_time, gfp))
-		bfqg_stats_exit(stats);
+		goto error;
 		return -ENOMEM;
 	}
 #endif
 	return 0;
 error:
 	bfqg_stats_exit(stats);
 	return -ENOMEM;
 }
 static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@ -117,7 +117,6 @@
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/cgroup.h>
 #include <linux/elevator.h>
 #include <linux/ktime.h>
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
@ -127,6 +126,7 @@
 #include <trace/events/block.h>
 #include "elevator.h"
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
@ -433,26 +433,21 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
 /**
 * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
 * @bfqd: the lookup key.
 * @ioc: the io_context of the process doing I/O.
 * @q: the request queue.
 */
-static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
+static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
 					struct io_context *ioc,
 					struct request_queue *q)
 {
-	if (ioc) {
+	struct bfq_io_cq *icq;
-		unsigned long flags;
+	unsigned long flags;
 		struct bfq_io_cq *icq;
-		spin_lock_irqsave(&q->queue_lock, flags);
+	if (!current->io_context)
-		icq = icq_to_bic(ioc_lookup_icq(ioc, q));
+		return NULL;
 		spin_unlock_irqrestore(&q->queue_lock, flags);
-		return icq;
+	spin_lock_irqsave(&q->queue_lock, flags);
-	}
+	icq = icq_to_bic(ioc_lookup_icq(q));
 	spin_unlock_irqrestore(&q->queue_lock, flags);
-	return NULL;
+	return icq;
 }
 /*
@ -565,26 +560,134 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
 	}
 }
 #define BFQ_LIMIT_INLINE_DEPTH 16
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
 {
 	struct bfq_data *bfqd = bfqq->bfqd;
 	struct bfq_entity *entity = &bfqq->entity;
 	struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH];
 	struct bfq_entity **entities = inline_entities;
 	int depth, level;
 	int class_idx = bfqq->ioprio_class - 1;
 	struct bfq_sched_data *sched_data;
 	unsigned long wsum;
 	bool ret = false;
 	if (!entity->on_st_or_in_serv)
 		return false;
 	/* +1 for bfqq entity, root cgroup not included */
 	depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1;
 	if (depth > BFQ_LIMIT_INLINE_DEPTH) {
 		entities = kmalloc_array(depth, sizeof(*entities), GFP_NOIO);
 		if (!entities)
 			return false;
 	}
 	spin_lock_irq(&bfqd->lock);
 	sched_data = entity->sched_data;
 	/* Gather our ancestors as we need to traverse them in reverse order */
 	level = 0;
 	for_each_entity(entity) {
 		/*
 		 * If at some level entity is not even active, allow request
 		 * queueing so that BFQ knows there's work to do and activate
 		 * entities.
 		 */
 		if (!entity->on_st_or_in_serv)
 			goto out;
 		/* Uh, more parents than cgroup subsystem thinks? */
 		if (WARN_ON_ONCE(level >= depth))
 			break;
 		entities[level++] = entity;
 	}
 	WARN_ON_ONCE(level != depth);
 	for (level--; level >= 0; level--) {
 		entity = entities[level];
 		if (level > 0) {
 			wsum = bfq_entity_service_tree(entity)->wsum;
 		} else {
 			int i;
 			/*
 			 * For bfqq itself we take into account service trees
 			 * of all higher priority classes and multiply their
 			 * weights so that low prio queue from higher class
 			 * gets more requests than high prio queue from lower
 			 * class.
 			 */
 			wsum = 0;
 			for (i = 0; i <= class_idx; i++) {
 				wsum = wsum * IOPRIO_BE_NR +
 					sched_data->service_tree[i].wsum;
 			}
 		}
 		limit = DIV_ROUND_CLOSEST(limit * entity->weight, wsum);
 		if (entity->allocated >= limit) {
 			bfq_log_bfqq(bfqq->bfqd, bfqq,
 				"too many requests: allocated %d limit %d level %d",
 				entity->allocated, limit, level);
 			ret = true;
 			break;
 		}
 	}
 out:
 	spin_unlock_irq(&bfqd->lock);
 	if (entities != inline_entities)
 		kfree(entities);
 	return ret;
 }
 #else
 static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
 {
 	return false;
 }
 #endif
 /*
 * Async I/O can easily starve sync I/O (both sync reads and sync
 * writes), by consuming all tags. Similarly, storms of sync writes,
 * such as those that sync(2) may trigger, can starve sync reads.
 * Limit depths of async I/O and sync writes so as to counter both
 * problems.
 *
 * Also if a bfq queue or its parent cgroup consume more tags than would be
 * appropriate for their weight, we trim the available tag depth to 1. This
 * avoids a situation where one cgroup can starve another cgroup from tags and
 * thus block service differentiation among cgroups. Note that because the
 * queue / cgroup already has many requests allocated and queued, this does not
 * significantly affect service guarantees coming from the BFQ scheduling
 * algorithm.
 */
 static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
 {
 	struct bfq_data *bfqd = data->q->elevator->elevator_data;
 	struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
 	struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(op)) : NULL;
 	int depth;
 	unsigned limit = data->q->nr_requests;
-	if (op_is_sync(op) && !op_is_write(op))
+	/* Sync reads have full depth available */
-		return;
+	if (op_is_sync(op) && !op_is_write(op)) {
 		depth = 0;
 	} else {
 		depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
 		limit = (limit * depth) >> bfqd->full_depth_shift;
 	}
-	data->shallow_depth =
+	/*
-		bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+	 * Does queue (or any parent entity) exceed number of requests that
 	 * should be available to it? Heavily limit depth so that it cannot
 	 * consume more available requests and thus starve other entities.
 	 */
 	if (bfqq && bfqq_request_over_limit(bfqq, limit))
 		depth = 1;
 	bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
-			__func__, bfqd->wr_busy_queues, op_is_sync(op),
+		__func__, bfqd->wr_busy_queues, op_is_sync(op), depth);
-			data->shallow_depth);
+	if (depth)
 		data->shallow_depth = depth;
 }
 static struct bfq_queue *
@ -1113,7 +1216,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
 static int bfqq_process_refs(struct bfq_queue *bfqq)
 {
-	return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv -
+	return bfqq->ref - bfqq->entity.allocated -
 		bfqq->entity.on_st_or_in_serv -
 		(bfqq->weight_counter != NULL) - bfqq->stable_ref;
 }
@ -1982,20 +2086,19 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
 * aspect, see the comments on the choice of the queue for injection
 * in bfq_select_queue().
 *
- * Turning back to the detection of a waker queue, a queue Q is deemed
+ * Turning back to the detection of a waker queue, a queue Q is deemed as a
- * as a waker queue for bfqq if, for three consecutive times, bfqq
+ * waker queue for bfqq if, for three consecutive times, bfqq happens to become
- * happens to become non empty right after a request of Q has been
+ * non empty right after a request of Q has been completed within given
- * completed. In this respect, even if bfqq is empty, we do not check
+ * timeout. In this respect, even if bfqq is empty, we do not check for a waker
- * for a waker if it still has some in-flight I/O. In fact, in this
+ * if it still has some in-flight I/O. In fact, in this case bfqq is actually
- * case bfqq is actually still being served by the drive, and may
+ * still being served by the drive, and may receive new I/O on the completion
- * receive new I/O on the completion of some of the in-flight
+ * of some of the in-flight requests. In particular, on the first time, Q is
- * requests. In particular, on the first time, Q is tentatively set as
+ * tentatively set as a candidate waker queue, while on the third consecutive
- * a candidate waker queue, while on the third consecutive time that Q
+ * time that Q is detected, the field waker_bfqq is set to Q, to confirm that Q
- * is detected, the field waker_bfqq is set to Q, to confirm that Q is
+ * is a waker queue for bfqq. These detection steps are performed only if bfqq
- * a waker queue for bfqq. These detection steps are performed only if
+ * has a long think time, so as to make it more likely that bfqq's I/O is
- * bfqq has a long think time, so as to make it more likely that
+ * actually being blocked by a synchronization. This last filter, plus the
- * bfqq's I/O is actually being blocked by a synchronization. This
+ * above three-times requirement and time limit for detection, make false
 * last filter, plus the above three-times requirement, make false
 * positives less likely.
 *
 * NOTE
@ -2019,6 +2122,8 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
 static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 			    u64 now_ns)
 {
 	char waker_name[MAX_BFQQ_NAME_LENGTH];
 	if (!bfqd->last_completed_rq_bfqq ||
 	    bfqd->last_completed_rq_bfqq == bfqq ||
 	    bfq_bfqq_has_short_ttime(bfqq) ||
@ -2027,8 +2132,16 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	    bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq)
 		return;
 	/*
 	 * We reset waker detection logic also if too much time has passed
 	 * since the first detection. If wakeups are rare, pointless idling
 	 * doesn't hurt throughput that much. The condition below makes sure
 	 * we do not uselessly idle blocking waker in more than 1/64 cases. 
 	 */
 	if (bfqd->last_completed_rq_bfqq !=
-	    bfqq->tentative_waker_bfqq) {
+	    bfqq->tentative_waker_bfqq ||
 	    now_ns > bfqq->waker_detection_started +
 					128 * (u64)bfqd->bfq_slice_idle) {
 		/*
 		 * First synchronization detected with a
 		 * candidate waker queue, or with a different
@ -2037,12 +2150,19 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 		bfqq->tentative_waker_bfqq =
 			bfqd->last_completed_rq_bfqq;
 		bfqq->num_waker_detections = 1;
 		bfqq->waker_detection_started = now_ns;
 		bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name,
 			      MAX_BFQQ_NAME_LENGTH);
 		bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name);
 	} else /* Same tentative waker queue detected again */
 		bfqq->num_waker_detections++;
 	if (bfqq->num_waker_detections == 3) {
 		bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
 		bfqq->tentative_waker_bfqq = NULL;
 		bfq_bfqq_name(bfqq->waker_bfqq, waker_name,
 			      MAX_BFQQ_NAME_LENGTH);
 		bfq_log_bfqq(bfqd, bfqq, "set waker %s", waker_name);
 		/*
 		 * If the waker queue disappears, then
@ -2332,7 +2452,7 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
 	 * returned by bfq_bic_lookup does not go away before
 	 * bfqd->lock is taken.
 	 */
-	struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
+	struct bfq_io_cq *bic = bfq_bic_lookup(q);
 	bool ret;
 	spin_lock_irq(&bfqd->lock);
@ -5878,6 +5998,22 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
 	}
 }
 static void bfqq_request_allocated(struct bfq_queue *bfqq)
 {
 	struct bfq_entity *entity = &bfqq->entity;
 	for_each_entity(entity)
 		entity->allocated++;
 }
 static void bfqq_request_freed(struct bfq_queue *bfqq)
 {
 	struct bfq_entity *entity = &bfqq->entity;
 	for_each_entity(entity)
 		entity->allocated--;
 }
 /* returns true if it causes the idle timer to be disabled */
 static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
 {
@ -5891,8 +6027,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
 		 * Release the request's reference to the old bfqq
 		 * and make sure one is taken to the shared queue.
 		 */
-		new_bfqq->allocated++;
+		bfqq_request_allocated(new_bfqq);
-		bfqq->allocated--;
+		bfqq_request_freed(bfqq);
 		new_bfqq->ref++;
 		/*
 		 * If the bic associated with the process
@ -6209,8 +6345,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
 static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq)
 {
-	bfqq->allocated--;
+	bfqq_request_freed(bfqq);
 	bfq_put_queue(bfqq);
 }
@ -6434,6 +6569,16 @@ static void bfq_finish_requeue_request(struct request *rq)
 	rq->elv.priv[1] = NULL;
 }
 static void bfq_finish_request(struct request *rq)
 {
 	bfq_finish_requeue_request(rq);
 	if (rq->elv.icq) {
 		put_io_context(rq->elv.icq->ioc);
 		rq->elv.icq = NULL;
 	}
 }
 /*
 * Removes the association between the current task and bfqq, assuming
 * that bic points to the bfq iocontext of the task.
@ -6531,6 +6676,8 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
 */
 static void bfq_prepare_request(struct request *rq)
 {
 	rq->elv.icq = ioc_find_get_icq(rq->q);
 	/*
 	 * Regardless of whether we have an icq attached, we have to
 	 * clear the scheduler pointers, as they might point to
@ -6630,7 +6777,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
 		}
 	}
-	bfqq->allocated++;
+	bfqq_request_allocated(bfqq);
 	bfqq->ref++;
 	bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
 		     rq, bfqq, bfqq->ref);
@ -6793,11 +6940,11 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
 * See the comments on bfq_limit_depth for the purpose of
 * the depths set in the function. Return minimum shallow depth we'll use.
 */
-static unsigned int bfq_update_depths(struct bfq_data *bfqd,
+static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
 				      struct sbitmap_queue *bt)
 {
-	unsigned int i, j, min_shallow = UINT_MAX;
+	unsigned int depth = 1U << bt->sb.shift;
 	bfqd->full_depth_shift = bt->sb.shift;
 	/*
 	 * In-word depths if no bfq_queue is being weight-raised:
 	 * leaving 25% of tags only for sync reads.
@ -6809,13 +6956,13 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
 	 * limit 'something'.
 	 */
 	/* no more than 50% of tags for async I/O */
-	bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U);
+	bfqd->word_depths[0][0] = max(depth >> 1, 1U);
 	/*
 	 * no more than 75% of tags for sync writes (25% extra tags
 	 * w.r.t. async I/O, to prevent async I/O from starving sync
 	 * writes)
 	 */
-	bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U);
+	bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U);
 	/*
 	 * In-word depths in case some bfq_queue is being weight-
@ -6825,25 +6972,18 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
 	 * shortage.
 	 */
 	/* no more than ~18% of tags for async I/O */
-	bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U);
+	bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U);
 	/* no more than ~37% of tags for sync writes (~20% extra tags) */
-	bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U);
+	bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U);
 	for (i = 0; i < 2; i++)
 		for (j = 0; j < 2; j++)
 			min_shallow = min(min_shallow, bfqd->word_depths[i][j]);
 	return min_shallow;
 }
 static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
 {
 	struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
 	struct blk_mq_tags *tags = hctx->sched_tags;
 	unsigned int min_shallow;
-	min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags);
+	bfq_update_depths(bfqd, &tags->bitmap_tags);
-	sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow);
+	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
 }
 static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
@ -7260,7 +7400,7 @@ static struct elevator_type iosched_bfq_mq = {
 		.limit_depth		= bfq_limit_depth,
 		.prepare_request	= bfq_prepare_request,
 		.requeue_request        = bfq_finish_requeue_request,
-		.finish_request		= bfq_finish_requeue_request,
+		.finish_request		= bfq_finish_request,
 		.exit_icq		= bfq_exit_icq,
 		.insert_requests	= bfq_insert_requests,
 		.dispatch_request	= bfq_dispatch_request,
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@ -25,7 +25,7 @@
 #define BFQ_DEFAULT_GRP_IOPRIO	0
 #define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
-#define MAX_PID_STR_LENGTH 12
+#define MAX_BFQQ_NAME_LENGTH 16
 /*
 * Soft real-time applications are extremely more latency sensitive
@ -170,6 +170,9 @@ struct bfq_entity {
 	/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
 	int budget;
 	/* Number of requests allocated in the subtree of this entity */
 	int allocated;
 	/* device weight, if non-zero, it overrides the default weight of
 	 * bfq_group_data */
 	int dev_weight;
@ -266,8 +269,6 @@ struct bfq_queue {
 	struct request *next_rq;
 	/* number of sync and async requests queued */
 	int queued[2];
 	/* number of requests currently allocated */
 	int allocated;
 	/* number of pending metadata requests */
 	int meta_pending;
 	/* fifo list of requests in sort_list */
@ -387,6 +388,8 @@ struct bfq_queue {
 	struct bfq_queue *tentative_waker_bfqq;
 	/* number of times the same tentative waker has been detected */
 	unsigned int num_waker_detections;
 	/* time when we started considering this waker */
 	u64 waker_detection_started;
 	/* node for woken_list, see below */
 	struct hlist_node woken_list_node;
@ -768,6 +771,7 @@ struct bfq_data {
 	 * function)
 	 */
 	unsigned int word_depths[2][2];
 	unsigned int full_depth_shift;
 };
 enum bfqq_state_flags {
@ -1079,26 +1083,27 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
 /* --------------- end of interface of B-WF2Q+ ---------------- */
 /* Logging facilities. */
-static inline void bfq_pid_to_str(int pid, char *str, int len)
+static inline void bfq_bfqq_name(struct bfq_queue *bfqq, char *str, int len)
 {
-	if (pid != -1)
+	char type = bfq_bfqq_sync(bfqq) ? 'S' : 'A';
-		snprintf(str, len, "%d", pid);
+
 	if (bfqq->pid != -1)
 		snprintf(str, len, "bfq%d%c", bfqq->pid, type);
 	else
-		snprintf(str, len, "SHARED-");
+		snprintf(str, len, "bfqSHARED-%c", type);
 }
 #ifdef CONFIG_BFQ_GROUP_IOSCHED
 struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...)	do {			\
-	char pid_str[MAX_PID_STR_LENGTH];	\
+	char pid_str[MAX_BFQQ_NAME_LENGTH];				\
 	if (likely(!blk_trace_note_message_enabled((bfqd)->queue)))	\
 		break;							\
-	bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH);	\
+	bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH);		\
 	blk_add_cgroup_trace_msg((bfqd)->queue,				\
 			bfqg_to_blkg(bfqq_group(bfqq))->blkcg,		\
-			"bfq%s%c " fmt, pid_str,			\
+			"%s " fmt, pid_str, ##args);			\
 			bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args);	\
 } while (0)
 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...)	do {			\
@ -1109,13 +1114,11 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
 #else /* CONFIG_BFQ_GROUP_IOSCHED */
 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do {	\
-	char pid_str[MAX_PID_STR_LENGTH];	\
+	char pid_str[MAX_BFQQ_NAME_LENGTH];				\
 	if (likely(!blk_trace_note_message_enabled((bfqd)->queue)))	\
 		break;							\
-	bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH);	\
+	bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH);		\
-	blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str,	\
+	blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args);	\
 			bfq_bfqq_sync((bfqq)) ? 'S' : 'A',		\
 				##args);	\
 } while (0)
 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...)		do {} while (0)
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@ -6,7 +6,7 @@
 * Written by: Martin K. Petersen <martin.petersen@oracle.com>
 */
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/mempool.h>
 #include <linux/export.h>
 #include <linux/bio.h>
@ -134,7 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 	iv = bip->bip_vec + bip->bip_vcnt;
 	if (bip->bip_vcnt &&
-	    bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue,
+	    bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
 			     &bip->bip_vec[bip->bip_vcnt - 1], offset))
 		return 0;
--- a/block/bio.c
+++ b/block/bio.c
@ -26,7 +26,7 @@
 #include "blk-rq-qos.h"
 struct bio_alloc_cache {
-	struct bio_list		free_list;
+	struct bio		*free_list;
 	unsigned int		nr;
 };
@ -87,7 +87,8 @@ static struct bio_slab *create_bio_slab(unsigned int size)
 	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
 	bslab->slab = kmem_cache_create(bslab->name, size,
-			ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL);
+			ARCH_KMALLOC_MINALIGN,
 			SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL);
 	if (!bslab->slab)
 		goto fail_alloc_slab;
@ -156,7 +157,7 @@ static void bio_put_slab(struct bio_set *bs)
 void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
 {
-	BIO_BUG_ON(nr_vecs > BIO_MAX_VECS);
+	BUG_ON(nr_vecs > BIO_MAX_VECS);
 	if (nr_vecs == BIO_MAX_VECS)
 		mempool_free(bv, pool);
@ -281,6 +282,7 @@ void bio_init(struct bio *bio, struct bio_vec *table,
 	atomic_set(&bio->__bi_remaining, 1);
 	atomic_set(&bio->__bi_cnt, 1);
 	bio->bi_cookie = BLK_QC_T_NONE;
 	bio->bi_max_vecs = max_vecs;
 	bio->bi_io_vec = table;
@ -546,7 +548,7 @@ EXPORT_SYMBOL(zero_fill_bio);
 *   REQ_OP_READ, zero the truncated part. This function should only
 *   be used for handling corner cases, such as bio eod.
 */
-void bio_truncate(struct bio *bio, unsigned new_size)
+static void bio_truncate(struct bio *bio, unsigned new_size)
 {
 	struct bio_vec bv;
 	struct bvec_iter iter;
@ -629,7 +631,8 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
 	unsigned int i = 0;
 	struct bio *bio;
-	while ((bio = bio_list_pop(&cache->free_list)) != NULL) {
+	while ((bio = cache->free_list) != NULL) {
 		cache->free_list = bio->bi_next;
 		cache->nr--;
 		bio_free(bio);
 		if (++i == nr)
@ -678,7 +681,7 @@ static void bio_alloc_cache_destroy(struct bio_set *bs)
 void bio_put(struct bio *bio)
 {
 	if (unlikely(bio_flagged(bio, BIO_REFFED))) {
-		BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
+		BUG_ON(!atomic_read(&bio->__bi_cnt));
 		if (!atomic_dec_and_test(&bio->__bi_cnt))
 			return;
 	}
@ -688,7 +691,8 @@ void bio_put(struct bio *bio)
 		bio_uninit(bio);
 		cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
-		bio_list_add_head(&cache->free_list, bio);
+		bio->bi_next = cache->free_list;
 		cache->free_list = bio;
 		if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
 			bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
 		put_cpu();
@ -773,6 +777,23 @@ const char *bio_devname(struct bio *bio, char *buf)
 }
 EXPORT_SYMBOL(bio_devname);
 /**
 * bio_full - check if the bio is full
 * @bio:	bio to check
 * @len:	length of one segment to be added
 *
 * Return true if @bio is full and one segment with @len bytes can't be
 * added to the bio, otherwise return false
 */
 static inline bool bio_full(struct bio *bio, unsigned len)
 {
 	if (bio->bi_vcnt >= bio->bi_max_vecs)
 		return true;
 	if (bio->bi_iter.bi_size > UINT_MAX - len)
 		return true;
 	return false;
 }
 static inline bool page_is_mergeable(const struct bio_vec *bv,
 		struct page *page, unsigned int len, unsigned int off,
 		bool *same_page)
@ -792,6 +813,44 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
 	return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
 }
 /**
 * __bio_try_merge_page - try appending data to an existing bvec.
 * @bio: destination bio
 * @page: start page to add
 * @len: length of the data to add
 * @off: offset of the data relative to @page
 * @same_page: return if the segment has been merged inside the same page
 *
 * Try to add the data at @page + @off to the last bvec of @bio.  This is a
 * useful optimisation for file systems with a block size smaller than the
 * page size.
 *
 * Warn if (@len, @off) crosses pages in case that @same_page is true.
 *
 * Return %true on success or %false on failure.
 */
 static bool __bio_try_merge_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int off, bool *same_page)
 {
 	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
 		return false;
 	if (bio->bi_vcnt > 0) {
 		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
 		if (page_is_mergeable(bv, page, len, off, same_page)) {
 			if (bio->bi_iter.bi_size > UINT_MAX - len) {
 				*same_page = false;
 				return false;
 			}
 			bv->bv_len += len;
 			bio->bi_iter.bi_size += len;
 			return true;
 		}
 	}
 	return false;
 }
 /*
 * Try to merge a page into a segment, while obeying the hardware segment
 * size limit.  This is not for normal read/write bios, but for passthrough
@ -909,7 +968,7 @@ EXPORT_SYMBOL(bio_add_pc_page);
 int bio_add_zone_append_page(struct bio *bio, struct page *page,
 			     unsigned int len, unsigned int offset)
 {
-	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	bool same_page = false;
 	if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
@ -923,45 +982,6 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
 /**
 * __bio_try_merge_page - try appending data to an existing bvec.
 * @bio: destination bio
 * @page: start page to add
 * @len: length of the data to add
 * @off: offset of the data relative to @page
 * @same_page: return if the segment has been merged inside the same page
 *
 * Try to add the data at @page + @off to the last bvec of @bio.  This is a
 * useful optimisation for file systems with a block size smaller than the
 * page size.
 *
 * Warn if (@len, @off) crosses pages in case that @same_page is true.
 *
 * Return %true on success or %false on failure.
 */
 bool __bio_try_merge_page(struct bio *bio, struct page *page,
 		unsigned int len, unsigned int off, bool *same_page)
 {
 	if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
 		return false;
 	if (bio->bi_vcnt > 0) {
 		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
 		if (page_is_mergeable(bv, page, len, off, same_page)) {
 			if (bio->bi_iter.bi_size > UINT_MAX - len) {
 				*same_page = false;
 				return false;
 			}
 			bv->bv_len += len;
 			bio->bi_iter.bi_size += len;
 			return true;
 		}
 	}
 	return false;
 }
 EXPORT_SYMBOL_GPL(__bio_try_merge_page);
 /**
 * __bio_add_page - add page(s) to a bio in a new segment
 * @bio: destination bio
@ -1016,52 +1036,62 @@ int bio_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_add_page);
-void bio_release_pages(struct bio *bio, bool mark_dirty)
+/**
 * bio_add_folio - Attempt to add part of a folio to a bio.
 * @bio: BIO to add to.
 * @folio: Folio to add.
 * @len: How many bytes from the folio to add.
 * @off: First byte in this folio to add.
 *
 * Filesystems that use folios can call this function instead of calling
 * bio_add_page() for each page in the folio.  If @off is bigger than
 * PAGE_SIZE, this function can create a bio_vec that starts in a page
 * after the bv_page.  BIOs do not support folios that are 4GiB or larger.
 *
 * Return: Whether the addition was successful.
 */
 bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
 		   size_t off)
 {
 	if (len > UINT_MAX || off > UINT_MAX)
 		return 0;
 	return bio_add_page(bio, &folio->page, len, off) > 0;
 }
 void __bio_release_pages(struct bio *bio, bool mark_dirty)
 {
 	struct bvec_iter_all iter_all;
 	struct bio_vec *bvec;
 	if (bio_flagged(bio, BIO_NO_PAGE_REF))
 		return;
 	bio_for_each_segment_all(bvec, bio, iter_all) {
 		if (mark_dirty && !PageCompound(bvec->bv_page))
 			set_page_dirty_lock(bvec->bv_page);
 		put_page(bvec->bv_page);
 	}
 }
-EXPORT_SYMBOL_GPL(bio_release_pages);
+EXPORT_SYMBOL_GPL(__bio_release_pages);
-static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
+void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
 {
 	size_t size = iov_iter_count(iter);
 	WARN_ON_ONCE(bio->bi_max_vecs);
 	if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
 		struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 		size_t max_sectors = queue_max_zone_append_sectors(q);
 		size = min(size, max_sectors << SECTOR_SHIFT);
 	}
 	bio->bi_vcnt = iter->nr_segs;
 	bio->bi_io_vec = (struct bio_vec *)iter->bvec;
 	bio->bi_iter.bi_bvec_done = iter->iov_offset;
-	bio->bi_iter.bi_size = iter->count;
+	bio->bi_iter.bi_size = size;
 	bio_set_flag(bio, BIO_NO_PAGE_REF);
 	bio_set_flag(bio, BIO_CLONED);
 }
 static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
 {
 	__bio_iov_bvec_set(bio, iter);
 	iov_iter_advance(iter, iter->count);
 	return 0;
 }
 static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter)
 {
 	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
 	struct iov_iter i = *iter;
 	iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9);
 	__bio_iov_bvec_set(bio, &i);
 	iov_iter_advance(iter, i.count);
 	return 0;
 }
 static void bio_put_pages(struct page **pages, size_t size, size_t off)
 {
 	size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
@ -1131,7 +1161,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
 {
 	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
 	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
-	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
 	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
 	struct page **pages = (struct page **)bv;
@ -1203,9 +1233,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	int ret = 0;
 	if (iov_iter_is_bvec(iter)) {
-		if (bio_op(bio) == REQ_OP_ZONE_APPEND)
+		bio_iov_bvec_set(bio, iter);
-			return bio_iov_bvec_set_append(bio, iter);
+		iov_iter_advance(iter, bio->bi_iter.bi_size);
-		return bio_iov_bvec_set(bio, iter);
+		return 0;
 	}
 	do {
@ -1261,18 +1291,7 @@ int submit_bio_wait(struct bio *bio)
 }
 EXPORT_SYMBOL(submit_bio_wait);
-/**
+void __bio_advance(struct bio *bio, unsigned bytes)
 * bio_advance - increment/complete a bio by some number of bytes
 * @bio:	bio to advance
 * @bytes:	number of bytes to complete
 *
 * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
 * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
 * be updated on the last bvec as well.
 *
 * @bio will then represent the remaining, uncompleted portion of the io.
 */
 void bio_advance(struct bio *bio, unsigned bytes)
 {
 	if (bio_integrity(bio))
 		bio_integrity_advance(bio, bytes);
@ -1280,7 +1299,7 @@ void bio_advance(struct bio *bio, unsigned bytes)
 	bio_crypt_advance(bio, bytes);
 	bio_advance_iter(bio, &bio->bi_iter, bytes);
 }
-EXPORT_SYMBOL(bio_advance);
+EXPORT_SYMBOL(__bio_advance);
 void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
 			struct bio *src, struct bvec_iter *src_iter)
@ -1468,10 +1487,10 @@ void bio_endio(struct bio *bio)
 		return;
 	if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED))
-		rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio);
+		rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio);
 	if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
-		trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
+		trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
 		bio_clear_flag(bio, BIO_TRACE_COMPLETION);
 	}
@ -1710,8 +1729,9 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
 		return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
 	cache = per_cpu_ptr(bs->cache, get_cpu());
-	bio = bio_list_pop(&cache->free_list);
+	if (cache->free_list) {
-	if (bio) {
+		bio = cache->free_list;
 		cache->free_list = bio->bi_next;
 		cache->nr--;
 		put_cpu();
 		bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@ -30,8 +30,10 @@
 #include <linux/blk-cgroup.h>
 #include <linux/tracehook.h>
 #include <linux/psi.h>
 #include <linux/part_stat.h>
 #include "blk.h"
 #include "blk-ioprio.h"
 #include "blk-throttle.h"
 /*
 * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
@ -620,7 +622,7 @@ struct block_device *blkcg_conf_open_bdev(char **inputp)
 */
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 		   char *input, struct blkg_conf_ctx *ctx)
-	__acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock)
+	__acquires(rcu) __acquires(&bdev->bd_queue->queue_lock)
 {
 	struct block_device *bdev;
 	struct request_queue *q;
@ -631,7 +633,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
-	q = bdev->bd_disk->queue;
+	q = bdev_get_queue(bdev);
 	/*
 	 * blkcg_deactivate_policy() requires queue to be frozen, we can grab
@ -747,9 +749,9 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
 * with blkg_conf_prep().
 */
 void blkg_conf_finish(struct blkg_conf_ctx *ctx)
-	__releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu)
+	__releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu)
 {
-	spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock);
+	spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
 	rcu_read_unlock();
 	blkdev_put_no_open(ctx->bdev);
 }
@ -852,7 +854,7 @@ static void blkcg_fill_root_iostats(void)
 	while ((dev = class_dev_iter_next(&iter))) {
 		struct block_device *bdev = dev_to_bdev(dev);
 		struct blkcg_gq *blkg =
-			blk_queue_root_blkg(bdev->bd_disk->queue);
+			blk_queue_root_blkg(bdev_get_queue(bdev));
 		struct blkg_iostat tmp;
 		int cpu;
@ -1811,7 +1813,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
 	rcu_read_lock();
 	blkg = blkg_lookup_create(css_to_blkcg(css),
-				  bio->bi_bdev->bd_disk->queue);
+				  bdev_get_queue(bio->bi_bdev));
 	while (blkg) {
 		if (blkg_tryget(blkg)) {
 			ret_blkg = blkg;
@ -1847,8 +1849,8 @@ void bio_associate_blkg_from_css(struct bio *bio,
 	if (css && css->parent) {
 		bio->bi_blkg = blkg_tryget_closest(bio, css);
 	} else {
-		blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg);
+		blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
-		bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg;
+		bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
 	}
 }
 EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
--- a/block/blk-core.c
+++ b/block/blk-core.c
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@ -12,12 +12,13 @@
 #include <crypto/skcipher.h>
 #include <linux/blk-cgroup.h>
 #include <linux/blk-crypto.h>
 #include <linux/blk-crypto-profile.h>
 #include <linux/blkdev.h>
 #include <linux/crypto.h>
 #include <linux/keyslot-manager.h>
 #include <linux/mempool.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/scatterlist.h>
 #include "blk-crypto-internal.h"
@ -72,12 +73,12 @@ static mempool_t *bio_fallback_crypt_ctx_pool;
 static DEFINE_MUTEX(tfms_init_lock);
 static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX];
-static struct blk_crypto_keyslot {
+static struct blk_crypto_fallback_keyslot {
 	enum blk_crypto_mode_num crypto_mode;
 	struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX];
 } *blk_crypto_keyslots;
-static struct blk_keyslot_manager blk_crypto_ksm;
+static struct blk_crypto_profile blk_crypto_fallback_profile;
 static struct workqueue_struct *blk_crypto_wq;
 static mempool_t *blk_crypto_bounce_page_pool;
 static struct bio_set crypto_bio_split;
@ -88,9 +89,9 @@ static struct bio_set crypto_bio_split;
 */
 static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE];
-static void blk_crypto_evict_keyslot(unsigned int slot)
+static void blk_crypto_fallback_evict_keyslot(unsigned int slot)
 {
-	struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
+	struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot];
 	enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode;
 	int err;
@ -103,45 +104,41 @@ static void blk_crypto_evict_keyslot(unsigned int slot)
 	slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID;
 }
-static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm,
+static int
-				      const struct blk_crypto_key *key,
+blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile,
-				      unsigned int slot)
+				    const struct blk_crypto_key *key,
 				    unsigned int slot)
 {
-	struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
+	struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot];
 	const enum blk_crypto_mode_num crypto_mode =
 						key->crypto_cfg.crypto_mode;
 	int err;
 	if (crypto_mode != slotp->crypto_mode &&
 	    slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID)
-		blk_crypto_evict_keyslot(slot);
+		blk_crypto_fallback_evict_keyslot(slot);
 	slotp->crypto_mode = crypto_mode;
 	err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw,
 				     key->size);
 	if (err) {
-		blk_crypto_evict_keyslot(slot);
+		blk_crypto_fallback_evict_keyslot(slot);
 		return err;
 	}
 	return 0;
 }
-static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
+static int blk_crypto_fallback_keyslot_evict(struct blk_crypto_profile *profile,
-				    const struct blk_crypto_key *key,
+					     const struct blk_crypto_key *key,
-				    unsigned int slot)
+					     unsigned int slot)
 {
-	blk_crypto_evict_keyslot(slot);
+	blk_crypto_fallback_evict_keyslot(slot);
 	return 0;
 }
-/*
+static const struct blk_crypto_ll_ops blk_crypto_fallback_ll_ops = {
- * The crypto API fallback KSM ops - only used for a bio when it specifies a
+	.keyslot_program        = blk_crypto_fallback_keyslot_program,
- * blk_crypto_key that was not supported by the device's inline encryption
+	.keyslot_evict          = blk_crypto_fallback_keyslot_evict,
 * hardware.
 */
 static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = {
 	.keyslot_program	= blk_crypto_keyslot_program,
 	.keyslot_evict		= blk_crypto_keyslot_evict,
 };
 static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
@ -159,7 +156,7 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
 	bio_endio(src_bio);
 }
-static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
+static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
 {
 	struct bvec_iter iter;
 	struct bio_vec bv;
@ -186,13 +183,14 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
 	return bio;
 }
-static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
+static bool
-					struct skcipher_request **ciph_req_ret,
+blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot,
-					struct crypto_wait *wait)
+				     struct skcipher_request **ciph_req_ret,
 				     struct crypto_wait *wait)
 {
 	struct skcipher_request *ciph_req;
-	const struct blk_crypto_keyslot *slotp;
+	const struct blk_crypto_fallback_keyslot *slotp;
-	int keyslot_idx = blk_ksm_get_slot_idx(slot);
+	int keyslot_idx = blk_crypto_keyslot_index(slot);
 	slotp = &blk_crypto_keyslots[keyslot_idx];
 	ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode],
@ -209,7 +207,7 @@ static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
 	return true;
 }
-static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr)
+static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr)
 {
 	struct bio *bio = *bio_ptr;
 	unsigned int i = 0;
@ -264,7 +262,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
 {
 	struct bio *src_bio, *enc_bio;
 	struct bio_crypt_ctx *bc;
-	struct blk_ksm_keyslot *slot;
+	struct blk_crypto_keyslot *slot;
 	int data_unit_size;
 	struct skcipher_request *ciph_req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
@ -276,7 +274,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
 	blk_status_t blk_st;
 	/* Split the bio if it's too big for single page bvec */
-	if (!blk_crypto_split_bio_if_needed(bio_ptr))
+	if (!blk_crypto_fallback_split_bio_if_needed(bio_ptr))
 		return false;
 	src_bio = *bio_ptr;
@ -284,24 +282,25 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
 	data_unit_size = bc->bc_key->crypto_cfg.data_unit_size;
 	/* Allocate bounce bio for encryption */
-	enc_bio = blk_crypto_clone_bio(src_bio);
+	enc_bio = blk_crypto_fallback_clone_bio(src_bio);
 	if (!enc_bio) {
 		src_bio->bi_status = BLK_STS_RESOURCE;
 		return false;
 	}
 	/*
-	 * Use the crypto API fallback keyslot manager to get a crypto_skcipher
+	 * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
-	 * for the algorithm and key specified for this bio.
+	 * this bio's algorithm and key.
 	 */
-	blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
+	blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile,
 					bc->bc_key, &slot);
 	if (blk_st != BLK_STS_OK) {
 		src_bio->bi_status = blk_st;
 		goto out_put_enc_bio;
 	}
 	/* and then allocate an skcipher_request for it */
-	if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
+	if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
 		src_bio->bi_status = BLK_STS_RESOURCE;
 		goto out_release_keyslot;
 	}
@ -362,7 +361,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
 out_free_ciph_req:
 	skcipher_request_free(ciph_req);
 out_release_keyslot:
-	blk_ksm_put_slot(slot);
+	blk_crypto_put_keyslot(slot);
 out_put_enc_bio:
 	if (enc_bio)
 		bio_put(enc_bio);
@ -380,7 +379,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
 		container_of(work, struct bio_fallback_crypt_ctx, work);
 	struct bio *bio = f_ctx->bio;
 	struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx;
-	struct blk_ksm_keyslot *slot;
+	struct blk_crypto_keyslot *slot;
 	struct skcipher_request *ciph_req = NULL;
 	DECLARE_CRYPTO_WAIT(wait);
 	u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
@ -393,17 +392,18 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
 	blk_status_t blk_st;
 	/*
-	 * Use the crypto API fallback keyslot manager to get a crypto_skcipher
+	 * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
-	 * for the algorithm and key specified for this bio.
+	 * this bio's algorithm and key.
 	 */
-	blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
+	blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile,
 					bc->bc_key, &slot);
 	if (blk_st != BLK_STS_OK) {
 		bio->bi_status = blk_st;
 		goto out_no_keyslot;
 	}
 	/* and then allocate an skcipher_request for it */
-	if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
+	if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
 		bio->bi_status = BLK_STS_RESOURCE;
 		goto out;
 	}
@ -434,7 +434,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
 out:
 	skcipher_request_free(ciph_req);
-	blk_ksm_put_slot(slot);
+	blk_crypto_put_keyslot(slot);
 out_no_keyslot:
 	mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
 	bio_endio(bio);
@ -473,9 +473,9 @@ static void blk_crypto_fallback_decrypt_endio(struct bio *bio)
 * @bio_ptr: pointer to the bio to prepare
 *
 * If bio is doing a WRITE operation, this splits the bio into two parts if it's
- * too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio
+ * too big (see blk_crypto_fallback_split_bio_if_needed()). It then allocates a
- * for the first part, encrypts it, and update bio_ptr to point to the bounce
+ * bounce bio for the first part, encrypts it, and updates bio_ptr to point to
- * bio.
+ * the bounce bio.
 *
 * For a READ operation, we mark the bio for decryption by using bi_private and
 * bi_end_io.
@ -499,8 +499,8 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
 		return false;
 	}
-	if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm,
+	if (!__blk_crypto_cfg_supported(&blk_crypto_fallback_profile,
-					  &bc->bc_key->crypto_cfg)) {
+					&bc->bc_key->crypto_cfg)) {
 		bio->bi_status = BLK_STS_NOTSUPP;
 		return false;
 	}
@ -526,7 +526,7 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
 int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key)
 {
-	return blk_ksm_evict_key(&blk_crypto_ksm, key);
+	return __blk_crypto_evict_key(&blk_crypto_fallback_profile, key);
 }
 static bool blk_crypto_fallback_inited;
@ -534,6 +534,7 @@ static int blk_crypto_fallback_init(void)
 {
 	int i;
 	int err;
 	struct blk_crypto_profile *profile = &blk_crypto_fallback_profile;
 	if (blk_crypto_fallback_inited)
 		return 0;
@ -544,24 +545,24 @@ static int blk_crypto_fallback_init(void)
 	if (err)
 		goto out;
-	err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots);
+	err = blk_crypto_profile_init(profile, blk_crypto_num_keyslots);
 	if (err)
 		goto fail_free_bioset;
 	err = -ENOMEM;
-	blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops;
+	profile->ll_ops = blk_crypto_fallback_ll_ops;
-	blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
+	profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
 	/* All blk-crypto modes have a crypto API fallback. */
 	for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++)
-		blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF;
+		profile->modes_supported[i] = 0xFFFFFFFF;
-	blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
+	profile->modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
 	blk_crypto_wq = alloc_workqueue("blk_crypto_wq",
 					WQ_UNBOUND | WQ_HIGHPRI |
 					WQ_MEM_RECLAIM, num_online_cpus());
 	if (!blk_crypto_wq)
-		goto fail_free_ksm;
+		goto fail_destroy_profile;
 	blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots,
 				      sizeof(blk_crypto_keyslots[0]),
@ -595,8 +596,8 @@ static int blk_crypto_fallback_init(void)
 	kfree(blk_crypto_keyslots);
 fail_free_wq:
 	destroy_workqueue(blk_crypto_wq);
-fail_free_ksm:
+fail_destroy_profile:
-	blk_ksm_destroy(&blk_crypto_ksm);
+	blk_crypto_profile_destroy(profile);
 fail_free_bioset:
 	bioset_exit(&crypto_bio_split);
 out:
@ -610,7 +611,7 @@ static int blk_crypto_fallback_init(void)
 int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
 {
 	const char *cipher_str = blk_crypto_modes[mode_num].cipher_str;
-	struct blk_crypto_keyslot *slotp;
+	struct blk_crypto_fallback_keyslot *slotp;
 	unsigned int i;
 	int err = 0;
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@ -11,7 +11,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@ -218,8 +218,9 @@ static bool bio_crypt_check_alignment(struct bio *bio)
 blk_status_t __blk_crypto_init_request(struct request *rq)
 {
-	return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key,
+	return blk_crypto_get_keyslot(rq->q->crypto_profile,
-					&rq->crypt_keyslot);
+				      rq->crypt_ctx->bc_key,
 				      &rq->crypt_keyslot);
 }
 /**
@ -233,7 +234,7 @@ blk_status_t __blk_crypto_init_request(struct request *rq)
 */
 void __blk_crypto_free_request(struct request *rq)
 {
-	blk_ksm_put_slot(rq->crypt_keyslot);
+	blk_crypto_put_keyslot(rq->crypt_keyslot);
 	mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool);
 	blk_crypto_rq_set_defaults(rq);
 }
@ -264,6 +265,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
 {
 	struct bio *bio = *bio_ptr;
 	const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
 	struct blk_crypto_profile *profile;
 	/* Error if bio has no data. */
 	if (WARN_ON_ONCE(!bio_has_data(bio))) {
@ -280,8 +282,8 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
 	 * Success if device supports the encryption context, or if we succeeded
 	 * in falling back to the crypto API.
 	 */
-	if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm,
+	profile = bdev_get_queue(bio->bi_bdev)->crypto_profile;
-					 &bc_key->crypto_cfg))
+	if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg))
 		return true;
 	if (blk_crypto_fallback_bio_prep(bio_ptr))
@ -357,7 +359,7 @@ bool blk_crypto_config_supported(struct request_queue *q,
 				 const struct blk_crypto_config *cfg)
 {
 	return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
-	       blk_ksm_crypto_cfg_supported(q->ksm, cfg);
+	       __blk_crypto_cfg_supported(q->crypto_profile, cfg);
 }
 /**
@ -378,7 +380,7 @@ bool blk_crypto_config_supported(struct request_queue *q,
 int blk_crypto_start_using_key(const struct blk_crypto_key *key,
 			       struct request_queue *q)
 {
-	if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
+	if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
 		return 0;
 	return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
 }
@ -394,18 +396,17 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
 * evicted from any hardware that it might have been programmed into.  The key
 * must not be in use by any in-flight IO when this function is called.
 *
- * Return: 0 on success or if key is not present in the q's ksm, -err on error.
+ * Return: 0 on success or if the key wasn't in any keyslot; -errno on error.
 */
 int blk_crypto_evict_key(struct request_queue *q,
 			 const struct blk_crypto_key *key)
 {
-	if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
+	if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
-		return blk_ksm_evict_key(q->ksm, key);
+		return __blk_crypto_evict_key(q->crypto_profile, key);
 	/*
-	 * If the request queue's associated inline encryption hardware didn't
+	 * If the request_queue didn't support the key, then blk-crypto-fallback
-	 * have support for the key, then the key might have been programmed
+	 * may have been used, so try to evict the key from blk-crypto-fallback.
 	 * into the fallback keyslot manager, so try to evict from there.
 	 */
 	return blk_crypto_fallback_evict_key(key);
 }
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@ -69,6 +69,7 @@
 #include <linux/blkdev.h>
 #include <linux/gfp.h>
 #include <linux/blk-mq.h>
 #include <linux/part_stat.h>
 #include "blk.h"
 #include "blk-mq.h"
@ -95,6 +96,12 @@ enum {
 static void blk_kick_flush(struct request_queue *q,
 			   struct blk_flush_queue *fq, unsigned int flags);
 static inline struct blk_flush_queue *
 blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
 {
 	return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
 }
 static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
 {
 	unsigned int policy = 0;
@ -138,7 +145,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front)
 static void blk_account_io_flush(struct request *rq)
 {
-	struct block_device *part = rq->rq_disk->part0;
+	struct block_device *part = rq->q->disk->part0;
 	part_stat_lock();
 	part_stat_inc(part, ios[STAT_FLUSH]);
@ -222,7 +229,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
 	/* release the tag's ownership to the req cloned from */
 	spin_lock_irqsave(&fq->mq_flush_lock, flags);
-	if (!refcount_dec_and_test(&flush_rq->ref)) {
+	if (!req_ref_put_and_test(flush_rq)) {
 		fq->rq_status = error;
 		spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
 		return;
@ -334,7 +341,6 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
 	flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
 	flush_rq->rq_flags |= RQF_FLUSH_SEQ;
 	flush_rq->rq_disk = first_rq->rq_disk;
 	flush_rq->end_io = flush_end_io;
 	/*
 	 * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one
@ -343,7 +349,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
 	 * and READ flush_rq->end_io
 	 */
 	smp_wmb();
-	refcount_set(&flush_rq->ref, 1);
+	req_ref_set(flush_rq, 1);
 	blk_flush_queue_rq(flush_rq, false);
 }
@ -423,7 +429,7 @@ void blk_insert_flush(struct request *rq)
 	 */
 	if ((policy & REQ_FSEQ_DATA) &&
 	    !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
-		blk_mq_request_bypass_insert(rq, false, false);
+		blk_mq_request_bypass_insert(rq, false, true);
 		return;
 	}
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@ -6,7 +6,7 @@
 * Written by: Martin K. Petersen <martin.petersen@oracle.com>
 */
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/backing-dev.h>
 #include <linux/mempool.h>
 #include <linux/bio.h>
@ -409,9 +409,9 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
 	blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
-	if (disk->queue->ksm) {
+	if (disk->queue->crypto_profile) {
 		pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
-		blk_ksm_unregister(disk->queue);
+		disk->queue->crypto_profile = NULL;
 	}
 #endif
 }
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@ -8,22 +8,25 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
 #include <linux/security.h>
 #include <linux/sched/task.h>
 #include "blk.h"
 #include "blk-mq-sched.h"
 /*
 * For io context allocations
 */
 static struct kmem_cache *iocontext_cachep;
 #ifdef CONFIG_BLK_ICQ
 /**
 * get_io_context - increment reference count to io_context
 * @ioc: io_context to get
 *
 * Increment reference count to @ioc.
 */
-void get_io_context(struct io_context *ioc)
+static void get_io_context(struct io_context *ioc)
 {
 	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
 	atomic_long_inc(&ioc->refcount);
@ -53,6 +56,16 @@ static void ioc_exit_icq(struct io_cq *icq)
 	icq->flags |= ICQ_EXITED;
 }
 static void ioc_exit_icqs(struct io_context *ioc)
 {
 	struct io_cq *icq;
 	spin_lock_irq(&ioc->lock);
 	hlist_for_each_entry(icq, &ioc->icq_list, ioc_node)
 		ioc_exit_icq(icq);
 	spin_unlock_irq(&ioc->lock);
 }
 /*
 * Release an icq. Called with ioc locked for blk-mq, and with both ioc
 * and queue locked for legacy.
@ -132,102 +145,22 @@ static void ioc_release_fn(struct work_struct *work)
 	kmem_cache_free(iocontext_cachep, ioc);
 }
-/**
+/*
- * put_io_context - put a reference of io_context
+ * Releasing icqs requires reverse order double locking and we may already be
- * @ioc: io_context to put
+ * holding a queue_lock.  Do it asynchronously from a workqueue.
 *
 * Decrement reference count of @ioc and release it if the count reaches
 * zero.
 */
-void put_io_context(struct io_context *ioc)
+static bool ioc_delay_free(struct io_context *ioc)
 {
 	unsigned long flags;
 	bool free_ioc = false;
 	if (ioc == NULL)
 		return;
 	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
 	/*
 	 * Releasing ioc requires reverse order double locking and we may
 	 * already be holding a queue_lock.  Do it asynchronously from wq.
 	 */
 	if (atomic_long_dec_and_test(&ioc->refcount)) {
 		spin_lock_irqsave(&ioc->lock, flags);
 		if (!hlist_empty(&ioc->icq_list))
 			queue_work(system_power_efficient_wq,
 					&ioc->release_work);
 		else
 			free_ioc = true;
 		spin_unlock_irqrestore(&ioc->lock, flags);
 	}
 	if (free_ioc)
 		kmem_cache_free(iocontext_cachep, ioc);
 }
 /**
 * put_io_context_active - put active reference on ioc
 * @ioc: ioc of interest
 *
 * Undo get_io_context_active().  If active reference reaches zero after
 * put, @ioc can never issue further IOs and ioscheds are notified.
 */
 void put_io_context_active(struct io_context *ioc)
 {
 	struct io_cq *icq;
 	if (!atomic_dec_and_test(&ioc->active_ref)) {
 		put_io_context(ioc);
 		return;
 	}
 	spin_lock_irq(&ioc->lock);
 	hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
 		if (icq->flags & ICQ_EXITED)
 			continue;
 		ioc_exit_icq(icq);
 	}
 	spin_unlock_irq(&ioc->lock);
 	put_io_context(ioc);
 }
 /* Called by the exiting task */
 void exit_io_context(struct task_struct *task)
 {
 	struct io_context *ioc;
 	task_lock(task);
 	ioc = task->io_context;
 	task->io_context = NULL;
 	task_unlock(task);
 	atomic_dec(&ioc->nr_tasks);
 	put_io_context_active(ioc);
 }
 static void __ioc_clear_queue(struct list_head *icq_list)
 {
 	unsigned long flags;
-	rcu_read_lock();
+	spin_lock_irqsave(&ioc->lock, flags);
-	while (!list_empty(icq_list)) {
+	if (!hlist_empty(&ioc->icq_list)) {
-		struct io_cq *icq = list_entry(icq_list->next,
+		queue_work(system_power_efficient_wq, &ioc->release_work);
 						struct io_cq, q_node);
 		struct io_context *ioc = icq->ioc;
 		spin_lock_irqsave(&ioc->lock, flags);
 		if (icq->flags & ICQ_DESTROYED) {
 			spin_unlock_irqrestore(&ioc->lock, flags);
 			continue;
 		}
 		ioc_destroy_icq(icq);
 		spin_unlock_irqrestore(&ioc->lock, flags);
 		return true;
 	}
-	rcu_read_unlock();
+	spin_unlock_irqrestore(&ioc->lock, flags);
 	return false;
 }
 /**
@ -244,93 +177,156 @@ void ioc_clear_queue(struct request_queue *q)
 	list_splice_init(&q->icq_list, &icq_list);
 	spin_unlock_irq(&q->queue_lock);
-	__ioc_clear_queue(&icq_list);
+	rcu_read_lock();
-}
+	while (!list_empty(&icq_list)) {
 		struct io_cq *icq =
 			list_entry(icq_list.next, struct io_cq, q_node);
-int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
+		spin_lock_irq(&icq->ioc->lock);
 		if (!(icq->flags & ICQ_DESTROYED))
 			ioc_destroy_icq(icq);
 		spin_unlock_irq(&icq->ioc->lock);
 	}
 	rcu_read_unlock();
 }
 #else /* CONFIG_BLK_ICQ */
 static inline void ioc_exit_icqs(struct io_context *ioc)
 {
 }
 static inline bool ioc_delay_free(struct io_context *ioc)
 {
 	return false;
 }
 #endif /* CONFIG_BLK_ICQ */
 /**
 * put_io_context - put a reference of io_context
 * @ioc: io_context to put
 *
 * Decrement reference count of @ioc and release it if the count reaches
 * zero.
 */
 void put_io_context(struct io_context *ioc)
 {
 	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
 	if (atomic_long_dec_and_test(&ioc->refcount) && !ioc_delay_free(ioc))
 		kmem_cache_free(iocontext_cachep, ioc);
 }
 EXPORT_SYMBOL_GPL(put_io_context);
 /* Called by the exiting task */
 void exit_io_context(struct task_struct *task)
 {
 	struct io_context *ioc;
 	task_lock(task);
 	ioc = task->io_context;
 	task->io_context = NULL;
 	task_unlock(task);
 	if (atomic_dec_and_test(&ioc->active_ref)) {
 		ioc_exit_icqs(ioc);
 		put_io_context(ioc);
 	}
 }
 static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
 {
 	struct io_context *ioc;
 	int ret;
 	ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
 				    node);
 	if (unlikely(!ioc))
-		return -ENOMEM;
+		return NULL;
 	/* initialize */
 	atomic_long_set(&ioc->refcount, 1);
 	atomic_set(&ioc->nr_tasks, 1);
 	atomic_set(&ioc->active_ref, 1);
 #ifdef CONFIG_BLK_ICQ
 	spin_lock_init(&ioc->lock);
 	INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
 	INIT_HLIST_HEAD(&ioc->icq_list);
 	INIT_WORK(&ioc->release_work, ioc_release_fn);
 #endif
 	return ioc;
 }
 int set_task_ioprio(struct task_struct *task, int ioprio)
 {
 	int err;
 	const struct cred *cred = current_cred(), *tcred;
 	rcu_read_lock();
 	tcred = __task_cred(task);
 	if (!uid_eq(tcred->uid, cred->euid) &&
 	    !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
 		rcu_read_unlock();
 		return -EPERM;
 	}
 	rcu_read_unlock();
 	err = security_task_setioprio(task, ioprio);
 	if (err)
 		return err;
 	task_lock(task);
 	if (unlikely(!task->io_context)) {
 		struct io_context *ioc;
 		task_unlock(task);
 		ioc = alloc_io_context(GFP_ATOMIC, NUMA_NO_NODE);
 		if (!ioc)
 			return -ENOMEM;
 		task_lock(task);
 		if (task->flags & PF_EXITING) {
 			err = -ESRCH;
 			kmem_cache_free(iocontext_cachep, ioc);
 			goto out;
 		}
 		if (task->io_context)
 			kmem_cache_free(iocontext_cachep, ioc);
 		else
 			task->io_context = ioc;
 	}
 	task->io_context->ioprio = ioprio;
 out:
 	task_unlock(task);
 	return err;
 }
 EXPORT_SYMBOL_GPL(set_task_ioprio);
 int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
 {
 	struct io_context *ioc = current->io_context;
 	/*
-	 * Try to install.  ioc shouldn't be installed if someone else
+	 * Share io context with parent, if CLONE_IO is set
 	 * already did or @task, which isn't %current, is exiting.  Note
 	 * that we need to allow ioc creation on exiting %current as exit
 	 * path may issue IOs from e.g. exit_files().  The exit path is
 	 * responsible for not issuing IO after exit_io_context().
 	 */
-	task_lock(task);
+	if (clone_flags & CLONE_IO) {
-	if (!task->io_context &&
+		atomic_inc(&ioc->active_ref);
-	    (task == current || !(task->flags & PF_EXITING)))
+		tsk->io_context = ioc;
-		task->io_context = ioc;
+	} else if (ioprio_valid(ioc->ioprio)) {
-	else
+		tsk->io_context = alloc_io_context(GFP_KERNEL, NUMA_NO_NODE);
-		kmem_cache_free(iocontext_cachep, ioc);
+		if (!tsk->io_context)
 			return -ENOMEM;
 		tsk->io_context->ioprio = ioc->ioprio;
 	}
-	ret = task->io_context ? 0 : -EBUSY;
+	return 0;
 	task_unlock(task);
 	return ret;
 }
 /**
 * get_task_io_context - get io_context of a task
 * @task: task of interest
 * @gfp_flags: allocation flags, used if allocation is necessary
 * @node: allocation node, used if allocation is necessary
 *
 * Return io_context of @task.  If it doesn't exist, it is created with
 * @gfp_flags and @node.  The returned io_context has its reference count
 * incremented.
 *
 * This function always goes through task_lock() and it's better to use
 * %current->io_context + get_io_context() for %current.
 */
 struct io_context *get_task_io_context(struct task_struct *task,
 				       gfp_t gfp_flags, int node)
 {
 	struct io_context *ioc;
 	might_sleep_if(gfpflags_allow_blocking(gfp_flags));
 	do {
 		task_lock(task);
 		ioc = task->io_context;
 		if (likely(ioc)) {
 			get_io_context(ioc);
 			task_unlock(task);
 			return ioc;
 		}
 		task_unlock(task);
 	} while (!create_task_io_context(task, gfp_flags, node));
 	return NULL;
 }
 #ifdef CONFIG_BLK_ICQ
 /**
 * ioc_lookup_icq - lookup io_cq from ioc
 * @ioc: the associated io_context
 * @q: the associated request_queue
 *
 * Look up io_cq associated with @ioc - @q pair from @ioc.  Must be called
 * with @q->queue_lock held.
 */
-struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
+struct io_cq *ioc_lookup_icq(struct request_queue *q)
 {
 	struct io_context *ioc = current->io_context;
 	struct io_cq *icq;
 	lockdep_assert_held(&q->queue_lock);
@ -359,9 +355,7 @@ EXPORT_SYMBOL(ioc_lookup_icq);
 /**
 * ioc_create_icq - create and link io_cq
 * @ioc: io_context of interest
 * @q: request_queue of interest
 * @gfp_mask: allocation mask
 *
 * Make sure io_cq linking @ioc and @q exists.  If icq doesn't exist, they
 * will be created using @gfp_mask.
@ -369,19 +363,19 @@ EXPORT_SYMBOL(ioc_lookup_icq);
 * The caller is responsible for ensuring @ioc won't go away and @q is
 * alive and will stay alive until this function returns.
 */
-struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
+static struct io_cq *ioc_create_icq(struct request_queue *q)
 			     gfp_t gfp_mask)
 {
 	struct io_context *ioc = current->io_context;
 	struct elevator_type *et = q->elevator->type;
 	struct io_cq *icq;
 	/* allocate stuff */
-	icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
+	icq = kmem_cache_alloc_node(et->icq_cache, GFP_ATOMIC | __GFP_ZERO,
 				    q->node);
 	if (!icq)
 		return NULL;
-	if (radix_tree_maybe_preload(gfp_mask) < 0) {
+	if (radix_tree_maybe_preload(GFP_ATOMIC) < 0) {
 		kmem_cache_free(et->icq_cache, icq);
 		return NULL;
 	}
@ -402,7 +396,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 			et->ops.init_icq(icq);
 	} else {
 		kmem_cache_free(et->icq_cache, icq);
-		icq = ioc_lookup_icq(ioc, q);
+		icq = ioc_lookup_icq(q);
 		if (!icq)
 			printk(KERN_ERR "cfq: icq link failed!\n");
 	}
@ -413,6 +407,46 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
 	return icq;
 }
 struct io_cq *ioc_find_get_icq(struct request_queue *q)
 {
 	struct io_context *ioc = current->io_context;
 	struct io_cq *icq = NULL;
 	if (unlikely(!ioc)) {
 		ioc = alloc_io_context(GFP_ATOMIC, q->node);
 		if (!ioc)
 			return NULL;
 		task_lock(current);
 		if (current->io_context) {
 			kmem_cache_free(iocontext_cachep, ioc);
 			ioc = current->io_context;
 		} else {
 			current->io_context = ioc;
 		}
 		get_io_context(ioc);
 		task_unlock(current);
 	} else {
 		get_io_context(ioc);
 		spin_lock_irq(&q->queue_lock);
 		icq = ioc_lookup_icq(q);
 		spin_unlock_irq(&q->queue_lock);
 	}
 	if (!icq) {
 		icq = ioc_create_icq(q);
 		if (!icq) {
 			put_io_context(ioc);
 			return NULL;
 		}
 	}
 	return icq;
 }
 EXPORT_SYMBOL_GPL(ioc_find_get_icq);
 #endif /* CONFIG_BLK_ICQ */
 static int __init blk_ioc_init(void)
 {
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@ -74,6 +74,7 @@
 #include <linux/sched/signal.h>
 #include <trace/events/block.h>
 #include <linux/blk-mq.h>
 #include <linux/blk-cgroup.h>
 #include "blk-rq-qos.h"
 #include "blk-stat.h"
 #include "blk.h"
--- a/block/blk-ioprio.c
+++ b/block/blk-ioprio.c
@ -62,6 +62,7 @@ struct ioprio_blkg {
 struct ioprio_blkcg {
 	struct blkcg_policy_data cpd;
 	enum prio_policy	 prio_policy;
 	bool			 prio_set;
 };
 static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
@ -112,7 +113,7 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
 	if (ret < 0)
 		return ret;
 	blkcg->prio_policy = ret;
-
+	blkcg->prio_set = true;
 	return nbytes;
 }
@ -190,6 +191,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
 			       struct bio *bio)
 {
 	struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
 	u16 prio;
 	if (!blkcg->prio_set)
 		return;
 	/*
 	 * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
@ -199,8 +204,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
 	 * bio I/O priority is not modified. If the bio I/O priority equals
 	 * IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio.
 	 */
-	bio->bi_ioprio = max_t(u16, bio->bi_ioprio,
+	prio = max_t(u16, bio->bi_ioprio,
-			       IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
+			IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
 	if (prio > bio->bi_ioprio)
 		bio->bi_ioprio = prio;
 }
 static void blkcg_ioprio_exit(struct rq_qos *rqos)
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@ -6,12 +6,47 @@
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/blk-integrity.h>
 #include <linux/scatterlist.h>
 #include <linux/part_stat.h>
 #include <trace/events/block.h>
 #include "blk.h"
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
 #include "blk-throttle.h"
 static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
 {
 	*bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
 }
 static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
 {
 	struct bvec_iter iter = bio->bi_iter;
 	int idx;
 	bio_get_first_bvec(bio, bv);
 	if (bv->bv_len == bio->bi_iter.bi_size)
 		return;		/* this bio only has a single bvec */
 	bio_advance_iter(bio, &iter, iter.bi_size);
 	if (!iter.bi_bvec_done)
 		idx = iter.bi_idx - 1;
 	else	/* in the middle of bvec */
 		idx = iter.bi_idx;
 	*bv = bio->bi_io_vec[idx];
 	/*
 	 * iter.bi_bvec_done records actual length of the last bvec
 	 * if this bio ends in the middle of one io vector
 	 */
 	if (iter.bi_bvec_done)
 		bv->bv_len = iter.bi_bvec_done;
 }
 static inline bool bio_will_gap(struct request_queue *q,
 		struct request *prev_rq, struct bio *prev, struct bio *next)
@ -285,13 +320,13 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 	 * iopoll in direct IO routine. Given performance gain of iopoll for
 	 * big IO can be trival, disable iopoll when split needed.
 	 */
-	bio_clear_hipri(bio);
+	bio_clear_polled(bio);
 	return bio_split(bio, sectors, GFP_NOIO, bs);
 }
 /**
 * __blk_queue_split - split a bio and submit the second half
 * @q:       [in] request_queue new bio is being queued at
 * @bio:     [in, out] bio to be split
 * @nr_segs: [out] number of segments in the first bio
 *
@ -302,9 +337,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 * of the caller to ensure that q->bio_split is only released after processing
 * of the split bio has finished.
 */
-void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
+void __blk_queue_split(struct request_queue *q, struct bio **bio,
 		       unsigned int *nr_segs)
 {
 	struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue;
 	struct bio *split = NULL;
 	switch (bio_op(*bio)) {
@ -321,21 +356,6 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
 				nr_segs);
 		break;
 	default:
 		/*
 		 * All drivers must accept single-segments bios that are <=
 		 * PAGE_SIZE.  This is a quick and dirty check that relies on
 		 * the fact that bi_io_vec[0] is always valid if a bio has data.
 		 * The check might lead to occasional false negatives when bios
 		 * are cloned, but compared to the performance impact of cloned
 		 * bios themselves the loop below doesn't matter anyway.
 		 */
 		if (!q->limits.chunk_sectors &&
 		    (*bio)->bi_vcnt == 1 &&
 		    ((*bio)->bi_io_vec[0].bv_len +
 		     (*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) {
 			*nr_segs = 1;
 			break;
 		}
 		split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
 		break;
 	}
@ -365,9 +385,11 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
 */
 void blk_queue_split(struct bio **bio)
 {
 	struct request_queue *q = bdev_get_queue((*bio)->bi_bdev);
 	unsigned int nr_segs;
-	__blk_queue_split(bio, &nr_segs);
+	if (blk_may_split(q, *bio))
 		__blk_queue_split(q, bio, &nr_segs);
 }
 EXPORT_SYMBOL(blk_queue_split);
@ -558,6 +580,23 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq)
 	return queue_max_segments(rq->q);
 }
 static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
 						  sector_t offset)
 {
 	struct request_queue *q = rq->q;
 	if (blk_rq_is_passthrough(rq))
 		return q->limits.max_hw_sectors;
 	if (!q->limits.chunk_sectors ||
 	    req_op(rq) == REQ_OP_DISCARD ||
 	    req_op(rq) == REQ_OP_SECURE_ERASE)
 		return blk_queue_get_max_sectors(q, req_op(rq));
 	return min(blk_max_size_offset(q, offset, 0),
 			blk_queue_get_max_sectors(q, req_op(rq)));
 }
 static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
 		unsigned int nr_phys_segs)
 {
@ -718,6 +757,13 @@ static enum elv_merge blk_try_req_merge(struct request *req,
 	return ELEVATOR_NO_MERGE;
 }
 static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
 {
 	if (bio_page(a) == bio_page(b) && bio_offset(a) == bio_offset(b))
 		return true;
 	return false;
 }
 /*
 * For non-mq, this has to be called with the request spinlock acquired.
 * For mq with scheduling, the appropriate queue wide lock should be held.
@ -731,8 +777,7 @@ static struct request *attempt_merge(struct request_queue *q,
 	if (req_op(req) != req_op(next))
 		return NULL;
-	if (rq_data_dir(req) != rq_data_dir(next)
+	if (rq_data_dir(req) != rq_data_dir(next))
 	    || req->rq_disk != next->rq_disk)
 		return NULL;
 	if (req_op(req) == REQ_OP_WRITE_SAME &&
@ -859,10 +904,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 	if (bio_data_dir(bio) != rq_data_dir(rq))
 		return false;
 	/* must be same device */
 	if (rq->rq_disk != bio->bi_bdev->bd_disk)
 		return false;
 	/* only merge integrity protected bio into ditto rq */
 	if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
 		return false;
@ -1023,12 +1064,10 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
 * @q: request_queue new bio is being queued at
 * @bio: new bio being queued
 * @nr_segs: number of segments in @bio
- * @same_queue_rq: pointer to &struct request that gets filled in when
+ * from the passed in @q already in the plug list
 * another request associated with @q is found on the plug list
 * (optional, may be %NULL)
 *
- * Determine whether @bio being queued on @q can be merged with a request
+ * Determine whether @bio being queued on @q can be merged with the previous
- * on %current's plugged list.  Returns %true if merge was successful,
+ * request on %current's plugged list.  Returns %true if merge was successful,
 * otherwise %false.
 *
 * Plugging coalesces IOs from the same issuer for the same purpose without
@ -1041,36 +1080,22 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
 * Caller must ensure !blk_queue_nomerges(q) beforehand.
 */
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-		unsigned int nr_segs, struct request **same_queue_rq)
+		unsigned int nr_segs)
 {
 	struct blk_plug *plug;
 	struct request *rq;
 	struct list_head *plug_list;
 	plug = blk_mq_plug(q, bio);
-	if (!plug)
+	if (!plug || rq_list_empty(plug->mq_list))
 		return false;
-	plug_list = &plug->mq_list;
+	/* check the previously added entry for a quick merge attempt */
-
+	rq = rq_list_peek(&plug->mq_list);
-	list_for_each_entry_reverse(rq, plug_list, queuelist) {
+	if (rq->q == q) {
 		if (rq->q == q && same_queue_rq) {
 			/*
 			 * Only blk-mq multiple hardware queues case checks the
 			 * rq in the same queue, there should be only one such
 			 * rq in a queue
 			 **/
 			*same_queue_rq = rq;
 		}
 		if (rq->q != q)
 			continue;
 		if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
-		    BIO_MERGE_OK)
+				BIO_MERGE_OK)
 			return true;
 	}
 	return false;
 }
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@ -11,6 +11,7 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-debugfs.h"
 #include "blk-mq-sched.h"
 #include "blk-mq-tag.h"
 #include "blk-rq-qos.h"
@ -29,6 +30,9 @@ static int queue_poll_stat_show(void *data, struct seq_file *m)
 	struct request_queue *q = data;
 	int bucket;
 	if (!q->poll_stat)
 		return 0;
 	for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) {
 		seq_printf(m, "read  (%d Bytes): ", 1 << (9 + bucket));
 		print_stat(m, &q->poll_stat[2 * bucket]);
@ -122,9 +126,7 @@ static const char *const blk_queue_flag_name[] = {
 	QUEUE_FLAG_NAME(FUA),
 	QUEUE_FLAG_NAME(DAX),
 	QUEUE_FLAG_NAME(STATS),
 	QUEUE_FLAG_NAME(POLL_STATS),
 	QUEUE_FLAG_NAME(REGISTERED),
 	QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
 	QUEUE_FLAG_NAME(QUIESCED),
 	QUEUE_FLAG_NAME(PCI_P2PDMA),
 	QUEUE_FLAG_NAME(ZONE_RESETALL),
@ -287,7 +289,7 @@ static const char *const cmd_flag_name[] = {
 	CMD_FLAG_NAME(BACKGROUND),
 	CMD_FLAG_NAME(NOWAIT),
 	CMD_FLAG_NAME(NOUNMAP),
-	CMD_FLAG_NAME(HIPRI),
+	CMD_FLAG_NAME(POLLED),
 };
 #undef CMD_FLAG_NAME
@ -309,6 +311,7 @@ static const char *const rqf_name[] = {
 	RQF_NAME(SPECIAL_PAYLOAD),
 	RQF_NAME(ZONE_WRITE_LOCKED),
 	RQF_NAME(MQ_POLL_SLEPT),
 	RQF_NAME(ELV),
 };
 #undef RQF_NAME
@ -453,11 +456,11 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
 		   atomic_read(&tags->active_queues));
 	seq_puts(m, "\nbitmap_tags:\n");
-	sbitmap_queue_show(tags->bitmap_tags, m);
+	sbitmap_queue_show(&tags->bitmap_tags, m);
 	if (tags->nr_reserved_tags) {
 		seq_puts(m, "\nbreserved_tags:\n");
-		sbitmap_queue_show(tags->breserved_tags, m);
+		sbitmap_queue_show(&tags->breserved_tags, m);
 	}
 }
@ -488,7 +491,7 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
 	if (res)
 		goto out;
 	if (hctx->tags)
-		sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m);
+		sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
 	mutex_unlock(&q->sysfs_lock);
 out:
@ -522,77 +525,13 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
 	if (res)
 		goto out;
 	if (hctx->sched_tags)
-		sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m);
+		sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m);
 	mutex_unlock(&q->sysfs_lock);
 out:
 	return res;
 }
 static int hctx_io_poll_show(void *data, struct seq_file *m)
 {
 	struct blk_mq_hw_ctx *hctx = data;
 	seq_printf(m, "considered=%lu\n", hctx->poll_considered);
 	seq_printf(m, "invoked=%lu\n", hctx->poll_invoked);
 	seq_printf(m, "success=%lu\n", hctx->poll_success);
 	return 0;
 }
 static ssize_t hctx_io_poll_write(void *data, const char __user *buf,
 				  size_t count, loff_t *ppos)
 {
 	struct blk_mq_hw_ctx *hctx = data;
 	hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
 	return count;
 }
 static int hctx_dispatched_show(void *data, struct seq_file *m)
 {
 	struct blk_mq_hw_ctx *hctx = data;
 	int i;
 	seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
 	for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
 		unsigned int d = 1U << (i - 1);
 		seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]);
 	}
 	seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]);
 	return 0;
 }
 static ssize_t hctx_dispatched_write(void *data, const char __user *buf,
 				     size_t count, loff_t *ppos)
 {
 	struct blk_mq_hw_ctx *hctx = data;
 	int i;
 	for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++)
 		hctx->dispatched[i] = 0;
 	return count;
 }
 static int hctx_queued_show(void *data, struct seq_file *m)
 {
 	struct blk_mq_hw_ctx *hctx = data;
 	seq_printf(m, "%lu\n", hctx->queued);
 	return 0;
 }
 static ssize_t hctx_queued_write(void *data, const char __user *buf,
 				 size_t count, loff_t *ppos)
 {
 	struct blk_mq_hw_ctx *hctx = data;
 	hctx->queued = 0;
 	return count;
 }
 static int hctx_run_show(void *data, struct seq_file *m)
 {
 	struct blk_mq_hw_ctx *hctx = data;
@ -614,7 +553,7 @@ static int hctx_active_show(void *data, struct seq_file *m)
 {
 	struct blk_mq_hw_ctx *hctx = data;
-	seq_printf(m, "%d\n", atomic_read(&hctx->nr_active));
+	seq_printf(m, "%d\n", __blk_mq_active_requests(hctx));
 	return 0;
 }
@ -663,57 +602,6 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
 CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
 CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
 static int ctx_dispatched_show(void *data, struct seq_file *m)
 {
 	struct blk_mq_ctx *ctx = data;
 	seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]);
 	return 0;
 }
 static ssize_t ctx_dispatched_write(void *data, const char __user *buf,
 				    size_t count, loff_t *ppos)
 {
 	struct blk_mq_ctx *ctx = data;
 	ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0;
 	return count;
 }
 static int ctx_merged_show(void *data, struct seq_file *m)
 {
 	struct blk_mq_ctx *ctx = data;
 	seq_printf(m, "%lu\n", ctx->rq_merged);
 	return 0;
 }
 static ssize_t ctx_merged_write(void *data, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
 	struct blk_mq_ctx *ctx = data;
 	ctx->rq_merged = 0;
 	return count;
 }
 static int ctx_completed_show(void *data, struct seq_file *m)
 {
 	struct blk_mq_ctx *ctx = data;
 	seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]);
 	return 0;
 }
 static ssize_t ctx_completed_write(void *data, const char __user *buf,
 				   size_t count, loff_t *ppos)
 {
 	struct blk_mq_ctx *ctx = data;
 	ctx->rq_completed[0] = ctx->rq_completed[1] = 0;
 	return count;
 }
 static int blk_mq_debugfs_show(struct seq_file *m, void *v)
 {
 	const struct blk_mq_debugfs_attr *attr = m->private;
@ -789,9 +677,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
 	{"tags_bitmap", 0400, hctx_tags_bitmap_show},
 	{"sched_tags", 0400, hctx_sched_tags_show},
 	{"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show},
 	{"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write},
 	{"dispatched", 0600, hctx_dispatched_show, hctx_dispatched_write},
 	{"queued", 0600, hctx_queued_show, hctx_queued_write},
 	{"run", 0600, hctx_run_show, hctx_run_write},
 	{"active", 0400, hctx_active_show},
 	{"dispatch_busy", 0400, hctx_dispatch_busy_show},
@ -803,9 +688,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
 	{"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops},
 	{"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
 	{"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
 	{"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
 	{"merged", 0600, ctx_merged_show, ctx_merged_write},
 	{"completed", 0600, ctx_completed_show, ctx_completed_write},
 	{},
 };
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@ -18,32 +18,6 @@
 #include "blk-mq-tag.h"
 #include "blk-wbt.h"
 void blk_mq_sched_assign_ioc(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	struct io_context *ioc;
 	struct io_cq *icq;
 	/*
 	 * May not have an IO context if it's a passthrough request
 	 */
 	ioc = current->io_context;
 	if (!ioc)
 		return;
 	spin_lock_irq(&q->queue_lock);
 	icq = ioc_lookup_icq(ioc, q);
 	spin_unlock_irq(&q->queue_lock);
 	if (!icq) {
 		icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
 		if (!icq)
 			return;
 	}
 	get_io_context(icq->ioc);
 	rq->elv.icq = icq;
 }
 /*
 * Mark a hardware queue as needing a restart. For shared queues, maintain
 * a count of how many hardware queues are marked for restart.
@ -57,10 +31,8 @@ void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
 }
 EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
+void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
 {
 	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
 		return;
 	clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
 	/*
@ -363,7 +335,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 	}
 }
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs)
 {
 	struct elevator_queue *e = q->elevator;
@ -372,15 +344,17 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 	bool ret = false;
 	enum hctx_type type;
-	if (e && e->type->ops.bio_merge)
+	if (e && e->type->ops.bio_merge) {
-		return e->type->ops.bio_merge(q, bio, nr_segs);
+		ret = e->type->ops.bio_merge(q, bio, nr_segs);
 		goto out_put;
 	}
 	ctx = blk_mq_get_ctx(q);
 	hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
 	type = hctx->type;
 	if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
 	    list_empty_careful(&ctx->rq_lists[type]))
-		return false;
+		goto out_put;
 	/* default per sw-queue merge */
 	spin_lock(&ctx->lock);
@ -389,13 +363,11 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 	 * potentially merge with. Currently includes a hand-wavy stop
 	 * count of 8, to not spend too much time checking for merges.
 	 */
-	if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
+	if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
 		ctx->rq_merged++;
 		ret = true;
 	}
 	spin_unlock(&ctx->lock);
-
+out_put:
 	return ret;
 }
@ -502,8 +474,9 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
 		 * busy in case of 'none' scheduler, and this way may save
 		 * us one extra enqueue & dequeue to sw queue.
 		 */
-		if (!hctx->dispatch_busy && !e && !run_queue_async) {
+		if (!hctx->dispatch_busy && !run_queue_async) {
-			blk_mq_try_issue_list_directly(hctx, list);
+			blk_mq_run_dispatch_ops(hctx->queue,
 				blk_mq_try_issue_list_directly(hctx, list));
 			if (list_empty(list))
 				goto out;
 		}
@ -515,83 +488,71 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
 	percpu_ref_put(&q->q_usage_counter);
 }
-static int blk_mq_sched_alloc_tags(struct request_queue *q,
+static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
-				   struct blk_mq_hw_ctx *hctx,
+					  struct blk_mq_hw_ctx *hctx,
-				   unsigned int hctx_idx)
+					  unsigned int hctx_idx)
 {
-	struct blk_mq_tag_set *set = q->tag_set;
+	if (blk_mq_is_shared_tags(q->tag_set->flags)) {
-	int ret;
+		hctx->sched_tags = q->sched_shared_tags;
-
+		return 0;
 	hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
 					       set->reserved_tags, set->flags);
 	if (!hctx->sched_tags)
 		return -ENOMEM;
 	ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
 	if (ret) {
 		blk_mq_free_rq_map(hctx->sched_tags, set->flags);
 		hctx->sched_tags = NULL;
 	}
-	return ret;
+	hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
 						    q->nr_requests);
 	if (!hctx->sched_tags)
 		return -ENOMEM;
 	return 0;
 }
 static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
 {
 	blk_mq_free_rq_map(queue->sched_shared_tags);
 	queue->sched_shared_tags = NULL;
 }
 /* called in queue's release handler, tagset has gone away */
-static void blk_mq_sched_tags_teardown(struct request_queue *q)
+static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (hctx->sched_tags) {
-			blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
+			if (!blk_mq_is_shared_tags(flags))
 				blk_mq_free_rq_map(hctx->sched_tags);
 			hctx->sched_tags = NULL;
 		}
 	}
 	if (blk_mq_is_shared_tags(flags))
 		blk_mq_exit_sched_shared_tags(q);
 }
-static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
+static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
 {
 	struct blk_mq_tag_set *set = queue->tag_set;
 	int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
 	struct blk_mq_hw_ctx *hctx;
 	int ret, i;
 	/*
 	 * Set initial depth at max so that we don't need to reallocate for
 	 * updating nr_requests.
 	 */
-	ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,
+	queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
-				  &queue->sched_breserved_tags,
+						BLK_MQ_NO_HCTX_IDX,
-				  MAX_SCHED_RQ, set->reserved_tags,
+						MAX_SCHED_RQ);
-				  set->numa_node, alloc_policy);
+	if (!queue->sched_shared_tags)
-	if (ret)
+		return -ENOMEM;
 		return ret;
-	queue_for_each_hw_ctx(queue, hctx, i) {
+	blk_mq_tag_update_sched_shared_tags(queue);
 		hctx->sched_tags->bitmap_tags =
 					&queue->sched_bitmap_tags;
 		hctx->sched_tags->breserved_tags =
 					&queue->sched_breserved_tags;
 	}
 	sbitmap_queue_resize(&queue->sched_bitmap_tags,
 			     queue->nr_requests - set->reserved_tags);
 	return 0;
 }
 static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
 {
 	sbitmap_queue_free(&queue->sched_bitmap_tags);
 	sbitmap_queue_free(&queue->sched_breserved_tags);
 }
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 {
 	unsigned int i, flags = q->tag_set->flags;
 	struct blk_mq_hw_ctx *hctx;
 	struct elevator_queue *eq;
 	unsigned int i;
 	int ret;
 	if (!e) {
@ -606,23 +567,23 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 	 * Additionally, this is a per-hw queue depth.
 	 */
 	q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
-				   BLKDEV_MAX_RQ);
+				   BLKDEV_DEFAULT_RQ);
-	queue_for_each_hw_ctx(q, hctx, i) {
+	if (blk_mq_is_shared_tags(flags)) {
-		ret = blk_mq_sched_alloc_tags(q, hctx, i);
+		ret = blk_mq_init_sched_shared_tags(q);
 		if (ret)
-			goto err_free_tags;
+			return ret;
 	}
-	if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
+	queue_for_each_hw_ctx(q, hctx, i) {
-		ret = blk_mq_init_sched_shared_sbitmap(q);
+		ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
 		if (ret)
-			goto err_free_tags;
+			goto err_free_map_and_rqs;
 	}
 	ret = e->ops.init_sched(q, e);
 	if (ret)
-		goto err_free_sbitmap;
+		goto err_free_map_and_rqs;
 	blk_mq_debugfs_register_sched(q);
@ -631,7 +592,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 			ret = e->ops.init_hctx(hctx, i);
 			if (ret) {
 				eq = q->elevator;
-				blk_mq_sched_free_requests(q);
+				blk_mq_sched_free_rqs(q);
 				blk_mq_exit_sched(q, eq);
 				kobject_put(&eq->kobj);
 				return ret;
@ -642,12 +603,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 	return 0;
-err_free_sbitmap:
+err_free_map_and_rqs:
-	if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
+	blk_mq_sched_free_rqs(q);
-		blk_mq_exit_sched_shared_sbitmap(q);
+	blk_mq_sched_tags_teardown(q, flags);
-err_free_tags:
+
 	blk_mq_sched_free_requests(q);
 	blk_mq_sched_tags_teardown(q);
 	q->elevator = NULL;
 	return ret;
 }
@ -656,14 +615,20 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
 * called in either blk_queue_cleanup or elevator_switch, tagset
 * is required for freeing requests
 */
-void blk_mq_sched_free_requests(struct request_queue *q)
+void blk_mq_sched_free_rqs(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
-	queue_for_each_hw_ctx(q, hctx, i) {
+	if (blk_mq_is_shared_tags(q->tag_set->flags)) {
-		if (hctx->sched_tags)
+		blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
-			blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
+				BLK_MQ_NO_HCTX_IDX);
 	} else {
 		queue_for_each_hw_ctx(q, hctx, i) {
 			if (hctx->sched_tags)
 				blk_mq_free_rqs(q->tag_set,
 						hctx->sched_tags, i);
 		}
 	}
 }
@ -684,8 +649,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
 	blk_mq_debugfs_unregister_sched(q);
 	if (e->type->ops.exit_sched)
 		e->type->ops.exit_sched(e);
-	blk_mq_sched_tags_teardown(q);
+	blk_mq_sched_tags_teardown(q, flags);
 	if (blk_mq_is_sbitmap_shared(flags))
 		blk_mq_exit_sched_shared_sbitmap(q);
 	q->elevator = NULL;
 }
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@ -2,21 +2,20 @@
 #ifndef BLK_MQ_SCHED_H
 #define BLK_MQ_SCHED_H
 #include "elevator.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
-#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ)
+#define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)
 void blk_mq_sched_assign_ioc(struct request *rq);
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs, struct request **merged_request);
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs);
 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
 				   struct list_head *free);
 void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
+void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
 				 bool run_queue, bool async);
@ -28,45 +27,51 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
-void blk_mq_sched_free_requests(struct request_queue *q);
+void blk_mq_sched_free_rqs(struct request_queue *q);
-static inline bool
+static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
 blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
 		unsigned int nr_segs)
 {
-	if (blk_queue_nomerges(q) || !bio_mergeable(bio))
+	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
-		return false;
+		__blk_mq_sched_restart(hctx);
 }
-	return __blk_mq_sched_bio_merge(q, bio, nr_segs);
+static inline bool bio_mergeable(struct bio *bio)
 {
 	return !(bio->bi_opf & REQ_NOMERGE_FLAGS);
 }
 static inline bool
 blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
 			 struct bio *bio)
 {
-	struct elevator_queue *e = q->elevator;
+	if (rq->rq_flags & RQF_ELV) {
-
+		struct elevator_queue *e = q->elevator;
 	if (e && e->type->ops.allow_merge)
 		return e->type->ops.allow_merge(q, rq, bio);
 		if (e->type->ops.allow_merge)
 			return e->type->ops.allow_merge(q, rq, bio);
 	}
 	return true;
 }
 static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
 {
-	struct elevator_queue *e = rq->q->elevator;
+	if (rq->rq_flags & RQF_ELV) {
 		struct elevator_queue *e = rq->q->elevator;
-	if (e && e->type->ops.completed_request)
+		if (e->type->ops.completed_request)
-		e->type->ops.completed_request(rq, now);
+			e->type->ops.completed_request(rq, now);
 	}
 }
 static inline void blk_mq_sched_requeue_request(struct request *rq)
 {
-	struct request_queue *q = rq->q;
+	if (rq->rq_flags & RQF_ELV) {
-	struct elevator_queue *e = q->elevator;
+		struct request_queue *q = rq->q;
 		struct elevator_queue *e = q->elevator;
-	if ((rq->rq_flags & RQF_ELVPRIV) && e && e->type->ops.requeue_request)
+		if ((rq->rq_flags & RQF_ELVPRIV) && e->type->ops.requeue_request)
-		e->type->ops.requeue_request(rq);
+			e->type->ops.requeue_request(rq);
 	}
 }
 static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
 	struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
 						  kobj);
 	if (hctx->flags & BLK_MQ_F_BLOCKING)
 		cleanup_srcu_struct(hctx->srcu);
 	blk_free_flush_queue(hctx->fq);
 	sbitmap_free(&hctx->ctx_map);
 	free_cpumask_var(hctx->cpumask);
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@ -16,6 +16,21 @@
 #include "blk-mq-sched.h"
 #include "blk-mq-tag.h"
 /*
 * Recalculate wakeup batch when tag is shared by hctx.
 */
 static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
 		unsigned int users)
 {
 	if (!users)
 		return;
 	sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags,
 			users);
 	sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags,
 			users);
 }
 /*
 * If a previously inactive queue goes active, bump the active user count.
 * We need to do this before try to allocate driver tag, then even if fail
@ -24,19 +39,26 @@
 */
 bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 {
-	if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+	unsigned int users;
 		struct request_queue *q = hctx->queue;
 		struct blk_mq_tag_set *set = q->tag_set;
-		if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
+	if (blk_mq_is_shared_tags(hctx->flags)) {
-		    !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
+		struct request_queue *q = hctx->queue;
-			atomic_inc(&set->active_queues_shared_sbitmap);
+
 		if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
 		    test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) {
 			return true;
 		}
 	} else {
-		if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
+		if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
-		    !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+		    test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) {
-			atomic_inc(&hctx->tags->active_queues);
+			return true;
 		}
 	}
 	users = atomic_inc_return(&hctx->tags->active_queues);
 	blk_mq_update_wake_batch(hctx->tags, users);
 	return true;
 }
@ -45,9 +67,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 */
 void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
 {
-	sbitmap_queue_wake_all(tags->bitmap_tags);
+	sbitmap_queue_wake_all(&tags->bitmap_tags);
 	if (include_reserve)
-		sbitmap_queue_wake_all(tags->breserved_tags);
+		sbitmap_queue_wake_all(&tags->breserved_tags);
 }
 /*
@ -57,20 +79,23 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
 void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
 {
 	struct blk_mq_tags *tags = hctx->tags;
-	struct request_queue *q = hctx->queue;
+	unsigned int users;
-	struct blk_mq_tag_set *set = q->tag_set;
+
 	if (blk_mq_is_shared_tags(hctx->flags)) {
 		struct request_queue *q = hctx->queue;
 	if (blk_mq_is_sbitmap_shared(hctx->flags)) {
 		if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
 					&q->queue_flags))
 			return;
 		atomic_dec(&set->active_queues_shared_sbitmap);
 	} else {
 		if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
 			return;
 		atomic_dec(&tags->active_queues);
 	}
 	users = atomic_dec_return(&tags->active_queues);
 	blk_mq_update_wake_batch(tags, users);
 	blk_mq_tag_wakeup_all(tags, false);
 }
@ -87,6 +112,21 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
 		return __sbitmap_queue_get(bt);
 }
 unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
 			      unsigned int *offset)
 {
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
 	struct sbitmap_queue *bt = &tags->bitmap_tags;
 	unsigned long ret;
 	if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED ||
 	    data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
 		return 0;
 	ret = __sbitmap_queue_get_batch(bt, nr_tags, offset);
 	*offset += tags->nr_reserved_tags;
 	return ret;
 }
 unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 {
 	struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
@ -101,10 +141,10 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 			WARN_ON_ONCE(1);
 			return BLK_MQ_NO_TAG;
 		}
-		bt = tags->breserved_tags;
+		bt = &tags->breserved_tags;
 		tag_offset = 0;
 	} else {
-		bt = tags->bitmap_tags;
+		bt = &tags->bitmap_tags;
 		tag_offset = tags->nr_reserved_tags;
 	}
@ -150,9 +190,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
 						data->ctx);
 		tags = blk_mq_tags_from_data(data);
 		if (data->flags & BLK_MQ_REQ_RESERVED)
-			bt = tags->breserved_tags;
+			bt = &tags->breserved_tags;
 		else
-			bt = tags->bitmap_tags;
+			bt = &tags->bitmap_tags;
 		/*
 		 * If destination hw queue is changed, fake wake up on
@ -186,16 +226,23 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
 		const int real_tag = tag - tags->nr_reserved_tags;
 		BUG_ON(real_tag >= tags->nr_tags);
-		sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu);
+		sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
 	} else {
 		BUG_ON(tag >= tags->nr_reserved_tags);
-		sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu);
+		sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
 	}
 }
 void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
 {
 	sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
 					tag_array, nr_tags);
 }
 struct bt_iter_data {
 	struct blk_mq_hw_ctx *hctx;
-	busy_iter_fn *fn;
+	struct request_queue *q;
 	busy_tag_iter_fn *fn;
 	void *data;
 	bool reserved;
 };
@ -208,7 +255,7 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
 	spin_lock_irqsave(&tags->lock, flags);
 	rq = tags->rqs[bitnr];
-	if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref))
+	if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq))
 		rq = NULL;
 	spin_unlock_irqrestore(&tags->lock, flags);
 	return rq;
@ -218,11 +265,18 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 {
 	struct bt_iter_data *iter_data = data;
 	struct blk_mq_hw_ctx *hctx = iter_data->hctx;
-	struct blk_mq_tags *tags = hctx->tags;
+	struct request_queue *q = iter_data->q;
 	struct blk_mq_tag_set *set = q->tag_set;
 	bool reserved = iter_data->reserved;
 	struct blk_mq_tags *tags;
 	struct request *rq;
 	bool ret = true;
 	if (blk_mq_is_shared_tags(set->flags))
 		tags = set->shared_tags;
 	else
 		tags = hctx->tags;
 	if (!reserved)
 		bitnr += tags->nr_reserved_tags;
 	/*
@ -233,8 +287,8 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 	if (!rq)
 		return true;
-	if (rq->q == hctx->queue && rq->mq_hctx == hctx)
+	if (rq->q == q && (!hctx || rq->mq_hctx == hctx))
-		ret = iter_data->fn(hctx, rq, iter_data->data, reserved);
+		ret = iter_data->fn(rq, iter_data->data, reserved);
 	blk_mq_put_rq_ref(rq);
 	return ret;
 }
@ -242,6 +296,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 /**
 * bt_for_each - iterate over the requests associated with a hardware queue
 * @hctx:	Hardware queue to examine.
 * @q:		Request queue to examine.
 * @bt:		sbitmap to examine. This is either the breserved_tags member
 *		or the bitmap_tags member of struct blk_mq_tags.
 * @fn:		Pointer to the function that will be called for each request
@ -253,14 +308,16 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 * @reserved:	Indicates whether @bt is the breserved_tags member or the
 *		bitmap_tags member of struct blk_mq_tags.
 */
-static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
+static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q,
-			busy_iter_fn *fn, void *data, bool reserved)
+			struct sbitmap_queue *bt, busy_tag_iter_fn *fn,
 			void *data, bool reserved)
 {
 	struct bt_iter_data iter_data = {
 		.hctx = hctx,
 		.fn = fn,
 		.data = data,
 		.reserved = reserved,
 		.q = q,
 	};
 	sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
@ -340,9 +397,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
 	WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);
 	if (tags->nr_reserved_tags)
-		bt_tags_for_each(tags, tags->breserved_tags, fn, priv,
+		bt_tags_for_each(tags, &tags->breserved_tags, fn, priv,
 				 flags | BT_TAG_ITER_RESERVED);
-	bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags);
+	bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags);
 }
 /**
@ -379,9 +436,12 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
 void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
 		busy_tag_iter_fn *fn, void *priv)
 {
-	int i;
+	unsigned int flags = tagset->flags;
 	int i, nr_tags;
-	for (i = 0; i < tagset->nr_hw_queues; i++) {
+	nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues;
 	for (i = 0; i < nr_tags; i++) {
 		if (tagset->tags && tagset->tags[i])
 			__blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
 					      BT_TAG_ITER_STARTED);
@ -434,12 +494,9 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
 * called for all requests on all queues that share that tag set and not only
 * for requests associated with @q.
 */
-void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
 		void *priv)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 	/*
 	 * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
 	 * while the queue is frozen. So we can use q_usage_counter to avoid
@ -448,19 +505,34 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
 	if (!percpu_ref_tryget(&q->q_usage_counter))
 		return;
-	queue_for_each_hw_ctx(q, hctx, i) {
+	if (blk_mq_is_shared_tags(q->tag_set->flags)) {
-		struct blk_mq_tags *tags = hctx->tags;
+		struct blk_mq_tags *tags = q->tag_set->shared_tags;
-
+		struct sbitmap_queue *bresv = &tags->breserved_tags;
-		/*
+		struct sbitmap_queue *btags = &tags->bitmap_tags;
 		 * If no software queues are currently mapped to this
 		 * hardware queue, there's nothing to check
 		 */
 		if (!blk_mq_hw_queue_mapped(hctx))
 			continue;
 		if (tags->nr_reserved_tags)
-			bt_for_each(hctx, tags->breserved_tags, fn, priv, true);
+			bt_for_each(NULL, q, bresv, fn, priv, true);
-		bt_for_each(hctx, tags->bitmap_tags, fn, priv, false);
+		bt_for_each(NULL, q, btags, fn, priv, false);
 	} else {
 		struct blk_mq_hw_ctx *hctx;
 		int i;
 		queue_for_each_hw_ctx(q, hctx, i) {
 			struct blk_mq_tags *tags = hctx->tags;
 			struct sbitmap_queue *bresv = &tags->breserved_tags;
 			struct sbitmap_queue *btags = &tags->bitmap_tags;
 			/*
 			 * If no software queues are currently mapped to this
 			 * hardware queue, there's nothing to check
 			 */
 			if (!blk_mq_hw_queue_mapped(hctx))
 				continue;
 			if (tags->nr_reserved_tags)
 				bt_for_each(hctx, q, bresv, fn, priv, true);
 			bt_for_each(hctx, q, btags, fn, priv, false);
 		}
 	}
 	blk_queue_exit(q);
 }
@ -492,56 +564,10 @@ int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
 	return -ENOMEM;
 }
 static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
 				   int node, int alloc_policy)
 {
 	int ret;
 	ret = blk_mq_init_bitmaps(&tags->__bitmap_tags,
 				  &tags->__breserved_tags,
 				  tags->nr_tags, tags->nr_reserved_tags,
 				  node, alloc_policy);
 	if (ret)
 		return ret;
 	tags->bitmap_tags = &tags->__bitmap_tags;
 	tags->breserved_tags = &tags->__breserved_tags;
 	return 0;
 }
 int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set)
 {
 	int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
 	int i, ret;
 	ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags,
 				  set->queue_depth, set->reserved_tags,
 				  set->numa_node, alloc_policy);
 	if (ret)
 		return ret;
 	for (i = 0; i < set->nr_hw_queues; i++) {
 		struct blk_mq_tags *tags = set->tags[i];
 		tags->bitmap_tags = &set->__bitmap_tags;
 		tags->breserved_tags = &set->__breserved_tags;
 	}
 	return 0;
 }
 void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
 {
 	sbitmap_queue_free(&set->__bitmap_tags);
 	sbitmap_queue_free(&set->__breserved_tags);
 }
 struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 				     unsigned int reserved_tags,
-				     int node, unsigned int flags)
+				     int node, int alloc_policy)
 {
 	int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags);
 	struct blk_mq_tags *tags;
 	if (total_tags > BLK_MQ_TAG_MAX) {
@ -557,22 +583,19 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
 	tags->nr_reserved_tags = reserved_tags;
 	spin_lock_init(&tags->lock);
-	if (blk_mq_is_sbitmap_shared(flags))
+	if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags,
-		return tags;
+				total_tags, reserved_tags, node,
-
+				alloc_policy) < 0) {
 	if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) {
 		kfree(tags);
 		return NULL;
 	}
 	return tags;
 }
-void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags)
+void blk_mq_free_tags(struct blk_mq_tags *tags)
 {
-	if (!blk_mq_is_sbitmap_shared(flags)) {
+	sbitmap_queue_free(&tags->bitmap_tags);
-		sbitmap_queue_free(tags->bitmap_tags);
+	sbitmap_queue_free(&tags->breserved_tags);
 		sbitmap_queue_free(tags->breserved_tags);
 	}
 	kfree(tags);
 }
@ -592,7 +615,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 	if (tdepth > tags->nr_tags) {
 		struct blk_mq_tag_set *set = hctx->queue->tag_set;
 		struct blk_mq_tags *new;
 		bool ret;
 		if (!can_grow)
 			return -EINVAL;
@ -604,34 +626,42 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 		if (tdepth > MAX_SCHED_RQ)
 			return -EINVAL;
-		new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
+		/*
-				tags->nr_reserved_tags, set->flags);
+		 * Only the sbitmap needs resizing since we allocated the max
 		 * initially.
 		 */
 		if (blk_mq_is_shared_tags(set->flags))
 			return 0;
 		new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth);
 		if (!new)
 			return -ENOMEM;
 		ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
 		if (ret) {
 			blk_mq_free_rq_map(new, set->flags);
 			return -ENOMEM;
 		}
-		blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
+		blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num);
 		blk_mq_free_rq_map(*tagsptr, set->flags);
 		*tagsptr = new;
 	} else {
 		/*
 		 * Don't need (or can't) update reserved tags here, they
 		 * remain static and should never need resizing.
 		 */
-		sbitmap_queue_resize(tags->bitmap_tags,
+		sbitmap_queue_resize(&tags->bitmap_tags,
 				tdepth - tags->nr_reserved_tags);
 	}
 	return 0;
 }
-void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size)
+void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size)
 {
-	sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags);
+	struct blk_mq_tags *tags = set->shared_tags;
 	sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags);
 }
 void blk_mq_tag_update_sched_shared_tags(struct request_queue *q)
 {
 	sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags,
 			     q->nr_requests - q->tag_set->reserved_tags);
 }
 /**
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@ -2,55 +2,33 @@
 #ifndef INT_BLK_MQ_TAG_H
 #define INT_BLK_MQ_TAG_H
-/*
+struct blk_mq_alloc_data;
 * Tag address space map.
 */
 struct blk_mq_tags {
 	unsigned int nr_tags;
 	unsigned int nr_reserved_tags;
 	atomic_t active_queues;
 	struct sbitmap_queue *bitmap_tags;
 	struct sbitmap_queue *breserved_tags;
 	struct sbitmap_queue __bitmap_tags;
 	struct sbitmap_queue __breserved_tags;
 	struct request **rqs;
 	struct request **static_rqs;
 	struct list_head page_list;
 	/*
 	 * used to clear request reference in rqs[] before freeing one
 	 * request pool
 	 */
 	spinlock_t lock;
 };
 extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
 					unsigned int reserved_tags,
-					int node, unsigned int flags);
+					int node, int alloc_policy);
-extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags);
+extern void blk_mq_free_tags(struct blk_mq_tags *tags);
 extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
 			       struct sbitmap_queue *breserved_tags,
 			       unsigned int queue_depth,
 			       unsigned int reserved,
 			       int node, int alloc_policy);
 extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set);
 extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set);
 extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
 unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
 			      unsigned int *offset);
 extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
 			   unsigned int tag);
 void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags);
 extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
 					struct blk_mq_tags **tags,
 					unsigned int depth, bool can_grow);
-extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set,
+extern void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set,
 					     unsigned int size);
 extern void blk_mq_tag_update_sched_shared_tags(struct request_queue *q);
 extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
-void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
 		void *priv);
 void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
 		void *priv);
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@ -25,18 +25,14 @@ struct blk_mq_ctx {
 	unsigned short		index_hw[HCTX_MAX_TYPES];
 	struct blk_mq_hw_ctx 	*hctxs[HCTX_MAX_TYPES];
 	/* incremented at dispatch time */
 	unsigned long		rq_dispatched[2];
 	unsigned long		rq_merged;
 	/* incremented at completion time */
 	unsigned long		____cacheline_aligned_in_smp rq_completed[2];
 	struct request_queue	*queue;
 	struct blk_mq_ctxs      *ctxs;
 	struct kobject		kobj;
 } ____cacheline_aligned_in_smp;
 void blk_mq_submit_bio(struct bio *bio);
 int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
 		unsigned int flags);
 void blk_mq_exit_queue(struct request_queue *q);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
 void blk_mq_wake_waiters(struct request_queue *q);
@ -54,15 +50,12 @@ void blk_mq_put_rq_ref(struct request *rq);
 */
 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		     unsigned int hctx_idx);
-void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags);
+void blk_mq_free_rq_map(struct blk_mq_tags *tags);
-struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
+struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
-					unsigned int hctx_idx,
+				unsigned int hctx_idx, unsigned int depth);
-					unsigned int nr_tags,
+void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
-					unsigned int reserved_tags,
+			     struct blk_mq_tags *tags,
-					unsigned int flags);
+			     unsigned int hctx_idx);
 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
 		     unsigned int hctx_idx, unsigned int depth);
 /*
 * Internal helpers for request insertion into sw queues
 */
@ -72,9 +65,6 @@ void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
 				  bool run_queue);
 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
 				struct list_head *list);
 /* Used by blk_insert_cloned_request() to issue request directly */
 blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last);
 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 				    struct list_head *list);
@ -96,6 +86,20 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
 	return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
 }
 static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags)
 {
 	enum hctx_type type = HCTX_TYPE_DEFAULT;
 	/*
 	 * The caller ensure that if REQ_POLLED, poll must be enabled.
 	 */
 	if (flags & REQ_POLLED)
 		type = HCTX_TYPE_POLL;
 	else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
 		type = HCTX_TYPE_READ;
 	return type;
 }
 /*
 * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
 * @q: request queue
@ -106,17 +110,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
 						     unsigned int flags,
 						     struct blk_mq_ctx *ctx)
 {
-	enum hctx_type type = HCTX_TYPE_DEFAULT;
+	return ctx->hctxs[blk_mq_get_hctx_type(flags)];
 	/*
 	 * The caller ensure that if REQ_HIPRI, poll must be enabled.
 	 */
 	if (flags & REQ_HIPRI)
 		type = HCTX_TYPE_POLL;
 	else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
 		type = HCTX_TYPE_READ;
 	return ctx->hctxs[type];
 }
 /*
@ -128,6 +122,8 @@ extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
 extern int blk_mq_sysfs_register(struct request_queue *q);
 extern void blk_mq_sysfs_unregister(struct request_queue *q);
 extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
 void blk_mq_free_plug_rqs(struct blk_plug *plug);
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
 void blk_mq_cancel_work_sync(struct request_queue *q);
@ -156,23 +152,27 @@ struct blk_mq_alloc_data {
 	blk_mq_req_flags_t flags;
 	unsigned int shallow_depth;
 	unsigned int cmd_flags;
 	req_flags_t rq_flags;
 	/* allocate multiple requests/tags in one go */
 	unsigned int nr_tags;
 	struct request **cached_rq;
 	/* input & output parameter */
 	struct blk_mq_ctx *ctx;
 	struct blk_mq_hw_ctx *hctx;
 };
-static inline bool blk_mq_is_sbitmap_shared(unsigned int flags)
+static inline bool blk_mq_is_shared_tags(unsigned int flags)
 {
 	return flags & BLK_MQ_F_TAG_HCTX_SHARED;
 }
 static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
 {
-	if (data->q->elevator)
+	if (!(data->rq_flags & RQF_ELV))
-		return data->hctx->sched_tags;
+		return data->hctx->tags;
-
+	return data->hctx->sched_tags;
 	return data->hctx->tags;
 }
 static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
@ -222,24 +222,30 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq)
 static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
 {
-	if (blk_mq_is_sbitmap_shared(hctx->flags))
+	if (blk_mq_is_shared_tags(hctx->flags))
-		atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap);
+		atomic_inc(&hctx->queue->nr_active_requests_shared_tags);
 	else
 		atomic_inc(&hctx->nr_active);
 }
 static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
 		int val)
 {
 	if (blk_mq_is_shared_tags(hctx->flags))
 		atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags);
 	else
 		atomic_sub(val, &hctx->nr_active);
 }
 static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
 {
-	if (blk_mq_is_sbitmap_shared(hctx->flags))
+	__blk_mq_sub_active_requests(hctx, 1);
 		atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap);
 	else
 		atomic_dec(&hctx->nr_active);
 }
 static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
 {
-	if (blk_mq_is_sbitmap_shared(hctx->flags))
+	if (blk_mq_is_shared_tags(hctx->flags))
-		return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap);
+		return atomic_read(&hctx->queue->nr_active_requests_shared_tags);
 	return atomic_read(&hctx->nr_active);
 }
 static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
@ -262,7 +268,20 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
 	__blk_mq_put_driver_tag(rq->mq_hctx, rq);
 }
-bool blk_mq_get_driver_tag(struct request *rq);
+bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq);
 static inline bool blk_mq_get_driver_tag(struct request *rq)
 {
 	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
 	if (rq->tag != BLK_MQ_NO_TAG &&
 	    !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
 		hctx->tags->rqs[rq->tag] = rq;
 		return true;
 	}
 	return __blk_mq_get_driver_tag(hctx, rq);
 }
 static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
 {
@ -333,19 +352,18 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 	if (bt->sb.depth == 1)
 		return true;
-	if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+	if (blk_mq_is_shared_tags(hctx->flags)) {
 		struct request_queue *q = hctx->queue;
 		struct blk_mq_tag_set *set = q->tag_set;
 		if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
 			return true;
 		users = atomic_read(&set->active_queues_shared_sbitmap);
 	} else {
 		if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
 			return true;
 		users = atomic_read(&hctx->tags->active_queues);
 	}
 	users = atomic_read(&hctx->tags->active_queues);
 	if (!users)
 		return true;
@ -356,5 +374,24 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
 	return __blk_mq_active_requests(hctx) < depth;
 }
 /* run the code block in @dispatch_ops with rcu/srcu read lock held */
 #define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops)	\
 do {								\
 	if (!blk_queue_has_srcu(q)) {				\
 		rcu_read_lock();				\
 		(dispatch_ops);					\
 		rcu_read_unlock();				\
 	} else {						\
 		int srcu_idx;					\
 								\
 		might_sleep_if(check_sleep);			\
 		srcu_idx = srcu_read_lock((q)->srcu);		\
 		(dispatch_ops);					\
 		srcu_read_unlock((q)->srcu, srcu_idx);		\
 	}							\
 } while (0)
 #define blk_mq_run_dispatch_ops(q, dispatch_ops)		\
 	__blk_mq_run_dispatch_ops(q, true, dispatch_ops)	\
 #endif
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@ -189,9 +189,10 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
 	 * BIO_TRACKED lets controllers know that a bio went through the
 	 * normal rq_qos path.
 	 */
-	bio_set_flag(bio, BIO_TRACKED);
+	if (q->rq_qos) {
-	if (q->rq_qos)
+		bio_set_flag(bio, BIO_TRACKED);
 		__rq_qos_throttle(q->rq_qos, bio);
 	}
 }
 static inline void rq_qos_track(struct request_queue *q, struct request *rq,
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@ -15,7 +15,7 @@
 struct blk_queue_stats {
 	struct list_head callbacks;
 	spinlock_t lock;
-	bool enable_accounting;
+	int accounting;
 };
 void blk_rq_stat_init(struct blk_rq_stat *stat)
@ -161,7 +161,7 @@ void blk_stat_remove_callback(struct request_queue *q,
 	spin_lock_irqsave(&q->stats->lock, flags);
 	list_del_rcu(&cb->list);
-	if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
+	if (list_empty(&q->stats->callbacks) && !q->stats->accounting)
 		blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
 	spin_unlock_irqrestore(&q->stats->lock, flags);
@ -184,13 +184,24 @@ void blk_stat_free_callback(struct blk_stat_callback *cb)
 		call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
 }
 void blk_stat_disable_accounting(struct request_queue *q)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&q->stats->lock, flags);
 	if (!--q->stats->accounting)
 		blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
 	spin_unlock_irqrestore(&q->stats->lock, flags);
 }
 EXPORT_SYMBOL_GPL(blk_stat_disable_accounting);
 void blk_stat_enable_accounting(struct request_queue *q)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&q->stats->lock, flags);
-	q->stats->enable_accounting = true;
+	if (!q->stats->accounting++)
-	blk_queue_flag_set(QUEUE_FLAG_STATS, q);
+		blk_queue_flag_set(QUEUE_FLAG_STATS, q);
 	spin_unlock_irqrestore(&q->stats->lock, flags);
 }
 EXPORT_SYMBOL_GPL(blk_stat_enable_accounting);
@ -205,7 +216,7 @@ struct blk_queue_stats *blk_alloc_queue_stats(void)
 	INIT_LIST_HEAD(&stats->callbacks);
 	spin_lock_init(&stats->lock);
-	stats->enable_accounting = false;
+	stats->accounting = 0;
 	return stats;
 }
@ -219,3 +230,21 @@ void blk_free_queue_stats(struct blk_queue_stats *stats)
 	kfree(stats);
 }
 bool blk_stats_alloc_enable(struct request_queue *q)
 {
 	struct blk_rq_stat *poll_stat;
 	poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat),
 				GFP_ATOMIC);
 	if (!poll_stat)
 		return false;
 	if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) {
 		kfree(poll_stat);
 		return true;
 	}
 	blk_stat_add_callback(q, q->poll_cb);
 	return false;
 }
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@ -64,11 +64,13 @@ struct blk_stat_callback {
 struct blk_queue_stats *blk_alloc_queue_stats(void);
 void blk_free_queue_stats(struct blk_queue_stats *);
 bool blk_stats_alloc_enable(struct request_queue *q);
 void blk_stat_add(struct request *rq, u64 now);
 /* record time/size info in request but not add a callback */
 void blk_stat_enable_accounting(struct request_queue *q);
 void blk_stat_disable_accounting(struct request_queue *q);
 /**
 * blk_stat_alloc_callback() - Allocate a block statistics callback.
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@ -16,7 +16,9 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-debugfs.h"
 #include "blk-mq-sched.h"
 #include "blk-wbt.h"
 #include "blk-throttle.h"
 struct queue_sysfs_entry {
 	struct attribute attr;
@ -432,26 +434,11 @@ static ssize_t queue_poll_show(struct request_queue *q, char *page)
 static ssize_t queue_poll_store(struct request_queue *q, const char *page,
 				size_t count)
 {
-	unsigned long poll_on;
+	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
 	ssize_t ret;
 	if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL ||
 	    !q->tag_set->map[HCTX_TYPE_POLL].nr_queues)
 		return -EINVAL;
-
+	pr_info_ratelimited("writes to the poll attribute are ignored.\n");
-	ret = queue_var_store(&poll_on, page, count);
+	pr_info_ratelimited("please use driver specific parameters instead.\n");
-	if (ret < 0)
+	return count;
 		return ret;
 	if (poll_on) {
 		blk_queue_flag_set(QUEUE_FLAG_POLL, q);
 	} else {
 		blk_mq_freeze_queue(q);
 		blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
 		blk_mq_unfreeze_queue(q);
 	}
 	return ret;
 }
 static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
@ -748,7 +735,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
 {
 	struct request_queue *q = container_of(rcu_head, struct request_queue,
 					       rcu_head);
-	kmem_cache_free(blk_requestq_cachep, q);
+
 	kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
 }
 /* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
@ -761,7 +749,7 @@ static void blk_exit_queue(struct request_queue *q)
 	 */
 	if (q->elevator) {
 		ioc_clear_queue(q);
-		__elevator_exit(q, q->elevator);
+		elevator_exit(q);
 	}
 	/*
@ -799,14 +787,15 @@ static void blk_release_queue(struct kobject *kobj)
 	might_sleep();
-	if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
+	if (q->poll_stat)
 		blk_stat_remove_callback(q, q->poll_cb);
 	blk_stat_free_callback(q->poll_cb);
 	blk_free_queue_stats(q->stats);
 	blk_exit_queue(q);
 	blk_free_queue_stats(q->stats);
 	kfree(q->poll_stat);
 	blk_queue_free_zone_bitmaps(q);
 	if (queue_is_mq(q))
@ -822,6 +811,9 @@ static void blk_release_queue(struct kobject *kobj)
 	bioset_exit(&q->bio_split);
 	if (blk_queue_has_srcu(q))
 		cleanup_srcu_struct(q->srcu);
 	ida_simple_remove(&blk_queue_ida, q->id);
 	call_rcu(&q->rcu_head, blk_free_queue_rcu);
 }
@ -877,16 +869,15 @@ int blk_register_queue(struct gendisk *disk)
 	}
 	mutex_lock(&q->sysfs_lock);
 	ret = disk_register_independent_access_ranges(disk, NULL);
 	if (ret)
 		goto put_dev;
 	if (q->elevator) {
 		ret = elv_register_queue(q, false);
-		if (ret) {
+		if (ret)
-			mutex_unlock(&q->sysfs_lock);
+			goto put_dev;
 			mutex_unlock(&q->sysfs_dir_lock);
 			kobject_del(&q->kobj);
 			blk_trace_remove_sysfs(dev);
 			kobject_put(&dev->kobj);
 			return ret;
 		}
 	}
 	blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
@ -899,7 +890,6 @@ int blk_register_queue(struct gendisk *disk)
 		kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
 	mutex_unlock(&q->sysfs_lock);
 	ret = 0;
 unlock:
 	mutex_unlock(&q->sysfs_dir_lock);
@ -917,6 +907,16 @@ int blk_register_queue(struct gendisk *disk)
 		percpu_ref_switch_to_percpu(&q->q_usage_counter);
 	}
 	return ret;
 put_dev:
 	disk_unregister_independent_access_ranges(disk);
 	mutex_unlock(&q->sysfs_lock);
 	mutex_unlock(&q->sysfs_dir_lock);
 	kobject_del(&q->kobj);
 	blk_trace_remove_sysfs(dev);
 	kobject_put(&dev->kobj);
 	return ret;
 }
@ -962,6 +962,7 @@ void blk_unregister_queue(struct gendisk *disk)
 	mutex_lock(&q->sysfs_lock);
 	if (q->elevator)
 		elv_unregister_queue(q);
 	disk_unregister_independent_access_ranges(disk);
 	mutex_unlock(&q->sysfs_lock);
 	mutex_unlock(&q->sysfs_dir_lock);
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@ -13,6 +13,8 @@
 #include <linux/blk-cgroup.h>
 #include "blk.h"
 #include "blk-cgroup-rwstat.h"
 #include "blk-stat.h"
 #include "blk-throttle.h"
 /* Max dispatch from a group in 1 round */
 #define THROTL_GRP_QUANTUM 8
@ -37,60 +39,9 @@
 */
 #define LATENCY_FILTERED_HD (1000L) /* 1ms */
 static struct blkcg_policy blkcg_policy_throtl;
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
 /*
 * To implement hierarchical throttling, throtl_grps form a tree and bios
 * are dispatched upwards level by level until they reach the top and get
 * issued.  When dispatching bios from the children and local group at each
 * level, if the bios are dispatched into a single bio_list, there's a risk
 * of a local or child group which can queue many bios at once filling up
 * the list starving others.
 *
 * To avoid such starvation, dispatched bios are queued separately
 * according to where they came from.  When they are again dispatched to
 * the parent, they're popped in round-robin order so that no single source
 * hogs the dispatch window.
 *
 * throtl_qnode is used to keep the queued bios separated by their sources.
 * Bios are queued to throtl_qnode which in turn is queued to
 * throtl_service_queue and then dispatched in round-robin order.
 *
 * It's also used to track the reference counts on blkg's.  A qnode always
 * belongs to a throtl_grp and gets queued on itself or the parent, so
 * incrementing the reference of the associated throtl_grp when a qnode is
 * queued and decrementing when dequeued is enough to keep the whole blkg
 * tree pinned while bios are in flight.
 */
 struct throtl_qnode {
 	struct list_head	node;		/* service_queue->queued[] */
 	struct bio_list		bios;		/* queued bios */
 	struct throtl_grp	*tg;		/* tg this qnode belongs to */
 };
 struct throtl_service_queue {
 	struct throtl_service_queue *parent_sq;	/* the parent service_queue */
 	/*
 	 * Bios queued directly to this service_queue or dispatched from
 	 * children throtl_grp's.
 	 */
 	struct list_head	queued[2];	/* throtl_qnode [READ/WRITE] */
 	unsigned int		nr_queued[2];	/* number of queued bios */
 	/*
 	 * RB tree of active children throtl_grp's, which are sorted by
 	 * their ->disptime.
 	 */
 	struct rb_root_cached	pending_tree;	/* RB tree of active tgs */
 	unsigned int		nr_pending;	/* # queued in the tree */
 	unsigned long		first_pending_disptime;	/* disptime of the first tg */
 	struct timer_list	pending_timer;	/* fires on first_pending_disptime */
 };
 enum tg_state_flags {
 	THROTL_TG_PENDING	= 1 << 0,	/* on parent's pending tree */
 	THROTL_TG_WAS_EMPTY	= 1 << 1,	/* bio_lists[] became non-empty */
@ -98,93 +49,6 @@ enum tg_state_flags {
 #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
 enum {
 	LIMIT_LOW,
 	LIMIT_MAX,
 	LIMIT_CNT,
 };
 struct throtl_grp {
 	/* must be the first member */
 	struct blkg_policy_data pd;
 	/* active throtl group service_queue member */
 	struct rb_node rb_node;
 	/* throtl_data this group belongs to */
 	struct throtl_data *td;
 	/* this group's service queue */
 	struct throtl_service_queue service_queue;
 	/*
 	 * qnode_on_self is used when bios are directly queued to this
 	 * throtl_grp so that local bios compete fairly with bios
 	 * dispatched from children.  qnode_on_parent is used when bios are
 	 * dispatched from this throtl_grp into its parent and will compete
 	 * with the sibling qnode_on_parents and the parent's
 	 * qnode_on_self.
 	 */
 	struct throtl_qnode qnode_on_self[2];
 	struct throtl_qnode qnode_on_parent[2];
 	/*
 	 * Dispatch time in jiffies. This is the estimated time when group
 	 * will unthrottle and is ready to dispatch more bio. It is used as
 	 * key to sort active groups in service tree.
 	 */
 	unsigned long disptime;
 	unsigned int flags;
 	/* are there any throtl rules between this group and td? */
 	bool has_rules[2];
 	/* internally used bytes per second rate limits */
 	uint64_t bps[2][LIMIT_CNT];
 	/* user configured bps limits */
 	uint64_t bps_conf[2][LIMIT_CNT];
 	/* internally used IOPS limits */
 	unsigned int iops[2][LIMIT_CNT];
 	/* user configured IOPS limits */
 	unsigned int iops_conf[2][LIMIT_CNT];
 	/* Number of bytes dispatched in current slice */
 	uint64_t bytes_disp[2];
 	/* Number of bio's dispatched in current slice */
 	unsigned int io_disp[2];
 	unsigned long last_low_overflow_time[2];
 	uint64_t last_bytes_disp[2];
 	unsigned int last_io_disp[2];
 	unsigned long last_check_time;
 	unsigned long latency_target; /* us */
 	unsigned long latency_target_conf; /* us */
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
 	unsigned long last_finish_time; /* ns / 1024 */
 	unsigned long checked_last_finish_time; /* ns / 1024 */
 	unsigned long avg_idletime; /* ns / 1024 */
 	unsigned long idletime_threshold; /* us */
 	unsigned long idletime_threshold_conf; /* us */
 	unsigned int bio_cnt; /* total bios */
 	unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
 	unsigned long bio_cnt_reset_time;
 	atomic_t io_split_cnt[2];
 	atomic_t last_io_split_cnt[2];
 	struct blkg_rwstat stat_bytes;
 	struct blkg_rwstat stat_ios;
 };
 /* We measure latency for request size from <= 4k to >= 1M */
 #define LATENCY_BUCKET_SIZE 9
@ -231,16 +95,6 @@ struct throtl_data
 static void throtl_pending_timer_fn(struct timer_list *t);
 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
 {
 	return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
 }
 static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
 {
 	return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
 }
 static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
 {
 	return pd_to_blkg(&tg->pd);
@ -1794,7 +1648,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
 	cancel_work_sync(&td->dispatch_work);
 }
-static struct blkcg_policy blkcg_policy_throtl = {
+struct blkcg_policy blkcg_policy_throtl = {
 	.dfl_cftypes		= throtl_files,
 	.legacy_cftypes		= throtl_legacy_files,
@ -2208,9 +2062,9 @@ void blk_throtl_charge_bio_split(struct bio *bio)
 	} while (parent);
 }
-bool blk_throtl_bio(struct bio *bio)
+bool __blk_throtl_bio(struct bio *bio)
 {
-	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	struct blkcg_gq *blkg = bio->bi_blkg;
 	struct throtl_qnode *qn = NULL;
 	struct throtl_grp *tg = blkg_to_tg(blkg);
@ -2221,19 +2075,12 @@ bool blk_throtl_bio(struct bio *bio)
 	rcu_read_lock();
 	/* see throtl_charge_bio() */
 	if (bio_flagged(bio, BIO_THROTTLED))
 		goto out;
 	if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
 		blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
 				bio->bi_iter.bi_size);
 		blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
 	}
 	if (!tg->has_rules[rw])
 		goto out;
 	spin_lock_irq(&q->queue_lock);
 	throtl_update_latency_buckets(td);
@ -2317,7 +2164,6 @@ bool blk_throtl_bio(struct bio *bio)
 out_unlock:
 	spin_unlock_irq(&q->queue_lock);
 out:
 	bio_set_flag(bio, BIO_THROTTLED);
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
--- a/block/blk.h
+++ b/block/blk.h
@ -2,15 +2,12 @@
 #ifndef BLK_INTERNAL_H
 #define BLK_INTERNAL_H
 #include <linux/idr.h>
 #include <linux/blk-mq.h>
 #include <linux/part_stat.h>
 #include <linux/blk-crypto.h>
 #include <linux/memblock.h>	/* for max_pfn/max_low_pfn */
 #include <xen/xen.h>
 #include "blk-crypto-internal.h"
-#include "blk-mq.h"
+
-#include "blk-mq-sched.h"
+struct elevator_type;
 /* Max future timer expiry for timeouts */
 #define BLK_MAX_TIMEOUT		(5 * HZ)
@ -30,15 +27,10 @@ struct blk_flush_queue {
 };
 extern struct kmem_cache *blk_requestq_cachep;
 extern struct kmem_cache *blk_requestq_srcu_cachep;
 extern struct kobj_type blk_queue_ktype;
 extern struct ida blk_queue_ida;
 static inline struct blk_flush_queue *
 blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
 {
 	return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
 }
 static inline void __blk_get_queue(struct request_queue *q)
 {
 	kobject_get(&q->kobj);
@ -53,6 +45,41 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
 void blk_freeze_queue(struct request_queue *q);
 void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
 void blk_queue_start_drain(struct request_queue *q);
 int __bio_queue_enter(struct request_queue *q, struct bio *bio);
 bool submit_bio_checks(struct bio *bio);
 static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
 {
 	rcu_read_lock();
 	if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter))
 		goto fail;
 	/*
 	 * The code that increments the pm_only counter must ensure that the
 	 * counter is globally visible before the queue is unfrozen.
 	 */
 	if (blk_queue_pm_only(q) &&
 	    (!pm || queue_rpm_status(q) == RPM_SUSPENDED))
 		goto fail_put;
 	rcu_read_unlock();
 	return true;
 fail_put:
 	blk_queue_exit(q);
 fail:
 	rcu_read_unlock();
 	return false;
 }
 static inline int bio_queue_enter(struct bio *bio)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
 	if (blk_try_enter_queue(q, false))
 		return 0;
 	return __bio_queue_enter(q, bio);
 }
 #define BIO_INLINE_VECS 4
 struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
@ -94,6 +121,44 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
 	return __bvec_gap_to_prev(q, bprv, offset);
 }
 static inline bool rq_mergeable(struct request *rq)
 {
 	if (blk_rq_is_passthrough(rq))
 		return false;
 	if (req_op(rq) == REQ_OP_FLUSH)
 		return false;
 	if (req_op(rq) == REQ_OP_WRITE_ZEROES)
 		return false;
 	if (req_op(rq) == REQ_OP_ZONE_APPEND)
 		return false;
 	if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
 		return false;
 	if (rq->rq_flags & RQF_NOMERGE_FLAGS)
 		return false;
 	return true;
 }
 /*
 * There are two different ways to handle DISCARD merges:
 *  1) If max_discard_segments > 1, the driver treats every bio as a range and
 *     send the bios to controller together. The ranges don't need to be
 *     contiguous.
 *  2) Otherwise, the request will be normal read/write requests.  The ranges
 *     need to be contiguous.
 */
 static inline bool blk_discard_mergable(struct request *req)
 {
 	if (req_op(req) == REQ_OP_DISCARD &&
 	    queue_max_discard_segments(req->q) > 1)
 		return true;
 	return false;
 }
 #ifdef CONFIG_BLK_DEV_INTEGRITY
 void blk_flush_integrity(void);
 bool __bio_integrity_endio(struct bio *);
@ -175,15 +240,13 @@ static inline void blk_integrity_del(struct gendisk *disk)
 unsigned long blk_rq_timeout(unsigned long timeout);
 void blk_add_timer(struct request *req);
 const char *blk_status_to_str(blk_status_t status);
 bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
-		unsigned int nr_segs, struct request **same_queue_rq);
+		unsigned int nr_segs);
 bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
 			struct bio *bio, unsigned int nr_segs);
 void blk_account_io_start(struct request *req);
 void blk_account_io_done(struct request *req, u64 now);
 /*
 * Plug flush limits
 */
@ -199,19 +262,10 @@ void blk_insert_flush(struct request *rq);
 int elevator_switch_mq(struct request_queue *q,
 			      struct elevator_type *new_e);
-void __elevator_exit(struct request_queue *, struct elevator_queue *);
+void elevator_exit(struct request_queue *q);
 int elv_register_queue(struct request_queue *q, bool uevent);
 void elv_unregister_queue(struct request_queue *q);
 static inline void elevator_exit(struct request_queue *q,
 		struct elevator_queue *e)
 {
 	lockdep_assert_held(&q->sysfs_lock);
 	blk_mq_sched_free_requests(q);
 	__elevator_exit(q, e);
 }
 ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
 		char *buf);
 ssize_t part_stat_show(struct device *dev, struct device_attribute *attr,
@ -226,7 +280,32 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
 ssize_t part_timeout_store(struct device *, struct device_attribute *,
 				const char *, size_t);
-void __blk_queue_split(struct bio **bio, unsigned int *nr_segs);
+static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
 {
 	switch (bio_op(bio)) {
 	case REQ_OP_DISCARD:
 	case REQ_OP_SECURE_ERASE:
 	case REQ_OP_WRITE_ZEROES:
 	case REQ_OP_WRITE_SAME:
 		return true; /* non-trivial splitting decisions */
 	default:
 		break;
 	}
 	/*
 	 * All drivers must accept single-segments bios that are <= PAGE_SIZE.
 	 * This is a quick and dirty check that relies on the fact that
 	 * bi_io_vec[0] is always valid if a bio has data.  The check might
 	 * lead to occasional false negatives when bios are cloned, but compared
 	 * to the performance impact of cloned bios themselves the loop below
 	 * doesn't matter anyway.
 	 */
 	return q->limits.chunk_sectors || bio->bi_vcnt != 1 ||
 		bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
 }
 void __blk_queue_split(struct request_queue *q, struct bio **bio,
 			unsigned int *nr_segs);
 int ll_back_merge_fn(struct request *req, struct bio *bio,
 		unsigned int nr_segs);
 bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@ -246,9 +325,11 @@ int blk_dev_init(void);
 */
 static inline bool blk_do_io_stat(struct request *rq)
 {
-	return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT);
+	return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk;
 }
 void update_io_ticks(struct block_device *part, unsigned long now, bool end);
 static inline void req_set_nomerge(struct request_queue *q, struct request *req)
 {
 	req->cmd_flags |= REQ_NOMERGE;
@ -283,30 +364,16 @@ static inline unsigned int bio_aligned_discard_max_sectors(
 /*
 * Internal io_context interface
 */
-void get_io_context(struct io_context *ioc);
+struct io_cq *ioc_find_get_icq(struct request_queue *q);
-struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
+struct io_cq *ioc_lookup_icq(struct request_queue *q);
-struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
+#ifdef CONFIG_BLK_ICQ
 			     gfp_t gfp_mask);
 void ioc_clear_queue(struct request_queue *q);
 #else
 static inline void ioc_clear_queue(struct request_queue *q)
 {
 }
 #endif /* CONFIG_BLK_ICQ */
 int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
 /*
 * Internal throttling interface
 */
 #ifdef CONFIG_BLK_DEV_THROTTLING
 extern int blk_throtl_init(struct request_queue *q);
 extern void blk_throtl_exit(struct request_queue *q);
 extern void blk_throtl_register_queue(struct request_queue *q);
 extern void blk_throtl_charge_bio_split(struct bio *bio);
 bool blk_throtl_bio(struct bio *bio);
 #else /* CONFIG_BLK_DEV_THROTTLING */
 static inline int blk_throtl_init(struct request_queue *q) { return 0; }
 static inline void blk_throtl_exit(struct request_queue *q) { }
 static inline void blk_throtl_register_queue(struct request_queue *q) { }
 static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
 static inline bool blk_throtl_bio(struct bio *bio) { return false; }
 #endif /* CONFIG_BLK_DEV_THROTTLING */
 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
 extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
 extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
@ -364,7 +431,15 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		struct page *page, unsigned int len, unsigned int offset,
 		unsigned int max_sectors, bool *same_page);
-struct request_queue *blk_alloc_queue(int node_id);
+static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
 {
 	if (srcu)
 		return blk_requestq_srcu_cachep;
 	return blk_requestq_cachep;
 }
 struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
 int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
 int disk_alloc_events(struct gendisk *disk);
 void disk_add_events(struct gendisk *disk);
@ -374,13 +449,61 @@ extern struct device_attribute dev_attr_events;
 extern struct device_attribute dev_attr_events_async;
 extern struct device_attribute dev_attr_events_poll_msecs;
-static inline void bio_clear_hipri(struct bio *bio)
+static inline void bio_clear_polled(struct bio *bio)
 {
 	/* can't support alloc cache if we turn off polling */
 	bio_clear_flag(bio, BIO_PERCPU_CACHE);
-	bio->bi_opf &= ~REQ_HIPRI;
+	bio->bi_opf &= ~REQ_POLLED;
 }
 long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
 long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
 extern const struct address_space_operations def_blk_aops;
 int disk_register_independent_access_ranges(struct gendisk *disk,
 				struct blk_independent_access_ranges *new_iars);
 void disk_unregister_independent_access_ranges(struct gendisk *disk);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 bool should_fail_request(struct block_device *part, unsigned int bytes);
 #else /* CONFIG_FAIL_MAKE_REQUEST */
 static inline bool should_fail_request(struct block_device *part,
 					unsigned int bytes)
 {
 	return false;
 }
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 /*
 * Optimized request reference counting. Ideally we'd make timeouts be more
 * clever, as that's the only reason we need references at all... But until
 * this happens, this is faster than using refcount_t. Also see:
 *
 * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count")
 */
 #define req_ref_zero_or_close_to_overflow(req)	\
 	((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)
 static inline bool req_ref_inc_not_zero(struct request *req)
 {
 	return atomic_inc_not_zero(&req->ref);
 }
 static inline bool req_ref_put_and_test(struct request *req)
 {
 	WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
 	return atomic_dec_and_test(&req->ref);
 }
 static inline void req_ref_set(struct request *req, int value)
 {
 	atomic_set(&req->ref, value);
 }
 static inline int req_ref_read(struct request *req)
 {
 	return atomic_read(&req->ref);
 }
 #endif /* BLK_INTERNAL_H */
--- a/block/bounce.c
+++ b/block/bounce.c
@ -14,6 +14,7 @@
 #include <linux/pagemap.h>
 #include <linux/mempool.h>
 #include <linux/blkdev.h>
 #include <linux/blk-cgroup.h>
 #include <linux/backing-dev.h>
 #include <linux/init.h>
 #include <linux/hash.h>
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@ -31,6 +31,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
 	struct bsg_job *job;
 	struct request *rq;
 	struct bio *bio;
 	void *reply;
 	int ret;
 	if (hdr->protocol != BSG_PROTOCOL_SCSI  ||
@ -39,22 +40,28 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
 	if (!capable(CAP_SYS_RAWIO))
 		return -EPERM;
-	rq = blk_get_request(q, hdr->dout_xfer_len ?
+	rq = blk_mq_alloc_request(q, hdr->dout_xfer_len ?
 			     REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
 	if (IS_ERR(rq))
 		return PTR_ERR(rq);
 	rq->timeout = timeout;
 	job = blk_mq_rq_to_pdu(rq);
 	reply = job->reply;
 	memset(job, 0, sizeof(*job));
 	job->reply = reply;
 	job->reply_len = SCSI_SENSE_BUFFERSIZE;
 	job->dd_data = job + 1;
 	job->request_len = hdr->request_len;
 	job->request = memdup_user(uptr64(hdr->request), hdr->request_len);
 	if (IS_ERR(job->request)) {
 		ret = PTR_ERR(job->request);
-		goto out_put_request;
+		goto out_free_rq;
 	}
 	if (hdr->dout_xfer_len && hdr->din_xfer_len) {
-		job->bidi_rq = blk_get_request(rq->q, REQ_OP_DRV_IN, 0);
+		job->bidi_rq = blk_mq_alloc_request(rq->q, REQ_OP_DRV_IN, 0);
 		if (IS_ERR(job->bidi_rq)) {
 			ret = PTR_ERR(job->bidi_rq);
 			goto out_free_job_request;
@ -85,7 +92,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
 		goto out_unmap_bidi_rq;
 	bio = rq->bio;
-	blk_execute_rq(NULL, rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL));
+	blk_execute_rq(rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL));
 	/*
 	 * The assignments below don't make much sense, but are kept for
@ -134,11 +141,11 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
 		blk_rq_unmap_user(job->bidi_bio);
 out_free_bidi_rq:
 	if (job->bidi_rq)
-		blk_put_request(job->bidi_rq);
+		blk_mq_free_request(job->bidi_rq);
 out_free_job_request:
 	kfree(job->request);
-out_put_request:
+out_free_rq:
-	blk_put_request(rq);
+	blk_mq_free_request(rq);
 	return ret;
 }
@ -302,18 +309,6 @@ static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req,
 	return 0;
 }
 /* called right before the request is given to the request_queue user */
 static void bsg_initialize_rq(struct request *req)
 {
 	struct bsg_job *job = blk_mq_rq_to_pdu(req);
 	void *reply = job->reply;
 	memset(job, 0, sizeof(*job));
 	job->reply = reply;
 	job->reply_len = SCSI_SENSE_BUFFERSIZE;
 	job->dd_data = job + 1;
 }
 static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req,
 		       unsigned int hctx_idx)
 {
@ -350,7 +345,6 @@ static const struct blk_mq_ops bsg_mq_ops = {
 	.queue_rq		= bsg_queue_rq,
 	.init_request		= bsg_init_rq,
 	.exit_request		= bsg_exit_rq,
 	.initialize_rq_fn	= bsg_initialize_rq,
 	.complete		= bsg_complete,
 	.timeout		= bsg_timeout,
 };
--- a/block/elevator.c
+++ b/block/elevator.c
@ -26,7 +26,6 @@
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/bio.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@ -40,6 +39,7 @@
 #include <trace/events/block.h>
 #include "elevator.h"
 #include "blk.h"
 #include "blk-mq-sched.h"
 #include "blk-pm.h"
@ -188,8 +188,10 @@ static void elevator_release(struct kobject *kobj)
 	kfree(e);
 }
-void __elevator_exit(struct request_queue *q, struct elevator_queue *e)
+void elevator_exit(struct request_queue *q)
 {
 	struct elevator_queue *e = q->elevator;
 	mutex_lock(&e->sysfs_lock);
 	blk_mq_exit_sched(q, e);
 	mutex_unlock(&e->sysfs_lock);
@ -593,7 +595,8 @@ int elevator_switch_mq(struct request_queue *q,
 			elv_unregister_queue(q);
 		ioc_clear_queue(q);
-		elevator_exit(q, q->elevator);
+		blk_mq_sched_free_rqs(q);
 		elevator_exit(q);
 	}
 	ret = blk_mq_init_sched(q, new_e);
@ -603,7 +606,8 @@ int elevator_switch_mq(struct request_queue *q,
 	if (new_e) {
 		ret = elv_register_queue(q, true);
 		if (ret) {
-			elevator_exit(q, q->elevator);
+			blk_mq_sched_free_rqs(q);
 			elevator_exit(q);
 			goto out;
 		}
 	}
@ -635,7 +639,7 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
 		return NULL;
 	if (q->nr_hw_queues != 1 &&
-			!blk_mq_is_sbitmap_shared(q->tag_set->flags))
+	    !blk_mq_is_shared_tags(q->tag_set->flags))
 		return NULL;
 	return elevator_get(q, "mq-deadline", false);
--- a/block/fops.c
+++ b/block/fops.c
@ -15,9 +15,10 @@
 #include <linux/falloc.h>
 #include <linux/suspend.h>
 #include <linux/fs.h>
 #include <linux/module.h>
 #include "blk.h"
-static struct inode *bdev_file_inode(struct file *file)
+static inline struct inode *bdev_file_inode(struct file *file)
 {
 	return file->f_mapping->host;
 }
@ -54,14 +55,12 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
 static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 		struct iov_iter *iter, unsigned int nr_pages)
 {
-	struct file *file = iocb->ki_filp;
+	struct block_device *bdev = iocb->ki_filp->private_data;
 	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
 	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
 	loff_t pos = iocb->ki_pos;
 	bool should_dirty = false;
 	struct bio bio;
 	ssize_t ret;
 	blk_qc_t qc;
 	if ((pos | iov_iter_alignment(iter)) &
 	    (bdev_logical_block_size(bdev) - 1))
@ -78,7 +77,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 	bio_init(&bio, vecs, nr_pages);
 	bio_set_dev(&bio, bdev);
-	bio.bi_iter.bi_sector = pos >> 9;
+	bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
 	bio.bi_write_hint = iocb->ki_hint;
 	bio.bi_private = current;
 	bio.bi_end_io = blkdev_bio_end_io_simple;
@ -102,13 +101,12 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 	if (iocb->ki_flags & IOCB_HIPRI)
 		bio_set_polled(&bio, iocb);
-	qc = submit_bio(&bio);
+	submit_bio(&bio);
 	for (;;) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (!READ_ONCE(bio.bi_private))
 			break;
-		if (!(iocb->ki_flags & IOCB_HIPRI) ||
+		if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, NULL, 0))
 		    !blk_poll(bdev_get_queue(bdev), qc, true))
 			blk_io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
@ -126,6 +124,11 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 	return ret;
 }
 enum {
 	DIO_SHOULD_DIRTY	= 1,
 	DIO_IS_SYNC		= 2,
 };
 struct blkdev_dio {
 	union {
 		struct kiocb		*iocb;
@ -133,35 +136,27 @@ struct blkdev_dio {
 	};
 	size_t			size;
 	atomic_t		ref;
-	bool			multi_bio : 1;
+	unsigned int		flags;
-	bool			should_dirty : 1;
+	struct bio		bio ____cacheline_aligned_in_smp;
 	bool			is_sync : 1;
 	struct bio		bio;
 };
 static struct bio_set blkdev_dio_pool;
 static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
 {
 	struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
 	struct request_queue *q = bdev_get_queue(bdev);
 	return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
 }
 static void blkdev_bio_end_io(struct bio *bio)
 {
 	struct blkdev_dio *dio = bio->bi_private;
-	bool should_dirty = dio->should_dirty;
+	bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
 	if (bio->bi_status && !dio->bio.bi_status)
 		dio->bio.bi_status = bio->bi_status;
-	if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
+	if (atomic_dec_and_test(&dio->ref)) {
-		if (!dio->is_sync) {
+		if (!(dio->flags & DIO_IS_SYNC)) {
 			struct kiocb *iocb = dio->iocb;
 			ssize_t ret;
 			WRITE_ONCE(iocb->private, NULL);
 			if (likely(!dio->bio.bi_status)) {
 				ret = dio->size;
 				iocb->ki_pos += ret;
@ -169,9 +164,8 @@ static void blkdev_bio_end_io(struct bio *bio)
 				ret = blk_status_to_errno(dio->bio.bi_status);
 			}
-			dio->iocb->ki_complete(iocb, ret, 0);
+			dio->iocb->ki_complete(iocb, ret);
-			if (dio->multi_bio)
+			bio_put(&dio->bio);
 				bio_put(&dio->bio);
 		} else {
 			struct task_struct *waiter = dio->waiter;
@ -191,16 +185,12 @@ static void blkdev_bio_end_io(struct bio *bio)
 static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		unsigned int nr_pages)
 {
-	struct file *file = iocb->ki_filp;
+	struct block_device *bdev = iocb->ki_filp->private_data;
 	struct inode *inode = bdev_file_inode(file);
 	struct block_device *bdev = I_BDEV(inode);
 	struct blk_plug plug;
 	struct blkdev_dio *dio;
 	struct bio *bio;
 	bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
 	bool is_read = (iov_iter_rw(iter) == READ), is_sync;
 	loff_t pos = iocb->ki_pos;
 	blk_qc_t qc = BLK_QC_T_NONE;
 	int ret = 0;
 	if ((pos | iov_iter_alignment(iter)) &
@ -210,28 +200,31 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
 	dio = container_of(bio, struct blkdev_dio, bio);
-	dio->is_sync = is_sync = is_sync_kiocb(iocb);
+	atomic_set(&dio->ref, 1);
-	if (dio->is_sync) {
+	/*
 	 * Grab an extra reference to ensure the dio structure which is embedded
 	 * into the first bio stays around.
 	 */
 	bio_get(bio);
 	is_sync = is_sync_kiocb(iocb);
 	if (is_sync) {
 		dio->flags = DIO_IS_SYNC;
 		dio->waiter = current;
 		bio_get(bio);
 	} else {
 		dio->flags = 0;
 		dio->iocb = iocb;
 	}
 	dio->size = 0;
-	dio->multi_bio = false;
+	if (is_read && iter_is_iovec(iter))
-	dio->should_dirty = is_read && iter_is_iovec(iter);
+		dio->flags |= DIO_SHOULD_DIRTY;
-	/*
+	blk_start_plug(&plug);
 	 * Don't plug for HIPRI/polled IO, as those should go straight
 	 * to issue
 	 */
 	if (!is_poll)
 		blk_start_plug(&plug);
 	for (;;) {
 		bio_set_dev(bio, bdev);
-		bio->bi_iter.bi_sector = pos >> 9;
+		bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
 		bio->bi_write_hint = iocb->ki_hint;
 		bio->bi_private = dio;
 		bio->bi_end_io = blkdev_bio_end_io;
@ -246,7 +239,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		if (is_read) {
 			bio->bi_opf = REQ_OP_READ;
-			if (dio->should_dirty)
+			if (dio->flags & DIO_SHOULD_DIRTY)
 				bio_set_pages_dirty(bio);
 		} else {
 			bio->bi_opf = dio_bio_write_op(iocb);
@ -260,40 +253,15 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
 		if (!nr_pages) {
-			bool polled = false;
+			submit_bio(bio);
 			if (iocb->ki_flags & IOCB_HIPRI) {
 				bio_set_polled(bio, iocb);
 				polled = true;
 			}
 			qc = submit_bio(bio);
 			if (polled)
 				WRITE_ONCE(iocb->ki_cookie, qc);
 			break;
 		}
-
+		atomic_inc(&dio->ref);
 		if (!dio->multi_bio) {
 			/*
 			 * AIO needs an extra reference to ensure the dio
 			 * structure which is embedded into the first bio
 			 * stays around.
 			 */
 			if (!is_sync)
 				bio_get(bio);
 			dio->multi_bio = true;
 			atomic_set(&dio->ref, 2);
 		} else {
 			atomic_inc(&dio->ref);
 		}
 		submit_bio(bio);
 		bio = bio_alloc(GFP_KERNEL, nr_pages);
 	}
-	if (!is_poll)
+	blk_finish_plug(&plug);
 		blk_finish_plug(&plug);
 	if (!is_sync)
 		return -EIOCBQUEUED;
@ -302,10 +270,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		if (!READ_ONCE(dio->waiter))
 			break;
-
+		blk_io_schedule();
 		if (!(iocb->ki_flags & IOCB_HIPRI) ||
 		    !blk_poll(bdev_get_queue(bdev), qc, true))
 			blk_io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
@ -318,6 +283,95 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	return ret;
 }
 static void blkdev_bio_end_io_async(struct bio *bio)
 {
 	struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio);
 	struct kiocb *iocb = dio->iocb;
 	ssize_t ret;
 	WRITE_ONCE(iocb->private, NULL);
 	if (likely(!bio->bi_status)) {
 		ret = dio->size;
 		iocb->ki_pos += ret;
 	} else {
 		ret = blk_status_to_errno(bio->bi_status);
 	}
 	iocb->ki_complete(iocb, ret);
 	if (dio->flags & DIO_SHOULD_DIRTY) {
 		bio_check_pages_dirty(bio);
 	} else {
 		bio_release_pages(bio, false);
 		bio_put(bio);
 	}
 }
 static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 					struct iov_iter *iter,
 					unsigned int nr_pages)
 {
 	struct block_device *bdev = iocb->ki_filp->private_data;
 	struct blkdev_dio *dio;
 	struct bio *bio;
 	loff_t pos = iocb->ki_pos;
 	int ret = 0;
 	if ((pos | iov_iter_alignment(iter)) &
 	    (bdev_logical_block_size(bdev) - 1))
 		return -EINVAL;
 	bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
 	dio = container_of(bio, struct blkdev_dio, bio);
 	dio->flags = 0;
 	dio->iocb = iocb;
 	bio_set_dev(bio, bdev);
 	bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
 	bio->bi_write_hint = iocb->ki_hint;
 	bio->bi_end_io = blkdev_bio_end_io_async;
 	bio->bi_ioprio = iocb->ki_ioprio;
 	if (iov_iter_is_bvec(iter)) {
 		/*
 		 * Users don't rely on the iterator being in any particular
 		 * state for async I/O returning -EIOCBQUEUED, hence we can
 		 * avoid expensive iov_iter_advance(). Bypass
 		 * bio_iov_iter_get_pages() and set the bvec directly.
 		 */
 		bio_iov_bvec_set(bio, iter);
 	} else {
 		ret = bio_iov_iter_get_pages(bio, iter);
 		if (unlikely(ret)) {
 			bio_put(bio);
 			return ret;
 		}
 	}
 	dio->size = bio->bi_iter.bi_size;
 	if (iov_iter_rw(iter) == READ) {
 		bio->bi_opf = REQ_OP_READ;
 		if (iter_is_iovec(iter)) {
 			dio->flags |= DIO_SHOULD_DIRTY;
 			bio_set_pages_dirty(bio);
 		}
 	} else {
 		bio->bi_opf = dio_bio_write_op(iocb);
 		task_io_account_write(bio->bi_iter.bi_size);
 	}
 	if (iocb->ki_flags & IOCB_HIPRI) {
 		bio->bi_opf |= REQ_POLLED | REQ_NOWAIT;
 		submit_bio(bio);
 		WRITE_ONCE(iocb->private, bio);
 	} else {
 		if (iocb->ki_flags & IOCB_NOWAIT)
 			bio->bi_opf |= REQ_NOWAIT;
 		submit_bio(bio);
 	}
 	return -EIOCBQUEUED;
 }
 static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 {
 	unsigned int nr_pages;
@ -326,9 +380,11 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 		return 0;
 	nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
-	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS)
+	if (likely(nr_pages <= BIO_MAX_VECS)) {
-		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
+		if (is_sync_kiocb(iocb))
-
+			return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
 		return __blkdev_direct_IO_async(iocb, iter, nr_pages);
 	}
 	return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
 }
@ -405,8 +461,7 @@ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence)
 static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
 		int datasync)
 {
-	struct inode *bd_inode = bdev_file_inode(filp);
+	struct block_device *bdev = filp->private_data;
 	struct block_device *bdev = I_BDEV(bd_inode);
 	int error;
 	error = file_write_and_wait_range(filp, start, end);
@ -448,6 +503,8 @@ static int blkdev_open(struct inode *inode, struct file *filp)
 	bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 	filp->private_data = bdev;
 	filp->f_mapping = bdev->bd_inode->i_mapping;
 	filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
 	return 0;
@ -455,29 +512,12 @@ static int blkdev_open(struct inode *inode, struct file *filp)
 static int blkdev_close(struct inode *inode, struct file *filp)
 {
-	struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
+	struct block_device *bdev = filp->private_data;
 	blkdev_put(bdev, filp->f_mode);
 	return 0;
 }
 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
 	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
 	fmode_t mode = file->f_mode;
 	/*
 	 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
 	 * to updated it before every ioctl.
 	 */
 	if (file->f_flags & O_NDELAY)
 		mode |= FMODE_NDELAY;
 	else
 		mode &= ~FMODE_NDELAY;
 	return blkdev_ioctl(bdev, mode, cmd, arg);
 }
 /*
 * Write data to the block device.  Only intended for the block device itself
 * and the raw driver which basically is a fake block device.
@ -487,14 +527,14 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 */
 static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
-	struct file *file = iocb->ki_filp;
+	struct block_device *bdev = iocb->ki_filp->private_data;
-	struct inode *bd_inode = bdev_file_inode(file);
+	struct inode *bd_inode = bdev->bd_inode;
-	loff_t size = i_size_read(bd_inode);
+	loff_t size = bdev_nr_bytes(bdev);
 	struct blk_plug plug;
 	size_t shorted = 0;
 	ssize_t ret;
-	if (bdev_read_only(I_BDEV(bd_inode)))
+	if (bdev_read_only(bdev))
 		return -EPERM;
 	if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
@ -526,24 +566,58 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
-	struct file *file = iocb->ki_filp;
+	struct block_device *bdev = iocb->ki_filp->private_data;
-	struct inode *bd_inode = bdev_file_inode(file);
+	loff_t size = bdev_nr_bytes(bdev);
 	loff_t size = i_size_read(bd_inode);
 	loff_t pos = iocb->ki_pos;
 	size_t shorted = 0;
-	ssize_t ret;
+	ssize_t ret = 0;
 	size_t count;
-	if (pos >= size)
+	if (unlikely(pos + iov_iter_count(to) > size)) {
-		return 0;
+		if (pos >= size)
-
+			return 0;
-	size -= pos;
+		size -= pos;
 	if (iov_iter_count(to) > size) {
 		shorted = iov_iter_count(to) - size;
 		iov_iter_truncate(to, size);
 	}
-	ret = generic_file_read_iter(iocb, to);
+	count = iov_iter_count(to);
-	iov_iter_reexpand(to, iov_iter_count(to) + shorted);
+	if (!count)
 		goto reexpand; /* skip atime */
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		struct address_space *mapping = iocb->ki_filp->f_mapping;
 		if (iocb->ki_flags & IOCB_NOWAIT) {
 			if (filemap_range_needs_writeback(mapping, pos,
 							  pos + count - 1)) {
 				ret = -EAGAIN;
 				goto reexpand;
 			}
 		} else {
 			ret = filemap_write_and_wait_range(mapping, pos,
 							   pos + count - 1);
 			if (ret < 0)
 				goto reexpand;
 		}
 		file_accessed(iocb->ki_filp);
 		ret = blkdev_direct_IO(iocb, to);
 		if (ret >= 0) {
 			iocb->ki_pos += ret;
 			count -= ret;
 		}
 		iov_iter_revert(to, count - iov_iter_count(to));
 		if (ret < 0 || !count)
 			goto reexpand;
 	}
 	ret = filemap_read(iocb, to, ret);
 reexpand:
 	if (unlikely(shorted))
 		iov_iter_reexpand(to, iov_iter_count(to) + shorted);
 	return ret;
 }
@ -565,7 +639,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
 		return -EOPNOTSUPP;
 	/* Don't go off the end of the device. */
-	isize = i_size_read(bdev->bd_inode);
+	isize = bdev_nr_bytes(bdev);
 	if (start >= isize)
 		return -EINVAL;
 	if (end >= isize) {
@ -592,16 +666,18 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
 	switch (mode) {
 	case FALLOC_FL_ZERO_RANGE:
 	case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
-		error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
+		error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
-					    GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
+					     len >> SECTOR_SHIFT, GFP_KERNEL,
 					     BLKDEV_ZERO_NOUNMAP);
 		break;
 	case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
-		error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
+		error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
-					     GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
+					     len >> SECTOR_SHIFT, GFP_KERNEL,
 					     BLKDEV_ZERO_NOFALLBACK);
 		break;
 	case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
-		error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
+		error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
-					     GFP_KERNEL, 0);
+					     len >> SECTOR_SHIFT, GFP_KERNEL, 0);
 		break;
 	default:
 		error = -EOPNOTSUPP;
@ -618,10 +694,10 @@ const struct file_operations def_blk_fops = {
 	.llseek		= blkdev_llseek,
 	.read_iter	= blkdev_read_iter,
 	.write_iter	= blkdev_write_iter,
-	.iopoll		= blkdev_iopoll,
+	.iopoll		= iocb_bio_iopoll,
 	.mmap		= generic_file_mmap,
 	.fsync		= blkdev_fsync,
-	.unlocked_ioctl	= block_ioctl,
+	.unlocked_ioctl	= blkdev_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= compat_blkdev_ioctl,
 #endif
--- a/block/genhd.c
+++ b/block/genhd.c
@ -25,8 +25,10 @@
 #include <linux/log2.h>
 #include <linux/pm_runtime.h>
 #include <linux/badblocks.h>
 #include <linux/part_stat.h>
 #include "blk.h"
 #include "blk-mq-sched.h"
 #include "blk-rq-qos.h"
 static struct kobject *block_depr;
@ -58,6 +60,7 @@ void set_capacity(struct gendisk *disk, sector_t sectors)
 	spin_lock(&bdev->bd_size_lock);
 	i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
 	bdev->bd_nr_sectors = sectors;
 	spin_unlock(&bdev->bd_size_lock);
 }
 EXPORT_SYMBOL(set_capacity);
@ -212,7 +215,10 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
 * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
 *         @major = 0, try to allocate any unused major number.
 * @name: the name of the new block device as a zero terminated string
- * @probe: allback that is called on access to any minor number of @major
+ * @probe: pre-devtmpfs / pre-udev callback used to create disks when their
 *	   pre-created device node is accessed. When a probe call uses
 *	   add_disk() and it fails the driver must cleanup resources. This
 *	   interface may soon be removed.
 *
 * The @name must be unique within the system.
 *
@ -368,17 +374,21 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action)
 }
 EXPORT_SYMBOL_GPL(disk_uevent);
-static void disk_scan_partitions(struct gendisk *disk)
+int disk_scan_partitions(struct gendisk *disk, fmode_t mode)
 {
 	struct block_device *bdev;
-	if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
+	if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
-		return;
+		return -EINVAL;
 	if (disk->open_partitions)
 		return -EBUSY;
 	set_bit(GD_NEED_PART_SCAN, &disk->state);
-	bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
+	bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL);
-	if (!IS_ERR(bdev))
+	if (IS_ERR(bdev))
-		blkdev_put(bdev, FMODE_READ);
+		return PTR_ERR(bdev);
 	blkdev_put(bdev, mode);
 	return 0;
 }
 /**
@ -390,8 +400,8 @@ static void disk_scan_partitions(struct gendisk *disk)
 * This function registers the partitioning information in @disk
 * with the kernel.
 */
-int device_add_disk(struct device *parent, struct gendisk *disk,
+int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
-		     const struct attribute_group **groups)
+				 const struct attribute_group **groups)
 {
 	struct device *ddev = disk_to_dev(disk);
@ -432,7 +442,6 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
 			return ret;
 		disk->major = BLOCK_EXT_MAJOR;
 		disk->first_minor = ret;
 		disk->flags |= GENHD_FL_EXT_DEVT;
 	}
 	/* delay uevents, until we scanned partition table */
@ -489,14 +498,7 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
 	if (ret)
 		goto out_put_slave_dir;
-	if (disk->flags & GENHD_FL_HIDDEN) {
+	if (!(disk->flags & GENHD_FL_HIDDEN)) {
 		/*
 		 * Don't let hidden disks show up in /proc/partitions,
 		 * and don't bother scanning for partitions either.
 		 */
 		disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
 		disk->flags |= GENHD_FL_NO_PART_SCAN;
 	} else {
 		ret = bdi_register(disk->bdi, "%u:%u",
 				   disk->major, disk->first_minor);
 		if (ret)
@ -508,7 +510,8 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
 			goto out_unregister_bdi;
 		bdev_add(disk->part0, ddev->devt);
-		disk_scan_partitions(disk);
+		if (get_capacity(disk))
 			disk_scan_partitions(disk, FMODE_READ);
 		/*
 		 * Announce the disk and partitions after all partitions are
@ -541,7 +544,7 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
 out_free_ext_minor:
 	if (disk->major == BLOCK_EXT_MAJOR)
 		blk_free_ext_minor(disk->first_minor);
-	return WARN_ON_ONCE(ret); /* keep until all callers handle errors */
+	return ret;
 }
 EXPORT_SYMBOL(device_add_disk);
@ -645,6 +648,26 @@ void del_gendisk(struct gendisk *disk)
 }
 EXPORT_SYMBOL(del_gendisk);
 /**
 * invalidate_disk - invalidate the disk
 * @disk: the struct gendisk to invalidate
 *
 * A helper to invalidates the disk. It will clean the disk's associated
 * buffer/page caches and reset its internal states so that the disk
 * can be reused by the drivers.
 *
 * Context: can sleep
 */
 void invalidate_disk(struct gendisk *disk)
 {
 	struct block_device *bdev = disk->part0;
 	invalidate_bdev(bdev);
 	bdev->bd_inode->i_mapping->wb_err = 0;
 	set_capacity(disk, 0);
 }
 EXPORT_SYMBOL(invalidate_disk);
 /* sysfs access to bad-blocks list. */
 static ssize_t disk_badblocks_show(struct device *dev,
 					struct device_attribute *attr,
@ -711,8 +734,7 @@ void __init printk_all_partitions(void)
 		 * Don't show empty devices or things that have been
 		 * suppressed
 		 */
-		if (get_capacity(disk) == 0 ||
+		if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_HIDDEN))
 		    (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
 			continue;
 		/*
@ -805,11 +827,7 @@ static int show_partition(struct seq_file *seqf, void *v)
 	struct block_device *part;
 	unsigned long idx;
-	/* Don't show non-partitionable removeable devices or empty devices */
+	if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN))
 	if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
 				   (sgp->flags & GENHD_FL_REMOVABLE)))
 		return 0;
 	if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
 		return 0;
 	rcu_read_lock();
@ -865,7 +883,8 @@ static ssize_t disk_ext_range_show(struct device *dev,
 {
 	struct gendisk *disk = dev_to_disk(dev);
-	return sprintf(buf, "%d\n", disk_max_parts(disk));
+	return sprintf(buf, "%d\n",
 		(disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS);
 }
 static ssize_t disk_removable_show(struct device *dev,
@ -904,7 +923,7 @@ ssize_t part_stat_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
 	struct block_device *bdev = dev_to_bdev(dev);
-	struct request_queue *q = bdev->bd_disk->queue;
+	struct request_queue *q = bdev_get_queue(bdev);
 	struct disk_stats stat;
 	unsigned int inflight;
@ -948,7 +967,7 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
 			   char *buf)
 {
 	struct block_device *bdev = dev_to_bdev(dev);
-	struct request_queue *q = bdev->bd_disk->queue;
+	struct request_queue *q = bdev_get_queue(bdev);
 	unsigned int inflight[2];
 	if (queue_is_mq(q))
@ -1290,6 +1309,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 	if (!disk->bdi)
 		goto out_free_disk;
 	/* bdev_alloc() might need the queue, set before the first call */
 	disk->queue = q;
 	disk->part0 = bdev_alloc(disk, 0);
 	if (!disk->part0)
 		goto out_free_bdi;
@ -1305,7 +1327,6 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
 	disk_to_dev(disk)->type = &disk_type;
 	device_initialize(disk_to_dev(disk));
 	inc_diskseq(disk);
 	disk->queue = q;
 	q->disk = disk;
 	lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
 #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
@ -1332,7 +1353,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
 	struct request_queue *q;
 	struct gendisk *disk;
-	q = blk_alloc_queue(node);
+	q = blk_alloc_queue(node, false);
 	if (!q)
 		return NULL;
@ -1410,12 +1431,6 @@ void set_disk_ro(struct gendisk *disk, bool read_only)
 }
 EXPORT_SYMBOL(set_disk_ro);
 int bdev_read_only(struct block_device *bdev)
 {
 	return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
 }
 EXPORT_SYMBOL(bdev_read_only);
 void inc_diskseq(struct gendisk *disk)
 {
 	disk->diskseq = atomic64_inc_return(&diskseq);
--- a/block/ioctl.c
+++ b/block/ioctl.c
@ -82,31 +82,6 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
 }
 #endif
 static int blkdev_reread_part(struct block_device *bdev, fmode_t mode)
 {
 	struct block_device *tmp;
 	if (!disk_part_scan_enabled(bdev->bd_disk) || bdev_is_partition(bdev))
 		return -EINVAL;
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
 	if (bdev->bd_disk->open_partitions)
 		return -EBUSY;
 	/*
 	 * Reopen the device to revalidate the driver state and force a
 	 * partition rescan.
 	 */
 	mode &= ~FMODE_EXCL;
 	set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
 	tmp = blkdev_get_by_dev(bdev->bd_dev, mode, NULL);
 	if (IS_ERR(tmp))
 		return PTR_ERR(tmp);
 	blkdev_put(tmp, mode);
 	return 0;
 }
 static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
 		unsigned long arg, unsigned long flags)
 {
@ -133,7 +108,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
 	if (len & 511)
 		return -EINVAL;
-	if (start + len > i_size_read(bdev->bd_inode))
+	if (start + len > bdev_nr_bytes(bdev))
 		return -EINVAL;
 	filemap_invalidate_lock(inode->i_mapping);
@ -171,7 +146,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
 		return -EINVAL;
 	if (len & 511)
 		return -EINVAL;
-	if (end >= (uint64_t)i_size_read(bdev->bd_inode))
+	if (end >= (uint64_t)bdev_nr_bytes(bdev))
 		return -EINVAL;
 	if (end < start)
 		return -EINVAL;
@ -522,7 +497,11 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
 		bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE;
 		return 0;
 	case BLKRRPART:
-		return blkdev_reread_part(bdev, mode);
+		if (!capable(CAP_SYS_ADMIN))
 			return -EACCES;
 		if (bdev_is_partition(bdev))
 			return -EINVAL;
 		return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL);
 	case BLKTRACESTART:
 	case BLKTRACESTOP:
 	case BLKTRACETEARDOWN:
@ -550,12 +529,21 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
 *
 * New commands must be compatible and go into blkdev_common_ioctl
 */
-int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
+long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 			unsigned long arg)
 {
-	int ret;
+	struct block_device *bdev = I_BDEV(file->f_mapping->host);
 	loff_t size;
 	void __user *argp = (void __user *)arg;
 	fmode_t mode = file->f_mode;
 	int ret;
 	/*
 	 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
 	 * to updated it before every ioctl.
 	 */
 	if (file->f_flags & O_NDELAY)
 		mode |= FMODE_NDELAY;
 	else
 		mode &= ~FMODE_NDELAY;
 	switch (cmd) {
 	/* These need separate implementations for the data structure */
@ -572,10 +560,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		return put_long(argp,
 			(bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
 	case BLKGETSIZE:
-		size = i_size_read(bdev->bd_inode);
+		if (bdev_nr_sectors(bdev) > ~0UL)
 		if ((size >> 9) > ~0UL)
 			return -EFBIG;
-		return put_ulong(argp, size >> 9);
+		return put_ulong(argp, bdev_nr_sectors(bdev));
 	/* The data is compatible, but the command number is different */
 	case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
@ -583,7 +570,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKBSZSET:
 		return blkdev_bszset(bdev, mode, argp);
 	case BLKGETSIZE64:
-		return put_u64(argp, i_size_read(bdev->bd_inode));
+		return put_u64(argp, bdev_nr_bytes(bdev));
 	/* Incompatible alignment on i386 */
 	case BLKTRACESETUP:
@ -600,7 +587,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		return -ENOTTY;
 	return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
 }
 EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */
 #ifdef CONFIG_COMPAT
@ -618,7 +604,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	struct block_device *bdev = I_BDEV(file->f_mapping->host);
 	struct gendisk *disk = bdev->bd_disk;
 	fmode_t mode = file->f_mode;
 	loff_t size;
 	/*
 	 * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
@ -644,10 +629,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		return compat_put_long(argp,
 			(bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
 	case BLKGETSIZE:
-		size = i_size_read(bdev->bd_inode);
+		if (bdev_nr_sectors(bdev) > ~0UL)
 		if ((size >> 9) > ~0UL)
 			return -EFBIG;
-		return compat_put_ulong(argp, size >> 9);
+		return compat_put_ulong(argp, bdev_nr_sectors(bdev));
 	/* The data is compatible, but the command number is different */
 	case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
@ -655,7 +639,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKBSZSET_32:
 		return blkdev_bszset(bdev, mode, argp);
 	case BLKGETSIZE64_32:
-		return put_u64(argp, i_size_read(bdev->bd_inode));
+		return put_u64(argp, bdev_nr_bytes(bdev));
 	/* Incompatible alignment on i386 */
 	case BLKTRACESETUP32:
--- a/block/ioprio.c
+++ b/block/ioprio.c
@ -22,46 +22,14 @@
 */
 #include <linux/gfp.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/ioprio.h>
 #include <linux/cred.h>
 #include <linux/blkdev.h>
 #include <linux/capability.h>
 #include <linux/sched/user.h>
 #include <linux/sched/task.h>
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/pid_namespace.h>
 int set_task_ioprio(struct task_struct *task, int ioprio)
 {
 	int err;
 	struct io_context *ioc;
 	const struct cred *cred = current_cred(), *tcred;
 	rcu_read_lock();
 	tcred = __task_cred(task);
 	if (!uid_eq(tcred->uid, cred->euid) &&
 	    !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
 		rcu_read_unlock();
 		return -EPERM;
 	}
 	rcu_read_unlock();
 	err = security_task_setioprio(task, ioprio);
 	if (err)
 		return err;
 	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
 		ioc->ioprio = ioprio;
 		put_io_context(ioc);
 	}
 	return err;
 }
 EXPORT_SYMBOL_GPL(set_task_ioprio);
 int ioprio_check_cap(int ioprio)
 {
 	int class = IOPRIO_PRIO_CLASS(ioprio);
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@ -9,12 +9,12 @@
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
 #include <linux/elevator.h>
 #include <linux/module.h>
 #include <linux/sbitmap.h>
 #include <trace/events/block.h>
 #include "elevator.h"
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-debugfs.h"
@ -433,6 +433,7 @@ static void kyber_exit_sched(struct elevator_queue *e)
 	int i;
 	del_timer_sync(&kqd->timer);
 	blk_stat_disable_accounting(kqd->q);
 	for (i = 0; i < KYBER_NUM_DOMAINS; i++)
 		sbitmap_queue_free(&kqd->domain_tokens[i]);
@ -453,11 +454,11 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
 {
 	struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
 	struct blk_mq_tags *tags = hctx->sched_tags;
-	unsigned int shift = tags->bitmap_tags->sb.shift;
+	unsigned int shift = tags->bitmap_tags.sb.shift;
 	kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
-	sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth);
+	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
 }
 static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@ -9,7 +9,6 @@
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/blk-mq.h>
 #include <linux/elevator.h>
 #include <linux/bio.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@ -20,6 +19,7 @@
 #include <trace/events/block.h>
 #include "elevator.h"
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-debugfs.h"
@ -31,6 +31,11 @@
 */
 static const int read_expire = HZ / 2;  /* max time before a read is submitted. */
 static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
 /*
 * Time after which to dispatch lower priority requests even if higher
 * priority requests are pending.
 */
 static const int prio_aging_expire = 10 * HZ;
 static const int writes_starved = 2;    /* max times reads can starve a write */
 static const int fifo_batch = 16;       /* # of sequential requests treated as one
 				     by the above parameters. For throughput. */
@ -51,17 +56,16 @@ enum dd_prio {
 enum { DD_PRIO_COUNT = 3 };
-/* I/O statistics per I/O priority. */
+/*
 * I/O statistics per I/O priority. It is fine if these counters overflow.
 * What matters is that these counters are at least as wide as
 * log2(max_outstanding_requests).
 */
 struct io_stats_per_prio {
-	local_t inserted;
+	uint32_t inserted;
-	local_t merged;
+	uint32_t merged;
-	local_t dispatched;
+	uint32_t dispatched;
-	local_t completed;
+	atomic_t completed;
 };
 /* I/O statistics for all I/O priorities (enum dd_prio). */
 struct io_stats {
 	struct io_stats_per_prio stats[DD_PRIO_COUNT];
 };
 /*
@ -74,6 +78,7 @@ struct dd_per_prio {
 	struct list_head fifo_list[DD_DIR_COUNT];
 	/* Next request in FIFO order. Read, write or both are NULL. */
 	struct request *next_rq[DD_DIR_COUNT];
 	struct io_stats_per_prio stats;
 };
 struct deadline_data {
@ -88,8 +93,6 @@ struct deadline_data {
 	unsigned int batching;		/* number of sequential requests made */
 	unsigned int starved;		/* times reads have starved writes */
 	struct io_stats __percpu *stats;
 	/*
 	 * settings that change how the i/o scheduler behaves
 	 */
@ -98,38 +101,12 @@ struct deadline_data {
 	int writes_starved;
 	int front_merges;
 	u32 async_depth;
 	int prio_aging_expire;
 	spinlock_t lock;
 	spinlock_t zone_lock;
 };
 /* Count one event of type 'event_type' and with I/O priority 'prio' */
 #define dd_count(dd, event_type, prio) do {				\
 	struct io_stats *io_stats = get_cpu_ptr((dd)->stats);		\
 									\
 	BUILD_BUG_ON(!__same_type((dd), struct deadline_data *));	\
 	BUILD_BUG_ON(!__same_type((prio), enum dd_prio));		\
 	local_inc(&io_stats->stats[(prio)].event_type);			\
 	put_cpu_ptr(io_stats);						\
 } while (0)
 /*
 * Returns the total number of dd_count(dd, event_type, prio) calls across all
 * CPUs. No locking or barriers since it is fine if the returned sum is slightly
 * outdated.
 */
 #define dd_sum(dd, event_type, prio) ({					\
 	unsigned int cpu;						\
 	u32 sum = 0;							\
 									\
 	BUILD_BUG_ON(!__same_type((dd), struct deadline_data *));	\
 	BUILD_BUG_ON(!__same_type((prio), enum dd_prio));		\
 	for_each_present_cpu(cpu)					\
 		sum += local_read(&per_cpu_ptr((dd)->stats, cpu)->	\
 				  stats[(prio)].event_type);		\
 	sum;								\
 })
 /* Maps an I/O priority class to a deadline scheduler priority. */
 static const enum dd_prio ioprio_class_to_prio[] = {
 	[IOPRIO_CLASS_NONE]	= DD_BE_PRIO,
@ -233,7 +210,9 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
 	const u8 ioprio_class = dd_rq_ioclass(next);
 	const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
-	dd_count(dd, merged, prio);
+	lockdep_assert_held(&dd->lock);
 	dd->per_prio[prio].stats.merged++;
 	/*
 	 * if next expires before rq, assign its expire time to rq
@ -270,6 +249,16 @@ deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
 	deadline_remove_request(rq->q, per_prio, rq);
 }
 /* Number of requests queued for a given priority level. */
 static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
 {
 	const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
 	lockdep_assert_held(&dd->lock);
 	return stats->inserted - atomic_read(&stats->completed);
 }
 /*
 * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
 * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
@ -355,12 +344,27 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
 	return rq;
 }
 /*
 * Returns true if and only if @rq started after @latest_start where
 * @latest_start is in jiffies.
 */
 static bool started_after(struct deadline_data *dd, struct request *rq,
 			  unsigned long latest_start)
 {
 	unsigned long start_time = (unsigned long)rq->fifo_time;
 	start_time -= dd->fifo_expire[rq_data_dir(rq)];
 	return time_after(start_time, latest_start);
 }
 /*
 * deadline_dispatch_requests selects the best request according to
- * read/write expire, fifo_batch, etc
+ * read/write expire, fifo_batch, etc and with a start time <= @latest_start.
 */
 static struct request *__dd_dispatch_request(struct deadline_data *dd,
-					     struct dd_per_prio *per_prio)
+					     struct dd_per_prio *per_prio,
 					     unsigned long latest_start)
 {
 	struct request *rq, *next_rq;
 	enum dd_data_dir data_dir;
@ -372,6 +376,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	if (!list_empty(&per_prio->dispatch)) {
 		rq = list_first_entry(&per_prio->dispatch, struct request,
 				      queuelist);
 		if (started_after(dd, rq, latest_start))
 			return NULL;
 		list_del_init(&rq->queuelist);
 		goto done;
 	}
@ -449,6 +455,9 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	dd->batching = 0;
 dispatch_request:
 	if (started_after(dd, rq, latest_start))
 		return NULL;
 	/*
 	 * rq is the selected appropriate request.
 	 */
@ -457,7 +466,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 done:
 	ioprio_class = dd_rq_ioclass(rq);
 	prio = ioprio_class_to_prio[ioprio_class];
-	dd_count(dd, dispatched, prio);
+	dd->per_prio[prio].stats.dispatched++;
 	/*
 	 * If the request needs its target zone locked, do it.
 	 */
@ -466,6 +475,34 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 	return rq;
 }
 /*
 * Check whether there are any requests with priority other than DD_RT_PRIO
 * that were inserted more than prio_aging_expire jiffies ago.
 */
 static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
 						      unsigned long now)
 {
 	struct request *rq;
 	enum dd_prio prio;
 	int prio_cnt;
 	lockdep_assert_held(&dd->lock);
 	prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) +
 		   !!dd_queued(dd, DD_IDLE_PRIO);
 	if (prio_cnt < 2)
 		return NULL;
 	for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
 		rq = __dd_dispatch_request(dd, &dd->per_prio[prio],
 					   now - dd->prio_aging_expire);
 		if (rq)
 			return rq;
 	}
 	return NULL;
 }
 /*
 * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
 *
@ -477,15 +514,26 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
 static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
 {
 	struct deadline_data *dd = hctx->queue->elevator->elevator_data;
 	const unsigned long now = jiffies;
 	struct request *rq;
 	enum dd_prio prio;
 	spin_lock(&dd->lock);
 	rq = dd_dispatch_prio_aged_requests(dd, now);
 	if (rq)
 		goto unlock;
 	/*
 	 * Next, dispatch requests in priority order. Ignore lower priority
 	 * requests if any higher priority requests are pending.
 	 */
 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
-		rq = __dd_dispatch_request(dd, &dd->per_prio[prio]);
+		rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now);
-		if (rq)
+		if (rq || dd_queued(dd, prio))
 			break;
 	}
 unlock:
 	spin_unlock(&dd->lock);
 	return rq;
@ -519,7 +567,7 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
 	dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
-	sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth);
+	sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth);
 }
 /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
@ -536,12 +584,21 @@ static void dd_exit_sched(struct elevator_queue *e)
 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
 		struct dd_per_prio *per_prio = &dd->per_prio[prio];
 		const struct io_stats_per_prio *stats = &per_prio->stats;
 		uint32_t queued;
 		WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
 		WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
 	}
-	free_percpu(dd->stats);
+		spin_lock(&dd->lock);
 		queued = dd_queued(dd, prio);
 		spin_unlock(&dd->lock);
 		WARN_ONCE(queued != 0,
 			  "statistics for priority %d: i %u m %u d %u c %u\n",
 			  prio, stats->inserted, stats->merged,
 			  stats->dispatched, atomic_read(&stats->completed));
 	}
 	kfree(dd);
 }
@ -566,11 +623,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 	eq->elevator_data = dd;
 	dd->stats = alloc_percpu_gfp(typeof(*dd->stats),
 				     GFP_KERNEL | __GFP_ZERO);
 	if (!dd->stats)
 		goto free_dd;
 	for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
 		struct dd_per_prio *per_prio = &dd->per_prio[prio];
@ -586,15 +638,13 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
 	dd->front_merges = 1;
 	dd->last_dir = DD_WRITE;
 	dd->fifo_batch = fifo_batch;
 	dd->prio_aging_expire = prio_aging_expire;
 	spin_lock_init(&dd->lock);
 	spin_lock_init(&dd->zone_lock);
 	q->elevator = eq;
 	return 0;
 free_dd:
 	kfree(dd);
 put_eq:
 	kobject_put(&eq->kobj);
 	return ret;
@ -677,8 +727,11 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	blk_req_zone_write_unlock(rq);
 	prio = ioprio_class_to_prio[ioprio_class];
-	dd_count(dd, inserted, prio);
+	per_prio = &dd->per_prio[prio];
-	rq->elv.priv[0] = (void *)(uintptr_t)1;
+	if (!rq->elv.priv[0]) {
 		per_prio->stats.inserted++;
 		rq->elv.priv[0] = (void *)(uintptr_t)1;
 	}
 	if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
 		blk_mq_free_requests(&free);
@ -687,7 +740,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
 	trace_block_rq_insert(rq);
 	per_prio = &dd->per_prio[prio];
 	if (at_head) {
 		list_add(&rq->queuelist, &per_prio->dispatch);
 	} else {
@ -759,12 +811,13 @@ static void dd_finish_request(struct request *rq)
 	/*
 	 * The block layer core may call dd_finish_request() without having
-	 * called dd_insert_requests(). Hence only update statistics for
+	 * called dd_insert_requests(). Skip requests that bypassed I/O
-	 * requests for which dd_insert_requests() has been called. See also
+	 * scheduling. See also blk_mq_request_bypass_insert().
 	 * blk_mq_request_bypass_insert().
 	 */
-	if (rq->elv.priv[0])
+	if (!rq->elv.priv[0])
-		dd_count(dd, completed, prio);
+		return;
 	atomic_inc(&per_prio->stats.completed);
 	if (blk_queue_is_zoned(q)) {
 		unsigned long flags;
@ -809,6 +862,7 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
 #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
 SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
 SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
 SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire);
 SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
 SHOW_INT(deadline_front_merges_show, dd->front_merges);
 SHOW_INT(deadline_async_depth_show, dd->async_depth);
@ -838,6 +892,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)
 	STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
 STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
 STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
 STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX);
 STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
 STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
 STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX);
@ -856,6 +911,7 @@ static struct elv_fs_entry deadline_attrs[] = {
 	DD_ATTR(front_merges),
 	DD_ATTR(async_depth),
 	DD_ATTR(fifo_batch),
 	DD_ATTR(prio_aging_expire),
 	__ATTR_NULL
 };
@ -947,38 +1003,48 @@ static int dd_async_depth_show(void *data, struct seq_file *m)
 	return 0;
 }
 /* Number of requests queued for a given priority level. */
 static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
 {
 	return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio);
 }
 static int dd_queued_show(void *data, struct seq_file *m)
 {
 	struct request_queue *q = data;
 	struct deadline_data *dd = q->elevator->elevator_data;
 	u32 rt, be, idle;
 	spin_lock(&dd->lock);
 	rt = dd_queued(dd, DD_RT_PRIO);
 	be = dd_queued(dd, DD_BE_PRIO);
 	idle = dd_queued(dd, DD_IDLE_PRIO);
 	spin_unlock(&dd->lock);
 	seq_printf(m, "%u %u %u\n", rt, be, idle);
 	seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO),
 		   dd_queued(dd, DD_BE_PRIO),
 		   dd_queued(dd, DD_IDLE_PRIO));
 	return 0;
 }
 /* Number of requests owned by the block driver for a given priority. */
 static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
 {
-	return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio)
+	const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
-		- dd_sum(dd, completed, prio);
+
 	lockdep_assert_held(&dd->lock);
 	return stats->dispatched + stats->merged -
 		atomic_read(&stats->completed);
 }
 static int dd_owned_by_driver_show(void *data, struct seq_file *m)
 {
 	struct request_queue *q = data;
 	struct deadline_data *dd = q->elevator->elevator_data;
 	u32 rt, be, idle;
 	spin_lock(&dd->lock);
 	rt = dd_owned_by_driver(dd, DD_RT_PRIO);
 	be = dd_owned_by_driver(dd, DD_BE_PRIO);
 	idle = dd_owned_by_driver(dd, DD_IDLE_PRIO);
 	spin_unlock(&dd->lock);
 	seq_printf(m, "%u %u %u\n", rt, be, idle);
 	seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO),
 		   dd_owned_by_driver(dd, DD_BE_PRIO),
 		   dd_owned_by_driver(dd, DD_IDLE_PRIO));
 	return 0;
 }
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@ -2,6 +2,8 @@
 #
 # Partition configuration
 #
 menu "Partition Types"
 config PARTITION_ADVANCED
 	bool "Advanced partition selection"
 	help
@ -267,3 +269,5 @@ config CMDLINE_PARTITION
 	help
 	  Say Y here if you want to read the partition table from bootargs.
 	  The format for the command line is just like mtdparts.
 endmenu
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@ -91,19 +91,19 @@ static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
 {
 	spin_lock(&bdev->bd_size_lock);
 	i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
 	bdev->bd_nr_sectors = sectors;
 	spin_unlock(&bdev->bd_size_lock);
 }
 static struct parsed_partitions *allocate_partitions(struct gendisk *hd)
 {
 	struct parsed_partitions *state;
-	int nr;
+	int nr = DISK_MAX_PARTS;
 	state = kzalloc(sizeof(*state), GFP_KERNEL);
 	if (!state)
 		return NULL;
 	nr = disk_max_parts(hd);
 	state->parts = vzalloc(array_size(nr, sizeof(state->parts[0])));
 	if (!state->parts) {
 		kfree(state);
@ -204,7 +204,7 @@ static ssize_t part_alignment_offset_show(struct device *dev,
 	struct block_device *bdev = dev_to_bdev(dev);
 	return sprintf(buf, "%u\n",
-		queue_limit_alignment_offset(&bdev->bd_disk->queue->limits,
+		queue_limit_alignment_offset(&bdev_get_queue(bdev)->limits,
 				bdev->bd_start_sect));
 }
@ -214,7 +214,7 @@ static ssize_t part_discard_alignment_show(struct device *dev,
 	struct block_device *bdev = dev_to_bdev(dev);
 	return sprintf(buf, "%u\n",
-		queue_limit_discard_alignment(&bdev->bd_disk->queue->limits,
+		queue_limit_discard_alignment(&bdev_get_queue(bdev)->limits,
 				bdev->bd_start_sect));
 }
@ -325,7 +325,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
 	lockdep_assert_held(&disk->open_mutex);
-	if (partno >= disk_max_parts(disk))
+	if (partno >= DISK_MAX_PARTS)
 		return ERR_PTR(-EINVAL);
 	/*
@ -526,18 +526,15 @@ int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
 static bool disk_unlock_native_capacity(struct gendisk *disk)
 {
-	const struct block_device_operations *bdops = disk->fops;
+	if (!disk->fops->unlock_native_capacity ||
-
+	    test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) {
 	if (bdops->unlock_native_capacity &&
 	    !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
 		printk(KERN_CONT "enabling native capacity\n");
 		bdops->unlock_native_capacity(disk);
 		disk->flags |= GENHD_FL_NATIVE_CAPACITY;
 		return true;
 	} else {
 		printk(KERN_CONT "truncated\n");
 		return false;
 	}
 	printk(KERN_CONT "enabling native capacity\n");
 	disk->fops->unlock_native_capacity(disk);
 	return true;
 }
 void blk_drop_partitions(struct gendisk *disk)
@ -606,7 +603,7 @@ static int blk_add_partitions(struct gendisk *disk)
 	struct parsed_partitions *state;
 	int ret = -EAGAIN, p;
-	if (!disk_part_scan_enabled(disk))
+	if (disk->flags & GENHD_FL_NO_PART)
 		return 0;
 	state = check_partition(disk);
@ -689,7 +686,7 @@ int bdev_disk_changed(struct gendisk *disk, bool invalidate)
 	 * userspace for this particular setup.
 	 */
 	if (invalidate) {
-		if (disk_part_scan_enabled(disk) ||
+		if (!(disk->flags & GENHD_FL_NO_PART) ||
 		    !(disk->flags & GENHD_FL_REMOVABLE))
 			set_capacity(disk, 0);
 	}
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@ -133,7 +133,7 @@ efi_crc32(const void *buf, unsigned long len)
 */
 static u64 last_lba(struct gendisk *disk)
 {
-	return div_u64(disk->part0->bd_inode->i_size,
+	return div_u64(bdev_nr_bytes(disk->part0),
 		       queue_logical_block_size(disk->queue)) - 1ULL;
 }
--- a/block/partitions/ibm.c
+++ b/block/partitions/ibm.c
@ -198,7 +198,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
 				char name[],
 				union label_t *label,
 				sector_t labelsect,
-				loff_t i_size,
+				sector_t nr_sectors,
 				dasd_information2_t *info)
 {
 	loff_t offset, geo_size, size;
@ -213,14 +213,14 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
 	} else {
 		/*
 		 * Formated w/o large volume support. If the sanity check
-		 * 'size based on geo == size based on i_size' is true, then
+		 * 'size based on geo == size based on nr_sectors' is true, then
 		 * we can safely assume that we know the formatted size of
 		 * the disk, otherwise we need additional information
 		 * that we can only get from a real DASD device.
 		 */
 		geo_size = geo->cylinders * geo->heads
 			* geo->sectors * secperblk;
-		size = i_size >> 9;
+		size = nr_sectors;
 		if (size != geo_size) {
 			if (!info) {
 				strlcat(state->pp_buf, "\n", PAGE_SIZE);
@ -229,7 +229,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
 			if (!strcmp(info->type, "ECKD"))
 				if (geo_size < size)
 					size = geo_size;
-			/* else keep size based on i_size */
+			/* else keep size based on nr_sectors */
 		}
 	}
 	/* first and only partition starts in the first block after the label */
@ -293,7 +293,8 @@ int ibm_partition(struct parsed_partitions *state)
 	struct gendisk *disk = state->disk;
 	struct block_device *bdev = disk->part0;
 	int blocksize, res;
-	loff_t i_size, offset, size;
+	loff_t offset, size;
 	sector_t nr_sectors;
 	dasd_information2_t *info;
 	struct hd_geometry *geo;
 	char type[5] = {0,};
@ -308,8 +309,8 @@ int ibm_partition(struct parsed_partitions *state)
 	blocksize = bdev_logical_block_size(bdev);
 	if (blocksize <= 0)
 		goto out_symbol;
-	i_size = i_size_read(bdev->bd_inode);
+	nr_sectors = bdev_nr_sectors(bdev);
-	if (i_size == 0)
+	if (nr_sectors == 0)
 		goto out_symbol;
 	info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL);
 	if (info == NULL)
@ -336,7 +337,7 @@ int ibm_partition(struct parsed_partitions *state)
 						   label);
 		} else if (!strncmp(type, "LNX1", 4)) {
 			res = find_lnx1_partitions(state, geo, blocksize, name,
-						   label, labelsect, i_size,
+						   label, labelsect, nr_sectors,
 						   info);
 		} else if (!strncmp(type, "CMS1", 4)) {
 			res = find_cms1_partitions(state, geo, blocksize, name,
@ -353,7 +354,7 @@ int ibm_partition(struct parsed_partitions *state)
 		res = 1;
 		if (info->format == DASD_FORMAT_LDL) {
 			strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
-			size = i_size >> 9;
+			size = nr_sectors;
 			offset = (info->label_block + 1) * (blocksize >> 9);
 			put_partition(state, 1, offset, size-offset);
 			strlcat(state->pp_buf, "\n", PAGE_SIZE);
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@ -5,7 +5,7 @@
 */
 #include <linux/t10-pi.h>
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
 #include <linux/crc-t10dif.h>
 #include <linux/module.h>
 #include <net/checksum.h>