3
0
mirror of https://github.com/Qortal/Brooklyn.git synced 2025-02-12 02:05:54 +00:00
This commit is contained in:
Raziel K. Crowe 2022-04-02 17:29:52 +05:00
parent fb209289b8
commit 07d9c3128d
47 changed files with 3310 additions and 2782 deletions

View File

@ -35,6 +35,9 @@ config BLK_CGROUP_RWSTAT
config BLK_DEV_BSG_COMMON config BLK_DEV_BSG_COMMON
tristate tristate
config BLK_ICQ
bool
config BLK_DEV_BSGLIB config BLK_DEV_BSGLIB
bool "Block layer SG support v4 helper lib" bool "Block layer SG support v4 helper lib"
select BLK_DEV_BSG_COMMON select BLK_DEV_BSG_COMMON
@ -73,7 +76,7 @@ config BLK_DEV_ZONED
config BLK_DEV_THROTTLING config BLK_DEV_THROTTLING
bool "Block layer bio throttling support" bool "Block layer bio throttling support"
depends on BLK_CGROUP=y depends on BLK_CGROUP
select BLK_CGROUP_RWSTAT select BLK_CGROUP_RWSTAT
help help
Block layer bio throttling support. It can be used to limit Block layer bio throttling support. It can be used to limit
@ -112,7 +115,7 @@ config BLK_WBT_MQ
config BLK_CGROUP_IOLATENCY config BLK_CGROUP_IOLATENCY
bool "Enable support for latency based cgroup IO protection" bool "Enable support for latency based cgroup IO protection"
depends on BLK_CGROUP=y depends on BLK_CGROUP
help help
Enabling this option enables the .latency interface for IO throttling. Enabling this option enables the .latency interface for IO throttling.
The IO controller will attempt to maintain average IO latencies below The IO controller will attempt to maintain average IO latencies below
@ -132,7 +135,7 @@ config BLK_CGROUP_FC_APPID
config BLK_CGROUP_IOCOST config BLK_CGROUP_IOCOST
bool "Enable support for cost model based cgroup IO controller" bool "Enable support for cost model based cgroup IO controller"
depends on BLK_CGROUP=y depends on BLK_CGROUP
select BLK_RQ_IO_DATA_LEN select BLK_RQ_IO_DATA_LEN
select BLK_RQ_ALLOC_TIME select BLK_RQ_ALLOC_TIME
help help
@ -190,39 +193,31 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
by falling back to the kernel crypto API when inline by falling back to the kernel crypto API when inline
encryption hardware is not present. encryption hardware is not present.
menu "Partition Types"
source "block/partitions/Kconfig" source "block/partitions/Kconfig"
endmenu
endif # BLOCK
config BLOCK_COMPAT config BLOCK_COMPAT
bool def_bool COMPAT
depends on BLOCK && COMPAT
default y
config BLK_MQ_PCI config BLK_MQ_PCI
bool def_bool PCI
depends on BLOCK && PCI
default y
config BLK_MQ_VIRTIO config BLK_MQ_VIRTIO
bool bool
depends on BLOCK && VIRTIO depends on VIRTIO
default y default y
config BLK_MQ_RDMA config BLK_MQ_RDMA
bool bool
depends on BLOCK && INFINIBAND depends on INFINIBAND
default y default y
config BLK_PM config BLK_PM
def_bool BLOCK && PM def_bool PM
# do not use in new code # do not use in new code
config BLOCK_HOLDER_DEPRECATED config BLOCK_HOLDER_DEPRECATED
bool bool
source "block/Kconfig.iosched" source "block/Kconfig.iosched"
endif # BLOCK

View File

@ -1,6 +1,4 @@
# SPDX-License-Identifier: GPL-2.0 # SPDX-License-Identifier: GPL-2.0
if BLOCK
menu "IO Schedulers" menu "IO Schedulers"
config MQ_IOSCHED_DEADLINE config MQ_IOSCHED_DEADLINE
@ -20,6 +18,7 @@ config MQ_IOSCHED_KYBER
config IOSCHED_BFQ config IOSCHED_BFQ
tristate "BFQ I/O scheduler" tristate "BFQ I/O scheduler"
select BLK_ICQ
help help
BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of
of the device among all processes according to their weights, of the device among all processes according to their weights,
@ -45,5 +44,3 @@ config BFQ_CGROUP_DEBUG
files in a cgroup which can be useful for debugging. files in a cgroup which can be useful for debugging.
endmenu endmenu
endif

View File

@ -3,13 +3,13 @@
# Makefile for the kernel block layer # Makefile for the kernel block layer
# #
obj-$(CONFIG_BLOCK) := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-timeout.o \ blk-merge.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
disk-events.o disk-events.o blk-ia-ranges.o
obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
@ -36,6 +36,6 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o obj-$(CONFIG_BLK_PM) += blk-pm.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o

View File

@ -12,6 +12,7 @@
#include <linux/major.h> #include <linux/major.h>
#include <linux/device_cgroup.h> #include <linux/device_cgroup.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/backing-dev.h> #include <linux/backing-dev.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/blkpg.h> #include <linux/blkpg.h>
@ -23,7 +24,6 @@
#include <linux/pseudo_fs.h> #include <linux/pseudo_fs.h>
#include <linux/uio.h> #include <linux/uio.h>
#include <linux/namei.h> #include <linux/namei.h>
#include <linux/cleancache.h>
#include <linux/part_stat.h> #include <linux/part_stat.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
#include "../fs/internal.h" #include "../fs/internal.h"
@ -87,10 +87,6 @@ void invalidate_bdev(struct block_device *bdev)
lru_add_drain_all(); /* make sure all lru add caches are flushed */ lru_add_drain_all(); /* make sure all lru add caches are flushed */
invalidate_mapping_pages(mapping, 0, -1); invalidate_mapping_pages(mapping, 0, -1);
} }
/* 99% of the time, we don't need to flush the cleancache on the bdev.
* But, for the strange corners, lets be cautious
*/
cleancache_invalidate_inode(mapping);
} }
EXPORT_SYMBOL(invalidate_bdev); EXPORT_SYMBOL(invalidate_bdev);
@ -184,14 +180,13 @@ int sb_min_blocksize(struct super_block *sb, int size)
EXPORT_SYMBOL(sb_min_blocksize); EXPORT_SYMBOL(sb_min_blocksize);
int __sync_blockdev(struct block_device *bdev, int wait) int sync_blockdev_nowait(struct block_device *bdev)
{ {
if (!bdev) if (!bdev)
return 0; return 0;
if (!wait) return filemap_flush(bdev->bd_inode->i_mapping);
return filemap_flush(bdev->bd_inode->i_mapping);
return filemap_write_and_wait(bdev->bd_inode->i_mapping);
} }
EXPORT_SYMBOL_GPL(sync_blockdev_nowait);
/* /*
* Write out and wait upon all the dirty data associated with a block * Write out and wait upon all the dirty data associated with a block
@ -199,7 +194,9 @@ int __sync_blockdev(struct block_device *bdev, int wait)
*/ */
int sync_blockdev(struct block_device *bdev) int sync_blockdev(struct block_device *bdev)
{ {
return __sync_blockdev(bdev, 1); if (!bdev)
return 0;
return filemap_write_and_wait(bdev->bd_inode->i_mapping);
} }
EXPORT_SYMBOL(sync_blockdev); EXPORT_SYMBOL(sync_blockdev);
@ -326,12 +323,12 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev)) if (!ops->rw_page || bdev_get_integrity(bdev))
return result; return result;
result = blk_queue_enter(bdev->bd_disk->queue, 0); result = blk_queue_enter(bdev_get_queue(bdev), 0);
if (result) if (result)
return result; return result;
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
REQ_OP_READ); REQ_OP_READ);
blk_queue_exit(bdev->bd_disk->queue); blk_queue_exit(bdev_get_queue(bdev));
return result; return result;
} }
@ -362,7 +359,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev)) if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP; return -EOPNOTSUPP;
result = blk_queue_enter(bdev->bd_disk->queue, 0); result = blk_queue_enter(bdev_get_queue(bdev), 0);
if (result) if (result)
return result; return result;
@ -375,7 +372,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
clean_page_buffers(page); clean_page_buffers(page);
unlock_page(page); unlock_page(page);
} }
blk_queue_exit(bdev->bd_disk->queue); blk_queue_exit(bdev_get_queue(bdev));
return result; return result;
} }
@ -492,6 +489,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
spin_lock_init(&bdev->bd_size_lock); spin_lock_init(&bdev->bd_size_lock);
bdev->bd_partno = partno; bdev->bd_partno = partno;
bdev->bd_inode = inode; bdev->bd_inode = inode;
bdev->bd_queue = disk->queue;
bdev->bd_stats = alloc_percpu(struct disk_stats); bdev->bd_stats = alloc_percpu(struct disk_stats);
if (!bdev->bd_stats) { if (!bdev->bd_stats) {
iput(inode); iput(inode);
@ -662,7 +660,7 @@ static void blkdev_flush_mapping(struct block_device *bdev)
static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
{ {
struct gendisk *disk = bdev->bd_disk; struct gendisk *disk = bdev->bd_disk;
int ret = 0; int ret;
if (disk->fops->open) { if (disk->fops->open) {
ret = disk->fops->open(bdev, mode); ret = disk->fops->open(bdev, mode);
@ -747,21 +745,11 @@ struct block_device *blkdev_get_no_open(dev_t dev)
if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
bdev = NULL; bdev = NULL;
iput(inode); iput(inode);
if (!bdev)
return NULL;
if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) ||
!try_module_get(bdev->bd_disk->fops->owner)) {
put_device(&bdev->bd_device);
return NULL;
}
return bdev; return bdev;
} }
void blkdev_put_no_open(struct block_device *bdev) void blkdev_put_no_open(struct block_device *bdev)
{ {
module_put(bdev->bd_disk->fops->owner);
put_device(&bdev->bd_device); put_device(&bdev->bd_device);
} }
@ -817,12 +805,14 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
ret = -ENXIO; ret = -ENXIO;
if (!disk_live(disk)) if (!disk_live(disk))
goto abort_claiming; goto abort_claiming;
if (!try_module_get(disk->fops->owner))
goto abort_claiming;
if (bdev_is_partition(bdev)) if (bdev_is_partition(bdev))
ret = blkdev_get_part(bdev, mode); ret = blkdev_get_part(bdev, mode);
else else
ret = blkdev_get_whole(bdev, mode); ret = blkdev_get_whole(bdev, mode);
if (ret) if (ret)
goto abort_claiming; goto put_module;
if (mode & FMODE_EXCL) { if (mode & FMODE_EXCL) {
bd_finish_claiming(bdev, holder); bd_finish_claiming(bdev, holder);
@ -834,7 +824,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
* used in blkdev_get/put(). * used in blkdev_get/put().
*/ */
if ((mode & FMODE_WRITE) && !bdev->bd_write_holder && if ((mode & FMODE_WRITE) && !bdev->bd_write_holder &&
(disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) {
bdev->bd_write_holder = true; bdev->bd_write_holder = true;
unblock_events = false; unblock_events = false;
} }
@ -844,7 +834,8 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
if (unblock_events) if (unblock_events)
disk_unblock_events(disk); disk_unblock_events(disk);
return bdev; return bdev;
put_module:
module_put(disk->fops->owner);
abort_claiming: abort_claiming:
if (mode & FMODE_EXCL) if (mode & FMODE_EXCL)
bd_abort_claiming(bdev, holder); bd_abort_claiming(bdev, holder);
@ -953,18 +944,21 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
blkdev_put_whole(bdev, mode); blkdev_put_whole(bdev, mode);
mutex_unlock(&disk->open_mutex); mutex_unlock(&disk->open_mutex);
module_put(disk->fops->owner);
blkdev_put_no_open(bdev); blkdev_put_no_open(bdev);
} }
EXPORT_SYMBOL(blkdev_put); EXPORT_SYMBOL(blkdev_put);
/** /**
* lookup_bdev - lookup a struct block_device by name * lookup_bdev() - Look up a struct block_device by name.
* @pathname: special file representing the block device * @pathname: Name of the block device in the filesystem.
* @dev: return value of the block device's dev_t * @dev: Pointer to the block device's dev_t, if found.
* *
* Get a reference to the blockdevice at @pathname in the current * Lookup the block device's dev_t at @pathname in the current
* namespace if possible and return it. Return ERR_PTR(error) * namespace if possible and return it in @dev.
* otherwise. *
* Context: May sleep.
* Return: 0 if succeeded, negative errno otherwise.
*/ */
int lookup_bdev(const char *pathname, dev_t *dev) int lookup_bdev(const char *pathname, dev_t *dev)
{ {
@ -1016,7 +1010,7 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty)
} }
EXPORT_SYMBOL(__invalidate_device); EXPORT_SYMBOL(__invalidate_device);
void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) void sync_bdevs(bool wait)
{ {
struct inode *inode, *old_inode = NULL; struct inode *inode, *old_inode = NULL;
@ -1047,8 +1041,19 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
bdev = I_BDEV(inode); bdev = I_BDEV(inode);
mutex_lock(&bdev->bd_disk->open_mutex); mutex_lock(&bdev->bd_disk->open_mutex);
if (bdev->bd_openers) if (!bdev->bd_openers) {
func(bdev, arg); ; /* skip */
} else if (wait) {
/*
* We keep the error status of individual mapping so
* that applications can catch the writeback error using
* fsync(2). See filemap_fdatawait_keep_errors() for
* details.
*/
filemap_fdatawait_keep_errors(inode->i_mapping);
} else {
filemap_fdatawrite(inode->i_mapping);
}
mutex_unlock(&bdev->bd_disk->open_mutex); mutex_unlock(&bdev->bd_disk->open_mutex);
spin_lock(&blockdev_superblock->s_inode_list_lock); spin_lock(&blockdev_superblock->s_inode_list_lock);

View File

@ -6,13 +6,13 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/cgroup.h> #include <linux/cgroup.h>
#include <linux/elevator.h>
#include <linux/ktime.h> #include <linux/ktime.h>
#include <linux/rbtree.h> #include <linux/rbtree.h>
#include <linux/ioprio.h> #include <linux/ioprio.h>
#include <linux/sbitmap.h> #include <linux/sbitmap.h>
#include <linux/delay.h> #include <linux/delay.h>
#include "elevator.h"
#include "bfq-iosched.h" #include "bfq-iosched.h"
#ifdef CONFIG_BFQ_CGROUP_DEBUG #ifdef CONFIG_BFQ_CGROUP_DEBUG
@ -463,7 +463,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
{ {
if (blkg_rwstat_init(&stats->bytes, gfp) || if (blkg_rwstat_init(&stats->bytes, gfp) ||
blkg_rwstat_init(&stats->ios, gfp)) blkg_rwstat_init(&stats->ios, gfp))
return -ENOMEM; goto error;
#ifdef CONFIG_BFQ_CGROUP_DEBUG #ifdef CONFIG_BFQ_CGROUP_DEBUG
if (blkg_rwstat_init(&stats->merged, gfp) || if (blkg_rwstat_init(&stats->merged, gfp) ||
@ -476,13 +476,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
bfq_stat_init(&stats->dequeue, gfp) || bfq_stat_init(&stats->dequeue, gfp) ||
bfq_stat_init(&stats->group_wait_time, gfp) || bfq_stat_init(&stats->group_wait_time, gfp) ||
bfq_stat_init(&stats->idle_time, gfp) || bfq_stat_init(&stats->idle_time, gfp) ||
bfq_stat_init(&stats->empty_time, gfp)) { bfq_stat_init(&stats->empty_time, gfp))
bfqg_stats_exit(stats); goto error;
return -ENOMEM;
}
#endif #endif
return 0; return 0;
error:
bfqg_stats_exit(stats);
return -ENOMEM;
} }
static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)

View File

@ -117,7 +117,6 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/cgroup.h> #include <linux/cgroup.h>
#include <linux/elevator.h>
#include <linux/ktime.h> #include <linux/ktime.h>
#include <linux/rbtree.h> #include <linux/rbtree.h>
#include <linux/ioprio.h> #include <linux/ioprio.h>
@ -127,6 +126,7 @@
#include <trace/events/block.h> #include <trace/events/block.h>
#include "elevator.h"
#include "blk.h" #include "blk.h"
#include "blk-mq.h" #include "blk-mq.h"
#include "blk-mq-tag.h" #include "blk-mq-tag.h"
@ -433,26 +433,21 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
/** /**
* bfq_bic_lookup - search into @ioc a bic associated to @bfqd. * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
* @bfqd: the lookup key.
* @ioc: the io_context of the process doing I/O.
* @q: the request queue. * @q: the request queue.
*/ */
static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
struct io_context *ioc,
struct request_queue *q)
{ {
if (ioc) { struct bfq_io_cq *icq;
unsigned long flags; unsigned long flags;
struct bfq_io_cq *icq;
spin_lock_irqsave(&q->queue_lock, flags); if (!current->io_context)
icq = icq_to_bic(ioc_lookup_icq(ioc, q)); return NULL;
spin_unlock_irqrestore(&q->queue_lock, flags);
return icq; spin_lock_irqsave(&q->queue_lock, flags);
} icq = icq_to_bic(ioc_lookup_icq(q));
spin_unlock_irqrestore(&q->queue_lock, flags);
return NULL; return icq;
} }
/* /*
@ -565,26 +560,134 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
} }
} }
#define BFQ_LIMIT_INLINE_DEPTH 16
#ifdef CONFIG_BFQ_GROUP_IOSCHED
static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
{
struct bfq_data *bfqd = bfqq->bfqd;
struct bfq_entity *entity = &bfqq->entity;
struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH];
struct bfq_entity **entities = inline_entities;
int depth, level;
int class_idx = bfqq->ioprio_class - 1;
struct bfq_sched_data *sched_data;
unsigned long wsum;
bool ret = false;
if (!entity->on_st_or_in_serv)
return false;
/* +1 for bfqq entity, root cgroup not included */
depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1;
if (depth > BFQ_LIMIT_INLINE_DEPTH) {
entities = kmalloc_array(depth, sizeof(*entities), GFP_NOIO);
if (!entities)
return false;
}
spin_lock_irq(&bfqd->lock);
sched_data = entity->sched_data;
/* Gather our ancestors as we need to traverse them in reverse order */
level = 0;
for_each_entity(entity) {
/*
* If at some level entity is not even active, allow request
* queueing so that BFQ knows there's work to do and activate
* entities.
*/
if (!entity->on_st_or_in_serv)
goto out;
/* Uh, more parents than cgroup subsystem thinks? */
if (WARN_ON_ONCE(level >= depth))
break;
entities[level++] = entity;
}
WARN_ON_ONCE(level != depth);
for (level--; level >= 0; level--) {
entity = entities[level];
if (level > 0) {
wsum = bfq_entity_service_tree(entity)->wsum;
} else {
int i;
/*
* For bfqq itself we take into account service trees
* of all higher priority classes and multiply their
* weights so that low prio queue from higher class
* gets more requests than high prio queue from lower
* class.
*/
wsum = 0;
for (i = 0; i <= class_idx; i++) {
wsum = wsum * IOPRIO_BE_NR +
sched_data->service_tree[i].wsum;
}
}
limit = DIV_ROUND_CLOSEST(limit * entity->weight, wsum);
if (entity->allocated >= limit) {
bfq_log_bfqq(bfqq->bfqd, bfqq,
"too many requests: allocated %d limit %d level %d",
entity->allocated, limit, level);
ret = true;
break;
}
}
out:
spin_unlock_irq(&bfqd->lock);
if (entities != inline_entities)
kfree(entities);
return ret;
}
#else
static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
{
return false;
}
#endif
/* /*
* Async I/O can easily starve sync I/O (both sync reads and sync * Async I/O can easily starve sync I/O (both sync reads and sync
* writes), by consuming all tags. Similarly, storms of sync writes, * writes), by consuming all tags. Similarly, storms of sync writes,
* such as those that sync(2) may trigger, can starve sync reads. * such as those that sync(2) may trigger, can starve sync reads.
* Limit depths of async I/O and sync writes so as to counter both * Limit depths of async I/O and sync writes so as to counter both
* problems. * problems.
*
* Also if a bfq queue or its parent cgroup consume more tags than would be
* appropriate for their weight, we trim the available tag depth to 1. This
* avoids a situation where one cgroup can starve another cgroup from tags and
* thus block service differentiation among cgroups. Note that because the
* queue / cgroup already has many requests allocated and queued, this does not
* significantly affect service guarantees coming from the BFQ scheduling
* algorithm.
*/ */
static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
{ {
struct bfq_data *bfqd = data->q->elevator->elevator_data; struct bfq_data *bfqd = data->q->elevator->elevator_data;
struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(op)) : NULL;
int depth;
unsigned limit = data->q->nr_requests;
if (op_is_sync(op) && !op_is_write(op)) /* Sync reads have full depth available */
return; if (op_is_sync(op) && !op_is_write(op)) {
depth = 0;
} else {
depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
limit = (limit * depth) >> bfqd->full_depth_shift;
}
data->shallow_depth = /*
bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; * Does queue (or any parent entity) exceed number of requests that
* should be available to it? Heavily limit depth so that it cannot
* consume more available requests and thus starve other entities.
*/
if (bfqq && bfqq_request_over_limit(bfqq, limit))
depth = 1;
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
__func__, bfqd->wr_busy_queues, op_is_sync(op), __func__, bfqd->wr_busy_queues, op_is_sync(op), depth);
data->shallow_depth); if (depth)
data->shallow_depth = depth;
} }
static struct bfq_queue * static struct bfq_queue *
@ -1113,7 +1216,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
static int bfqq_process_refs(struct bfq_queue *bfqq) static int bfqq_process_refs(struct bfq_queue *bfqq)
{ {
return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv - return bfqq->ref - bfqq->entity.allocated -
bfqq->entity.on_st_or_in_serv -
(bfqq->weight_counter != NULL) - bfqq->stable_ref; (bfqq->weight_counter != NULL) - bfqq->stable_ref;
} }
@ -1982,20 +2086,19 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
* aspect, see the comments on the choice of the queue for injection * aspect, see the comments on the choice of the queue for injection
* in bfq_select_queue(). * in bfq_select_queue().
* *
* Turning back to the detection of a waker queue, a queue Q is deemed * Turning back to the detection of a waker queue, a queue Q is deemed as a
* as a waker queue for bfqq if, for three consecutive times, bfqq * waker queue for bfqq if, for three consecutive times, bfqq happens to become
* happens to become non empty right after a request of Q has been * non empty right after a request of Q has been completed within given
* completed. In this respect, even if bfqq is empty, we do not check * timeout. In this respect, even if bfqq is empty, we do not check for a waker
* for a waker if it still has some in-flight I/O. In fact, in this * if it still has some in-flight I/O. In fact, in this case bfqq is actually
* case bfqq is actually still being served by the drive, and may * still being served by the drive, and may receive new I/O on the completion
* receive new I/O on the completion of some of the in-flight * of some of the in-flight requests. In particular, on the first time, Q is
* requests. In particular, on the first time, Q is tentatively set as * tentatively set as a candidate waker queue, while on the third consecutive
* a candidate waker queue, while on the third consecutive time that Q * time that Q is detected, the field waker_bfqq is set to Q, to confirm that Q
* is detected, the field waker_bfqq is set to Q, to confirm that Q is * is a waker queue for bfqq. These detection steps are performed only if bfqq
* a waker queue for bfqq. These detection steps are performed only if * has a long think time, so as to make it more likely that bfqq's I/O is
* bfqq has a long think time, so as to make it more likely that * actually being blocked by a synchronization. This last filter, plus the
* bfqq's I/O is actually being blocked by a synchronization. This * above three-times requirement and time limit for detection, make false
* last filter, plus the above three-times requirement, make false
* positives less likely. * positives less likely.
* *
* NOTE * NOTE
@ -2019,6 +2122,8 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
u64 now_ns) u64 now_ns)
{ {
char waker_name[MAX_BFQQ_NAME_LENGTH];
if (!bfqd->last_completed_rq_bfqq || if (!bfqd->last_completed_rq_bfqq ||
bfqd->last_completed_rq_bfqq == bfqq || bfqd->last_completed_rq_bfqq == bfqq ||
bfq_bfqq_has_short_ttime(bfqq) || bfq_bfqq_has_short_ttime(bfqq) ||
@ -2027,8 +2132,16 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq) bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq)
return; return;
/*
* We reset waker detection logic also if too much time has passed
* since the first detection. If wakeups are rare, pointless idling
* doesn't hurt throughput that much. The condition below makes sure
* we do not uselessly idle blocking waker in more than 1/64 cases.
*/
if (bfqd->last_completed_rq_bfqq != if (bfqd->last_completed_rq_bfqq !=
bfqq->tentative_waker_bfqq) { bfqq->tentative_waker_bfqq ||
now_ns > bfqq->waker_detection_started +
128 * (u64)bfqd->bfq_slice_idle) {
/* /*
* First synchronization detected with a * First synchronization detected with a
* candidate waker queue, or with a different * candidate waker queue, or with a different
@ -2037,12 +2150,19 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqq->tentative_waker_bfqq = bfqq->tentative_waker_bfqq =
bfqd->last_completed_rq_bfqq; bfqd->last_completed_rq_bfqq;
bfqq->num_waker_detections = 1; bfqq->num_waker_detections = 1;
bfqq->waker_detection_started = now_ns;
bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name,
MAX_BFQQ_NAME_LENGTH);
bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name);
} else /* Same tentative waker queue detected again */ } else /* Same tentative waker queue detected again */
bfqq->num_waker_detections++; bfqq->num_waker_detections++;
if (bfqq->num_waker_detections == 3) { if (bfqq->num_waker_detections == 3) {
bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
bfqq->tentative_waker_bfqq = NULL; bfqq->tentative_waker_bfqq = NULL;
bfq_bfqq_name(bfqq->waker_bfqq, waker_name,
MAX_BFQQ_NAME_LENGTH);
bfq_log_bfqq(bfqd, bfqq, "set waker %s", waker_name);
/* /*
* If the waker queue disappears, then * If the waker queue disappears, then
@ -2332,7 +2452,7 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
* returned by bfq_bic_lookup does not go away before * returned by bfq_bic_lookup does not go away before
* bfqd->lock is taken. * bfqd->lock is taken.
*/ */
struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q); struct bfq_io_cq *bic = bfq_bic_lookup(q);
bool ret; bool ret;
spin_lock_irq(&bfqd->lock); spin_lock_irq(&bfqd->lock);
@ -5878,6 +5998,22 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
} }
} }
static void bfqq_request_allocated(struct bfq_queue *bfqq)
{
struct bfq_entity *entity = &bfqq->entity;
for_each_entity(entity)
entity->allocated++;
}
static void bfqq_request_freed(struct bfq_queue *bfqq)
{
struct bfq_entity *entity = &bfqq->entity;
for_each_entity(entity)
entity->allocated--;
}
/* returns true if it causes the idle timer to be disabled */ /* returns true if it causes the idle timer to be disabled */
static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
{ {
@ -5891,8 +6027,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
* Release the request's reference to the old bfqq * Release the request's reference to the old bfqq
* and make sure one is taken to the shared queue. * and make sure one is taken to the shared queue.
*/ */
new_bfqq->allocated++; bfqq_request_allocated(new_bfqq);
bfqq->allocated--; bfqq_request_freed(bfqq);
new_bfqq->ref++; new_bfqq->ref++;
/* /*
* If the bic associated with the process * If the bic associated with the process
@ -6209,8 +6345,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq)
{ {
bfqq->allocated--; bfqq_request_freed(bfqq);
bfq_put_queue(bfqq); bfq_put_queue(bfqq);
} }
@ -6434,6 +6569,16 @@ static void bfq_finish_requeue_request(struct request *rq)
rq->elv.priv[1] = NULL; rq->elv.priv[1] = NULL;
} }
static void bfq_finish_request(struct request *rq)
{
bfq_finish_requeue_request(rq);
if (rq->elv.icq) {
put_io_context(rq->elv.icq->ioc);
rq->elv.icq = NULL;
}
}
/* /*
* Removes the association between the current task and bfqq, assuming * Removes the association between the current task and bfqq, assuming
* that bic points to the bfq iocontext of the task. * that bic points to the bfq iocontext of the task.
@ -6531,6 +6676,8 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
*/ */
static void bfq_prepare_request(struct request *rq) static void bfq_prepare_request(struct request *rq)
{ {
rq->elv.icq = ioc_find_get_icq(rq->q);
/* /*
* Regardless of whether we have an icq attached, we have to * Regardless of whether we have an icq attached, we have to
* clear the scheduler pointers, as they might point to * clear the scheduler pointers, as they might point to
@ -6630,7 +6777,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
} }
} }
bfqq->allocated++; bfqq_request_allocated(bfqq);
bfqq->ref++; bfqq->ref++;
bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
rq, bfqq, bfqq->ref); rq, bfqq, bfqq->ref);
@ -6793,11 +6940,11 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
* See the comments on bfq_limit_depth for the purpose of * See the comments on bfq_limit_depth for the purpose of
* the depths set in the function. Return minimum shallow depth we'll use. * the depths set in the function. Return minimum shallow depth we'll use.
*/ */
static unsigned int bfq_update_depths(struct bfq_data *bfqd, static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
struct sbitmap_queue *bt)
{ {
unsigned int i, j, min_shallow = UINT_MAX; unsigned int depth = 1U << bt->sb.shift;
bfqd->full_depth_shift = bt->sb.shift;
/* /*
* In-word depths if no bfq_queue is being weight-raised: * In-word depths if no bfq_queue is being weight-raised:
* leaving 25% of tags only for sync reads. * leaving 25% of tags only for sync reads.
@ -6809,13 +6956,13 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
* limit 'something'. * limit 'something'.
*/ */
/* no more than 50% of tags for async I/O */ /* no more than 50% of tags for async I/O */
bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U); bfqd->word_depths[0][0] = max(depth >> 1, 1U);
/* /*
* no more than 75% of tags for sync writes (25% extra tags * no more than 75% of tags for sync writes (25% extra tags
* w.r.t. async I/O, to prevent async I/O from starving sync * w.r.t. async I/O, to prevent async I/O from starving sync
* writes) * writes)
*/ */
bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U); bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U);
/* /*
* In-word depths in case some bfq_queue is being weight- * In-word depths in case some bfq_queue is being weight-
@ -6825,25 +6972,18 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
* shortage. * shortage.
*/ */
/* no more than ~18% of tags for async I/O */ /* no more than ~18% of tags for async I/O */
bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U); bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U);
/* no more than ~37% of tags for sync writes (~20% extra tags) */ /* no more than ~37% of tags for sync writes (~20% extra tags) */
bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U); bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U);
for (i = 0; i < 2; i++)
for (j = 0; j < 2; j++)
min_shallow = min(min_shallow, bfqd->word_depths[i][j]);
return min_shallow;
} }
static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
{ {
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
struct blk_mq_tags *tags = hctx->sched_tags; struct blk_mq_tags *tags = hctx->sched_tags;
unsigned int min_shallow;
min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags); bfq_update_depths(bfqd, &tags->bitmap_tags);
sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow); sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
} }
static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
@ -7260,7 +7400,7 @@ static struct elevator_type iosched_bfq_mq = {
.limit_depth = bfq_limit_depth, .limit_depth = bfq_limit_depth,
.prepare_request = bfq_prepare_request, .prepare_request = bfq_prepare_request,
.requeue_request = bfq_finish_requeue_request, .requeue_request = bfq_finish_requeue_request,
.finish_request = bfq_finish_requeue_request, .finish_request = bfq_finish_request,
.exit_icq = bfq_exit_icq, .exit_icq = bfq_exit_icq,
.insert_requests = bfq_insert_requests, .insert_requests = bfq_insert_requests,
.dispatch_request = bfq_dispatch_request, .dispatch_request = bfq_dispatch_request,

View File

@ -25,7 +25,7 @@
#define BFQ_DEFAULT_GRP_IOPRIO 0 #define BFQ_DEFAULT_GRP_IOPRIO 0
#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
#define MAX_PID_STR_LENGTH 12 #define MAX_BFQQ_NAME_LENGTH 16
/* /*
* Soft real-time applications are extremely more latency sensitive * Soft real-time applications are extremely more latency sensitive
@ -170,6 +170,9 @@ struct bfq_entity {
/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
int budget; int budget;
/* Number of requests allocated in the subtree of this entity */
int allocated;
/* device weight, if non-zero, it overrides the default weight of /* device weight, if non-zero, it overrides the default weight of
* bfq_group_data */ * bfq_group_data */
int dev_weight; int dev_weight;
@ -266,8 +269,6 @@ struct bfq_queue {
struct request *next_rq; struct request *next_rq;
/* number of sync and async requests queued */ /* number of sync and async requests queued */
int queued[2]; int queued[2];
/* number of requests currently allocated */
int allocated;
/* number of pending metadata requests */ /* number of pending metadata requests */
int meta_pending; int meta_pending;
/* fifo list of requests in sort_list */ /* fifo list of requests in sort_list */
@ -387,6 +388,8 @@ struct bfq_queue {
struct bfq_queue *tentative_waker_bfqq; struct bfq_queue *tentative_waker_bfqq;
/* number of times the same tentative waker has been detected */ /* number of times the same tentative waker has been detected */
unsigned int num_waker_detections; unsigned int num_waker_detections;
/* time when we started considering this waker */
u64 waker_detection_started;
/* node for woken_list, see below */ /* node for woken_list, see below */
struct hlist_node woken_list_node; struct hlist_node woken_list_node;
@ -768,6 +771,7 @@ struct bfq_data {
* function) * function)
*/ */
unsigned int word_depths[2][2]; unsigned int word_depths[2][2];
unsigned int full_depth_shift;
}; };
enum bfqq_state_flags { enum bfqq_state_flags {
@ -1079,26 +1083,27 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
/* --------------- end of interface of B-WF2Q+ ---------------- */ /* --------------- end of interface of B-WF2Q+ ---------------- */
/* Logging facilities. */ /* Logging facilities. */
static inline void bfq_pid_to_str(int pid, char *str, int len) static inline void bfq_bfqq_name(struct bfq_queue *bfqq, char *str, int len)
{ {
if (pid != -1) char type = bfq_bfqq_sync(bfqq) ? 'S' : 'A';
snprintf(str, len, "%d", pid);
if (bfqq->pid != -1)
snprintf(str, len, "bfq%d%c", bfqq->pid, type);
else else
snprintf(str, len, "SHARED-"); snprintf(str, len, "bfqSHARED-%c", type);
} }
#ifdef CONFIG_BFQ_GROUP_IOSCHED #ifdef CONFIG_BFQ_GROUP_IOSCHED
struct bfq_group *bfqq_group(struct bfq_queue *bfqq); struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
char pid_str[MAX_PID_STR_LENGTH]; \ char pid_str[MAX_BFQQ_NAME_LENGTH]; \
if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
break; \ break; \
bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \
blk_add_cgroup_trace_msg((bfqd)->queue, \ blk_add_cgroup_trace_msg((bfqd)->queue, \
bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \
"bfq%s%c " fmt, pid_str, \ "%s " fmt, pid_str, ##args); \
bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \
} while (0) } while (0)
#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
@ -1109,13 +1114,11 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
#else /* CONFIG_BFQ_GROUP_IOSCHED */ #else /* CONFIG_BFQ_GROUP_IOSCHED */
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
char pid_str[MAX_PID_STR_LENGTH]; \ char pid_str[MAX_BFQQ_NAME_LENGTH]; \
if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
break; \ break; \
bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \
blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \ blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \
bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
##args); \
} while (0) } while (0)
#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)

View File

@ -6,7 +6,7 @@
* Written by: Martin K. Petersen <martin.petersen@oracle.com> * Written by: Martin K. Petersen <martin.petersen@oracle.com>
*/ */
#include <linux/blkdev.h> #include <linux/blk-integrity.h>
#include <linux/mempool.h> #include <linux/mempool.h>
#include <linux/export.h> #include <linux/export.h>
#include <linux/bio.h> #include <linux/bio.h>
@ -134,7 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
iv = bip->bip_vec + bip->bip_vcnt; iv = bip->bip_vec + bip->bip_vcnt;
if (bip->bip_vcnt && if (bip->bip_vcnt &&
bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue, bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
&bip->bip_vec[bip->bip_vcnt - 1], offset)) &bip->bip_vec[bip->bip_vcnt - 1], offset))
return 0; return 0;

View File

@ -26,7 +26,7 @@
#include "blk-rq-qos.h" #include "blk-rq-qos.h"
struct bio_alloc_cache { struct bio_alloc_cache {
struct bio_list free_list; struct bio *free_list;
unsigned int nr; unsigned int nr;
}; };
@ -87,7 +87,8 @@ static struct bio_slab *create_bio_slab(unsigned int size)
snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size); snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
bslab->slab = kmem_cache_create(bslab->name, size, bslab->slab = kmem_cache_create(bslab->name, size,
ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL); ARCH_KMALLOC_MINALIGN,
SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL);
if (!bslab->slab) if (!bslab->slab)
goto fail_alloc_slab; goto fail_alloc_slab;
@ -156,7 +157,7 @@ static void bio_put_slab(struct bio_set *bs)
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
{ {
BIO_BUG_ON(nr_vecs > BIO_MAX_VECS); BUG_ON(nr_vecs > BIO_MAX_VECS);
if (nr_vecs == BIO_MAX_VECS) if (nr_vecs == BIO_MAX_VECS)
mempool_free(bv, pool); mempool_free(bv, pool);
@ -281,6 +282,7 @@ void bio_init(struct bio *bio, struct bio_vec *table,
atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_remaining, 1);
atomic_set(&bio->__bi_cnt, 1); atomic_set(&bio->__bi_cnt, 1);
bio->bi_cookie = BLK_QC_T_NONE;
bio->bi_max_vecs = max_vecs; bio->bi_max_vecs = max_vecs;
bio->bi_io_vec = table; bio->bi_io_vec = table;
@ -546,7 +548,7 @@ EXPORT_SYMBOL(zero_fill_bio);
* REQ_OP_READ, zero the truncated part. This function should only * REQ_OP_READ, zero the truncated part. This function should only
* be used for handling corner cases, such as bio eod. * be used for handling corner cases, such as bio eod.
*/ */
void bio_truncate(struct bio *bio, unsigned new_size) static void bio_truncate(struct bio *bio, unsigned new_size)
{ {
struct bio_vec bv; struct bio_vec bv;
struct bvec_iter iter; struct bvec_iter iter;
@ -629,7 +631,8 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
unsigned int i = 0; unsigned int i = 0;
struct bio *bio; struct bio *bio;
while ((bio = bio_list_pop(&cache->free_list)) != NULL) { while ((bio = cache->free_list) != NULL) {
cache->free_list = bio->bi_next;
cache->nr--; cache->nr--;
bio_free(bio); bio_free(bio);
if (++i == nr) if (++i == nr)
@ -678,7 +681,7 @@ static void bio_alloc_cache_destroy(struct bio_set *bs)
void bio_put(struct bio *bio) void bio_put(struct bio *bio)
{ {
if (unlikely(bio_flagged(bio, BIO_REFFED))) { if (unlikely(bio_flagged(bio, BIO_REFFED))) {
BIO_BUG_ON(!atomic_read(&bio->__bi_cnt)); BUG_ON(!atomic_read(&bio->__bi_cnt));
if (!atomic_dec_and_test(&bio->__bi_cnt)) if (!atomic_dec_and_test(&bio->__bi_cnt))
return; return;
} }
@ -688,7 +691,8 @@ void bio_put(struct bio *bio)
bio_uninit(bio); bio_uninit(bio);
cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
bio_list_add_head(&cache->free_list, bio); bio->bi_next = cache->free_list;
cache->free_list = bio;
if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK) if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK); bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
put_cpu(); put_cpu();
@ -773,6 +777,23 @@ const char *bio_devname(struct bio *bio, char *buf)
} }
EXPORT_SYMBOL(bio_devname); EXPORT_SYMBOL(bio_devname);
/**
* bio_full - check if the bio is full
* @bio: bio to check
* @len: length of one segment to be added
*
* Return true if @bio is full and one segment with @len bytes can't be
* added to the bio, otherwise return false
*/
static inline bool bio_full(struct bio *bio, unsigned len)
{
if (bio->bi_vcnt >= bio->bi_max_vecs)
return true;
if (bio->bi_iter.bi_size > UINT_MAX - len)
return true;
return false;
}
static inline bool page_is_mergeable(const struct bio_vec *bv, static inline bool page_is_mergeable(const struct bio_vec *bv,
struct page *page, unsigned int len, unsigned int off, struct page *page, unsigned int len, unsigned int off,
bool *same_page) bool *same_page)
@ -792,6 +813,44 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
} }
/**
* __bio_try_merge_page - try appending data to an existing bvec.
* @bio: destination bio
* @page: start page to add
* @len: length of the data to add
* @off: offset of the data relative to @page
* @same_page: return if the segment has been merged inside the same page
*
* Try to add the data at @page + @off to the last bvec of @bio. This is a
* useful optimisation for file systems with a block size smaller than the
* page size.
*
* Warn if (@len, @off) crosses pages in case that @same_page is true.
*
* Return %true on success or %false on failure.
*/
static bool __bio_try_merge_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off, bool *same_page)
{
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
if (bio->bi_vcnt > 0) {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (page_is_mergeable(bv, page, len, off, same_page)) {
if (bio->bi_iter.bi_size > UINT_MAX - len) {
*same_page = false;
return false;
}
bv->bv_len += len;
bio->bi_iter.bi_size += len;
return true;
}
}
return false;
}
/* /*
* Try to merge a page into a segment, while obeying the hardware segment * Try to merge a page into a segment, while obeying the hardware segment
* size limit. This is not for normal read/write bios, but for passthrough * size limit. This is not for normal read/write bios, but for passthrough
@ -909,7 +968,7 @@ EXPORT_SYMBOL(bio_add_pc_page);
int bio_add_zone_append_page(struct bio *bio, struct page *page, int bio_add_zone_append_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset) unsigned int len, unsigned int offset)
{ {
struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct request_queue *q = bdev_get_queue(bio->bi_bdev);
bool same_page = false; bool same_page = false;
if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND)) if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
@ -923,45 +982,6 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page,
} }
EXPORT_SYMBOL_GPL(bio_add_zone_append_page); EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
/**
* __bio_try_merge_page - try appending data to an existing bvec.
* @bio: destination bio
* @page: start page to add
* @len: length of the data to add
* @off: offset of the data relative to @page
* @same_page: return if the segment has been merged inside the same page
*
* Try to add the data at @page + @off to the last bvec of @bio. This is a
* useful optimisation for file systems with a block size smaller than the
* page size.
*
* Warn if (@len, @off) crosses pages in case that @same_page is true.
*
* Return %true on success or %false on failure.
*/
bool __bio_try_merge_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off, bool *same_page)
{
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
if (bio->bi_vcnt > 0) {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (page_is_mergeable(bv, page, len, off, same_page)) {
if (bio->bi_iter.bi_size > UINT_MAX - len) {
*same_page = false;
return false;
}
bv->bv_len += len;
bio->bi_iter.bi_size += len;
return true;
}
}
return false;
}
EXPORT_SYMBOL_GPL(__bio_try_merge_page);
/** /**
* __bio_add_page - add page(s) to a bio in a new segment * __bio_add_page - add page(s) to a bio in a new segment
* @bio: destination bio * @bio: destination bio
@ -1016,52 +1036,62 @@ int bio_add_page(struct bio *bio, struct page *page,
} }
EXPORT_SYMBOL(bio_add_page); EXPORT_SYMBOL(bio_add_page);
void bio_release_pages(struct bio *bio, bool mark_dirty) /**
* bio_add_folio - Attempt to add part of a folio to a bio.
* @bio: BIO to add to.
* @folio: Folio to add.
* @len: How many bytes from the folio to add.
* @off: First byte in this folio to add.
*
* Filesystems that use folios can call this function instead of calling
* bio_add_page() for each page in the folio. If @off is bigger than
* PAGE_SIZE, this function can create a bio_vec that starts in a page
* after the bv_page. BIOs do not support folios that are 4GiB or larger.
*
* Return: Whether the addition was successful.
*/
bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
size_t off)
{
if (len > UINT_MAX || off > UINT_MAX)
return 0;
return bio_add_page(bio, &folio->page, len, off) > 0;
}
void __bio_release_pages(struct bio *bio, bool mark_dirty)
{ {
struct bvec_iter_all iter_all; struct bvec_iter_all iter_all;
struct bio_vec *bvec; struct bio_vec *bvec;
if (bio_flagged(bio, BIO_NO_PAGE_REF))
return;
bio_for_each_segment_all(bvec, bio, iter_all) { bio_for_each_segment_all(bvec, bio, iter_all) {
if (mark_dirty && !PageCompound(bvec->bv_page)) if (mark_dirty && !PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page); set_page_dirty_lock(bvec->bv_page);
put_page(bvec->bv_page); put_page(bvec->bv_page);
} }
} }
EXPORT_SYMBOL_GPL(bio_release_pages); EXPORT_SYMBOL_GPL(__bio_release_pages);
static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
{ {
size_t size = iov_iter_count(iter);
WARN_ON_ONCE(bio->bi_max_vecs); WARN_ON_ONCE(bio->bi_max_vecs);
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
size_t max_sectors = queue_max_zone_append_sectors(q);
size = min(size, max_sectors << SECTOR_SHIFT);
}
bio->bi_vcnt = iter->nr_segs; bio->bi_vcnt = iter->nr_segs;
bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_io_vec = (struct bio_vec *)iter->bvec;
bio->bi_iter.bi_bvec_done = iter->iov_offset; bio->bi_iter.bi_bvec_done = iter->iov_offset;
bio->bi_iter.bi_size = iter->count; bio->bi_iter.bi_size = size;
bio_set_flag(bio, BIO_NO_PAGE_REF); bio_set_flag(bio, BIO_NO_PAGE_REF);
bio_set_flag(bio, BIO_CLONED); bio_set_flag(bio, BIO_CLONED);
} }
static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
{
__bio_iov_bvec_set(bio, iter);
iov_iter_advance(iter, iter->count);
return 0;
}
static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter)
{
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct iov_iter i = *iter;
iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9);
__bio_iov_bvec_set(bio, &i);
iov_iter_advance(iter, i.count);
return 0;
}
static void bio_put_pages(struct page **pages, size_t size, size_t off) static void bio_put_pages(struct page **pages, size_t size, size_t off)
{ {
size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
@ -1131,7 +1161,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
{ {
unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct request_queue *q = bdev_get_queue(bio->bi_bdev);
unsigned int max_append_sectors = queue_max_zone_append_sectors(q); unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv; struct page **pages = (struct page **)bv;
@ -1203,9 +1233,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
int ret = 0; int ret = 0;
if (iov_iter_is_bvec(iter)) { if (iov_iter_is_bvec(iter)) {
if (bio_op(bio) == REQ_OP_ZONE_APPEND) bio_iov_bvec_set(bio, iter);
return bio_iov_bvec_set_append(bio, iter); iov_iter_advance(iter, bio->bi_iter.bi_size);
return bio_iov_bvec_set(bio, iter); return 0;
} }
do { do {
@ -1261,18 +1291,7 @@ int submit_bio_wait(struct bio *bio)
} }
EXPORT_SYMBOL(submit_bio_wait); EXPORT_SYMBOL(submit_bio_wait);
/** void __bio_advance(struct bio *bio, unsigned bytes)
* bio_advance - increment/complete a bio by some number of bytes
* @bio: bio to advance
* @bytes: number of bytes to complete
*
* This updates bi_sector, bi_size and bi_idx; if the number of bytes to
* complete doesn't align with a bvec boundary, then bv_len and bv_offset will
* be updated on the last bvec as well.
*
* @bio will then represent the remaining, uncompleted portion of the io.
*/
void bio_advance(struct bio *bio, unsigned bytes)
{ {
if (bio_integrity(bio)) if (bio_integrity(bio))
bio_integrity_advance(bio, bytes); bio_integrity_advance(bio, bytes);
@ -1280,7 +1299,7 @@ void bio_advance(struct bio *bio, unsigned bytes)
bio_crypt_advance(bio, bytes); bio_crypt_advance(bio, bytes);
bio_advance_iter(bio, &bio->bi_iter, bytes); bio_advance_iter(bio, &bio->bi_iter, bytes);
} }
EXPORT_SYMBOL(bio_advance); EXPORT_SYMBOL(__bio_advance);
void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter) struct bio *src, struct bvec_iter *src_iter)
@ -1468,10 +1487,10 @@ void bio_endio(struct bio *bio)
return; return;
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED)) if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED))
rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio); rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio);
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio); trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
bio_clear_flag(bio, BIO_TRACE_COMPLETION); bio_clear_flag(bio, BIO_TRACE_COMPLETION);
} }
@ -1710,8 +1729,9 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs); return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
cache = per_cpu_ptr(bs->cache, get_cpu()); cache = per_cpu_ptr(bs->cache, get_cpu());
bio = bio_list_pop(&cache->free_list); if (cache->free_list) {
if (bio) { bio = cache->free_list;
cache->free_list = bio->bi_next;
cache->nr--; cache->nr--;
put_cpu(); put_cpu();
bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs); bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);

View File

@ -30,8 +30,10 @@
#include <linux/blk-cgroup.h> #include <linux/blk-cgroup.h>
#include <linux/tracehook.h> #include <linux/tracehook.h>
#include <linux/psi.h> #include <linux/psi.h>
#include <linux/part_stat.h>
#include "blk.h" #include "blk.h"
#include "blk-ioprio.h" #include "blk-ioprio.h"
#include "blk-throttle.h"
/* /*
* blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
@ -620,7 +622,7 @@ struct block_device *blkcg_conf_open_bdev(char **inputp)
*/ */
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
char *input, struct blkg_conf_ctx *ctx) char *input, struct blkg_conf_ctx *ctx)
__acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock) __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock)
{ {
struct block_device *bdev; struct block_device *bdev;
struct request_queue *q; struct request_queue *q;
@ -631,7 +633,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
if (IS_ERR(bdev)) if (IS_ERR(bdev))
return PTR_ERR(bdev); return PTR_ERR(bdev);
q = bdev->bd_disk->queue; q = bdev_get_queue(bdev);
/* /*
* blkcg_deactivate_policy() requires queue to be frozen, we can grab * blkcg_deactivate_policy() requires queue to be frozen, we can grab
@ -747,9 +749,9 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
* with blkg_conf_prep(). * with blkg_conf_prep().
*/ */
void blkg_conf_finish(struct blkg_conf_ctx *ctx) void blkg_conf_finish(struct blkg_conf_ctx *ctx)
__releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu) __releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu)
{ {
spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock); spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
rcu_read_unlock(); rcu_read_unlock();
blkdev_put_no_open(ctx->bdev); blkdev_put_no_open(ctx->bdev);
} }
@ -852,7 +854,7 @@ static void blkcg_fill_root_iostats(void)
while ((dev = class_dev_iter_next(&iter))) { while ((dev = class_dev_iter_next(&iter))) {
struct block_device *bdev = dev_to_bdev(dev); struct block_device *bdev = dev_to_bdev(dev);
struct blkcg_gq *blkg = struct blkcg_gq *blkg =
blk_queue_root_blkg(bdev->bd_disk->queue); blk_queue_root_blkg(bdev_get_queue(bdev));
struct blkg_iostat tmp; struct blkg_iostat tmp;
int cpu; int cpu;
@ -1811,7 +1813,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
rcu_read_lock(); rcu_read_lock();
blkg = blkg_lookup_create(css_to_blkcg(css), blkg = blkg_lookup_create(css_to_blkcg(css),
bio->bi_bdev->bd_disk->queue); bdev_get_queue(bio->bi_bdev));
while (blkg) { while (blkg) {
if (blkg_tryget(blkg)) { if (blkg_tryget(blkg)) {
ret_blkg = blkg; ret_blkg = blkg;
@ -1847,8 +1849,8 @@ void bio_associate_blkg_from_css(struct bio *bio,
if (css && css->parent) { if (css && css->parent) {
bio->bi_blkg = blkg_tryget_closest(bio, css); bio->bi_blkg = blkg_tryget_closest(bio, css);
} else { } else {
blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg); blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg; bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
} }
} }
EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);

File diff suppressed because it is too large Load Diff

View File

@ -12,12 +12,13 @@
#include <crypto/skcipher.h> #include <crypto/skcipher.h>
#include <linux/blk-cgroup.h> #include <linux/blk-cgroup.h>
#include <linux/blk-crypto.h> #include <linux/blk-crypto.h>
#include <linux/blk-crypto-profile.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/crypto.h> #include <linux/crypto.h>
#include <linux/keyslot-manager.h>
#include <linux/mempool.h> #include <linux/mempool.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/random.h> #include <linux/random.h>
#include <linux/scatterlist.h>
#include "blk-crypto-internal.h" #include "blk-crypto-internal.h"
@ -72,12 +73,12 @@ static mempool_t *bio_fallback_crypt_ctx_pool;
static DEFINE_MUTEX(tfms_init_lock); static DEFINE_MUTEX(tfms_init_lock);
static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX]; static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX];
static struct blk_crypto_keyslot { static struct blk_crypto_fallback_keyslot {
enum blk_crypto_mode_num crypto_mode; enum blk_crypto_mode_num crypto_mode;
struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX]; struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX];
} *blk_crypto_keyslots; } *blk_crypto_keyslots;
static struct blk_keyslot_manager blk_crypto_ksm; static struct blk_crypto_profile blk_crypto_fallback_profile;
static struct workqueue_struct *blk_crypto_wq; static struct workqueue_struct *blk_crypto_wq;
static mempool_t *blk_crypto_bounce_page_pool; static mempool_t *blk_crypto_bounce_page_pool;
static struct bio_set crypto_bio_split; static struct bio_set crypto_bio_split;
@ -88,9 +89,9 @@ static struct bio_set crypto_bio_split;
*/ */
static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE]; static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE];
static void blk_crypto_evict_keyslot(unsigned int slot) static void blk_crypto_fallback_evict_keyslot(unsigned int slot)
{ {
struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot];
enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode; enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode;
int err; int err;
@ -103,45 +104,41 @@ static void blk_crypto_evict_keyslot(unsigned int slot)
slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID; slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID;
} }
static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm, static int
const struct blk_crypto_key *key, blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile,
unsigned int slot) const struct blk_crypto_key *key,
unsigned int slot)
{ {
struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot];
const enum blk_crypto_mode_num crypto_mode = const enum blk_crypto_mode_num crypto_mode =
key->crypto_cfg.crypto_mode; key->crypto_cfg.crypto_mode;
int err; int err;
if (crypto_mode != slotp->crypto_mode && if (crypto_mode != slotp->crypto_mode &&
slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID) slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID)
blk_crypto_evict_keyslot(slot); blk_crypto_fallback_evict_keyslot(slot);
slotp->crypto_mode = crypto_mode; slotp->crypto_mode = crypto_mode;
err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw, err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw,
key->size); key->size);
if (err) { if (err) {
blk_crypto_evict_keyslot(slot); blk_crypto_fallback_evict_keyslot(slot);
return err; return err;
} }
return 0; return 0;
} }
static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm, static int blk_crypto_fallback_keyslot_evict(struct blk_crypto_profile *profile,
const struct blk_crypto_key *key, const struct blk_crypto_key *key,
unsigned int slot) unsigned int slot)
{ {
blk_crypto_evict_keyslot(slot); blk_crypto_fallback_evict_keyslot(slot);
return 0; return 0;
} }
/* static const struct blk_crypto_ll_ops blk_crypto_fallback_ll_ops = {
* The crypto API fallback KSM ops - only used for a bio when it specifies a .keyslot_program = blk_crypto_fallback_keyslot_program,
* blk_crypto_key that was not supported by the device's inline encryption .keyslot_evict = blk_crypto_fallback_keyslot_evict,
* hardware.
*/
static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = {
.keyslot_program = blk_crypto_keyslot_program,
.keyslot_evict = blk_crypto_keyslot_evict,
}; };
static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
@ -159,7 +156,7 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
bio_endio(src_bio); bio_endio(src_bio);
} }
static struct bio *blk_crypto_clone_bio(struct bio *bio_src) static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
{ {
struct bvec_iter iter; struct bvec_iter iter;
struct bio_vec bv; struct bio_vec bv;
@ -186,13 +183,14 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
return bio; return bio;
} }
static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot, static bool
struct skcipher_request **ciph_req_ret, blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot,
struct crypto_wait *wait) struct skcipher_request **ciph_req_ret,
struct crypto_wait *wait)
{ {
struct skcipher_request *ciph_req; struct skcipher_request *ciph_req;
const struct blk_crypto_keyslot *slotp; const struct blk_crypto_fallback_keyslot *slotp;
int keyslot_idx = blk_ksm_get_slot_idx(slot); int keyslot_idx = blk_crypto_keyslot_index(slot);
slotp = &blk_crypto_keyslots[keyslot_idx]; slotp = &blk_crypto_keyslots[keyslot_idx];
ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode], ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode],
@ -209,7 +207,7 @@ static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
return true; return true;
} }
static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr)
{ {
struct bio *bio = *bio_ptr; struct bio *bio = *bio_ptr;
unsigned int i = 0; unsigned int i = 0;
@ -264,7 +262,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
{ {
struct bio *src_bio, *enc_bio; struct bio *src_bio, *enc_bio;
struct bio_crypt_ctx *bc; struct bio_crypt_ctx *bc;
struct blk_ksm_keyslot *slot; struct blk_crypto_keyslot *slot;
int data_unit_size; int data_unit_size;
struct skcipher_request *ciph_req = NULL; struct skcipher_request *ciph_req = NULL;
DECLARE_CRYPTO_WAIT(wait); DECLARE_CRYPTO_WAIT(wait);
@ -276,7 +274,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
blk_status_t blk_st; blk_status_t blk_st;
/* Split the bio if it's too big for single page bvec */ /* Split the bio if it's too big for single page bvec */
if (!blk_crypto_split_bio_if_needed(bio_ptr)) if (!blk_crypto_fallback_split_bio_if_needed(bio_ptr))
return false; return false;
src_bio = *bio_ptr; src_bio = *bio_ptr;
@ -284,24 +282,25 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; data_unit_size = bc->bc_key->crypto_cfg.data_unit_size;
/* Allocate bounce bio for encryption */ /* Allocate bounce bio for encryption */
enc_bio = blk_crypto_clone_bio(src_bio); enc_bio = blk_crypto_fallback_clone_bio(src_bio);
if (!enc_bio) { if (!enc_bio) {
src_bio->bi_status = BLK_STS_RESOURCE; src_bio->bi_status = BLK_STS_RESOURCE;
return false; return false;
} }
/* /*
* Use the crypto API fallback keyslot manager to get a crypto_skcipher * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
* for the algorithm and key specified for this bio. * this bio's algorithm and key.
*/ */
blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile,
bc->bc_key, &slot);
if (blk_st != BLK_STS_OK) { if (blk_st != BLK_STS_OK) {
src_bio->bi_status = blk_st; src_bio->bi_status = blk_st;
goto out_put_enc_bio; goto out_put_enc_bio;
} }
/* and then allocate an skcipher_request for it */ /* and then allocate an skcipher_request for it */
if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
src_bio->bi_status = BLK_STS_RESOURCE; src_bio->bi_status = BLK_STS_RESOURCE;
goto out_release_keyslot; goto out_release_keyslot;
} }
@ -362,7 +361,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
out_free_ciph_req: out_free_ciph_req:
skcipher_request_free(ciph_req); skcipher_request_free(ciph_req);
out_release_keyslot: out_release_keyslot:
blk_ksm_put_slot(slot); blk_crypto_put_keyslot(slot);
out_put_enc_bio: out_put_enc_bio:
if (enc_bio) if (enc_bio)
bio_put(enc_bio); bio_put(enc_bio);
@ -380,7 +379,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
container_of(work, struct bio_fallback_crypt_ctx, work); container_of(work, struct bio_fallback_crypt_ctx, work);
struct bio *bio = f_ctx->bio; struct bio *bio = f_ctx->bio;
struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx; struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx;
struct blk_ksm_keyslot *slot; struct blk_crypto_keyslot *slot;
struct skcipher_request *ciph_req = NULL; struct skcipher_request *ciph_req = NULL;
DECLARE_CRYPTO_WAIT(wait); DECLARE_CRYPTO_WAIT(wait);
u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
@ -393,17 +392,18 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
blk_status_t blk_st; blk_status_t blk_st;
/* /*
* Use the crypto API fallback keyslot manager to get a crypto_skcipher * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
* for the algorithm and key specified for this bio. * this bio's algorithm and key.
*/ */
blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile,
bc->bc_key, &slot);
if (blk_st != BLK_STS_OK) { if (blk_st != BLK_STS_OK) {
bio->bi_status = blk_st; bio->bi_status = blk_st;
goto out_no_keyslot; goto out_no_keyslot;
} }
/* and then allocate an skcipher_request for it */ /* and then allocate an skcipher_request for it */
if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
bio->bi_status = BLK_STS_RESOURCE; bio->bi_status = BLK_STS_RESOURCE;
goto out; goto out;
} }
@ -434,7 +434,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
out: out:
skcipher_request_free(ciph_req); skcipher_request_free(ciph_req);
blk_ksm_put_slot(slot); blk_crypto_put_keyslot(slot);
out_no_keyslot: out_no_keyslot:
mempool_free(f_ctx, bio_fallback_crypt_ctx_pool); mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
bio_endio(bio); bio_endio(bio);
@ -473,9 +473,9 @@ static void blk_crypto_fallback_decrypt_endio(struct bio *bio)
* @bio_ptr: pointer to the bio to prepare * @bio_ptr: pointer to the bio to prepare
* *
* If bio is doing a WRITE operation, this splits the bio into two parts if it's * If bio is doing a WRITE operation, this splits the bio into two parts if it's
* too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio * too big (see blk_crypto_fallback_split_bio_if_needed()). It then allocates a
* for the first part, encrypts it, and update bio_ptr to point to the bounce * bounce bio for the first part, encrypts it, and updates bio_ptr to point to
* bio. * the bounce bio.
* *
* For a READ operation, we mark the bio for decryption by using bi_private and * For a READ operation, we mark the bio for decryption by using bi_private and
* bi_end_io. * bi_end_io.
@ -499,8 +499,8 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
return false; return false;
} }
if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm, if (!__blk_crypto_cfg_supported(&blk_crypto_fallback_profile,
&bc->bc_key->crypto_cfg)) { &bc->bc_key->crypto_cfg)) {
bio->bi_status = BLK_STS_NOTSUPP; bio->bi_status = BLK_STS_NOTSUPP;
return false; return false;
} }
@ -526,7 +526,7 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key)
{ {
return blk_ksm_evict_key(&blk_crypto_ksm, key); return __blk_crypto_evict_key(&blk_crypto_fallback_profile, key);
} }
static bool blk_crypto_fallback_inited; static bool blk_crypto_fallback_inited;
@ -534,6 +534,7 @@ static int blk_crypto_fallback_init(void)
{ {
int i; int i;
int err; int err;
struct blk_crypto_profile *profile = &blk_crypto_fallback_profile;
if (blk_crypto_fallback_inited) if (blk_crypto_fallback_inited)
return 0; return 0;
@ -544,24 +545,24 @@ static int blk_crypto_fallback_init(void)
if (err) if (err)
goto out; goto out;
err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots); err = blk_crypto_profile_init(profile, blk_crypto_num_keyslots);
if (err) if (err)
goto fail_free_bioset; goto fail_free_bioset;
err = -ENOMEM; err = -ENOMEM;
blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops; profile->ll_ops = blk_crypto_fallback_ll_ops;
blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
/* All blk-crypto modes have a crypto API fallback. */ /* All blk-crypto modes have a crypto API fallback. */
for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++)
blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF; profile->modes_supported[i] = 0xFFFFFFFF;
blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0; profile->modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
blk_crypto_wq = alloc_workqueue("blk_crypto_wq", blk_crypto_wq = alloc_workqueue("blk_crypto_wq",
WQ_UNBOUND | WQ_HIGHPRI | WQ_UNBOUND | WQ_HIGHPRI |
WQ_MEM_RECLAIM, num_online_cpus()); WQ_MEM_RECLAIM, num_online_cpus());
if (!blk_crypto_wq) if (!blk_crypto_wq)
goto fail_free_ksm; goto fail_destroy_profile;
blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots, blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots,
sizeof(blk_crypto_keyslots[0]), sizeof(blk_crypto_keyslots[0]),
@ -595,8 +596,8 @@ static int blk_crypto_fallback_init(void)
kfree(blk_crypto_keyslots); kfree(blk_crypto_keyslots);
fail_free_wq: fail_free_wq:
destroy_workqueue(blk_crypto_wq); destroy_workqueue(blk_crypto_wq);
fail_free_ksm: fail_destroy_profile:
blk_ksm_destroy(&blk_crypto_ksm); blk_crypto_profile_destroy(profile);
fail_free_bioset: fail_free_bioset:
bioset_exit(&crypto_bio_split); bioset_exit(&crypto_bio_split);
out: out:
@ -610,7 +611,7 @@ static int blk_crypto_fallback_init(void)
int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
{ {
const char *cipher_str = blk_crypto_modes[mode_num].cipher_str; const char *cipher_str = blk_crypto_modes[mode_num].cipher_str;
struct blk_crypto_keyslot *slotp; struct blk_crypto_fallback_keyslot *slotp;
unsigned int i; unsigned int i;
int err = 0; int err = 0;

View File

@ -11,7 +11,7 @@
#include <linux/bio.h> #include <linux/bio.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/keyslot-manager.h> #include <linux/blk-crypto-profile.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/slab.h> #include <linux/slab.h>
@ -218,8 +218,9 @@ static bool bio_crypt_check_alignment(struct bio *bio)
blk_status_t __blk_crypto_init_request(struct request *rq) blk_status_t __blk_crypto_init_request(struct request *rq)
{ {
return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key, return blk_crypto_get_keyslot(rq->q->crypto_profile,
&rq->crypt_keyslot); rq->crypt_ctx->bc_key,
&rq->crypt_keyslot);
} }
/** /**
@ -233,7 +234,7 @@ blk_status_t __blk_crypto_init_request(struct request *rq)
*/ */
void __blk_crypto_free_request(struct request *rq) void __blk_crypto_free_request(struct request *rq)
{ {
blk_ksm_put_slot(rq->crypt_keyslot); blk_crypto_put_keyslot(rq->crypt_keyslot);
mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool); mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool);
blk_crypto_rq_set_defaults(rq); blk_crypto_rq_set_defaults(rq);
} }
@ -264,6 +265,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
{ {
struct bio *bio = *bio_ptr; struct bio *bio = *bio_ptr;
const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
struct blk_crypto_profile *profile;
/* Error if bio has no data. */ /* Error if bio has no data. */
if (WARN_ON_ONCE(!bio_has_data(bio))) { if (WARN_ON_ONCE(!bio_has_data(bio))) {
@ -280,8 +282,8 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
* Success if device supports the encryption context, or if we succeeded * Success if device supports the encryption context, or if we succeeded
* in falling back to the crypto API. * in falling back to the crypto API.
*/ */
if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm, profile = bdev_get_queue(bio->bi_bdev)->crypto_profile;
&bc_key->crypto_cfg)) if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg))
return true; return true;
if (blk_crypto_fallback_bio_prep(bio_ptr)) if (blk_crypto_fallback_bio_prep(bio_ptr))
@ -357,7 +359,7 @@ bool blk_crypto_config_supported(struct request_queue *q,
const struct blk_crypto_config *cfg) const struct blk_crypto_config *cfg)
{ {
return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
blk_ksm_crypto_cfg_supported(q->ksm, cfg); __blk_crypto_cfg_supported(q->crypto_profile, cfg);
} }
/** /**
@ -378,7 +380,7 @@ bool blk_crypto_config_supported(struct request_queue *q,
int blk_crypto_start_using_key(const struct blk_crypto_key *key, int blk_crypto_start_using_key(const struct blk_crypto_key *key,
struct request_queue *q) struct request_queue *q)
{ {
if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
return 0; return 0;
return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
} }
@ -394,18 +396,17 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
* evicted from any hardware that it might have been programmed into. The key * evicted from any hardware that it might have been programmed into. The key
* must not be in use by any in-flight IO when this function is called. * must not be in use by any in-flight IO when this function is called.
* *
* Return: 0 on success or if key is not present in the q's ksm, -err on error. * Return: 0 on success or if the key wasn't in any keyslot; -errno on error.
*/ */
int blk_crypto_evict_key(struct request_queue *q, int blk_crypto_evict_key(struct request_queue *q,
const struct blk_crypto_key *key) const struct blk_crypto_key *key)
{ {
if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
return blk_ksm_evict_key(q->ksm, key); return __blk_crypto_evict_key(q->crypto_profile, key);
/* /*
* If the request queue's associated inline encryption hardware didn't * If the request_queue didn't support the key, then blk-crypto-fallback
* have support for the key, then the key might have been programmed * may have been used, so try to evict the key from blk-crypto-fallback.
* into the fallback keyslot manager, so try to evict from there.
*/ */
return blk_crypto_fallback_evict_key(key); return blk_crypto_fallback_evict_key(key);
} }

View File

@ -69,6 +69,7 @@
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/gfp.h> #include <linux/gfp.h>
#include <linux/blk-mq.h> #include <linux/blk-mq.h>
#include <linux/part_stat.h>
#include "blk.h" #include "blk.h"
#include "blk-mq.h" #include "blk-mq.h"
@ -95,6 +96,12 @@ enum {
static void blk_kick_flush(struct request_queue *q, static void blk_kick_flush(struct request_queue *q,
struct blk_flush_queue *fq, unsigned int flags); struct blk_flush_queue *fq, unsigned int flags);
static inline struct blk_flush_queue *
blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
{
return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
}
static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
{ {
unsigned int policy = 0; unsigned int policy = 0;
@ -138,7 +145,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front)
static void blk_account_io_flush(struct request *rq) static void blk_account_io_flush(struct request *rq)
{ {
struct block_device *part = rq->rq_disk->part0; struct block_device *part = rq->q->disk->part0;
part_stat_lock(); part_stat_lock();
part_stat_inc(part, ios[STAT_FLUSH]); part_stat_inc(part, ios[STAT_FLUSH]);
@ -222,7 +229,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
/* release the tag's ownership to the req cloned from */ /* release the tag's ownership to the req cloned from */
spin_lock_irqsave(&fq->mq_flush_lock, flags); spin_lock_irqsave(&fq->mq_flush_lock, flags);
if (!refcount_dec_and_test(&flush_rq->ref)) { if (!req_ref_put_and_test(flush_rq)) {
fq->rq_status = error; fq->rq_status = error;
spin_unlock_irqrestore(&fq->mq_flush_lock, flags); spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
return; return;
@ -334,7 +341,6 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
flush_rq->rq_flags |= RQF_FLUSH_SEQ; flush_rq->rq_flags |= RQF_FLUSH_SEQ;
flush_rq->rq_disk = first_rq->rq_disk;
flush_rq->end_io = flush_end_io; flush_rq->end_io = flush_end_io;
/* /*
* Order WRITE ->end_io and WRITE rq->ref, and its pair is the one * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one
@ -343,7 +349,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
* and READ flush_rq->end_io * and READ flush_rq->end_io
*/ */
smp_wmb(); smp_wmb();
refcount_set(&flush_rq->ref, 1); req_ref_set(flush_rq, 1);
blk_flush_queue_rq(flush_rq, false); blk_flush_queue_rq(flush_rq, false);
} }
@ -423,7 +429,7 @@ void blk_insert_flush(struct request *rq)
*/ */
if ((policy & REQ_FSEQ_DATA) && if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
blk_mq_request_bypass_insert(rq, false, false); blk_mq_request_bypass_insert(rq, false, true);
return; return;
} }

View File

@ -6,7 +6,7 @@
* Written by: Martin K. Petersen <martin.petersen@oracle.com> * Written by: Martin K. Petersen <martin.petersen@oracle.com>
*/ */
#include <linux/blkdev.h> #include <linux/blk-integrity.h>
#include <linux/backing-dev.h> #include <linux/backing-dev.h>
#include <linux/mempool.h> #include <linux/mempool.h>
#include <linux/bio.h> #include <linux/bio.h>
@ -409,9 +409,9 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
#ifdef CONFIG_BLK_INLINE_ENCRYPTION #ifdef CONFIG_BLK_INLINE_ENCRYPTION
if (disk->queue->ksm) { if (disk->queue->crypto_profile) {
pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
blk_ksm_unregister(disk->queue); disk->queue->crypto_profile = NULL;
} }
#endif #endif
} }

View File

@ -8,22 +8,25 @@
#include <linux/bio.h> #include <linux/bio.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/security.h>
#include <linux/sched/task.h> #include <linux/sched/task.h>
#include "blk.h" #include "blk.h"
#include "blk-mq-sched.h"
/* /*
* For io context allocations * For io context allocations
*/ */
static struct kmem_cache *iocontext_cachep; static struct kmem_cache *iocontext_cachep;
#ifdef CONFIG_BLK_ICQ
/** /**
* get_io_context - increment reference count to io_context * get_io_context - increment reference count to io_context
* @ioc: io_context to get * @ioc: io_context to get
* *
* Increment reference count to @ioc. * Increment reference count to @ioc.
*/ */
void get_io_context(struct io_context *ioc) static void get_io_context(struct io_context *ioc)
{ {
BUG_ON(atomic_long_read(&ioc->refcount) <= 0); BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
atomic_long_inc(&ioc->refcount); atomic_long_inc(&ioc->refcount);
@ -53,6 +56,16 @@ static void ioc_exit_icq(struct io_cq *icq)
icq->flags |= ICQ_EXITED; icq->flags |= ICQ_EXITED;
} }
static void ioc_exit_icqs(struct io_context *ioc)
{
struct io_cq *icq;
spin_lock_irq(&ioc->lock);
hlist_for_each_entry(icq, &ioc->icq_list, ioc_node)
ioc_exit_icq(icq);
spin_unlock_irq(&ioc->lock);
}
/* /*
* Release an icq. Called with ioc locked for blk-mq, and with both ioc * Release an icq. Called with ioc locked for blk-mq, and with both ioc
* and queue locked for legacy. * and queue locked for legacy.
@ -132,102 +145,22 @@ static void ioc_release_fn(struct work_struct *work)
kmem_cache_free(iocontext_cachep, ioc); kmem_cache_free(iocontext_cachep, ioc);
} }
/** /*
* put_io_context - put a reference of io_context * Releasing icqs requires reverse order double locking and we may already be
* @ioc: io_context to put * holding a queue_lock. Do it asynchronously from a workqueue.
*
* Decrement reference count of @ioc and release it if the count reaches
* zero.
*/ */
void put_io_context(struct io_context *ioc) static bool ioc_delay_free(struct io_context *ioc)
{
unsigned long flags;
bool free_ioc = false;
if (ioc == NULL)
return;
BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
/*
* Releasing ioc requires reverse order double locking and we may
* already be holding a queue_lock. Do it asynchronously from wq.
*/
if (atomic_long_dec_and_test(&ioc->refcount)) {
spin_lock_irqsave(&ioc->lock, flags);
if (!hlist_empty(&ioc->icq_list))
queue_work(system_power_efficient_wq,
&ioc->release_work);
else
free_ioc = true;
spin_unlock_irqrestore(&ioc->lock, flags);
}
if (free_ioc)
kmem_cache_free(iocontext_cachep, ioc);
}
/**
* put_io_context_active - put active reference on ioc
* @ioc: ioc of interest
*
* Undo get_io_context_active(). If active reference reaches zero after
* put, @ioc can never issue further IOs and ioscheds are notified.
*/
void put_io_context_active(struct io_context *ioc)
{
struct io_cq *icq;
if (!atomic_dec_and_test(&ioc->active_ref)) {
put_io_context(ioc);
return;
}
spin_lock_irq(&ioc->lock);
hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
if (icq->flags & ICQ_EXITED)
continue;
ioc_exit_icq(icq);
}
spin_unlock_irq(&ioc->lock);
put_io_context(ioc);
}
/* Called by the exiting task */
void exit_io_context(struct task_struct *task)
{
struct io_context *ioc;
task_lock(task);
ioc = task->io_context;
task->io_context = NULL;
task_unlock(task);
atomic_dec(&ioc->nr_tasks);
put_io_context_active(ioc);
}
static void __ioc_clear_queue(struct list_head *icq_list)
{ {
unsigned long flags; unsigned long flags;
rcu_read_lock(); spin_lock_irqsave(&ioc->lock, flags);
while (!list_empty(icq_list)) { if (!hlist_empty(&ioc->icq_list)) {
struct io_cq *icq = list_entry(icq_list->next, queue_work(system_power_efficient_wq, &ioc->release_work);
struct io_cq, q_node);
struct io_context *ioc = icq->ioc;
spin_lock_irqsave(&ioc->lock, flags);
if (icq->flags & ICQ_DESTROYED) {
spin_unlock_irqrestore(&ioc->lock, flags);
continue;
}
ioc_destroy_icq(icq);
spin_unlock_irqrestore(&ioc->lock, flags); spin_unlock_irqrestore(&ioc->lock, flags);
return true;
} }
rcu_read_unlock(); spin_unlock_irqrestore(&ioc->lock, flags);
return false;
} }
/** /**
@ -244,93 +177,156 @@ void ioc_clear_queue(struct request_queue *q)
list_splice_init(&q->icq_list, &icq_list); list_splice_init(&q->icq_list, &icq_list);
spin_unlock_irq(&q->queue_lock); spin_unlock_irq(&q->queue_lock);
__ioc_clear_queue(&icq_list); rcu_read_lock();
} while (!list_empty(&icq_list)) {
struct io_cq *icq =
list_entry(icq_list.next, struct io_cq, q_node);
int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) spin_lock_irq(&icq->ioc->lock);
if (!(icq->flags & ICQ_DESTROYED))
ioc_destroy_icq(icq);
spin_unlock_irq(&icq->ioc->lock);
}
rcu_read_unlock();
}
#else /* CONFIG_BLK_ICQ */
static inline void ioc_exit_icqs(struct io_context *ioc)
{
}
static inline bool ioc_delay_free(struct io_context *ioc)
{
return false;
}
#endif /* CONFIG_BLK_ICQ */
/**
* put_io_context - put a reference of io_context
* @ioc: io_context to put
*
* Decrement reference count of @ioc and release it if the count reaches
* zero.
*/
void put_io_context(struct io_context *ioc)
{
BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
if (atomic_long_dec_and_test(&ioc->refcount) && !ioc_delay_free(ioc))
kmem_cache_free(iocontext_cachep, ioc);
}
EXPORT_SYMBOL_GPL(put_io_context);
/* Called by the exiting task */
void exit_io_context(struct task_struct *task)
{
struct io_context *ioc;
task_lock(task);
ioc = task->io_context;
task->io_context = NULL;
task_unlock(task);
if (atomic_dec_and_test(&ioc->active_ref)) {
ioc_exit_icqs(ioc);
put_io_context(ioc);
}
}
static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
{ {
struct io_context *ioc; struct io_context *ioc;
int ret;
ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
node); node);
if (unlikely(!ioc)) if (unlikely(!ioc))
return -ENOMEM; return NULL;
/* initialize */
atomic_long_set(&ioc->refcount, 1); atomic_long_set(&ioc->refcount, 1);
atomic_set(&ioc->nr_tasks, 1);
atomic_set(&ioc->active_ref, 1); atomic_set(&ioc->active_ref, 1);
#ifdef CONFIG_BLK_ICQ
spin_lock_init(&ioc->lock); spin_lock_init(&ioc->lock);
INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC); INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
INIT_HLIST_HEAD(&ioc->icq_list); INIT_HLIST_HEAD(&ioc->icq_list);
INIT_WORK(&ioc->release_work, ioc_release_fn); INIT_WORK(&ioc->release_work, ioc_release_fn);
#endif
return ioc;
}
int set_task_ioprio(struct task_struct *task, int ioprio)
{
int err;
const struct cred *cred = current_cred(), *tcred;
rcu_read_lock();
tcred = __task_cred(task);
if (!uid_eq(tcred->uid, cred->euid) &&
!uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
rcu_read_unlock();
return -EPERM;
}
rcu_read_unlock();
err = security_task_setioprio(task, ioprio);
if (err)
return err;
task_lock(task);
if (unlikely(!task->io_context)) {
struct io_context *ioc;
task_unlock(task);
ioc = alloc_io_context(GFP_ATOMIC, NUMA_NO_NODE);
if (!ioc)
return -ENOMEM;
task_lock(task);
if (task->flags & PF_EXITING) {
err = -ESRCH;
kmem_cache_free(iocontext_cachep, ioc);
goto out;
}
if (task->io_context)
kmem_cache_free(iocontext_cachep, ioc);
else
task->io_context = ioc;
}
task->io_context->ioprio = ioprio;
out:
task_unlock(task);
return err;
}
EXPORT_SYMBOL_GPL(set_task_ioprio);
int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
{
struct io_context *ioc = current->io_context;
/* /*
* Try to install. ioc shouldn't be installed if someone else * Share io context with parent, if CLONE_IO is set
* already did or @task, which isn't %current, is exiting. Note
* that we need to allow ioc creation on exiting %current as exit
* path may issue IOs from e.g. exit_files(). The exit path is
* responsible for not issuing IO after exit_io_context().
*/ */
task_lock(task); if (clone_flags & CLONE_IO) {
if (!task->io_context && atomic_inc(&ioc->active_ref);
(task == current || !(task->flags & PF_EXITING))) tsk->io_context = ioc;
task->io_context = ioc; } else if (ioprio_valid(ioc->ioprio)) {
else tsk->io_context = alloc_io_context(GFP_KERNEL, NUMA_NO_NODE);
kmem_cache_free(iocontext_cachep, ioc); if (!tsk->io_context)
return -ENOMEM;
tsk->io_context->ioprio = ioc->ioprio;
}
ret = task->io_context ? 0 : -EBUSY; return 0;
task_unlock(task);
return ret;
}
/**
* get_task_io_context - get io_context of a task
* @task: task of interest
* @gfp_flags: allocation flags, used if allocation is necessary
* @node: allocation node, used if allocation is necessary
*
* Return io_context of @task. If it doesn't exist, it is created with
* @gfp_flags and @node. The returned io_context has its reference count
* incremented.
*
* This function always goes through task_lock() and it's better to use
* %current->io_context + get_io_context() for %current.
*/
struct io_context *get_task_io_context(struct task_struct *task,
gfp_t gfp_flags, int node)
{
struct io_context *ioc;
might_sleep_if(gfpflags_allow_blocking(gfp_flags));
do {
task_lock(task);
ioc = task->io_context;
if (likely(ioc)) {
get_io_context(ioc);
task_unlock(task);
return ioc;
}
task_unlock(task);
} while (!create_task_io_context(task, gfp_flags, node));
return NULL;
} }
#ifdef CONFIG_BLK_ICQ
/** /**
* ioc_lookup_icq - lookup io_cq from ioc * ioc_lookup_icq - lookup io_cq from ioc
* @ioc: the associated io_context
* @q: the associated request_queue * @q: the associated request_queue
* *
* Look up io_cq associated with @ioc - @q pair from @ioc. Must be called * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called
* with @q->queue_lock held. * with @q->queue_lock held.
*/ */
struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q) struct io_cq *ioc_lookup_icq(struct request_queue *q)
{ {
struct io_context *ioc = current->io_context;
struct io_cq *icq; struct io_cq *icq;
lockdep_assert_held(&q->queue_lock); lockdep_assert_held(&q->queue_lock);
@ -359,9 +355,7 @@ EXPORT_SYMBOL(ioc_lookup_icq);
/** /**
* ioc_create_icq - create and link io_cq * ioc_create_icq - create and link io_cq
* @ioc: io_context of interest
* @q: request_queue of interest * @q: request_queue of interest
* @gfp_mask: allocation mask
* *
* Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they
* will be created using @gfp_mask. * will be created using @gfp_mask.
@ -369,19 +363,19 @@ EXPORT_SYMBOL(ioc_lookup_icq);
* The caller is responsible for ensuring @ioc won't go away and @q is * The caller is responsible for ensuring @ioc won't go away and @q is
* alive and will stay alive until this function returns. * alive and will stay alive until this function returns.
*/ */
struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, static struct io_cq *ioc_create_icq(struct request_queue *q)
gfp_t gfp_mask)
{ {
struct io_context *ioc = current->io_context;
struct elevator_type *et = q->elevator->type; struct elevator_type *et = q->elevator->type;
struct io_cq *icq; struct io_cq *icq;
/* allocate stuff */ /* allocate stuff */
icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, icq = kmem_cache_alloc_node(et->icq_cache, GFP_ATOMIC | __GFP_ZERO,
q->node); q->node);
if (!icq) if (!icq)
return NULL; return NULL;
if (radix_tree_maybe_preload(gfp_mask) < 0) { if (radix_tree_maybe_preload(GFP_ATOMIC) < 0) {
kmem_cache_free(et->icq_cache, icq); kmem_cache_free(et->icq_cache, icq);
return NULL; return NULL;
} }
@ -402,7 +396,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
et->ops.init_icq(icq); et->ops.init_icq(icq);
} else { } else {
kmem_cache_free(et->icq_cache, icq); kmem_cache_free(et->icq_cache, icq);
icq = ioc_lookup_icq(ioc, q); icq = ioc_lookup_icq(q);
if (!icq) if (!icq)
printk(KERN_ERR "cfq: icq link failed!\n"); printk(KERN_ERR "cfq: icq link failed!\n");
} }
@ -413,6 +407,46 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
return icq; return icq;
} }
struct io_cq *ioc_find_get_icq(struct request_queue *q)
{
struct io_context *ioc = current->io_context;
struct io_cq *icq = NULL;
if (unlikely(!ioc)) {
ioc = alloc_io_context(GFP_ATOMIC, q->node);
if (!ioc)
return NULL;
task_lock(current);
if (current->io_context) {
kmem_cache_free(iocontext_cachep, ioc);
ioc = current->io_context;
} else {
current->io_context = ioc;
}
get_io_context(ioc);
task_unlock(current);
} else {
get_io_context(ioc);
spin_lock_irq(&q->queue_lock);
icq = ioc_lookup_icq(q);
spin_unlock_irq(&q->queue_lock);
}
if (!icq) {
icq = ioc_create_icq(q);
if (!icq) {
put_io_context(ioc);
return NULL;
}
}
return icq;
}
EXPORT_SYMBOL_GPL(ioc_find_get_icq);
#endif /* CONFIG_BLK_ICQ */
static int __init blk_ioc_init(void) static int __init blk_ioc_init(void)
{ {
iocontext_cachep = kmem_cache_create("blkdev_ioc", iocontext_cachep = kmem_cache_create("blkdev_ioc",

View File

@ -74,6 +74,7 @@
#include <linux/sched/signal.h> #include <linux/sched/signal.h>
#include <trace/events/block.h> #include <trace/events/block.h>
#include <linux/blk-mq.h> #include <linux/blk-mq.h>
#include <linux/blk-cgroup.h>
#include "blk-rq-qos.h" #include "blk-rq-qos.h"
#include "blk-stat.h" #include "blk-stat.h"
#include "blk.h" #include "blk.h"

View File

@ -62,6 +62,7 @@ struct ioprio_blkg {
struct ioprio_blkcg { struct ioprio_blkcg {
struct blkcg_policy_data cpd; struct blkcg_policy_data cpd;
enum prio_policy prio_policy; enum prio_policy prio_policy;
bool prio_set;
}; };
static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
@ -112,7 +113,7 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
if (ret < 0) if (ret < 0)
return ret; return ret;
blkcg->prio_policy = ret; blkcg->prio_policy = ret;
blkcg->prio_set = true;
return nbytes; return nbytes;
} }
@ -190,6 +191,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
struct bio *bio) struct bio *bio)
{ {
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
u16 prio;
if (!blkcg->prio_set)
return;
/* /*
* Except for IOPRIO_CLASS_NONE, higher I/O priority numbers * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
@ -199,8 +204,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
* bio I/O priority is not modified. If the bio I/O priority equals * bio I/O priority is not modified. If the bio I/O priority equals
* IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio. * IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio.
*/ */
bio->bi_ioprio = max_t(u16, bio->bi_ioprio, prio = max_t(u16, bio->bi_ioprio,
IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
if (prio > bio->bi_ioprio)
bio->bi_ioprio = prio;
} }
static void blkcg_ioprio_exit(struct rq_qos *rqos) static void blkcg_ioprio_exit(struct rq_qos *rqos)

View File

@ -6,12 +6,47 @@
#include <linux/module.h> #include <linux/module.h>
#include <linux/bio.h> #include <linux/bio.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/scatterlist.h> #include <linux/scatterlist.h>
#include <linux/part_stat.h>
#include <trace/events/block.h> #include <trace/events/block.h>
#include "blk.h" #include "blk.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h" #include "blk-rq-qos.h"
#include "blk-throttle.h"
static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
{
*bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
}
static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
{
struct bvec_iter iter = bio->bi_iter;
int idx;
bio_get_first_bvec(bio, bv);
if (bv->bv_len == bio->bi_iter.bi_size)
return; /* this bio only has a single bvec */
bio_advance_iter(bio, &iter, iter.bi_size);
if (!iter.bi_bvec_done)
idx = iter.bi_idx - 1;
else /* in the middle of bvec */
idx = iter.bi_idx;
*bv = bio->bi_io_vec[idx];
/*
* iter.bi_bvec_done records actual length of the last bvec
* if this bio ends in the middle of one io vector
*/
if (iter.bi_bvec_done)
bv->bv_len = iter.bi_bvec_done;
}
static inline bool bio_will_gap(struct request_queue *q, static inline bool bio_will_gap(struct request_queue *q,
struct request *prev_rq, struct bio *prev, struct bio *next) struct request *prev_rq, struct bio *prev, struct bio *next)
@ -285,13 +320,13 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
* iopoll in direct IO routine. Given performance gain of iopoll for * iopoll in direct IO routine. Given performance gain of iopoll for
* big IO can be trival, disable iopoll when split needed. * big IO can be trival, disable iopoll when split needed.
*/ */
bio_clear_hipri(bio); bio_clear_polled(bio);
return bio_split(bio, sectors, GFP_NOIO, bs); return bio_split(bio, sectors, GFP_NOIO, bs);
} }
/** /**
* __blk_queue_split - split a bio and submit the second half * __blk_queue_split - split a bio and submit the second half
* @q: [in] request_queue new bio is being queued at
* @bio: [in, out] bio to be split * @bio: [in, out] bio to be split
* @nr_segs: [out] number of segments in the first bio * @nr_segs: [out] number of segments in the first bio
* *
@ -302,9 +337,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
* of the caller to ensure that q->bio_split is only released after processing * of the caller to ensure that q->bio_split is only released after processing
* of the split bio has finished. * of the split bio has finished.
*/ */
void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) void __blk_queue_split(struct request_queue *q, struct bio **bio,
unsigned int *nr_segs)
{ {
struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue;
struct bio *split = NULL; struct bio *split = NULL;
switch (bio_op(*bio)) { switch (bio_op(*bio)) {
@ -321,21 +356,6 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
nr_segs); nr_segs);
break; break;
default: default:
/*
* All drivers must accept single-segments bios that are <=
* PAGE_SIZE. This is a quick and dirty check that relies on
* the fact that bi_io_vec[0] is always valid if a bio has data.
* The check might lead to occasional false negatives when bios
* are cloned, but compared to the performance impact of cloned
* bios themselves the loop below doesn't matter anyway.
*/
if (!q->limits.chunk_sectors &&
(*bio)->bi_vcnt == 1 &&
((*bio)->bi_io_vec[0].bv_len +
(*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) {
*nr_segs = 1;
break;
}
split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs); split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
break; break;
} }
@ -365,9 +385,11 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
*/ */
void blk_queue_split(struct bio **bio) void blk_queue_split(struct bio **bio)
{ {
struct request_queue *q = bdev_get_queue((*bio)->bi_bdev);
unsigned int nr_segs; unsigned int nr_segs;
__blk_queue_split(bio, &nr_segs); if (blk_may_split(q, *bio))
__blk_queue_split(q, bio, &nr_segs);
} }
EXPORT_SYMBOL(blk_queue_split); EXPORT_SYMBOL(blk_queue_split);
@ -558,6 +580,23 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq)
return queue_max_segments(rq->q); return queue_max_segments(rq->q);
} }
static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
sector_t offset)
{
struct request_queue *q = rq->q;
if (blk_rq_is_passthrough(rq))
return q->limits.max_hw_sectors;
if (!q->limits.chunk_sectors ||
req_op(rq) == REQ_OP_DISCARD ||
req_op(rq) == REQ_OP_SECURE_ERASE)
return blk_queue_get_max_sectors(q, req_op(rq));
return min(blk_max_size_offset(q, offset, 0),
blk_queue_get_max_sectors(q, req_op(rq)));
}
static inline int ll_new_hw_segment(struct request *req, struct bio *bio, static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
unsigned int nr_phys_segs) unsigned int nr_phys_segs)
{ {
@ -718,6 +757,13 @@ static enum elv_merge blk_try_req_merge(struct request *req,
return ELEVATOR_NO_MERGE; return ELEVATOR_NO_MERGE;
} }
static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
{
if (bio_page(a) == bio_page(b) && bio_offset(a) == bio_offset(b))
return true;
return false;
}
/* /*
* For non-mq, this has to be called with the request spinlock acquired. * For non-mq, this has to be called with the request spinlock acquired.
* For mq with scheduling, the appropriate queue wide lock should be held. * For mq with scheduling, the appropriate queue wide lock should be held.
@ -731,8 +777,7 @@ static struct request *attempt_merge(struct request_queue *q,
if (req_op(req) != req_op(next)) if (req_op(req) != req_op(next))
return NULL; return NULL;
if (rq_data_dir(req) != rq_data_dir(next) if (rq_data_dir(req) != rq_data_dir(next))
|| req->rq_disk != next->rq_disk)
return NULL; return NULL;
if (req_op(req) == REQ_OP_WRITE_SAME && if (req_op(req) == REQ_OP_WRITE_SAME &&
@ -859,10 +904,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (bio_data_dir(bio) != rq_data_dir(rq)) if (bio_data_dir(bio) != rq_data_dir(rq))
return false; return false;
/* must be same device */
if (rq->rq_disk != bio->bi_bdev->bd_disk)
return false;
/* only merge integrity protected bio into ditto rq */ /* only merge integrity protected bio into ditto rq */
if (blk_integrity_merge_bio(rq->q, rq, bio) == false) if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
return false; return false;
@ -1023,12 +1064,10 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
* @q: request_queue new bio is being queued at * @q: request_queue new bio is being queued at
* @bio: new bio being queued * @bio: new bio being queued
* @nr_segs: number of segments in @bio * @nr_segs: number of segments in @bio
* @same_queue_rq: pointer to &struct request that gets filled in when * from the passed in @q already in the plug list
* another request associated with @q is found on the plug list
* (optional, may be %NULL)
* *
* Determine whether @bio being queued on @q can be merged with a request * Determine whether @bio being queued on @q can be merged with the previous
* on %current's plugged list. Returns %true if merge was successful, * request on %current's plugged list. Returns %true if merge was successful,
* otherwise %false. * otherwise %false.
* *
* Plugging coalesces IOs from the same issuer for the same purpose without * Plugging coalesces IOs from the same issuer for the same purpose without
@ -1041,36 +1080,22 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
* Caller must ensure !blk_queue_nomerges(q) beforehand. * Caller must ensure !blk_queue_nomerges(q) beforehand.
*/ */
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **same_queue_rq) unsigned int nr_segs)
{ {
struct blk_plug *plug; struct blk_plug *plug;
struct request *rq; struct request *rq;
struct list_head *plug_list;
plug = blk_mq_plug(q, bio); plug = blk_mq_plug(q, bio);
if (!plug) if (!plug || rq_list_empty(plug->mq_list))
return false; return false;
plug_list = &plug->mq_list; /* check the previously added entry for a quick merge attempt */
rq = rq_list_peek(&plug->mq_list);
list_for_each_entry_reverse(rq, plug_list, queuelist) { if (rq->q == q) {
if (rq->q == q && same_queue_rq) {
/*
* Only blk-mq multiple hardware queues case checks the
* rq in the same queue, there should be only one such
* rq in a queue
**/
*same_queue_rq = rq;
}
if (rq->q != q)
continue;
if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
BIO_MERGE_OK) BIO_MERGE_OK)
return true; return true;
} }
return false; return false;
} }

View File

@ -11,6 +11,7 @@
#include "blk.h" #include "blk.h"
#include "blk-mq.h" #include "blk-mq.h"
#include "blk-mq-debugfs.h" #include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
#include "blk-mq-tag.h" #include "blk-mq-tag.h"
#include "blk-rq-qos.h" #include "blk-rq-qos.h"
@ -29,6 +30,9 @@ static int queue_poll_stat_show(void *data, struct seq_file *m)
struct request_queue *q = data; struct request_queue *q = data;
int bucket; int bucket;
if (!q->poll_stat)
return 0;
for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) { for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) {
seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket)); seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket));
print_stat(m, &q->poll_stat[2 * bucket]); print_stat(m, &q->poll_stat[2 * bucket]);
@ -122,9 +126,7 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(FUA), QUEUE_FLAG_NAME(FUA),
QUEUE_FLAG_NAME(DAX), QUEUE_FLAG_NAME(DAX),
QUEUE_FLAG_NAME(STATS), QUEUE_FLAG_NAME(STATS),
QUEUE_FLAG_NAME(POLL_STATS),
QUEUE_FLAG_NAME(REGISTERED), QUEUE_FLAG_NAME(REGISTERED),
QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
QUEUE_FLAG_NAME(QUIESCED), QUEUE_FLAG_NAME(QUIESCED),
QUEUE_FLAG_NAME(PCI_P2PDMA), QUEUE_FLAG_NAME(PCI_P2PDMA),
QUEUE_FLAG_NAME(ZONE_RESETALL), QUEUE_FLAG_NAME(ZONE_RESETALL),
@ -287,7 +289,7 @@ static const char *const cmd_flag_name[] = {
CMD_FLAG_NAME(BACKGROUND), CMD_FLAG_NAME(BACKGROUND),
CMD_FLAG_NAME(NOWAIT), CMD_FLAG_NAME(NOWAIT),
CMD_FLAG_NAME(NOUNMAP), CMD_FLAG_NAME(NOUNMAP),
CMD_FLAG_NAME(HIPRI), CMD_FLAG_NAME(POLLED),
}; };
#undef CMD_FLAG_NAME #undef CMD_FLAG_NAME
@ -309,6 +311,7 @@ static const char *const rqf_name[] = {
RQF_NAME(SPECIAL_PAYLOAD), RQF_NAME(SPECIAL_PAYLOAD),
RQF_NAME(ZONE_WRITE_LOCKED), RQF_NAME(ZONE_WRITE_LOCKED),
RQF_NAME(MQ_POLL_SLEPT), RQF_NAME(MQ_POLL_SLEPT),
RQF_NAME(ELV),
}; };
#undef RQF_NAME #undef RQF_NAME
@ -453,11 +456,11 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
atomic_read(&tags->active_queues)); atomic_read(&tags->active_queues));
seq_puts(m, "\nbitmap_tags:\n"); seq_puts(m, "\nbitmap_tags:\n");
sbitmap_queue_show(tags->bitmap_tags, m); sbitmap_queue_show(&tags->bitmap_tags, m);
if (tags->nr_reserved_tags) { if (tags->nr_reserved_tags) {
seq_puts(m, "\nbreserved_tags:\n"); seq_puts(m, "\nbreserved_tags:\n");
sbitmap_queue_show(tags->breserved_tags, m); sbitmap_queue_show(&tags->breserved_tags, m);
} }
} }
@ -488,7 +491,7 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
if (res) if (res)
goto out; goto out;
if (hctx->tags) if (hctx->tags)
sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m); sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_lock);
out: out:
@ -522,77 +525,13 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
if (res) if (res)
goto out; goto out;
if (hctx->sched_tags) if (hctx->sched_tags)
sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m); sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m);
mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_lock);
out: out:
return res; return res;
} }
static int hctx_io_poll_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
seq_printf(m, "considered=%lu\n", hctx->poll_considered);
seq_printf(m, "invoked=%lu\n", hctx->poll_invoked);
seq_printf(m, "success=%lu\n", hctx->poll_success);
return 0;
}
static ssize_t hctx_io_poll_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_hw_ctx *hctx = data;
hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
return count;
}
static int hctx_dispatched_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
int i;
seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
unsigned int d = 1U << (i - 1);
seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]);
}
seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]);
return 0;
}
static ssize_t hctx_dispatched_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_hw_ctx *hctx = data;
int i;
for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++)
hctx->dispatched[i] = 0;
return count;
}
static int hctx_queued_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
seq_printf(m, "%lu\n", hctx->queued);
return 0;
}
static ssize_t hctx_queued_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_hw_ctx *hctx = data;
hctx->queued = 0;
return count;
}
static int hctx_run_show(void *data, struct seq_file *m) static int hctx_run_show(void *data, struct seq_file *m)
{ {
struct blk_mq_hw_ctx *hctx = data; struct blk_mq_hw_ctx *hctx = data;
@ -614,7 +553,7 @@ static int hctx_active_show(void *data, struct seq_file *m)
{ {
struct blk_mq_hw_ctx *hctx = data; struct blk_mq_hw_ctx *hctx = data;
seq_printf(m, "%d\n", atomic_read(&hctx->nr_active)); seq_printf(m, "%d\n", __blk_mq_active_requests(hctx));
return 0; return 0;
} }
@ -663,57 +602,6 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ); CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL); CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
static int ctx_dispatched_show(void *data, struct seq_file *m)
{
struct blk_mq_ctx *ctx = data;
seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]);
return 0;
}
static ssize_t ctx_dispatched_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_ctx *ctx = data;
ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0;
return count;
}
static int ctx_merged_show(void *data, struct seq_file *m)
{
struct blk_mq_ctx *ctx = data;
seq_printf(m, "%lu\n", ctx->rq_merged);
return 0;
}
static ssize_t ctx_merged_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_ctx *ctx = data;
ctx->rq_merged = 0;
return count;
}
static int ctx_completed_show(void *data, struct seq_file *m)
{
struct blk_mq_ctx *ctx = data;
seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]);
return 0;
}
static ssize_t ctx_completed_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_ctx *ctx = data;
ctx->rq_completed[0] = ctx->rq_completed[1] = 0;
return count;
}
static int blk_mq_debugfs_show(struct seq_file *m, void *v) static int blk_mq_debugfs_show(struct seq_file *m, void *v)
{ {
const struct blk_mq_debugfs_attr *attr = m->private; const struct blk_mq_debugfs_attr *attr = m->private;
@ -789,9 +677,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"tags_bitmap", 0400, hctx_tags_bitmap_show}, {"tags_bitmap", 0400, hctx_tags_bitmap_show},
{"sched_tags", 0400, hctx_sched_tags_show}, {"sched_tags", 0400, hctx_sched_tags_show},
{"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show}, {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show},
{"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write},
{"dispatched", 0600, hctx_dispatched_show, hctx_dispatched_write},
{"queued", 0600, hctx_queued_show, hctx_queued_write},
{"run", 0600, hctx_run_show, hctx_run_write}, {"run", 0600, hctx_run_show, hctx_run_write},
{"active", 0400, hctx_active_show}, {"active", 0400, hctx_active_show},
{"dispatch_busy", 0400, hctx_dispatch_busy_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show},
@ -803,9 +688,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
{"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops}, {"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops},
{"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops}, {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
{"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops}, {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
{"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
{"merged", 0600, ctx_merged_show, ctx_merged_write},
{"completed", 0600, ctx_completed_show, ctx_completed_write},
{}, {},
}; };

View File

@ -18,32 +18,6 @@
#include "blk-mq-tag.h" #include "blk-mq-tag.h"
#include "blk-wbt.h" #include "blk-wbt.h"
void blk_mq_sched_assign_ioc(struct request *rq)
{
struct request_queue *q = rq->q;
struct io_context *ioc;
struct io_cq *icq;
/*
* May not have an IO context if it's a passthrough request
*/
ioc = current->io_context;
if (!ioc)
return;
spin_lock_irq(&q->queue_lock);
icq = ioc_lookup_icq(ioc, q);
spin_unlock_irq(&q->queue_lock);
if (!icq) {
icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
if (!icq)
return;
}
get_io_context(icq->ioc);
rq->elv.icq = icq;
}
/* /*
* Mark a hardware queue as needing a restart. For shared queues, maintain * Mark a hardware queue as needing a restart. For shared queues, maintain
* a count of how many hardware queues are marked for restart. * a count of how many hardware queues are marked for restart.
@ -57,10 +31,8 @@ void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
} }
EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx); EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{ {
if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
return;
clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
/* /*
@ -363,7 +335,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
} }
} }
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs) unsigned int nr_segs)
{ {
struct elevator_queue *e = q->elevator; struct elevator_queue *e = q->elevator;
@ -372,15 +344,17 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
bool ret = false; bool ret = false;
enum hctx_type type; enum hctx_type type;
if (e && e->type->ops.bio_merge) if (e && e->type->ops.bio_merge) {
return e->type->ops.bio_merge(q, bio, nr_segs); ret = e->type->ops.bio_merge(q, bio, nr_segs);
goto out_put;
}
ctx = blk_mq_get_ctx(q); ctx = blk_mq_get_ctx(q);
hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
type = hctx->type; type = hctx->type;
if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
list_empty_careful(&ctx->rq_lists[type])) list_empty_careful(&ctx->rq_lists[type]))
return false; goto out_put;
/* default per sw-queue merge */ /* default per sw-queue merge */
spin_lock(&ctx->lock); spin_lock(&ctx->lock);
@ -389,13 +363,11 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
* potentially merge with. Currently includes a hand-wavy stop * potentially merge with. Currently includes a hand-wavy stop
* count of 8, to not spend too much time checking for merges. * count of 8, to not spend too much time checking for merges.
*/ */
if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
ctx->rq_merged++;
ret = true; ret = true;
}
spin_unlock(&ctx->lock); spin_unlock(&ctx->lock);
out_put:
return ret; return ret;
} }
@ -502,8 +474,9 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
* busy in case of 'none' scheduler, and this way may save * busy in case of 'none' scheduler, and this way may save
* us one extra enqueue & dequeue to sw queue. * us one extra enqueue & dequeue to sw queue.
*/ */
if (!hctx->dispatch_busy && !e && !run_queue_async) { if (!hctx->dispatch_busy && !run_queue_async) {
blk_mq_try_issue_list_directly(hctx, list); blk_mq_run_dispatch_ops(hctx->queue,
blk_mq_try_issue_list_directly(hctx, list));
if (list_empty(list)) if (list_empty(list))
goto out; goto out;
} }
@ -515,83 +488,71 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
percpu_ref_put(&q->q_usage_counter); percpu_ref_put(&q->q_usage_counter);
} }
static int blk_mq_sched_alloc_tags(struct request_queue *q, static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
struct blk_mq_hw_ctx *hctx, struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx) unsigned int hctx_idx)
{ {
struct blk_mq_tag_set *set = q->tag_set; if (blk_mq_is_shared_tags(q->tag_set->flags)) {
int ret; hctx->sched_tags = q->sched_shared_tags;
return 0;
hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
set->reserved_tags, set->flags);
if (!hctx->sched_tags)
return -ENOMEM;
ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
if (ret) {
blk_mq_free_rq_map(hctx->sched_tags, set->flags);
hctx->sched_tags = NULL;
} }
return ret; hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
q->nr_requests);
if (!hctx->sched_tags)
return -ENOMEM;
return 0;
}
static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
{
blk_mq_free_rq_map(queue->sched_shared_tags);
queue->sched_shared_tags = NULL;
} }
/* called in queue's release handler, tagset has gone away */ /* called in queue's release handler, tagset has gone away */
static void blk_mq_sched_tags_teardown(struct request_queue *q) static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{ {
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
int i; int i;
queue_for_each_hw_ctx(q, hctx, i) { queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags) { if (hctx->sched_tags) {
blk_mq_free_rq_map(hctx->sched_tags, hctx->flags); if (!blk_mq_is_shared_tags(flags))
blk_mq_free_rq_map(hctx->sched_tags);
hctx->sched_tags = NULL; hctx->sched_tags = NULL;
} }
} }
if (blk_mq_is_shared_tags(flags))
blk_mq_exit_sched_shared_tags(q);
} }
static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue) static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
{ {
struct blk_mq_tag_set *set = queue->tag_set; struct blk_mq_tag_set *set = queue->tag_set;
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
struct blk_mq_hw_ctx *hctx;
int ret, i;
/* /*
* Set initial depth at max so that we don't need to reallocate for * Set initial depth at max so that we don't need to reallocate for
* updating nr_requests. * updating nr_requests.
*/ */
ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags, queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
&queue->sched_breserved_tags, BLK_MQ_NO_HCTX_IDX,
MAX_SCHED_RQ, set->reserved_tags, MAX_SCHED_RQ);
set->numa_node, alloc_policy); if (!queue->sched_shared_tags)
if (ret) return -ENOMEM;
return ret;
queue_for_each_hw_ctx(queue, hctx, i) { blk_mq_tag_update_sched_shared_tags(queue);
hctx->sched_tags->bitmap_tags =
&queue->sched_bitmap_tags;
hctx->sched_tags->breserved_tags =
&queue->sched_breserved_tags;
}
sbitmap_queue_resize(&queue->sched_bitmap_tags,
queue->nr_requests - set->reserved_tags);
return 0; return 0;
} }
static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
{
sbitmap_queue_free(&queue->sched_bitmap_tags);
sbitmap_queue_free(&queue->sched_breserved_tags);
}
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{ {
unsigned int i, flags = q->tag_set->flags;
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
struct elevator_queue *eq; struct elevator_queue *eq;
unsigned int i;
int ret; int ret;
if (!e) { if (!e) {
@ -606,23 +567,23 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
* Additionally, this is a per-hw queue depth. * Additionally, this is a per-hw queue depth.
*/ */
q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
BLKDEV_MAX_RQ); BLKDEV_DEFAULT_RQ);
queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_is_shared_tags(flags)) {
ret = blk_mq_sched_alloc_tags(q, hctx, i); ret = blk_mq_init_sched_shared_tags(q);
if (ret) if (ret)
goto err_free_tags; return ret;
} }
if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) { queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_init_sched_shared_sbitmap(q); ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
if (ret) if (ret)
goto err_free_tags; goto err_free_map_and_rqs;
} }
ret = e->ops.init_sched(q, e); ret = e->ops.init_sched(q, e);
if (ret) if (ret)
goto err_free_sbitmap; goto err_free_map_and_rqs;
blk_mq_debugfs_register_sched(q); blk_mq_debugfs_register_sched(q);
@ -631,7 +592,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
ret = e->ops.init_hctx(hctx, i); ret = e->ops.init_hctx(hctx, i);
if (ret) { if (ret) {
eq = q->elevator; eq = q->elevator;
blk_mq_sched_free_requests(q); blk_mq_sched_free_rqs(q);
blk_mq_exit_sched(q, eq); blk_mq_exit_sched(q, eq);
kobject_put(&eq->kobj); kobject_put(&eq->kobj);
return ret; return ret;
@ -642,12 +603,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
return 0; return 0;
err_free_sbitmap: err_free_map_and_rqs:
if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) blk_mq_sched_free_rqs(q);
blk_mq_exit_sched_shared_sbitmap(q); blk_mq_sched_tags_teardown(q, flags);
err_free_tags:
blk_mq_sched_free_requests(q);
blk_mq_sched_tags_teardown(q);
q->elevator = NULL; q->elevator = NULL;
return ret; return ret;
} }
@ -656,14 +615,20 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
* called in either blk_queue_cleanup or elevator_switch, tagset * called in either blk_queue_cleanup or elevator_switch, tagset
* is required for freeing requests * is required for freeing requests
*/ */
void blk_mq_sched_free_requests(struct request_queue *q) void blk_mq_sched_free_rqs(struct request_queue *q)
{ {
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
int i; int i;
queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_is_shared_tags(q->tag_set->flags)) {
if (hctx->sched_tags) blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); BLK_MQ_NO_HCTX_IDX);
} else {
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags)
blk_mq_free_rqs(q->tag_set,
hctx->sched_tags, i);
}
} }
} }
@ -684,8 +649,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
blk_mq_debugfs_unregister_sched(q); blk_mq_debugfs_unregister_sched(q);
if (e->type->ops.exit_sched) if (e->type->ops.exit_sched)
e->type->ops.exit_sched(e); e->type->ops.exit_sched(e);
blk_mq_sched_tags_teardown(q); blk_mq_sched_tags_teardown(q, flags);
if (blk_mq_is_sbitmap_shared(flags))
blk_mq_exit_sched_shared_sbitmap(q);
q->elevator = NULL; q->elevator = NULL;
} }

View File

@ -2,21 +2,20 @@
#ifndef BLK_MQ_SCHED_H #ifndef BLK_MQ_SCHED_H
#define BLK_MQ_SCHED_H #define BLK_MQ_SCHED_H
#include "elevator.h"
#include "blk-mq.h" #include "blk-mq.h"
#include "blk-mq-tag.h" #include "blk-mq-tag.h"
#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ) #define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)
void blk_mq_sched_assign_ioc(struct request *rq);
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **merged_request); unsigned int nr_segs, struct request **merged_request);
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs); unsigned int nr_segs);
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
struct list_head *free); struct list_head *free);
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_insert_request(struct request *rq, bool at_head, void blk_mq_sched_insert_request(struct request *rq, bool at_head,
bool run_queue, bool async); bool run_queue, bool async);
@ -28,45 +27,51 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_requests(struct request_queue *q); void blk_mq_sched_free_rqs(struct request_queue *q);
static inline bool static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{ {
if (blk_queue_nomerges(q) || !bio_mergeable(bio)) if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
return false; __blk_mq_sched_restart(hctx);
}
return __blk_mq_sched_bio_merge(q, bio, nr_segs); static inline bool bio_mergeable(struct bio *bio)
{
return !(bio->bi_opf & REQ_NOMERGE_FLAGS);
} }
static inline bool static inline bool
blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
struct bio *bio) struct bio *bio)
{ {
struct elevator_queue *e = q->elevator; if (rq->rq_flags & RQF_ELV) {
struct elevator_queue *e = q->elevator;
if (e && e->type->ops.allow_merge)
return e->type->ops.allow_merge(q, rq, bio);
if (e->type->ops.allow_merge)
return e->type->ops.allow_merge(q, rq, bio);
}
return true; return true;
} }
static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
{ {
struct elevator_queue *e = rq->q->elevator; if (rq->rq_flags & RQF_ELV) {
struct elevator_queue *e = rq->q->elevator;
if (e && e->type->ops.completed_request) if (e->type->ops.completed_request)
e->type->ops.completed_request(rq, now); e->type->ops.completed_request(rq, now);
}
} }
static inline void blk_mq_sched_requeue_request(struct request *rq) static inline void blk_mq_sched_requeue_request(struct request *rq)
{ {
struct request_queue *q = rq->q; if (rq->rq_flags & RQF_ELV) {
struct elevator_queue *e = q->elevator; struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
if ((rq->rq_flags & RQF_ELVPRIV) && e && e->type->ops.requeue_request) if ((rq->rq_flags & RQF_ELVPRIV) && e->type->ops.requeue_request)
e->type->ops.requeue_request(rq); e->type->ops.requeue_request(rq);
}
} }
static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)

View File

@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx, struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
kobj); kobj);
if (hctx->flags & BLK_MQ_F_BLOCKING)
cleanup_srcu_struct(hctx->srcu);
blk_free_flush_queue(hctx->fq); blk_free_flush_queue(hctx->fq);
sbitmap_free(&hctx->ctx_map); sbitmap_free(&hctx->ctx_map);
free_cpumask_var(hctx->cpumask); free_cpumask_var(hctx->cpumask);

View File

@ -16,6 +16,21 @@
#include "blk-mq-sched.h" #include "blk-mq-sched.h"
#include "blk-mq-tag.h" #include "blk-mq-tag.h"
/*
* Recalculate wakeup batch when tag is shared by hctx.
*/
static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
unsigned int users)
{
if (!users)
return;
sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags,
users);
sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags,
users);
}
/* /*
* If a previously inactive queue goes active, bump the active user count. * If a previously inactive queue goes active, bump the active user count.
* We need to do this before try to allocate driver tag, then even if fail * We need to do this before try to allocate driver tag, then even if fail
@ -24,19 +39,26 @@
*/ */
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{ {
if (blk_mq_is_sbitmap_shared(hctx->flags)) { unsigned int users;
struct request_queue *q = hctx->queue;
struct blk_mq_tag_set *set = q->tag_set;
if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && if (blk_mq_is_shared_tags(hctx->flags)) {
!test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) struct request_queue *q = hctx->queue;
atomic_inc(&set->active_queues_shared_sbitmap);
if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) {
return true;
}
} else { } else {
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
!test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) {
atomic_inc(&hctx->tags->active_queues); return true;
}
} }
users = atomic_inc_return(&hctx->tags->active_queues);
blk_mq_update_wake_batch(hctx->tags, users);
return true; return true;
} }
@ -45,9 +67,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
*/ */
void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
{ {
sbitmap_queue_wake_all(tags->bitmap_tags); sbitmap_queue_wake_all(&tags->bitmap_tags);
if (include_reserve) if (include_reserve)
sbitmap_queue_wake_all(tags->breserved_tags); sbitmap_queue_wake_all(&tags->breserved_tags);
} }
/* /*
@ -57,20 +79,23 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{ {
struct blk_mq_tags *tags = hctx->tags; struct blk_mq_tags *tags = hctx->tags;
struct request_queue *q = hctx->queue; unsigned int users;
struct blk_mq_tag_set *set = q->tag_set;
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
&q->queue_flags)) &q->queue_flags))
return; return;
atomic_dec(&set->active_queues_shared_sbitmap);
} else { } else {
if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return; return;
atomic_dec(&tags->active_queues);
} }
users = atomic_dec_return(&tags->active_queues);
blk_mq_update_wake_batch(tags, users);
blk_mq_tag_wakeup_all(tags, false); blk_mq_tag_wakeup_all(tags, false);
} }
@ -87,6 +112,21 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
return __sbitmap_queue_get(bt); return __sbitmap_queue_get(bt);
} }
unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
unsigned int *offset)
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct sbitmap_queue *bt = &tags->bitmap_tags;
unsigned long ret;
if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED ||
data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
return 0;
ret = __sbitmap_queue_get_batch(bt, nr_tags, offset);
*offset += tags->nr_reserved_tags;
return ret;
}
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
{ {
struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
@ -101,10 +141,10 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
return BLK_MQ_NO_TAG; return BLK_MQ_NO_TAG;
} }
bt = tags->breserved_tags; bt = &tags->breserved_tags;
tag_offset = 0; tag_offset = 0;
} else { } else {
bt = tags->bitmap_tags; bt = &tags->bitmap_tags;
tag_offset = tags->nr_reserved_tags; tag_offset = tags->nr_reserved_tags;
} }
@ -150,9 +190,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
data->ctx); data->ctx);
tags = blk_mq_tags_from_data(data); tags = blk_mq_tags_from_data(data);
if (data->flags & BLK_MQ_REQ_RESERVED) if (data->flags & BLK_MQ_REQ_RESERVED)
bt = tags->breserved_tags; bt = &tags->breserved_tags;
else else
bt = tags->bitmap_tags; bt = &tags->bitmap_tags;
/* /*
* If destination hw queue is changed, fake wake up on * If destination hw queue is changed, fake wake up on
@ -186,16 +226,23 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
const int real_tag = tag - tags->nr_reserved_tags; const int real_tag = tag - tags->nr_reserved_tags;
BUG_ON(real_tag >= tags->nr_tags); BUG_ON(real_tag >= tags->nr_tags);
sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu); sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
} else { } else {
BUG_ON(tag >= tags->nr_reserved_tags); BUG_ON(tag >= tags->nr_reserved_tags);
sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu); sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
} }
} }
void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
{
sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
tag_array, nr_tags);
}
struct bt_iter_data { struct bt_iter_data {
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
busy_iter_fn *fn; struct request_queue *q;
busy_tag_iter_fn *fn;
void *data; void *data;
bool reserved; bool reserved;
}; };
@ -208,7 +255,7 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
spin_lock_irqsave(&tags->lock, flags); spin_lock_irqsave(&tags->lock, flags);
rq = tags->rqs[bitnr]; rq = tags->rqs[bitnr];
if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref)) if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq))
rq = NULL; rq = NULL;
spin_unlock_irqrestore(&tags->lock, flags); spin_unlock_irqrestore(&tags->lock, flags);
return rq; return rq;
@ -218,11 +265,18 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{ {
struct bt_iter_data *iter_data = data; struct bt_iter_data *iter_data = data;
struct blk_mq_hw_ctx *hctx = iter_data->hctx; struct blk_mq_hw_ctx *hctx = iter_data->hctx;
struct blk_mq_tags *tags = hctx->tags; struct request_queue *q = iter_data->q;
struct blk_mq_tag_set *set = q->tag_set;
bool reserved = iter_data->reserved; bool reserved = iter_data->reserved;
struct blk_mq_tags *tags;
struct request *rq; struct request *rq;
bool ret = true; bool ret = true;
if (blk_mq_is_shared_tags(set->flags))
tags = set->shared_tags;
else
tags = hctx->tags;
if (!reserved) if (!reserved)
bitnr += tags->nr_reserved_tags; bitnr += tags->nr_reserved_tags;
/* /*
@ -233,8 +287,8 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
if (!rq) if (!rq)
return true; return true;
if (rq->q == hctx->queue && rq->mq_hctx == hctx) if (rq->q == q && (!hctx || rq->mq_hctx == hctx))
ret = iter_data->fn(hctx, rq, iter_data->data, reserved); ret = iter_data->fn(rq, iter_data->data, reserved);
blk_mq_put_rq_ref(rq); blk_mq_put_rq_ref(rq);
return ret; return ret;
} }
@ -242,6 +296,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
/** /**
* bt_for_each - iterate over the requests associated with a hardware queue * bt_for_each - iterate over the requests associated with a hardware queue
* @hctx: Hardware queue to examine. * @hctx: Hardware queue to examine.
* @q: Request queue to examine.
* @bt: sbitmap to examine. This is either the breserved_tags member * @bt: sbitmap to examine. This is either the breserved_tags member
* or the bitmap_tags member of struct blk_mq_tags. * or the bitmap_tags member of struct blk_mq_tags.
* @fn: Pointer to the function that will be called for each request * @fn: Pointer to the function that will be called for each request
@ -253,14 +308,16 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
* @reserved: Indicates whether @bt is the breserved_tags member or the * @reserved: Indicates whether @bt is the breserved_tags member or the
* bitmap_tags member of struct blk_mq_tags. * bitmap_tags member of struct blk_mq_tags.
*/ */
static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q,
busy_iter_fn *fn, void *data, bool reserved) struct sbitmap_queue *bt, busy_tag_iter_fn *fn,
void *data, bool reserved)
{ {
struct bt_iter_data iter_data = { struct bt_iter_data iter_data = {
.hctx = hctx, .hctx = hctx,
.fn = fn, .fn = fn,
.data = data, .data = data,
.reserved = reserved, .reserved = reserved,
.q = q,
}; };
sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
@ -340,9 +397,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED); WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);
if (tags->nr_reserved_tags) if (tags->nr_reserved_tags)
bt_tags_for_each(tags, tags->breserved_tags, fn, priv, bt_tags_for_each(tags, &tags->breserved_tags, fn, priv,
flags | BT_TAG_ITER_RESERVED); flags | BT_TAG_ITER_RESERVED);
bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags); bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags);
} }
/** /**
@ -379,9 +436,12 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv) busy_tag_iter_fn *fn, void *priv)
{ {
int i; unsigned int flags = tagset->flags;
int i, nr_tags;
for (i = 0; i < tagset->nr_hw_queues; i++) { nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues;
for (i = 0; i < nr_tags; i++) {
if (tagset->tags && tagset->tags[i]) if (tagset->tags && tagset->tags[i])
__blk_mq_all_tag_iter(tagset->tags[i], fn, priv, __blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
BT_TAG_ITER_STARTED); BT_TAG_ITER_STARTED);
@ -434,12 +494,9 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
* called for all requests on all queues that share that tag set and not only * called for all requests on all queues that share that tag set and not only
* for requests associated with @q. * for requests associated with @q.
*/ */
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
void *priv) void *priv)
{ {
struct blk_mq_hw_ctx *hctx;
int i;
/* /*
* __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
* while the queue is frozen. So we can use q_usage_counter to avoid * while the queue is frozen. So we can use q_usage_counter to avoid
@ -448,19 +505,34 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
if (!percpu_ref_tryget(&q->q_usage_counter)) if (!percpu_ref_tryget(&q->q_usage_counter))
return; return;
queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_is_shared_tags(q->tag_set->flags)) {
struct blk_mq_tags *tags = hctx->tags; struct blk_mq_tags *tags = q->tag_set->shared_tags;
struct sbitmap_queue *bresv = &tags->breserved_tags;
/* struct sbitmap_queue *btags = &tags->bitmap_tags;
* If no software queues are currently mapped to this
* hardware queue, there's nothing to check
*/
if (!blk_mq_hw_queue_mapped(hctx))
continue;
if (tags->nr_reserved_tags) if (tags->nr_reserved_tags)
bt_for_each(hctx, tags->breserved_tags, fn, priv, true); bt_for_each(NULL, q, bresv, fn, priv, true);
bt_for_each(hctx, tags->bitmap_tags, fn, priv, false); bt_for_each(NULL, q, btags, fn, priv, false);
} else {
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i) {
struct blk_mq_tags *tags = hctx->tags;
struct sbitmap_queue *bresv = &tags->breserved_tags;
struct sbitmap_queue *btags = &tags->bitmap_tags;
/*
* If no software queues are currently mapped to this
* hardware queue, there's nothing to check
*/
if (!blk_mq_hw_queue_mapped(hctx))
continue;
if (tags->nr_reserved_tags)
bt_for_each(hctx, q, bresv, fn, priv, true);
bt_for_each(hctx, q, btags, fn, priv, false);
}
} }
blk_queue_exit(q); blk_queue_exit(q);
} }
@ -492,56 +564,10 @@ int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
return -ENOMEM; return -ENOMEM;
} }
static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
int node, int alloc_policy)
{
int ret;
ret = blk_mq_init_bitmaps(&tags->__bitmap_tags,
&tags->__breserved_tags,
tags->nr_tags, tags->nr_reserved_tags,
node, alloc_policy);
if (ret)
return ret;
tags->bitmap_tags = &tags->__bitmap_tags;
tags->breserved_tags = &tags->__breserved_tags;
return 0;
}
int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set)
{
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
int i, ret;
ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags,
set->queue_depth, set->reserved_tags,
set->numa_node, alloc_policy);
if (ret)
return ret;
for (i = 0; i < set->nr_hw_queues; i++) {
struct blk_mq_tags *tags = set->tags[i];
tags->bitmap_tags = &set->__bitmap_tags;
tags->breserved_tags = &set->__breserved_tags;
}
return 0;
}
void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
{
sbitmap_queue_free(&set->__bitmap_tags);
sbitmap_queue_free(&set->__breserved_tags);
}
struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
unsigned int reserved_tags, unsigned int reserved_tags,
int node, unsigned int flags) int node, int alloc_policy)
{ {
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags);
struct blk_mq_tags *tags; struct blk_mq_tags *tags;
if (total_tags > BLK_MQ_TAG_MAX) { if (total_tags > BLK_MQ_TAG_MAX) {
@ -557,22 +583,19 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_reserved_tags = reserved_tags; tags->nr_reserved_tags = reserved_tags;
spin_lock_init(&tags->lock); spin_lock_init(&tags->lock);
if (blk_mq_is_sbitmap_shared(flags)) if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags,
return tags; total_tags, reserved_tags, node,
alloc_policy) < 0) {
if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) {
kfree(tags); kfree(tags);
return NULL; return NULL;
} }
return tags; return tags;
} }
void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags) void blk_mq_free_tags(struct blk_mq_tags *tags)
{ {
if (!blk_mq_is_sbitmap_shared(flags)) { sbitmap_queue_free(&tags->bitmap_tags);
sbitmap_queue_free(tags->bitmap_tags); sbitmap_queue_free(&tags->breserved_tags);
sbitmap_queue_free(tags->breserved_tags);
}
kfree(tags); kfree(tags);
} }
@ -592,7 +615,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
if (tdepth > tags->nr_tags) { if (tdepth > tags->nr_tags) {
struct blk_mq_tag_set *set = hctx->queue->tag_set; struct blk_mq_tag_set *set = hctx->queue->tag_set;
struct blk_mq_tags *new; struct blk_mq_tags *new;
bool ret;
if (!can_grow) if (!can_grow)
return -EINVAL; return -EINVAL;
@ -604,34 +626,42 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
if (tdepth > MAX_SCHED_RQ) if (tdepth > MAX_SCHED_RQ)
return -EINVAL; return -EINVAL;
new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, /*
tags->nr_reserved_tags, set->flags); * Only the sbitmap needs resizing since we allocated the max
* initially.
*/
if (blk_mq_is_shared_tags(set->flags))
return 0;
new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth);
if (!new) if (!new)
return -ENOMEM; return -ENOMEM;
ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
if (ret) {
blk_mq_free_rq_map(new, set->flags);
return -ENOMEM;
}
blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num);
blk_mq_free_rq_map(*tagsptr, set->flags);
*tagsptr = new; *tagsptr = new;
} else { } else {
/* /*
* Don't need (or can't) update reserved tags here, they * Don't need (or can't) update reserved tags here, they
* remain static and should never need resizing. * remain static and should never need resizing.
*/ */
sbitmap_queue_resize(tags->bitmap_tags, sbitmap_queue_resize(&tags->bitmap_tags,
tdepth - tags->nr_reserved_tags); tdepth - tags->nr_reserved_tags);
} }
return 0; return 0;
} }
void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size) void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size)
{ {
sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags); struct blk_mq_tags *tags = set->shared_tags;
sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags);
}
void blk_mq_tag_update_sched_shared_tags(struct request_queue *q)
{
sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags,
q->nr_requests - q->tag_set->reserved_tags);
} }
/** /**

View File

@ -2,55 +2,33 @@
#ifndef INT_BLK_MQ_TAG_H #ifndef INT_BLK_MQ_TAG_H
#define INT_BLK_MQ_TAG_H #define INT_BLK_MQ_TAG_H
/* struct blk_mq_alloc_data;
* Tag address space map.
*/
struct blk_mq_tags {
unsigned int nr_tags;
unsigned int nr_reserved_tags;
atomic_t active_queues;
struct sbitmap_queue *bitmap_tags;
struct sbitmap_queue *breserved_tags;
struct sbitmap_queue __bitmap_tags;
struct sbitmap_queue __breserved_tags;
struct request **rqs;
struct request **static_rqs;
struct list_head page_list;
/*
* used to clear request reference in rqs[] before freeing one
* request pool
*/
spinlock_t lock;
};
extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
unsigned int reserved_tags, unsigned int reserved_tags,
int node, unsigned int flags); int node, int alloc_policy);
extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags); extern void blk_mq_free_tags(struct blk_mq_tags *tags);
extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
struct sbitmap_queue *breserved_tags, struct sbitmap_queue *breserved_tags,
unsigned int queue_depth, unsigned int queue_depth,
unsigned int reserved, unsigned int reserved,
int node, int alloc_policy); int node, int alloc_policy);
extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set);
extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set);
extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
unsigned int *offset);
extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
unsigned int tag); unsigned int tag);
void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags);
extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
struct blk_mq_tags **tags, struct blk_mq_tags **tags,
unsigned int depth, bool can_grow); unsigned int depth, bool can_grow);
extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, extern void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set,
unsigned int size); unsigned int size);
extern void blk_mq_tag_update_sched_shared_tags(struct request_queue *q);
extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
void *priv); void *priv);
void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
void *priv); void *priv);

File diff suppressed because it is too large Load Diff

View File

@ -25,18 +25,14 @@ struct blk_mq_ctx {
unsigned short index_hw[HCTX_MAX_TYPES]; unsigned short index_hw[HCTX_MAX_TYPES];
struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES]; struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES];
/* incremented at dispatch time */
unsigned long rq_dispatched[2];
unsigned long rq_merged;
/* incremented at completion time */
unsigned long ____cacheline_aligned_in_smp rq_completed[2];
struct request_queue *queue; struct request_queue *queue;
struct blk_mq_ctxs *ctxs; struct blk_mq_ctxs *ctxs;
struct kobject kobj; struct kobject kobj;
} ____cacheline_aligned_in_smp; } ____cacheline_aligned_in_smp;
void blk_mq_submit_bio(struct bio *bio);
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
unsigned int flags);
void blk_mq_exit_queue(struct request_queue *q); void blk_mq_exit_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q); void blk_mq_wake_waiters(struct request_queue *q);
@ -54,15 +50,12 @@ void blk_mq_put_rq_ref(struct request *rq);
*/ */
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx); unsigned int hctx_idx);
void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags); void blk_mq_free_rq_map(struct blk_mq_tags *tags);
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
unsigned int hctx_idx, unsigned int hctx_idx, unsigned int depth);
unsigned int nr_tags, void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
unsigned int reserved_tags, struct blk_mq_tags *tags,
unsigned int flags); unsigned int hctx_idx);
int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx, unsigned int depth);
/* /*
* Internal helpers for request insertion into sw queues * Internal helpers for request insertion into sw queues
*/ */
@ -72,9 +65,6 @@ void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
bool run_queue); bool run_queue);
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list); struct list_head *list);
/* Used by blk_insert_cloned_request() to issue request directly */
blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last);
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
struct list_head *list); struct list_head *list);
@ -96,6 +86,20 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]]; return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
} }
static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags)
{
enum hctx_type type = HCTX_TYPE_DEFAULT;
/*
* The caller ensure that if REQ_POLLED, poll must be enabled.
*/
if (flags & REQ_POLLED)
type = HCTX_TYPE_POLL;
else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
type = HCTX_TYPE_READ;
return type;
}
/* /*
* blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
* @q: request queue * @q: request queue
@ -106,17 +110,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
unsigned int flags, unsigned int flags,
struct blk_mq_ctx *ctx) struct blk_mq_ctx *ctx)
{ {
enum hctx_type type = HCTX_TYPE_DEFAULT; return ctx->hctxs[blk_mq_get_hctx_type(flags)];
/*
* The caller ensure that if REQ_HIPRI, poll must be enabled.
*/
if (flags & REQ_HIPRI)
type = HCTX_TYPE_POLL;
else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
type = HCTX_TYPE_READ;
return ctx->hctxs[type];
} }
/* /*
@ -128,6 +122,8 @@ extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
extern int blk_mq_sysfs_register(struct request_queue *q); extern int blk_mq_sysfs_register(struct request_queue *q);
extern void blk_mq_sysfs_unregister(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
void blk_mq_free_plug_rqs(struct blk_plug *plug);
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
void blk_mq_cancel_work_sync(struct request_queue *q); void blk_mq_cancel_work_sync(struct request_queue *q);
@ -156,23 +152,27 @@ struct blk_mq_alloc_data {
blk_mq_req_flags_t flags; blk_mq_req_flags_t flags;
unsigned int shallow_depth; unsigned int shallow_depth;
unsigned int cmd_flags; unsigned int cmd_flags;
req_flags_t rq_flags;
/* allocate multiple requests/tags in one go */
unsigned int nr_tags;
struct request **cached_rq;
/* input & output parameter */ /* input & output parameter */
struct blk_mq_ctx *ctx; struct blk_mq_ctx *ctx;
struct blk_mq_hw_ctx *hctx; struct blk_mq_hw_ctx *hctx;
}; };
static inline bool blk_mq_is_sbitmap_shared(unsigned int flags) static inline bool blk_mq_is_shared_tags(unsigned int flags)
{ {
return flags & BLK_MQ_F_TAG_HCTX_SHARED; return flags & BLK_MQ_F_TAG_HCTX_SHARED;
} }
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
{ {
if (data->q->elevator) if (!(data->rq_flags & RQF_ELV))
return data->hctx->sched_tags; return data->hctx->tags;
return data->hctx->sched_tags;
return data->hctx->tags;
} }
static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
@ -222,24 +222,30 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq)
static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
{ {
if (blk_mq_is_sbitmap_shared(hctx->flags)) if (blk_mq_is_shared_tags(hctx->flags))
atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap); atomic_inc(&hctx->queue->nr_active_requests_shared_tags);
else else
atomic_inc(&hctx->nr_active); atomic_inc(&hctx->nr_active);
} }
static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
int val)
{
if (blk_mq_is_shared_tags(hctx->flags))
atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags);
else
atomic_sub(val, &hctx->nr_active);
}
static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
{ {
if (blk_mq_is_sbitmap_shared(hctx->flags)) __blk_mq_sub_active_requests(hctx, 1);
atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap);
else
atomic_dec(&hctx->nr_active);
} }
static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
{ {
if (blk_mq_is_sbitmap_shared(hctx->flags)) if (blk_mq_is_shared_tags(hctx->flags))
return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap); return atomic_read(&hctx->queue->nr_active_requests_shared_tags);
return atomic_read(&hctx->nr_active); return atomic_read(&hctx->nr_active);
} }
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
@ -262,7 +268,20 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
__blk_mq_put_driver_tag(rq->mq_hctx, rq); __blk_mq_put_driver_tag(rq->mq_hctx, rq);
} }
bool blk_mq_get_driver_tag(struct request *rq); bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq);
static inline bool blk_mq_get_driver_tag(struct request *rq)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
if (rq->tag != BLK_MQ_NO_TAG &&
!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
hctx->tags->rqs[rq->tag] = rq;
return true;
}
return __blk_mq_get_driver_tag(hctx, rq);
}
static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
{ {
@ -333,19 +352,18 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
if (bt->sb.depth == 1) if (bt->sb.depth == 1)
return true; return true;
if (blk_mq_is_sbitmap_shared(hctx->flags)) { if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue; struct request_queue *q = hctx->queue;
struct blk_mq_tag_set *set = q->tag_set;
if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
return true; return true;
users = atomic_read(&set->active_queues_shared_sbitmap);
} else { } else {
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return true; return true;
users = atomic_read(&hctx->tags->active_queues);
} }
users = atomic_read(&hctx->tags->active_queues);
if (!users) if (!users)
return true; return true;
@ -356,5 +374,24 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
return __blk_mq_active_requests(hctx) < depth; return __blk_mq_active_requests(hctx) < depth;
} }
/* run the code block in @dispatch_ops with rcu/srcu read lock held */
#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \
do { \
if (!blk_queue_has_srcu(q)) { \
rcu_read_lock(); \
(dispatch_ops); \
rcu_read_unlock(); \
} else { \
int srcu_idx; \
\
might_sleep_if(check_sleep); \
srcu_idx = srcu_read_lock((q)->srcu); \
(dispatch_ops); \
srcu_read_unlock((q)->srcu, srcu_idx); \
} \
} while (0)
#define blk_mq_run_dispatch_ops(q, dispatch_ops) \
__blk_mq_run_dispatch_ops(q, true, dispatch_ops) \
#endif #endif

View File

@ -189,9 +189,10 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
* BIO_TRACKED lets controllers know that a bio went through the * BIO_TRACKED lets controllers know that a bio went through the
* normal rq_qos path. * normal rq_qos path.
*/ */
bio_set_flag(bio, BIO_TRACKED); if (q->rq_qos) {
if (q->rq_qos) bio_set_flag(bio, BIO_TRACKED);
__rq_qos_throttle(q->rq_qos, bio); __rq_qos_throttle(q->rq_qos, bio);
}
} }
static inline void rq_qos_track(struct request_queue *q, struct request *rq, static inline void rq_qos_track(struct request_queue *q, struct request *rq,

View File

@ -15,7 +15,7 @@
struct blk_queue_stats { struct blk_queue_stats {
struct list_head callbacks; struct list_head callbacks;
spinlock_t lock; spinlock_t lock;
bool enable_accounting; int accounting;
}; };
void blk_rq_stat_init(struct blk_rq_stat *stat) void blk_rq_stat_init(struct blk_rq_stat *stat)
@ -161,7 +161,7 @@ void blk_stat_remove_callback(struct request_queue *q,
spin_lock_irqsave(&q->stats->lock, flags); spin_lock_irqsave(&q->stats->lock, flags);
list_del_rcu(&cb->list); list_del_rcu(&cb->list);
if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting) if (list_empty(&q->stats->callbacks) && !q->stats->accounting)
blk_queue_flag_clear(QUEUE_FLAG_STATS, q); blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
spin_unlock_irqrestore(&q->stats->lock, flags); spin_unlock_irqrestore(&q->stats->lock, flags);
@ -184,13 +184,24 @@ void blk_stat_free_callback(struct blk_stat_callback *cb)
call_rcu(&cb->rcu, blk_stat_free_callback_rcu); call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
} }
void blk_stat_disable_accounting(struct request_queue *q)
{
unsigned long flags;
spin_lock_irqsave(&q->stats->lock, flags);
if (!--q->stats->accounting)
blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
spin_unlock_irqrestore(&q->stats->lock, flags);
}
EXPORT_SYMBOL_GPL(blk_stat_disable_accounting);
void blk_stat_enable_accounting(struct request_queue *q) void blk_stat_enable_accounting(struct request_queue *q)
{ {
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&q->stats->lock, flags); spin_lock_irqsave(&q->stats->lock, flags);
q->stats->enable_accounting = true; if (!q->stats->accounting++)
blk_queue_flag_set(QUEUE_FLAG_STATS, q); blk_queue_flag_set(QUEUE_FLAG_STATS, q);
spin_unlock_irqrestore(&q->stats->lock, flags); spin_unlock_irqrestore(&q->stats->lock, flags);
} }
EXPORT_SYMBOL_GPL(blk_stat_enable_accounting); EXPORT_SYMBOL_GPL(blk_stat_enable_accounting);
@ -205,7 +216,7 @@ struct blk_queue_stats *blk_alloc_queue_stats(void)
INIT_LIST_HEAD(&stats->callbacks); INIT_LIST_HEAD(&stats->callbacks);
spin_lock_init(&stats->lock); spin_lock_init(&stats->lock);
stats->enable_accounting = false; stats->accounting = 0;
return stats; return stats;
} }
@ -219,3 +230,21 @@ void blk_free_queue_stats(struct blk_queue_stats *stats)
kfree(stats); kfree(stats);
} }
bool blk_stats_alloc_enable(struct request_queue *q)
{
struct blk_rq_stat *poll_stat;
poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat),
GFP_ATOMIC);
if (!poll_stat)
return false;
if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) {
kfree(poll_stat);
return true;
}
blk_stat_add_callback(q, q->poll_cb);
return false;
}

View File

@ -64,11 +64,13 @@ struct blk_stat_callback {
struct blk_queue_stats *blk_alloc_queue_stats(void); struct blk_queue_stats *blk_alloc_queue_stats(void);
void blk_free_queue_stats(struct blk_queue_stats *); void blk_free_queue_stats(struct blk_queue_stats *);
bool blk_stats_alloc_enable(struct request_queue *q);
void blk_stat_add(struct request *rq, u64 now); void blk_stat_add(struct request *rq, u64 now);
/* record time/size info in request but not add a callback */ /* record time/size info in request but not add a callback */
void blk_stat_enable_accounting(struct request_queue *q); void blk_stat_enable_accounting(struct request_queue *q);
void blk_stat_disable_accounting(struct request_queue *q);
/** /**
* blk_stat_alloc_callback() - Allocate a block statistics callback. * blk_stat_alloc_callback() - Allocate a block statistics callback.

View File

@ -16,7 +16,9 @@
#include "blk.h" #include "blk.h"
#include "blk-mq.h" #include "blk-mq.h"
#include "blk-mq-debugfs.h" #include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
#include "blk-wbt.h" #include "blk-wbt.h"
#include "blk-throttle.h"
struct queue_sysfs_entry { struct queue_sysfs_entry {
struct attribute attr; struct attribute attr;
@ -432,26 +434,11 @@ static ssize_t queue_poll_show(struct request_queue *q, char *page)
static ssize_t queue_poll_store(struct request_queue *q, const char *page, static ssize_t queue_poll_store(struct request_queue *q, const char *page,
size_t count) size_t count)
{ {
unsigned long poll_on; if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
ssize_t ret;
if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL ||
!q->tag_set->map[HCTX_TYPE_POLL].nr_queues)
return -EINVAL; return -EINVAL;
pr_info_ratelimited("writes to the poll attribute are ignored.\n");
ret = queue_var_store(&poll_on, page, count); pr_info_ratelimited("please use driver specific parameters instead.\n");
if (ret < 0) return count;
return ret;
if (poll_on) {
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
} else {
blk_mq_freeze_queue(q);
blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
blk_mq_unfreeze_queue(q);
}
return ret;
} }
static ssize_t queue_io_timeout_show(struct request_queue *q, char *page) static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
@ -748,7 +735,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
{ {
struct request_queue *q = container_of(rcu_head, struct request_queue, struct request_queue *q = container_of(rcu_head, struct request_queue,
rcu_head); rcu_head);
kmem_cache_free(blk_requestq_cachep, q);
kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
} }
/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ /* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
@ -761,7 +749,7 @@ static void blk_exit_queue(struct request_queue *q)
*/ */
if (q->elevator) { if (q->elevator) {
ioc_clear_queue(q); ioc_clear_queue(q);
__elevator_exit(q, q->elevator); elevator_exit(q);
} }
/* /*
@ -799,14 +787,15 @@ static void blk_release_queue(struct kobject *kobj)
might_sleep(); might_sleep();
if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) if (q->poll_stat)
blk_stat_remove_callback(q, q->poll_cb); blk_stat_remove_callback(q, q->poll_cb);
blk_stat_free_callback(q->poll_cb); blk_stat_free_callback(q->poll_cb);
blk_free_queue_stats(q->stats);
blk_exit_queue(q); blk_exit_queue(q);
blk_free_queue_stats(q->stats);
kfree(q->poll_stat);
blk_queue_free_zone_bitmaps(q); blk_queue_free_zone_bitmaps(q);
if (queue_is_mq(q)) if (queue_is_mq(q))
@ -822,6 +811,9 @@ static void blk_release_queue(struct kobject *kobj)
bioset_exit(&q->bio_split); bioset_exit(&q->bio_split);
if (blk_queue_has_srcu(q))
cleanup_srcu_struct(q->srcu);
ida_simple_remove(&blk_queue_ida, q->id); ida_simple_remove(&blk_queue_ida, q->id);
call_rcu(&q->rcu_head, blk_free_queue_rcu); call_rcu(&q->rcu_head, blk_free_queue_rcu);
} }
@ -877,16 +869,15 @@ int blk_register_queue(struct gendisk *disk)
} }
mutex_lock(&q->sysfs_lock); mutex_lock(&q->sysfs_lock);
ret = disk_register_independent_access_ranges(disk, NULL);
if (ret)
goto put_dev;
if (q->elevator) { if (q->elevator) {
ret = elv_register_queue(q, false); ret = elv_register_queue(q, false);
if (ret) { if (ret)
mutex_unlock(&q->sysfs_lock); goto put_dev;
mutex_unlock(&q->sysfs_dir_lock);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(dev);
kobject_put(&dev->kobj);
return ret;
}
} }
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
@ -899,7 +890,6 @@ int blk_register_queue(struct gendisk *disk)
kobject_uevent(&q->elevator->kobj, KOBJ_ADD); kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_lock);
ret = 0;
unlock: unlock:
mutex_unlock(&q->sysfs_dir_lock); mutex_unlock(&q->sysfs_dir_lock);
@ -917,6 +907,16 @@ int blk_register_queue(struct gendisk *disk)
percpu_ref_switch_to_percpu(&q->q_usage_counter); percpu_ref_switch_to_percpu(&q->q_usage_counter);
} }
return ret;
put_dev:
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(dev);
kobject_put(&dev->kobj);
return ret; return ret;
} }
@ -962,6 +962,7 @@ void blk_unregister_queue(struct gendisk *disk)
mutex_lock(&q->sysfs_lock); mutex_lock(&q->sysfs_lock);
if (q->elevator) if (q->elevator)
elv_unregister_queue(q); elv_unregister_queue(q);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock); mutex_unlock(&q->sysfs_dir_lock);

View File

@ -13,6 +13,8 @@
#include <linux/blk-cgroup.h> #include <linux/blk-cgroup.h>
#include "blk.h" #include "blk.h"
#include "blk-cgroup-rwstat.h" #include "blk-cgroup-rwstat.h"
#include "blk-stat.h"
#include "blk-throttle.h"
/* Max dispatch from a group in 1 round */ /* Max dispatch from a group in 1 round */
#define THROTL_GRP_QUANTUM 8 #define THROTL_GRP_QUANTUM 8
@ -37,60 +39,9 @@
*/ */
#define LATENCY_FILTERED_HD (1000L) /* 1ms */ #define LATENCY_FILTERED_HD (1000L) /* 1ms */
static struct blkcg_policy blkcg_policy_throtl;
/* A workqueue to queue throttle related work */ /* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue; static struct workqueue_struct *kthrotld_workqueue;
/*
* To implement hierarchical throttling, throtl_grps form a tree and bios
* are dispatched upwards level by level until they reach the top and get
* issued. When dispatching bios from the children and local group at each
* level, if the bios are dispatched into a single bio_list, there's a risk
* of a local or child group which can queue many bios at once filling up
* the list starving others.
*
* To avoid such starvation, dispatched bios are queued separately
* according to where they came from. When they are again dispatched to
* the parent, they're popped in round-robin order so that no single source
* hogs the dispatch window.
*
* throtl_qnode is used to keep the queued bios separated by their sources.
* Bios are queued to throtl_qnode which in turn is queued to
* throtl_service_queue and then dispatched in round-robin order.
*
* It's also used to track the reference counts on blkg's. A qnode always
* belongs to a throtl_grp and gets queued on itself or the parent, so
* incrementing the reference of the associated throtl_grp when a qnode is
* queued and decrementing when dequeued is enough to keep the whole blkg
* tree pinned while bios are in flight.
*/
struct throtl_qnode {
struct list_head node; /* service_queue->queued[] */
struct bio_list bios; /* queued bios */
struct throtl_grp *tg; /* tg this qnode belongs to */
};
struct throtl_service_queue {
struct throtl_service_queue *parent_sq; /* the parent service_queue */
/*
* Bios queued directly to this service_queue or dispatched from
* children throtl_grp's.
*/
struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
unsigned int nr_queued[2]; /* number of queued bios */
/*
* RB tree of active children throtl_grp's, which are sorted by
* their ->disptime.
*/
struct rb_root_cached pending_tree; /* RB tree of active tgs */
unsigned int nr_pending; /* # queued in the tree */
unsigned long first_pending_disptime; /* disptime of the first tg */
struct timer_list pending_timer; /* fires on first_pending_disptime */
};
enum tg_state_flags { enum tg_state_flags {
THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
@ -98,93 +49,6 @@ enum tg_state_flags {
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
enum {
LIMIT_LOW,
LIMIT_MAX,
LIMIT_CNT,
};
struct throtl_grp {
/* must be the first member */
struct blkg_policy_data pd;
/* active throtl group service_queue member */
struct rb_node rb_node;
/* throtl_data this group belongs to */
struct throtl_data *td;
/* this group's service queue */
struct throtl_service_queue service_queue;
/*
* qnode_on_self is used when bios are directly queued to this
* throtl_grp so that local bios compete fairly with bios
* dispatched from children. qnode_on_parent is used when bios are
* dispatched from this throtl_grp into its parent and will compete
* with the sibling qnode_on_parents and the parent's
* qnode_on_self.
*/
struct throtl_qnode qnode_on_self[2];
struct throtl_qnode qnode_on_parent[2];
/*
* Dispatch time in jiffies. This is the estimated time when group
* will unthrottle and is ready to dispatch more bio. It is used as
* key to sort active groups in service tree.
*/
unsigned long disptime;
unsigned int flags;
/* are there any throtl rules between this group and td? */
bool has_rules[2];
/* internally used bytes per second rate limits */
uint64_t bps[2][LIMIT_CNT];
/* user configured bps limits */
uint64_t bps_conf[2][LIMIT_CNT];
/* internally used IOPS limits */
unsigned int iops[2][LIMIT_CNT];
/* user configured IOPS limits */
unsigned int iops_conf[2][LIMIT_CNT];
/* Number of bytes dispatched in current slice */
uint64_t bytes_disp[2];
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];
unsigned long last_low_overflow_time[2];
uint64_t last_bytes_disp[2];
unsigned int last_io_disp[2];
unsigned long last_check_time;
unsigned long latency_target; /* us */
unsigned long latency_target_conf; /* us */
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
unsigned long last_finish_time; /* ns / 1024 */
unsigned long checked_last_finish_time; /* ns / 1024 */
unsigned long avg_idletime; /* ns / 1024 */
unsigned long idletime_threshold; /* us */
unsigned long idletime_threshold_conf; /* us */
unsigned int bio_cnt; /* total bios */
unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
unsigned long bio_cnt_reset_time;
atomic_t io_split_cnt[2];
atomic_t last_io_split_cnt[2];
struct blkg_rwstat stat_bytes;
struct blkg_rwstat stat_ios;
};
/* We measure latency for request size from <= 4k to >= 1M */ /* We measure latency for request size from <= 4k to >= 1M */
#define LATENCY_BUCKET_SIZE 9 #define LATENCY_BUCKET_SIZE 9
@ -231,16 +95,6 @@ struct throtl_data
static void throtl_pending_timer_fn(struct timer_list *t); static void throtl_pending_timer_fn(struct timer_list *t);
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
{
return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
}
static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
{
return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
}
static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
{ {
return pd_to_blkg(&tg->pd); return pd_to_blkg(&tg->pd);
@ -1794,7 +1648,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
cancel_work_sync(&td->dispatch_work); cancel_work_sync(&td->dispatch_work);
} }
static struct blkcg_policy blkcg_policy_throtl = { struct blkcg_policy blkcg_policy_throtl = {
.dfl_cftypes = throtl_files, .dfl_cftypes = throtl_files,
.legacy_cftypes = throtl_legacy_files, .legacy_cftypes = throtl_legacy_files,
@ -2208,9 +2062,9 @@ void blk_throtl_charge_bio_split(struct bio *bio)
} while (parent); } while (parent);
} }
bool blk_throtl_bio(struct bio *bio) bool __blk_throtl_bio(struct bio *bio)
{ {
struct request_queue *q = bio->bi_bdev->bd_disk->queue; struct request_queue *q = bdev_get_queue(bio->bi_bdev);
struct blkcg_gq *blkg = bio->bi_blkg; struct blkcg_gq *blkg = bio->bi_blkg;
struct throtl_qnode *qn = NULL; struct throtl_qnode *qn = NULL;
struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_grp *tg = blkg_to_tg(blkg);
@ -2221,19 +2075,12 @@ bool blk_throtl_bio(struct bio *bio)
rcu_read_lock(); rcu_read_lock();
/* see throtl_charge_bio() */
if (bio_flagged(bio, BIO_THROTTLED))
goto out;
if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
bio->bi_iter.bi_size); bio->bi_iter.bi_size);
blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
} }
if (!tg->has_rules[rw])
goto out;
spin_lock_irq(&q->queue_lock); spin_lock_irq(&q->queue_lock);
throtl_update_latency_buckets(td); throtl_update_latency_buckets(td);
@ -2317,7 +2164,6 @@ bool blk_throtl_bio(struct bio *bio)
out_unlock: out_unlock:
spin_unlock_irq(&q->queue_lock); spin_unlock_irq(&q->queue_lock);
out:
bio_set_flag(bio, BIO_THROTTLED); bio_set_flag(bio, BIO_THROTTLED);
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW #ifdef CONFIG_BLK_DEV_THROTTLING_LOW

View File

@ -2,15 +2,12 @@
#ifndef BLK_INTERNAL_H #ifndef BLK_INTERNAL_H
#define BLK_INTERNAL_H #define BLK_INTERNAL_H
#include <linux/idr.h>
#include <linux/blk-mq.h>
#include <linux/part_stat.h>
#include <linux/blk-crypto.h> #include <linux/blk-crypto.h>
#include <linux/memblock.h> /* for max_pfn/max_low_pfn */ #include <linux/memblock.h> /* for max_pfn/max_low_pfn */
#include <xen/xen.h> #include <xen/xen.h>
#include "blk-crypto-internal.h" #include "blk-crypto-internal.h"
#include "blk-mq.h"
#include "blk-mq-sched.h" struct elevator_type;
/* Max future timer expiry for timeouts */ /* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT (5 * HZ) #define BLK_MAX_TIMEOUT (5 * HZ)
@ -30,15 +27,10 @@ struct blk_flush_queue {
}; };
extern struct kmem_cache *blk_requestq_cachep; extern struct kmem_cache *blk_requestq_cachep;
extern struct kmem_cache *blk_requestq_srcu_cachep;
extern struct kobj_type blk_queue_ktype; extern struct kobj_type blk_queue_ktype;
extern struct ida blk_queue_ida; extern struct ida blk_queue_ida;
static inline struct blk_flush_queue *
blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
{
return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
}
static inline void __blk_get_queue(struct request_queue *q) static inline void __blk_get_queue(struct request_queue *q)
{ {
kobject_get(&q->kobj); kobject_get(&q->kobj);
@ -53,6 +45,41 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
void blk_freeze_queue(struct request_queue *q); void blk_freeze_queue(struct request_queue *q);
void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
void blk_queue_start_drain(struct request_queue *q); void blk_queue_start_drain(struct request_queue *q);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
bool submit_bio_checks(struct bio *bio);
static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
{
rcu_read_lock();
if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter))
goto fail;
/*
* The code that increments the pm_only counter must ensure that the
* counter is globally visible before the queue is unfrozen.
*/
if (blk_queue_pm_only(q) &&
(!pm || queue_rpm_status(q) == RPM_SUSPENDED))
goto fail_put;
rcu_read_unlock();
return true;
fail_put:
blk_queue_exit(q);
fail:
rcu_read_unlock();
return false;
}
static inline int bio_queue_enter(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
if (blk_try_enter_queue(q, false))
return 0;
return __bio_queue_enter(q, bio);
}
#define BIO_INLINE_VECS 4 #define BIO_INLINE_VECS 4
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
@ -94,6 +121,44 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
return __bvec_gap_to_prev(q, bprv, offset); return __bvec_gap_to_prev(q, bprv, offset);
} }
static inline bool rq_mergeable(struct request *rq)
{
if (blk_rq_is_passthrough(rq))
return false;
if (req_op(rq) == REQ_OP_FLUSH)
return false;
if (req_op(rq) == REQ_OP_WRITE_ZEROES)
return false;
if (req_op(rq) == REQ_OP_ZONE_APPEND)
return false;
if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
return false;
if (rq->rq_flags & RQF_NOMERGE_FLAGS)
return false;
return true;
}
/*
* There are two different ways to handle DISCARD merges:
* 1) If max_discard_segments > 1, the driver treats every bio as a range and
* send the bios to controller together. The ranges don't need to be
* contiguous.
* 2) Otherwise, the request will be normal read/write requests. The ranges
* need to be contiguous.
*/
static inline bool blk_discard_mergable(struct request *req)
{
if (req_op(req) == REQ_OP_DISCARD &&
queue_max_discard_segments(req->q) > 1)
return true;
return false;
}
#ifdef CONFIG_BLK_DEV_INTEGRITY #ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void); void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *); bool __bio_integrity_endio(struct bio *);
@ -175,15 +240,13 @@ static inline void blk_integrity_del(struct gendisk *disk)
unsigned long blk_rq_timeout(unsigned long timeout); unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req); void blk_add_timer(struct request *req);
const char *blk_status_to_str(blk_status_t status);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **same_queue_rq); unsigned int nr_segs);
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
struct bio *bio, unsigned int nr_segs); struct bio *bio, unsigned int nr_segs);
void blk_account_io_start(struct request *req);
void blk_account_io_done(struct request *req, u64 now);
/* /*
* Plug flush limits * Plug flush limits
*/ */
@ -199,19 +262,10 @@ void blk_insert_flush(struct request *rq);
int elevator_switch_mq(struct request_queue *q, int elevator_switch_mq(struct request_queue *q,
struct elevator_type *new_e); struct elevator_type *new_e);
void __elevator_exit(struct request_queue *, struct elevator_queue *); void elevator_exit(struct request_queue *q);
int elv_register_queue(struct request_queue *q, bool uevent); int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q); void elv_unregister_queue(struct request_queue *q);
static inline void elevator_exit(struct request_queue *q,
struct elevator_queue *e)
{
lockdep_assert_held(&q->sysfs_lock);
blk_mq_sched_free_requests(q);
__elevator_exit(q, e);
}
ssize_t part_size_show(struct device *dev, struct device_attribute *attr, ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
char *buf); char *buf);
ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, ssize_t part_stat_show(struct device *dev, struct device_attribute *attr,
@ -226,7 +280,32 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *, ssize_t part_timeout_store(struct device *, struct device_attribute *,
const char *, size_t); const char *, size_t);
void __blk_queue_split(struct bio **bio, unsigned int *nr_segs); static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
{
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE_SAME:
return true; /* non-trivial splitting decisions */
default:
break;
}
/*
* All drivers must accept single-segments bios that are <= PAGE_SIZE.
* This is a quick and dirty check that relies on the fact that
* bi_io_vec[0] is always valid if a bio has data. The check might
* lead to occasional false negatives when bios are cloned, but compared
* to the performance impact of cloned bios themselves the loop below
* doesn't matter anyway.
*/
return q->limits.chunk_sectors || bio->bi_vcnt != 1 ||
bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
}
void __blk_queue_split(struct request_queue *q, struct bio **bio,
unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio, int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs); unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@ -246,9 +325,11 @@ int blk_dev_init(void);
*/ */
static inline bool blk_do_io_stat(struct request *rq) static inline bool blk_do_io_stat(struct request *rq)
{ {
return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT); return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk;
} }
void update_io_ticks(struct block_device *part, unsigned long now, bool end);
static inline void req_set_nomerge(struct request_queue *q, struct request *req) static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{ {
req->cmd_flags |= REQ_NOMERGE; req->cmd_flags |= REQ_NOMERGE;
@ -283,30 +364,16 @@ static inline unsigned int bio_aligned_discard_max_sectors(
/* /*
* Internal io_context interface * Internal io_context interface
*/ */
void get_io_context(struct io_context *ioc); struct io_cq *ioc_find_get_icq(struct request_queue *q);
struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); struct io_cq *ioc_lookup_icq(struct request_queue *q);
struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, #ifdef CONFIG_BLK_ICQ
gfp_t gfp_mask);
void ioc_clear_queue(struct request_queue *q); void ioc_clear_queue(struct request_queue *q);
#else
static inline void ioc_clear_queue(struct request_queue *q)
{
}
#endif /* CONFIG_BLK_ICQ */
int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
/*
* Internal throttling interface
*/
#ifdef CONFIG_BLK_DEV_THROTTLING
extern int blk_throtl_init(struct request_queue *q);
extern void blk_throtl_exit(struct request_queue *q);
extern void blk_throtl_register_queue(struct request_queue *q);
extern void blk_throtl_charge_bio_split(struct bio *bio);
bool blk_throtl_bio(struct bio *bio);
#else /* CONFIG_BLK_DEV_THROTTLING */
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
static inline void blk_throtl_register_queue(struct request_queue *q) { }
static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
#endif /* CONFIG_BLK_DEV_THROTTLING */
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
extern ssize_t blk_throtl_sample_time_store(struct request_queue *q, extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
@ -364,7 +431,15 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset, struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page); unsigned int max_sectors, bool *same_page);
struct request_queue *blk_alloc_queue(int node_id); static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
{
if (srcu)
return blk_requestq_srcu_cachep;
return blk_requestq_cachep;
}
struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
int disk_alloc_events(struct gendisk *disk); int disk_alloc_events(struct gendisk *disk);
void disk_add_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk);
@ -374,13 +449,61 @@ extern struct device_attribute dev_attr_events;
extern struct device_attribute dev_attr_events_async; extern struct device_attribute dev_attr_events_async;
extern struct device_attribute dev_attr_events_poll_msecs; extern struct device_attribute dev_attr_events_poll_msecs;
static inline void bio_clear_hipri(struct bio *bio) static inline void bio_clear_polled(struct bio *bio)
{ {
/* can't support alloc cache if we turn off polling */ /* can't support alloc cache if we turn off polling */
bio_clear_flag(bio, BIO_PERCPU_CACHE); bio_clear_flag(bio, BIO_PERCPU_CACHE);
bio->bi_opf &= ~REQ_HIPRI; bio->bi_opf &= ~REQ_POLLED;
} }
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
extern const struct address_space_operations def_blk_aops; extern const struct address_space_operations def_blk_aops;
int disk_register_independent_access_ranges(struct gendisk *disk,
struct blk_independent_access_ranges *new_iars);
void disk_unregister_independent_access_ranges(struct gendisk *disk);
#ifdef CONFIG_FAIL_MAKE_REQUEST
bool should_fail_request(struct block_device *part, unsigned int bytes);
#else /* CONFIG_FAIL_MAKE_REQUEST */
static inline bool should_fail_request(struct block_device *part,
unsigned int bytes)
{
return false;
}
#endif /* CONFIG_FAIL_MAKE_REQUEST */
/*
* Optimized request reference counting. Ideally we'd make timeouts be more
* clever, as that's the only reason we need references at all... But until
* this happens, this is faster than using refcount_t. Also see:
*
* abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count")
*/
#define req_ref_zero_or_close_to_overflow(req) \
((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)
static inline bool req_ref_inc_not_zero(struct request *req)
{
return atomic_inc_not_zero(&req->ref);
}
static inline bool req_ref_put_and_test(struct request *req)
{
WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
return atomic_dec_and_test(&req->ref);
}
static inline void req_ref_set(struct request *req, int value)
{
atomic_set(&req->ref, value);
}
static inline int req_ref_read(struct request *req)
{
return atomic_read(&req->ref);
}
#endif /* BLK_INTERNAL_H */ #endif /* BLK_INTERNAL_H */

View File

@ -14,6 +14,7 @@
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/mempool.h> #include <linux/mempool.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/blk-cgroup.h>
#include <linux/backing-dev.h> #include <linux/backing-dev.h>
#include <linux/init.h> #include <linux/init.h>
#include <linux/hash.h> #include <linux/hash.h>

View File

@ -31,6 +31,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
struct bsg_job *job; struct bsg_job *job;
struct request *rq; struct request *rq;
struct bio *bio; struct bio *bio;
void *reply;
int ret; int ret;
if (hdr->protocol != BSG_PROTOCOL_SCSI || if (hdr->protocol != BSG_PROTOCOL_SCSI ||
@ -39,22 +40,28 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
if (!capable(CAP_SYS_RAWIO)) if (!capable(CAP_SYS_RAWIO))
return -EPERM; return -EPERM;
rq = blk_get_request(q, hdr->dout_xfer_len ? rq = blk_mq_alloc_request(q, hdr->dout_xfer_len ?
REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
if (IS_ERR(rq)) if (IS_ERR(rq))
return PTR_ERR(rq); return PTR_ERR(rq);
rq->timeout = timeout; rq->timeout = timeout;
job = blk_mq_rq_to_pdu(rq); job = blk_mq_rq_to_pdu(rq);
reply = job->reply;
memset(job, 0, sizeof(*job));
job->reply = reply;
job->reply_len = SCSI_SENSE_BUFFERSIZE;
job->dd_data = job + 1;
job->request_len = hdr->request_len; job->request_len = hdr->request_len;
job->request = memdup_user(uptr64(hdr->request), hdr->request_len); job->request = memdup_user(uptr64(hdr->request), hdr->request_len);
if (IS_ERR(job->request)) { if (IS_ERR(job->request)) {
ret = PTR_ERR(job->request); ret = PTR_ERR(job->request);
goto out_put_request; goto out_free_rq;
} }
if (hdr->dout_xfer_len && hdr->din_xfer_len) { if (hdr->dout_xfer_len && hdr->din_xfer_len) {
job->bidi_rq = blk_get_request(rq->q, REQ_OP_DRV_IN, 0); job->bidi_rq = blk_mq_alloc_request(rq->q, REQ_OP_DRV_IN, 0);
if (IS_ERR(job->bidi_rq)) { if (IS_ERR(job->bidi_rq)) {
ret = PTR_ERR(job->bidi_rq); ret = PTR_ERR(job->bidi_rq);
goto out_free_job_request; goto out_free_job_request;
@ -85,7 +92,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
goto out_unmap_bidi_rq; goto out_unmap_bidi_rq;
bio = rq->bio; bio = rq->bio;
blk_execute_rq(NULL, rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL)); blk_execute_rq(rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL));
/* /*
* The assignments below don't make much sense, but are kept for * The assignments below don't make much sense, but are kept for
@ -134,11 +141,11 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
blk_rq_unmap_user(job->bidi_bio); blk_rq_unmap_user(job->bidi_bio);
out_free_bidi_rq: out_free_bidi_rq:
if (job->bidi_rq) if (job->bidi_rq)
blk_put_request(job->bidi_rq); blk_mq_free_request(job->bidi_rq);
out_free_job_request: out_free_job_request:
kfree(job->request); kfree(job->request);
out_put_request: out_free_rq:
blk_put_request(rq); blk_mq_free_request(rq);
return ret; return ret;
} }
@ -302,18 +309,6 @@ static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req,
return 0; return 0;
} }
/* called right before the request is given to the request_queue user */
static void bsg_initialize_rq(struct request *req)
{
struct bsg_job *job = blk_mq_rq_to_pdu(req);
void *reply = job->reply;
memset(job, 0, sizeof(*job));
job->reply = reply;
job->reply_len = SCSI_SENSE_BUFFERSIZE;
job->dd_data = job + 1;
}
static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req, static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req,
unsigned int hctx_idx) unsigned int hctx_idx)
{ {
@ -350,7 +345,6 @@ static const struct blk_mq_ops bsg_mq_ops = {
.queue_rq = bsg_queue_rq, .queue_rq = bsg_queue_rq,
.init_request = bsg_init_rq, .init_request = bsg_init_rq,
.exit_request = bsg_exit_rq, .exit_request = bsg_exit_rq,
.initialize_rq_fn = bsg_initialize_rq,
.complete = bsg_complete, .complete = bsg_complete,
.timeout = bsg_timeout, .timeout = bsg_timeout,
}; };

View File

@ -26,7 +26,6 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/bio.h> #include <linux/bio.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/slab.h> #include <linux/slab.h>
@ -40,6 +39,7 @@
#include <trace/events/block.h> #include <trace/events/block.h>
#include "elevator.h"
#include "blk.h" #include "blk.h"
#include "blk-mq-sched.h" #include "blk-mq-sched.h"
#include "blk-pm.h" #include "blk-pm.h"
@ -188,8 +188,10 @@ static void elevator_release(struct kobject *kobj)
kfree(e); kfree(e);
} }
void __elevator_exit(struct request_queue *q, struct elevator_queue *e) void elevator_exit(struct request_queue *q)
{ {
struct elevator_queue *e = q->elevator;
mutex_lock(&e->sysfs_lock); mutex_lock(&e->sysfs_lock);
blk_mq_exit_sched(q, e); blk_mq_exit_sched(q, e);
mutex_unlock(&e->sysfs_lock); mutex_unlock(&e->sysfs_lock);
@ -593,7 +595,8 @@ int elevator_switch_mq(struct request_queue *q,
elv_unregister_queue(q); elv_unregister_queue(q);
ioc_clear_queue(q); ioc_clear_queue(q);
elevator_exit(q, q->elevator); blk_mq_sched_free_rqs(q);
elevator_exit(q);
} }
ret = blk_mq_init_sched(q, new_e); ret = blk_mq_init_sched(q, new_e);
@ -603,7 +606,8 @@ int elevator_switch_mq(struct request_queue *q,
if (new_e) { if (new_e) {
ret = elv_register_queue(q, true); ret = elv_register_queue(q, true);
if (ret) { if (ret) {
elevator_exit(q, q->elevator); blk_mq_sched_free_rqs(q);
elevator_exit(q);
goto out; goto out;
} }
} }
@ -635,7 +639,7 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
return NULL; return NULL;
if (q->nr_hw_queues != 1 && if (q->nr_hw_queues != 1 &&
!blk_mq_is_sbitmap_shared(q->tag_set->flags)) !blk_mq_is_shared_tags(q->tag_set->flags))
return NULL; return NULL;
return elevator_get(q, "mq-deadline", false); return elevator_get(q, "mq-deadline", false);

View File

@ -15,9 +15,10 @@
#include <linux/falloc.h> #include <linux/falloc.h>
#include <linux/suspend.h> #include <linux/suspend.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/module.h>
#include "blk.h" #include "blk.h"
static struct inode *bdev_file_inode(struct file *file) static inline struct inode *bdev_file_inode(struct file *file)
{ {
return file->f_mapping->host; return file->f_mapping->host;
} }
@ -54,14 +55,12 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
struct iov_iter *iter, unsigned int nr_pages) struct iov_iter *iter, unsigned int nr_pages)
{ {
struct file *file = iocb->ki_filp; struct block_device *bdev = iocb->ki_filp->private_data;
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
loff_t pos = iocb->ki_pos; loff_t pos = iocb->ki_pos;
bool should_dirty = false; bool should_dirty = false;
struct bio bio; struct bio bio;
ssize_t ret; ssize_t ret;
blk_qc_t qc;
if ((pos | iov_iter_alignment(iter)) & if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1)) (bdev_logical_block_size(bdev) - 1))
@ -78,7 +77,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
bio_init(&bio, vecs, nr_pages); bio_init(&bio, vecs, nr_pages);
bio_set_dev(&bio, bdev); bio_set_dev(&bio, bdev);
bio.bi_iter.bi_sector = pos >> 9; bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio.bi_write_hint = iocb->ki_hint; bio.bi_write_hint = iocb->ki_hint;
bio.bi_private = current; bio.bi_private = current;
bio.bi_end_io = blkdev_bio_end_io_simple; bio.bi_end_io = blkdev_bio_end_io_simple;
@ -102,13 +101,12 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
if (iocb->ki_flags & IOCB_HIPRI) if (iocb->ki_flags & IOCB_HIPRI)
bio_set_polled(&bio, iocb); bio_set_polled(&bio, iocb);
qc = submit_bio(&bio); submit_bio(&bio);
for (;;) { for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE); set_current_state(TASK_UNINTERRUPTIBLE);
if (!READ_ONCE(bio.bi_private)) if (!READ_ONCE(bio.bi_private))
break; break;
if (!(iocb->ki_flags & IOCB_HIPRI) || if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, NULL, 0))
!blk_poll(bdev_get_queue(bdev), qc, true))
blk_io_schedule(); blk_io_schedule();
} }
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
@ -126,6 +124,11 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
return ret; return ret;
} }
enum {
DIO_SHOULD_DIRTY = 1,
DIO_IS_SYNC = 2,
};
struct blkdev_dio { struct blkdev_dio {
union { union {
struct kiocb *iocb; struct kiocb *iocb;
@ -133,35 +136,27 @@ struct blkdev_dio {
}; };
size_t size; size_t size;
atomic_t ref; atomic_t ref;
bool multi_bio : 1; unsigned int flags;
bool should_dirty : 1; struct bio bio ____cacheline_aligned_in_smp;
bool is_sync : 1;
struct bio bio;
}; };
static struct bio_set blkdev_dio_pool; static struct bio_set blkdev_dio_pool;
static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
{
struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
struct request_queue *q = bdev_get_queue(bdev);
return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
}
static void blkdev_bio_end_io(struct bio *bio) static void blkdev_bio_end_io(struct bio *bio)
{ {
struct blkdev_dio *dio = bio->bi_private; struct blkdev_dio *dio = bio->bi_private;
bool should_dirty = dio->should_dirty; bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
if (bio->bi_status && !dio->bio.bi_status) if (bio->bi_status && !dio->bio.bi_status)
dio->bio.bi_status = bio->bi_status; dio->bio.bi_status = bio->bi_status;
if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { if (atomic_dec_and_test(&dio->ref)) {
if (!dio->is_sync) { if (!(dio->flags & DIO_IS_SYNC)) {
struct kiocb *iocb = dio->iocb; struct kiocb *iocb = dio->iocb;
ssize_t ret; ssize_t ret;
WRITE_ONCE(iocb->private, NULL);
if (likely(!dio->bio.bi_status)) { if (likely(!dio->bio.bi_status)) {
ret = dio->size; ret = dio->size;
iocb->ki_pos += ret; iocb->ki_pos += ret;
@ -169,9 +164,8 @@ static void blkdev_bio_end_io(struct bio *bio)
ret = blk_status_to_errno(dio->bio.bi_status); ret = blk_status_to_errno(dio->bio.bi_status);
} }
dio->iocb->ki_complete(iocb, ret, 0); dio->iocb->ki_complete(iocb, ret);
if (dio->multi_bio) bio_put(&dio->bio);
bio_put(&dio->bio);
} else { } else {
struct task_struct *waiter = dio->waiter; struct task_struct *waiter = dio->waiter;
@ -191,16 +185,12 @@ static void blkdev_bio_end_io(struct bio *bio)
static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
unsigned int nr_pages) unsigned int nr_pages)
{ {
struct file *file = iocb->ki_filp; struct block_device *bdev = iocb->ki_filp->private_data;
struct inode *inode = bdev_file_inode(file);
struct block_device *bdev = I_BDEV(inode);
struct blk_plug plug; struct blk_plug plug;
struct blkdev_dio *dio; struct blkdev_dio *dio;
struct bio *bio; struct bio *bio;
bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
bool is_read = (iov_iter_rw(iter) == READ), is_sync; bool is_read = (iov_iter_rw(iter) == READ), is_sync;
loff_t pos = iocb->ki_pos; loff_t pos = iocb->ki_pos;
blk_qc_t qc = BLK_QC_T_NONE;
int ret = 0; int ret = 0;
if ((pos | iov_iter_alignment(iter)) & if ((pos | iov_iter_alignment(iter)) &
@ -210,28 +200,31 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio); dio = container_of(bio, struct blkdev_dio, bio);
dio->is_sync = is_sync = is_sync_kiocb(iocb); atomic_set(&dio->ref, 1);
if (dio->is_sync) { /*
* Grab an extra reference to ensure the dio structure which is embedded
* into the first bio stays around.
*/
bio_get(bio);
is_sync = is_sync_kiocb(iocb);
if (is_sync) {
dio->flags = DIO_IS_SYNC;
dio->waiter = current; dio->waiter = current;
bio_get(bio);
} else { } else {
dio->flags = 0;
dio->iocb = iocb; dio->iocb = iocb;
} }
dio->size = 0; dio->size = 0;
dio->multi_bio = false; if (is_read && iter_is_iovec(iter))
dio->should_dirty = is_read && iter_is_iovec(iter); dio->flags |= DIO_SHOULD_DIRTY;
/* blk_start_plug(&plug);
* Don't plug for HIPRI/polled IO, as those should go straight
* to issue
*/
if (!is_poll)
blk_start_plug(&plug);
for (;;) { for (;;) {
bio_set_dev(bio, bdev); bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = pos >> 9; bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = iocb->ki_hint; bio->bi_write_hint = iocb->ki_hint;
bio->bi_private = dio; bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io; bio->bi_end_io = blkdev_bio_end_io;
@ -246,7 +239,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
if (is_read) { if (is_read) {
bio->bi_opf = REQ_OP_READ; bio->bi_opf = REQ_OP_READ;
if (dio->should_dirty) if (dio->flags & DIO_SHOULD_DIRTY)
bio_set_pages_dirty(bio); bio_set_pages_dirty(bio);
} else { } else {
bio->bi_opf = dio_bio_write_op(iocb); bio->bi_opf = dio_bio_write_op(iocb);
@ -260,40 +253,15 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
if (!nr_pages) { if (!nr_pages) {
bool polled = false; submit_bio(bio);
if (iocb->ki_flags & IOCB_HIPRI) {
bio_set_polled(bio, iocb);
polled = true;
}
qc = submit_bio(bio);
if (polled)
WRITE_ONCE(iocb->ki_cookie, qc);
break; break;
} }
atomic_inc(&dio->ref);
if (!dio->multi_bio) {
/*
* AIO needs an extra reference to ensure the dio
* structure which is embedded into the first bio
* stays around.
*/
if (!is_sync)
bio_get(bio);
dio->multi_bio = true;
atomic_set(&dio->ref, 2);
} else {
atomic_inc(&dio->ref);
}
submit_bio(bio); submit_bio(bio);
bio = bio_alloc(GFP_KERNEL, nr_pages); bio = bio_alloc(GFP_KERNEL, nr_pages);
} }
if (!is_poll) blk_finish_plug(&plug);
blk_finish_plug(&plug);
if (!is_sync) if (!is_sync)
return -EIOCBQUEUED; return -EIOCBQUEUED;
@ -302,10 +270,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
set_current_state(TASK_UNINTERRUPTIBLE); set_current_state(TASK_UNINTERRUPTIBLE);
if (!READ_ONCE(dio->waiter)) if (!READ_ONCE(dio->waiter))
break; break;
blk_io_schedule();
if (!(iocb->ki_flags & IOCB_HIPRI) ||
!blk_poll(bdev_get_queue(bdev), qc, true))
blk_io_schedule();
} }
__set_current_state(TASK_RUNNING); __set_current_state(TASK_RUNNING);
@ -318,6 +283,95 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
return ret; return ret;
} }
static void blkdev_bio_end_io_async(struct bio *bio)
{
struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio);
struct kiocb *iocb = dio->iocb;
ssize_t ret;
WRITE_ONCE(iocb->private, NULL);
if (likely(!bio->bi_status)) {
ret = dio->size;
iocb->ki_pos += ret;
} else {
ret = blk_status_to_errno(bio->bi_status);
}
iocb->ki_complete(iocb, ret);
if (dio->flags & DIO_SHOULD_DIRTY) {
bio_check_pages_dirty(bio);
} else {
bio_release_pages(bio, false);
bio_put(bio);
}
}
static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
struct iov_iter *iter,
unsigned int nr_pages)
{
struct block_device *bdev = iocb->ki_filp->private_data;
struct blkdev_dio *dio;
struct bio *bio;
loff_t pos = iocb->ki_pos;
int ret = 0;
if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1))
return -EINVAL;
bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
dio->flags = 0;
dio->iocb = iocb;
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = iocb->ki_hint;
bio->bi_end_io = blkdev_bio_end_io_async;
bio->bi_ioprio = iocb->ki_ioprio;
if (iov_iter_is_bvec(iter)) {
/*
* Users don't rely on the iterator being in any particular
* state for async I/O returning -EIOCBQUEUED, hence we can
* avoid expensive iov_iter_advance(). Bypass
* bio_iov_iter_get_pages() and set the bvec directly.
*/
bio_iov_bvec_set(bio, iter);
} else {
ret = bio_iov_iter_get_pages(bio, iter);
if (unlikely(ret)) {
bio_put(bio);
return ret;
}
}
dio->size = bio->bi_iter.bi_size;
if (iov_iter_rw(iter) == READ) {
bio->bi_opf = REQ_OP_READ;
if (iter_is_iovec(iter)) {
dio->flags |= DIO_SHOULD_DIRTY;
bio_set_pages_dirty(bio);
}
} else {
bio->bi_opf = dio_bio_write_op(iocb);
task_io_account_write(bio->bi_iter.bi_size);
}
if (iocb->ki_flags & IOCB_HIPRI) {
bio->bi_opf |= REQ_POLLED | REQ_NOWAIT;
submit_bio(bio);
WRITE_ONCE(iocb->private, bio);
} else {
if (iocb->ki_flags & IOCB_NOWAIT)
bio->bi_opf |= REQ_NOWAIT;
submit_bio(bio);
}
return -EIOCBQUEUED;
}
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{ {
unsigned int nr_pages; unsigned int nr_pages;
@ -326,9 +380,11 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
return 0; return 0;
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) if (likely(nr_pages <= BIO_MAX_VECS)) {
return __blkdev_direct_IO_simple(iocb, iter, nr_pages); if (is_sync_kiocb(iocb))
return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
return __blkdev_direct_IO_async(iocb, iter, nr_pages);
}
return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
} }
@ -405,8 +461,7 @@ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence)
static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
int datasync) int datasync)
{ {
struct inode *bd_inode = bdev_file_inode(filp); struct block_device *bdev = filp->private_data;
struct block_device *bdev = I_BDEV(bd_inode);
int error; int error;
error = file_write_and_wait_range(filp, start, end); error = file_write_and_wait_range(filp, start, end);
@ -448,6 +503,8 @@ static int blkdev_open(struct inode *inode, struct file *filp)
bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp);
if (IS_ERR(bdev)) if (IS_ERR(bdev))
return PTR_ERR(bdev); return PTR_ERR(bdev);
filp->private_data = bdev;
filp->f_mapping = bdev->bd_inode->i_mapping; filp->f_mapping = bdev->bd_inode->i_mapping;
filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
return 0; return 0;
@ -455,29 +512,12 @@ static int blkdev_open(struct inode *inode, struct file *filp)
static int blkdev_close(struct inode *inode, struct file *filp) static int blkdev_close(struct inode *inode, struct file *filp)
{ {
struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); struct block_device *bdev = filp->private_data;
blkdev_put(bdev, filp->f_mode); blkdev_put(bdev, filp->f_mode);
return 0; return 0;
} }
static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
fmode_t mode = file->f_mode;
/*
* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
* to updated it before every ioctl.
*/
if (file->f_flags & O_NDELAY)
mode |= FMODE_NDELAY;
else
mode &= ~FMODE_NDELAY;
return blkdev_ioctl(bdev, mode, cmd, arg);
}
/* /*
* Write data to the block device. Only intended for the block device itself * Write data to the block device. Only intended for the block device itself
* and the raw driver which basically is a fake block device. * and the raw driver which basically is a fake block device.
@ -487,14 +527,14 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
*/ */
static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{ {
struct file *file = iocb->ki_filp; struct block_device *bdev = iocb->ki_filp->private_data;
struct inode *bd_inode = bdev_file_inode(file); struct inode *bd_inode = bdev->bd_inode;
loff_t size = i_size_read(bd_inode); loff_t size = bdev_nr_bytes(bdev);
struct blk_plug plug; struct blk_plug plug;
size_t shorted = 0; size_t shorted = 0;
ssize_t ret; ssize_t ret;
if (bdev_read_only(I_BDEV(bd_inode))) if (bdev_read_only(bdev))
return -EPERM; return -EPERM;
if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
@ -526,24 +566,58 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{ {
struct file *file = iocb->ki_filp; struct block_device *bdev = iocb->ki_filp->private_data;
struct inode *bd_inode = bdev_file_inode(file); loff_t size = bdev_nr_bytes(bdev);
loff_t size = i_size_read(bd_inode);
loff_t pos = iocb->ki_pos; loff_t pos = iocb->ki_pos;
size_t shorted = 0; size_t shorted = 0;
ssize_t ret; ssize_t ret = 0;
size_t count;
if (pos >= size) if (unlikely(pos + iov_iter_count(to) > size)) {
return 0; if (pos >= size)
return 0;
size -= pos; size -= pos;
if (iov_iter_count(to) > size) {
shorted = iov_iter_count(to) - size; shorted = iov_iter_count(to) - size;
iov_iter_truncate(to, size); iov_iter_truncate(to, size);
} }
ret = generic_file_read_iter(iocb, to); count = iov_iter_count(to);
iov_iter_reexpand(to, iov_iter_count(to) + shorted); if (!count)
goto reexpand; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT) {
struct address_space *mapping = iocb->ki_filp->f_mapping;
if (iocb->ki_flags & IOCB_NOWAIT) {
if (filemap_range_needs_writeback(mapping, pos,
pos + count - 1)) {
ret = -EAGAIN;
goto reexpand;
}
} else {
ret = filemap_write_and_wait_range(mapping, pos,
pos + count - 1);
if (ret < 0)
goto reexpand;
}
file_accessed(iocb->ki_filp);
ret = blkdev_direct_IO(iocb, to);
if (ret >= 0) {
iocb->ki_pos += ret;
count -= ret;
}
iov_iter_revert(to, count - iov_iter_count(to));
if (ret < 0 || !count)
goto reexpand;
}
ret = filemap_read(iocb, to, ret);
reexpand:
if (unlikely(shorted))
iov_iter_reexpand(to, iov_iter_count(to) + shorted);
return ret; return ret;
} }
@ -565,7 +639,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
return -EOPNOTSUPP; return -EOPNOTSUPP;
/* Don't go off the end of the device. */ /* Don't go off the end of the device. */
isize = i_size_read(bdev->bd_inode); isize = bdev_nr_bytes(bdev);
if (start >= isize) if (start >= isize)
return -EINVAL; return -EINVAL;
if (end >= isize) { if (end >= isize) {
@ -592,16 +666,18 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
switch (mode) { switch (mode) {
case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE:
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); len >> SECTOR_SHIFT, GFP_KERNEL,
BLKDEV_ZERO_NOUNMAP);
break; break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); len >> SECTOR_SHIFT, GFP_KERNEL,
BLKDEV_ZERO_NOFALLBACK);
break; break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
error = blkdev_issue_discard(bdev, start >> 9, len >> 9, error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
GFP_KERNEL, 0); len >> SECTOR_SHIFT, GFP_KERNEL, 0);
break; break;
default: default:
error = -EOPNOTSUPP; error = -EOPNOTSUPP;
@ -618,10 +694,10 @@ const struct file_operations def_blk_fops = {
.llseek = blkdev_llseek, .llseek = blkdev_llseek,
.read_iter = blkdev_read_iter, .read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter, .write_iter = blkdev_write_iter,
.iopoll = blkdev_iopoll, .iopoll = iocb_bio_iopoll,
.mmap = generic_file_mmap, .mmap = generic_file_mmap,
.fsync = blkdev_fsync, .fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl, .unlocked_ioctl = blkdev_ioctl,
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
.compat_ioctl = compat_blkdev_ioctl, .compat_ioctl = compat_blkdev_ioctl,
#endif #endif

View File

@ -25,8 +25,10 @@
#include <linux/log2.h> #include <linux/log2.h>
#include <linux/pm_runtime.h> #include <linux/pm_runtime.h>
#include <linux/badblocks.h> #include <linux/badblocks.h>
#include <linux/part_stat.h>
#include "blk.h" #include "blk.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h" #include "blk-rq-qos.h"
static struct kobject *block_depr; static struct kobject *block_depr;
@ -58,6 +60,7 @@ void set_capacity(struct gendisk *disk, sector_t sectors)
spin_lock(&bdev->bd_size_lock); spin_lock(&bdev->bd_size_lock);
i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
bdev->bd_nr_sectors = sectors;
spin_unlock(&bdev->bd_size_lock); spin_unlock(&bdev->bd_size_lock);
} }
EXPORT_SYMBOL(set_capacity); EXPORT_SYMBOL(set_capacity);
@ -212,7 +215,10 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
* @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
* @major = 0, try to allocate any unused major number. * @major = 0, try to allocate any unused major number.
* @name: the name of the new block device as a zero terminated string * @name: the name of the new block device as a zero terminated string
* @probe: allback that is called on access to any minor number of @major * @probe: pre-devtmpfs / pre-udev callback used to create disks when their
* pre-created device node is accessed. When a probe call uses
* add_disk() and it fails the driver must cleanup resources. This
* interface may soon be removed.
* *
* The @name must be unique within the system. * The @name must be unique within the system.
* *
@ -368,17 +374,21 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action)
} }
EXPORT_SYMBOL_GPL(disk_uevent); EXPORT_SYMBOL_GPL(disk_uevent);
static void disk_scan_partitions(struct gendisk *disk) int disk_scan_partitions(struct gendisk *disk, fmode_t mode)
{ {
struct block_device *bdev; struct block_device *bdev;
if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
return; return -EINVAL;
if (disk->open_partitions)
return -EBUSY;
set_bit(GD_NEED_PART_SCAN, &disk->state); set_bit(GD_NEED_PART_SCAN, &disk->state);
bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL);
if (!IS_ERR(bdev)) if (IS_ERR(bdev))
blkdev_put(bdev, FMODE_READ); return PTR_ERR(bdev);
blkdev_put(bdev, mode);
return 0;
} }
/** /**
@ -390,8 +400,8 @@ static void disk_scan_partitions(struct gendisk *disk)
* This function registers the partitioning information in @disk * This function registers the partitioning information in @disk
* with the kernel. * with the kernel.
*/ */
int device_add_disk(struct device *parent, struct gendisk *disk, int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
const struct attribute_group **groups) const struct attribute_group **groups)
{ {
struct device *ddev = disk_to_dev(disk); struct device *ddev = disk_to_dev(disk);
@ -432,7 +442,6 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
return ret; return ret;
disk->major = BLOCK_EXT_MAJOR; disk->major = BLOCK_EXT_MAJOR;
disk->first_minor = ret; disk->first_minor = ret;
disk->flags |= GENHD_FL_EXT_DEVT;
} }
/* delay uevents, until we scanned partition table */ /* delay uevents, until we scanned partition table */
@ -489,14 +498,7 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
if (ret) if (ret)
goto out_put_slave_dir; goto out_put_slave_dir;
if (disk->flags & GENHD_FL_HIDDEN) { if (!(disk->flags & GENHD_FL_HIDDEN)) {
/*
* Don't let hidden disks show up in /proc/partitions,
* and don't bother scanning for partitions either.
*/
disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
disk->flags |= GENHD_FL_NO_PART_SCAN;
} else {
ret = bdi_register(disk->bdi, "%u:%u", ret = bdi_register(disk->bdi, "%u:%u",
disk->major, disk->first_minor); disk->major, disk->first_minor);
if (ret) if (ret)
@ -508,7 +510,8 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
goto out_unregister_bdi; goto out_unregister_bdi;
bdev_add(disk->part0, ddev->devt); bdev_add(disk->part0, ddev->devt);
disk_scan_partitions(disk); if (get_capacity(disk))
disk_scan_partitions(disk, FMODE_READ);
/* /*
* Announce the disk and partitions after all partitions are * Announce the disk and partitions after all partitions are
@ -541,7 +544,7 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
out_free_ext_minor: out_free_ext_minor:
if (disk->major == BLOCK_EXT_MAJOR) if (disk->major == BLOCK_EXT_MAJOR)
blk_free_ext_minor(disk->first_minor); blk_free_ext_minor(disk->first_minor);
return WARN_ON_ONCE(ret); /* keep until all callers handle errors */ return ret;
} }
EXPORT_SYMBOL(device_add_disk); EXPORT_SYMBOL(device_add_disk);
@ -645,6 +648,26 @@ void del_gendisk(struct gendisk *disk)
} }
EXPORT_SYMBOL(del_gendisk); EXPORT_SYMBOL(del_gendisk);
/**
* invalidate_disk - invalidate the disk
* @disk: the struct gendisk to invalidate
*
* A helper to invalidates the disk. It will clean the disk's associated
* buffer/page caches and reset its internal states so that the disk
* can be reused by the drivers.
*
* Context: can sleep
*/
void invalidate_disk(struct gendisk *disk)
{
struct block_device *bdev = disk->part0;
invalidate_bdev(bdev);
bdev->bd_inode->i_mapping->wb_err = 0;
set_capacity(disk, 0);
}
EXPORT_SYMBOL(invalidate_disk);
/* sysfs access to bad-blocks list. */ /* sysfs access to bad-blocks list. */
static ssize_t disk_badblocks_show(struct device *dev, static ssize_t disk_badblocks_show(struct device *dev,
struct device_attribute *attr, struct device_attribute *attr,
@ -711,8 +734,7 @@ void __init printk_all_partitions(void)
* Don't show empty devices or things that have been * Don't show empty devices or things that have been
* suppressed * suppressed
*/ */
if (get_capacity(disk) == 0 || if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_HIDDEN))
(disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
continue; continue;
/* /*
@ -805,11 +827,7 @@ static int show_partition(struct seq_file *seqf, void *v)
struct block_device *part; struct block_device *part;
unsigned long idx; unsigned long idx;
/* Don't show non-partitionable removeable devices or empty devices */ if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN))
if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
return 0; return 0;
rcu_read_lock(); rcu_read_lock();
@ -865,7 +883,8 @@ static ssize_t disk_ext_range_show(struct device *dev,
{ {
struct gendisk *disk = dev_to_disk(dev); struct gendisk *disk = dev_to_disk(dev);
return sprintf(buf, "%d\n", disk_max_parts(disk)); return sprintf(buf, "%d\n",
(disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS);
} }
static ssize_t disk_removable_show(struct device *dev, static ssize_t disk_removable_show(struct device *dev,
@ -904,7 +923,7 @@ ssize_t part_stat_show(struct device *dev,
struct device_attribute *attr, char *buf) struct device_attribute *attr, char *buf)
{ {
struct block_device *bdev = dev_to_bdev(dev); struct block_device *bdev = dev_to_bdev(dev);
struct request_queue *q = bdev->bd_disk->queue; struct request_queue *q = bdev_get_queue(bdev);
struct disk_stats stat; struct disk_stats stat;
unsigned int inflight; unsigned int inflight;
@ -948,7 +967,7 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
char *buf) char *buf)
{ {
struct block_device *bdev = dev_to_bdev(dev); struct block_device *bdev = dev_to_bdev(dev);
struct request_queue *q = bdev->bd_disk->queue; struct request_queue *q = bdev_get_queue(bdev);
unsigned int inflight[2]; unsigned int inflight[2];
if (queue_is_mq(q)) if (queue_is_mq(q))
@ -1290,6 +1309,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
if (!disk->bdi) if (!disk->bdi)
goto out_free_disk; goto out_free_disk;
/* bdev_alloc() might need the queue, set before the first call */
disk->queue = q;
disk->part0 = bdev_alloc(disk, 0); disk->part0 = bdev_alloc(disk, 0);
if (!disk->part0) if (!disk->part0)
goto out_free_bdi; goto out_free_bdi;
@ -1305,7 +1327,6 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
disk_to_dev(disk)->type = &disk_type; disk_to_dev(disk)->type = &disk_type;
device_initialize(disk_to_dev(disk)); device_initialize(disk_to_dev(disk));
inc_diskseq(disk); inc_diskseq(disk);
disk->queue = q;
q->disk = disk; q->disk = disk;
lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
@ -1332,7 +1353,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
struct request_queue *q; struct request_queue *q;
struct gendisk *disk; struct gendisk *disk;
q = blk_alloc_queue(node); q = blk_alloc_queue(node, false);
if (!q) if (!q)
return NULL; return NULL;
@ -1410,12 +1431,6 @@ void set_disk_ro(struct gendisk *disk, bool read_only)
} }
EXPORT_SYMBOL(set_disk_ro); EXPORT_SYMBOL(set_disk_ro);
int bdev_read_only(struct block_device *bdev)
{
return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
}
EXPORT_SYMBOL(bdev_read_only);
void inc_diskseq(struct gendisk *disk) void inc_diskseq(struct gendisk *disk)
{ {
disk->diskseq = atomic64_inc_return(&diskseq); disk->diskseq = atomic64_inc_return(&diskseq);

View File

@ -82,31 +82,6 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
} }
#endif #endif
static int blkdev_reread_part(struct block_device *bdev, fmode_t mode)
{
struct block_device *tmp;
if (!disk_part_scan_enabled(bdev->bd_disk) || bdev_is_partition(bdev))
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (bdev->bd_disk->open_partitions)
return -EBUSY;
/*
* Reopen the device to revalidate the driver state and force a
* partition rescan.
*/
mode &= ~FMODE_EXCL;
set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
tmp = blkdev_get_by_dev(bdev->bd_dev, mode, NULL);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
blkdev_put(tmp, mode);
return 0;
}
static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
unsigned long arg, unsigned long flags) unsigned long arg, unsigned long flags)
{ {
@ -133,7 +108,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
if (len & 511) if (len & 511)
return -EINVAL; return -EINVAL;
if (start + len > i_size_read(bdev->bd_inode)) if (start + len > bdev_nr_bytes(bdev))
return -EINVAL; return -EINVAL;
filemap_invalidate_lock(inode->i_mapping); filemap_invalidate_lock(inode->i_mapping);
@ -171,7 +146,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
return -EINVAL; return -EINVAL;
if (len & 511) if (len & 511)
return -EINVAL; return -EINVAL;
if (end >= (uint64_t)i_size_read(bdev->bd_inode)) if (end >= (uint64_t)bdev_nr_bytes(bdev))
return -EINVAL; return -EINVAL;
if (end < start) if (end < start)
return -EINVAL; return -EINVAL;
@ -522,7 +497,11 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE; bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE;
return 0; return 0;
case BLKRRPART: case BLKRRPART:
return blkdev_reread_part(bdev, mode); if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (bdev_is_partition(bdev))
return -EINVAL;
return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL);
case BLKTRACESTART: case BLKTRACESTART:
case BLKTRACESTOP: case BLKTRACESTOP:
case BLKTRACETEARDOWN: case BLKTRACETEARDOWN:
@ -550,12 +529,21 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
* *
* New commands must be compatible and go into blkdev_common_ioctl * New commands must be compatible and go into blkdev_common_ioctl
*/ */
int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
unsigned long arg)
{ {
int ret; struct block_device *bdev = I_BDEV(file->f_mapping->host);
loff_t size;
void __user *argp = (void __user *)arg; void __user *argp = (void __user *)arg;
fmode_t mode = file->f_mode;
int ret;
/*
* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
* to updated it before every ioctl.
*/
if (file->f_flags & O_NDELAY)
mode |= FMODE_NDELAY;
else
mode &= ~FMODE_NDELAY;
switch (cmd) { switch (cmd) {
/* These need separate implementations for the data structure */ /* These need separate implementations for the data structure */
@ -572,10 +560,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
return put_long(argp, return put_long(argp,
(bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
case BLKGETSIZE: case BLKGETSIZE:
size = i_size_read(bdev->bd_inode); if (bdev_nr_sectors(bdev) > ~0UL)
if ((size >> 9) > ~0UL)
return -EFBIG; return -EFBIG;
return put_ulong(argp, size >> 9); return put_ulong(argp, bdev_nr_sectors(bdev));
/* The data is compatible, but the command number is different */ /* The data is compatible, but the command number is different */
case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */ case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
@ -583,7 +570,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKBSZSET: case BLKBSZSET:
return blkdev_bszset(bdev, mode, argp); return blkdev_bszset(bdev, mode, argp);
case BLKGETSIZE64: case BLKGETSIZE64:
return put_u64(argp, i_size_read(bdev->bd_inode)); return put_u64(argp, bdev_nr_bytes(bdev));
/* Incompatible alignment on i386 */ /* Incompatible alignment on i386 */
case BLKTRACESETUP: case BLKTRACESETUP:
@ -600,7 +587,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
return -ENOTTY; return -ENOTTY;
return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
} }
EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */
#ifdef CONFIG_COMPAT #ifdef CONFIG_COMPAT
@ -618,7 +604,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
struct block_device *bdev = I_BDEV(file->f_mapping->host); struct block_device *bdev = I_BDEV(file->f_mapping->host);
struct gendisk *disk = bdev->bd_disk; struct gendisk *disk = bdev->bd_disk;
fmode_t mode = file->f_mode; fmode_t mode = file->f_mode;
loff_t size;
/* /*
* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
@ -644,10 +629,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return compat_put_long(argp, return compat_put_long(argp,
(bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
case BLKGETSIZE: case BLKGETSIZE:
size = i_size_read(bdev->bd_inode); if (bdev_nr_sectors(bdev) > ~0UL)
if ((size >> 9) > ~0UL)
return -EFBIG; return -EFBIG;
return compat_put_ulong(argp, size >> 9); return compat_put_ulong(argp, bdev_nr_sectors(bdev));
/* The data is compatible, but the command number is different */ /* The data is compatible, but the command number is different */
case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */ case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
@ -655,7 +639,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case BLKBSZSET_32: case BLKBSZSET_32:
return blkdev_bszset(bdev, mode, argp); return blkdev_bszset(bdev, mode, argp);
case BLKGETSIZE64_32: case BLKGETSIZE64_32:
return put_u64(argp, i_size_read(bdev->bd_inode)); return put_u64(argp, bdev_nr_bytes(bdev));
/* Incompatible alignment on i386 */ /* Incompatible alignment on i386 */
case BLKTRACESETUP32: case BLKTRACESETUP32:

View File

@ -22,46 +22,14 @@
*/ */
#include <linux/gfp.h> #include <linux/gfp.h>
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/export.h>
#include <linux/ioprio.h> #include <linux/ioprio.h>
#include <linux/cred.h> #include <linux/cred.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/capability.h> #include <linux/capability.h>
#include <linux/sched/user.h>
#include <linux/sched/task.h>
#include <linux/syscalls.h> #include <linux/syscalls.h>
#include <linux/security.h> #include <linux/security.h>
#include <linux/pid_namespace.h> #include <linux/pid_namespace.h>
int set_task_ioprio(struct task_struct *task, int ioprio)
{
int err;
struct io_context *ioc;
const struct cred *cred = current_cred(), *tcred;
rcu_read_lock();
tcred = __task_cred(task);
if (!uid_eq(tcred->uid, cred->euid) &&
!uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
rcu_read_unlock();
return -EPERM;
}
rcu_read_unlock();
err = security_task_setioprio(task, ioprio);
if (err)
return err;
ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
if (ioc) {
ioc->ioprio = ioprio;
put_io_context(ioc);
}
return err;
}
EXPORT_SYMBOL_GPL(set_task_ioprio);
int ioprio_check_cap(int ioprio) int ioprio_check_cap(int ioprio)
{ {
int class = IOPRIO_PRIO_CLASS(ioprio); int class = IOPRIO_PRIO_CLASS(ioprio);

View File

@ -9,12 +9,12 @@
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/blk-mq.h> #include <linux/blk-mq.h>
#include <linux/elevator.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/sbitmap.h> #include <linux/sbitmap.h>
#include <trace/events/block.h> #include <trace/events/block.h>
#include "elevator.h"
#include "blk.h" #include "blk.h"
#include "blk-mq.h" #include "blk-mq.h"
#include "blk-mq-debugfs.h" #include "blk-mq-debugfs.h"
@ -433,6 +433,7 @@ static void kyber_exit_sched(struct elevator_queue *e)
int i; int i;
del_timer_sync(&kqd->timer); del_timer_sync(&kqd->timer);
blk_stat_disable_accounting(kqd->q);
for (i = 0; i < KYBER_NUM_DOMAINS; i++) for (i = 0; i < KYBER_NUM_DOMAINS; i++)
sbitmap_queue_free(&kqd->domain_tokens[i]); sbitmap_queue_free(&kqd->domain_tokens[i]);
@ -453,11 +454,11 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
{ {
struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
struct blk_mq_tags *tags = hctx->sched_tags; struct blk_mq_tags *tags = hctx->sched_tags;
unsigned int shift = tags->bitmap_tags->sb.shift; unsigned int shift = tags->bitmap_tags.sb.shift;
kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth); sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
} }
static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)

View File

@ -9,7 +9,6 @@
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/blkdev.h> #include <linux/blkdev.h>
#include <linux/blk-mq.h> #include <linux/blk-mq.h>
#include <linux/elevator.h>
#include <linux/bio.h> #include <linux/bio.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/slab.h> #include <linux/slab.h>
@ -20,6 +19,7 @@
#include <trace/events/block.h> #include <trace/events/block.h>
#include "elevator.h"
#include "blk.h" #include "blk.h"
#include "blk-mq.h" #include "blk-mq.h"
#include "blk-mq-debugfs.h" #include "blk-mq-debugfs.h"
@ -31,6 +31,11 @@
*/ */
static const int read_expire = HZ / 2; /* max time before a read is submitted. */ static const int read_expire = HZ / 2; /* max time before a read is submitted. */
static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
/*
* Time after which to dispatch lower priority requests even if higher
* priority requests are pending.
*/
static const int prio_aging_expire = 10 * HZ;
static const int writes_starved = 2; /* max times reads can starve a write */ static const int writes_starved = 2; /* max times reads can starve a write */
static const int fifo_batch = 16; /* # of sequential requests treated as one static const int fifo_batch = 16; /* # of sequential requests treated as one
by the above parameters. For throughput. */ by the above parameters. For throughput. */
@ -51,17 +56,16 @@ enum dd_prio {
enum { DD_PRIO_COUNT = 3 }; enum { DD_PRIO_COUNT = 3 };
/* I/O statistics per I/O priority. */ /*
* I/O statistics per I/O priority. It is fine if these counters overflow.
* What matters is that these counters are at least as wide as
* log2(max_outstanding_requests).
*/
struct io_stats_per_prio { struct io_stats_per_prio {
local_t inserted; uint32_t inserted;
local_t merged; uint32_t merged;
local_t dispatched; uint32_t dispatched;
local_t completed; atomic_t completed;
};
/* I/O statistics for all I/O priorities (enum dd_prio). */
struct io_stats {
struct io_stats_per_prio stats[DD_PRIO_COUNT];
}; };
/* /*
@ -74,6 +78,7 @@ struct dd_per_prio {
struct list_head fifo_list[DD_DIR_COUNT]; struct list_head fifo_list[DD_DIR_COUNT];
/* Next request in FIFO order. Read, write or both are NULL. */ /* Next request in FIFO order. Read, write or both are NULL. */
struct request *next_rq[DD_DIR_COUNT]; struct request *next_rq[DD_DIR_COUNT];
struct io_stats_per_prio stats;
}; };
struct deadline_data { struct deadline_data {
@ -88,8 +93,6 @@ struct deadline_data {
unsigned int batching; /* number of sequential requests made */ unsigned int batching; /* number of sequential requests made */
unsigned int starved; /* times reads have starved writes */ unsigned int starved; /* times reads have starved writes */
struct io_stats __percpu *stats;
/* /*
* settings that change how the i/o scheduler behaves * settings that change how the i/o scheduler behaves
*/ */
@ -98,38 +101,12 @@ struct deadline_data {
int writes_starved; int writes_starved;
int front_merges; int front_merges;
u32 async_depth; u32 async_depth;
int prio_aging_expire;
spinlock_t lock; spinlock_t lock;
spinlock_t zone_lock; spinlock_t zone_lock;
}; };
/* Count one event of type 'event_type' and with I/O priority 'prio' */
#define dd_count(dd, event_type, prio) do { \
struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \
\
BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \
BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \
local_inc(&io_stats->stats[(prio)].event_type); \
put_cpu_ptr(io_stats); \
} while (0)
/*
* Returns the total number of dd_count(dd, event_type, prio) calls across all
* CPUs. No locking or barriers since it is fine if the returned sum is slightly
* outdated.
*/
#define dd_sum(dd, event_type, prio) ({ \
unsigned int cpu; \
u32 sum = 0; \
\
BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \
BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \
for_each_present_cpu(cpu) \
sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \
stats[(prio)].event_type); \
sum; \
})
/* Maps an I/O priority class to a deadline scheduler priority. */ /* Maps an I/O priority class to a deadline scheduler priority. */
static const enum dd_prio ioprio_class_to_prio[] = { static const enum dd_prio ioprio_class_to_prio[] = {
[IOPRIO_CLASS_NONE] = DD_BE_PRIO, [IOPRIO_CLASS_NONE] = DD_BE_PRIO,
@ -233,7 +210,9 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
const u8 ioprio_class = dd_rq_ioclass(next); const u8 ioprio_class = dd_rq_ioclass(next);
const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
dd_count(dd, merged, prio); lockdep_assert_held(&dd->lock);
dd->per_prio[prio].stats.merged++;
/* /*
* if next expires before rq, assign its expire time to rq * if next expires before rq, assign its expire time to rq
@ -270,6 +249,16 @@ deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
deadline_remove_request(rq->q, per_prio, rq); deadline_remove_request(rq->q, per_prio, rq);
} }
/* Number of requests queued for a given priority level. */
static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
{
const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
lockdep_assert_held(&dd->lock);
return stats->inserted - atomic_read(&stats->completed);
}
/* /*
* deadline_check_fifo returns 0 if there are no expired requests on the fifo, * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
* 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
@ -355,12 +344,27 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
return rq; return rq;
} }
/*
* Returns true if and only if @rq started after @latest_start where
* @latest_start is in jiffies.
*/
static bool started_after(struct deadline_data *dd, struct request *rq,
unsigned long latest_start)
{
unsigned long start_time = (unsigned long)rq->fifo_time;
start_time -= dd->fifo_expire[rq_data_dir(rq)];
return time_after(start_time, latest_start);
}
/* /*
* deadline_dispatch_requests selects the best request according to * deadline_dispatch_requests selects the best request according to
* read/write expire, fifo_batch, etc * read/write expire, fifo_batch, etc and with a start time <= @latest_start.
*/ */
static struct request *__dd_dispatch_request(struct deadline_data *dd, static struct request *__dd_dispatch_request(struct deadline_data *dd,
struct dd_per_prio *per_prio) struct dd_per_prio *per_prio,
unsigned long latest_start)
{ {
struct request *rq, *next_rq; struct request *rq, *next_rq;
enum dd_data_dir data_dir; enum dd_data_dir data_dir;
@ -372,6 +376,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
if (!list_empty(&per_prio->dispatch)) { if (!list_empty(&per_prio->dispatch)) {
rq = list_first_entry(&per_prio->dispatch, struct request, rq = list_first_entry(&per_prio->dispatch, struct request,
queuelist); queuelist);
if (started_after(dd, rq, latest_start))
return NULL;
list_del_init(&rq->queuelist); list_del_init(&rq->queuelist);
goto done; goto done;
} }
@ -449,6 +455,9 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
dd->batching = 0; dd->batching = 0;
dispatch_request: dispatch_request:
if (started_after(dd, rq, latest_start))
return NULL;
/* /*
* rq is the selected appropriate request. * rq is the selected appropriate request.
*/ */
@ -457,7 +466,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
done: done:
ioprio_class = dd_rq_ioclass(rq); ioprio_class = dd_rq_ioclass(rq);
prio = ioprio_class_to_prio[ioprio_class]; prio = ioprio_class_to_prio[ioprio_class];
dd_count(dd, dispatched, prio); dd->per_prio[prio].stats.dispatched++;
/* /*
* If the request needs its target zone locked, do it. * If the request needs its target zone locked, do it.
*/ */
@ -466,6 +475,34 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
return rq; return rq;
} }
/*
* Check whether there are any requests with priority other than DD_RT_PRIO
* that were inserted more than prio_aging_expire jiffies ago.
*/
static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
unsigned long now)
{
struct request *rq;
enum dd_prio prio;
int prio_cnt;
lockdep_assert_held(&dd->lock);
prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) +
!!dd_queued(dd, DD_IDLE_PRIO);
if (prio_cnt < 2)
return NULL;
for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
rq = __dd_dispatch_request(dd, &dd->per_prio[prio],
now - dd->prio_aging_expire);
if (rq)
return rq;
}
return NULL;
}
/* /*
* Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
* *
@ -477,15 +514,26 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{ {
struct deadline_data *dd = hctx->queue->elevator->elevator_data; struct deadline_data *dd = hctx->queue->elevator->elevator_data;
const unsigned long now = jiffies;
struct request *rq; struct request *rq;
enum dd_prio prio; enum dd_prio prio;
spin_lock(&dd->lock); spin_lock(&dd->lock);
rq = dd_dispatch_prio_aged_requests(dd, now);
if (rq)
goto unlock;
/*
* Next, dispatch requests in priority order. Ignore lower priority
* requests if any higher priority requests are pending.
*/
for (prio = 0; prio <= DD_PRIO_MAX; prio++) { for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
rq = __dd_dispatch_request(dd, &dd->per_prio[prio]); rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now);
if (rq) if (rq || dd_queued(dd, prio))
break; break;
} }
unlock:
spin_unlock(&dd->lock); spin_unlock(&dd->lock);
return rq; return rq;
@ -519,7 +567,7 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
dd->async_depth = max(1UL, 3 * q->nr_requests / 4); dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth); sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth);
} }
/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
@ -536,12 +584,21 @@ static void dd_exit_sched(struct elevator_queue *e)
for (prio = 0; prio <= DD_PRIO_MAX; prio++) { for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
struct dd_per_prio *per_prio = &dd->per_prio[prio]; struct dd_per_prio *per_prio = &dd->per_prio[prio];
const struct io_stats_per_prio *stats = &per_prio->stats;
uint32_t queued;
WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
}
free_percpu(dd->stats); spin_lock(&dd->lock);
queued = dd_queued(dd, prio);
spin_unlock(&dd->lock);
WARN_ONCE(queued != 0,
"statistics for priority %d: i %u m %u d %u c %u\n",
prio, stats->inserted, stats->merged,
stats->dispatched, atomic_read(&stats->completed));
}
kfree(dd); kfree(dd);
} }
@ -566,11 +623,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
eq->elevator_data = dd; eq->elevator_data = dd;
dd->stats = alloc_percpu_gfp(typeof(*dd->stats),
GFP_KERNEL | __GFP_ZERO);
if (!dd->stats)
goto free_dd;
for (prio = 0; prio <= DD_PRIO_MAX; prio++) { for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
struct dd_per_prio *per_prio = &dd->per_prio[prio]; struct dd_per_prio *per_prio = &dd->per_prio[prio];
@ -586,15 +638,13 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
dd->front_merges = 1; dd->front_merges = 1;
dd->last_dir = DD_WRITE; dd->last_dir = DD_WRITE;
dd->fifo_batch = fifo_batch; dd->fifo_batch = fifo_batch;
dd->prio_aging_expire = prio_aging_expire;
spin_lock_init(&dd->lock); spin_lock_init(&dd->lock);
spin_lock_init(&dd->zone_lock); spin_lock_init(&dd->zone_lock);
q->elevator = eq; q->elevator = eq;
return 0; return 0;
free_dd:
kfree(dd);
put_eq: put_eq:
kobject_put(&eq->kobj); kobject_put(&eq->kobj);
return ret; return ret;
@ -677,8 +727,11 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
blk_req_zone_write_unlock(rq); blk_req_zone_write_unlock(rq);
prio = ioprio_class_to_prio[ioprio_class]; prio = ioprio_class_to_prio[ioprio_class];
dd_count(dd, inserted, prio); per_prio = &dd->per_prio[prio];
rq->elv.priv[0] = (void *)(uintptr_t)1; if (!rq->elv.priv[0]) {
per_prio->stats.inserted++;
rq->elv.priv[0] = (void *)(uintptr_t)1;
}
if (blk_mq_sched_try_insert_merge(q, rq, &free)) { if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
blk_mq_free_requests(&free); blk_mq_free_requests(&free);
@ -687,7 +740,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
trace_block_rq_insert(rq); trace_block_rq_insert(rq);
per_prio = &dd->per_prio[prio];
if (at_head) { if (at_head) {
list_add(&rq->queuelist, &per_prio->dispatch); list_add(&rq->queuelist, &per_prio->dispatch);
} else { } else {
@ -759,12 +811,13 @@ static void dd_finish_request(struct request *rq)
/* /*
* The block layer core may call dd_finish_request() without having * The block layer core may call dd_finish_request() without having
* called dd_insert_requests(). Hence only update statistics for * called dd_insert_requests(). Skip requests that bypassed I/O
* requests for which dd_insert_requests() has been called. See also * scheduling. See also blk_mq_request_bypass_insert().
* blk_mq_request_bypass_insert().
*/ */
if (rq->elv.priv[0]) if (!rq->elv.priv[0])
dd_count(dd, completed, prio); return;
atomic_inc(&per_prio->stats.completed);
if (blk_queue_is_zoned(q)) { if (blk_queue_is_zoned(q)) {
unsigned long flags; unsigned long flags;
@ -809,6 +862,7 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page) \
#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire);
SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
SHOW_INT(deadline_front_merges_show, dd->front_merges); SHOW_INT(deadline_front_merges_show, dd->front_merges);
SHOW_INT(deadline_async_depth_show, dd->async_depth); SHOW_INT(deadline_async_depth_show, dd->async_depth);
@ -838,6 +892,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)
STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX);
STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX); STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX);
@ -856,6 +911,7 @@ static struct elv_fs_entry deadline_attrs[] = {
DD_ATTR(front_merges), DD_ATTR(front_merges),
DD_ATTR(async_depth), DD_ATTR(async_depth),
DD_ATTR(fifo_batch), DD_ATTR(fifo_batch),
DD_ATTR(prio_aging_expire),
__ATTR_NULL __ATTR_NULL
}; };
@ -947,38 +1003,48 @@ static int dd_async_depth_show(void *data, struct seq_file *m)
return 0; return 0;
} }
/* Number of requests queued for a given priority level. */
static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
{
return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio);
}
static int dd_queued_show(void *data, struct seq_file *m) static int dd_queued_show(void *data, struct seq_file *m)
{ {
struct request_queue *q = data; struct request_queue *q = data;
struct deadline_data *dd = q->elevator->elevator_data; struct deadline_data *dd = q->elevator->elevator_data;
u32 rt, be, idle;
spin_lock(&dd->lock);
rt = dd_queued(dd, DD_RT_PRIO);
be = dd_queued(dd, DD_BE_PRIO);
idle = dd_queued(dd, DD_IDLE_PRIO);
spin_unlock(&dd->lock);
seq_printf(m, "%u %u %u\n", rt, be, idle);
seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO),
dd_queued(dd, DD_BE_PRIO),
dd_queued(dd, DD_IDLE_PRIO));
return 0; return 0;
} }
/* Number of requests owned by the block driver for a given priority. */ /* Number of requests owned by the block driver for a given priority. */
static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
{ {
return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio) const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
- dd_sum(dd, completed, prio);
lockdep_assert_held(&dd->lock);
return stats->dispatched + stats->merged -
atomic_read(&stats->completed);
} }
static int dd_owned_by_driver_show(void *data, struct seq_file *m) static int dd_owned_by_driver_show(void *data, struct seq_file *m)
{ {
struct request_queue *q = data; struct request_queue *q = data;
struct deadline_data *dd = q->elevator->elevator_data; struct deadline_data *dd = q->elevator->elevator_data;
u32 rt, be, idle;
spin_lock(&dd->lock);
rt = dd_owned_by_driver(dd, DD_RT_PRIO);
be = dd_owned_by_driver(dd, DD_BE_PRIO);
idle = dd_owned_by_driver(dd, DD_IDLE_PRIO);
spin_unlock(&dd->lock);
seq_printf(m, "%u %u %u\n", rt, be, idle);
seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO),
dd_owned_by_driver(dd, DD_BE_PRIO),
dd_owned_by_driver(dd, DD_IDLE_PRIO));
return 0; return 0;
} }

View File

@ -2,6 +2,8 @@
# #
# Partition configuration # Partition configuration
# #
menu "Partition Types"
config PARTITION_ADVANCED config PARTITION_ADVANCED
bool "Advanced partition selection" bool "Advanced partition selection"
help help
@ -267,3 +269,5 @@ config CMDLINE_PARTITION
help help
Say Y here if you want to read the partition table from bootargs. Say Y here if you want to read the partition table from bootargs.
The format for the command line is just like mtdparts. The format for the command line is just like mtdparts.
endmenu

View File

@ -91,19 +91,19 @@ static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
{ {
spin_lock(&bdev->bd_size_lock); spin_lock(&bdev->bd_size_lock);
i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
bdev->bd_nr_sectors = sectors;
spin_unlock(&bdev->bd_size_lock); spin_unlock(&bdev->bd_size_lock);
} }
static struct parsed_partitions *allocate_partitions(struct gendisk *hd) static struct parsed_partitions *allocate_partitions(struct gendisk *hd)
{ {
struct parsed_partitions *state; struct parsed_partitions *state;
int nr; int nr = DISK_MAX_PARTS;
state = kzalloc(sizeof(*state), GFP_KERNEL); state = kzalloc(sizeof(*state), GFP_KERNEL);
if (!state) if (!state)
return NULL; return NULL;
nr = disk_max_parts(hd);
state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); state->parts = vzalloc(array_size(nr, sizeof(state->parts[0])));
if (!state->parts) { if (!state->parts) {
kfree(state); kfree(state);
@ -204,7 +204,7 @@ static ssize_t part_alignment_offset_show(struct device *dev,
struct block_device *bdev = dev_to_bdev(dev); struct block_device *bdev = dev_to_bdev(dev);
return sprintf(buf, "%u\n", return sprintf(buf, "%u\n",
queue_limit_alignment_offset(&bdev->bd_disk->queue->limits, queue_limit_alignment_offset(&bdev_get_queue(bdev)->limits,
bdev->bd_start_sect)); bdev->bd_start_sect));
} }
@ -214,7 +214,7 @@ static ssize_t part_discard_alignment_show(struct device *dev,
struct block_device *bdev = dev_to_bdev(dev); struct block_device *bdev = dev_to_bdev(dev);
return sprintf(buf, "%u\n", return sprintf(buf, "%u\n",
queue_limit_discard_alignment(&bdev->bd_disk->queue->limits, queue_limit_discard_alignment(&bdev_get_queue(bdev)->limits,
bdev->bd_start_sect)); bdev->bd_start_sect));
} }
@ -325,7 +325,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
lockdep_assert_held(&disk->open_mutex); lockdep_assert_held(&disk->open_mutex);
if (partno >= disk_max_parts(disk)) if (partno >= DISK_MAX_PARTS)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
/* /*
@ -526,18 +526,15 @@ int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
static bool disk_unlock_native_capacity(struct gendisk *disk) static bool disk_unlock_native_capacity(struct gendisk *disk)
{ {
const struct block_device_operations *bdops = disk->fops; if (!disk->fops->unlock_native_capacity ||
test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) {
if (bdops->unlock_native_capacity &&
!(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
printk(KERN_CONT "enabling native capacity\n");
bdops->unlock_native_capacity(disk);
disk->flags |= GENHD_FL_NATIVE_CAPACITY;
return true;
} else {
printk(KERN_CONT "truncated\n"); printk(KERN_CONT "truncated\n");
return false; return false;
} }
printk(KERN_CONT "enabling native capacity\n");
disk->fops->unlock_native_capacity(disk);
return true;
} }
void blk_drop_partitions(struct gendisk *disk) void blk_drop_partitions(struct gendisk *disk)
@ -606,7 +603,7 @@ static int blk_add_partitions(struct gendisk *disk)
struct parsed_partitions *state; struct parsed_partitions *state;
int ret = -EAGAIN, p; int ret = -EAGAIN, p;
if (!disk_part_scan_enabled(disk)) if (disk->flags & GENHD_FL_NO_PART)
return 0; return 0;
state = check_partition(disk); state = check_partition(disk);
@ -689,7 +686,7 @@ int bdev_disk_changed(struct gendisk *disk, bool invalidate)
* userspace for this particular setup. * userspace for this particular setup.
*/ */
if (invalidate) { if (invalidate) {
if (disk_part_scan_enabled(disk) || if (!(disk->flags & GENHD_FL_NO_PART) ||
!(disk->flags & GENHD_FL_REMOVABLE)) !(disk->flags & GENHD_FL_REMOVABLE))
set_capacity(disk, 0); set_capacity(disk, 0);
} }

View File

@ -133,7 +133,7 @@ efi_crc32(const void *buf, unsigned long len)
*/ */
static u64 last_lba(struct gendisk *disk) static u64 last_lba(struct gendisk *disk)
{ {
return div_u64(disk->part0->bd_inode->i_size, return div_u64(bdev_nr_bytes(disk->part0),
queue_logical_block_size(disk->queue)) - 1ULL; queue_logical_block_size(disk->queue)) - 1ULL;
} }

View File

@ -198,7 +198,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
char name[], char name[],
union label_t *label, union label_t *label,
sector_t labelsect, sector_t labelsect,
loff_t i_size, sector_t nr_sectors,
dasd_information2_t *info) dasd_information2_t *info)
{ {
loff_t offset, geo_size, size; loff_t offset, geo_size, size;
@ -213,14 +213,14 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
} else { } else {
/* /*
* Formated w/o large volume support. If the sanity check * Formated w/o large volume support. If the sanity check
* 'size based on geo == size based on i_size' is true, then * 'size based on geo == size based on nr_sectors' is true, then
* we can safely assume that we know the formatted size of * we can safely assume that we know the formatted size of
* the disk, otherwise we need additional information * the disk, otherwise we need additional information
* that we can only get from a real DASD device. * that we can only get from a real DASD device.
*/ */
geo_size = geo->cylinders * geo->heads geo_size = geo->cylinders * geo->heads
* geo->sectors * secperblk; * geo->sectors * secperblk;
size = i_size >> 9; size = nr_sectors;
if (size != geo_size) { if (size != geo_size) {
if (!info) { if (!info) {
strlcat(state->pp_buf, "\n", PAGE_SIZE); strlcat(state->pp_buf, "\n", PAGE_SIZE);
@ -229,7 +229,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
if (!strcmp(info->type, "ECKD")) if (!strcmp(info->type, "ECKD"))
if (geo_size < size) if (geo_size < size)
size = geo_size; size = geo_size;
/* else keep size based on i_size */ /* else keep size based on nr_sectors */
} }
} }
/* first and only partition starts in the first block after the label */ /* first and only partition starts in the first block after the label */
@ -293,7 +293,8 @@ int ibm_partition(struct parsed_partitions *state)
struct gendisk *disk = state->disk; struct gendisk *disk = state->disk;
struct block_device *bdev = disk->part0; struct block_device *bdev = disk->part0;
int blocksize, res; int blocksize, res;
loff_t i_size, offset, size; loff_t offset, size;
sector_t nr_sectors;
dasd_information2_t *info; dasd_information2_t *info;
struct hd_geometry *geo; struct hd_geometry *geo;
char type[5] = {0,}; char type[5] = {0,};
@ -308,8 +309,8 @@ int ibm_partition(struct parsed_partitions *state)
blocksize = bdev_logical_block_size(bdev); blocksize = bdev_logical_block_size(bdev);
if (blocksize <= 0) if (blocksize <= 0)
goto out_symbol; goto out_symbol;
i_size = i_size_read(bdev->bd_inode); nr_sectors = bdev_nr_sectors(bdev);
if (i_size == 0) if (nr_sectors == 0)
goto out_symbol; goto out_symbol;
info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL); info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL);
if (info == NULL) if (info == NULL)
@ -336,7 +337,7 @@ int ibm_partition(struct parsed_partitions *state)
label); label);
} else if (!strncmp(type, "LNX1", 4)) { } else if (!strncmp(type, "LNX1", 4)) {
res = find_lnx1_partitions(state, geo, blocksize, name, res = find_lnx1_partitions(state, geo, blocksize, name,
label, labelsect, i_size, label, labelsect, nr_sectors,
info); info);
} else if (!strncmp(type, "CMS1", 4)) { } else if (!strncmp(type, "CMS1", 4)) {
res = find_cms1_partitions(state, geo, blocksize, name, res = find_cms1_partitions(state, geo, blocksize, name,
@ -353,7 +354,7 @@ int ibm_partition(struct parsed_partitions *state)
res = 1; res = 1;
if (info->format == DASD_FORMAT_LDL) { if (info->format == DASD_FORMAT_LDL) {
strlcat(state->pp_buf, "(nonl)", PAGE_SIZE); strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
size = i_size >> 9; size = nr_sectors;
offset = (info->label_block + 1) * (blocksize >> 9); offset = (info->label_block + 1) * (blocksize >> 9);
put_partition(state, 1, offset, size-offset); put_partition(state, 1, offset, size-offset);
strlcat(state->pp_buf, "\n", PAGE_SIZE); strlcat(state->pp_buf, "\n", PAGE_SIZE);

View File

@ -5,7 +5,7 @@
*/ */
#include <linux/t10-pi.h> #include <linux/t10-pi.h>
#include <linux/blkdev.h> #include <linux/blk-integrity.h>
#include <linux/crc-t10dif.h> #include <linux/crc-t10dif.h>
#include <linux/module.h> #include <linux/module.h>
#include <net/checksum.h> #include <net/checksum.h>