mirror of
https://github.com/Qortal/Brooklyn.git
synced 2025-01-30 14:52:17 +00:00
phase 11
This commit is contained in:
parent
fb209289b8
commit
07d9c3128d
@ -35,6 +35,9 @@ config BLK_CGROUP_RWSTAT
|
||||
config BLK_DEV_BSG_COMMON
|
||||
tristate
|
||||
|
||||
config BLK_ICQ
|
||||
bool
|
||||
|
||||
config BLK_DEV_BSGLIB
|
||||
bool "Block layer SG support v4 helper lib"
|
||||
select BLK_DEV_BSG_COMMON
|
||||
@ -73,7 +76,7 @@ config BLK_DEV_ZONED
|
||||
|
||||
config BLK_DEV_THROTTLING
|
||||
bool "Block layer bio throttling support"
|
||||
depends on BLK_CGROUP=y
|
||||
depends on BLK_CGROUP
|
||||
select BLK_CGROUP_RWSTAT
|
||||
help
|
||||
Block layer bio throttling support. It can be used to limit
|
||||
@ -112,7 +115,7 @@ config BLK_WBT_MQ
|
||||
|
||||
config BLK_CGROUP_IOLATENCY
|
||||
bool "Enable support for latency based cgroup IO protection"
|
||||
depends on BLK_CGROUP=y
|
||||
depends on BLK_CGROUP
|
||||
help
|
||||
Enabling this option enables the .latency interface for IO throttling.
|
||||
The IO controller will attempt to maintain average IO latencies below
|
||||
@ -132,7 +135,7 @@ config BLK_CGROUP_FC_APPID
|
||||
|
||||
config BLK_CGROUP_IOCOST
|
||||
bool "Enable support for cost model based cgroup IO controller"
|
||||
depends on BLK_CGROUP=y
|
||||
depends on BLK_CGROUP
|
||||
select BLK_RQ_IO_DATA_LEN
|
||||
select BLK_RQ_ALLOC_TIME
|
||||
help
|
||||
@ -190,39 +193,31 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
|
||||
by falling back to the kernel crypto API when inline
|
||||
encryption hardware is not present.
|
||||
|
||||
menu "Partition Types"
|
||||
|
||||
source "block/partitions/Kconfig"
|
||||
|
||||
endmenu
|
||||
|
||||
endif # BLOCK
|
||||
|
||||
config BLOCK_COMPAT
|
||||
bool
|
||||
depends on BLOCK && COMPAT
|
||||
default y
|
||||
def_bool COMPAT
|
||||
|
||||
config BLK_MQ_PCI
|
||||
bool
|
||||
depends on BLOCK && PCI
|
||||
default y
|
||||
def_bool PCI
|
||||
|
||||
config BLK_MQ_VIRTIO
|
||||
bool
|
||||
depends on BLOCK && VIRTIO
|
||||
depends on VIRTIO
|
||||
default y
|
||||
|
||||
config BLK_MQ_RDMA
|
||||
bool
|
||||
depends on BLOCK && INFINIBAND
|
||||
depends on INFINIBAND
|
||||
default y
|
||||
|
||||
config BLK_PM
|
||||
def_bool BLOCK && PM
|
||||
def_bool PM
|
||||
|
||||
# do not use in new code
|
||||
config BLOCK_HOLDER_DEPRECATED
|
||||
bool
|
||||
|
||||
source "block/Kconfig.iosched"
|
||||
|
||||
endif # BLOCK
|
||||
|
@ -1,6 +1,4 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
if BLOCK
|
||||
|
||||
menu "IO Schedulers"
|
||||
|
||||
config MQ_IOSCHED_DEADLINE
|
||||
@ -20,6 +18,7 @@ config MQ_IOSCHED_KYBER
|
||||
|
||||
config IOSCHED_BFQ
|
||||
tristate "BFQ I/O scheduler"
|
||||
select BLK_ICQ
|
||||
help
|
||||
BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of
|
||||
of the device among all processes according to their weights,
|
||||
@ -45,5 +44,3 @@ config BFQ_CGROUP_DEBUG
|
||||
files in a cgroup which can be useful for debugging.
|
||||
|
||||
endmenu
|
||||
|
||||
endif
|
||||
|
@ -3,13 +3,13 @@
|
||||
# Makefile for the kernel block layer
|
||||
#
|
||||
|
||||
obj-$(CONFIG_BLOCK) := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
|
||||
obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
|
||||
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
|
||||
blk-exec.o blk-merge.o blk-timeout.o \
|
||||
blk-merge.o blk-timeout.o \
|
||||
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
|
||||
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
|
||||
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
|
||||
disk-events.o
|
||||
disk-events.o blk-ia-ranges.o
|
||||
|
||||
obj-$(CONFIG_BOUNCE) += bounce.o
|
||||
obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
|
||||
@ -36,6 +36,6 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
|
||||
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
|
||||
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
|
||||
obj-$(CONFIG_BLK_PM) += blk-pm.o
|
||||
obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o
|
||||
obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o
|
||||
obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
|
||||
obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o
|
||||
|
79
block/bdev.c
79
block/bdev.c
@ -12,6 +12,7 @@
|
||||
#include <linux/major.h>
|
||||
#include <linux/device_cgroup.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-integrity.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/blkpg.h>
|
||||
@ -23,7 +24,6 @@
|
||||
#include <linux/pseudo_fs.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/cleancache.h>
|
||||
#include <linux/part_stat.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include "../fs/internal.h"
|
||||
@ -87,10 +87,6 @@ void invalidate_bdev(struct block_device *bdev)
|
||||
lru_add_drain_all(); /* make sure all lru add caches are flushed */
|
||||
invalidate_mapping_pages(mapping, 0, -1);
|
||||
}
|
||||
/* 99% of the time, we don't need to flush the cleancache on the bdev.
|
||||
* But, for the strange corners, lets be cautious
|
||||
*/
|
||||
cleancache_invalidate_inode(mapping);
|
||||
}
|
||||
EXPORT_SYMBOL(invalidate_bdev);
|
||||
|
||||
@ -184,14 +180,13 @@ int sb_min_blocksize(struct super_block *sb, int size)
|
||||
|
||||
EXPORT_SYMBOL(sb_min_blocksize);
|
||||
|
||||
int __sync_blockdev(struct block_device *bdev, int wait)
|
||||
int sync_blockdev_nowait(struct block_device *bdev)
|
||||
{
|
||||
if (!bdev)
|
||||
return 0;
|
||||
if (!wait)
|
||||
return filemap_flush(bdev->bd_inode->i_mapping);
|
||||
return filemap_write_and_wait(bdev->bd_inode->i_mapping);
|
||||
return filemap_flush(bdev->bd_inode->i_mapping);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(sync_blockdev_nowait);
|
||||
|
||||
/*
|
||||
* Write out and wait upon all the dirty data associated with a block
|
||||
@ -199,7 +194,9 @@ int __sync_blockdev(struct block_device *bdev, int wait)
|
||||
*/
|
||||
int sync_blockdev(struct block_device *bdev)
|
||||
{
|
||||
return __sync_blockdev(bdev, 1);
|
||||
if (!bdev)
|
||||
return 0;
|
||||
return filemap_write_and_wait(bdev->bd_inode->i_mapping);
|
||||
}
|
||||
EXPORT_SYMBOL(sync_blockdev);
|
||||
|
||||
@ -326,12 +323,12 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
|
||||
if (!ops->rw_page || bdev_get_integrity(bdev))
|
||||
return result;
|
||||
|
||||
result = blk_queue_enter(bdev->bd_disk->queue, 0);
|
||||
result = blk_queue_enter(bdev_get_queue(bdev), 0);
|
||||
if (result)
|
||||
return result;
|
||||
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
|
||||
REQ_OP_READ);
|
||||
blk_queue_exit(bdev->bd_disk->queue);
|
||||
blk_queue_exit(bdev_get_queue(bdev));
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -362,7 +359,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
|
||||
|
||||
if (!ops->rw_page || bdev_get_integrity(bdev))
|
||||
return -EOPNOTSUPP;
|
||||
result = blk_queue_enter(bdev->bd_disk->queue, 0);
|
||||
result = blk_queue_enter(bdev_get_queue(bdev), 0);
|
||||
if (result)
|
||||
return result;
|
||||
|
||||
@ -375,7 +372,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
|
||||
clean_page_buffers(page);
|
||||
unlock_page(page);
|
||||
}
|
||||
blk_queue_exit(bdev->bd_disk->queue);
|
||||
blk_queue_exit(bdev_get_queue(bdev));
|
||||
return result;
|
||||
}
|
||||
|
||||
@ -492,6 +489,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
|
||||
spin_lock_init(&bdev->bd_size_lock);
|
||||
bdev->bd_partno = partno;
|
||||
bdev->bd_inode = inode;
|
||||
bdev->bd_queue = disk->queue;
|
||||
bdev->bd_stats = alloc_percpu(struct disk_stats);
|
||||
if (!bdev->bd_stats) {
|
||||
iput(inode);
|
||||
@ -662,7 +660,7 @@ static void blkdev_flush_mapping(struct block_device *bdev)
|
||||
static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
|
||||
{
|
||||
struct gendisk *disk = bdev->bd_disk;
|
||||
int ret = 0;
|
||||
int ret;
|
||||
|
||||
if (disk->fops->open) {
|
||||
ret = disk->fops->open(bdev, mode);
|
||||
@ -747,21 +745,11 @@ struct block_device *blkdev_get_no_open(dev_t dev)
|
||||
if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
|
||||
bdev = NULL;
|
||||
iput(inode);
|
||||
|
||||
if (!bdev)
|
||||
return NULL;
|
||||
if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) ||
|
||||
!try_module_get(bdev->bd_disk->fops->owner)) {
|
||||
put_device(&bdev->bd_device);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return bdev;
|
||||
}
|
||||
|
||||
void blkdev_put_no_open(struct block_device *bdev)
|
||||
{
|
||||
module_put(bdev->bd_disk->fops->owner);
|
||||
put_device(&bdev->bd_device);
|
||||
}
|
||||
|
||||
@ -817,12 +805,14 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
|
||||
ret = -ENXIO;
|
||||
if (!disk_live(disk))
|
||||
goto abort_claiming;
|
||||
if (!try_module_get(disk->fops->owner))
|
||||
goto abort_claiming;
|
||||
if (bdev_is_partition(bdev))
|
||||
ret = blkdev_get_part(bdev, mode);
|
||||
else
|
||||
ret = blkdev_get_whole(bdev, mode);
|
||||
if (ret)
|
||||
goto abort_claiming;
|
||||
goto put_module;
|
||||
if (mode & FMODE_EXCL) {
|
||||
bd_finish_claiming(bdev, holder);
|
||||
|
||||
@ -834,7 +824,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
|
||||
* used in blkdev_get/put().
|
||||
*/
|
||||
if ((mode & FMODE_WRITE) && !bdev->bd_write_holder &&
|
||||
(disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
|
||||
(disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) {
|
||||
bdev->bd_write_holder = true;
|
||||
unblock_events = false;
|
||||
}
|
||||
@ -844,7 +834,8 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
|
||||
if (unblock_events)
|
||||
disk_unblock_events(disk);
|
||||
return bdev;
|
||||
|
||||
put_module:
|
||||
module_put(disk->fops->owner);
|
||||
abort_claiming:
|
||||
if (mode & FMODE_EXCL)
|
||||
bd_abort_claiming(bdev, holder);
|
||||
@ -953,18 +944,21 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
|
||||
blkdev_put_whole(bdev, mode);
|
||||
mutex_unlock(&disk->open_mutex);
|
||||
|
||||
module_put(disk->fops->owner);
|
||||
blkdev_put_no_open(bdev);
|
||||
}
|
||||
EXPORT_SYMBOL(blkdev_put);
|
||||
|
||||
/**
|
||||
* lookup_bdev - lookup a struct block_device by name
|
||||
* @pathname: special file representing the block device
|
||||
* @dev: return value of the block device's dev_t
|
||||
* lookup_bdev() - Look up a struct block_device by name.
|
||||
* @pathname: Name of the block device in the filesystem.
|
||||
* @dev: Pointer to the block device's dev_t, if found.
|
||||
*
|
||||
* Get a reference to the blockdevice at @pathname in the current
|
||||
* namespace if possible and return it. Return ERR_PTR(error)
|
||||
* otherwise.
|
||||
* Lookup the block device's dev_t at @pathname in the current
|
||||
* namespace if possible and return it in @dev.
|
||||
*
|
||||
* Context: May sleep.
|
||||
* Return: 0 if succeeded, negative errno otherwise.
|
||||
*/
|
||||
int lookup_bdev(const char *pathname, dev_t *dev)
|
||||
{
|
||||
@ -1016,7 +1010,7 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty)
|
||||
}
|
||||
EXPORT_SYMBOL(__invalidate_device);
|
||||
|
||||
void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
|
||||
void sync_bdevs(bool wait)
|
||||
{
|
||||
struct inode *inode, *old_inode = NULL;
|
||||
|
||||
@ -1047,8 +1041,19 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
|
||||
bdev = I_BDEV(inode);
|
||||
|
||||
mutex_lock(&bdev->bd_disk->open_mutex);
|
||||
if (bdev->bd_openers)
|
||||
func(bdev, arg);
|
||||
if (!bdev->bd_openers) {
|
||||
; /* skip */
|
||||
} else if (wait) {
|
||||
/*
|
||||
* We keep the error status of individual mapping so
|
||||
* that applications can catch the writeback error using
|
||||
* fsync(2). See filemap_fdatawait_keep_errors() for
|
||||
* details.
|
||||
*/
|
||||
filemap_fdatawait_keep_errors(inode->i_mapping);
|
||||
} else {
|
||||
filemap_fdatawrite(inode->i_mapping);
|
||||
}
|
||||
mutex_unlock(&bdev->bd_disk->open_mutex);
|
||||
|
||||
spin_lock(&blockdev_superblock->s_inode_list_lock);
|
||||
|
@ -6,13 +6,13 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/elevator.h>
|
||||
#include <linux/ktime.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/ioprio.h>
|
||||
#include <linux/sbitmap.h>
|
||||
#include <linux/delay.h>
|
||||
|
||||
#include "elevator.h"
|
||||
#include "bfq-iosched.h"
|
||||
|
||||
#ifdef CONFIG_BFQ_CGROUP_DEBUG
|
||||
@ -463,7 +463,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
|
||||
{
|
||||
if (blkg_rwstat_init(&stats->bytes, gfp) ||
|
||||
blkg_rwstat_init(&stats->ios, gfp))
|
||||
return -ENOMEM;
|
||||
goto error;
|
||||
|
||||
#ifdef CONFIG_BFQ_CGROUP_DEBUG
|
||||
if (blkg_rwstat_init(&stats->merged, gfp) ||
|
||||
@ -476,13 +476,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
|
||||
bfq_stat_init(&stats->dequeue, gfp) ||
|
||||
bfq_stat_init(&stats->group_wait_time, gfp) ||
|
||||
bfq_stat_init(&stats->idle_time, gfp) ||
|
||||
bfq_stat_init(&stats->empty_time, gfp)) {
|
||||
bfqg_stats_exit(stats);
|
||||
return -ENOMEM;
|
||||
}
|
||||
bfq_stat_init(&stats->empty_time, gfp))
|
||||
goto error;
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
|
||||
error:
|
||||
bfqg_stats_exit(stats);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
|
||||
|
@ -117,7 +117,6 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/elevator.h>
|
||||
#include <linux/ktime.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/ioprio.h>
|
||||
@ -127,6 +126,7 @@
|
||||
|
||||
#include <trace/events/block.h>
|
||||
|
||||
#include "elevator.h"
|
||||
#include "blk.h"
|
||||
#include "blk-mq.h"
|
||||
#include "blk-mq-tag.h"
|
||||
@ -433,26 +433,21 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
|
||||
|
||||
/**
|
||||
* bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
|
||||
* @bfqd: the lookup key.
|
||||
* @ioc: the io_context of the process doing I/O.
|
||||
* @q: the request queue.
|
||||
*/
|
||||
static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
|
||||
struct io_context *ioc,
|
||||
struct request_queue *q)
|
||||
static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
|
||||
{
|
||||
if (ioc) {
|
||||
unsigned long flags;
|
||||
struct bfq_io_cq *icq;
|
||||
struct bfq_io_cq *icq;
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&q->queue_lock, flags);
|
||||
icq = icq_to_bic(ioc_lookup_icq(ioc, q));
|
||||
spin_unlock_irqrestore(&q->queue_lock, flags);
|
||||
if (!current->io_context)
|
||||
return NULL;
|
||||
|
||||
return icq;
|
||||
}
|
||||
spin_lock_irqsave(&q->queue_lock, flags);
|
||||
icq = icq_to_bic(ioc_lookup_icq(q));
|
||||
spin_unlock_irqrestore(&q->queue_lock, flags);
|
||||
|
||||
return NULL;
|
||||
return icq;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -565,26 +560,134 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
|
||||
}
|
||||
}
|
||||
|
||||
#define BFQ_LIMIT_INLINE_DEPTH 16
|
||||
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
|
||||
{
|
||||
struct bfq_data *bfqd = bfqq->bfqd;
|
||||
struct bfq_entity *entity = &bfqq->entity;
|
||||
struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH];
|
||||
struct bfq_entity **entities = inline_entities;
|
||||
int depth, level;
|
||||
int class_idx = bfqq->ioprio_class - 1;
|
||||
struct bfq_sched_data *sched_data;
|
||||
unsigned long wsum;
|
||||
bool ret = false;
|
||||
|
||||
if (!entity->on_st_or_in_serv)
|
||||
return false;
|
||||
|
||||
/* +1 for bfqq entity, root cgroup not included */
|
||||
depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1;
|
||||
if (depth > BFQ_LIMIT_INLINE_DEPTH) {
|
||||
entities = kmalloc_array(depth, sizeof(*entities), GFP_NOIO);
|
||||
if (!entities)
|
||||
return false;
|
||||
}
|
||||
|
||||
spin_lock_irq(&bfqd->lock);
|
||||
sched_data = entity->sched_data;
|
||||
/* Gather our ancestors as we need to traverse them in reverse order */
|
||||
level = 0;
|
||||
for_each_entity(entity) {
|
||||
/*
|
||||
* If at some level entity is not even active, allow request
|
||||
* queueing so that BFQ knows there's work to do and activate
|
||||
* entities.
|
||||
*/
|
||||
if (!entity->on_st_or_in_serv)
|
||||
goto out;
|
||||
/* Uh, more parents than cgroup subsystem thinks? */
|
||||
if (WARN_ON_ONCE(level >= depth))
|
||||
break;
|
||||
entities[level++] = entity;
|
||||
}
|
||||
WARN_ON_ONCE(level != depth);
|
||||
for (level--; level >= 0; level--) {
|
||||
entity = entities[level];
|
||||
if (level > 0) {
|
||||
wsum = bfq_entity_service_tree(entity)->wsum;
|
||||
} else {
|
||||
int i;
|
||||
/*
|
||||
* For bfqq itself we take into account service trees
|
||||
* of all higher priority classes and multiply their
|
||||
* weights so that low prio queue from higher class
|
||||
* gets more requests than high prio queue from lower
|
||||
* class.
|
||||
*/
|
||||
wsum = 0;
|
||||
for (i = 0; i <= class_idx; i++) {
|
||||
wsum = wsum * IOPRIO_BE_NR +
|
||||
sched_data->service_tree[i].wsum;
|
||||
}
|
||||
}
|
||||
limit = DIV_ROUND_CLOSEST(limit * entity->weight, wsum);
|
||||
if (entity->allocated >= limit) {
|
||||
bfq_log_bfqq(bfqq->bfqd, bfqq,
|
||||
"too many requests: allocated %d limit %d level %d",
|
||||
entity->allocated, limit, level);
|
||||
ret = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
out:
|
||||
spin_unlock_irq(&bfqd->lock);
|
||||
if (entities != inline_entities)
|
||||
kfree(entities);
|
||||
return ret;
|
||||
}
|
||||
#else
|
||||
static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Async I/O can easily starve sync I/O (both sync reads and sync
|
||||
* writes), by consuming all tags. Similarly, storms of sync writes,
|
||||
* such as those that sync(2) may trigger, can starve sync reads.
|
||||
* Limit depths of async I/O and sync writes so as to counter both
|
||||
* problems.
|
||||
*
|
||||
* Also if a bfq queue or its parent cgroup consume more tags than would be
|
||||
* appropriate for their weight, we trim the available tag depth to 1. This
|
||||
* avoids a situation where one cgroup can starve another cgroup from tags and
|
||||
* thus block service differentiation among cgroups. Note that because the
|
||||
* queue / cgroup already has many requests allocated and queued, this does not
|
||||
* significantly affect service guarantees coming from the BFQ scheduling
|
||||
* algorithm.
|
||||
*/
|
||||
static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
|
||||
{
|
||||
struct bfq_data *bfqd = data->q->elevator->elevator_data;
|
||||
struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
|
||||
struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(op)) : NULL;
|
||||
int depth;
|
||||
unsigned limit = data->q->nr_requests;
|
||||
|
||||
if (op_is_sync(op) && !op_is_write(op))
|
||||
return;
|
||||
/* Sync reads have full depth available */
|
||||
if (op_is_sync(op) && !op_is_write(op)) {
|
||||
depth = 0;
|
||||
} else {
|
||||
depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
|
||||
limit = (limit * depth) >> bfqd->full_depth_shift;
|
||||
}
|
||||
|
||||
data->shallow_depth =
|
||||
bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
|
||||
/*
|
||||
* Does queue (or any parent entity) exceed number of requests that
|
||||
* should be available to it? Heavily limit depth so that it cannot
|
||||
* consume more available requests and thus starve other entities.
|
||||
*/
|
||||
if (bfqq && bfqq_request_over_limit(bfqq, limit))
|
||||
depth = 1;
|
||||
|
||||
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
|
||||
__func__, bfqd->wr_busy_queues, op_is_sync(op),
|
||||
data->shallow_depth);
|
||||
__func__, bfqd->wr_busy_queues, op_is_sync(op), depth);
|
||||
if (depth)
|
||||
data->shallow_depth = depth;
|
||||
}
|
||||
|
||||
static struct bfq_queue *
|
||||
@ -1113,7 +1216,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
|
||||
|
||||
static int bfqq_process_refs(struct bfq_queue *bfqq)
|
||||
{
|
||||
return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv -
|
||||
return bfqq->ref - bfqq->entity.allocated -
|
||||
bfqq->entity.on_st_or_in_serv -
|
||||
(bfqq->weight_counter != NULL) - bfqq->stable_ref;
|
||||
}
|
||||
|
||||
@ -1982,20 +2086,19 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
|
||||
* aspect, see the comments on the choice of the queue for injection
|
||||
* in bfq_select_queue().
|
||||
*
|
||||
* Turning back to the detection of a waker queue, a queue Q is deemed
|
||||
* as a waker queue for bfqq if, for three consecutive times, bfqq
|
||||
* happens to become non empty right after a request of Q has been
|
||||
* completed. In this respect, even if bfqq is empty, we do not check
|
||||
* for a waker if it still has some in-flight I/O. In fact, in this
|
||||
* case bfqq is actually still being served by the drive, and may
|
||||
* receive new I/O on the completion of some of the in-flight
|
||||
* requests. In particular, on the first time, Q is tentatively set as
|
||||
* a candidate waker queue, while on the third consecutive time that Q
|
||||
* is detected, the field waker_bfqq is set to Q, to confirm that Q is
|
||||
* a waker queue for bfqq. These detection steps are performed only if
|
||||
* bfqq has a long think time, so as to make it more likely that
|
||||
* bfqq's I/O is actually being blocked by a synchronization. This
|
||||
* last filter, plus the above three-times requirement, make false
|
||||
* Turning back to the detection of a waker queue, a queue Q is deemed as a
|
||||
* waker queue for bfqq if, for three consecutive times, bfqq happens to become
|
||||
* non empty right after a request of Q has been completed within given
|
||||
* timeout. In this respect, even if bfqq is empty, we do not check for a waker
|
||||
* if it still has some in-flight I/O. In fact, in this case bfqq is actually
|
||||
* still being served by the drive, and may receive new I/O on the completion
|
||||
* of some of the in-flight requests. In particular, on the first time, Q is
|
||||
* tentatively set as a candidate waker queue, while on the third consecutive
|
||||
* time that Q is detected, the field waker_bfqq is set to Q, to confirm that Q
|
||||
* is a waker queue for bfqq. These detection steps are performed only if bfqq
|
||||
* has a long think time, so as to make it more likely that bfqq's I/O is
|
||||
* actually being blocked by a synchronization. This last filter, plus the
|
||||
* above three-times requirement and time limit for detection, make false
|
||||
* positives less likely.
|
||||
*
|
||||
* NOTE
|
||||
@ -2019,6 +2122,8 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
|
||||
static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
u64 now_ns)
|
||||
{
|
||||
char waker_name[MAX_BFQQ_NAME_LENGTH];
|
||||
|
||||
if (!bfqd->last_completed_rq_bfqq ||
|
||||
bfqd->last_completed_rq_bfqq == bfqq ||
|
||||
bfq_bfqq_has_short_ttime(bfqq) ||
|
||||
@ -2027,8 +2132,16 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq)
|
||||
return;
|
||||
|
||||
/*
|
||||
* We reset waker detection logic also if too much time has passed
|
||||
* since the first detection. If wakeups are rare, pointless idling
|
||||
* doesn't hurt throughput that much. The condition below makes sure
|
||||
* we do not uselessly idle blocking waker in more than 1/64 cases.
|
||||
*/
|
||||
if (bfqd->last_completed_rq_bfqq !=
|
||||
bfqq->tentative_waker_bfqq) {
|
||||
bfqq->tentative_waker_bfqq ||
|
||||
now_ns > bfqq->waker_detection_started +
|
||||
128 * (u64)bfqd->bfq_slice_idle) {
|
||||
/*
|
||||
* First synchronization detected with a
|
||||
* candidate waker queue, or with a different
|
||||
@ -2037,12 +2150,19 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
bfqq->tentative_waker_bfqq =
|
||||
bfqd->last_completed_rq_bfqq;
|
||||
bfqq->num_waker_detections = 1;
|
||||
bfqq->waker_detection_started = now_ns;
|
||||
bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name,
|
||||
MAX_BFQQ_NAME_LENGTH);
|
||||
bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name);
|
||||
} else /* Same tentative waker queue detected again */
|
||||
bfqq->num_waker_detections++;
|
||||
|
||||
if (bfqq->num_waker_detections == 3) {
|
||||
bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
|
||||
bfqq->tentative_waker_bfqq = NULL;
|
||||
bfq_bfqq_name(bfqq->waker_bfqq, waker_name,
|
||||
MAX_BFQQ_NAME_LENGTH);
|
||||
bfq_log_bfqq(bfqd, bfqq, "set waker %s", waker_name);
|
||||
|
||||
/*
|
||||
* If the waker queue disappears, then
|
||||
@ -2332,7 +2452,7 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
|
||||
* returned by bfq_bic_lookup does not go away before
|
||||
* bfqd->lock is taken.
|
||||
*/
|
||||
struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
|
||||
struct bfq_io_cq *bic = bfq_bic_lookup(q);
|
||||
bool ret;
|
||||
|
||||
spin_lock_irq(&bfqd->lock);
|
||||
@ -5878,6 +5998,22 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
|
||||
}
|
||||
}
|
||||
|
||||
static void bfqq_request_allocated(struct bfq_queue *bfqq)
|
||||
{
|
||||
struct bfq_entity *entity = &bfqq->entity;
|
||||
|
||||
for_each_entity(entity)
|
||||
entity->allocated++;
|
||||
}
|
||||
|
||||
static void bfqq_request_freed(struct bfq_queue *bfqq)
|
||||
{
|
||||
struct bfq_entity *entity = &bfqq->entity;
|
||||
|
||||
for_each_entity(entity)
|
||||
entity->allocated--;
|
||||
}
|
||||
|
||||
/* returns true if it causes the idle timer to be disabled */
|
||||
static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
|
||||
{
|
||||
@ -5891,8 +6027,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
|
||||
* Release the request's reference to the old bfqq
|
||||
* and make sure one is taken to the shared queue.
|
||||
*/
|
||||
new_bfqq->allocated++;
|
||||
bfqq->allocated--;
|
||||
bfqq_request_allocated(new_bfqq);
|
||||
bfqq_request_freed(bfqq);
|
||||
new_bfqq->ref++;
|
||||
/*
|
||||
* If the bic associated with the process
|
||||
@ -6209,8 +6345,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
|
||||
|
||||
static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq)
|
||||
{
|
||||
bfqq->allocated--;
|
||||
|
||||
bfqq_request_freed(bfqq);
|
||||
bfq_put_queue(bfqq);
|
||||
}
|
||||
|
||||
@ -6434,6 +6569,16 @@ static void bfq_finish_requeue_request(struct request *rq)
|
||||
rq->elv.priv[1] = NULL;
|
||||
}
|
||||
|
||||
static void bfq_finish_request(struct request *rq)
|
||||
{
|
||||
bfq_finish_requeue_request(rq);
|
||||
|
||||
if (rq->elv.icq) {
|
||||
put_io_context(rq->elv.icq->ioc);
|
||||
rq->elv.icq = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Removes the association between the current task and bfqq, assuming
|
||||
* that bic points to the bfq iocontext of the task.
|
||||
@ -6531,6 +6676,8 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
|
||||
*/
|
||||
static void bfq_prepare_request(struct request *rq)
|
||||
{
|
||||
rq->elv.icq = ioc_find_get_icq(rq->q);
|
||||
|
||||
/*
|
||||
* Regardless of whether we have an icq attached, we have to
|
||||
* clear the scheduler pointers, as they might point to
|
||||
@ -6630,7 +6777,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
|
||||
}
|
||||
}
|
||||
|
||||
bfqq->allocated++;
|
||||
bfqq_request_allocated(bfqq);
|
||||
bfqq->ref++;
|
||||
bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
|
||||
rq, bfqq, bfqq->ref);
|
||||
@ -6793,11 +6940,11 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
|
||||
* See the comments on bfq_limit_depth for the purpose of
|
||||
* the depths set in the function. Return minimum shallow depth we'll use.
|
||||
*/
|
||||
static unsigned int bfq_update_depths(struct bfq_data *bfqd,
|
||||
struct sbitmap_queue *bt)
|
||||
static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
|
||||
{
|
||||
unsigned int i, j, min_shallow = UINT_MAX;
|
||||
unsigned int depth = 1U << bt->sb.shift;
|
||||
|
||||
bfqd->full_depth_shift = bt->sb.shift;
|
||||
/*
|
||||
* In-word depths if no bfq_queue is being weight-raised:
|
||||
* leaving 25% of tags only for sync reads.
|
||||
@ -6809,13 +6956,13 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
|
||||
* limit 'something'.
|
||||
*/
|
||||
/* no more than 50% of tags for async I/O */
|
||||
bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U);
|
||||
bfqd->word_depths[0][0] = max(depth >> 1, 1U);
|
||||
/*
|
||||
* no more than 75% of tags for sync writes (25% extra tags
|
||||
* w.r.t. async I/O, to prevent async I/O from starving sync
|
||||
* writes)
|
||||
*/
|
||||
bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U);
|
||||
bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U);
|
||||
|
||||
/*
|
||||
* In-word depths in case some bfq_queue is being weight-
|
||||
@ -6825,25 +6972,18 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
|
||||
* shortage.
|
||||
*/
|
||||
/* no more than ~18% of tags for async I/O */
|
||||
bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U);
|
||||
bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U);
|
||||
/* no more than ~37% of tags for sync writes (~20% extra tags) */
|
||||
bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U);
|
||||
|
||||
for (i = 0; i < 2; i++)
|
||||
for (j = 0; j < 2; j++)
|
||||
min_shallow = min(min_shallow, bfqd->word_depths[i][j]);
|
||||
|
||||
return min_shallow;
|
||||
bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U);
|
||||
}
|
||||
|
||||
static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
|
||||
struct blk_mq_tags *tags = hctx->sched_tags;
|
||||
unsigned int min_shallow;
|
||||
|
||||
min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags);
|
||||
sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow);
|
||||
bfq_update_depths(bfqd, &tags->bitmap_tags);
|
||||
sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
|
||||
}
|
||||
|
||||
static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
|
||||
@ -7260,7 +7400,7 @@ static struct elevator_type iosched_bfq_mq = {
|
||||
.limit_depth = bfq_limit_depth,
|
||||
.prepare_request = bfq_prepare_request,
|
||||
.requeue_request = bfq_finish_requeue_request,
|
||||
.finish_request = bfq_finish_requeue_request,
|
||||
.finish_request = bfq_finish_request,
|
||||
.exit_icq = bfq_exit_icq,
|
||||
.insert_requests = bfq_insert_requests,
|
||||
.dispatch_request = bfq_dispatch_request,
|
||||
|
@ -25,7 +25,7 @@
|
||||
#define BFQ_DEFAULT_GRP_IOPRIO 0
|
||||
#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
|
||||
|
||||
#define MAX_PID_STR_LENGTH 12
|
||||
#define MAX_BFQQ_NAME_LENGTH 16
|
||||
|
||||
/*
|
||||
* Soft real-time applications are extremely more latency sensitive
|
||||
@ -170,6 +170,9 @@ struct bfq_entity {
|
||||
/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
|
||||
int budget;
|
||||
|
||||
/* Number of requests allocated in the subtree of this entity */
|
||||
int allocated;
|
||||
|
||||
/* device weight, if non-zero, it overrides the default weight of
|
||||
* bfq_group_data */
|
||||
int dev_weight;
|
||||
@ -266,8 +269,6 @@ struct bfq_queue {
|
||||
struct request *next_rq;
|
||||
/* number of sync and async requests queued */
|
||||
int queued[2];
|
||||
/* number of requests currently allocated */
|
||||
int allocated;
|
||||
/* number of pending metadata requests */
|
||||
int meta_pending;
|
||||
/* fifo list of requests in sort_list */
|
||||
@ -387,6 +388,8 @@ struct bfq_queue {
|
||||
struct bfq_queue *tentative_waker_bfqq;
|
||||
/* number of times the same tentative waker has been detected */
|
||||
unsigned int num_waker_detections;
|
||||
/* time when we started considering this waker */
|
||||
u64 waker_detection_started;
|
||||
|
||||
/* node for woken_list, see below */
|
||||
struct hlist_node woken_list_node;
|
||||
@ -768,6 +771,7 @@ struct bfq_data {
|
||||
* function)
|
||||
*/
|
||||
unsigned int word_depths[2][2];
|
||||
unsigned int full_depth_shift;
|
||||
};
|
||||
|
||||
enum bfqq_state_flags {
|
||||
@ -1079,26 +1083,27 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
|
||||
/* --------------- end of interface of B-WF2Q+ ---------------- */
|
||||
|
||||
/* Logging facilities. */
|
||||
static inline void bfq_pid_to_str(int pid, char *str, int len)
|
||||
static inline void bfq_bfqq_name(struct bfq_queue *bfqq, char *str, int len)
|
||||
{
|
||||
if (pid != -1)
|
||||
snprintf(str, len, "%d", pid);
|
||||
char type = bfq_bfqq_sync(bfqq) ? 'S' : 'A';
|
||||
|
||||
if (bfqq->pid != -1)
|
||||
snprintf(str, len, "bfq%d%c", bfqq->pid, type);
|
||||
else
|
||||
snprintf(str, len, "SHARED-");
|
||||
snprintf(str, len, "bfqSHARED-%c", type);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BFQ_GROUP_IOSCHED
|
||||
struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
|
||||
|
||||
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
|
||||
char pid_str[MAX_PID_STR_LENGTH]; \
|
||||
char pid_str[MAX_BFQQ_NAME_LENGTH]; \
|
||||
if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
|
||||
break; \
|
||||
bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
|
||||
bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \
|
||||
blk_add_cgroup_trace_msg((bfqd)->queue, \
|
||||
bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \
|
||||
"bfq%s%c " fmt, pid_str, \
|
||||
bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \
|
||||
"%s " fmt, pid_str, ##args); \
|
||||
} while (0)
|
||||
|
||||
#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
|
||||
@ -1109,13 +1114,11 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
|
||||
#else /* CONFIG_BFQ_GROUP_IOSCHED */
|
||||
|
||||
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
|
||||
char pid_str[MAX_PID_STR_LENGTH]; \
|
||||
char pid_str[MAX_BFQQ_NAME_LENGTH]; \
|
||||
if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
|
||||
break; \
|
||||
bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
|
||||
blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \
|
||||
bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
|
||||
##args); \
|
||||
bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \
|
||||
blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \
|
||||
} while (0)
|
||||
#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
|
||||
|
||||
|
@ -6,7 +6,7 @@
|
||||
* Written by: Martin K. Petersen <martin.petersen@oracle.com>
|
||||
*/
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-integrity.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/bio.h>
|
||||
@ -134,7 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
|
||||
iv = bip->bip_vec + bip->bip_vcnt;
|
||||
|
||||
if (bip->bip_vcnt &&
|
||||
bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue,
|
||||
bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
|
||||
&bip->bip_vec[bip->bip_vcnt - 1], offset))
|
||||
return 0;
|
||||
|
||||
|
206
block/bio.c
206
block/bio.c
@ -26,7 +26,7 @@
|
||||
#include "blk-rq-qos.h"
|
||||
|
||||
struct bio_alloc_cache {
|
||||
struct bio_list free_list;
|
||||
struct bio *free_list;
|
||||
unsigned int nr;
|
||||
};
|
||||
|
||||
@ -87,7 +87,8 @@ static struct bio_slab *create_bio_slab(unsigned int size)
|
||||
|
||||
snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
|
||||
bslab->slab = kmem_cache_create(bslab->name, size,
|
||||
ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL);
|
||||
ARCH_KMALLOC_MINALIGN,
|
||||
SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL);
|
||||
if (!bslab->slab)
|
||||
goto fail_alloc_slab;
|
||||
|
||||
@ -156,7 +157,7 @@ static void bio_put_slab(struct bio_set *bs)
|
||||
|
||||
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
|
||||
{
|
||||
BIO_BUG_ON(nr_vecs > BIO_MAX_VECS);
|
||||
BUG_ON(nr_vecs > BIO_MAX_VECS);
|
||||
|
||||
if (nr_vecs == BIO_MAX_VECS)
|
||||
mempool_free(bv, pool);
|
||||
@ -281,6 +282,7 @@ void bio_init(struct bio *bio, struct bio_vec *table,
|
||||
|
||||
atomic_set(&bio->__bi_remaining, 1);
|
||||
atomic_set(&bio->__bi_cnt, 1);
|
||||
bio->bi_cookie = BLK_QC_T_NONE;
|
||||
|
||||
bio->bi_max_vecs = max_vecs;
|
||||
bio->bi_io_vec = table;
|
||||
@ -546,7 +548,7 @@ EXPORT_SYMBOL(zero_fill_bio);
|
||||
* REQ_OP_READ, zero the truncated part. This function should only
|
||||
* be used for handling corner cases, such as bio eod.
|
||||
*/
|
||||
void bio_truncate(struct bio *bio, unsigned new_size)
|
||||
static void bio_truncate(struct bio *bio, unsigned new_size)
|
||||
{
|
||||
struct bio_vec bv;
|
||||
struct bvec_iter iter;
|
||||
@ -629,7 +631,8 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
|
||||
unsigned int i = 0;
|
||||
struct bio *bio;
|
||||
|
||||
while ((bio = bio_list_pop(&cache->free_list)) != NULL) {
|
||||
while ((bio = cache->free_list) != NULL) {
|
||||
cache->free_list = bio->bi_next;
|
||||
cache->nr--;
|
||||
bio_free(bio);
|
||||
if (++i == nr)
|
||||
@ -678,7 +681,7 @@ static void bio_alloc_cache_destroy(struct bio_set *bs)
|
||||
void bio_put(struct bio *bio)
|
||||
{
|
||||
if (unlikely(bio_flagged(bio, BIO_REFFED))) {
|
||||
BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
|
||||
BUG_ON(!atomic_read(&bio->__bi_cnt));
|
||||
if (!atomic_dec_and_test(&bio->__bi_cnt))
|
||||
return;
|
||||
}
|
||||
@ -688,7 +691,8 @@ void bio_put(struct bio *bio)
|
||||
|
||||
bio_uninit(bio);
|
||||
cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
|
||||
bio_list_add_head(&cache->free_list, bio);
|
||||
bio->bi_next = cache->free_list;
|
||||
cache->free_list = bio;
|
||||
if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
|
||||
bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
|
||||
put_cpu();
|
||||
@ -773,6 +777,23 @@ const char *bio_devname(struct bio *bio, char *buf)
|
||||
}
|
||||
EXPORT_SYMBOL(bio_devname);
|
||||
|
||||
/**
|
||||
* bio_full - check if the bio is full
|
||||
* @bio: bio to check
|
||||
* @len: length of one segment to be added
|
||||
*
|
||||
* Return true if @bio is full and one segment with @len bytes can't be
|
||||
* added to the bio, otherwise return false
|
||||
*/
|
||||
static inline bool bio_full(struct bio *bio, unsigned len)
|
||||
{
|
||||
if (bio->bi_vcnt >= bio->bi_max_vecs)
|
||||
return true;
|
||||
if (bio->bi_iter.bi_size > UINT_MAX - len)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool page_is_mergeable(const struct bio_vec *bv,
|
||||
struct page *page, unsigned int len, unsigned int off,
|
||||
bool *same_page)
|
||||
@ -792,6 +813,44 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
|
||||
return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* __bio_try_merge_page - try appending data to an existing bvec.
|
||||
* @bio: destination bio
|
||||
* @page: start page to add
|
||||
* @len: length of the data to add
|
||||
* @off: offset of the data relative to @page
|
||||
* @same_page: return if the segment has been merged inside the same page
|
||||
*
|
||||
* Try to add the data at @page + @off to the last bvec of @bio. This is a
|
||||
* useful optimisation for file systems with a block size smaller than the
|
||||
* page size.
|
||||
*
|
||||
* Warn if (@len, @off) crosses pages in case that @same_page is true.
|
||||
*
|
||||
* Return %true on success or %false on failure.
|
||||
*/
|
||||
static bool __bio_try_merge_page(struct bio *bio, struct page *page,
|
||||
unsigned int len, unsigned int off, bool *same_page)
|
||||
{
|
||||
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
|
||||
return false;
|
||||
|
||||
if (bio->bi_vcnt > 0) {
|
||||
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
|
||||
|
||||
if (page_is_mergeable(bv, page, len, off, same_page)) {
|
||||
if (bio->bi_iter.bi_size > UINT_MAX - len) {
|
||||
*same_page = false;
|
||||
return false;
|
||||
}
|
||||
bv->bv_len += len;
|
||||
bio->bi_iter.bi_size += len;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to merge a page into a segment, while obeying the hardware segment
|
||||
* size limit. This is not for normal read/write bios, but for passthrough
|
||||
@ -909,7 +968,7 @@ EXPORT_SYMBOL(bio_add_pc_page);
|
||||
int bio_add_zone_append_page(struct bio *bio, struct page *page,
|
||||
unsigned int len, unsigned int offset)
|
||||
{
|
||||
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
|
||||
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
|
||||
bool same_page = false;
|
||||
|
||||
if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
|
||||
@ -923,45 +982,6 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page,
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
|
||||
|
||||
/**
|
||||
* __bio_try_merge_page - try appending data to an existing bvec.
|
||||
* @bio: destination bio
|
||||
* @page: start page to add
|
||||
* @len: length of the data to add
|
||||
* @off: offset of the data relative to @page
|
||||
* @same_page: return if the segment has been merged inside the same page
|
||||
*
|
||||
* Try to add the data at @page + @off to the last bvec of @bio. This is a
|
||||
* useful optimisation for file systems with a block size smaller than the
|
||||
* page size.
|
||||
*
|
||||
* Warn if (@len, @off) crosses pages in case that @same_page is true.
|
||||
*
|
||||
* Return %true on success or %false on failure.
|
||||
*/
|
||||
bool __bio_try_merge_page(struct bio *bio, struct page *page,
|
||||
unsigned int len, unsigned int off, bool *same_page)
|
||||
{
|
||||
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
|
||||
return false;
|
||||
|
||||
if (bio->bi_vcnt > 0) {
|
||||
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
|
||||
|
||||
if (page_is_mergeable(bv, page, len, off, same_page)) {
|
||||
if (bio->bi_iter.bi_size > UINT_MAX - len) {
|
||||
*same_page = false;
|
||||
return false;
|
||||
}
|
||||
bv->bv_len += len;
|
||||
bio->bi_iter.bi_size += len;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__bio_try_merge_page);
|
||||
|
||||
/**
|
||||
* __bio_add_page - add page(s) to a bio in a new segment
|
||||
* @bio: destination bio
|
||||
@ -1016,52 +1036,62 @@ int bio_add_page(struct bio *bio, struct page *page,
|
||||
}
|
||||
EXPORT_SYMBOL(bio_add_page);
|
||||
|
||||
void bio_release_pages(struct bio *bio, bool mark_dirty)
|
||||
/**
|
||||
* bio_add_folio - Attempt to add part of a folio to a bio.
|
||||
* @bio: BIO to add to.
|
||||
* @folio: Folio to add.
|
||||
* @len: How many bytes from the folio to add.
|
||||
* @off: First byte in this folio to add.
|
||||
*
|
||||
* Filesystems that use folios can call this function instead of calling
|
||||
* bio_add_page() for each page in the folio. If @off is bigger than
|
||||
* PAGE_SIZE, this function can create a bio_vec that starts in a page
|
||||
* after the bv_page. BIOs do not support folios that are 4GiB or larger.
|
||||
*
|
||||
* Return: Whether the addition was successful.
|
||||
*/
|
||||
bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
|
||||
size_t off)
|
||||
{
|
||||
if (len > UINT_MAX || off > UINT_MAX)
|
||||
return 0;
|
||||
return bio_add_page(bio, &folio->page, len, off) > 0;
|
||||
}
|
||||
|
||||
void __bio_release_pages(struct bio *bio, bool mark_dirty)
|
||||
{
|
||||
struct bvec_iter_all iter_all;
|
||||
struct bio_vec *bvec;
|
||||
|
||||
if (bio_flagged(bio, BIO_NO_PAGE_REF))
|
||||
return;
|
||||
|
||||
bio_for_each_segment_all(bvec, bio, iter_all) {
|
||||
if (mark_dirty && !PageCompound(bvec->bv_page))
|
||||
set_page_dirty_lock(bvec->bv_page);
|
||||
put_page(bvec->bv_page);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_release_pages);
|
||||
EXPORT_SYMBOL_GPL(__bio_release_pages);
|
||||
|
||||
static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
|
||||
void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
|
||||
{
|
||||
size_t size = iov_iter_count(iter);
|
||||
|
||||
WARN_ON_ONCE(bio->bi_max_vecs);
|
||||
|
||||
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
|
||||
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
|
||||
size_t max_sectors = queue_max_zone_append_sectors(q);
|
||||
|
||||
size = min(size, max_sectors << SECTOR_SHIFT);
|
||||
}
|
||||
|
||||
bio->bi_vcnt = iter->nr_segs;
|
||||
bio->bi_io_vec = (struct bio_vec *)iter->bvec;
|
||||
bio->bi_iter.bi_bvec_done = iter->iov_offset;
|
||||
bio->bi_iter.bi_size = iter->count;
|
||||
bio->bi_iter.bi_size = size;
|
||||
bio_set_flag(bio, BIO_NO_PAGE_REF);
|
||||
bio_set_flag(bio, BIO_CLONED);
|
||||
}
|
||||
|
||||
static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
|
||||
{
|
||||
__bio_iov_bvec_set(bio, iter);
|
||||
iov_iter_advance(iter, iter->count);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter)
|
||||
{
|
||||
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
|
||||
struct iov_iter i = *iter;
|
||||
|
||||
iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9);
|
||||
__bio_iov_bvec_set(bio, &i);
|
||||
iov_iter_advance(iter, i.count);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void bio_put_pages(struct page **pages, size_t size, size_t off)
|
||||
{
|
||||
size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
|
||||
@ -1131,7 +1161,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
|
||||
{
|
||||
unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
|
||||
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
|
||||
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
|
||||
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
|
||||
unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
|
||||
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
|
||||
struct page **pages = (struct page **)bv;
|
||||
@ -1203,9 +1233,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
|
||||
int ret = 0;
|
||||
|
||||
if (iov_iter_is_bvec(iter)) {
|
||||
if (bio_op(bio) == REQ_OP_ZONE_APPEND)
|
||||
return bio_iov_bvec_set_append(bio, iter);
|
||||
return bio_iov_bvec_set(bio, iter);
|
||||
bio_iov_bvec_set(bio, iter);
|
||||
iov_iter_advance(iter, bio->bi_iter.bi_size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
do {
|
||||
@ -1261,18 +1291,7 @@ int submit_bio_wait(struct bio *bio)
|
||||
}
|
||||
EXPORT_SYMBOL(submit_bio_wait);
|
||||
|
||||
/**
|
||||
* bio_advance - increment/complete a bio by some number of bytes
|
||||
* @bio: bio to advance
|
||||
* @bytes: number of bytes to complete
|
||||
*
|
||||
* This updates bi_sector, bi_size and bi_idx; if the number of bytes to
|
||||
* complete doesn't align with a bvec boundary, then bv_len and bv_offset will
|
||||
* be updated on the last bvec as well.
|
||||
*
|
||||
* @bio will then represent the remaining, uncompleted portion of the io.
|
||||
*/
|
||||
void bio_advance(struct bio *bio, unsigned bytes)
|
||||
void __bio_advance(struct bio *bio, unsigned bytes)
|
||||
{
|
||||
if (bio_integrity(bio))
|
||||
bio_integrity_advance(bio, bytes);
|
||||
@ -1280,7 +1299,7 @@ void bio_advance(struct bio *bio, unsigned bytes)
|
||||
bio_crypt_advance(bio, bytes);
|
||||
bio_advance_iter(bio, &bio->bi_iter, bytes);
|
||||
}
|
||||
EXPORT_SYMBOL(bio_advance);
|
||||
EXPORT_SYMBOL(__bio_advance);
|
||||
|
||||
void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
|
||||
struct bio *src, struct bvec_iter *src_iter)
|
||||
@ -1468,10 +1487,10 @@ void bio_endio(struct bio *bio)
|
||||
return;
|
||||
|
||||
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED))
|
||||
rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio);
|
||||
rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio);
|
||||
|
||||
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
|
||||
trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
|
||||
trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
|
||||
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
|
||||
}
|
||||
|
||||
@ -1710,8 +1729,9 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
|
||||
return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
|
||||
|
||||
cache = per_cpu_ptr(bs->cache, get_cpu());
|
||||
bio = bio_list_pop(&cache->free_list);
|
||||
if (bio) {
|
||||
if (cache->free_list) {
|
||||
bio = cache->free_list;
|
||||
cache->free_list = bio->bi_next;
|
||||
cache->nr--;
|
||||
put_cpu();
|
||||
bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);
|
||||
|
@ -30,8 +30,10 @@
|
||||
#include <linux/blk-cgroup.h>
|
||||
#include <linux/tracehook.h>
|
||||
#include <linux/psi.h>
|
||||
#include <linux/part_stat.h>
|
||||
#include "blk.h"
|
||||
#include "blk-ioprio.h"
|
||||
#include "blk-throttle.h"
|
||||
|
||||
/*
|
||||
* blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
|
||||
@ -620,7 +622,7 @@ struct block_device *blkcg_conf_open_bdev(char **inputp)
|
||||
*/
|
||||
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
|
||||
char *input, struct blkg_conf_ctx *ctx)
|
||||
__acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock)
|
||||
__acquires(rcu) __acquires(&bdev->bd_queue->queue_lock)
|
||||
{
|
||||
struct block_device *bdev;
|
||||
struct request_queue *q;
|
||||
@ -631,7 +633,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
|
||||
if (IS_ERR(bdev))
|
||||
return PTR_ERR(bdev);
|
||||
|
||||
q = bdev->bd_disk->queue;
|
||||
q = bdev_get_queue(bdev);
|
||||
|
||||
/*
|
||||
* blkcg_deactivate_policy() requires queue to be frozen, we can grab
|
||||
@ -747,9 +749,9 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
|
||||
* with blkg_conf_prep().
|
||||
*/
|
||||
void blkg_conf_finish(struct blkg_conf_ctx *ctx)
|
||||
__releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu)
|
||||
__releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu)
|
||||
{
|
||||
spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock);
|
||||
spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
|
||||
rcu_read_unlock();
|
||||
blkdev_put_no_open(ctx->bdev);
|
||||
}
|
||||
@ -852,7 +854,7 @@ static void blkcg_fill_root_iostats(void)
|
||||
while ((dev = class_dev_iter_next(&iter))) {
|
||||
struct block_device *bdev = dev_to_bdev(dev);
|
||||
struct blkcg_gq *blkg =
|
||||
blk_queue_root_blkg(bdev->bd_disk->queue);
|
||||
blk_queue_root_blkg(bdev_get_queue(bdev));
|
||||
struct blkg_iostat tmp;
|
||||
int cpu;
|
||||
|
||||
@ -1811,7 +1813,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
|
||||
|
||||
rcu_read_lock();
|
||||
blkg = blkg_lookup_create(css_to_blkcg(css),
|
||||
bio->bi_bdev->bd_disk->queue);
|
||||
bdev_get_queue(bio->bi_bdev));
|
||||
while (blkg) {
|
||||
if (blkg_tryget(blkg)) {
|
||||
ret_blkg = blkg;
|
||||
@ -1847,8 +1849,8 @@ void bio_associate_blkg_from_css(struct bio *bio,
|
||||
if (css && css->parent) {
|
||||
bio->bi_blkg = blkg_tryget_closest(bio, css);
|
||||
} else {
|
||||
blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg);
|
||||
bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg;
|
||||
blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
|
||||
bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
|
||||
|
799
block/blk-core.c
799
block/blk-core.c
File diff suppressed because it is too large
Load Diff
@ -12,12 +12,13 @@
|
||||
#include <crypto/skcipher.h>
|
||||
#include <linux/blk-cgroup.h>
|
||||
#include <linux/blk-crypto.h>
|
||||
#include <linux/blk-crypto-profile.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/crypto.h>
|
||||
#include <linux/keyslot-manager.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/scatterlist.h>
|
||||
|
||||
#include "blk-crypto-internal.h"
|
||||
|
||||
@ -72,12 +73,12 @@ static mempool_t *bio_fallback_crypt_ctx_pool;
|
||||
static DEFINE_MUTEX(tfms_init_lock);
|
||||
static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX];
|
||||
|
||||
static struct blk_crypto_keyslot {
|
||||
static struct blk_crypto_fallback_keyslot {
|
||||
enum blk_crypto_mode_num crypto_mode;
|
||||
struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX];
|
||||
} *blk_crypto_keyslots;
|
||||
|
||||
static struct blk_keyslot_manager blk_crypto_ksm;
|
||||
static struct blk_crypto_profile blk_crypto_fallback_profile;
|
||||
static struct workqueue_struct *blk_crypto_wq;
|
||||
static mempool_t *blk_crypto_bounce_page_pool;
|
||||
static struct bio_set crypto_bio_split;
|
||||
@ -88,9 +89,9 @@ static struct bio_set crypto_bio_split;
|
||||
*/
|
||||
static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE];
|
||||
|
||||
static void blk_crypto_evict_keyslot(unsigned int slot)
|
||||
static void blk_crypto_fallback_evict_keyslot(unsigned int slot)
|
||||
{
|
||||
struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
|
||||
struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot];
|
||||
enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode;
|
||||
int err;
|
||||
|
||||
@ -103,45 +104,41 @@ static void blk_crypto_evict_keyslot(unsigned int slot)
|
||||
slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID;
|
||||
}
|
||||
|
||||
static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm,
|
||||
const struct blk_crypto_key *key,
|
||||
unsigned int slot)
|
||||
static int
|
||||
blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile,
|
||||
const struct blk_crypto_key *key,
|
||||
unsigned int slot)
|
||||
{
|
||||
struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
|
||||
struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot];
|
||||
const enum blk_crypto_mode_num crypto_mode =
|
||||
key->crypto_cfg.crypto_mode;
|
||||
int err;
|
||||
|
||||
if (crypto_mode != slotp->crypto_mode &&
|
||||
slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID)
|
||||
blk_crypto_evict_keyslot(slot);
|
||||
blk_crypto_fallback_evict_keyslot(slot);
|
||||
|
||||
slotp->crypto_mode = crypto_mode;
|
||||
err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw,
|
||||
key->size);
|
||||
if (err) {
|
||||
blk_crypto_evict_keyslot(slot);
|
||||
blk_crypto_fallback_evict_keyslot(slot);
|
||||
return err;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
|
||||
const struct blk_crypto_key *key,
|
||||
unsigned int slot)
|
||||
static int blk_crypto_fallback_keyslot_evict(struct blk_crypto_profile *profile,
|
||||
const struct blk_crypto_key *key,
|
||||
unsigned int slot)
|
||||
{
|
||||
blk_crypto_evict_keyslot(slot);
|
||||
blk_crypto_fallback_evict_keyslot(slot);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* The crypto API fallback KSM ops - only used for a bio when it specifies a
|
||||
* blk_crypto_key that was not supported by the device's inline encryption
|
||||
* hardware.
|
||||
*/
|
||||
static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = {
|
||||
.keyslot_program = blk_crypto_keyslot_program,
|
||||
.keyslot_evict = blk_crypto_keyslot_evict,
|
||||
static const struct blk_crypto_ll_ops blk_crypto_fallback_ll_ops = {
|
||||
.keyslot_program = blk_crypto_fallback_keyslot_program,
|
||||
.keyslot_evict = blk_crypto_fallback_keyslot_evict,
|
||||
};
|
||||
|
||||
static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
|
||||
@ -159,7 +156,7 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
|
||||
bio_endio(src_bio);
|
||||
}
|
||||
|
||||
static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
|
||||
static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
|
||||
{
|
||||
struct bvec_iter iter;
|
||||
struct bio_vec bv;
|
||||
@ -186,13 +183,14 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
|
||||
return bio;
|
||||
}
|
||||
|
||||
static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
|
||||
struct skcipher_request **ciph_req_ret,
|
||||
struct crypto_wait *wait)
|
||||
static bool
|
||||
blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot,
|
||||
struct skcipher_request **ciph_req_ret,
|
||||
struct crypto_wait *wait)
|
||||
{
|
||||
struct skcipher_request *ciph_req;
|
||||
const struct blk_crypto_keyslot *slotp;
|
||||
int keyslot_idx = blk_ksm_get_slot_idx(slot);
|
||||
const struct blk_crypto_fallback_keyslot *slotp;
|
||||
int keyslot_idx = blk_crypto_keyslot_index(slot);
|
||||
|
||||
slotp = &blk_crypto_keyslots[keyslot_idx];
|
||||
ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode],
|
||||
@ -209,7 +207,7 @@ static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr)
|
||||
static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr)
|
||||
{
|
||||
struct bio *bio = *bio_ptr;
|
||||
unsigned int i = 0;
|
||||
@ -264,7 +262,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
|
||||
{
|
||||
struct bio *src_bio, *enc_bio;
|
||||
struct bio_crypt_ctx *bc;
|
||||
struct blk_ksm_keyslot *slot;
|
||||
struct blk_crypto_keyslot *slot;
|
||||
int data_unit_size;
|
||||
struct skcipher_request *ciph_req = NULL;
|
||||
DECLARE_CRYPTO_WAIT(wait);
|
||||
@ -276,7 +274,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
|
||||
blk_status_t blk_st;
|
||||
|
||||
/* Split the bio if it's too big for single page bvec */
|
||||
if (!blk_crypto_split_bio_if_needed(bio_ptr))
|
||||
if (!blk_crypto_fallback_split_bio_if_needed(bio_ptr))
|
||||
return false;
|
||||
|
||||
src_bio = *bio_ptr;
|
||||
@ -284,24 +282,25 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
|
||||
data_unit_size = bc->bc_key->crypto_cfg.data_unit_size;
|
||||
|
||||
/* Allocate bounce bio for encryption */
|
||||
enc_bio = blk_crypto_clone_bio(src_bio);
|
||||
enc_bio = blk_crypto_fallback_clone_bio(src_bio);
|
||||
if (!enc_bio) {
|
||||
src_bio->bi_status = BLK_STS_RESOURCE;
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Use the crypto API fallback keyslot manager to get a crypto_skcipher
|
||||
* for the algorithm and key specified for this bio.
|
||||
* Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
|
||||
* this bio's algorithm and key.
|
||||
*/
|
||||
blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
|
||||
blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile,
|
||||
bc->bc_key, &slot);
|
||||
if (blk_st != BLK_STS_OK) {
|
||||
src_bio->bi_status = blk_st;
|
||||
goto out_put_enc_bio;
|
||||
}
|
||||
|
||||
/* and then allocate an skcipher_request for it */
|
||||
if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
|
||||
if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
|
||||
src_bio->bi_status = BLK_STS_RESOURCE;
|
||||
goto out_release_keyslot;
|
||||
}
|
||||
@ -362,7 +361,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
|
||||
out_free_ciph_req:
|
||||
skcipher_request_free(ciph_req);
|
||||
out_release_keyslot:
|
||||
blk_ksm_put_slot(slot);
|
||||
blk_crypto_put_keyslot(slot);
|
||||
out_put_enc_bio:
|
||||
if (enc_bio)
|
||||
bio_put(enc_bio);
|
||||
@ -380,7 +379,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
|
||||
container_of(work, struct bio_fallback_crypt_ctx, work);
|
||||
struct bio *bio = f_ctx->bio;
|
||||
struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx;
|
||||
struct blk_ksm_keyslot *slot;
|
||||
struct blk_crypto_keyslot *slot;
|
||||
struct skcipher_request *ciph_req = NULL;
|
||||
DECLARE_CRYPTO_WAIT(wait);
|
||||
u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
|
||||
@ -393,17 +392,18 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
|
||||
blk_status_t blk_st;
|
||||
|
||||
/*
|
||||
* Use the crypto API fallback keyslot manager to get a crypto_skcipher
|
||||
* for the algorithm and key specified for this bio.
|
||||
* Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
|
||||
* this bio's algorithm and key.
|
||||
*/
|
||||
blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
|
||||
blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile,
|
||||
bc->bc_key, &slot);
|
||||
if (blk_st != BLK_STS_OK) {
|
||||
bio->bi_status = blk_st;
|
||||
goto out_no_keyslot;
|
||||
}
|
||||
|
||||
/* and then allocate an skcipher_request for it */
|
||||
if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
|
||||
if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
|
||||
bio->bi_status = BLK_STS_RESOURCE;
|
||||
goto out;
|
||||
}
|
||||
@ -434,7 +434,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
|
||||
|
||||
out:
|
||||
skcipher_request_free(ciph_req);
|
||||
blk_ksm_put_slot(slot);
|
||||
blk_crypto_put_keyslot(slot);
|
||||
out_no_keyslot:
|
||||
mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
|
||||
bio_endio(bio);
|
||||
@ -473,9 +473,9 @@ static void blk_crypto_fallback_decrypt_endio(struct bio *bio)
|
||||
* @bio_ptr: pointer to the bio to prepare
|
||||
*
|
||||
* If bio is doing a WRITE operation, this splits the bio into two parts if it's
|
||||
* too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio
|
||||
* for the first part, encrypts it, and update bio_ptr to point to the bounce
|
||||
* bio.
|
||||
* too big (see blk_crypto_fallback_split_bio_if_needed()). It then allocates a
|
||||
* bounce bio for the first part, encrypts it, and updates bio_ptr to point to
|
||||
* the bounce bio.
|
||||
*
|
||||
* For a READ operation, we mark the bio for decryption by using bi_private and
|
||||
* bi_end_io.
|
||||
@ -499,8 +499,8 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm,
|
||||
&bc->bc_key->crypto_cfg)) {
|
||||
if (!__blk_crypto_cfg_supported(&blk_crypto_fallback_profile,
|
||||
&bc->bc_key->crypto_cfg)) {
|
||||
bio->bi_status = BLK_STS_NOTSUPP;
|
||||
return false;
|
||||
}
|
||||
@ -526,7 +526,7 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
|
||||
|
||||
int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key)
|
||||
{
|
||||
return blk_ksm_evict_key(&blk_crypto_ksm, key);
|
||||
return __blk_crypto_evict_key(&blk_crypto_fallback_profile, key);
|
||||
}
|
||||
|
||||
static bool blk_crypto_fallback_inited;
|
||||
@ -534,6 +534,7 @@ static int blk_crypto_fallback_init(void)
|
||||
{
|
||||
int i;
|
||||
int err;
|
||||
struct blk_crypto_profile *profile = &blk_crypto_fallback_profile;
|
||||
|
||||
if (blk_crypto_fallback_inited)
|
||||
return 0;
|
||||
@ -544,24 +545,24 @@ static int blk_crypto_fallback_init(void)
|
||||
if (err)
|
||||
goto out;
|
||||
|
||||
err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots);
|
||||
err = blk_crypto_profile_init(profile, blk_crypto_num_keyslots);
|
||||
if (err)
|
||||
goto fail_free_bioset;
|
||||
err = -ENOMEM;
|
||||
|
||||
blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops;
|
||||
blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
|
||||
profile->ll_ops = blk_crypto_fallback_ll_ops;
|
||||
profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
|
||||
|
||||
/* All blk-crypto modes have a crypto API fallback. */
|
||||
for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++)
|
||||
blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF;
|
||||
blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
|
||||
profile->modes_supported[i] = 0xFFFFFFFF;
|
||||
profile->modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
|
||||
|
||||
blk_crypto_wq = alloc_workqueue("blk_crypto_wq",
|
||||
WQ_UNBOUND | WQ_HIGHPRI |
|
||||
WQ_MEM_RECLAIM, num_online_cpus());
|
||||
if (!blk_crypto_wq)
|
||||
goto fail_free_ksm;
|
||||
goto fail_destroy_profile;
|
||||
|
||||
blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots,
|
||||
sizeof(blk_crypto_keyslots[0]),
|
||||
@ -595,8 +596,8 @@ static int blk_crypto_fallback_init(void)
|
||||
kfree(blk_crypto_keyslots);
|
||||
fail_free_wq:
|
||||
destroy_workqueue(blk_crypto_wq);
|
||||
fail_free_ksm:
|
||||
blk_ksm_destroy(&blk_crypto_ksm);
|
||||
fail_destroy_profile:
|
||||
blk_crypto_profile_destroy(profile);
|
||||
fail_free_bioset:
|
||||
bioset_exit(&crypto_bio_split);
|
||||
out:
|
||||
@ -610,7 +611,7 @@ static int blk_crypto_fallback_init(void)
|
||||
int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
|
||||
{
|
||||
const char *cipher_str = blk_crypto_modes[mode_num].cipher_str;
|
||||
struct blk_crypto_keyslot *slotp;
|
||||
struct blk_crypto_fallback_keyslot *slotp;
|
||||
unsigned int i;
|
||||
int err = 0;
|
||||
|
||||
|
@ -11,7 +11,7 @@
|
||||
|
||||
#include <linux/bio.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/keyslot-manager.h>
|
||||
#include <linux/blk-crypto-profile.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
@ -218,8 +218,9 @@ static bool bio_crypt_check_alignment(struct bio *bio)
|
||||
|
||||
blk_status_t __blk_crypto_init_request(struct request *rq)
|
||||
{
|
||||
return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key,
|
||||
&rq->crypt_keyslot);
|
||||
return blk_crypto_get_keyslot(rq->q->crypto_profile,
|
||||
rq->crypt_ctx->bc_key,
|
||||
&rq->crypt_keyslot);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -233,7 +234,7 @@ blk_status_t __blk_crypto_init_request(struct request *rq)
|
||||
*/
|
||||
void __blk_crypto_free_request(struct request *rq)
|
||||
{
|
||||
blk_ksm_put_slot(rq->crypt_keyslot);
|
||||
blk_crypto_put_keyslot(rq->crypt_keyslot);
|
||||
mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool);
|
||||
blk_crypto_rq_set_defaults(rq);
|
||||
}
|
||||
@ -264,6 +265,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
|
||||
{
|
||||
struct bio *bio = *bio_ptr;
|
||||
const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
|
||||
struct blk_crypto_profile *profile;
|
||||
|
||||
/* Error if bio has no data. */
|
||||
if (WARN_ON_ONCE(!bio_has_data(bio))) {
|
||||
@ -280,8 +282,8 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
|
||||
* Success if device supports the encryption context, or if we succeeded
|
||||
* in falling back to the crypto API.
|
||||
*/
|
||||
if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm,
|
||||
&bc_key->crypto_cfg))
|
||||
profile = bdev_get_queue(bio->bi_bdev)->crypto_profile;
|
||||
if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg))
|
||||
return true;
|
||||
|
||||
if (blk_crypto_fallback_bio_prep(bio_ptr))
|
||||
@ -357,7 +359,7 @@ bool blk_crypto_config_supported(struct request_queue *q,
|
||||
const struct blk_crypto_config *cfg)
|
||||
{
|
||||
return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
|
||||
blk_ksm_crypto_cfg_supported(q->ksm, cfg);
|
||||
__blk_crypto_cfg_supported(q->crypto_profile, cfg);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -378,7 +380,7 @@ bool blk_crypto_config_supported(struct request_queue *q,
|
||||
int blk_crypto_start_using_key(const struct blk_crypto_key *key,
|
||||
struct request_queue *q)
|
||||
{
|
||||
if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
|
||||
if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
|
||||
return 0;
|
||||
return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
|
||||
}
|
||||
@ -394,18 +396,17 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
|
||||
* evicted from any hardware that it might have been programmed into. The key
|
||||
* must not be in use by any in-flight IO when this function is called.
|
||||
*
|
||||
* Return: 0 on success or if key is not present in the q's ksm, -err on error.
|
||||
* Return: 0 on success or if the key wasn't in any keyslot; -errno on error.
|
||||
*/
|
||||
int blk_crypto_evict_key(struct request_queue *q,
|
||||
const struct blk_crypto_key *key)
|
||||
{
|
||||
if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
|
||||
return blk_ksm_evict_key(q->ksm, key);
|
||||
if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
|
||||
return __blk_crypto_evict_key(q->crypto_profile, key);
|
||||
|
||||
/*
|
||||
* If the request queue's associated inline encryption hardware didn't
|
||||
* have support for the key, then the key might have been programmed
|
||||
* into the fallback keyslot manager, so try to evict from there.
|
||||
* If the request_queue didn't support the key, then blk-crypto-fallback
|
||||
* may have been used, so try to evict the key from blk-crypto-fallback.
|
||||
*/
|
||||
return blk_crypto_fallback_evict_key(key);
|
||||
}
|
||||
|
@ -69,6 +69,7 @@
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/blk-mq.h>
|
||||
#include <linux/part_stat.h>
|
||||
|
||||
#include "blk.h"
|
||||
#include "blk-mq.h"
|
||||
@ -95,6 +96,12 @@ enum {
|
||||
static void blk_kick_flush(struct request_queue *q,
|
||||
struct blk_flush_queue *fq, unsigned int flags);
|
||||
|
||||
static inline struct blk_flush_queue *
|
||||
blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
|
||||
{
|
||||
return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
|
||||
}
|
||||
|
||||
static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
|
||||
{
|
||||
unsigned int policy = 0;
|
||||
@ -138,7 +145,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front)
|
||||
|
||||
static void blk_account_io_flush(struct request *rq)
|
||||
{
|
||||
struct block_device *part = rq->rq_disk->part0;
|
||||
struct block_device *part = rq->q->disk->part0;
|
||||
|
||||
part_stat_lock();
|
||||
part_stat_inc(part, ios[STAT_FLUSH]);
|
||||
@ -222,7 +229,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
|
||||
/* release the tag's ownership to the req cloned from */
|
||||
spin_lock_irqsave(&fq->mq_flush_lock, flags);
|
||||
|
||||
if (!refcount_dec_and_test(&flush_rq->ref)) {
|
||||
if (!req_ref_put_and_test(flush_rq)) {
|
||||
fq->rq_status = error;
|
||||
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
|
||||
return;
|
||||
@ -334,7 +341,6 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
|
||||
flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
|
||||
flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
|
||||
flush_rq->rq_flags |= RQF_FLUSH_SEQ;
|
||||
flush_rq->rq_disk = first_rq->rq_disk;
|
||||
flush_rq->end_io = flush_end_io;
|
||||
/*
|
||||
* Order WRITE ->end_io and WRITE rq->ref, and its pair is the one
|
||||
@ -343,7 +349,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
|
||||
* and READ flush_rq->end_io
|
||||
*/
|
||||
smp_wmb();
|
||||
refcount_set(&flush_rq->ref, 1);
|
||||
req_ref_set(flush_rq, 1);
|
||||
|
||||
blk_flush_queue_rq(flush_rq, false);
|
||||
}
|
||||
@ -423,7 +429,7 @@ void blk_insert_flush(struct request *rq)
|
||||
*/
|
||||
if ((policy & REQ_FSEQ_DATA) &&
|
||||
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
|
||||
blk_mq_request_bypass_insert(rq, false, false);
|
||||
blk_mq_request_bypass_insert(rq, false, true);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -6,7 +6,7 @@
|
||||
* Written by: Martin K. Petersen <martin.petersen@oracle.com>
|
||||
*/
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-integrity.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/bio.h>
|
||||
@ -409,9 +409,9 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
|
||||
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
|
||||
|
||||
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
|
||||
if (disk->queue->ksm) {
|
||||
if (disk->queue->crypto_profile) {
|
||||
pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
|
||||
blk_ksm_unregister(disk->queue);
|
||||
disk->queue->crypto_profile = NULL;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
348
block/blk-ioc.c
348
block/blk-ioc.c
@ -8,22 +8,25 @@
|
||||
#include <linux/bio.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/sched/task.h>
|
||||
|
||||
#include "blk.h"
|
||||
#include "blk-mq-sched.h"
|
||||
|
||||
/*
|
||||
* For io context allocations
|
||||
*/
|
||||
static struct kmem_cache *iocontext_cachep;
|
||||
|
||||
#ifdef CONFIG_BLK_ICQ
|
||||
/**
|
||||
* get_io_context - increment reference count to io_context
|
||||
* @ioc: io_context to get
|
||||
*
|
||||
* Increment reference count to @ioc.
|
||||
*/
|
||||
void get_io_context(struct io_context *ioc)
|
||||
static void get_io_context(struct io_context *ioc)
|
||||
{
|
||||
BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
|
||||
atomic_long_inc(&ioc->refcount);
|
||||
@ -53,6 +56,16 @@ static void ioc_exit_icq(struct io_cq *icq)
|
||||
icq->flags |= ICQ_EXITED;
|
||||
}
|
||||
|
||||
static void ioc_exit_icqs(struct io_context *ioc)
|
||||
{
|
||||
struct io_cq *icq;
|
||||
|
||||
spin_lock_irq(&ioc->lock);
|
||||
hlist_for_each_entry(icq, &ioc->icq_list, ioc_node)
|
||||
ioc_exit_icq(icq);
|
||||
spin_unlock_irq(&ioc->lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Release an icq. Called with ioc locked for blk-mq, and with both ioc
|
||||
* and queue locked for legacy.
|
||||
@ -132,102 +145,22 @@ static void ioc_release_fn(struct work_struct *work)
|
||||
kmem_cache_free(iocontext_cachep, ioc);
|
||||
}
|
||||
|
||||
/**
|
||||
* put_io_context - put a reference of io_context
|
||||
* @ioc: io_context to put
|
||||
*
|
||||
* Decrement reference count of @ioc and release it if the count reaches
|
||||
* zero.
|
||||
/*
|
||||
* Releasing icqs requires reverse order double locking and we may already be
|
||||
* holding a queue_lock. Do it asynchronously from a workqueue.
|
||||
*/
|
||||
void put_io_context(struct io_context *ioc)
|
||||
{
|
||||
unsigned long flags;
|
||||
bool free_ioc = false;
|
||||
|
||||
if (ioc == NULL)
|
||||
return;
|
||||
|
||||
BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
|
||||
|
||||
/*
|
||||
* Releasing ioc requires reverse order double locking and we may
|
||||
* already be holding a queue_lock. Do it asynchronously from wq.
|
||||
*/
|
||||
if (atomic_long_dec_and_test(&ioc->refcount)) {
|
||||
spin_lock_irqsave(&ioc->lock, flags);
|
||||
if (!hlist_empty(&ioc->icq_list))
|
||||
queue_work(system_power_efficient_wq,
|
||||
&ioc->release_work);
|
||||
else
|
||||
free_ioc = true;
|
||||
spin_unlock_irqrestore(&ioc->lock, flags);
|
||||
}
|
||||
|
||||
if (free_ioc)
|
||||
kmem_cache_free(iocontext_cachep, ioc);
|
||||
}
|
||||
|
||||
/**
|
||||
* put_io_context_active - put active reference on ioc
|
||||
* @ioc: ioc of interest
|
||||
*
|
||||
* Undo get_io_context_active(). If active reference reaches zero after
|
||||
* put, @ioc can never issue further IOs and ioscheds are notified.
|
||||
*/
|
||||
void put_io_context_active(struct io_context *ioc)
|
||||
{
|
||||
struct io_cq *icq;
|
||||
|
||||
if (!atomic_dec_and_test(&ioc->active_ref)) {
|
||||
put_io_context(ioc);
|
||||
return;
|
||||
}
|
||||
|
||||
spin_lock_irq(&ioc->lock);
|
||||
hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
|
||||
if (icq->flags & ICQ_EXITED)
|
||||
continue;
|
||||
|
||||
ioc_exit_icq(icq);
|
||||
}
|
||||
spin_unlock_irq(&ioc->lock);
|
||||
|
||||
put_io_context(ioc);
|
||||
}
|
||||
|
||||
/* Called by the exiting task */
|
||||
void exit_io_context(struct task_struct *task)
|
||||
{
|
||||
struct io_context *ioc;
|
||||
|
||||
task_lock(task);
|
||||
ioc = task->io_context;
|
||||
task->io_context = NULL;
|
||||
task_unlock(task);
|
||||
|
||||
atomic_dec(&ioc->nr_tasks);
|
||||
put_io_context_active(ioc);
|
||||
}
|
||||
|
||||
static void __ioc_clear_queue(struct list_head *icq_list)
|
||||
static bool ioc_delay_free(struct io_context *ioc)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
rcu_read_lock();
|
||||
while (!list_empty(icq_list)) {
|
||||
struct io_cq *icq = list_entry(icq_list->next,
|
||||
struct io_cq, q_node);
|
||||
struct io_context *ioc = icq->ioc;
|
||||
|
||||
spin_lock_irqsave(&ioc->lock, flags);
|
||||
if (icq->flags & ICQ_DESTROYED) {
|
||||
spin_unlock_irqrestore(&ioc->lock, flags);
|
||||
continue;
|
||||
}
|
||||
ioc_destroy_icq(icq);
|
||||
spin_lock_irqsave(&ioc->lock, flags);
|
||||
if (!hlist_empty(&ioc->icq_list)) {
|
||||
queue_work(system_power_efficient_wq, &ioc->release_work);
|
||||
spin_unlock_irqrestore(&ioc->lock, flags);
|
||||
return true;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
spin_unlock_irqrestore(&ioc->lock, flags);
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -244,93 +177,156 @@ void ioc_clear_queue(struct request_queue *q)
|
||||
list_splice_init(&q->icq_list, &icq_list);
|
||||
spin_unlock_irq(&q->queue_lock);
|
||||
|
||||
__ioc_clear_queue(&icq_list);
|
||||
}
|
||||
rcu_read_lock();
|
||||
while (!list_empty(&icq_list)) {
|
||||
struct io_cq *icq =
|
||||
list_entry(icq_list.next, struct io_cq, q_node);
|
||||
|
||||
int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
|
||||
spin_lock_irq(&icq->ioc->lock);
|
||||
if (!(icq->flags & ICQ_DESTROYED))
|
||||
ioc_destroy_icq(icq);
|
||||
spin_unlock_irq(&icq->ioc->lock);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
#else /* CONFIG_BLK_ICQ */
|
||||
static inline void ioc_exit_icqs(struct io_context *ioc)
|
||||
{
|
||||
}
|
||||
static inline bool ioc_delay_free(struct io_context *ioc)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif /* CONFIG_BLK_ICQ */
|
||||
|
||||
/**
|
||||
* put_io_context - put a reference of io_context
|
||||
* @ioc: io_context to put
|
||||
*
|
||||
* Decrement reference count of @ioc and release it if the count reaches
|
||||
* zero.
|
||||
*/
|
||||
void put_io_context(struct io_context *ioc)
|
||||
{
|
||||
BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
|
||||
if (atomic_long_dec_and_test(&ioc->refcount) && !ioc_delay_free(ioc))
|
||||
kmem_cache_free(iocontext_cachep, ioc);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(put_io_context);
|
||||
|
||||
/* Called by the exiting task */
|
||||
void exit_io_context(struct task_struct *task)
|
||||
{
|
||||
struct io_context *ioc;
|
||||
|
||||
task_lock(task);
|
||||
ioc = task->io_context;
|
||||
task->io_context = NULL;
|
||||
task_unlock(task);
|
||||
|
||||
if (atomic_dec_and_test(&ioc->active_ref)) {
|
||||
ioc_exit_icqs(ioc);
|
||||
put_io_context(ioc);
|
||||
}
|
||||
}
|
||||
|
||||
static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
|
||||
{
|
||||
struct io_context *ioc;
|
||||
int ret;
|
||||
|
||||
ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
|
||||
node);
|
||||
if (unlikely(!ioc))
|
||||
return -ENOMEM;
|
||||
return NULL;
|
||||
|
||||
/* initialize */
|
||||
atomic_long_set(&ioc->refcount, 1);
|
||||
atomic_set(&ioc->nr_tasks, 1);
|
||||
atomic_set(&ioc->active_ref, 1);
|
||||
#ifdef CONFIG_BLK_ICQ
|
||||
spin_lock_init(&ioc->lock);
|
||||
INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
|
||||
INIT_HLIST_HEAD(&ioc->icq_list);
|
||||
INIT_WORK(&ioc->release_work, ioc_release_fn);
|
||||
#endif
|
||||
return ioc;
|
||||
}
|
||||
|
||||
int set_task_ioprio(struct task_struct *task, int ioprio)
|
||||
{
|
||||
int err;
|
||||
const struct cred *cred = current_cred(), *tcred;
|
||||
|
||||
rcu_read_lock();
|
||||
tcred = __task_cred(task);
|
||||
if (!uid_eq(tcred->uid, cred->euid) &&
|
||||
!uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
|
||||
rcu_read_unlock();
|
||||
return -EPERM;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
err = security_task_setioprio(task, ioprio);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
task_lock(task);
|
||||
if (unlikely(!task->io_context)) {
|
||||
struct io_context *ioc;
|
||||
|
||||
task_unlock(task);
|
||||
|
||||
ioc = alloc_io_context(GFP_ATOMIC, NUMA_NO_NODE);
|
||||
if (!ioc)
|
||||
return -ENOMEM;
|
||||
|
||||
task_lock(task);
|
||||
if (task->flags & PF_EXITING) {
|
||||
err = -ESRCH;
|
||||
kmem_cache_free(iocontext_cachep, ioc);
|
||||
goto out;
|
||||
}
|
||||
if (task->io_context)
|
||||
kmem_cache_free(iocontext_cachep, ioc);
|
||||
else
|
||||
task->io_context = ioc;
|
||||
}
|
||||
task->io_context->ioprio = ioprio;
|
||||
out:
|
||||
task_unlock(task);
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(set_task_ioprio);
|
||||
|
||||
int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
|
||||
{
|
||||
struct io_context *ioc = current->io_context;
|
||||
|
||||
/*
|
||||
* Try to install. ioc shouldn't be installed if someone else
|
||||
* already did or @task, which isn't %current, is exiting. Note
|
||||
* that we need to allow ioc creation on exiting %current as exit
|
||||
* path may issue IOs from e.g. exit_files(). The exit path is
|
||||
* responsible for not issuing IO after exit_io_context().
|
||||
* Share io context with parent, if CLONE_IO is set
|
||||
*/
|
||||
task_lock(task);
|
||||
if (!task->io_context &&
|
||||
(task == current || !(task->flags & PF_EXITING)))
|
||||
task->io_context = ioc;
|
||||
else
|
||||
kmem_cache_free(iocontext_cachep, ioc);
|
||||
if (clone_flags & CLONE_IO) {
|
||||
atomic_inc(&ioc->active_ref);
|
||||
tsk->io_context = ioc;
|
||||
} else if (ioprio_valid(ioc->ioprio)) {
|
||||
tsk->io_context = alloc_io_context(GFP_KERNEL, NUMA_NO_NODE);
|
||||
if (!tsk->io_context)
|
||||
return -ENOMEM;
|
||||
tsk->io_context->ioprio = ioc->ioprio;
|
||||
}
|
||||
|
||||
ret = task->io_context ? 0 : -EBUSY;
|
||||
|
||||
task_unlock(task);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* get_task_io_context - get io_context of a task
|
||||
* @task: task of interest
|
||||
* @gfp_flags: allocation flags, used if allocation is necessary
|
||||
* @node: allocation node, used if allocation is necessary
|
||||
*
|
||||
* Return io_context of @task. If it doesn't exist, it is created with
|
||||
* @gfp_flags and @node. The returned io_context has its reference count
|
||||
* incremented.
|
||||
*
|
||||
* This function always goes through task_lock() and it's better to use
|
||||
* %current->io_context + get_io_context() for %current.
|
||||
*/
|
||||
struct io_context *get_task_io_context(struct task_struct *task,
|
||||
gfp_t gfp_flags, int node)
|
||||
{
|
||||
struct io_context *ioc;
|
||||
|
||||
might_sleep_if(gfpflags_allow_blocking(gfp_flags));
|
||||
|
||||
do {
|
||||
task_lock(task);
|
||||
ioc = task->io_context;
|
||||
if (likely(ioc)) {
|
||||
get_io_context(ioc);
|
||||
task_unlock(task);
|
||||
return ioc;
|
||||
}
|
||||
task_unlock(task);
|
||||
} while (!create_task_io_context(task, gfp_flags, node));
|
||||
|
||||
return NULL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BLK_ICQ
|
||||
/**
|
||||
* ioc_lookup_icq - lookup io_cq from ioc
|
||||
* @ioc: the associated io_context
|
||||
* @q: the associated request_queue
|
||||
*
|
||||
* Look up io_cq associated with @ioc - @q pair from @ioc. Must be called
|
||||
* with @q->queue_lock held.
|
||||
*/
|
||||
struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
|
||||
struct io_cq *ioc_lookup_icq(struct request_queue *q)
|
||||
{
|
||||
struct io_context *ioc = current->io_context;
|
||||
struct io_cq *icq;
|
||||
|
||||
lockdep_assert_held(&q->queue_lock);
|
||||
@ -359,9 +355,7 @@ EXPORT_SYMBOL(ioc_lookup_icq);
|
||||
|
||||
/**
|
||||
* ioc_create_icq - create and link io_cq
|
||||
* @ioc: io_context of interest
|
||||
* @q: request_queue of interest
|
||||
* @gfp_mask: allocation mask
|
||||
*
|
||||
* Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they
|
||||
* will be created using @gfp_mask.
|
||||
@ -369,19 +363,19 @@ EXPORT_SYMBOL(ioc_lookup_icq);
|
||||
* The caller is responsible for ensuring @ioc won't go away and @q is
|
||||
* alive and will stay alive until this function returns.
|
||||
*/
|
||||
struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
|
||||
gfp_t gfp_mask)
|
||||
static struct io_cq *ioc_create_icq(struct request_queue *q)
|
||||
{
|
||||
struct io_context *ioc = current->io_context;
|
||||
struct elevator_type *et = q->elevator->type;
|
||||
struct io_cq *icq;
|
||||
|
||||
/* allocate stuff */
|
||||
icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
|
||||
icq = kmem_cache_alloc_node(et->icq_cache, GFP_ATOMIC | __GFP_ZERO,
|
||||
q->node);
|
||||
if (!icq)
|
||||
return NULL;
|
||||
|
||||
if (radix_tree_maybe_preload(gfp_mask) < 0) {
|
||||
if (radix_tree_maybe_preload(GFP_ATOMIC) < 0) {
|
||||
kmem_cache_free(et->icq_cache, icq);
|
||||
return NULL;
|
||||
}
|
||||
@ -402,7 +396,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
|
||||
et->ops.init_icq(icq);
|
||||
} else {
|
||||
kmem_cache_free(et->icq_cache, icq);
|
||||
icq = ioc_lookup_icq(ioc, q);
|
||||
icq = ioc_lookup_icq(q);
|
||||
if (!icq)
|
||||
printk(KERN_ERR "cfq: icq link failed!\n");
|
||||
}
|
||||
@ -413,6 +407,46 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
|
||||
return icq;
|
||||
}
|
||||
|
||||
struct io_cq *ioc_find_get_icq(struct request_queue *q)
|
||||
{
|
||||
struct io_context *ioc = current->io_context;
|
||||
struct io_cq *icq = NULL;
|
||||
|
||||
if (unlikely(!ioc)) {
|
||||
ioc = alloc_io_context(GFP_ATOMIC, q->node);
|
||||
if (!ioc)
|
||||
return NULL;
|
||||
|
||||
task_lock(current);
|
||||
if (current->io_context) {
|
||||
kmem_cache_free(iocontext_cachep, ioc);
|
||||
ioc = current->io_context;
|
||||
} else {
|
||||
current->io_context = ioc;
|
||||
}
|
||||
|
||||
get_io_context(ioc);
|
||||
task_unlock(current);
|
||||
} else {
|
||||
get_io_context(ioc);
|
||||
|
||||
spin_lock_irq(&q->queue_lock);
|
||||
icq = ioc_lookup_icq(q);
|
||||
spin_unlock_irq(&q->queue_lock);
|
||||
}
|
||||
|
||||
if (!icq) {
|
||||
icq = ioc_create_icq(q);
|
||||
if (!icq) {
|
||||
put_io_context(ioc);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
return icq;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(ioc_find_get_icq);
|
||||
#endif /* CONFIG_BLK_ICQ */
|
||||
|
||||
static int __init blk_ioc_init(void)
|
||||
{
|
||||
iocontext_cachep = kmem_cache_create("blkdev_ioc",
|
||||
|
@ -74,6 +74,7 @@
|
||||
#include <linux/sched/signal.h>
|
||||
#include <trace/events/block.h>
|
||||
#include <linux/blk-mq.h>
|
||||
#include <linux/blk-cgroup.h>
|
||||
#include "blk-rq-qos.h"
|
||||
#include "blk-stat.h"
|
||||
#include "blk.h"
|
||||
|
@ -62,6 +62,7 @@ struct ioprio_blkg {
|
||||
struct ioprio_blkcg {
|
||||
struct blkcg_policy_data cpd;
|
||||
enum prio_policy prio_policy;
|
||||
bool prio_set;
|
||||
};
|
||||
|
||||
static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
|
||||
@ -112,7 +113,7 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
blkcg->prio_policy = ret;
|
||||
|
||||
blkcg->prio_set = true;
|
||||
return nbytes;
|
||||
}
|
||||
|
||||
@ -190,6 +191,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
|
||||
struct bio *bio)
|
||||
{
|
||||
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
|
||||
u16 prio;
|
||||
|
||||
if (!blkcg->prio_set)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
|
||||
@ -199,8 +204,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
|
||||
* bio I/O priority is not modified. If the bio I/O priority equals
|
||||
* IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio.
|
||||
*/
|
||||
bio->bi_ioprio = max_t(u16, bio->bi_ioprio,
|
||||
IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
|
||||
prio = max_t(u16, bio->bi_ioprio,
|
||||
IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
|
||||
if (prio > bio->bi_ioprio)
|
||||
bio->bi_ioprio = prio;
|
||||
}
|
||||
|
||||
static void blkcg_ioprio_exit(struct rq_qos *rqos)
|
||||
|
@ -6,12 +6,47 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-integrity.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <linux/part_stat.h>
|
||||
|
||||
#include <trace/events/block.h>
|
||||
|
||||
#include "blk.h"
|
||||
#include "blk-mq-sched.h"
|
||||
#include "blk-rq-qos.h"
|
||||
#include "blk-throttle.h"
|
||||
|
||||
static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
|
||||
{
|
||||
*bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
|
||||
}
|
||||
|
||||
static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
|
||||
{
|
||||
struct bvec_iter iter = bio->bi_iter;
|
||||
int idx;
|
||||
|
||||
bio_get_first_bvec(bio, bv);
|
||||
if (bv->bv_len == bio->bi_iter.bi_size)
|
||||
return; /* this bio only has a single bvec */
|
||||
|
||||
bio_advance_iter(bio, &iter, iter.bi_size);
|
||||
|
||||
if (!iter.bi_bvec_done)
|
||||
idx = iter.bi_idx - 1;
|
||||
else /* in the middle of bvec */
|
||||
idx = iter.bi_idx;
|
||||
|
||||
*bv = bio->bi_io_vec[idx];
|
||||
|
||||
/*
|
||||
* iter.bi_bvec_done records actual length of the last bvec
|
||||
* if this bio ends in the middle of one io vector
|
||||
*/
|
||||
if (iter.bi_bvec_done)
|
||||
bv->bv_len = iter.bi_bvec_done;
|
||||
}
|
||||
|
||||
static inline bool bio_will_gap(struct request_queue *q,
|
||||
struct request *prev_rq, struct bio *prev, struct bio *next)
|
||||
@ -285,13 +320,13 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
|
||||
* iopoll in direct IO routine. Given performance gain of iopoll for
|
||||
* big IO can be trival, disable iopoll when split needed.
|
||||
*/
|
||||
bio_clear_hipri(bio);
|
||||
|
||||
bio_clear_polled(bio);
|
||||
return bio_split(bio, sectors, GFP_NOIO, bs);
|
||||
}
|
||||
|
||||
/**
|
||||
* __blk_queue_split - split a bio and submit the second half
|
||||
* @q: [in] request_queue new bio is being queued at
|
||||
* @bio: [in, out] bio to be split
|
||||
* @nr_segs: [out] number of segments in the first bio
|
||||
*
|
||||
@ -302,9 +337,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
|
||||
* of the caller to ensure that q->bio_split is only released after processing
|
||||
* of the split bio has finished.
|
||||
*/
|
||||
void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
|
||||
void __blk_queue_split(struct request_queue *q, struct bio **bio,
|
||||
unsigned int *nr_segs)
|
||||
{
|
||||
struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue;
|
||||
struct bio *split = NULL;
|
||||
|
||||
switch (bio_op(*bio)) {
|
||||
@ -321,21 +356,6 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
|
||||
nr_segs);
|
||||
break;
|
||||
default:
|
||||
/*
|
||||
* All drivers must accept single-segments bios that are <=
|
||||
* PAGE_SIZE. This is a quick and dirty check that relies on
|
||||
* the fact that bi_io_vec[0] is always valid if a bio has data.
|
||||
* The check might lead to occasional false negatives when bios
|
||||
* are cloned, but compared to the performance impact of cloned
|
||||
* bios themselves the loop below doesn't matter anyway.
|
||||
*/
|
||||
if (!q->limits.chunk_sectors &&
|
||||
(*bio)->bi_vcnt == 1 &&
|
||||
((*bio)->bi_io_vec[0].bv_len +
|
||||
(*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) {
|
||||
*nr_segs = 1;
|
||||
break;
|
||||
}
|
||||
split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
|
||||
break;
|
||||
}
|
||||
@ -365,9 +385,11 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
|
||||
*/
|
||||
void blk_queue_split(struct bio **bio)
|
||||
{
|
||||
struct request_queue *q = bdev_get_queue((*bio)->bi_bdev);
|
||||
unsigned int nr_segs;
|
||||
|
||||
__blk_queue_split(bio, &nr_segs);
|
||||
if (blk_may_split(q, *bio))
|
||||
__blk_queue_split(q, bio, &nr_segs);
|
||||
}
|
||||
EXPORT_SYMBOL(blk_queue_split);
|
||||
|
||||
@ -558,6 +580,23 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq)
|
||||
return queue_max_segments(rq->q);
|
||||
}
|
||||
|
||||
static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
|
||||
sector_t offset)
|
||||
{
|
||||
struct request_queue *q = rq->q;
|
||||
|
||||
if (blk_rq_is_passthrough(rq))
|
||||
return q->limits.max_hw_sectors;
|
||||
|
||||
if (!q->limits.chunk_sectors ||
|
||||
req_op(rq) == REQ_OP_DISCARD ||
|
||||
req_op(rq) == REQ_OP_SECURE_ERASE)
|
||||
return blk_queue_get_max_sectors(q, req_op(rq));
|
||||
|
||||
return min(blk_max_size_offset(q, offset, 0),
|
||||
blk_queue_get_max_sectors(q, req_op(rq)));
|
||||
}
|
||||
|
||||
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
|
||||
unsigned int nr_phys_segs)
|
||||
{
|
||||
@ -718,6 +757,13 @@ static enum elv_merge blk_try_req_merge(struct request *req,
|
||||
return ELEVATOR_NO_MERGE;
|
||||
}
|
||||
|
||||
static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
|
||||
{
|
||||
if (bio_page(a) == bio_page(b) && bio_offset(a) == bio_offset(b))
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* For non-mq, this has to be called with the request spinlock acquired.
|
||||
* For mq with scheduling, the appropriate queue wide lock should be held.
|
||||
@ -731,8 +777,7 @@ static struct request *attempt_merge(struct request_queue *q,
|
||||
if (req_op(req) != req_op(next))
|
||||
return NULL;
|
||||
|
||||
if (rq_data_dir(req) != rq_data_dir(next)
|
||||
|| req->rq_disk != next->rq_disk)
|
||||
if (rq_data_dir(req) != rq_data_dir(next))
|
||||
return NULL;
|
||||
|
||||
if (req_op(req) == REQ_OP_WRITE_SAME &&
|
||||
@ -859,10 +904,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
|
||||
if (bio_data_dir(bio) != rq_data_dir(rq))
|
||||
return false;
|
||||
|
||||
/* must be same device */
|
||||
if (rq->rq_disk != bio->bi_bdev->bd_disk)
|
||||
return false;
|
||||
|
||||
/* only merge integrity protected bio into ditto rq */
|
||||
if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
|
||||
return false;
|
||||
@ -1023,12 +1064,10 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
|
||||
* @q: request_queue new bio is being queued at
|
||||
* @bio: new bio being queued
|
||||
* @nr_segs: number of segments in @bio
|
||||
* @same_queue_rq: pointer to &struct request that gets filled in when
|
||||
* another request associated with @q is found on the plug list
|
||||
* (optional, may be %NULL)
|
||||
* from the passed in @q already in the plug list
|
||||
*
|
||||
* Determine whether @bio being queued on @q can be merged with a request
|
||||
* on %current's plugged list. Returns %true if merge was successful,
|
||||
* Determine whether @bio being queued on @q can be merged with the previous
|
||||
* request on %current's plugged list. Returns %true if merge was successful,
|
||||
* otherwise %false.
|
||||
*
|
||||
* Plugging coalesces IOs from the same issuer for the same purpose without
|
||||
@ -1041,36 +1080,22 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
|
||||
* Caller must ensure !blk_queue_nomerges(q) beforehand.
|
||||
*/
|
||||
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
|
||||
unsigned int nr_segs, struct request **same_queue_rq)
|
||||
unsigned int nr_segs)
|
||||
{
|
||||
struct blk_plug *plug;
|
||||
struct request *rq;
|
||||
struct list_head *plug_list;
|
||||
|
||||
plug = blk_mq_plug(q, bio);
|
||||
if (!plug)
|
||||
if (!plug || rq_list_empty(plug->mq_list))
|
||||
return false;
|
||||
|
||||
plug_list = &plug->mq_list;
|
||||
|
||||
list_for_each_entry_reverse(rq, plug_list, queuelist) {
|
||||
if (rq->q == q && same_queue_rq) {
|
||||
/*
|
||||
* Only blk-mq multiple hardware queues case checks the
|
||||
* rq in the same queue, there should be only one such
|
||||
* rq in a queue
|
||||
**/
|
||||
*same_queue_rq = rq;
|
||||
}
|
||||
|
||||
if (rq->q != q)
|
||||
continue;
|
||||
|
||||
/* check the previously added entry for a quick merge attempt */
|
||||
rq = rq_list_peek(&plug->mq_list);
|
||||
if (rq->q == q) {
|
||||
if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
|
||||
BIO_MERGE_OK)
|
||||
BIO_MERGE_OK)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include "blk.h"
|
||||
#include "blk-mq.h"
|
||||
#include "blk-mq-debugfs.h"
|
||||
#include "blk-mq-sched.h"
|
||||
#include "blk-mq-tag.h"
|
||||
#include "blk-rq-qos.h"
|
||||
|
||||
@ -29,6 +30,9 @@ static int queue_poll_stat_show(void *data, struct seq_file *m)
|
||||
struct request_queue *q = data;
|
||||
int bucket;
|
||||
|
||||
if (!q->poll_stat)
|
||||
return 0;
|
||||
|
||||
for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) {
|
||||
seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket));
|
||||
print_stat(m, &q->poll_stat[2 * bucket]);
|
||||
@ -122,9 +126,7 @@ static const char *const blk_queue_flag_name[] = {
|
||||
QUEUE_FLAG_NAME(FUA),
|
||||
QUEUE_FLAG_NAME(DAX),
|
||||
QUEUE_FLAG_NAME(STATS),
|
||||
QUEUE_FLAG_NAME(POLL_STATS),
|
||||
QUEUE_FLAG_NAME(REGISTERED),
|
||||
QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
|
||||
QUEUE_FLAG_NAME(QUIESCED),
|
||||
QUEUE_FLAG_NAME(PCI_P2PDMA),
|
||||
QUEUE_FLAG_NAME(ZONE_RESETALL),
|
||||
@ -287,7 +289,7 @@ static const char *const cmd_flag_name[] = {
|
||||
CMD_FLAG_NAME(BACKGROUND),
|
||||
CMD_FLAG_NAME(NOWAIT),
|
||||
CMD_FLAG_NAME(NOUNMAP),
|
||||
CMD_FLAG_NAME(HIPRI),
|
||||
CMD_FLAG_NAME(POLLED),
|
||||
};
|
||||
#undef CMD_FLAG_NAME
|
||||
|
||||
@ -309,6 +311,7 @@ static const char *const rqf_name[] = {
|
||||
RQF_NAME(SPECIAL_PAYLOAD),
|
||||
RQF_NAME(ZONE_WRITE_LOCKED),
|
||||
RQF_NAME(MQ_POLL_SLEPT),
|
||||
RQF_NAME(ELV),
|
||||
};
|
||||
#undef RQF_NAME
|
||||
|
||||
@ -453,11 +456,11 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
|
||||
atomic_read(&tags->active_queues));
|
||||
|
||||
seq_puts(m, "\nbitmap_tags:\n");
|
||||
sbitmap_queue_show(tags->bitmap_tags, m);
|
||||
sbitmap_queue_show(&tags->bitmap_tags, m);
|
||||
|
||||
if (tags->nr_reserved_tags) {
|
||||
seq_puts(m, "\nbreserved_tags:\n");
|
||||
sbitmap_queue_show(tags->breserved_tags, m);
|
||||
sbitmap_queue_show(&tags->breserved_tags, m);
|
||||
}
|
||||
}
|
||||
|
||||
@ -488,7 +491,7 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
|
||||
if (res)
|
||||
goto out;
|
||||
if (hctx->tags)
|
||||
sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m);
|
||||
sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
||||
out:
|
||||
@ -522,77 +525,13 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
|
||||
if (res)
|
||||
goto out;
|
||||
if (hctx->sched_tags)
|
||||
sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m);
|
||||
sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
||||
out:
|
||||
return res;
|
||||
}
|
||||
|
||||
static int hctx_io_poll_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = data;
|
||||
|
||||
seq_printf(m, "considered=%lu\n", hctx->poll_considered);
|
||||
seq_printf(m, "invoked=%lu\n", hctx->poll_invoked);
|
||||
seq_printf(m, "success=%lu\n", hctx->poll_success);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t hctx_io_poll_write(void *data, const char __user *buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = data;
|
||||
|
||||
hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
|
||||
return count;
|
||||
}
|
||||
|
||||
static int hctx_dispatched_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = data;
|
||||
int i;
|
||||
|
||||
seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
|
||||
|
||||
for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
|
||||
unsigned int d = 1U << (i - 1);
|
||||
|
||||
seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]);
|
||||
}
|
||||
|
||||
seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t hctx_dispatched_write(void *data, const char __user *buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = data;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++)
|
||||
hctx->dispatched[i] = 0;
|
||||
return count;
|
||||
}
|
||||
|
||||
static int hctx_queued_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = data;
|
||||
|
||||
seq_printf(m, "%lu\n", hctx->queued);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t hctx_queued_write(void *data, const char __user *buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = data;
|
||||
|
||||
hctx->queued = 0;
|
||||
return count;
|
||||
}
|
||||
|
||||
static int hctx_run_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = data;
|
||||
@ -614,7 +553,7 @@ static int hctx_active_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = data;
|
||||
|
||||
seq_printf(m, "%d\n", atomic_read(&hctx->nr_active));
|
||||
seq_printf(m, "%d\n", __blk_mq_active_requests(hctx));
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -663,57 +602,6 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
|
||||
CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
|
||||
CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
|
||||
|
||||
static int ctx_dispatched_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct blk_mq_ctx *ctx = data;
|
||||
|
||||
seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t ctx_dispatched_write(void *data, const char __user *buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
struct blk_mq_ctx *ctx = data;
|
||||
|
||||
ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0;
|
||||
return count;
|
||||
}
|
||||
|
||||
static int ctx_merged_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct blk_mq_ctx *ctx = data;
|
||||
|
||||
seq_printf(m, "%lu\n", ctx->rq_merged);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t ctx_merged_write(void *data, const char __user *buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
struct blk_mq_ctx *ctx = data;
|
||||
|
||||
ctx->rq_merged = 0;
|
||||
return count;
|
||||
}
|
||||
|
||||
static int ctx_completed_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct blk_mq_ctx *ctx = data;
|
||||
|
||||
seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t ctx_completed_write(void *data, const char __user *buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
struct blk_mq_ctx *ctx = data;
|
||||
|
||||
ctx->rq_completed[0] = ctx->rq_completed[1] = 0;
|
||||
return count;
|
||||
}
|
||||
|
||||
static int blk_mq_debugfs_show(struct seq_file *m, void *v)
|
||||
{
|
||||
const struct blk_mq_debugfs_attr *attr = m->private;
|
||||
@ -789,9 +677,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
|
||||
{"tags_bitmap", 0400, hctx_tags_bitmap_show},
|
||||
{"sched_tags", 0400, hctx_sched_tags_show},
|
||||
{"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show},
|
||||
{"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write},
|
||||
{"dispatched", 0600, hctx_dispatched_show, hctx_dispatched_write},
|
||||
{"queued", 0600, hctx_queued_show, hctx_queued_write},
|
||||
{"run", 0600, hctx_run_show, hctx_run_write},
|
||||
{"active", 0400, hctx_active_show},
|
||||
{"dispatch_busy", 0400, hctx_dispatch_busy_show},
|
||||
@ -803,9 +688,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
|
||||
{"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops},
|
||||
{"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
|
||||
{"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
|
||||
{"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
|
||||
{"merged", 0600, ctx_merged_show, ctx_merged_write},
|
||||
{"completed", 0600, ctx_completed_show, ctx_completed_write},
|
||||
{},
|
||||
};
|
||||
|
||||
|
@ -18,32 +18,6 @@
|
||||
#include "blk-mq-tag.h"
|
||||
#include "blk-wbt.h"
|
||||
|
||||
void blk_mq_sched_assign_ioc(struct request *rq)
|
||||
{
|
||||
struct request_queue *q = rq->q;
|
||||
struct io_context *ioc;
|
||||
struct io_cq *icq;
|
||||
|
||||
/*
|
||||
* May not have an IO context if it's a passthrough request
|
||||
*/
|
||||
ioc = current->io_context;
|
||||
if (!ioc)
|
||||
return;
|
||||
|
||||
spin_lock_irq(&q->queue_lock);
|
||||
icq = ioc_lookup_icq(ioc, q);
|
||||
spin_unlock_irq(&q->queue_lock);
|
||||
|
||||
if (!icq) {
|
||||
icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
|
||||
if (!icq)
|
||||
return;
|
||||
}
|
||||
get_io_context(icq->ioc);
|
||||
rq->elv.icq = icq;
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark a hardware queue as needing a restart. For shared queues, maintain
|
||||
* a count of how many hardware queues are marked for restart.
|
||||
@ -57,10 +31,8 @@ void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
|
||||
|
||||
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
|
||||
void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
|
||||
return;
|
||||
clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
|
||||
|
||||
/*
|
||||
@ -363,7 +335,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
|
||||
}
|
||||
}
|
||||
|
||||
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
|
||||
bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
|
||||
unsigned int nr_segs)
|
||||
{
|
||||
struct elevator_queue *e = q->elevator;
|
||||
@ -372,15 +344,17 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
|
||||
bool ret = false;
|
||||
enum hctx_type type;
|
||||
|
||||
if (e && e->type->ops.bio_merge)
|
||||
return e->type->ops.bio_merge(q, bio, nr_segs);
|
||||
if (e && e->type->ops.bio_merge) {
|
||||
ret = e->type->ops.bio_merge(q, bio, nr_segs);
|
||||
goto out_put;
|
||||
}
|
||||
|
||||
ctx = blk_mq_get_ctx(q);
|
||||
hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
|
||||
type = hctx->type;
|
||||
if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
|
||||
list_empty_careful(&ctx->rq_lists[type]))
|
||||
return false;
|
||||
goto out_put;
|
||||
|
||||
/* default per sw-queue merge */
|
||||
spin_lock(&ctx->lock);
|
||||
@ -389,13 +363,11 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
|
||||
* potentially merge with. Currently includes a hand-wavy stop
|
||||
* count of 8, to not spend too much time checking for merges.
|
||||
*/
|
||||
if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
|
||||
ctx->rq_merged++;
|
||||
if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
|
||||
ret = true;
|
||||
}
|
||||
|
||||
spin_unlock(&ctx->lock);
|
||||
|
||||
out_put:
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -502,8 +474,9 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
|
||||
* busy in case of 'none' scheduler, and this way may save
|
||||
* us one extra enqueue & dequeue to sw queue.
|
||||
*/
|
||||
if (!hctx->dispatch_busy && !e && !run_queue_async) {
|
||||
blk_mq_try_issue_list_directly(hctx, list);
|
||||
if (!hctx->dispatch_busy && !run_queue_async) {
|
||||
blk_mq_run_dispatch_ops(hctx->queue,
|
||||
blk_mq_try_issue_list_directly(hctx, list));
|
||||
if (list_empty(list))
|
||||
goto out;
|
||||
}
|
||||
@ -515,83 +488,71 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
|
||||
percpu_ref_put(&q->q_usage_counter);
|
||||
}
|
||||
|
||||
static int blk_mq_sched_alloc_tags(struct request_queue *q,
|
||||
struct blk_mq_hw_ctx *hctx,
|
||||
unsigned int hctx_idx)
|
||||
static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
|
||||
struct blk_mq_hw_ctx *hctx,
|
||||
unsigned int hctx_idx)
|
||||
{
|
||||
struct blk_mq_tag_set *set = q->tag_set;
|
||||
int ret;
|
||||
|
||||
hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
|
||||
set->reserved_tags, set->flags);
|
||||
if (!hctx->sched_tags)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
|
||||
if (ret) {
|
||||
blk_mq_free_rq_map(hctx->sched_tags, set->flags);
|
||||
hctx->sched_tags = NULL;
|
||||
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
|
||||
hctx->sched_tags = q->sched_shared_tags;
|
||||
return 0;
|
||||
}
|
||||
|
||||
return ret;
|
||||
hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
|
||||
q->nr_requests);
|
||||
|
||||
if (!hctx->sched_tags)
|
||||
return -ENOMEM;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
|
||||
{
|
||||
blk_mq_free_rq_map(queue->sched_shared_tags);
|
||||
queue->sched_shared_tags = NULL;
|
||||
}
|
||||
|
||||
/* called in queue's release handler, tagset has gone away */
|
||||
static void blk_mq_sched_tags_teardown(struct request_queue *q)
|
||||
static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
int i;
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
if (hctx->sched_tags) {
|
||||
blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
|
||||
if (!blk_mq_is_shared_tags(flags))
|
||||
blk_mq_free_rq_map(hctx->sched_tags);
|
||||
hctx->sched_tags = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
if (blk_mq_is_shared_tags(flags))
|
||||
blk_mq_exit_sched_shared_tags(q);
|
||||
}
|
||||
|
||||
static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
|
||||
static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
|
||||
{
|
||||
struct blk_mq_tag_set *set = queue->tag_set;
|
||||
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
int ret, i;
|
||||
|
||||
/*
|
||||
* Set initial depth at max so that we don't need to reallocate for
|
||||
* updating nr_requests.
|
||||
*/
|
||||
ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,
|
||||
&queue->sched_breserved_tags,
|
||||
MAX_SCHED_RQ, set->reserved_tags,
|
||||
set->numa_node, alloc_policy);
|
||||
if (ret)
|
||||
return ret;
|
||||
queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
|
||||
BLK_MQ_NO_HCTX_IDX,
|
||||
MAX_SCHED_RQ);
|
||||
if (!queue->sched_shared_tags)
|
||||
return -ENOMEM;
|
||||
|
||||
queue_for_each_hw_ctx(queue, hctx, i) {
|
||||
hctx->sched_tags->bitmap_tags =
|
||||
&queue->sched_bitmap_tags;
|
||||
hctx->sched_tags->breserved_tags =
|
||||
&queue->sched_breserved_tags;
|
||||
}
|
||||
|
||||
sbitmap_queue_resize(&queue->sched_bitmap_tags,
|
||||
queue->nr_requests - set->reserved_tags);
|
||||
blk_mq_tag_update_sched_shared_tags(queue);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
|
||||
{
|
||||
sbitmap_queue_free(&queue->sched_bitmap_tags);
|
||||
sbitmap_queue_free(&queue->sched_breserved_tags);
|
||||
}
|
||||
|
||||
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
|
||||
{
|
||||
unsigned int i, flags = q->tag_set->flags;
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
struct elevator_queue *eq;
|
||||
unsigned int i;
|
||||
int ret;
|
||||
|
||||
if (!e) {
|
||||
@ -606,23 +567,23 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
|
||||
* Additionally, this is a per-hw queue depth.
|
||||
*/
|
||||
q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
|
||||
BLKDEV_MAX_RQ);
|
||||
BLKDEV_DEFAULT_RQ);
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
ret = blk_mq_sched_alloc_tags(q, hctx, i);
|
||||
if (blk_mq_is_shared_tags(flags)) {
|
||||
ret = blk_mq_init_sched_shared_tags(q);
|
||||
if (ret)
|
||||
goto err_free_tags;
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
|
||||
ret = blk_mq_init_sched_shared_sbitmap(q);
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
|
||||
if (ret)
|
||||
goto err_free_tags;
|
||||
goto err_free_map_and_rqs;
|
||||
}
|
||||
|
||||
ret = e->ops.init_sched(q, e);
|
||||
if (ret)
|
||||
goto err_free_sbitmap;
|
||||
goto err_free_map_and_rqs;
|
||||
|
||||
blk_mq_debugfs_register_sched(q);
|
||||
|
||||
@ -631,7 +592,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
|
||||
ret = e->ops.init_hctx(hctx, i);
|
||||
if (ret) {
|
||||
eq = q->elevator;
|
||||
blk_mq_sched_free_requests(q);
|
||||
blk_mq_sched_free_rqs(q);
|
||||
blk_mq_exit_sched(q, eq);
|
||||
kobject_put(&eq->kobj);
|
||||
return ret;
|
||||
@ -642,12 +603,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
|
||||
|
||||
return 0;
|
||||
|
||||
err_free_sbitmap:
|
||||
if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
|
||||
blk_mq_exit_sched_shared_sbitmap(q);
|
||||
err_free_tags:
|
||||
blk_mq_sched_free_requests(q);
|
||||
blk_mq_sched_tags_teardown(q);
|
||||
err_free_map_and_rqs:
|
||||
blk_mq_sched_free_rqs(q);
|
||||
blk_mq_sched_tags_teardown(q, flags);
|
||||
|
||||
q->elevator = NULL;
|
||||
return ret;
|
||||
}
|
||||
@ -656,14 +615,20 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
|
||||
* called in either blk_queue_cleanup or elevator_switch, tagset
|
||||
* is required for freeing requests
|
||||
*/
|
||||
void blk_mq_sched_free_requests(struct request_queue *q)
|
||||
void blk_mq_sched_free_rqs(struct request_queue *q)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
int i;
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
if (hctx->sched_tags)
|
||||
blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
|
||||
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
|
||||
blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
|
||||
BLK_MQ_NO_HCTX_IDX);
|
||||
} else {
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
if (hctx->sched_tags)
|
||||
blk_mq_free_rqs(q->tag_set,
|
||||
hctx->sched_tags, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -684,8 +649,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
|
||||
blk_mq_debugfs_unregister_sched(q);
|
||||
if (e->type->ops.exit_sched)
|
||||
e->type->ops.exit_sched(e);
|
||||
blk_mq_sched_tags_teardown(q);
|
||||
if (blk_mq_is_sbitmap_shared(flags))
|
||||
blk_mq_exit_sched_shared_sbitmap(q);
|
||||
blk_mq_sched_tags_teardown(q, flags);
|
||||
q->elevator = NULL;
|
||||
}
|
||||
|
@ -2,21 +2,20 @@
|
||||
#ifndef BLK_MQ_SCHED_H
|
||||
#define BLK_MQ_SCHED_H
|
||||
|
||||
#include "elevator.h"
|
||||
#include "blk-mq.h"
|
||||
#include "blk-mq-tag.h"
|
||||
|
||||
#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ)
|
||||
|
||||
void blk_mq_sched_assign_ioc(struct request *rq);
|
||||
#define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)
|
||||
|
||||
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
|
||||
unsigned int nr_segs, struct request **merged_request);
|
||||
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
|
||||
bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
|
||||
unsigned int nr_segs);
|
||||
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
|
||||
struct list_head *free);
|
||||
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
|
||||
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
|
||||
void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
|
||||
|
||||
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
|
||||
bool run_queue, bool async);
|
||||
@ -28,45 +27,51 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
|
||||
|
||||
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
|
||||
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
|
||||
void blk_mq_sched_free_requests(struct request_queue *q);
|
||||
void blk_mq_sched_free_rqs(struct request_queue *q);
|
||||
|
||||
static inline bool
|
||||
blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
|
||||
unsigned int nr_segs)
|
||||
static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
if (blk_queue_nomerges(q) || !bio_mergeable(bio))
|
||||
return false;
|
||||
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
|
||||
__blk_mq_sched_restart(hctx);
|
||||
}
|
||||
|
||||
return __blk_mq_sched_bio_merge(q, bio, nr_segs);
|
||||
static inline bool bio_mergeable(struct bio *bio)
|
||||
{
|
||||
return !(bio->bi_opf & REQ_NOMERGE_FLAGS);
|
||||
}
|
||||
|
||||
static inline bool
|
||||
blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
|
||||
struct bio *bio)
|
||||
{
|
||||
struct elevator_queue *e = q->elevator;
|
||||
|
||||
if (e && e->type->ops.allow_merge)
|
||||
return e->type->ops.allow_merge(q, rq, bio);
|
||||
if (rq->rq_flags & RQF_ELV) {
|
||||
struct elevator_queue *e = q->elevator;
|
||||
|
||||
if (e->type->ops.allow_merge)
|
||||
return e->type->ops.allow_merge(q, rq, bio);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
|
||||
{
|
||||
struct elevator_queue *e = rq->q->elevator;
|
||||
if (rq->rq_flags & RQF_ELV) {
|
||||
struct elevator_queue *e = rq->q->elevator;
|
||||
|
||||
if (e && e->type->ops.completed_request)
|
||||
e->type->ops.completed_request(rq, now);
|
||||
if (e->type->ops.completed_request)
|
||||
e->type->ops.completed_request(rq, now);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void blk_mq_sched_requeue_request(struct request *rq)
|
||||
{
|
||||
struct request_queue *q = rq->q;
|
||||
struct elevator_queue *e = q->elevator;
|
||||
if (rq->rq_flags & RQF_ELV) {
|
||||
struct request_queue *q = rq->q;
|
||||
struct elevator_queue *e = q->elevator;
|
||||
|
||||
if ((rq->rq_flags & RQF_ELVPRIV) && e && e->type->ops.requeue_request)
|
||||
e->type->ops.requeue_request(rq);
|
||||
if ((rq->rq_flags & RQF_ELVPRIV) && e->type->ops.requeue_request)
|
||||
e->type->ops.requeue_request(rq);
|
||||
}
|
||||
}
|
||||
|
||||
static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
|
||||
|
@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
|
||||
struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
|
||||
kobj);
|
||||
|
||||
if (hctx->flags & BLK_MQ_F_BLOCKING)
|
||||
cleanup_srcu_struct(hctx->srcu);
|
||||
blk_free_flush_queue(hctx->fq);
|
||||
sbitmap_free(&hctx->ctx_map);
|
||||
free_cpumask_var(hctx->cpumask);
|
||||
|
@ -16,6 +16,21 @@
|
||||
#include "blk-mq-sched.h"
|
||||
#include "blk-mq-tag.h"
|
||||
|
||||
/*
|
||||
* Recalculate wakeup batch when tag is shared by hctx.
|
||||
*/
|
||||
static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
|
||||
unsigned int users)
|
||||
{
|
||||
if (!users)
|
||||
return;
|
||||
|
||||
sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags,
|
||||
users);
|
||||
sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags,
|
||||
users);
|
||||
}
|
||||
|
||||
/*
|
||||
* If a previously inactive queue goes active, bump the active user count.
|
||||
* We need to do this before try to allocate driver tag, then even if fail
|
||||
@ -24,19 +39,26 @@
|
||||
*/
|
||||
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
|
||||
struct request_queue *q = hctx->queue;
|
||||
struct blk_mq_tag_set *set = q->tag_set;
|
||||
unsigned int users;
|
||||
|
||||
if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
|
||||
!test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
|
||||
atomic_inc(&set->active_queues_shared_sbitmap);
|
||||
if (blk_mq_is_shared_tags(hctx->flags)) {
|
||||
struct request_queue *q = hctx->queue;
|
||||
|
||||
if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
|
||||
test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
|
||||
!test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
|
||||
atomic_inc(&hctx->tags->active_queues);
|
||||
if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
|
||||
test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
users = atomic_inc_return(&hctx->tags->active_queues);
|
||||
|
||||
blk_mq_update_wake_batch(hctx->tags, users);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -45,9 +67,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
|
||||
*/
|
||||
void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
|
||||
{
|
||||
sbitmap_queue_wake_all(tags->bitmap_tags);
|
||||
sbitmap_queue_wake_all(&tags->bitmap_tags);
|
||||
if (include_reserve)
|
||||
sbitmap_queue_wake_all(tags->breserved_tags);
|
||||
sbitmap_queue_wake_all(&tags->breserved_tags);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -57,20 +79,23 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
|
||||
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
struct blk_mq_tags *tags = hctx->tags;
|
||||
struct request_queue *q = hctx->queue;
|
||||
struct blk_mq_tag_set *set = q->tag_set;
|
||||
unsigned int users;
|
||||
|
||||
if (blk_mq_is_shared_tags(hctx->flags)) {
|
||||
struct request_queue *q = hctx->queue;
|
||||
|
||||
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
|
||||
if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
|
||||
&q->queue_flags))
|
||||
return;
|
||||
atomic_dec(&set->active_queues_shared_sbitmap);
|
||||
} else {
|
||||
if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
|
||||
return;
|
||||
atomic_dec(&tags->active_queues);
|
||||
}
|
||||
|
||||
users = atomic_dec_return(&tags->active_queues);
|
||||
|
||||
blk_mq_update_wake_batch(tags, users);
|
||||
|
||||
blk_mq_tag_wakeup_all(tags, false);
|
||||
}
|
||||
|
||||
@ -87,6 +112,21 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
|
||||
return __sbitmap_queue_get(bt);
|
||||
}
|
||||
|
||||
unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
|
||||
unsigned int *offset)
|
||||
{
|
||||
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
|
||||
struct sbitmap_queue *bt = &tags->bitmap_tags;
|
||||
unsigned long ret;
|
||||
|
||||
if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED ||
|
||||
data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
|
||||
return 0;
|
||||
ret = __sbitmap_queue_get_batch(bt, nr_tags, offset);
|
||||
*offset += tags->nr_reserved_tags;
|
||||
return ret;
|
||||
}
|
||||
|
||||
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
|
||||
{
|
||||
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
|
||||
@ -101,10 +141,10 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
|
||||
WARN_ON_ONCE(1);
|
||||
return BLK_MQ_NO_TAG;
|
||||
}
|
||||
bt = tags->breserved_tags;
|
||||
bt = &tags->breserved_tags;
|
||||
tag_offset = 0;
|
||||
} else {
|
||||
bt = tags->bitmap_tags;
|
||||
bt = &tags->bitmap_tags;
|
||||
tag_offset = tags->nr_reserved_tags;
|
||||
}
|
||||
|
||||
@ -150,9 +190,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
|
||||
data->ctx);
|
||||
tags = blk_mq_tags_from_data(data);
|
||||
if (data->flags & BLK_MQ_REQ_RESERVED)
|
||||
bt = tags->breserved_tags;
|
||||
bt = &tags->breserved_tags;
|
||||
else
|
||||
bt = tags->bitmap_tags;
|
||||
bt = &tags->bitmap_tags;
|
||||
|
||||
/*
|
||||
* If destination hw queue is changed, fake wake up on
|
||||
@ -186,16 +226,23 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
|
||||
const int real_tag = tag - tags->nr_reserved_tags;
|
||||
|
||||
BUG_ON(real_tag >= tags->nr_tags);
|
||||
sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu);
|
||||
sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
|
||||
} else {
|
||||
BUG_ON(tag >= tags->nr_reserved_tags);
|
||||
sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu);
|
||||
sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
|
||||
}
|
||||
}
|
||||
|
||||
void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
|
||||
{
|
||||
sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
|
||||
tag_array, nr_tags);
|
||||
}
|
||||
|
||||
struct bt_iter_data {
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
busy_iter_fn *fn;
|
||||
struct request_queue *q;
|
||||
busy_tag_iter_fn *fn;
|
||||
void *data;
|
||||
bool reserved;
|
||||
};
|
||||
@ -208,7 +255,7 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
|
||||
|
||||
spin_lock_irqsave(&tags->lock, flags);
|
||||
rq = tags->rqs[bitnr];
|
||||
if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref))
|
||||
if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq))
|
||||
rq = NULL;
|
||||
spin_unlock_irqrestore(&tags->lock, flags);
|
||||
return rq;
|
||||
@ -218,11 +265,18 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
|
||||
{
|
||||
struct bt_iter_data *iter_data = data;
|
||||
struct blk_mq_hw_ctx *hctx = iter_data->hctx;
|
||||
struct blk_mq_tags *tags = hctx->tags;
|
||||
struct request_queue *q = iter_data->q;
|
||||
struct blk_mq_tag_set *set = q->tag_set;
|
||||
bool reserved = iter_data->reserved;
|
||||
struct blk_mq_tags *tags;
|
||||
struct request *rq;
|
||||
bool ret = true;
|
||||
|
||||
if (blk_mq_is_shared_tags(set->flags))
|
||||
tags = set->shared_tags;
|
||||
else
|
||||
tags = hctx->tags;
|
||||
|
||||
if (!reserved)
|
||||
bitnr += tags->nr_reserved_tags;
|
||||
/*
|
||||
@ -233,8 +287,8 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
|
||||
if (!rq)
|
||||
return true;
|
||||
|
||||
if (rq->q == hctx->queue && rq->mq_hctx == hctx)
|
||||
ret = iter_data->fn(hctx, rq, iter_data->data, reserved);
|
||||
if (rq->q == q && (!hctx || rq->mq_hctx == hctx))
|
||||
ret = iter_data->fn(rq, iter_data->data, reserved);
|
||||
blk_mq_put_rq_ref(rq);
|
||||
return ret;
|
||||
}
|
||||
@ -242,6 +296,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
|
||||
/**
|
||||
* bt_for_each - iterate over the requests associated with a hardware queue
|
||||
* @hctx: Hardware queue to examine.
|
||||
* @q: Request queue to examine.
|
||||
* @bt: sbitmap to examine. This is either the breserved_tags member
|
||||
* or the bitmap_tags member of struct blk_mq_tags.
|
||||
* @fn: Pointer to the function that will be called for each request
|
||||
@ -253,14 +308,16 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
|
||||
* @reserved: Indicates whether @bt is the breserved_tags member or the
|
||||
* bitmap_tags member of struct blk_mq_tags.
|
||||
*/
|
||||
static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
|
||||
busy_iter_fn *fn, void *data, bool reserved)
|
||||
static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q,
|
||||
struct sbitmap_queue *bt, busy_tag_iter_fn *fn,
|
||||
void *data, bool reserved)
|
||||
{
|
||||
struct bt_iter_data iter_data = {
|
||||
.hctx = hctx,
|
||||
.fn = fn,
|
||||
.data = data,
|
||||
.reserved = reserved,
|
||||
.q = q,
|
||||
};
|
||||
|
||||
sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
|
||||
@ -340,9 +397,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
|
||||
WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);
|
||||
|
||||
if (tags->nr_reserved_tags)
|
||||
bt_tags_for_each(tags, tags->breserved_tags, fn, priv,
|
||||
bt_tags_for_each(tags, &tags->breserved_tags, fn, priv,
|
||||
flags | BT_TAG_ITER_RESERVED);
|
||||
bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags);
|
||||
bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -379,9 +436,12 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
|
||||
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
|
||||
busy_tag_iter_fn *fn, void *priv)
|
||||
{
|
||||
int i;
|
||||
unsigned int flags = tagset->flags;
|
||||
int i, nr_tags;
|
||||
|
||||
for (i = 0; i < tagset->nr_hw_queues; i++) {
|
||||
nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues;
|
||||
|
||||
for (i = 0; i < nr_tags; i++) {
|
||||
if (tagset->tags && tagset->tags[i])
|
||||
__blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
|
||||
BT_TAG_ITER_STARTED);
|
||||
@ -434,12 +494,9 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
|
||||
* called for all requests on all queues that share that tag set and not only
|
||||
* for requests associated with @q.
|
||||
*/
|
||||
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
|
||||
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
|
||||
void *priv)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
|
||||
* while the queue is frozen. So we can use q_usage_counter to avoid
|
||||
@ -448,19 +505,34 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
|
||||
if (!percpu_ref_tryget(&q->q_usage_counter))
|
||||
return;
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
struct blk_mq_tags *tags = hctx->tags;
|
||||
|
||||
/*
|
||||
* If no software queues are currently mapped to this
|
||||
* hardware queue, there's nothing to check
|
||||
*/
|
||||
if (!blk_mq_hw_queue_mapped(hctx))
|
||||
continue;
|
||||
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
|
||||
struct blk_mq_tags *tags = q->tag_set->shared_tags;
|
||||
struct sbitmap_queue *bresv = &tags->breserved_tags;
|
||||
struct sbitmap_queue *btags = &tags->bitmap_tags;
|
||||
|
||||
if (tags->nr_reserved_tags)
|
||||
bt_for_each(hctx, tags->breserved_tags, fn, priv, true);
|
||||
bt_for_each(hctx, tags->bitmap_tags, fn, priv, false);
|
||||
bt_for_each(NULL, q, bresv, fn, priv, true);
|
||||
bt_for_each(NULL, q, btags, fn, priv, false);
|
||||
} else {
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
int i;
|
||||
|
||||
queue_for_each_hw_ctx(q, hctx, i) {
|
||||
struct blk_mq_tags *tags = hctx->tags;
|
||||
struct sbitmap_queue *bresv = &tags->breserved_tags;
|
||||
struct sbitmap_queue *btags = &tags->bitmap_tags;
|
||||
|
||||
/*
|
||||
* If no software queues are currently mapped to this
|
||||
* hardware queue, there's nothing to check
|
||||
*/
|
||||
if (!blk_mq_hw_queue_mapped(hctx))
|
||||
continue;
|
||||
|
||||
if (tags->nr_reserved_tags)
|
||||
bt_for_each(hctx, q, bresv, fn, priv, true);
|
||||
bt_for_each(hctx, q, btags, fn, priv, false);
|
||||
}
|
||||
}
|
||||
blk_queue_exit(q);
|
||||
}
|
||||
@ -492,56 +564,10 @@ int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
|
||||
int node, int alloc_policy)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = blk_mq_init_bitmaps(&tags->__bitmap_tags,
|
||||
&tags->__breserved_tags,
|
||||
tags->nr_tags, tags->nr_reserved_tags,
|
||||
node, alloc_policy);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
tags->bitmap_tags = &tags->__bitmap_tags;
|
||||
tags->breserved_tags = &tags->__breserved_tags;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set)
|
||||
{
|
||||
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
|
||||
int i, ret;
|
||||
|
||||
ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags,
|
||||
set->queue_depth, set->reserved_tags,
|
||||
set->numa_node, alloc_policy);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
for (i = 0; i < set->nr_hw_queues; i++) {
|
||||
struct blk_mq_tags *tags = set->tags[i];
|
||||
|
||||
tags->bitmap_tags = &set->__bitmap_tags;
|
||||
tags->breserved_tags = &set->__breserved_tags;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
|
||||
{
|
||||
sbitmap_queue_free(&set->__bitmap_tags);
|
||||
sbitmap_queue_free(&set->__breserved_tags);
|
||||
}
|
||||
|
||||
struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
|
||||
unsigned int reserved_tags,
|
||||
int node, unsigned int flags)
|
||||
int node, int alloc_policy)
|
||||
{
|
||||
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags);
|
||||
struct blk_mq_tags *tags;
|
||||
|
||||
if (total_tags > BLK_MQ_TAG_MAX) {
|
||||
@ -557,22 +583,19 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
|
||||
tags->nr_reserved_tags = reserved_tags;
|
||||
spin_lock_init(&tags->lock);
|
||||
|
||||
if (blk_mq_is_sbitmap_shared(flags))
|
||||
return tags;
|
||||
|
||||
if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) {
|
||||
if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags,
|
||||
total_tags, reserved_tags, node,
|
||||
alloc_policy) < 0) {
|
||||
kfree(tags);
|
||||
return NULL;
|
||||
}
|
||||
return tags;
|
||||
}
|
||||
|
||||
void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags)
|
||||
void blk_mq_free_tags(struct blk_mq_tags *tags)
|
||||
{
|
||||
if (!blk_mq_is_sbitmap_shared(flags)) {
|
||||
sbitmap_queue_free(tags->bitmap_tags);
|
||||
sbitmap_queue_free(tags->breserved_tags);
|
||||
}
|
||||
sbitmap_queue_free(&tags->bitmap_tags);
|
||||
sbitmap_queue_free(&tags->breserved_tags);
|
||||
kfree(tags);
|
||||
}
|
||||
|
||||
@ -592,7 +615,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
|
||||
if (tdepth > tags->nr_tags) {
|
||||
struct blk_mq_tag_set *set = hctx->queue->tag_set;
|
||||
struct blk_mq_tags *new;
|
||||
bool ret;
|
||||
|
||||
if (!can_grow)
|
||||
return -EINVAL;
|
||||
@ -604,34 +626,42 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
|
||||
if (tdepth > MAX_SCHED_RQ)
|
||||
return -EINVAL;
|
||||
|
||||
new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
|
||||
tags->nr_reserved_tags, set->flags);
|
||||
/*
|
||||
* Only the sbitmap needs resizing since we allocated the max
|
||||
* initially.
|
||||
*/
|
||||
if (blk_mq_is_shared_tags(set->flags))
|
||||
return 0;
|
||||
|
||||
new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth);
|
||||
if (!new)
|
||||
return -ENOMEM;
|
||||
ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
|
||||
if (ret) {
|
||||
blk_mq_free_rq_map(new, set->flags);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
|
||||
blk_mq_free_rq_map(*tagsptr, set->flags);
|
||||
blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num);
|
||||
*tagsptr = new;
|
||||
} else {
|
||||
/*
|
||||
* Don't need (or can't) update reserved tags here, they
|
||||
* remain static and should never need resizing.
|
||||
*/
|
||||
sbitmap_queue_resize(tags->bitmap_tags,
|
||||
sbitmap_queue_resize(&tags->bitmap_tags,
|
||||
tdepth - tags->nr_reserved_tags);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size)
|
||||
void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size)
|
||||
{
|
||||
sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags);
|
||||
struct blk_mq_tags *tags = set->shared_tags;
|
||||
|
||||
sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags);
|
||||
}
|
||||
|
||||
void blk_mq_tag_update_sched_shared_tags(struct request_queue *q)
|
||||
{
|
||||
sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags,
|
||||
q->nr_requests - q->tag_set->reserved_tags);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -2,55 +2,33 @@
|
||||
#ifndef INT_BLK_MQ_TAG_H
|
||||
#define INT_BLK_MQ_TAG_H
|
||||
|
||||
/*
|
||||
* Tag address space map.
|
||||
*/
|
||||
struct blk_mq_tags {
|
||||
unsigned int nr_tags;
|
||||
unsigned int nr_reserved_tags;
|
||||
|
||||
atomic_t active_queues;
|
||||
|
||||
struct sbitmap_queue *bitmap_tags;
|
||||
struct sbitmap_queue *breserved_tags;
|
||||
|
||||
struct sbitmap_queue __bitmap_tags;
|
||||
struct sbitmap_queue __breserved_tags;
|
||||
|
||||
struct request **rqs;
|
||||
struct request **static_rqs;
|
||||
struct list_head page_list;
|
||||
|
||||
/*
|
||||
* used to clear request reference in rqs[] before freeing one
|
||||
* request pool
|
||||
*/
|
||||
spinlock_t lock;
|
||||
};
|
||||
struct blk_mq_alloc_data;
|
||||
|
||||
extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
|
||||
unsigned int reserved_tags,
|
||||
int node, unsigned int flags);
|
||||
extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags);
|
||||
int node, int alloc_policy);
|
||||
extern void blk_mq_free_tags(struct blk_mq_tags *tags);
|
||||
extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
|
||||
struct sbitmap_queue *breserved_tags,
|
||||
unsigned int queue_depth,
|
||||
unsigned int reserved,
|
||||
int node, int alloc_policy);
|
||||
|
||||
extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set);
|
||||
extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set);
|
||||
extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
|
||||
unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
|
||||
unsigned int *offset);
|
||||
extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
|
||||
unsigned int tag);
|
||||
void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags);
|
||||
extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
|
||||
struct blk_mq_tags **tags,
|
||||
unsigned int depth, bool can_grow);
|
||||
extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set,
|
||||
extern void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set,
|
||||
unsigned int size);
|
||||
extern void blk_mq_tag_update_sched_shared_tags(struct request_queue *q);
|
||||
|
||||
extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
|
||||
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
|
||||
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
|
||||
void *priv);
|
||||
void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
|
||||
void *priv);
|
||||
|
1818
block/blk-mq.c
1818
block/blk-mq.c
File diff suppressed because it is too large
Load Diff
133
block/blk-mq.h
133
block/blk-mq.h
@ -25,18 +25,14 @@ struct blk_mq_ctx {
|
||||
unsigned short index_hw[HCTX_MAX_TYPES];
|
||||
struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES];
|
||||
|
||||
/* incremented at dispatch time */
|
||||
unsigned long rq_dispatched[2];
|
||||
unsigned long rq_merged;
|
||||
|
||||
/* incremented at completion time */
|
||||
unsigned long ____cacheline_aligned_in_smp rq_completed[2];
|
||||
|
||||
struct request_queue *queue;
|
||||
struct blk_mq_ctxs *ctxs;
|
||||
struct kobject kobj;
|
||||
} ____cacheline_aligned_in_smp;
|
||||
|
||||
void blk_mq_submit_bio(struct bio *bio);
|
||||
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
|
||||
unsigned int flags);
|
||||
void blk_mq_exit_queue(struct request_queue *q);
|
||||
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
|
||||
void blk_mq_wake_waiters(struct request_queue *q);
|
||||
@ -54,15 +50,12 @@ void blk_mq_put_rq_ref(struct request *rq);
|
||||
*/
|
||||
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
|
||||
unsigned int hctx_idx);
|
||||
void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags);
|
||||
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
|
||||
unsigned int hctx_idx,
|
||||
unsigned int nr_tags,
|
||||
unsigned int reserved_tags,
|
||||
unsigned int flags);
|
||||
int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
|
||||
unsigned int hctx_idx, unsigned int depth);
|
||||
|
||||
void blk_mq_free_rq_map(struct blk_mq_tags *tags);
|
||||
struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
|
||||
unsigned int hctx_idx, unsigned int depth);
|
||||
void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
|
||||
struct blk_mq_tags *tags,
|
||||
unsigned int hctx_idx);
|
||||
/*
|
||||
* Internal helpers for request insertion into sw queues
|
||||
*/
|
||||
@ -72,9 +65,6 @@ void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
|
||||
bool run_queue);
|
||||
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
|
||||
struct list_head *list);
|
||||
|
||||
/* Used by blk_insert_cloned_request() to issue request directly */
|
||||
blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last);
|
||||
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
|
||||
struct list_head *list);
|
||||
|
||||
@ -96,6 +86,20 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
|
||||
return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
|
||||
}
|
||||
|
||||
static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags)
|
||||
{
|
||||
enum hctx_type type = HCTX_TYPE_DEFAULT;
|
||||
|
||||
/*
|
||||
* The caller ensure that if REQ_POLLED, poll must be enabled.
|
||||
*/
|
||||
if (flags & REQ_POLLED)
|
||||
type = HCTX_TYPE_POLL;
|
||||
else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
|
||||
type = HCTX_TYPE_READ;
|
||||
return type;
|
||||
}
|
||||
|
||||
/*
|
||||
* blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
|
||||
* @q: request queue
|
||||
@ -106,17 +110,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
|
||||
unsigned int flags,
|
||||
struct blk_mq_ctx *ctx)
|
||||
{
|
||||
enum hctx_type type = HCTX_TYPE_DEFAULT;
|
||||
|
||||
/*
|
||||
* The caller ensure that if REQ_HIPRI, poll must be enabled.
|
||||
*/
|
||||
if (flags & REQ_HIPRI)
|
||||
type = HCTX_TYPE_POLL;
|
||||
else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
|
||||
type = HCTX_TYPE_READ;
|
||||
|
||||
return ctx->hctxs[type];
|
||||
return ctx->hctxs[blk_mq_get_hctx_type(flags)];
|
||||
}
|
||||
|
||||
/*
|
||||
@ -128,6 +122,8 @@ extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
|
||||
extern int blk_mq_sysfs_register(struct request_queue *q);
|
||||
extern void blk_mq_sysfs_unregister(struct request_queue *q);
|
||||
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
|
||||
void blk_mq_free_plug_rqs(struct blk_plug *plug);
|
||||
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
|
||||
|
||||
void blk_mq_cancel_work_sync(struct request_queue *q);
|
||||
|
||||
@ -156,23 +152,27 @@ struct blk_mq_alloc_data {
|
||||
blk_mq_req_flags_t flags;
|
||||
unsigned int shallow_depth;
|
||||
unsigned int cmd_flags;
|
||||
req_flags_t rq_flags;
|
||||
|
||||
/* allocate multiple requests/tags in one go */
|
||||
unsigned int nr_tags;
|
||||
struct request **cached_rq;
|
||||
|
||||
/* input & output parameter */
|
||||
struct blk_mq_ctx *ctx;
|
||||
struct blk_mq_hw_ctx *hctx;
|
||||
};
|
||||
|
||||
static inline bool blk_mq_is_sbitmap_shared(unsigned int flags)
|
||||
static inline bool blk_mq_is_shared_tags(unsigned int flags)
|
||||
{
|
||||
return flags & BLK_MQ_F_TAG_HCTX_SHARED;
|
||||
}
|
||||
|
||||
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
|
||||
{
|
||||
if (data->q->elevator)
|
||||
return data->hctx->sched_tags;
|
||||
|
||||
return data->hctx->tags;
|
||||
if (!(data->rq_flags & RQF_ELV))
|
||||
return data->hctx->tags;
|
||||
return data->hctx->sched_tags;
|
||||
}
|
||||
|
||||
static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
|
||||
@ -222,24 +222,30 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq)
|
||||
|
||||
static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
if (blk_mq_is_sbitmap_shared(hctx->flags))
|
||||
atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap);
|
||||
if (blk_mq_is_shared_tags(hctx->flags))
|
||||
atomic_inc(&hctx->queue->nr_active_requests_shared_tags);
|
||||
else
|
||||
atomic_inc(&hctx->nr_active);
|
||||
}
|
||||
|
||||
static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
|
||||
int val)
|
||||
{
|
||||
if (blk_mq_is_shared_tags(hctx->flags))
|
||||
atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags);
|
||||
else
|
||||
atomic_sub(val, &hctx->nr_active);
|
||||
}
|
||||
|
||||
static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
if (blk_mq_is_sbitmap_shared(hctx->flags))
|
||||
atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap);
|
||||
else
|
||||
atomic_dec(&hctx->nr_active);
|
||||
__blk_mq_sub_active_requests(hctx, 1);
|
||||
}
|
||||
|
||||
static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
if (blk_mq_is_sbitmap_shared(hctx->flags))
|
||||
return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap);
|
||||
if (blk_mq_is_shared_tags(hctx->flags))
|
||||
return atomic_read(&hctx->queue->nr_active_requests_shared_tags);
|
||||
return atomic_read(&hctx->nr_active);
|
||||
}
|
||||
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
|
||||
@ -262,7 +268,20 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
|
||||
__blk_mq_put_driver_tag(rq->mq_hctx, rq);
|
||||
}
|
||||
|
||||
bool blk_mq_get_driver_tag(struct request *rq);
|
||||
bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq);
|
||||
|
||||
static inline bool blk_mq_get_driver_tag(struct request *rq)
|
||||
{
|
||||
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
|
||||
|
||||
if (rq->tag != BLK_MQ_NO_TAG &&
|
||||
!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
|
||||
hctx->tags->rqs[rq->tag] = rq;
|
||||
return true;
|
||||
}
|
||||
|
||||
return __blk_mq_get_driver_tag(hctx, rq);
|
||||
}
|
||||
|
||||
static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
|
||||
{
|
||||
@ -333,19 +352,18 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
|
||||
if (bt->sb.depth == 1)
|
||||
return true;
|
||||
|
||||
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
|
||||
if (blk_mq_is_shared_tags(hctx->flags)) {
|
||||
struct request_queue *q = hctx->queue;
|
||||
struct blk_mq_tag_set *set = q->tag_set;
|
||||
|
||||
if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
|
||||
return true;
|
||||
users = atomic_read(&set->active_queues_shared_sbitmap);
|
||||
} else {
|
||||
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
|
||||
return true;
|
||||
users = atomic_read(&hctx->tags->active_queues);
|
||||
}
|
||||
|
||||
users = atomic_read(&hctx->tags->active_queues);
|
||||
|
||||
if (!users)
|
||||
return true;
|
||||
|
||||
@ -356,5 +374,24 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
|
||||
return __blk_mq_active_requests(hctx) < depth;
|
||||
}
|
||||
|
||||
/* run the code block in @dispatch_ops with rcu/srcu read lock held */
|
||||
#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \
|
||||
do { \
|
||||
if (!blk_queue_has_srcu(q)) { \
|
||||
rcu_read_lock(); \
|
||||
(dispatch_ops); \
|
||||
rcu_read_unlock(); \
|
||||
} else { \
|
||||
int srcu_idx; \
|
||||
\
|
||||
might_sleep_if(check_sleep); \
|
||||
srcu_idx = srcu_read_lock((q)->srcu); \
|
||||
(dispatch_ops); \
|
||||
srcu_read_unlock((q)->srcu, srcu_idx); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define blk_mq_run_dispatch_ops(q, dispatch_ops) \
|
||||
__blk_mq_run_dispatch_ops(q, true, dispatch_ops) \
|
||||
|
||||
#endif
|
||||
|
@ -189,9 +189,10 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
|
||||
* BIO_TRACKED lets controllers know that a bio went through the
|
||||
* normal rq_qos path.
|
||||
*/
|
||||
bio_set_flag(bio, BIO_TRACKED);
|
||||
if (q->rq_qos)
|
||||
if (q->rq_qos) {
|
||||
bio_set_flag(bio, BIO_TRACKED);
|
||||
__rq_qos_throttle(q->rq_qos, bio);
|
||||
}
|
||||
}
|
||||
|
||||
static inline void rq_qos_track(struct request_queue *q, struct request *rq,
|
||||
|
@ -15,7 +15,7 @@
|
||||
struct blk_queue_stats {
|
||||
struct list_head callbacks;
|
||||
spinlock_t lock;
|
||||
bool enable_accounting;
|
||||
int accounting;
|
||||
};
|
||||
|
||||
void blk_rq_stat_init(struct blk_rq_stat *stat)
|
||||
@ -161,7 +161,7 @@ void blk_stat_remove_callback(struct request_queue *q,
|
||||
|
||||
spin_lock_irqsave(&q->stats->lock, flags);
|
||||
list_del_rcu(&cb->list);
|
||||
if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
|
||||
if (list_empty(&q->stats->callbacks) && !q->stats->accounting)
|
||||
blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
|
||||
spin_unlock_irqrestore(&q->stats->lock, flags);
|
||||
|
||||
@ -184,13 +184,24 @@ void blk_stat_free_callback(struct blk_stat_callback *cb)
|
||||
call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
|
||||
}
|
||||
|
||||
void blk_stat_disable_accounting(struct request_queue *q)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&q->stats->lock, flags);
|
||||
if (!--q->stats->accounting)
|
||||
blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
|
||||
spin_unlock_irqrestore(&q->stats->lock, flags);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_stat_disable_accounting);
|
||||
|
||||
void blk_stat_enable_accounting(struct request_queue *q)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
spin_lock_irqsave(&q->stats->lock, flags);
|
||||
q->stats->enable_accounting = true;
|
||||
blk_queue_flag_set(QUEUE_FLAG_STATS, q);
|
||||
if (!q->stats->accounting++)
|
||||
blk_queue_flag_set(QUEUE_FLAG_STATS, q);
|
||||
spin_unlock_irqrestore(&q->stats->lock, flags);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blk_stat_enable_accounting);
|
||||
@ -205,7 +216,7 @@ struct blk_queue_stats *blk_alloc_queue_stats(void)
|
||||
|
||||
INIT_LIST_HEAD(&stats->callbacks);
|
||||
spin_lock_init(&stats->lock);
|
||||
stats->enable_accounting = false;
|
||||
stats->accounting = 0;
|
||||
|
||||
return stats;
|
||||
}
|
||||
@ -219,3 +230,21 @@ void blk_free_queue_stats(struct blk_queue_stats *stats)
|
||||
|
||||
kfree(stats);
|
||||
}
|
||||
|
||||
bool blk_stats_alloc_enable(struct request_queue *q)
|
||||
{
|
||||
struct blk_rq_stat *poll_stat;
|
||||
|
||||
poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat),
|
||||
GFP_ATOMIC);
|
||||
if (!poll_stat)
|
||||
return false;
|
||||
|
||||
if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) {
|
||||
kfree(poll_stat);
|
||||
return true;
|
||||
}
|
||||
|
||||
blk_stat_add_callback(q, q->poll_cb);
|
||||
return false;
|
||||
}
|
||||
|
@ -64,11 +64,13 @@ struct blk_stat_callback {
|
||||
|
||||
struct blk_queue_stats *blk_alloc_queue_stats(void);
|
||||
void blk_free_queue_stats(struct blk_queue_stats *);
|
||||
bool blk_stats_alloc_enable(struct request_queue *q);
|
||||
|
||||
void blk_stat_add(struct request *rq, u64 now);
|
||||
|
||||
/* record time/size info in request but not add a callback */
|
||||
void blk_stat_enable_accounting(struct request_queue *q);
|
||||
void blk_stat_disable_accounting(struct request_queue *q);
|
||||
|
||||
/**
|
||||
* blk_stat_alloc_callback() - Allocate a block statistics callback.
|
||||
|
@ -16,7 +16,9 @@
|
||||
#include "blk.h"
|
||||
#include "blk-mq.h"
|
||||
#include "blk-mq-debugfs.h"
|
||||
#include "blk-mq-sched.h"
|
||||
#include "blk-wbt.h"
|
||||
#include "blk-throttle.h"
|
||||
|
||||
struct queue_sysfs_entry {
|
||||
struct attribute attr;
|
||||
@ -432,26 +434,11 @@ static ssize_t queue_poll_show(struct request_queue *q, char *page)
|
||||
static ssize_t queue_poll_store(struct request_queue *q, const char *page,
|
||||
size_t count)
|
||||
{
|
||||
unsigned long poll_on;
|
||||
ssize_t ret;
|
||||
|
||||
if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL ||
|
||||
!q->tag_set->map[HCTX_TYPE_POLL].nr_queues)
|
||||
if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
|
||||
return -EINVAL;
|
||||
|
||||
ret = queue_var_store(&poll_on, page, count);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
|
||||
if (poll_on) {
|
||||
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
|
||||
} else {
|
||||
blk_mq_freeze_queue(q);
|
||||
blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
|
||||
blk_mq_unfreeze_queue(q);
|
||||
}
|
||||
|
||||
return ret;
|
||||
pr_info_ratelimited("writes to the poll attribute are ignored.\n");
|
||||
pr_info_ratelimited("please use driver specific parameters instead.\n");
|
||||
return count;
|
||||
}
|
||||
|
||||
static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
|
||||
@ -748,7 +735,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
|
||||
{
|
||||
struct request_queue *q = container_of(rcu_head, struct request_queue,
|
||||
rcu_head);
|
||||
kmem_cache_free(blk_requestq_cachep, q);
|
||||
|
||||
kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
|
||||
}
|
||||
|
||||
/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
|
||||
@ -761,7 +749,7 @@ static void blk_exit_queue(struct request_queue *q)
|
||||
*/
|
||||
if (q->elevator) {
|
||||
ioc_clear_queue(q);
|
||||
__elevator_exit(q, q->elevator);
|
||||
elevator_exit(q);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -799,14 +787,15 @@ static void blk_release_queue(struct kobject *kobj)
|
||||
|
||||
might_sleep();
|
||||
|
||||
if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
|
||||
if (q->poll_stat)
|
||||
blk_stat_remove_callback(q, q->poll_cb);
|
||||
blk_stat_free_callback(q->poll_cb);
|
||||
|
||||
blk_free_queue_stats(q->stats);
|
||||
|
||||
blk_exit_queue(q);
|
||||
|
||||
blk_free_queue_stats(q->stats);
|
||||
kfree(q->poll_stat);
|
||||
|
||||
blk_queue_free_zone_bitmaps(q);
|
||||
|
||||
if (queue_is_mq(q))
|
||||
@ -822,6 +811,9 @@ static void blk_release_queue(struct kobject *kobj)
|
||||
|
||||
bioset_exit(&q->bio_split);
|
||||
|
||||
if (blk_queue_has_srcu(q))
|
||||
cleanup_srcu_struct(q->srcu);
|
||||
|
||||
ida_simple_remove(&blk_queue_ida, q->id);
|
||||
call_rcu(&q->rcu_head, blk_free_queue_rcu);
|
||||
}
|
||||
@ -877,16 +869,15 @@ int blk_register_queue(struct gendisk *disk)
|
||||
}
|
||||
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
|
||||
ret = disk_register_independent_access_ranges(disk, NULL);
|
||||
if (ret)
|
||||
goto put_dev;
|
||||
|
||||
if (q->elevator) {
|
||||
ret = elv_register_queue(q, false);
|
||||
if (ret) {
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
mutex_unlock(&q->sysfs_dir_lock);
|
||||
kobject_del(&q->kobj);
|
||||
blk_trace_remove_sysfs(dev);
|
||||
kobject_put(&dev->kobj);
|
||||
return ret;
|
||||
}
|
||||
if (ret)
|
||||
goto put_dev;
|
||||
}
|
||||
|
||||
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
|
||||
@ -899,7 +890,6 @@ int blk_register_queue(struct gendisk *disk)
|
||||
kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
|
||||
ret = 0;
|
||||
unlock:
|
||||
mutex_unlock(&q->sysfs_dir_lock);
|
||||
|
||||
@ -917,6 +907,16 @@ int blk_register_queue(struct gendisk *disk)
|
||||
percpu_ref_switch_to_percpu(&q->q_usage_counter);
|
||||
}
|
||||
|
||||
return ret;
|
||||
|
||||
put_dev:
|
||||
disk_unregister_independent_access_ranges(disk);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
mutex_unlock(&q->sysfs_dir_lock);
|
||||
kobject_del(&q->kobj);
|
||||
blk_trace_remove_sysfs(dev);
|
||||
kobject_put(&dev->kobj);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -962,6 +962,7 @@ void blk_unregister_queue(struct gendisk *disk)
|
||||
mutex_lock(&q->sysfs_lock);
|
||||
if (q->elevator)
|
||||
elv_unregister_queue(q);
|
||||
disk_unregister_independent_access_ranges(disk);
|
||||
mutex_unlock(&q->sysfs_lock);
|
||||
mutex_unlock(&q->sysfs_dir_lock);
|
||||
|
||||
|
@ -13,6 +13,8 @@
|
||||
#include <linux/blk-cgroup.h>
|
||||
#include "blk.h"
|
||||
#include "blk-cgroup-rwstat.h"
|
||||
#include "blk-stat.h"
|
||||
#include "blk-throttle.h"
|
||||
|
||||
/* Max dispatch from a group in 1 round */
|
||||
#define THROTL_GRP_QUANTUM 8
|
||||
@ -37,60 +39,9 @@
|
||||
*/
|
||||
#define LATENCY_FILTERED_HD (1000L) /* 1ms */
|
||||
|
||||
static struct blkcg_policy blkcg_policy_throtl;
|
||||
|
||||
/* A workqueue to queue throttle related work */
|
||||
static struct workqueue_struct *kthrotld_workqueue;
|
||||
|
||||
/*
|
||||
* To implement hierarchical throttling, throtl_grps form a tree and bios
|
||||
* are dispatched upwards level by level until they reach the top and get
|
||||
* issued. When dispatching bios from the children and local group at each
|
||||
* level, if the bios are dispatched into a single bio_list, there's a risk
|
||||
* of a local or child group which can queue many bios at once filling up
|
||||
* the list starving others.
|
||||
*
|
||||
* To avoid such starvation, dispatched bios are queued separately
|
||||
* according to where they came from. When they are again dispatched to
|
||||
* the parent, they're popped in round-robin order so that no single source
|
||||
* hogs the dispatch window.
|
||||
*
|
||||
* throtl_qnode is used to keep the queued bios separated by their sources.
|
||||
* Bios are queued to throtl_qnode which in turn is queued to
|
||||
* throtl_service_queue and then dispatched in round-robin order.
|
||||
*
|
||||
* It's also used to track the reference counts on blkg's. A qnode always
|
||||
* belongs to a throtl_grp and gets queued on itself or the parent, so
|
||||
* incrementing the reference of the associated throtl_grp when a qnode is
|
||||
* queued and decrementing when dequeued is enough to keep the whole blkg
|
||||
* tree pinned while bios are in flight.
|
||||
*/
|
||||
struct throtl_qnode {
|
||||
struct list_head node; /* service_queue->queued[] */
|
||||
struct bio_list bios; /* queued bios */
|
||||
struct throtl_grp *tg; /* tg this qnode belongs to */
|
||||
};
|
||||
|
||||
struct throtl_service_queue {
|
||||
struct throtl_service_queue *parent_sq; /* the parent service_queue */
|
||||
|
||||
/*
|
||||
* Bios queued directly to this service_queue or dispatched from
|
||||
* children throtl_grp's.
|
||||
*/
|
||||
struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
|
||||
unsigned int nr_queued[2]; /* number of queued bios */
|
||||
|
||||
/*
|
||||
* RB tree of active children throtl_grp's, which are sorted by
|
||||
* their ->disptime.
|
||||
*/
|
||||
struct rb_root_cached pending_tree; /* RB tree of active tgs */
|
||||
unsigned int nr_pending; /* # queued in the tree */
|
||||
unsigned long first_pending_disptime; /* disptime of the first tg */
|
||||
struct timer_list pending_timer; /* fires on first_pending_disptime */
|
||||
};
|
||||
|
||||
enum tg_state_flags {
|
||||
THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
|
||||
THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
|
||||
@ -98,93 +49,6 @@ enum tg_state_flags {
|
||||
|
||||
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
|
||||
|
||||
enum {
|
||||
LIMIT_LOW,
|
||||
LIMIT_MAX,
|
||||
LIMIT_CNT,
|
||||
};
|
||||
|
||||
struct throtl_grp {
|
||||
/* must be the first member */
|
||||
struct blkg_policy_data pd;
|
||||
|
||||
/* active throtl group service_queue member */
|
||||
struct rb_node rb_node;
|
||||
|
||||
/* throtl_data this group belongs to */
|
||||
struct throtl_data *td;
|
||||
|
||||
/* this group's service queue */
|
||||
struct throtl_service_queue service_queue;
|
||||
|
||||
/*
|
||||
* qnode_on_self is used when bios are directly queued to this
|
||||
* throtl_grp so that local bios compete fairly with bios
|
||||
* dispatched from children. qnode_on_parent is used when bios are
|
||||
* dispatched from this throtl_grp into its parent and will compete
|
||||
* with the sibling qnode_on_parents and the parent's
|
||||
* qnode_on_self.
|
||||
*/
|
||||
struct throtl_qnode qnode_on_self[2];
|
||||
struct throtl_qnode qnode_on_parent[2];
|
||||
|
||||
/*
|
||||
* Dispatch time in jiffies. This is the estimated time when group
|
||||
* will unthrottle and is ready to dispatch more bio. It is used as
|
||||
* key to sort active groups in service tree.
|
||||
*/
|
||||
unsigned long disptime;
|
||||
|
||||
unsigned int flags;
|
||||
|
||||
/* are there any throtl rules between this group and td? */
|
||||
bool has_rules[2];
|
||||
|
||||
/* internally used bytes per second rate limits */
|
||||
uint64_t bps[2][LIMIT_CNT];
|
||||
/* user configured bps limits */
|
||||
uint64_t bps_conf[2][LIMIT_CNT];
|
||||
|
||||
/* internally used IOPS limits */
|
||||
unsigned int iops[2][LIMIT_CNT];
|
||||
/* user configured IOPS limits */
|
||||
unsigned int iops_conf[2][LIMIT_CNT];
|
||||
|
||||
/* Number of bytes dispatched in current slice */
|
||||
uint64_t bytes_disp[2];
|
||||
/* Number of bio's dispatched in current slice */
|
||||
unsigned int io_disp[2];
|
||||
|
||||
unsigned long last_low_overflow_time[2];
|
||||
|
||||
uint64_t last_bytes_disp[2];
|
||||
unsigned int last_io_disp[2];
|
||||
|
||||
unsigned long last_check_time;
|
||||
|
||||
unsigned long latency_target; /* us */
|
||||
unsigned long latency_target_conf; /* us */
|
||||
/* When did we start a new slice */
|
||||
unsigned long slice_start[2];
|
||||
unsigned long slice_end[2];
|
||||
|
||||
unsigned long last_finish_time; /* ns / 1024 */
|
||||
unsigned long checked_last_finish_time; /* ns / 1024 */
|
||||
unsigned long avg_idletime; /* ns / 1024 */
|
||||
unsigned long idletime_threshold; /* us */
|
||||
unsigned long idletime_threshold_conf; /* us */
|
||||
|
||||
unsigned int bio_cnt; /* total bios */
|
||||
unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
|
||||
unsigned long bio_cnt_reset_time;
|
||||
|
||||
atomic_t io_split_cnt[2];
|
||||
atomic_t last_io_split_cnt[2];
|
||||
|
||||
struct blkg_rwstat stat_bytes;
|
||||
struct blkg_rwstat stat_ios;
|
||||
};
|
||||
|
||||
/* We measure latency for request size from <= 4k to >= 1M */
|
||||
#define LATENCY_BUCKET_SIZE 9
|
||||
|
||||
@ -231,16 +95,6 @@ struct throtl_data
|
||||
|
||||
static void throtl_pending_timer_fn(struct timer_list *t);
|
||||
|
||||
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
|
||||
{
|
||||
return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
|
||||
}
|
||||
|
||||
static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
|
||||
{
|
||||
return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
|
||||
}
|
||||
|
||||
static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
|
||||
{
|
||||
return pd_to_blkg(&tg->pd);
|
||||
@ -1794,7 +1648,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
|
||||
cancel_work_sync(&td->dispatch_work);
|
||||
}
|
||||
|
||||
static struct blkcg_policy blkcg_policy_throtl = {
|
||||
struct blkcg_policy blkcg_policy_throtl = {
|
||||
.dfl_cftypes = throtl_files,
|
||||
.legacy_cftypes = throtl_legacy_files,
|
||||
|
||||
@ -2208,9 +2062,9 @@ void blk_throtl_charge_bio_split(struct bio *bio)
|
||||
} while (parent);
|
||||
}
|
||||
|
||||
bool blk_throtl_bio(struct bio *bio)
|
||||
bool __blk_throtl_bio(struct bio *bio)
|
||||
{
|
||||
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
|
||||
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
|
||||
struct blkcg_gq *blkg = bio->bi_blkg;
|
||||
struct throtl_qnode *qn = NULL;
|
||||
struct throtl_grp *tg = blkg_to_tg(blkg);
|
||||
@ -2221,19 +2075,12 @@ bool blk_throtl_bio(struct bio *bio)
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
/* see throtl_charge_bio() */
|
||||
if (bio_flagged(bio, BIO_THROTTLED))
|
||||
goto out;
|
||||
|
||||
if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
|
||||
blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
|
||||
bio->bi_iter.bi_size);
|
||||
blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
|
||||
}
|
||||
|
||||
if (!tg->has_rules[rw])
|
||||
goto out;
|
||||
|
||||
spin_lock_irq(&q->queue_lock);
|
||||
|
||||
throtl_update_latency_buckets(td);
|
||||
@ -2317,7 +2164,6 @@ bool blk_throtl_bio(struct bio *bio)
|
||||
|
||||
out_unlock:
|
||||
spin_unlock_irq(&q->queue_lock);
|
||||
out:
|
||||
bio_set_flag(bio, BIO_THROTTLED);
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
|
||||
|
227
block/blk.h
227
block/blk.h
@ -2,15 +2,12 @@
|
||||
#ifndef BLK_INTERNAL_H
|
||||
#define BLK_INTERNAL_H
|
||||
|
||||
#include <linux/idr.h>
|
||||
#include <linux/blk-mq.h>
|
||||
#include <linux/part_stat.h>
|
||||
#include <linux/blk-crypto.h>
|
||||
#include <linux/memblock.h> /* for max_pfn/max_low_pfn */
|
||||
#include <xen/xen.h>
|
||||
#include "blk-crypto-internal.h"
|
||||
#include "blk-mq.h"
|
||||
#include "blk-mq-sched.h"
|
||||
|
||||
struct elevator_type;
|
||||
|
||||
/* Max future timer expiry for timeouts */
|
||||
#define BLK_MAX_TIMEOUT (5 * HZ)
|
||||
@ -30,15 +27,10 @@ struct blk_flush_queue {
|
||||
};
|
||||
|
||||
extern struct kmem_cache *blk_requestq_cachep;
|
||||
extern struct kmem_cache *blk_requestq_srcu_cachep;
|
||||
extern struct kobj_type blk_queue_ktype;
|
||||
extern struct ida blk_queue_ida;
|
||||
|
||||
static inline struct blk_flush_queue *
|
||||
blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
|
||||
{
|
||||
return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
|
||||
}
|
||||
|
||||
static inline void __blk_get_queue(struct request_queue *q)
|
||||
{
|
||||
kobject_get(&q->kobj);
|
||||
@ -53,6 +45,41 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
|
||||
void blk_freeze_queue(struct request_queue *q);
|
||||
void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
|
||||
void blk_queue_start_drain(struct request_queue *q);
|
||||
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
|
||||
bool submit_bio_checks(struct bio *bio);
|
||||
|
||||
static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
|
||||
{
|
||||
rcu_read_lock();
|
||||
if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter))
|
||||
goto fail;
|
||||
|
||||
/*
|
||||
* The code that increments the pm_only counter must ensure that the
|
||||
* counter is globally visible before the queue is unfrozen.
|
||||
*/
|
||||
if (blk_queue_pm_only(q) &&
|
||||
(!pm || queue_rpm_status(q) == RPM_SUSPENDED))
|
||||
goto fail_put;
|
||||
|
||||
rcu_read_unlock();
|
||||
return true;
|
||||
|
||||
fail_put:
|
||||
blk_queue_exit(q);
|
||||
fail:
|
||||
rcu_read_unlock();
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline int bio_queue_enter(struct bio *bio)
|
||||
{
|
||||
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
|
||||
|
||||
if (blk_try_enter_queue(q, false))
|
||||
return 0;
|
||||
return __bio_queue_enter(q, bio);
|
||||
}
|
||||
|
||||
#define BIO_INLINE_VECS 4
|
||||
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
|
||||
@ -94,6 +121,44 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
|
||||
return __bvec_gap_to_prev(q, bprv, offset);
|
||||
}
|
||||
|
||||
static inline bool rq_mergeable(struct request *rq)
|
||||
{
|
||||
if (blk_rq_is_passthrough(rq))
|
||||
return false;
|
||||
|
||||
if (req_op(rq) == REQ_OP_FLUSH)
|
||||
return false;
|
||||
|
||||
if (req_op(rq) == REQ_OP_WRITE_ZEROES)
|
||||
return false;
|
||||
|
||||
if (req_op(rq) == REQ_OP_ZONE_APPEND)
|
||||
return false;
|
||||
|
||||
if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
|
||||
return false;
|
||||
if (rq->rq_flags & RQF_NOMERGE_FLAGS)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* There are two different ways to handle DISCARD merges:
|
||||
* 1) If max_discard_segments > 1, the driver treats every bio as a range and
|
||||
* send the bios to controller together. The ranges don't need to be
|
||||
* contiguous.
|
||||
* 2) Otherwise, the request will be normal read/write requests. The ranges
|
||||
* need to be contiguous.
|
||||
*/
|
||||
static inline bool blk_discard_mergable(struct request *req)
|
||||
{
|
||||
if (req_op(req) == REQ_OP_DISCARD &&
|
||||
queue_max_discard_segments(req->q) > 1)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_BLK_DEV_INTEGRITY
|
||||
void blk_flush_integrity(void);
|
||||
bool __bio_integrity_endio(struct bio *);
|
||||
@ -175,15 +240,13 @@ static inline void blk_integrity_del(struct gendisk *disk)
|
||||
|
||||
unsigned long blk_rq_timeout(unsigned long timeout);
|
||||
void blk_add_timer(struct request *req);
|
||||
const char *blk_status_to_str(blk_status_t status);
|
||||
|
||||
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
|
||||
unsigned int nr_segs, struct request **same_queue_rq);
|
||||
unsigned int nr_segs);
|
||||
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
|
||||
struct bio *bio, unsigned int nr_segs);
|
||||
|
||||
void blk_account_io_start(struct request *req);
|
||||
void blk_account_io_done(struct request *req, u64 now);
|
||||
|
||||
/*
|
||||
* Plug flush limits
|
||||
*/
|
||||
@ -199,19 +262,10 @@ void blk_insert_flush(struct request *rq);
|
||||
|
||||
int elevator_switch_mq(struct request_queue *q,
|
||||
struct elevator_type *new_e);
|
||||
void __elevator_exit(struct request_queue *, struct elevator_queue *);
|
||||
void elevator_exit(struct request_queue *q);
|
||||
int elv_register_queue(struct request_queue *q, bool uevent);
|
||||
void elv_unregister_queue(struct request_queue *q);
|
||||
|
||||
static inline void elevator_exit(struct request_queue *q,
|
||||
struct elevator_queue *e)
|
||||
{
|
||||
lockdep_assert_held(&q->sysfs_lock);
|
||||
|
||||
blk_mq_sched_free_requests(q);
|
||||
__elevator_exit(q, e);
|
||||
}
|
||||
|
||||
ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
|
||||
char *buf);
|
||||
ssize_t part_stat_show(struct device *dev, struct device_attribute *attr,
|
||||
@ -226,7 +280,32 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
|
||||
ssize_t part_timeout_store(struct device *, struct device_attribute *,
|
||||
const char *, size_t);
|
||||
|
||||
void __blk_queue_split(struct bio **bio, unsigned int *nr_segs);
|
||||
static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
|
||||
{
|
||||
switch (bio_op(bio)) {
|
||||
case REQ_OP_DISCARD:
|
||||
case REQ_OP_SECURE_ERASE:
|
||||
case REQ_OP_WRITE_ZEROES:
|
||||
case REQ_OP_WRITE_SAME:
|
||||
return true; /* non-trivial splitting decisions */
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* All drivers must accept single-segments bios that are <= PAGE_SIZE.
|
||||
* This is a quick and dirty check that relies on the fact that
|
||||
* bi_io_vec[0] is always valid if a bio has data. The check might
|
||||
* lead to occasional false negatives when bios are cloned, but compared
|
||||
* to the performance impact of cloned bios themselves the loop below
|
||||
* doesn't matter anyway.
|
||||
*/
|
||||
return q->limits.chunk_sectors || bio->bi_vcnt != 1 ||
|
||||
bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
|
||||
}
|
||||
|
||||
void __blk_queue_split(struct request_queue *q, struct bio **bio,
|
||||
unsigned int *nr_segs);
|
||||
int ll_back_merge_fn(struct request *req, struct bio *bio,
|
||||
unsigned int nr_segs);
|
||||
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
|
||||
@ -246,9 +325,11 @@ int blk_dev_init(void);
|
||||
*/
|
||||
static inline bool blk_do_io_stat(struct request *rq)
|
||||
{
|
||||
return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT);
|
||||
return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk;
|
||||
}
|
||||
|
||||
void update_io_ticks(struct block_device *part, unsigned long now, bool end);
|
||||
|
||||
static inline void req_set_nomerge(struct request_queue *q, struct request *req)
|
||||
{
|
||||
req->cmd_flags |= REQ_NOMERGE;
|
||||
@ -283,30 +364,16 @@ static inline unsigned int bio_aligned_discard_max_sectors(
|
||||
/*
|
||||
* Internal io_context interface
|
||||
*/
|
||||
void get_io_context(struct io_context *ioc);
|
||||
struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
|
||||
struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
|
||||
gfp_t gfp_mask);
|
||||
struct io_cq *ioc_find_get_icq(struct request_queue *q);
|
||||
struct io_cq *ioc_lookup_icq(struct request_queue *q);
|
||||
#ifdef CONFIG_BLK_ICQ
|
||||
void ioc_clear_queue(struct request_queue *q);
|
||||
#else
|
||||
static inline void ioc_clear_queue(struct request_queue *q)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_BLK_ICQ */
|
||||
|
||||
int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
|
||||
|
||||
/*
|
||||
* Internal throttling interface
|
||||
*/
|
||||
#ifdef CONFIG_BLK_DEV_THROTTLING
|
||||
extern int blk_throtl_init(struct request_queue *q);
|
||||
extern void blk_throtl_exit(struct request_queue *q);
|
||||
extern void blk_throtl_register_queue(struct request_queue *q);
|
||||
extern void blk_throtl_charge_bio_split(struct bio *bio);
|
||||
bool blk_throtl_bio(struct bio *bio);
|
||||
#else /* CONFIG_BLK_DEV_THROTTLING */
|
||||
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
|
||||
static inline void blk_throtl_exit(struct request_queue *q) { }
|
||||
static inline void blk_throtl_register_queue(struct request_queue *q) { }
|
||||
static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
|
||||
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
|
||||
#endif /* CONFIG_BLK_DEV_THROTTLING */
|
||||
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
|
||||
extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
|
||||
extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
|
||||
@ -364,7 +431,15 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
|
||||
struct page *page, unsigned int len, unsigned int offset,
|
||||
unsigned int max_sectors, bool *same_page);
|
||||
|
||||
struct request_queue *blk_alloc_queue(int node_id);
|
||||
static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
|
||||
{
|
||||
if (srcu)
|
||||
return blk_requestq_srcu_cachep;
|
||||
return blk_requestq_cachep;
|
||||
}
|
||||
struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
|
||||
|
||||
int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
|
||||
|
||||
int disk_alloc_events(struct gendisk *disk);
|
||||
void disk_add_events(struct gendisk *disk);
|
||||
@ -374,13 +449,61 @@ extern struct device_attribute dev_attr_events;
|
||||
extern struct device_attribute dev_attr_events_async;
|
||||
extern struct device_attribute dev_attr_events_poll_msecs;
|
||||
|
||||
static inline void bio_clear_hipri(struct bio *bio)
|
||||
static inline void bio_clear_polled(struct bio *bio)
|
||||
{
|
||||
/* can't support alloc cache if we turn off polling */
|
||||
bio_clear_flag(bio, BIO_PERCPU_CACHE);
|
||||
bio->bi_opf &= ~REQ_HIPRI;
|
||||
bio->bi_opf &= ~REQ_POLLED;
|
||||
}
|
||||
|
||||
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
|
||||
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
|
||||
|
||||
extern const struct address_space_operations def_blk_aops;
|
||||
|
||||
int disk_register_independent_access_ranges(struct gendisk *disk,
|
||||
struct blk_independent_access_ranges *new_iars);
|
||||
void disk_unregister_independent_access_ranges(struct gendisk *disk);
|
||||
|
||||
#ifdef CONFIG_FAIL_MAKE_REQUEST
|
||||
bool should_fail_request(struct block_device *part, unsigned int bytes);
|
||||
#else /* CONFIG_FAIL_MAKE_REQUEST */
|
||||
static inline bool should_fail_request(struct block_device *part,
|
||||
unsigned int bytes)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif /* CONFIG_FAIL_MAKE_REQUEST */
|
||||
|
||||
/*
|
||||
* Optimized request reference counting. Ideally we'd make timeouts be more
|
||||
* clever, as that's the only reason we need references at all... But until
|
||||
* this happens, this is faster than using refcount_t. Also see:
|
||||
*
|
||||
* abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count")
|
||||
*/
|
||||
#define req_ref_zero_or_close_to_overflow(req) \
|
||||
((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)
|
||||
|
||||
static inline bool req_ref_inc_not_zero(struct request *req)
|
||||
{
|
||||
return atomic_inc_not_zero(&req->ref);
|
||||
}
|
||||
|
||||
static inline bool req_ref_put_and_test(struct request *req)
|
||||
{
|
||||
WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
|
||||
return atomic_dec_and_test(&req->ref);
|
||||
}
|
||||
|
||||
static inline void req_ref_set(struct request *req, int value)
|
||||
{
|
||||
atomic_set(&req->ref, value);
|
||||
}
|
||||
|
||||
static inline int req_ref_read(struct request *req)
|
||||
{
|
||||
return atomic_read(&req->ref);
|
||||
}
|
||||
|
||||
#endif /* BLK_INTERNAL_H */
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-cgroup.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/hash.h>
|
||||
|
@ -31,6 +31,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
|
||||
struct bsg_job *job;
|
||||
struct request *rq;
|
||||
struct bio *bio;
|
||||
void *reply;
|
||||
int ret;
|
||||
|
||||
if (hdr->protocol != BSG_PROTOCOL_SCSI ||
|
||||
@ -39,22 +40,28 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
|
||||
if (!capable(CAP_SYS_RAWIO))
|
||||
return -EPERM;
|
||||
|
||||
rq = blk_get_request(q, hdr->dout_xfer_len ?
|
||||
rq = blk_mq_alloc_request(q, hdr->dout_xfer_len ?
|
||||
REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
|
||||
if (IS_ERR(rq))
|
||||
return PTR_ERR(rq);
|
||||
rq->timeout = timeout;
|
||||
|
||||
job = blk_mq_rq_to_pdu(rq);
|
||||
reply = job->reply;
|
||||
memset(job, 0, sizeof(*job));
|
||||
job->reply = reply;
|
||||
job->reply_len = SCSI_SENSE_BUFFERSIZE;
|
||||
job->dd_data = job + 1;
|
||||
|
||||
job->request_len = hdr->request_len;
|
||||
job->request = memdup_user(uptr64(hdr->request), hdr->request_len);
|
||||
if (IS_ERR(job->request)) {
|
||||
ret = PTR_ERR(job->request);
|
||||
goto out_put_request;
|
||||
goto out_free_rq;
|
||||
}
|
||||
|
||||
if (hdr->dout_xfer_len && hdr->din_xfer_len) {
|
||||
job->bidi_rq = blk_get_request(rq->q, REQ_OP_DRV_IN, 0);
|
||||
job->bidi_rq = blk_mq_alloc_request(rq->q, REQ_OP_DRV_IN, 0);
|
||||
if (IS_ERR(job->bidi_rq)) {
|
||||
ret = PTR_ERR(job->bidi_rq);
|
||||
goto out_free_job_request;
|
||||
@ -85,7 +92,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
|
||||
goto out_unmap_bidi_rq;
|
||||
|
||||
bio = rq->bio;
|
||||
blk_execute_rq(NULL, rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL));
|
||||
blk_execute_rq(rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL));
|
||||
|
||||
/*
|
||||
* The assignments below don't make much sense, but are kept for
|
||||
@ -134,11 +141,11 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
|
||||
blk_rq_unmap_user(job->bidi_bio);
|
||||
out_free_bidi_rq:
|
||||
if (job->bidi_rq)
|
||||
blk_put_request(job->bidi_rq);
|
||||
blk_mq_free_request(job->bidi_rq);
|
||||
out_free_job_request:
|
||||
kfree(job->request);
|
||||
out_put_request:
|
||||
blk_put_request(rq);
|
||||
out_free_rq:
|
||||
blk_mq_free_request(rq);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -302,18 +309,6 @@ static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* called right before the request is given to the request_queue user */
|
||||
static void bsg_initialize_rq(struct request *req)
|
||||
{
|
||||
struct bsg_job *job = blk_mq_rq_to_pdu(req);
|
||||
void *reply = job->reply;
|
||||
|
||||
memset(job, 0, sizeof(*job));
|
||||
job->reply = reply;
|
||||
job->reply_len = SCSI_SENSE_BUFFERSIZE;
|
||||
job->dd_data = job + 1;
|
||||
}
|
||||
|
||||
static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req,
|
||||
unsigned int hctx_idx)
|
||||
{
|
||||
@ -350,7 +345,6 @@ static const struct blk_mq_ops bsg_mq_ops = {
|
||||
.queue_rq = bsg_queue_rq,
|
||||
.init_request = bsg_init_rq,
|
||||
.exit_request = bsg_exit_rq,
|
||||
.initialize_rq_fn = bsg_initialize_rq,
|
||||
.complete = bsg_complete,
|
||||
.timeout = bsg_timeout,
|
||||
};
|
||||
|
@ -26,7 +26,6 @@
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/elevator.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
@ -40,6 +39,7 @@
|
||||
|
||||
#include <trace/events/block.h>
|
||||
|
||||
#include "elevator.h"
|
||||
#include "blk.h"
|
||||
#include "blk-mq-sched.h"
|
||||
#include "blk-pm.h"
|
||||
@ -188,8 +188,10 @@ static void elevator_release(struct kobject *kobj)
|
||||
kfree(e);
|
||||
}
|
||||
|
||||
void __elevator_exit(struct request_queue *q, struct elevator_queue *e)
|
||||
void elevator_exit(struct request_queue *q)
|
||||
{
|
||||
struct elevator_queue *e = q->elevator;
|
||||
|
||||
mutex_lock(&e->sysfs_lock);
|
||||
blk_mq_exit_sched(q, e);
|
||||
mutex_unlock(&e->sysfs_lock);
|
||||
@ -593,7 +595,8 @@ int elevator_switch_mq(struct request_queue *q,
|
||||
elv_unregister_queue(q);
|
||||
|
||||
ioc_clear_queue(q);
|
||||
elevator_exit(q, q->elevator);
|
||||
blk_mq_sched_free_rqs(q);
|
||||
elevator_exit(q);
|
||||
}
|
||||
|
||||
ret = blk_mq_init_sched(q, new_e);
|
||||
@ -603,7 +606,8 @@ int elevator_switch_mq(struct request_queue *q,
|
||||
if (new_e) {
|
||||
ret = elv_register_queue(q, true);
|
||||
if (ret) {
|
||||
elevator_exit(q, q->elevator);
|
||||
blk_mq_sched_free_rqs(q);
|
||||
elevator_exit(q);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
@ -635,7 +639,7 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
|
||||
return NULL;
|
||||
|
||||
if (q->nr_hw_queues != 1 &&
|
||||
!blk_mq_is_sbitmap_shared(q->tag_set->flags))
|
||||
!blk_mq_is_shared_tags(q->tag_set->flags))
|
||||
return NULL;
|
||||
|
||||
return elevator_get(q, "mq-deadline", false);
|
||||
|
322
block/fops.c
322
block/fops.c
@ -15,9 +15,10 @@
|
||||
#include <linux/falloc.h>
|
||||
#include <linux/suspend.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/module.h>
|
||||
#include "blk.h"
|
||||
|
||||
static struct inode *bdev_file_inode(struct file *file)
|
||||
static inline struct inode *bdev_file_inode(struct file *file)
|
||||
{
|
||||
return file->f_mapping->host;
|
||||
}
|
||||
@ -54,14 +55,12 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
|
||||
static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
|
||||
struct iov_iter *iter, unsigned int nr_pages)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
|
||||
struct block_device *bdev = iocb->ki_filp->private_data;
|
||||
struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
|
||||
loff_t pos = iocb->ki_pos;
|
||||
bool should_dirty = false;
|
||||
struct bio bio;
|
||||
ssize_t ret;
|
||||
blk_qc_t qc;
|
||||
|
||||
if ((pos | iov_iter_alignment(iter)) &
|
||||
(bdev_logical_block_size(bdev) - 1))
|
||||
@ -78,7 +77,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
|
||||
|
||||
bio_init(&bio, vecs, nr_pages);
|
||||
bio_set_dev(&bio, bdev);
|
||||
bio.bi_iter.bi_sector = pos >> 9;
|
||||
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
|
||||
bio.bi_write_hint = iocb->ki_hint;
|
||||
bio.bi_private = current;
|
||||
bio.bi_end_io = blkdev_bio_end_io_simple;
|
||||
@ -102,13 +101,12 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
|
||||
if (iocb->ki_flags & IOCB_HIPRI)
|
||||
bio_set_polled(&bio, iocb);
|
||||
|
||||
qc = submit_bio(&bio);
|
||||
submit_bio(&bio);
|
||||
for (;;) {
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
if (!READ_ONCE(bio.bi_private))
|
||||
break;
|
||||
if (!(iocb->ki_flags & IOCB_HIPRI) ||
|
||||
!blk_poll(bdev_get_queue(bdev), qc, true))
|
||||
if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, NULL, 0))
|
||||
blk_io_schedule();
|
||||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
@ -126,6 +124,11 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
|
||||
return ret;
|
||||
}
|
||||
|
||||
enum {
|
||||
DIO_SHOULD_DIRTY = 1,
|
||||
DIO_IS_SYNC = 2,
|
||||
};
|
||||
|
||||
struct blkdev_dio {
|
||||
union {
|
||||
struct kiocb *iocb;
|
||||
@ -133,35 +136,27 @@ struct blkdev_dio {
|
||||
};
|
||||
size_t size;
|
||||
atomic_t ref;
|
||||
bool multi_bio : 1;
|
||||
bool should_dirty : 1;
|
||||
bool is_sync : 1;
|
||||
struct bio bio;
|
||||
unsigned int flags;
|
||||
struct bio bio ____cacheline_aligned_in_smp;
|
||||
};
|
||||
|
||||
static struct bio_set blkdev_dio_pool;
|
||||
|
||||
static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
|
||||
{
|
||||
struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
|
||||
struct request_queue *q = bdev_get_queue(bdev);
|
||||
|
||||
return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
|
||||
}
|
||||
|
||||
static void blkdev_bio_end_io(struct bio *bio)
|
||||
{
|
||||
struct blkdev_dio *dio = bio->bi_private;
|
||||
bool should_dirty = dio->should_dirty;
|
||||
bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
|
||||
|
||||
if (bio->bi_status && !dio->bio.bi_status)
|
||||
dio->bio.bi_status = bio->bi_status;
|
||||
|
||||
if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
|
||||
if (!dio->is_sync) {
|
||||
if (atomic_dec_and_test(&dio->ref)) {
|
||||
if (!(dio->flags & DIO_IS_SYNC)) {
|
||||
struct kiocb *iocb = dio->iocb;
|
||||
ssize_t ret;
|
||||
|
||||
WRITE_ONCE(iocb->private, NULL);
|
||||
|
||||
if (likely(!dio->bio.bi_status)) {
|
||||
ret = dio->size;
|
||||
iocb->ki_pos += ret;
|
||||
@ -169,9 +164,8 @@ static void blkdev_bio_end_io(struct bio *bio)
|
||||
ret = blk_status_to_errno(dio->bio.bi_status);
|
||||
}
|
||||
|
||||
dio->iocb->ki_complete(iocb, ret, 0);
|
||||
if (dio->multi_bio)
|
||||
bio_put(&dio->bio);
|
||||
dio->iocb->ki_complete(iocb, ret);
|
||||
bio_put(&dio->bio);
|
||||
} else {
|
||||
struct task_struct *waiter = dio->waiter;
|
||||
|
||||
@ -191,16 +185,12 @@ static void blkdev_bio_end_io(struct bio *bio)
|
||||
static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
||||
unsigned int nr_pages)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *inode = bdev_file_inode(file);
|
||||
struct block_device *bdev = I_BDEV(inode);
|
||||
struct block_device *bdev = iocb->ki_filp->private_data;
|
||||
struct blk_plug plug;
|
||||
struct blkdev_dio *dio;
|
||||
struct bio *bio;
|
||||
bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
|
||||
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
|
||||
loff_t pos = iocb->ki_pos;
|
||||
blk_qc_t qc = BLK_QC_T_NONE;
|
||||
int ret = 0;
|
||||
|
||||
if ((pos | iov_iter_alignment(iter)) &
|
||||
@ -210,28 +200,31 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
||||
bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
|
||||
|
||||
dio = container_of(bio, struct blkdev_dio, bio);
|
||||
dio->is_sync = is_sync = is_sync_kiocb(iocb);
|
||||
if (dio->is_sync) {
|
||||
atomic_set(&dio->ref, 1);
|
||||
/*
|
||||
* Grab an extra reference to ensure the dio structure which is embedded
|
||||
* into the first bio stays around.
|
||||
*/
|
||||
bio_get(bio);
|
||||
|
||||
is_sync = is_sync_kiocb(iocb);
|
||||
if (is_sync) {
|
||||
dio->flags = DIO_IS_SYNC;
|
||||
dio->waiter = current;
|
||||
bio_get(bio);
|
||||
} else {
|
||||
dio->flags = 0;
|
||||
dio->iocb = iocb;
|
||||
}
|
||||
|
||||
dio->size = 0;
|
||||
dio->multi_bio = false;
|
||||
dio->should_dirty = is_read && iter_is_iovec(iter);
|
||||
if (is_read && iter_is_iovec(iter))
|
||||
dio->flags |= DIO_SHOULD_DIRTY;
|
||||
|
||||
/*
|
||||
* Don't plug for HIPRI/polled IO, as those should go straight
|
||||
* to issue
|
||||
*/
|
||||
if (!is_poll)
|
||||
blk_start_plug(&plug);
|
||||
blk_start_plug(&plug);
|
||||
|
||||
for (;;) {
|
||||
bio_set_dev(bio, bdev);
|
||||
bio->bi_iter.bi_sector = pos >> 9;
|
||||
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
|
||||
bio->bi_write_hint = iocb->ki_hint;
|
||||
bio->bi_private = dio;
|
||||
bio->bi_end_io = blkdev_bio_end_io;
|
||||
@ -246,7 +239,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
||||
|
||||
if (is_read) {
|
||||
bio->bi_opf = REQ_OP_READ;
|
||||
if (dio->should_dirty)
|
||||
if (dio->flags & DIO_SHOULD_DIRTY)
|
||||
bio_set_pages_dirty(bio);
|
||||
} else {
|
||||
bio->bi_opf = dio_bio_write_op(iocb);
|
||||
@ -260,40 +253,15 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
||||
|
||||
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
|
||||
if (!nr_pages) {
|
||||
bool polled = false;
|
||||
|
||||
if (iocb->ki_flags & IOCB_HIPRI) {
|
||||
bio_set_polled(bio, iocb);
|
||||
polled = true;
|
||||
}
|
||||
|
||||
qc = submit_bio(bio);
|
||||
|
||||
if (polled)
|
||||
WRITE_ONCE(iocb->ki_cookie, qc);
|
||||
submit_bio(bio);
|
||||
break;
|
||||
}
|
||||
|
||||
if (!dio->multi_bio) {
|
||||
/*
|
||||
* AIO needs an extra reference to ensure the dio
|
||||
* structure which is embedded into the first bio
|
||||
* stays around.
|
||||
*/
|
||||
if (!is_sync)
|
||||
bio_get(bio);
|
||||
dio->multi_bio = true;
|
||||
atomic_set(&dio->ref, 2);
|
||||
} else {
|
||||
atomic_inc(&dio->ref);
|
||||
}
|
||||
|
||||
atomic_inc(&dio->ref);
|
||||
submit_bio(bio);
|
||||
bio = bio_alloc(GFP_KERNEL, nr_pages);
|
||||
}
|
||||
|
||||
if (!is_poll)
|
||||
blk_finish_plug(&plug);
|
||||
blk_finish_plug(&plug);
|
||||
|
||||
if (!is_sync)
|
||||
return -EIOCBQUEUED;
|
||||
@ -302,10 +270,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
if (!READ_ONCE(dio->waiter))
|
||||
break;
|
||||
|
||||
if (!(iocb->ki_flags & IOCB_HIPRI) ||
|
||||
!blk_poll(bdev_get_queue(bdev), qc, true))
|
||||
blk_io_schedule();
|
||||
blk_io_schedule();
|
||||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
|
||||
@ -318,6 +283,95 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void blkdev_bio_end_io_async(struct bio *bio)
|
||||
{
|
||||
struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio);
|
||||
struct kiocb *iocb = dio->iocb;
|
||||
ssize_t ret;
|
||||
|
||||
WRITE_ONCE(iocb->private, NULL);
|
||||
|
||||
if (likely(!bio->bi_status)) {
|
||||
ret = dio->size;
|
||||
iocb->ki_pos += ret;
|
||||
} else {
|
||||
ret = blk_status_to_errno(bio->bi_status);
|
||||
}
|
||||
|
||||
iocb->ki_complete(iocb, ret);
|
||||
|
||||
if (dio->flags & DIO_SHOULD_DIRTY) {
|
||||
bio_check_pages_dirty(bio);
|
||||
} else {
|
||||
bio_release_pages(bio, false);
|
||||
bio_put(bio);
|
||||
}
|
||||
}
|
||||
|
||||
static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
|
||||
struct iov_iter *iter,
|
||||
unsigned int nr_pages)
|
||||
{
|
||||
struct block_device *bdev = iocb->ki_filp->private_data;
|
||||
struct blkdev_dio *dio;
|
||||
struct bio *bio;
|
||||
loff_t pos = iocb->ki_pos;
|
||||
int ret = 0;
|
||||
|
||||
if ((pos | iov_iter_alignment(iter)) &
|
||||
(bdev_logical_block_size(bdev) - 1))
|
||||
return -EINVAL;
|
||||
|
||||
bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
|
||||
dio = container_of(bio, struct blkdev_dio, bio);
|
||||
dio->flags = 0;
|
||||
dio->iocb = iocb;
|
||||
bio_set_dev(bio, bdev);
|
||||
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
|
||||
bio->bi_write_hint = iocb->ki_hint;
|
||||
bio->bi_end_io = blkdev_bio_end_io_async;
|
||||
bio->bi_ioprio = iocb->ki_ioprio;
|
||||
|
||||
if (iov_iter_is_bvec(iter)) {
|
||||
/*
|
||||
* Users don't rely on the iterator being in any particular
|
||||
* state for async I/O returning -EIOCBQUEUED, hence we can
|
||||
* avoid expensive iov_iter_advance(). Bypass
|
||||
* bio_iov_iter_get_pages() and set the bvec directly.
|
||||
*/
|
||||
bio_iov_bvec_set(bio, iter);
|
||||
} else {
|
||||
ret = bio_iov_iter_get_pages(bio, iter);
|
||||
if (unlikely(ret)) {
|
||||
bio_put(bio);
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
dio->size = bio->bi_iter.bi_size;
|
||||
|
||||
if (iov_iter_rw(iter) == READ) {
|
||||
bio->bi_opf = REQ_OP_READ;
|
||||
if (iter_is_iovec(iter)) {
|
||||
dio->flags |= DIO_SHOULD_DIRTY;
|
||||
bio_set_pages_dirty(bio);
|
||||
}
|
||||
} else {
|
||||
bio->bi_opf = dio_bio_write_op(iocb);
|
||||
task_io_account_write(bio->bi_iter.bi_size);
|
||||
}
|
||||
|
||||
if (iocb->ki_flags & IOCB_HIPRI) {
|
||||
bio->bi_opf |= REQ_POLLED | REQ_NOWAIT;
|
||||
submit_bio(bio);
|
||||
WRITE_ONCE(iocb->private, bio);
|
||||
} else {
|
||||
if (iocb->ki_flags & IOCB_NOWAIT)
|
||||
bio->bi_opf |= REQ_NOWAIT;
|
||||
submit_bio(bio);
|
||||
}
|
||||
return -EIOCBQUEUED;
|
||||
}
|
||||
|
||||
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
|
||||
{
|
||||
unsigned int nr_pages;
|
||||
@ -326,9 +380,11 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
|
||||
return 0;
|
||||
|
||||
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
|
||||
if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS)
|
||||
return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
|
||||
|
||||
if (likely(nr_pages <= BIO_MAX_VECS)) {
|
||||
if (is_sync_kiocb(iocb))
|
||||
return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
|
||||
return __blkdev_direct_IO_async(iocb, iter, nr_pages);
|
||||
}
|
||||
return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
|
||||
}
|
||||
|
||||
@ -405,8 +461,7 @@ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence)
|
||||
static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
|
||||
int datasync)
|
||||
{
|
||||
struct inode *bd_inode = bdev_file_inode(filp);
|
||||
struct block_device *bdev = I_BDEV(bd_inode);
|
||||
struct block_device *bdev = filp->private_data;
|
||||
int error;
|
||||
|
||||
error = file_write_and_wait_range(filp, start, end);
|
||||
@ -448,6 +503,8 @@ static int blkdev_open(struct inode *inode, struct file *filp)
|
||||
bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp);
|
||||
if (IS_ERR(bdev))
|
||||
return PTR_ERR(bdev);
|
||||
|
||||
filp->private_data = bdev;
|
||||
filp->f_mapping = bdev->bd_inode->i_mapping;
|
||||
filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
|
||||
return 0;
|
||||
@ -455,29 +512,12 @@ static int blkdev_open(struct inode *inode, struct file *filp)
|
||||
|
||||
static int blkdev_close(struct inode *inode, struct file *filp)
|
||||
{
|
||||
struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
|
||||
struct block_device *bdev = filp->private_data;
|
||||
|
||||
blkdev_put(bdev, filp->f_mode);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
||||
{
|
||||
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
|
||||
fmode_t mode = file->f_mode;
|
||||
|
||||
/*
|
||||
* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
|
||||
* to updated it before every ioctl.
|
||||
*/
|
||||
if (file->f_flags & O_NDELAY)
|
||||
mode |= FMODE_NDELAY;
|
||||
else
|
||||
mode &= ~FMODE_NDELAY;
|
||||
|
||||
return blkdev_ioctl(bdev, mode, cmd, arg);
|
||||
}
|
||||
|
||||
/*
|
||||
* Write data to the block device. Only intended for the block device itself
|
||||
* and the raw driver which basically is a fake block device.
|
||||
@ -487,14 +527,14 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
||||
*/
|
||||
static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *bd_inode = bdev_file_inode(file);
|
||||
loff_t size = i_size_read(bd_inode);
|
||||
struct block_device *bdev = iocb->ki_filp->private_data;
|
||||
struct inode *bd_inode = bdev->bd_inode;
|
||||
loff_t size = bdev_nr_bytes(bdev);
|
||||
struct blk_plug plug;
|
||||
size_t shorted = 0;
|
||||
ssize_t ret;
|
||||
|
||||
if (bdev_read_only(I_BDEV(bd_inode)))
|
||||
if (bdev_read_only(bdev))
|
||||
return -EPERM;
|
||||
|
||||
if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
|
||||
@ -526,24 +566,58 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
|
||||
|
||||
static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
||||
{
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *bd_inode = bdev_file_inode(file);
|
||||
loff_t size = i_size_read(bd_inode);
|
||||
struct block_device *bdev = iocb->ki_filp->private_data;
|
||||
loff_t size = bdev_nr_bytes(bdev);
|
||||
loff_t pos = iocb->ki_pos;
|
||||
size_t shorted = 0;
|
||||
ssize_t ret;
|
||||
ssize_t ret = 0;
|
||||
size_t count;
|
||||
|
||||
if (pos >= size)
|
||||
return 0;
|
||||
|
||||
size -= pos;
|
||||
if (iov_iter_count(to) > size) {
|
||||
if (unlikely(pos + iov_iter_count(to) > size)) {
|
||||
if (pos >= size)
|
||||
return 0;
|
||||
size -= pos;
|
||||
shorted = iov_iter_count(to) - size;
|
||||
iov_iter_truncate(to, size);
|
||||
}
|
||||
|
||||
ret = generic_file_read_iter(iocb, to);
|
||||
iov_iter_reexpand(to, iov_iter_count(to) + shorted);
|
||||
count = iov_iter_count(to);
|
||||
if (!count)
|
||||
goto reexpand; /* skip atime */
|
||||
|
||||
if (iocb->ki_flags & IOCB_DIRECT) {
|
||||
struct address_space *mapping = iocb->ki_filp->f_mapping;
|
||||
|
||||
if (iocb->ki_flags & IOCB_NOWAIT) {
|
||||
if (filemap_range_needs_writeback(mapping, pos,
|
||||
pos + count - 1)) {
|
||||
ret = -EAGAIN;
|
||||
goto reexpand;
|
||||
}
|
||||
} else {
|
||||
ret = filemap_write_and_wait_range(mapping, pos,
|
||||
pos + count - 1);
|
||||
if (ret < 0)
|
||||
goto reexpand;
|
||||
}
|
||||
|
||||
file_accessed(iocb->ki_filp);
|
||||
|
||||
ret = blkdev_direct_IO(iocb, to);
|
||||
if (ret >= 0) {
|
||||
iocb->ki_pos += ret;
|
||||
count -= ret;
|
||||
}
|
||||
iov_iter_revert(to, count - iov_iter_count(to));
|
||||
if (ret < 0 || !count)
|
||||
goto reexpand;
|
||||
}
|
||||
|
||||
ret = filemap_read(iocb, to, ret);
|
||||
|
||||
reexpand:
|
||||
if (unlikely(shorted))
|
||||
iov_iter_reexpand(to, iov_iter_count(to) + shorted);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -565,7 +639,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
/* Don't go off the end of the device. */
|
||||
isize = i_size_read(bdev->bd_inode);
|
||||
isize = bdev_nr_bytes(bdev);
|
||||
if (start >= isize)
|
||||
return -EINVAL;
|
||||
if (end >= isize) {
|
||||
@ -592,16 +666,18 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
|
||||
switch (mode) {
|
||||
case FALLOC_FL_ZERO_RANGE:
|
||||
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
|
||||
error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
|
||||
GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
|
||||
error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
|
||||
len >> SECTOR_SHIFT, GFP_KERNEL,
|
||||
BLKDEV_ZERO_NOUNMAP);
|
||||
break;
|
||||
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
|
||||
error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
|
||||
GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
|
||||
error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
|
||||
len >> SECTOR_SHIFT, GFP_KERNEL,
|
||||
BLKDEV_ZERO_NOFALLBACK);
|
||||
break;
|
||||
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
|
||||
error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
|
||||
GFP_KERNEL, 0);
|
||||
error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
|
||||
len >> SECTOR_SHIFT, GFP_KERNEL, 0);
|
||||
break;
|
||||
default:
|
||||
error = -EOPNOTSUPP;
|
||||
@ -618,10 +694,10 @@ const struct file_operations def_blk_fops = {
|
||||
.llseek = blkdev_llseek,
|
||||
.read_iter = blkdev_read_iter,
|
||||
.write_iter = blkdev_write_iter,
|
||||
.iopoll = blkdev_iopoll,
|
||||
.iopoll = iocb_bio_iopoll,
|
||||
.mmap = generic_file_mmap,
|
||||
.fsync = blkdev_fsync,
|
||||
.unlocked_ioctl = block_ioctl,
|
||||
.unlocked_ioctl = blkdev_ioctl,
|
||||
#ifdef CONFIG_COMPAT
|
||||
.compat_ioctl = compat_blkdev_ioctl,
|
||||
#endif
|
||||
|
@ -25,8 +25,10 @@
|
||||
#include <linux/log2.h>
|
||||
#include <linux/pm_runtime.h>
|
||||
#include <linux/badblocks.h>
|
||||
#include <linux/part_stat.h>
|
||||
|
||||
#include "blk.h"
|
||||
#include "blk-mq-sched.h"
|
||||
#include "blk-rq-qos.h"
|
||||
|
||||
static struct kobject *block_depr;
|
||||
@ -58,6 +60,7 @@ void set_capacity(struct gendisk *disk, sector_t sectors)
|
||||
|
||||
spin_lock(&bdev->bd_size_lock);
|
||||
i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
|
||||
bdev->bd_nr_sectors = sectors;
|
||||
spin_unlock(&bdev->bd_size_lock);
|
||||
}
|
||||
EXPORT_SYMBOL(set_capacity);
|
||||
@ -212,7 +215,10 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
|
||||
* @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
|
||||
* @major = 0, try to allocate any unused major number.
|
||||
* @name: the name of the new block device as a zero terminated string
|
||||
* @probe: allback that is called on access to any minor number of @major
|
||||
* @probe: pre-devtmpfs / pre-udev callback used to create disks when their
|
||||
* pre-created device node is accessed. When a probe call uses
|
||||
* add_disk() and it fails the driver must cleanup resources. This
|
||||
* interface may soon be removed.
|
||||
*
|
||||
* The @name must be unique within the system.
|
||||
*
|
||||
@ -368,17 +374,21 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(disk_uevent);
|
||||
|
||||
static void disk_scan_partitions(struct gendisk *disk)
|
||||
int disk_scan_partitions(struct gendisk *disk, fmode_t mode)
|
||||
{
|
||||
struct block_device *bdev;
|
||||
|
||||
if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
|
||||
return;
|
||||
if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
|
||||
return -EINVAL;
|
||||
if (disk->open_partitions)
|
||||
return -EBUSY;
|
||||
|
||||
set_bit(GD_NEED_PART_SCAN, &disk->state);
|
||||
bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
|
||||
if (!IS_ERR(bdev))
|
||||
blkdev_put(bdev, FMODE_READ);
|
||||
bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL);
|
||||
if (IS_ERR(bdev))
|
||||
return PTR_ERR(bdev);
|
||||
blkdev_put(bdev, mode);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -390,8 +400,8 @@ static void disk_scan_partitions(struct gendisk *disk)
|
||||
* This function registers the partitioning information in @disk
|
||||
* with the kernel.
|
||||
*/
|
||||
int device_add_disk(struct device *parent, struct gendisk *disk,
|
||||
const struct attribute_group **groups)
|
||||
int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
|
||||
const struct attribute_group **groups)
|
||||
|
||||
{
|
||||
struct device *ddev = disk_to_dev(disk);
|
||||
@ -432,7 +442,6 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
|
||||
return ret;
|
||||
disk->major = BLOCK_EXT_MAJOR;
|
||||
disk->first_minor = ret;
|
||||
disk->flags |= GENHD_FL_EXT_DEVT;
|
||||
}
|
||||
|
||||
/* delay uevents, until we scanned partition table */
|
||||
@ -489,14 +498,7 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
|
||||
if (ret)
|
||||
goto out_put_slave_dir;
|
||||
|
||||
if (disk->flags & GENHD_FL_HIDDEN) {
|
||||
/*
|
||||
* Don't let hidden disks show up in /proc/partitions,
|
||||
* and don't bother scanning for partitions either.
|
||||
*/
|
||||
disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
|
||||
disk->flags |= GENHD_FL_NO_PART_SCAN;
|
||||
} else {
|
||||
if (!(disk->flags & GENHD_FL_HIDDEN)) {
|
||||
ret = bdi_register(disk->bdi, "%u:%u",
|
||||
disk->major, disk->first_minor);
|
||||
if (ret)
|
||||
@ -508,7 +510,8 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
|
||||
goto out_unregister_bdi;
|
||||
|
||||
bdev_add(disk->part0, ddev->devt);
|
||||
disk_scan_partitions(disk);
|
||||
if (get_capacity(disk))
|
||||
disk_scan_partitions(disk, FMODE_READ);
|
||||
|
||||
/*
|
||||
* Announce the disk and partitions after all partitions are
|
||||
@ -541,7 +544,7 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
|
||||
out_free_ext_minor:
|
||||
if (disk->major == BLOCK_EXT_MAJOR)
|
||||
blk_free_ext_minor(disk->first_minor);
|
||||
return WARN_ON_ONCE(ret); /* keep until all callers handle errors */
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(device_add_disk);
|
||||
|
||||
@ -645,6 +648,26 @@ void del_gendisk(struct gendisk *disk)
|
||||
}
|
||||
EXPORT_SYMBOL(del_gendisk);
|
||||
|
||||
/**
|
||||
* invalidate_disk - invalidate the disk
|
||||
* @disk: the struct gendisk to invalidate
|
||||
*
|
||||
* A helper to invalidates the disk. It will clean the disk's associated
|
||||
* buffer/page caches and reset its internal states so that the disk
|
||||
* can be reused by the drivers.
|
||||
*
|
||||
* Context: can sleep
|
||||
*/
|
||||
void invalidate_disk(struct gendisk *disk)
|
||||
{
|
||||
struct block_device *bdev = disk->part0;
|
||||
|
||||
invalidate_bdev(bdev);
|
||||
bdev->bd_inode->i_mapping->wb_err = 0;
|
||||
set_capacity(disk, 0);
|
||||
}
|
||||
EXPORT_SYMBOL(invalidate_disk);
|
||||
|
||||
/* sysfs access to bad-blocks list. */
|
||||
static ssize_t disk_badblocks_show(struct device *dev,
|
||||
struct device_attribute *attr,
|
||||
@ -711,8 +734,7 @@ void __init printk_all_partitions(void)
|
||||
* Don't show empty devices or things that have been
|
||||
* suppressed
|
||||
*/
|
||||
if (get_capacity(disk) == 0 ||
|
||||
(disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
|
||||
if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_HIDDEN))
|
||||
continue;
|
||||
|
||||
/*
|
||||
@ -805,11 +827,7 @@ static int show_partition(struct seq_file *seqf, void *v)
|
||||
struct block_device *part;
|
||||
unsigned long idx;
|
||||
|
||||
/* Don't show non-partitionable removeable devices or empty devices */
|
||||
if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
|
||||
(sgp->flags & GENHD_FL_REMOVABLE)))
|
||||
return 0;
|
||||
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
|
||||
if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN))
|
||||
return 0;
|
||||
|
||||
rcu_read_lock();
|
||||
@ -865,7 +883,8 @@ static ssize_t disk_ext_range_show(struct device *dev,
|
||||
{
|
||||
struct gendisk *disk = dev_to_disk(dev);
|
||||
|
||||
return sprintf(buf, "%d\n", disk_max_parts(disk));
|
||||
return sprintf(buf, "%d\n",
|
||||
(disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS);
|
||||
}
|
||||
|
||||
static ssize_t disk_removable_show(struct device *dev,
|
||||
@ -904,7 +923,7 @@ ssize_t part_stat_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
struct block_device *bdev = dev_to_bdev(dev);
|
||||
struct request_queue *q = bdev->bd_disk->queue;
|
||||
struct request_queue *q = bdev_get_queue(bdev);
|
||||
struct disk_stats stat;
|
||||
unsigned int inflight;
|
||||
|
||||
@ -948,7 +967,7 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct block_device *bdev = dev_to_bdev(dev);
|
||||
struct request_queue *q = bdev->bd_disk->queue;
|
||||
struct request_queue *q = bdev_get_queue(bdev);
|
||||
unsigned int inflight[2];
|
||||
|
||||
if (queue_is_mq(q))
|
||||
@ -1290,6 +1309,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
|
||||
if (!disk->bdi)
|
||||
goto out_free_disk;
|
||||
|
||||
/* bdev_alloc() might need the queue, set before the first call */
|
||||
disk->queue = q;
|
||||
|
||||
disk->part0 = bdev_alloc(disk, 0);
|
||||
if (!disk->part0)
|
||||
goto out_free_bdi;
|
||||
@ -1305,7 +1327,6 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
|
||||
disk_to_dev(disk)->type = &disk_type;
|
||||
device_initialize(disk_to_dev(disk));
|
||||
inc_diskseq(disk);
|
||||
disk->queue = q;
|
||||
q->disk = disk;
|
||||
lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
|
||||
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
|
||||
@ -1332,7 +1353,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
|
||||
struct request_queue *q;
|
||||
struct gendisk *disk;
|
||||
|
||||
q = blk_alloc_queue(node);
|
||||
q = blk_alloc_queue(node, false);
|
||||
if (!q)
|
||||
return NULL;
|
||||
|
||||
@ -1410,12 +1431,6 @@ void set_disk_ro(struct gendisk *disk, bool read_only)
|
||||
}
|
||||
EXPORT_SYMBOL(set_disk_ro);
|
||||
|
||||
int bdev_read_only(struct block_device *bdev)
|
||||
{
|
||||
return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
|
||||
}
|
||||
EXPORT_SYMBOL(bdev_read_only);
|
||||
|
||||
void inc_diskseq(struct gendisk *disk)
|
||||
{
|
||||
disk->diskseq = atomic64_inc_return(&diskseq);
|
||||
|
@ -82,31 +82,6 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
|
||||
}
|
||||
#endif
|
||||
|
||||
static int blkdev_reread_part(struct block_device *bdev, fmode_t mode)
|
||||
{
|
||||
struct block_device *tmp;
|
||||
|
||||
if (!disk_part_scan_enabled(bdev->bd_disk) || bdev_is_partition(bdev))
|
||||
return -EINVAL;
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EACCES;
|
||||
if (bdev->bd_disk->open_partitions)
|
||||
return -EBUSY;
|
||||
|
||||
/*
|
||||
* Reopen the device to revalidate the driver state and force a
|
||||
* partition rescan.
|
||||
*/
|
||||
mode &= ~FMODE_EXCL;
|
||||
set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
|
||||
|
||||
tmp = blkdev_get_by_dev(bdev->bd_dev, mode, NULL);
|
||||
if (IS_ERR(tmp))
|
||||
return PTR_ERR(tmp);
|
||||
blkdev_put(tmp, mode);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
|
||||
unsigned long arg, unsigned long flags)
|
||||
{
|
||||
@ -133,7 +108,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
|
||||
if (len & 511)
|
||||
return -EINVAL;
|
||||
|
||||
if (start + len > i_size_read(bdev->bd_inode))
|
||||
if (start + len > bdev_nr_bytes(bdev))
|
||||
return -EINVAL;
|
||||
|
||||
filemap_invalidate_lock(inode->i_mapping);
|
||||
@ -171,7 +146,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
|
||||
return -EINVAL;
|
||||
if (len & 511)
|
||||
return -EINVAL;
|
||||
if (end >= (uint64_t)i_size_read(bdev->bd_inode))
|
||||
if (end >= (uint64_t)bdev_nr_bytes(bdev))
|
||||
return -EINVAL;
|
||||
if (end < start)
|
||||
return -EINVAL;
|
||||
@ -522,7 +497,11 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
|
||||
bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE;
|
||||
return 0;
|
||||
case BLKRRPART:
|
||||
return blkdev_reread_part(bdev, mode);
|
||||
if (!capable(CAP_SYS_ADMIN))
|
||||
return -EACCES;
|
||||
if (bdev_is_partition(bdev))
|
||||
return -EINVAL;
|
||||
return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL);
|
||||
case BLKTRACESTART:
|
||||
case BLKTRACESTOP:
|
||||
case BLKTRACETEARDOWN:
|
||||
@ -550,12 +529,21 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
|
||||
*
|
||||
* New commands must be compatible and go into blkdev_common_ioctl
|
||||
*/
|
||||
int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
|
||||
unsigned long arg)
|
||||
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
||||
{
|
||||
int ret;
|
||||
loff_t size;
|
||||
struct block_device *bdev = I_BDEV(file->f_mapping->host);
|
||||
void __user *argp = (void __user *)arg;
|
||||
fmode_t mode = file->f_mode;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
|
||||
* to updated it before every ioctl.
|
||||
*/
|
||||
if (file->f_flags & O_NDELAY)
|
||||
mode |= FMODE_NDELAY;
|
||||
else
|
||||
mode &= ~FMODE_NDELAY;
|
||||
|
||||
switch (cmd) {
|
||||
/* These need separate implementations for the data structure */
|
||||
@ -572,10 +560,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
|
||||
return put_long(argp,
|
||||
(bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
|
||||
case BLKGETSIZE:
|
||||
size = i_size_read(bdev->bd_inode);
|
||||
if ((size >> 9) > ~0UL)
|
||||
if (bdev_nr_sectors(bdev) > ~0UL)
|
||||
return -EFBIG;
|
||||
return put_ulong(argp, size >> 9);
|
||||
return put_ulong(argp, bdev_nr_sectors(bdev));
|
||||
|
||||
/* The data is compatible, but the command number is different */
|
||||
case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
|
||||
@ -583,7 +570,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
|
||||
case BLKBSZSET:
|
||||
return blkdev_bszset(bdev, mode, argp);
|
||||
case BLKGETSIZE64:
|
||||
return put_u64(argp, i_size_read(bdev->bd_inode));
|
||||
return put_u64(argp, bdev_nr_bytes(bdev));
|
||||
|
||||
/* Incompatible alignment on i386 */
|
||||
case BLKTRACESETUP:
|
||||
@ -600,7 +587,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
|
||||
return -ENOTTY;
|
||||
return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */
|
||||
|
||||
#ifdef CONFIG_COMPAT
|
||||
|
||||
@ -618,7 +604,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
||||
struct block_device *bdev = I_BDEV(file->f_mapping->host);
|
||||
struct gendisk *disk = bdev->bd_disk;
|
||||
fmode_t mode = file->f_mode;
|
||||
loff_t size;
|
||||
|
||||
/*
|
||||
* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
|
||||
@ -644,10 +629,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
||||
return compat_put_long(argp,
|
||||
(bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
|
||||
case BLKGETSIZE:
|
||||
size = i_size_read(bdev->bd_inode);
|
||||
if ((size >> 9) > ~0UL)
|
||||
if (bdev_nr_sectors(bdev) > ~0UL)
|
||||
return -EFBIG;
|
||||
return compat_put_ulong(argp, size >> 9);
|
||||
return compat_put_ulong(argp, bdev_nr_sectors(bdev));
|
||||
|
||||
/* The data is compatible, but the command number is different */
|
||||
case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
|
||||
@ -655,7 +639,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
|
||||
case BLKBSZSET_32:
|
||||
return blkdev_bszset(bdev, mode, argp);
|
||||
case BLKGETSIZE64_32:
|
||||
return put_u64(argp, i_size_read(bdev->bd_inode));
|
||||
return put_u64(argp, bdev_nr_bytes(bdev));
|
||||
|
||||
/* Incompatible alignment on i386 */
|
||||
case BLKTRACESETUP32:
|
||||
|
@ -22,46 +22,14 @@
|
||||
*/
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/ioprio.h>
|
||||
#include <linux/cred.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/capability.h>
|
||||
#include <linux/sched/user.h>
|
||||
#include <linux/sched/task.h>
|
||||
#include <linux/syscalls.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/pid_namespace.h>
|
||||
|
||||
int set_task_ioprio(struct task_struct *task, int ioprio)
|
||||
{
|
||||
int err;
|
||||
struct io_context *ioc;
|
||||
const struct cred *cred = current_cred(), *tcred;
|
||||
|
||||
rcu_read_lock();
|
||||
tcred = __task_cred(task);
|
||||
if (!uid_eq(tcred->uid, cred->euid) &&
|
||||
!uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
|
||||
rcu_read_unlock();
|
||||
return -EPERM;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
err = security_task_setioprio(task, ioprio);
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
|
||||
if (ioc) {
|
||||
ioc->ioprio = ioprio;
|
||||
put_io_context(ioc);
|
||||
}
|
||||
|
||||
return err;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(set_task_ioprio);
|
||||
|
||||
int ioprio_check_cap(int ioprio)
|
||||
{
|
||||
int class = IOPRIO_PRIO_CLASS(ioprio);
|
||||
|
@ -9,12 +9,12 @@
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-mq.h>
|
||||
#include <linux/elevator.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/sbitmap.h>
|
||||
|
||||
#include <trace/events/block.h>
|
||||
|
||||
#include "elevator.h"
|
||||
#include "blk.h"
|
||||
#include "blk-mq.h"
|
||||
#include "blk-mq-debugfs.h"
|
||||
@ -433,6 +433,7 @@ static void kyber_exit_sched(struct elevator_queue *e)
|
||||
int i;
|
||||
|
||||
del_timer_sync(&kqd->timer);
|
||||
blk_stat_disable_accounting(kqd->q);
|
||||
|
||||
for (i = 0; i < KYBER_NUM_DOMAINS; i++)
|
||||
sbitmap_queue_free(&kqd->domain_tokens[i]);
|
||||
@ -453,11 +454,11 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
|
||||
struct blk_mq_tags *tags = hctx->sched_tags;
|
||||
unsigned int shift = tags->bitmap_tags->sb.shift;
|
||||
unsigned int shift = tags->bitmap_tags.sb.shift;
|
||||
|
||||
kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
|
||||
|
||||
sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth);
|
||||
sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
|
||||
}
|
||||
|
||||
static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
|
||||
|
@ -9,7 +9,6 @@
|
||||
#include <linux/fs.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-mq.h>
|
||||
#include <linux/elevator.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
@ -20,6 +19,7 @@
|
||||
|
||||
#include <trace/events/block.h>
|
||||
|
||||
#include "elevator.h"
|
||||
#include "blk.h"
|
||||
#include "blk-mq.h"
|
||||
#include "blk-mq-debugfs.h"
|
||||
@ -31,6 +31,11 @@
|
||||
*/
|
||||
static const int read_expire = HZ / 2; /* max time before a read is submitted. */
|
||||
static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
|
||||
/*
|
||||
* Time after which to dispatch lower priority requests even if higher
|
||||
* priority requests are pending.
|
||||
*/
|
||||
static const int prio_aging_expire = 10 * HZ;
|
||||
static const int writes_starved = 2; /* max times reads can starve a write */
|
||||
static const int fifo_batch = 16; /* # of sequential requests treated as one
|
||||
by the above parameters. For throughput. */
|
||||
@ -51,17 +56,16 @@ enum dd_prio {
|
||||
|
||||
enum { DD_PRIO_COUNT = 3 };
|
||||
|
||||
/* I/O statistics per I/O priority. */
|
||||
/*
|
||||
* I/O statistics per I/O priority. It is fine if these counters overflow.
|
||||
* What matters is that these counters are at least as wide as
|
||||
* log2(max_outstanding_requests).
|
||||
*/
|
||||
struct io_stats_per_prio {
|
||||
local_t inserted;
|
||||
local_t merged;
|
||||
local_t dispatched;
|
||||
local_t completed;
|
||||
};
|
||||
|
||||
/* I/O statistics for all I/O priorities (enum dd_prio). */
|
||||
struct io_stats {
|
||||
struct io_stats_per_prio stats[DD_PRIO_COUNT];
|
||||
uint32_t inserted;
|
||||
uint32_t merged;
|
||||
uint32_t dispatched;
|
||||
atomic_t completed;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -74,6 +78,7 @@ struct dd_per_prio {
|
||||
struct list_head fifo_list[DD_DIR_COUNT];
|
||||
/* Next request in FIFO order. Read, write or both are NULL. */
|
||||
struct request *next_rq[DD_DIR_COUNT];
|
||||
struct io_stats_per_prio stats;
|
||||
};
|
||||
|
||||
struct deadline_data {
|
||||
@ -88,8 +93,6 @@ struct deadline_data {
|
||||
unsigned int batching; /* number of sequential requests made */
|
||||
unsigned int starved; /* times reads have starved writes */
|
||||
|
||||
struct io_stats __percpu *stats;
|
||||
|
||||
/*
|
||||
* settings that change how the i/o scheduler behaves
|
||||
*/
|
||||
@ -98,38 +101,12 @@ struct deadline_data {
|
||||
int writes_starved;
|
||||
int front_merges;
|
||||
u32 async_depth;
|
||||
int prio_aging_expire;
|
||||
|
||||
spinlock_t lock;
|
||||
spinlock_t zone_lock;
|
||||
};
|
||||
|
||||
/* Count one event of type 'event_type' and with I/O priority 'prio' */
|
||||
#define dd_count(dd, event_type, prio) do { \
|
||||
struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \
|
||||
\
|
||||
BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \
|
||||
BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \
|
||||
local_inc(&io_stats->stats[(prio)].event_type); \
|
||||
put_cpu_ptr(io_stats); \
|
||||
} while (0)
|
||||
|
||||
/*
|
||||
* Returns the total number of dd_count(dd, event_type, prio) calls across all
|
||||
* CPUs. No locking or barriers since it is fine if the returned sum is slightly
|
||||
* outdated.
|
||||
*/
|
||||
#define dd_sum(dd, event_type, prio) ({ \
|
||||
unsigned int cpu; \
|
||||
u32 sum = 0; \
|
||||
\
|
||||
BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \
|
||||
BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \
|
||||
for_each_present_cpu(cpu) \
|
||||
sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \
|
||||
stats[(prio)].event_type); \
|
||||
sum; \
|
||||
})
|
||||
|
||||
/* Maps an I/O priority class to a deadline scheduler priority. */
|
||||
static const enum dd_prio ioprio_class_to_prio[] = {
|
||||
[IOPRIO_CLASS_NONE] = DD_BE_PRIO,
|
||||
@ -233,7 +210,9 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
|
||||
const u8 ioprio_class = dd_rq_ioclass(next);
|
||||
const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
|
||||
|
||||
dd_count(dd, merged, prio);
|
||||
lockdep_assert_held(&dd->lock);
|
||||
|
||||
dd->per_prio[prio].stats.merged++;
|
||||
|
||||
/*
|
||||
* if next expires before rq, assign its expire time to rq
|
||||
@ -270,6 +249,16 @@ deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
|
||||
deadline_remove_request(rq->q, per_prio, rq);
|
||||
}
|
||||
|
||||
/* Number of requests queued for a given priority level. */
|
||||
static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
|
||||
{
|
||||
const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
|
||||
|
||||
lockdep_assert_held(&dd->lock);
|
||||
|
||||
return stats->inserted - atomic_read(&stats->completed);
|
||||
}
|
||||
|
||||
/*
|
||||
* deadline_check_fifo returns 0 if there are no expired requests on the fifo,
|
||||
* 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
|
||||
@ -355,12 +344,27 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
|
||||
return rq;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if and only if @rq started after @latest_start where
|
||||
* @latest_start is in jiffies.
|
||||
*/
|
||||
static bool started_after(struct deadline_data *dd, struct request *rq,
|
||||
unsigned long latest_start)
|
||||
{
|
||||
unsigned long start_time = (unsigned long)rq->fifo_time;
|
||||
|
||||
start_time -= dd->fifo_expire[rq_data_dir(rq)];
|
||||
|
||||
return time_after(start_time, latest_start);
|
||||
}
|
||||
|
||||
/*
|
||||
* deadline_dispatch_requests selects the best request according to
|
||||
* read/write expire, fifo_batch, etc
|
||||
* read/write expire, fifo_batch, etc and with a start time <= @latest_start.
|
||||
*/
|
||||
static struct request *__dd_dispatch_request(struct deadline_data *dd,
|
||||
struct dd_per_prio *per_prio)
|
||||
struct dd_per_prio *per_prio,
|
||||
unsigned long latest_start)
|
||||
{
|
||||
struct request *rq, *next_rq;
|
||||
enum dd_data_dir data_dir;
|
||||
@ -372,6 +376,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
|
||||
if (!list_empty(&per_prio->dispatch)) {
|
||||
rq = list_first_entry(&per_prio->dispatch, struct request,
|
||||
queuelist);
|
||||
if (started_after(dd, rq, latest_start))
|
||||
return NULL;
|
||||
list_del_init(&rq->queuelist);
|
||||
goto done;
|
||||
}
|
||||
@ -449,6 +455,9 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
|
||||
dd->batching = 0;
|
||||
|
||||
dispatch_request:
|
||||
if (started_after(dd, rq, latest_start))
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* rq is the selected appropriate request.
|
||||
*/
|
||||
@ -457,7 +466,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
|
||||
done:
|
||||
ioprio_class = dd_rq_ioclass(rq);
|
||||
prio = ioprio_class_to_prio[ioprio_class];
|
||||
dd_count(dd, dispatched, prio);
|
||||
dd->per_prio[prio].stats.dispatched++;
|
||||
/*
|
||||
* If the request needs its target zone locked, do it.
|
||||
*/
|
||||
@ -466,6 +475,34 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
|
||||
return rq;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check whether there are any requests with priority other than DD_RT_PRIO
|
||||
* that were inserted more than prio_aging_expire jiffies ago.
|
||||
*/
|
||||
static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
|
||||
unsigned long now)
|
||||
{
|
||||
struct request *rq;
|
||||
enum dd_prio prio;
|
||||
int prio_cnt;
|
||||
|
||||
lockdep_assert_held(&dd->lock);
|
||||
|
||||
prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) +
|
||||
!!dd_queued(dd, DD_IDLE_PRIO);
|
||||
if (prio_cnt < 2)
|
||||
return NULL;
|
||||
|
||||
for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
|
||||
rq = __dd_dispatch_request(dd, &dd->per_prio[prio],
|
||||
now - dd->prio_aging_expire);
|
||||
if (rq)
|
||||
return rq;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
|
||||
*
|
||||
@ -477,15 +514,26 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
|
||||
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
|
||||
{
|
||||
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
|
||||
const unsigned long now = jiffies;
|
||||
struct request *rq;
|
||||
enum dd_prio prio;
|
||||
|
||||
spin_lock(&dd->lock);
|
||||
rq = dd_dispatch_prio_aged_requests(dd, now);
|
||||
if (rq)
|
||||
goto unlock;
|
||||
|
||||
/*
|
||||
* Next, dispatch requests in priority order. Ignore lower priority
|
||||
* requests if any higher priority requests are pending.
|
||||
*/
|
||||
for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
|
||||
rq = __dd_dispatch_request(dd, &dd->per_prio[prio]);
|
||||
if (rq)
|
||||
rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now);
|
||||
if (rq || dd_queued(dd, prio))
|
||||
break;
|
||||
}
|
||||
|
||||
unlock:
|
||||
spin_unlock(&dd->lock);
|
||||
|
||||
return rq;
|
||||
@ -519,7 +567,7 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
|
||||
|
||||
dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
|
||||
|
||||
sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth);
|
||||
sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth);
|
||||
}
|
||||
|
||||
/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
|
||||
@ -536,12 +584,21 @@ static void dd_exit_sched(struct elevator_queue *e)
|
||||
|
||||
for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
|
||||
struct dd_per_prio *per_prio = &dd->per_prio[prio];
|
||||
const struct io_stats_per_prio *stats = &per_prio->stats;
|
||||
uint32_t queued;
|
||||
|
||||
WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
|
||||
WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
|
||||
}
|
||||
|
||||
free_percpu(dd->stats);
|
||||
spin_lock(&dd->lock);
|
||||
queued = dd_queued(dd, prio);
|
||||
spin_unlock(&dd->lock);
|
||||
|
||||
WARN_ONCE(queued != 0,
|
||||
"statistics for priority %d: i %u m %u d %u c %u\n",
|
||||
prio, stats->inserted, stats->merged,
|
||||
stats->dispatched, atomic_read(&stats->completed));
|
||||
}
|
||||
|
||||
kfree(dd);
|
||||
}
|
||||
@ -566,11 +623,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
|
||||
|
||||
eq->elevator_data = dd;
|
||||
|
||||
dd->stats = alloc_percpu_gfp(typeof(*dd->stats),
|
||||
GFP_KERNEL | __GFP_ZERO);
|
||||
if (!dd->stats)
|
||||
goto free_dd;
|
||||
|
||||
for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
|
||||
struct dd_per_prio *per_prio = &dd->per_prio[prio];
|
||||
|
||||
@ -586,15 +638,13 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
|
||||
dd->front_merges = 1;
|
||||
dd->last_dir = DD_WRITE;
|
||||
dd->fifo_batch = fifo_batch;
|
||||
dd->prio_aging_expire = prio_aging_expire;
|
||||
spin_lock_init(&dd->lock);
|
||||
spin_lock_init(&dd->zone_lock);
|
||||
|
||||
q->elevator = eq;
|
||||
return 0;
|
||||
|
||||
free_dd:
|
||||
kfree(dd);
|
||||
|
||||
put_eq:
|
||||
kobject_put(&eq->kobj);
|
||||
return ret;
|
||||
@ -677,8 +727,11 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
|
||||
blk_req_zone_write_unlock(rq);
|
||||
|
||||
prio = ioprio_class_to_prio[ioprio_class];
|
||||
dd_count(dd, inserted, prio);
|
||||
rq->elv.priv[0] = (void *)(uintptr_t)1;
|
||||
per_prio = &dd->per_prio[prio];
|
||||
if (!rq->elv.priv[0]) {
|
||||
per_prio->stats.inserted++;
|
||||
rq->elv.priv[0] = (void *)(uintptr_t)1;
|
||||
}
|
||||
|
||||
if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
|
||||
blk_mq_free_requests(&free);
|
||||
@ -687,7 +740,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
|
||||
|
||||
trace_block_rq_insert(rq);
|
||||
|
||||
per_prio = &dd->per_prio[prio];
|
||||
if (at_head) {
|
||||
list_add(&rq->queuelist, &per_prio->dispatch);
|
||||
} else {
|
||||
@ -759,12 +811,13 @@ static void dd_finish_request(struct request *rq)
|
||||
|
||||
/*
|
||||
* The block layer core may call dd_finish_request() without having
|
||||
* called dd_insert_requests(). Hence only update statistics for
|
||||
* requests for which dd_insert_requests() has been called. See also
|
||||
* blk_mq_request_bypass_insert().
|
||||
* called dd_insert_requests(). Skip requests that bypassed I/O
|
||||
* scheduling. See also blk_mq_request_bypass_insert().
|
||||
*/
|
||||
if (rq->elv.priv[0])
|
||||
dd_count(dd, completed, prio);
|
||||
if (!rq->elv.priv[0])
|
||||
return;
|
||||
|
||||
atomic_inc(&per_prio->stats.completed);
|
||||
|
||||
if (blk_queue_is_zoned(q)) {
|
||||
unsigned long flags;
|
||||
@ -809,6 +862,7 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page) \
|
||||
#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
|
||||
SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
|
||||
SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
|
||||
SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire);
|
||||
SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
|
||||
SHOW_INT(deadline_front_merges_show, dd->front_merges);
|
||||
SHOW_INT(deadline_async_depth_show, dd->async_depth);
|
||||
@ -838,6 +892,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)
|
||||
STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
|
||||
STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
|
||||
STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
|
||||
STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX);
|
||||
STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
|
||||
STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
|
||||
STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX);
|
||||
@ -856,6 +911,7 @@ static struct elv_fs_entry deadline_attrs[] = {
|
||||
DD_ATTR(front_merges),
|
||||
DD_ATTR(async_depth),
|
||||
DD_ATTR(fifo_batch),
|
||||
DD_ATTR(prio_aging_expire),
|
||||
__ATTR_NULL
|
||||
};
|
||||
|
||||
@ -947,38 +1003,48 @@ static int dd_async_depth_show(void *data, struct seq_file *m)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Number of requests queued for a given priority level. */
|
||||
static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
|
||||
{
|
||||
return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio);
|
||||
}
|
||||
|
||||
static int dd_queued_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct request_queue *q = data;
|
||||
struct deadline_data *dd = q->elevator->elevator_data;
|
||||
u32 rt, be, idle;
|
||||
|
||||
spin_lock(&dd->lock);
|
||||
rt = dd_queued(dd, DD_RT_PRIO);
|
||||
be = dd_queued(dd, DD_BE_PRIO);
|
||||
idle = dd_queued(dd, DD_IDLE_PRIO);
|
||||
spin_unlock(&dd->lock);
|
||||
|
||||
seq_printf(m, "%u %u %u\n", rt, be, idle);
|
||||
|
||||
seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO),
|
||||
dd_queued(dd, DD_BE_PRIO),
|
||||
dd_queued(dd, DD_IDLE_PRIO));
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Number of requests owned by the block driver for a given priority. */
|
||||
static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
|
||||
{
|
||||
return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio)
|
||||
- dd_sum(dd, completed, prio);
|
||||
const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
|
||||
|
||||
lockdep_assert_held(&dd->lock);
|
||||
|
||||
return stats->dispatched + stats->merged -
|
||||
atomic_read(&stats->completed);
|
||||
}
|
||||
|
||||
static int dd_owned_by_driver_show(void *data, struct seq_file *m)
|
||||
{
|
||||
struct request_queue *q = data;
|
||||
struct deadline_data *dd = q->elevator->elevator_data;
|
||||
u32 rt, be, idle;
|
||||
|
||||
spin_lock(&dd->lock);
|
||||
rt = dd_owned_by_driver(dd, DD_RT_PRIO);
|
||||
be = dd_owned_by_driver(dd, DD_BE_PRIO);
|
||||
idle = dd_owned_by_driver(dd, DD_IDLE_PRIO);
|
||||
spin_unlock(&dd->lock);
|
||||
|
||||
seq_printf(m, "%u %u %u\n", rt, be, idle);
|
||||
|
||||
seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO),
|
||||
dd_owned_by_driver(dd, DD_BE_PRIO),
|
||||
dd_owned_by_driver(dd, DD_IDLE_PRIO));
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -2,6 +2,8 @@
|
||||
#
|
||||
# Partition configuration
|
||||
#
|
||||
menu "Partition Types"
|
||||
|
||||
config PARTITION_ADVANCED
|
||||
bool "Advanced partition selection"
|
||||
help
|
||||
@ -267,3 +269,5 @@ config CMDLINE_PARTITION
|
||||
help
|
||||
Say Y here if you want to read the partition table from bootargs.
|
||||
The format for the command line is just like mtdparts.
|
||||
|
||||
endmenu
|
||||
|
@ -91,19 +91,19 @@ static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
|
||||
{
|
||||
spin_lock(&bdev->bd_size_lock);
|
||||
i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
|
||||
bdev->bd_nr_sectors = sectors;
|
||||
spin_unlock(&bdev->bd_size_lock);
|
||||
}
|
||||
|
||||
static struct parsed_partitions *allocate_partitions(struct gendisk *hd)
|
||||
{
|
||||
struct parsed_partitions *state;
|
||||
int nr;
|
||||
int nr = DISK_MAX_PARTS;
|
||||
|
||||
state = kzalloc(sizeof(*state), GFP_KERNEL);
|
||||
if (!state)
|
||||
return NULL;
|
||||
|
||||
nr = disk_max_parts(hd);
|
||||
state->parts = vzalloc(array_size(nr, sizeof(state->parts[0])));
|
||||
if (!state->parts) {
|
||||
kfree(state);
|
||||
@ -204,7 +204,7 @@ static ssize_t part_alignment_offset_show(struct device *dev,
|
||||
struct block_device *bdev = dev_to_bdev(dev);
|
||||
|
||||
return sprintf(buf, "%u\n",
|
||||
queue_limit_alignment_offset(&bdev->bd_disk->queue->limits,
|
||||
queue_limit_alignment_offset(&bdev_get_queue(bdev)->limits,
|
||||
bdev->bd_start_sect));
|
||||
}
|
||||
|
||||
@ -214,7 +214,7 @@ static ssize_t part_discard_alignment_show(struct device *dev,
|
||||
struct block_device *bdev = dev_to_bdev(dev);
|
||||
|
||||
return sprintf(buf, "%u\n",
|
||||
queue_limit_discard_alignment(&bdev->bd_disk->queue->limits,
|
||||
queue_limit_discard_alignment(&bdev_get_queue(bdev)->limits,
|
||||
bdev->bd_start_sect));
|
||||
}
|
||||
|
||||
@ -325,7 +325,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
|
||||
|
||||
lockdep_assert_held(&disk->open_mutex);
|
||||
|
||||
if (partno >= disk_max_parts(disk))
|
||||
if (partno >= DISK_MAX_PARTS)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
/*
|
||||
@ -526,18 +526,15 @@ int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
|
||||
|
||||
static bool disk_unlock_native_capacity(struct gendisk *disk)
|
||||
{
|
||||
const struct block_device_operations *bdops = disk->fops;
|
||||
|
||||
if (bdops->unlock_native_capacity &&
|
||||
!(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
|
||||
printk(KERN_CONT "enabling native capacity\n");
|
||||
bdops->unlock_native_capacity(disk);
|
||||
disk->flags |= GENHD_FL_NATIVE_CAPACITY;
|
||||
return true;
|
||||
} else {
|
||||
if (!disk->fops->unlock_native_capacity ||
|
||||
test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) {
|
||||
printk(KERN_CONT "truncated\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
printk(KERN_CONT "enabling native capacity\n");
|
||||
disk->fops->unlock_native_capacity(disk);
|
||||
return true;
|
||||
}
|
||||
|
||||
void blk_drop_partitions(struct gendisk *disk)
|
||||
@ -606,7 +603,7 @@ static int blk_add_partitions(struct gendisk *disk)
|
||||
struct parsed_partitions *state;
|
||||
int ret = -EAGAIN, p;
|
||||
|
||||
if (!disk_part_scan_enabled(disk))
|
||||
if (disk->flags & GENHD_FL_NO_PART)
|
||||
return 0;
|
||||
|
||||
state = check_partition(disk);
|
||||
@ -689,7 +686,7 @@ int bdev_disk_changed(struct gendisk *disk, bool invalidate)
|
||||
* userspace for this particular setup.
|
||||
*/
|
||||
if (invalidate) {
|
||||
if (disk_part_scan_enabled(disk) ||
|
||||
if (!(disk->flags & GENHD_FL_NO_PART) ||
|
||||
!(disk->flags & GENHD_FL_REMOVABLE))
|
||||
set_capacity(disk, 0);
|
||||
}
|
||||
|
@ -133,7 +133,7 @@ efi_crc32(const void *buf, unsigned long len)
|
||||
*/
|
||||
static u64 last_lba(struct gendisk *disk)
|
||||
{
|
||||
return div_u64(disk->part0->bd_inode->i_size,
|
||||
return div_u64(bdev_nr_bytes(disk->part0),
|
||||
queue_logical_block_size(disk->queue)) - 1ULL;
|
||||
}
|
||||
|
||||
|
@ -198,7 +198,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
|
||||
char name[],
|
||||
union label_t *label,
|
||||
sector_t labelsect,
|
||||
loff_t i_size,
|
||||
sector_t nr_sectors,
|
||||
dasd_information2_t *info)
|
||||
{
|
||||
loff_t offset, geo_size, size;
|
||||
@ -213,14 +213,14 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
|
||||
} else {
|
||||
/*
|
||||
* Formated w/o large volume support. If the sanity check
|
||||
* 'size based on geo == size based on i_size' is true, then
|
||||
* 'size based on geo == size based on nr_sectors' is true, then
|
||||
* we can safely assume that we know the formatted size of
|
||||
* the disk, otherwise we need additional information
|
||||
* that we can only get from a real DASD device.
|
||||
*/
|
||||
geo_size = geo->cylinders * geo->heads
|
||||
* geo->sectors * secperblk;
|
||||
size = i_size >> 9;
|
||||
size = nr_sectors;
|
||||
if (size != geo_size) {
|
||||
if (!info) {
|
||||
strlcat(state->pp_buf, "\n", PAGE_SIZE);
|
||||
@ -229,7 +229,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
|
||||
if (!strcmp(info->type, "ECKD"))
|
||||
if (geo_size < size)
|
||||
size = geo_size;
|
||||
/* else keep size based on i_size */
|
||||
/* else keep size based on nr_sectors */
|
||||
}
|
||||
}
|
||||
/* first and only partition starts in the first block after the label */
|
||||
@ -293,7 +293,8 @@ int ibm_partition(struct parsed_partitions *state)
|
||||
struct gendisk *disk = state->disk;
|
||||
struct block_device *bdev = disk->part0;
|
||||
int blocksize, res;
|
||||
loff_t i_size, offset, size;
|
||||
loff_t offset, size;
|
||||
sector_t nr_sectors;
|
||||
dasd_information2_t *info;
|
||||
struct hd_geometry *geo;
|
||||
char type[5] = {0,};
|
||||
@ -308,8 +309,8 @@ int ibm_partition(struct parsed_partitions *state)
|
||||
blocksize = bdev_logical_block_size(bdev);
|
||||
if (blocksize <= 0)
|
||||
goto out_symbol;
|
||||
i_size = i_size_read(bdev->bd_inode);
|
||||
if (i_size == 0)
|
||||
nr_sectors = bdev_nr_sectors(bdev);
|
||||
if (nr_sectors == 0)
|
||||
goto out_symbol;
|
||||
info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL);
|
||||
if (info == NULL)
|
||||
@ -336,7 +337,7 @@ int ibm_partition(struct parsed_partitions *state)
|
||||
label);
|
||||
} else if (!strncmp(type, "LNX1", 4)) {
|
||||
res = find_lnx1_partitions(state, geo, blocksize, name,
|
||||
label, labelsect, i_size,
|
||||
label, labelsect, nr_sectors,
|
||||
info);
|
||||
} else if (!strncmp(type, "CMS1", 4)) {
|
||||
res = find_cms1_partitions(state, geo, blocksize, name,
|
||||
@ -353,7 +354,7 @@ int ibm_partition(struct parsed_partitions *state)
|
||||
res = 1;
|
||||
if (info->format == DASD_FORMAT_LDL) {
|
||||
strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
|
||||
size = i_size >> 9;
|
||||
size = nr_sectors;
|
||||
offset = (info->label_block + 1) * (blocksize >> 9);
|
||||
put_partition(state, 1, offset, size-offset);
|
||||
strlcat(state->pp_buf, "\n", PAGE_SIZE);
|
||||
|
@ -5,7 +5,7 @@
|
||||
*/
|
||||
|
||||
#include <linux/t10-pi.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-integrity.h>
|
||||
#include <linux/crc-t10dif.h>
|
||||
#include <linux/module.h>
|
||||
#include <net/checksum.h>
|
||||
|
Loading…
Reference in New Issue
Block a user