3
0
mirror of https://github.com/Qortal/Brooklyn.git synced 2025-01-30 14:52:17 +00:00
This commit is contained in:
Raziel K. Crowe 2022-04-02 17:29:52 +05:00
parent fb209289b8
commit 07d9c3128d
47 changed files with 3310 additions and 2782 deletions

View File

@ -35,6 +35,9 @@ config BLK_CGROUP_RWSTAT
config BLK_DEV_BSG_COMMON
tristate
config BLK_ICQ
bool
config BLK_DEV_BSGLIB
bool "Block layer SG support v4 helper lib"
select BLK_DEV_BSG_COMMON
@ -73,7 +76,7 @@ config BLK_DEV_ZONED
config BLK_DEV_THROTTLING
bool "Block layer bio throttling support"
depends on BLK_CGROUP=y
depends on BLK_CGROUP
select BLK_CGROUP_RWSTAT
help
Block layer bio throttling support. It can be used to limit
@ -112,7 +115,7 @@ config BLK_WBT_MQ
config BLK_CGROUP_IOLATENCY
bool "Enable support for latency based cgroup IO protection"
depends on BLK_CGROUP=y
depends on BLK_CGROUP
help
Enabling this option enables the .latency interface for IO throttling.
The IO controller will attempt to maintain average IO latencies below
@ -132,7 +135,7 @@ config BLK_CGROUP_FC_APPID
config BLK_CGROUP_IOCOST
bool "Enable support for cost model based cgroup IO controller"
depends on BLK_CGROUP=y
depends on BLK_CGROUP
select BLK_RQ_IO_DATA_LEN
select BLK_RQ_ALLOC_TIME
help
@ -190,39 +193,31 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
by falling back to the kernel crypto API when inline
encryption hardware is not present.
menu "Partition Types"
source "block/partitions/Kconfig"
endmenu
endif # BLOCK
config BLOCK_COMPAT
bool
depends on BLOCK && COMPAT
default y
def_bool COMPAT
config BLK_MQ_PCI
bool
depends on BLOCK && PCI
default y
def_bool PCI
config BLK_MQ_VIRTIO
bool
depends on BLOCK && VIRTIO
depends on VIRTIO
default y
config BLK_MQ_RDMA
bool
depends on BLOCK && INFINIBAND
depends on INFINIBAND
default y
config BLK_PM
def_bool BLOCK && PM
def_bool PM
# do not use in new code
config BLOCK_HOLDER_DEPRECATED
bool
source "block/Kconfig.iosched"
endif # BLOCK

View File

@ -1,6 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
if BLOCK
menu "IO Schedulers"
config MQ_IOSCHED_DEADLINE
@ -20,6 +18,7 @@ config MQ_IOSCHED_KYBER
config IOSCHED_BFQ
tristate "BFQ I/O scheduler"
select BLK_ICQ
help
BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of
of the device among all processes according to their weights,
@ -45,5 +44,3 @@ config BFQ_CGROUP_DEBUG
files in a cgroup which can be useful for debugging.
endmenu
endif

View File

@ -3,13 +3,13 @@
# Makefile for the kernel block layer
#
obj-$(CONFIG_BLOCK) := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
blk-exec.o blk-merge.o blk-timeout.o \
blk-merge.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
disk-events.o
disk-events.o blk-ia-ranges.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
@ -36,6 +36,6 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o

View File

@ -12,6 +12,7 @@
#include <linux/major.h>
#include <linux/device_cgroup.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/backing-dev.h>
#include <linux/module.h>
#include <linux/blkpg.h>
@ -23,7 +24,6 @@
#include <linux/pseudo_fs.h>
#include <linux/uio.h>
#include <linux/namei.h>
#include <linux/cleancache.h>
#include <linux/part_stat.h>
#include <linux/uaccess.h>
#include "../fs/internal.h"
@ -87,10 +87,6 @@ void invalidate_bdev(struct block_device *bdev)
lru_add_drain_all(); /* make sure all lru add caches are flushed */
invalidate_mapping_pages(mapping, 0, -1);
}
/* 99% of the time, we don't need to flush the cleancache on the bdev.
* But, for the strange corners, lets be cautious
*/
cleancache_invalidate_inode(mapping);
}
EXPORT_SYMBOL(invalidate_bdev);
@ -184,14 +180,13 @@ int sb_min_blocksize(struct super_block *sb, int size)
EXPORT_SYMBOL(sb_min_blocksize);
int __sync_blockdev(struct block_device *bdev, int wait)
int sync_blockdev_nowait(struct block_device *bdev)
{
if (!bdev)
return 0;
if (!wait)
return filemap_flush(bdev->bd_inode->i_mapping);
return filemap_write_and_wait(bdev->bd_inode->i_mapping);
return filemap_flush(bdev->bd_inode->i_mapping);
}
EXPORT_SYMBOL_GPL(sync_blockdev_nowait);
/*
* Write out and wait upon all the dirty data associated with a block
@ -199,7 +194,9 @@ int __sync_blockdev(struct block_device *bdev, int wait)
*/
int sync_blockdev(struct block_device *bdev)
{
return __sync_blockdev(bdev, 1);
if (!bdev)
return 0;
return filemap_write_and_wait(bdev->bd_inode->i_mapping);
}
EXPORT_SYMBOL(sync_blockdev);
@ -326,12 +323,12 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return result;
result = blk_queue_enter(bdev->bd_disk->queue, 0);
result = blk_queue_enter(bdev_get_queue(bdev), 0);
if (result)
return result;
result = ops->rw_page(bdev, sector + get_start_sect(bdev), page,
REQ_OP_READ);
blk_queue_exit(bdev->bd_disk->queue);
blk_queue_exit(bdev_get_queue(bdev));
return result;
}
@ -362,7 +359,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
if (!ops->rw_page || bdev_get_integrity(bdev))
return -EOPNOTSUPP;
result = blk_queue_enter(bdev->bd_disk->queue, 0);
result = blk_queue_enter(bdev_get_queue(bdev), 0);
if (result)
return result;
@ -375,7 +372,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
clean_page_buffers(page);
unlock_page(page);
}
blk_queue_exit(bdev->bd_disk->queue);
blk_queue_exit(bdev_get_queue(bdev));
return result;
}
@ -492,6 +489,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
spin_lock_init(&bdev->bd_size_lock);
bdev->bd_partno = partno;
bdev->bd_inode = inode;
bdev->bd_queue = disk->queue;
bdev->bd_stats = alloc_percpu(struct disk_stats);
if (!bdev->bd_stats) {
iput(inode);
@ -662,7 +660,7 @@ static void blkdev_flush_mapping(struct block_device *bdev)
static int blkdev_get_whole(struct block_device *bdev, fmode_t mode)
{
struct gendisk *disk = bdev->bd_disk;
int ret = 0;
int ret;
if (disk->fops->open) {
ret = disk->fops->open(bdev, mode);
@ -747,21 +745,11 @@ struct block_device *blkdev_get_no_open(dev_t dev)
if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
bdev = NULL;
iput(inode);
if (!bdev)
return NULL;
if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) ||
!try_module_get(bdev->bd_disk->fops->owner)) {
put_device(&bdev->bd_device);
return NULL;
}
return bdev;
}
void blkdev_put_no_open(struct block_device *bdev)
{
module_put(bdev->bd_disk->fops->owner);
put_device(&bdev->bd_device);
}
@ -817,12 +805,14 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
ret = -ENXIO;
if (!disk_live(disk))
goto abort_claiming;
if (!try_module_get(disk->fops->owner))
goto abort_claiming;
if (bdev_is_partition(bdev))
ret = blkdev_get_part(bdev, mode);
else
ret = blkdev_get_whole(bdev, mode);
if (ret)
goto abort_claiming;
goto put_module;
if (mode & FMODE_EXCL) {
bd_finish_claiming(bdev, holder);
@ -834,7 +824,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
* used in blkdev_get/put().
*/
if ((mode & FMODE_WRITE) && !bdev->bd_write_holder &&
(disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) {
(disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) {
bdev->bd_write_holder = true;
unblock_events = false;
}
@ -844,7 +834,8 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
if (unblock_events)
disk_unblock_events(disk);
return bdev;
put_module:
module_put(disk->fops->owner);
abort_claiming:
if (mode & FMODE_EXCL)
bd_abort_claiming(bdev, holder);
@ -953,18 +944,21 @@ void blkdev_put(struct block_device *bdev, fmode_t mode)
blkdev_put_whole(bdev, mode);
mutex_unlock(&disk->open_mutex);
module_put(disk->fops->owner);
blkdev_put_no_open(bdev);
}
EXPORT_SYMBOL(blkdev_put);
/**
* lookup_bdev - lookup a struct block_device by name
* @pathname: special file representing the block device
* @dev: return value of the block device's dev_t
* lookup_bdev() - Look up a struct block_device by name.
* @pathname: Name of the block device in the filesystem.
* @dev: Pointer to the block device's dev_t, if found.
*
* Get a reference to the blockdevice at @pathname in the current
* namespace if possible and return it. Return ERR_PTR(error)
* otherwise.
* Lookup the block device's dev_t at @pathname in the current
* namespace if possible and return it in @dev.
*
* Context: May sleep.
* Return: 0 if succeeded, negative errno otherwise.
*/
int lookup_bdev(const char *pathname, dev_t *dev)
{
@ -1016,7 +1010,7 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty)
}
EXPORT_SYMBOL(__invalidate_device);
void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
void sync_bdevs(bool wait)
{
struct inode *inode, *old_inode = NULL;
@ -1047,8 +1041,19 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
bdev = I_BDEV(inode);
mutex_lock(&bdev->bd_disk->open_mutex);
if (bdev->bd_openers)
func(bdev, arg);
if (!bdev->bd_openers) {
; /* skip */
} else if (wait) {
/*
* We keep the error status of individual mapping so
* that applications can catch the writeback error using
* fsync(2). See filemap_fdatawait_keep_errors() for
* details.
*/
filemap_fdatawait_keep_errors(inode->i_mapping);
} else {
filemap_fdatawrite(inode->i_mapping);
}
mutex_unlock(&bdev->bd_disk->open_mutex);
spin_lock(&blockdev_superblock->s_inode_list_lock);

View File

@ -6,13 +6,13 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/cgroup.h>
#include <linux/elevator.h>
#include <linux/ktime.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/sbitmap.h>
#include <linux/delay.h>
#include "elevator.h"
#include "bfq-iosched.h"
#ifdef CONFIG_BFQ_CGROUP_DEBUG
@ -463,7 +463,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
{
if (blkg_rwstat_init(&stats->bytes, gfp) ||
blkg_rwstat_init(&stats->ios, gfp))
return -ENOMEM;
goto error;
#ifdef CONFIG_BFQ_CGROUP_DEBUG
if (blkg_rwstat_init(&stats->merged, gfp) ||
@ -476,13 +476,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
bfq_stat_init(&stats->dequeue, gfp) ||
bfq_stat_init(&stats->group_wait_time, gfp) ||
bfq_stat_init(&stats->idle_time, gfp) ||
bfq_stat_init(&stats->empty_time, gfp)) {
bfqg_stats_exit(stats);
return -ENOMEM;
}
bfq_stat_init(&stats->empty_time, gfp))
goto error;
#endif
return 0;
error:
bfqg_stats_exit(stats);
return -ENOMEM;
}
static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)

View File

@ -117,7 +117,6 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/cgroup.h>
#include <linux/elevator.h>
#include <linux/ktime.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
@ -127,6 +126,7 @@
#include <trace/events/block.h>
#include "elevator.h"
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
@ -433,26 +433,21 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
/**
* bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
* @bfqd: the lookup key.
* @ioc: the io_context of the process doing I/O.
* @q: the request queue.
*/
static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
struct io_context *ioc,
struct request_queue *q)
static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
{
if (ioc) {
unsigned long flags;
struct bfq_io_cq *icq;
struct bfq_io_cq *icq;
unsigned long flags;
spin_lock_irqsave(&q->queue_lock, flags);
icq = icq_to_bic(ioc_lookup_icq(ioc, q));
spin_unlock_irqrestore(&q->queue_lock, flags);
if (!current->io_context)
return NULL;
return icq;
}
spin_lock_irqsave(&q->queue_lock, flags);
icq = icq_to_bic(ioc_lookup_icq(q));
spin_unlock_irqrestore(&q->queue_lock, flags);
return NULL;
return icq;
}
/*
@ -565,26 +560,134 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
}
}
#define BFQ_LIMIT_INLINE_DEPTH 16
#ifdef CONFIG_BFQ_GROUP_IOSCHED
static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
{
struct bfq_data *bfqd = bfqq->bfqd;
struct bfq_entity *entity = &bfqq->entity;
struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH];
struct bfq_entity **entities = inline_entities;
int depth, level;
int class_idx = bfqq->ioprio_class - 1;
struct bfq_sched_data *sched_data;
unsigned long wsum;
bool ret = false;
if (!entity->on_st_or_in_serv)
return false;
/* +1 for bfqq entity, root cgroup not included */
depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1;
if (depth > BFQ_LIMIT_INLINE_DEPTH) {
entities = kmalloc_array(depth, sizeof(*entities), GFP_NOIO);
if (!entities)
return false;
}
spin_lock_irq(&bfqd->lock);
sched_data = entity->sched_data;
/* Gather our ancestors as we need to traverse them in reverse order */
level = 0;
for_each_entity(entity) {
/*
* If at some level entity is not even active, allow request
* queueing so that BFQ knows there's work to do and activate
* entities.
*/
if (!entity->on_st_or_in_serv)
goto out;
/* Uh, more parents than cgroup subsystem thinks? */
if (WARN_ON_ONCE(level >= depth))
break;
entities[level++] = entity;
}
WARN_ON_ONCE(level != depth);
for (level--; level >= 0; level--) {
entity = entities[level];
if (level > 0) {
wsum = bfq_entity_service_tree(entity)->wsum;
} else {
int i;
/*
* For bfqq itself we take into account service trees
* of all higher priority classes and multiply their
* weights so that low prio queue from higher class
* gets more requests than high prio queue from lower
* class.
*/
wsum = 0;
for (i = 0; i <= class_idx; i++) {
wsum = wsum * IOPRIO_BE_NR +
sched_data->service_tree[i].wsum;
}
}
limit = DIV_ROUND_CLOSEST(limit * entity->weight, wsum);
if (entity->allocated >= limit) {
bfq_log_bfqq(bfqq->bfqd, bfqq,
"too many requests: allocated %d limit %d level %d",
entity->allocated, limit, level);
ret = true;
break;
}
}
out:
spin_unlock_irq(&bfqd->lock);
if (entities != inline_entities)
kfree(entities);
return ret;
}
#else
static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
{
return false;
}
#endif
/*
* Async I/O can easily starve sync I/O (both sync reads and sync
* writes), by consuming all tags. Similarly, storms of sync writes,
* such as those that sync(2) may trigger, can starve sync reads.
* Limit depths of async I/O and sync writes so as to counter both
* problems.
*
* Also if a bfq queue or its parent cgroup consume more tags than would be
* appropriate for their weight, we trim the available tag depth to 1. This
* avoids a situation where one cgroup can starve another cgroup from tags and
* thus block service differentiation among cgroups. Note that because the
* queue / cgroup already has many requests allocated and queued, this does not
* significantly affect service guarantees coming from the BFQ scheduling
* algorithm.
*/
static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
{
struct bfq_data *bfqd = data->q->elevator->elevator_data;
struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(op)) : NULL;
int depth;
unsigned limit = data->q->nr_requests;
if (op_is_sync(op) && !op_is_write(op))
return;
/* Sync reads have full depth available */
if (op_is_sync(op) && !op_is_write(op)) {
depth = 0;
} else {
depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
limit = (limit * depth) >> bfqd->full_depth_shift;
}
data->shallow_depth =
bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
/*
* Does queue (or any parent entity) exceed number of requests that
* should be available to it? Heavily limit depth so that it cannot
* consume more available requests and thus starve other entities.
*/
if (bfqq && bfqq_request_over_limit(bfqq, limit))
depth = 1;
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
__func__, bfqd->wr_busy_queues, op_is_sync(op),
data->shallow_depth);
__func__, bfqd->wr_busy_queues, op_is_sync(op), depth);
if (depth)
data->shallow_depth = depth;
}
static struct bfq_queue *
@ -1113,7 +1216,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
static int bfqq_process_refs(struct bfq_queue *bfqq)
{
return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv -
return bfqq->ref - bfqq->entity.allocated -
bfqq->entity.on_st_or_in_serv -
(bfqq->weight_counter != NULL) - bfqq->stable_ref;
}
@ -1982,20 +2086,19 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
* aspect, see the comments on the choice of the queue for injection
* in bfq_select_queue().
*
* Turning back to the detection of a waker queue, a queue Q is deemed
* as a waker queue for bfqq if, for three consecutive times, bfqq
* happens to become non empty right after a request of Q has been
* completed. In this respect, even if bfqq is empty, we do not check
* for a waker if it still has some in-flight I/O. In fact, in this
* case bfqq is actually still being served by the drive, and may
* receive new I/O on the completion of some of the in-flight
* requests. In particular, on the first time, Q is tentatively set as
* a candidate waker queue, while on the third consecutive time that Q
* is detected, the field waker_bfqq is set to Q, to confirm that Q is
* a waker queue for bfqq. These detection steps are performed only if
* bfqq has a long think time, so as to make it more likely that
* bfqq's I/O is actually being blocked by a synchronization. This
* last filter, plus the above three-times requirement, make false
* Turning back to the detection of a waker queue, a queue Q is deemed as a
* waker queue for bfqq if, for three consecutive times, bfqq happens to become
* non empty right after a request of Q has been completed within given
* timeout. In this respect, even if bfqq is empty, we do not check for a waker
* if it still has some in-flight I/O. In fact, in this case bfqq is actually
* still being served by the drive, and may receive new I/O on the completion
* of some of the in-flight requests. In particular, on the first time, Q is
* tentatively set as a candidate waker queue, while on the third consecutive
* time that Q is detected, the field waker_bfqq is set to Q, to confirm that Q
* is a waker queue for bfqq. These detection steps are performed only if bfqq
* has a long think time, so as to make it more likely that bfqq's I/O is
* actually being blocked by a synchronization. This last filter, plus the
* above three-times requirement and time limit for detection, make false
* positives less likely.
*
* NOTE
@ -2019,6 +2122,8 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
u64 now_ns)
{
char waker_name[MAX_BFQQ_NAME_LENGTH];
if (!bfqd->last_completed_rq_bfqq ||
bfqd->last_completed_rq_bfqq == bfqq ||
bfq_bfqq_has_short_ttime(bfqq) ||
@ -2027,8 +2132,16 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq)
return;
/*
* We reset waker detection logic also if too much time has passed
* since the first detection. If wakeups are rare, pointless idling
* doesn't hurt throughput that much. The condition below makes sure
* we do not uselessly idle blocking waker in more than 1/64 cases.
*/
if (bfqd->last_completed_rq_bfqq !=
bfqq->tentative_waker_bfqq) {
bfqq->tentative_waker_bfqq ||
now_ns > bfqq->waker_detection_started +
128 * (u64)bfqd->bfq_slice_idle) {
/*
* First synchronization detected with a
* candidate waker queue, or with a different
@ -2037,12 +2150,19 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqq->tentative_waker_bfqq =
bfqd->last_completed_rq_bfqq;
bfqq->num_waker_detections = 1;
bfqq->waker_detection_started = now_ns;
bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name,
MAX_BFQQ_NAME_LENGTH);
bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name);
} else /* Same tentative waker queue detected again */
bfqq->num_waker_detections++;
if (bfqq->num_waker_detections == 3) {
bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
bfqq->tentative_waker_bfqq = NULL;
bfq_bfqq_name(bfqq->waker_bfqq, waker_name,
MAX_BFQQ_NAME_LENGTH);
bfq_log_bfqq(bfqd, bfqq, "set waker %s", waker_name);
/*
* If the waker queue disappears, then
@ -2332,7 +2452,7 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
* returned by bfq_bic_lookup does not go away before
* bfqd->lock is taken.
*/
struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
struct bfq_io_cq *bic = bfq_bic_lookup(q);
bool ret;
spin_lock_irq(&bfqd->lock);
@ -5878,6 +5998,22 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
}
}
static void bfqq_request_allocated(struct bfq_queue *bfqq)
{
struct bfq_entity *entity = &bfqq->entity;
for_each_entity(entity)
entity->allocated++;
}
static void bfqq_request_freed(struct bfq_queue *bfqq)
{
struct bfq_entity *entity = &bfqq->entity;
for_each_entity(entity)
entity->allocated--;
}
/* returns true if it causes the idle timer to be disabled */
static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
{
@ -5891,8 +6027,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
* Release the request's reference to the old bfqq
* and make sure one is taken to the shared queue.
*/
new_bfqq->allocated++;
bfqq->allocated--;
bfqq_request_allocated(new_bfqq);
bfqq_request_freed(bfqq);
new_bfqq->ref++;
/*
* If the bic associated with the process
@ -6209,8 +6345,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq)
{
bfqq->allocated--;
bfqq_request_freed(bfqq);
bfq_put_queue(bfqq);
}
@ -6434,6 +6569,16 @@ static void bfq_finish_requeue_request(struct request *rq)
rq->elv.priv[1] = NULL;
}
static void bfq_finish_request(struct request *rq)
{
bfq_finish_requeue_request(rq);
if (rq->elv.icq) {
put_io_context(rq->elv.icq->ioc);
rq->elv.icq = NULL;
}
}
/*
* Removes the association between the current task and bfqq, assuming
* that bic points to the bfq iocontext of the task.
@ -6531,6 +6676,8 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
*/
static void bfq_prepare_request(struct request *rq)
{
rq->elv.icq = ioc_find_get_icq(rq->q);
/*
* Regardless of whether we have an icq attached, we have to
* clear the scheduler pointers, as they might point to
@ -6630,7 +6777,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
}
}
bfqq->allocated++;
bfqq_request_allocated(bfqq);
bfqq->ref++;
bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
rq, bfqq, bfqq->ref);
@ -6793,11 +6940,11 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
* See the comments on bfq_limit_depth for the purpose of
* the depths set in the function. Return minimum shallow depth we'll use.
*/
static unsigned int bfq_update_depths(struct bfq_data *bfqd,
struct sbitmap_queue *bt)
static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
{
unsigned int i, j, min_shallow = UINT_MAX;
unsigned int depth = 1U << bt->sb.shift;
bfqd->full_depth_shift = bt->sb.shift;
/*
* In-word depths if no bfq_queue is being weight-raised:
* leaving 25% of tags only for sync reads.
@ -6809,13 +6956,13 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
* limit 'something'.
*/
/* no more than 50% of tags for async I/O */
bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U);
bfqd->word_depths[0][0] = max(depth >> 1, 1U);
/*
* no more than 75% of tags for sync writes (25% extra tags
* w.r.t. async I/O, to prevent async I/O from starving sync
* writes)
*/
bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U);
bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U);
/*
* In-word depths in case some bfq_queue is being weight-
@ -6825,25 +6972,18 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
* shortage.
*/
/* no more than ~18% of tags for async I/O */
bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U);
bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U);
/* no more than ~37% of tags for sync writes (~20% extra tags) */
bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U);
for (i = 0; i < 2; i++)
for (j = 0; j < 2; j++)
min_shallow = min(min_shallow, bfqd->word_depths[i][j]);
return min_shallow;
bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U);
}
static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
{
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
struct blk_mq_tags *tags = hctx->sched_tags;
unsigned int min_shallow;
min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags);
sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow);
bfq_update_depths(bfqd, &tags->bitmap_tags);
sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
}
static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
@ -7260,7 +7400,7 @@ static struct elevator_type iosched_bfq_mq = {
.limit_depth = bfq_limit_depth,
.prepare_request = bfq_prepare_request,
.requeue_request = bfq_finish_requeue_request,
.finish_request = bfq_finish_requeue_request,
.finish_request = bfq_finish_request,
.exit_icq = bfq_exit_icq,
.insert_requests = bfq_insert_requests,
.dispatch_request = bfq_dispatch_request,

View File

@ -25,7 +25,7 @@
#define BFQ_DEFAULT_GRP_IOPRIO 0
#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
#define MAX_PID_STR_LENGTH 12
#define MAX_BFQQ_NAME_LENGTH 16
/*
* Soft real-time applications are extremely more latency sensitive
@ -170,6 +170,9 @@ struct bfq_entity {
/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
int budget;
/* Number of requests allocated in the subtree of this entity */
int allocated;
/* device weight, if non-zero, it overrides the default weight of
* bfq_group_data */
int dev_weight;
@ -266,8 +269,6 @@ struct bfq_queue {
struct request *next_rq;
/* number of sync and async requests queued */
int queued[2];
/* number of requests currently allocated */
int allocated;
/* number of pending metadata requests */
int meta_pending;
/* fifo list of requests in sort_list */
@ -387,6 +388,8 @@ struct bfq_queue {
struct bfq_queue *tentative_waker_bfqq;
/* number of times the same tentative waker has been detected */
unsigned int num_waker_detections;
/* time when we started considering this waker */
u64 waker_detection_started;
/* node for woken_list, see below */
struct hlist_node woken_list_node;
@ -768,6 +771,7 @@ struct bfq_data {
* function)
*/
unsigned int word_depths[2][2];
unsigned int full_depth_shift;
};
enum bfqq_state_flags {
@ -1079,26 +1083,27 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
/* --------------- end of interface of B-WF2Q+ ---------------- */
/* Logging facilities. */
static inline void bfq_pid_to_str(int pid, char *str, int len)
static inline void bfq_bfqq_name(struct bfq_queue *bfqq, char *str, int len)
{
if (pid != -1)
snprintf(str, len, "%d", pid);
char type = bfq_bfqq_sync(bfqq) ? 'S' : 'A';
if (bfqq->pid != -1)
snprintf(str, len, "bfq%d%c", bfqq->pid, type);
else
snprintf(str, len, "SHARED-");
snprintf(str, len, "bfqSHARED-%c", type);
}
#ifdef CONFIG_BFQ_GROUP_IOSCHED
struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
char pid_str[MAX_PID_STR_LENGTH]; \
char pid_str[MAX_BFQQ_NAME_LENGTH]; \
if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
break; \
bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \
blk_add_cgroup_trace_msg((bfqd)->queue, \
bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \
"bfq%s%c " fmt, pid_str, \
bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \
"%s " fmt, pid_str, ##args); \
} while (0)
#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
@ -1109,13 +1114,11 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
#else /* CONFIG_BFQ_GROUP_IOSCHED */
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
char pid_str[MAX_PID_STR_LENGTH]; \
char pid_str[MAX_BFQQ_NAME_LENGTH]; \
if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
break; \
bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \
bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
##args); \
bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \
blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \
} while (0)
#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)

View File

@ -6,7 +6,7 @@
* Written by: Martin K. Petersen <martin.petersen@oracle.com>
*/
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/mempool.h>
#include <linux/export.h>
#include <linux/bio.h>
@ -134,7 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
iv = bip->bip_vec + bip->bip_vcnt;
if (bip->bip_vcnt &&
bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue,
bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev),
&bip->bip_vec[bip->bip_vcnt - 1], offset))
return 0;

View File

@ -26,7 +26,7 @@
#include "blk-rq-qos.h"
struct bio_alloc_cache {
struct bio_list free_list;
struct bio *free_list;
unsigned int nr;
};
@ -87,7 +87,8 @@ static struct bio_slab *create_bio_slab(unsigned int size)
snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
bslab->slab = kmem_cache_create(bslab->name, size,
ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL);
ARCH_KMALLOC_MINALIGN,
SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL);
if (!bslab->slab)
goto fail_alloc_slab;
@ -156,7 +157,7 @@ static void bio_put_slab(struct bio_set *bs)
void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
{
BIO_BUG_ON(nr_vecs > BIO_MAX_VECS);
BUG_ON(nr_vecs > BIO_MAX_VECS);
if (nr_vecs == BIO_MAX_VECS)
mempool_free(bv, pool);
@ -281,6 +282,7 @@ void bio_init(struct bio *bio, struct bio_vec *table,
atomic_set(&bio->__bi_remaining, 1);
atomic_set(&bio->__bi_cnt, 1);
bio->bi_cookie = BLK_QC_T_NONE;
bio->bi_max_vecs = max_vecs;
bio->bi_io_vec = table;
@ -546,7 +548,7 @@ EXPORT_SYMBOL(zero_fill_bio);
* REQ_OP_READ, zero the truncated part. This function should only
* be used for handling corner cases, such as bio eod.
*/
void bio_truncate(struct bio *bio, unsigned new_size)
static void bio_truncate(struct bio *bio, unsigned new_size)
{
struct bio_vec bv;
struct bvec_iter iter;
@ -629,7 +631,8 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
unsigned int i = 0;
struct bio *bio;
while ((bio = bio_list_pop(&cache->free_list)) != NULL) {
while ((bio = cache->free_list) != NULL) {
cache->free_list = bio->bi_next;
cache->nr--;
bio_free(bio);
if (++i == nr)
@ -678,7 +681,7 @@ static void bio_alloc_cache_destroy(struct bio_set *bs)
void bio_put(struct bio *bio)
{
if (unlikely(bio_flagged(bio, BIO_REFFED))) {
BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
BUG_ON(!atomic_read(&bio->__bi_cnt));
if (!atomic_dec_and_test(&bio->__bi_cnt))
return;
}
@ -688,7 +691,8 @@ void bio_put(struct bio *bio)
bio_uninit(bio);
cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
bio_list_add_head(&cache->free_list, bio);
bio->bi_next = cache->free_list;
cache->free_list = bio;
if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
put_cpu();
@ -773,6 +777,23 @@ const char *bio_devname(struct bio *bio, char *buf)
}
EXPORT_SYMBOL(bio_devname);
/**
* bio_full - check if the bio is full
* @bio: bio to check
* @len: length of one segment to be added
*
* Return true if @bio is full and one segment with @len bytes can't be
* added to the bio, otherwise return false
*/
static inline bool bio_full(struct bio *bio, unsigned len)
{
if (bio->bi_vcnt >= bio->bi_max_vecs)
return true;
if (bio->bi_iter.bi_size > UINT_MAX - len)
return true;
return false;
}
static inline bool page_is_mergeable(const struct bio_vec *bv,
struct page *page, unsigned int len, unsigned int off,
bool *same_page)
@ -792,6 +813,44 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE);
}
/**
* __bio_try_merge_page - try appending data to an existing bvec.
* @bio: destination bio
* @page: start page to add
* @len: length of the data to add
* @off: offset of the data relative to @page
* @same_page: return if the segment has been merged inside the same page
*
* Try to add the data at @page + @off to the last bvec of @bio. This is a
* useful optimisation for file systems with a block size smaller than the
* page size.
*
* Warn if (@len, @off) crosses pages in case that @same_page is true.
*
* Return %true on success or %false on failure.
*/
static bool __bio_try_merge_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off, bool *same_page)
{
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
if (bio->bi_vcnt > 0) {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (page_is_mergeable(bv, page, len, off, same_page)) {
if (bio->bi_iter.bi_size > UINT_MAX - len) {
*same_page = false;
return false;
}
bv->bv_len += len;
bio->bi_iter.bi_size += len;
return true;
}
}
return false;
}
/*
* Try to merge a page into a segment, while obeying the hardware segment
* size limit. This is not for normal read/write bios, but for passthrough
@ -909,7 +968,7 @@ EXPORT_SYMBOL(bio_add_pc_page);
int bio_add_zone_append_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
bool same_page = false;
if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
@ -923,45 +982,6 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page,
}
EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
/**
* __bio_try_merge_page - try appending data to an existing bvec.
* @bio: destination bio
* @page: start page to add
* @len: length of the data to add
* @off: offset of the data relative to @page
* @same_page: return if the segment has been merged inside the same page
*
* Try to add the data at @page + @off to the last bvec of @bio. This is a
* useful optimisation for file systems with a block size smaller than the
* page size.
*
* Warn if (@len, @off) crosses pages in case that @same_page is true.
*
* Return %true on success or %false on failure.
*/
bool __bio_try_merge_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off, bool *same_page)
{
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return false;
if (bio->bi_vcnt > 0) {
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (page_is_mergeable(bv, page, len, off, same_page)) {
if (bio->bi_iter.bi_size > UINT_MAX - len) {
*same_page = false;
return false;
}
bv->bv_len += len;
bio->bi_iter.bi_size += len;
return true;
}
}
return false;
}
EXPORT_SYMBOL_GPL(__bio_try_merge_page);
/**
* __bio_add_page - add page(s) to a bio in a new segment
* @bio: destination bio
@ -1016,52 +1036,62 @@ int bio_add_page(struct bio *bio, struct page *page,
}
EXPORT_SYMBOL(bio_add_page);
void bio_release_pages(struct bio *bio, bool mark_dirty)
/**
* bio_add_folio - Attempt to add part of a folio to a bio.
* @bio: BIO to add to.
* @folio: Folio to add.
* @len: How many bytes from the folio to add.
* @off: First byte in this folio to add.
*
* Filesystems that use folios can call this function instead of calling
* bio_add_page() for each page in the folio. If @off is bigger than
* PAGE_SIZE, this function can create a bio_vec that starts in a page
* after the bv_page. BIOs do not support folios that are 4GiB or larger.
*
* Return: Whether the addition was successful.
*/
bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
size_t off)
{
if (len > UINT_MAX || off > UINT_MAX)
return 0;
return bio_add_page(bio, &folio->page, len, off) > 0;
}
void __bio_release_pages(struct bio *bio, bool mark_dirty)
{
struct bvec_iter_all iter_all;
struct bio_vec *bvec;
if (bio_flagged(bio, BIO_NO_PAGE_REF))
return;
bio_for_each_segment_all(bvec, bio, iter_all) {
if (mark_dirty && !PageCompound(bvec->bv_page))
set_page_dirty_lock(bvec->bv_page);
put_page(bvec->bv_page);
}
}
EXPORT_SYMBOL_GPL(bio_release_pages);
EXPORT_SYMBOL_GPL(__bio_release_pages);
static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
{
size_t size = iov_iter_count(iter);
WARN_ON_ONCE(bio->bi_max_vecs);
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
size_t max_sectors = queue_max_zone_append_sectors(q);
size = min(size, max_sectors << SECTOR_SHIFT);
}
bio->bi_vcnt = iter->nr_segs;
bio->bi_io_vec = (struct bio_vec *)iter->bvec;
bio->bi_iter.bi_bvec_done = iter->iov_offset;
bio->bi_iter.bi_size = iter->count;
bio->bi_iter.bi_size = size;
bio_set_flag(bio, BIO_NO_PAGE_REF);
bio_set_flag(bio, BIO_CLONED);
}
static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
{
__bio_iov_bvec_set(bio, iter);
iov_iter_advance(iter, iter->count);
return 0;
}
static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter)
{
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct iov_iter i = *iter;
iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9);
__bio_iov_bvec_set(bio, &i);
iov_iter_advance(iter, i.count);
return 0;
}
static void bio_put_pages(struct page **pages, size_t size, size_t off)
{
size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE);
@ -1131,7 +1161,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
{
unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
@ -1203,9 +1233,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
int ret = 0;
if (iov_iter_is_bvec(iter)) {
if (bio_op(bio) == REQ_OP_ZONE_APPEND)
return bio_iov_bvec_set_append(bio, iter);
return bio_iov_bvec_set(bio, iter);
bio_iov_bvec_set(bio, iter);
iov_iter_advance(iter, bio->bi_iter.bi_size);
return 0;
}
do {
@ -1261,18 +1291,7 @@ int submit_bio_wait(struct bio *bio)
}
EXPORT_SYMBOL(submit_bio_wait);
/**
* bio_advance - increment/complete a bio by some number of bytes
* @bio: bio to advance
* @bytes: number of bytes to complete
*
* This updates bi_sector, bi_size and bi_idx; if the number of bytes to
* complete doesn't align with a bvec boundary, then bv_len and bv_offset will
* be updated on the last bvec as well.
*
* @bio will then represent the remaining, uncompleted portion of the io.
*/
void bio_advance(struct bio *bio, unsigned bytes)
void __bio_advance(struct bio *bio, unsigned bytes)
{
if (bio_integrity(bio))
bio_integrity_advance(bio, bytes);
@ -1280,7 +1299,7 @@ void bio_advance(struct bio *bio, unsigned bytes)
bio_crypt_advance(bio, bytes);
bio_advance_iter(bio, &bio->bi_iter, bytes);
}
EXPORT_SYMBOL(bio_advance);
EXPORT_SYMBOL(__bio_advance);
void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter)
@ -1468,10 +1487,10 @@ void bio_endio(struct bio *bio)
return;
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED))
rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio);
rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio);
if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio);
trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
}
@ -1710,8 +1729,9 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs,
return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs);
cache = per_cpu_ptr(bs->cache, get_cpu());
bio = bio_list_pop(&cache->free_list);
if (bio) {
if (cache->free_list) {
bio = cache->free_list;
cache->free_list = bio->bi_next;
cache->nr--;
put_cpu();
bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs);

View File

@ -30,8 +30,10 @@
#include <linux/blk-cgroup.h>
#include <linux/tracehook.h>
#include <linux/psi.h>
#include <linux/part_stat.h>
#include "blk.h"
#include "blk-ioprio.h"
#include "blk-throttle.h"
/*
* blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
@ -620,7 +622,7 @@ struct block_device *blkcg_conf_open_bdev(char **inputp)
*/
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
char *input, struct blkg_conf_ctx *ctx)
__acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock)
__acquires(rcu) __acquires(&bdev->bd_queue->queue_lock)
{
struct block_device *bdev;
struct request_queue *q;
@ -631,7 +633,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
if (IS_ERR(bdev))
return PTR_ERR(bdev);
q = bdev->bd_disk->queue;
q = bdev_get_queue(bdev);
/*
* blkcg_deactivate_policy() requires queue to be frozen, we can grab
@ -747,9 +749,9 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
* with blkg_conf_prep().
*/
void blkg_conf_finish(struct blkg_conf_ctx *ctx)
__releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu)
__releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu)
{
spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock);
spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
rcu_read_unlock();
blkdev_put_no_open(ctx->bdev);
}
@ -852,7 +854,7 @@ static void blkcg_fill_root_iostats(void)
while ((dev = class_dev_iter_next(&iter))) {
struct block_device *bdev = dev_to_bdev(dev);
struct blkcg_gq *blkg =
blk_queue_root_blkg(bdev->bd_disk->queue);
blk_queue_root_blkg(bdev_get_queue(bdev));
struct blkg_iostat tmp;
int cpu;
@ -1811,7 +1813,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
rcu_read_lock();
blkg = blkg_lookup_create(css_to_blkcg(css),
bio->bi_bdev->bd_disk->queue);
bdev_get_queue(bio->bi_bdev));
while (blkg) {
if (blkg_tryget(blkg)) {
ret_blkg = blkg;
@ -1847,8 +1849,8 @@ void bio_associate_blkg_from_css(struct bio *bio,
if (css && css->parent) {
bio->bi_blkg = blkg_tryget_closest(bio, css);
} else {
blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg);
bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg;
blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
}
}
EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);

File diff suppressed because it is too large Load Diff

View File

@ -12,12 +12,13 @@
#include <crypto/skcipher.h>
#include <linux/blk-cgroup.h>
#include <linux/blk-crypto.h>
#include <linux/blk-crypto-profile.h>
#include <linux/blkdev.h>
#include <linux/crypto.h>
#include <linux/keyslot-manager.h>
#include <linux/mempool.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/scatterlist.h>
#include "blk-crypto-internal.h"
@ -72,12 +73,12 @@ static mempool_t *bio_fallback_crypt_ctx_pool;
static DEFINE_MUTEX(tfms_init_lock);
static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX];
static struct blk_crypto_keyslot {
static struct blk_crypto_fallback_keyslot {
enum blk_crypto_mode_num crypto_mode;
struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX];
} *blk_crypto_keyslots;
static struct blk_keyslot_manager blk_crypto_ksm;
static struct blk_crypto_profile blk_crypto_fallback_profile;
static struct workqueue_struct *blk_crypto_wq;
static mempool_t *blk_crypto_bounce_page_pool;
static struct bio_set crypto_bio_split;
@ -88,9 +89,9 @@ static struct bio_set crypto_bio_split;
*/
static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE];
static void blk_crypto_evict_keyslot(unsigned int slot)
static void blk_crypto_fallback_evict_keyslot(unsigned int slot)
{
struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot];
enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode;
int err;
@ -103,45 +104,41 @@ static void blk_crypto_evict_keyslot(unsigned int slot)
slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID;
}
static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm,
const struct blk_crypto_key *key,
unsigned int slot)
static int
blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile,
const struct blk_crypto_key *key,
unsigned int slot)
{
struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot];
const enum blk_crypto_mode_num crypto_mode =
key->crypto_cfg.crypto_mode;
int err;
if (crypto_mode != slotp->crypto_mode &&
slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID)
blk_crypto_evict_keyslot(slot);
blk_crypto_fallback_evict_keyslot(slot);
slotp->crypto_mode = crypto_mode;
err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw,
key->size);
if (err) {
blk_crypto_evict_keyslot(slot);
blk_crypto_fallback_evict_keyslot(slot);
return err;
}
return 0;
}
static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
const struct blk_crypto_key *key,
unsigned int slot)
static int blk_crypto_fallback_keyslot_evict(struct blk_crypto_profile *profile,
const struct blk_crypto_key *key,
unsigned int slot)
{
blk_crypto_evict_keyslot(slot);
blk_crypto_fallback_evict_keyslot(slot);
return 0;
}
/*
* The crypto API fallback KSM ops - only used for a bio when it specifies a
* blk_crypto_key that was not supported by the device's inline encryption
* hardware.
*/
static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = {
.keyslot_program = blk_crypto_keyslot_program,
.keyslot_evict = blk_crypto_keyslot_evict,
static const struct blk_crypto_ll_ops blk_crypto_fallback_ll_ops = {
.keyslot_program = blk_crypto_fallback_keyslot_program,
.keyslot_evict = blk_crypto_fallback_keyslot_evict,
};
static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
@ -159,7 +156,7 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
bio_endio(src_bio);
}
static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
{
struct bvec_iter iter;
struct bio_vec bv;
@ -186,13 +183,14 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
return bio;
}
static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
struct skcipher_request **ciph_req_ret,
struct crypto_wait *wait)
static bool
blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot,
struct skcipher_request **ciph_req_ret,
struct crypto_wait *wait)
{
struct skcipher_request *ciph_req;
const struct blk_crypto_keyslot *slotp;
int keyslot_idx = blk_ksm_get_slot_idx(slot);
const struct blk_crypto_fallback_keyslot *slotp;
int keyslot_idx = blk_crypto_keyslot_index(slot);
slotp = &blk_crypto_keyslots[keyslot_idx];
ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode],
@ -209,7 +207,7 @@ static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
return true;
}
static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr)
static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr)
{
struct bio *bio = *bio_ptr;
unsigned int i = 0;
@ -264,7 +262,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
{
struct bio *src_bio, *enc_bio;
struct bio_crypt_ctx *bc;
struct blk_ksm_keyslot *slot;
struct blk_crypto_keyslot *slot;
int data_unit_size;
struct skcipher_request *ciph_req = NULL;
DECLARE_CRYPTO_WAIT(wait);
@ -276,7 +274,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
blk_status_t blk_st;
/* Split the bio if it's too big for single page bvec */
if (!blk_crypto_split_bio_if_needed(bio_ptr))
if (!blk_crypto_fallback_split_bio_if_needed(bio_ptr))
return false;
src_bio = *bio_ptr;
@ -284,24 +282,25 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
data_unit_size = bc->bc_key->crypto_cfg.data_unit_size;
/* Allocate bounce bio for encryption */
enc_bio = blk_crypto_clone_bio(src_bio);
enc_bio = blk_crypto_fallback_clone_bio(src_bio);
if (!enc_bio) {
src_bio->bi_status = BLK_STS_RESOURCE;
return false;
}
/*
* Use the crypto API fallback keyslot manager to get a crypto_skcipher
* for the algorithm and key specified for this bio.
* Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
* this bio's algorithm and key.
*/
blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile,
bc->bc_key, &slot);
if (blk_st != BLK_STS_OK) {
src_bio->bi_status = blk_st;
goto out_put_enc_bio;
}
/* and then allocate an skcipher_request for it */
if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
src_bio->bi_status = BLK_STS_RESOURCE;
goto out_release_keyslot;
}
@ -362,7 +361,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
out_free_ciph_req:
skcipher_request_free(ciph_req);
out_release_keyslot:
blk_ksm_put_slot(slot);
blk_crypto_put_keyslot(slot);
out_put_enc_bio:
if (enc_bio)
bio_put(enc_bio);
@ -380,7 +379,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
container_of(work, struct bio_fallback_crypt_ctx, work);
struct bio *bio = f_ctx->bio;
struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx;
struct blk_ksm_keyslot *slot;
struct blk_crypto_keyslot *slot;
struct skcipher_request *ciph_req = NULL;
DECLARE_CRYPTO_WAIT(wait);
u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
@ -393,17 +392,18 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
blk_status_t blk_st;
/*
* Use the crypto API fallback keyslot manager to get a crypto_skcipher
* for the algorithm and key specified for this bio.
* Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
* this bio's algorithm and key.
*/
blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile,
bc->bc_key, &slot);
if (blk_st != BLK_STS_OK) {
bio->bi_status = blk_st;
goto out_no_keyslot;
}
/* and then allocate an skcipher_request for it */
if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
bio->bi_status = BLK_STS_RESOURCE;
goto out;
}
@ -434,7 +434,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
out:
skcipher_request_free(ciph_req);
blk_ksm_put_slot(slot);
blk_crypto_put_keyslot(slot);
out_no_keyslot:
mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
bio_endio(bio);
@ -473,9 +473,9 @@ static void blk_crypto_fallback_decrypt_endio(struct bio *bio)
* @bio_ptr: pointer to the bio to prepare
*
* If bio is doing a WRITE operation, this splits the bio into two parts if it's
* too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio
* for the first part, encrypts it, and update bio_ptr to point to the bounce
* bio.
* too big (see blk_crypto_fallback_split_bio_if_needed()). It then allocates a
* bounce bio for the first part, encrypts it, and updates bio_ptr to point to
* the bounce bio.
*
* For a READ operation, we mark the bio for decryption by using bi_private and
* bi_end_io.
@ -499,8 +499,8 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
return false;
}
if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm,
&bc->bc_key->crypto_cfg)) {
if (!__blk_crypto_cfg_supported(&blk_crypto_fallback_profile,
&bc->bc_key->crypto_cfg)) {
bio->bi_status = BLK_STS_NOTSUPP;
return false;
}
@ -526,7 +526,7 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key)
{
return blk_ksm_evict_key(&blk_crypto_ksm, key);
return __blk_crypto_evict_key(&blk_crypto_fallback_profile, key);
}
static bool blk_crypto_fallback_inited;
@ -534,6 +534,7 @@ static int blk_crypto_fallback_init(void)
{
int i;
int err;
struct blk_crypto_profile *profile = &blk_crypto_fallback_profile;
if (blk_crypto_fallback_inited)
return 0;
@ -544,24 +545,24 @@ static int blk_crypto_fallback_init(void)
if (err)
goto out;
err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots);
err = blk_crypto_profile_init(profile, blk_crypto_num_keyslots);
if (err)
goto fail_free_bioset;
err = -ENOMEM;
blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops;
blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
profile->ll_ops = blk_crypto_fallback_ll_ops;
profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
/* All blk-crypto modes have a crypto API fallback. */
for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++)
blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF;
blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
profile->modes_supported[i] = 0xFFFFFFFF;
profile->modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
blk_crypto_wq = alloc_workqueue("blk_crypto_wq",
WQ_UNBOUND | WQ_HIGHPRI |
WQ_MEM_RECLAIM, num_online_cpus());
if (!blk_crypto_wq)
goto fail_free_ksm;
goto fail_destroy_profile;
blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots,
sizeof(blk_crypto_keyslots[0]),
@ -595,8 +596,8 @@ static int blk_crypto_fallback_init(void)
kfree(blk_crypto_keyslots);
fail_free_wq:
destroy_workqueue(blk_crypto_wq);
fail_free_ksm:
blk_ksm_destroy(&blk_crypto_ksm);
fail_destroy_profile:
blk_crypto_profile_destroy(profile);
fail_free_bioset:
bioset_exit(&crypto_bio_split);
out:
@ -610,7 +611,7 @@ static int blk_crypto_fallback_init(void)
int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
{
const char *cipher_str = blk_crypto_modes[mode_num].cipher_str;
struct blk_crypto_keyslot *slotp;
struct blk_crypto_fallback_keyslot *slotp;
unsigned int i;
int err = 0;

View File

@ -11,7 +11,7 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/keyslot-manager.h>
#include <linux/blk-crypto-profile.h>
#include <linux/module.h>
#include <linux/slab.h>
@ -218,8 +218,9 @@ static bool bio_crypt_check_alignment(struct bio *bio)
blk_status_t __blk_crypto_init_request(struct request *rq)
{
return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key,
&rq->crypt_keyslot);
return blk_crypto_get_keyslot(rq->q->crypto_profile,
rq->crypt_ctx->bc_key,
&rq->crypt_keyslot);
}
/**
@ -233,7 +234,7 @@ blk_status_t __blk_crypto_init_request(struct request *rq)
*/
void __blk_crypto_free_request(struct request *rq)
{
blk_ksm_put_slot(rq->crypt_keyslot);
blk_crypto_put_keyslot(rq->crypt_keyslot);
mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool);
blk_crypto_rq_set_defaults(rq);
}
@ -264,6 +265,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
{
struct bio *bio = *bio_ptr;
const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
struct blk_crypto_profile *profile;
/* Error if bio has no data. */
if (WARN_ON_ONCE(!bio_has_data(bio))) {
@ -280,8 +282,8 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
* Success if device supports the encryption context, or if we succeeded
* in falling back to the crypto API.
*/
if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm,
&bc_key->crypto_cfg))
profile = bdev_get_queue(bio->bi_bdev)->crypto_profile;
if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg))
return true;
if (blk_crypto_fallback_bio_prep(bio_ptr))
@ -357,7 +359,7 @@ bool blk_crypto_config_supported(struct request_queue *q,
const struct blk_crypto_config *cfg)
{
return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
blk_ksm_crypto_cfg_supported(q->ksm, cfg);
__blk_crypto_cfg_supported(q->crypto_profile, cfg);
}
/**
@ -378,7 +380,7 @@ bool blk_crypto_config_supported(struct request_queue *q,
int blk_crypto_start_using_key(const struct blk_crypto_key *key,
struct request_queue *q)
{
if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
return 0;
return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
}
@ -394,18 +396,17 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
* evicted from any hardware that it might have been programmed into. The key
* must not be in use by any in-flight IO when this function is called.
*
* Return: 0 on success or if key is not present in the q's ksm, -err on error.
* Return: 0 on success or if the key wasn't in any keyslot; -errno on error.
*/
int blk_crypto_evict_key(struct request_queue *q,
const struct blk_crypto_key *key)
{
if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
return blk_ksm_evict_key(q->ksm, key);
if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
return __blk_crypto_evict_key(q->crypto_profile, key);
/*
* If the request queue's associated inline encryption hardware didn't
* have support for the key, then the key might have been programmed
* into the fallback keyslot manager, so try to evict from there.
* If the request_queue didn't support the key, then blk-crypto-fallback
* may have been used, so try to evict the key from blk-crypto-fallback.
*/
return blk_crypto_fallback_evict_key(key);
}

View File

@ -69,6 +69,7 @@
#include <linux/blkdev.h>
#include <linux/gfp.h>
#include <linux/blk-mq.h>
#include <linux/part_stat.h>
#include "blk.h"
#include "blk-mq.h"
@ -95,6 +96,12 @@ enum {
static void blk_kick_flush(struct request_queue *q,
struct blk_flush_queue *fq, unsigned int flags);
static inline struct blk_flush_queue *
blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
{
return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
}
static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
{
unsigned int policy = 0;
@ -138,7 +145,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front)
static void blk_account_io_flush(struct request *rq)
{
struct block_device *part = rq->rq_disk->part0;
struct block_device *part = rq->q->disk->part0;
part_stat_lock();
part_stat_inc(part, ios[STAT_FLUSH]);
@ -222,7 +229,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
/* release the tag's ownership to the req cloned from */
spin_lock_irqsave(&fq->mq_flush_lock, flags);
if (!refcount_dec_and_test(&flush_rq->ref)) {
if (!req_ref_put_and_test(flush_rq)) {
fq->rq_status = error;
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
return;
@ -334,7 +341,6 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
flush_rq->rq_flags |= RQF_FLUSH_SEQ;
flush_rq->rq_disk = first_rq->rq_disk;
flush_rq->end_io = flush_end_io;
/*
* Order WRITE ->end_io and WRITE rq->ref, and its pair is the one
@ -343,7 +349,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
* and READ flush_rq->end_io
*/
smp_wmb();
refcount_set(&flush_rq->ref, 1);
req_ref_set(flush_rq, 1);
blk_flush_queue_rq(flush_rq, false);
}
@ -423,7 +429,7 @@ void blk_insert_flush(struct request *rq)
*/
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
blk_mq_request_bypass_insert(rq, false, false);
blk_mq_request_bypass_insert(rq, false, true);
return;
}

View File

@ -6,7 +6,7 @@
* Written by: Martin K. Petersen <martin.petersen@oracle.com>
*/
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/backing-dev.h>
#include <linux/mempool.h>
#include <linux/bio.h>
@ -409,9 +409,9 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
if (disk->queue->ksm) {
if (disk->queue->crypto_profile) {
pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
blk_ksm_unregister(disk->queue);
disk->queue->crypto_profile = NULL;
}
#endif
}

View File

@ -8,22 +8,25 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/security.h>
#include <linux/sched/task.h>
#include "blk.h"
#include "blk-mq-sched.h"
/*
* For io context allocations
*/
static struct kmem_cache *iocontext_cachep;
#ifdef CONFIG_BLK_ICQ
/**
* get_io_context - increment reference count to io_context
* @ioc: io_context to get
*
* Increment reference count to @ioc.
*/
void get_io_context(struct io_context *ioc)
static void get_io_context(struct io_context *ioc)
{
BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
atomic_long_inc(&ioc->refcount);
@ -53,6 +56,16 @@ static void ioc_exit_icq(struct io_cq *icq)
icq->flags |= ICQ_EXITED;
}
static void ioc_exit_icqs(struct io_context *ioc)
{
struct io_cq *icq;
spin_lock_irq(&ioc->lock);
hlist_for_each_entry(icq, &ioc->icq_list, ioc_node)
ioc_exit_icq(icq);
spin_unlock_irq(&ioc->lock);
}
/*
* Release an icq. Called with ioc locked for blk-mq, and with both ioc
* and queue locked for legacy.
@ -132,102 +145,22 @@ static void ioc_release_fn(struct work_struct *work)
kmem_cache_free(iocontext_cachep, ioc);
}
/**
* put_io_context - put a reference of io_context
* @ioc: io_context to put
*
* Decrement reference count of @ioc and release it if the count reaches
* zero.
/*
* Releasing icqs requires reverse order double locking and we may already be
* holding a queue_lock. Do it asynchronously from a workqueue.
*/
void put_io_context(struct io_context *ioc)
{
unsigned long flags;
bool free_ioc = false;
if (ioc == NULL)
return;
BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
/*
* Releasing ioc requires reverse order double locking and we may
* already be holding a queue_lock. Do it asynchronously from wq.
*/
if (atomic_long_dec_and_test(&ioc->refcount)) {
spin_lock_irqsave(&ioc->lock, flags);
if (!hlist_empty(&ioc->icq_list))
queue_work(system_power_efficient_wq,
&ioc->release_work);
else
free_ioc = true;
spin_unlock_irqrestore(&ioc->lock, flags);
}
if (free_ioc)
kmem_cache_free(iocontext_cachep, ioc);
}
/**
* put_io_context_active - put active reference on ioc
* @ioc: ioc of interest
*
* Undo get_io_context_active(). If active reference reaches zero after
* put, @ioc can never issue further IOs and ioscheds are notified.
*/
void put_io_context_active(struct io_context *ioc)
{
struct io_cq *icq;
if (!atomic_dec_and_test(&ioc->active_ref)) {
put_io_context(ioc);
return;
}
spin_lock_irq(&ioc->lock);
hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
if (icq->flags & ICQ_EXITED)
continue;
ioc_exit_icq(icq);
}
spin_unlock_irq(&ioc->lock);
put_io_context(ioc);
}
/* Called by the exiting task */
void exit_io_context(struct task_struct *task)
{
struct io_context *ioc;
task_lock(task);
ioc = task->io_context;
task->io_context = NULL;
task_unlock(task);
atomic_dec(&ioc->nr_tasks);
put_io_context_active(ioc);
}
static void __ioc_clear_queue(struct list_head *icq_list)
static bool ioc_delay_free(struct io_context *ioc)
{
unsigned long flags;
rcu_read_lock();
while (!list_empty(icq_list)) {
struct io_cq *icq = list_entry(icq_list->next,
struct io_cq, q_node);
struct io_context *ioc = icq->ioc;
spin_lock_irqsave(&ioc->lock, flags);
if (icq->flags & ICQ_DESTROYED) {
spin_unlock_irqrestore(&ioc->lock, flags);
continue;
}
ioc_destroy_icq(icq);
spin_lock_irqsave(&ioc->lock, flags);
if (!hlist_empty(&ioc->icq_list)) {
queue_work(system_power_efficient_wq, &ioc->release_work);
spin_unlock_irqrestore(&ioc->lock, flags);
return true;
}
rcu_read_unlock();
spin_unlock_irqrestore(&ioc->lock, flags);
return false;
}
/**
@ -244,93 +177,156 @@ void ioc_clear_queue(struct request_queue *q)
list_splice_init(&q->icq_list, &icq_list);
spin_unlock_irq(&q->queue_lock);
__ioc_clear_queue(&icq_list);
}
rcu_read_lock();
while (!list_empty(&icq_list)) {
struct io_cq *icq =
list_entry(icq_list.next, struct io_cq, q_node);
int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
spin_lock_irq(&icq->ioc->lock);
if (!(icq->flags & ICQ_DESTROYED))
ioc_destroy_icq(icq);
spin_unlock_irq(&icq->ioc->lock);
}
rcu_read_unlock();
}
#else /* CONFIG_BLK_ICQ */
static inline void ioc_exit_icqs(struct io_context *ioc)
{
}
static inline bool ioc_delay_free(struct io_context *ioc)
{
return false;
}
#endif /* CONFIG_BLK_ICQ */
/**
* put_io_context - put a reference of io_context
* @ioc: io_context to put
*
* Decrement reference count of @ioc and release it if the count reaches
* zero.
*/
void put_io_context(struct io_context *ioc)
{
BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
if (atomic_long_dec_and_test(&ioc->refcount) && !ioc_delay_free(ioc))
kmem_cache_free(iocontext_cachep, ioc);
}
EXPORT_SYMBOL_GPL(put_io_context);
/* Called by the exiting task */
void exit_io_context(struct task_struct *task)
{
struct io_context *ioc;
task_lock(task);
ioc = task->io_context;
task->io_context = NULL;
task_unlock(task);
if (atomic_dec_and_test(&ioc->active_ref)) {
ioc_exit_icqs(ioc);
put_io_context(ioc);
}
}
static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
{
struct io_context *ioc;
int ret;
ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
node);
if (unlikely(!ioc))
return -ENOMEM;
return NULL;
/* initialize */
atomic_long_set(&ioc->refcount, 1);
atomic_set(&ioc->nr_tasks, 1);
atomic_set(&ioc->active_ref, 1);
#ifdef CONFIG_BLK_ICQ
spin_lock_init(&ioc->lock);
INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
INIT_HLIST_HEAD(&ioc->icq_list);
INIT_WORK(&ioc->release_work, ioc_release_fn);
#endif
return ioc;
}
int set_task_ioprio(struct task_struct *task, int ioprio)
{
int err;
const struct cred *cred = current_cred(), *tcred;
rcu_read_lock();
tcred = __task_cred(task);
if (!uid_eq(tcred->uid, cred->euid) &&
!uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
rcu_read_unlock();
return -EPERM;
}
rcu_read_unlock();
err = security_task_setioprio(task, ioprio);
if (err)
return err;
task_lock(task);
if (unlikely(!task->io_context)) {
struct io_context *ioc;
task_unlock(task);
ioc = alloc_io_context(GFP_ATOMIC, NUMA_NO_NODE);
if (!ioc)
return -ENOMEM;
task_lock(task);
if (task->flags & PF_EXITING) {
err = -ESRCH;
kmem_cache_free(iocontext_cachep, ioc);
goto out;
}
if (task->io_context)
kmem_cache_free(iocontext_cachep, ioc);
else
task->io_context = ioc;
}
task->io_context->ioprio = ioprio;
out:
task_unlock(task);
return err;
}
EXPORT_SYMBOL_GPL(set_task_ioprio);
int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
{
struct io_context *ioc = current->io_context;
/*
* Try to install. ioc shouldn't be installed if someone else
* already did or @task, which isn't %current, is exiting. Note
* that we need to allow ioc creation on exiting %current as exit
* path may issue IOs from e.g. exit_files(). The exit path is
* responsible for not issuing IO after exit_io_context().
* Share io context with parent, if CLONE_IO is set
*/
task_lock(task);
if (!task->io_context &&
(task == current || !(task->flags & PF_EXITING)))
task->io_context = ioc;
else
kmem_cache_free(iocontext_cachep, ioc);
if (clone_flags & CLONE_IO) {
atomic_inc(&ioc->active_ref);
tsk->io_context = ioc;
} else if (ioprio_valid(ioc->ioprio)) {
tsk->io_context = alloc_io_context(GFP_KERNEL, NUMA_NO_NODE);
if (!tsk->io_context)
return -ENOMEM;
tsk->io_context->ioprio = ioc->ioprio;
}
ret = task->io_context ? 0 : -EBUSY;
task_unlock(task);
return ret;
}
/**
* get_task_io_context - get io_context of a task
* @task: task of interest
* @gfp_flags: allocation flags, used if allocation is necessary
* @node: allocation node, used if allocation is necessary
*
* Return io_context of @task. If it doesn't exist, it is created with
* @gfp_flags and @node. The returned io_context has its reference count
* incremented.
*
* This function always goes through task_lock() and it's better to use
* %current->io_context + get_io_context() for %current.
*/
struct io_context *get_task_io_context(struct task_struct *task,
gfp_t gfp_flags, int node)
{
struct io_context *ioc;
might_sleep_if(gfpflags_allow_blocking(gfp_flags));
do {
task_lock(task);
ioc = task->io_context;
if (likely(ioc)) {
get_io_context(ioc);
task_unlock(task);
return ioc;
}
task_unlock(task);
} while (!create_task_io_context(task, gfp_flags, node));
return NULL;
return 0;
}
#ifdef CONFIG_BLK_ICQ
/**
* ioc_lookup_icq - lookup io_cq from ioc
* @ioc: the associated io_context
* @q: the associated request_queue
*
* Look up io_cq associated with @ioc - @q pair from @ioc. Must be called
* with @q->queue_lock held.
*/
struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
struct io_cq *ioc_lookup_icq(struct request_queue *q)
{
struct io_context *ioc = current->io_context;
struct io_cq *icq;
lockdep_assert_held(&q->queue_lock);
@ -359,9 +355,7 @@ EXPORT_SYMBOL(ioc_lookup_icq);
/**
* ioc_create_icq - create and link io_cq
* @ioc: io_context of interest
* @q: request_queue of interest
* @gfp_mask: allocation mask
*
* Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they
* will be created using @gfp_mask.
@ -369,19 +363,19 @@ EXPORT_SYMBOL(ioc_lookup_icq);
* The caller is responsible for ensuring @ioc won't go away and @q is
* alive and will stay alive until this function returns.
*/
struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
gfp_t gfp_mask)
static struct io_cq *ioc_create_icq(struct request_queue *q)
{
struct io_context *ioc = current->io_context;
struct elevator_type *et = q->elevator->type;
struct io_cq *icq;
/* allocate stuff */
icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
icq = kmem_cache_alloc_node(et->icq_cache, GFP_ATOMIC | __GFP_ZERO,
q->node);
if (!icq)
return NULL;
if (radix_tree_maybe_preload(gfp_mask) < 0) {
if (radix_tree_maybe_preload(GFP_ATOMIC) < 0) {
kmem_cache_free(et->icq_cache, icq);
return NULL;
}
@ -402,7 +396,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
et->ops.init_icq(icq);
} else {
kmem_cache_free(et->icq_cache, icq);
icq = ioc_lookup_icq(ioc, q);
icq = ioc_lookup_icq(q);
if (!icq)
printk(KERN_ERR "cfq: icq link failed!\n");
}
@ -413,6 +407,46 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
return icq;
}
struct io_cq *ioc_find_get_icq(struct request_queue *q)
{
struct io_context *ioc = current->io_context;
struct io_cq *icq = NULL;
if (unlikely(!ioc)) {
ioc = alloc_io_context(GFP_ATOMIC, q->node);
if (!ioc)
return NULL;
task_lock(current);
if (current->io_context) {
kmem_cache_free(iocontext_cachep, ioc);
ioc = current->io_context;
} else {
current->io_context = ioc;
}
get_io_context(ioc);
task_unlock(current);
} else {
get_io_context(ioc);
spin_lock_irq(&q->queue_lock);
icq = ioc_lookup_icq(q);
spin_unlock_irq(&q->queue_lock);
}
if (!icq) {
icq = ioc_create_icq(q);
if (!icq) {
put_io_context(ioc);
return NULL;
}
}
return icq;
}
EXPORT_SYMBOL_GPL(ioc_find_get_icq);
#endif /* CONFIG_BLK_ICQ */
static int __init blk_ioc_init(void)
{
iocontext_cachep = kmem_cache_create("blkdev_ioc",

View File

@ -74,6 +74,7 @@
#include <linux/sched/signal.h>
#include <trace/events/block.h>
#include <linux/blk-mq.h>
#include <linux/blk-cgroup.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
#include "blk.h"

View File

@ -62,6 +62,7 @@ struct ioprio_blkg {
struct ioprio_blkcg {
struct blkcg_policy_data cpd;
enum prio_policy prio_policy;
bool prio_set;
};
static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
@ -112,7 +113,7 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
if (ret < 0)
return ret;
blkcg->prio_policy = ret;
blkcg->prio_set = true;
return nbytes;
}
@ -190,6 +191,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
struct bio *bio)
{
struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
u16 prio;
if (!blkcg->prio_set)
return;
/*
* Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
@ -199,8 +204,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq,
* bio I/O priority is not modified. If the bio I/O priority equals
* IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio.
*/
bio->bi_ioprio = max_t(u16, bio->bi_ioprio,
IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
prio = max_t(u16, bio->bi_ioprio,
IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
if (prio > bio->bi_ioprio)
bio->bi_ioprio = prio;
}
static void blkcg_ioprio_exit(struct rq_qos *rqos)

View File

@ -6,12 +6,47 @@
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/scatterlist.h>
#include <linux/part_stat.h>
#include <trace/events/block.h>
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
#include "blk-throttle.h"
static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
{
*bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
}
static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
{
struct bvec_iter iter = bio->bi_iter;
int idx;
bio_get_first_bvec(bio, bv);
if (bv->bv_len == bio->bi_iter.bi_size)
return; /* this bio only has a single bvec */
bio_advance_iter(bio, &iter, iter.bi_size);
if (!iter.bi_bvec_done)
idx = iter.bi_idx - 1;
else /* in the middle of bvec */
idx = iter.bi_idx;
*bv = bio->bi_io_vec[idx];
/*
* iter.bi_bvec_done records actual length of the last bvec
* if this bio ends in the middle of one io vector
*/
if (iter.bi_bvec_done)
bv->bv_len = iter.bi_bvec_done;
}
static inline bool bio_will_gap(struct request_queue *q,
struct request *prev_rq, struct bio *prev, struct bio *next)
@ -285,13 +320,13 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
* iopoll in direct IO routine. Given performance gain of iopoll for
* big IO can be trival, disable iopoll when split needed.
*/
bio_clear_hipri(bio);
bio_clear_polled(bio);
return bio_split(bio, sectors, GFP_NOIO, bs);
}
/**
* __blk_queue_split - split a bio and submit the second half
* @q: [in] request_queue new bio is being queued at
* @bio: [in, out] bio to be split
* @nr_segs: [out] number of segments in the first bio
*
@ -302,9 +337,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
* of the caller to ensure that q->bio_split is only released after processing
* of the split bio has finished.
*/
void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
void __blk_queue_split(struct request_queue *q, struct bio **bio,
unsigned int *nr_segs)
{
struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue;
struct bio *split = NULL;
switch (bio_op(*bio)) {
@ -321,21 +356,6 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
nr_segs);
break;
default:
/*
* All drivers must accept single-segments bios that are <=
* PAGE_SIZE. This is a quick and dirty check that relies on
* the fact that bi_io_vec[0] is always valid if a bio has data.
* The check might lead to occasional false negatives when bios
* are cloned, but compared to the performance impact of cloned
* bios themselves the loop below doesn't matter anyway.
*/
if (!q->limits.chunk_sectors &&
(*bio)->bi_vcnt == 1 &&
((*bio)->bi_io_vec[0].bv_len +
(*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) {
*nr_segs = 1;
break;
}
split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
break;
}
@ -365,9 +385,11 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
*/
void blk_queue_split(struct bio **bio)
{
struct request_queue *q = bdev_get_queue((*bio)->bi_bdev);
unsigned int nr_segs;
__blk_queue_split(bio, &nr_segs);
if (blk_may_split(q, *bio))
__blk_queue_split(q, bio, &nr_segs);
}
EXPORT_SYMBOL(blk_queue_split);
@ -558,6 +580,23 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq)
return queue_max_segments(rq->q);
}
static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
sector_t offset)
{
struct request_queue *q = rq->q;
if (blk_rq_is_passthrough(rq))
return q->limits.max_hw_sectors;
if (!q->limits.chunk_sectors ||
req_op(rq) == REQ_OP_DISCARD ||
req_op(rq) == REQ_OP_SECURE_ERASE)
return blk_queue_get_max_sectors(q, req_op(rq));
return min(blk_max_size_offset(q, offset, 0),
blk_queue_get_max_sectors(q, req_op(rq)));
}
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
unsigned int nr_phys_segs)
{
@ -718,6 +757,13 @@ static enum elv_merge blk_try_req_merge(struct request *req,
return ELEVATOR_NO_MERGE;
}
static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b)
{
if (bio_page(a) == bio_page(b) && bio_offset(a) == bio_offset(b))
return true;
return false;
}
/*
* For non-mq, this has to be called with the request spinlock acquired.
* For mq with scheduling, the appropriate queue wide lock should be held.
@ -731,8 +777,7 @@ static struct request *attempt_merge(struct request_queue *q,
if (req_op(req) != req_op(next))
return NULL;
if (rq_data_dir(req) != rq_data_dir(next)
|| req->rq_disk != next->rq_disk)
if (rq_data_dir(req) != rq_data_dir(next))
return NULL;
if (req_op(req) == REQ_OP_WRITE_SAME &&
@ -859,10 +904,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (bio_data_dir(bio) != rq_data_dir(rq))
return false;
/* must be same device */
if (rq->rq_disk != bio->bi_bdev->bd_disk)
return false;
/* only merge integrity protected bio into ditto rq */
if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
return false;
@ -1023,12 +1064,10 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
* @q: request_queue new bio is being queued at
* @bio: new bio being queued
* @nr_segs: number of segments in @bio
* @same_queue_rq: pointer to &struct request that gets filled in when
* another request associated with @q is found on the plug list
* (optional, may be %NULL)
* from the passed in @q already in the plug list
*
* Determine whether @bio being queued on @q can be merged with a request
* on %current's plugged list. Returns %true if merge was successful,
* Determine whether @bio being queued on @q can be merged with the previous
* request on %current's plugged list. Returns %true if merge was successful,
* otherwise %false.
*
* Plugging coalesces IOs from the same issuer for the same purpose without
@ -1041,36 +1080,22 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
* Caller must ensure !blk_queue_nomerges(q) beforehand.
*/
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **same_queue_rq)
unsigned int nr_segs)
{
struct blk_plug *plug;
struct request *rq;
struct list_head *plug_list;
plug = blk_mq_plug(q, bio);
if (!plug)
if (!plug || rq_list_empty(plug->mq_list))
return false;
plug_list = &plug->mq_list;
list_for_each_entry_reverse(rq, plug_list, queuelist) {
if (rq->q == q && same_queue_rq) {
/*
* Only blk-mq multiple hardware queues case checks the
* rq in the same queue, there should be only one such
* rq in a queue
**/
*same_queue_rq = rq;
}
if (rq->q != q)
continue;
/* check the previously added entry for a quick merge attempt */
rq = rq_list_peek(&plug->mq_list);
if (rq->q == q) {
if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
BIO_MERGE_OK)
BIO_MERGE_OK)
return true;
}
return false;
}

View File

@ -11,6 +11,7 @@
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
#include "blk-rq-qos.h"
@ -29,6 +30,9 @@ static int queue_poll_stat_show(void *data, struct seq_file *m)
struct request_queue *q = data;
int bucket;
if (!q->poll_stat)
return 0;
for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) {
seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket));
print_stat(m, &q->poll_stat[2 * bucket]);
@ -122,9 +126,7 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(FUA),
QUEUE_FLAG_NAME(DAX),
QUEUE_FLAG_NAME(STATS),
QUEUE_FLAG_NAME(POLL_STATS),
QUEUE_FLAG_NAME(REGISTERED),
QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
QUEUE_FLAG_NAME(QUIESCED),
QUEUE_FLAG_NAME(PCI_P2PDMA),
QUEUE_FLAG_NAME(ZONE_RESETALL),
@ -287,7 +289,7 @@ static const char *const cmd_flag_name[] = {
CMD_FLAG_NAME(BACKGROUND),
CMD_FLAG_NAME(NOWAIT),
CMD_FLAG_NAME(NOUNMAP),
CMD_FLAG_NAME(HIPRI),
CMD_FLAG_NAME(POLLED),
};
#undef CMD_FLAG_NAME
@ -309,6 +311,7 @@ static const char *const rqf_name[] = {
RQF_NAME(SPECIAL_PAYLOAD),
RQF_NAME(ZONE_WRITE_LOCKED),
RQF_NAME(MQ_POLL_SLEPT),
RQF_NAME(ELV),
};
#undef RQF_NAME
@ -453,11 +456,11 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
atomic_read(&tags->active_queues));
seq_puts(m, "\nbitmap_tags:\n");
sbitmap_queue_show(tags->bitmap_tags, m);
sbitmap_queue_show(&tags->bitmap_tags, m);
if (tags->nr_reserved_tags) {
seq_puts(m, "\nbreserved_tags:\n");
sbitmap_queue_show(tags->breserved_tags, m);
sbitmap_queue_show(&tags->breserved_tags, m);
}
}
@ -488,7 +491,7 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m)
if (res)
goto out;
if (hctx->tags)
sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m);
sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m);
mutex_unlock(&q->sysfs_lock);
out:
@ -522,77 +525,13 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m)
if (res)
goto out;
if (hctx->sched_tags)
sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m);
sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m);
mutex_unlock(&q->sysfs_lock);
out:
return res;
}
static int hctx_io_poll_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
seq_printf(m, "considered=%lu\n", hctx->poll_considered);
seq_printf(m, "invoked=%lu\n", hctx->poll_invoked);
seq_printf(m, "success=%lu\n", hctx->poll_success);
return 0;
}
static ssize_t hctx_io_poll_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_hw_ctx *hctx = data;
hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
return count;
}
static int hctx_dispatched_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
int i;
seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
unsigned int d = 1U << (i - 1);
seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]);
}
seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]);
return 0;
}
static ssize_t hctx_dispatched_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_hw_ctx *hctx = data;
int i;
for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++)
hctx->dispatched[i] = 0;
return count;
}
static int hctx_queued_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
seq_printf(m, "%lu\n", hctx->queued);
return 0;
}
static ssize_t hctx_queued_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_hw_ctx *hctx = data;
hctx->queued = 0;
return count;
}
static int hctx_run_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
@ -614,7 +553,7 @@ static int hctx_active_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
seq_printf(m, "%d\n", atomic_read(&hctx->nr_active));
seq_printf(m, "%d\n", __blk_mq_active_requests(hctx));
return 0;
}
@ -663,57 +602,6 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
static int ctx_dispatched_show(void *data, struct seq_file *m)
{
struct blk_mq_ctx *ctx = data;
seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]);
return 0;
}
static ssize_t ctx_dispatched_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_ctx *ctx = data;
ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0;
return count;
}
static int ctx_merged_show(void *data, struct seq_file *m)
{
struct blk_mq_ctx *ctx = data;
seq_printf(m, "%lu\n", ctx->rq_merged);
return 0;
}
static ssize_t ctx_merged_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_ctx *ctx = data;
ctx->rq_merged = 0;
return count;
}
static int ctx_completed_show(void *data, struct seq_file *m)
{
struct blk_mq_ctx *ctx = data;
seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]);
return 0;
}
static ssize_t ctx_completed_write(void *data, const char __user *buf,
size_t count, loff_t *ppos)
{
struct blk_mq_ctx *ctx = data;
ctx->rq_completed[0] = ctx->rq_completed[1] = 0;
return count;
}
static int blk_mq_debugfs_show(struct seq_file *m, void *v)
{
const struct blk_mq_debugfs_attr *attr = m->private;
@ -789,9 +677,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"tags_bitmap", 0400, hctx_tags_bitmap_show},
{"sched_tags", 0400, hctx_sched_tags_show},
{"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show},
{"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write},
{"dispatched", 0600, hctx_dispatched_show, hctx_dispatched_write},
{"queued", 0600, hctx_queued_show, hctx_queued_write},
{"run", 0600, hctx_run_show, hctx_run_write},
{"active", 0400, hctx_active_show},
{"dispatch_busy", 0400, hctx_dispatch_busy_show},
@ -803,9 +688,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
{"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops},
{"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
{"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
{"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
{"merged", 0600, ctx_merged_show, ctx_merged_write},
{"completed", 0600, ctx_completed_show, ctx_completed_write},
{},
};

View File

@ -18,32 +18,6 @@
#include "blk-mq-tag.h"
#include "blk-wbt.h"
void blk_mq_sched_assign_ioc(struct request *rq)
{
struct request_queue *q = rq->q;
struct io_context *ioc;
struct io_cq *icq;
/*
* May not have an IO context if it's a passthrough request
*/
ioc = current->io_context;
if (!ioc)
return;
spin_lock_irq(&q->queue_lock);
icq = ioc_lookup_icq(ioc, q);
spin_unlock_irq(&q->queue_lock);
if (!icq) {
icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
if (!icq)
return;
}
get_io_context(icq->ioc);
rq->elv.icq = icq;
}
/*
* Mark a hardware queue as needing a restart. For shared queues, maintain
* a count of how many hardware queues are marked for restart.
@ -57,10 +31,8 @@ void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
}
EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
return;
clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
/*
@ -363,7 +335,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
}
}
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
struct elevator_queue *e = q->elevator;
@ -372,15 +344,17 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
bool ret = false;
enum hctx_type type;
if (e && e->type->ops.bio_merge)
return e->type->ops.bio_merge(q, bio, nr_segs);
if (e && e->type->ops.bio_merge) {
ret = e->type->ops.bio_merge(q, bio, nr_segs);
goto out_put;
}
ctx = blk_mq_get_ctx(q);
hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
type = hctx->type;
if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
list_empty_careful(&ctx->rq_lists[type]))
return false;
goto out_put;
/* default per sw-queue merge */
spin_lock(&ctx->lock);
@ -389,13 +363,11 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
* potentially merge with. Currently includes a hand-wavy stop
* count of 8, to not spend too much time checking for merges.
*/
if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
ctx->rq_merged++;
if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
ret = true;
}
spin_unlock(&ctx->lock);
out_put:
return ret;
}
@ -502,8 +474,9 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
* busy in case of 'none' scheduler, and this way may save
* us one extra enqueue & dequeue to sw queue.
*/
if (!hctx->dispatch_busy && !e && !run_queue_async) {
blk_mq_try_issue_list_directly(hctx, list);
if (!hctx->dispatch_busy && !run_queue_async) {
blk_mq_run_dispatch_ops(hctx->queue,
blk_mq_try_issue_list_directly(hctx, list));
if (list_empty(list))
goto out;
}
@ -515,83 +488,71 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
percpu_ref_put(&q->q_usage_counter);
}
static int blk_mq_sched_alloc_tags(struct request_queue *q,
struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx)
static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
struct blk_mq_hw_ctx *hctx,
unsigned int hctx_idx)
{
struct blk_mq_tag_set *set = q->tag_set;
int ret;
hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
set->reserved_tags, set->flags);
if (!hctx->sched_tags)
return -ENOMEM;
ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
if (ret) {
blk_mq_free_rq_map(hctx->sched_tags, set->flags);
hctx->sched_tags = NULL;
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
hctx->sched_tags = q->sched_shared_tags;
return 0;
}
return ret;
hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
q->nr_requests);
if (!hctx->sched_tags)
return -ENOMEM;
return 0;
}
static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
{
blk_mq_free_rq_map(queue->sched_shared_tags);
queue->sched_shared_tags = NULL;
}
/* called in queue's release handler, tagset has gone away */
static void blk_mq_sched_tags_teardown(struct request_queue *q)
static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags) {
blk_mq_free_rq_map(hctx->sched_tags, hctx->flags);
if (!blk_mq_is_shared_tags(flags))
blk_mq_free_rq_map(hctx->sched_tags);
hctx->sched_tags = NULL;
}
}
if (blk_mq_is_shared_tags(flags))
blk_mq_exit_sched_shared_tags(q);
}
static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue)
static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
{
struct blk_mq_tag_set *set = queue->tag_set;
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
struct blk_mq_hw_ctx *hctx;
int ret, i;
/*
* Set initial depth at max so that we don't need to reallocate for
* updating nr_requests.
*/
ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags,
&queue->sched_breserved_tags,
MAX_SCHED_RQ, set->reserved_tags,
set->numa_node, alloc_policy);
if (ret)
return ret;
queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
BLK_MQ_NO_HCTX_IDX,
MAX_SCHED_RQ);
if (!queue->sched_shared_tags)
return -ENOMEM;
queue_for_each_hw_ctx(queue, hctx, i) {
hctx->sched_tags->bitmap_tags =
&queue->sched_bitmap_tags;
hctx->sched_tags->breserved_tags =
&queue->sched_breserved_tags;
}
sbitmap_queue_resize(&queue->sched_bitmap_tags,
queue->nr_requests - set->reserved_tags);
blk_mq_tag_update_sched_shared_tags(queue);
return 0;
}
static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue)
{
sbitmap_queue_free(&queue->sched_bitmap_tags);
sbitmap_queue_free(&queue->sched_breserved_tags);
}
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
unsigned int i, flags = q->tag_set->flags;
struct blk_mq_hw_ctx *hctx;
struct elevator_queue *eq;
unsigned int i;
int ret;
if (!e) {
@ -606,23 +567,23 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
* Additionally, this is a per-hw queue depth.
*/
q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
BLKDEV_MAX_RQ);
BLKDEV_DEFAULT_RQ);
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_sched_alloc_tags(q, hctx, i);
if (blk_mq_is_shared_tags(flags)) {
ret = blk_mq_init_sched_shared_tags(q);
if (ret)
goto err_free_tags;
return ret;
}
if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) {
ret = blk_mq_init_sched_shared_sbitmap(q);
queue_for_each_hw_ctx(q, hctx, i) {
ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
if (ret)
goto err_free_tags;
goto err_free_map_and_rqs;
}
ret = e->ops.init_sched(q, e);
if (ret)
goto err_free_sbitmap;
goto err_free_map_and_rqs;
blk_mq_debugfs_register_sched(q);
@ -631,7 +592,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
ret = e->ops.init_hctx(hctx, i);
if (ret) {
eq = q->elevator;
blk_mq_sched_free_requests(q);
blk_mq_sched_free_rqs(q);
blk_mq_exit_sched(q, eq);
kobject_put(&eq->kobj);
return ret;
@ -642,12 +603,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
return 0;
err_free_sbitmap:
if (blk_mq_is_sbitmap_shared(q->tag_set->flags))
blk_mq_exit_sched_shared_sbitmap(q);
err_free_tags:
blk_mq_sched_free_requests(q);
blk_mq_sched_tags_teardown(q);
err_free_map_and_rqs:
blk_mq_sched_free_rqs(q);
blk_mq_sched_tags_teardown(q, flags);
q->elevator = NULL;
return ret;
}
@ -656,14 +615,20 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
* called in either blk_queue_cleanup or elevator_switch, tagset
* is required for freeing requests
*/
void blk_mq_sched_free_requests(struct request_queue *q)
void blk_mq_sched_free_rqs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags)
blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
BLK_MQ_NO_HCTX_IDX);
} else {
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags)
blk_mq_free_rqs(q->tag_set,
hctx->sched_tags, i);
}
}
}
@ -684,8 +649,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
blk_mq_debugfs_unregister_sched(q);
if (e->type->ops.exit_sched)
e->type->ops.exit_sched(e);
blk_mq_sched_tags_teardown(q);
if (blk_mq_is_sbitmap_shared(flags))
blk_mq_exit_sched_shared_sbitmap(q);
blk_mq_sched_tags_teardown(q, flags);
q->elevator = NULL;
}

View File

@ -2,21 +2,20 @@
#ifndef BLK_MQ_SCHED_H
#define BLK_MQ_SCHED_H
#include "elevator.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ)
void blk_mq_sched_assign_ioc(struct request *rq);
#define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **merged_request);
bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs);
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
struct list_head *free);
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_insert_request(struct request *rq, bool at_head,
bool run_queue, bool async);
@ -28,45 +27,51 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_requests(struct request_queue *q);
void blk_mq_sched_free_rqs(struct request_queue *q);
static inline bool
blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
if (blk_queue_nomerges(q) || !bio_mergeable(bio))
return false;
if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
__blk_mq_sched_restart(hctx);
}
return __blk_mq_sched_bio_merge(q, bio, nr_segs);
static inline bool bio_mergeable(struct bio *bio)
{
return !(bio->bi_opf & REQ_NOMERGE_FLAGS);
}
static inline bool
blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
struct elevator_queue *e = q->elevator;
if (e && e->type->ops.allow_merge)
return e->type->ops.allow_merge(q, rq, bio);
if (rq->rq_flags & RQF_ELV) {
struct elevator_queue *e = q->elevator;
if (e->type->ops.allow_merge)
return e->type->ops.allow_merge(q, rq, bio);
}
return true;
}
static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
{
struct elevator_queue *e = rq->q->elevator;
if (rq->rq_flags & RQF_ELV) {
struct elevator_queue *e = rq->q->elevator;
if (e && e->type->ops.completed_request)
e->type->ops.completed_request(rq, now);
if (e->type->ops.completed_request)
e->type->ops.completed_request(rq, now);
}
}
static inline void blk_mq_sched_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
if (rq->rq_flags & RQF_ELV) {
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
if ((rq->rq_flags & RQF_ELVPRIV) && e && e->type->ops.requeue_request)
e->type->ops.requeue_request(rq);
if ((rq->rq_flags & RQF_ELVPRIV) && e->type->ops.requeue_request)
e->type->ops.requeue_request(rq);
}
}
static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)

View File

@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
kobj);
if (hctx->flags & BLK_MQ_F_BLOCKING)
cleanup_srcu_struct(hctx->srcu);
blk_free_flush_queue(hctx->fq);
sbitmap_free(&hctx->ctx_map);
free_cpumask_var(hctx->cpumask);

View File

@ -16,6 +16,21 @@
#include "blk-mq-sched.h"
#include "blk-mq-tag.h"
/*
* Recalculate wakeup batch when tag is shared by hctx.
*/
static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
unsigned int users)
{
if (!users)
return;
sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags,
users);
sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags,
users);
}
/*
* If a previously inactive queue goes active, bump the active user count.
* We need to do this before try to allocate driver tag, then even if fail
@ -24,19 +39,26 @@
*/
bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
struct request_queue *q = hctx->queue;
struct blk_mq_tag_set *set = q->tag_set;
unsigned int users;
if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) &&
!test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
atomic_inc(&set->active_queues_shared_sbitmap);
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) {
return true;
}
} else {
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
!test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
atomic_inc(&hctx->tags->active_queues);
if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) {
return true;
}
}
users = atomic_inc_return(&hctx->tags->active_queues);
blk_mq_update_wake_batch(hctx->tags, users);
return true;
}
@ -45,9 +67,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
*/
void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
{
sbitmap_queue_wake_all(tags->bitmap_tags);
sbitmap_queue_wake_all(&tags->bitmap_tags);
if (include_reserve)
sbitmap_queue_wake_all(tags->breserved_tags);
sbitmap_queue_wake_all(&tags->breserved_tags);
}
/*
@ -57,20 +79,23 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
struct request_queue *q = hctx->queue;
struct blk_mq_tag_set *set = q->tag_set;
unsigned int users;
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
&q->queue_flags))
return;
atomic_dec(&set->active_queues_shared_sbitmap);
} else {
if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return;
atomic_dec(&tags->active_queues);
}
users = atomic_dec_return(&tags->active_queues);
blk_mq_update_wake_batch(tags, users);
blk_mq_tag_wakeup_all(tags, false);
}
@ -87,6 +112,21 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
return __sbitmap_queue_get(bt);
}
unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
unsigned int *offset)
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct sbitmap_queue *bt = &tags->bitmap_tags;
unsigned long ret;
if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED ||
data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
return 0;
ret = __sbitmap_queue_get_batch(bt, nr_tags, offset);
*offset += tags->nr_reserved_tags;
return ret;
}
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
@ -101,10 +141,10 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
WARN_ON_ONCE(1);
return BLK_MQ_NO_TAG;
}
bt = tags->breserved_tags;
bt = &tags->breserved_tags;
tag_offset = 0;
} else {
bt = tags->bitmap_tags;
bt = &tags->bitmap_tags;
tag_offset = tags->nr_reserved_tags;
}
@ -150,9 +190,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
data->ctx);
tags = blk_mq_tags_from_data(data);
if (data->flags & BLK_MQ_REQ_RESERVED)
bt = tags->breserved_tags;
bt = &tags->breserved_tags;
else
bt = tags->bitmap_tags;
bt = &tags->bitmap_tags;
/*
* If destination hw queue is changed, fake wake up on
@ -186,16 +226,23 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
const int real_tag = tag - tags->nr_reserved_tags;
BUG_ON(real_tag >= tags->nr_tags);
sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu);
sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
} else {
BUG_ON(tag >= tags->nr_reserved_tags);
sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu);
sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
}
}
void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
{
sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
tag_array, nr_tags);
}
struct bt_iter_data {
struct blk_mq_hw_ctx *hctx;
busy_iter_fn *fn;
struct request_queue *q;
busy_tag_iter_fn *fn;
void *data;
bool reserved;
};
@ -208,7 +255,7 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
spin_lock_irqsave(&tags->lock, flags);
rq = tags->rqs[bitnr];
if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref))
if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq))
rq = NULL;
spin_unlock_irqrestore(&tags->lock, flags);
return rq;
@ -218,11 +265,18 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
struct bt_iter_data *iter_data = data;
struct blk_mq_hw_ctx *hctx = iter_data->hctx;
struct blk_mq_tags *tags = hctx->tags;
struct request_queue *q = iter_data->q;
struct blk_mq_tag_set *set = q->tag_set;
bool reserved = iter_data->reserved;
struct blk_mq_tags *tags;
struct request *rq;
bool ret = true;
if (blk_mq_is_shared_tags(set->flags))
tags = set->shared_tags;
else
tags = hctx->tags;
if (!reserved)
bitnr += tags->nr_reserved_tags;
/*
@ -233,8 +287,8 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
if (!rq)
return true;
if (rq->q == hctx->queue && rq->mq_hctx == hctx)
ret = iter_data->fn(hctx, rq, iter_data->data, reserved);
if (rq->q == q && (!hctx || rq->mq_hctx == hctx))
ret = iter_data->fn(rq, iter_data->data, reserved);
blk_mq_put_rq_ref(rq);
return ret;
}
@ -242,6 +296,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
/**
* bt_for_each - iterate over the requests associated with a hardware queue
* @hctx: Hardware queue to examine.
* @q: Request queue to examine.
* @bt: sbitmap to examine. This is either the breserved_tags member
* or the bitmap_tags member of struct blk_mq_tags.
* @fn: Pointer to the function that will be called for each request
@ -253,14 +308,16 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
* @reserved: Indicates whether @bt is the breserved_tags member or the
* bitmap_tags member of struct blk_mq_tags.
*/
static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
busy_iter_fn *fn, void *data, bool reserved)
static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q,
struct sbitmap_queue *bt, busy_tag_iter_fn *fn,
void *data, bool reserved)
{
struct bt_iter_data iter_data = {
.hctx = hctx,
.fn = fn,
.data = data,
.reserved = reserved,
.q = q,
};
sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
@ -340,9 +397,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);
if (tags->nr_reserved_tags)
bt_tags_for_each(tags, tags->breserved_tags, fn, priv,
bt_tags_for_each(tags, &tags->breserved_tags, fn, priv,
flags | BT_TAG_ITER_RESERVED);
bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags);
bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags);
}
/**
@ -379,9 +436,12 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv)
{
int i;
unsigned int flags = tagset->flags;
int i, nr_tags;
for (i = 0; i < tagset->nr_hw_queues; i++) {
nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues;
for (i = 0; i < nr_tags; i++) {
if (tagset->tags && tagset->tags[i])
__blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
BT_TAG_ITER_STARTED);
@ -434,12 +494,9 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
* called for all requests on all queues that share that tag set and not only
* for requests associated with @q.
*/
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
void *priv)
{
struct blk_mq_hw_ctx *hctx;
int i;
/*
* __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
* while the queue is frozen. So we can use q_usage_counter to avoid
@ -448,19 +505,34 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
queue_for_each_hw_ctx(q, hctx, i) {
struct blk_mq_tags *tags = hctx->tags;
/*
* If no software queues are currently mapped to this
* hardware queue, there's nothing to check
*/
if (!blk_mq_hw_queue_mapped(hctx))
continue;
if (blk_mq_is_shared_tags(q->tag_set->flags)) {
struct blk_mq_tags *tags = q->tag_set->shared_tags;
struct sbitmap_queue *bresv = &tags->breserved_tags;
struct sbitmap_queue *btags = &tags->bitmap_tags;
if (tags->nr_reserved_tags)
bt_for_each(hctx, tags->breserved_tags, fn, priv, true);
bt_for_each(hctx, tags->bitmap_tags, fn, priv, false);
bt_for_each(NULL, q, bresv, fn, priv, true);
bt_for_each(NULL, q, btags, fn, priv, false);
} else {
struct blk_mq_hw_ctx *hctx;
int i;
queue_for_each_hw_ctx(q, hctx, i) {
struct blk_mq_tags *tags = hctx->tags;
struct sbitmap_queue *bresv = &tags->breserved_tags;
struct sbitmap_queue *btags = &tags->bitmap_tags;
/*
* If no software queues are currently mapped to this
* hardware queue, there's nothing to check
*/
if (!blk_mq_hw_queue_mapped(hctx))
continue;
if (tags->nr_reserved_tags)
bt_for_each(hctx, q, bresv, fn, priv, true);
bt_for_each(hctx, q, btags, fn, priv, false);
}
}
blk_queue_exit(q);
}
@ -492,56 +564,10 @@ int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
return -ENOMEM;
}
static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
int node, int alloc_policy)
{
int ret;
ret = blk_mq_init_bitmaps(&tags->__bitmap_tags,
&tags->__breserved_tags,
tags->nr_tags, tags->nr_reserved_tags,
node, alloc_policy);
if (ret)
return ret;
tags->bitmap_tags = &tags->__bitmap_tags;
tags->breserved_tags = &tags->__breserved_tags;
return 0;
}
int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set)
{
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags);
int i, ret;
ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags,
set->queue_depth, set->reserved_tags,
set->numa_node, alloc_policy);
if (ret)
return ret;
for (i = 0; i < set->nr_hw_queues; i++) {
struct blk_mq_tags *tags = set->tags[i];
tags->bitmap_tags = &set->__bitmap_tags;
tags->breserved_tags = &set->__breserved_tags;
}
return 0;
}
void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set)
{
sbitmap_queue_free(&set->__bitmap_tags);
sbitmap_queue_free(&set->__breserved_tags);
}
struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
unsigned int reserved_tags,
int node, unsigned int flags)
int node, int alloc_policy)
{
int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags);
struct blk_mq_tags *tags;
if (total_tags > BLK_MQ_TAG_MAX) {
@ -557,22 +583,19 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_reserved_tags = reserved_tags;
spin_lock_init(&tags->lock);
if (blk_mq_is_sbitmap_shared(flags))
return tags;
if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) {
if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags,
total_tags, reserved_tags, node,
alloc_policy) < 0) {
kfree(tags);
return NULL;
}
return tags;
}
void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags)
void blk_mq_free_tags(struct blk_mq_tags *tags)
{
if (!blk_mq_is_sbitmap_shared(flags)) {
sbitmap_queue_free(tags->bitmap_tags);
sbitmap_queue_free(tags->breserved_tags);
}
sbitmap_queue_free(&tags->bitmap_tags);
sbitmap_queue_free(&tags->breserved_tags);
kfree(tags);
}
@ -592,7 +615,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
if (tdepth > tags->nr_tags) {
struct blk_mq_tag_set *set = hctx->queue->tag_set;
struct blk_mq_tags *new;
bool ret;
if (!can_grow)
return -EINVAL;
@ -604,34 +626,42 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
if (tdepth > MAX_SCHED_RQ)
return -EINVAL;
new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
tags->nr_reserved_tags, set->flags);
/*
* Only the sbitmap needs resizing since we allocated the max
* initially.
*/
if (blk_mq_is_shared_tags(set->flags))
return 0;
new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth);
if (!new)
return -ENOMEM;
ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
if (ret) {
blk_mq_free_rq_map(new, set->flags);
return -ENOMEM;
}
blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
blk_mq_free_rq_map(*tagsptr, set->flags);
blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num);
*tagsptr = new;
} else {
/*
* Don't need (or can't) update reserved tags here, they
* remain static and should never need resizing.
*/
sbitmap_queue_resize(tags->bitmap_tags,
sbitmap_queue_resize(&tags->bitmap_tags,
tdepth - tags->nr_reserved_tags);
}
return 0;
}
void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size)
void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size)
{
sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags);
struct blk_mq_tags *tags = set->shared_tags;
sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags);
}
void blk_mq_tag_update_sched_shared_tags(struct request_queue *q)
{
sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags,
q->nr_requests - q->tag_set->reserved_tags);
}
/**

View File

@ -2,55 +2,33 @@
#ifndef INT_BLK_MQ_TAG_H
#define INT_BLK_MQ_TAG_H
/*
* Tag address space map.
*/
struct blk_mq_tags {
unsigned int nr_tags;
unsigned int nr_reserved_tags;
atomic_t active_queues;
struct sbitmap_queue *bitmap_tags;
struct sbitmap_queue *breserved_tags;
struct sbitmap_queue __bitmap_tags;
struct sbitmap_queue __breserved_tags;
struct request **rqs;
struct request **static_rqs;
struct list_head page_list;
/*
* used to clear request reference in rqs[] before freeing one
* request pool
*/
spinlock_t lock;
};
struct blk_mq_alloc_data;
extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
unsigned int reserved_tags,
int node, unsigned int flags);
extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags);
int node, int alloc_policy);
extern void blk_mq_free_tags(struct blk_mq_tags *tags);
extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
struct sbitmap_queue *breserved_tags,
unsigned int queue_depth,
unsigned int reserved,
int node, int alloc_policy);
extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set);
extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set);
extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
unsigned int *offset);
extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
unsigned int tag);
void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags);
extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
struct blk_mq_tags **tags,
unsigned int depth, bool can_grow);
extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set,
extern void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set,
unsigned int size);
extern void blk_mq_tag_update_sched_shared_tags(struct request_queue *q);
extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
void *priv);
void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
void *priv);

File diff suppressed because it is too large Load Diff

View File

@ -25,18 +25,14 @@ struct blk_mq_ctx {
unsigned short index_hw[HCTX_MAX_TYPES];
struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES];
/* incremented at dispatch time */
unsigned long rq_dispatched[2];
unsigned long rq_merged;
/* incremented at completion time */
unsigned long ____cacheline_aligned_in_smp rq_completed[2];
struct request_queue *queue;
struct blk_mq_ctxs *ctxs;
struct kobject kobj;
} ____cacheline_aligned_in_smp;
void blk_mq_submit_bio(struct bio *bio);
int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
unsigned int flags);
void blk_mq_exit_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
@ -54,15 +50,12 @@ void blk_mq_put_rq_ref(struct request *rq);
*/
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx);
void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags);
struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
unsigned int hctx_idx,
unsigned int nr_tags,
unsigned int reserved_tags,
unsigned int flags);
int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx, unsigned int depth);
void blk_mq_free_rq_map(struct blk_mq_tags *tags);
struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
unsigned int hctx_idx, unsigned int depth);
void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
struct blk_mq_tags *tags,
unsigned int hctx_idx);
/*
* Internal helpers for request insertion into sw queues
*/
@ -72,9 +65,6 @@ void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
bool run_queue);
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list);
/* Used by blk_insert_cloned_request() to issue request directly */
blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last);
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
struct list_head *list);
@ -96,6 +86,20 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
}
static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags)
{
enum hctx_type type = HCTX_TYPE_DEFAULT;
/*
* The caller ensure that if REQ_POLLED, poll must be enabled.
*/
if (flags & REQ_POLLED)
type = HCTX_TYPE_POLL;
else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
type = HCTX_TYPE_READ;
return type;
}
/*
* blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
* @q: request queue
@ -106,17 +110,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
unsigned int flags,
struct blk_mq_ctx *ctx)
{
enum hctx_type type = HCTX_TYPE_DEFAULT;
/*
* The caller ensure that if REQ_HIPRI, poll must be enabled.
*/
if (flags & REQ_HIPRI)
type = HCTX_TYPE_POLL;
else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
type = HCTX_TYPE_READ;
return ctx->hctxs[type];
return ctx->hctxs[blk_mq_get_hctx_type(flags)];
}
/*
@ -128,6 +122,8 @@ extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
extern int blk_mq_sysfs_register(struct request_queue *q);
extern void blk_mq_sysfs_unregister(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
void blk_mq_free_plug_rqs(struct blk_plug *plug);
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
void blk_mq_cancel_work_sync(struct request_queue *q);
@ -156,23 +152,27 @@ struct blk_mq_alloc_data {
blk_mq_req_flags_t flags;
unsigned int shallow_depth;
unsigned int cmd_flags;
req_flags_t rq_flags;
/* allocate multiple requests/tags in one go */
unsigned int nr_tags;
struct request **cached_rq;
/* input & output parameter */
struct blk_mq_ctx *ctx;
struct blk_mq_hw_ctx *hctx;
};
static inline bool blk_mq_is_sbitmap_shared(unsigned int flags)
static inline bool blk_mq_is_shared_tags(unsigned int flags)
{
return flags & BLK_MQ_F_TAG_HCTX_SHARED;
}
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
{
if (data->q->elevator)
return data->hctx->sched_tags;
return data->hctx->tags;
if (!(data->rq_flags & RQF_ELV))
return data->hctx->tags;
return data->hctx->sched_tags;
}
static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx)
@ -222,24 +222,30 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq)
static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_sbitmap_shared(hctx->flags))
atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap);
if (blk_mq_is_shared_tags(hctx->flags))
atomic_inc(&hctx->queue->nr_active_requests_shared_tags);
else
atomic_inc(&hctx->nr_active);
}
static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
int val)
{
if (blk_mq_is_shared_tags(hctx->flags))
atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags);
else
atomic_sub(val, &hctx->nr_active);
}
static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_sbitmap_shared(hctx->flags))
atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap);
else
atomic_dec(&hctx->nr_active);
__blk_mq_sub_active_requests(hctx, 1);
}
static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_sbitmap_shared(hctx->flags))
return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap);
if (blk_mq_is_shared_tags(hctx->flags))
return atomic_read(&hctx->queue->nr_active_requests_shared_tags);
return atomic_read(&hctx->nr_active);
}
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
@ -262,7 +268,20 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
__blk_mq_put_driver_tag(rq->mq_hctx, rq);
}
bool blk_mq_get_driver_tag(struct request *rq);
bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq);
static inline bool blk_mq_get_driver_tag(struct request *rq)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
if (rq->tag != BLK_MQ_NO_TAG &&
!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
hctx->tags->rqs[rq->tag] = rq;
return true;
}
return __blk_mq_get_driver_tag(hctx, rq);
}
static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
{
@ -333,19 +352,18 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
if (bt->sb.depth == 1)
return true;
if (blk_mq_is_sbitmap_shared(hctx->flags)) {
if (blk_mq_is_shared_tags(hctx->flags)) {
struct request_queue *q = hctx->queue;
struct blk_mq_tag_set *set = q->tag_set;
if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
return true;
users = atomic_read(&set->active_queues_shared_sbitmap);
} else {
if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
return true;
users = atomic_read(&hctx->tags->active_queues);
}
users = atomic_read(&hctx->tags->active_queues);
if (!users)
return true;
@ -356,5 +374,24 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
return __blk_mq_active_requests(hctx) < depth;
}
/* run the code block in @dispatch_ops with rcu/srcu read lock held */
#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \
do { \
if (!blk_queue_has_srcu(q)) { \
rcu_read_lock(); \
(dispatch_ops); \
rcu_read_unlock(); \
} else { \
int srcu_idx; \
\
might_sleep_if(check_sleep); \
srcu_idx = srcu_read_lock((q)->srcu); \
(dispatch_ops); \
srcu_read_unlock((q)->srcu, srcu_idx); \
} \
} while (0)
#define blk_mq_run_dispatch_ops(q, dispatch_ops) \
__blk_mq_run_dispatch_ops(q, true, dispatch_ops) \
#endif

View File

@ -189,9 +189,10 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
* BIO_TRACKED lets controllers know that a bio went through the
* normal rq_qos path.
*/
bio_set_flag(bio, BIO_TRACKED);
if (q->rq_qos)
if (q->rq_qos) {
bio_set_flag(bio, BIO_TRACKED);
__rq_qos_throttle(q->rq_qos, bio);
}
}
static inline void rq_qos_track(struct request_queue *q, struct request *rq,

View File

@ -15,7 +15,7 @@
struct blk_queue_stats {
struct list_head callbacks;
spinlock_t lock;
bool enable_accounting;
int accounting;
};
void blk_rq_stat_init(struct blk_rq_stat *stat)
@ -161,7 +161,7 @@ void blk_stat_remove_callback(struct request_queue *q,
spin_lock_irqsave(&q->stats->lock, flags);
list_del_rcu(&cb->list);
if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
if (list_empty(&q->stats->callbacks) && !q->stats->accounting)
blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
spin_unlock_irqrestore(&q->stats->lock, flags);
@ -184,13 +184,24 @@ void blk_stat_free_callback(struct blk_stat_callback *cb)
call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
}
void blk_stat_disable_accounting(struct request_queue *q)
{
unsigned long flags;
spin_lock_irqsave(&q->stats->lock, flags);
if (!--q->stats->accounting)
blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
spin_unlock_irqrestore(&q->stats->lock, flags);
}
EXPORT_SYMBOL_GPL(blk_stat_disable_accounting);
void blk_stat_enable_accounting(struct request_queue *q)
{
unsigned long flags;
spin_lock_irqsave(&q->stats->lock, flags);
q->stats->enable_accounting = true;
blk_queue_flag_set(QUEUE_FLAG_STATS, q);
if (!q->stats->accounting++)
blk_queue_flag_set(QUEUE_FLAG_STATS, q);
spin_unlock_irqrestore(&q->stats->lock, flags);
}
EXPORT_SYMBOL_GPL(blk_stat_enable_accounting);
@ -205,7 +216,7 @@ struct blk_queue_stats *blk_alloc_queue_stats(void)
INIT_LIST_HEAD(&stats->callbacks);
spin_lock_init(&stats->lock);
stats->enable_accounting = false;
stats->accounting = 0;
return stats;
}
@ -219,3 +230,21 @@ void blk_free_queue_stats(struct blk_queue_stats *stats)
kfree(stats);
}
bool blk_stats_alloc_enable(struct request_queue *q)
{
struct blk_rq_stat *poll_stat;
poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat),
GFP_ATOMIC);
if (!poll_stat)
return false;
if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) {
kfree(poll_stat);
return true;
}
blk_stat_add_callback(q, q->poll_cb);
return false;
}

View File

@ -64,11 +64,13 @@ struct blk_stat_callback {
struct blk_queue_stats *blk_alloc_queue_stats(void);
void blk_free_queue_stats(struct blk_queue_stats *);
bool blk_stats_alloc_enable(struct request_queue *q);
void blk_stat_add(struct request *rq, u64 now);
/* record time/size info in request but not add a callback */
void blk_stat_enable_accounting(struct request_queue *q);
void blk_stat_disable_accounting(struct request_queue *q);
/**
* blk_stat_alloc_callback() - Allocate a block statistics callback.

View File

@ -16,7 +16,9 @@
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
#include "blk-wbt.h"
#include "blk-throttle.h"
struct queue_sysfs_entry {
struct attribute attr;
@ -432,26 +434,11 @@ static ssize_t queue_poll_show(struct request_queue *q, char *page)
static ssize_t queue_poll_store(struct request_queue *q, const char *page,
size_t count)
{
unsigned long poll_on;
ssize_t ret;
if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL ||
!q->tag_set->map[HCTX_TYPE_POLL].nr_queues)
if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return -EINVAL;
ret = queue_var_store(&poll_on, page, count);
if (ret < 0)
return ret;
if (poll_on) {
blk_queue_flag_set(QUEUE_FLAG_POLL, q);
} else {
blk_mq_freeze_queue(q);
blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
blk_mq_unfreeze_queue(q);
}
return ret;
pr_info_ratelimited("writes to the poll attribute are ignored.\n");
pr_info_ratelimited("please use driver specific parameters instead.\n");
return count;
}
static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
@ -748,7 +735,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head)
{
struct request_queue *q = container_of(rcu_head, struct request_queue,
rcu_head);
kmem_cache_free(blk_requestq_cachep, q);
kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
}
/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
@ -761,7 +749,7 @@ static void blk_exit_queue(struct request_queue *q)
*/
if (q->elevator) {
ioc_clear_queue(q);
__elevator_exit(q, q->elevator);
elevator_exit(q);
}
/*
@ -799,14 +787,15 @@ static void blk_release_queue(struct kobject *kobj)
might_sleep();
if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
if (q->poll_stat)
blk_stat_remove_callback(q, q->poll_cb);
blk_stat_free_callback(q->poll_cb);
blk_free_queue_stats(q->stats);
blk_exit_queue(q);
blk_free_queue_stats(q->stats);
kfree(q->poll_stat);
blk_queue_free_zone_bitmaps(q);
if (queue_is_mq(q))
@ -822,6 +811,9 @@ static void blk_release_queue(struct kobject *kobj)
bioset_exit(&q->bio_split);
if (blk_queue_has_srcu(q))
cleanup_srcu_struct(q->srcu);
ida_simple_remove(&blk_queue_ida, q->id);
call_rcu(&q->rcu_head, blk_free_queue_rcu);
}
@ -877,16 +869,15 @@ int blk_register_queue(struct gendisk *disk)
}
mutex_lock(&q->sysfs_lock);
ret = disk_register_independent_access_ranges(disk, NULL);
if (ret)
goto put_dev;
if (q->elevator) {
ret = elv_register_queue(q, false);
if (ret) {
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(dev);
kobject_put(&dev->kobj);
return ret;
}
if (ret)
goto put_dev;
}
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
@ -899,7 +890,6 @@ int blk_register_queue(struct gendisk *disk)
kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
mutex_unlock(&q->sysfs_lock);
ret = 0;
unlock:
mutex_unlock(&q->sysfs_dir_lock);
@ -917,6 +907,16 @@ int blk_register_queue(struct gendisk *disk)
percpu_ref_switch_to_percpu(&q->q_usage_counter);
}
return ret;
put_dev:
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(dev);
kobject_put(&dev->kobj);
return ret;
}
@ -962,6 +962,7 @@ void blk_unregister_queue(struct gendisk *disk)
mutex_lock(&q->sysfs_lock);
if (q->elevator)
elv_unregister_queue(q);
disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
mutex_unlock(&q->sysfs_dir_lock);

View File

@ -13,6 +13,8 @@
#include <linux/blk-cgroup.h>
#include "blk.h"
#include "blk-cgroup-rwstat.h"
#include "blk-stat.h"
#include "blk-throttle.h"
/* Max dispatch from a group in 1 round */
#define THROTL_GRP_QUANTUM 8
@ -37,60 +39,9 @@
*/
#define LATENCY_FILTERED_HD (1000L) /* 1ms */
static struct blkcg_policy blkcg_policy_throtl;
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;
/*
* To implement hierarchical throttling, throtl_grps form a tree and bios
* are dispatched upwards level by level until they reach the top and get
* issued. When dispatching bios from the children and local group at each
* level, if the bios are dispatched into a single bio_list, there's a risk
* of a local or child group which can queue many bios at once filling up
* the list starving others.
*
* To avoid such starvation, dispatched bios are queued separately
* according to where they came from. When they are again dispatched to
* the parent, they're popped in round-robin order so that no single source
* hogs the dispatch window.
*
* throtl_qnode is used to keep the queued bios separated by their sources.
* Bios are queued to throtl_qnode which in turn is queued to
* throtl_service_queue and then dispatched in round-robin order.
*
* It's also used to track the reference counts on blkg's. A qnode always
* belongs to a throtl_grp and gets queued on itself or the parent, so
* incrementing the reference of the associated throtl_grp when a qnode is
* queued and decrementing when dequeued is enough to keep the whole blkg
* tree pinned while bios are in flight.
*/
struct throtl_qnode {
struct list_head node; /* service_queue->queued[] */
struct bio_list bios; /* queued bios */
struct throtl_grp *tg; /* tg this qnode belongs to */
};
struct throtl_service_queue {
struct throtl_service_queue *parent_sq; /* the parent service_queue */
/*
* Bios queued directly to this service_queue or dispatched from
* children throtl_grp's.
*/
struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
unsigned int nr_queued[2]; /* number of queued bios */
/*
* RB tree of active children throtl_grp's, which are sorted by
* their ->disptime.
*/
struct rb_root_cached pending_tree; /* RB tree of active tgs */
unsigned int nr_pending; /* # queued in the tree */
unsigned long first_pending_disptime; /* disptime of the first tg */
struct timer_list pending_timer; /* fires on first_pending_disptime */
};
enum tg_state_flags {
THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
@ -98,93 +49,6 @@ enum tg_state_flags {
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
enum {
LIMIT_LOW,
LIMIT_MAX,
LIMIT_CNT,
};
struct throtl_grp {
/* must be the first member */
struct blkg_policy_data pd;
/* active throtl group service_queue member */
struct rb_node rb_node;
/* throtl_data this group belongs to */
struct throtl_data *td;
/* this group's service queue */
struct throtl_service_queue service_queue;
/*
* qnode_on_self is used when bios are directly queued to this
* throtl_grp so that local bios compete fairly with bios
* dispatched from children. qnode_on_parent is used when bios are
* dispatched from this throtl_grp into its parent and will compete
* with the sibling qnode_on_parents and the parent's
* qnode_on_self.
*/
struct throtl_qnode qnode_on_self[2];
struct throtl_qnode qnode_on_parent[2];
/*
* Dispatch time in jiffies. This is the estimated time when group
* will unthrottle and is ready to dispatch more bio. It is used as
* key to sort active groups in service tree.
*/
unsigned long disptime;
unsigned int flags;
/* are there any throtl rules between this group and td? */
bool has_rules[2];
/* internally used bytes per second rate limits */
uint64_t bps[2][LIMIT_CNT];
/* user configured bps limits */
uint64_t bps_conf[2][LIMIT_CNT];
/* internally used IOPS limits */
unsigned int iops[2][LIMIT_CNT];
/* user configured IOPS limits */
unsigned int iops_conf[2][LIMIT_CNT];
/* Number of bytes dispatched in current slice */
uint64_t bytes_disp[2];
/* Number of bio's dispatched in current slice */
unsigned int io_disp[2];
unsigned long last_low_overflow_time[2];
uint64_t last_bytes_disp[2];
unsigned int last_io_disp[2];
unsigned long last_check_time;
unsigned long latency_target; /* us */
unsigned long latency_target_conf; /* us */
/* When did we start a new slice */
unsigned long slice_start[2];
unsigned long slice_end[2];
unsigned long last_finish_time; /* ns / 1024 */
unsigned long checked_last_finish_time; /* ns / 1024 */
unsigned long avg_idletime; /* ns / 1024 */
unsigned long idletime_threshold; /* us */
unsigned long idletime_threshold_conf; /* us */
unsigned int bio_cnt; /* total bios */
unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
unsigned long bio_cnt_reset_time;
atomic_t io_split_cnt[2];
atomic_t last_io_split_cnt[2];
struct blkg_rwstat stat_bytes;
struct blkg_rwstat stat_ios;
};
/* We measure latency for request size from <= 4k to >= 1M */
#define LATENCY_BUCKET_SIZE 9
@ -231,16 +95,6 @@ struct throtl_data
static void throtl_pending_timer_fn(struct timer_list *t);
static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
{
return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
}
static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
{
return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
}
static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
{
return pd_to_blkg(&tg->pd);
@ -1794,7 +1648,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
cancel_work_sync(&td->dispatch_work);
}
static struct blkcg_policy blkcg_policy_throtl = {
struct blkcg_policy blkcg_policy_throtl = {
.dfl_cftypes = throtl_files,
.legacy_cftypes = throtl_legacy_files,
@ -2208,9 +2062,9 @@ void blk_throtl_charge_bio_split(struct bio *bio)
} while (parent);
}
bool blk_throtl_bio(struct bio *bio)
bool __blk_throtl_bio(struct bio *bio)
{
struct request_queue *q = bio->bi_bdev->bd_disk->queue;
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
struct blkcg_gq *blkg = bio->bi_blkg;
struct throtl_qnode *qn = NULL;
struct throtl_grp *tg = blkg_to_tg(blkg);
@ -2221,19 +2075,12 @@ bool blk_throtl_bio(struct bio *bio)
rcu_read_lock();
/* see throtl_charge_bio() */
if (bio_flagged(bio, BIO_THROTTLED))
goto out;
if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
bio->bi_iter.bi_size);
blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
}
if (!tg->has_rules[rw])
goto out;
spin_lock_irq(&q->queue_lock);
throtl_update_latency_buckets(td);
@ -2317,7 +2164,6 @@ bool blk_throtl_bio(struct bio *bio)
out_unlock:
spin_unlock_irq(&q->queue_lock);
out:
bio_set_flag(bio, BIO_THROTTLED);
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW

View File

@ -2,15 +2,12 @@
#ifndef BLK_INTERNAL_H
#define BLK_INTERNAL_H
#include <linux/idr.h>
#include <linux/blk-mq.h>
#include <linux/part_stat.h>
#include <linux/blk-crypto.h>
#include <linux/memblock.h> /* for max_pfn/max_low_pfn */
#include <xen/xen.h>
#include "blk-crypto-internal.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"
struct elevator_type;
/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT (5 * HZ)
@ -30,15 +27,10 @@ struct blk_flush_queue {
};
extern struct kmem_cache *blk_requestq_cachep;
extern struct kmem_cache *blk_requestq_srcu_cachep;
extern struct kobj_type blk_queue_ktype;
extern struct ida blk_queue_ida;
static inline struct blk_flush_queue *
blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
{
return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
}
static inline void __blk_get_queue(struct request_queue *q)
{
kobject_get(&q->kobj);
@ -53,6 +45,41 @@ void blk_free_flush_queue(struct blk_flush_queue *q);
void blk_freeze_queue(struct request_queue *q);
void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
void blk_queue_start_drain(struct request_queue *q);
int __bio_queue_enter(struct request_queue *q, struct bio *bio);
bool submit_bio_checks(struct bio *bio);
static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
{
rcu_read_lock();
if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter))
goto fail;
/*
* The code that increments the pm_only counter must ensure that the
* counter is globally visible before the queue is unfrozen.
*/
if (blk_queue_pm_only(q) &&
(!pm || queue_rpm_status(q) == RPM_SUSPENDED))
goto fail_put;
rcu_read_unlock();
return true;
fail_put:
blk_queue_exit(q);
fail:
rcu_read_unlock();
return false;
}
static inline int bio_queue_enter(struct bio *bio)
{
struct request_queue *q = bdev_get_queue(bio->bi_bdev);
if (blk_try_enter_queue(q, false))
return 0;
return __bio_queue_enter(q, bio);
}
#define BIO_INLINE_VECS 4
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
@ -94,6 +121,44 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
return __bvec_gap_to_prev(q, bprv, offset);
}
static inline bool rq_mergeable(struct request *rq)
{
if (blk_rq_is_passthrough(rq))
return false;
if (req_op(rq) == REQ_OP_FLUSH)
return false;
if (req_op(rq) == REQ_OP_WRITE_ZEROES)
return false;
if (req_op(rq) == REQ_OP_ZONE_APPEND)
return false;
if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
return false;
if (rq->rq_flags & RQF_NOMERGE_FLAGS)
return false;
return true;
}
/*
* There are two different ways to handle DISCARD merges:
* 1) If max_discard_segments > 1, the driver treats every bio as a range and
* send the bios to controller together. The ranges don't need to be
* contiguous.
* 2) Otherwise, the request will be normal read/write requests. The ranges
* need to be contiguous.
*/
static inline bool blk_discard_mergable(struct request *req)
{
if (req_op(req) == REQ_OP_DISCARD &&
queue_max_discard_segments(req->q) > 1)
return true;
return false;
}
#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *);
@ -175,15 +240,13 @@ static inline void blk_integrity_del(struct gendisk *disk)
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
const char *blk_status_to_str(blk_status_t status);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **same_queue_rq);
unsigned int nr_segs);
bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
struct bio *bio, unsigned int nr_segs);
void blk_account_io_start(struct request *req);
void blk_account_io_done(struct request *req, u64 now);
/*
* Plug flush limits
*/
@ -199,19 +262,10 @@ void blk_insert_flush(struct request *rq);
int elevator_switch_mq(struct request_queue *q,
struct elevator_type *new_e);
void __elevator_exit(struct request_queue *, struct elevator_queue *);
void elevator_exit(struct request_queue *q);
int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q);
static inline void elevator_exit(struct request_queue *q,
struct elevator_queue *e)
{
lockdep_assert_held(&q->sysfs_lock);
blk_mq_sched_free_requests(q);
__elevator_exit(q, e);
}
ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
char *buf);
ssize_t part_stat_show(struct device *dev, struct device_attribute *attr,
@ -226,7 +280,32 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
const char *, size_t);
void __blk_queue_split(struct bio **bio, unsigned int *nr_segs);
static inline bool blk_may_split(struct request_queue *q, struct bio *bio)
{
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE_SAME:
return true; /* non-trivial splitting decisions */
default:
break;
}
/*
* All drivers must accept single-segments bios that are <= PAGE_SIZE.
* This is a quick and dirty check that relies on the fact that
* bi_io_vec[0] is always valid if a bio has data. The check might
* lead to occasional false negatives when bios are cloned, but compared
* to the performance impact of cloned bios themselves the loop below
* doesn't matter anyway.
*/
return q->limits.chunk_sectors || bio->bi_vcnt != 1 ||
bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
}
void __blk_queue_split(struct request_queue *q, struct bio **bio,
unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@ -246,9 +325,11 @@ int blk_dev_init(void);
*/
static inline bool blk_do_io_stat(struct request *rq)
{
return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT);
return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk;
}
void update_io_ticks(struct block_device *part, unsigned long now, bool end);
static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{
req->cmd_flags |= REQ_NOMERGE;
@ -283,30 +364,16 @@ static inline unsigned int bio_aligned_discard_max_sectors(
/*
* Internal io_context interface
*/
void get_io_context(struct io_context *ioc);
struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
gfp_t gfp_mask);
struct io_cq *ioc_find_get_icq(struct request_queue *q);
struct io_cq *ioc_lookup_icq(struct request_queue *q);
#ifdef CONFIG_BLK_ICQ
void ioc_clear_queue(struct request_queue *q);
#else
static inline void ioc_clear_queue(struct request_queue *q)
{
}
#endif /* CONFIG_BLK_ICQ */
int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
/*
* Internal throttling interface
*/
#ifdef CONFIG_BLK_DEV_THROTTLING
extern int blk_throtl_init(struct request_queue *q);
extern void blk_throtl_exit(struct request_queue *q);
extern void blk_throtl_register_queue(struct request_queue *q);
extern void blk_throtl_charge_bio_split(struct bio *bio);
bool blk_throtl_bio(struct bio *bio);
#else /* CONFIG_BLK_DEV_THROTTLING */
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
static inline void blk_throtl_register_queue(struct request_queue *q) { }
static inline void blk_throtl_charge_bio_split(struct bio *bio) { }
static inline bool blk_throtl_bio(struct bio *bio) { return false; }
#endif /* CONFIG_BLK_DEV_THROTTLING */
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
@ -364,7 +431,15 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page);
struct request_queue *blk_alloc_queue(int node_id);
static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
{
if (srcu)
return blk_requestq_srcu_cachep;
return blk_requestq_cachep;
}
struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
int disk_alloc_events(struct gendisk *disk);
void disk_add_events(struct gendisk *disk);
@ -374,13 +449,61 @@ extern struct device_attribute dev_attr_events;
extern struct device_attribute dev_attr_events_async;
extern struct device_attribute dev_attr_events_poll_msecs;
static inline void bio_clear_hipri(struct bio *bio)
static inline void bio_clear_polled(struct bio *bio)
{
/* can't support alloc cache if we turn off polling */
bio_clear_flag(bio, BIO_PERCPU_CACHE);
bio->bi_opf &= ~REQ_HIPRI;
bio->bi_opf &= ~REQ_POLLED;
}
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
extern const struct address_space_operations def_blk_aops;
int disk_register_independent_access_ranges(struct gendisk *disk,
struct blk_independent_access_ranges *new_iars);
void disk_unregister_independent_access_ranges(struct gendisk *disk);
#ifdef CONFIG_FAIL_MAKE_REQUEST
bool should_fail_request(struct block_device *part, unsigned int bytes);
#else /* CONFIG_FAIL_MAKE_REQUEST */
static inline bool should_fail_request(struct block_device *part,
unsigned int bytes)
{
return false;
}
#endif /* CONFIG_FAIL_MAKE_REQUEST */
/*
* Optimized request reference counting. Ideally we'd make timeouts be more
* clever, as that's the only reason we need references at all... But until
* this happens, this is faster than using refcount_t. Also see:
*
* abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count")
*/
#define req_ref_zero_or_close_to_overflow(req) \
((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)
static inline bool req_ref_inc_not_zero(struct request *req)
{
return atomic_inc_not_zero(&req->ref);
}
static inline bool req_ref_put_and_test(struct request *req)
{
WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
return atomic_dec_and_test(&req->ref);
}
static inline void req_ref_set(struct request *req, int value)
{
atomic_set(&req->ref, value);
}
static inline int req_ref_read(struct request *req)
{
return atomic_read(&req->ref);
}
#endif /* BLK_INTERNAL_H */

View File

@ -14,6 +14,7 @@
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/blk-cgroup.h>
#include <linux/backing-dev.h>
#include <linux/init.h>
#include <linux/hash.h>

View File

@ -31,6 +31,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
struct bsg_job *job;
struct request *rq;
struct bio *bio;
void *reply;
int ret;
if (hdr->protocol != BSG_PROTOCOL_SCSI ||
@ -39,22 +40,28 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
rq = blk_get_request(q, hdr->dout_xfer_len ?
rq = blk_mq_alloc_request(q, hdr->dout_xfer_len ?
REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
if (IS_ERR(rq))
return PTR_ERR(rq);
rq->timeout = timeout;
job = blk_mq_rq_to_pdu(rq);
reply = job->reply;
memset(job, 0, sizeof(*job));
job->reply = reply;
job->reply_len = SCSI_SENSE_BUFFERSIZE;
job->dd_data = job + 1;
job->request_len = hdr->request_len;
job->request = memdup_user(uptr64(hdr->request), hdr->request_len);
if (IS_ERR(job->request)) {
ret = PTR_ERR(job->request);
goto out_put_request;
goto out_free_rq;
}
if (hdr->dout_xfer_len && hdr->din_xfer_len) {
job->bidi_rq = blk_get_request(rq->q, REQ_OP_DRV_IN, 0);
job->bidi_rq = blk_mq_alloc_request(rq->q, REQ_OP_DRV_IN, 0);
if (IS_ERR(job->bidi_rq)) {
ret = PTR_ERR(job->bidi_rq);
goto out_free_job_request;
@ -85,7 +92,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
goto out_unmap_bidi_rq;
bio = rq->bio;
blk_execute_rq(NULL, rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL));
blk_execute_rq(rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL));
/*
* The assignments below don't make much sense, but are kept for
@ -134,11 +141,11 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
blk_rq_unmap_user(job->bidi_bio);
out_free_bidi_rq:
if (job->bidi_rq)
blk_put_request(job->bidi_rq);
blk_mq_free_request(job->bidi_rq);
out_free_job_request:
kfree(job->request);
out_put_request:
blk_put_request(rq);
out_free_rq:
blk_mq_free_request(rq);
return ret;
}
@ -302,18 +309,6 @@ static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req,
return 0;
}
/* called right before the request is given to the request_queue user */
static void bsg_initialize_rq(struct request *req)
{
struct bsg_job *job = blk_mq_rq_to_pdu(req);
void *reply = job->reply;
memset(job, 0, sizeof(*job));
job->reply = reply;
job->reply_len = SCSI_SENSE_BUFFERSIZE;
job->dd_data = job + 1;
}
static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req,
unsigned int hctx_idx)
{
@ -350,7 +345,6 @@ static const struct blk_mq_ops bsg_mq_ops = {
.queue_rq = bsg_queue_rq,
.init_request = bsg_init_rq,
.exit_request = bsg_exit_rq,
.initialize_rq_fn = bsg_initialize_rq,
.complete = bsg_complete,
.timeout = bsg_timeout,
};

View File

@ -26,7 +26,6 @@
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
@ -40,6 +39,7 @@
#include <trace/events/block.h>
#include "elevator.h"
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-pm.h"
@ -188,8 +188,10 @@ static void elevator_release(struct kobject *kobj)
kfree(e);
}
void __elevator_exit(struct request_queue *q, struct elevator_queue *e)
void elevator_exit(struct request_queue *q)
{
struct elevator_queue *e = q->elevator;
mutex_lock(&e->sysfs_lock);
blk_mq_exit_sched(q, e);
mutex_unlock(&e->sysfs_lock);
@ -593,7 +595,8 @@ int elevator_switch_mq(struct request_queue *q,
elv_unregister_queue(q);
ioc_clear_queue(q);
elevator_exit(q, q->elevator);
blk_mq_sched_free_rqs(q);
elevator_exit(q);
}
ret = blk_mq_init_sched(q, new_e);
@ -603,7 +606,8 @@ int elevator_switch_mq(struct request_queue *q,
if (new_e) {
ret = elv_register_queue(q, true);
if (ret) {
elevator_exit(q, q->elevator);
blk_mq_sched_free_rqs(q);
elevator_exit(q);
goto out;
}
}
@ -635,7 +639,7 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
return NULL;
if (q->nr_hw_queues != 1 &&
!blk_mq_is_sbitmap_shared(q->tag_set->flags))
!blk_mq_is_shared_tags(q->tag_set->flags))
return NULL;
return elevator_get(q, "mq-deadline", false);

View File

@ -15,9 +15,10 @@
#include <linux/falloc.h>
#include <linux/suspend.h>
#include <linux/fs.h>
#include <linux/module.h>
#include "blk.h"
static struct inode *bdev_file_inode(struct file *file)
static inline struct inode *bdev_file_inode(struct file *file)
{
return file->f_mapping->host;
}
@ -54,14 +55,12 @@ static void blkdev_bio_end_io_simple(struct bio *bio)
static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
struct iov_iter *iter, unsigned int nr_pages)
{
struct file *file = iocb->ki_filp;
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
struct block_device *bdev = iocb->ki_filp->private_data;
struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
loff_t pos = iocb->ki_pos;
bool should_dirty = false;
struct bio bio;
ssize_t ret;
blk_qc_t qc;
if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1))
@ -78,7 +77,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
bio_init(&bio, vecs, nr_pages);
bio_set_dev(&bio, bdev);
bio.bi_iter.bi_sector = pos >> 9;
bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio.bi_write_hint = iocb->ki_hint;
bio.bi_private = current;
bio.bi_end_io = blkdev_bio_end_io_simple;
@ -102,13 +101,12 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
if (iocb->ki_flags & IOCB_HIPRI)
bio_set_polled(&bio, iocb);
qc = submit_bio(&bio);
submit_bio(&bio);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!READ_ONCE(bio.bi_private))
break;
if (!(iocb->ki_flags & IOCB_HIPRI) ||
!blk_poll(bdev_get_queue(bdev), qc, true))
if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, NULL, 0))
blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
@ -126,6 +124,11 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
return ret;
}
enum {
DIO_SHOULD_DIRTY = 1,
DIO_IS_SYNC = 2,
};
struct blkdev_dio {
union {
struct kiocb *iocb;
@ -133,35 +136,27 @@ struct blkdev_dio {
};
size_t size;
atomic_t ref;
bool multi_bio : 1;
bool should_dirty : 1;
bool is_sync : 1;
struct bio bio;
unsigned int flags;
struct bio bio ____cacheline_aligned_in_smp;
};
static struct bio_set blkdev_dio_pool;
static int blkdev_iopoll(struct kiocb *kiocb, bool wait)
{
struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host);
struct request_queue *q = bdev_get_queue(bdev);
return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait);
}
static void blkdev_bio_end_io(struct bio *bio)
{
struct blkdev_dio *dio = bio->bi_private;
bool should_dirty = dio->should_dirty;
bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
if (bio->bi_status && !dio->bio.bi_status)
dio->bio.bi_status = bio->bi_status;
if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) {
if (!dio->is_sync) {
if (atomic_dec_and_test(&dio->ref)) {
if (!(dio->flags & DIO_IS_SYNC)) {
struct kiocb *iocb = dio->iocb;
ssize_t ret;
WRITE_ONCE(iocb->private, NULL);
if (likely(!dio->bio.bi_status)) {
ret = dio->size;
iocb->ki_pos += ret;
@ -169,9 +164,8 @@ static void blkdev_bio_end_io(struct bio *bio)
ret = blk_status_to_errno(dio->bio.bi_status);
}
dio->iocb->ki_complete(iocb, ret, 0);
if (dio->multi_bio)
bio_put(&dio->bio);
dio->iocb->ki_complete(iocb, ret);
bio_put(&dio->bio);
} else {
struct task_struct *waiter = dio->waiter;
@ -191,16 +185,12 @@ static void blkdev_bio_end_io(struct bio *bio)
static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
unsigned int nr_pages)
{
struct file *file = iocb->ki_filp;
struct inode *inode = bdev_file_inode(file);
struct block_device *bdev = I_BDEV(inode);
struct block_device *bdev = iocb->ki_filp->private_data;
struct blk_plug plug;
struct blkdev_dio *dio;
struct bio *bio;
bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0;
bool is_read = (iov_iter_rw(iter) == READ), is_sync;
loff_t pos = iocb->ki_pos;
blk_qc_t qc = BLK_QC_T_NONE;
int ret = 0;
if ((pos | iov_iter_alignment(iter)) &
@ -210,28 +200,31 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
dio->is_sync = is_sync = is_sync_kiocb(iocb);
if (dio->is_sync) {
atomic_set(&dio->ref, 1);
/*
* Grab an extra reference to ensure the dio structure which is embedded
* into the first bio stays around.
*/
bio_get(bio);
is_sync = is_sync_kiocb(iocb);
if (is_sync) {
dio->flags = DIO_IS_SYNC;
dio->waiter = current;
bio_get(bio);
} else {
dio->flags = 0;
dio->iocb = iocb;
}
dio->size = 0;
dio->multi_bio = false;
dio->should_dirty = is_read && iter_is_iovec(iter);
if (is_read && iter_is_iovec(iter))
dio->flags |= DIO_SHOULD_DIRTY;
/*
* Don't plug for HIPRI/polled IO, as those should go straight
* to issue
*/
if (!is_poll)
blk_start_plug(&plug);
blk_start_plug(&plug);
for (;;) {
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = pos >> 9;
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = iocb->ki_hint;
bio->bi_private = dio;
bio->bi_end_io = blkdev_bio_end_io;
@ -246,7 +239,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
if (is_read) {
bio->bi_opf = REQ_OP_READ;
if (dio->should_dirty)
if (dio->flags & DIO_SHOULD_DIRTY)
bio_set_pages_dirty(bio);
} else {
bio->bi_opf = dio_bio_write_op(iocb);
@ -260,40 +253,15 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
if (!nr_pages) {
bool polled = false;
if (iocb->ki_flags & IOCB_HIPRI) {
bio_set_polled(bio, iocb);
polled = true;
}
qc = submit_bio(bio);
if (polled)
WRITE_ONCE(iocb->ki_cookie, qc);
submit_bio(bio);
break;
}
if (!dio->multi_bio) {
/*
* AIO needs an extra reference to ensure the dio
* structure which is embedded into the first bio
* stays around.
*/
if (!is_sync)
bio_get(bio);
dio->multi_bio = true;
atomic_set(&dio->ref, 2);
} else {
atomic_inc(&dio->ref);
}
atomic_inc(&dio->ref);
submit_bio(bio);
bio = bio_alloc(GFP_KERNEL, nr_pages);
}
if (!is_poll)
blk_finish_plug(&plug);
blk_finish_plug(&plug);
if (!is_sync)
return -EIOCBQUEUED;
@ -302,10 +270,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
set_current_state(TASK_UNINTERRUPTIBLE);
if (!READ_ONCE(dio->waiter))
break;
if (!(iocb->ki_flags & IOCB_HIPRI) ||
!blk_poll(bdev_get_queue(bdev), qc, true))
blk_io_schedule();
blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
@ -318,6 +283,95 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
return ret;
}
static void blkdev_bio_end_io_async(struct bio *bio)
{
struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio);
struct kiocb *iocb = dio->iocb;
ssize_t ret;
WRITE_ONCE(iocb->private, NULL);
if (likely(!bio->bi_status)) {
ret = dio->size;
iocb->ki_pos += ret;
} else {
ret = blk_status_to_errno(bio->bi_status);
}
iocb->ki_complete(iocb, ret);
if (dio->flags & DIO_SHOULD_DIRTY) {
bio_check_pages_dirty(bio);
} else {
bio_release_pages(bio, false);
bio_put(bio);
}
}
static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
struct iov_iter *iter,
unsigned int nr_pages)
{
struct block_device *bdev = iocb->ki_filp->private_data;
struct blkdev_dio *dio;
struct bio *bio;
loff_t pos = iocb->ki_pos;
int ret = 0;
if ((pos | iov_iter_alignment(iter)) &
(bdev_logical_block_size(bdev) - 1))
return -EINVAL;
bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool);
dio = container_of(bio, struct blkdev_dio, bio);
dio->flags = 0;
dio->iocb = iocb;
bio_set_dev(bio, bdev);
bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
bio->bi_write_hint = iocb->ki_hint;
bio->bi_end_io = blkdev_bio_end_io_async;
bio->bi_ioprio = iocb->ki_ioprio;
if (iov_iter_is_bvec(iter)) {
/*
* Users don't rely on the iterator being in any particular
* state for async I/O returning -EIOCBQUEUED, hence we can
* avoid expensive iov_iter_advance(). Bypass
* bio_iov_iter_get_pages() and set the bvec directly.
*/
bio_iov_bvec_set(bio, iter);
} else {
ret = bio_iov_iter_get_pages(bio, iter);
if (unlikely(ret)) {
bio_put(bio);
return ret;
}
}
dio->size = bio->bi_iter.bi_size;
if (iov_iter_rw(iter) == READ) {
bio->bi_opf = REQ_OP_READ;
if (iter_is_iovec(iter)) {
dio->flags |= DIO_SHOULD_DIRTY;
bio_set_pages_dirty(bio);
}
} else {
bio->bi_opf = dio_bio_write_op(iocb);
task_io_account_write(bio->bi_iter.bi_size);
}
if (iocb->ki_flags & IOCB_HIPRI) {
bio->bi_opf |= REQ_POLLED | REQ_NOWAIT;
submit_bio(bio);
WRITE_ONCE(iocb->private, bio);
} else {
if (iocb->ki_flags & IOCB_NOWAIT)
bio->bi_opf |= REQ_NOWAIT;
submit_bio(bio);
}
return -EIOCBQUEUED;
}
static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
unsigned int nr_pages;
@ -326,9 +380,11 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
return 0;
nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS)
return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
if (likely(nr_pages <= BIO_MAX_VECS)) {
if (is_sync_kiocb(iocb))
return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
return __blkdev_direct_IO_async(iocb, iter, nr_pages);
}
return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
}
@ -405,8 +461,7 @@ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence)
static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
int datasync)
{
struct inode *bd_inode = bdev_file_inode(filp);
struct block_device *bdev = I_BDEV(bd_inode);
struct block_device *bdev = filp->private_data;
int error;
error = file_write_and_wait_range(filp, start, end);
@ -448,6 +503,8 @@ static int blkdev_open(struct inode *inode, struct file *filp)
bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
filp->private_data = bdev;
filp->f_mapping = bdev->bd_inode->i_mapping;
filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
return 0;
@ -455,29 +512,12 @@ static int blkdev_open(struct inode *inode, struct file *filp)
static int blkdev_close(struct inode *inode, struct file *filp)
{
struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
struct block_device *bdev = filp->private_data;
blkdev_put(bdev, filp->f_mode);
return 0;
}
static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
struct block_device *bdev = I_BDEV(bdev_file_inode(file));
fmode_t mode = file->f_mode;
/*
* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
* to updated it before every ioctl.
*/
if (file->f_flags & O_NDELAY)
mode |= FMODE_NDELAY;
else
mode &= ~FMODE_NDELAY;
return blkdev_ioctl(bdev, mode, cmd, arg);
}
/*
* Write data to the block device. Only intended for the block device itself
* and the raw driver which basically is a fake block device.
@ -487,14 +527,14 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
*/
static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
struct file *file = iocb->ki_filp;
struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
struct block_device *bdev = iocb->ki_filp->private_data;
struct inode *bd_inode = bdev->bd_inode;
loff_t size = bdev_nr_bytes(bdev);
struct blk_plug plug;
size_t shorted = 0;
ssize_t ret;
if (bdev_read_only(I_BDEV(bd_inode)))
if (bdev_read_only(bdev))
return -EPERM;
if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
@ -526,24 +566,58 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
struct file *file = iocb->ki_filp;
struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
struct block_device *bdev = iocb->ki_filp->private_data;
loff_t size = bdev_nr_bytes(bdev);
loff_t pos = iocb->ki_pos;
size_t shorted = 0;
ssize_t ret;
ssize_t ret = 0;
size_t count;
if (pos >= size)
return 0;
size -= pos;
if (iov_iter_count(to) > size) {
if (unlikely(pos + iov_iter_count(to) > size)) {
if (pos >= size)
return 0;
size -= pos;
shorted = iov_iter_count(to) - size;
iov_iter_truncate(to, size);
}
ret = generic_file_read_iter(iocb, to);
iov_iter_reexpand(to, iov_iter_count(to) + shorted);
count = iov_iter_count(to);
if (!count)
goto reexpand; /* skip atime */
if (iocb->ki_flags & IOCB_DIRECT) {
struct address_space *mapping = iocb->ki_filp->f_mapping;
if (iocb->ki_flags & IOCB_NOWAIT) {
if (filemap_range_needs_writeback(mapping, pos,
pos + count - 1)) {
ret = -EAGAIN;
goto reexpand;
}
} else {
ret = filemap_write_and_wait_range(mapping, pos,
pos + count - 1);
if (ret < 0)
goto reexpand;
}
file_accessed(iocb->ki_filp);
ret = blkdev_direct_IO(iocb, to);
if (ret >= 0) {
iocb->ki_pos += ret;
count -= ret;
}
iov_iter_revert(to, count - iov_iter_count(to));
if (ret < 0 || !count)
goto reexpand;
}
ret = filemap_read(iocb, to, ret);
reexpand:
if (unlikely(shorted))
iov_iter_reexpand(to, iov_iter_count(to) + shorted);
return ret;
}
@ -565,7 +639,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
return -EOPNOTSUPP;
/* Don't go off the end of the device. */
isize = i_size_read(bdev->bd_inode);
isize = bdev_nr_bytes(bdev);
if (start >= isize)
return -EINVAL;
if (end >= isize) {
@ -592,16 +666,18 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start,
switch (mode) {
case FALLOC_FL_ZERO_RANGE:
case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
GFP_KERNEL, BLKDEV_ZERO_NOUNMAP);
error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
len >> SECTOR_SHIFT, GFP_KERNEL,
BLKDEV_ZERO_NOUNMAP);
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9,
GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK);
error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
len >> SECTOR_SHIFT, GFP_KERNEL,
BLKDEV_ZERO_NOFALLBACK);
break;
case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
error = blkdev_issue_discard(bdev, start >> 9, len >> 9,
GFP_KERNEL, 0);
error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
len >> SECTOR_SHIFT, GFP_KERNEL, 0);
break;
default:
error = -EOPNOTSUPP;
@ -618,10 +694,10 @@ const struct file_operations def_blk_fops = {
.llseek = blkdev_llseek,
.read_iter = blkdev_read_iter,
.write_iter = blkdev_write_iter,
.iopoll = blkdev_iopoll,
.iopoll = iocb_bio_iopoll,
.mmap = generic_file_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
.unlocked_ioctl = blkdev_ioctl,
#ifdef CONFIG_COMPAT
.compat_ioctl = compat_blkdev_ioctl,
#endif

View File

@ -25,8 +25,10 @@
#include <linux/log2.h>
#include <linux/pm_runtime.h>
#include <linux/badblocks.h>
#include <linux/part_stat.h>
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
static struct kobject *block_depr;
@ -58,6 +60,7 @@ void set_capacity(struct gendisk *disk, sector_t sectors)
spin_lock(&bdev->bd_size_lock);
i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
bdev->bd_nr_sectors = sectors;
spin_unlock(&bdev->bd_size_lock);
}
EXPORT_SYMBOL(set_capacity);
@ -212,7 +215,10 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
* @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
* @major = 0, try to allocate any unused major number.
* @name: the name of the new block device as a zero terminated string
* @probe: allback that is called on access to any minor number of @major
* @probe: pre-devtmpfs / pre-udev callback used to create disks when their
* pre-created device node is accessed. When a probe call uses
* add_disk() and it fails the driver must cleanup resources. This
* interface may soon be removed.
*
* The @name must be unique within the system.
*
@ -368,17 +374,21 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action)
}
EXPORT_SYMBOL_GPL(disk_uevent);
static void disk_scan_partitions(struct gendisk *disk)
int disk_scan_partitions(struct gendisk *disk, fmode_t mode)
{
struct block_device *bdev;
if (!get_capacity(disk) || !disk_part_scan_enabled(disk))
return;
if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
return -EINVAL;
if (disk->open_partitions)
return -EBUSY;
set_bit(GD_NEED_PART_SCAN, &disk->state);
bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL);
if (!IS_ERR(bdev))
blkdev_put(bdev, FMODE_READ);
bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL);
if (IS_ERR(bdev))
return PTR_ERR(bdev);
blkdev_put(bdev, mode);
return 0;
}
/**
@ -390,8 +400,8 @@ static void disk_scan_partitions(struct gendisk *disk)
* This function registers the partitioning information in @disk
* with the kernel.
*/
int device_add_disk(struct device *parent, struct gendisk *disk,
const struct attribute_group **groups)
int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
const struct attribute_group **groups)
{
struct device *ddev = disk_to_dev(disk);
@ -432,7 +442,6 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
return ret;
disk->major = BLOCK_EXT_MAJOR;
disk->first_minor = ret;
disk->flags |= GENHD_FL_EXT_DEVT;
}
/* delay uevents, until we scanned partition table */
@ -489,14 +498,7 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
if (ret)
goto out_put_slave_dir;
if (disk->flags & GENHD_FL_HIDDEN) {
/*
* Don't let hidden disks show up in /proc/partitions,
* and don't bother scanning for partitions either.
*/
disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
disk->flags |= GENHD_FL_NO_PART_SCAN;
} else {
if (!(disk->flags & GENHD_FL_HIDDEN)) {
ret = bdi_register(disk->bdi, "%u:%u",
disk->major, disk->first_minor);
if (ret)
@ -508,7 +510,8 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
goto out_unregister_bdi;
bdev_add(disk->part0, ddev->devt);
disk_scan_partitions(disk);
if (get_capacity(disk))
disk_scan_partitions(disk, FMODE_READ);
/*
* Announce the disk and partitions after all partitions are
@ -541,7 +544,7 @@ int device_add_disk(struct device *parent, struct gendisk *disk,
out_free_ext_minor:
if (disk->major == BLOCK_EXT_MAJOR)
blk_free_ext_minor(disk->first_minor);
return WARN_ON_ONCE(ret); /* keep until all callers handle errors */
return ret;
}
EXPORT_SYMBOL(device_add_disk);
@ -645,6 +648,26 @@ void del_gendisk(struct gendisk *disk)
}
EXPORT_SYMBOL(del_gendisk);
/**
* invalidate_disk - invalidate the disk
* @disk: the struct gendisk to invalidate
*
* A helper to invalidates the disk. It will clean the disk's associated
* buffer/page caches and reset its internal states so that the disk
* can be reused by the drivers.
*
* Context: can sleep
*/
void invalidate_disk(struct gendisk *disk)
{
struct block_device *bdev = disk->part0;
invalidate_bdev(bdev);
bdev->bd_inode->i_mapping->wb_err = 0;
set_capacity(disk, 0);
}
EXPORT_SYMBOL(invalidate_disk);
/* sysfs access to bad-blocks list. */
static ssize_t disk_badblocks_show(struct device *dev,
struct device_attribute *attr,
@ -711,8 +734,7 @@ void __init printk_all_partitions(void)
* Don't show empty devices or things that have been
* suppressed
*/
if (get_capacity(disk) == 0 ||
(disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_HIDDEN))
continue;
/*
@ -805,11 +827,7 @@ static int show_partition(struct seq_file *seqf, void *v)
struct block_device *part;
unsigned long idx;
/* Don't show non-partitionable removeable devices or empty devices */
if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
(sgp->flags & GENHD_FL_REMOVABLE)))
return 0;
if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN))
return 0;
rcu_read_lock();
@ -865,7 +883,8 @@ static ssize_t disk_ext_range_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
return sprintf(buf, "%d\n", disk_max_parts(disk));
return sprintf(buf, "%d\n",
(disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS);
}
static ssize_t disk_removable_show(struct device *dev,
@ -904,7 +923,7 @@ ssize_t part_stat_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct block_device *bdev = dev_to_bdev(dev);
struct request_queue *q = bdev->bd_disk->queue;
struct request_queue *q = bdev_get_queue(bdev);
struct disk_stats stat;
unsigned int inflight;
@ -948,7 +967,7 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct block_device *bdev = dev_to_bdev(dev);
struct request_queue *q = bdev->bd_disk->queue;
struct request_queue *q = bdev_get_queue(bdev);
unsigned int inflight[2];
if (queue_is_mq(q))
@ -1290,6 +1309,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
if (!disk->bdi)
goto out_free_disk;
/* bdev_alloc() might need the queue, set before the first call */
disk->queue = q;
disk->part0 = bdev_alloc(disk, 0);
if (!disk->part0)
goto out_free_bdi;
@ -1305,7 +1327,6 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
disk_to_dev(disk)->type = &disk_type;
device_initialize(disk_to_dev(disk));
inc_diskseq(disk);
disk->queue = q;
q->disk = disk;
lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
@ -1332,7 +1353,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
struct request_queue *q;
struct gendisk *disk;
q = blk_alloc_queue(node);
q = blk_alloc_queue(node, false);
if (!q)
return NULL;
@ -1410,12 +1431,6 @@ void set_disk_ro(struct gendisk *disk, bool read_only)
}
EXPORT_SYMBOL(set_disk_ro);
int bdev_read_only(struct block_device *bdev)
{
return bdev->bd_read_only || get_disk_ro(bdev->bd_disk);
}
EXPORT_SYMBOL(bdev_read_only);
void inc_diskseq(struct gendisk *disk)
{
disk->diskseq = atomic64_inc_return(&diskseq);

View File

@ -82,31 +82,6 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
}
#endif
static int blkdev_reread_part(struct block_device *bdev, fmode_t mode)
{
struct block_device *tmp;
if (!disk_part_scan_enabled(bdev->bd_disk) || bdev_is_partition(bdev))
return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (bdev->bd_disk->open_partitions)
return -EBUSY;
/*
* Reopen the device to revalidate the driver state and force a
* partition rescan.
*/
mode &= ~FMODE_EXCL;
set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
tmp = blkdev_get_by_dev(bdev->bd_dev, mode, NULL);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
blkdev_put(tmp, mode);
return 0;
}
static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
unsigned long arg, unsigned long flags)
{
@ -133,7 +108,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
if (len & 511)
return -EINVAL;
if (start + len > i_size_read(bdev->bd_inode))
if (start + len > bdev_nr_bytes(bdev))
return -EINVAL;
filemap_invalidate_lock(inode->i_mapping);
@ -171,7 +146,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
return -EINVAL;
if (len & 511)
return -EINVAL;
if (end >= (uint64_t)i_size_read(bdev->bd_inode))
if (end >= (uint64_t)bdev_nr_bytes(bdev))
return -EINVAL;
if (end < start)
return -EINVAL;
@ -522,7 +497,11 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE;
return 0;
case BLKRRPART:
return blkdev_reread_part(bdev, mode);
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (bdev_is_partition(bdev))
return -EINVAL;
return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL);
case BLKTRACESTART:
case BLKTRACESTOP:
case BLKTRACETEARDOWN:
@ -550,12 +529,21 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
*
* New commands must be compatible and go into blkdev_common_ioctl
*/
int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
unsigned long arg)
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
int ret;
loff_t size;
struct block_device *bdev = I_BDEV(file->f_mapping->host);
void __user *argp = (void __user *)arg;
fmode_t mode = file->f_mode;
int ret;
/*
* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
* to updated it before every ioctl.
*/
if (file->f_flags & O_NDELAY)
mode |= FMODE_NDELAY;
else
mode &= ~FMODE_NDELAY;
switch (cmd) {
/* These need separate implementations for the data structure */
@ -572,10 +560,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
return put_long(argp,
(bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
case BLKGETSIZE:
size = i_size_read(bdev->bd_inode);
if ((size >> 9) > ~0UL)
if (bdev_nr_sectors(bdev) > ~0UL)
return -EFBIG;
return put_ulong(argp, size >> 9);
return put_ulong(argp, bdev_nr_sectors(bdev));
/* The data is compatible, but the command number is different */
case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
@ -583,7 +570,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKBSZSET:
return blkdev_bszset(bdev, mode, argp);
case BLKGETSIZE64:
return put_u64(argp, i_size_read(bdev->bd_inode));
return put_u64(argp, bdev_nr_bytes(bdev));
/* Incompatible alignment on i386 */
case BLKTRACESETUP:
@ -600,7 +587,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
return -ENOTTY;
return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
}
EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */
#ifdef CONFIG_COMPAT
@ -618,7 +604,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
struct block_device *bdev = I_BDEV(file->f_mapping->host);
struct gendisk *disk = bdev->bd_disk;
fmode_t mode = file->f_mode;
loff_t size;
/*
* O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
@ -644,10 +629,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return compat_put_long(argp,
(bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
case BLKGETSIZE:
size = i_size_read(bdev->bd_inode);
if ((size >> 9) > ~0UL)
if (bdev_nr_sectors(bdev) > ~0UL)
return -EFBIG;
return compat_put_ulong(argp, size >> 9);
return compat_put_ulong(argp, bdev_nr_sectors(bdev));
/* The data is compatible, but the command number is different */
case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
@ -655,7 +639,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case BLKBSZSET_32:
return blkdev_bszset(bdev, mode, argp);
case BLKGETSIZE64_32:
return put_u64(argp, i_size_read(bdev->bd_inode));
return put_u64(argp, bdev_nr_bytes(bdev));
/* Incompatible alignment on i386 */
case BLKTRACESETUP32:

View File

@ -22,46 +22,14 @@
*/
#include <linux/gfp.h>
#include <linux/kernel.h>
#include <linux/export.h>
#include <linux/ioprio.h>
#include <linux/cred.h>
#include <linux/blkdev.h>
#include <linux/capability.h>
#include <linux/sched/user.h>
#include <linux/sched/task.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/pid_namespace.h>
int set_task_ioprio(struct task_struct *task, int ioprio)
{
int err;
struct io_context *ioc;
const struct cred *cred = current_cred(), *tcred;
rcu_read_lock();
tcred = __task_cred(task);
if (!uid_eq(tcred->uid, cred->euid) &&
!uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
rcu_read_unlock();
return -EPERM;
}
rcu_read_unlock();
err = security_task_setioprio(task, ioprio);
if (err)
return err;
ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
if (ioc) {
ioc->ioprio = ioprio;
put_io_context(ioc);
}
return err;
}
EXPORT_SYMBOL_GPL(set_task_ioprio);
int ioprio_check_cap(int ioprio)
{
int class = IOPRIO_PRIO_CLASS(ioprio);

View File

@ -9,12 +9,12 @@
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/elevator.h>
#include <linux/module.h>
#include <linux/sbitmap.h>
#include <trace/events/block.h>
#include "elevator.h"
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
@ -433,6 +433,7 @@ static void kyber_exit_sched(struct elevator_queue *e)
int i;
del_timer_sync(&kqd->timer);
blk_stat_disable_accounting(kqd->q);
for (i = 0; i < KYBER_NUM_DOMAINS; i++)
sbitmap_queue_free(&kqd->domain_tokens[i]);
@ -453,11 +454,11 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
{
struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
struct blk_mq_tags *tags = hctx->sched_tags;
unsigned int shift = tags->bitmap_tags->sb.shift;
unsigned int shift = tags->bitmap_tags.sb.shift;
kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth);
sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
}
static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)

View File

@ -9,7 +9,6 @@
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
@ -20,6 +19,7 @@
#include <trace/events/block.h>
#include "elevator.h"
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
@ -31,6 +31,11 @@
*/
static const int read_expire = HZ / 2; /* max time before a read is submitted. */
static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
/*
* Time after which to dispatch lower priority requests even if higher
* priority requests are pending.
*/
static const int prio_aging_expire = 10 * HZ;
static const int writes_starved = 2; /* max times reads can starve a write */
static const int fifo_batch = 16; /* # of sequential requests treated as one
by the above parameters. For throughput. */
@ -51,17 +56,16 @@ enum dd_prio {
enum { DD_PRIO_COUNT = 3 };
/* I/O statistics per I/O priority. */
/*
* I/O statistics per I/O priority. It is fine if these counters overflow.
* What matters is that these counters are at least as wide as
* log2(max_outstanding_requests).
*/
struct io_stats_per_prio {
local_t inserted;
local_t merged;
local_t dispatched;
local_t completed;
};
/* I/O statistics for all I/O priorities (enum dd_prio). */
struct io_stats {
struct io_stats_per_prio stats[DD_PRIO_COUNT];
uint32_t inserted;
uint32_t merged;
uint32_t dispatched;
atomic_t completed;
};
/*
@ -74,6 +78,7 @@ struct dd_per_prio {
struct list_head fifo_list[DD_DIR_COUNT];
/* Next request in FIFO order. Read, write or both are NULL. */
struct request *next_rq[DD_DIR_COUNT];
struct io_stats_per_prio stats;
};
struct deadline_data {
@ -88,8 +93,6 @@ struct deadline_data {
unsigned int batching; /* number of sequential requests made */
unsigned int starved; /* times reads have starved writes */
struct io_stats __percpu *stats;
/*
* settings that change how the i/o scheduler behaves
*/
@ -98,38 +101,12 @@ struct deadline_data {
int writes_starved;
int front_merges;
u32 async_depth;
int prio_aging_expire;
spinlock_t lock;
spinlock_t zone_lock;
};
/* Count one event of type 'event_type' and with I/O priority 'prio' */
#define dd_count(dd, event_type, prio) do { \
struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \
\
BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \
BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \
local_inc(&io_stats->stats[(prio)].event_type); \
put_cpu_ptr(io_stats); \
} while (0)
/*
* Returns the total number of dd_count(dd, event_type, prio) calls across all
* CPUs. No locking or barriers since it is fine if the returned sum is slightly
* outdated.
*/
#define dd_sum(dd, event_type, prio) ({ \
unsigned int cpu; \
u32 sum = 0; \
\
BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \
BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \
for_each_present_cpu(cpu) \
sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \
stats[(prio)].event_type); \
sum; \
})
/* Maps an I/O priority class to a deadline scheduler priority. */
static const enum dd_prio ioprio_class_to_prio[] = {
[IOPRIO_CLASS_NONE] = DD_BE_PRIO,
@ -233,7 +210,9 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
const u8 ioprio_class = dd_rq_ioclass(next);
const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
dd_count(dd, merged, prio);
lockdep_assert_held(&dd->lock);
dd->per_prio[prio].stats.merged++;
/*
* if next expires before rq, assign its expire time to rq
@ -270,6 +249,16 @@ deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
deadline_remove_request(rq->q, per_prio, rq);
}
/* Number of requests queued for a given priority level. */
static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
{
const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
lockdep_assert_held(&dd->lock);
return stats->inserted - atomic_read(&stats->completed);
}
/*
* deadline_check_fifo returns 0 if there are no expired requests on the fifo,
* 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
@ -355,12 +344,27 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
return rq;
}
/*
* Returns true if and only if @rq started after @latest_start where
* @latest_start is in jiffies.
*/
static bool started_after(struct deadline_data *dd, struct request *rq,
unsigned long latest_start)
{
unsigned long start_time = (unsigned long)rq->fifo_time;
start_time -= dd->fifo_expire[rq_data_dir(rq)];
return time_after(start_time, latest_start);
}
/*
* deadline_dispatch_requests selects the best request according to
* read/write expire, fifo_batch, etc
* read/write expire, fifo_batch, etc and with a start time <= @latest_start.
*/
static struct request *__dd_dispatch_request(struct deadline_data *dd,
struct dd_per_prio *per_prio)
struct dd_per_prio *per_prio,
unsigned long latest_start)
{
struct request *rq, *next_rq;
enum dd_data_dir data_dir;
@ -372,6 +376,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
if (!list_empty(&per_prio->dispatch)) {
rq = list_first_entry(&per_prio->dispatch, struct request,
queuelist);
if (started_after(dd, rq, latest_start))
return NULL;
list_del_init(&rq->queuelist);
goto done;
}
@ -449,6 +455,9 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
dd->batching = 0;
dispatch_request:
if (started_after(dd, rq, latest_start))
return NULL;
/*
* rq is the selected appropriate request.
*/
@ -457,7 +466,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
done:
ioprio_class = dd_rq_ioclass(rq);
prio = ioprio_class_to_prio[ioprio_class];
dd_count(dd, dispatched, prio);
dd->per_prio[prio].stats.dispatched++;
/*
* If the request needs its target zone locked, do it.
*/
@ -466,6 +475,34 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
return rq;
}
/*
* Check whether there are any requests with priority other than DD_RT_PRIO
* that were inserted more than prio_aging_expire jiffies ago.
*/
static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
unsigned long now)
{
struct request *rq;
enum dd_prio prio;
int prio_cnt;
lockdep_assert_held(&dd->lock);
prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) +
!!dd_queued(dd, DD_IDLE_PRIO);
if (prio_cnt < 2)
return NULL;
for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
rq = __dd_dispatch_request(dd, &dd->per_prio[prio],
now - dd->prio_aging_expire);
if (rq)
return rq;
}
return NULL;
}
/*
* Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
*
@ -477,15 +514,26 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd,
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
const unsigned long now = jiffies;
struct request *rq;
enum dd_prio prio;
spin_lock(&dd->lock);
rq = dd_dispatch_prio_aged_requests(dd, now);
if (rq)
goto unlock;
/*
* Next, dispatch requests in priority order. Ignore lower priority
* requests if any higher priority requests are pending.
*/
for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
rq = __dd_dispatch_request(dd, &dd->per_prio[prio]);
if (rq)
rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now);
if (rq || dd_queued(dd, prio))
break;
}
unlock:
spin_unlock(&dd->lock);
return rq;
@ -519,7 +567,7 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth);
sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth);
}
/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
@ -536,12 +584,21 @@ static void dd_exit_sched(struct elevator_queue *e)
for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
struct dd_per_prio *per_prio = &dd->per_prio[prio];
const struct io_stats_per_prio *stats = &per_prio->stats;
uint32_t queued;
WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
}
free_percpu(dd->stats);
spin_lock(&dd->lock);
queued = dd_queued(dd, prio);
spin_unlock(&dd->lock);
WARN_ONCE(queued != 0,
"statistics for priority %d: i %u m %u d %u c %u\n",
prio, stats->inserted, stats->merged,
stats->dispatched, atomic_read(&stats->completed));
}
kfree(dd);
}
@ -566,11 +623,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
eq->elevator_data = dd;
dd->stats = alloc_percpu_gfp(typeof(*dd->stats),
GFP_KERNEL | __GFP_ZERO);
if (!dd->stats)
goto free_dd;
for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
struct dd_per_prio *per_prio = &dd->per_prio[prio];
@ -586,15 +638,13 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
dd->front_merges = 1;
dd->last_dir = DD_WRITE;
dd->fifo_batch = fifo_batch;
dd->prio_aging_expire = prio_aging_expire;
spin_lock_init(&dd->lock);
spin_lock_init(&dd->zone_lock);
q->elevator = eq;
return 0;
free_dd:
kfree(dd);
put_eq:
kobject_put(&eq->kobj);
return ret;
@ -677,8 +727,11 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
blk_req_zone_write_unlock(rq);
prio = ioprio_class_to_prio[ioprio_class];
dd_count(dd, inserted, prio);
rq->elv.priv[0] = (void *)(uintptr_t)1;
per_prio = &dd->per_prio[prio];
if (!rq->elv.priv[0]) {
per_prio->stats.inserted++;
rq->elv.priv[0] = (void *)(uintptr_t)1;
}
if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
blk_mq_free_requests(&free);
@ -687,7 +740,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
trace_block_rq_insert(rq);
per_prio = &dd->per_prio[prio];
if (at_head) {
list_add(&rq->queuelist, &per_prio->dispatch);
} else {
@ -759,12 +811,13 @@ static void dd_finish_request(struct request *rq)
/*
* The block layer core may call dd_finish_request() without having
* called dd_insert_requests(). Hence only update statistics for
* requests for which dd_insert_requests() has been called. See also
* blk_mq_request_bypass_insert().
* called dd_insert_requests(). Skip requests that bypassed I/O
* scheduling. See also blk_mq_request_bypass_insert().
*/
if (rq->elv.priv[0])
dd_count(dd, completed, prio);
if (!rq->elv.priv[0])
return;
atomic_inc(&per_prio->stats.completed);
if (blk_queue_is_zoned(q)) {
unsigned long flags;
@ -809,6 +862,7 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page) \
#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire);
SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
SHOW_INT(deadline_front_merges_show, dd->front_merges);
SHOW_INT(deadline_async_depth_show, dd->async_depth);
@ -838,6 +892,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)
STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX);
STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX);
@ -856,6 +911,7 @@ static struct elv_fs_entry deadline_attrs[] = {
DD_ATTR(front_merges),
DD_ATTR(async_depth),
DD_ATTR(fifo_batch),
DD_ATTR(prio_aging_expire),
__ATTR_NULL
};
@ -947,38 +1003,48 @@ static int dd_async_depth_show(void *data, struct seq_file *m)
return 0;
}
/* Number of requests queued for a given priority level. */
static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
{
return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio);
}
static int dd_queued_show(void *data, struct seq_file *m)
{
struct request_queue *q = data;
struct deadline_data *dd = q->elevator->elevator_data;
u32 rt, be, idle;
spin_lock(&dd->lock);
rt = dd_queued(dd, DD_RT_PRIO);
be = dd_queued(dd, DD_BE_PRIO);
idle = dd_queued(dd, DD_IDLE_PRIO);
spin_unlock(&dd->lock);
seq_printf(m, "%u %u %u\n", rt, be, idle);
seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO),
dd_queued(dd, DD_BE_PRIO),
dd_queued(dd, DD_IDLE_PRIO));
return 0;
}
/* Number of requests owned by the block driver for a given priority. */
static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
{
return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio)
- dd_sum(dd, completed, prio);
const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
lockdep_assert_held(&dd->lock);
return stats->dispatched + stats->merged -
atomic_read(&stats->completed);
}
static int dd_owned_by_driver_show(void *data, struct seq_file *m)
{
struct request_queue *q = data;
struct deadline_data *dd = q->elevator->elevator_data;
u32 rt, be, idle;
spin_lock(&dd->lock);
rt = dd_owned_by_driver(dd, DD_RT_PRIO);
be = dd_owned_by_driver(dd, DD_BE_PRIO);
idle = dd_owned_by_driver(dd, DD_IDLE_PRIO);
spin_unlock(&dd->lock);
seq_printf(m, "%u %u %u\n", rt, be, idle);
seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO),
dd_owned_by_driver(dd, DD_BE_PRIO),
dd_owned_by_driver(dd, DD_IDLE_PRIO));
return 0;
}

View File

@ -2,6 +2,8 @@
#
# Partition configuration
#
menu "Partition Types"
config PARTITION_ADVANCED
bool "Advanced partition selection"
help
@ -267,3 +269,5 @@ config CMDLINE_PARTITION
help
Say Y here if you want to read the partition table from bootargs.
The format for the command line is just like mtdparts.
endmenu

View File

@ -91,19 +91,19 @@ static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
{
spin_lock(&bdev->bd_size_lock);
i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
bdev->bd_nr_sectors = sectors;
spin_unlock(&bdev->bd_size_lock);
}
static struct parsed_partitions *allocate_partitions(struct gendisk *hd)
{
struct parsed_partitions *state;
int nr;
int nr = DISK_MAX_PARTS;
state = kzalloc(sizeof(*state), GFP_KERNEL);
if (!state)
return NULL;
nr = disk_max_parts(hd);
state->parts = vzalloc(array_size(nr, sizeof(state->parts[0])));
if (!state->parts) {
kfree(state);
@ -204,7 +204,7 @@ static ssize_t part_alignment_offset_show(struct device *dev,
struct block_device *bdev = dev_to_bdev(dev);
return sprintf(buf, "%u\n",
queue_limit_alignment_offset(&bdev->bd_disk->queue->limits,
queue_limit_alignment_offset(&bdev_get_queue(bdev)->limits,
bdev->bd_start_sect));
}
@ -214,7 +214,7 @@ static ssize_t part_discard_alignment_show(struct device *dev,
struct block_device *bdev = dev_to_bdev(dev);
return sprintf(buf, "%u\n",
queue_limit_discard_alignment(&bdev->bd_disk->queue->limits,
queue_limit_discard_alignment(&bdev_get_queue(bdev)->limits,
bdev->bd_start_sect));
}
@ -325,7 +325,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
lockdep_assert_held(&disk->open_mutex);
if (partno >= disk_max_parts(disk))
if (partno >= DISK_MAX_PARTS)
return ERR_PTR(-EINVAL);
/*
@ -526,18 +526,15 @@ int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
static bool disk_unlock_native_capacity(struct gendisk *disk)
{
const struct block_device_operations *bdops = disk->fops;
if (bdops->unlock_native_capacity &&
!(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
printk(KERN_CONT "enabling native capacity\n");
bdops->unlock_native_capacity(disk);
disk->flags |= GENHD_FL_NATIVE_CAPACITY;
return true;
} else {
if (!disk->fops->unlock_native_capacity ||
test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) {
printk(KERN_CONT "truncated\n");
return false;
}
printk(KERN_CONT "enabling native capacity\n");
disk->fops->unlock_native_capacity(disk);
return true;
}
void blk_drop_partitions(struct gendisk *disk)
@ -606,7 +603,7 @@ static int blk_add_partitions(struct gendisk *disk)
struct parsed_partitions *state;
int ret = -EAGAIN, p;
if (!disk_part_scan_enabled(disk))
if (disk->flags & GENHD_FL_NO_PART)
return 0;
state = check_partition(disk);
@ -689,7 +686,7 @@ int bdev_disk_changed(struct gendisk *disk, bool invalidate)
* userspace for this particular setup.
*/
if (invalidate) {
if (disk_part_scan_enabled(disk) ||
if (!(disk->flags & GENHD_FL_NO_PART) ||
!(disk->flags & GENHD_FL_REMOVABLE))
set_capacity(disk, 0);
}

View File

@ -133,7 +133,7 @@ efi_crc32(const void *buf, unsigned long len)
*/
static u64 last_lba(struct gendisk *disk)
{
return div_u64(disk->part0->bd_inode->i_size,
return div_u64(bdev_nr_bytes(disk->part0),
queue_logical_block_size(disk->queue)) - 1ULL;
}

View File

@ -198,7 +198,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
char name[],
union label_t *label,
sector_t labelsect,
loff_t i_size,
sector_t nr_sectors,
dasd_information2_t *info)
{
loff_t offset, geo_size, size;
@ -213,14 +213,14 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
} else {
/*
* Formated w/o large volume support. If the sanity check
* 'size based on geo == size based on i_size' is true, then
* 'size based on geo == size based on nr_sectors' is true, then
* we can safely assume that we know the formatted size of
* the disk, otherwise we need additional information
* that we can only get from a real DASD device.
*/
geo_size = geo->cylinders * geo->heads
* geo->sectors * secperblk;
size = i_size >> 9;
size = nr_sectors;
if (size != geo_size) {
if (!info) {
strlcat(state->pp_buf, "\n", PAGE_SIZE);
@ -229,7 +229,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
if (!strcmp(info->type, "ECKD"))
if (geo_size < size)
size = geo_size;
/* else keep size based on i_size */
/* else keep size based on nr_sectors */
}
}
/* first and only partition starts in the first block after the label */
@ -293,7 +293,8 @@ int ibm_partition(struct parsed_partitions *state)
struct gendisk *disk = state->disk;
struct block_device *bdev = disk->part0;
int blocksize, res;
loff_t i_size, offset, size;
loff_t offset, size;
sector_t nr_sectors;
dasd_information2_t *info;
struct hd_geometry *geo;
char type[5] = {0,};
@ -308,8 +309,8 @@ int ibm_partition(struct parsed_partitions *state)
blocksize = bdev_logical_block_size(bdev);
if (blocksize <= 0)
goto out_symbol;
i_size = i_size_read(bdev->bd_inode);
if (i_size == 0)
nr_sectors = bdev_nr_sectors(bdev);
if (nr_sectors == 0)
goto out_symbol;
info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL);
if (info == NULL)
@ -336,7 +337,7 @@ int ibm_partition(struct parsed_partitions *state)
label);
} else if (!strncmp(type, "LNX1", 4)) {
res = find_lnx1_partitions(state, geo, blocksize, name,
label, labelsect, i_size,
label, labelsect, nr_sectors,
info);
} else if (!strncmp(type, "CMS1", 4)) {
res = find_cms1_partitions(state, geo, blocksize, name,
@ -353,7 +354,7 @@ int ibm_partition(struct parsed_partitions *state)
res = 1;
if (info->format == DASD_FORMAT_LDL) {
strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
size = i_size >> 9;
size = nr_sectors;
offset = (info->label_block + 1) * (blocksize >> 9);
put_partition(state, 1, offset, size-offset);
strlcat(state->pp_buf, "\n", PAGE_SIZE);

View File

@ -5,7 +5,7 @@
*/
#include <linux/t10-pi.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
#include <linux/crc-t10dif.h>
#include <linux/module.h>
#include <net/checksum.h>