diff --git a/block/Kconfig b/block/Kconfig index 8e28ae7718..d5d4197b7e 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -35,6 +35,9 @@ config BLK_CGROUP_RWSTAT config BLK_DEV_BSG_COMMON tristate +config BLK_ICQ + bool + config BLK_DEV_BSGLIB bool "Block layer SG support v4 helper lib" select BLK_DEV_BSG_COMMON @@ -73,7 +76,7 @@ config BLK_DEV_ZONED config BLK_DEV_THROTTLING bool "Block layer bio throttling support" - depends on BLK_CGROUP=y + depends on BLK_CGROUP select BLK_CGROUP_RWSTAT help Block layer bio throttling support. It can be used to limit @@ -112,7 +115,7 @@ config BLK_WBT_MQ config BLK_CGROUP_IOLATENCY bool "Enable support for latency based cgroup IO protection" - depends on BLK_CGROUP=y + depends on BLK_CGROUP help Enabling this option enables the .latency interface for IO throttling. The IO controller will attempt to maintain average IO latencies below @@ -132,7 +135,7 @@ config BLK_CGROUP_FC_APPID config BLK_CGROUP_IOCOST bool "Enable support for cost model based cgroup IO controller" - depends on BLK_CGROUP=y + depends on BLK_CGROUP select BLK_RQ_IO_DATA_LEN select BLK_RQ_ALLOC_TIME help @@ -190,39 +193,31 @@ config BLK_INLINE_ENCRYPTION_FALLBACK by falling back to the kernel crypto API when inline encryption hardware is not present. -menu "Partition Types" - source "block/partitions/Kconfig" -endmenu - -endif # BLOCK - config BLOCK_COMPAT - bool - depends on BLOCK && COMPAT - default y + def_bool COMPAT config BLK_MQ_PCI - bool - depends on BLOCK && PCI - default y + def_bool PCI config BLK_MQ_VIRTIO bool - depends on BLOCK && VIRTIO + depends on VIRTIO default y config BLK_MQ_RDMA bool - depends on BLOCK && INFINIBAND + depends on INFINIBAND default y config BLK_PM - def_bool BLOCK && PM + def_bool PM # do not use in new code config BLOCK_HOLDER_DEPRECATED bool source "block/Kconfig.iosched" + +endif # BLOCK diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 2f2158e05a..6155161460 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -if BLOCK - menu "IO Schedulers" config MQ_IOSCHED_DEADLINE @@ -20,6 +18,7 @@ config MQ_IOSCHED_KYBER config IOSCHED_BFQ tristate "BFQ I/O scheduler" + select BLK_ICQ help BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of of the device among all processes according to their weights, @@ -45,5 +44,3 @@ config BFQ_CGROUP_DEBUG files in a cgroup which can be useful for debugging. endmenu - -endif diff --git a/block/Makefile b/block/Makefile index 41aa1ba69c..f38eaa6129 100644 --- a/block/Makefile +++ b/block/Makefile @@ -3,13 +3,13 @@ # Makefile for the kernel block layer # -obj-$(CONFIG_BLOCK) := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ +obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ - blk-exec.o blk-merge.o blk-timeout.o \ + blk-merge.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \ - disk-events.o + disk-events.o blk-ia-ranges.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o @@ -36,6 +36,6 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o -obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o +obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o diff --git a/block/bdev.c b/block/bdev.c index 485a258b0a..102837a370 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -23,7 +24,6 @@ #include #include #include -#include #include #include #include "../fs/internal.h" @@ -87,10 +87,6 @@ void invalidate_bdev(struct block_device *bdev) lru_add_drain_all(); /* make sure all lru add caches are flushed */ invalidate_mapping_pages(mapping, 0, -1); } - /* 99% of the time, we don't need to flush the cleancache on the bdev. - * But, for the strange corners, lets be cautious - */ - cleancache_invalidate_inode(mapping); } EXPORT_SYMBOL(invalidate_bdev); @@ -184,14 +180,13 @@ int sb_min_blocksize(struct super_block *sb, int size) EXPORT_SYMBOL(sb_min_blocksize); -int __sync_blockdev(struct block_device *bdev, int wait) +int sync_blockdev_nowait(struct block_device *bdev) { if (!bdev) return 0; - if (!wait) - return filemap_flush(bdev->bd_inode->i_mapping); - return filemap_write_and_wait(bdev->bd_inode->i_mapping); + return filemap_flush(bdev->bd_inode->i_mapping); } +EXPORT_SYMBOL_GPL(sync_blockdev_nowait); /* * Write out and wait upon all the dirty data associated with a block @@ -199,7 +194,9 @@ int __sync_blockdev(struct block_device *bdev, int wait) */ int sync_blockdev(struct block_device *bdev) { - return __sync_blockdev(bdev, 1); + if (!bdev) + return 0; + return filemap_write_and_wait(bdev->bd_inode->i_mapping); } EXPORT_SYMBOL(sync_blockdev); @@ -326,12 +323,12 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return result; - result = blk_queue_enter(bdev->bd_disk->queue, 0); + result = blk_queue_enter(bdev_get_queue(bdev), 0); if (result) return result; result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, REQ_OP_READ); - blk_queue_exit(bdev->bd_disk->queue); + blk_queue_exit(bdev_get_queue(bdev)); return result; } @@ -362,7 +359,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; - result = blk_queue_enter(bdev->bd_disk->queue, 0); + result = blk_queue_enter(bdev_get_queue(bdev), 0); if (result) return result; @@ -375,7 +372,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, clean_page_buffers(page); unlock_page(page); } - blk_queue_exit(bdev->bd_disk->queue); + blk_queue_exit(bdev_get_queue(bdev)); return result; } @@ -492,6 +489,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) spin_lock_init(&bdev->bd_size_lock); bdev->bd_partno = partno; bdev->bd_inode = inode; + bdev->bd_queue = disk->queue; bdev->bd_stats = alloc_percpu(struct disk_stats); if (!bdev->bd_stats) { iput(inode); @@ -662,7 +660,7 @@ static void blkdev_flush_mapping(struct block_device *bdev) static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) { struct gendisk *disk = bdev->bd_disk; - int ret = 0; + int ret; if (disk->fops->open) { ret = disk->fops->open(bdev, mode); @@ -747,21 +745,11 @@ struct block_device *blkdev_get_no_open(dev_t dev) if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) bdev = NULL; iput(inode); - - if (!bdev) - return NULL; - if ((bdev->bd_disk->flags & GENHD_FL_HIDDEN) || - !try_module_get(bdev->bd_disk->fops->owner)) { - put_device(&bdev->bd_device); - return NULL; - } - return bdev; } void blkdev_put_no_open(struct block_device *bdev) { - module_put(bdev->bd_disk->fops->owner); put_device(&bdev->bd_device); } @@ -817,12 +805,14 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) ret = -ENXIO; if (!disk_live(disk)) goto abort_claiming; + if (!try_module_get(disk->fops->owner)) + goto abort_claiming; if (bdev_is_partition(bdev)) ret = blkdev_get_part(bdev, mode); else ret = blkdev_get_whole(bdev, mode); if (ret) - goto abort_claiming; + goto put_module; if (mode & FMODE_EXCL) { bd_finish_claiming(bdev, holder); @@ -834,7 +824,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) * used in blkdev_get/put(). */ if ((mode & FMODE_WRITE) && !bdev->bd_write_holder && - (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { + (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { bdev->bd_write_holder = true; unblock_events = false; } @@ -844,7 +834,8 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) if (unblock_events) disk_unblock_events(disk); return bdev; - +put_module: + module_put(disk->fops->owner); abort_claiming: if (mode & FMODE_EXCL) bd_abort_claiming(bdev, holder); @@ -953,18 +944,21 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) blkdev_put_whole(bdev, mode); mutex_unlock(&disk->open_mutex); + module_put(disk->fops->owner); blkdev_put_no_open(bdev); } EXPORT_SYMBOL(blkdev_put); /** - * lookup_bdev - lookup a struct block_device by name - * @pathname: special file representing the block device - * @dev: return value of the block device's dev_t + * lookup_bdev() - Look up a struct block_device by name. + * @pathname: Name of the block device in the filesystem. + * @dev: Pointer to the block device's dev_t, if found. * - * Get a reference to the blockdevice at @pathname in the current - * namespace if possible and return it. Return ERR_PTR(error) - * otherwise. + * Lookup the block device's dev_t at @pathname in the current + * namespace if possible and return it in @dev. + * + * Context: May sleep. + * Return: 0 if succeeded, negative errno otherwise. */ int lookup_bdev(const char *pathname, dev_t *dev) { @@ -1016,7 +1010,7 @@ int __invalidate_device(struct block_device *bdev, bool kill_dirty) } EXPORT_SYMBOL(__invalidate_device); -void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) +void sync_bdevs(bool wait) { struct inode *inode, *old_inode = NULL; @@ -1047,8 +1041,19 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) bdev = I_BDEV(inode); mutex_lock(&bdev->bd_disk->open_mutex); - if (bdev->bd_openers) - func(bdev, arg); + if (!bdev->bd_openers) { + ; /* skip */ + } else if (wait) { + /* + * We keep the error status of individual mapping so + * that applications can catch the writeback error using + * fsync(2). See filemap_fdatawait_keep_errors() for + * details. + */ + filemap_fdatawait_keep_errors(inode->i_mapping); + } else { + filemap_fdatawrite(inode->i_mapping); + } mutex_unlock(&bdev->bd_disk->open_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 85b8e1c3a7..24a5c5329b 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -6,13 +6,13 @@ #include #include #include -#include #include #include #include #include #include +#include "elevator.h" #include "bfq-iosched.h" #ifdef CONFIG_BFQ_CGROUP_DEBUG @@ -463,7 +463,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { if (blkg_rwstat_init(&stats->bytes, gfp) || blkg_rwstat_init(&stats->ios, gfp)) - return -ENOMEM; + goto error; #ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || @@ -476,13 +476,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) bfq_stat_init(&stats->dequeue, gfp) || bfq_stat_init(&stats->group_wait_time, gfp) || bfq_stat_init(&stats->idle_time, gfp) || - bfq_stat_init(&stats->empty_time, gfp)) { - bfqg_stats_exit(stats); - return -ENOMEM; - } + bfq_stat_init(&stats->empty_time, gfp)) + goto error; #endif return 0; + +error: + bfqg_stats_exit(stats); + return -ENOMEM; } static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index e66970bf27..36a66e97e3 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -117,7 +117,6 @@ #include #include #include -#include #include #include #include @@ -127,6 +126,7 @@ #include +#include "elevator.h" #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" @@ -433,26 +433,21 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) /** * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. - * @bfqd: the lookup key. - * @ioc: the io_context of the process doing I/O. * @q: the request queue. */ -static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - struct io_context *ioc, - struct request_queue *q) +static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) { - if (ioc) { - unsigned long flags; - struct bfq_io_cq *icq; + struct bfq_io_cq *icq; + unsigned long flags; - spin_lock_irqsave(&q->queue_lock, flags); - icq = icq_to_bic(ioc_lookup_icq(ioc, q)); - spin_unlock_irqrestore(&q->queue_lock, flags); + if (!current->io_context) + return NULL; - return icq; - } + spin_lock_irqsave(&q->queue_lock, flags); + icq = icq_to_bic(ioc_lookup_icq(q)); + spin_unlock_irqrestore(&q->queue_lock, flags); - return NULL; + return icq; } /* @@ -565,26 +560,134 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, } } +#define BFQ_LIMIT_INLINE_DEPTH 16 + +#ifdef CONFIG_BFQ_GROUP_IOSCHED +static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) +{ + struct bfq_data *bfqd = bfqq->bfqd; + struct bfq_entity *entity = &bfqq->entity; + struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH]; + struct bfq_entity **entities = inline_entities; + int depth, level; + int class_idx = bfqq->ioprio_class - 1; + struct bfq_sched_data *sched_data; + unsigned long wsum; + bool ret = false; + + if (!entity->on_st_or_in_serv) + return false; + + /* +1 for bfqq entity, root cgroup not included */ + depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1; + if (depth > BFQ_LIMIT_INLINE_DEPTH) { + entities = kmalloc_array(depth, sizeof(*entities), GFP_NOIO); + if (!entities) + return false; + } + + spin_lock_irq(&bfqd->lock); + sched_data = entity->sched_data; + /* Gather our ancestors as we need to traverse them in reverse order */ + level = 0; + for_each_entity(entity) { + /* + * If at some level entity is not even active, allow request + * queueing so that BFQ knows there's work to do and activate + * entities. + */ + if (!entity->on_st_or_in_serv) + goto out; + /* Uh, more parents than cgroup subsystem thinks? */ + if (WARN_ON_ONCE(level >= depth)) + break; + entities[level++] = entity; + } + WARN_ON_ONCE(level != depth); + for (level--; level >= 0; level--) { + entity = entities[level]; + if (level > 0) { + wsum = bfq_entity_service_tree(entity)->wsum; + } else { + int i; + /* + * For bfqq itself we take into account service trees + * of all higher priority classes and multiply their + * weights so that low prio queue from higher class + * gets more requests than high prio queue from lower + * class. + */ + wsum = 0; + for (i = 0; i <= class_idx; i++) { + wsum = wsum * IOPRIO_BE_NR + + sched_data->service_tree[i].wsum; + } + } + limit = DIV_ROUND_CLOSEST(limit * entity->weight, wsum); + if (entity->allocated >= limit) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "too many requests: allocated %d limit %d level %d", + entity->allocated, limit, level); + ret = true; + break; + } + } +out: + spin_unlock_irq(&bfqd->lock); + if (entities != inline_entities) + kfree(entities); + return ret; +} +#else +static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) +{ + return false; +} +#endif + /* * Async I/O can easily starve sync I/O (both sync reads and sync * writes), by consuming all tags. Similarly, storms of sync writes, * such as those that sync(2) may trigger, can starve sync reads. * Limit depths of async I/O and sync writes so as to counter both * problems. + * + * Also if a bfq queue or its parent cgroup consume more tags than would be + * appropriate for their weight, we trim the available tag depth to 1. This + * avoids a situation where one cgroup can starve another cgroup from tags and + * thus block service differentiation among cgroups. Note that because the + * queue / cgroup already has many requests allocated and queued, this does not + * significantly affect service guarantees coming from the BFQ scheduling + * algorithm. */ static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) { struct bfq_data *bfqd = data->q->elevator->elevator_data; + struct bfq_io_cq *bic = bfq_bic_lookup(data->q); + struct bfq_queue *bfqq = bic ? bic_to_bfqq(bic, op_is_sync(op)) : NULL; + int depth; + unsigned limit = data->q->nr_requests; - if (op_is_sync(op) && !op_is_write(op)) - return; + /* Sync reads have full depth available */ + if (op_is_sync(op) && !op_is_write(op)) { + depth = 0; + } else { + depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + limit = (limit * depth) >> bfqd->full_depth_shift; + } - data->shallow_depth = - bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + /* + * Does queue (or any parent entity) exceed number of requests that + * should be available to it? Heavily limit depth so that it cannot + * consume more available requests and thus starve other entities. + */ + if (bfqq && bfqq_request_over_limit(bfqq, limit)) + depth = 1; bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", - __func__, bfqd->wr_busy_queues, op_is_sync(op), - data->shallow_depth); + __func__, bfqd->wr_busy_queues, op_is_sync(op), depth); + if (depth) + data->shallow_depth = depth; } static struct bfq_queue * @@ -1113,7 +1216,8 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, static int bfqq_process_refs(struct bfq_queue *bfqq) { - return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv - + return bfqq->ref - bfqq->entity.allocated - + bfqq->entity.on_st_or_in_serv - (bfqq->weight_counter != NULL) - bfqq->stable_ref; } @@ -1982,20 +2086,19 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) * aspect, see the comments on the choice of the queue for injection * in bfq_select_queue(). * - * Turning back to the detection of a waker queue, a queue Q is deemed - * as a waker queue for bfqq if, for three consecutive times, bfqq - * happens to become non empty right after a request of Q has been - * completed. In this respect, even if bfqq is empty, we do not check - * for a waker if it still has some in-flight I/O. In fact, in this - * case bfqq is actually still being served by the drive, and may - * receive new I/O on the completion of some of the in-flight - * requests. In particular, on the first time, Q is tentatively set as - * a candidate waker queue, while on the third consecutive time that Q - * is detected, the field waker_bfqq is set to Q, to confirm that Q is - * a waker queue for bfqq. These detection steps are performed only if - * bfqq has a long think time, so as to make it more likely that - * bfqq's I/O is actually being blocked by a synchronization. This - * last filter, plus the above three-times requirement, make false + * Turning back to the detection of a waker queue, a queue Q is deemed as a + * waker queue for bfqq if, for three consecutive times, bfqq happens to become + * non empty right after a request of Q has been completed within given + * timeout. In this respect, even if bfqq is empty, we do not check for a waker + * if it still has some in-flight I/O. In fact, in this case bfqq is actually + * still being served by the drive, and may receive new I/O on the completion + * of some of the in-flight requests. In particular, on the first time, Q is + * tentatively set as a candidate waker queue, while on the third consecutive + * time that Q is detected, the field waker_bfqq is set to Q, to confirm that Q + * is a waker queue for bfqq. These detection steps are performed only if bfqq + * has a long think time, so as to make it more likely that bfqq's I/O is + * actually being blocked by a synchronization. This last filter, plus the + * above three-times requirement and time limit for detection, make false * positives less likely. * * NOTE @@ -2019,6 +2122,8 @@ static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, u64 now_ns) { + char waker_name[MAX_BFQQ_NAME_LENGTH]; + if (!bfqd->last_completed_rq_bfqq || bfqd->last_completed_rq_bfqq == bfqq || bfq_bfqq_has_short_ttime(bfqq) || @@ -2027,8 +2132,16 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqd->last_completed_rq_bfqq == bfqq->waker_bfqq) return; + /* + * We reset waker detection logic also if too much time has passed + * since the first detection. If wakeups are rare, pointless idling + * doesn't hurt throughput that much. The condition below makes sure + * we do not uselessly idle blocking waker in more than 1/64 cases. + */ if (bfqd->last_completed_rq_bfqq != - bfqq->tentative_waker_bfqq) { + bfqq->tentative_waker_bfqq || + now_ns > bfqq->waker_detection_started + + 128 * (u64)bfqd->bfq_slice_idle) { /* * First synchronization detected with a * candidate waker queue, or with a different @@ -2037,12 +2150,19 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq->tentative_waker_bfqq = bfqd->last_completed_rq_bfqq; bfqq->num_waker_detections = 1; + bfqq->waker_detection_started = now_ns; + bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name, + MAX_BFQQ_NAME_LENGTH); + bfq_log_bfqq(bfqd, bfqq, "set tenative waker %s", waker_name); } else /* Same tentative waker queue detected again */ bfqq->num_waker_detections++; if (bfqq->num_waker_detections == 3) { bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; bfqq->tentative_waker_bfqq = NULL; + bfq_bfqq_name(bfqq->waker_bfqq, waker_name, + MAX_BFQQ_NAME_LENGTH); + bfq_log_bfqq(bfqd, bfqq, "set waker %s", waker_name); /* * If the waker queue disappears, then @@ -2332,7 +2452,7 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, * returned by bfq_bic_lookup does not go away before * bfqd->lock is taken. */ - struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q); + struct bfq_io_cq *bic = bfq_bic_lookup(q); bool ret; spin_lock_irq(&bfqd->lock); @@ -5878,6 +5998,22 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, } } +static void bfqq_request_allocated(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + for_each_entity(entity) + entity->allocated++; +} + +static void bfqq_request_freed(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + for_each_entity(entity) + entity->allocated--; +} + /* returns true if it causes the idle timer to be disabled */ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) { @@ -5891,8 +6027,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) * Release the request's reference to the old bfqq * and make sure one is taken to the shared queue. */ - new_bfqq->allocated++; - bfqq->allocated--; + bfqq_request_allocated(new_bfqq); + bfqq_request_freed(bfqq); new_bfqq->ref++; /* * If the bic associated with the process @@ -6209,8 +6345,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) { - bfqq->allocated--; - + bfqq_request_freed(bfqq); bfq_put_queue(bfqq); } @@ -6434,6 +6569,16 @@ static void bfq_finish_requeue_request(struct request *rq) rq->elv.priv[1] = NULL; } +static void bfq_finish_request(struct request *rq) +{ + bfq_finish_requeue_request(rq); + + if (rq->elv.icq) { + put_io_context(rq->elv.icq->ioc); + rq->elv.icq = NULL; + } +} + /* * Removes the association between the current task and bfqq, assuming * that bic points to the bfq iocontext of the task. @@ -6531,6 +6676,8 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, */ static void bfq_prepare_request(struct request *rq) { + rq->elv.icq = ioc_find_get_icq(rq->q); + /* * Regardless of whether we have an icq attached, we have to * clear the scheduler pointers, as they might point to @@ -6630,7 +6777,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) } } - bfqq->allocated++; + bfqq_request_allocated(bfqq); bfqq->ref++; bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); @@ -6793,11 +6940,11 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) * See the comments on bfq_limit_depth for the purpose of * the depths set in the function. Return minimum shallow depth we'll use. */ -static unsigned int bfq_update_depths(struct bfq_data *bfqd, - struct sbitmap_queue *bt) +static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) { - unsigned int i, j, min_shallow = UINT_MAX; + unsigned int depth = 1U << bt->sb.shift; + bfqd->full_depth_shift = bt->sb.shift; /* * In-word depths if no bfq_queue is being weight-raised: * leaving 25% of tags only for sync reads. @@ -6809,13 +6956,13 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, * limit 'something'. */ /* no more than 50% of tags for async I/O */ - bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U); + bfqd->word_depths[0][0] = max(depth >> 1, 1U); /* * no more than 75% of tags for sync writes (25% extra tags * w.r.t. async I/O, to prevent async I/O from starving sync * writes) */ - bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U); + bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U); /* * In-word depths in case some bfq_queue is being weight- @@ -6825,25 +6972,18 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, * shortage. */ /* no more than ~18% of tags for async I/O */ - bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U); + bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U); /* no more than ~37% of tags for sync writes (~20% extra tags) */ - bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U); - - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - min_shallow = min(min_shallow, bfqd->word_depths[i][j]); - - return min_shallow; + bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U); } static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; - unsigned int min_shallow; - min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags); - sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow); + bfq_update_depths(bfqd, &tags->bitmap_tags); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); } static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) @@ -7260,7 +7400,7 @@ static struct elevator_type iosched_bfq_mq = { .limit_depth = bfq_limit_depth, .prepare_request = bfq_prepare_request, .requeue_request = bfq_finish_requeue_request, - .finish_request = bfq_finish_requeue_request, + .finish_request = bfq_finish_request, .exit_icq = bfq_exit_icq, .insert_requests = bfq_insert_requests, .dispatch_request = bfq_dispatch_request, diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index a73488eec8..07288b9da3 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -25,7 +25,7 @@ #define BFQ_DEFAULT_GRP_IOPRIO 0 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -#define MAX_PID_STR_LENGTH 12 +#define MAX_BFQQ_NAME_LENGTH 16 /* * Soft real-time applications are extremely more latency sensitive @@ -170,6 +170,9 @@ struct bfq_entity { /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ int budget; + /* Number of requests allocated in the subtree of this entity */ + int allocated; + /* device weight, if non-zero, it overrides the default weight of * bfq_group_data */ int dev_weight; @@ -266,8 +269,6 @@ struct bfq_queue { struct request *next_rq; /* number of sync and async requests queued */ int queued[2]; - /* number of requests currently allocated */ - int allocated; /* number of pending metadata requests */ int meta_pending; /* fifo list of requests in sort_list */ @@ -387,6 +388,8 @@ struct bfq_queue { struct bfq_queue *tentative_waker_bfqq; /* number of times the same tentative waker has been detected */ unsigned int num_waker_detections; + /* time when we started considering this waker */ + u64 waker_detection_started; /* node for woken_list, see below */ struct hlist_node woken_list_node; @@ -768,6 +771,7 @@ struct bfq_data { * function) */ unsigned int word_depths[2][2]; + unsigned int full_depth_shift; }; enum bfqq_state_flags { @@ -1079,26 +1083,27 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); /* --------------- end of interface of B-WF2Q+ ---------------- */ /* Logging facilities. */ -static inline void bfq_pid_to_str(int pid, char *str, int len) +static inline void bfq_bfqq_name(struct bfq_queue *bfqq, char *str, int len) { - if (pid != -1) - snprintf(str, len, "%d", pid); + char type = bfq_bfqq_sync(bfqq) ? 'S' : 'A'; + + if (bfqq->pid != -1) + snprintf(str, len, "bfq%d%c", bfqq->pid, type); else - snprintf(str, len, "SHARED-"); + snprintf(str, len, "bfqSHARED-%c", type); } #ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfq_group *bfqq_group(struct bfq_queue *bfqq); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char pid_str[MAX_PID_STR_LENGTH]; \ + char pid_str[MAX_BFQQ_NAME_LENGTH]; \ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ break; \ - bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ + bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ blk_add_cgroup_trace_msg((bfqd)->queue, \ bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ - "bfq%s%c " fmt, pid_str, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \ + "%s " fmt, pid_str, ##args); \ } while (0) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ @@ -1109,13 +1114,11 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq); #else /* CONFIG_BFQ_GROUP_IOSCHED */ #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char pid_str[MAX_PID_STR_LENGTH]; \ + char pid_str[MAX_BFQQ_NAME_LENGTH]; \ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ break; \ - bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ - blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - ##args); \ + bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ + blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \ } while (0) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4a7c33ed9a..0827b19820 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -6,7 +6,7 @@ * Written by: Martin K. Petersen */ -#include +#include #include #include #include @@ -134,7 +134,7 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, iv = bip->bip_vec + bip->bip_vcnt; if (bip->bip_vcnt && - bvec_gap_to_prev(bio->bi_bdev->bd_disk->queue, + bvec_gap_to_prev(bdev_get_queue(bio->bi_bdev), &bip->bip_vec[bip->bip_vcnt - 1], offset)) return 0; diff --git a/block/bio.c b/block/bio.c index 25f1ed2611..4312a80853 100644 --- a/block/bio.c +++ b/block/bio.c @@ -26,7 +26,7 @@ #include "blk-rq-qos.h" struct bio_alloc_cache { - struct bio_list free_list; + struct bio *free_list; unsigned int nr; }; @@ -87,7 +87,8 @@ static struct bio_slab *create_bio_slab(unsigned int size) snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size); bslab->slab = kmem_cache_create(bslab->name, size, - ARCH_KMALLOC_MINALIGN, SLAB_HWCACHE_ALIGN, NULL); + ARCH_KMALLOC_MINALIGN, + SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL); if (!bslab->slab) goto fail_alloc_slab; @@ -156,7 +157,7 @@ static void bio_put_slab(struct bio_set *bs) void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { - BIO_BUG_ON(nr_vecs > BIO_MAX_VECS); + BUG_ON(nr_vecs > BIO_MAX_VECS); if (nr_vecs == BIO_MAX_VECS) mempool_free(bv, pool); @@ -281,6 +282,7 @@ void bio_init(struct bio *bio, struct bio_vec *table, atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); + bio->bi_cookie = BLK_QC_T_NONE; bio->bi_max_vecs = max_vecs; bio->bi_io_vec = table; @@ -546,7 +548,7 @@ EXPORT_SYMBOL(zero_fill_bio); * REQ_OP_READ, zero the truncated part. This function should only * be used for handling corner cases, such as bio eod. */ -void bio_truncate(struct bio *bio, unsigned new_size) +static void bio_truncate(struct bio *bio, unsigned new_size) { struct bio_vec bv; struct bvec_iter iter; @@ -629,7 +631,8 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, unsigned int i = 0; struct bio *bio; - while ((bio = bio_list_pop(&cache->free_list)) != NULL) { + while ((bio = cache->free_list) != NULL) { + cache->free_list = bio->bi_next; cache->nr--; bio_free(bio); if (++i == nr) @@ -678,7 +681,7 @@ static void bio_alloc_cache_destroy(struct bio_set *bs) void bio_put(struct bio *bio) { if (unlikely(bio_flagged(bio, BIO_REFFED))) { - BIO_BUG_ON(!atomic_read(&bio->__bi_cnt)); + BUG_ON(!atomic_read(&bio->__bi_cnt)); if (!atomic_dec_and_test(&bio->__bi_cnt)) return; } @@ -688,7 +691,8 @@ void bio_put(struct bio *bio) bio_uninit(bio); cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); - bio_list_add_head(&cache->free_list, bio); + bio->bi_next = cache->free_list; + cache->free_list = bio; if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK) bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK); put_cpu(); @@ -773,6 +777,23 @@ const char *bio_devname(struct bio *bio, char *buf) } EXPORT_SYMBOL(bio_devname); +/** + * bio_full - check if the bio is full + * @bio: bio to check + * @len: length of one segment to be added + * + * Return true if @bio is full and one segment with @len bytes can't be + * added to the bio, otherwise return false + */ +static inline bool bio_full(struct bio *bio, unsigned len) +{ + if (bio->bi_vcnt >= bio->bi_max_vecs) + return true; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return true; + return false; +} + static inline bool page_is_mergeable(const struct bio_vec *bv, struct page *page, unsigned int len, unsigned int off, bool *same_page) @@ -792,6 +813,44 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); } +/** + * __bio_try_merge_page - try appending data to an existing bvec. + * @bio: destination bio + * @page: start page to add + * @len: length of the data to add + * @off: offset of the data relative to @page + * @same_page: return if the segment has been merged inside the same page + * + * Try to add the data at @page + @off to the last bvec of @bio. This is a + * useful optimisation for file systems with a block size smaller than the + * page size. + * + * Warn if (@len, @off) crosses pages in case that @same_page is true. + * + * Return %true on success or %false on failure. + */ +static bool __bio_try_merge_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int off, bool *same_page) +{ + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return false; + + if (bio->bi_vcnt > 0) { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (page_is_mergeable(bv, page, len, off, same_page)) { + if (bio->bi_iter.bi_size > UINT_MAX - len) { + *same_page = false; + return false; + } + bv->bv_len += len; + bio->bi_iter.bi_size += len; + return true; + } + } + return false; +} + /* * Try to merge a page into a segment, while obeying the hardware segment * size limit. This is not for normal read/write bios, but for passthrough @@ -909,7 +968,7 @@ EXPORT_SYMBOL(bio_add_pc_page); int bio_add_zone_append_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bio->bi_bdev); bool same_page = false; if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND)) @@ -923,45 +982,6 @@ int bio_add_zone_append_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL_GPL(bio_add_zone_append_page); -/** - * __bio_try_merge_page - try appending data to an existing bvec. - * @bio: destination bio - * @page: start page to add - * @len: length of the data to add - * @off: offset of the data relative to @page - * @same_page: return if the segment has been merged inside the same page - * - * Try to add the data at @page + @off to the last bvec of @bio. This is a - * useful optimisation for file systems with a block size smaller than the - * page size. - * - * Warn if (@len, @off) crosses pages in case that @same_page is true. - * - * Return %true on success or %false on failure. - */ -bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off, bool *same_page) -{ - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) - return false; - - if (bio->bi_vcnt > 0) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (page_is_mergeable(bv, page, len, off, same_page)) { - if (bio->bi_iter.bi_size > UINT_MAX - len) { - *same_page = false; - return false; - } - bv->bv_len += len; - bio->bi_iter.bi_size += len; - return true; - } - } - return false; -} -EXPORT_SYMBOL_GPL(__bio_try_merge_page); - /** * __bio_add_page - add page(s) to a bio in a new segment * @bio: destination bio @@ -1016,52 +1036,62 @@ int bio_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_add_page); -void bio_release_pages(struct bio *bio, bool mark_dirty) +/** + * bio_add_folio - Attempt to add part of a folio to a bio. + * @bio: BIO to add to. + * @folio: Folio to add. + * @len: How many bytes from the folio to add. + * @off: First byte in this folio to add. + * + * Filesystems that use folios can call this function instead of calling + * bio_add_page() for each page in the folio. If @off is bigger than + * PAGE_SIZE, this function can create a bio_vec that starts in a page + * after the bv_page. BIOs do not support folios that are 4GiB or larger. + * + * Return: Whether the addition was successful. + */ +bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, + size_t off) +{ + if (len > UINT_MAX || off > UINT_MAX) + return 0; + return bio_add_page(bio, &folio->page, len, off) > 0; +} + +void __bio_release_pages(struct bio *bio, bool mark_dirty) { struct bvec_iter_all iter_all; struct bio_vec *bvec; - if (bio_flagged(bio, BIO_NO_PAGE_REF)) - return; - bio_for_each_segment_all(bvec, bio, iter_all) { if (mark_dirty && !PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page); put_page(bvec->bv_page); } } -EXPORT_SYMBOL_GPL(bio_release_pages); +EXPORT_SYMBOL_GPL(__bio_release_pages); -static void __bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) +void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { + size_t size = iov_iter_count(iter); + WARN_ON_ONCE(bio->bi_max_vecs); + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + size_t max_sectors = queue_max_zone_append_sectors(q); + + size = min(size, max_sectors << SECTOR_SHIFT); + } + bio->bi_vcnt = iter->nr_segs; bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; - bio->bi_iter.bi_size = iter->count; + bio->bi_iter.bi_size = size; bio_set_flag(bio, BIO_NO_PAGE_REF); bio_set_flag(bio, BIO_CLONED); } -static int bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) -{ - __bio_iov_bvec_set(bio, iter); - iov_iter_advance(iter, iter->count); - return 0; -} - -static int bio_iov_bvec_set_append(struct bio *bio, struct iov_iter *iter) -{ - struct request_queue *q = bio->bi_bdev->bd_disk->queue; - struct iov_iter i = *iter; - - iov_iter_truncate(&i, queue_max_zone_append_sectors(q) << 9); - __bio_iov_bvec_set(bio, &i); - iov_iter_advance(iter, i.count); - return 0; -} - static void bio_put_pages(struct page **pages, size_t size, size_t off) { size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); @@ -1131,7 +1161,7 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) { unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bio->bi_bdev); unsigned int max_append_sectors = queue_max_zone_append_sectors(q); struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; @@ -1203,9 +1233,9 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) int ret = 0; if (iov_iter_is_bvec(iter)) { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) - return bio_iov_bvec_set_append(bio, iter); - return bio_iov_bvec_set(bio, iter); + bio_iov_bvec_set(bio, iter); + iov_iter_advance(iter, bio->bi_iter.bi_size); + return 0; } do { @@ -1261,18 +1291,7 @@ int submit_bio_wait(struct bio *bio) } EXPORT_SYMBOL(submit_bio_wait); -/** - * bio_advance - increment/complete a bio by some number of bytes - * @bio: bio to advance - * @bytes: number of bytes to complete - * - * This updates bi_sector, bi_size and bi_idx; if the number of bytes to - * complete doesn't align with a bvec boundary, then bv_len and bv_offset will - * be updated on the last bvec as well. - * - * @bio will then represent the remaining, uncompleted portion of the io. - */ -void bio_advance(struct bio *bio, unsigned bytes) +void __bio_advance(struct bio *bio, unsigned bytes) { if (bio_integrity(bio)) bio_integrity_advance(bio, bytes); @@ -1280,7 +1299,7 @@ void bio_advance(struct bio *bio, unsigned bytes) bio_crypt_advance(bio, bytes); bio_advance_iter(bio, &bio->bi_iter, bytes); } -EXPORT_SYMBOL(bio_advance); +EXPORT_SYMBOL(__bio_advance); void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) @@ -1468,10 +1487,10 @@ void bio_endio(struct bio *bio) return; if (bio->bi_bdev && bio_flagged(bio, BIO_TRACKED)) - rq_qos_done_bio(bio->bi_bdev->bd_disk->queue, bio); + rq_qos_done_bio(bdev_get_queue(bio->bi_bdev), bio); if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bio->bi_bdev->bd_disk->queue, bio); + trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio); bio_clear_flag(bio, BIO_TRACE_COMPLETION); } @@ -1710,8 +1729,9 @@ struct bio *bio_alloc_kiocb(struct kiocb *kiocb, unsigned short nr_vecs, return bio_alloc_bioset(GFP_KERNEL, nr_vecs, bs); cache = per_cpu_ptr(bs->cache, get_cpu()); - bio = bio_list_pop(&cache->free_list); - if (bio) { + if (cache->free_list) { + bio = cache->free_list; + cache->free_list = bio->bi_next; cache->nr--; put_cpu(); bio_init(bio, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0eec59e4df..650f7e2798 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -30,8 +30,10 @@ #include #include #include +#include #include "blk.h" #include "blk-ioprio.h" +#include "blk-throttle.h" /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. @@ -620,7 +622,7 @@ struct block_device *blkcg_conf_open_bdev(char **inputp) */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx) - __acquires(rcu) __acquires(&bdev->bd_disk->queue->queue_lock) + __acquires(rcu) __acquires(&bdev->bd_queue->queue_lock) { struct block_device *bdev; struct request_queue *q; @@ -631,7 +633,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, if (IS_ERR(bdev)) return PTR_ERR(bdev); - q = bdev->bd_disk->queue; + q = bdev_get_queue(bdev); /* * blkcg_deactivate_policy() requires queue to be frozen, we can grab @@ -747,9 +749,9 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); * with blkg_conf_prep(). */ void blkg_conf_finish(struct blkg_conf_ctx *ctx) - __releases(&ctx->bdev->bd_disk->queue->queue_lock) __releases(rcu) + __releases(&ctx->bdev->bd_queue->queue_lock) __releases(rcu) { - spin_unlock_irq(&ctx->bdev->bd_disk->queue->queue_lock); + spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); rcu_read_unlock(); blkdev_put_no_open(ctx->bdev); } @@ -852,7 +854,7 @@ static void blkcg_fill_root_iostats(void) while ((dev = class_dev_iter_next(&iter))) { struct block_device *bdev = dev_to_bdev(dev); struct blkcg_gq *blkg = - blk_queue_root_blkg(bdev->bd_disk->queue); + blk_queue_root_blkg(bdev_get_queue(bdev)); struct blkg_iostat tmp; int cpu; @@ -1811,7 +1813,7 @@ static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, rcu_read_lock(); blkg = blkg_lookup_create(css_to_blkcg(css), - bio->bi_bdev->bd_disk->queue); + bdev_get_queue(bio->bi_bdev)); while (blkg) { if (blkg_tryget(blkg)) { ret_blkg = blkg; @@ -1847,8 +1849,8 @@ void bio_associate_blkg_from_css(struct bio *bio, if (css && css->parent) { bio->bi_blkg = blkg_tryget_closest(bio, css); } else { - blkg_get(bio->bi_bdev->bd_disk->queue->root_blkg); - bio->bi_blkg = bio->bi_bdev->bd_disk->queue->root_blkg; + blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg); + bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg; } } EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); diff --git a/block/blk-core.c b/block/blk-core.c index 42ac3a985c..779b4a1f66 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -16,8 +16,8 @@ #include #include #include -#include #include +#include #include #include #include @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -46,9 +47,10 @@ #include #include "blk.h" -#include "blk-mq.h" #include "blk-mq-sched.h" #include "blk-pm.h" +#include "blk-throttle.h" +#include "blk-rq-qos.h" struct dentry *blk_debugfs_root; @@ -65,6 +67,7 @@ DEFINE_IDA(blk_queue_ida); * For queue allocation */ struct kmem_cache *blk_requestq_cachep; +struct kmem_cache *blk_requestq_srcu_cachep; /* * Controlling structure to kblockd @@ -107,23 +110,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); -void blk_rq_init(struct request_queue *q, struct request *rq) -{ - memset(rq, 0, sizeof(*rq)); - - INIT_LIST_HEAD(&rq->queuelist); - rq->q = q; - rq->__sector = (sector_t) -1; - INIT_HLIST_NODE(&rq->hash); - RB_CLEAR_NODE(&rq->rb_node); - rq->tag = BLK_MQ_NO_TAG; - rq->internal_tag = BLK_MQ_NO_TAG; - rq->start_time_ns = ktime_get_ns(); - rq->part = NULL; - blk_crypto_rq_set_defaults(rq); -} -EXPORT_SYMBOL(blk_rq_init); - #define REQ_OP_NAME(name) [REQ_OP_##name] = #name static const char *const blk_op_name[] = { REQ_OP_NAME(READ), @@ -214,66 +200,15 @@ int blk_status_to_errno(blk_status_t status) } EXPORT_SYMBOL_GPL(blk_status_to_errno); -static void print_req_error(struct request *req, blk_status_t status, - const char *caller) +const char *blk_status_to_str(blk_status_t status) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) - return; - - printk_ratelimited(KERN_ERR - "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " - "phys_seg %u prio class %u\n", - caller, blk_errors[idx].name, - req->rq_disk ? req->rq_disk->disk_name : "?", - blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), - req->cmd_flags & ~REQ_OP_MASK, - req->nr_phys_segments, - IOPRIO_PRIO_CLASS(req->ioprio)); + return ""; + return blk_errors[idx].name; } -static void req_bio_endio(struct request *rq, struct bio *bio, - unsigned int nbytes, blk_status_t error) -{ - if (error) - bio->bi_status = error; - - if (unlikely(rq->rq_flags & RQF_QUIET)) - bio_set_flag(bio, BIO_QUIET); - - bio_advance(bio, nbytes); - - if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) { - /* - * Partial zone append completions cannot be supported as the - * BIO fragments may end up not being written sequentially. - */ - if (bio->bi_iter.bi_size) - bio->bi_status = BLK_STS_IOERR; - else - bio->bi_iter.bi_sector = rq->__sector; - } - - /* don't actually finish bio if it's part of flush sequence */ - if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) - bio_endio(bio); -} - -void blk_dump_rq_flags(struct request *rq, char *msg) -{ - printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, - rq->rq_disk ? rq->rq_disk->disk_name : "?", - (unsigned long long) rq->cmd_flags); - - printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", - (unsigned long long)blk_rq_pos(rq), - blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); - printk(KERN_INFO " bio %p, biotail %p, len %u\n", - rq->bio, rq->biotail, blk_rq_bytes(rq)); -} -EXPORT_SYMBOL(blk_dump_rq_flags); - /** * blk_sync_queue - cancel any pending callbacks on a queue * @q: the queue @@ -380,6 +315,9 @@ void blk_cleanup_queue(struct request_queue *q) */ blk_freeze_queue(q); + /* cleanup rq qos structures for queue without disk */ + rq_qos_exit(q); + blk_queue_flag_set(QUEUE_FLAG_DEAD, q); blk_sync_queue(q); @@ -398,7 +336,7 @@ void blk_cleanup_queue(struct request_queue *q) */ mutex_lock(&q->sysfs_lock); if (q->elevator) - blk_mq_sched_free_requests(q); + blk_mq_sched_free_rqs(q); mutex_unlock(&q->sysfs_lock); percpu_ref_exit(&q->q_usage_counter); @@ -408,30 +346,6 @@ void blk_cleanup_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_cleanup_queue); -static bool blk_try_enter_queue(struct request_queue *q, bool pm) -{ - rcu_read_lock(); - if (!percpu_ref_tryget_live(&q->q_usage_counter)) - goto fail; - - /* - * The code that increments the pm_only counter must ensure that the - * counter is globally visible before the queue is unfrozen. - */ - if (blk_queue_pm_only(q) && - (!pm || queue_rpm_status(q) == RPM_SUSPENDED)) - goto fail_put; - - rcu_read_unlock(); - return true; - -fail_put: - percpu_ref_put(&q->q_usage_counter); -fail: - rcu_read_unlock(); - return false; -} - /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer @@ -464,12 +378,11 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) return 0; } -static inline int bio_queue_enter(struct bio *bio) +int __bio_queue_enter(struct request_queue *q, struct bio *bio) { - struct gendisk *disk = bio->bi_bdev->bd_disk; - struct request_queue *q = disk->queue; - while (!blk_try_enter_queue(q, false)) { + struct gendisk *disk = bio->bi_bdev->bd_disk; + if (bio->bi_opf & REQ_NOWAIT) { if (test_bit(GD_DEAD, &disk->state)) goto dead; @@ -523,21 +436,27 @@ static void blk_timeout_work(struct work_struct *work) { } -struct request_queue *blk_alloc_queue(int node_id) +struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu) { struct request_queue *q; int ret; - q = kmem_cache_alloc_node(blk_requestq_cachep, - GFP_KERNEL | __GFP_ZERO, node_id); + q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu), + GFP_KERNEL | __GFP_ZERO, node_id); if (!q) return NULL; + if (alloc_srcu) { + blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q); + if (init_srcu_struct(q->srcu) != 0) + goto fail_q; + } + q->last_merge = NULL; q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); if (q->id < 0) - goto fail_q; + goto fail_srcu; ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, 0); if (ret) @@ -549,7 +468,7 @@ struct request_queue *blk_alloc_queue(int node_id) q->node = node_id; - atomic_set(&q->nr_active_requests_shared_sbitmap, 0); + atomic_set(&q->nr_active_requests_shared_tags, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, blk_timeout_work); @@ -582,7 +501,7 @@ struct request_queue *blk_alloc_queue(int node_id) blk_queue_dma_alignment(q, 511); blk_set_default_limits(&q->limits); - q->nr_requests = BLKDEV_MAX_RQ; + q->nr_requests = BLKDEV_DEFAULT_RQ; return q; @@ -594,8 +513,11 @@ struct request_queue *blk_alloc_queue(int node_id) bioset_exit(&q->bio_split); fail_id: ida_simple_remove(&blk_queue_ida, q->id); +fail_srcu: + if (alloc_srcu) + cleanup_srcu_struct(q->srcu); fail_q: - kmem_cache_free(blk_requestq_cachep, q); + kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q); return NULL; } @@ -618,40 +540,13 @@ bool blk_get_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_get_queue); -/** - * blk_get_request - allocate a request - * @q: request queue to allocate a request for - * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC. - * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT. - */ -struct request *blk_get_request(struct request_queue *q, unsigned int op, - blk_mq_req_flags_t flags) -{ - struct request *req; - - WARN_ON_ONCE(op & REQ_NOWAIT); - WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM)); - - req = blk_mq_alloc_request(q, op, flags); - if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) - q->mq_ops->initialize_rq_fn(req); - - return req; -} -EXPORT_SYMBOL(blk_get_request); - -void blk_put_request(struct request *req) -{ - blk_mq_free_request(req); -} -EXPORT_SYMBOL(blk_put_request); - static void handle_bad_sector(struct bio *bio, sector_t maxsector) { char b[BDEVNAME_SIZE]; - pr_info_ratelimited("attempt to access beyond end of device\n" + pr_info_ratelimited("%s: attempt to access beyond end of device\n" "%s: rw=%d, want=%llu, limit=%llu\n", + current->comm, bio_devname(bio, b), bio->bi_opf, bio_end_sector(bio), maxsector); } @@ -666,7 +561,7 @@ static int __init setup_fail_make_request(char *str) } __setup("fail_make_request=", setup_fail_make_request); -static bool should_fail_request(struct block_device *part, unsigned int bytes) +bool should_fail_request(struct block_device *part, unsigned int bytes) { return part->bd_make_it_fail && should_fail(&fail_make_request, bytes); } @@ -680,15 +575,6 @@ static int __init fail_make_request_debugfs(void) } late_initcall(fail_make_request_debugfs); - -#else /* CONFIG_FAIL_MAKE_REQUEST */ - -static inline bool should_fail_request(struct block_device *part, - unsigned int bytes) -{ - return false; -} - #endif /* CONFIG_FAIL_MAKE_REQUEST */ static inline bool bio_check_ro(struct bio *bio) @@ -790,10 +676,10 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_OK; } -static noinline_for_stack bool submit_bio_checks(struct bio *bio) +noinline_for_stack bool submit_bio_checks(struct bio *bio) { struct block_device *bdev = bio->bi_bdev; - struct request_queue *q = bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bdev); blk_status_t status = BLK_STS_IOERR; struct blk_plug *plug; @@ -835,7 +721,7 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) } if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) - bio_clear_hipri(bio); + bio_clear_polled(bio); switch (bio_op(bio)) { case REQ_OP_DISCARD: @@ -874,15 +760,6 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) break; } - /* - * Various block parts want %current->io_context, so allocate it up - * front rather than dealing with lots of pain to allocate it only - * where needed. This may fail and the block layer knows how to live - * with it. - */ - if (unlikely(!current->io_context)) - create_task_io_context(current, GFP_ATOMIC, q->node); - if (blk_throtl_bio(bio)) return false; @@ -906,25 +783,27 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio) return false; } -static blk_qc_t __submit_bio(struct bio *bio) +static void __submit_bio_fops(struct gendisk *disk, struct bio *bio) +{ + if (blk_crypto_bio_prep(&bio)) { + if (likely(bio_queue_enter(bio) == 0)) { + disk->fops->submit_bio(bio); + blk_queue_exit(disk->queue); + } + } +} + +static void __submit_bio(struct bio *bio) { struct gendisk *disk = bio->bi_bdev->bd_disk; - blk_qc_t ret = BLK_QC_T_NONE; - if (unlikely(bio_queue_enter(bio) != 0)) - return BLK_QC_T_NONE; + if (unlikely(!submit_bio_checks(bio))) + return; - if (!submit_bio_checks(bio) || !blk_crypto_bio_prep(&bio)) - goto queue_exit; - if (disk->fops->submit_bio) { - ret = disk->fops->submit_bio(bio); - goto queue_exit; - } - return blk_mq_submit_bio(bio); - -queue_exit: - blk_queue_exit(disk->queue); - return ret; + if (!disk->fops->submit_bio) + blk_mq_submit_bio(bio); + else + __submit_bio_fops(disk, bio); } /* @@ -946,10 +825,9 @@ static blk_qc_t __submit_bio(struct bio *bio) * bio_list_on_stack[1] contains bios that were submitted before the current * ->submit_bio_bio, but that haven't been processed yet. */ -static blk_qc_t __submit_bio_noacct(struct bio *bio) +static void __submit_bio_noacct(struct bio *bio) { struct bio_list bio_list_on_stack[2]; - blk_qc_t ret = BLK_QC_T_NONE; BUG_ON(bio->bi_next); @@ -957,7 +835,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) current->bio_list = bio_list_on_stack; do { - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct bio_list lower, same; /* @@ -966,7 +844,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); - ret = __submit_bio(bio); + __submit_bio(bio); /* * Sort new bios into those for a lower level and those for the @@ -975,7 +853,7 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) bio_list_init(&lower); bio_list_init(&same); while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) - if (q == bio->bi_bdev->bd_disk->queue) + if (q == bdev_get_queue(bio->bi_bdev)) bio_list_add(&same, bio); else bio_list_add(&lower, bio); @@ -989,22 +867,19 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio) } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); current->bio_list = NULL; - return ret; } -static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) +static void __submit_bio_noacct_mq(struct bio *bio) { struct bio_list bio_list[2] = { }; - blk_qc_t ret; current->bio_list = bio_list; do { - ret = __submit_bio(bio); + __submit_bio(bio); } while ((bio = bio_list_pop(&bio_list[0]))); current->bio_list = NULL; - return ret; } /** @@ -1016,7 +891,7 @@ static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) * systems and other upper level users of the block layer should use * submit_bio() instead. */ -blk_qc_t submit_bio_noacct(struct bio *bio) +void submit_bio_noacct(struct bio *bio) { /* * We only want one ->submit_bio to be active at a time, else stack @@ -1024,14 +899,12 @@ blk_qc_t submit_bio_noacct(struct bio *bio) * to collect a list of requests submited by a ->submit_bio method while * it is active, and then process them after it returned. */ - if (current->bio_list) { + if (current->bio_list) bio_list_add(¤t->bio_list[0], bio); - return BLK_QC_T_NONE; - } - - if (!bio->bi_bdev->bd_disk->fops->submit_bio) - return __submit_bio_noacct_mq(bio); - return __submit_bio_noacct(bio); + else if (!bio->bi_bdev->bd_disk->fops->submit_bio) + __submit_bio_noacct_mq(bio); + else + __submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio_noacct); @@ -1048,10 +921,10 @@ EXPORT_SYMBOL(submit_bio_noacct); * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has * been called. */ -blk_qc_t submit_bio(struct bio *bio) +void submit_bio(struct bio *bio) { if (blkcg_punt_bio_submit(bio)) - return BLK_QC_T_NONE; + return; /* * If it's a regular read/write or a barrier with data attached, @@ -1062,7 +935,7 @@ blk_qc_t submit_bio(struct bio *bio) if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) count = queue_logical_block_size( - bio->bi_bdev->bd_disk->queue) >> 9; + bdev_get_queue(bio->bi_bdev)) >> 9; else count = bio_sectors(bio); @@ -1083,149 +956,94 @@ blk_qc_t submit_bio(struct bio *bio) if (unlikely(bio_op(bio) == REQ_OP_READ && bio_flagged(bio, BIO_WORKINGSET))) { unsigned long pflags; - blk_qc_t ret; psi_memstall_enter(&pflags); - ret = submit_bio_noacct(bio); + submit_bio_noacct(bio); psi_memstall_leave(&pflags); - - return ret; + return; } - return submit_bio_noacct(bio); + submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); /** - * blk_cloned_rq_check_limits - Helper function to check a cloned request - * for the new queue limits - * @q: the queue - * @rq: the request being checked + * bio_poll - poll for BIO completions + * @bio: bio to poll for + * @iob: batches of IO + * @flags: BLK_POLL_* flags that control the behavior * - * Description: - * @rq may have been made based on weaker limitations of upper-level queues - * in request stacking drivers, and it may violate the limitation of @q. - * Since the block layer and the underlying device driver trust @rq - * after it is inserted to @q, it should be checked against @q before - * the insertion using this generic function. + * Poll for completions on queue associated with the bio. Returns number of + * completed entries found. * - * Request stacking drivers like request-based dm may change the queue - * limits when retrying requests on other queues. Those requests need - * to be checked against the new queue limits again during dispatch. + * Note: the caller must either be the context that submitted @bio, or + * be in a RCU critical section to prevent freeing of @bio. */ -static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, - struct request *rq) +int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) { - unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + blk_qc_t cookie = READ_ONCE(bio->bi_cookie); + int ret; - if (blk_rq_sectors(rq) > max_sectors) { - /* - * SCSI device does not have a good way to return if - * Write Same/Zero is actually supported. If a device rejects - * a non-read/write command (discard, write same,etc.) the - * low-level device driver will set the relevant queue limit to - * 0 to prevent blk-lib from issuing more of the offending - * operations. Commands queued prior to the queue limit being - * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O - * errors being propagated to upper layers. - */ - if (max_sectors == 0) - return BLK_STS_NOTSUPP; + if (cookie == BLK_QC_T_NONE || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return 0; - printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", - __func__, blk_rq_sectors(rq), max_sectors); - return BLK_STS_IOERR; - } + if (current->plug) + blk_flush_plug(current->plug, false); - /* - * The queue settings related to segment counting may differ from the - * original queue. - */ - rq->nr_phys_segments = blk_recalc_rq_segments(rq); - if (rq->nr_phys_segments > queue_max_segments(q)) { - printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", - __func__, rq->nr_phys_segments, queue_max_segments(q)); - return BLK_STS_IOERR; - } - - return BLK_STS_OK; + if (blk_queue_enter(q, BLK_MQ_REQ_NOWAIT)) + return 0; + if (WARN_ON_ONCE(!queue_is_mq(q))) + ret = 0; /* not yet implemented, should not happen */ + else + ret = blk_mq_poll(q, cookie, iob, flags); + blk_queue_exit(q); + return ret; } +EXPORT_SYMBOL_GPL(bio_poll); -/** - * blk_insert_cloned_request - Helper for stacking drivers to submit a request - * @q: the queue to submit the request - * @rq: the request being queued +/* + * Helper to implement file_operations.iopoll. Requires the bio to be stored + * in iocb->private, and cleared before freeing the bio. */ -blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) +int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, + unsigned int flags) { - blk_status_t ret; - - ret = blk_cloned_rq_check_limits(q, rq); - if (ret != BLK_STS_OK) - return ret; - - if (rq->rq_disk && - should_fail_request(rq->rq_disk->part0, blk_rq_bytes(rq))) - return BLK_STS_IOERR; - - if (blk_crypto_insert_cloned_request(rq)) - return BLK_STS_IOERR; - - if (blk_queue_io_stat(q)) - blk_account_io_start(rq); - - /* - * Since we have a scheduler attached on the top device, - * bypass a potential scheduler on the bottom device for - * insert. - */ - return blk_mq_request_issue_directly(rq, true); -} -EXPORT_SYMBOL_GPL(blk_insert_cloned_request); - -/** - * blk_rq_err_bytes - determine number of bytes till the next failure boundary - * @rq: request to examine - * - * Description: - * A request could be merge of IOs which require different failure - * handling. This function determines the number of bytes which - * can be failed from the beginning of the request without - * crossing into area which need to be retried further. - * - * Return: - * The number of bytes to fail. - */ -unsigned int blk_rq_err_bytes(const struct request *rq) -{ - unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; - unsigned int bytes = 0; struct bio *bio; - - if (!(rq->rq_flags & RQF_MIXED_MERGE)) - return blk_rq_bytes(rq); + int ret = 0; /* - * Currently the only 'mixing' which can happen is between - * different fastfail types. We can safely fail portions - * which have all the failfast bits that the first one has - - * the ones which are at least as eager to fail as the first - * one. + * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can + * point to a freshly allocated bio at this point. If that happens + * we have a few cases to consider: + * + * 1) the bio is beeing initialized and bi_bdev is NULL. We can just + * simply nothing in this case + * 2) the bio points to a not poll enabled device. bio_poll will catch + * this and return 0 + * 3) the bio points to a poll capable device, including but not + * limited to the one that the original bio pointed to. In this + * case we will call into the actual poll method and poll for I/O, + * even if we don't need to, but it won't cause harm either. + * + * For cases 2) and 3) above the RCU grace period ensures that bi_bdev + * is still allocated. Because partitions hold a reference to the whole + * device bdev and thus disk, the disk is also still valid. Grabbing + * a reference to the queue in bio_poll() ensures the hctxs and requests + * are still valid as well. */ - for (bio = rq->bio; bio; bio = bio->bi_next) { - if ((bio->bi_opf & ff) != ff) - break; - bytes += bio->bi_iter.bi_size; - } + rcu_read_lock(); + bio = READ_ONCE(kiocb->private); + if (bio && bio->bi_bdev) + ret = bio_poll(bio, iob, flags); + rcu_read_unlock(); - /* this could lead to infinite loop */ - BUG_ON(blk_rq_bytes(rq) && !bytes); - return bytes; + return ret; } -EXPORT_SYMBOL_GPL(blk_rq_err_bytes); +EXPORT_SYMBOL_GPL(iocb_bio_iopoll); -static void update_io_ticks(struct block_device *part, unsigned long now, - bool end) +void update_io_ticks(struct block_device *part, unsigned long now, bool end) { unsigned long stamp; again: @@ -1240,52 +1058,6 @@ static void update_io_ticks(struct block_device *part, unsigned long now, } } -static void blk_account_io_completion(struct request *req, unsigned int bytes) -{ - if (req->part && blk_do_io_stat(req)) { - const int sgrp = op_stat_group(req_op(req)); - - part_stat_lock(); - part_stat_add(req->part, sectors[sgrp], bytes >> 9); - part_stat_unlock(); - } -} - -void blk_account_io_done(struct request *req, u64 now) -{ - /* - * Account IO completion. flush_rq isn't accounted as a - * normal IO on queueing nor completion. Accounting the - * containing request is enough. - */ - if (req->part && blk_do_io_stat(req) && - !(req->rq_flags & RQF_FLUSH_SEQ)) { - const int sgrp = op_stat_group(req_op(req)); - - part_stat_lock(); - update_io_ticks(req->part, jiffies, true); - part_stat_inc(req->part, ios[sgrp]); - part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); - part_stat_unlock(); - } -} - -void blk_account_io_start(struct request *rq) -{ - if (!blk_do_io_stat(rq)) - return; - - /* passthrough requests can hold bios that do not have ->bi_bdev set */ - if (rq->bio && rq->bio->bi_bdev) - rq->part = rq->bio->bi_bdev; - else - rq->part = rq->rq_disk->part0; - - part_stat_lock(); - update_io_ticks(rq->part, jiffies, false); - part_stat_unlock(); -} - static unsigned long __part_start_io_acct(struct block_device *part, unsigned int sectors, unsigned int op, unsigned long start_time) @@ -1362,152 +1134,6 @@ void disk_end_io_acct(struct gendisk *disk, unsigned int op, } EXPORT_SYMBOL(disk_end_io_acct); -/* - * Steal bios from a request and add them to a bio list. - * The request must not have been partially completed before. - */ -void blk_steal_bios(struct bio_list *list, struct request *rq) -{ - if (rq->bio) { - if (list->tail) - list->tail->bi_next = rq->bio; - else - list->head = rq->bio; - list->tail = rq->biotail; - - rq->bio = NULL; - rq->biotail = NULL; - } - - rq->__data_len = 0; -} -EXPORT_SYMBOL_GPL(blk_steal_bios); - -/** - * blk_update_request - Complete multiple bytes without completing the request - * @req: the request being processed - * @error: block status code - * @nr_bytes: number of bytes to complete for @req - * - * Description: - * Ends I/O on a number of bytes attached to @req, but doesn't complete - * the request structure even if @req doesn't have leftover. - * If @req has leftover, sets it up for the next range of segments. - * - * Passing the result of blk_rq_bytes() as @nr_bytes guarantees - * %false return from this function. - * - * Note: - * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function - * except in the consistency check at the end of this function. - * - * Return: - * %false - this request doesn't have any more data - * %true - this request has more data - **/ -bool blk_update_request(struct request *req, blk_status_t error, - unsigned int nr_bytes) -{ - int total_bytes; - - trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes); - - if (!req->bio) - return false; - -#ifdef CONFIG_BLK_DEV_INTEGRITY - if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && - error == BLK_STS_OK) - req->q->integrity.profile->complete_fn(req, nr_bytes); -#endif - - if (unlikely(error && !blk_rq_is_passthrough(req) && - !(req->rq_flags & RQF_QUIET))) - print_req_error(req, error, __func__); - - blk_account_io_completion(req, nr_bytes); - - total_bytes = 0; - while (req->bio) { - struct bio *bio = req->bio; - unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); - - if (bio_bytes == bio->bi_iter.bi_size) - req->bio = bio->bi_next; - - /* Completion has already been traced */ - bio_clear_flag(bio, BIO_TRACE_COMPLETION); - req_bio_endio(req, bio, bio_bytes, error); - - total_bytes += bio_bytes; - nr_bytes -= bio_bytes; - - if (!nr_bytes) - break; - } - - /* - * completely done - */ - if (!req->bio) { - /* - * Reset counters so that the request stacking driver - * can find how many bytes remain in the request - * later. - */ - req->__data_len = 0; - return false; - } - - req->__data_len -= total_bytes; - - /* update sector only for requests with clear definition of sector */ - if (!blk_rq_is_passthrough(req)) - req->__sector += total_bytes >> 9; - - /* mixed attributes always follow the first bio */ - if (req->rq_flags & RQF_MIXED_MERGE) { - req->cmd_flags &= ~REQ_FAILFAST_MASK; - req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; - } - - if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { - /* - * If total number of sectors is less than the first segment - * size, something has gone terribly wrong. - */ - if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { - blk_dump_rq_flags(req, "request botched"); - req->__data_len = blk_rq_cur_bytes(req); - } - - /* recalculate the number of segments */ - req->nr_phys_segments = blk_recalc_rq_segments(req); - } - - return true; -} -EXPORT_SYMBOL_GPL(blk_update_request); - -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -/** - * rq_flush_dcache_pages - Helper function to flush all pages in a request - * @rq: the request to be flushed - * - * Description: - * Flush all pages in @rq. - */ -void rq_flush_dcache_pages(struct request *rq) -{ - struct req_iterator iter; - struct bio_vec bvec; - - rq_for_each_segment(bvec, rq, iter) - flush_dcache_page(bvec.bv_page); -} -EXPORT_SYMBOL_GPL(rq_flush_dcache_pages); -#endif - /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy * @q : the queue of the device being checked @@ -1536,93 +1162,6 @@ int blk_lld_busy(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_lld_busy); -/** - * blk_rq_unprep_clone - Helper function to free all bios in a cloned request - * @rq: the clone request to be cleaned up - * - * Description: - * Free all bios in @rq for a cloned request. - */ -void blk_rq_unprep_clone(struct request *rq) -{ - struct bio *bio; - - while ((bio = rq->bio) != NULL) { - rq->bio = bio->bi_next; - - bio_put(bio); - } -} -EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); - -/** - * blk_rq_prep_clone - Helper function to setup clone request - * @rq: the request to be setup - * @rq_src: original request to be cloned - * @bs: bio_set that bios for clone are allocated from - * @gfp_mask: memory allocation mask for bio - * @bio_ctr: setup function to be called for each clone bio. - * Returns %0 for success, non %0 for failure. - * @data: private data to be passed to @bio_ctr - * - * Description: - * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. - * Also, pages which the original bios are pointing to are not copied - * and the cloned bios just point same pages. - * So cloned bios must be completed before original bios, which means - * the caller must complete @rq before @rq_src. - */ -int blk_rq_prep_clone(struct request *rq, struct request *rq_src, - struct bio_set *bs, gfp_t gfp_mask, - int (*bio_ctr)(struct bio *, struct bio *, void *), - void *data) -{ - struct bio *bio, *bio_src; - - if (!bs) - bs = &fs_bio_set; - - __rq_for_each_bio(bio_src, rq_src) { - bio = bio_clone_fast(bio_src, gfp_mask, bs); - if (!bio) - goto free_and_out; - - if (bio_ctr && bio_ctr(bio, bio_src, data)) - goto free_and_out; - - if (rq->bio) { - rq->biotail->bi_next = bio; - rq->biotail = bio; - } else { - rq->bio = rq->biotail = bio; - } - bio = NULL; - } - - /* Copy attributes of the original request to the clone request. */ - rq->__sector = blk_rq_pos(rq_src); - rq->__data_len = blk_rq_bytes(rq_src); - if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { - rq->rq_flags |= RQF_SPECIAL_PAYLOAD; - rq->special_vec = rq_src->special_vec; - } - rq->nr_phys_segments = rq_src->nr_phys_segments; - rq->ioprio = rq_src->ioprio; - - if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) - goto free_and_out; - - return 0; - -free_and_out: - if (bio) - bio_put(bio); - blk_rq_unprep_clone(rq); - - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(blk_rq_prep_clone); - int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); @@ -1636,6 +1175,32 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, } EXPORT_SYMBOL(kblockd_mod_delayed_work_on); +void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) +{ + struct task_struct *tsk = current; + + /* + * If this is a nested plug, don't actually assign it. + */ + if (tsk->plug) + return; + + plug->mq_list = NULL; + plug->cached_rq = NULL; + plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); + plug->rq_count = 0; + plug->multiple_queues = false; + plug->has_elevator = false; + plug->nowait = false; + INIT_LIST_HEAD(&plug->cb_list); + + /* + * Store ordering should not be needed here, since a potential + * preempt will imply a full memory barrier + */ + tsk->plug = plug; +} + /** * blk_start_plug - initialize blk_plug and track it inside the task_struct * @plug: The &struct blk_plug that needs to be initialized @@ -1661,25 +1226,7 @@ EXPORT_SYMBOL(kblockd_mod_delayed_work_on); */ void blk_start_plug(struct blk_plug *plug) { - struct task_struct *tsk = current; - - /* - * If this is a nested plug, don't actually assign it. - */ - if (tsk->plug) - return; - - INIT_LIST_HEAD(&plug->mq_list); - INIT_LIST_HEAD(&plug->cb_list); - plug->rq_count = 0; - plug->multiple_queues = false; - plug->nowait = false; - - /* - * Store ordering should not be needed here, since a potential - * preempt will imply a full memory barrier - */ - tsk->plug = plug; + blk_start_plug_nr_ios(plug, 1); } EXPORT_SYMBOL(blk_start_plug); @@ -1725,12 +1272,20 @@ struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, } EXPORT_SYMBOL(blk_check_plugged); -void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) +void blk_flush_plug(struct blk_plug *plug, bool from_schedule) { - flush_plug_callbacks(plug, from_schedule); - - if (!list_empty(&plug->mq_list)) + if (!list_empty(&plug->cb_list)) + flush_plug_callbacks(plug, from_schedule); + if (!rq_list_empty(plug->mq_list)) blk_mq_flush_plug_list(plug, from_schedule); + /* + * Unconditionally flush out cached requests, even if the unplug + * event came from schedule. Since we know hold references to the + * queue for cached requests, we don't want a blocked task holding + * up a queue freeze/quiesce event. + */ + if (unlikely(!rq_list_empty(plug->cached_rq))) + blk_mq_free_plug_rqs(plug); } /** @@ -1745,11 +1300,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ void blk_finish_plug(struct blk_plug *plug) { - if (plug != current->plug) - return; - blk_flush_plug_list(plug, false); - - current->plug = NULL; + if (plug == current->plug) { + blk_flush_plug(plug, false); + current->plug = NULL; + } } EXPORT_SYMBOL(blk_finish_plug); @@ -1772,6 +1326,9 @@ int __init blk_dev_init(void) sizeof_field(struct request, cmd_flags)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct bio, bi_opf)); + BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu), + __alignof__(struct request_queue)) != + sizeof(struct request_queue)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", @@ -1782,6 +1339,10 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); + blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu", + sizeof(struct request_queue) + + sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL); + blk_debugfs_root = debugfs_create_dir("block", NULL); return 0; diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index c322176a1e..c87aba8584 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -12,12 +12,13 @@ #include #include #include +#include #include #include -#include #include #include #include +#include #include "blk-crypto-internal.h" @@ -72,12 +73,12 @@ static mempool_t *bio_fallback_crypt_ctx_pool; static DEFINE_MUTEX(tfms_init_lock); static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX]; -static struct blk_crypto_keyslot { +static struct blk_crypto_fallback_keyslot { enum blk_crypto_mode_num crypto_mode; struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX]; } *blk_crypto_keyslots; -static struct blk_keyslot_manager blk_crypto_ksm; +static struct blk_crypto_profile blk_crypto_fallback_profile; static struct workqueue_struct *blk_crypto_wq; static mempool_t *blk_crypto_bounce_page_pool; static struct bio_set crypto_bio_split; @@ -88,9 +89,9 @@ static struct bio_set crypto_bio_split; */ static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE]; -static void blk_crypto_evict_keyslot(unsigned int slot) +static void blk_crypto_fallback_evict_keyslot(unsigned int slot) { - struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; + struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot]; enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode; int err; @@ -103,45 +104,41 @@ static void blk_crypto_evict_keyslot(unsigned int slot) slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID; } -static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - unsigned int slot) +static int +blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + unsigned int slot) { - struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; + struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot]; const enum blk_crypto_mode_num crypto_mode = key->crypto_cfg.crypto_mode; int err; if (crypto_mode != slotp->crypto_mode && slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID) - blk_crypto_evict_keyslot(slot); + blk_crypto_fallback_evict_keyslot(slot); slotp->crypto_mode = crypto_mode; err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw, key->size); if (err) { - blk_crypto_evict_keyslot(slot); + blk_crypto_fallback_evict_keyslot(slot); return err; } return 0; } -static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - unsigned int slot) +static int blk_crypto_fallback_keyslot_evict(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + unsigned int slot) { - blk_crypto_evict_keyslot(slot); + blk_crypto_fallback_evict_keyslot(slot); return 0; } -/* - * The crypto API fallback KSM ops - only used for a bio when it specifies a - * blk_crypto_key that was not supported by the device's inline encryption - * hardware. - */ -static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = { - .keyslot_program = blk_crypto_keyslot_program, - .keyslot_evict = blk_crypto_keyslot_evict, +static const struct blk_crypto_ll_ops blk_crypto_fallback_ll_ops = { + .keyslot_program = blk_crypto_fallback_keyslot_program, + .keyslot_evict = blk_crypto_fallback_keyslot_evict, }; static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) @@ -159,7 +156,7 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) bio_endio(src_bio); } -static struct bio *blk_crypto_clone_bio(struct bio *bio_src) +static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) { struct bvec_iter iter; struct bio_vec bv; @@ -186,13 +183,14 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src) return bio; } -static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot, - struct skcipher_request **ciph_req_ret, - struct crypto_wait *wait) +static bool +blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot, + struct skcipher_request **ciph_req_ret, + struct crypto_wait *wait) { struct skcipher_request *ciph_req; - const struct blk_crypto_keyslot *slotp; - int keyslot_idx = blk_ksm_get_slot_idx(slot); + const struct blk_crypto_fallback_keyslot *slotp; + int keyslot_idx = blk_crypto_keyslot_index(slot); slotp = &blk_crypto_keyslots[keyslot_idx]; ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode], @@ -209,7 +207,7 @@ static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot, return true; } -static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) +static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr) { struct bio *bio = *bio_ptr; unsigned int i = 0; @@ -264,7 +262,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) { struct bio *src_bio, *enc_bio; struct bio_crypt_ctx *bc; - struct blk_ksm_keyslot *slot; + struct blk_crypto_keyslot *slot; int data_unit_size; struct skcipher_request *ciph_req = NULL; DECLARE_CRYPTO_WAIT(wait); @@ -276,7 +274,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) blk_status_t blk_st; /* Split the bio if it's too big for single page bvec */ - if (!blk_crypto_split_bio_if_needed(bio_ptr)) + if (!blk_crypto_fallback_split_bio_if_needed(bio_ptr)) return false; src_bio = *bio_ptr; @@ -284,24 +282,25 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; /* Allocate bounce bio for encryption */ - enc_bio = blk_crypto_clone_bio(src_bio); + enc_bio = blk_crypto_fallback_clone_bio(src_bio); if (!enc_bio) { src_bio->bi_status = BLK_STS_RESOURCE; return false; } /* - * Use the crypto API fallback keyslot manager to get a crypto_skcipher - * for the algorithm and key specified for this bio. + * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for + * this bio's algorithm and key. */ - blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); + blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile, + bc->bc_key, &slot); if (blk_st != BLK_STS_OK) { src_bio->bi_status = blk_st; goto out_put_enc_bio; } /* and then allocate an skcipher_request for it */ - if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { + if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) { src_bio->bi_status = BLK_STS_RESOURCE; goto out_release_keyslot; } @@ -362,7 +361,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) out_free_ciph_req: skcipher_request_free(ciph_req); out_release_keyslot: - blk_ksm_put_slot(slot); + blk_crypto_put_keyslot(slot); out_put_enc_bio: if (enc_bio) bio_put(enc_bio); @@ -380,7 +379,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) container_of(work, struct bio_fallback_crypt_ctx, work); struct bio *bio = f_ctx->bio; struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx; - struct blk_ksm_keyslot *slot; + struct blk_crypto_keyslot *slot; struct skcipher_request *ciph_req = NULL; DECLARE_CRYPTO_WAIT(wait); u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; @@ -393,17 +392,18 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) blk_status_t blk_st; /* - * Use the crypto API fallback keyslot manager to get a crypto_skcipher - * for the algorithm and key specified for this bio. + * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for + * this bio's algorithm and key. */ - blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); + blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile, + bc->bc_key, &slot); if (blk_st != BLK_STS_OK) { bio->bi_status = blk_st; goto out_no_keyslot; } /* and then allocate an skcipher_request for it */ - if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { + if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) { bio->bi_status = BLK_STS_RESOURCE; goto out; } @@ -434,7 +434,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) out: skcipher_request_free(ciph_req); - blk_ksm_put_slot(slot); + blk_crypto_put_keyslot(slot); out_no_keyslot: mempool_free(f_ctx, bio_fallback_crypt_ctx_pool); bio_endio(bio); @@ -473,9 +473,9 @@ static void blk_crypto_fallback_decrypt_endio(struct bio *bio) * @bio_ptr: pointer to the bio to prepare * * If bio is doing a WRITE operation, this splits the bio into two parts if it's - * too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio - * for the first part, encrypts it, and update bio_ptr to point to the bounce - * bio. + * too big (see blk_crypto_fallback_split_bio_if_needed()). It then allocates a + * bounce bio for the first part, encrypts it, and updates bio_ptr to point to + * the bounce bio. * * For a READ operation, we mark the bio for decryption by using bi_private and * bi_end_io. @@ -499,8 +499,8 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) return false; } - if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm, - &bc->bc_key->crypto_cfg)) { + if (!__blk_crypto_cfg_supported(&blk_crypto_fallback_profile, + &bc->bc_key->crypto_cfg)) { bio->bi_status = BLK_STS_NOTSUPP; return false; } @@ -526,7 +526,7 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) { - return blk_ksm_evict_key(&blk_crypto_ksm, key); + return __blk_crypto_evict_key(&blk_crypto_fallback_profile, key); } static bool blk_crypto_fallback_inited; @@ -534,6 +534,7 @@ static int blk_crypto_fallback_init(void) { int i; int err; + struct blk_crypto_profile *profile = &blk_crypto_fallback_profile; if (blk_crypto_fallback_inited) return 0; @@ -544,24 +545,24 @@ static int blk_crypto_fallback_init(void) if (err) goto out; - err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots); + err = blk_crypto_profile_init(profile, blk_crypto_num_keyslots); if (err) goto fail_free_bioset; err = -ENOMEM; - blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops; - blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; + profile->ll_ops = blk_crypto_fallback_ll_ops; + profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; /* All blk-crypto modes have a crypto API fallback. */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) - blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF; - blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0; + profile->modes_supported[i] = 0xFFFFFFFF; + profile->modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0; blk_crypto_wq = alloc_workqueue("blk_crypto_wq", WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM, num_online_cpus()); if (!blk_crypto_wq) - goto fail_free_ksm; + goto fail_destroy_profile; blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots, sizeof(blk_crypto_keyslots[0]), @@ -595,8 +596,8 @@ static int blk_crypto_fallback_init(void) kfree(blk_crypto_keyslots); fail_free_wq: destroy_workqueue(blk_crypto_wq); -fail_free_ksm: - blk_ksm_destroy(&blk_crypto_ksm); +fail_destroy_profile: + blk_crypto_profile_destroy(profile); fail_free_bioset: bioset_exit(&crypto_bio_split); out: @@ -610,7 +611,7 @@ static int blk_crypto_fallback_init(void) int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) { const char *cipher_str = blk_crypto_modes[mode_num].cipher_str; - struct blk_crypto_keyslot *slotp; + struct blk_crypto_fallback_keyslot *slotp; unsigned int i; int err = 0; diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 103c2e2d50..ec9efeeeca 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -11,7 +11,7 @@ #include #include -#include +#include #include #include @@ -218,8 +218,9 @@ static bool bio_crypt_check_alignment(struct bio *bio) blk_status_t __blk_crypto_init_request(struct request *rq) { - return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key, - &rq->crypt_keyslot); + return blk_crypto_get_keyslot(rq->q->crypto_profile, + rq->crypt_ctx->bc_key, + &rq->crypt_keyslot); } /** @@ -233,7 +234,7 @@ blk_status_t __blk_crypto_init_request(struct request *rq) */ void __blk_crypto_free_request(struct request *rq) { - blk_ksm_put_slot(rq->crypt_keyslot); + blk_crypto_put_keyslot(rq->crypt_keyslot); mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool); blk_crypto_rq_set_defaults(rq); } @@ -264,6 +265,7 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) { struct bio *bio = *bio_ptr; const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; + struct blk_crypto_profile *profile; /* Error if bio has no data. */ if (WARN_ON_ONCE(!bio_has_data(bio))) { @@ -280,8 +282,8 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) * Success if device supports the encryption context, or if we succeeded * in falling back to the crypto API. */ - if (blk_ksm_crypto_cfg_supported(bio->bi_bdev->bd_disk->queue->ksm, - &bc_key->crypto_cfg)) + profile = bdev_get_queue(bio->bi_bdev)->crypto_profile; + if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg)) return true; if (blk_crypto_fallback_bio_prep(bio_ptr)) @@ -357,7 +359,7 @@ bool blk_crypto_config_supported(struct request_queue *q, const struct blk_crypto_config *cfg) { return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - blk_ksm_crypto_cfg_supported(q->ksm, cfg); + __blk_crypto_cfg_supported(q->crypto_profile, cfg); } /** @@ -378,7 +380,7 @@ bool blk_crypto_config_supported(struct request_queue *q, int blk_crypto_start_using_key(const struct blk_crypto_key *key, struct request_queue *q) { - if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) + if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg)) return 0; return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } @@ -394,18 +396,17 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key, * evicted from any hardware that it might have been programmed into. The key * must not be in use by any in-flight IO when this function is called. * - * Return: 0 on success or if key is not present in the q's ksm, -err on error. + * Return: 0 on success or if the key wasn't in any keyslot; -errno on error. */ int blk_crypto_evict_key(struct request_queue *q, const struct blk_crypto_key *key) { - if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) - return blk_ksm_evict_key(q->ksm, key); + if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg)) + return __blk_crypto_evict_key(q->crypto_profile, key); /* - * If the request queue's associated inline encryption hardware didn't - * have support for the key, then the key might have been programmed - * into the fallback keyslot manager, so try to evict from there. + * If the request_queue didn't support the key, then blk-crypto-fallback + * may have been used, so try to evict the key from blk-crypto-fallback. */ return blk_crypto_fallback_evict_key(key); } diff --git a/block/blk-flush.c b/block/blk-flush.c index 94a86acbb7..e4df894189 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -69,6 +69,7 @@ #include #include #include +#include #include "blk.h" #include "blk-mq.h" @@ -95,6 +96,12 @@ enum { static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, unsigned int flags); +static inline struct blk_flush_queue * +blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) +{ + return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; +} + static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) { unsigned int policy = 0; @@ -138,7 +145,7 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front) static void blk_account_io_flush(struct request *rq) { - struct block_device *part = rq->rq_disk->part0; + struct block_device *part = rq->q->disk->part0; part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); @@ -222,7 +229,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); - if (!refcount_dec_and_test(&flush_rq->ref)) { + if (!req_ref_put_and_test(flush_rq)) { fq->rq_status = error; spin_unlock_irqrestore(&fq->mq_flush_lock, flags); return; @@ -334,7 +341,6 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); flush_rq->rq_flags |= RQF_FLUSH_SEQ; - flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; /* * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one @@ -343,7 +349,7 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, * and READ flush_rq->end_io */ smp_wmb(); - refcount_set(&flush_rq->ref, 1); + req_ref_set(flush_rq, 1); blk_flush_queue_rq(flush_rq, false); } @@ -423,7 +429,7 @@ void blk_insert_flush(struct request *rq) */ if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { - blk_mq_request_bypass_insert(rq, false, false); + blk_mq_request_bypass_insert(rq, false, true); return; } diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 16d5d53383..69eed260a8 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -6,7 +6,7 @@ * Written by: Martin K. Petersen */ -#include +#include #include #include #include @@ -409,9 +409,9 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); #ifdef CONFIG_BLK_INLINE_ENCRYPTION - if (disk->queue->ksm) { + if (disk->queue->crypto_profile) { pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); - blk_ksm_unregister(disk->queue); + disk->queue->crypto_profile = NULL; } #endif } diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 57299f860d..11f49f78db 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -8,22 +8,25 @@ #include #include #include +#include #include #include "blk.h" +#include "blk-mq-sched.h" /* * For io context allocations */ static struct kmem_cache *iocontext_cachep; +#ifdef CONFIG_BLK_ICQ /** * get_io_context - increment reference count to io_context * @ioc: io_context to get * * Increment reference count to @ioc. */ -void get_io_context(struct io_context *ioc) +static void get_io_context(struct io_context *ioc) { BUG_ON(atomic_long_read(&ioc->refcount) <= 0); atomic_long_inc(&ioc->refcount); @@ -53,6 +56,16 @@ static void ioc_exit_icq(struct io_cq *icq) icq->flags |= ICQ_EXITED; } +static void ioc_exit_icqs(struct io_context *ioc) +{ + struct io_cq *icq; + + spin_lock_irq(&ioc->lock); + hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) + ioc_exit_icq(icq); + spin_unlock_irq(&ioc->lock); +} + /* * Release an icq. Called with ioc locked for blk-mq, and with both ioc * and queue locked for legacy. @@ -132,102 +145,22 @@ static void ioc_release_fn(struct work_struct *work) kmem_cache_free(iocontext_cachep, ioc); } -/** - * put_io_context - put a reference of io_context - * @ioc: io_context to put - * - * Decrement reference count of @ioc and release it if the count reaches - * zero. +/* + * Releasing icqs requires reverse order double locking and we may already be + * holding a queue_lock. Do it asynchronously from a workqueue. */ -void put_io_context(struct io_context *ioc) -{ - unsigned long flags; - bool free_ioc = false; - - if (ioc == NULL) - return; - - BUG_ON(atomic_long_read(&ioc->refcount) <= 0); - - /* - * Releasing ioc requires reverse order double locking and we may - * already be holding a queue_lock. Do it asynchronously from wq. - */ - if (atomic_long_dec_and_test(&ioc->refcount)) { - spin_lock_irqsave(&ioc->lock, flags); - if (!hlist_empty(&ioc->icq_list)) - queue_work(system_power_efficient_wq, - &ioc->release_work); - else - free_ioc = true; - spin_unlock_irqrestore(&ioc->lock, flags); - } - - if (free_ioc) - kmem_cache_free(iocontext_cachep, ioc); -} - -/** - * put_io_context_active - put active reference on ioc - * @ioc: ioc of interest - * - * Undo get_io_context_active(). If active reference reaches zero after - * put, @ioc can never issue further IOs and ioscheds are notified. - */ -void put_io_context_active(struct io_context *ioc) -{ - struct io_cq *icq; - - if (!atomic_dec_and_test(&ioc->active_ref)) { - put_io_context(ioc); - return; - } - - spin_lock_irq(&ioc->lock); - hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { - if (icq->flags & ICQ_EXITED) - continue; - - ioc_exit_icq(icq); - } - spin_unlock_irq(&ioc->lock); - - put_io_context(ioc); -} - -/* Called by the exiting task */ -void exit_io_context(struct task_struct *task) -{ - struct io_context *ioc; - - task_lock(task); - ioc = task->io_context; - task->io_context = NULL; - task_unlock(task); - - atomic_dec(&ioc->nr_tasks); - put_io_context_active(ioc); -} - -static void __ioc_clear_queue(struct list_head *icq_list) +static bool ioc_delay_free(struct io_context *ioc) { unsigned long flags; - rcu_read_lock(); - while (!list_empty(icq_list)) { - struct io_cq *icq = list_entry(icq_list->next, - struct io_cq, q_node); - struct io_context *ioc = icq->ioc; - - spin_lock_irqsave(&ioc->lock, flags); - if (icq->flags & ICQ_DESTROYED) { - spin_unlock_irqrestore(&ioc->lock, flags); - continue; - } - ioc_destroy_icq(icq); + spin_lock_irqsave(&ioc->lock, flags); + if (!hlist_empty(&ioc->icq_list)) { + queue_work(system_power_efficient_wq, &ioc->release_work); spin_unlock_irqrestore(&ioc->lock, flags); + return true; } - rcu_read_unlock(); + spin_unlock_irqrestore(&ioc->lock, flags); + return false; } /** @@ -244,93 +177,156 @@ void ioc_clear_queue(struct request_queue *q) list_splice_init(&q->icq_list, &icq_list); spin_unlock_irq(&q->queue_lock); - __ioc_clear_queue(&icq_list); -} + rcu_read_lock(); + while (!list_empty(&icq_list)) { + struct io_cq *icq = + list_entry(icq_list.next, struct io_cq, q_node); -int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) + spin_lock_irq(&icq->ioc->lock); + if (!(icq->flags & ICQ_DESTROYED)) + ioc_destroy_icq(icq); + spin_unlock_irq(&icq->ioc->lock); + } + rcu_read_unlock(); +} +#else /* CONFIG_BLK_ICQ */ +static inline void ioc_exit_icqs(struct io_context *ioc) +{ +} +static inline bool ioc_delay_free(struct io_context *ioc) +{ + return false; +} +#endif /* CONFIG_BLK_ICQ */ + +/** + * put_io_context - put a reference of io_context + * @ioc: io_context to put + * + * Decrement reference count of @ioc and release it if the count reaches + * zero. + */ +void put_io_context(struct io_context *ioc) +{ + BUG_ON(atomic_long_read(&ioc->refcount) <= 0); + if (atomic_long_dec_and_test(&ioc->refcount) && !ioc_delay_free(ioc)) + kmem_cache_free(iocontext_cachep, ioc); +} +EXPORT_SYMBOL_GPL(put_io_context); + +/* Called by the exiting task */ +void exit_io_context(struct task_struct *task) +{ + struct io_context *ioc; + + task_lock(task); + ioc = task->io_context; + task->io_context = NULL; + task_unlock(task); + + if (atomic_dec_and_test(&ioc->active_ref)) { + ioc_exit_icqs(ioc); + put_io_context(ioc); + } +} + +static struct io_context *alloc_io_context(gfp_t gfp_flags, int node) { struct io_context *ioc; - int ret; ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, node); if (unlikely(!ioc)) - return -ENOMEM; + return NULL; - /* initialize */ atomic_long_set(&ioc->refcount, 1); - atomic_set(&ioc->nr_tasks, 1); atomic_set(&ioc->active_ref, 1); +#ifdef CONFIG_BLK_ICQ spin_lock_init(&ioc->lock); INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&ioc->icq_list); INIT_WORK(&ioc->release_work, ioc_release_fn); +#endif + return ioc; +} + +int set_task_ioprio(struct task_struct *task, int ioprio) +{ + int err; + const struct cred *cred = current_cred(), *tcred; + + rcu_read_lock(); + tcred = __task_cred(task); + if (!uid_eq(tcred->uid, cred->euid) && + !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); + return -EPERM; + } + rcu_read_unlock(); + + err = security_task_setioprio(task, ioprio); + if (err) + return err; + + task_lock(task); + if (unlikely(!task->io_context)) { + struct io_context *ioc; + + task_unlock(task); + + ioc = alloc_io_context(GFP_ATOMIC, NUMA_NO_NODE); + if (!ioc) + return -ENOMEM; + + task_lock(task); + if (task->flags & PF_EXITING) { + err = -ESRCH; + kmem_cache_free(iocontext_cachep, ioc); + goto out; + } + if (task->io_context) + kmem_cache_free(iocontext_cachep, ioc); + else + task->io_context = ioc; + } + task->io_context->ioprio = ioprio; +out: + task_unlock(task); + return err; +} +EXPORT_SYMBOL_GPL(set_task_ioprio); + +int __copy_io(unsigned long clone_flags, struct task_struct *tsk) +{ + struct io_context *ioc = current->io_context; /* - * Try to install. ioc shouldn't be installed if someone else - * already did or @task, which isn't %current, is exiting. Note - * that we need to allow ioc creation on exiting %current as exit - * path may issue IOs from e.g. exit_files(). The exit path is - * responsible for not issuing IO after exit_io_context(). + * Share io context with parent, if CLONE_IO is set */ - task_lock(task); - if (!task->io_context && - (task == current || !(task->flags & PF_EXITING))) - task->io_context = ioc; - else - kmem_cache_free(iocontext_cachep, ioc); + if (clone_flags & CLONE_IO) { + atomic_inc(&ioc->active_ref); + tsk->io_context = ioc; + } else if (ioprio_valid(ioc->ioprio)) { + tsk->io_context = alloc_io_context(GFP_KERNEL, NUMA_NO_NODE); + if (!tsk->io_context) + return -ENOMEM; + tsk->io_context->ioprio = ioc->ioprio; + } - ret = task->io_context ? 0 : -EBUSY; - - task_unlock(task); - - return ret; -} - -/** - * get_task_io_context - get io_context of a task - * @task: task of interest - * @gfp_flags: allocation flags, used if allocation is necessary - * @node: allocation node, used if allocation is necessary - * - * Return io_context of @task. If it doesn't exist, it is created with - * @gfp_flags and @node. The returned io_context has its reference count - * incremented. - * - * This function always goes through task_lock() and it's better to use - * %current->io_context + get_io_context() for %current. - */ -struct io_context *get_task_io_context(struct task_struct *task, - gfp_t gfp_flags, int node) -{ - struct io_context *ioc; - - might_sleep_if(gfpflags_allow_blocking(gfp_flags)); - - do { - task_lock(task); - ioc = task->io_context; - if (likely(ioc)) { - get_io_context(ioc); - task_unlock(task); - return ioc; - } - task_unlock(task); - } while (!create_task_io_context(task, gfp_flags, node)); - - return NULL; + return 0; } +#ifdef CONFIG_BLK_ICQ /** * ioc_lookup_icq - lookup io_cq from ioc - * @ioc: the associated io_context * @q: the associated request_queue * * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called * with @q->queue_lock held. */ -struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q) +struct io_cq *ioc_lookup_icq(struct request_queue *q) { + struct io_context *ioc = current->io_context; struct io_cq *icq; lockdep_assert_held(&q->queue_lock); @@ -359,9 +355,7 @@ EXPORT_SYMBOL(ioc_lookup_icq); /** * ioc_create_icq - create and link io_cq - * @ioc: io_context of interest * @q: request_queue of interest - * @gfp_mask: allocation mask * * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they * will be created using @gfp_mask. @@ -369,19 +363,19 @@ EXPORT_SYMBOL(ioc_lookup_icq); * The caller is responsible for ensuring @ioc won't go away and @q is * alive and will stay alive until this function returns. */ -struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, - gfp_t gfp_mask) +static struct io_cq *ioc_create_icq(struct request_queue *q) { + struct io_context *ioc = current->io_context; struct elevator_type *et = q->elevator->type; struct io_cq *icq; /* allocate stuff */ - icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, + icq = kmem_cache_alloc_node(et->icq_cache, GFP_ATOMIC | __GFP_ZERO, q->node); if (!icq) return NULL; - if (radix_tree_maybe_preload(gfp_mask) < 0) { + if (radix_tree_maybe_preload(GFP_ATOMIC) < 0) { kmem_cache_free(et->icq_cache, icq); return NULL; } @@ -402,7 +396,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, et->ops.init_icq(icq); } else { kmem_cache_free(et->icq_cache, icq); - icq = ioc_lookup_icq(ioc, q); + icq = ioc_lookup_icq(q); if (!icq) printk(KERN_ERR "cfq: icq link failed!\n"); } @@ -413,6 +407,46 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, return icq; } +struct io_cq *ioc_find_get_icq(struct request_queue *q) +{ + struct io_context *ioc = current->io_context; + struct io_cq *icq = NULL; + + if (unlikely(!ioc)) { + ioc = alloc_io_context(GFP_ATOMIC, q->node); + if (!ioc) + return NULL; + + task_lock(current); + if (current->io_context) { + kmem_cache_free(iocontext_cachep, ioc); + ioc = current->io_context; + } else { + current->io_context = ioc; + } + + get_io_context(ioc); + task_unlock(current); + } else { + get_io_context(ioc); + + spin_lock_irq(&q->queue_lock); + icq = ioc_lookup_icq(q); + spin_unlock_irq(&q->queue_lock); + } + + if (!icq) { + icq = ioc_create_icq(q); + if (!icq) { + put_io_context(ioc); + return NULL; + } + } + return icq; +} +EXPORT_SYMBOL_GPL(ioc_find_get_icq); +#endif /* CONFIG_BLK_ICQ */ + static int __init blk_ioc_init(void) { iocontext_cachep = kmem_cache_create("blkdev_ioc", diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index c0545f9da5..6593c7123b 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -74,6 +74,7 @@ #include #include #include +#include #include "blk-rq-qos.h" #include "blk-stat.h" #include "blk.h" diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index 332a07761b..2e7f10e1c0 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -62,6 +62,7 @@ struct ioprio_blkg { struct ioprio_blkcg { struct blkcg_policy_data cpd; enum prio_policy prio_policy; + bool prio_set; }; static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) @@ -112,7 +113,7 @@ static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, if (ret < 0) return ret; blkcg->prio_policy = ret; - + blkcg->prio_set = true; return nbytes; } @@ -190,6 +191,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) { struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); + u16 prio; + + if (!blkcg->prio_set) + return; /* * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers @@ -199,8 +204,10 @@ static void blkcg_ioprio_track(struct rq_qos *rqos, struct request *rq, * bio I/O priority is not modified. If the bio I/O priority equals * IOPRIO_CLASS_NONE, the cgroup I/O priority is assigned to the bio. */ - bio->bi_ioprio = max_t(u16, bio->bi_ioprio, - IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); + prio = max_t(u16, bio->bi_ioprio, + IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); + if (prio > bio->bi_ioprio) + bio->bi_ioprio = prio; } static void blkcg_ioprio_exit(struct rq_qos *rqos) diff --git a/block/blk-merge.c b/block/blk-merge.c index 7a5c81c02c..4de34a332c 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -6,12 +6,47 @@ #include #include #include +#include #include +#include #include #include "blk.h" +#include "blk-mq-sched.h" #include "blk-rq-qos.h" +#include "blk-throttle.h" + +static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv) +{ + *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); +} + +static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv) +{ + struct bvec_iter iter = bio->bi_iter; + int idx; + + bio_get_first_bvec(bio, bv); + if (bv->bv_len == bio->bi_iter.bi_size) + return; /* this bio only has a single bvec */ + + bio_advance_iter(bio, &iter, iter.bi_size); + + if (!iter.bi_bvec_done) + idx = iter.bi_idx - 1; + else /* in the middle of bvec */ + idx = iter.bi_idx; + + *bv = bio->bi_io_vec[idx]; + + /* + * iter.bi_bvec_done records actual length of the last bvec + * if this bio ends in the middle of one io vector + */ + if (iter.bi_bvec_done) + bv->bv_len = iter.bi_bvec_done; +} static inline bool bio_will_gap(struct request_queue *q, struct request *prev_rq, struct bio *prev, struct bio *next) @@ -285,13 +320,13 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, * iopoll in direct IO routine. Given performance gain of iopoll for * big IO can be trival, disable iopoll when split needed. */ - bio_clear_hipri(bio); - + bio_clear_polled(bio); return bio_split(bio, sectors, GFP_NOIO, bs); } /** * __blk_queue_split - split a bio and submit the second half + * @q: [in] request_queue new bio is being queued at * @bio: [in, out] bio to be split * @nr_segs: [out] number of segments in the first bio * @@ -302,9 +337,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, * of the caller to ensure that q->bio_split is only released after processing * of the split bio has finished. */ -void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) +void __blk_queue_split(struct request_queue *q, struct bio **bio, + unsigned int *nr_segs) { - struct request_queue *q = (*bio)->bi_bdev->bd_disk->queue; struct bio *split = NULL; switch (bio_op(*bio)) { @@ -321,21 +356,6 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) nr_segs); break; default: - /* - * All drivers must accept single-segments bios that are <= - * PAGE_SIZE. This is a quick and dirty check that relies on - * the fact that bi_io_vec[0] is always valid if a bio has data. - * The check might lead to occasional false negatives when bios - * are cloned, but compared to the performance impact of cloned - * bios themselves the loop below doesn't matter anyway. - */ - if (!q->limits.chunk_sectors && - (*bio)->bi_vcnt == 1 && - ((*bio)->bi_io_vec[0].bv_len + - (*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) { - *nr_segs = 1; - break; - } split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs); break; } @@ -365,9 +385,11 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) */ void blk_queue_split(struct bio **bio) { + struct request_queue *q = bdev_get_queue((*bio)->bi_bdev); unsigned int nr_segs; - __blk_queue_split(bio, &nr_segs); + if (blk_may_split(q, *bio)) + __blk_queue_split(q, bio, &nr_segs); } EXPORT_SYMBOL(blk_queue_split); @@ -558,6 +580,23 @@ static inline unsigned int blk_rq_get_max_segments(struct request *rq) return queue_max_segments(rq->q); } +static inline unsigned int blk_rq_get_max_sectors(struct request *rq, + sector_t offset) +{ + struct request_queue *q = rq->q; + + if (blk_rq_is_passthrough(rq)) + return q->limits.max_hw_sectors; + + if (!q->limits.chunk_sectors || + req_op(rq) == REQ_OP_DISCARD || + req_op(rq) == REQ_OP_SECURE_ERASE) + return blk_queue_get_max_sectors(q, req_op(rq)); + + return min(blk_max_size_offset(q, offset, 0), + blk_queue_get_max_sectors(q, req_op(rq))); +} + static inline int ll_new_hw_segment(struct request *req, struct bio *bio, unsigned int nr_phys_segs) { @@ -718,6 +757,13 @@ static enum elv_merge blk_try_req_merge(struct request *req, return ELEVATOR_NO_MERGE; } +static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) +{ + if (bio_page(a) == bio_page(b) && bio_offset(a) == bio_offset(b)) + return true; + return false; +} + /* * For non-mq, this has to be called with the request spinlock acquired. * For mq with scheduling, the appropriate queue wide lock should be held. @@ -731,8 +777,7 @@ static struct request *attempt_merge(struct request_queue *q, if (req_op(req) != req_op(next)) return NULL; - if (rq_data_dir(req) != rq_data_dir(next) - || req->rq_disk != next->rq_disk) + if (rq_data_dir(req) != rq_data_dir(next)) return NULL; if (req_op(req) == REQ_OP_WRITE_SAME && @@ -859,10 +904,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (bio_data_dir(bio) != rq_data_dir(rq)) return false; - /* must be same device */ - if (rq->rq_disk != bio->bi_bdev->bd_disk) - return false; - /* only merge integrity protected bio into ditto rq */ if (blk_integrity_merge_bio(rq->q, rq, bio) == false) return false; @@ -1023,12 +1064,10 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, * @q: request_queue new bio is being queued at * @bio: new bio being queued * @nr_segs: number of segments in @bio - * @same_queue_rq: pointer to &struct request that gets filled in when - * another request associated with @q is found on the plug list - * (optional, may be %NULL) + * from the passed in @q already in the plug list * - * Determine whether @bio being queued on @q can be merged with a request - * on %current's plugged list. Returns %true if merge was successful, + * Determine whether @bio being queued on @q can be merged with the previous + * request on %current's plugged list. Returns %true if merge was successful, * otherwise %false. * * Plugging coalesces IOs from the same issuer for the same purpose without @@ -1041,36 +1080,22 @@ static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, struct request **same_queue_rq) + unsigned int nr_segs) { struct blk_plug *plug; struct request *rq; - struct list_head *plug_list; plug = blk_mq_plug(q, bio); - if (!plug) + if (!plug || rq_list_empty(plug->mq_list)) return false; - plug_list = &plug->mq_list; - - list_for_each_entry_reverse(rq, plug_list, queuelist) { - if (rq->q == q && same_queue_rq) { - /* - * Only blk-mq multiple hardware queues case checks the - * rq in the same queue, there should be only one such - * rq in a queue - **/ - *same_queue_rq = rq; - } - - if (rq->q != q) - continue; - + /* check the previously added entry for a quick merge attempt */ + rq = rq_list_peek(&plug->mq_list); + if (rq->q == q) { if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == - BIO_MERGE_OK) + BIO_MERGE_OK) return true; } - return false; } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 3b38d15723..3a790eb499 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -11,6 +11,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" +#include "blk-mq-sched.h" #include "blk-mq-tag.h" #include "blk-rq-qos.h" @@ -29,6 +30,9 @@ static int queue_poll_stat_show(void *data, struct seq_file *m) struct request_queue *q = data; int bucket; + if (!q->poll_stat) + return 0; + for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) { seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket)); print_stat(m, &q->poll_stat[2 * bucket]); @@ -122,9 +126,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(FUA), QUEUE_FLAG_NAME(DAX), QUEUE_FLAG_NAME(STATS), - QUEUE_FLAG_NAME(POLL_STATS), QUEUE_FLAG_NAME(REGISTERED), - QUEUE_FLAG_NAME(SCSI_PASSTHROUGH), QUEUE_FLAG_NAME(QUIESCED), QUEUE_FLAG_NAME(PCI_P2PDMA), QUEUE_FLAG_NAME(ZONE_RESETALL), @@ -287,7 +289,7 @@ static const char *const cmd_flag_name[] = { CMD_FLAG_NAME(BACKGROUND), CMD_FLAG_NAME(NOWAIT), CMD_FLAG_NAME(NOUNMAP), - CMD_FLAG_NAME(HIPRI), + CMD_FLAG_NAME(POLLED), }; #undef CMD_FLAG_NAME @@ -309,6 +311,7 @@ static const char *const rqf_name[] = { RQF_NAME(SPECIAL_PAYLOAD), RQF_NAME(ZONE_WRITE_LOCKED), RQF_NAME(MQ_POLL_SLEPT), + RQF_NAME(ELV), }; #undef RQF_NAME @@ -453,11 +456,11 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m, atomic_read(&tags->active_queues)); seq_puts(m, "\nbitmap_tags:\n"); - sbitmap_queue_show(tags->bitmap_tags, m); + sbitmap_queue_show(&tags->bitmap_tags, m); if (tags->nr_reserved_tags) { seq_puts(m, "\nbreserved_tags:\n"); - sbitmap_queue_show(tags->breserved_tags, m); + sbitmap_queue_show(&tags->breserved_tags, m); } } @@ -488,7 +491,7 @@ static int hctx_tags_bitmap_show(void *data, struct seq_file *m) if (res) goto out; if (hctx->tags) - sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m); + sbitmap_bitmap_show(&hctx->tags->bitmap_tags.sb, m); mutex_unlock(&q->sysfs_lock); out: @@ -522,77 +525,13 @@ static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) if (res) goto out; if (hctx->sched_tags) - sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m); + sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags.sb, m); mutex_unlock(&q->sysfs_lock); out: return res; } -static int hctx_io_poll_show(void *data, struct seq_file *m) -{ - struct blk_mq_hw_ctx *hctx = data; - - seq_printf(m, "considered=%lu\n", hctx->poll_considered); - seq_printf(m, "invoked=%lu\n", hctx->poll_invoked); - seq_printf(m, "success=%lu\n", hctx->poll_success); - return 0; -} - -static ssize_t hctx_io_poll_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_hw_ctx *hctx = data; - - hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0; - return count; -} - -static int hctx_dispatched_show(void *data, struct seq_file *m) -{ - struct blk_mq_hw_ctx *hctx = data; - int i; - - seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]); - - for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) { - unsigned int d = 1U << (i - 1); - - seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]); - } - - seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]); - return 0; -} - -static ssize_t hctx_dispatched_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_hw_ctx *hctx = data; - int i; - - for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) - hctx->dispatched[i] = 0; - return count; -} - -static int hctx_queued_show(void *data, struct seq_file *m) -{ - struct blk_mq_hw_ctx *hctx = data; - - seq_printf(m, "%lu\n", hctx->queued); - return 0; -} - -static ssize_t hctx_queued_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_hw_ctx *hctx = data; - - hctx->queued = 0; - return count; -} - static int hctx_run_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; @@ -614,7 +553,7 @@ static int hctx_active_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; - seq_printf(m, "%d\n", atomic_read(&hctx->nr_active)); + seq_printf(m, "%d\n", __blk_mq_active_requests(hctx)); return 0; } @@ -663,57 +602,6 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT); CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ); CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL); -static int ctx_dispatched_show(void *data, struct seq_file *m) -{ - struct blk_mq_ctx *ctx = data; - - seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]); - return 0; -} - -static ssize_t ctx_dispatched_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_ctx *ctx = data; - - ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0; - return count; -} - -static int ctx_merged_show(void *data, struct seq_file *m) -{ - struct blk_mq_ctx *ctx = data; - - seq_printf(m, "%lu\n", ctx->rq_merged); - return 0; -} - -static ssize_t ctx_merged_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_ctx *ctx = data; - - ctx->rq_merged = 0; - return count; -} - -static int ctx_completed_show(void *data, struct seq_file *m) -{ - struct blk_mq_ctx *ctx = data; - - seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]); - return 0; -} - -static ssize_t ctx_completed_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_ctx *ctx = data; - - ctx->rq_completed[0] = ctx->rq_completed[1] = 0; - return count; -} - static int blk_mq_debugfs_show(struct seq_file *m, void *v) { const struct blk_mq_debugfs_attr *attr = m->private; @@ -789,9 +677,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"tags_bitmap", 0400, hctx_tags_bitmap_show}, {"sched_tags", 0400, hctx_sched_tags_show}, {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show}, - {"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write}, - {"dispatched", 0600, hctx_dispatched_show, hctx_dispatched_write}, - {"queued", 0600, hctx_queued_show, hctx_queued_write}, {"run", 0600, hctx_run_show, hctx_run_write}, {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, @@ -803,9 +688,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops}, {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops}, {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops}, - {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write}, - {"merged", 0600, ctx_merged_show, ctx_merged_write}, - {"completed", 0600, ctx_completed_show, ctx_completed_write}, {}, }; diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 0f006cabfd..55488ba978 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -18,32 +18,6 @@ #include "blk-mq-tag.h" #include "blk-wbt.h" -void blk_mq_sched_assign_ioc(struct request *rq) -{ - struct request_queue *q = rq->q; - struct io_context *ioc; - struct io_cq *icq; - - /* - * May not have an IO context if it's a passthrough request - */ - ioc = current->io_context; - if (!ioc) - return; - - spin_lock_irq(&q->queue_lock); - icq = ioc_lookup_icq(ioc, q); - spin_unlock_irq(&q->queue_lock); - - if (!icq) { - icq = ioc_create_icq(ioc, q, GFP_ATOMIC); - if (!icq) - return; - } - get_io_context(icq->ioc); - rq->elv.icq = icq; -} - /* * Mark a hardware queue as needing a restart. For shared queues, maintain * a count of how many hardware queues are marked for restart. @@ -57,10 +31,8 @@ void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) } EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx); -void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) +void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - return; clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); /* @@ -363,7 +335,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) } } -bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, +bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct elevator_queue *e = q->elevator; @@ -372,15 +344,17 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, bool ret = false; enum hctx_type type; - if (e && e->type->ops.bio_merge) - return e->type->ops.bio_merge(q, bio, nr_segs); + if (e && e->type->ops.bio_merge) { + ret = e->type->ops.bio_merge(q, bio, nr_segs); + goto out_put; + } ctx = blk_mq_get_ctx(q); hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); type = hctx->type; if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || list_empty_careful(&ctx->rq_lists[type])) - return false; + goto out_put; /* default per sw-queue merge */ spin_lock(&ctx->lock); @@ -389,13 +363,11 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, * potentially merge with. Currently includes a hand-wavy stop * count of 8, to not spend too much time checking for merges. */ - if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { - ctx->rq_merged++; + if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) ret = true; - } spin_unlock(&ctx->lock); - +out_put: return ret; } @@ -502,8 +474,9 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, * busy in case of 'none' scheduler, and this way may save * us one extra enqueue & dequeue to sw queue. */ - if (!hctx->dispatch_busy && !e && !run_queue_async) { - blk_mq_try_issue_list_directly(hctx, list); + if (!hctx->dispatch_busy && !run_queue_async) { + blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_try_issue_list_directly(hctx, list)); if (list_empty(list)) goto out; } @@ -515,83 +488,71 @@ void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, percpu_ref_put(&q->q_usage_counter); } -static int blk_mq_sched_alloc_tags(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) +static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, + unsigned int hctx_idx) { - struct blk_mq_tag_set *set = q->tag_set; - int ret; - - hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, - set->reserved_tags, set->flags); - if (!hctx->sched_tags) - return -ENOMEM; - - ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); - if (ret) { - blk_mq_free_rq_map(hctx->sched_tags, set->flags); - hctx->sched_tags = NULL; + if (blk_mq_is_shared_tags(q->tag_set->flags)) { + hctx->sched_tags = q->sched_shared_tags; + return 0; } - return ret; + hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx, + q->nr_requests); + + if (!hctx->sched_tags) + return -ENOMEM; + return 0; +} + +static void blk_mq_exit_sched_shared_tags(struct request_queue *queue) +{ + blk_mq_free_rq_map(queue->sched_shared_tags); + queue->sched_shared_tags = NULL; } /* called in queue's release handler, tagset has gone away */ -static void blk_mq_sched_tags_teardown(struct request_queue *q) +static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) { struct blk_mq_hw_ctx *hctx; int i; queue_for_each_hw_ctx(q, hctx, i) { if (hctx->sched_tags) { - blk_mq_free_rq_map(hctx->sched_tags, hctx->flags); + if (!blk_mq_is_shared_tags(flags)) + blk_mq_free_rq_map(hctx->sched_tags); hctx->sched_tags = NULL; } } + + if (blk_mq_is_shared_tags(flags)) + blk_mq_exit_sched_shared_tags(q); } -static int blk_mq_init_sched_shared_sbitmap(struct request_queue *queue) +static int blk_mq_init_sched_shared_tags(struct request_queue *queue) { struct blk_mq_tag_set *set = queue->tag_set; - int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); - struct blk_mq_hw_ctx *hctx; - int ret, i; /* * Set initial depth at max so that we don't need to reallocate for * updating nr_requests. */ - ret = blk_mq_init_bitmaps(&queue->sched_bitmap_tags, - &queue->sched_breserved_tags, - MAX_SCHED_RQ, set->reserved_tags, - set->numa_node, alloc_policy); - if (ret) - return ret; + queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set, + BLK_MQ_NO_HCTX_IDX, + MAX_SCHED_RQ); + if (!queue->sched_shared_tags) + return -ENOMEM; - queue_for_each_hw_ctx(queue, hctx, i) { - hctx->sched_tags->bitmap_tags = - &queue->sched_bitmap_tags; - hctx->sched_tags->breserved_tags = - &queue->sched_breserved_tags; - } - - sbitmap_queue_resize(&queue->sched_bitmap_tags, - queue->nr_requests - set->reserved_tags); + blk_mq_tag_update_sched_shared_tags(queue); return 0; } -static void blk_mq_exit_sched_shared_sbitmap(struct request_queue *queue) -{ - sbitmap_queue_free(&queue->sched_bitmap_tags); - sbitmap_queue_free(&queue->sched_breserved_tags); -} - int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { + unsigned int i, flags = q->tag_set->flags; struct blk_mq_hw_ctx *hctx; struct elevator_queue *eq; - unsigned int i; int ret; if (!e) { @@ -606,23 +567,23 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) * Additionally, this is a per-hw queue depth. */ q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, - BLKDEV_MAX_RQ); + BLKDEV_DEFAULT_RQ); - queue_for_each_hw_ctx(q, hctx, i) { - ret = blk_mq_sched_alloc_tags(q, hctx, i); + if (blk_mq_is_shared_tags(flags)) { + ret = blk_mq_init_sched_shared_tags(q); if (ret) - goto err_free_tags; + return ret; } - if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) { - ret = blk_mq_init_sched_shared_sbitmap(q); + queue_for_each_hw_ctx(q, hctx, i) { + ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i); if (ret) - goto err_free_tags; + goto err_free_map_and_rqs; } ret = e->ops.init_sched(q, e); if (ret) - goto err_free_sbitmap; + goto err_free_map_and_rqs; blk_mq_debugfs_register_sched(q); @@ -631,7 +592,7 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) ret = e->ops.init_hctx(hctx, i); if (ret) { eq = q->elevator; - blk_mq_sched_free_requests(q); + blk_mq_sched_free_rqs(q); blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; @@ -642,12 +603,10 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) return 0; -err_free_sbitmap: - if (blk_mq_is_sbitmap_shared(q->tag_set->flags)) - blk_mq_exit_sched_shared_sbitmap(q); -err_free_tags: - blk_mq_sched_free_requests(q); - blk_mq_sched_tags_teardown(q); +err_free_map_and_rqs: + blk_mq_sched_free_rqs(q); + blk_mq_sched_tags_teardown(q, flags); + q->elevator = NULL; return ret; } @@ -656,14 +615,20 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) * called in either blk_queue_cleanup or elevator_switch, tagset * is required for freeing requests */ -void blk_mq_sched_free_requests(struct request_queue *q) +void blk_mq_sched_free_rqs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; int i; - queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->sched_tags) - blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); + if (blk_mq_is_shared_tags(q->tag_set->flags)) { + blk_mq_free_rqs(q->tag_set, q->sched_shared_tags, + BLK_MQ_NO_HCTX_IDX); + } else { + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->sched_tags) + blk_mq_free_rqs(q->tag_set, + hctx->sched_tags, i); + } } } @@ -684,8 +649,6 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) blk_mq_debugfs_unregister_sched(q); if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); - blk_mq_sched_tags_teardown(q); - if (blk_mq_is_sbitmap_shared(flags)) - blk_mq_exit_sched_shared_sbitmap(q); + blk_mq_sched_tags_teardown(q, flags); q->elevator = NULL; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 5246ae0407..0250139724 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -2,21 +2,20 @@ #ifndef BLK_MQ_SCHED_H #define BLK_MQ_SCHED_H +#include "elevator.h" #include "blk-mq.h" #include "blk-mq-tag.h" -#define MAX_SCHED_RQ (16 * BLKDEV_MAX_RQ) - -void blk_mq_sched_assign_ioc(struct request *rq); +#define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ) bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); -bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, +bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, struct list_head *free); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); -void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); +void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async); @@ -28,45 +27,51 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); -void blk_mq_sched_free_requests(struct request_queue *q); +void blk_mq_sched_free_rqs(struct request_queue *q); -static inline bool -blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs) +static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { - if (blk_queue_nomerges(q) || !bio_mergeable(bio)) - return false; + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + __blk_mq_sched_restart(hctx); +} - return __blk_mq_sched_bio_merge(q, bio, nr_segs); +static inline bool bio_mergeable(struct bio *bio) +{ + return !(bio->bi_opf & REQ_NOMERGE_FLAGS); } static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - struct elevator_queue *e = q->elevator; - - if (e && e->type->ops.allow_merge) - return e->type->ops.allow_merge(q, rq, bio); + if (rq->rq_flags & RQF_ELV) { + struct elevator_queue *e = q->elevator; + if (e->type->ops.allow_merge) + return e->type->ops.allow_merge(q, rq, bio); + } return true; } static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) { - struct elevator_queue *e = rq->q->elevator; + if (rq->rq_flags & RQF_ELV) { + struct elevator_queue *e = rq->q->elevator; - if (e && e->type->ops.completed_request) - e->type->ops.completed_request(rq, now); + if (e->type->ops.completed_request) + e->type->ops.completed_request(rq, now); + } } static inline void blk_mq_sched_requeue_request(struct request *rq) { - struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; + if (rq->rq_flags & RQF_ELV) { + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; - if ((rq->rq_flags & RQF_ELVPRIV) && e && e->type->ops.requeue_request) - e->type->ops.requeue_request(rq); + if ((rq->rq_flags & RQF_ELVPRIV) && e->type->ops.requeue_request) + e->type->ops.requeue_request(rq); + } } static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 253c857cba..6747865740 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -36,8 +36,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj) struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); - if (hctx->flags & BLK_MQ_F_BLOCKING) - cleanup_srcu_struct(hctx->srcu); blk_free_flush_queue(hctx->fq); sbitmap_free(&hctx->ctx_map); free_cpumask_var(hctx->cpumask); diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index ff5caeb825..845f74e8dd 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -16,6 +16,21 @@ #include "blk-mq-sched.h" #include "blk-mq-tag.h" +/* + * Recalculate wakeup batch when tag is shared by hctx. + */ +static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, + unsigned int users) +{ + if (!users) + return; + + sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags, + users); + sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags, + users); +} + /* * If a previously inactive queue goes active, bump the active user count. * We need to do this before try to allocate driver tag, then even if fail @@ -24,19 +39,26 @@ */ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { - if (blk_mq_is_sbitmap_shared(hctx->flags)) { - struct request_queue *q = hctx->queue; - struct blk_mq_tag_set *set = q->tag_set; + unsigned int users; - if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && - !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) - atomic_inc(&set->active_queues_shared_sbitmap); + if (blk_mq_is_shared_tags(hctx->flags)) { + struct request_queue *q = hctx->queue; + + if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || + test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) { + return true; + } } else { - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && - !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) - atomic_inc(&hctx->tags->active_queues); + if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || + test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) { + return true; + } } + users = atomic_inc_return(&hctx->tags->active_queues); + + blk_mq_update_wake_batch(hctx->tags, users); + return true; } @@ -45,9 +67,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) */ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) { - sbitmap_queue_wake_all(tags->bitmap_tags); + sbitmap_queue_wake_all(&tags->bitmap_tags); if (include_reserve) - sbitmap_queue_wake_all(tags->breserved_tags); + sbitmap_queue_wake_all(&tags->breserved_tags); } /* @@ -57,20 +79,23 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; - struct request_queue *q = hctx->queue; - struct blk_mq_tag_set *set = q->tag_set; + unsigned int users; + + if (blk_mq_is_shared_tags(hctx->flags)) { + struct request_queue *q = hctx->queue; - if (blk_mq_is_sbitmap_shared(hctx->flags)) { if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return; - atomic_dec(&set->active_queues_shared_sbitmap); } else { if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return; - atomic_dec(&tags->active_queues); } + users = atomic_dec_return(&tags->active_queues); + + blk_mq_update_wake_batch(tags, users); + blk_mq_tag_wakeup_all(tags, false); } @@ -87,6 +112,21 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, return __sbitmap_queue_get(bt); } +unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, + unsigned int *offset) +{ + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct sbitmap_queue *bt = &tags->bitmap_tags; + unsigned long ret; + + if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED || + data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + return 0; + ret = __sbitmap_queue_get_batch(bt, nr_tags, offset); + *offset += tags->nr_reserved_tags; + return ret; +} + unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); @@ -101,10 +141,10 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) WARN_ON_ONCE(1); return BLK_MQ_NO_TAG; } - bt = tags->breserved_tags; + bt = &tags->breserved_tags; tag_offset = 0; } else { - bt = tags->bitmap_tags; + bt = &tags->bitmap_tags; tag_offset = tags->nr_reserved_tags; } @@ -150,9 +190,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) data->ctx); tags = blk_mq_tags_from_data(data); if (data->flags & BLK_MQ_REQ_RESERVED) - bt = tags->breserved_tags; + bt = &tags->breserved_tags; else - bt = tags->bitmap_tags; + bt = &tags->bitmap_tags; /* * If destination hw queue is changed, fake wake up on @@ -186,16 +226,23 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, const int real_tag = tag - tags->nr_reserved_tags; BUG_ON(real_tag >= tags->nr_tags); - sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu); + sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); } else { BUG_ON(tag >= tags->nr_reserved_tags); - sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu); + sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); } } +void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags) +{ + sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags, + tag_array, nr_tags); +} + struct bt_iter_data { struct blk_mq_hw_ctx *hctx; - busy_iter_fn *fn; + struct request_queue *q; + busy_tag_iter_fn *fn; void *data; bool reserved; }; @@ -208,7 +255,7 @@ static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, spin_lock_irqsave(&tags->lock, flags); rq = tags->rqs[bitnr]; - if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref)) + if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq)) rq = NULL; spin_unlock_irqrestore(&tags->lock, flags); return rq; @@ -218,11 +265,18 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_iter_data *iter_data = data; struct blk_mq_hw_ctx *hctx = iter_data->hctx; - struct blk_mq_tags *tags = hctx->tags; + struct request_queue *q = iter_data->q; + struct blk_mq_tag_set *set = q->tag_set; bool reserved = iter_data->reserved; + struct blk_mq_tags *tags; struct request *rq; bool ret = true; + if (blk_mq_is_shared_tags(set->flags)) + tags = set->shared_tags; + else + tags = hctx->tags; + if (!reserved) bitnr += tags->nr_reserved_tags; /* @@ -233,8 +287,8 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) if (!rq) return true; - if (rq->q == hctx->queue && rq->mq_hctx == hctx) - ret = iter_data->fn(hctx, rq, iter_data->data, reserved); + if (rq->q == q && (!hctx || rq->mq_hctx == hctx)) + ret = iter_data->fn(rq, iter_data->data, reserved); blk_mq_put_rq_ref(rq); return ret; } @@ -242,6 +296,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) /** * bt_for_each - iterate over the requests associated with a hardware queue * @hctx: Hardware queue to examine. + * @q: Request queue to examine. * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each request @@ -253,14 +308,16 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. */ -static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, - busy_iter_fn *fn, void *data, bool reserved) +static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q, + struct sbitmap_queue *bt, busy_tag_iter_fn *fn, + void *data, bool reserved) { struct bt_iter_data iter_data = { .hctx = hctx, .fn = fn, .data = data, .reserved = reserved, + .q = q, }; sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); @@ -340,9 +397,9 @@ static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED); if (tags->nr_reserved_tags) - bt_tags_for_each(tags, tags->breserved_tags, fn, priv, + bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, flags | BT_TAG_ITER_RESERVED); - bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags); + bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags); } /** @@ -379,9 +436,12 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) { - int i; + unsigned int flags = tagset->flags; + int i, nr_tags; - for (i = 0; i < tagset->nr_hw_queues; i++) { + nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues; + + for (i = 0; i < nr_tags; i++) { if (tagset->tags && tagset->tags[i]) __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, BT_TAG_ITER_STARTED); @@ -434,12 +494,9 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); * called for all requests on all queues that share that tag set and not only * for requests associated with @q. */ -void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv) { - struct blk_mq_hw_ctx *hctx; - int i; - /* * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx * while the queue is frozen. So we can use q_usage_counter to avoid @@ -448,19 +505,34 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, if (!percpu_ref_tryget(&q->q_usage_counter)) return; - queue_for_each_hw_ctx(q, hctx, i) { - struct blk_mq_tags *tags = hctx->tags; - - /* - * If no software queues are currently mapped to this - * hardware queue, there's nothing to check - */ - if (!blk_mq_hw_queue_mapped(hctx)) - continue; + if (blk_mq_is_shared_tags(q->tag_set->flags)) { + struct blk_mq_tags *tags = q->tag_set->shared_tags; + struct sbitmap_queue *bresv = &tags->breserved_tags; + struct sbitmap_queue *btags = &tags->bitmap_tags; if (tags->nr_reserved_tags) - bt_for_each(hctx, tags->breserved_tags, fn, priv, true); - bt_for_each(hctx, tags->bitmap_tags, fn, priv, false); + bt_for_each(NULL, q, bresv, fn, priv, true); + bt_for_each(NULL, q, btags, fn, priv, false); + } else { + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + struct blk_mq_tags *tags = hctx->tags; + struct sbitmap_queue *bresv = &tags->breserved_tags; + struct sbitmap_queue *btags = &tags->bitmap_tags; + + /* + * If no software queues are currently mapped to this + * hardware queue, there's nothing to check + */ + if (!blk_mq_hw_queue_mapped(hctx)) + continue; + + if (tags->nr_reserved_tags) + bt_for_each(hctx, q, bresv, fn, priv, true); + bt_for_each(hctx, q, btags, fn, priv, false); + } } blk_queue_exit(q); } @@ -492,56 +564,10 @@ int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, return -ENOMEM; } -static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, - int node, int alloc_policy) -{ - int ret; - - ret = blk_mq_init_bitmaps(&tags->__bitmap_tags, - &tags->__breserved_tags, - tags->nr_tags, tags->nr_reserved_tags, - node, alloc_policy); - if (ret) - return ret; - - tags->bitmap_tags = &tags->__bitmap_tags; - tags->breserved_tags = &tags->__breserved_tags; - - return 0; -} - -int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set) -{ - int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); - int i, ret; - - ret = blk_mq_init_bitmaps(&set->__bitmap_tags, &set->__breserved_tags, - set->queue_depth, set->reserved_tags, - set->numa_node, alloc_policy); - if (ret) - return ret; - - for (i = 0; i < set->nr_hw_queues; i++) { - struct blk_mq_tags *tags = set->tags[i]; - - tags->bitmap_tags = &set->__bitmap_tags; - tags->breserved_tags = &set->__breserved_tags; - } - - return 0; -} - -void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set) -{ - sbitmap_queue_free(&set->__bitmap_tags); - sbitmap_queue_free(&set->__breserved_tags); -} - struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, unsigned int reserved_tags, - int node, unsigned int flags) + int node, int alloc_policy) { - int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags); struct blk_mq_tags *tags; if (total_tags > BLK_MQ_TAG_MAX) { @@ -557,22 +583,19 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_reserved_tags = reserved_tags; spin_lock_init(&tags->lock); - if (blk_mq_is_sbitmap_shared(flags)) - return tags; - - if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) { + if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags, + total_tags, reserved_tags, node, + alloc_policy) < 0) { kfree(tags); return NULL; } return tags; } -void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags) +void blk_mq_free_tags(struct blk_mq_tags *tags) { - if (!blk_mq_is_sbitmap_shared(flags)) { - sbitmap_queue_free(tags->bitmap_tags); - sbitmap_queue_free(tags->breserved_tags); - } + sbitmap_queue_free(&tags->bitmap_tags); + sbitmap_queue_free(&tags->breserved_tags); kfree(tags); } @@ -592,7 +615,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, if (tdepth > tags->nr_tags) { struct blk_mq_tag_set *set = hctx->queue->tag_set; struct blk_mq_tags *new; - bool ret; if (!can_grow) return -EINVAL; @@ -604,34 +626,42 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, if (tdepth > MAX_SCHED_RQ) return -EINVAL; - new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, - tags->nr_reserved_tags, set->flags); + /* + * Only the sbitmap needs resizing since we allocated the max + * initially. + */ + if (blk_mq_is_shared_tags(set->flags)) + return 0; + + new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth); if (!new) return -ENOMEM; - ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); - if (ret) { - blk_mq_free_rq_map(new, set->flags); - return -ENOMEM; - } - blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); - blk_mq_free_rq_map(*tagsptr, set->flags); + blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num); *tagsptr = new; } else { /* * Don't need (or can't) update reserved tags here, they * remain static and should never need resizing. */ - sbitmap_queue_resize(tags->bitmap_tags, + sbitmap_queue_resize(&tags->bitmap_tags, tdepth - tags->nr_reserved_tags); } return 0; } -void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size) +void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size) { - sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags); + struct blk_mq_tags *tags = set->shared_tags; + + sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags); +} + +void blk_mq_tag_update_sched_shared_tags(struct request_queue *q) +{ + sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags, + q->nr_requests - q->tag_set->reserved_tags); } /** diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 8ed55af084..5668e28be0 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -2,55 +2,33 @@ #ifndef INT_BLK_MQ_TAG_H #define INT_BLK_MQ_TAG_H -/* - * Tag address space map. - */ -struct blk_mq_tags { - unsigned int nr_tags; - unsigned int nr_reserved_tags; - - atomic_t active_queues; - - struct sbitmap_queue *bitmap_tags; - struct sbitmap_queue *breserved_tags; - - struct sbitmap_queue __bitmap_tags; - struct sbitmap_queue __breserved_tags; - - struct request **rqs; - struct request **static_rqs; - struct list_head page_list; - - /* - * used to clear request reference in rqs[] before freeing one - * request pool - */ - spinlock_t lock; -}; +struct blk_mq_alloc_data; extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, - int node, unsigned int flags); -extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags); + int node, int alloc_policy); +extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, struct sbitmap_queue *breserved_tags, unsigned int queue_depth, unsigned int reserved, int node, int alloc_policy); -extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set); -extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); +unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, + unsigned int *offset); extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag); +void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags **tags, unsigned int depth, bool can_grow); -extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, +extern void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size); +extern void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); -void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv); void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv); diff --git a/block/blk-mq.c b/block/blk-mq.c index 82de39926a..9a9185a0a2 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -10,14 +10,15 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include -#include #include #include #include @@ -27,6 +28,7 @@ #include #include #include +#include #include @@ -63,6 +65,32 @@ static int blk_mq_poll_stats_bkt(const struct request *rq) return bucket; } +#define BLK_QC_T_SHIFT 16 +#define BLK_QC_T_INTERNAL (1U << 31) + +static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q, + blk_qc_t qc) +{ + return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT]; +} + +static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx, + blk_qc_t qc) +{ + unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1); + + if (qc & BLK_QC_T_INTERNAL) + return blk_mq_tag_to_rq(hctx->sched_tags, tag); + return blk_mq_tag_to_rq(hctx->tags, tag); +} + +static inline blk_qc_t blk_rq_to_qc(struct request *rq) +{ + return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) | + (rq->tag != -1 ? + rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL)); +} + /* * Check if any of the ctx, dispatch list or elevator * have pending work in this hardware queue. @@ -99,8 +127,7 @@ struct mq_inflight { unsigned int inflight[2]; }; -static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, - struct request *rq, void *priv, +static bool blk_mq_check_inflight(struct request *rq, void *priv, bool reserved) { struct mq_inflight *mi = priv; @@ -214,10 +241,31 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); */ void blk_mq_quiesce_queue_nowait(struct request_queue *q) { - blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); + unsigned long flags; + + spin_lock_irqsave(&q->queue_lock, flags); + if (!q->quiesce_depth++) + blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); + spin_unlock_irqrestore(&q->queue_lock, flags); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); +/** + * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done + * @q: request queue. + * + * Note: it is driver's responsibility for making sure that quiesce has + * been started. + */ +void blk_mq_wait_quiesce_done(struct request_queue *q) +{ + if (blk_queue_has_srcu(q)) + synchronize_srcu(q->srcu); + else + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done); + /** * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished * @q: request queue. @@ -229,20 +277,8 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); */ void blk_mq_quiesce_queue(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - unsigned int i; - bool rcu = false; - blk_mq_quiesce_queue_nowait(q); - - queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->flags & BLK_MQ_F_BLOCKING) - synchronize_srcu(hctx->srcu); - else - rcu = true; - } - if (rcu) - synchronize_rcu(); + blk_mq_wait_quiesce_done(q); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); @@ -255,10 +291,21 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); */ void blk_mq_unquiesce_queue(struct request_queue *q) { - blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + unsigned long flags; + bool run_queue = false; + + spin_lock_irqsave(&q->queue_lock, flags); + if (WARN_ON_ONCE(q->quiesce_depth <= 0)) { + ; + } else if (!--q->quiesce_depth) { + blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + run_queue = true; + } + spin_unlock_irqrestore(&q->queue_lock, flags); /* dispatch requests which are inserted during quiescing */ - blk_mq_run_hw_queues(q, true); + if (run_queue) + blk_mq_run_hw_queues(q, true); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); @@ -272,91 +319,127 @@ void blk_mq_wake_waiters(struct request_queue *q) blk_mq_tag_wakeup_all(hctx->tags, true); } -/* - * Only need start/end time stamping if we have iostat or - * blk stats enabled, or using an IO scheduler. - */ -static inline bool blk_mq_need_time_stamp(struct request *rq) +void blk_rq_init(struct request_queue *q, struct request *rq) { - return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator; -} + memset(rq, 0, sizeof(*rq)); -static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, - unsigned int tag, u64 alloc_time_ns) -{ - struct blk_mq_tags *tags = blk_mq_tags_from_data(data); - struct request *rq = tags->static_rqs[tag]; - - if (data->q->elevator) { - rq->tag = BLK_MQ_NO_TAG; - rq->internal_tag = tag; - } else { - rq->tag = tag; - rq->internal_tag = BLK_MQ_NO_TAG; - } - - /* csd/requeue_work/fifo_time is initialized before use */ - rq->q = data->q; - rq->mq_ctx = data->ctx; - rq->mq_hctx = data->hctx; - rq->rq_flags = 0; - rq->cmd_flags = data->cmd_flags; - if (data->flags & BLK_MQ_REQ_PM) - rq->rq_flags |= RQF_PM; - if (blk_queue_io_stat(data->q)) - rq->rq_flags |= RQF_IO_STAT; INIT_LIST_HEAD(&rq->queuelist); + rq->q = q; + rq->__sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); - rq->rq_disk = NULL; + rq->tag = BLK_MQ_NO_TAG; + rq->internal_tag = BLK_MQ_NO_TAG; + rq->start_time_ns = ktime_get_ns(); rq->part = NULL; -#ifdef CONFIG_BLK_RQ_ALLOC_TIME - rq->alloc_time_ns = alloc_time_ns; -#endif + blk_crypto_rq_set_defaults(rq); +} +EXPORT_SYMBOL(blk_rq_init); + +static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, + struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns) +{ + struct blk_mq_ctx *ctx = data->ctx; + struct blk_mq_hw_ctx *hctx = data->hctx; + struct request_queue *q = data->q; + struct request *rq = tags->static_rqs[tag]; + + rq->q = q; + rq->mq_ctx = ctx; + rq->mq_hctx = hctx; + rq->cmd_flags = data->cmd_flags; + + if (data->flags & BLK_MQ_REQ_PM) + data->rq_flags |= RQF_PM; + if (blk_queue_io_stat(q)) + data->rq_flags |= RQF_IO_STAT; + rq->rq_flags = data->rq_flags; + + if (!(data->rq_flags & RQF_ELV)) { + rq->tag = tag; + rq->internal_tag = BLK_MQ_NO_TAG; + } else { + rq->tag = BLK_MQ_NO_TAG; + rq->internal_tag = tag; + } + rq->timeout = 0; + if (blk_mq_need_time_stamp(rq)) rq->start_time_ns = ktime_get_ns(); else rq->start_time_ns = 0; + rq->part = NULL; +#ifdef CONFIG_BLK_RQ_ALLOC_TIME + rq->alloc_time_ns = alloc_time_ns; +#endif rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; #endif - blk_crypto_rq_set_defaults(rq); - /* tag was already set */ - WRITE_ONCE(rq->deadline, 0); - - rq->timeout = 0; - rq->end_io = NULL; rq->end_io_data = NULL; - data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++; - refcount_set(&rq->ref, 1); + blk_crypto_rq_set_defaults(rq); + INIT_LIST_HEAD(&rq->queuelist); + /* tag was already set */ + WRITE_ONCE(rq->deadline, 0); + req_ref_set(rq, 1); - if (!op_is_flush(data->cmd_flags)) { + if (rq->rq_flags & RQF_ELV) { struct elevator_queue *e = data->q->elevator; - rq->elv.icq = NULL; - if (e && e->type->ops.prepare_request) { - if (e->type->icq_cache) - blk_mq_sched_assign_ioc(rq); + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + if (!op_is_flush(data->cmd_flags) && + e->type->ops.prepare_request) { e->type->ops.prepare_request(rq); rq->rq_flags |= RQF_ELVPRIV; } } - data->hctx->queued++; return rq; } -static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) +static inline struct request * +__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, + u64 alloc_time_ns) +{ + unsigned int tag, tag_offset; + struct blk_mq_tags *tags; + struct request *rq; + unsigned long tag_mask; + int i, nr = 0; + + tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset); + if (unlikely(!tag_mask)) + return NULL; + + tags = blk_mq_tags_from_data(data); + for (i = 0; tag_mask; i++) { + if (!(tag_mask & (1UL << i))) + continue; + tag = tag_offset + i; + prefetch(tags->static_rqs[tag]); + tag_mask &= ~(1UL << i); + rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns); + rq_list_add(data->cached_rq, rq); + nr++; + } + /* caller already holds a reference, add for remainder */ + percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); + data->nr_tags -= nr; + + return rq_list_pop(data->cached_rq); +} + +static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) { struct request_queue *q = data->q; - struct elevator_queue *e = q->elevator; u64 alloc_time_ns = 0; + struct request *rq; unsigned int tag; /* alloc_time includes depth and tag waits */ @@ -366,7 +449,11 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; - if (e) { + if (q->elevator) { + struct elevator_queue *e = q->elevator; + + data->rq_flags |= RQF_ELV; + /* * Flush/passthrough requests are special and go directly to the * dispatch list. Don't include reserved tags in the @@ -382,9 +469,19 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); - if (!e) + if (!(data->rq_flags & RQF_ELV)) blk_mq_tag_busy(data->hctx); + /* + * Try batched alloc if we want more than 1 tag. + */ + if (data->nr_tags > 1) { + rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns); + if (rq) + return rq; + data->nr_tags = 1; + } + /* * Waiting allocations only fail because of an inactive hctx. In that * case just retry the hctx assignment and tag allocation as CPU hotplug @@ -394,16 +491,18 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) if (tag == BLK_MQ_NO_TAG) { if (data->flags & BLK_MQ_REQ_NOWAIT) return NULL; - /* - * Give up the CPU and sleep for a random short time to ensure - * that thread using a realtime scheduling class are migrated - * off the CPU, and thus off the hctx that is going away. + * Give up the CPU and sleep for a random short time to + * ensure that thread using a realtime scheduling class + * are migrated off the CPU, and thus off the hctx that + * is going away. */ msleep(3); goto retry; } - return blk_mq_rq_ctx_init(data, tag, alloc_time_ns); + + return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag, + alloc_time_ns); } struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, @@ -413,6 +512,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, .q = q, .flags = flags, .cmd_flags = op, + .nr_tags = 1, }; struct request *rq; int ret; @@ -421,7 +521,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, if (ret) return ERR_PTR(ret); - rq = __blk_mq_alloc_request(&data); + rq = __blk_mq_alloc_requests(&data); if (!rq) goto out_queue_exit; rq->__data_len = 0; @@ -441,6 +541,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, .q = q, .flags = flags, .cmd_flags = op, + .nr_tags = 1, }; u64 alloc_time_ns = 0; unsigned int cpu; @@ -480,12 +581,15 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, if (!q->elevator) blk_mq_tag_busy(data.hctx); + else + data.rq_flags |= RQF_ELV; ret = -EWOULDBLOCK; tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; - return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns); + return blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag, + alloc_time_ns); out_queue_exit: blk_queue_exit(q); @@ -514,20 +618,12 @@ static void __blk_mq_free_request(struct request *rq) void blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; - struct blk_mq_ctx *ctx = rq->mq_ctx; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - if (rq->rq_flags & RQF_ELVPRIV) { - if (e && e->type->ops.finish_request) - e->type->ops.finish_request(rq); - if (rq->elv.icq) { - put_io_context(rq->elv.icq->ioc); - rq->elv.icq = NULL; - } - } + if ((rq->rq_flags & RQF_ELVPRIV) && + q->elevator->type->ops.finish_request) + q->elevator->type->ops.finish_request(rq); - ctx->rq_completed[rq_is_sync(rq)]++; if (rq->rq_flags & RQF_MQ_INFLIGHT) __blk_mq_dec_active_requests(hctx); @@ -537,26 +633,290 @@ void blk_mq_free_request(struct request *rq) rq_qos_done(q, rq); WRITE_ONCE(rq->state, MQ_RQ_IDLE); - if (refcount_dec_and_test(&rq->ref)) + if (req_ref_put_and_test(rq)) __blk_mq_free_request(rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); -inline void __blk_mq_end_request(struct request *rq, blk_status_t error) +void blk_mq_free_plug_rqs(struct blk_plug *plug) { - u64 now = 0; + struct request *rq; - if (blk_mq_need_time_stamp(rq)) - now = ktime_get_ns(); + while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) + blk_mq_free_request(rq); +} +void blk_dump_rq_flags(struct request *rq, char *msg) +{ + printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, + rq->q->disk ? rq->q->disk->disk_name : "?", + (unsigned long long) rq->cmd_flags); + + printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", + (unsigned long long)blk_rq_pos(rq), + blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); + printk(KERN_INFO " bio %p, biotail %p, len %u\n", + rq->bio, rq->biotail, blk_rq_bytes(rq)); +} +EXPORT_SYMBOL(blk_dump_rq_flags); + +static void req_bio_endio(struct request *rq, struct bio *bio, + unsigned int nbytes, blk_status_t error) +{ + if (unlikely(error)) { + bio->bi_status = error; + } else if (req_op(rq) == REQ_OP_ZONE_APPEND) { + /* + * Partial zone append completions cannot be supported as the + * BIO fragments may end up not being written sequentially. + */ + if (bio->bi_iter.bi_size != nbytes) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_iter.bi_sector = rq->__sector; + } + + bio_advance(bio, nbytes); + + if (unlikely(rq->rq_flags & RQF_QUIET)) + bio_set_flag(bio, BIO_QUIET); + /* don't actually finish bio if it's part of flush sequence */ + if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) + bio_endio(bio); +} + +static void blk_account_io_completion(struct request *req, unsigned int bytes) +{ + if (req->part && blk_do_io_stat(req)) { + const int sgrp = op_stat_group(req_op(req)); + + part_stat_lock(); + part_stat_add(req->part, sectors[sgrp], bytes >> 9); + part_stat_unlock(); + } +} + +static void blk_print_req_error(struct request *req, blk_status_t status) +{ + printk_ratelimited(KERN_ERR + "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " + "phys_seg %u prio class %u\n", + blk_status_to_str(status), + req->q->disk ? req->q->disk->disk_name : "?", + blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), + req->cmd_flags & ~REQ_OP_MASK, + req->nr_phys_segments, + IOPRIO_PRIO_CLASS(req->ioprio)); +} + +/* + * Fully end IO on a request. Does not support partial completions, or + * errors. + */ +static void blk_complete_request(struct request *req) +{ + const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0; + int total_bytes = blk_rq_bytes(req); + struct bio *bio = req->bio; + + trace_block_rq_complete(req, BLK_STS_OK, total_bytes); + + if (!bio) + return; + +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ) + req->q->integrity.profile->complete_fn(req, total_bytes); +#endif + + blk_account_io_completion(req, total_bytes); + + do { + struct bio *next = bio->bi_next; + + /* Completion has already been traced */ + bio_clear_flag(bio, BIO_TRACE_COMPLETION); + + if (req_op(req) == REQ_OP_ZONE_APPEND) + bio->bi_iter.bi_sector = req->__sector; + + if (!is_flush) + bio_endio(bio); + bio = next; + } while (bio); + + /* + * Reset counters so that the request stacking driver + * can find how many bytes remain in the request + * later. + */ + req->bio = NULL; + req->__data_len = 0; +} + +/** + * blk_update_request - Complete multiple bytes without completing the request + * @req: the request being processed + * @error: block status code + * @nr_bytes: number of bytes to complete for @req + * + * Description: + * Ends I/O on a number of bytes attached to @req, but doesn't complete + * the request structure even if @req doesn't have leftover. + * If @req has leftover, sets it up for the next range of segments. + * + * Passing the result of blk_rq_bytes() as @nr_bytes guarantees + * %false return from this function. + * + * Note: + * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function + * except in the consistency check at the end of this function. + * + * Return: + * %false - this request doesn't have any more data + * %true - this request has more data + **/ +bool blk_update_request(struct request *req, blk_status_t error, + unsigned int nr_bytes) +{ + int total_bytes; + + trace_block_rq_complete(req, error, nr_bytes); + + if (!req->bio) + return false; + +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && + error == BLK_STS_OK) + req->q->integrity.profile->complete_fn(req, nr_bytes); +#endif + + if (unlikely(error && !blk_rq_is_passthrough(req) && + !(req->rq_flags & RQF_QUIET))) + blk_print_req_error(req, error); + + blk_account_io_completion(req, nr_bytes); + + total_bytes = 0; + while (req->bio) { + struct bio *bio = req->bio; + unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); + + if (bio_bytes == bio->bi_iter.bi_size) + req->bio = bio->bi_next; + + /* Completion has already been traced */ + bio_clear_flag(bio, BIO_TRACE_COMPLETION); + req_bio_endio(req, bio, bio_bytes, error); + + total_bytes += bio_bytes; + nr_bytes -= bio_bytes; + + if (!nr_bytes) + break; + } + + /* + * completely done + */ + if (!req->bio) { + /* + * Reset counters so that the request stacking driver + * can find how many bytes remain in the request + * later. + */ + req->__data_len = 0; + return false; + } + + req->__data_len -= total_bytes; + + /* update sector only for requests with clear definition of sector */ + if (!blk_rq_is_passthrough(req)) + req->__sector += total_bytes >> 9; + + /* mixed attributes always follow the first bio */ + if (req->rq_flags & RQF_MIXED_MERGE) { + req->cmd_flags &= ~REQ_FAILFAST_MASK; + req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; + } + + if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { + /* + * If total number of sectors is less than the first segment + * size, something has gone terribly wrong. + */ + if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { + blk_dump_rq_flags(req, "request botched"); + req->__data_len = blk_rq_cur_bytes(req); + } + + /* recalculate the number of segments */ + req->nr_phys_segments = blk_recalc_rq_segments(req); + } + + return true; +} +EXPORT_SYMBOL_GPL(blk_update_request); + +static void __blk_account_io_done(struct request *req, u64 now) +{ + const int sgrp = op_stat_group(req_op(req)); + + part_stat_lock(); + update_io_ticks(req->part, jiffies, true); + part_stat_inc(req->part, ios[sgrp]); + part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); + part_stat_unlock(); +} + +static inline void blk_account_io_done(struct request *req, u64 now) +{ + /* + * Account IO completion. flush_rq isn't accounted as a + * normal IO on queueing nor completion. Accounting the + * containing request is enough. + */ + if (blk_do_io_stat(req) && req->part && + !(req->rq_flags & RQF_FLUSH_SEQ)) + __blk_account_io_done(req, now); +} + +static void __blk_account_io_start(struct request *rq) +{ + /* passthrough requests can hold bios that do not have ->bi_bdev set */ + if (rq->bio && rq->bio->bi_bdev) + rq->part = rq->bio->bi_bdev; + else if (rq->q->disk) + rq->part = rq->q->disk->part0; + + part_stat_lock(); + update_io_ticks(rq->part, jiffies, false); + part_stat_unlock(); +} + +static inline void blk_account_io_start(struct request *req) +{ + if (blk_do_io_stat(req)) + __blk_account_io_start(req); +} + +static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) +{ if (rq->rq_flags & RQF_STATS) { blk_mq_poll_stats_start(rq->q); blk_stat_add(rq, now); } blk_mq_sched_completed_request(rq, now); - blk_account_io_done(rq, now); +} + +inline void __blk_mq_end_request(struct request *rq, blk_status_t error) +{ + if (blk_mq_need_time_stamp(rq)) + __blk_mq_end_request_acct(rq, ktime_get_ns()); if (rq->end_io) { rq_qos_done(rq->q, rq); @@ -575,6 +935,65 @@ void blk_mq_end_request(struct request *rq, blk_status_t error) } EXPORT_SYMBOL(blk_mq_end_request); +#define TAG_COMP_BATCH 32 + +static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, + int *tag_array, int nr_tags) +{ + struct request_queue *q = hctx->queue; + + /* + * All requests should have been marked as RQF_MQ_INFLIGHT, so + * update hctx->nr_active in batch + */ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_sub_active_requests(hctx, nr_tags); + + blk_mq_put_tags(hctx->tags, tag_array, nr_tags); + percpu_ref_put_many(&q->q_usage_counter, nr_tags); +} + +void blk_mq_end_request_batch(struct io_comp_batch *iob) +{ + int tags[TAG_COMP_BATCH], nr_tags = 0; + struct blk_mq_hw_ctx *cur_hctx = NULL; + struct request *rq; + u64 now = 0; + + if (iob->need_ts) + now = ktime_get_ns(); + + while ((rq = rq_list_pop(&iob->req_list)) != NULL) { + prefetch(rq->bio); + prefetch(rq->rq_next); + + blk_complete_request(rq); + if (iob->need_ts) + __blk_mq_end_request_acct(rq, now); + + rq_qos_done(rq->q, rq); + + WRITE_ONCE(rq->state, MQ_RQ_IDLE); + if (!req_ref_put_and_test(rq)) + continue; + + blk_crypto_free_request(rq); + blk_pm_mark_last_busy(rq); + + if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) { + if (cur_hctx) + blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); + nr_tags = 0; + cur_hctx = rq->mq_hctx; + } + tags[nr_tags++] = rq->tag; + } + + if (nr_tags) + blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); +} +EXPORT_SYMBOL_GPL(blk_mq_end_request_batch); + static void blk_complete_reqs(struct llist_head *list) { struct llist_node *entry = llist_reverse_order(llist_del_all(list)); @@ -658,7 +1077,7 @@ bool blk_mq_complete_request_remote(struct request *rq) * For a polled request, always complete locallly, it's pointless * to redirect the completion. */ - if (rq->cmd_flags & REQ_HIPRI) + if (rq->cmd_flags & REQ_POLLED) return false; if (blk_mq_complete_need_ipi(rq)) { @@ -688,26 +1107,6 @@ void blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); -static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) - __releases(hctx->srcu) -{ - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) - rcu_read_unlock(); - else - srcu_read_unlock(hctx->srcu, srcu_idx); -} - -static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) - __acquires(hctx->srcu) -{ - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { - /* shut up gcc false positive */ - *srcu_idx = 0; - rcu_read_lock(); - } else - *srcu_idx = srcu_read_lock(hctx->srcu); -} - /** * blk_mq_start_request - Start processing a request * @rq: Pointer to request to be started @@ -723,7 +1122,14 @@ void blk_mq_start_request(struct request *rq) trace_block_rq_issue(rq); if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { - rq->io_start_time_ns = ktime_get_ns(); + u64 start_time; +#ifdef CONFIG_BLK_CGROUP + if (rq->bio) + start_time = bio_issue_time(&rq->bio->bi_issue); + else +#endif + start_time = ktime_get_ns(); + rq->io_start_time_ns = start_time; rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); @@ -738,9 +1144,112 @@ void blk_mq_start_request(struct request *rq) if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) q->integrity.profile->prepare_fn(rq); #endif + if (rq->bio && rq->bio->bi_opf & REQ_POLLED) + WRITE_ONCE(rq->bio->bi_cookie, blk_rq_to_qc(rq)); } EXPORT_SYMBOL(blk_mq_start_request); +/** + * blk_end_sync_rq - executes a completion event on a request + * @rq: request to complete + * @error: end I/O status of the request + */ +static void blk_end_sync_rq(struct request *rq, blk_status_t error) +{ + struct completion *waiting = rq->end_io_data; + + rq->end_io_data = (void *)(uintptr_t)error; + + /* + * complete last, if this is a stack request the process (and thus + * the rq pointer) could be invalid right after this complete() + */ + complete(waiting); +} + +/** + * blk_execute_rq_nowait - insert a request to I/O scheduler for execution + * @rq: request to insert + * @at_head: insert request at head or tail of queue + * @done: I/O completion handler + * + * Description: + * Insert a fully prepared request at the back of the I/O scheduler queue + * for execution. Don't wait for completion. + * + * Note: + * This function will invoke @done directly if the queue is dead. + */ +void blk_execute_rq_nowait(struct request *rq, bool at_head, rq_end_io_fn *done) +{ + WARN_ON(irqs_disabled()); + WARN_ON(!blk_rq_is_passthrough(rq)); + + rq->end_io = done; + + blk_account_io_start(rq); + + /* + * don't check dying flag for MQ because the request won't + * be reused after dying flag is set + */ + blk_mq_sched_insert_request(rq, at_head, true, false); +} +EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); + +static bool blk_rq_is_poll(struct request *rq) +{ + if (!rq->mq_hctx) + return false; + if (rq->mq_hctx->type != HCTX_TYPE_POLL) + return false; + if (WARN_ON_ONCE(!rq->bio)) + return false; + return true; +} + +static void blk_rq_poll_completion(struct request *rq, struct completion *wait) +{ + do { + bio_poll(rq->bio, NULL, 0); + cond_resched(); + } while (!completion_done(wait)); +} + +/** + * blk_execute_rq - insert a request into queue for execution + * @rq: request to insert + * @at_head: insert request at head or tail of queue + * + * Description: + * Insert a fully prepared request at the back of the I/O scheduler queue + * for execution and wait for completion. + * Return: The blk_status_t result provided to blk_mq_end_request(). + */ +blk_status_t blk_execute_rq(struct request *rq, bool at_head) +{ + DECLARE_COMPLETION_ONSTACK(wait); + unsigned long hang_check; + + rq->end_io_data = &wait; + blk_execute_rq_nowait(rq, at_head, blk_end_sync_rq); + + /* Prevent hang_check timer from firing at us during very long I/O */ + hang_check = sysctl_hung_task_timeout_secs; + + if (blk_rq_is_poll(rq)) + blk_rq_poll_completion(rq, &wait); + else if (hang_check) + while (!wait_for_completion_io_timeout(&wait, + hang_check * (HZ/2))) + ; + else + wait_for_completion_io(&wait); + + return (blk_status_t)(uintptr_t)rq->end_io_data; +} +EXPORT_SYMBOL(blk_execute_rq); + static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; @@ -843,25 +1352,15 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q, } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); -struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) -{ - if (tag < tags->nr_tags) { - prefetch(tags->rqs[tag]); - return tags->rqs[tag]; - } - - return NULL; -} -EXPORT_SYMBOL(blk_mq_tag_to_rq); - -static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, - void *priv, bool reserved) +static bool blk_mq_rq_inflight(struct request *rq, void *priv, + bool reserved) { /* - * If we find a request that isn't idle and the queue matches, - * we know the queue is busy. Return false to stop the iteration. + * If we find a request that isn't idle we know the queue is busy + * as it's checked in the iter. + * Return false to stop the iteration. */ - if (blk_mq_request_started(rq) && rq->q == hctx->queue) { + if (blk_mq_request_started(rq)) { bool *busy = priv; *busy = true; @@ -919,12 +1418,11 @@ void blk_mq_put_rq_ref(struct request *rq) { if (is_flush_rq(rq)) rq->end_io(rq, 0); - else if (refcount_dec_and_test(&rq->ref)) + else if (req_ref_put_and_test(rq)) __blk_mq_free_request(rq); } -static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, - struct request *rq, void *priv, bool reserved) +static bool blk_mq_check_expired(struct request *rq, void *priv, bool reserved) { unsigned long *next = priv; @@ -1058,24 +1556,16 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, return data.rq; } -static inline unsigned int queued_to_index(unsigned int queued) +static bool __blk_mq_alloc_driver_tag(struct request *rq) { - if (!queued) - return 0; - - return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); -} - -static bool __blk_mq_get_driver_tag(struct request *rq) -{ - struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags; + struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; int tag; blk_mq_tag_busy(rq->mq_hctx); if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { - bt = rq->mq_hctx->tags->breserved_tags; + bt = &rq->mq_hctx->tags->breserved_tags; tag_offset = 0; } else { if (!hctx_may_queue(rq->mq_hctx, bt)) @@ -1090,11 +1580,9 @@ static bool __blk_mq_get_driver_tag(struct request *rq) return true; } -bool blk_mq_get_driver_tag(struct request *rq) +bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - - if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq)) + if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) return false; if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && @@ -1118,7 +1606,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, struct sbitmap_queue *sbq; list_del_init(&wait->entry); - sbq = hctx->tags->bitmap_tags; + sbq = &hctx->tags->bitmap_tags; atomic_dec(&sbq->ws_active); } spin_unlock(&hctx->dispatch_wait_lock); @@ -1136,7 +1624,7 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct sbitmap_queue *sbq = hctx->tags->bitmap_tags; + struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags; struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; @@ -1393,8 +1881,6 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, if (!list_empty(&zone_list)) list_splice_tail_init(&zone_list, list); - hctx->dispatched[queued_to_index(queued)]++; - /* If we didn't flush the entire list, we could have told the driver * there was more coming, but that turned out to be a lie. */ @@ -1477,19 +1963,14 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, */ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) { - int srcu_idx; - /* * We can't run the queue inline with ints disabled. Ensure that * we catch bad users of this early. */ WARN_ON_ONCE(in_interrupt()); - might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - - hctx_lock(hctx, &srcu_idx); - blk_mq_sched_dispatch_requests(hctx); - hctx_unlock(hctx, srcu_idx); + blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_sched_dispatch_requests(hctx)); } static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) @@ -1601,7 +2082,6 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); */ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - int srcu_idx; bool need_run; /* @@ -1612,10 +2092,9 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) * And queue will be rerun in blk_mq_unquiesce_queue() if it is * quiesced. */ - hctx_lock(hctx, &srcu_idx); - need_run = !blk_queue_quiesced(hctx->queue) && - blk_mq_hctx_has_pending(hctx); - hctx_unlock(hctx, srcu_idx); + __blk_mq_run_dispatch_ops(hctx->queue, false, + need_run = !blk_queue_quiesced(hctx->queue) && + blk_mq_hctx_has_pending(hctx)); if (need_run) __blk_mq_delay_run_hw_queue(hctx, async, 0); @@ -1898,54 +2377,14 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, spin_unlock(&ctx->lock); } -static int plug_rq_cmp(void *priv, const struct list_head *a, - const struct list_head *b) +static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int *queued, + bool from_schedule) { - struct request *rqa = container_of(a, struct request, queuelist); - struct request *rqb = container_of(b, struct request, queuelist); - - if (rqa->mq_ctx != rqb->mq_ctx) - return rqa->mq_ctx > rqb->mq_ctx; - if (rqa->mq_hctx != rqb->mq_hctx) - return rqa->mq_hctx > rqb->mq_hctx; - - return blk_rq_pos(rqa) > blk_rq_pos(rqb); -} - -void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) -{ - LIST_HEAD(list); - - if (list_empty(&plug->mq_list)) - return; - list_splice_init(&plug->mq_list, &list); - - if (plug->rq_count > 2 && plug->multiple_queues) - list_sort(NULL, &list, plug_rq_cmp); - - plug->rq_count = 0; - - do { - struct list_head rq_list; - struct request *rq, *head_rq = list_entry_rq(list.next); - struct list_head *pos = &head_rq->queuelist; /* skip first */ - struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx; - struct blk_mq_ctx *this_ctx = head_rq->mq_ctx; - unsigned int depth = 1; - - list_for_each_continue(pos, &list) { - rq = list_entry_rq(pos); - BUG_ON(!rq->q); - if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) - break; - depth++; - } - - list_cut_before(&rq_list, &list, pos); - trace_block_unplug(head_rq->q, depth, !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, - from_schedule); - } while(!list_empty(&list)); + if (hctx->queue->mq_ops->commit_rqs) { + trace_block_unplug(hctx->queue, *queued, !from_schedule); + hctx->queue->mq_ops->commit_rqs(hctx); + } + *queued = 0; } static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, @@ -1968,19 +2407,15 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, } static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, - blk_qc_t *cookie, bool last) + struct request *rq, bool last) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, .last = last, }; - blk_qc_t new_cookie; blk_status_t ret; - new_cookie = request_to_qc_t(hctx, rq); - /* * For OK queue, we are done. For error, caller may kill it. * Any other error (busy), just add it to our list as we @@ -1990,7 +2425,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, switch (ret) { case BLK_STS_OK: blk_mq_update_dispatch_busy(hctx, false); - *cookie = new_cookie; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: @@ -1999,7 +2433,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, break; default: blk_mq_update_dispatch_busy(hctx, false); - *cookie = BLK_QC_T_NONE; break; } @@ -2008,7 +2441,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, - blk_qc_t *cookie, bool bypass_insert, bool last) { struct request_queue *q = rq->q; @@ -2028,7 +2460,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, goto insert; } - if (q->elevator && !bypass_insert) + if ((rq->rq_flags & RQF_ELV) && !bypass_insert) goto insert; budget_token = blk_mq_get_dispatch_budget(q); @@ -2042,7 +2474,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, goto insert; } - return __blk_mq_issue_directly(hctx, rq, cookie, last); + return __blk_mq_issue_directly(hctx, rq, last); insert: if (bypass_insert) return BLK_STS_RESOURCE; @@ -2056,7 +2488,6 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, * blk_mq_try_issue_directly - Try to send a request directly to device driver. * @hctx: Pointer of the associated hardware queue. * @rq: Pointer to request to be sent. - * @cookie: Request queue cookie. * * If the device has enough resources to accept a new request now, send the * request directly to device driver. Else, insert at hctx->dispatch queue, so @@ -2064,36 +2495,143 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, * queue have higher priority. */ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, blk_qc_t *cookie) + struct request *rq) { - blk_status_t ret; - int srcu_idx; + blk_status_t ret = + __blk_mq_try_issue_directly(hctx, rq, false, true); - might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - - hctx_lock(hctx, &srcu_idx); - - ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true); if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) blk_mq_request_bypass_insert(rq, false, true); else if (ret != BLK_STS_OK) blk_mq_end_request(rq, ret); - - hctx_unlock(hctx, srcu_idx); } -blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) +static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { - blk_status_t ret; - int srcu_idx; - blk_qc_t unused_cookie; - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last); +} - hctx_lock(hctx, &srcu_idx); - ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last); - hctx_unlock(hctx, srcu_idx); +static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule) +{ + struct blk_mq_hw_ctx *hctx = NULL; + struct request *rq; + int queued = 0; + int errors = 0; - return ret; + while ((rq = rq_list_pop(&plug->mq_list))) { + bool last = rq_list_empty(plug->mq_list); + blk_status_t ret; + + if (hctx != rq->mq_hctx) { + if (hctx) + blk_mq_commit_rqs(hctx, &queued, from_schedule); + hctx = rq->mq_hctx; + } + + ret = blk_mq_request_issue_directly(rq, last); + switch (ret) { + case BLK_STS_OK: + queued++; + break; + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_request_bypass_insert(rq, false, last); + blk_mq_commit_rqs(hctx, &queued, from_schedule); + return; + default: + blk_mq_end_request(rq, ret); + errors++; + break; + } + } + + /* + * If we didn't flush the entire list, we could have told the driver + * there was more coming, but that turned out to be a lie. + */ + if (errors) + blk_mq_commit_rqs(hctx, &queued, from_schedule); +} + +static void __blk_mq_flush_plug_list(struct request_queue *q, + struct blk_plug *plug) +{ + if (blk_queue_quiesced(q)) + return; + q->mq_ops->queue_rqs(&plug->mq_list); +} + +void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) +{ + struct blk_mq_hw_ctx *this_hctx; + struct blk_mq_ctx *this_ctx; + struct request *rq; + unsigned int depth; + LIST_HEAD(list); + + if (rq_list_empty(plug->mq_list)) + return; + plug->rq_count = 0; + + if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { + struct request_queue *q; + + rq = rq_list_peek(&plug->mq_list); + q = rq->q; + + /* + * Peek first request and see if we have a ->queue_rqs() hook. + * If we do, we can dispatch the whole plug list in one go. We + * already know at this point that all requests belong to the + * same queue, caller must ensure that's the case. + * + * Since we pass off the full list to the driver at this point, + * we do not increment the active request count for the queue. + * Bypass shared tags for now because of that. + */ + if (q->mq_ops->queue_rqs && + !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { + blk_mq_run_dispatch_ops(q, + __blk_mq_flush_plug_list(q, plug)); + if (rq_list_empty(plug->mq_list)) + return; + } + + blk_mq_run_dispatch_ops(q, + blk_mq_plug_issue_direct(plug, false)); + if (rq_list_empty(plug->mq_list)) + return; + } + + this_hctx = NULL; + this_ctx = NULL; + depth = 0; + do { + rq = rq_list_pop(&plug->mq_list); + + if (!this_hctx) { + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { + trace_block_unplug(this_hctx->queue, depth, + !from_schedule); + blk_mq_sched_insert_requests(this_hctx, this_ctx, + &list, from_schedule); + depth = 0; + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + + } + + list_add(&rq->queuelist, &list); + depth++; + } while (!rq_list_empty(plug->mq_list)); + + if (!list_empty(&list)) { + trace_block_unplug(this_hctx->queue, depth, !from_schedule); + blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, + from_schedule); + } } void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, @@ -2132,20 +2670,6 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, hctx->queue->mq_ops->commit_rqs(hctx); } -static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) -{ - list_add_tail(&rq->queuelist, &plug->mq_list); - plug->rq_count++; - if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) { - struct request *tmp; - - tmp = list_first_entry(&plug->mq_list, struct request, - queuelist); - if (tmp->q != rq->q) - plug->multiple_queues = true; - } -} - /* * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple * queues. This is important for md arrays to benefit from merging @@ -2158,6 +2682,106 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) return BLK_MAX_REQUEST_COUNT; } +static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) +{ + struct request *last = rq_list_peek(&plug->mq_list); + + if (!plug->rq_count) { + trace_block_plug(rq->q); + } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || + (!blk_queue_nomerges(rq->q) && + blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { + blk_mq_flush_plug_list(plug, false); + trace_block_plug(rq->q); + } + + if (!plug->multiple_queues && last && last->q != rq->q) + plug->multiple_queues = true; + if (!plug->has_elevator && (rq->rq_flags & RQF_ELV)) + plug->has_elevator = true; + rq->rq_next = NULL; + rq_list_add(&plug->mq_list, rq); + plug->rq_count++; +} + +static bool blk_mq_attempt_bio_merge(struct request_queue *q, + struct bio *bio, unsigned int nr_segs) +{ + if (!blk_queue_nomerges(q) && bio_mergeable(bio)) { + if (blk_attempt_plug_merge(q, bio, nr_segs)) + return true; + if (blk_mq_sched_bio_merge(q, bio, nr_segs)) + return true; + } + return false; +} + +static struct request *blk_mq_get_new_requests(struct request_queue *q, + struct blk_plug *plug, + struct bio *bio, + unsigned int nsegs) +{ + struct blk_mq_alloc_data data = { + .q = q, + .nr_tags = 1, + .cmd_flags = bio->bi_opf, + }; + struct request *rq; + + if (unlikely(bio_queue_enter(bio))) + return NULL; + + if (blk_mq_attempt_bio_merge(q, bio, nsegs)) + goto queue_exit; + + rq_qos_throttle(q, bio); + + if (plug) { + data.nr_tags = plug->nr_ios; + plug->nr_ios = 1; + data.cached_rq = &plug->cached_rq; + } + + rq = __blk_mq_alloc_requests(&data); + if (rq) + return rq; + rq_qos_cleanup(q, bio); + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); +queue_exit: + blk_queue_exit(q); + return NULL; +} + +static inline struct request *blk_mq_get_cached_request(struct request_queue *q, + struct blk_plug *plug, struct bio **bio, unsigned int nsegs) +{ + struct request *rq; + + if (!plug) + return NULL; + rq = rq_list_peek(&plug->cached_rq); + if (!rq || rq->q != q) + return NULL; + + if (blk_mq_attempt_bio_merge(q, *bio, nsegs)) { + *bio = NULL; + return NULL; + } + + rq_qos_throttle(q, *bio); + + if (blk_mq_get_hctx_type((*bio)->bi_opf) != rq->mq_hctx->type) + return NULL; + if (op_is_flush(rq->cmd_flags) != op_is_flush((*bio)->bi_opf)) + return NULL; + + rq->cmd_flags = (*bio)->bi_opf; + plug->cached_rq = rq_list_next(rq); + INIT_LIST_HEAD(&rq->queuelist); + return rq; +} + /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. @@ -2170,57 +2794,39 @@ static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) * * It will not queue the request if there is an error with the bio, or at the * request creation. - * - * Returns: Request queue cookie. */ -blk_qc_t blk_mq_submit_bio(struct bio *bio) +void blk_mq_submit_bio(struct bio *bio) { - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + struct blk_plug *plug = blk_mq_plug(q, bio); const int is_sync = op_is_sync(bio->bi_opf); - const int is_flush_fua = op_is_flush(bio->bi_opf); - struct blk_mq_alloc_data data = { - .q = q, - }; struct request *rq; - struct blk_plug *plug; - struct request *same_queue_rq = NULL; - unsigned int nr_segs; - blk_qc_t cookie; + unsigned int nr_segs = 1; blk_status_t ret; - bool hipri; + + if (unlikely(!blk_crypto_bio_prep(&bio))) + return; blk_queue_bounce(q, &bio); - __blk_queue_split(&bio, &nr_segs); + if (blk_may_split(q, bio)) + __blk_queue_split(q, &bio, &nr_segs); if (!bio_integrity_prep(bio)) - goto queue_exit; + return; - if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq)) - goto queue_exit; - - if (blk_mq_sched_bio_merge(q, bio, nr_segs)) - goto queue_exit; - - rq_qos_throttle(q, bio); - - hipri = bio->bi_opf & REQ_HIPRI; - - data.cmd_flags = bio->bi_opf; - rq = __blk_mq_alloc_request(&data); - if (unlikely(!rq)) { - rq_qos_cleanup(q, bio); - if (bio->bi_opf & REQ_NOWAIT) - bio_wouldblock_error(bio); - goto queue_exit; + rq = blk_mq_get_cached_request(q, plug, &bio, nr_segs); + if (!rq) { + if (!bio) + return; + rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); + if (unlikely(!rq)) + return; } trace_block_getrq(bio); rq_qos_track(q, rq, bio); - cookie = request_to_qc_t(data.hctx, rq); - blk_mq_bio_to_request(rq, bio, nr_segs); ret = blk_crypto_init_request(rq); @@ -2228,109 +2834,251 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio) bio->bi_status = ret; bio_endio(bio); blk_mq_free_request(rq); - return BLK_QC_T_NONE; + return; } - plug = blk_mq_plug(q, bio); - if (unlikely(is_flush_fua)) { - /* Bypass scheduler for flush requests */ + if (op_is_flush(bio->bi_opf)) { blk_insert_flush(rq); - blk_mq_run_hw_queue(data.hctx, true); - } else if (plug && (q->nr_hw_queues == 1 || - blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) || - q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) { - /* - * Use plugging if we have a ->commit_rqs() hook as well, as - * we know the driver uses bd->last in a smart fashion. - * - * Use normal plugging if this disk is slow HDD, as sequential - * IO may benefit a lot from plug merging. - */ - unsigned int request_count = plug->rq_count; - struct request *last = NULL; - - if (!request_count) - trace_block_plug(q); - else - last = list_entry_rq(plug->mq_list.prev); - - if (request_count >= blk_plug_max_rq_count(plug) || (last && - blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { - blk_flush_plug_list(plug, false); - trace_block_plug(q); - } - - blk_add_rq_to_plug(plug, rq); - } else if (q->elevator) { - /* Insert the request at the IO scheduler queue */ - blk_mq_sched_insert_request(rq, false, true, true); - } else if (plug && !blk_queue_nomerges(q)) { - /* - * We do limited plugging. If the bio can be merged, do that. - * Otherwise the existing request in the plug list will be - * issued. So the plug list will have one request at most - * The plug list might get flushed before this. If that happens, - * the plug list is empty, and same_queue_rq is invalid. - */ - if (list_empty(&plug->mq_list)) - same_queue_rq = NULL; - if (same_queue_rq) { - list_del_init(&same_queue_rq->queuelist); - plug->rq_count--; - } - blk_add_rq_to_plug(plug, rq); - trace_block_plug(q); - - if (same_queue_rq) { - data.hctx = same_queue_rq->mq_hctx; - trace_block_unplug(q, 1, true); - blk_mq_try_issue_directly(data.hctx, same_queue_rq, - &cookie); - } - } else if ((q->nr_hw_queues > 1 && is_sync) || - !data.hctx->dispatch_busy) { - /* - * There is no scheduler and we can try to send directly - * to the hardware. - */ - blk_mq_try_issue_directly(data.hctx, rq, &cookie); - } else { - /* Default case. */ - blk_mq_sched_insert_request(rq, false, true, true); + return; } - if (!hipri) - return BLK_QC_T_NONE; - return cookie; -queue_exit: - blk_queue_exit(q); - return BLK_QC_T_NONE; + if (plug) + blk_add_rq_to_plug(plug, rq); + else if ((rq->rq_flags & RQF_ELV) || + (rq->mq_hctx->dispatch_busy && + (q->nr_hw_queues == 1 || !is_sync))) + blk_mq_sched_insert_request(rq, false, true, true); + else + blk_mq_run_dispatch_ops(rq->q, + blk_mq_try_issue_directly(rq->mq_hctx, rq)); } +/** + * blk_cloned_rq_check_limits - Helper function to check a cloned request + * for the new queue limits + * @q: the queue + * @rq: the request being checked + * + * Description: + * @rq may have been made based on weaker limitations of upper-level queues + * in request stacking drivers, and it may violate the limitation of @q. + * Since the block layer and the underlying device driver trust @rq + * after it is inserted to @q, it should be checked against @q before + * the insertion using this generic function. + * + * Request stacking drivers like request-based dm may change the queue + * limits when retrying requests on other queues. Those requests need + * to be checked against the new queue limits again during dispatch. + */ +static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, + struct request *rq) +{ + unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + + if (blk_rq_sectors(rq) > max_sectors) { + /* + * SCSI device does not have a good way to return if + * Write Same/Zero is actually supported. If a device rejects + * a non-read/write command (discard, write same,etc.) the + * low-level device driver will set the relevant queue limit to + * 0 to prevent blk-lib from issuing more of the offending + * operations. Commands queued prior to the queue limit being + * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O + * errors being propagated to upper layers. + */ + if (max_sectors == 0) + return BLK_STS_NOTSUPP; + + printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", + __func__, blk_rq_sectors(rq), max_sectors); + return BLK_STS_IOERR; + } + + /* + * The queue settings related to segment counting may differ from the + * original queue. + */ + rq->nr_phys_segments = blk_recalc_rq_segments(rq); + if (rq->nr_phys_segments > queue_max_segments(q)) { + printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", + __func__, rq->nr_phys_segments, queue_max_segments(q)); + return BLK_STS_IOERR; + } + + return BLK_STS_OK; +} + +/** + * blk_insert_cloned_request - Helper for stacking drivers to submit a request + * @q: the queue to submit the request + * @rq: the request being queued + */ +blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) +{ + blk_status_t ret; + + ret = blk_cloned_rq_check_limits(q, rq); + if (ret != BLK_STS_OK) + return ret; + + if (rq->q->disk && + should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq))) + return BLK_STS_IOERR; + + if (blk_crypto_insert_cloned_request(rq)) + return BLK_STS_IOERR; + + blk_account_io_start(rq); + + /* + * Since we have a scheduler attached on the top device, + * bypass a potential scheduler on the bottom device for + * insert. + */ + blk_mq_run_dispatch_ops(rq->q, + ret = blk_mq_request_issue_directly(rq, true)); + if (ret) + blk_account_io_done(rq, ktime_get_ns()); + return ret; +} +EXPORT_SYMBOL_GPL(blk_insert_cloned_request); + +/** + * blk_rq_unprep_clone - Helper function to free all bios in a cloned request + * @rq: the clone request to be cleaned up + * + * Description: + * Free all bios in @rq for a cloned request. + */ +void blk_rq_unprep_clone(struct request *rq) +{ + struct bio *bio; + + while ((bio = rq->bio) != NULL) { + rq->bio = bio->bi_next; + + bio_put(bio); + } +} +EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); + +/** + * blk_rq_prep_clone - Helper function to setup clone request + * @rq: the request to be setup + * @rq_src: original request to be cloned + * @bs: bio_set that bios for clone are allocated from + * @gfp_mask: memory allocation mask for bio + * @bio_ctr: setup function to be called for each clone bio. + * Returns %0 for success, non %0 for failure. + * @data: private data to be passed to @bio_ctr + * + * Description: + * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. + * Also, pages which the original bios are pointing to are not copied + * and the cloned bios just point same pages. + * So cloned bios must be completed before original bios, which means + * the caller must complete @rq before @rq_src. + */ +int blk_rq_prep_clone(struct request *rq, struct request *rq_src, + struct bio_set *bs, gfp_t gfp_mask, + int (*bio_ctr)(struct bio *, struct bio *, void *), + void *data) +{ + struct bio *bio, *bio_src; + + if (!bs) + bs = &fs_bio_set; + + __rq_for_each_bio(bio_src, rq_src) { + bio = bio_clone_fast(bio_src, gfp_mask, bs); + if (!bio) + goto free_and_out; + bio->bi_bdev = rq->q->disk->part0; + + if (bio_ctr && bio_ctr(bio, bio_src, data)) + goto free_and_out; + + if (rq->bio) { + rq->biotail->bi_next = bio; + rq->biotail = bio; + } else { + rq->bio = rq->biotail = bio; + } + bio = NULL; + } + + /* Copy attributes of the original request to the clone request. */ + rq->__sector = blk_rq_pos(rq_src); + rq->__data_len = blk_rq_bytes(rq_src); + if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { + rq->rq_flags |= RQF_SPECIAL_PAYLOAD; + rq->special_vec = rq_src->special_vec; + } + rq->nr_phys_segments = rq_src->nr_phys_segments; + rq->ioprio = rq_src->ioprio; + + if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) + goto free_and_out; + + return 0; + +free_and_out: + if (bio) + bio_put(bio); + blk_rq_unprep_clone(rq); + + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(blk_rq_prep_clone); + +/* + * Steal bios from a request and add them to a bio list. + * The request must not have been partially completed before. + */ +void blk_steal_bios(struct bio_list *list, struct request *rq) +{ + if (rq->bio) { + if (list->tail) + list->tail->bi_next = rq->bio; + else + list->head = rq->bio; + list->tail = rq->biotail; + + rq->bio = NULL; + rq->biotail = NULL; + } + + rq->__data_len = 0; +} +EXPORT_SYMBOL_GPL(blk_steal_bios); + static size_t order_to_size(unsigned int order) { return (size_t)PAGE_SIZE << order; } /* called before freeing request pool in @tags */ -static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, - struct blk_mq_tags *tags, unsigned int hctx_idx) +static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, + struct blk_mq_tags *tags) { - struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; struct page *page; unsigned long flags; + /* There is no need to clear a driver tags own mapping */ + if (drv_tags == tags) + return; + list_for_each_entry(page, &tags->page_list, lru) { unsigned long start = (unsigned long)page_address(page); unsigned long end = start + order_to_size(page->private); int i; - for (i = 0; i < set->queue_depth; i++) { + for (i = 0; i < drv_tags->nr_tags; i++) { struct request *rq = drv_tags->rqs[i]; unsigned long rq_addr = (unsigned long)rq; if (rq_addr >= start && rq_addr < end) { - WARN_ON_ONCE(refcount_read(&rq->ref) != 0); + WARN_ON_ONCE(req_ref_read(rq) != 0); cmpxchg(&drv_tags->rqs[i], rq, NULL); } } @@ -2349,9 +3097,15 @@ static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { + struct blk_mq_tags *drv_tags; struct page *page; - if (tags->rqs && set->ops->exit_request) { + if (blk_mq_is_shared_tags(set->flags)) + drv_tags = set->shared_tags; + else + drv_tags = set->tags[hctx_idx]; + + if (tags->static_rqs && set->ops->exit_request) { int i; for (i = 0; i < tags->nr_tags; i++) { @@ -2364,7 +3118,7 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } } - blk_mq_clear_rq_mapping(set, tags, hctx_idx); + blk_mq_clear_rq_mapping(drv_tags, tags); while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); @@ -2378,21 +3132,20 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } } -void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags) +void blk_mq_free_rq_map(struct blk_mq_tags *tags) { kfree(tags->rqs); tags->rqs = NULL; kfree(tags->static_rqs); tags->static_rqs = NULL; - blk_mq_free_tags(tags, flags); + blk_mq_free_tags(tags); } -struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx, - unsigned int nr_tags, - unsigned int reserved_tags, - unsigned int flags) +static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int nr_tags, + unsigned int reserved_tags) { struct blk_mq_tags *tags; int node; @@ -2401,7 +3154,8 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, if (node == NUMA_NO_NODE) node = set->numa_node; - tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags); + tags = blk_mq_init_tags(nr_tags, reserved_tags, node, + BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); if (!tags) return NULL; @@ -2409,7 +3163,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!tags->rqs) { - blk_mq_free_tags(tags, flags); + blk_mq_free_tags(tags); return NULL; } @@ -2418,7 +3172,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, node); if (!tags->static_rqs) { kfree(tags->rqs); - blk_mq_free_tags(tags, flags); + blk_mq_free_tags(tags); return NULL; } @@ -2440,8 +3194,9 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, return 0; } -int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, - unsigned int hctx_idx, unsigned int depth) +static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, + unsigned int hctx_idx, unsigned int depth) { unsigned int i, j, entries_per_page, max_order = 4; size_t rq_size, left; @@ -2547,7 +3302,7 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, struct blk_mq_hw_ctx *hctx) { - if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) + if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu) return false; if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) return false; @@ -2657,7 +3412,7 @@ static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, if (!tags) return; - WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0); + WARN_ON_ONCE(req_ref_read(flush_rq) != 0); for (i = 0; i < queue_depth; i++) cmpxchg(&tags->rqs[i], flush_rq, NULL); @@ -2711,20 +3466,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, } } -static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) -{ - int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); - - BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu), - __alignof__(struct blk_mq_hw_ctx)) != - sizeof(struct blk_mq_hw_ctx)); - - if (tag_set->flags & BLK_MQ_F_BLOCKING) - hw_ctx_size += sizeof(struct srcu_struct); - - return hw_ctx_size; -} - static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) @@ -2762,7 +3503,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx; gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; - hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node); + hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node); if (!hctx) goto fail_alloc_hctx; @@ -2804,8 +3545,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, if (!hctx->fq) goto free_bitmap; - if (hctx->flags & BLK_MQ_F_BLOCKING) - init_srcu_struct(hctx->srcu); blk_mq_hctx_kobj_init(hctx); return hctx; @@ -2852,37 +3591,58 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, } } -static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set, - int hctx_idx) +struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int depth) { - unsigned int flags = set->flags; - int ret = 0; + struct blk_mq_tags *tags; + int ret; - set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, - set->queue_depth, set->reserved_tags, flags); - if (!set->tags[hctx_idx]) - return false; + tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags); + if (!tags) + return NULL; - ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, - set->queue_depth); - if (!ret) - return true; + ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); + if (ret) { + blk_mq_free_rq_map(tags); + return NULL; + } - blk_mq_free_rq_map(set->tags[hctx_idx], flags); - set->tags[hctx_idx] = NULL; - return false; + return tags; } -static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, - unsigned int hctx_idx) +static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, + int hctx_idx) { - unsigned int flags = set->flags; + if (blk_mq_is_shared_tags(set->flags)) { + set->tags[hctx_idx] = set->shared_tags; - if (set->tags && set->tags[hctx_idx]) { - blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); - blk_mq_free_rq_map(set->tags[hctx_idx], flags); - set->tags[hctx_idx] = NULL; + return true; } + + set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx, + set->queue_depth); + + return set->tags[hctx_idx]; +} + +void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, + unsigned int hctx_idx) +{ + if (tags) { + blk_mq_free_rqs(set, tags, hctx_idx); + blk_mq_free_rq_map(tags); + } +} + +static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + if (!blk_mq_is_shared_tags(set->flags)) + blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx); + + set->tags[hctx_idx] = NULL; } static void blk_mq_map_swqueue(struct request_queue *q) @@ -2915,7 +3675,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx_idx = set->map[j].mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && - !__blk_mq_alloc_map_and_request(set, hctx_idx)) { + !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) { /* * If tags initialization fail for some hctx, * that hctx won't be brought online. In this @@ -2962,8 +3722,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) * fallback in case of a new remap fails * allocation */ - if (i && set->tags[i]) - blk_mq_free_map_and_requests(set, i); + if (i) + __blk_mq_free_map_and_rqs(set, i); hctx->tags = NULL; continue; @@ -3120,7 +3880,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, struct request_queue *q; int ret; - q = blk_alloc_queue(set->numa_node); + q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING); if (!q) return ERR_PTR(-ENOMEM); q->queuedata = queuedata; @@ -3259,8 +4019,6 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx = hctxs[j]; if (hctx) { - if (hctx->tags) - blk_mq_free_map_and_requests(set, j); blk_mq_exit_hctx(q, set, hctx, j); hctxs[j] = NULL; } @@ -3271,6 +4029,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { + WARN_ON_ONCE(blk_queue_has_srcu(q) != + !!(set->flags & BLK_MQ_F_BLOCKING)); + /* mark the queue as mq asap */ q->mq_ops = set->ops; @@ -3347,8 +4108,16 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i; + if (blk_mq_is_shared_tags(set->flags)) { + set->shared_tags = blk_mq_alloc_map_and_rqs(set, + BLK_MQ_NO_HCTX_IDX, + set->queue_depth); + if (!set->shared_tags) + return -ENOMEM; + } + for (i = 0; i < set->nr_hw_queues; i++) { - if (!__blk_mq_alloc_map_and_request(set, i)) + if (!__blk_mq_alloc_map_and_rqs(set, i)) goto out_unwind; cond_resched(); } @@ -3357,7 +4126,12 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) out_unwind: while (--i >= 0) - blk_mq_free_map_and_requests(set, i); + __blk_mq_free_map_and_rqs(set, i); + + if (blk_mq_is_shared_tags(set->flags)) { + blk_mq_free_map_and_rqs(set, set->shared_tags, + BLK_MQ_NO_HCTX_IDX); + } return -ENOMEM; } @@ -3367,7 +4141,7 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) * may reduce the depth asked for, if memory is tight. set->queue_depth * will be updated to reflect the allocated depth. */ -static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set) +static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set) { unsigned int depth; int err; @@ -3533,27 +4307,15 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (ret) goto out_free_mq_map; - ret = blk_mq_alloc_map_and_requests(set); + ret = blk_mq_alloc_set_map_and_rqs(set); if (ret) goto out_free_mq_map; - if (blk_mq_is_sbitmap_shared(set->flags)) { - atomic_set(&set->active_queues_shared_sbitmap, 0); - - if (blk_mq_init_shared_sbitmap(set)) { - ret = -ENOMEM; - goto out_free_mq_rq_maps; - } - } - mutex_init(&set->tag_list_lock); INIT_LIST_HEAD(&set->tag_list); return 0; -out_free_mq_rq_maps: - for (i = 0; i < set->nr_hw_queues; i++) - blk_mq_free_map_and_requests(set, i); out_free_mq_map: for (i = 0; i < set->nr_maps; i++) { kfree(set->map[i].mq_map); @@ -3586,10 +4348,12 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) int i, j; for (i = 0; i < set->nr_hw_queues; i++) - blk_mq_free_map_and_requests(set, i); + __blk_mq_free_map_and_rqs(set, i); - if (blk_mq_is_sbitmap_shared(set->flags)) - blk_mq_exit_shared_sbitmap(set); + if (blk_mq_is_shared_tags(set->flags)) { + blk_mq_free_map_and_rqs(set, set->shared_tags, + BLK_MQ_NO_HCTX_IDX); + } for (j = 0; j < set->nr_maps; j++) { kfree(set->map[j].mq_map); @@ -3624,20 +4388,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) * If we're using an MQ scheduler, just update the scheduler * queue depth. This is similar to what the old code would do. */ - if (!hctx->sched_tags) { - ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, - false); - if (!ret && blk_mq_is_sbitmap_shared(set->flags)) - blk_mq_tag_resize_shared_sbitmap(set, nr); - } else { + if (hctx->sched_tags) { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, - nr, true); - if (blk_mq_is_sbitmap_shared(set->flags)) { - hctx->sched_tags->bitmap_tags = - &q->sched_bitmap_tags; - hctx->sched_tags->breserved_tags = - &q->sched_breserved_tags; - } + nr, true); + } else { + ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, + false); } if (ret) break; @@ -3646,9 +4402,12 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) } if (!ret) { q->nr_requests = nr; - if (q->elevator && blk_mq_is_sbitmap_shared(set->flags)) - sbitmap_queue_resize(&q->sched_bitmap_tags, - nr - set->reserved_tags); + if (blk_mq_is_shared_tags(set->flags)) { + if (q->elevator) + blk_mq_tag_update_sched_shared_tags(q); + else + blk_mq_tag_resize_shared_tags(set, nr); + } } blk_mq_unquiesce_queue(q); @@ -3770,8 +4529,13 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); if (q->nr_hw_queues != set->nr_hw_queues) { + int i = prev_nr_hw_queues; + pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", nr_hw_queues, prev_nr_hw_queues); + for (; i < set->nr_hw_queues; i++) + __blk_mq_free_map_and_rqs(set, i); + set->nr_hw_queues = prev_nr_hw_queues; blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); goto fallback; @@ -3804,11 +4568,10 @@ EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); /* Enable polling stats and return whether they were already enabled. */ static bool blk_poll_stats_enable(struct request_queue *q) { - if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || - blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q)) + if (q->poll_stat) return true; - blk_stat_add_callback(q, q->poll_cb); - return false; + + return blk_stats_alloc_enable(q); } static void blk_mq_poll_stats_start(struct request_queue *q) @@ -3817,8 +4580,7 @@ static void blk_mq_poll_stats_start(struct request_queue *q) * We don't arm the callback if polling stats are not enabled or the * callback is already active. */ - if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || - blk_stat_is_active(q->poll_cb)) + if (!q->poll_stat || blk_stat_is_active(q->poll_cb)) return; blk_stat_activate_msecs(q->poll_cb, 100); @@ -3867,15 +4629,20 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q, return ret; } -static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, - struct request *rq) +static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc) { + struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc); + struct request *rq = blk_qc_to_rq(hctx, qc); struct hrtimer_sleeper hs; enum hrtimer_mode mode; unsigned int nsecs; ktime_t kt; - if (rq->rq_flags & RQF_MQ_POLL_SLEPT) + /* + * If a request has completed on queue that uses an I/O scheduler, we + * won't get back a request from blk_qc_to_rq. + */ + if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT)) return false; /* @@ -3917,92 +4684,37 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, __set_current_state(TASK_RUNNING); destroy_hrtimer_on_stack(&hs.timer); + + /* + * If we sleep, have the caller restart the poll loop to reset the + * state. Like for the other success return cases, the caller is + * responsible for checking if the IO completed. If the IO isn't + * complete, we'll get called again and will go straight to the busy + * poll loop. + */ return true; } -static bool blk_mq_poll_hybrid(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, blk_qc_t cookie) +static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, + struct io_comp_batch *iob, unsigned int flags) { - struct request *rq; + struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie); + long state = get_current_state(); + int ret; - if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) - return false; - - if (!blk_qc_t_is_internal(cookie)) - rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); - else { - rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); - /* - * With scheduling, if the request has completed, we'll - * get a NULL return here, as we clear the sched tag when - * that happens. The request still remains valid, like always, - * so we should be safe with just the NULL check. - */ - if (!rq) - return false; - } - - return blk_mq_poll_hybrid_sleep(q, rq); -} - -/** - * blk_poll - poll for IO completions - * @q: the queue - * @cookie: cookie passed back at IO submission time - * @spin: whether to spin for completions - * - * Description: - * Poll for completions on the passed in queue. Returns number of - * completed entries found. If @spin is true, then blk_poll will continue - * looping until at least one completion is found, unless the task is - * otherwise marked running (or we need to reschedule). - */ -int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) -{ - struct blk_mq_hw_ctx *hctx; - unsigned int state; - - if (!blk_qc_t_valid(cookie) || - !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) - return 0; - - if (current->plug) - blk_flush_plug_list(current->plug, false); - - hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; - - /* - * If we sleep, have the caller restart the poll loop to reset - * the state. Like for the other success return cases, the - * caller is responsible for checking if the IO completed. If - * the IO isn't complete, we'll get called again and will go - * straight to the busy poll loop. If specified not to spin, - * we also should not sleep. - */ - if (spin && blk_mq_poll_hybrid(q, hctx, cookie)) - return 1; - - hctx->poll_considered++; - - state = get_current_state(); do { - int ret; - - hctx->poll_invoked++; - - ret = q->mq_ops->poll(hctx); + ret = q->mq_ops->poll(hctx, iob); if (ret > 0) { - hctx->poll_success++; __set_current_state(TASK_RUNNING); return ret; } if (signal_pending_state(state, current)) __set_current_state(TASK_RUNNING); - if (task_is_running(current)) return 1; - if (ret < 0 || !spin) + + if (ret < 0 || (flags & BLK_POLL_ONESHOT)) break; cpu_relax(); } while (!need_resched()); @@ -4010,7 +4722,17 @@ int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) __set_current_state(TASK_RUNNING); return 0; } -EXPORT_SYMBOL_GPL(blk_poll); + +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, + unsigned int flags) +{ + if (!(flags & BLK_POLL_NOSLEEP) && + q->poll_nsec != BLK_MQ_POLL_CLASSIC) { + if (blk_mq_poll_hybrid(q, cookie)) + return 1; + } + return blk_mq_poll_classic(q, cookie, iob, flags); +} unsigned int blk_mq_rq_cpu(struct request *rq) { diff --git a/block/blk-mq.h b/block/blk-mq.h index 7cdca23b62..948791ea2a 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -25,18 +25,14 @@ struct blk_mq_ctx { unsigned short index_hw[HCTX_MAX_TYPES]; struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES]; - /* incremented at dispatch time */ - unsigned long rq_dispatched[2]; - unsigned long rq_merged; - - /* incremented at completion time */ - unsigned long ____cacheline_aligned_in_smp rq_completed[2]; - struct request_queue *queue; struct blk_mq_ctxs *ctxs; struct kobject kobj; } ____cacheline_aligned_in_smp; +void blk_mq_submit_bio(struct bio *bio); +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, + unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); @@ -54,15 +50,12 @@ void blk_mq_put_rq_ref(struct request *rq); */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); -void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags); -struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx, - unsigned int nr_tags, - unsigned int reserved_tags, - unsigned int flags); -int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, - unsigned int hctx_idx, unsigned int depth); - +void blk_mq_free_rq_map(struct blk_mq_tags *tags); +struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, + unsigned int hctx_idx, unsigned int depth); +void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, + unsigned int hctx_idx); /* * Internal helpers for request insertion into sw queues */ @@ -72,9 +65,6 @@ void blk_mq_request_bypass_insert(struct request *rq, bool at_head, bool run_queue); void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list); - -/* Used by blk_insert_cloned_request() to issue request directly */ -blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last); void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list); @@ -96,6 +86,20 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]]; } +static inline enum hctx_type blk_mq_get_hctx_type(unsigned int flags) +{ + enum hctx_type type = HCTX_TYPE_DEFAULT; + + /* + * The caller ensure that if REQ_POLLED, poll must be enabled. + */ + if (flags & REQ_POLLED) + type = HCTX_TYPE_POLL; + else if ((flags & REQ_OP_MASK) == REQ_OP_READ) + type = HCTX_TYPE_READ; + return type; +} + /* * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue * @q: request queue @@ -106,17 +110,7 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, unsigned int flags, struct blk_mq_ctx *ctx) { - enum hctx_type type = HCTX_TYPE_DEFAULT; - - /* - * The caller ensure that if REQ_HIPRI, poll must be enabled. - */ - if (flags & REQ_HIPRI) - type = HCTX_TYPE_POLL; - else if ((flags & REQ_OP_MASK) == REQ_OP_READ) - type = HCTX_TYPE_READ; - - return ctx->hctxs[type]; + return ctx->hctxs[blk_mq_get_hctx_type(flags)]; } /* @@ -128,6 +122,8 @@ extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q); extern int blk_mq_sysfs_register(struct request_queue *q); extern void blk_mq_sysfs_unregister(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); +void blk_mq_free_plug_rqs(struct blk_plug *plug); +void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); void blk_mq_cancel_work_sync(struct request_queue *q); @@ -156,23 +152,27 @@ struct blk_mq_alloc_data { blk_mq_req_flags_t flags; unsigned int shallow_depth; unsigned int cmd_flags; + req_flags_t rq_flags; + + /* allocate multiple requests/tags in one go */ + unsigned int nr_tags; + struct request **cached_rq; /* input & output parameter */ struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; }; -static inline bool blk_mq_is_sbitmap_shared(unsigned int flags) +static inline bool blk_mq_is_shared_tags(unsigned int flags) { return flags & BLK_MQ_F_TAG_HCTX_SHARED; } static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { - if (data->q->elevator) - return data->hctx->sched_tags; - - return data->hctx->tags; + if (!(data->rq_flags & RQF_ELV)) + return data->hctx->tags; + return data->hctx->sched_tags; } static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) @@ -222,24 +222,30 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq) static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) { - if (blk_mq_is_sbitmap_shared(hctx->flags)) - atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap); + if (blk_mq_is_shared_tags(hctx->flags)) + atomic_inc(&hctx->queue->nr_active_requests_shared_tags); else atomic_inc(&hctx->nr_active); } +static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, + int val) +{ + if (blk_mq_is_shared_tags(hctx->flags)) + atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags); + else + atomic_sub(val, &hctx->nr_active); +} + static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) { - if (blk_mq_is_sbitmap_shared(hctx->flags)) - atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap); - else - atomic_dec(&hctx->nr_active); + __blk_mq_sub_active_requests(hctx, 1); } static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) { - if (blk_mq_is_sbitmap_shared(hctx->flags)) - return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap); + if (blk_mq_is_shared_tags(hctx->flags)) + return atomic_read(&hctx->queue->nr_active_requests_shared_tags); return atomic_read(&hctx->nr_active); } static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, @@ -262,7 +268,20 @@ static inline void blk_mq_put_driver_tag(struct request *rq) __blk_mq_put_driver_tag(rq->mq_hctx, rq); } -bool blk_mq_get_driver_tag(struct request *rq); +bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq); + +static inline bool blk_mq_get_driver_tag(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (rq->tag != BLK_MQ_NO_TAG && + !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { + hctx->tags->rqs[rq->tag] = rq; + return true; + } + + return __blk_mq_get_driver_tag(hctx, rq); +} static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { @@ -333,19 +352,18 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, if (bt->sb.depth == 1) return true; - if (blk_mq_is_sbitmap_shared(hctx->flags)) { + if (blk_mq_is_shared_tags(hctx->flags)) { struct request_queue *q = hctx->queue; - struct blk_mq_tag_set *set = q->tag_set; if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) return true; - users = atomic_read(&set->active_queues_shared_sbitmap); } else { if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) return true; - users = atomic_read(&hctx->tags->active_queues); } + users = atomic_read(&hctx->tags->active_queues); + if (!users) return true; @@ -356,5 +374,24 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return __blk_mq_active_requests(hctx) < depth; } +/* run the code block in @dispatch_ops with rcu/srcu read lock held */ +#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ +do { \ + if (!blk_queue_has_srcu(q)) { \ + rcu_read_lock(); \ + (dispatch_ops); \ + rcu_read_unlock(); \ + } else { \ + int srcu_idx; \ + \ + might_sleep_if(check_sleep); \ + srcu_idx = srcu_read_lock((q)->srcu); \ + (dispatch_ops); \ + srcu_read_unlock((q)->srcu, srcu_idx); \ + } \ +} while (0) + +#define blk_mq_run_dispatch_ops(q, dispatch_ops) \ + __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ #endif diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index f000f83e06..3cfbc8668c 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -189,9 +189,10 @@ static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) * BIO_TRACKED lets controllers know that a bio went through the * normal rq_qos path. */ - bio_set_flag(bio, BIO_TRACKED); - if (q->rq_qos) + if (q->rq_qos) { + bio_set_flag(bio, BIO_TRACKED); __rq_qos_throttle(q->rq_qos, bio); + } } static inline void rq_qos_track(struct request_queue *q, struct request *rq, diff --git a/block/blk-stat.c b/block/blk-stat.c index ae3dd1fb8e..2ea01b5c1a 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -15,7 +15,7 @@ struct blk_queue_stats { struct list_head callbacks; spinlock_t lock; - bool enable_accounting; + int accounting; }; void blk_rq_stat_init(struct blk_rq_stat *stat) @@ -161,7 +161,7 @@ void blk_stat_remove_callback(struct request_queue *q, spin_lock_irqsave(&q->stats->lock, flags); list_del_rcu(&cb->list); - if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting) + if (list_empty(&q->stats->callbacks) && !q->stats->accounting) blk_queue_flag_clear(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); @@ -184,13 +184,24 @@ void blk_stat_free_callback(struct blk_stat_callback *cb) call_rcu(&cb->rcu, blk_stat_free_callback_rcu); } +void blk_stat_disable_accounting(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(&q->stats->lock, flags); + if (!--q->stats->accounting) + blk_queue_flag_clear(QUEUE_FLAG_STATS, q); + spin_unlock_irqrestore(&q->stats->lock, flags); +} +EXPORT_SYMBOL_GPL(blk_stat_disable_accounting); + void blk_stat_enable_accounting(struct request_queue *q) { unsigned long flags; spin_lock_irqsave(&q->stats->lock, flags); - q->stats->enable_accounting = true; - blk_queue_flag_set(QUEUE_FLAG_STATS, q); + if (!q->stats->accounting++) + blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock_irqrestore(&q->stats->lock, flags); } EXPORT_SYMBOL_GPL(blk_stat_enable_accounting); @@ -205,7 +216,7 @@ struct blk_queue_stats *blk_alloc_queue_stats(void) INIT_LIST_HEAD(&stats->callbacks); spin_lock_init(&stats->lock); - stats->enable_accounting = false; + stats->accounting = 0; return stats; } @@ -219,3 +230,21 @@ void blk_free_queue_stats(struct blk_queue_stats *stats) kfree(stats); } + +bool blk_stats_alloc_enable(struct request_queue *q) +{ + struct blk_rq_stat *poll_stat; + + poll_stat = kcalloc(BLK_MQ_POLL_STATS_BKTS, sizeof(*poll_stat), + GFP_ATOMIC); + if (!poll_stat) + return false; + + if (cmpxchg(&q->poll_stat, NULL, poll_stat) != NULL) { + kfree(poll_stat); + return true; + } + + blk_stat_add_callback(q, q->poll_cb); + return false; +} diff --git a/block/blk-stat.h b/block/blk-stat.h index 17b47a86ee..17e1eb4ec7 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -64,11 +64,13 @@ struct blk_stat_callback { struct blk_queue_stats *blk_alloc_queue_stats(void); void blk_free_queue_stats(struct blk_queue_stats *); +bool blk_stats_alloc_enable(struct request_queue *q); void blk_stat_add(struct request *rq, u64 now); /* record time/size info in request but not add a callback */ void blk_stat_enable_accounting(struct request_queue *q); +void blk_stat_disable_accounting(struct request_queue *q); /** * blk_stat_alloc_callback() - Allocate a block statistics callback. diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 4737ec024e..9f32882ceb 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -16,7 +16,9 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" +#include "blk-mq-sched.h" #include "blk-wbt.h" +#include "blk-throttle.h" struct queue_sysfs_entry { struct attribute attr; @@ -432,26 +434,11 @@ static ssize_t queue_poll_show(struct request_queue *q, char *page) static ssize_t queue_poll_store(struct request_queue *q, const char *page, size_t count) { - unsigned long poll_on; - ssize_t ret; - - if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL || - !q->tag_set->map[HCTX_TYPE_POLL].nr_queues) + if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return -EINVAL; - - ret = queue_var_store(&poll_on, page, count); - if (ret < 0) - return ret; - - if (poll_on) { - blk_queue_flag_set(QUEUE_FLAG_POLL, q); - } else { - blk_mq_freeze_queue(q); - blk_queue_flag_clear(QUEUE_FLAG_POLL, q); - blk_mq_unfreeze_queue(q); - } - - return ret; + pr_info_ratelimited("writes to the poll attribute are ignored.\n"); + pr_info_ratelimited("please use driver specific parameters instead.\n"); + return count; } static ssize_t queue_io_timeout_show(struct request_queue *q, char *page) @@ -748,7 +735,8 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) { struct request_queue *q = container_of(rcu_head, struct request_queue, rcu_head); - kmem_cache_free(blk_requestq_cachep, q); + + kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q); } /* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ @@ -761,7 +749,7 @@ static void blk_exit_queue(struct request_queue *q) */ if (q->elevator) { ioc_clear_queue(q); - __elevator_exit(q, q->elevator); + elevator_exit(q); } /* @@ -799,14 +787,15 @@ static void blk_release_queue(struct kobject *kobj) might_sleep(); - if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) + if (q->poll_stat) blk_stat_remove_callback(q, q->poll_cb); blk_stat_free_callback(q->poll_cb); - blk_free_queue_stats(q->stats); - blk_exit_queue(q); + blk_free_queue_stats(q->stats); + kfree(q->poll_stat); + blk_queue_free_zone_bitmaps(q); if (queue_is_mq(q)) @@ -822,6 +811,9 @@ static void blk_release_queue(struct kobject *kobj) bioset_exit(&q->bio_split); + if (blk_queue_has_srcu(q)) + cleanup_srcu_struct(q->srcu); + ida_simple_remove(&blk_queue_ida, q->id); call_rcu(&q->rcu_head, blk_free_queue_rcu); } @@ -877,16 +869,15 @@ int blk_register_queue(struct gendisk *disk) } mutex_lock(&q->sysfs_lock); + + ret = disk_register_independent_access_ranges(disk, NULL); + if (ret) + goto put_dev; + if (q->elevator) { ret = elv_register_queue(q, false); - if (ret) { - mutex_unlock(&q->sysfs_lock); - mutex_unlock(&q->sysfs_dir_lock); - kobject_del(&q->kobj); - blk_trace_remove_sysfs(dev); - kobject_put(&dev->kobj); - return ret; - } + if (ret) + goto put_dev; } blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); @@ -899,7 +890,6 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->elevator->kobj, KOBJ_ADD); mutex_unlock(&q->sysfs_lock); - ret = 0; unlock: mutex_unlock(&q->sysfs_dir_lock); @@ -917,6 +907,16 @@ int blk_register_queue(struct gendisk *disk) percpu_ref_switch_to_percpu(&q->q_usage_counter); } + return ret; + +put_dev: + disk_unregister_independent_access_ranges(disk); + mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); + kobject_del(&q->kobj); + blk_trace_remove_sysfs(dev); + kobject_put(&dev->kobj); + return ret; } @@ -962,6 +962,7 @@ void blk_unregister_queue(struct gendisk *disk) mutex_lock(&q->sysfs_lock); if (q->elevator) elv_unregister_queue(q); + disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); mutex_unlock(&q->sysfs_dir_lock); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 7c4e7993ba..7c462c006b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -13,6 +13,8 @@ #include #include "blk.h" #include "blk-cgroup-rwstat.h" +#include "blk-stat.h" +#include "blk-throttle.h" /* Max dispatch from a group in 1 round */ #define THROTL_GRP_QUANTUM 8 @@ -37,60 +39,9 @@ */ #define LATENCY_FILTERED_HD (1000L) /* 1ms */ -static struct blkcg_policy blkcg_policy_throtl; - /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; -/* - * To implement hierarchical throttling, throtl_grps form a tree and bios - * are dispatched upwards level by level until they reach the top and get - * issued. When dispatching bios from the children and local group at each - * level, if the bios are dispatched into a single bio_list, there's a risk - * of a local or child group which can queue many bios at once filling up - * the list starving others. - * - * To avoid such starvation, dispatched bios are queued separately - * according to where they came from. When they are again dispatched to - * the parent, they're popped in round-robin order so that no single source - * hogs the dispatch window. - * - * throtl_qnode is used to keep the queued bios separated by their sources. - * Bios are queued to throtl_qnode which in turn is queued to - * throtl_service_queue and then dispatched in round-robin order. - * - * It's also used to track the reference counts on blkg's. A qnode always - * belongs to a throtl_grp and gets queued on itself or the parent, so - * incrementing the reference of the associated throtl_grp when a qnode is - * queued and decrementing when dequeued is enough to keep the whole blkg - * tree pinned while bios are in flight. - */ -struct throtl_qnode { - struct list_head node; /* service_queue->queued[] */ - struct bio_list bios; /* queued bios */ - struct throtl_grp *tg; /* tg this qnode belongs to */ -}; - -struct throtl_service_queue { - struct throtl_service_queue *parent_sq; /* the parent service_queue */ - - /* - * Bios queued directly to this service_queue or dispatched from - * children throtl_grp's. - */ - struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ - unsigned int nr_queued[2]; /* number of queued bios */ - - /* - * RB tree of active children throtl_grp's, which are sorted by - * their ->disptime. - */ - struct rb_root_cached pending_tree; /* RB tree of active tgs */ - unsigned int nr_pending; /* # queued in the tree */ - unsigned long first_pending_disptime; /* disptime of the first tg */ - struct timer_list pending_timer; /* fires on first_pending_disptime */ -}; - enum tg_state_flags { THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ @@ -98,93 +49,6 @@ enum tg_state_flags { #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) -enum { - LIMIT_LOW, - LIMIT_MAX, - LIMIT_CNT, -}; - -struct throtl_grp { - /* must be the first member */ - struct blkg_policy_data pd; - - /* active throtl group service_queue member */ - struct rb_node rb_node; - - /* throtl_data this group belongs to */ - struct throtl_data *td; - - /* this group's service queue */ - struct throtl_service_queue service_queue; - - /* - * qnode_on_self is used when bios are directly queued to this - * throtl_grp so that local bios compete fairly with bios - * dispatched from children. qnode_on_parent is used when bios are - * dispatched from this throtl_grp into its parent and will compete - * with the sibling qnode_on_parents and the parent's - * qnode_on_self. - */ - struct throtl_qnode qnode_on_self[2]; - struct throtl_qnode qnode_on_parent[2]; - - /* - * Dispatch time in jiffies. This is the estimated time when group - * will unthrottle and is ready to dispatch more bio. It is used as - * key to sort active groups in service tree. - */ - unsigned long disptime; - - unsigned int flags; - - /* are there any throtl rules between this group and td? */ - bool has_rules[2]; - - /* internally used bytes per second rate limits */ - uint64_t bps[2][LIMIT_CNT]; - /* user configured bps limits */ - uint64_t bps_conf[2][LIMIT_CNT]; - - /* internally used IOPS limits */ - unsigned int iops[2][LIMIT_CNT]; - /* user configured IOPS limits */ - unsigned int iops_conf[2][LIMIT_CNT]; - - /* Number of bytes dispatched in current slice */ - uint64_t bytes_disp[2]; - /* Number of bio's dispatched in current slice */ - unsigned int io_disp[2]; - - unsigned long last_low_overflow_time[2]; - - uint64_t last_bytes_disp[2]; - unsigned int last_io_disp[2]; - - unsigned long last_check_time; - - unsigned long latency_target; /* us */ - unsigned long latency_target_conf; /* us */ - /* When did we start a new slice */ - unsigned long slice_start[2]; - unsigned long slice_end[2]; - - unsigned long last_finish_time; /* ns / 1024 */ - unsigned long checked_last_finish_time; /* ns / 1024 */ - unsigned long avg_idletime; /* ns / 1024 */ - unsigned long idletime_threshold; /* us */ - unsigned long idletime_threshold_conf; /* us */ - - unsigned int bio_cnt; /* total bios */ - unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ - unsigned long bio_cnt_reset_time; - - atomic_t io_split_cnt[2]; - atomic_t last_io_split_cnt[2]; - - struct blkg_rwstat stat_bytes; - struct blkg_rwstat stat_ios; -}; - /* We measure latency for request size from <= 4k to >= 1M */ #define LATENCY_BUCKET_SIZE 9 @@ -231,16 +95,6 @@ struct throtl_data static void throtl_pending_timer_fn(struct timer_list *t); -static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct throtl_grp, pd) : NULL; -} - -static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) -{ - return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); -} - static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) { return pd_to_blkg(&tg->pd); @@ -1794,7 +1648,7 @@ static void throtl_shutdown_wq(struct request_queue *q) cancel_work_sync(&td->dispatch_work); } -static struct blkcg_policy blkcg_policy_throtl = { +struct blkcg_policy blkcg_policy_throtl = { .dfl_cftypes = throtl_files, .legacy_cftypes = throtl_legacy_files, @@ -2208,9 +2062,9 @@ void blk_throtl_charge_bio_split(struct bio *bio) } while (parent); } -bool blk_throtl_bio(struct bio *bio) +bool __blk_throtl_bio(struct bio *bio) { - struct request_queue *q = bio->bi_bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; struct throtl_grp *tg = blkg_to_tg(blkg); @@ -2221,19 +2075,12 @@ bool blk_throtl_bio(struct bio *bio) rcu_read_lock(); - /* see throtl_charge_bio() */ - if (bio_flagged(bio, BIO_THROTTLED)) - goto out; - if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, bio->bi_iter.bi_size); blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); } - if (!tg->has_rules[rw]) - goto out; - spin_lock_irq(&q->queue_lock); throtl_update_latency_buckets(td); @@ -2317,7 +2164,6 @@ bool blk_throtl_bio(struct bio *bio) out_unlock: spin_unlock_irq(&q->queue_lock); -out: bio_set_flag(bio, BIO_THROTTLED); #ifdef CONFIG_BLK_DEV_THROTTLING_LOW diff --git a/block/blk.h b/block/blk.h index aab72194d2..8bd43b3ad3 100644 --- a/block/blk.h +++ b/block/blk.h @@ -2,15 +2,12 @@ #ifndef BLK_INTERNAL_H #define BLK_INTERNAL_H -#include -#include -#include #include #include /* for max_pfn/max_low_pfn */ #include #include "blk-crypto-internal.h" -#include "blk-mq.h" -#include "blk-mq-sched.h" + +struct elevator_type; /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) @@ -30,15 +27,10 @@ struct blk_flush_queue { }; extern struct kmem_cache *blk_requestq_cachep; +extern struct kmem_cache *blk_requestq_srcu_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; -static inline struct blk_flush_queue * -blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) -{ - return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; -} - static inline void __blk_get_queue(struct request_queue *q) { kobject_get(&q->kobj); @@ -53,6 +45,41 @@ void blk_free_flush_queue(struct blk_flush_queue *q); void blk_freeze_queue(struct request_queue *q); void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); void blk_queue_start_drain(struct request_queue *q); +int __bio_queue_enter(struct request_queue *q, struct bio *bio); +bool submit_bio_checks(struct bio *bio); + +static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) +{ + rcu_read_lock(); + if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter)) + goto fail; + + /* + * The code that increments the pm_only counter must ensure that the + * counter is globally visible before the queue is unfrozen. + */ + if (blk_queue_pm_only(q) && + (!pm || queue_rpm_status(q) == RPM_SUSPENDED)) + goto fail_put; + + rcu_read_unlock(); + return true; + +fail_put: + blk_queue_exit(q); +fail: + rcu_read_unlock(); + return false; +} + +static inline int bio_queue_enter(struct bio *bio) +{ + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + + if (blk_try_enter_queue(q, false)) + return 0; + return __bio_queue_enter(q, bio); +} #define BIO_INLINE_VECS 4 struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, @@ -94,6 +121,44 @@ static inline bool bvec_gap_to_prev(struct request_queue *q, return __bvec_gap_to_prev(q, bprv, offset); } +static inline bool rq_mergeable(struct request *rq) +{ + if (blk_rq_is_passthrough(rq)) + return false; + + if (req_op(rq) == REQ_OP_FLUSH) + return false; + + if (req_op(rq) == REQ_OP_WRITE_ZEROES) + return false; + + if (req_op(rq) == REQ_OP_ZONE_APPEND) + return false; + + if (rq->cmd_flags & REQ_NOMERGE_FLAGS) + return false; + if (rq->rq_flags & RQF_NOMERGE_FLAGS) + return false; + + return true; +} + +/* + * There are two different ways to handle DISCARD merges: + * 1) If max_discard_segments > 1, the driver treats every bio as a range and + * send the bios to controller together. The ranges don't need to be + * contiguous. + * 2) Otherwise, the request will be normal read/write requests. The ranges + * need to be contiguous. + */ +static inline bool blk_discard_mergable(struct request *req) +{ + if (req_op(req) == REQ_OP_DISCARD && + queue_max_discard_segments(req->q) > 1) + return true; + return false; +} + #ifdef CONFIG_BLK_DEV_INTEGRITY void blk_flush_integrity(void); bool __bio_integrity_endio(struct bio *); @@ -175,15 +240,13 @@ static inline void blk_integrity_del(struct gendisk *disk) unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); +const char *blk_status_to_str(blk_status_t status); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, struct request **same_queue_rq); + unsigned int nr_segs); bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, struct bio *bio, unsigned int nr_segs); -void blk_account_io_start(struct request *req); -void blk_account_io_done(struct request *req, u64 now); - /* * Plug flush limits */ @@ -199,19 +262,10 @@ void blk_insert_flush(struct request *rq); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); -void __elevator_exit(struct request_queue *, struct elevator_queue *); +void elevator_exit(struct request_queue *q); int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); -static inline void elevator_exit(struct request_queue *q, - struct elevator_queue *e) -{ - lockdep_assert_held(&q->sysfs_lock); - - blk_mq_sched_free_requests(q); - __elevator_exit(q, e); -} - ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, @@ -226,7 +280,32 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); -void __blk_queue_split(struct bio **bio, unsigned int *nr_segs); +static inline bool blk_may_split(struct request_queue *q, struct bio *bio) +{ + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE_SAME: + return true; /* non-trivial splitting decisions */ + default: + break; + } + + /* + * All drivers must accept single-segments bios that are <= PAGE_SIZE. + * This is a quick and dirty check that relies on the fact that + * bi_io_vec[0] is always valid if a bio has data. The check might + * lead to occasional false negatives when bios are cloned, but compared + * to the performance impact of cloned bios themselves the loop below + * doesn't matter anyway. + */ + return q->limits.chunk_sectors || bio->bi_vcnt != 1 || + bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; +} + +void __blk_queue_split(struct request_queue *q, struct bio **bio, + unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, @@ -246,9 +325,11 @@ int blk_dev_init(void); */ static inline bool blk_do_io_stat(struct request *rq) { - return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT); + return (rq->rq_flags & RQF_IO_STAT) && rq->q->disk; } +void update_io_ticks(struct block_device *part, unsigned long now, bool end); + static inline void req_set_nomerge(struct request_queue *q, struct request *req) { req->cmd_flags |= REQ_NOMERGE; @@ -283,30 +364,16 @@ static inline unsigned int bio_aligned_discard_max_sectors( /* * Internal io_context interface */ -void get_io_context(struct io_context *ioc); -struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); -struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, - gfp_t gfp_mask); +struct io_cq *ioc_find_get_icq(struct request_queue *q); +struct io_cq *ioc_lookup_icq(struct request_queue *q); +#ifdef CONFIG_BLK_ICQ void ioc_clear_queue(struct request_queue *q); +#else +static inline void ioc_clear_queue(struct request_queue *q) +{ +} +#endif /* CONFIG_BLK_ICQ */ -int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); - -/* - * Internal throttling interface - */ -#ifdef CONFIG_BLK_DEV_THROTTLING -extern int blk_throtl_init(struct request_queue *q); -extern void blk_throtl_exit(struct request_queue *q); -extern void blk_throtl_register_queue(struct request_queue *q); -extern void blk_throtl_charge_bio_split(struct bio *bio); -bool blk_throtl_bio(struct bio *bio); -#else /* CONFIG_BLK_DEV_THROTTLING */ -static inline int blk_throtl_init(struct request_queue *q) { return 0; } -static inline void blk_throtl_exit(struct request_queue *q) { } -static inline void blk_throtl_register_queue(struct request_queue *q) { } -static inline void blk_throtl_charge_bio_split(struct bio *bio) { } -static inline bool blk_throtl_bio(struct bio *bio) { return false; } -#endif /* CONFIG_BLK_DEV_THROTTLING */ #ifdef CONFIG_BLK_DEV_THROTTLING_LOW extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); extern ssize_t blk_throtl_sample_time_store(struct request_queue *q, @@ -364,7 +431,15 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); -struct request_queue *blk_alloc_queue(int node_id); +static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu) +{ + if (srcu) + return blk_requestq_srcu_cachep; + return blk_requestq_cachep; +} +struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu); + +int disk_scan_partitions(struct gendisk *disk, fmode_t mode); int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); @@ -374,13 +449,61 @@ extern struct device_attribute dev_attr_events; extern struct device_attribute dev_attr_events_async; extern struct device_attribute dev_attr_events_poll_msecs; -static inline void bio_clear_hipri(struct bio *bio) +static inline void bio_clear_polled(struct bio *bio) { /* can't support alloc cache if we turn off polling */ bio_clear_flag(bio, BIO_PERCPU_CACHE); - bio->bi_opf &= ~REQ_HIPRI; + bio->bi_opf &= ~REQ_POLLED; } +long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); +long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); + extern const struct address_space_operations def_blk_aops; +int disk_register_independent_access_ranges(struct gendisk *disk, + struct blk_independent_access_ranges *new_iars); +void disk_unregister_independent_access_ranges(struct gendisk *disk); + +#ifdef CONFIG_FAIL_MAKE_REQUEST +bool should_fail_request(struct block_device *part, unsigned int bytes); +#else /* CONFIG_FAIL_MAKE_REQUEST */ +static inline bool should_fail_request(struct block_device *part, + unsigned int bytes) +{ + return false; +} +#endif /* CONFIG_FAIL_MAKE_REQUEST */ + +/* + * Optimized request reference counting. Ideally we'd make timeouts be more + * clever, as that's the only reason we need references at all... But until + * this happens, this is faster than using refcount_t. Also see: + * + * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count") + */ +#define req_ref_zero_or_close_to_overflow(req) \ + ((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u) + +static inline bool req_ref_inc_not_zero(struct request *req) +{ + return atomic_inc_not_zero(&req->ref); +} + +static inline bool req_ref_put_and_test(struct request *req) +{ + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_dec_and_test(&req->ref); +} + +static inline void req_ref_set(struct request *req, int value) +{ + atomic_set(&req->ref, value); +} + +static inline int req_ref_read(struct request *req) +{ + return atomic_read(&req->ref); +} + #endif /* BLK_INTERNAL_H */ diff --git a/block/bounce.c b/block/bounce.c index 05fc714848..7af1a72835 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include diff --git a/block/bsg-lib.c b/block/bsg-lib.c index ccb98276c9..acfe1357bf 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -31,6 +31,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, struct bsg_job *job; struct request *rq; struct bio *bio; + void *reply; int ret; if (hdr->protocol != BSG_PROTOCOL_SCSI || @@ -39,22 +40,28 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, if (!capable(CAP_SYS_RAWIO)) return -EPERM; - rq = blk_get_request(q, hdr->dout_xfer_len ? + rq = blk_mq_alloc_request(q, hdr->dout_xfer_len ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); if (IS_ERR(rq)) return PTR_ERR(rq); rq->timeout = timeout; job = blk_mq_rq_to_pdu(rq); + reply = job->reply; + memset(job, 0, sizeof(*job)); + job->reply = reply; + job->reply_len = SCSI_SENSE_BUFFERSIZE; + job->dd_data = job + 1; + job->request_len = hdr->request_len; job->request = memdup_user(uptr64(hdr->request), hdr->request_len); if (IS_ERR(job->request)) { ret = PTR_ERR(job->request); - goto out_put_request; + goto out_free_rq; } if (hdr->dout_xfer_len && hdr->din_xfer_len) { - job->bidi_rq = blk_get_request(rq->q, REQ_OP_DRV_IN, 0); + job->bidi_rq = blk_mq_alloc_request(rq->q, REQ_OP_DRV_IN, 0); if (IS_ERR(job->bidi_rq)) { ret = PTR_ERR(job->bidi_rq); goto out_free_job_request; @@ -85,7 +92,7 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, goto out_unmap_bidi_rq; bio = rq->bio; - blk_execute_rq(NULL, rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL)); + blk_execute_rq(rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL)); /* * The assignments below don't make much sense, but are kept for @@ -134,11 +141,11 @@ static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, blk_rq_unmap_user(job->bidi_bio); out_free_bidi_rq: if (job->bidi_rq) - blk_put_request(job->bidi_rq); + blk_mq_free_request(job->bidi_rq); out_free_job_request: kfree(job->request); -out_put_request: - blk_put_request(rq); +out_free_rq: + blk_mq_free_request(rq); return ret; } @@ -302,18 +309,6 @@ static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req, return 0; } -/* called right before the request is given to the request_queue user */ -static void bsg_initialize_rq(struct request *req) -{ - struct bsg_job *job = blk_mq_rq_to_pdu(req); - void *reply = job->reply; - - memset(job, 0, sizeof(*job)); - job->reply = reply; - job->reply_len = SCSI_SENSE_BUFFERSIZE; - job->dd_data = job + 1; -} - static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req, unsigned int hctx_idx) { @@ -350,7 +345,6 @@ static const struct blk_mq_ops bsg_mq_ops = { .queue_rq = bsg_queue_rq, .init_request = bsg_init_rq, .exit_request = bsg_exit_rq, - .initialize_rq_fn = bsg_initialize_rq, .complete = bsg_complete, .timeout = bsg_timeout, }; diff --git a/block/elevator.c b/block/elevator.c index 1b5e57f611..482df2a350 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -40,6 +39,7 @@ #include +#include "elevator.h" #include "blk.h" #include "blk-mq-sched.h" #include "blk-pm.h" @@ -188,8 +188,10 @@ static void elevator_release(struct kobject *kobj) kfree(e); } -void __elevator_exit(struct request_queue *q, struct elevator_queue *e) +void elevator_exit(struct request_queue *q) { + struct elevator_queue *e = q->elevator; + mutex_lock(&e->sysfs_lock); blk_mq_exit_sched(q, e); mutex_unlock(&e->sysfs_lock); @@ -593,7 +595,8 @@ int elevator_switch_mq(struct request_queue *q, elv_unregister_queue(q); ioc_clear_queue(q); - elevator_exit(q, q->elevator); + blk_mq_sched_free_rqs(q); + elevator_exit(q); } ret = blk_mq_init_sched(q, new_e); @@ -603,7 +606,8 @@ int elevator_switch_mq(struct request_queue *q, if (new_e) { ret = elv_register_queue(q, true); if (ret) { - elevator_exit(q, q->elevator); + blk_mq_sched_free_rqs(q); + elevator_exit(q); goto out; } } @@ -635,7 +639,7 @@ static struct elevator_type *elevator_get_default(struct request_queue *q) return NULL; if (q->nr_hw_queues != 1 && - !blk_mq_is_sbitmap_shared(q->tag_set->flags)) + !blk_mq_is_shared_tags(q->tag_set->flags)) return NULL; return elevator_get(q, "mq-deadline", false); diff --git a/block/fops.c b/block/fops.c index 1e970c247e..a18e7fbd97 100644 --- a/block/fops.c +++ b/block/fops.c @@ -15,9 +15,10 @@ #include #include #include +#include #include "blk.h" -static struct inode *bdev_file_inode(struct file *file) +static inline struct inode *bdev_file_inode(struct file *file) { return file->f_mapping->host; } @@ -54,14 +55,12 @@ static void blkdev_bio_end_io_simple(struct bio *bio) static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, unsigned int nr_pages) { - struct file *file = iocb->ki_filp; - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); + struct block_device *bdev = iocb->ki_filp->private_data; struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; struct bio bio; ssize_t ret; - blk_qc_t qc; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) @@ -78,7 +77,7 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, bio_init(&bio, vecs, nr_pages); bio_set_dev(&bio, bdev); - bio.bi_iter.bi_sector = pos >> 9; + bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio.bi_write_hint = iocb->ki_hint; bio.bi_private = current; bio.bi_end_io = blkdev_bio_end_io_simple; @@ -102,13 +101,12 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, if (iocb->ki_flags & IOCB_HIPRI) bio_set_polled(&bio, iocb); - qc = submit_bio(&bio); + submit_bio(&bio); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(bio.bi_private)) break; - if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc, true)) + if (!(iocb->ki_flags & IOCB_HIPRI) || !bio_poll(&bio, NULL, 0)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); @@ -126,6 +124,11 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, return ret; } +enum { + DIO_SHOULD_DIRTY = 1, + DIO_IS_SYNC = 2, +}; + struct blkdev_dio { union { struct kiocb *iocb; @@ -133,35 +136,27 @@ struct blkdev_dio { }; size_t size; atomic_t ref; - bool multi_bio : 1; - bool should_dirty : 1; - bool is_sync : 1; - struct bio bio; + unsigned int flags; + struct bio bio ____cacheline_aligned_in_smp; }; static struct bio_set blkdev_dio_pool; -static int blkdev_iopoll(struct kiocb *kiocb, bool wait) -{ - struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); - struct request_queue *q = bdev_get_queue(bdev); - - return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); -} - static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; - bool should_dirty = dio->should_dirty; + bool should_dirty = dio->flags & DIO_SHOULD_DIRTY; if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; - if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { - if (!dio->is_sync) { + if (atomic_dec_and_test(&dio->ref)) { + if (!(dio->flags & DIO_IS_SYNC)) { struct kiocb *iocb = dio->iocb; ssize_t ret; + WRITE_ONCE(iocb->private, NULL); + if (likely(!dio->bio.bi_status)) { ret = dio->size; iocb->ki_pos += ret; @@ -169,9 +164,8 @@ static void blkdev_bio_end_io(struct bio *bio) ret = blk_status_to_errno(dio->bio.bi_status); } - dio->iocb->ki_complete(iocb, ret, 0); - if (dio->multi_bio) - bio_put(&dio->bio); + dio->iocb->ki_complete(iocb, ret); + bio_put(&dio->bio); } else { struct task_struct *waiter = dio->waiter; @@ -191,16 +185,12 @@ static void blkdev_bio_end_io(struct bio *bio) static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, unsigned int nr_pages) { - struct file *file = iocb->ki_filp; - struct inode *inode = bdev_file_inode(file); - struct block_device *bdev = I_BDEV(inode); + struct block_device *bdev = iocb->ki_filp->private_data; struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; - bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; - blk_qc_t qc = BLK_QC_T_NONE; int ret = 0; if ((pos | iov_iter_alignment(iter)) & @@ -210,28 +200,31 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); - dio->is_sync = is_sync = is_sync_kiocb(iocb); - if (dio->is_sync) { + atomic_set(&dio->ref, 1); + /* + * Grab an extra reference to ensure the dio structure which is embedded + * into the first bio stays around. + */ + bio_get(bio); + + is_sync = is_sync_kiocb(iocb); + if (is_sync) { + dio->flags = DIO_IS_SYNC; dio->waiter = current; - bio_get(bio); } else { + dio->flags = 0; dio->iocb = iocb; } dio->size = 0; - dio->multi_bio = false; - dio->should_dirty = is_read && iter_is_iovec(iter); + if (is_read && iter_is_iovec(iter)) + dio->flags |= DIO_SHOULD_DIRTY; - /* - * Don't plug for HIPRI/polled IO, as those should go straight - * to issue - */ - if (!is_poll) - blk_start_plug(&plug); + blk_start_plug(&plug); for (;;) { bio_set_dev(bio, bdev); - bio->bi_iter.bi_sector = pos >> 9; + bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; bio->bi_write_hint = iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; @@ -246,7 +239,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (is_read) { bio->bi_opf = REQ_OP_READ; - if (dio->should_dirty) + if (dio->flags & DIO_SHOULD_DIRTY) bio_set_pages_dirty(bio); } else { bio->bi_opf = dio_bio_write_op(iocb); @@ -260,40 +253,15 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); if (!nr_pages) { - bool polled = false; - - if (iocb->ki_flags & IOCB_HIPRI) { - bio_set_polled(bio, iocb); - polled = true; - } - - qc = submit_bio(bio); - - if (polled) - WRITE_ONCE(iocb->ki_cookie, qc); + submit_bio(bio); break; } - - if (!dio->multi_bio) { - /* - * AIO needs an extra reference to ensure the dio - * structure which is embedded into the first bio - * stays around. - */ - if (!is_sync) - bio_get(bio); - dio->multi_bio = true; - atomic_set(&dio->ref, 2); - } else { - atomic_inc(&dio->ref); - } - + atomic_inc(&dio->ref); submit_bio(bio); bio = bio_alloc(GFP_KERNEL, nr_pages); } - if (!is_poll) - blk_finish_plug(&plug); + blk_finish_plug(&plug); if (!is_sync) return -EIOCBQUEUED; @@ -302,10 +270,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(dio->waiter)) break; - - if (!(iocb->ki_flags & IOCB_HIPRI) || - !blk_poll(bdev_get_queue(bdev), qc, true)) - blk_io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); @@ -318,6 +283,95 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return ret; } +static void blkdev_bio_end_io_async(struct bio *bio) +{ + struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio); + struct kiocb *iocb = dio->iocb; + ssize_t ret; + + WRITE_ONCE(iocb->private, NULL); + + if (likely(!bio->bi_status)) { + ret = dio->size; + iocb->ki_pos += ret; + } else { + ret = blk_status_to_errno(bio->bi_status); + } + + iocb->ki_complete(iocb, ret); + + if (dio->flags & DIO_SHOULD_DIRTY) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + +static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, + struct iov_iter *iter, + unsigned int nr_pages) +{ + struct block_device *bdev = iocb->ki_filp->private_data; + struct blkdev_dio *dio; + struct bio *bio; + loff_t pos = iocb->ki_pos; + int ret = 0; + + if ((pos | iov_iter_alignment(iter)) & + (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + bio = bio_alloc_kiocb(iocb, nr_pages, &blkdev_dio_pool); + dio = container_of(bio, struct blkdev_dio, bio); + dio->flags = 0; + dio->iocb = iocb; + bio_set_dev(bio, bdev); + bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio->bi_write_hint = iocb->ki_hint; + bio->bi_end_io = blkdev_bio_end_io_async; + bio->bi_ioprio = iocb->ki_ioprio; + + if (iov_iter_is_bvec(iter)) { + /* + * Users don't rely on the iterator being in any particular + * state for async I/O returning -EIOCBQUEUED, hence we can + * avoid expensive iov_iter_advance(). Bypass + * bio_iov_iter_get_pages() and set the bvec directly. + */ + bio_iov_bvec_set(bio, iter); + } else { + ret = bio_iov_iter_get_pages(bio, iter); + if (unlikely(ret)) { + bio_put(bio); + return ret; + } + } + dio->size = bio->bi_iter.bi_size; + + if (iov_iter_rw(iter) == READ) { + bio->bi_opf = REQ_OP_READ; + if (iter_is_iovec(iter)) { + dio->flags |= DIO_SHOULD_DIRTY; + bio_set_pages_dirty(bio); + } + } else { + bio->bi_opf = dio_bio_write_op(iocb); + task_io_account_write(bio->bi_iter.bi_size); + } + + if (iocb->ki_flags & IOCB_HIPRI) { + bio->bi_opf |= REQ_POLLED | REQ_NOWAIT; + submit_bio(bio); + WRITE_ONCE(iocb->private, bio); + } else { + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; + submit_bio(bio); + } + return -EIOCBQUEUED; +} + static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { unsigned int nr_pages; @@ -326,9 +380,11 @@ static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return 0; nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); - if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_VECS) - return __blkdev_direct_IO_simple(iocb, iter, nr_pages); - + if (likely(nr_pages <= BIO_MAX_VECS)) { + if (is_sync_kiocb(iocb)) + return __blkdev_direct_IO_simple(iocb, iter, nr_pages); + return __blkdev_direct_IO_async(iocb, iter, nr_pages); + } return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); } @@ -405,8 +461,7 @@ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence) static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { - struct inode *bd_inode = bdev_file_inode(filp); - struct block_device *bdev = I_BDEV(bd_inode); + struct block_device *bdev = filp->private_data; int error; error = file_write_and_wait_range(filp, start, end); @@ -448,6 +503,8 @@ static int blkdev_open(struct inode *inode, struct file *filp) bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); if (IS_ERR(bdev)) return PTR_ERR(bdev); + + filp->private_data = bdev; filp->f_mapping = bdev->bd_inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); return 0; @@ -455,29 +512,12 @@ static int blkdev_open(struct inode *inode, struct file *filp) static int blkdev_close(struct inode *inode, struct file *filp) { - struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); + struct block_device *bdev = filp->private_data; blkdev_put(bdev, filp->f_mode); return 0; } -static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) -{ - struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - fmode_t mode = file->f_mode; - - /* - * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have - * to updated it before every ioctl. - */ - if (file->f_flags & O_NDELAY) - mode |= FMODE_NDELAY; - else - mode &= ~FMODE_NDELAY; - - return blkdev_ioctl(bdev, mode, cmd, arg); -} - /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. @@ -487,14 +527,14 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) */ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct file *file = iocb->ki_filp; - struct inode *bd_inode = bdev_file_inode(file); - loff_t size = i_size_read(bd_inode); + struct block_device *bdev = iocb->ki_filp->private_data; + struct inode *bd_inode = bdev->bd_inode; + loff_t size = bdev_nr_bytes(bdev); struct blk_plug plug; size_t shorted = 0; ssize_t ret; - if (bdev_read_only(I_BDEV(bd_inode))) + if (bdev_read_only(bdev)) return -EPERM; if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) @@ -526,24 +566,58 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct file *file = iocb->ki_filp; - struct inode *bd_inode = bdev_file_inode(file); - loff_t size = i_size_read(bd_inode); + struct block_device *bdev = iocb->ki_filp->private_data; + loff_t size = bdev_nr_bytes(bdev); loff_t pos = iocb->ki_pos; size_t shorted = 0; - ssize_t ret; + ssize_t ret = 0; + size_t count; - if (pos >= size) - return 0; - - size -= pos; - if (iov_iter_count(to) > size) { + if (unlikely(pos + iov_iter_count(to) > size)) { + if (pos >= size) + return 0; + size -= pos; shorted = iov_iter_count(to) - size; iov_iter_truncate(to, size); } - ret = generic_file_read_iter(iocb, to); - iov_iter_reexpand(to, iov_iter_count(to) + shorted); + count = iov_iter_count(to); + if (!count) + goto reexpand; /* skip atime */ + + if (iocb->ki_flags & IOCB_DIRECT) { + struct address_space *mapping = iocb->ki_filp->f_mapping; + + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_needs_writeback(mapping, pos, + pos + count - 1)) { + ret = -EAGAIN; + goto reexpand; + } + } else { + ret = filemap_write_and_wait_range(mapping, pos, + pos + count - 1); + if (ret < 0) + goto reexpand; + } + + file_accessed(iocb->ki_filp); + + ret = blkdev_direct_IO(iocb, to); + if (ret >= 0) { + iocb->ki_pos += ret; + count -= ret; + } + iov_iter_revert(to, count - iov_iter_count(to)); + if (ret < 0 || !count) + goto reexpand; + } + + ret = filemap_read(iocb, to, ret); + +reexpand: + if (unlikely(shorted)) + iov_iter_reexpand(to, iov_iter_count(to) + shorted); return ret; } @@ -565,7 +639,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, return -EOPNOTSUPP; /* Don't go off the end of the device. */ - isize = i_size_read(bdev->bd_inode); + isize = bdev_nr_bytes(bdev); if (start >= isize) return -EINVAL; if (end >= isize) { @@ -592,16 +666,18 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, switch (mode) { case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: - error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); + error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, + len >> SECTOR_SHIFT, GFP_KERNEL, + BLKDEV_ZERO_NOUNMAP); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: - error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); + error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, + len >> SECTOR_SHIFT, GFP_KERNEL, + BLKDEV_ZERO_NOFALLBACK); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: - error = blkdev_issue_discard(bdev, start >> 9, len >> 9, - GFP_KERNEL, 0); + error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, + len >> SECTOR_SHIFT, GFP_KERNEL, 0); break; default: error = -EOPNOTSUPP; @@ -618,10 +694,10 @@ const struct file_operations def_blk_fops = { .llseek = blkdev_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, - .iopoll = blkdev_iopoll, + .iopoll = iocb_bio_iopoll, .mmap = generic_file_mmap, .fsync = blkdev_fsync, - .unlocked_ioctl = block_ioctl, + .unlocked_ioctl = blkdev_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif diff --git a/block/genhd.c b/block/genhd.c index 0276a2846a..9eca1f7d35 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -25,8 +25,10 @@ #include #include #include +#include #include "blk.h" +#include "blk-mq-sched.h" #include "blk-rq-qos.h" static struct kobject *block_depr; @@ -58,6 +60,7 @@ void set_capacity(struct gendisk *disk, sector_t sectors) spin_lock(&bdev->bd_size_lock); i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + bdev->bd_nr_sectors = sectors; spin_unlock(&bdev->bd_size_lock); } EXPORT_SYMBOL(set_capacity); @@ -212,7 +215,10 @@ void blkdev_show(struct seq_file *seqf, off_t offset) * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If * @major = 0, try to allocate any unused major number. * @name: the name of the new block device as a zero terminated string - * @probe: allback that is called on access to any minor number of @major + * @probe: pre-devtmpfs / pre-udev callback used to create disks when their + * pre-created device node is accessed. When a probe call uses + * add_disk() and it fails the driver must cleanup resources. This + * interface may soon be removed. * * The @name must be unique within the system. * @@ -368,17 +374,21 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action) } EXPORT_SYMBOL_GPL(disk_uevent); -static void disk_scan_partitions(struct gendisk *disk) +int disk_scan_partitions(struct gendisk *disk, fmode_t mode) { struct block_device *bdev; - if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) - return; + if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) + return -EINVAL; + if (disk->open_partitions) + return -EBUSY; set_bit(GD_NEED_PART_SCAN, &disk->state); - bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); - if (!IS_ERR(bdev)) - blkdev_put(bdev, FMODE_READ); + bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL); + if (IS_ERR(bdev)) + return PTR_ERR(bdev); + blkdev_put(bdev, mode); + return 0; } /** @@ -390,8 +400,8 @@ static void disk_scan_partitions(struct gendisk *disk) * This function registers the partitioning information in @disk * with the kernel. */ -int device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups) +int __must_check device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups) { struct device *ddev = disk_to_dev(disk); @@ -432,7 +442,6 @@ int device_add_disk(struct device *parent, struct gendisk *disk, return ret; disk->major = BLOCK_EXT_MAJOR; disk->first_minor = ret; - disk->flags |= GENHD_FL_EXT_DEVT; } /* delay uevents, until we scanned partition table */ @@ -489,14 +498,7 @@ int device_add_disk(struct device *parent, struct gendisk *disk, if (ret) goto out_put_slave_dir; - if (disk->flags & GENHD_FL_HIDDEN) { - /* - * Don't let hidden disks show up in /proc/partitions, - * and don't bother scanning for partitions either. - */ - disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; - disk->flags |= GENHD_FL_NO_PART_SCAN; - } else { + if (!(disk->flags & GENHD_FL_HIDDEN)) { ret = bdi_register(disk->bdi, "%u:%u", disk->major, disk->first_minor); if (ret) @@ -508,7 +510,8 @@ int device_add_disk(struct device *parent, struct gendisk *disk, goto out_unregister_bdi; bdev_add(disk->part0, ddev->devt); - disk_scan_partitions(disk); + if (get_capacity(disk)) + disk_scan_partitions(disk, FMODE_READ); /* * Announce the disk and partitions after all partitions are @@ -541,7 +544,7 @@ int device_add_disk(struct device *parent, struct gendisk *disk, out_free_ext_minor: if (disk->major == BLOCK_EXT_MAJOR) blk_free_ext_minor(disk->first_minor); - return WARN_ON_ONCE(ret); /* keep until all callers handle errors */ + return ret; } EXPORT_SYMBOL(device_add_disk); @@ -645,6 +648,26 @@ void del_gendisk(struct gendisk *disk) } EXPORT_SYMBOL(del_gendisk); +/** + * invalidate_disk - invalidate the disk + * @disk: the struct gendisk to invalidate + * + * A helper to invalidates the disk. It will clean the disk's associated + * buffer/page caches and reset its internal states so that the disk + * can be reused by the drivers. + * + * Context: can sleep + */ +void invalidate_disk(struct gendisk *disk) +{ + struct block_device *bdev = disk->part0; + + invalidate_bdev(bdev); + bdev->bd_inode->i_mapping->wb_err = 0; + set_capacity(disk, 0); +} +EXPORT_SYMBOL(invalidate_disk); + /* sysfs access to bad-blocks list. */ static ssize_t disk_badblocks_show(struct device *dev, struct device_attribute *attr, @@ -711,8 +734,7 @@ void __init printk_all_partitions(void) * Don't show empty devices or things that have been * suppressed */ - if (get_capacity(disk) == 0 || - (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) + if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_HIDDEN)) continue; /* @@ -805,11 +827,7 @@ static int show_partition(struct seq_file *seqf, void *v) struct block_device *part; unsigned long idx; - /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && - (sgp->flags & GENHD_FL_REMOVABLE))) - return 0; - if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) + if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN)) return 0; rcu_read_lock(); @@ -865,7 +883,8 @@ static ssize_t disk_ext_range_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", disk_max_parts(disk)); + return sprintf(buf, "%d\n", + (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS); } static ssize_t disk_removable_show(struct device *dev, @@ -904,7 +923,7 @@ ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); - struct request_queue *q = bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bdev); struct disk_stats stat; unsigned int inflight; @@ -948,7 +967,7 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf) { struct block_device *bdev = dev_to_bdev(dev); - struct request_queue *q = bdev->bd_disk->queue; + struct request_queue *q = bdev_get_queue(bdev); unsigned int inflight[2]; if (queue_is_mq(q)) @@ -1290,6 +1309,9 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, if (!disk->bdi) goto out_free_disk; + /* bdev_alloc() might need the queue, set before the first call */ + disk->queue = q; + disk->part0 = bdev_alloc(disk, 0); if (!disk->part0) goto out_free_bdi; @@ -1305,7 +1327,6 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, disk_to_dev(disk)->type = &disk_type; device_initialize(disk_to_dev(disk)); inc_diskseq(disk); - disk->queue = q; q->disk = disk; lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); #ifdef CONFIG_BLOCK_HOLDER_DEPRECATED @@ -1332,7 +1353,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) struct request_queue *q; struct gendisk *disk; - q = blk_alloc_queue(node); + q = blk_alloc_queue(node, false); if (!q) return NULL; @@ -1410,12 +1431,6 @@ void set_disk_ro(struct gendisk *disk, bool read_only) } EXPORT_SYMBOL(set_disk_ro); -int bdev_read_only(struct block_device *bdev) -{ - return bdev->bd_read_only || get_disk_ro(bdev->bd_disk); -} -EXPORT_SYMBOL(bdev_read_only); - void inc_diskseq(struct gendisk *disk) { disk->diskseq = atomic64_inc_return(&diskseq); diff --git a/block/ioctl.c b/block/ioctl.c index a31be7fa31..4a86340133 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -82,31 +82,6 @@ static int compat_blkpg_ioctl(struct block_device *bdev, } #endif -static int blkdev_reread_part(struct block_device *bdev, fmode_t mode) -{ - struct block_device *tmp; - - if (!disk_part_scan_enabled(bdev->bd_disk) || bdev_is_partition(bdev)) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (bdev->bd_disk->open_partitions) - return -EBUSY; - - /* - * Reopen the device to revalidate the driver state and force a - * partition rescan. - */ - mode &= ~FMODE_EXCL; - set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); - - tmp = blkdev_get_by_dev(bdev->bd_dev, mode, NULL); - if (IS_ERR(tmp)) - return PTR_ERR(tmp); - blkdev_put(tmp, mode); - return 0; -} - static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, unsigned long arg, unsigned long flags) { @@ -133,7 +108,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, if (len & 511) return -EINVAL; - if (start + len > i_size_read(bdev->bd_inode)) + if (start + len > bdev_nr_bytes(bdev)) return -EINVAL; filemap_invalidate_lock(inode->i_mapping); @@ -171,7 +146,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, return -EINVAL; if (len & 511) return -EINVAL; - if (end >= (uint64_t)i_size_read(bdev->bd_inode)) + if (end >= (uint64_t)bdev_nr_bytes(bdev)) return -EINVAL; if (end < start) return -EINVAL; @@ -522,7 +497,11 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKRRPART: - return blkdev_reread_part(bdev, mode); + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (bdev_is_partition(bdev)) + return -EINVAL; + return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: @@ -550,12 +529,21 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, * * New commands must be compatible and go into blkdev_common_ioctl */ -int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, - unsigned long arg) +long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) { - int ret; - loff_t size; + struct block_device *bdev = I_BDEV(file->f_mapping->host); void __user *argp = (void __user *)arg; + fmode_t mode = file->f_mode; + int ret; + + /* + * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have + * to updated it before every ioctl. + */ + if (file->f_flags & O_NDELAY) + mode |= FMODE_NDELAY; + else + mode &= ~FMODE_NDELAY; switch (cmd) { /* These need separate implementations for the data structure */ @@ -572,10 +560,9 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return put_long(argp, (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: - size = i_size_read(bdev->bd_inode); - if ((size >> 9) > ~0UL) + if (bdev_nr_sectors(bdev) > ~0UL) return -EFBIG; - return put_ulong(argp, size >> 9); + return put_ulong(argp, bdev_nr_sectors(bdev)); /* The data is compatible, but the command number is different */ case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */ @@ -583,7 +570,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKBSZSET: return blkdev_bszset(bdev, mode, argp); case BLKGETSIZE64: - return put_u64(argp, i_size_read(bdev->bd_inode)); + return put_u64(argp, bdev_nr_bytes(bdev)); /* Incompatible alignment on i386 */ case BLKTRACESETUP: @@ -600,7 +587,6 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, return -ENOTTY; return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); } -EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */ #ifdef CONFIG_COMPAT @@ -618,7 +604,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) struct block_device *bdev = I_BDEV(file->f_mapping->host); struct gendisk *disk = bdev->bd_disk; fmode_t mode = file->f_mode; - loff_t size; /* * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have @@ -644,10 +629,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) return compat_put_long(argp, (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: - size = i_size_read(bdev->bd_inode); - if ((size >> 9) > ~0UL) + if (bdev_nr_sectors(bdev) > ~0UL) return -EFBIG; - return compat_put_ulong(argp, size >> 9); + return compat_put_ulong(argp, bdev_nr_sectors(bdev)); /* The data is compatible, but the command number is different */ case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */ @@ -655,7 +639,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKBSZSET_32: return blkdev_bszset(bdev, mode, argp); case BLKGETSIZE64_32: - return put_u64(argp, i_size_read(bdev->bd_inode)); + return put_u64(argp, bdev_nr_bytes(bdev)); /* Incompatible alignment on i386 */ case BLKTRACESETUP32: diff --git a/block/ioprio.c b/block/ioprio.c index 6f01d35a51..2fe068fcaa 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -22,46 +22,14 @@ */ #include #include -#include #include #include #include #include -#include -#include #include #include #include -int set_task_ioprio(struct task_struct *task, int ioprio) -{ - int err; - struct io_context *ioc; - const struct cred *cred = current_cred(), *tcred; - - rcu_read_lock(); - tcred = __task_cred(task); - if (!uid_eq(tcred->uid, cred->euid) && - !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { - rcu_read_unlock(); - return -EPERM; - } - rcu_read_unlock(); - - err = security_task_setioprio(task, ioprio); - if (err) - return err; - - ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); - if (ioc) { - ioc->ioprio = ioprio; - put_io_context(ioc); - } - - return err; -} -EXPORT_SYMBOL_GPL(set_task_ioprio); - int ioprio_check_cap(int ioprio) { int class = IOPRIO_PRIO_CLASS(ioprio); diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index a0ffbabfac..70ff2a599e 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -9,12 +9,12 @@ #include #include #include -#include #include #include #include +#include "elevator.h" #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" @@ -433,6 +433,7 @@ static void kyber_exit_sched(struct elevator_queue *e) int i; del_timer_sync(&kqd->timer); + blk_stat_disable_accounting(kqd->q); for (i = 0; i < KYBER_NUM_DOMAINS; i++) sbitmap_queue_free(&kqd->domain_tokens[i]); @@ -453,11 +454,11 @@ static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx) { struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; - unsigned int shift = tags->bitmap_tags->sb.shift; + unsigned int shift = tags->bitmap_tags.sb.shift; kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; - sbitmap_queue_min_shallow_depth(tags->bitmap_tags, kqd->async_depth); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth); } static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index cd2342d297..3ed5eaf344 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -20,6 +19,7 @@ #include +#include "elevator.h" #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" @@ -31,6 +31,11 @@ */ static const int read_expire = HZ / 2; /* max time before a read is submitted. */ static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ +/* + * Time after which to dispatch lower priority requests even if higher + * priority requests are pending. + */ +static const int prio_aging_expire = 10 * HZ; static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ @@ -51,17 +56,16 @@ enum dd_prio { enum { DD_PRIO_COUNT = 3 }; -/* I/O statistics per I/O priority. */ +/* + * I/O statistics per I/O priority. It is fine if these counters overflow. + * What matters is that these counters are at least as wide as + * log2(max_outstanding_requests). + */ struct io_stats_per_prio { - local_t inserted; - local_t merged; - local_t dispatched; - local_t completed; -}; - -/* I/O statistics for all I/O priorities (enum dd_prio). */ -struct io_stats { - struct io_stats_per_prio stats[DD_PRIO_COUNT]; + uint32_t inserted; + uint32_t merged; + uint32_t dispatched; + atomic_t completed; }; /* @@ -74,6 +78,7 @@ struct dd_per_prio { struct list_head fifo_list[DD_DIR_COUNT]; /* Next request in FIFO order. Read, write or both are NULL. */ struct request *next_rq[DD_DIR_COUNT]; + struct io_stats_per_prio stats; }; struct deadline_data { @@ -88,8 +93,6 @@ struct deadline_data { unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ - struct io_stats __percpu *stats; - /* * settings that change how the i/o scheduler behaves */ @@ -98,38 +101,12 @@ struct deadline_data { int writes_starved; int front_merges; u32 async_depth; + int prio_aging_expire; spinlock_t lock; spinlock_t zone_lock; }; -/* Count one event of type 'event_type' and with I/O priority 'prio' */ -#define dd_count(dd, event_type, prio) do { \ - struct io_stats *io_stats = get_cpu_ptr((dd)->stats); \ - \ - BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ - BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ - local_inc(&io_stats->stats[(prio)].event_type); \ - put_cpu_ptr(io_stats); \ -} while (0) - -/* - * Returns the total number of dd_count(dd, event_type, prio) calls across all - * CPUs. No locking or barriers since it is fine if the returned sum is slightly - * outdated. - */ -#define dd_sum(dd, event_type, prio) ({ \ - unsigned int cpu; \ - u32 sum = 0; \ - \ - BUILD_BUG_ON(!__same_type((dd), struct deadline_data *)); \ - BUILD_BUG_ON(!__same_type((prio), enum dd_prio)); \ - for_each_present_cpu(cpu) \ - sum += local_read(&per_cpu_ptr((dd)->stats, cpu)-> \ - stats[(prio)].event_type); \ - sum; \ -}) - /* Maps an I/O priority class to a deadline scheduler priority. */ static const enum dd_prio ioprio_class_to_prio[] = { [IOPRIO_CLASS_NONE] = DD_BE_PRIO, @@ -233,7 +210,9 @@ static void dd_merged_requests(struct request_queue *q, struct request *req, const u8 ioprio_class = dd_rq_ioclass(next); const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; - dd_count(dd, merged, prio); + lockdep_assert_held(&dd->lock); + + dd->per_prio[prio].stats.merged++; /* * if next expires before rq, assign its expire time to rq @@ -270,6 +249,16 @@ deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, deadline_remove_request(rq->q, per_prio, rq); } +/* Number of requests queued for a given priority level. */ +static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) +{ + const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; + + lockdep_assert_held(&dd->lock); + + return stats->inserted - atomic_read(&stats->completed); +} + /* * deadline_check_fifo returns 0 if there are no expired requests on the fifo, * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) @@ -355,12 +344,27 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, return rq; } +/* + * Returns true if and only if @rq started after @latest_start where + * @latest_start is in jiffies. + */ +static bool started_after(struct deadline_data *dd, struct request *rq, + unsigned long latest_start) +{ + unsigned long start_time = (unsigned long)rq->fifo_time; + + start_time -= dd->fifo_expire[rq_data_dir(rq)]; + + return time_after(start_time, latest_start); +} + /* * deadline_dispatch_requests selects the best request according to - * read/write expire, fifo_batch, etc + * read/write expire, fifo_batch, etc and with a start time <= @latest_start. */ static struct request *__dd_dispatch_request(struct deadline_data *dd, - struct dd_per_prio *per_prio) + struct dd_per_prio *per_prio, + unsigned long latest_start) { struct request *rq, *next_rq; enum dd_data_dir data_dir; @@ -372,6 +376,8 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, if (!list_empty(&per_prio->dispatch)) { rq = list_first_entry(&per_prio->dispatch, struct request, queuelist); + if (started_after(dd, rq, latest_start)) + return NULL; list_del_init(&rq->queuelist); goto done; } @@ -449,6 +455,9 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, dd->batching = 0; dispatch_request: + if (started_after(dd, rq, latest_start)) + return NULL; + /* * rq is the selected appropriate request. */ @@ -457,7 +466,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, done: ioprio_class = dd_rq_ioclass(rq); prio = ioprio_class_to_prio[ioprio_class]; - dd_count(dd, dispatched, prio); + dd->per_prio[prio].stats.dispatched++; /* * If the request needs its target zone locked, do it. */ @@ -466,6 +475,34 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, return rq; } +/* + * Check whether there are any requests with priority other than DD_RT_PRIO + * that were inserted more than prio_aging_expire jiffies ago. + */ +static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, + unsigned long now) +{ + struct request *rq; + enum dd_prio prio; + int prio_cnt; + + lockdep_assert_held(&dd->lock); + + prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) + + !!dd_queued(dd, DD_IDLE_PRIO); + if (prio_cnt < 2) + return NULL; + + for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) { + rq = __dd_dispatch_request(dd, &dd->per_prio[prio], + now - dd->prio_aging_expire); + if (rq) + return rq; + } + + return NULL; +} + /* * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). * @@ -477,15 +514,26 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; + const unsigned long now = jiffies; struct request *rq; enum dd_prio prio; spin_lock(&dd->lock); + rq = dd_dispatch_prio_aged_requests(dd, now); + if (rq) + goto unlock; + + /* + * Next, dispatch requests in priority order. Ignore lower priority + * requests if any higher priority requests are pending. + */ for (prio = 0; prio <= DD_PRIO_MAX; prio++) { - rq = __dd_dispatch_request(dd, &dd->per_prio[prio]); - if (rq) + rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now); + if (rq || dd_queued(dd, prio)) break; } + +unlock: spin_unlock(&dd->lock); return rq; @@ -519,7 +567,7 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) dd->async_depth = max(1UL, 3 * q->nr_requests / 4); - sbitmap_queue_min_shallow_depth(tags->bitmap_tags, dd->async_depth); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); } /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ @@ -536,12 +584,21 @@ static void dd_exit_sched(struct elevator_queue *e) for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; + const struct io_stats_per_prio *stats = &per_prio->stats; + uint32_t queued; WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); - } - free_percpu(dd->stats); + spin_lock(&dd->lock); + queued = dd_queued(dd, prio); + spin_unlock(&dd->lock); + + WARN_ONCE(queued != 0, + "statistics for priority %d: i %u m %u d %u c %u\n", + prio, stats->inserted, stats->merged, + stats->dispatched, atomic_read(&stats->completed)); + } kfree(dd); } @@ -566,11 +623,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) eq->elevator_data = dd; - dd->stats = alloc_percpu_gfp(typeof(*dd->stats), - GFP_KERNEL | __GFP_ZERO); - if (!dd->stats) - goto free_dd; - for (prio = 0; prio <= DD_PRIO_MAX; prio++) { struct dd_per_prio *per_prio = &dd->per_prio[prio]; @@ -586,15 +638,13 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) dd->front_merges = 1; dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; + dd->prio_aging_expire = prio_aging_expire; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); q->elevator = eq; return 0; -free_dd: - kfree(dd); - put_eq: kobject_put(&eq->kobj); return ret; @@ -677,8 +727,11 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_req_zone_write_unlock(rq); prio = ioprio_class_to_prio[ioprio_class]; - dd_count(dd, inserted, prio); - rq->elv.priv[0] = (void *)(uintptr_t)1; + per_prio = &dd->per_prio[prio]; + if (!rq->elv.priv[0]) { + per_prio->stats.inserted++; + rq->elv.priv[0] = (void *)(uintptr_t)1; + } if (blk_mq_sched_try_insert_merge(q, rq, &free)) { blk_mq_free_requests(&free); @@ -687,7 +740,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, trace_block_rq_insert(rq); - per_prio = &dd->per_prio[prio]; if (at_head) { list_add(&rq->queuelist, &per_prio->dispatch); } else { @@ -759,12 +811,13 @@ static void dd_finish_request(struct request *rq) /* * The block layer core may call dd_finish_request() without having - * called dd_insert_requests(). Hence only update statistics for - * requests for which dd_insert_requests() has been called. See also - * blk_mq_request_bypass_insert(). + * called dd_insert_requests(). Skip requests that bypassed I/O + * scheduling. See also blk_mq_request_bypass_insert(). */ - if (rq->elv.priv[0]) - dd_count(dd, completed, prio); + if (!rq->elv.priv[0]) + return; + + atomic_inc(&per_prio->stats.completed); if (blk_queue_is_zoned(q)) { unsigned long flags; @@ -809,6 +862,7 @@ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); +SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire); SHOW_INT(deadline_writes_starved_show, dd->writes_starved); SHOW_INT(deadline_front_merges_show, dd->front_merges); SHOW_INT(deadline_async_depth_show, dd->async_depth); @@ -838,6 +892,7 @@ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); +STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX); STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX); @@ -856,6 +911,7 @@ static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(front_merges), DD_ATTR(async_depth), DD_ATTR(fifo_batch), + DD_ATTR(prio_aging_expire), __ATTR_NULL }; @@ -947,38 +1003,48 @@ static int dd_async_depth_show(void *data, struct seq_file *m) return 0; } -/* Number of requests queued for a given priority level. */ -static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) -{ - return dd_sum(dd, inserted, prio) - dd_sum(dd, completed, prio); -} - static int dd_queued_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; + u32 rt, be, idle; + + spin_lock(&dd->lock); + rt = dd_queued(dd, DD_RT_PRIO); + be = dd_queued(dd, DD_BE_PRIO); + idle = dd_queued(dd, DD_IDLE_PRIO); + spin_unlock(&dd->lock); + + seq_printf(m, "%u %u %u\n", rt, be, idle); - seq_printf(m, "%u %u %u\n", dd_queued(dd, DD_RT_PRIO), - dd_queued(dd, DD_BE_PRIO), - dd_queued(dd, DD_IDLE_PRIO)); return 0; } /* Number of requests owned by the block driver for a given priority. */ static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) { - return dd_sum(dd, dispatched, prio) + dd_sum(dd, merged, prio) - - dd_sum(dd, completed, prio); + const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; + + lockdep_assert_held(&dd->lock); + + return stats->dispatched + stats->merged - + atomic_read(&stats->completed); } static int dd_owned_by_driver_show(void *data, struct seq_file *m) { struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; + u32 rt, be, idle; + + spin_lock(&dd->lock); + rt = dd_owned_by_driver(dd, DD_RT_PRIO); + be = dd_owned_by_driver(dd, DD_BE_PRIO); + idle = dd_owned_by_driver(dd, DD_IDLE_PRIO); + spin_unlock(&dd->lock); + + seq_printf(m, "%u %u %u\n", rt, be, idle); - seq_printf(m, "%u %u %u\n", dd_owned_by_driver(dd, DD_RT_PRIO), - dd_owned_by_driver(dd, DD_BE_PRIO), - dd_owned_by_driver(dd, DD_IDLE_PRIO)); return 0; } diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 278593b8e4..7aff4eb81c 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -2,6 +2,8 @@ # # Partition configuration # +menu "Partition Types" + config PARTITION_ADVANCED bool "Advanced partition selection" help @@ -267,3 +269,5 @@ config CMDLINE_PARTITION help Say Y here if you want to read the partition table from bootargs. The format for the command line is just like mtdparts. + +endmenu diff --git a/block/partitions/core.c b/block/partitions/core.c index b9e9af84f5..c2a1635922 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -91,19 +91,19 @@ static void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) { spin_lock(&bdev->bd_size_lock); i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + bdev->bd_nr_sectors = sectors; spin_unlock(&bdev->bd_size_lock); } static struct parsed_partitions *allocate_partitions(struct gendisk *hd) { struct parsed_partitions *state; - int nr; + int nr = DISK_MAX_PARTS; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return NULL; - nr = disk_max_parts(hd); state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); if (!state->parts) { kfree(state); @@ -204,7 +204,7 @@ static ssize_t part_alignment_offset_show(struct device *dev, struct block_device *bdev = dev_to_bdev(dev); return sprintf(buf, "%u\n", - queue_limit_alignment_offset(&bdev->bd_disk->queue->limits, + queue_limit_alignment_offset(&bdev_get_queue(bdev)->limits, bdev->bd_start_sect)); } @@ -214,7 +214,7 @@ static ssize_t part_discard_alignment_show(struct device *dev, struct block_device *bdev = dev_to_bdev(dev); return sprintf(buf, "%u\n", - queue_limit_discard_alignment(&bdev->bd_disk->queue->limits, + queue_limit_discard_alignment(&bdev_get_queue(bdev)->limits, bdev->bd_start_sect)); } @@ -325,7 +325,7 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, lockdep_assert_held(&disk->open_mutex); - if (partno >= disk_max_parts(disk)) + if (partno >= DISK_MAX_PARTS) return ERR_PTR(-EINVAL); /* @@ -526,18 +526,15 @@ int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, static bool disk_unlock_native_capacity(struct gendisk *disk) { - const struct block_device_operations *bdops = disk->fops; - - if (bdops->unlock_native_capacity && - !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) { - printk(KERN_CONT "enabling native capacity\n"); - bdops->unlock_native_capacity(disk); - disk->flags |= GENHD_FL_NATIVE_CAPACITY; - return true; - } else { + if (!disk->fops->unlock_native_capacity || + test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) { printk(KERN_CONT "truncated\n"); return false; } + + printk(KERN_CONT "enabling native capacity\n"); + disk->fops->unlock_native_capacity(disk); + return true; } void blk_drop_partitions(struct gendisk *disk) @@ -606,7 +603,7 @@ static int blk_add_partitions(struct gendisk *disk) struct parsed_partitions *state; int ret = -EAGAIN, p; - if (!disk_part_scan_enabled(disk)) + if (disk->flags & GENHD_FL_NO_PART) return 0; state = check_partition(disk); @@ -689,7 +686,7 @@ int bdev_disk_changed(struct gendisk *disk, bool invalidate) * userspace for this particular setup. */ if (invalidate) { - if (disk_part_scan_enabled(disk) || + if (!(disk->flags & GENHD_FL_NO_PART) || !(disk->flags & GENHD_FL_REMOVABLE)) set_capacity(disk, 0); } diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 7ca5c4c374..5e9be13a56 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -133,7 +133,7 @@ efi_crc32(const void *buf, unsigned long len) */ static u64 last_lba(struct gendisk *disk) { - return div_u64(disk->part0->bd_inode->i_size, + return div_u64(bdev_nr_bytes(disk->part0), queue_logical_block_size(disk->queue)) - 1ULL; } diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 9bca396aef..403756dbd5 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -198,7 +198,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state, char name[], union label_t *label, sector_t labelsect, - loff_t i_size, + sector_t nr_sectors, dasd_information2_t *info) { loff_t offset, geo_size, size; @@ -213,14 +213,14 @@ static int find_lnx1_partitions(struct parsed_partitions *state, } else { /* * Formated w/o large volume support. If the sanity check - * 'size based on geo == size based on i_size' is true, then + * 'size based on geo == size based on nr_sectors' is true, then * we can safely assume that we know the formatted size of * the disk, otherwise we need additional information * that we can only get from a real DASD device. */ geo_size = geo->cylinders * geo->heads * geo->sectors * secperblk; - size = i_size >> 9; + size = nr_sectors; if (size != geo_size) { if (!info) { strlcat(state->pp_buf, "\n", PAGE_SIZE); @@ -229,7 +229,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state, if (!strcmp(info->type, "ECKD")) if (geo_size < size) size = geo_size; - /* else keep size based on i_size */ + /* else keep size based on nr_sectors */ } } /* first and only partition starts in the first block after the label */ @@ -293,7 +293,8 @@ int ibm_partition(struct parsed_partitions *state) struct gendisk *disk = state->disk; struct block_device *bdev = disk->part0; int blocksize, res; - loff_t i_size, offset, size; + loff_t offset, size; + sector_t nr_sectors; dasd_information2_t *info; struct hd_geometry *geo; char type[5] = {0,}; @@ -308,8 +309,8 @@ int ibm_partition(struct parsed_partitions *state) blocksize = bdev_logical_block_size(bdev); if (blocksize <= 0) goto out_symbol; - i_size = i_size_read(bdev->bd_inode); - if (i_size == 0) + nr_sectors = bdev_nr_sectors(bdev); + if (nr_sectors == 0) goto out_symbol; info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL); if (info == NULL) @@ -336,7 +337,7 @@ int ibm_partition(struct parsed_partitions *state) label); } else if (!strncmp(type, "LNX1", 4)) { res = find_lnx1_partitions(state, geo, blocksize, name, - label, labelsect, i_size, + label, labelsect, nr_sectors, info); } else if (!strncmp(type, "CMS1", 4)) { res = find_cms1_partitions(state, geo, blocksize, name, @@ -353,7 +354,7 @@ int ibm_partition(struct parsed_partitions *state) res = 1; if (info->format == DASD_FORMAT_LDL) { strlcat(state->pp_buf, "(nonl)", PAGE_SIZE); - size = i_size >> 9; + size = nr_sectors; offset = (info->label_block + 1) * (blocksize >> 9); put_partition(state, 1, offset, size-offset); strlcat(state->pp_buf, "\n", PAGE_SIZE); diff --git a/block/t10-pi.c b/block/t10-pi.c index 00c203b2a9..25a52a2a09 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -5,7 +5,7 @@ */ #include -#include +#include #include #include #include