diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 8b17140214..1ed557cb5e 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -61,6 +61,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index aab48b292a..82faaa4581 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -68,6 +68,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 4a6a74177b..0f58594c5a 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -184,6 +184,7 @@ static int print_unex = 1; #include #include #include +#include #include #include #include diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 7ccc8d2a41..3911d0833e 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 303caf2d17..f538bc9dce 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -859,9 +859,15 @@ static int virtblk_probe(struct virtio_device *vdev) virtio_cread(vdev, struct virtio_blk_config, max_discard_seg, &v); + + /* + * max_discard_seg == 0 is out of spec but we always + * handled it. + */ + if (!v) + v = sg_elems - 2; blk_queue_max_discard_segments(q, - min_not_zero(v, - MAX_DISCARD_SEGMENTS)); + min(v, MAX_DISCARD_SEGMENTS)); blk_queue_flag_set(QUEUE_FLAG_DISCARD, q); } diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 1becbbb3be..390817cf12 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/pinctrl/intel/pinctrl-tigerlake.c b/drivers/pinctrl/intel/pinctrl-tigerlake.c index 0bcd19597e..3ddaeffc04 100644 --- a/drivers/pinctrl/intel/pinctrl-tigerlake.c +++ b/drivers/pinctrl/intel/pinctrl-tigerlake.c @@ -749,7 +749,6 @@ static const struct acpi_device_id tgl_pinctrl_acpi_match[] = { { "INT34C5", (kernel_ulong_t)&tgllp_soc_data }, { "INT34C6", (kernel_ulong_t)&tglh_soc_data }, { "INTC1055", (kernel_ulong_t)&tgllp_soc_data }, - { "INTC1057", (kernel_ulong_t)&tgllp_soc_data }, { } }; MODULE_DEVICE_TABLE(acpi, tgl_pinctrl_acpi_match); diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index fa966e0db6..3a6f3af240 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -14,6 +14,7 @@ #define KMSG_COMPONENT "dasd" #include +#include #include #include diff --git a/drivers/spi/spi-rockchip.c b/drivers/spi/spi-rockchip.c index 553b6b9d02..c6a1bb09be 100644 --- a/drivers/spi/spi-rockchip.c +++ b/drivers/spi/spi-rockchip.c @@ -585,6 +585,12 @@ static int rockchip_spi_slave_abort(struct spi_controller *ctlr) { struct rockchip_spi *rs = spi_controller_get_devdata(ctlr); + if (atomic_read(&rs->state) & RXDMA) + dmaengine_terminate_sync(ctlr->dma_rx); + if (atomic_read(&rs->state) & TXDMA) + dmaengine_terminate_sync(ctlr->dma_tx); + atomic_set(&rs->state, 0); + spi_enable_chip(rs, false); rs->slave_abort = true; spi_finalize_current_transfer(ctlr); @@ -654,7 +660,7 @@ static int rockchip_spi_probe(struct platform_device *pdev) struct spi_controller *ctlr; struct resource *mem; struct device_node *np = pdev->dev.of_node; - u32 rsd_nsecs; + u32 rsd_nsecs, num_cs; bool slave_mode; slave_mode = of_property_read_bool(np, "spi-slave"); @@ -764,8 +770,9 @@ static int rockchip_spi_probe(struct platform_device *pdev) * rk spi0 has two native cs, spi1..5 one cs only * if num-cs is missing in the dts, default to 1 */ - if (of_property_read_u16(np, "num-cs", &ctlr->num_chipselect)) - ctlr->num_chipselect = 1; + if (of_property_read_u32(np, "num-cs", &num_cs)) + num_cs = 1; + ctlr->num_chipselect = num_cs; ctlr->use_gpio_descriptors = true; } ctlr->dev.of_node = pdev->dev.of_node; diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 236081afe9..c2b733ef95 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -166,14 +166,13 @@ void virtio_add_status(struct virtio_device *dev, unsigned int status) } EXPORT_SYMBOL_GPL(virtio_add_status); -int virtio_finalize_features(struct virtio_device *dev) +/* Do some validation, then set FEATURES_OK */ +static int virtio_features_ok(struct virtio_device *dev) { - int ret = dev->config->finalize_features(dev); unsigned status; + int ret; might_sleep(); - if (ret) - return ret; ret = arch_has_restricted_virtio_memory_access(); if (ret) { @@ -202,7 +201,6 @@ int virtio_finalize_features(struct virtio_device *dev) } return 0; } -EXPORT_SYMBOL_GPL(virtio_finalize_features); static int virtio_dev_probe(struct device *_d) { @@ -239,17 +237,6 @@ static int virtio_dev_probe(struct device *_d) driver_features_legacy = driver_features; } - /* - * Some devices detect legacy solely via F_VERSION_1. Write - * F_VERSION_1 to force LE config space accesses before FEATURES_OK for - * these when needed. - */ - if (drv->validate && !virtio_legacy_is_little_endian() - && device_features & BIT_ULL(VIRTIO_F_VERSION_1)) { - dev->features = BIT_ULL(VIRTIO_F_VERSION_1); - dev->config->finalize_features(dev); - } - if (device_features & (1ULL << VIRTIO_F_VERSION_1)) dev->features = driver_features & device_features; else @@ -260,13 +247,26 @@ static int virtio_dev_probe(struct device *_d) if (device_features & (1ULL << i)) __virtio_set_bit(dev, i); + err = dev->config->finalize_features(dev); + if (err) + goto err; + if (drv->validate) { + u64 features = dev->features; + err = drv->validate(dev); if (err) goto err; + + /* Did validation change any features? Then write them again. */ + if (features != dev->features) { + err = dev->config->finalize_features(dev); + if (err) + goto err; + } } - err = virtio_finalize_features(dev); + err = virtio_features_ok(dev); if (err) goto err; @@ -490,7 +490,11 @@ int virtio_device_restore(struct virtio_device *dev) /* We have a driver! */ virtio_add_status(dev, VIRTIO_CONFIG_S_DRIVER); - ret = virtio_finalize_features(dev); + ret = dev->config->finalize_features(dev); + if (ret) + goto err; + + ret = virtio_features_ok(dev); if (ret) goto err; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index d721c66d0b..5edd07e023 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1491,7 +1491,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) container_of(work, struct btrfs_fs_info, reclaim_bgs_work); struct btrfs_block_group *bg; struct btrfs_space_info *space_info; - LIST_HEAD(again_list); if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; @@ -1562,18 +1561,14 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); ret = btrfs_relocate_chunk(fs_info, bg->start); - if (ret && ret != -EAGAIN) + if (ret) btrfs_err(fs_info, "error relocating chunk %llu", bg->start); next: + btrfs_put_block_group(bg); spin_lock(&fs_info->unused_bgs_lock); - if (ret == -EAGAIN && list_empty(&bg->bg_list)) - list_add_tail(&bg->bg_list, &again_list); - else - btrfs_put_block_group(bg); } - list_splice_tail(&again_list, &fs_info->reclaim_bgs); spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 95a6a63caf..899f854459 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1566,32 +1566,13 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, struct btrfs_path *p, int write_lock_level) { - struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; int root_lock = 0; int level = 0; if (p->search_commit_root) { - /* - * The commit roots are read only so we always do read locks, - * and we always must hold the commit_root_sem when doing - * searches on them, the only exception is send where we don't - * want to block transaction commits for a long time, so - * we need to clone the commit root in order to avoid races - * with transaction commits that create a snapshot of one of - * the roots used by a send operation. - */ - if (p->need_commit_sem) { - down_read(&fs_info->commit_root_sem); - b = btrfs_clone_extent_buffer(root->commit_root); - up_read(&fs_info->commit_root_sem); - if (!b) - return ERR_PTR(-ENOMEM); - - } else { - b = root->commit_root; - atomic_inc(&b->refs); - } + b = root->commit_root; + atomic_inc(&b->refs); level = btrfs_header_level(b); /* * Ensure that all callers have set skip_locking when @@ -1657,6 +1638,42 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, return b; } +/* + * Replace the extent buffer at the lowest level of the path with a cloned + * version. The purpose is to be able to use it safely, after releasing the + * commit root semaphore, even if relocation is happening in parallel, the + * transaction used for relocation is committed and the extent buffer is + * reallocated in the next transaction. + * + * This is used in a context where the caller does not prevent transaction + * commits from happening, either by holding a transaction handle or holding + * some lock, while it's doing searches through a commit root. + * At the moment it's only used for send operations. + */ +static int finish_need_commit_sem_search(struct btrfs_path *path) +{ + const int i = path->lowest_level; + const int slot = path->slots[i]; + struct extent_buffer *lowest = path->nodes[i]; + struct extent_buffer *clone; + + ASSERT(path->need_commit_sem); + + if (!lowest) + return 0; + + lockdep_assert_held_read(&lowest->fs_info->commit_root_sem); + + clone = btrfs_clone_extent_buffer(lowest); + if (!clone) + return -ENOMEM; + + btrfs_release_path(path); + path->nodes[i] = clone; + path->slots[i] = slot; + + return 0; +} /* * btrfs_search_slot - look for a key in a tree and perform necessary @@ -1693,6 +1710,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow) { + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *b; int slot; int ret; @@ -1734,6 +1752,11 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, min_write_lock_level = write_lock_level; + if (p->need_commit_sem) { + ASSERT(p->search_commit_root); + down_read(&fs_info->commit_root_sem); + } + again: prev_cmp = -1; b = btrfs_search_slot_get_root(root, p, write_lock_level); @@ -1928,6 +1951,16 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, done: if (ret < 0 && !p->skip_release_on_error) btrfs_release_path(p); + + if (p->need_commit_sem) { + int ret2; + + ret2 = finish_need_commit_sem_search(p); + up_read(&fs_info->commit_root_sem); + if (ret2) + ret = ret2; + } + return ret; } ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO); @@ -4396,7 +4429,9 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, int level; struct extent_buffer *c; struct extent_buffer *next; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key key; + bool need_commit_sem = false; u32 nritems; int ret; int i; @@ -4413,14 +4448,20 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, path->keep_locks = 1; - if (time_seq) + if (time_seq) { ret = btrfs_search_old_slot(root, &key, path, time_seq); - else + } else { + if (path->need_commit_sem) { + path->need_commit_sem = 0; + need_commit_sem = true; + down_read(&fs_info->commit_root_sem); + } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + } path->keep_locks = 0; if (ret < 0) - return ret; + goto done; nritems = btrfs_header_nritems(path->nodes[0]); /* @@ -4543,6 +4584,15 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, ret = 0; done: unlock_up(path, 0, 1, 0, NULL); + if (need_commit_sem) { + int ret2; + + path->need_commit_sem = 1; + ret2 = finish_need_commit_sem_search(path); + up_read(&fs_info->commit_root_sem); + if (ret2) + ret = ret2; + } return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b464098016..e89f814cc8 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -568,7 +568,6 @@ enum { /* * Indicate that relocation of a chunk has started, it's set per chunk * and is toggled between chunks. - * Set, tested and cleared while holding fs_info::send_reloc_lock. */ BTRFS_FS_RELOC_RUNNING, @@ -668,6 +667,12 @@ struct btrfs_fs_info { u64 generation; u64 last_trans_committed; + /* + * Generation of the last transaction used for block group relocation + * since the filesystem was last mounted (or 0 if none happened yet). + * Must be written and read while holding btrfs_fs_info::commit_root_sem. + */ + u64 last_reloc_trans; u64 avg_delayed_ref_runtime; /* @@ -997,13 +1002,6 @@ struct btrfs_fs_info { struct crypto_shash *csum_shash; - spinlock_t send_reloc_lock; - /* - * Number of send operations in progress. - * Updated while holding fs_info::send_reloc_lock. - */ - int send_in_progress; - /* Type of exclusive operation running, protected by super_lock */ enum btrfs_exclusive_operation exclusive_operation; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2180fcef56..d5a590b11b 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2859,6 +2859,7 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) /* All successful */ fs_info->generation = generation; fs_info->last_trans_committed = generation; + fs_info->last_reloc_trans = 0; /* Always begin writing backup roots after the one being used */ if (backup_index < 0) { @@ -2992,9 +2993,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; - spin_lock_init(&fs_info->send_reloc_lock); - fs_info->send_in_progress = 0; - fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH; INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work); } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index a050f9748f..a6661f2ad2 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3854,25 +3854,14 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, * 0 success * -EINPROGRESS operation is already in progress, that's probably a bug * -ECANCELED cancellation request was set before the operation started - * -EAGAIN can not start because there are ongoing send operations */ static int reloc_chunk_start(struct btrfs_fs_info *fs_info) { - spin_lock(&fs_info->send_reloc_lock); - if (fs_info->send_in_progress) { - btrfs_warn_rl(fs_info, -"cannot run relocation while send operations are in progress (%d in progress)", - fs_info->send_in_progress); - spin_unlock(&fs_info->send_reloc_lock); - return -EAGAIN; - } if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { /* This should not happen */ - spin_unlock(&fs_info->send_reloc_lock); btrfs_err(fs_info, "reloc already running, cannot start"); return -EINPROGRESS; } - spin_unlock(&fs_info->send_reloc_lock); if (atomic_read(&fs_info->reloc_cancel_req) > 0) { btrfs_info(fs_info, "chunk relocation canceled on start"); @@ -3894,9 +3883,7 @@ static void reloc_chunk_end(struct btrfs_fs_info *fs_info) /* Requested after start, clear bit first so any waiters can continue */ if (atomic_read(&fs_info->reloc_cancel_req) > 0) btrfs_info(fs_info, "chunk relocation canceled during operation"); - spin_lock(&fs_info->send_reloc_lock); clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); - spin_unlock(&fs_info->send_reloc_lock); atomic_set(&fs_info->reloc_cancel_req, 0); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 5612e8bf2a..4d2c6ce29f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -24,6 +24,7 @@ #include "transaction.h" #include "compression.h" #include "xattr.h" +#include "print-tree.h" /* * Maximum number of references an extent can have in order for us to attempt to @@ -95,6 +96,15 @@ struct send_ctx { struct btrfs_path *right_path; struct btrfs_key *cmp_key; + /* + * Keep track of the generation of the last transaction that was used + * for relocating a block group. This is periodically checked in order + * to detect if a relocation happened since the last check, so that we + * don't operate on stale extent buffers for nodes (level >= 1) or on + * stale disk_bytenr values of file extent items. + */ + u64 last_reloc_trans; + /* * infos of the currently processed inode. In case of deleted inodes, * these are the values from the deleted inode. @@ -1415,6 +1425,26 @@ static int find_extent_clone(struct send_ctx *sctx, if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + /* + * A transaction commit for a transaction in which block group + * relocation was done just happened. + * The disk_bytenr of the file extent item we processed is + * possibly stale, referring to the extent's location before + * relocation. So act as if we haven't found any clone sources + * and fallback to write commands, which will read the correct + * data from the new extent location. Otherwise we will fail + * below because we haven't found our own back reference or we + * could be getting incorrect sources in case the old extent + * was already reallocated after the relocation. + */ + up_read(&fs_info->commit_root_sem); + ret = -ENOENT; + goto out; + } + up_read(&fs_info->commit_root_sem); + if (!backref_ctx.found_itself) { /* found a bug in backref code? */ ret = -EIO; @@ -6596,6 +6626,50 @@ static int changed_cb(struct btrfs_path *left_path, { int ret = 0; + /* + * We can not hold the commit root semaphore here. This is because in + * the case of sending and receiving to the same filesystem, using a + * pipe, could result in a deadlock: + * + * 1) The task running send blocks on the pipe because it's full; + * + * 2) The task running receive, which is the only consumer of the pipe, + * is waiting for a transaction commit (for example due to a space + * reservation when doing a write or triggering a transaction commit + * when creating a subvolume); + * + * 3) The transaction is waiting to write lock the commit root semaphore, + * but can not acquire it since it's being held at 1). + * + * Down this call chain we write to the pipe through kernel_write(). + * The same type of problem can also happen when sending to a file that + * is stored in the same filesystem - when reserving space for a write + * into the file, we can trigger a transaction commit. + * + * Our caller has supplied us with clones of leaves from the send and + * parent roots, so we're safe here from a concurrent relocation and + * further reallocation of metadata extents while we are here. Below we + * also assert that the leaves are clones. + */ + lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem); + + /* + * We always have a send root, so left_path is never NULL. We will not + * have a leaf when we have reached the end of the send root but have + * not yet reached the end of the parent root. + */ + if (left_path->nodes[0]) + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, + &left_path->nodes[0]->bflags)); + /* + * When doing a full send we don't have a parent root, so right_path is + * NULL. When doing an incremental send, we may have reached the end of + * the parent root already, so we don't have a leaf at right_path. + */ + if (right_path && right_path->nodes[0]) + ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED, + &right_path->nodes[0]->bflags)); + if (result == BTRFS_COMPARE_TREE_SAME) { if (key->type == BTRFS_INODE_REF_KEY || key->type == BTRFS_INODE_EXTREF_KEY) { @@ -6642,14 +6716,46 @@ static int changed_cb(struct btrfs_path *left_path, return ret; } +static int search_key_again(const struct send_ctx *sctx, + struct btrfs_root *root, + struct btrfs_path *path, + const struct btrfs_key *key) +{ + int ret; + + if (!path->need_commit_sem) + lockdep_assert_held_read(&root->fs_info->commit_root_sem); + + /* + * Roots used for send operations are readonly and no one can add, + * update or remove keys from them, so we should be able to find our + * key again. The only exception is deduplication, which can operate on + * readonly roots and add, update or remove keys to/from them - but at + * the moment we don't allow it to run in parallel with send. + */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + ASSERT(ret <= 0); + if (ret > 0) { + btrfs_print_tree(path->nodes[path->lowest_level], false); + btrfs_err(root->fs_info, +"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d", + key->objectid, key->type, key->offset, + (root == sctx->parent_root ? "parent" : "send"), + root->root_key.objectid, path->lowest_level, + path->slots[path->lowest_level]); + return -EUCLEAN; + } + + return ret; +} + static int full_send_tree(struct send_ctx *sctx) { int ret; struct btrfs_root *send_root = sctx->send_root; struct btrfs_key key; + struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_path *path; - struct extent_buffer *eb; - int slot; path = alloc_path_for_send(); if (!path) @@ -6660,6 +6766,10 @@ static int full_send_tree(struct send_ctx *sctx) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; + down_read(&fs_info->commit_root_sem); + sctx->last_reloc_trans = fs_info->last_reloc_trans; + up_read(&fs_info->commit_root_sem); + ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0); if (ret < 0) goto out; @@ -6667,15 +6777,35 @@ static int full_send_tree(struct send_ctx *sctx) goto out_finish; while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - btrfs_item_key_to_cpu(eb, &key, slot); + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); ret = changed_cb(path, NULL, &key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + sctx->last_reloc_trans = fs_info->last_reloc_trans; + up_read(&fs_info->commit_root_sem); + /* + * A transaction used for relocating a block group was + * committed or is about to finish its commit. Release + * our path (leaf) and restart the search, so that we + * avoid operating on any file extent items that are + * stale, with a disk_bytenr that reflects a pre + * relocation value. This way we avoid as much as + * possible to fallback to regular writes when checking + * if we can clone file ranges. + */ + btrfs_release_path(path); + ret = search_key_again(sctx, send_root, path, &key); + if (ret < 0) + goto out; + } else { + up_read(&fs_info->commit_root_sem); + } + ret = btrfs_next_item(send_root, path); if (ret < 0) goto out; @@ -6693,6 +6823,20 @@ static int full_send_tree(struct send_ctx *sctx) return ret; } +static int replace_node_with_clone(struct btrfs_path *path, int level) +{ + struct extent_buffer *clone; + + clone = btrfs_clone_extent_buffer(path->nodes[level]); + if (!clone) + return -ENOMEM; + + free_extent_buffer(path->nodes[level]); + path->nodes[level] = clone; + + return 0; +} + static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen) { struct extent_buffer *eb; @@ -6702,6 +6846,8 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen u64 reada_max; u64 reada_done = 0; + lockdep_assert_held_read(&parent->fs_info->commit_root_sem); + BUG_ON(*level == 0); eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) @@ -6725,6 +6871,10 @@ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen path->nodes[*level - 1] = eb; path->slots[*level - 1] = 0; (*level)--; + + if (*level == 0) + return replace_node_with_clone(path, 0); + return 0; } @@ -6738,8 +6888,10 @@ static int tree_move_next_or_upnext(struct btrfs_path *path, path->slots[*level]++; while (path->slots[*level] >= nritems) { - if (*level == root_level) + if (*level == root_level) { + path->slots[*level] = nritems - 1; return -1; + } /* move upnext */ path->slots[*level] = 0; @@ -6771,14 +6923,20 @@ static int tree_advance(struct btrfs_path *path, } else { ret = tree_move_down(path, level, reada_min_gen); } - if (ret >= 0) { - if (*level == 0) - btrfs_item_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - else - btrfs_node_key_to_cpu(path->nodes[*level], key, - path->slots[*level]); - } + + /* + * Even if we have reached the end of a tree, ret is -1, update the key + * anyway, so that in case we need to restart due to a block group + * relocation, we can assert that the last key of the root node still + * exists in the tree. + */ + if (*level == 0) + btrfs_item_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + else + btrfs_node_key_to_cpu(path->nodes[*level], key, + path->slots[*level]); + return ret; } @@ -6807,6 +6965,97 @@ static int tree_compare_item(struct btrfs_path *left_path, return 0; } +/* + * A transaction used for relocating a block group was committed or is about to + * finish its commit. Release our paths and restart the search, so that we are + * not using stale extent buffers: + * + * 1) For levels > 0, we are only holding references of extent buffers, without + * any locks on them, which does not prevent them from having been relocated + * and reallocated after the last time we released the commit root semaphore. + * The exception are the root nodes, for which we always have a clone, see + * the comment at btrfs_compare_trees(); + * + * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so + * we are safe from the concurrent relocation and reallocation. However they + * can have file extent items with a pre relocation disk_bytenr value, so we + * restart the start from the current commit roots and clone the new leaves so + * that we get the post relocation disk_bytenr values. Not doing so, could + * make us clone the wrong data in case there are new extents using the old + * disk_bytenr that happen to be shared. + */ +static int restart_after_relocation(struct btrfs_path *left_path, + struct btrfs_path *right_path, + const struct btrfs_key *left_key, + const struct btrfs_key *right_key, + int left_level, + int right_level, + const struct send_ctx *sctx) +{ + int root_level; + int ret; + + lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem); + + btrfs_release_path(left_path); + btrfs_release_path(right_path); + + /* + * Since keys can not be added or removed to/from our roots because they + * are readonly and we do not allow deduplication to run in parallel + * (which can add, remove or change keys), the layout of the trees should + * not change. + */ + left_path->lowest_level = left_level; + ret = search_key_again(sctx, sctx->send_root, left_path, left_key); + if (ret < 0) + return ret; + + right_path->lowest_level = right_level; + ret = search_key_again(sctx, sctx->parent_root, right_path, right_key); + if (ret < 0) + return ret; + + /* + * If the lowest level nodes are leaves, clone them so that they can be + * safely used by changed_cb() while not under the protection of the + * commit root semaphore, even if relocation and reallocation happens in + * parallel. + */ + if (left_level == 0) { + ret = replace_node_with_clone(left_path, 0); + if (ret < 0) + return ret; + } + + if (right_level == 0) { + ret = replace_node_with_clone(right_path, 0); + if (ret < 0) + return ret; + } + + /* + * Now clone the root nodes (unless they happen to be the leaves we have + * already cloned). This is to protect against concurrent snapshotting of + * the send and parent roots (see the comment at btrfs_compare_trees()). + */ + root_level = btrfs_header_level(sctx->send_root->commit_root); + if (root_level > 0) { + ret = replace_node_with_clone(left_path, root_level); + if (ret < 0) + return ret; + } + + root_level = btrfs_header_level(sctx->parent_root->commit_root); + if (root_level > 0) { + ret = replace_node_with_clone(right_path, root_level); + if (ret < 0) + return ret; + } + + return 0; +} + /* * This function compares two trees and calls the provided callback for * every changed/new/deleted item it finds. @@ -6835,10 +7084,10 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, int right_root_level; int left_level; int right_level; - int left_end_reached; - int right_end_reached; - int advance_left; - int advance_right; + int left_end_reached = 0; + int right_end_reached = 0; + int advance_left = 0; + int advance_right = 0; u64 left_blockptr; u64 right_blockptr; u64 left_gen; @@ -6906,12 +7155,18 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, down_read(&fs_info->commit_root_sem); left_level = btrfs_header_level(left_root->commit_root); left_root_level = left_level; + /* + * We clone the root node of the send and parent roots to prevent races + * with snapshot creation of these roots. Snapshot creation COWs the + * root node of a tree, so after the transaction is committed the old + * extent can be reallocated while this send operation is still ongoing. + * So we clone them, under the commit root semaphore, to be race free. + */ left_path->nodes[left_level] = btrfs_clone_extent_buffer(left_root->commit_root); if (!left_path->nodes[left_level]) { - up_read(&fs_info->commit_root_sem); ret = -ENOMEM; - goto out; + goto out_unlock; } right_level = btrfs_header_level(right_root->commit_root); @@ -6919,9 +7174,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, right_path->nodes[right_level] = btrfs_clone_extent_buffer(right_root->commit_root); if (!right_path->nodes[right_level]) { - up_read(&fs_info->commit_root_sem); ret = -ENOMEM; - goto out; + goto out_unlock; } /* * Our right root is the parent root, while the left root is the "send" @@ -6931,7 +7185,6 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, * will need to read them at some point. */ reada_min_gen = btrfs_header_generation(right_root->commit_root); - up_read(&fs_info->commit_root_sem); if (left_level == 0) btrfs_item_key_to_cpu(left_path->nodes[left_level], @@ -6946,11 +7199,26 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, btrfs_node_key_to_cpu(right_path->nodes[right_level], &right_key, right_path->slots[right_level]); - left_end_reached = right_end_reached = 0; - advance_left = advance_right = 0; + sctx->last_reloc_trans = fs_info->last_reloc_trans; while (1) { - cond_resched(); + if (need_resched() || + rwsem_is_contended(&fs_info->commit_root_sem)) { + up_read(&fs_info->commit_root_sem); + cond_resched(); + down_read(&fs_info->commit_root_sem); + } + + if (fs_info->last_reloc_trans > sctx->last_reloc_trans) { + ret = restart_after_relocation(left_path, right_path, + &left_key, &right_key, + left_level, right_level, + sctx); + if (ret < 0) + goto out_unlock; + sctx->last_reloc_trans = fs_info->last_reloc_trans; + } + if (advance_left && !left_end_reached) { ret = tree_advance(left_path, &left_level, left_root_level, @@ -6959,7 +7227,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, if (ret == -1) left_end_reached = ADVANCE; else if (ret < 0) - goto out; + goto out_unlock; advance_left = 0; } if (advance_right && !right_end_reached) { @@ -6970,54 +7238,55 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, if (ret == -1) right_end_reached = ADVANCE; else if (ret < 0) - goto out; + goto out_unlock; advance_right = 0; } if (left_end_reached && right_end_reached) { ret = 0; - goto out; + goto out_unlock; } else if (left_end_reached) { if (right_level == 0) { + up_read(&fs_info->commit_root_sem); ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, sctx); if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); } advance_right = ADVANCE; continue; } else if (right_end_reached) { if (left_level == 0) { + up_read(&fs_info->commit_root_sem); ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, sctx); if (ret < 0) goto out; + down_read(&fs_info->commit_root_sem); } advance_left = ADVANCE; continue; } if (left_level == 0 && right_level == 0) { + up_read(&fs_info->commit_root_sem); cmp = btrfs_comp_cpu_keys(&left_key, &right_key); if (cmp < 0) { ret = changed_cb(left_path, right_path, &left_key, BTRFS_COMPARE_TREE_NEW, sctx); - if (ret < 0) - goto out; advance_left = ADVANCE; } else if (cmp > 0) { ret = changed_cb(left_path, right_path, &right_key, BTRFS_COMPARE_TREE_DELETED, sctx); - if (ret < 0) - goto out; advance_right = ADVANCE; } else { enum btrfs_compare_tree_result result; @@ -7031,11 +7300,13 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, result = BTRFS_COMPARE_TREE_SAME; ret = changed_cb(left_path, right_path, &left_key, result, sctx); - if (ret < 0) - goto out; advance_left = ADVANCE; advance_right = ADVANCE; } + + if (ret < 0) + goto out; + down_read(&fs_info->commit_root_sem); } else if (left_level == right_level) { cmp = btrfs_comp_cpu_keys(&left_key, &right_key); if (cmp < 0) { @@ -7075,6 +7346,8 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, } } +out_unlock: + up_read(&fs_info->commit_root_sem); out: btrfs_free_path(left_path); btrfs_free_path(right_path); @@ -7413,21 +7686,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) if (ret) goto out; - spin_lock(&fs_info->send_reloc_lock); - if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) { - spin_unlock(&fs_info->send_reloc_lock); - btrfs_warn_rl(fs_info, - "cannot run send because a relocation operation is in progress"); - ret = -EAGAIN; - goto out; - } - fs_info->send_in_progress++; - spin_unlock(&fs_info->send_reloc_lock); - ret = send_subvol(sctx); - spin_lock(&fs_info->send_reloc_lock); - fs_info->send_in_progress--; - spin_unlock(&fs_info->send_reloc_lock); if (ret < 0) goto out; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 9a6009108e..642cd2b55f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -163,6 +163,10 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) struct btrfs_caching_control *caching_ctl, *next; down_write(&fs_info->commit_root_sem); + + if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) + fs_info->last_reloc_trans = trans->transid; + list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits, dirty_list) { list_del_init(&root->dirty_list); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a9d21b33da..d6b5339c56 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -941,7 +941,17 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, while (count) { if (cs->write && cs->pipebufs && page) { - return fuse_ref_page(cs, page, offset, count); + /* + * Can't control lifetime of pipe buffers, so always + * copy user pages. + */ + if (cs->req->args->user_pages) { + err = fuse_copy_fill(cs); + if (err) + return err; + } else { + return fuse_ref_page(cs, page, offset, count); + } } else if (!cs->len) { if (cs->move_pages && page && offset == 0 && count == PAGE_SIZE) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 2004d36236..bc50a9fa84 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1417,6 +1417,7 @@ static int fuse_get_user_pages(struct fuse_args_pages *ap, struct iov_iter *ii, (PAGE_SIZE - ret) & (PAGE_SIZE - 1); } + ap->args.user_pages = true; if (write) ap->args.in_pages = true; else diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a59e36c7de..c3a87586a1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -256,6 +256,7 @@ struct fuse_args { bool nocreds:1; bool in_pages:1; bool out_pages:1; + bool user_pages:1; bool out_argvar:1; bool page_zeroing:1; bool page_replace:1; diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 546ea3d58f..fc69e1797a 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -394,9 +394,12 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff, args.out_args[1].value = ptr; err = fuse_simple_request(fm, &args); - if (!err && outarg.flags & FUSE_IOCTL_RETRY) - err = -EIO; - + if (!err) { + if (outarg.result < 0) + err = outarg.result; + else if (outarg.flags & FUSE_IOCTL_RETRY) + err = -EIO; + } return err; } diff --git a/fs/pipe.c b/fs/pipe.c index 6d4342bad9..751d5b36c8 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -252,7 +252,8 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) */ was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); for (;;) { - unsigned int head = pipe->head; + /* Read ->head with a barrier vs post_one_notification() */ + unsigned int head = smp_load_acquire(&pipe->head); unsigned int tail = pipe->tail; unsigned int mask = pipe->ring_size - 1; @@ -830,10 +831,8 @@ void free_pipe_info(struct pipe_inode_info *pipe) int i; #ifdef CONFIG_WATCH_QUEUE - if (pipe->watch_queue) { + if (pipe->watch_queue) watch_queue_clear(pipe->watch_queue); - put_watch_queue(pipe->watch_queue); - } #endif (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); @@ -843,6 +842,10 @@ void free_pipe_info(struct pipe_inode_info *pipe) if (buf->ops) pipe_buf_release(pipe, buf); } +#ifdef CONFIG_WATCH_QUEUE + if (pipe->watch_queue) + put_watch_queue(pipe->watch_queue); +#endif if (pipe->tmp_page) __free_page(pipe->tmp_page); kfree(pipe->bufs); diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 8519b3ae5d..b341dd62aa 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -62,8 +62,9 @@ struct virtio_shm_region { * Returns the first 64 feature bits (all we currently need). * @finalize_features: confirm what device features we'll be using. * vdev: the virtio_device - * This gives the final feature bits for the device: it can change + * This sends the driver feature bits to the device: it can change * the dev->feature bits if it wants. + * Note: despite the name this can be called any number of times. * Returns 0 on success or error status * @bus_name: return the bus name associated with the device (optional) * vdev: the virtio_device diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h index c994d1b2cd..3b9a40ae8b 100644 --- a/include/linux/watch_queue.h +++ b/include/linux/watch_queue.h @@ -28,7 +28,8 @@ struct watch_type_filter { struct watch_filter { union { struct rcu_head rcu; - unsigned long type_filter[2]; /* Bitmask of accepted types */ + /* Bitmask of accepted types */ + DECLARE_BITMAP(type_filter, WATCH_TYPE__NR); }; u32 nr_filters; /* Number of filters */ struct watch_type_filter filters[]; diff --git a/include/net/dsa.h b/include/net/dsa.h index 49e5ece936..d784e76113 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -1056,7 +1056,6 @@ void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds); void dsa_switch_shutdown(struct dsa_switch *ds); struct dsa_switch *dsa_switch_find(int tree_index, int sw_index); -void dsa_flush_workqueue(void); #ifdef CONFIG_PM_SLEEP int dsa_switch_suspend(struct dsa_switch *ds); int dsa_switch_resume(struct dsa_switch *ds); diff --git a/include/net/esp.h b/include/net/esp.h index 9c5637d41d..90cd02ff77 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -4,6 +4,8 @@ #include +#define ESP_SKB_FRAG_MAXSIZE (PAGE_SIZE << SKB_FRAG_PAGE_ORDER) + struct ip_esp_hdr; static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 301a164f17..358dfe6fef 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -1679,14 +1679,15 @@ int km_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, const struct xfrm_migrate *m, int num_bundles, const struct xfrm_kmaddress *k, const struct xfrm_encap_tmpl *encap); -struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net); +struct xfrm_state *xfrm_migrate_state_find(struct xfrm_migrate *m, struct net *net, + u32 if_id); struct xfrm_state *xfrm_state_migrate(struct xfrm_state *x, struct xfrm_migrate *m, struct xfrm_encap_tmpl *encap); int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_bundles, struct xfrm_kmaddress *k, struct net *net, - struct xfrm_encap_tmpl *encap); + struct xfrm_encap_tmpl *encap, u32 if_id); #endif int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport);