mirror of
https://github.com/Qortal/Brooklyn.git
synced 2025-02-01 07:42:18 +00:00
phase 5
This commit is contained in:
parent
6ca71e00d3
commit
4bdaa608f6
39
init/Kconfig
39
init/Kconfig
@ -550,7 +550,7 @@ config SCHED_THERMAL_PRESSURE
|
||||
i.e. put less load on throttled CPUs than on non/less throttled ones.
|
||||
|
||||
This requires the architecture to implement
|
||||
arch_set_thermal_pressure() and arch_scale_thermal_pressure().
|
||||
arch_update_thermal_pressure() and arch_scale_thermal_pressure().
|
||||
|
||||
config BSD_PROCESS_ACCT
|
||||
bool "BSD Process Accounting"
|
||||
@ -885,6 +885,11 @@ config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
||||
config CC_HAS_INT128
|
||||
def_bool !$(cc-option,$(m64-flag) -D__SIZEOF_INT128__=0) && 64BIT
|
||||
|
||||
config CC_IMPLICIT_FALLTHROUGH
|
||||
string
|
||||
default "-Wimplicit-fallthrough=5" if CC_IS_GCC && $(cc-option,-Wimplicit-fallthrough=5)
|
||||
default "-Wimplicit-fallthrough" if CC_IS_CLANG && $(cc-option,-Wunreachable-code-fallthrough)
|
||||
|
||||
#
|
||||
# For architectures that know their GCC __int128 support is sound
|
||||
#
|
||||
@ -901,7 +906,7 @@ config NUMA_BALANCING
|
||||
bool "Memory placement aware NUMA scheduler"
|
||||
depends on ARCH_SUPPORTS_NUMA_BALANCING
|
||||
depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
|
||||
depends on SMP && NUMA && MIGRATION
|
||||
depends on SMP && NUMA && MIGRATION && !PREEMPT_RT
|
||||
help
|
||||
This option adds support for automatic NUMA aware memory/task placement.
|
||||
The mechanism is quite primitive and is based on migrating memory when
|
||||
@ -1409,7 +1414,6 @@ config LD_DEAD_CODE_DATA_ELIMINATION
|
||||
config LD_ORPHAN_WARN
|
||||
def_bool y
|
||||
depends on ARCH_WANT_LD_ORPHAN_WARN
|
||||
depends on !LD_IS_LLD || LLD_VERSION >= 110000
|
||||
depends on $(ld-option,--orphan-handling=warn)
|
||||
|
||||
config SYSCTL
|
||||
@ -1574,6 +1578,7 @@ config BASE_FULL
|
||||
|
||||
config FUTEX
|
||||
bool "Enable futex support" if EXPERT
|
||||
depends on !(SPARC32 && SMP)
|
||||
default y
|
||||
imply RT_MUTEXES
|
||||
help
|
||||
@ -1586,14 +1591,6 @@ config FUTEX_PI
|
||||
depends on FUTEX && RT_MUTEXES
|
||||
default y
|
||||
|
||||
config HAVE_FUTEX_CMPXCHG
|
||||
bool
|
||||
depends on FUTEX
|
||||
help
|
||||
Architectures should select this if futex_atomic_cmpxchg_inatomic()
|
||||
is implemented and always working. This removes a couple of runtime
|
||||
checks.
|
||||
|
||||
config EPOLL
|
||||
bool "Enable eventpoll support" if EXPERT
|
||||
default y
|
||||
@ -1799,6 +1796,10 @@ config HAVE_PERF_EVENTS
|
||||
help
|
||||
See tools/perf/design.txt for details.
|
||||
|
||||
config GUEST_PERF_EVENTS
|
||||
bool
|
||||
depends on HAVE_PERF_EVENTS
|
||||
|
||||
config PERF_USE_VMALLOC
|
||||
bool
|
||||
help
|
||||
@ -1896,6 +1897,7 @@ choice
|
||||
|
||||
config SLAB
|
||||
bool "SLAB"
|
||||
depends on !PREEMPT_RT
|
||||
select HAVE_HARDENED_USERCOPY_ALLOCATOR
|
||||
help
|
||||
The regular slab allocator that is established and known to work
|
||||
@ -1916,6 +1918,7 @@ config SLUB
|
||||
config SLOB
|
||||
depends on EXPERT
|
||||
bool "SLOB (Simple Allocator)"
|
||||
depends on !PREEMPT_RT
|
||||
help
|
||||
SLOB replaces the stock allocator with a drastically simpler
|
||||
allocator. SLOB is generally more space efficient but
|
||||
@ -1926,6 +1929,7 @@ endchoice
|
||||
config SLAB_MERGE_DEFAULT
|
||||
bool "Allow slab caches to be merged"
|
||||
default y
|
||||
depends on SLAB || SLUB
|
||||
help
|
||||
For reduced kernel memory fragmentation, slab caches can be
|
||||
merged when they share the same size and other characteristics.
|
||||
@ -2273,6 +2277,19 @@ config MODULE_COMPRESS_ZSTD
|
||||
|
||||
endchoice
|
||||
|
||||
config MODULE_DECOMPRESS
|
||||
bool "Support in-kernel module decompression"
|
||||
depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ
|
||||
select ZLIB_INFLATE if MODULE_COMPRESS_GZIP
|
||||
select XZ_DEC if MODULE_COMPRESS_XZ
|
||||
help
|
||||
|
||||
Support for decompressing kernel modules by the kernel itself
|
||||
instead of relying on userspace to perform this task. Useful when
|
||||
load pinning security policy is enabled.
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
|
||||
bool "Allow loading of modules with missing namespace imports"
|
||||
help
|
||||
|
@ -30,8 +30,8 @@ $(obj)/version.o: include/generated/compile.h
|
||||
quiet_cmd_compile.h = CHK $@
|
||||
cmd_compile.h = \
|
||||
$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
|
||||
"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" \
|
||||
"$(CONFIG_PREEMPT_RT)" $(CONFIG_CC_VERSION_TEXT) "$(LD)"
|
||||
"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT_BUILD)" \
|
||||
"$(CONFIG_PREEMPT_RT)" "$(CONFIG_CC_VERSION_TEXT)" "$(LD)"
|
||||
|
||||
include/generated/compile.h: FORCE
|
||||
$(call cmd,compile.h)
|
||||
|
@ -182,11 +182,6 @@ struct task_struct init_task
|
||||
#endif
|
||||
#ifdef CONFIG_KCSAN
|
||||
.kcsan_ctx = {
|
||||
.disable_count = 0,
|
||||
.atomic_next = 0,
|
||||
.atomic_nest_count = 0,
|
||||
.in_flat_atomic = false,
|
||||
.access_mask = 0,
|
||||
.scoped_accesses = {LIST_POISON1, NULL},
|
||||
},
|
||||
#endif
|
||||
|
@ -607,7 +607,7 @@ void __weak __init free_initrd_mem(unsigned long start, unsigned long end)
|
||||
unsigned long aligned_start = ALIGN_DOWN(start, PAGE_SIZE);
|
||||
unsigned long aligned_end = ALIGN(end, PAGE_SIZE);
|
||||
|
||||
memblock_free(__pa(aligned_start), aligned_end - aligned_start);
|
||||
memblock_free((void *)aligned_start, aligned_end - aligned_start);
|
||||
#endif
|
||||
|
||||
free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,
|
||||
|
32
init/main.c
32
init/main.c
@ -83,7 +83,6 @@
|
||||
#include <linux/ptrace.h>
|
||||
#include <linux/pti.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/elevator.h>
|
||||
#include <linux/sched/clock.h>
|
||||
#include <linux/sched/task.h>
|
||||
#include <linux/sched/task_stack.h>
|
||||
@ -382,7 +381,7 @@ static char * __init xbc_make_cmdline(const char *key)
|
||||
ret = xbc_snprint_cmdline(new_cmdline, len + 1, root);
|
||||
if (ret < 0 || ret > len) {
|
||||
pr_err("Failed to print extra kernel cmdline.\n");
|
||||
memblock_free_ptr(new_cmdline, len + 1);
|
||||
memblock_free(new_cmdline, len + 1);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -410,7 +409,7 @@ static void __init setup_boot_config(void)
|
||||
const char *msg;
|
||||
int pos;
|
||||
u32 size, csum;
|
||||
char *data, *copy, *err;
|
||||
char *data, *err;
|
||||
int ret;
|
||||
|
||||
/* Cut out the bootconfig data even if we have no bootconfig option */
|
||||
@ -443,16 +442,7 @@ static void __init setup_boot_config(void)
|
||||
return;
|
||||
}
|
||||
|
||||
copy = memblock_alloc(size + 1, SMP_CACHE_BYTES);
|
||||
if (!copy) {
|
||||
pr_err("Failed to allocate memory for bootconfig\n");
|
||||
return;
|
||||
}
|
||||
|
||||
memcpy(copy, data, size);
|
||||
copy[size] = '\0';
|
||||
|
||||
ret = xbc_init(copy, &msg, &pos);
|
||||
ret = xbc_init(data, size, &msg, &pos);
|
||||
if (ret < 0) {
|
||||
if (pos < 0)
|
||||
pr_err("Failed to init bootconfig: %s.\n", msg);
|
||||
@ -460,6 +450,7 @@ static void __init setup_boot_config(void)
|
||||
pr_err("Failed to parse bootconfig: %s at %d.\n",
|
||||
msg, pos);
|
||||
} else {
|
||||
xbc_get_info(&ret, NULL);
|
||||
pr_info("Load bootconfig: %d bytes %d nodes\n", size, ret);
|
||||
/* keys starting with "kernel." are passed via cmdline */
|
||||
extra_command_line = xbc_make_cmdline("kernel");
|
||||
@ -471,7 +462,7 @@ static void __init setup_boot_config(void)
|
||||
|
||||
static void __init exit_boot_config(void)
|
||||
{
|
||||
xbc_destroy_all();
|
||||
xbc_exit();
|
||||
}
|
||||
|
||||
#else /* !CONFIG_BOOT_CONFIG */
|
||||
@ -843,12 +834,15 @@ static void __init mm_init(void)
|
||||
init_mem_debugging_and_hardening();
|
||||
kfence_alloc_pool();
|
||||
report_meminit();
|
||||
stack_depot_init();
|
||||
stack_depot_early_init();
|
||||
mem_init();
|
||||
mem_init_print_info();
|
||||
/* page_owner must be initialized after buddy is ready */
|
||||
page_ext_init_flatmem_late();
|
||||
kmem_cache_init();
|
||||
/*
|
||||
* page_owner must be initialized after buddy is ready, and also after
|
||||
* slab is ready so that stack_depot_init() works properly
|
||||
*/
|
||||
page_ext_init_flatmem_late();
|
||||
kmemleak_init();
|
||||
pgtable_init();
|
||||
debug_objects_mem_init();
|
||||
@ -927,7 +921,7 @@ static void __init print_unknown_bootoptions(void)
|
||||
/* Start at unknown_options[1] to skip the initial space */
|
||||
pr_notice("Unknown kernel command line parameters \"%s\", will be passed to user space.\n",
|
||||
&unknown_options[1]);
|
||||
memblock_free_ptr(unknown_options, len);
|
||||
memblock_free(unknown_options, len);
|
||||
}
|
||||
|
||||
asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
|
||||
@ -1508,6 +1502,8 @@ static int __ref kernel_init(void *unused)
|
||||
kernel_init_freeable();
|
||||
/* need to finish all async __init code before freeing the memory */
|
||||
async_synchronize_full();
|
||||
|
||||
system_state = SYSTEM_FREEING_INITMEM;
|
||||
kprobe_free_init_mem();
|
||||
ftrace_free_init_mem();
|
||||
kgdb_free_init_mem();
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include <linux/nsproxy.h>
|
||||
#include <linux/sysctl.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/capability.h>
|
||||
#include <linux/ipc_namespace.h>
|
||||
#include <linux/msg.h>
|
||||
#include "util.h"
|
||||
@ -22,7 +23,6 @@ static void *get_ipc(struct ctl_table *table)
|
||||
return which;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PROC_SYSCTL
|
||||
static int proc_ipc_dointvec(struct ctl_table *table, int write,
|
||||
void *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
@ -104,13 +104,17 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
|
||||
return ret;
|
||||
}
|
||||
|
||||
#else
|
||||
#define proc_ipc_doulongvec_minmax NULL
|
||||
#define proc_ipc_dointvec NULL
|
||||
#define proc_ipc_dointvec_minmax NULL
|
||||
#define proc_ipc_dointvec_minmax_orphans NULL
|
||||
#define proc_ipc_auto_msgmni NULL
|
||||
#define proc_ipc_sem_dointvec NULL
|
||||
#ifdef CONFIG_CHECKPOINT_RESTORE
|
||||
static int proc_ipc_dointvec_minmax_checkpoint_restore(struct ctl_table *table,
|
||||
int write, void *buffer, size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
struct user_namespace *user_ns = current->nsproxy->ipc_ns->user_ns;
|
||||
|
||||
if (write && !checkpoint_restore_ns_capable(user_ns))
|
||||
return -EPERM;
|
||||
|
||||
return proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
|
||||
}
|
||||
#endif
|
||||
|
||||
int ipc_mni = IPCMNI;
|
||||
@ -198,8 +202,8 @@ static struct ctl_table ipc_kern_table[] = {
|
||||
.procname = "sem_next_id",
|
||||
.data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id,
|
||||
.maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_ipc_dointvec_minmax,
|
||||
.mode = 0666,
|
||||
.proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_INT_MAX,
|
||||
},
|
||||
@ -207,8 +211,8 @@ static struct ctl_table ipc_kern_table[] = {
|
||||
.procname = "msg_next_id",
|
||||
.data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id,
|
||||
.maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_ipc_dointvec_minmax,
|
||||
.mode = 0666,
|
||||
.proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_INT_MAX,
|
||||
},
|
||||
@ -216,8 +220,8 @@ static struct ctl_table ipc_kern_table[] = {
|
||||
.procname = "shm_next_id",
|
||||
.data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id,
|
||||
.maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_ipc_dointvec_minmax,
|
||||
.mode = 0666,
|
||||
.proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_INT_MAX,
|
||||
},
|
||||
|
@ -330,9 +330,6 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
|
||||
shm_unlock(shp);
|
||||
if (!is_file_hugepages(shm_file))
|
||||
shmem_lock(shm_file, 0, shp->mlock_ucounts);
|
||||
else if (shp->mlock_ucounts)
|
||||
user_shm_unlock(i_size_read(file_inode(shm_file)),
|
||||
shp->mlock_ucounts);
|
||||
fput(shm_file);
|
||||
ipc_update_pid(&shp->shm_cprid, NULL);
|
||||
ipc_update_pid(&shp->shm_lprid, NULL);
|
||||
@ -742,8 +739,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
|
||||
if (shmflg & SHM_NORESERVE)
|
||||
acctflag = VM_NORESERVE;
|
||||
file = hugetlb_file_setup(name, hugesize, acctflag,
|
||||
&shp->mlock_ucounts, HUGETLB_SHMFS_INODE,
|
||||
(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
|
||||
HUGETLB_SHMFS_INODE, (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
|
||||
} else {
|
||||
/*
|
||||
* Do not allow no accounting for OVERCOMMIT_NEVER, even
|
||||
@ -794,8 +790,6 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
|
||||
no_id:
|
||||
ipc_update_pid(&shp->shm_cprid, NULL);
|
||||
ipc_update_pid(&shp->shm_lprid, NULL);
|
||||
if (is_file_hugepages(file) && shp->mlock_ucounts)
|
||||
user_shm_unlock(size, shp->mlock_ucounts);
|
||||
fput(file);
|
||||
ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
|
||||
return error;
|
||||
|
@ -1,11 +1,23 @@
|
||||
# SPDX-License-Identifier: GPL-2.0-only
|
||||
|
||||
config PREEMPT_NONE_BUILD
|
||||
bool
|
||||
|
||||
config PREEMPT_VOLUNTARY_BUILD
|
||||
bool
|
||||
|
||||
config PREEMPT_BUILD
|
||||
bool
|
||||
select PREEMPTION
|
||||
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
|
||||
|
||||
choice
|
||||
prompt "Preemption Model"
|
||||
default PREEMPT_NONE
|
||||
|
||||
config PREEMPT_NONE
|
||||
bool "No Forced Preemption (Server)"
|
||||
select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
|
||||
help
|
||||
This is the traditional Linux preemption model, geared towards
|
||||
throughput. It will still provide good latencies most of the
|
||||
@ -20,6 +32,7 @@ config PREEMPT_NONE
|
||||
config PREEMPT_VOLUNTARY
|
||||
bool "Voluntary Kernel Preemption (Desktop)"
|
||||
depends on !ARCH_NO_PREEMPT
|
||||
select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
|
||||
help
|
||||
This option reduces the latency of the kernel by adding more
|
||||
"explicit preemption points" to the kernel code. These new
|
||||
@ -38,9 +51,7 @@ config PREEMPT_VOLUNTARY
|
||||
config PREEMPT
|
||||
bool "Preemptible Kernel (Low-Latency Desktop)"
|
||||
depends on !ARCH_NO_PREEMPT
|
||||
select PREEMPTION
|
||||
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
|
||||
select PREEMPT_DYNAMIC if HAVE_PREEMPT_DYNAMIC
|
||||
select PREEMPT_BUILD
|
||||
help
|
||||
This option reduces the latency of the kernel by making
|
||||
all kernel code (that is not executing in a critical section)
|
||||
@ -83,7 +94,10 @@ config PREEMPTION
|
||||
select PREEMPT_COUNT
|
||||
|
||||
config PREEMPT_DYNAMIC
|
||||
bool
|
||||
bool "Preemption behaviour defined on boot"
|
||||
depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
|
||||
select PREEMPT_BUILD
|
||||
default y
|
||||
help
|
||||
This option allows to define the preemption model on the kernel
|
||||
command line parameter and thus override the default preemption
|
||||
|
@ -59,7 +59,7 @@ obj-$(CONFIG_FREEZER) += freezer.o
|
||||
obj-$(CONFIG_PROFILING) += profile.o
|
||||
obj-$(CONFIG_STACKTRACE) += stacktrace.o
|
||||
obj-y += time/
|
||||
obj-$(CONFIG_FUTEX) += futex.o
|
||||
obj-$(CONFIG_FUTEX) += futex/
|
||||
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
|
||||
obj-$(CONFIG_SMP) += smp.o
|
||||
ifneq ($(CONFIG_SMP),y)
|
||||
@ -67,6 +67,7 @@ obj-y += up.o
|
||||
endif
|
||||
obj-$(CONFIG_UID16) += uid16.o
|
||||
obj-$(CONFIG_MODULES) += module.o
|
||||
obj-$(CONFIG_MODULE_DECOMPRESS) += module_decompress.o
|
||||
obj-$(CONFIG_MODULE_SIG) += module_signing.o
|
||||
obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
|
||||
obj-$(CONFIG_KALLSYMS) += kallsyms.o
|
||||
@ -85,7 +86,6 @@ obj-$(CONFIG_PID_NS) += pid_namespace.o
|
||||
obj-$(CONFIG_IKCONFIG) += configs.o
|
||||
obj-$(CONFIG_IKHEADERS) += kheaders.o
|
||||
obj-$(CONFIG_SMP) += stop_machine.o
|
||||
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
|
||||
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
|
||||
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o audit_fsnotify.o audit_tree.o
|
||||
obj-$(CONFIG_GCOV_KERNEL) += gcov/
|
||||
|
@ -60,7 +60,6 @@
|
||||
#include <linux/sched/cputime.h>
|
||||
|
||||
#include <asm/div64.h>
|
||||
#include <linux/blkdev.h> /* sector_div */
|
||||
#include <linux/pid_namespace.h>
|
||||
#include <linux/fs_pin.h>
|
||||
|
||||
|
@ -1468,7 +1468,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
|
||||
sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL);
|
||||
if (!sig_data) {
|
||||
if (audit_sig_sid)
|
||||
security_release_secctx(ctx, len);
|
||||
@ -1481,7 +1481,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
|
||||
security_release_secctx(ctx, len);
|
||||
}
|
||||
audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
|
||||
sig_data, sizeof(*sig_data) + len);
|
||||
sig_data, struct_size(sig_data, ctx, len));
|
||||
kfree(sig_data);
|
||||
break;
|
||||
case AUDIT_TTY_GET: {
|
||||
@ -2171,7 +2171,7 @@ int audit_log_task_context(struct audit_buffer *ab)
|
||||
int error;
|
||||
u32 sid;
|
||||
|
||||
security_task_getsecid_subj(current, &sid);
|
||||
security_current_getsecid_subj(&sid);
|
||||
if (!sid)
|
||||
return 0;
|
||||
|
||||
@ -2392,7 +2392,7 @@ int audit_signal_info(int sig, struct task_struct *t)
|
||||
audit_sig_uid = auid;
|
||||
else
|
||||
audit_sig_uid = uid;
|
||||
security_task_getsecid_subj(current, &audit_sig_sid);
|
||||
security_current_getsecid_subj(&audit_sig_sid);
|
||||
}
|
||||
|
||||
return audit_signal_info_syscall(t);
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include <linux/skbuff.h>
|
||||
#include <uapi/linux/mqueue.h>
|
||||
#include <linux/tty.h>
|
||||
#include <uapi/linux/openat2.h> // struct open_how
|
||||
|
||||
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
|
||||
* for saving names from getname(). If we get more names we will allocate
|
||||
@ -100,10 +101,15 @@ struct audit_proctitle {
|
||||
/* The per-task audit context. */
|
||||
struct audit_context {
|
||||
int dummy; /* must be the first element */
|
||||
int in_syscall; /* 1 if task is in a syscall */
|
||||
enum {
|
||||
AUDIT_CTX_UNUSED, /* audit_context is currently unused */
|
||||
AUDIT_CTX_SYSCALL, /* in use by syscall */
|
||||
AUDIT_CTX_URING, /* in use by io_uring */
|
||||
} context;
|
||||
enum audit_state state, current_state;
|
||||
unsigned int serial; /* serial number for record */
|
||||
int major; /* syscall number */
|
||||
int uring_op; /* uring operation */
|
||||
struct timespec64 ctime; /* time of syscall entry */
|
||||
unsigned long argv[4]; /* syscall arguments */
|
||||
long return_code;/* syscall return code */
|
||||
@ -188,6 +194,7 @@ struct audit_context {
|
||||
int fd;
|
||||
int flags;
|
||||
} mmap;
|
||||
struct open_how openat2;
|
||||
struct {
|
||||
int argc;
|
||||
} execve;
|
||||
|
@ -160,8 +160,7 @@ static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
|
||||
|
||||
audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
|
||||
|
||||
if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group) ||
|
||||
WARN_ON_ONCE(!inode))
|
||||
if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group))
|
||||
return 0;
|
||||
|
||||
if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
|
||||
|
@ -30,7 +30,7 @@ struct audit_chunk {
|
||||
int count;
|
||||
atomic_long_t refs;
|
||||
struct rcu_head head;
|
||||
struct node {
|
||||
struct audit_node {
|
||||
struct list_head list;
|
||||
struct audit_tree *owner;
|
||||
unsigned index; /* index; upper bit indicates 'will prune' */
|
||||
@ -94,7 +94,7 @@ static struct audit_tree *alloc_tree(const char *s)
|
||||
{
|
||||
struct audit_tree *tree;
|
||||
|
||||
tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL);
|
||||
tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL);
|
||||
if (tree) {
|
||||
refcount_set(&tree->count, 1);
|
||||
tree->goner = 0;
|
||||
@ -269,7 +269,7 @@ bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
|
||||
|
||||
/* tagging and untagging inodes with trees */
|
||||
|
||||
static struct audit_chunk *find_chunk(struct node *p)
|
||||
static struct audit_chunk *find_chunk(struct audit_node *p)
|
||||
{
|
||||
int index = p->index & ~(1U<<31);
|
||||
p -= index;
|
||||
@ -322,7 +322,7 @@ static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old)
|
||||
list_replace_rcu(&old->hash, &new->hash);
|
||||
}
|
||||
|
||||
static void remove_chunk_node(struct audit_chunk *chunk, struct node *p)
|
||||
static void remove_chunk_node(struct audit_chunk *chunk, struct audit_node *p)
|
||||
{
|
||||
struct audit_tree *owner = p->owner;
|
||||
|
||||
@ -459,7 +459,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
|
||||
{
|
||||
struct fsnotify_mark *mark;
|
||||
struct audit_chunk *chunk, *old;
|
||||
struct node *p;
|
||||
struct audit_node *p;
|
||||
int n;
|
||||
|
||||
mutex_lock(&audit_tree_group->mark_mutex);
|
||||
@ -570,11 +570,11 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
|
||||
{
|
||||
spin_lock(&hash_lock);
|
||||
while (!list_empty(&victim->chunks)) {
|
||||
struct node *p;
|
||||
struct audit_node *p;
|
||||
struct audit_chunk *chunk;
|
||||
struct fsnotify_mark *mark;
|
||||
|
||||
p = list_first_entry(&victim->chunks, struct node, list);
|
||||
p = list_first_entry(&victim->chunks, struct audit_node, list);
|
||||
/* have we run out of marked? */
|
||||
if (tagged && !(p->index & (1U<<31)))
|
||||
break;
|
||||
@ -616,7 +616,7 @@ static void trim_marked(struct audit_tree *tree)
|
||||
}
|
||||
/* reorder */
|
||||
for (p = tree->chunks.next; p != &tree->chunks; p = q) {
|
||||
struct node *node = list_entry(p, struct node, list);
|
||||
struct audit_node *node = list_entry(p, struct audit_node, list);
|
||||
q = p->next;
|
||||
if (node->index & (1U<<31)) {
|
||||
list_del_init(p);
|
||||
@ -684,7 +684,7 @@ void audit_trim_trees(void)
|
||||
struct audit_tree *tree;
|
||||
struct path path;
|
||||
struct vfsmount *root_mnt;
|
||||
struct node *node;
|
||||
struct audit_node *node;
|
||||
int err;
|
||||
|
||||
tree = container_of(cursor.next, struct audit_tree, list);
|
||||
@ -726,7 +726,8 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
|
||||
{
|
||||
|
||||
if (pathname[0] != '/' ||
|
||||
rule->listnr != AUDIT_FILTER_EXIT ||
|
||||
(rule->listnr != AUDIT_FILTER_EXIT &&
|
||||
rule->listnr != AUDIT_FILTER_URING_EXIT) ||
|
||||
op != Audit_equal ||
|
||||
rule->inode_f || rule->watch || rule->tree)
|
||||
return -EINVAL;
|
||||
@ -839,7 +840,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
|
||||
drop_collected_mounts(mnt);
|
||||
|
||||
if (!err) {
|
||||
struct node *node;
|
||||
struct audit_node *node;
|
||||
spin_lock(&hash_lock);
|
||||
list_for_each_entry(node, &tree->chunks, list)
|
||||
node->index &= ~(1U<<31);
|
||||
@ -938,7 +939,7 @@ int audit_tag_tree(char *old, char *new)
|
||||
mutex_unlock(&audit_filter_mutex);
|
||||
|
||||
if (!failed) {
|
||||
struct node *node;
|
||||
struct audit_node *node;
|
||||
spin_lock(&hash_lock);
|
||||
list_for_each_entry(node, &tree->chunks, list)
|
||||
node->index &= ~(1U<<31);
|
||||
|
@ -183,7 +183,8 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
if (path[0] != '/' || path[len-1] == '/' ||
|
||||
krule->listnr != AUDIT_FILTER_EXIT ||
|
||||
(krule->listnr != AUDIT_FILTER_EXIT &&
|
||||
krule->listnr != AUDIT_FILTER_URING_EXIT) ||
|
||||
op != Audit_equal ||
|
||||
krule->inode_f || krule->watch || krule->tree)
|
||||
return -EINVAL;
|
||||
@ -472,8 +473,7 @@ static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
|
||||
|
||||
parent = container_of(inode_mark, struct audit_parent, mark);
|
||||
|
||||
if (WARN_ON_ONCE(inode_mark->group != audit_watch_group) ||
|
||||
WARN_ON_ONCE(!inode))
|
||||
if (WARN_ON_ONCE(inode_mark->group != audit_watch_group))
|
||||
return 0;
|
||||
|
||||
if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
|
||||
|
@ -44,7 +44,8 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
|
||||
LIST_HEAD_INIT(audit_filter_list[4]),
|
||||
LIST_HEAD_INIT(audit_filter_list[5]),
|
||||
LIST_HEAD_INIT(audit_filter_list[6]),
|
||||
#if AUDIT_NR_FILTERS != 7
|
||||
LIST_HEAD_INIT(audit_filter_list[7]),
|
||||
#if AUDIT_NR_FILTERS != 8
|
||||
#error Fix audit_filter_list initialiser
|
||||
#endif
|
||||
};
|
||||
@ -56,6 +57,7 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
|
||||
LIST_HEAD_INIT(audit_rules_list[4]),
|
||||
LIST_HEAD_INIT(audit_rules_list[5]),
|
||||
LIST_HEAD_INIT(audit_rules_list[6]),
|
||||
LIST_HEAD_INIT(audit_rules_list[7]),
|
||||
};
|
||||
|
||||
DEFINE_MUTEX(audit_filter_mutex);
|
||||
@ -151,7 +153,8 @@ char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
|
||||
static inline int audit_to_inode(struct audit_krule *krule,
|
||||
struct audit_field *f)
|
||||
{
|
||||
if (krule->listnr != AUDIT_FILTER_EXIT ||
|
||||
if ((krule->listnr != AUDIT_FILTER_EXIT &&
|
||||
krule->listnr != AUDIT_FILTER_URING_EXIT) ||
|
||||
krule->inode_f || krule->watch || krule->tree ||
|
||||
(f->op != Audit_equal && f->op != Audit_not_equal))
|
||||
return -EINVAL;
|
||||
@ -248,6 +251,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
|
||||
pr_err("AUDIT_FILTER_ENTRY is deprecated\n");
|
||||
goto exit_err;
|
||||
case AUDIT_FILTER_EXIT:
|
||||
case AUDIT_FILTER_URING_EXIT:
|
||||
case AUDIT_FILTER_TASK:
|
||||
#endif
|
||||
case AUDIT_FILTER_USER:
|
||||
@ -332,6 +336,10 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
|
||||
if (entry->rule.listnr != AUDIT_FILTER_FS)
|
||||
return -EINVAL;
|
||||
break;
|
||||
case AUDIT_PERM:
|
||||
if (entry->rule.listnr == AUDIT_FILTER_URING_EXIT)
|
||||
return -EINVAL;
|
||||
break;
|
||||
}
|
||||
|
||||
switch (entry->rule.listnr) {
|
||||
@ -629,7 +637,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
|
||||
void *bufp;
|
||||
int i;
|
||||
|
||||
data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
|
||||
data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL);
|
||||
if (unlikely(!data))
|
||||
return NULL;
|
||||
memset(data, 0, sizeof(*data));
|
||||
@ -980,7 +988,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
|
||||
}
|
||||
|
||||
entry->rule.prio = ~0ULL;
|
||||
if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
|
||||
if (entry->rule.listnr == AUDIT_FILTER_EXIT ||
|
||||
entry->rule.listnr == AUDIT_FILTER_URING_EXIT) {
|
||||
if (entry->rule.flags & AUDIT_FILTER_PREPEND)
|
||||
entry->rule.prio = ++prio_high;
|
||||
else
|
||||
@ -1083,7 +1092,7 @@ static void audit_list_rules(int seq, struct sk_buff_head *q)
|
||||
break;
|
||||
skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1,
|
||||
data,
|
||||
sizeof(*data) + data->buflen);
|
||||
struct_size(data, buf, data->buflen));
|
||||
if (skb)
|
||||
skb_queue_tail(q, skb);
|
||||
kfree(data);
|
||||
@ -1359,8 +1368,7 @@ int audit_filter(int msgtype, unsigned int listtype)
|
||||
case AUDIT_SUBJ_SEN:
|
||||
case AUDIT_SUBJ_CLR:
|
||||
if (f->lsm_rule) {
|
||||
security_task_getsecid_subj(current,
|
||||
&sid);
|
||||
security_current_getsecid_subj(&sid);
|
||||
result = security_audit_rule_match(sid,
|
||||
f->type, f->op, f->lsm_rule);
|
||||
}
|
||||
|
476
kernel/auditsc.c
476
kernel/auditsc.c
@ -1,3 +1,4 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
/* auditsc.c -- System-call auditing support
|
||||
* Handles all system-call specific auditing features.
|
||||
*
|
||||
@ -6,20 +7,6 @@
|
||||
* Copyright (C) 2005, 2006 IBM Corporation
|
||||
* All Rights Reserved.
|
||||
*
|
||||
* This program is free software; you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation; either version 2 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program; if not, write to the Free Software
|
||||
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
*
|
||||
* Written by Rickard E. (Rik) Faith <faith@redhat.com>
|
||||
*
|
||||
* Many of the ideas implemented here are from Stephen C. Tweedie,
|
||||
@ -76,6 +63,7 @@
|
||||
#include <linux/fsnotify_backend.h>
|
||||
#include <uapi/linux/limits.h>
|
||||
#include <uapi/linux/netfilter/nf_tables.h>
|
||||
#include <uapi/linux/openat2.h> // struct open_how
|
||||
|
||||
#include "audit.h"
|
||||
|
||||
@ -166,7 +154,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
|
||||
n = ctx->major;
|
||||
|
||||
switch (audit_classify_syscall(ctx->arch, n)) {
|
||||
case 0: /* native */
|
||||
case AUDITSC_NATIVE:
|
||||
if ((mask & AUDIT_PERM_WRITE) &&
|
||||
audit_match_class(AUDIT_CLASS_WRITE, n))
|
||||
return 1;
|
||||
@ -177,7 +165,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
|
||||
audit_match_class(AUDIT_CLASS_CHATTR, n))
|
||||
return 1;
|
||||
return 0;
|
||||
case 1: /* 32bit on biarch */
|
||||
case AUDITSC_COMPAT: /* 32bit on biarch */
|
||||
if ((mask & AUDIT_PERM_WRITE) &&
|
||||
audit_match_class(AUDIT_CLASS_WRITE_32, n))
|
||||
return 1;
|
||||
@ -188,14 +176,16 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
|
||||
audit_match_class(AUDIT_CLASS_CHATTR_32, n))
|
||||
return 1;
|
||||
return 0;
|
||||
case 2: /* open */
|
||||
case AUDITSC_OPEN:
|
||||
return mask & ACC_MODE(ctx->argv[1]);
|
||||
case 3: /* openat */
|
||||
case AUDITSC_OPENAT:
|
||||
return mask & ACC_MODE(ctx->argv[2]);
|
||||
case 4: /* socketcall */
|
||||
case AUDITSC_SOCKETCALL:
|
||||
return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND);
|
||||
case 5: /* execve */
|
||||
case AUDITSC_EXECVE:
|
||||
return mask & AUDIT_PERM_EXEC;
|
||||
case AUDITSC_OPENAT2:
|
||||
return mask & ACC_MODE((u32)ctx->openat2.flags);
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
@ -480,6 +470,9 @@ static int audit_filter_rules(struct task_struct *tsk,
|
||||
u32 sid;
|
||||
unsigned int sessionid;
|
||||
|
||||
if (ctx && rule->prio <= ctx->prio)
|
||||
return 0;
|
||||
|
||||
cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
|
||||
|
||||
for (i = 0; i < rule->field_count; i++) {
|
||||
@ -673,7 +666,16 @@ static int audit_filter_rules(struct task_struct *tsk,
|
||||
logged upon error */
|
||||
if (f->lsm_rule) {
|
||||
if (need_sid) {
|
||||
security_task_getsecid_subj(tsk, &sid);
|
||||
/* @tsk should always be equal to
|
||||
* @current with the exception of
|
||||
* fork()/copy_process() in which case
|
||||
* the new @tsk creds are still a dup
|
||||
* of @current's creds so we can still
|
||||
* use security_current_getsecid_subj()
|
||||
* here even though it always refs
|
||||
* @current's creds
|
||||
*/
|
||||
security_current_getsecid_subj(&sid);
|
||||
need_sid = 0;
|
||||
}
|
||||
result = security_audit_rule_match(sid, f->type,
|
||||
@ -747,8 +749,6 @@ static int audit_filter_rules(struct task_struct *tsk,
|
||||
}
|
||||
|
||||
if (ctx) {
|
||||
if (rule->prio <= ctx->prio)
|
||||
return 0;
|
||||
if (rule->filterkey) {
|
||||
kfree(ctx->filterkey);
|
||||
ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
|
||||
@ -805,6 +805,34 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
|
||||
return rule->mask[word] & bit;
|
||||
}
|
||||
|
||||
/**
|
||||
* audit_filter_uring - apply filters to an io_uring operation
|
||||
* @tsk: associated task
|
||||
* @ctx: audit context
|
||||
*/
|
||||
static void audit_filter_uring(struct task_struct *tsk,
|
||||
struct audit_context *ctx)
|
||||
{
|
||||
struct audit_entry *e;
|
||||
enum audit_state state;
|
||||
|
||||
if (auditd_test_task(tsk))
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_URING_EXIT],
|
||||
list) {
|
||||
if (audit_in_mask(&e->rule, ctx->uring_op) &&
|
||||
audit_filter_rules(tsk, &e->rule, ctx, NULL, &state,
|
||||
false)) {
|
||||
rcu_read_unlock();
|
||||
ctx->current_state = state;
|
||||
return;
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/* At syscall exit time, this filter is called if the audit_state is
|
||||
* not low enough that auditing cannot take place, but is also not
|
||||
* high enough that we already know we have to write an audit record
|
||||
@ -915,10 +943,81 @@ static inline void audit_free_aux(struct audit_context *context)
|
||||
context->aux = aux->next;
|
||||
kfree(aux);
|
||||
}
|
||||
context->aux = NULL;
|
||||
while ((aux = context->aux_pids)) {
|
||||
context->aux_pids = aux->next;
|
||||
kfree(aux);
|
||||
}
|
||||
context->aux_pids = NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* audit_reset_context - reset a audit_context structure
|
||||
* @ctx: the audit_context to reset
|
||||
*
|
||||
* All fields in the audit_context will be reset to an initial state, all
|
||||
* references held by fields will be dropped, and private memory will be
|
||||
* released. When this function returns the audit_context will be suitable
|
||||
* for reuse, so long as the passed context is not NULL or a dummy context.
|
||||
*/
|
||||
static void audit_reset_context(struct audit_context *ctx)
|
||||
{
|
||||
if (!ctx)
|
||||
return;
|
||||
|
||||
/* if ctx is non-null, reset the "ctx->state" regardless */
|
||||
ctx->context = AUDIT_CTX_UNUSED;
|
||||
if (ctx->dummy)
|
||||
return;
|
||||
|
||||
/*
|
||||
* NOTE: It shouldn't matter in what order we release the fields, so
|
||||
* release them in the order in which they appear in the struct;
|
||||
* this gives us some hope of quickly making sure we are
|
||||
* resetting the audit_context properly.
|
||||
*
|
||||
* Other things worth mentioning:
|
||||
* - we don't reset "dummy"
|
||||
* - we don't reset "state", we do reset "current_state"
|
||||
* - we preserve "filterkey" if "state" is AUDIT_STATE_RECORD
|
||||
* - much of this is likely overkill, but play it safe for now
|
||||
* - we really need to work on improving the audit_context struct
|
||||
*/
|
||||
|
||||
ctx->current_state = ctx->state;
|
||||
ctx->serial = 0;
|
||||
ctx->major = 0;
|
||||
ctx->uring_op = 0;
|
||||
ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
|
||||
memset(ctx->argv, 0, sizeof(ctx->argv));
|
||||
ctx->return_code = 0;
|
||||
ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0);
|
||||
ctx->return_valid = AUDITSC_INVALID;
|
||||
audit_free_names(ctx);
|
||||
if (ctx->state != AUDIT_STATE_RECORD) {
|
||||
kfree(ctx->filterkey);
|
||||
ctx->filterkey = NULL;
|
||||
}
|
||||
audit_free_aux(ctx);
|
||||
kfree(ctx->sockaddr);
|
||||
ctx->sockaddr = NULL;
|
||||
ctx->sockaddr_len = 0;
|
||||
ctx->pid = ctx->ppid = 0;
|
||||
ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0);
|
||||
ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0);
|
||||
ctx->personality = 0;
|
||||
ctx->arch = 0;
|
||||
ctx->target_pid = 0;
|
||||
ctx->target_auid = ctx->target_uid = KUIDT_INIT(0);
|
||||
ctx->target_sessionid = 0;
|
||||
ctx->target_sid = 0;
|
||||
ctx->target_comm[0] = '\0';
|
||||
unroll_tree_refs(ctx, NULL, 0);
|
||||
WARN_ON(!list_empty(&ctx->killed_trees));
|
||||
ctx->type = 0;
|
||||
audit_free_module(ctx);
|
||||
ctx->fds[0] = -1;
|
||||
audit_proctitle_free(ctx);
|
||||
}
|
||||
|
||||
static inline struct audit_context *audit_alloc_context(enum audit_state state)
|
||||
@ -928,6 +1027,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
|
||||
context = kzalloc(sizeof(*context), GFP_KERNEL);
|
||||
if (!context)
|
||||
return NULL;
|
||||
context->context = AUDIT_CTX_UNUSED;
|
||||
context->state = state;
|
||||
context->prio = state == AUDIT_STATE_RECORD ? ~0ULL : 0;
|
||||
INIT_LIST_HEAD(&context->killed_trees);
|
||||
@ -953,7 +1053,7 @@ int audit_alloc(struct task_struct *tsk)
|
||||
char *key = NULL;
|
||||
|
||||
if (likely(!audit_ever_enabled))
|
||||
return 0; /* Return if not auditing. */
|
||||
return 0;
|
||||
|
||||
state = audit_filter_task(tsk, &key);
|
||||
if (state == AUDIT_STATE_DISABLED) {
|
||||
@ -973,16 +1073,37 @@ int audit_alloc(struct task_struct *tsk)
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* audit_alloc_kernel - allocate an audit_context for a kernel task
|
||||
* @tsk: the kernel task
|
||||
*
|
||||
* Similar to the audit_alloc() function, but intended for kernel private
|
||||
* threads. Returns zero on success, negative values on failure.
|
||||
*/
|
||||
int audit_alloc_kernel(struct task_struct *tsk)
|
||||
{
|
||||
/*
|
||||
* At the moment we are just going to call into audit_alloc() to
|
||||
* simplify the code, but there two things to keep in mind with this
|
||||
* approach:
|
||||
*
|
||||
* 1. Filtering internal kernel tasks is a bit laughable in almost all
|
||||
* cases, but there is at least one case where there is a benefit:
|
||||
* the '-a task,never' case allows the admin to effectively disable
|
||||
* task auditing at runtime.
|
||||
*
|
||||
* 2. The {set,clear}_task_syscall_work() ops likely have zero effect
|
||||
* on these internal kernel tasks, but they probably don't hurt either.
|
||||
*/
|
||||
return audit_alloc(tsk);
|
||||
}
|
||||
|
||||
static inline void audit_free_context(struct audit_context *context)
|
||||
{
|
||||
audit_free_module(context);
|
||||
audit_free_names(context);
|
||||
unroll_tree_refs(context, NULL, 0);
|
||||
/* resetting is extra work, but it is likely just noise */
|
||||
audit_reset_context(context);
|
||||
free_tree_refs(context);
|
||||
audit_free_aux(context);
|
||||
kfree(context->filterkey);
|
||||
kfree(context->sockaddr);
|
||||
audit_proctitle_free(context);
|
||||
kfree(context);
|
||||
}
|
||||
|
||||
@ -1316,6 +1437,12 @@ static void show_special(struct audit_context *context, int *call_panic)
|
||||
audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
|
||||
context->mmap.flags);
|
||||
break;
|
||||
case AUDIT_OPENAT2:
|
||||
audit_log_format(ab, "oflag=0%llo mode=0%llo resolve=0x%llx",
|
||||
context->openat2.flags,
|
||||
context->openat2.mode,
|
||||
context->openat2.resolve);
|
||||
break;
|
||||
case AUDIT_EXECVE:
|
||||
audit_log_execve_info(context, &ab);
|
||||
break;
|
||||
@ -1479,6 +1606,44 @@ static void audit_log_proctitle(void)
|
||||
audit_log_end(ab);
|
||||
}
|
||||
|
||||
/**
|
||||
* audit_log_uring - generate a AUDIT_URINGOP record
|
||||
* @ctx: the audit context
|
||||
*/
|
||||
static void audit_log_uring(struct audit_context *ctx)
|
||||
{
|
||||
struct audit_buffer *ab;
|
||||
const struct cred *cred;
|
||||
|
||||
ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_URINGOP);
|
||||
if (!ab)
|
||||
return;
|
||||
cred = current_cred();
|
||||
audit_log_format(ab, "uring_op=%d", ctx->uring_op);
|
||||
if (ctx->return_valid != AUDITSC_INVALID)
|
||||
audit_log_format(ab, " success=%s exit=%ld",
|
||||
(ctx->return_valid == AUDITSC_SUCCESS ?
|
||||
"yes" : "no"),
|
||||
ctx->return_code);
|
||||
audit_log_format(ab,
|
||||
" items=%d"
|
||||
" ppid=%d pid=%d uid=%u gid=%u euid=%u suid=%u"
|
||||
" fsuid=%u egid=%u sgid=%u fsgid=%u",
|
||||
ctx->name_count,
|
||||
task_ppid_nr(current), task_tgid_nr(current),
|
||||
from_kuid(&init_user_ns, cred->uid),
|
||||
from_kgid(&init_user_ns, cred->gid),
|
||||
from_kuid(&init_user_ns, cred->euid),
|
||||
from_kuid(&init_user_ns, cred->suid),
|
||||
from_kuid(&init_user_ns, cred->fsuid),
|
||||
from_kgid(&init_user_ns, cred->egid),
|
||||
from_kgid(&init_user_ns, cred->sgid),
|
||||
from_kgid(&init_user_ns, cred->fsgid));
|
||||
audit_log_task_context(ab);
|
||||
audit_log_key(ab, ctx->filterkey);
|
||||
audit_log_end(ab);
|
||||
}
|
||||
|
||||
static void audit_log_exit(void)
|
||||
{
|
||||
int i, call_panic = 0;
|
||||
@ -1489,18 +1654,20 @@ static void audit_log_exit(void)
|
||||
|
||||
context->personality = current->personality;
|
||||
|
||||
switch (context->context) {
|
||||
case AUDIT_CTX_SYSCALL:
|
||||
ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
|
||||
if (!ab)
|
||||
return; /* audit_panic has been called */
|
||||
return;
|
||||
audit_log_format(ab, "arch=%x syscall=%d",
|
||||
context->arch, context->major);
|
||||
if (context->personality != PER_LINUX)
|
||||
audit_log_format(ab, " per=%lx", context->personality);
|
||||
if (context->return_valid != AUDITSC_INVALID)
|
||||
audit_log_format(ab, " success=%s exit=%ld",
|
||||
(context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
|
||||
(context->return_valid == AUDITSC_SUCCESS ?
|
||||
"yes" : "no"),
|
||||
context->return_code);
|
||||
|
||||
audit_log_format(ab,
|
||||
" a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
|
||||
context->argv[0],
|
||||
@ -1508,10 +1675,17 @@ static void audit_log_exit(void)
|
||||
context->argv[2],
|
||||
context->argv[3],
|
||||
context->name_count);
|
||||
|
||||
audit_log_task_info(ab);
|
||||
audit_log_key(ab, context->filterkey);
|
||||
audit_log_end(ab);
|
||||
break;
|
||||
case AUDIT_CTX_URING:
|
||||
audit_log_uring(context);
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
break;
|
||||
}
|
||||
|
||||
for (aux = context->aux; aux; aux = aux->next) {
|
||||
|
||||
@ -1602,6 +1776,7 @@ static void audit_log_exit(void)
|
||||
audit_log_name(context, n, NULL, i++, &call_panic);
|
||||
}
|
||||
|
||||
if (context->context == AUDIT_CTX_SYSCALL)
|
||||
audit_log_proctitle();
|
||||
|
||||
/* Send end of event record to help user space know we are finished */
|
||||
@ -1609,14 +1784,14 @@ static void audit_log_exit(void)
|
||||
if (ab)
|
||||
audit_log_end(ab);
|
||||
if (call_panic)
|
||||
audit_panic("error converting sid to string");
|
||||
audit_panic("error in audit_log_exit()");
|
||||
}
|
||||
|
||||
/**
|
||||
* __audit_free - free a per-task audit context
|
||||
* @tsk: task whose audit context block to free
|
||||
*
|
||||
* Called from copy_process and do_exit
|
||||
* Called from copy_process, do_exit, and the io_uring code
|
||||
*/
|
||||
void __audit_free(struct task_struct *tsk)
|
||||
{
|
||||
@ -1625,6 +1800,7 @@ void __audit_free(struct task_struct *tsk)
|
||||
if (!context)
|
||||
return;
|
||||
|
||||
/* this may generate CONFIG_CHANGE records */
|
||||
if (!list_empty(&context->killed_trees))
|
||||
audit_kill_trees(context);
|
||||
|
||||
@ -1633,20 +1809,152 @@ void __audit_free(struct task_struct *tsk)
|
||||
* random task_struct that doesn't doesn't have any meaningful data we
|
||||
* need to log via audit_log_exit().
|
||||
*/
|
||||
if (tsk == current && !context->dummy && context->in_syscall) {
|
||||
if (tsk == current && !context->dummy) {
|
||||
context->return_valid = AUDITSC_INVALID;
|
||||
context->return_code = 0;
|
||||
|
||||
if (context->context == AUDIT_CTX_SYSCALL) {
|
||||
audit_filter_syscall(tsk, context);
|
||||
audit_filter_inodes(tsk, context);
|
||||
if (context->current_state == AUDIT_STATE_RECORD)
|
||||
audit_log_exit();
|
||||
} else if (context->context == AUDIT_CTX_URING) {
|
||||
/* TODO: verify this case is real and valid */
|
||||
audit_filter_uring(tsk, context);
|
||||
audit_filter_inodes(tsk, context);
|
||||
if (context->current_state == AUDIT_STATE_RECORD)
|
||||
audit_log_uring(context);
|
||||
}
|
||||
}
|
||||
|
||||
audit_set_context(tsk, NULL);
|
||||
audit_free_context(context);
|
||||
}
|
||||
|
||||
/**
|
||||
* audit_return_fixup - fixup the return codes in the audit_context
|
||||
* @ctx: the audit_context
|
||||
* @success: true/false value to indicate if the operation succeeded or not
|
||||
* @code: operation return code
|
||||
*
|
||||
* We need to fixup the return code in the audit logs if the actual return
|
||||
* codes are later going to be fixed by the arch specific signal handlers.
|
||||
*/
|
||||
static void audit_return_fixup(struct audit_context *ctx,
|
||||
int success, long code)
|
||||
{
|
||||
/*
|
||||
* This is actually a test for:
|
||||
* (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
|
||||
* (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
|
||||
*
|
||||
* but is faster than a bunch of ||
|
||||
*/
|
||||
if (unlikely(code <= -ERESTARTSYS) &&
|
||||
(code >= -ERESTART_RESTARTBLOCK) &&
|
||||
(code != -ENOIOCTLCMD))
|
||||
ctx->return_code = -EINTR;
|
||||
else
|
||||
ctx->return_code = code;
|
||||
ctx->return_valid = (success ? AUDITSC_SUCCESS : AUDITSC_FAILURE);
|
||||
}
|
||||
|
||||
/**
|
||||
* __audit_uring_entry - prepare the kernel task's audit context for io_uring
|
||||
* @op: the io_uring opcode
|
||||
*
|
||||
* This is similar to audit_syscall_entry() but is intended for use by io_uring
|
||||
* operations. This function should only ever be called from
|
||||
* audit_uring_entry() as we rely on the audit context checking present in that
|
||||
* function.
|
||||
*/
|
||||
void __audit_uring_entry(u8 op)
|
||||
{
|
||||
struct audit_context *ctx = audit_context();
|
||||
|
||||
if (ctx->state == AUDIT_STATE_DISABLED)
|
||||
return;
|
||||
|
||||
/*
|
||||
* NOTE: It's possible that we can be called from the process' context
|
||||
* before it returns to userspace, and before audit_syscall_exit()
|
||||
* is called. In this case there is not much to do, just record
|
||||
* the io_uring details and return.
|
||||
*/
|
||||
ctx->uring_op = op;
|
||||
if (ctx->context == AUDIT_CTX_SYSCALL)
|
||||
return;
|
||||
|
||||
ctx->dummy = !audit_n_rules;
|
||||
if (!ctx->dummy && ctx->state == AUDIT_STATE_BUILD)
|
||||
ctx->prio = 0;
|
||||
|
||||
ctx->context = AUDIT_CTX_URING;
|
||||
ctx->current_state = ctx->state;
|
||||
ktime_get_coarse_real_ts64(&ctx->ctime);
|
||||
}
|
||||
|
||||
/**
|
||||
* __audit_uring_exit - wrap up the kernel task's audit context after io_uring
|
||||
* @success: true/false value to indicate if the operation succeeded or not
|
||||
* @code: operation return code
|
||||
*
|
||||
* This is similar to audit_syscall_exit() but is intended for use by io_uring
|
||||
* operations. This function should only ever be called from
|
||||
* audit_uring_exit() as we rely on the audit context checking present in that
|
||||
* function.
|
||||
*/
|
||||
void __audit_uring_exit(int success, long code)
|
||||
{
|
||||
struct audit_context *ctx = audit_context();
|
||||
|
||||
if (ctx->context == AUDIT_CTX_SYSCALL) {
|
||||
/*
|
||||
* NOTE: See the note in __audit_uring_entry() about the case
|
||||
* where we may be called from process context before we
|
||||
* return to userspace via audit_syscall_exit(). In this
|
||||
* case we simply emit a URINGOP record and bail, the
|
||||
* normal syscall exit handling will take care of
|
||||
* everything else.
|
||||
* It is also worth mentioning that when we are called,
|
||||
* the current process creds may differ from the creds
|
||||
* used during the normal syscall processing; keep that
|
||||
* in mind if/when we move the record generation code.
|
||||
*/
|
||||
|
||||
/*
|
||||
* We need to filter on the syscall info here to decide if we
|
||||
* should emit a URINGOP record. I know it seems odd but this
|
||||
* solves the problem where users have a filter to block *all*
|
||||
* syscall records in the "exit" filter; we want to preserve
|
||||
* the behavior here.
|
||||
*/
|
||||
audit_filter_syscall(current, ctx);
|
||||
if (ctx->current_state != AUDIT_STATE_RECORD)
|
||||
audit_filter_uring(current, ctx);
|
||||
audit_filter_inodes(current, ctx);
|
||||
if (ctx->current_state != AUDIT_STATE_RECORD)
|
||||
return;
|
||||
|
||||
audit_log_uring(ctx);
|
||||
return;
|
||||
}
|
||||
|
||||
/* this may generate CONFIG_CHANGE records */
|
||||
if (!list_empty(&ctx->killed_trees))
|
||||
audit_kill_trees(ctx);
|
||||
|
||||
/* run through both filters to ensure we set the filterkey properly */
|
||||
audit_filter_uring(current, ctx);
|
||||
audit_filter_inodes(current, ctx);
|
||||
if (ctx->current_state != AUDIT_STATE_RECORD)
|
||||
goto out;
|
||||
audit_return_fixup(ctx, success, code);
|
||||
audit_log_exit();
|
||||
|
||||
out:
|
||||
audit_reset_context(ctx);
|
||||
}
|
||||
|
||||
/**
|
||||
* __audit_syscall_entry - fill in an audit record at syscall entry
|
||||
* @major: major syscall type (function)
|
||||
@ -1672,7 +1980,12 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
|
||||
if (!audit_enabled || !context)
|
||||
return;
|
||||
|
||||
BUG_ON(context->in_syscall || context->name_count);
|
||||
WARN_ON(context->context != AUDIT_CTX_UNUSED);
|
||||
WARN_ON(context->name_count);
|
||||
if (context->context != AUDIT_CTX_UNUSED || context->name_count) {
|
||||
audit_panic("unrecoverable error in audit_syscall_entry()");
|
||||
return;
|
||||
}
|
||||
|
||||
state = context->state;
|
||||
if (state == AUDIT_STATE_DISABLED)
|
||||
@ -1691,10 +2004,8 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
|
||||
context->argv[1] = a2;
|
||||
context->argv[2] = a3;
|
||||
context->argv[3] = a4;
|
||||
context->serial = 0;
|
||||
context->in_syscall = 1;
|
||||
context->context = AUDIT_CTX_SYSCALL;
|
||||
context->current_state = state;
|
||||
context->ppid = 0;
|
||||
ktime_get_coarse_real_ts64(&context->ctime);
|
||||
}
|
||||
|
||||
@ -1711,63 +2022,27 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
|
||||
*/
|
||||
void __audit_syscall_exit(int success, long return_code)
|
||||
{
|
||||
struct audit_context *context;
|
||||
struct audit_context *context = audit_context();
|
||||
|
||||
context = audit_context();
|
||||
if (!context)
|
||||
return;
|
||||
if (!context || context->dummy ||
|
||||
context->context != AUDIT_CTX_SYSCALL)
|
||||
goto out;
|
||||
|
||||
/* this may generate CONFIG_CHANGE records */
|
||||
if (!list_empty(&context->killed_trees))
|
||||
audit_kill_trees(context);
|
||||
|
||||
if (!context->dummy && context->in_syscall) {
|
||||
if (success)
|
||||
context->return_valid = AUDITSC_SUCCESS;
|
||||
else
|
||||
context->return_valid = AUDITSC_FAILURE;
|
||||
|
||||
/*
|
||||
* we need to fix up the return code in the audit logs if the
|
||||
* actual return codes are later going to be fixed up by the
|
||||
* arch specific signal handlers
|
||||
*
|
||||
* This is actually a test for:
|
||||
* (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
|
||||
* (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
|
||||
*
|
||||
* but is faster than a bunch of ||
|
||||
*/
|
||||
if (unlikely(return_code <= -ERESTARTSYS) &&
|
||||
(return_code >= -ERESTART_RESTARTBLOCK) &&
|
||||
(return_code != -ENOIOCTLCMD))
|
||||
context->return_code = -EINTR;
|
||||
else
|
||||
context->return_code = return_code;
|
||||
|
||||
/* run through both filters to ensure we set the filterkey properly */
|
||||
audit_filter_syscall(current, context);
|
||||
audit_filter_inodes(current, context);
|
||||
if (context->current_state == AUDIT_STATE_RECORD)
|
||||
if (context->current_state < AUDIT_STATE_RECORD)
|
||||
goto out;
|
||||
|
||||
audit_return_fixup(context, success, return_code);
|
||||
audit_log_exit();
|
||||
}
|
||||
|
||||
context->in_syscall = 0;
|
||||
context->prio = context->state == AUDIT_STATE_RECORD ? ~0ULL : 0;
|
||||
|
||||
audit_free_module(context);
|
||||
audit_free_names(context);
|
||||
unroll_tree_refs(context, NULL, 0);
|
||||
audit_free_aux(context);
|
||||
context->aux = NULL;
|
||||
context->aux_pids = NULL;
|
||||
context->target_pid = 0;
|
||||
context->target_sid = 0;
|
||||
context->sockaddr_len = 0;
|
||||
context->type = 0;
|
||||
context->fds[0] = -1;
|
||||
if (context->state != AUDIT_STATE_RECORD) {
|
||||
kfree(context->filterkey);
|
||||
context->filterkey = NULL;
|
||||
}
|
||||
out:
|
||||
audit_reset_context(context);
|
||||
}
|
||||
|
||||
static inline void handle_one(const struct inode *inode)
|
||||
@ -1919,7 +2194,7 @@ void __audit_getname(struct filename *name)
|
||||
struct audit_context *context = audit_context();
|
||||
struct audit_names *n;
|
||||
|
||||
if (!context->in_syscall)
|
||||
if (context->context == AUDIT_CTX_UNUSED)
|
||||
return;
|
||||
|
||||
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
|
||||
@ -1991,7 +2266,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
|
||||
struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
|
||||
int i;
|
||||
|
||||
if (!context->in_syscall)
|
||||
if (context->context == AUDIT_CTX_UNUSED)
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
@ -2109,7 +2384,7 @@ void __audit_inode_child(struct inode *parent,
|
||||
struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
|
||||
int i;
|
||||
|
||||
if (!context->in_syscall)
|
||||
if (context->context == AUDIT_CTX_UNUSED)
|
||||
return;
|
||||
|
||||
rcu_read_lock();
|
||||
@ -2208,7 +2483,7 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
|
||||
int auditsc_get_stamp(struct audit_context *ctx,
|
||||
struct timespec64 *t, unsigned int *serial)
|
||||
{
|
||||
if (!ctx->in_syscall)
|
||||
if (ctx->context == AUDIT_CTX_UNUSED)
|
||||
return 0;
|
||||
if (!ctx->serial)
|
||||
ctx->serial = audit_serial();
|
||||
@ -2546,6 +2821,16 @@ void __audit_mmap_fd(int fd, int flags)
|
||||
context->type = AUDIT_MMAP;
|
||||
}
|
||||
|
||||
void __audit_openat2_how(struct open_how *how)
|
||||
{
|
||||
struct audit_context *context = audit_context();
|
||||
|
||||
context->openat2.flags = how->flags;
|
||||
context->openat2.mode = how->mode;
|
||||
context->openat2.resolve = how->resolve;
|
||||
context->type = AUDIT_OPENAT2;
|
||||
}
|
||||
|
||||
void __audit_log_kern_module(char *name)
|
||||
{
|
||||
struct audit_context *context = audit_context();
|
||||
@ -2706,8 +2991,7 @@ void audit_seccomp_actions_logged(const char *names, const char *old_names,
|
||||
struct list_head *audit_killed_trees(void)
|
||||
{
|
||||
struct audit_context *ctx = audit_context();
|
||||
|
||||
if (likely(!ctx || !ctx->in_syscall))
|
||||
if (likely(!ctx || ctx->context == AUDIT_CTX_UNUSED))
|
||||
return NULL;
|
||||
return &ctx->killed_trees;
|
||||
}
|
||||
|
@ -64,6 +64,7 @@ config BPF_JIT_DEFAULT_ON
|
||||
|
||||
config BPF_UNPRIV_DEFAULT_OFF
|
||||
bool "Disable unprivileged BPF by default"
|
||||
default y
|
||||
depends on BPF_SYSCALL
|
||||
help
|
||||
Disables unprivileged BPF by default by setting the corresponding
|
||||
@ -72,6 +73,12 @@ config BPF_UNPRIV_DEFAULT_OFF
|
||||
disable it by setting it to 1 (from which no other transition to
|
||||
0 is possible anymore).
|
||||
|
||||
Unprivileged BPF could be used to exploit certain potential
|
||||
speculative execution side-channel vulnerabilities on unmitigated
|
||||
affected hardware.
|
||||
|
||||
If you are unsure how to answer this question, answer Y.
|
||||
|
||||
source "kernel/bpf/preload/Kconfig"
|
||||
|
||||
config BPF_LSM
|
||||
|
@ -7,7 +7,7 @@ endif
|
||||
CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
|
||||
|
||||
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
|
||||
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
|
||||
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
|
||||
@ -36,3 +36,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
|
||||
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
|
||||
endif
|
||||
obj-$(CONFIG_BPF_PRELOAD) += preload/
|
||||
|
||||
obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
|
||||
$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE
|
||||
$(call if_changed_rule,cc_o_c)
|
||||
|
@ -645,7 +645,7 @@ static const struct bpf_iter_seq_info iter_seq_info = {
|
||||
.seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info),
|
||||
};
|
||||
|
||||
static int bpf_for_each_array_elem(struct bpf_map *map, void *callback_fn,
|
||||
static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
|
||||
void *callback_ctx, u64 flags)
|
||||
{
|
||||
u32 i, key, num_elems = 0;
|
||||
@ -668,9 +668,8 @@ static int bpf_for_each_array_elem(struct bpf_map *map, void *callback_fn,
|
||||
val = array->value + array->elem_size * i;
|
||||
num_elems++;
|
||||
key = i;
|
||||
ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
|
||||
(u64)(long)&key, (u64)(long)val,
|
||||
(u64)(long)callback_ctx, 0);
|
||||
ret = callback_fn((u64)(long)map, (u64)(long)&key,
|
||||
(u64)(long)val, (u64)(long)callback_ctx, 0);
|
||||
/* return value: 0 - continue, 1 - stop and return */
|
||||
if (ret)
|
||||
break;
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include <linux/bpf_lsm.h>
|
||||
#include <linux/btf_ids.h>
|
||||
#include <linux/fdtable.h>
|
||||
#include <linux/rcupdate_trace.h>
|
||||
|
||||
DEFINE_BPF_STORAGE_CACHE(inode_cache);
|
||||
|
||||
@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
|
||||
if (!bsb)
|
||||
return NULL;
|
||||
|
||||
inode_storage = rcu_dereference(bsb->storage);
|
||||
inode_storage =
|
||||
rcu_dereference_check(bsb->storage, bpf_rcu_lock_held());
|
||||
if (!inode_storage)
|
||||
return NULL;
|
||||
|
||||
@ -172,6 +174,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
|
||||
{
|
||||
struct bpf_local_storage_data *sdata;
|
||||
|
||||
WARN_ON_ONCE(!bpf_rcu_lock_held());
|
||||
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
|
||||
return (unsigned long)NULL;
|
||||
|
||||
@ -204,6 +207,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
|
||||
BPF_CALL_2(bpf_inode_storage_delete,
|
||||
struct bpf_map *, map, struct inode *, inode)
|
||||
{
|
||||
WARN_ON_ONCE(!bpf_rcu_lock_held());
|
||||
if (!inode)
|
||||
return -EINVAL;
|
||||
|
||||
|
@ -714,3 +714,38 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = {
|
||||
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
/* maximum number of loops */
|
||||
#define MAX_LOOPS BIT(23)
|
||||
|
||||
BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
|
||||
u64, flags)
|
||||
{
|
||||
bpf_callback_t callback = (bpf_callback_t)callback_fn;
|
||||
u64 ret;
|
||||
u32 i;
|
||||
|
||||
if (flags)
|
||||
return -EINVAL;
|
||||
if (nr_loops > MAX_LOOPS)
|
||||
return -E2BIG;
|
||||
|
||||
for (i = 0; i < nr_loops; i++) {
|
||||
ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0);
|
||||
/* return value: 0 - continue, 1 - stop and return */
|
||||
if (ret)
|
||||
return i + 1;
|
||||
}
|
||||
|
||||
return i;
|
||||
}
|
||||
|
||||
const struct bpf_func_proto bpf_loop_proto = {
|
||||
.func = bpf_loop,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_ANYTHING,
|
||||
.arg2_type = ARG_PTR_TO_FUNC,
|
||||
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
@ -11,6 +11,9 @@
|
||||
#include <net/sock.h>
|
||||
#include <uapi/linux/sock_diag.h>
|
||||
#include <uapi/linux/btf.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/rcupdate_trace.h>
|
||||
#include <linux/rcupdate_wait.h>
|
||||
|
||||
#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
|
||||
|
||||
@ -81,6 +84,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void bpf_local_storage_free_rcu(struct rcu_head *rcu)
|
||||
{
|
||||
struct bpf_local_storage *local_storage;
|
||||
|
||||
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
|
||||
kfree_rcu(local_storage, rcu);
|
||||
}
|
||||
|
||||
static void bpf_selem_free_rcu(struct rcu_head *rcu)
|
||||
{
|
||||
struct bpf_local_storage_elem *selem;
|
||||
|
||||
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
|
||||
kfree_rcu(selem, rcu);
|
||||
}
|
||||
|
||||
/* local_storage->lock must be held and selem->local_storage == local_storage.
|
||||
* The caller must ensure selem->smap is still valid to be
|
||||
* dereferenced for its smap->elem_size and smap->cache_idx.
|
||||
@ -93,7 +112,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
|
||||
bool free_local_storage;
|
||||
void *owner;
|
||||
|
||||
smap = rcu_dereference(SDATA(selem)->smap);
|
||||
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
|
||||
owner = local_storage->owner;
|
||||
|
||||
/* All uncharging on the owner must be done first.
|
||||
@ -118,12 +137,12 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
|
||||
*
|
||||
* Although the unlock will be done under
|
||||
* rcu_read_lock(), it is more intutivie to
|
||||
* read if kfree_rcu(local_storage, rcu) is done
|
||||
* read if the freeing of the storage is done
|
||||
* after the raw_spin_unlock_bh(&local_storage->lock).
|
||||
*
|
||||
* Hence, a "bool free_local_storage" is returned
|
||||
* to the caller which then calls the kfree_rcu()
|
||||
* after unlock.
|
||||
* to the caller which then calls then frees the storage after
|
||||
* all the RCU grace periods have expired.
|
||||
*/
|
||||
}
|
||||
hlist_del_init_rcu(&selem->snode);
|
||||
@ -131,8 +150,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
|
||||
SDATA(selem))
|
||||
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
|
||||
|
||||
kfree_rcu(selem, rcu);
|
||||
|
||||
call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
|
||||
return free_local_storage;
|
||||
}
|
||||
|
||||
@ -146,7 +164,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
|
||||
/* selem has already been unlinked from sk */
|
||||
return;
|
||||
|
||||
local_storage = rcu_dereference(selem->local_storage);
|
||||
local_storage = rcu_dereference_check(selem->local_storage,
|
||||
bpf_rcu_lock_held());
|
||||
raw_spin_lock_irqsave(&local_storage->lock, flags);
|
||||
if (likely(selem_linked_to_storage(selem)))
|
||||
free_local_storage = bpf_selem_unlink_storage_nolock(
|
||||
@ -154,7 +173,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
|
||||
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
|
||||
|
||||
if (free_local_storage)
|
||||
kfree_rcu(local_storage, rcu);
|
||||
call_rcu_tasks_trace(&local_storage->rcu,
|
||||
bpf_local_storage_free_rcu);
|
||||
}
|
||||
|
||||
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
|
||||
@ -174,7 +194,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
|
||||
/* selem has already be unlinked from smap */
|
||||
return;
|
||||
|
||||
smap = rcu_dereference(SDATA(selem)->smap);
|
||||
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
|
||||
b = select_bucket(smap, selem);
|
||||
raw_spin_lock_irqsave(&b->lock, flags);
|
||||
if (likely(selem_linked_to_map(selem)))
|
||||
@ -213,12 +233,14 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
|
||||
struct bpf_local_storage_elem *selem;
|
||||
|
||||
/* Fast path (cache hit) */
|
||||
sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
|
||||
sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx],
|
||||
bpf_rcu_lock_held());
|
||||
if (sdata && rcu_access_pointer(sdata->smap) == smap)
|
||||
return sdata;
|
||||
|
||||
/* Slow path (cache miss) */
|
||||
hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
|
||||
hlist_for_each_entry_rcu(selem, &local_storage->list, snode,
|
||||
rcu_read_lock_trace_held())
|
||||
if (rcu_access_pointer(SDATA(selem)->smap) == smap)
|
||||
break;
|
||||
|
||||
@ -306,7 +328,8 @@ int bpf_local_storage_alloc(void *owner,
|
||||
* bucket->list, first_selem can be freed immediately
|
||||
* (instead of kfree_rcu) because
|
||||
* bpf_local_storage_map_free() does a
|
||||
* synchronize_rcu() before walking the bucket->list.
|
||||
* synchronize_rcu_mult (waiting for both sleepable and
|
||||
* normal programs) before walking the bucket->list.
|
||||
* Hence, no one is accessing selem from the
|
||||
* bucket->list under rcu_read_lock().
|
||||
*/
|
||||
@ -342,7 +365,8 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
|
||||
!map_value_has_spin_lock(&smap->map)))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
local_storage = rcu_dereference(*owner_storage(smap, owner));
|
||||
local_storage = rcu_dereference_check(*owner_storage(smap, owner),
|
||||
bpf_rcu_lock_held());
|
||||
if (!local_storage || hlist_empty(&local_storage->list)) {
|
||||
/* Very first elem for the owner */
|
||||
err = check_flags(NULL, map_flags);
|
||||
|
@ -207,7 +207,7 @@ BTF_ID(func, bpf_lsm_socket_socketpair)
|
||||
|
||||
BTF_ID(func, bpf_lsm_syslog)
|
||||
BTF_ID(func, bpf_lsm_task_alloc)
|
||||
BTF_ID(func, bpf_lsm_task_getsecid_subj)
|
||||
BTF_ID(func, bpf_lsm_current_getsecid_subj)
|
||||
BTF_ID(func, bpf_lsm_task_getsecid_obj)
|
||||
BTF_ID(func, bpf_lsm_task_prctl)
|
||||
BTF_ID(func, bpf_lsm_task_setscheduler)
|
||||
|
@ -93,6 +93,9 @@ const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
|
||||
};
|
||||
|
||||
const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
|
||||
#ifdef CONFIG_NET
|
||||
.test_run = bpf_struct_ops_test_run,
|
||||
#endif
|
||||
};
|
||||
|
||||
static const struct btf_type *module_type;
|
||||
@ -162,7 +165,7 @@ void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
|
||||
break;
|
||||
}
|
||||
|
||||
if (btf_member_bitfield_size(t, member)) {
|
||||
if (__btf_member_bitfield_size(t, member)) {
|
||||
pr_warn("bit field member %s in struct %s is not supported\n",
|
||||
mname, st_ops->name);
|
||||
break;
|
||||
@ -293,7 +296,7 @@ static int check_zero_holes(const struct btf_type *t, void *data)
|
||||
const struct btf_type *mtype;
|
||||
|
||||
for_each_member(i, t, member) {
|
||||
moff = btf_member_bit_offset(t, member) / 8;
|
||||
moff = __btf_member_bit_offset(t, member) / 8;
|
||||
if (moff > prev_mend &&
|
||||
memchr_inv(data + prev_mend, 0, moff - prev_mend))
|
||||
return -EINVAL;
|
||||
@ -312,6 +315,20 @@ static int check_zero_holes(const struct btf_type *t, void *data)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs,
|
||||
struct bpf_prog *prog,
|
||||
const struct btf_func_model *model,
|
||||
void *image, void *image_end)
|
||||
{
|
||||
u32 flags;
|
||||
|
||||
tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
|
||||
tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
|
||||
flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
|
||||
return arch_prepare_bpf_trampoline(NULL, image, image_end,
|
||||
model, flags, tprogs, NULL);
|
||||
}
|
||||
|
||||
static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
|
||||
void *value, u64 flags)
|
||||
{
|
||||
@ -323,7 +340,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
|
||||
struct bpf_tramp_progs *tprogs = NULL;
|
||||
void *udata, *kdata;
|
||||
int prog_fd, err = 0;
|
||||
void *image;
|
||||
void *image, *image_end;
|
||||
u32 i;
|
||||
|
||||
if (flags)
|
||||
@ -363,14 +380,14 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
|
||||
udata = &uvalue->data;
|
||||
kdata = &kvalue->data;
|
||||
image = st_map->image;
|
||||
image_end = st_map->image + PAGE_SIZE;
|
||||
|
||||
for_each_member(i, t, member) {
|
||||
const struct btf_type *mtype, *ptype;
|
||||
struct bpf_prog *prog;
|
||||
u32 moff;
|
||||
u32 flags;
|
||||
|
||||
moff = btf_member_bit_offset(t, member) / 8;
|
||||
moff = __btf_member_bit_offset(t, member) / 8;
|
||||
ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
|
||||
if (ptype == module_type) {
|
||||
if (*(void **)(udata + moff))
|
||||
@ -430,14 +447,9 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
|
||||
goto reset_unlock;
|
||||
}
|
||||
|
||||
tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
|
||||
tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
|
||||
flags = st_ops->func_models[i].ret_size > 0 ?
|
||||
BPF_TRAMP_F_RET_FENTRY_RET : 0;
|
||||
err = arch_prepare_bpf_trampoline(NULL, image,
|
||||
st_map->image + PAGE_SIZE,
|
||||
err = bpf_struct_ops_prepare_trampoline(tprogs, prog,
|
||||
&st_ops->func_models[i],
|
||||
flags, tprogs, NULL);
|
||||
image, image_end);
|
||||
if (err < 0)
|
||||
goto reset_unlock;
|
||||
|
||||
|
@ -2,6 +2,9 @@
|
||||
/* internal file - do not include directly */
|
||||
|
||||
#ifdef CONFIG_BPF_JIT
|
||||
#ifdef CONFIG_NET
|
||||
BPF_STRUCT_OPS_TYPE(bpf_dummy_ops)
|
||||
#endif
|
||||
#ifdef CONFIG_INET
|
||||
#include <net/tcp.h>
|
||||
BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)
|
||||
|
@ -17,6 +17,7 @@
|
||||
#include <uapi/linux/btf.h>
|
||||
#include <linux/btf_ids.h>
|
||||
#include <linux/fdtable.h>
|
||||
#include <linux/rcupdate_trace.h>
|
||||
|
||||
DEFINE_BPF_STORAGE_CACHE(task_cache);
|
||||
|
||||
@ -59,7 +60,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
|
||||
struct bpf_local_storage *task_storage;
|
||||
struct bpf_local_storage_map *smap;
|
||||
|
||||
task_storage = rcu_dereference(task->bpf_storage);
|
||||
task_storage =
|
||||
rcu_dereference_check(task->bpf_storage, bpf_rcu_lock_held());
|
||||
if (!task_storage)
|
||||
return NULL;
|
||||
|
||||
@ -229,6 +231,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
|
||||
{
|
||||
struct bpf_local_storage_data *sdata;
|
||||
|
||||
WARN_ON_ONCE(!bpf_rcu_lock_held());
|
||||
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
|
||||
return (unsigned long)NULL;
|
||||
|
||||
@ -260,6 +263,7 @@ BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
|
||||
{
|
||||
int ret;
|
||||
|
||||
WARN_ON_ONCE(!bpf_rcu_lock_held());
|
||||
if (!task)
|
||||
return -EINVAL;
|
||||
|
||||
@ -323,7 +327,7 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {
|
||||
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
|
||||
.arg1_type = ARG_CONST_MAP_PTR,
|
||||
.arg2_type = ARG_PTR_TO_BTF_ID,
|
||||
.arg2_btf_id = &btf_task_struct_ids[0],
|
||||
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
|
||||
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
@ -334,5 +338,5 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = {
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_CONST_MAP_PTR,
|
||||
.arg2_type = ARG_PTR_TO_BTF_ID,
|
||||
.arg2_btf_id = &btf_task_struct_ids[0],
|
||||
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
|
||||
};
|
||||
|
717
kernel/bpf/btf.c
717
kernel/bpf/btf.c
@ -25,6 +25,7 @@
|
||||
#include <linux/kobject.h>
|
||||
#include <linux/sysfs.h>
|
||||
#include <net/sock.h>
|
||||
#include "../tools/lib/bpf/relo_core.h"
|
||||
|
||||
/* BTF (BPF Type Format) is the meta data format which describes
|
||||
* the data types of BPF program/map. Hence, it basically focus
|
||||
@ -281,6 +282,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
|
||||
[BTF_KIND_VAR] = "VAR",
|
||||
[BTF_KIND_DATASEC] = "DATASEC",
|
||||
[BTF_KIND_FLOAT] = "FLOAT",
|
||||
[BTF_KIND_DECL_TAG] = "DECL_TAG",
|
||||
[BTF_KIND_TYPE_TAG] = "TYPE_TAG",
|
||||
};
|
||||
|
||||
const char *btf_type_str(const struct btf_type *t)
|
||||
@ -417,6 +420,7 @@ static bool btf_type_is_modifier(const struct btf_type *t)
|
||||
case BTF_KIND_VOLATILE:
|
||||
case BTF_KIND_CONST:
|
||||
case BTF_KIND_RESTRICT:
|
||||
case BTF_KIND_TYPE_TAG:
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -459,6 +463,17 @@ static bool btf_type_is_datasec(const struct btf_type *t)
|
||||
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
|
||||
}
|
||||
|
||||
static bool btf_type_is_decl_tag(const struct btf_type *t)
|
||||
{
|
||||
return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
|
||||
}
|
||||
|
||||
static bool btf_type_is_decl_tag_target(const struct btf_type *t)
|
||||
{
|
||||
return btf_type_is_func(t) || btf_type_is_struct(t) ||
|
||||
btf_type_is_var(t) || btf_type_is_typedef(t);
|
||||
}
|
||||
|
||||
u32 btf_nr_types(const struct btf *btf)
|
||||
{
|
||||
u32 total = 0;
|
||||
@ -537,6 +552,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
|
||||
static bool btf_type_is_resolve_source_only(const struct btf_type *t)
|
||||
{
|
||||
return btf_type_is_var(t) ||
|
||||
btf_type_is_decl_tag(t) ||
|
||||
btf_type_is_datasec(t);
|
||||
}
|
||||
|
||||
@ -563,6 +579,7 @@ static bool btf_type_needs_resolve(const struct btf_type *t)
|
||||
btf_type_is_struct(t) ||
|
||||
btf_type_is_array(t) ||
|
||||
btf_type_is_var(t) ||
|
||||
btf_type_is_decl_tag(t) ||
|
||||
btf_type_is_datasec(t);
|
||||
}
|
||||
|
||||
@ -616,6 +633,11 @@ static const struct btf_var *btf_type_var(const struct btf_type *t)
|
||||
return (const struct btf_var *)(t + 1);
|
||||
}
|
||||
|
||||
static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t)
|
||||
{
|
||||
return (const struct btf_decl_tag *)(t + 1);
|
||||
}
|
||||
|
||||
static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
|
||||
{
|
||||
return kind_ops[BTF_INFO_KIND(t->info)];
|
||||
@ -815,7 +837,7 @@ static const char *btf_show_name(struct btf_show *show)
|
||||
const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)];
|
||||
const char *name = NULL, *prefix = "", *parens = "";
|
||||
const struct btf_member *m = show->state.member;
|
||||
const struct btf_type *t = show->state.type;
|
||||
const struct btf_type *t;
|
||||
const struct btf_array *array;
|
||||
u32 id = show->state.type_id;
|
||||
const char *member = NULL;
|
||||
@ -1718,6 +1740,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type,
|
||||
case BTF_KIND_VOLATILE:
|
||||
case BTF_KIND_CONST:
|
||||
case BTF_KIND_RESTRICT:
|
||||
case BTF_KIND_TYPE_TAG:
|
||||
id = type->type;
|
||||
type = btf_type_by_id(btf, type->type);
|
||||
break;
|
||||
@ -2326,6 +2349,8 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
|
||||
const struct btf_type *t,
|
||||
u32 meta_left)
|
||||
{
|
||||
const char *value;
|
||||
|
||||
if (btf_type_vlen(t)) {
|
||||
btf_verifier_log_type(env, t, "vlen != 0");
|
||||
return -EINVAL;
|
||||
@ -2341,7 +2366,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/* typedef type must have a valid name, and other ref types,
|
||||
/* typedef/type_tag type must have a valid name, and other ref types,
|
||||
* volatile, const, restrict, should have a null name.
|
||||
*/
|
||||
if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) {
|
||||
@ -2350,6 +2375,12 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
|
||||
btf_verifier_log_type(env, t, "Invalid name");
|
||||
return -EINVAL;
|
||||
}
|
||||
} else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) {
|
||||
value = btf_name_by_offset(env->btf, t->name_off);
|
||||
if (!value || !value[0]) {
|
||||
btf_verifier_log_type(env, t, "Invalid name");
|
||||
return -EINVAL;
|
||||
}
|
||||
} else {
|
||||
if (t->name_off) {
|
||||
btf_verifier_log_type(env, t, "Invalid name");
|
||||
@ -2939,7 +2970,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
offset = btf_member_bit_offset(t, member);
|
||||
offset = __btf_member_bit_offset(t, member);
|
||||
if (is_union && offset) {
|
||||
btf_verifier_log_member(env, t, member,
|
||||
"Invalid member bits_offset");
|
||||
@ -3064,7 +3095,7 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
|
||||
if (off != -ENOENT)
|
||||
/* only one such field is allowed */
|
||||
return -E2BIG;
|
||||
off = btf_member_bit_offset(t, member);
|
||||
off = __btf_member_bit_offset(t, member);
|
||||
if (off % 8)
|
||||
/* valid C code cannot generate such BTF */
|
||||
return -EINVAL;
|
||||
@ -3154,8 +3185,8 @@ static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
|
||||
|
||||
btf_show_start_member(show, member);
|
||||
|
||||
member_offset = btf_member_bit_offset(t, member);
|
||||
bitfield_size = btf_member_bitfield_size(t, member);
|
||||
member_offset = __btf_member_bit_offset(t, member);
|
||||
bitfield_size = __btf_member_bitfield_size(t, member);
|
||||
bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
|
||||
bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
|
||||
if (bitfield_size) {
|
||||
@ -3801,6 +3832,110 @@ static const struct btf_kind_operations float_ops = {
|
||||
.show = btf_df_show,
|
||||
};
|
||||
|
||||
static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env,
|
||||
const struct btf_type *t,
|
||||
u32 meta_left)
|
||||
{
|
||||
const struct btf_decl_tag *tag;
|
||||
u32 meta_needed = sizeof(*tag);
|
||||
s32 component_idx;
|
||||
const char *value;
|
||||
|
||||
if (meta_left < meta_needed) {
|
||||
btf_verifier_log_basic(env, t,
|
||||
"meta_left:%u meta_needed:%u",
|
||||
meta_left, meta_needed);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
value = btf_name_by_offset(env->btf, t->name_off);
|
||||
if (!value || !value[0]) {
|
||||
btf_verifier_log_type(env, t, "Invalid value");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (btf_type_vlen(t)) {
|
||||
btf_verifier_log_type(env, t, "vlen != 0");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (btf_type_kflag(t)) {
|
||||
btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
component_idx = btf_type_decl_tag(t)->component_idx;
|
||||
if (component_idx < -1) {
|
||||
btf_verifier_log_type(env, t, "Invalid component_idx");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
btf_verifier_log_type(env, t, NULL);
|
||||
|
||||
return meta_needed;
|
||||
}
|
||||
|
||||
static int btf_decl_tag_resolve(struct btf_verifier_env *env,
|
||||
const struct resolve_vertex *v)
|
||||
{
|
||||
const struct btf_type *next_type;
|
||||
const struct btf_type *t = v->t;
|
||||
u32 next_type_id = t->type;
|
||||
struct btf *btf = env->btf;
|
||||
s32 component_idx;
|
||||
u32 vlen;
|
||||
|
||||
next_type = btf_type_by_id(btf, next_type_id);
|
||||
if (!next_type || !btf_type_is_decl_tag_target(next_type)) {
|
||||
btf_verifier_log_type(env, v->t, "Invalid type_id");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (!env_type_is_resolve_sink(env, next_type) &&
|
||||
!env_type_is_resolved(env, next_type_id))
|
||||
return env_stack_push(env, next_type, next_type_id);
|
||||
|
||||
component_idx = btf_type_decl_tag(t)->component_idx;
|
||||
if (component_idx != -1) {
|
||||
if (btf_type_is_var(next_type) || btf_type_is_typedef(next_type)) {
|
||||
btf_verifier_log_type(env, v->t, "Invalid component_idx");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (btf_type_is_struct(next_type)) {
|
||||
vlen = btf_type_vlen(next_type);
|
||||
} else {
|
||||
/* next_type should be a function */
|
||||
next_type = btf_type_by_id(btf, next_type->type);
|
||||
vlen = btf_type_vlen(next_type);
|
||||
}
|
||||
|
||||
if ((u32)component_idx >= vlen) {
|
||||
btf_verifier_log_type(env, v->t, "Invalid component_idx");
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
env_stack_pop_resolved(env, next_type_id, 0);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void btf_decl_tag_log(struct btf_verifier_env *env, const struct btf_type *t)
|
||||
{
|
||||
btf_verifier_log(env, "type=%u component_idx=%d", t->type,
|
||||
btf_type_decl_tag(t)->component_idx);
|
||||
}
|
||||
|
||||
static const struct btf_kind_operations decl_tag_ops = {
|
||||
.check_meta = btf_decl_tag_check_meta,
|
||||
.resolve = btf_decl_tag_resolve,
|
||||
.check_member = btf_df_check_member,
|
||||
.check_kflag_member = btf_df_check_kflag_member,
|
||||
.log_details = btf_decl_tag_log,
|
||||
.show = btf_df_show,
|
||||
};
|
||||
|
||||
static int btf_func_proto_check(struct btf_verifier_env *env,
|
||||
const struct btf_type *t)
|
||||
{
|
||||
@ -3935,6 +4070,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
|
||||
[BTF_KIND_VAR] = &var_ops,
|
||||
[BTF_KIND_DATASEC] = &datasec_ops,
|
||||
[BTF_KIND_FLOAT] = &float_ops,
|
||||
[BTF_KIND_DECL_TAG] = &decl_tag_ops,
|
||||
[BTF_KIND_TYPE_TAG] = &modifier_ops,
|
||||
};
|
||||
|
||||
static s32 btf_check_meta(struct btf_verifier_env *env,
|
||||
@ -4019,6 +4156,10 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
|
||||
return !btf_resolved_type_id(btf, type_id) &&
|
||||
!btf_resolved_type_size(btf, type_id);
|
||||
|
||||
if (btf_type_is_decl_tag(t))
|
||||
return btf_resolved_type_id(btf, type_id) &&
|
||||
!btf_resolved_type_size(btf, type_id);
|
||||
|
||||
if (btf_type_is_modifier(t) || btf_type_is_ptr(t) ||
|
||||
btf_type_is_var(t)) {
|
||||
t = btf_type_id_resolve(btf, &type_id);
|
||||
@ -4685,7 +4826,7 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
|
||||
return prog->aux->attach_btf;
|
||||
}
|
||||
|
||||
static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
|
||||
static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
|
||||
{
|
||||
/* t comes in already as a pointer */
|
||||
t = btf_type_by_id(btf, t->type);
|
||||
@ -4694,8 +4835,7 @@ static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
|
||||
if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
|
||||
t = btf_type_by_id(btf, t->type);
|
||||
|
||||
/* char, signed char, unsigned char */
|
||||
return btf_type_is_int(t) && t->size == 1;
|
||||
return btf_type_is_int(t);
|
||||
}
|
||||
|
||||
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
|
||||
@ -4800,10 +4940,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
|
||||
/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
|
||||
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
|
||||
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
|
||||
u32 type, flag;
|
||||
|
||||
if (ctx_arg_info->offset == off &&
|
||||
(ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL ||
|
||||
ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) {
|
||||
type = base_type(ctx_arg_info->reg_type);
|
||||
flag = type_flag(ctx_arg_info->reg_type);
|
||||
if (ctx_arg_info->offset == off && type == PTR_TO_BUF &&
|
||||
(flag & PTR_MAYBE_NULL)) {
|
||||
info->reg_type = ctx_arg_info->reg_type;
|
||||
return true;
|
||||
}
|
||||
@ -4816,7 +4958,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
|
||||
*/
|
||||
return true;
|
||||
|
||||
if (is_string_ptr(btf, t))
|
||||
if (is_int_ptr(btf, t))
|
||||
return true;
|
||||
|
||||
/* this is a pointer to another type */
|
||||
@ -4919,7 +5061,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
|
||||
if (array_elem->nelems != 0)
|
||||
goto error;
|
||||
|
||||
moff = btf_member_bit_offset(t, member) / 8;
|
||||
moff = __btf_member_bit_offset(t, member) / 8;
|
||||
if (off < moff)
|
||||
goto error;
|
||||
|
||||
@ -4942,14 +5084,14 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
|
||||
|
||||
for_each_member(i, t, member) {
|
||||
/* offset of the field in bytes */
|
||||
moff = btf_member_bit_offset(t, member) / 8;
|
||||
moff = __btf_member_bit_offset(t, member) / 8;
|
||||
if (off + size <= moff)
|
||||
/* won't find anything, field is already too far */
|
||||
break;
|
||||
|
||||
if (btf_member_bitfield_size(t, member)) {
|
||||
u32 end_bit = btf_member_bit_offset(t, member) +
|
||||
btf_member_bitfield_size(t, member);
|
||||
if (__btf_member_bitfield_size(t, member)) {
|
||||
u32 end_bit = __btf_member_bit_offset(t, member) +
|
||||
__btf_member_bitfield_size(t, member);
|
||||
|
||||
/* off <= moff instead of off == moff because clang
|
||||
* does not generate a BTF member for anonymous
|
||||
@ -5434,12 +5576,53 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
|
||||
#endif
|
||||
};
|
||||
|
||||
/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
|
||||
static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
|
||||
const struct btf *btf,
|
||||
const struct btf_type *t, int rec)
|
||||
{
|
||||
const struct btf_type *member_type;
|
||||
const struct btf_member *member;
|
||||
u32 i;
|
||||
|
||||
if (!btf_type_is_struct(t))
|
||||
return false;
|
||||
|
||||
for_each_member(i, t, member) {
|
||||
const struct btf_array *array;
|
||||
|
||||
member_type = btf_type_skip_modifiers(btf, member->type, NULL);
|
||||
if (btf_type_is_struct(member_type)) {
|
||||
if (rec >= 3) {
|
||||
bpf_log(log, "max struct nesting depth exceeded\n");
|
||||
return false;
|
||||
}
|
||||
if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1))
|
||||
return false;
|
||||
continue;
|
||||
}
|
||||
if (btf_type_is_array(member_type)) {
|
||||
array = btf_type_array(member_type);
|
||||
if (!array->nelems)
|
||||
return false;
|
||||
member_type = btf_type_skip_modifiers(btf, array->type, NULL);
|
||||
if (!btf_type_is_scalar(member_type))
|
||||
return false;
|
||||
continue;
|
||||
}
|
||||
if (!btf_type_is_scalar(member_type))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static int btf_check_func_arg_match(struct bpf_verifier_env *env,
|
||||
const struct btf *btf, u32 func_id,
|
||||
struct bpf_reg_state *regs,
|
||||
bool ptr_to_mem_ok)
|
||||
{
|
||||
struct bpf_verifier_log *log = &env->log;
|
||||
bool is_kfunc = btf_is_kernel(btf);
|
||||
const char *func_name, *ref_tname;
|
||||
const struct btf_type *t, *ref_t;
|
||||
const struct btf_param *args;
|
||||
@ -5492,7 +5675,21 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
|
||||
|
||||
ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
|
||||
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
|
||||
if (btf_is_kernel(btf)) {
|
||||
if (btf_get_prog_ctx_type(log, btf, t,
|
||||
env->prog->type, i)) {
|
||||
/* If function expects ctx type in BTF check that caller
|
||||
* is passing PTR_TO_CTX.
|
||||
*/
|
||||
if (reg->type != PTR_TO_CTX) {
|
||||
bpf_log(log,
|
||||
"arg#%d expected pointer to ctx, but got %s\n",
|
||||
i, btf_type_str(t));
|
||||
return -EINVAL;
|
||||
}
|
||||
if (check_ptr_off_reg(env, reg, regno))
|
||||
return -EINVAL;
|
||||
} else if (is_kfunc && (reg->type == PTR_TO_BTF_ID ||
|
||||
(reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) {
|
||||
const struct btf_type *reg_ref_t;
|
||||
const struct btf *reg_btf;
|
||||
const char *reg_ref_tname;
|
||||
@ -5508,14 +5705,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
|
||||
if (reg->type == PTR_TO_BTF_ID) {
|
||||
reg_btf = reg->btf;
|
||||
reg_ref_id = reg->btf_id;
|
||||
} else if (reg2btf_ids[reg->type]) {
|
||||
reg_btf = btf_vmlinux;
|
||||
reg_ref_id = *reg2btf_ids[reg->type];
|
||||
} else {
|
||||
bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n",
|
||||
func_name, i,
|
||||
btf_type_str(ref_t), ref_tname, regno);
|
||||
return -EINVAL;
|
||||
reg_btf = btf_vmlinux;
|
||||
reg_ref_id = *reg2btf_ids[base_type(reg->type)];
|
||||
}
|
||||
|
||||
reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
|
||||
@ -5531,23 +5723,24 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
|
||||
reg_ref_tname);
|
||||
return -EINVAL;
|
||||
}
|
||||
} else if (btf_get_prog_ctx_type(log, btf, t,
|
||||
env->prog->type, i)) {
|
||||
/* If function expects ctx type in BTF check that caller
|
||||
* is passing PTR_TO_CTX.
|
||||
*/
|
||||
if (reg->type != PTR_TO_CTX) {
|
||||
bpf_log(log,
|
||||
"arg#%d expected pointer to ctx, but got %s\n",
|
||||
i, btf_type_str(t));
|
||||
return -EINVAL;
|
||||
}
|
||||
if (check_ctx_reg(env, reg, regno))
|
||||
return -EINVAL;
|
||||
} else if (ptr_to_mem_ok) {
|
||||
const struct btf_type *resolve_ret;
|
||||
u32 type_size;
|
||||
|
||||
if (is_kfunc) {
|
||||
/* Permit pointer to mem, but only when argument
|
||||
* type is pointer to scalar, or struct composed
|
||||
* (recursively) of scalars.
|
||||
*/
|
||||
if (!btf_type_is_scalar(ref_t) &&
|
||||
!__btf_type_is_scalar_struct(log, btf, ref_t, 0)) {
|
||||
bpf_log(log,
|
||||
"arg#%d pointer type %s %s must point to scalar or struct with scalar\n",
|
||||
i, btf_type_str(ref_t), ref_tname);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
|
||||
if (IS_ERR(resolve_ret)) {
|
||||
bpf_log(log,
|
||||
@ -5560,6 +5753,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
|
||||
if (check_mem_reg(env, reg, regno, type_size))
|
||||
return -EINVAL;
|
||||
} else {
|
||||
bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i,
|
||||
is_kfunc ? "kernel " : "", func_name, func_id);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
@ -5609,7 +5804,7 @@ int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
|
||||
const struct btf *btf, u32 func_id,
|
||||
struct bpf_reg_state *regs)
|
||||
{
|
||||
return btf_check_func_arg_match(env, btf, func_id, regs, false);
|
||||
return btf_check_func_arg_match(env, btf, func_id, regs, true);
|
||||
}
|
||||
|
||||
/* Convert BTF of a function into bpf_reg_state if possible
|
||||
@ -5717,7 +5912,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
reg->type = PTR_TO_MEM_OR_NULL;
|
||||
reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
|
||||
reg->id = ++env->id_gen;
|
||||
|
||||
continue;
|
||||
@ -6028,6 +6223,8 @@ btf_module_read(struct file *file, struct kobject *kobj,
|
||||
return len;
|
||||
}
|
||||
|
||||
static void purge_cand_cache(struct btf *btf);
|
||||
|
||||
static int btf_module_notify(struct notifier_block *nb, unsigned long op,
|
||||
void *module)
|
||||
{
|
||||
@ -6062,6 +6259,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
|
||||
goto out;
|
||||
}
|
||||
|
||||
purge_cand_cache(NULL);
|
||||
mutex_lock(&btf_module_mutex);
|
||||
btf_mod->module = module;
|
||||
btf_mod->btf = btf;
|
||||
@ -6104,6 +6302,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
|
||||
list_del(&btf_mod->list);
|
||||
if (btf_mod->sysfs_attr)
|
||||
sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr);
|
||||
purge_cand_cache(btf_mod->btf);
|
||||
btf_put(btf_mod->btf);
|
||||
kfree(btf_mod->sysfs_attr);
|
||||
kfree(btf_mod);
|
||||
@ -6207,10 +6406,442 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
|
||||
.func = bpf_btf_find_by_name_kind,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_MEM,
|
||||
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg2_type = ARG_CONST_SIZE,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct)
|
||||
BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
|
||||
#define BTF_TRACING_TYPE(name, type) BTF_ID(struct, type)
|
||||
BTF_TRACING_TYPE_xxx
|
||||
#undef BTF_TRACING_TYPE
|
||||
|
||||
/* BTF ID set registration API for modules */
|
||||
|
||||
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
|
||||
|
||||
void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
|
||||
struct kfunc_btf_id_set *s)
|
||||
{
|
||||
mutex_lock(&l->mutex);
|
||||
list_add(&s->list, &l->list);
|
||||
mutex_unlock(&l->mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(register_kfunc_btf_id_set);
|
||||
|
||||
void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
|
||||
struct kfunc_btf_id_set *s)
|
||||
{
|
||||
mutex_lock(&l->mutex);
|
||||
list_del_init(&s->list);
|
||||
mutex_unlock(&l->mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set);
|
||||
|
||||
bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
|
||||
struct module *owner)
|
||||
{
|
||||
struct kfunc_btf_id_set *s;
|
||||
|
||||
mutex_lock(&klist->mutex);
|
||||
list_for_each_entry(s, &klist->list, list) {
|
||||
if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) {
|
||||
mutex_unlock(&klist->mutex);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
mutex_unlock(&klist->mutex);
|
||||
return false;
|
||||
}
|
||||
|
||||
#define DEFINE_KFUNC_BTF_ID_LIST(name) \
|
||||
struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \
|
||||
__MUTEX_INITIALIZER(name.mutex) }; \
|
||||
EXPORT_SYMBOL_GPL(name)
|
||||
|
||||
DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list);
|
||||
DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list);
|
||||
|
||||
#endif
|
||||
|
||||
int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
|
||||
const struct btf *targ_btf, __u32 targ_id)
|
||||
{
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
static bool bpf_core_is_flavor_sep(const char *s)
|
||||
{
|
||||
/* check X___Y name pattern, where X and Y are not underscores */
|
||||
return s[0] != '_' && /* X */
|
||||
s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */
|
||||
s[4] != '_'; /* Y */
|
||||
}
|
||||
|
||||
size_t bpf_core_essential_name_len(const char *name)
|
||||
{
|
||||
size_t n = strlen(name);
|
||||
int i;
|
||||
|
||||
for (i = n - 5; i >= 0; i--) {
|
||||
if (bpf_core_is_flavor_sep(name + i))
|
||||
return i + 1;
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
struct bpf_cand_cache {
|
||||
const char *name;
|
||||
u32 name_len;
|
||||
u16 kind;
|
||||
u16 cnt;
|
||||
struct {
|
||||
const struct btf *btf;
|
||||
u32 id;
|
||||
} cands[];
|
||||
};
|
||||
|
||||
static void bpf_free_cands(struct bpf_cand_cache *cands)
|
||||
{
|
||||
if (!cands->cnt)
|
||||
/* empty candidate array was allocated on stack */
|
||||
return;
|
||||
kfree(cands);
|
||||
}
|
||||
|
||||
static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands)
|
||||
{
|
||||
kfree(cands->name);
|
||||
kfree(cands);
|
||||
}
|
||||
|
||||
#define VMLINUX_CAND_CACHE_SIZE 31
|
||||
static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE];
|
||||
|
||||
#define MODULE_CAND_CACHE_SIZE 31
|
||||
static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE];
|
||||
|
||||
static DEFINE_MUTEX(cand_cache_mutex);
|
||||
|
||||
static void __print_cand_cache(struct bpf_verifier_log *log,
|
||||
struct bpf_cand_cache **cache,
|
||||
int cache_size)
|
||||
{
|
||||
struct bpf_cand_cache *cc;
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < cache_size; i++) {
|
||||
cc = cache[i];
|
||||
if (!cc)
|
||||
continue;
|
||||
bpf_log(log, "[%d]%s(", i, cc->name);
|
||||
for (j = 0; j < cc->cnt; j++) {
|
||||
bpf_log(log, "%d", cc->cands[j].id);
|
||||
if (j < cc->cnt - 1)
|
||||
bpf_log(log, " ");
|
||||
}
|
||||
bpf_log(log, "), ");
|
||||
}
|
||||
}
|
||||
|
||||
static void print_cand_cache(struct bpf_verifier_log *log)
|
||||
{
|
||||
mutex_lock(&cand_cache_mutex);
|
||||
bpf_log(log, "vmlinux_cand_cache:");
|
||||
__print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
|
||||
bpf_log(log, "\nmodule_cand_cache:");
|
||||
__print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE);
|
||||
bpf_log(log, "\n");
|
||||
mutex_unlock(&cand_cache_mutex);
|
||||
}
|
||||
|
||||
static u32 hash_cands(struct bpf_cand_cache *cands)
|
||||
{
|
||||
return jhash(cands->name, cands->name_len, 0);
|
||||
}
|
||||
|
||||
static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands,
|
||||
struct bpf_cand_cache **cache,
|
||||
int cache_size)
|
||||
{
|
||||
struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size];
|
||||
|
||||
if (cc && cc->name_len == cands->name_len &&
|
||||
!strncmp(cc->name, cands->name, cands->name_len))
|
||||
return cc;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static size_t sizeof_cands(int cnt)
|
||||
{
|
||||
return offsetof(struct bpf_cand_cache, cands[cnt]);
|
||||
}
|
||||
|
||||
static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands,
|
||||
struct bpf_cand_cache **cache,
|
||||
int cache_size)
|
||||
{
|
||||
struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands;
|
||||
|
||||
if (*cc) {
|
||||
bpf_free_cands_from_cache(*cc);
|
||||
*cc = NULL;
|
||||
}
|
||||
new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL);
|
||||
if (!new_cands) {
|
||||
bpf_free_cands(cands);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
/* strdup the name, since it will stay in cache.
|
||||
* the cands->name points to strings in prog's BTF and the prog can be unloaded.
|
||||
*/
|
||||
new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL);
|
||||
bpf_free_cands(cands);
|
||||
if (!new_cands->name) {
|
||||
kfree(new_cands);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
*cc = new_cands;
|
||||
return new_cands;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
|
||||
static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache,
|
||||
int cache_size)
|
||||
{
|
||||
struct bpf_cand_cache *cc;
|
||||
int i, j;
|
||||
|
||||
for (i = 0; i < cache_size; i++) {
|
||||
cc = cache[i];
|
||||
if (!cc)
|
||||
continue;
|
||||
if (!btf) {
|
||||
/* when new module is loaded purge all of module_cand_cache,
|
||||
* since new module might have candidates with the name
|
||||
* that matches cached cands.
|
||||
*/
|
||||
bpf_free_cands_from_cache(cc);
|
||||
cache[i] = NULL;
|
||||
continue;
|
||||
}
|
||||
/* when module is unloaded purge cache entries
|
||||
* that match module's btf
|
||||
*/
|
||||
for (j = 0; j < cc->cnt; j++)
|
||||
if (cc->cands[j].btf == btf) {
|
||||
bpf_free_cands_from_cache(cc);
|
||||
cache[i] = NULL;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
static void purge_cand_cache(struct btf *btf)
|
||||
{
|
||||
mutex_lock(&cand_cache_mutex);
|
||||
__purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE);
|
||||
mutex_unlock(&cand_cache_mutex);
|
||||
}
|
||||
#endif
|
||||
|
||||
static struct bpf_cand_cache *
|
||||
bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf,
|
||||
int targ_start_id)
|
||||
{
|
||||
struct bpf_cand_cache *new_cands;
|
||||
const struct btf_type *t;
|
||||
const char *targ_name;
|
||||
size_t targ_essent_len;
|
||||
int n, i;
|
||||
|
||||
n = btf_nr_types(targ_btf);
|
||||
for (i = targ_start_id; i < n; i++) {
|
||||
t = btf_type_by_id(targ_btf, i);
|
||||
if (btf_kind(t) != cands->kind)
|
||||
continue;
|
||||
|
||||
targ_name = btf_name_by_offset(targ_btf, t->name_off);
|
||||
if (!targ_name)
|
||||
continue;
|
||||
|
||||
/* the resched point is before strncmp to make sure that search
|
||||
* for non-existing name will have a chance to schedule().
|
||||
*/
|
||||
cond_resched();
|
||||
|
||||
if (strncmp(cands->name, targ_name, cands->name_len) != 0)
|
||||
continue;
|
||||
|
||||
targ_essent_len = bpf_core_essential_name_len(targ_name);
|
||||
if (targ_essent_len != cands->name_len)
|
||||
continue;
|
||||
|
||||
/* most of the time there is only one candidate for a given kind+name pair */
|
||||
new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL);
|
||||
if (!new_cands) {
|
||||
bpf_free_cands(cands);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
memcpy(new_cands, cands, sizeof_cands(cands->cnt));
|
||||
bpf_free_cands(cands);
|
||||
cands = new_cands;
|
||||
cands->cands[cands->cnt].btf = targ_btf;
|
||||
cands->cands[cands->cnt].id = i;
|
||||
cands->cnt++;
|
||||
}
|
||||
return cands;
|
||||
}
|
||||
|
||||
static struct bpf_cand_cache *
|
||||
bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
|
||||
{
|
||||
struct bpf_cand_cache *cands, *cc, local_cand = {};
|
||||
const struct btf *local_btf = ctx->btf;
|
||||
const struct btf_type *local_type;
|
||||
const struct btf *main_btf;
|
||||
size_t local_essent_len;
|
||||
struct btf *mod_btf;
|
||||
const char *name;
|
||||
int id;
|
||||
|
||||
main_btf = bpf_get_btf_vmlinux();
|
||||
if (IS_ERR(main_btf))
|
||||
return ERR_CAST(main_btf);
|
||||
|
||||
local_type = btf_type_by_id(local_btf, local_type_id);
|
||||
if (!local_type)
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
name = btf_name_by_offset(local_btf, local_type->name_off);
|
||||
if (str_is_empty(name))
|
||||
return ERR_PTR(-EINVAL);
|
||||
local_essent_len = bpf_core_essential_name_len(name);
|
||||
|
||||
cands = &local_cand;
|
||||
cands->name = name;
|
||||
cands->kind = btf_kind(local_type);
|
||||
cands->name_len = local_essent_len;
|
||||
|
||||
cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
|
||||
/* cands is a pointer to stack here */
|
||||
if (cc) {
|
||||
if (cc->cnt)
|
||||
return cc;
|
||||
goto check_modules;
|
||||
}
|
||||
|
||||
/* Attempt to find target candidates in vmlinux BTF first */
|
||||
cands = bpf_core_add_cands(cands, main_btf, 1);
|
||||
if (IS_ERR(cands))
|
||||
return ERR_CAST(cands);
|
||||
|
||||
/* cands is a pointer to kmalloced memory here if cands->cnt > 0 */
|
||||
|
||||
/* populate cache even when cands->cnt == 0 */
|
||||
cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
|
||||
if (IS_ERR(cc))
|
||||
return ERR_CAST(cc);
|
||||
|
||||
/* if vmlinux BTF has any candidate, don't go for module BTFs */
|
||||
if (cc->cnt)
|
||||
return cc;
|
||||
|
||||
check_modules:
|
||||
/* cands is a pointer to stack here and cands->cnt == 0 */
|
||||
cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
|
||||
if (cc)
|
||||
/* if cache has it return it even if cc->cnt == 0 */
|
||||
return cc;
|
||||
|
||||
/* If candidate is not found in vmlinux's BTF then search in module's BTFs */
|
||||
spin_lock_bh(&btf_idr_lock);
|
||||
idr_for_each_entry(&btf_idr, mod_btf, id) {
|
||||
if (!btf_is_module(mod_btf))
|
||||
continue;
|
||||
/* linear search could be slow hence unlock/lock
|
||||
* the IDR to avoiding holding it for too long
|
||||
*/
|
||||
btf_get(mod_btf);
|
||||
spin_unlock_bh(&btf_idr_lock);
|
||||
cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
|
||||
if (IS_ERR(cands)) {
|
||||
btf_put(mod_btf);
|
||||
return ERR_CAST(cands);
|
||||
}
|
||||
spin_lock_bh(&btf_idr_lock);
|
||||
btf_put(mod_btf);
|
||||
}
|
||||
spin_unlock_bh(&btf_idr_lock);
|
||||
/* cands is a pointer to kmalloced memory here if cands->cnt > 0
|
||||
* or pointer to stack if cands->cnd == 0.
|
||||
* Copy it into the cache even when cands->cnt == 0 and
|
||||
* return the result.
|
||||
*/
|
||||
return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
|
||||
}
|
||||
|
||||
int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
|
||||
int relo_idx, void *insn)
|
||||
{
|
||||
bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
|
||||
struct bpf_core_cand_list cands = {};
|
||||
struct bpf_core_spec *specs;
|
||||
int err;
|
||||
|
||||
/* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
|
||||
* into arrays of btf_ids of struct fields and array indices.
|
||||
*/
|
||||
specs = kcalloc(3, sizeof(*specs), GFP_KERNEL);
|
||||
if (!specs)
|
||||
return -ENOMEM;
|
||||
|
||||
if (need_cands) {
|
||||
struct bpf_cand_cache *cc;
|
||||
int i;
|
||||
|
||||
mutex_lock(&cand_cache_mutex);
|
||||
cc = bpf_core_find_cands(ctx, relo->type_id);
|
||||
if (IS_ERR(cc)) {
|
||||
bpf_log(ctx->log, "target candidate search failed for %d\n",
|
||||
relo->type_id);
|
||||
err = PTR_ERR(cc);
|
||||
goto out;
|
||||
}
|
||||
if (cc->cnt) {
|
||||
cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL);
|
||||
if (!cands.cands) {
|
||||
err = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
for (i = 0; i < cc->cnt; i++) {
|
||||
bpf_log(ctx->log,
|
||||
"CO-RE relocating %s %s: found target candidate [%d]\n",
|
||||
btf_kind_str[cc->kind], cc->name, cc->cands[i].id);
|
||||
cands.cands[i].btf = cc->cands[i].btf;
|
||||
cands.cands[i].id = cc->cands[i].id;
|
||||
}
|
||||
cands.len = cc->cnt;
|
||||
/* cand_cache_mutex needs to span the cache lookup and
|
||||
* copy of btf pointer into bpf_core_cand_list,
|
||||
* since module can be unloaded while bpf_core_apply_relo_insn
|
||||
* is working with module's btf.
|
||||
*/
|
||||
}
|
||||
|
||||
err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8,
|
||||
relo, relo_idx, ctx->btf, &cands, specs);
|
||||
out:
|
||||
kfree(specs);
|
||||
if (need_cands) {
|
||||
kfree(cands.cands);
|
||||
mutex_unlock(&cand_cache_mutex);
|
||||
if (ctx->log->level & BPF_LOG_LEVEL2)
|
||||
print_cand_cache(ctx->log);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
@ -430,7 +430,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
|
||||
* Exactly one of @prog or @link can be non-null.
|
||||
* Must be called with cgroup_mutex held.
|
||||
*/
|
||||
int __cgroup_bpf_attach(struct cgroup *cgrp,
|
||||
static int __cgroup_bpf_attach(struct cgroup *cgrp,
|
||||
struct bpf_prog *prog, struct bpf_prog *replace_prog,
|
||||
struct bpf_cgroup_link *link,
|
||||
enum bpf_attach_type type, u32 flags)
|
||||
@ -523,6 +523,20 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
|
||||
return err;
|
||||
}
|
||||
|
||||
static int cgroup_bpf_attach(struct cgroup *cgrp,
|
||||
struct bpf_prog *prog, struct bpf_prog *replace_prog,
|
||||
struct bpf_cgroup_link *link,
|
||||
enum bpf_attach_type type,
|
||||
u32 flags)
|
||||
{
|
||||
int ret;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Swap updated BPF program for given link in effective program arrays across
|
||||
* all descendant cgroups. This function is guaranteed to succeed.
|
||||
*/
|
||||
@ -672,13 +686,13 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
|
||||
* propagate the change to descendants
|
||||
* @cgrp: The cgroup which descendants to traverse
|
||||
* @prog: A program to detach or NULL
|
||||
* @prog: A link to detach or NULL
|
||||
* @link: A link to detach or NULL
|
||||
* @type: Type of detach operation
|
||||
*
|
||||
* At most one of @prog or @link can be non-NULL.
|
||||
* Must be called with cgroup_mutex held.
|
||||
*/
|
||||
int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
|
||||
static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
|
||||
struct bpf_cgroup_link *link, enum bpf_attach_type type)
|
||||
{
|
||||
enum cgroup_bpf_attach_type atype;
|
||||
@ -730,8 +744,19 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
|
||||
return err;
|
||||
}
|
||||
|
||||
static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
|
||||
enum bpf_attach_type type)
|
||||
{
|
||||
int ret;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Must be called with cgroup_mutex held to avoid races. */
|
||||
int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
|
||||
static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
|
||||
union bpf_attr __user *uattr)
|
||||
{
|
||||
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
|
||||
@ -789,6 +814,17 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
|
||||
union bpf_attr __user *uattr)
|
||||
{
|
||||
int ret;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
ret = __cgroup_bpf_query(cgrp, attr, uattr);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
|
||||
enum bpf_prog_type ptype, struct bpf_prog *prog)
|
||||
{
|
||||
@ -1753,7 +1789,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
};
|
||||
|
||||
|
@ -32,6 +32,7 @@
|
||||
#include <linux/perf_event.h>
|
||||
#include <linux/extable.h>
|
||||
#include <linux/log2.h>
|
||||
#include <linux/bpf_verifier.h>
|
||||
|
||||
#include <asm/barrier.h>
|
||||
#include <asm/unaligned.h>
|
||||
@ -389,6 +390,13 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
|
||||
i = end_new;
|
||||
insn = prog->insnsi + end_old;
|
||||
}
|
||||
if (bpf_pseudo_func(insn)) {
|
||||
ret = bpf_adj_delta_to_imm(insn, pos, end_old,
|
||||
end_new, i, probe_pass);
|
||||
if (ret)
|
||||
return ret;
|
||||
continue;
|
||||
}
|
||||
code = insn->code;
|
||||
if ((BPF_CLASS(code) != BPF_JMP &&
|
||||
BPF_CLASS(code) != BPF_JMP32) ||
|
||||
@ -1566,7 +1574,8 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
|
||||
|
||||
if (unlikely(index >= array->map.max_entries))
|
||||
goto out;
|
||||
if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
|
||||
|
||||
if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
|
||||
goto out;
|
||||
|
||||
tail_call_cnt++;
|
||||
@ -1883,7 +1892,7 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
|
||||
|
||||
/**
|
||||
* bpf_prog_select_runtime - select exec runtime for BPF program
|
||||
* @fp: bpf_prog populated with internal BPF program
|
||||
* @fp: bpf_prog populated with BPF program
|
||||
* @err: pointer to error variable
|
||||
*
|
||||
* Try to JIT eBPF program, if JIT is not available, use interpreter.
|
||||
@ -2263,6 +2272,9 @@ static void bpf_prog_free_deferred(struct work_struct *work)
|
||||
int i;
|
||||
|
||||
aux = container_of(work, struct bpf_prog_aux, work);
|
||||
#ifdef CONFIG_BPF_SYSCALL
|
||||
bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
|
||||
#endif
|
||||
bpf_free_used_maps(aux);
|
||||
bpf_free_used_btfs(aux);
|
||||
if (bpf_prog_is_dev_bound(aux))
|
||||
@ -2289,7 +2301,6 @@ static void bpf_prog_free_deferred(struct work_struct *work)
|
||||
}
|
||||
}
|
||||
|
||||
/* Free internal BPF program */
|
||||
void bpf_prog_free(struct bpf_prog *fp)
|
||||
{
|
||||
struct bpf_prog_aux *aux = fp->aux;
|
||||
@ -2365,6 +2376,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
u64 __weak
|
||||
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
|
||||
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)
|
||||
|
@ -195,7 +195,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
|
||||
}
|
||||
return;
|
||||
default:
|
||||
bpf_warn_invalid_xdp_action(act);
|
||||
bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
|
||||
fallthrough;
|
||||
case XDP_ABORTED:
|
||||
trace_xdp_exception(skb->dev, rcpu->prog, act);
|
||||
@ -254,7 +254,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
|
||||
}
|
||||
break;
|
||||
default:
|
||||
bpf_warn_invalid_xdp_action(act);
|
||||
bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
|
||||
fallthrough;
|
||||
case XDP_DROP:
|
||||
xdp_return_frame(xdpf);
|
||||
@ -746,15 +746,9 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
|
||||
list_add(&bq->flush_node, flush_list);
|
||||
}
|
||||
|
||||
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
|
||||
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
|
||||
struct net_device *dev_rx)
|
||||
{
|
||||
struct xdp_frame *xdpf;
|
||||
|
||||
xdpf = xdp_convert_buff_to_frame(xdp);
|
||||
if (unlikely(!xdpf))
|
||||
return -EOVERFLOW;
|
||||
|
||||
/* Info needed when constructing SKB on remote CPU */
|
||||
xdpf->dev_rx = dev_rx;
|
||||
|
||||
|
@ -348,7 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
|
||||
frames[nframes++] = xdpf;
|
||||
break;
|
||||
default:
|
||||
bpf_warn_invalid_xdp_action(act);
|
||||
bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
|
||||
fallthrough;
|
||||
case XDP_ABORTED:
|
||||
trace_xdp_exception(dev, xdp_prog, act);
|
||||
@ -467,24 +467,19 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
|
||||
bq->q[bq->count++] = xdpf;
|
||||
}
|
||||
|
||||
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
|
||||
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
|
||||
struct net_device *dev_rx,
|
||||
struct bpf_prog *xdp_prog)
|
||||
{
|
||||
struct xdp_frame *xdpf;
|
||||
int err;
|
||||
|
||||
if (!dev->netdev_ops->ndo_xdp_xmit)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
|
||||
err = xdp_ok_fwd_dev(dev, xdpf->len);
|
||||
if (unlikely(err))
|
||||
return err;
|
||||
|
||||
xdpf = xdp_convert_buff_to_frame(xdp);
|
||||
if (unlikely(!xdpf))
|
||||
return -EOVERFLOW;
|
||||
|
||||
bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
|
||||
return 0;
|
||||
}
|
||||
@ -507,7 +502,7 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev
|
||||
__skb_push(skb, skb->mac_len);
|
||||
break;
|
||||
default:
|
||||
bpf_warn_invalid_xdp_action(act);
|
||||
bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act);
|
||||
fallthrough;
|
||||
case XDP_ABORTED:
|
||||
trace_xdp_exception(dst->dev, dst->xdp_prog, act);
|
||||
@ -520,27 +515,27 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev
|
||||
return act;
|
||||
}
|
||||
|
||||
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
|
||||
int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
|
||||
struct net_device *dev_rx)
|
||||
{
|
||||
return __xdp_enqueue(dev, xdp, dev_rx, NULL);
|
||||
return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
|
||||
}
|
||||
|
||||
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
|
||||
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
|
||||
struct net_device *dev_rx)
|
||||
{
|
||||
struct net_device *dev = dst->dev;
|
||||
|
||||
return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
|
||||
return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog);
|
||||
}
|
||||
|
||||
static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp)
|
||||
static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
|
||||
{
|
||||
if (!obj ||
|
||||
!obj->dev->netdev_ops->ndo_xdp_xmit)
|
||||
return false;
|
||||
|
||||
if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data))
|
||||
if (xdp_ok_fwd_dev(obj->dev, xdpf->len))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
@ -586,14 +581,13 @@ static int get_upper_ifindexes(struct net_device *dev, int *indexes)
|
||||
return n;
|
||||
}
|
||||
|
||||
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
|
||||
int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
|
||||
struct bpf_map *map, bool exclude_ingress)
|
||||
{
|
||||
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
|
||||
struct bpf_dtab_netdev *dst, *last_dst = NULL;
|
||||
int excluded_devices[1+MAX_NEST_DEV];
|
||||
struct hlist_head *head;
|
||||
struct xdp_frame *xdpf;
|
||||
int num_excluded = 0;
|
||||
unsigned int i;
|
||||
int err;
|
||||
@ -603,15 +597,11 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
|
||||
excluded_devices[num_excluded++] = dev_rx->ifindex;
|
||||
}
|
||||
|
||||
xdpf = xdp_convert_buff_to_frame(xdp);
|
||||
if (unlikely(!xdpf))
|
||||
return -EOVERFLOW;
|
||||
|
||||
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
|
||||
for (i = 0; i < map->max_entries; i++) {
|
||||
dst = rcu_dereference_check(dtab->netdev_map[i],
|
||||
rcu_read_lock_bh_held());
|
||||
if (!is_valid_dst(dst, xdp))
|
||||
if (!is_valid_dst(dst, xdpf))
|
||||
continue;
|
||||
|
||||
if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
|
||||
@ -634,7 +624,7 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
|
||||
head = dev_map_index_hash(dtab, i);
|
||||
hlist_for_each_entry_rcu(dst, head, index_hlist,
|
||||
lockdep_is_held(&dtab->index_lock)) {
|
||||
if (!is_valid_dst(dst, xdp))
|
||||
if (!is_valid_dst(dst, xdpf))
|
||||
continue;
|
||||
|
||||
if (is_ifindex_excluded(excluded_devices, num_excluded,
|
||||
|
@ -668,7 +668,7 @@ static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
|
||||
|
||||
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
|
||||
(void *(*)(struct bpf_map *map, void *key))NULL));
|
||||
*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
|
||||
*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
|
||||
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
|
||||
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
|
||||
offsetof(struct htab_elem, key) +
|
||||
@ -709,7 +709,7 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
|
||||
|
||||
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
|
||||
(void *(*)(struct bpf_map *map, void *key))NULL));
|
||||
*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
|
||||
*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
|
||||
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
|
||||
*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
|
||||
offsetof(struct htab_elem, lru_node) +
|
||||
@ -2049,7 +2049,7 @@ static const struct bpf_iter_seq_info iter_seq_info = {
|
||||
.seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
|
||||
};
|
||||
|
||||
static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn,
|
||||
static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
|
||||
void *callback_ctx, u64 flags)
|
||||
{
|
||||
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
|
||||
@ -2089,9 +2089,8 @@ static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn,
|
||||
val = elem->key + roundup_key_size;
|
||||
}
|
||||
num_elems++;
|
||||
ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
|
||||
(u64)(long)key, (u64)(long)val,
|
||||
(u64)(long)callback_ctx, 0);
|
||||
ret = callback_fn((u64)(long)map, (u64)(long)key,
|
||||
(u64)(long)val, (u64)(long)callback_ctx, 0);
|
||||
/* return value: 0 - continue, 1 - stop and return */
|
||||
if (ret) {
|
||||
rcu_read_unlock();
|
||||
@ -2397,7 +2396,7 @@ static int htab_of_map_gen_lookup(struct bpf_map *map,
|
||||
|
||||
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
|
||||
(void *(*)(struct bpf_map *map, void *key))NULL));
|
||||
*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
|
||||
*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
|
||||
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
|
||||
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
|
||||
offsetof(struct htab_elem, key) +
|
||||
|
@ -2,6 +2,8 @@
|
||||
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
|
||||
*/
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/btf.h>
|
||||
#include <linux/bpf-cgroup.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/smp.h>
|
||||
@ -530,7 +532,7 @@ const struct bpf_func_proto bpf_strtol_proto = {
|
||||
.func = bpf_strtol,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_MEM,
|
||||
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg2_type = ARG_CONST_SIZE,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_LONG,
|
||||
@ -558,13 +560,27 @@ const struct bpf_func_proto bpf_strtoul_proto = {
|
||||
.func = bpf_strtoul,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_MEM,
|
||||
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg2_type = ARG_CONST_SIZE,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_LONG,
|
||||
};
|
||||
#endif
|
||||
|
||||
BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
|
||||
{
|
||||
return strncmp(s1, s2, s1_sz);
|
||||
}
|
||||
|
||||
const struct bpf_func_proto bpf_strncmp_proto = {
|
||||
.func = bpf_strncmp,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_CONST_SIZE,
|
||||
.arg3_type = ARG_PTR_TO_CONST_STR,
|
||||
};
|
||||
|
||||
BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino,
|
||||
struct bpf_pidns_info *, nsdata, u32, size)
|
||||
{
|
||||
@ -630,7 +646,7 @@ const struct bpf_func_proto bpf_event_output_data_proto = {
|
||||
.arg1_type = ARG_PTR_TO_CTX,
|
||||
.arg2_type = ARG_CONST_MAP_PTR,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_MEM,
|
||||
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
};
|
||||
|
||||
@ -667,7 +683,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
|
||||
const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
|
||||
.func = bpf_per_cpu_ptr,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL,
|
||||
.ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
|
||||
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
|
||||
.arg2_type = ARG_ANYTHING,
|
||||
};
|
||||
@ -680,7 +696,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
|
||||
const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
|
||||
.func = bpf_this_cpu_ptr,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_PTR_TO_MEM_OR_BTF_ID,
|
||||
.ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
|
||||
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
|
||||
};
|
||||
|
||||
@ -979,15 +995,13 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
|
||||
return err;
|
||||
}
|
||||
|
||||
#define MAX_SNPRINTF_VARARGS 12
|
||||
|
||||
BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
|
||||
const void *, data, u32, data_len)
|
||||
{
|
||||
int err, num_args;
|
||||
u32 *bin_args;
|
||||
|
||||
if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 ||
|
||||
if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 ||
|
||||
(data_len && !data))
|
||||
return -EINVAL;
|
||||
num_args = data_len / 8;
|
||||
@ -1013,7 +1027,7 @@ const struct bpf_func_proto bpf_snprintf_proto = {
|
||||
.arg1_type = ARG_PTR_TO_MEM_OR_NULL,
|
||||
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
.arg3_type = ARG_PTR_TO_CONST_STR,
|
||||
.arg4_type = ARG_PTR_TO_MEM_OR_NULL,
|
||||
.arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
|
||||
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
};
|
||||
|
||||
@ -1058,10 +1072,11 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
|
||||
struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
|
||||
struct bpf_map *map = t->map;
|
||||
void *value = t->value;
|
||||
void *callback_fn;
|
||||
bpf_callback_t callback_fn;
|
||||
void *key;
|
||||
u32 idx;
|
||||
|
||||
BTF_TYPE_EMIT(struct bpf_timer);
|
||||
callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
|
||||
if (!callback_fn)
|
||||
goto out;
|
||||
@ -1083,8 +1098,7 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
|
||||
key = value - round_up(map->key_size, 8);
|
||||
}
|
||||
|
||||
BPF_CAST_CALL(callback_fn)((u64)(long)map, (u64)(long)key,
|
||||
(u64)(long)value, 0, 0);
|
||||
callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
|
||||
/* The verifier checked that return value is zero. */
|
||||
|
||||
this_cpu_write(hrtimer_running, NULL);
|
||||
@ -1379,6 +1393,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
|
||||
return &bpf_ringbuf_query_proto;
|
||||
case BPF_FUNC_for_each_map_elem:
|
||||
return &bpf_for_each_map_elem_proto;
|
||||
case BPF_FUNC_loop:
|
||||
return &bpf_loop_proto;
|
||||
case BPF_FUNC_strncmp:
|
||||
return &bpf_strncmp_proto;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
@ -1435,6 +1453,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
|
||||
return &bpf_snprintf_proto;
|
||||
case BPF_FUNC_task_pt_regs:
|
||||
return &bpf_task_pt_regs_proto;
|
||||
case BPF_FUNC_trace_vprintk:
|
||||
return bpf_get_trace_vprintk_proto();
|
||||
default:
|
||||
return NULL;
|
||||
}
|
||||
|
@ -163,8 +163,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
|
||||
return 0;
|
||||
}
|
||||
|
||||
new = bpf_map_kmalloc_node(map, sizeof(struct bpf_storage_buffer) +
|
||||
map->value_size,
|
||||
new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size),
|
||||
__GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
|
||||
map->numa_node);
|
||||
if (!new)
|
||||
|
@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = {
|
||||
.ctx_arg_info_size = 2,
|
||||
.ctx_arg_info = {
|
||||
{ offsetof(struct bpf_iter__bpf_map_elem, key),
|
||||
PTR_TO_RDONLY_BUF_OR_NULL },
|
||||
PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
|
||||
{ offsetof(struct bpf_iter__bpf_map_elem, value),
|
||||
PTR_TO_RDWR_BUF_OR_NULL },
|
||||
PTR_TO_BUF | PTR_MAYBE_NULL },
|
||||
},
|
||||
};
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/bpf-netns.h>
|
||||
#include <linux/filter.h>
|
||||
#include <net/net_namespace.h>
|
||||
|
||||
|
4
kernel/bpf/preload/.gitignore
vendored
4
kernel/bpf/preload/.gitignore
vendored
@ -1,4 +1,2 @@
|
||||
/FEATURE-DUMP.libbpf
|
||||
/bpf_helper_defs.h
|
||||
/feature
|
||||
/libbpf
|
||||
/bpf_preload_umd
|
||||
|
@ -1,21 +1,35 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
LIBBPF_SRCS = $(srctree)/tools/lib/bpf/
|
||||
LIBBPF_A = $(obj)/libbpf.a
|
||||
LIBBPF_OUT = $(abspath $(obj))
|
||||
LIBBPF_OUT = $(abspath $(obj))/libbpf
|
||||
LIBBPF_A = $(LIBBPF_OUT)/libbpf.a
|
||||
LIBBPF_DESTDIR = $(LIBBPF_OUT)
|
||||
LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include
|
||||
|
||||
# Although not in use by libbpf's Makefile, set $(O) so that the "dummy" test
|
||||
# in tools/scripts/Makefile.include always succeeds when building the kernel
|
||||
# with $(O) pointing to a relative path, as in "make O=build bindeb-pkg".
|
||||
$(LIBBPF_A):
|
||||
$(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ $(LIBBPF_OUT)/libbpf.a
|
||||
$(LIBBPF_A): | $(LIBBPF_OUT)
|
||||
$(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ \
|
||||
DESTDIR=$(LIBBPF_DESTDIR) prefix= \
|
||||
$(LIBBPF_OUT)/libbpf.a install_headers
|
||||
|
||||
libbpf_hdrs: $(LIBBPF_A)
|
||||
|
||||
.PHONY: libbpf_hdrs
|
||||
|
||||
$(LIBBPF_OUT):
|
||||
$(call msg,MKDIR,$@)
|
||||
$(Q)mkdir -p $@
|
||||
|
||||
userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \
|
||||
-I $(srctree)/tools/lib/ -Wno-unused-result
|
||||
-I $(LIBBPF_INCLUDE) -Wno-unused-result
|
||||
|
||||
userprogs := bpf_preload_umd
|
||||
|
||||
clean-files := $(userprogs) bpf_helper_defs.h FEATURE-DUMP.libbpf staticobjs/ feature/
|
||||
clean-files := libbpf/
|
||||
|
||||
$(obj)/iterators/iterators.o: | libbpf_hdrs
|
||||
|
||||
bpf_preload_umd-objs := iterators/iterators.o
|
||||
bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz
|
||||
|
@ -1,18 +1,26 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
OUTPUT := .output
|
||||
abs_out := $(abspath $(OUTPUT))
|
||||
|
||||
CLANG ?= clang
|
||||
LLC ?= llc
|
||||
LLVM_STRIP ?= llvm-strip
|
||||
|
||||
TOOLS_PATH := $(abspath ../../../../tools)
|
||||
BPFTOOL_SRC := $(TOOLS_PATH)/bpf/bpftool
|
||||
BPFTOOL_OUTPUT := $(abs_out)/bpftool
|
||||
DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool
|
||||
BPFTOOL ?= $(DEFAULT_BPFTOOL)
|
||||
LIBBPF_SRC := $(abspath ../../../../tools/lib/bpf)
|
||||
BPFOBJ := $(OUTPUT)/libbpf.a
|
||||
BPF_INCLUDE := $(OUTPUT)
|
||||
INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../../../tools/lib) \
|
||||
-I$(abspath ../../../../tools/include/uapi)
|
||||
|
||||
LIBBPF_SRC := $(TOOLS_PATH)/lib/bpf
|
||||
LIBBPF_OUTPUT := $(abs_out)/libbpf
|
||||
LIBBPF_DESTDIR := $(LIBBPF_OUTPUT)
|
||||
LIBBPF_INCLUDE := $(LIBBPF_DESTDIR)/include
|
||||
BPFOBJ := $(LIBBPF_OUTPUT)/libbpf.a
|
||||
|
||||
INCLUDES := -I$(OUTPUT) -I$(LIBBPF_INCLUDE) -I$(TOOLS_PATH)/include/uapi
|
||||
CFLAGS := -g -Wall
|
||||
|
||||
abs_out := $(abspath $(OUTPUT))
|
||||
ifeq ($(V),1)
|
||||
Q =
|
||||
msg =
|
||||
@ -44,14 +52,18 @@ $(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
|
||||
-c $(filter %.c,$^) -o $@ && \
|
||||
$(LLVM_STRIP) -g $@
|
||||
|
||||
$(OUTPUT):
|
||||
$(OUTPUT) $(LIBBPF_OUTPUT) $(BPFTOOL_OUTPUT):
|
||||
$(call msg,MKDIR,$@)
|
||||
$(Q)mkdir -p $(OUTPUT)
|
||||
$(Q)mkdir -p $@
|
||||
|
||||
$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)
|
||||
$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT)
|
||||
$(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \
|
||||
OUTPUT=$(abspath $(dir $@))/ $(abspath $@)
|
||||
OUTPUT=$(abspath $(dir $@))/ prefix= \
|
||||
DESTDIR=$(LIBBPF_DESTDIR) $(abspath $@) install_headers
|
||||
|
||||
$(DEFAULT_BPFTOOL):
|
||||
$(Q)$(MAKE) $(submake_extras) -C ../../../../tools/bpf/bpftool \
|
||||
prefix= OUTPUT=$(abs_out)/ DESTDIR=$(abs_out) install
|
||||
$(DEFAULT_BPFTOOL): $(BPFOBJ) | $(BPFTOOL_OUTPUT)
|
||||
$(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) \
|
||||
OUTPUT=$(BPFTOOL_OUTPUT)/ \
|
||||
LIBBPF_OUTPUT=$(LIBBPF_OUTPUT)/ \
|
||||
LIBBPF_DESTDIR=$(LIBBPF_DESTDIR)/ \
|
||||
prefix= DESTDIR=$(abs_out)/ install-bin
|
||||
|
@ -152,16 +152,12 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
|
||||
{
|
||||
int numa_node = bpf_map_attr_numa_node(attr);
|
||||
struct reuseport_array *array;
|
||||
u64 array_size;
|
||||
|
||||
if (!bpf_capable())
|
||||
return ERR_PTR(-EPERM);
|
||||
|
||||
array_size = sizeof(*array);
|
||||
array_size += (u64)attr->max_entries * sizeof(struct sock *);
|
||||
|
||||
/* allocate all map elements and zero-initialize them */
|
||||
array = bpf_map_area_alloc(array_size, numa_node);
|
||||
array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node);
|
||||
if (!array)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
|
@ -444,7 +444,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = {
|
||||
.func = bpf_ringbuf_output,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_CONST_MAP_PTR,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
};
|
||||
|
@ -7,10 +7,10 @@
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/stacktrace.h>
|
||||
#include <linux/perf_event.h>
|
||||
#include <linux/irq_work.h>
|
||||
#include <linux/btf_ids.h>
|
||||
#include <linux/buildid.h>
|
||||
#include "percpu_freelist.h"
|
||||
#include "mmap_unlock_work.h"
|
||||
|
||||
#define STACK_CREATE_FLAG_MASK \
|
||||
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
|
||||
@ -31,25 +31,6 @@ struct bpf_stack_map {
|
||||
struct stack_map_bucket *buckets[];
|
||||
};
|
||||
|
||||
/* irq_work to run up_read() for build_id lookup in nmi context */
|
||||
struct stack_map_irq_work {
|
||||
struct irq_work irq_work;
|
||||
struct mm_struct *mm;
|
||||
};
|
||||
|
||||
static void do_up_read(struct irq_work *entry)
|
||||
{
|
||||
struct stack_map_irq_work *work;
|
||||
|
||||
if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
|
||||
return;
|
||||
|
||||
work = container_of(entry, struct stack_map_irq_work, irq_work);
|
||||
mmap_read_unlock_non_owner(work->mm);
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
|
||||
|
||||
static inline bool stack_map_use_build_id(struct bpf_map *map)
|
||||
{
|
||||
return (map->map_flags & BPF_F_STACK_BUILD_ID);
|
||||
@ -149,35 +130,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
|
||||
u64 *ips, u32 trace_nr, bool user)
|
||||
{
|
||||
int i;
|
||||
struct mmap_unlock_irq_work *work = NULL;
|
||||
bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
|
||||
struct vm_area_struct *vma;
|
||||
bool irq_work_busy = false;
|
||||
struct stack_map_irq_work *work = NULL;
|
||||
|
||||
if (irqs_disabled()) {
|
||||
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
|
||||
work = this_cpu_ptr(&up_read_work);
|
||||
if (irq_work_is_busy(&work->irq_work)) {
|
||||
/* cannot queue more up_read, fallback */
|
||||
irq_work_busy = true;
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
* PREEMPT_RT does not allow to trylock mmap sem in
|
||||
* interrupt disabled context. Force the fallback code.
|
||||
*/
|
||||
irq_work_busy = true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We cannot do up_read() when the irq is disabled, because of
|
||||
* risk to deadlock with rq_lock. To do build_id lookup when the
|
||||
* irqs are disabled, we need to run up_read() in irq_work. We use
|
||||
* a percpu variable to do the irq_work. If the irq_work is
|
||||
* already used by another lookup, we fall back to report ips.
|
||||
*
|
||||
* Same fallback is used for kernel stack (!user) on a stackmap
|
||||
* with build_id.
|
||||
/* If the irq_work is in use, fall back to report ips. Same
|
||||
* fallback is used for kernel stack (!user) on a stackmap with
|
||||
* build_id.
|
||||
*/
|
||||
if (!user || !current || !current->mm || irq_work_busy ||
|
||||
!mmap_read_trylock(current->mm)) {
|
||||
@ -203,19 +162,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
|
||||
- vma->vm_start;
|
||||
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
|
||||
}
|
||||
|
||||
if (!work) {
|
||||
mmap_read_unlock(current->mm);
|
||||
} else {
|
||||
work->mm = current->mm;
|
||||
|
||||
/* The lock will be released once we're out of interrupt
|
||||
* context. Tell lockdep that we've released it now so
|
||||
* it doesn't complain that we forgot to release it.
|
||||
*/
|
||||
rwsem_release(¤t->mm->mmap_lock.dep_map, _RET_IP_);
|
||||
irq_work_queue(&work->irq_work);
|
||||
}
|
||||
bpf_mmap_unlock_mm(work, current->mm);
|
||||
}
|
||||
|
||||
static struct perf_callchain_entry *
|
||||
@ -543,7 +490,7 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_BTF_ID,
|
||||
.arg1_btf_id = &btf_task_struct_ids[0],
|
||||
.arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
|
||||
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
|
||||
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
.arg4_type = ARG_ANYTHING,
|
||||
@ -720,16 +667,3 @@ const struct bpf_map_ops stack_trace_map_ops = {
|
||||
.map_btf_name = "bpf_stack_map",
|
||||
.map_btf_id = &stack_trace_map_btf_id,
|
||||
};
|
||||
|
||||
static int __init stack_map_init(void)
|
||||
{
|
||||
int cpu;
|
||||
struct stack_map_irq_work *work;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
work = per_cpu_ptr(&up_read_work, cpu);
|
||||
init_irq_work(&work->irq_work, do_up_read);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
subsys_initcall(stack_map_init);
|
||||
|
@ -2,6 +2,7 @@
|
||||
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
|
||||
*/
|
||||
#include <linux/bpf.h>
|
||||
#include <linux/bpf-cgroup.h>
|
||||
#include <linux/bpf_trace.h>
|
||||
#include <linux/bpf_lirc.h>
|
||||
#include <linux/bpf_verifier.h>
|
||||
@ -214,7 +215,8 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
|
||||
err = bpf_fd_reuseport_array_update_elem(map, key, value,
|
||||
flags);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
|
||||
map->map_type == BPF_MAP_TYPE_STACK) {
|
||||
map->map_type == BPF_MAP_TYPE_STACK ||
|
||||
map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
|
||||
err = map->ops->map_push_elem(map, value, flags);
|
||||
} else {
|
||||
rcu_read_lock();
|
||||
@ -253,7 +255,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
|
||||
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
|
||||
err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
|
||||
map->map_type == BPF_MAP_TYPE_STACK) {
|
||||
map->map_type == BPF_MAP_TYPE_STACK ||
|
||||
map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
|
||||
err = map->ops->map_peek_elem(map, value);
|
||||
} else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
|
||||
/* struct_ops map requires directly updating "value" */
|
||||
@ -363,6 +366,7 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
|
||||
map->max_entries = attr->max_entries;
|
||||
map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
|
||||
map->numa_node = bpf_map_attr_numa_node(attr);
|
||||
map->map_extra = attr->map_extra;
|
||||
}
|
||||
|
||||
static int bpf_map_alloc_id(struct bpf_map *map)
|
||||
@ -570,6 +574,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
|
||||
"value_size:\t%u\n"
|
||||
"max_entries:\t%u\n"
|
||||
"map_flags:\t%#x\n"
|
||||
"map_extra:\t%#llx\n"
|
||||
"memlock:\t%lu\n"
|
||||
"map_id:\t%u\n"
|
||||
"frozen:\t%u\n",
|
||||
@ -578,6 +583,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
|
||||
map->value_size,
|
||||
map->max_entries,
|
||||
map->map_flags,
|
||||
(unsigned long long)map->map_extra,
|
||||
bpf_map_memory_footprint(map),
|
||||
map->id,
|
||||
READ_ONCE(map->frozen));
|
||||
@ -821,7 +827,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
|
||||
return ret;
|
||||
}
|
||||
|
||||
#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id
|
||||
#define BPF_MAP_CREATE_LAST_FIELD map_extra
|
||||
/* called via syscall */
|
||||
static int map_create(union bpf_attr *attr)
|
||||
{
|
||||
@ -842,6 +848,10 @@ static int map_create(union bpf_attr *attr)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
|
||||
attr->map_extra != 0)
|
||||
return -EINVAL;
|
||||
|
||||
f_flags = bpf_get_file_flag(attr->map_flags);
|
||||
if (f_flags < 0)
|
||||
return f_flags;
|
||||
@ -1091,6 +1101,14 @@ static int map_lookup_elem(union bpf_attr *attr)
|
||||
if (!value)
|
||||
goto free_key;
|
||||
|
||||
if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
|
||||
if (copy_from_user(value, uvalue, value_size))
|
||||
err = -EFAULT;
|
||||
else
|
||||
err = bpf_map_copy_value(map, key, value, attr->flags);
|
||||
goto free_value;
|
||||
}
|
||||
|
||||
err = bpf_map_copy_value(map, key, value, attr->flags);
|
||||
if (err)
|
||||
goto free_value;
|
||||
@ -1874,7 +1892,8 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
|
||||
"prog_id:\t%u\n"
|
||||
"run_time_ns:\t%llu\n"
|
||||
"run_cnt:\t%llu\n"
|
||||
"recursion_misses:\t%llu\n",
|
||||
"recursion_misses:\t%llu\n"
|
||||
"verified_insns:\t%u\n",
|
||||
prog->type,
|
||||
prog->jited,
|
||||
prog_tag,
|
||||
@ -1882,7 +1901,8 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
|
||||
prog->aux->id,
|
||||
stats.nsecs,
|
||||
stats.cnt,
|
||||
stats.misses);
|
||||
stats.misses,
|
||||
prog->aux->verified_insns);
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -2182,7 +2202,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
|
||||
}
|
||||
|
||||
/* last field in 'union bpf_attr' used by this command */
|
||||
#define BPF_PROG_LOAD_LAST_FIELD fd_array
|
||||
#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
|
||||
|
||||
static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
|
||||
{
|
||||
@ -3651,6 +3671,8 @@ static int bpf_prog_get_info_by_fd(struct file *file,
|
||||
info.run_cnt = stats.cnt;
|
||||
info.recursion_misses = stats.misses;
|
||||
|
||||
info.verified_insns = prog->aux->verified_insns;
|
||||
|
||||
if (!bpf_capable()) {
|
||||
info.jited_prog_len = 0;
|
||||
info.xlated_prog_len = 0;
|
||||
@ -3897,6 +3919,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
|
||||
info.value_size = map->value_size;
|
||||
info.max_entries = map->max_entries;
|
||||
info.map_flags = map->map_flags;
|
||||
info.map_extra = map->map_extra;
|
||||
memcpy(info.name, map->name, sizeof(map->name));
|
||||
|
||||
if (map->btf) {
|
||||
@ -4753,7 +4776,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_ANYTHING,
|
||||
.arg2_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
|
||||
.arg3_type = ARG_CONST_SIZE,
|
||||
};
|
||||
|
||||
@ -4780,6 +4803,31 @@ static const struct bpf_func_proto bpf_sys_close_proto = {
|
||||
.arg1_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
|
||||
{
|
||||
if (flags)
|
||||
return -EINVAL;
|
||||
|
||||
if (name_sz <= 1 || name[name_sz - 1])
|
||||
return -EINVAL;
|
||||
|
||||
if (!bpf_dump_raw_ok(current_cred()))
|
||||
return -EPERM;
|
||||
|
||||
*res = kallsyms_lookup_name(name);
|
||||
return *res ? 0 : -ENOENT;
|
||||
}
|
||||
|
||||
const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
|
||||
.func = bpf_kallsyms_lookup_name,
|
||||
.gpl_only = false,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_MEM,
|
||||
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
|
||||
.arg3_type = ARG_ANYTHING,
|
||||
.arg4_type = ARG_PTR_TO_LONG,
|
||||
};
|
||||
|
||||
static const struct bpf_func_proto *
|
||||
syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||
{
|
||||
@ -4790,6 +4838,8 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
|
||||
return &bpf_btf_find_by_name_kind_proto;
|
||||
case BPF_FUNC_sys_close:
|
||||
return &bpf_sys_close_proto;
|
||||
case BPF_FUNC_kallsyms_lookup_name:
|
||||
return &bpf_kallsyms_lookup_name_proto;
|
||||
default:
|
||||
return tracing_prog_func_proto(func_id, prog);
|
||||
}
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include <linux/fdtable.h>
|
||||
#include <linux/filter.h>
|
||||
#include <linux/btf_ids.h>
|
||||
#include "mmap_unlock_work.h"
|
||||
|
||||
struct bpf_iter_seq_task_common {
|
||||
struct pid_namespace *ns;
|
||||
@ -524,10 +525,6 @@ static const struct seq_operations task_vma_seq_ops = {
|
||||
.show = task_vma_seq_show,
|
||||
};
|
||||
|
||||
BTF_ID_LIST(btf_task_file_ids)
|
||||
BTF_ID(struct, file)
|
||||
BTF_ID(struct, vm_area_struct)
|
||||
|
||||
static const struct bpf_iter_seq_info task_seq_info = {
|
||||
.seq_ops = &task_seq_ops,
|
||||
.init_seq_private = init_seq_pidns,
|
||||
@ -586,23 +583,88 @@ static struct bpf_iter_reg task_vma_reg_info = {
|
||||
.seq_info = &task_vma_seq_info,
|
||||
};
|
||||
|
||||
BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
|
||||
bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
|
||||
{
|
||||
struct mmap_unlock_irq_work *work = NULL;
|
||||
struct vm_area_struct *vma;
|
||||
bool irq_work_busy = false;
|
||||
struct mm_struct *mm;
|
||||
int ret = -ENOENT;
|
||||
|
||||
if (flags)
|
||||
return -EINVAL;
|
||||
|
||||
if (!task)
|
||||
return -ENOENT;
|
||||
|
||||
mm = task->mm;
|
||||
if (!mm)
|
||||
return -ENOENT;
|
||||
|
||||
irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
|
||||
|
||||
if (irq_work_busy || !mmap_read_trylock(mm))
|
||||
return -EBUSY;
|
||||
|
||||
vma = find_vma(mm, start);
|
||||
|
||||
if (vma && vma->vm_start <= start && vma->vm_end > start) {
|
||||
callback_fn((u64)(long)task, (u64)(long)vma,
|
||||
(u64)(long)callback_ctx, 0, 0);
|
||||
ret = 0;
|
||||
}
|
||||
bpf_mmap_unlock_mm(work, mm);
|
||||
return ret;
|
||||
}
|
||||
|
||||
const struct bpf_func_proto bpf_find_vma_proto = {
|
||||
.func = bpf_find_vma,
|
||||
.ret_type = RET_INTEGER,
|
||||
.arg1_type = ARG_PTR_TO_BTF_ID,
|
||||
.arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
|
||||
.arg2_type = ARG_ANYTHING,
|
||||
.arg3_type = ARG_PTR_TO_FUNC,
|
||||
.arg4_type = ARG_PTR_TO_STACK_OR_NULL,
|
||||
.arg5_type = ARG_ANYTHING,
|
||||
};
|
||||
|
||||
DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
|
||||
|
||||
static void do_mmap_read_unlock(struct irq_work *entry)
|
||||
{
|
||||
struct mmap_unlock_irq_work *work;
|
||||
|
||||
if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
|
||||
return;
|
||||
|
||||
work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
|
||||
mmap_read_unlock_non_owner(work->mm);
|
||||
}
|
||||
|
||||
static int __init task_iter_init(void)
|
||||
{
|
||||
int ret;
|
||||
struct mmap_unlock_irq_work *work;
|
||||
int ret, cpu;
|
||||
|
||||
task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
|
||||
for_each_possible_cpu(cpu) {
|
||||
work = per_cpu_ptr(&mmap_unlock_work, cpu);
|
||||
init_irq_work(&work->irq_work, do_mmap_read_unlock);
|
||||
}
|
||||
|
||||
task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
|
||||
ret = bpf_iter_reg_target(&task_reg_info);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
|
||||
task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0];
|
||||
task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
|
||||
task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE];
|
||||
ret = bpf_iter_reg_target(&task_file_reg_info);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
|
||||
task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
|
||||
task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
|
||||
task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
|
||||
return bpf_iter_reg_target(&task_vma_reg_info);
|
||||
}
|
||||
late_initcall(task_iter_init);
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include <linux/rcupdate_trace.h>
|
||||
#include <linux/rcupdate_wait.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/static_call.h>
|
||||
|
||||
/* dummy _ops. The verifier will operate on target program's ops. */
|
||||
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
|
||||
@ -26,6 +27,14 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
|
||||
/* serializes access to trampoline_table */
|
||||
static DEFINE_MUTEX(trampoline_mutex);
|
||||
|
||||
bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
|
||||
{
|
||||
enum bpf_attach_type eatype = prog->expected_attach_type;
|
||||
|
||||
return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
|
||||
eatype == BPF_MODIFY_RETURN;
|
||||
}
|
||||
|
||||
void *bpf_jit_alloc_exec_page(void)
|
||||
{
|
||||
void *image;
|
||||
@ -526,7 +535,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
|
||||
}
|
||||
|
||||
#define NO_START_TIME 1
|
||||
static u64 notrace bpf_prog_start_time(void)
|
||||
static __always_inline u64 notrace bpf_prog_start_time(void)
|
||||
{
|
||||
u64 start = NO_START_TIME;
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -63,9 +63,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
|
||||
for_each_root(root) {
|
||||
struct cgroup *from_cgrp;
|
||||
|
||||
if (root == &cgrp_dfl_root)
|
||||
continue;
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
from_cgrp = task_cgroup_from_root(from, root);
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
@ -675,11 +672,9 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
|
||||
|
||||
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
|
||||
/*
|
||||
* ideally we don't want subsystems moving around while we do this.
|
||||
* cgroup_mutex is also necessary to guarantee an atomic snapshot of
|
||||
* subsys/hierarchy state.
|
||||
* Grab the subsystems state racily. No need to add avenue to
|
||||
* cgroup_mutex contention.
|
||||
*/
|
||||
mutex_lock(&cgroup_mutex);
|
||||
|
||||
for_each_subsys(ss, i)
|
||||
seq_printf(m, "%s\t%d\t%d\t%d\n",
|
||||
@ -687,7 +682,6 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
|
||||
atomic_read(&ss->root->nr_cgrps),
|
||||
cgroup_ssid_enabled(i));
|
||||
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -714,8 +708,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
|
||||
kernfs_type(kn) != KERNFS_DIR)
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
|
||||
/*
|
||||
* We aren't being called from kernfs and there's no guarantee on
|
||||
* @kn->priv's validity. For this and css_tryget_online_from_dir(),
|
||||
@ -723,9 +715,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
|
||||
*/
|
||||
rcu_read_lock();
|
||||
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
|
||||
if (!cgrp || cgroup_is_dead(cgrp)) {
|
||||
if (!cgrp || !cgroup_tryget(cgrp)) {
|
||||
rcu_read_unlock();
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
return -ENOENT;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
@ -753,7 +744,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
|
||||
}
|
||||
css_task_iter_end(&it);
|
||||
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
cgroup_put(cgrp);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -30,6 +30,7 @@
|
||||
|
||||
#include "cgroup-internal.h"
|
||||
|
||||
#include <linux/bpf-cgroup.h>
|
||||
#include <linux/cred.h>
|
||||
#include <linux/errno.h>
|
||||
#include <linux/init_task.h>
|
||||
@ -2650,11 +2651,11 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
|
||||
if (src_cset->dead)
|
||||
return;
|
||||
|
||||
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
|
||||
|
||||
if (!list_empty(&src_cset->mg_preload_node))
|
||||
return;
|
||||
|
||||
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
|
||||
|
||||
WARN_ON(src_cset->mg_src_cgrp);
|
||||
WARN_ON(src_cset->mg_dst_cgrp);
|
||||
WARN_ON(!list_empty(&src_cset->mg_tasks));
|
||||
@ -5747,7 +5748,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
|
||||
|
||||
/* Create the root cgroup state for this subsystem */
|
||||
ss->root = &cgrp_dfl_root;
|
||||
css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
|
||||
css = ss->css_alloc(NULL);
|
||||
/* We don't handle early failures gracefully */
|
||||
BUG_ON(IS_ERR(css));
|
||||
init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
|
||||
@ -5979,17 +5980,20 @@ struct cgroup *cgroup_get_from_id(u64 id)
|
||||
struct kernfs_node *kn;
|
||||
struct cgroup *cgrp = NULL;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
|
||||
if (!kn)
|
||||
goto out_unlock;
|
||||
goto out;
|
||||
|
||||
cgrp = kn->priv;
|
||||
if (cgroup_is_dead(cgrp) || !cgroup_tryget(cgrp))
|
||||
rcu_read_lock();
|
||||
|
||||
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
|
||||
if (cgrp && !cgroup_tryget(cgrp))
|
||||
cgrp = NULL;
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
kernfs_put(kn);
|
||||
out_unlock:
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
out:
|
||||
return cgrp;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cgroup_get_from_id);
|
||||
@ -6171,6 +6175,20 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
/*
|
||||
* Spawning a task directly into a cgroup works by passing a file
|
||||
* descriptor to the target cgroup directory. This can even be an O_PATH
|
||||
* file descriptor. But it can never be a cgroup.procs file descriptor.
|
||||
* This was done on purpose so spawning into a cgroup could be
|
||||
* conceptualized as an atomic
|
||||
*
|
||||
* fd = openat(dfd_cgroup, "cgroup.procs", ...);
|
||||
* write(fd, <child-pid>, ...);
|
||||
*
|
||||
* sequence, i.e. it's a shorthand for the caller opening and writing
|
||||
* cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
|
||||
* to always use the caller's credentials.
|
||||
*/
|
||||
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
|
||||
!(kargs->flags & CLONE_THREAD),
|
||||
current->nsproxy->cgroup_ns);
|
||||
@ -6572,30 +6590,34 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
|
||||
*
|
||||
* Find the cgroup at @path on the default hierarchy, increment its
|
||||
* reference count and return it. Returns pointer to the found cgroup on
|
||||
* success, ERR_PTR(-ENOENT) if @path doesn't exist and ERR_PTR(-ENOTDIR)
|
||||
* if @path points to a non-directory.
|
||||
* success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already
|
||||
* been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.
|
||||
*/
|
||||
struct cgroup *cgroup_get_from_path(const char *path)
|
||||
{
|
||||
struct kernfs_node *kn;
|
||||
struct cgroup *cgrp;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
struct cgroup *cgrp = ERR_PTR(-ENOENT);
|
||||
|
||||
kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
|
||||
if (kn) {
|
||||
if (kernfs_type(kn) == KERNFS_DIR) {
|
||||
cgrp = kn->priv;
|
||||
cgroup_get_live(cgrp);
|
||||
} else {
|
||||
if (!kn)
|
||||
goto out;
|
||||
|
||||
if (kernfs_type(kn) != KERNFS_DIR) {
|
||||
cgrp = ERR_PTR(-ENOTDIR);
|
||||
}
|
||||
kernfs_put(kn);
|
||||
} else {
|
||||
cgrp = ERR_PTR(-ENOENT);
|
||||
goto out_kernfs;
|
||||
}
|
||||
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
rcu_read_lock();
|
||||
|
||||
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
|
||||
if (!cgrp || !cgroup_tryget(cgrp))
|
||||
cgrp = ERR_PTR(-ENOENT);
|
||||
|
||||
rcu_read_unlock();
|
||||
|
||||
out_kernfs:
|
||||
kernfs_put(kn);
|
||||
out:
|
||||
return cgrp;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
|
||||
@ -6723,44 +6745,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
|
||||
|
||||
#endif /* CONFIG_SOCK_CGROUP_DATA */
|
||||
|
||||
#ifdef CONFIG_CGROUP_BPF
|
||||
int cgroup_bpf_attach(struct cgroup *cgrp,
|
||||
struct bpf_prog *prog, struct bpf_prog *replace_prog,
|
||||
struct bpf_cgroup_link *link,
|
||||
enum bpf_attach_type type,
|
||||
u32 flags)
|
||||
{
|
||||
int ret;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
|
||||
enum bpf_attach_type type)
|
||||
{
|
||||
int ret;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
|
||||
union bpf_attr __user *uattr)
|
||||
{
|
||||
int ret;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
ret = __cgroup_bpf_query(cgrp, attr, uattr);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
return ret;
|
||||
}
|
||||
#endif /* CONFIG_CGROUP_BPF */
|
||||
|
||||
#ifdef CONFIG_SYSFS
|
||||
static ssize_t show_delegatable_files(struct cftype *files, char *buf,
|
||||
ssize_t size, const char *prefix)
|
||||
|
@ -69,6 +69,13 @@
|
||||
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
|
||||
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
|
||||
|
||||
/*
|
||||
* There could be abnormal cpuset configurations for cpu or memory
|
||||
* node binding, add this key to provide a quick low-cost judgement
|
||||
* of the situation.
|
||||
*/
|
||||
DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
|
||||
|
||||
/* See "Frequency meter" comments, below. */
|
||||
|
||||
struct fmeter {
|
||||
@ -372,6 +379,17 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
|
||||
|
||||
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
|
||||
|
||||
static inline void check_insane_mems_config(nodemask_t *nodes)
|
||||
{
|
||||
if (!cpusets_insane_config() &&
|
||||
movable_only_nodes(nodes)) {
|
||||
static_branch_enable(&cpusets_insane_config_key);
|
||||
pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
|
||||
"Cpuset allocations might fail even with a lot of memory available.\n",
|
||||
nodemask_pr_args(nodes));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
|
||||
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
|
||||
@ -572,6 +590,35 @@ static inline void free_cpuset(struct cpuset *cs)
|
||||
kfree(cs);
|
||||
}
|
||||
|
||||
/*
|
||||
* validate_change_legacy() - Validate conditions specific to legacy (v1)
|
||||
* behavior.
|
||||
*/
|
||||
static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
struct cpuset *c, *par;
|
||||
int ret;
|
||||
|
||||
WARN_ON_ONCE(!rcu_read_lock_held());
|
||||
|
||||
/* Each of our child cpusets must be a subset of us */
|
||||
ret = -EBUSY;
|
||||
cpuset_for_each_child(c, css, cur)
|
||||
if (!is_cpuset_subset(c, trial))
|
||||
goto out;
|
||||
|
||||
/* On legacy hierarchy, we must be a subset of our parent cpuset. */
|
||||
ret = -EACCES;
|
||||
par = parent_cs(cur);
|
||||
if (par && !is_cpuset_subset(trial, par))
|
||||
goto out;
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* validate_change() - Used to validate that any proposed cpuset change
|
||||
* follows the structural rules for cpusets.
|
||||
@ -596,28 +643,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
struct cpuset *c, *par;
|
||||
int ret;
|
||||
int ret = 0;
|
||||
|
||||
rcu_read_lock();
|
||||
|
||||
/* Each of our child cpusets must be a subset of us */
|
||||
ret = -EBUSY;
|
||||
cpuset_for_each_child(c, css, cur)
|
||||
if (!is_cpuset_subset(c, trial))
|
||||
if (!is_in_v2_mode())
|
||||
ret = validate_change_legacy(cur, trial);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/* Remaining checks don't apply to root cpuset */
|
||||
ret = 0;
|
||||
if (cur == &top_cpuset)
|
||||
goto out;
|
||||
|
||||
par = parent_cs(cur);
|
||||
|
||||
/* On legacy hierarchy, we must be a subset of our parent cpuset. */
|
||||
ret = -EACCES;
|
||||
if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* If either I or some sibling (!= me) is exclusive, we can't
|
||||
* overlap
|
||||
@ -1165,9 +1205,7 @@ enum subparts_cmd {
|
||||
*
|
||||
* Because of the implicit cpu exclusive nature of a partition root,
|
||||
* cpumask changes that violates the cpu exclusivity rule will not be
|
||||
* permitted when checked by validate_change(). The validate_change()
|
||||
* function will also prevent any changes to the cpu list if it is not
|
||||
* a superset of children's cpu lists.
|
||||
* permitted when checked by validate_change().
|
||||
*/
|
||||
static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
|
||||
struct cpumask *newmask,
|
||||
@ -1879,6 +1917,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
|
||||
if (retval < 0)
|
||||
goto done;
|
||||
|
||||
check_insane_mems_config(&trialcs->mems_allowed);
|
||||
|
||||
spin_lock_irq(&callback_lock);
|
||||
cs->mems_allowed = trialcs->mems_allowed;
|
||||
spin_unlock_irq(&callback_lock);
|
||||
@ -3184,6 +3224,9 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
|
||||
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
|
||||
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
|
||||
|
||||
if (mems_updated)
|
||||
check_insane_mems_config(&new_mems);
|
||||
|
||||
if (is_in_v2_mode())
|
||||
hotplug_update_tasks(cs, &new_cpus, &new_mems,
|
||||
cpus_updated, mems_updated);
|
||||
@ -3481,8 +3524,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
|
||||
return cs;
|
||||
}
|
||||
|
||||
/**
|
||||
* cpuset_node_allowed - Can we allocate on a memory node?
|
||||
/*
|
||||
* __cpuset_node_allowed - Can we allocate on a memory node?
|
||||
* @node: is this an allowed node?
|
||||
* @gfp_mask: memory allocation flags
|
||||
*
|
||||
@ -3524,7 +3567,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
|
||||
bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
|
||||
{
|
||||
struct cpuset *cs; /* current cpuset ancestors */
|
||||
int allowed; /* is allocation in zone z allowed? */
|
||||
bool allowed; /* is allocation in zone z allowed? */
|
||||
unsigned long flags;
|
||||
|
||||
if (in_interrupt())
|
||||
@ -3653,8 +3696,8 @@ void cpuset_print_current_mems_allowed(void)
|
||||
|
||||
int cpuset_memory_pressure_enabled __read_mostly;
|
||||
|
||||
/**
|
||||
* cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
|
||||
/*
|
||||
* __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
|
||||
*
|
||||
* Keep a running average of the rate of synchronous (direct)
|
||||
* page reclaim efforts initiated by tasks in each cpuset.
|
||||
@ -3669,7 +3712,7 @@ int cpuset_memory_pressure_enabled __read_mostly;
|
||||
* "memory_pressure". Value displayed is an integer
|
||||
* representing the recent rate of entry into the synchronous
|
||||
* (direct) page reclaim by any task attached to the cpuset.
|
||||
**/
|
||||
*/
|
||||
|
||||
void __cpuset_memory_pressure_bump(void)
|
||||
{
|
||||
|
@ -157,13 +157,6 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
|
||||
new_usage = atomic_long_add_return(amount, &res->usage);
|
||||
if (new_usage > READ_ONCE(res->max) ||
|
||||
new_usage > READ_ONCE(misc_res_capacity[type])) {
|
||||
if (!res->failed) {
|
||||
pr_info("cgroup: charge rejected by the misc controller for %s resource in ",
|
||||
misc_res_name[type]);
|
||||
pr_cont_cgroup_path(i->css.cgroup);
|
||||
pr_cont("\n");
|
||||
res->failed = true;
|
||||
}
|
||||
ret = -EBUSY;
|
||||
goto err_charge;
|
||||
}
|
||||
@ -171,6 +164,11 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
|
||||
return 0;
|
||||
|
||||
err_charge:
|
||||
for (j = i; j; j = parent_misc(j)) {
|
||||
atomic_long_inc(&j->res[type].events);
|
||||
cgroup_file_notify(&j->events_file);
|
||||
}
|
||||
|
||||
for (j = cg; j != i; j = parent_misc(j))
|
||||
misc_cg_cancel_charge(type, j, amount);
|
||||
misc_cg_cancel_charge(type, i, amount);
|
||||
@ -335,6 +333,19 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int misc_events_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct misc_cg *cg = css_misc(seq_css(sf));
|
||||
unsigned long events, i;
|
||||
|
||||
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
|
||||
events = atomic_long_read(&cg->res[i].events);
|
||||
if (READ_ONCE(misc_res_capacity[i]) || events)
|
||||
seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Misc cgroup interface files */
|
||||
static struct cftype misc_cg_files[] = {
|
||||
{
|
||||
@ -353,6 +364,12 @@ static struct cftype misc_cg_files[] = {
|
||||
.seq_show = misc_cg_capacity_show,
|
||||
.flags = CFTYPE_ONLY_ON_ROOT,
|
||||
},
|
||||
{
|
||||
.name = "events",
|
||||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
.file_offset = offsetof(struct misc_cg, events_file),
|
||||
.seq_show = misc_events_show,
|
||||
},
|
||||
{}
|
||||
};
|
||||
|
||||
|
@ -35,7 +35,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
|
||||
* instead of NULL, we can tell whether @cgrp is on the list by
|
||||
* testing the next pointer for NULL.
|
||||
*/
|
||||
if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
|
||||
if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
|
||||
return;
|
||||
|
||||
raw_spin_lock_irqsave(cpu_lock, flags);
|
||||
@ -88,6 +88,7 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
|
||||
struct cgroup *root, int cpu)
|
||||
{
|
||||
struct cgroup_rstat_cpu *rstatc;
|
||||
struct cgroup *parent;
|
||||
|
||||
if (pos == root)
|
||||
return NULL;
|
||||
@ -96,10 +97,14 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
|
||||
* We're gonna walk down to the first leaf and visit/remove it. We
|
||||
* can pick whatever unvisited node as the starting point.
|
||||
*/
|
||||
if (!pos)
|
||||
if (!pos) {
|
||||
pos = root;
|
||||
else
|
||||
/* return NULL if this subtree is not on-list */
|
||||
if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
|
||||
return NULL;
|
||||
} else {
|
||||
pos = cgroup_parent(pos);
|
||||
}
|
||||
|
||||
/* walk down to the first leaf */
|
||||
while (true) {
|
||||
@ -115,21 +120,17 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
|
||||
* However, due to the way we traverse, @pos will be the first
|
||||
* child in most cases. The only exception is @root.
|
||||
*/
|
||||
if (rstatc->updated_next) {
|
||||
struct cgroup *parent = cgroup_parent(pos);
|
||||
|
||||
parent = cgroup_parent(pos);
|
||||
if (parent) {
|
||||
struct cgroup_rstat_cpu *prstatc;
|
||||
struct cgroup **nextp;
|
||||
|
||||
prstatc = cgroup_rstat_cpu(parent, cpu);
|
||||
nextp = &prstatc->updated_children;
|
||||
while (true) {
|
||||
while (*nextp != pos) {
|
||||
struct cgroup_rstat_cpu *nrstatc;
|
||||
|
||||
nrstatc = cgroup_rstat_cpu(*nextp, cpu);
|
||||
if (*nextp == pos)
|
||||
break;
|
||||
WARN_ON_ONCE(*nextp == parent);
|
||||
nextp = &nrstatc->updated_next;
|
||||
}
|
||||
@ -140,10 +141,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
|
||||
return pos;
|
||||
}
|
||||
|
||||
/* only happens for @root */
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/* see cgroup_rstat_flush() */
|
||||
static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
|
||||
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)
|
||||
|
@ -17,6 +17,7 @@ CONFIG_SYMBOLIC_ERRNAME=y
|
||||
# Compile-time checks and compiler options
|
||||
#
|
||||
CONFIG_DEBUG_INFO=y
|
||||
CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
|
||||
CONFIG_DEBUG_SECTION_MISMATCH=y
|
||||
CONFIG_FRAME_WARN=2048
|
||||
CONFIG_SECTION_MISMATCH_WARN_ONLY=y
|
||||
|
@ -100,19 +100,10 @@ void __delayacct_blkio_start(void)
|
||||
*/
|
||||
void __delayacct_blkio_end(struct task_struct *p)
|
||||
{
|
||||
struct task_delay_info *delays = p->delays;
|
||||
u64 *total;
|
||||
u32 *count;
|
||||
|
||||
if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
|
||||
total = &delays->swapin_delay;
|
||||
count = &delays->swapin_count;
|
||||
} else {
|
||||
total = &delays->blkio_delay;
|
||||
count = &delays->blkio_count;
|
||||
}
|
||||
|
||||
delayacct_end(&delays->lock, &delays->blkio_start, total, count);
|
||||
delayacct_end(&p->delays->lock,
|
||||
&p->delays->blkio_start,
|
||||
&p->delays->blkio_delay,
|
||||
&p->delays->blkio_count);
|
||||
}
|
||||
|
||||
int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
|
||||
@ -164,10 +155,13 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
|
||||
d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
|
||||
tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay;
|
||||
d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
|
||||
tmp = d->compact_delay_total + tsk->delays->compact_delay;
|
||||
d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
|
||||
d->blkio_count += tsk->delays->blkio_count;
|
||||
d->swapin_count += tsk->delays->swapin_count;
|
||||
d->freepages_count += tsk->delays->freepages_count;
|
||||
d->thrashing_count += tsk->delays->thrashing_count;
|
||||
d->compact_count += tsk->delays->compact_count;
|
||||
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
|
||||
|
||||
return 0;
|
||||
@ -179,8 +173,7 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
|
||||
unsigned long flags;
|
||||
|
||||
raw_spin_lock_irqsave(&tsk->delays->lock, flags);
|
||||
ret = nsec_to_clock_t(tsk->delays->blkio_delay +
|
||||
tsk->delays->swapin_delay);
|
||||
ret = nsec_to_clock_t(tsk->delays->blkio_delay);
|
||||
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
|
||||
return ret;
|
||||
}
|
||||
@ -210,3 +203,29 @@ void __delayacct_thrashing_end(void)
|
||||
¤t->delays->thrashing_delay,
|
||||
¤t->delays->thrashing_count);
|
||||
}
|
||||
|
||||
void __delayacct_swapin_start(void)
|
||||
{
|
||||
current->delays->swapin_start = local_clock();
|
||||
}
|
||||
|
||||
void __delayacct_swapin_end(void)
|
||||
{
|
||||
delayacct_end(¤t->delays->lock,
|
||||
¤t->delays->swapin_start,
|
||||
¤t->delays->swapin_delay,
|
||||
¤t->delays->swapin_count);
|
||||
}
|
||||
|
||||
void __delayacct_compact_start(void)
|
||||
{
|
||||
current->delays->compact_start = local_clock();
|
||||
}
|
||||
|
||||
void __delayacct_compact_end(void)
|
||||
{
|
||||
delayacct_end(¤t->delays->lock,
|
||||
¤t->delays->compact_start,
|
||||
¤t->delays->compact_delay,
|
||||
¤t->delays->compact_count);
|
||||
}
|
||||
|
@ -40,7 +40,6 @@ static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
|
||||
{
|
||||
struct dma_coherent_mem *dma_mem;
|
||||
int pages = size >> PAGE_SHIFT;
|
||||
int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
|
||||
void *mem_base;
|
||||
|
||||
if (!size)
|
||||
@ -53,7 +52,7 @@ static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
|
||||
dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
|
||||
if (!dma_mem)
|
||||
goto out_unmap_membase;
|
||||
dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
|
||||
dma_mem->bitmap = bitmap_zalloc(pages, GFP_KERNEL);
|
||||
if (!dma_mem->bitmap)
|
||||
goto out_free_dma_mem;
|
||||
|
||||
@ -81,7 +80,7 @@ static void dma_release_coherent_memory(struct dma_coherent_mem *mem)
|
||||
return;
|
||||
|
||||
memunmap(mem->virt_base);
|
||||
kfree(mem->bitmap);
|
||||
bitmap_free(mem->bitmap);
|
||||
kfree(mem);
|
||||
}
|
||||
|
||||
|
@ -75,15 +75,45 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
|
||||
min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
|
||||
}
|
||||
|
||||
static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size)
|
||||
{
|
||||
if (!force_dma_unencrypted(dev))
|
||||
return 0;
|
||||
return set_memory_decrypted((unsigned long)vaddr, 1 << get_order(size));
|
||||
}
|
||||
|
||||
static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (!force_dma_unencrypted(dev))
|
||||
return 0;
|
||||
ret = set_memory_encrypted((unsigned long)vaddr, 1 << get_order(size));
|
||||
if (ret)
|
||||
pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n");
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void __dma_direct_free_pages(struct device *dev, struct page *page,
|
||||
size_t size)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
|
||||
swiotlb_free(dev, page, size))
|
||||
if (swiotlb_free(dev, page, size))
|
||||
return;
|
||||
dma_free_contiguous(dev, page, size);
|
||||
}
|
||||
|
||||
static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
|
||||
{
|
||||
struct page *page = swiotlb_alloc(dev, size);
|
||||
|
||||
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
|
||||
swiotlb_free(dev, page, size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return page;
|
||||
}
|
||||
|
||||
static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
|
||||
gfp_t gfp)
|
||||
{
|
||||
@ -93,18 +123,11 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
|
||||
|
||||
WARN_ON_ONCE(!PAGE_ALIGNED(size));
|
||||
|
||||
if (is_swiotlb_for_alloc(dev))
|
||||
return dma_direct_alloc_swiotlb(dev, size);
|
||||
|
||||
gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
|
||||
&phys_limit);
|
||||
if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
|
||||
is_swiotlb_for_alloc(dev)) {
|
||||
page = swiotlb_alloc(dev, size);
|
||||
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
|
||||
__dma_direct_free_pages(dev, page, size);
|
||||
return NULL;
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
page = dma_alloc_contiguous(dev, size, gfp);
|
||||
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
|
||||
dma_free_contiguous(dev, page, size);
|
||||
@ -133,6 +156,15 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
|
||||
return page;
|
||||
}
|
||||
|
||||
/*
|
||||
* Check if a potentially blocking operations needs to dip into the atomic
|
||||
* pools for the given device/gfp.
|
||||
*/
|
||||
static bool dma_direct_use_pool(struct device *dev, gfp_t gfp)
|
||||
{
|
||||
return !gfpflags_allow_blocking(gfp) && !is_swiotlb_for_alloc(dev);
|
||||
}
|
||||
|
||||
static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t gfp)
|
||||
{
|
||||
@ -140,6 +172,9 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
|
||||
u64 phys_mask;
|
||||
void *ret;
|
||||
|
||||
if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)))
|
||||
return NULL;
|
||||
|
||||
gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
|
||||
&phys_mask);
|
||||
page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
|
||||
@ -149,64 +184,103 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t gfp)
|
||||
{
|
||||
struct page *page;
|
||||
|
||||
page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
|
||||
if (!page)
|
||||
return NULL;
|
||||
|
||||
/* remove any dirty cache lines on the kernel alias */
|
||||
if (!PageHighMem(page))
|
||||
arch_dma_prep_coherent(page, size);
|
||||
|
||||
/* return the page pointer as the opaque cookie */
|
||||
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
|
||||
return page;
|
||||
}
|
||||
|
||||
void *dma_direct_alloc(struct device *dev, size_t size,
|
||||
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
|
||||
{
|
||||
bool remap = false, set_uncached = false;
|
||||
struct page *page;
|
||||
void *ret;
|
||||
int err;
|
||||
|
||||
size = PAGE_ALIGN(size);
|
||||
if (attrs & DMA_ATTR_NO_WARN)
|
||||
gfp |= __GFP_NOWARN;
|
||||
|
||||
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
|
||||
!force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) {
|
||||
page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
|
||||
if (!page)
|
||||
return NULL;
|
||||
/* remove any dirty cache lines on the kernel alias */
|
||||
if (!PageHighMem(page))
|
||||
arch_dma_prep_coherent(page, size);
|
||||
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
|
||||
/* return the page pointer as the opaque cookie */
|
||||
return page;
|
||||
}
|
||||
!force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev))
|
||||
return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp);
|
||||
|
||||
if (!dev_is_dma_coherent(dev)) {
|
||||
/*
|
||||
* Fallback to the arch handler if it exists. This should
|
||||
* eventually go away.
|
||||
*/
|
||||
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
|
||||
!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
|
||||
!IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
|
||||
!dev_is_dma_coherent(dev) &&
|
||||
!is_swiotlb_for_alloc(dev))
|
||||
return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
|
||||
|
||||
if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
|
||||
!dev_is_dma_coherent(dev))
|
||||
return dma_alloc_from_global_coherent(dev, size, dma_handle);
|
||||
return arch_dma_alloc(dev, size, dma_handle, gfp,
|
||||
attrs);
|
||||
|
||||
/*
|
||||
* Remapping or decrypting memory may block. If either is required and
|
||||
* we can't block, allocate the memory from the atomic pools.
|
||||
* If restricted DMA (i.e., is_swiotlb_for_alloc) is required, one must
|
||||
* set up another device coherent pool by shared-dma-pool and use
|
||||
* dma_alloc_from_dev_coherent instead.
|
||||
* If there is a global pool, always allocate from it for
|
||||
* non-coherent devices.
|
||||
*/
|
||||
if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
|
||||
!gfpflags_allow_blocking(gfp) &&
|
||||
(force_dma_unencrypted(dev) ||
|
||||
(IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
|
||||
!dev_is_dma_coherent(dev))) &&
|
||||
!is_swiotlb_for_alloc(dev))
|
||||
if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL))
|
||||
return dma_alloc_from_global_coherent(dev, size,
|
||||
dma_handle);
|
||||
|
||||
/*
|
||||
* Otherwise remap if the architecture is asking for it. But
|
||||
* given that remapping memory is a blocking operation we'll
|
||||
* instead have to dip into the atomic pools.
|
||||
*/
|
||||
remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP);
|
||||
if (remap) {
|
||||
if (dma_direct_use_pool(dev, gfp))
|
||||
return dma_direct_alloc_from_pool(dev, size,
|
||||
dma_handle, gfp);
|
||||
} else {
|
||||
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED))
|
||||
return NULL;
|
||||
set_uncached = true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Decrypting memory may block, so allocate the memory from the atomic
|
||||
* pools if we can't block.
|
||||
*/
|
||||
if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
|
||||
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
|
||||
|
||||
/* we always manually zero the memory once we are done */
|
||||
page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
|
||||
if (!page)
|
||||
return NULL;
|
||||
if (PageHighMem(page)) {
|
||||
/*
|
||||
* Depending on the cma= arguments and per-arch setup,
|
||||
* dma_alloc_contiguous could return highmem pages.
|
||||
* Without remapping there is no way to return them here, so
|
||||
* log an error and fail.
|
||||
*/
|
||||
if (!IS_ENABLED(CONFIG_DMA_REMAP)) {
|
||||
dev_info(dev, "Rejecting highmem page from CMA.\n");
|
||||
goto out_free_pages;
|
||||
}
|
||||
remap = true;
|
||||
set_uncached = false;
|
||||
}
|
||||
|
||||
if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
|
||||
!dev_is_dma_coherent(dev)) ||
|
||||
(IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
|
||||
if (remap) {
|
||||
/* remove any dirty cache lines on the kernel alias */
|
||||
arch_dma_prep_coherent(page, size);
|
||||
|
||||
@ -216,56 +290,27 @@ void *dma_direct_alloc(struct device *dev, size_t size,
|
||||
__builtin_return_address(0));
|
||||
if (!ret)
|
||||
goto out_free_pages;
|
||||
if (force_dma_unencrypted(dev)) {
|
||||
err = set_memory_decrypted((unsigned long)ret,
|
||||
1 << get_order(size));
|
||||
if (err)
|
||||
goto out_free_pages;
|
||||
}
|
||||
memset(ret, 0, size);
|
||||
goto done;
|
||||
}
|
||||
|
||||
if (PageHighMem(page)) {
|
||||
/*
|
||||
* Depending on the cma= arguments and per-arch setup
|
||||
* dma_alloc_contiguous could return highmem pages.
|
||||
* Without remapping there is no way to return them here,
|
||||
* so log an error and fail.
|
||||
*/
|
||||
dev_info(dev, "Rejecting highmem page from CMA.\n");
|
||||
goto out_free_pages;
|
||||
}
|
||||
|
||||
} else {
|
||||
ret = page_address(page);
|
||||
if (force_dma_unencrypted(dev)) {
|
||||
err = set_memory_decrypted((unsigned long)ret,
|
||||
1 << get_order(size));
|
||||
if (err)
|
||||
if (dma_set_decrypted(dev, ret, size))
|
||||
goto out_free_pages;
|
||||
}
|
||||
|
||||
memset(ret, 0, size);
|
||||
|
||||
if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
|
||||
!dev_is_dma_coherent(dev)) {
|
||||
if (set_uncached) {
|
||||
arch_dma_prep_coherent(page, size);
|
||||
ret = arch_dma_set_uncached(ret, size);
|
||||
if (IS_ERR(ret))
|
||||
goto out_encrypt_pages;
|
||||
}
|
||||
done:
|
||||
|
||||
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
|
||||
return ret;
|
||||
|
||||
out_encrypt_pages:
|
||||
if (force_dma_unencrypted(dev)) {
|
||||
err = set_memory_encrypted((unsigned long)page_address(page),
|
||||
1 << get_order(size));
|
||||
/* If memory cannot be re-encrypted, it must be leaked */
|
||||
if (err)
|
||||
if (dma_set_encrypted(dev, page_address(page), size))
|
||||
return NULL;
|
||||
}
|
||||
out_free_pages:
|
||||
__dma_direct_free_pages(dev, page, size);
|
||||
return NULL;
|
||||
@ -304,13 +349,14 @@ void dma_direct_free(struct device *dev, size_t size,
|
||||
dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
|
||||
return;
|
||||
|
||||
if (force_dma_unencrypted(dev))
|
||||
set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
|
||||
|
||||
if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr))
|
||||
if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
|
||||
vunmap(cpu_addr);
|
||||
else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
|
||||
} else {
|
||||
if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
|
||||
arch_dma_clear_uncached(cpu_addr, size);
|
||||
if (dma_set_encrypted(dev, cpu_addr, 1 << page_order))
|
||||
return;
|
||||
}
|
||||
|
||||
__dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size);
|
||||
}
|
||||
@ -321,9 +367,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
|
||||
struct page *page;
|
||||
void *ret;
|
||||
|
||||
if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
|
||||
force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) &&
|
||||
!is_swiotlb_for_alloc(dev))
|
||||
if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
|
||||
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
|
||||
|
||||
page = __dma_direct_alloc_pages(dev, size, gfp);
|
||||
@ -341,11 +385,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
|
||||
}
|
||||
|
||||
ret = page_address(page);
|
||||
if (force_dma_unencrypted(dev)) {
|
||||
if (set_memory_decrypted((unsigned long)ret,
|
||||
1 << get_order(size)))
|
||||
if (dma_set_decrypted(dev, ret, size))
|
||||
goto out_free_pages;
|
||||
}
|
||||
memset(ret, 0, size);
|
||||
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
|
||||
return page;
|
||||
@ -366,9 +407,8 @@ void dma_direct_free_pages(struct device *dev, size_t size,
|
||||
dma_free_from_pool(dev, vaddr, size))
|
||||
return;
|
||||
|
||||
if (force_dma_unencrypted(dev))
|
||||
set_memory_encrypted((unsigned long)vaddr, 1 << page_order);
|
||||
|
||||
if (dma_set_encrypted(dev, vaddr, 1 << page_order))
|
||||
return;
|
||||
__dma_direct_free_pages(dev, page, size);
|
||||
}
|
||||
|
||||
|
@ -296,10 +296,6 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
|
||||
if (WARN_ON_ONCE(!dev->dma_mask))
|
||||
return DMA_MAPPING_ERROR;
|
||||
|
||||
/* Don't allow RAM to be mapped */
|
||||
if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
|
||||
return DMA_MAPPING_ERROR;
|
||||
|
||||
if (dma_map_direct(dev, ops))
|
||||
addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
|
||||
else if (ops->map_resource)
|
||||
|
@ -34,7 +34,7 @@
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <linux/mem_encrypt.h>
|
||||
#include <linux/cc_platform.h>
|
||||
#include <linux/set_memory.h>
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
#include <linux/debugfs.h>
|
||||
@ -50,6 +50,7 @@
|
||||
#include <asm/io.h>
|
||||
#include <asm/dma.h>
|
||||
|
||||
#include <linux/io.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/memblock.h>
|
||||
#include <linux/iommu-helper.h>
|
||||
@ -72,6 +73,8 @@ enum swiotlb_force swiotlb_force;
|
||||
|
||||
struct io_tlb_mem io_tlb_default_mem;
|
||||
|
||||
phys_addr_t swiotlb_unencrypted_base;
|
||||
|
||||
/*
|
||||
* Max segment that we can provide which (if pages are contingous) will
|
||||
* not be bounced (unless SWIOTLB_FORCE is set).
|
||||
@ -155,6 +158,34 @@ static inline unsigned long nr_slots(u64 val)
|
||||
return DIV_ROUND_UP(val, IO_TLB_SIZE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Remap swioltb memory in the unencrypted physical address space
|
||||
* when swiotlb_unencrypted_base is set. (e.g. for Hyper-V AMD SEV-SNP
|
||||
* Isolation VMs).
|
||||
*/
|
||||
#ifdef CONFIG_HAS_IOMEM
|
||||
static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
|
||||
{
|
||||
void *vaddr = NULL;
|
||||
|
||||
if (swiotlb_unencrypted_base) {
|
||||
phys_addr_t paddr = mem->start + swiotlb_unencrypted_base;
|
||||
|
||||
vaddr = memremap(paddr, bytes, MEMREMAP_WB);
|
||||
if (!vaddr)
|
||||
pr_err("Failed to map the unencrypted memory %pa size %lx.\n",
|
||||
&paddr, bytes);
|
||||
}
|
||||
|
||||
return vaddr;
|
||||
}
|
||||
#else
|
||||
static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Early SWIOTLB allocation may be too early to allow an architecture to
|
||||
* perform the desired operations. This function allows the architecture to
|
||||
@ -172,7 +203,12 @@ void __init swiotlb_update_mem_attributes(void)
|
||||
vaddr = phys_to_virt(mem->start);
|
||||
bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
|
||||
set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
|
||||
memset(vaddr, 0, bytes);
|
||||
|
||||
mem->vaddr = swiotlb_mem_remap(mem, bytes);
|
||||
if (!mem->vaddr)
|
||||
mem->vaddr = vaddr;
|
||||
|
||||
memset(mem->vaddr, 0, bytes);
|
||||
}
|
||||
|
||||
static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
|
||||
@ -196,7 +232,17 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
|
||||
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
|
||||
mem->slots[i].alloc_size = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* If swiotlb_unencrypted_base is set, the bounce buffer memory will
|
||||
* be remapped and cleared in swiotlb_update_mem_attributes.
|
||||
*/
|
||||
if (swiotlb_unencrypted_base)
|
||||
return;
|
||||
|
||||
memset(vaddr, 0, bytes);
|
||||
mem->vaddr = vaddr;
|
||||
return;
|
||||
}
|
||||
|
||||
int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
|
||||
@ -247,7 +293,7 @@ swiotlb_init(int verbose)
|
||||
return;
|
||||
|
||||
fail_free_mem:
|
||||
memblock_free_early(__pa(tlb), bytes);
|
||||
memblock_free(tlb, bytes);
|
||||
fail:
|
||||
pr_warn("Cannot allocate buffer");
|
||||
}
|
||||
@ -371,7 +417,7 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
|
||||
phys_addr_t orig_addr = mem->slots[index].orig_addr;
|
||||
size_t alloc_size = mem->slots[index].alloc_size;
|
||||
unsigned long pfn = PFN_DOWN(orig_addr);
|
||||
unsigned char *vaddr = phys_to_virt(tlb_addr);
|
||||
unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start;
|
||||
unsigned int tlb_offset, orig_addr_offset;
|
||||
|
||||
if (orig_addr == INVALID_PHYS_ADDR)
|
||||
@ -459,7 +505,7 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
|
||||
* allocate a buffer from that IO TLB pool.
|
||||
*/
|
||||
static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
|
||||
size_t alloc_size)
|
||||
size_t alloc_size, unsigned int alloc_align_mask)
|
||||
{
|
||||
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
|
||||
unsigned long boundary_mask = dma_get_seg_boundary(dev);
|
||||
@ -483,6 +529,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
|
||||
stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
|
||||
if (alloc_size >= PAGE_SIZE)
|
||||
stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
|
||||
stride = max(stride, (alloc_align_mask >> IO_TLB_SHIFT) + 1);
|
||||
|
||||
spin_lock_irqsave(&mem->lock, flags);
|
||||
if (unlikely(nslots > mem->nslabs - mem->used))
|
||||
@ -541,7 +588,8 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
|
||||
|
||||
phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
|
||||
size_t mapping_size, size_t alloc_size,
|
||||
enum dma_data_direction dir, unsigned long attrs)
|
||||
unsigned int alloc_align_mask, enum dma_data_direction dir,
|
||||
unsigned long attrs)
|
||||
{
|
||||
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
|
||||
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
|
||||
@ -552,7 +600,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
|
||||
if (!mem)
|
||||
panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
|
||||
|
||||
if (mem_encrypt_active())
|
||||
if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
|
||||
pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
|
||||
|
||||
if (mapping_size > alloc_size) {
|
||||
@ -561,7 +609,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
|
||||
return (phys_addr_t)DMA_MAPPING_ERROR;
|
||||
}
|
||||
|
||||
index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset);
|
||||
index = swiotlb_find_slots(dev, orig_addr,
|
||||
alloc_size + offset, alloc_align_mask);
|
||||
if (index == -1) {
|
||||
if (!(attrs & DMA_ATTR_NO_WARN))
|
||||
dev_warn_ratelimited(dev,
|
||||
@ -683,7 +732,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
|
||||
trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size,
|
||||
swiotlb_force);
|
||||
|
||||
swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir,
|
||||
swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir,
|
||||
attrs);
|
||||
if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
|
||||
return DMA_MAPPING_ERROR;
|
||||
@ -767,7 +816,7 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
|
||||
if (!mem)
|
||||
return NULL;
|
||||
|
||||
index = swiotlb_find_slots(dev, 0, size);
|
||||
index = swiotlb_find_slots(dev, 0, size, 0);
|
||||
if (index == -1)
|
||||
return NULL;
|
||||
|
||||
|
@ -187,7 +187,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
|
||||
/* Check if any of the above work has queued a deferred wakeup */
|
||||
tick_nohz_user_enter_prepare();
|
||||
|
||||
ti_work = READ_ONCE(current_thread_info()->flags);
|
||||
ti_work = read_thread_flags();
|
||||
}
|
||||
|
||||
/* Return the latest work state for arch_exit_to_user_mode() */
|
||||
@ -196,7 +196,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
|
||||
|
||||
static void exit_to_user_mode_prepare(struct pt_regs *regs)
|
||||
{
|
||||
unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
|
||||
unsigned long ti_work = read_thread_flags();
|
||||
|
||||
lockdep_assert_irqs_disabled();
|
||||
|
||||
|
@ -26,7 +26,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ti_work = READ_ONCE(current_thread_info()->flags);
|
||||
ti_work = read_thread_flags();
|
||||
} while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
|
||||
return 0;
|
||||
}
|
||||
@ -43,7 +43,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu)
|
||||
* disabled in the inner loop before going into guest mode. No need
|
||||
* to disable interrupts here.
|
||||
*/
|
||||
ti_work = READ_ONCE(current_thread_info()->flags);
|
||||
ti_work = read_thread_flags();
|
||||
if (!(ti_work & XFER_TO_GUEST_MODE_WORK))
|
||||
return 0;
|
||||
|
||||
|
@ -1,10 +1,5 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
ifdef CONFIG_FUNCTION_TRACER
|
||||
CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
|
||||
endif
|
||||
|
||||
obj-y := core.o ring_buffer.o callchain.o
|
||||
|
||||
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
|
||||
obj-$(CONFIG_UPROBES) += uprobes.o
|
||||
|
||||
|
@ -1875,6 +1875,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
|
||||
|
||||
list_add_rcu(&event->event_entry, &ctx->event_list);
|
||||
ctx->nr_events++;
|
||||
if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
|
||||
ctx->nr_user++;
|
||||
if (event->attr.inherit_stat)
|
||||
ctx->nr_stat++;
|
||||
|
||||
@ -2066,6 +2068,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
|
||||
event->attach_state &= ~PERF_ATTACH_CONTEXT;
|
||||
|
||||
ctx->nr_events--;
|
||||
if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
|
||||
ctx->nr_user--;
|
||||
if (event->attr.inherit_stat)
|
||||
ctx->nr_stat--;
|
||||
|
||||
@ -6598,33 +6602,43 @@ static void perf_pending_event(struct irq_work *entry)
|
||||
perf_swevent_put_recursion_context(rctx);
|
||||
}
|
||||
|
||||
/*
|
||||
* We assume there is only KVM supporting the callbacks.
|
||||
* Later on, we might change it to a list if there is
|
||||
* another virtualization implementation supporting the callbacks.
|
||||
*/
|
||||
#ifdef CONFIG_GUEST_PERF_EVENTS
|
||||
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
|
||||
|
||||
int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
|
||||
DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
|
||||
DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
|
||||
DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
|
||||
|
||||
void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
|
||||
{
|
||||
if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
|
||||
return -EBUSY;
|
||||
return;
|
||||
|
||||
rcu_assign_pointer(perf_guest_cbs, cbs);
|
||||
return 0;
|
||||
static_call_update(__perf_guest_state, cbs->state);
|
||||
static_call_update(__perf_guest_get_ip, cbs->get_ip);
|
||||
|
||||
/* Implementing ->handle_intel_pt_intr is optional. */
|
||||
if (cbs->handle_intel_pt_intr)
|
||||
static_call_update(__perf_guest_handle_intel_pt_intr,
|
||||
cbs->handle_intel_pt_intr);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
|
||||
|
||||
int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
|
||||
void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
|
||||
{
|
||||
if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
|
||||
return -EINVAL;
|
||||
return;
|
||||
|
||||
rcu_assign_pointer(perf_guest_cbs, NULL);
|
||||
static_call_update(__perf_guest_state, (void *)&__static_call_return0);
|
||||
static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
|
||||
static_call_update(__perf_guest_handle_intel_pt_intr,
|
||||
(void *)&__static_call_return0);
|
||||
synchronize_rcu();
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
|
||||
#endif
|
||||
|
||||
static void
|
||||
perf_output_sample_regs(struct perf_output_handle *handle,
|
||||
@ -9183,6 +9197,36 @@ static void perf_log_itrace_start(struct perf_event *event)
|
||||
perf_output_end(&handle);
|
||||
}
|
||||
|
||||
void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
|
||||
{
|
||||
struct perf_output_handle handle;
|
||||
struct perf_sample_data sample;
|
||||
struct perf_aux_event {
|
||||
struct perf_event_header header;
|
||||
u64 hw_id;
|
||||
} rec;
|
||||
int ret;
|
||||
|
||||
if (event->parent)
|
||||
event = event->parent;
|
||||
|
||||
rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID;
|
||||
rec.header.misc = 0;
|
||||
rec.header.size = sizeof(rec);
|
||||
rec.hw_id = hw_id;
|
||||
|
||||
perf_event_header__init_id(&rec.header, &sample, event);
|
||||
ret = perf_output_begin(&handle, &sample, event, rec.header.size);
|
||||
|
||||
if (ret)
|
||||
return;
|
||||
|
||||
perf_output_put(&handle, rec);
|
||||
perf_event__output_id_sample(event, &handle, &sample);
|
||||
|
||||
perf_output_end(&handle);
|
||||
}
|
||||
|
||||
static int
|
||||
__perf_event_account_interrupt(struct perf_event *event, int throttle)
|
||||
{
|
||||
@ -13548,3 +13592,5 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
|
||||
.threaded = true,
|
||||
};
|
||||
#endif /* CONFIG_CGROUP_PERF */
|
||||
|
||||
DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);
|
||||
|
@ -205,12 +205,7 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
|
||||
|
||||
static inline int get_recursion_context(int *recursion)
|
||||
{
|
||||
unsigned int pc = preempt_count();
|
||||
unsigned char rctx = 0;
|
||||
|
||||
rctx += !!(pc & (NMI_MASK));
|
||||
rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK));
|
||||
rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
|
||||
unsigned char rctx = interrupt_context_level();
|
||||
|
||||
if (recursion[rctx])
|
||||
return -1;
|
||||
|
@ -167,7 +167,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
|
||||
addr + PAGE_SIZE);
|
||||
|
||||
if (new_page) {
|
||||
err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL);
|
||||
err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm,
|
||||
GFP_KERNEL);
|
||||
if (err)
|
||||
return err;
|
||||
}
|
||||
|
176
kernel/exit.c
176
kernel/exit.c
@ -48,7 +48,6 @@
|
||||
#include <linux/pipe_fs_i.h>
|
||||
#include <linux/audit.h> /* for audit_free() */
|
||||
#include <linux/resource.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/task_io_accounting_ops.h>
|
||||
#include <linux/tracehook.h>
|
||||
#include <linux/fs_struct.h>
|
||||
@ -64,6 +63,7 @@
|
||||
#include <linux/rcuwait.h>
|
||||
#include <linux/compat.h>
|
||||
#include <linux/io_uring.h>
|
||||
#include <linux/kprobes.h>
|
||||
|
||||
#include <linux/uaccess.h>
|
||||
#include <asm/unistd.h>
|
||||
@ -116,7 +116,7 @@ static void __exit_signal(struct task_struct *tsk)
|
||||
* then notify it:
|
||||
*/
|
||||
if (sig->notify_count > 0 && !--sig->notify_count)
|
||||
wake_up_process(sig->group_exit_task);
|
||||
wake_up_process(sig->group_exec_task);
|
||||
|
||||
if (tsk == sig->curr_target)
|
||||
sig->curr_target = next_thread(tsk);
|
||||
@ -168,6 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
|
||||
{
|
||||
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
|
||||
|
||||
kprobe_flush_task(tsk);
|
||||
perf_event_delayed_put(tsk);
|
||||
trace_sched_process_free(tsk);
|
||||
put_task_struct(tsk);
|
||||
@ -339,6 +340,46 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
|
||||
}
|
||||
}
|
||||
|
||||
static void coredump_task_exit(struct task_struct *tsk)
|
||||
{
|
||||
struct core_state *core_state;
|
||||
|
||||
/*
|
||||
* Serialize with any possible pending coredump.
|
||||
* We must hold siglock around checking core_state
|
||||
* and setting PF_POSTCOREDUMP. The core-inducing thread
|
||||
* will increment ->nr_threads for each thread in the
|
||||
* group without PF_POSTCOREDUMP set.
|
||||
*/
|
||||
spin_lock_irq(&tsk->sighand->siglock);
|
||||
tsk->flags |= PF_POSTCOREDUMP;
|
||||
core_state = tsk->signal->core_state;
|
||||
spin_unlock_irq(&tsk->sighand->siglock);
|
||||
if (core_state) {
|
||||
struct core_thread self;
|
||||
|
||||
self.task = current;
|
||||
if (self.task->flags & PF_SIGNALED)
|
||||
self.next = xchg(&core_state->dumper.next, &self);
|
||||
else
|
||||
self.task = NULL;
|
||||
/*
|
||||
* Implies mb(), the result of xchg() must be visible
|
||||
* to core_state->dumper.
|
||||
*/
|
||||
if (atomic_dec_and_test(&core_state->nr_threads))
|
||||
complete(&core_state->startup);
|
||||
|
||||
for (;;) {
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
if (!self.task) /* see coredump_finish() */
|
||||
break;
|
||||
freezable_schedule();
|
||||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
/*
|
||||
* A task is exiting. If it owned this mm, find a new owner for the mm.
|
||||
@ -434,47 +475,12 @@ void mm_update_next_owner(struct mm_struct *mm)
|
||||
static void exit_mm(void)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct core_state *core_state;
|
||||
|
||||
exit_mm_release(current, mm);
|
||||
if (!mm)
|
||||
return;
|
||||
sync_mm_rss(mm);
|
||||
/*
|
||||
* Serialize with any possible pending coredump.
|
||||
* We must hold mmap_lock around checking core_state
|
||||
* and clearing tsk->mm. The core-inducing thread
|
||||
* will increment ->nr_threads for each thread in the
|
||||
* group with ->mm != NULL.
|
||||
*/
|
||||
mmap_read_lock(mm);
|
||||
core_state = mm->core_state;
|
||||
if (core_state) {
|
||||
struct core_thread self;
|
||||
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
self.task = current;
|
||||
if (self.task->flags & PF_SIGNALED)
|
||||
self.next = xchg(&core_state->dumper.next, &self);
|
||||
else
|
||||
self.task = NULL;
|
||||
/*
|
||||
* Implies mb(), the result of xchg() must be visible
|
||||
* to core_state->dumper.
|
||||
*/
|
||||
if (atomic_dec_and_test(&core_state->nr_threads))
|
||||
complete(&core_state->startup);
|
||||
|
||||
for (;;) {
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
if (!self.task) /* see coredump_finish() */
|
||||
break;
|
||||
freezable_schedule();
|
||||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
mmap_read_lock(mm);
|
||||
}
|
||||
mmgrab(mm);
|
||||
BUG_ON(mm != current->active_mm);
|
||||
/* more a memory barrier than a real lock */
|
||||
@ -691,7 +697,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
|
||||
|
||||
/* mt-exec, de_thread() is waiting for group leader */
|
||||
if (unlikely(tsk->signal->notify_count < 0))
|
||||
wake_up_process(tsk->signal->group_exit_task);
|
||||
wake_up_process(tsk->signal->group_exec_task);
|
||||
write_unlock_irq(&tasklist_lock);
|
||||
|
||||
list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
|
||||
@ -729,54 +735,29 @@ void __noreturn do_exit(long code)
|
||||
struct task_struct *tsk = current;
|
||||
int group_dead;
|
||||
|
||||
/*
|
||||
* We can get here from a kernel oops, sometimes with preemption off.
|
||||
* Start by checking for critical errors.
|
||||
* Then fix up important state like USER_DS and preemption.
|
||||
* Then do everything else.
|
||||
*/
|
||||
|
||||
WARN_ON(blk_needs_flush_plug(tsk));
|
||||
|
||||
if (unlikely(in_interrupt()))
|
||||
panic("Aiee, killing interrupt handler!");
|
||||
if (unlikely(!tsk->pid))
|
||||
panic("Attempted to kill the idle task!");
|
||||
|
||||
/*
|
||||
* If do_exit is called because this processes oopsed, it's possible
|
||||
* If do_dead is called because this processes oopsed, it's possible
|
||||
* that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
|
||||
* continuing. Amongst other possible reasons, this is to prevent
|
||||
* mm_release()->clear_child_tid() from writing to a user-controlled
|
||||
* kernel address.
|
||||
*
|
||||
* On uptodate architectures force_uaccess_begin is a noop. On
|
||||
* architectures that still have set_fs/get_fs in addition to handling
|
||||
* oopses handles kernel threads that run as set_fs(KERNEL_DS) by
|
||||
* default.
|
||||
*/
|
||||
force_uaccess_begin();
|
||||
|
||||
if (unlikely(in_atomic())) {
|
||||
pr_info("note: %s[%d] exited with preempt_count %d\n",
|
||||
current->comm, task_pid_nr(current),
|
||||
preempt_count());
|
||||
preempt_count_set(PREEMPT_ENABLED);
|
||||
}
|
||||
|
||||
profile_task_exit(tsk);
|
||||
kcov_task_exit(tsk);
|
||||
|
||||
coredump_task_exit(tsk);
|
||||
ptrace_event(PTRACE_EVENT_EXIT, code);
|
||||
|
||||
validate_creds_for_do_exit(tsk);
|
||||
|
||||
/*
|
||||
* We're taking recursive faults here in do_exit. Safest is to just
|
||||
* leave this task alone and wait for reboot.
|
||||
*/
|
||||
if (unlikely(tsk->flags & PF_EXITING)) {
|
||||
pr_alert("Fixing recursive fault but reboot is needed!\n");
|
||||
futex_exit_recursive(tsk);
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
schedule();
|
||||
}
|
||||
|
||||
io_uring_files_cancel();
|
||||
exit_signals(tsk); /* sets PF_EXITING */
|
||||
|
||||
@ -875,16 +856,46 @@ void __noreturn do_exit(long code)
|
||||
lockdep_free_task(tsk);
|
||||
do_task_dead();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(do_exit);
|
||||
|
||||
void complete_and_exit(struct completion *comp, long code)
|
||||
void __noreturn make_task_dead(int signr)
|
||||
{
|
||||
if (comp)
|
||||
complete(comp);
|
||||
/*
|
||||
* Take the task off the cpu after something catastrophic has
|
||||
* happened.
|
||||
*
|
||||
* We can get here from a kernel oops, sometimes with preemption off.
|
||||
* Start by checking for critical errors.
|
||||
* Then fix up important state like USER_DS and preemption.
|
||||
* Then do everything else.
|
||||
*/
|
||||
struct task_struct *tsk = current;
|
||||
|
||||
do_exit(code);
|
||||
if (unlikely(in_interrupt()))
|
||||
panic("Aiee, killing interrupt handler!");
|
||||
if (unlikely(!tsk->pid))
|
||||
panic("Attempted to kill the idle task!");
|
||||
|
||||
if (unlikely(in_atomic())) {
|
||||
pr_info("note: %s[%d] exited with preempt_count %d\n",
|
||||
current->comm, task_pid_nr(current),
|
||||
preempt_count());
|
||||
preempt_count_set(PREEMPT_ENABLED);
|
||||
}
|
||||
|
||||
/*
|
||||
* We're taking recursive faults here in make_task_dead. Safest is to just
|
||||
* leave this task alone and wait for reboot.
|
||||
*/
|
||||
if (unlikely(tsk->flags & PF_EXITING)) {
|
||||
pr_alert("Fixing recursive fault but reboot is needed!\n");
|
||||
futex_exit_recursive(tsk);
|
||||
tsk->exit_state = EXIT_DEAD;
|
||||
refcount_inc(&tsk->rcu_users);
|
||||
do_task_dead();
|
||||
}
|
||||
|
||||
do_exit(signr);
|
||||
}
|
||||
EXPORT_SYMBOL(complete_and_exit);
|
||||
|
||||
SYSCALL_DEFINE1(exit, int, error_code)
|
||||
{
|
||||
@ -900,17 +911,19 @@ do_group_exit(int exit_code)
|
||||
{
|
||||
struct signal_struct *sig = current->signal;
|
||||
|
||||
BUG_ON(exit_code & 0x80); /* core dumps don't get here */
|
||||
|
||||
if (signal_group_exit(sig))
|
||||
if (sig->flags & SIGNAL_GROUP_EXIT)
|
||||
exit_code = sig->group_exit_code;
|
||||
else if (sig->group_exec_task)
|
||||
exit_code = 0;
|
||||
else if (!thread_group_empty(current)) {
|
||||
struct sighand_struct *const sighand = current->sighand;
|
||||
|
||||
spin_lock_irq(&sighand->siglock);
|
||||
if (signal_group_exit(sig))
|
||||
if (sig->flags & SIGNAL_GROUP_EXIT)
|
||||
/* Another thread got here before we took the lock. */
|
||||
exit_code = sig->group_exit_code;
|
||||
else if (sig->group_exec_task)
|
||||
exit_code = 0;
|
||||
else {
|
||||
sig->group_exit_code = exit_code;
|
||||
sig->flags = SIGNAL_GROUP_EXIT;
|
||||
@ -1005,7 +1018,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
|
||||
return 0;
|
||||
|
||||
if (unlikely(wo->wo_flags & WNOWAIT)) {
|
||||
status = p->exit_code;
|
||||
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
|
||||
? p->signal->group_exit_code : p->exit_code;
|
||||
get_task_struct(p);
|
||||
read_unlock(&tasklist_lock);
|
||||
sched_annotate_sleep();
|
||||
|
@ -62,40 +62,13 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
|
||||
return e;
|
||||
}
|
||||
|
||||
int init_kernel_text(unsigned long addr)
|
||||
{
|
||||
if (addr >= (unsigned long)_sinittext &&
|
||||
addr < (unsigned long)_einittext)
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int notrace core_kernel_text(unsigned long addr)
|
||||
{
|
||||
if (addr >= (unsigned long)_stext &&
|
||||
addr < (unsigned long)_etext)
|
||||
if (is_kernel_text(addr))
|
||||
return 1;
|
||||
|
||||
if (system_state < SYSTEM_RUNNING &&
|
||||
init_kernel_text(addr))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* core_kernel_data - tell if addr points to kernel data
|
||||
* @addr: address to test
|
||||
*
|
||||
* Returns true if @addr passed in is from the core kernel data
|
||||
* section.
|
||||
*
|
||||
* Note: On some archs it may return true for core RODATA, and false
|
||||
* for others. But will always be true for core RW data.
|
||||
*/
|
||||
int core_kernel_data(unsigned long addr)
|
||||
{
|
||||
if (addr >= (unsigned long)_sdata &&
|
||||
addr < (unsigned long)_edata)
|
||||
if (system_state < SYSTEM_FREEING_INITMEM &&
|
||||
is_kernel_inittext(addr))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
@ -112,7 +85,7 @@ int __kernel_text_address(unsigned long addr)
|
||||
* Since we are after the module-symbols check, there's
|
||||
* no danger of address overlap:
|
||||
*/
|
||||
if (init_kernel_text(addr))
|
||||
if (is_kernel_inittext(addr))
|
||||
return 1;
|
||||
return 0;
|
||||
}
|
||||
|
@ -42,6 +42,7 @@
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/vmacache.h>
|
||||
#include <linux/nsproxy.h>
|
||||
#include <linux/capability.h>
|
||||
@ -76,7 +77,6 @@
|
||||
#include <linux/taskstats_kern.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/tty.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/fs_struct.h>
|
||||
#include <linux/magic.h>
|
||||
#include <linux/perf_event.h>
|
||||
@ -366,12 +366,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
|
||||
*new = data_race(*orig);
|
||||
INIT_LIST_HEAD(&new->anon_vma_chain);
|
||||
new->vm_next = new->vm_prev = NULL;
|
||||
dup_anon_vma_name(orig, new);
|
||||
}
|
||||
return new;
|
||||
}
|
||||
|
||||
void vm_area_free(struct vm_area_struct *vma)
|
||||
{
|
||||
free_anon_vma_name(vma);
|
||||
kmem_cache_free(vm_area_cachep, vma);
|
||||
}
|
||||
|
||||
@ -755,8 +757,6 @@ void __put_task_struct(struct task_struct *tsk)
|
||||
delayacct_tsk_free(tsk);
|
||||
put_signal_struct(tsk->signal);
|
||||
sched_core_free(tsk);
|
||||
|
||||
if (!profile_handoff_task(tsk))
|
||||
free_task(tsk);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__put_task_struct);
|
||||
@ -951,7 +951,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
|
||||
tsk->splice_pipe = NULL;
|
||||
tsk->task_frag.page = NULL;
|
||||
tsk->wake_q.next = NULL;
|
||||
tsk->pf_io_worker = NULL;
|
||||
tsk->worker_private = NULL;
|
||||
|
||||
account_kernel_stack(tsk, 1);
|
||||
|
||||
@ -1044,7 +1044,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
|
||||
seqcount_init(&mm->write_protect_seq);
|
||||
mmap_init_lock(mm);
|
||||
INIT_LIST_HEAD(&mm->mmlist);
|
||||
mm->core_state = NULL;
|
||||
mm_pgtables_bytes_init(mm);
|
||||
mm->map_count = 0;
|
||||
mm->locked_vm = 0;
|
||||
@ -1392,8 +1391,7 @@ static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
|
||||
* purposes.
|
||||
*/
|
||||
if (tsk->clear_child_tid) {
|
||||
if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
|
||||
atomic_read(&mm->mm_users) > 1) {
|
||||
if (atomic_read(&mm->mm_users) > 1) {
|
||||
/*
|
||||
* We don't check the error code - if userspace has
|
||||
* not set up a proper pointer then tough luck.
|
||||
@ -1559,32 +1557,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
|
||||
return error;
|
||||
}
|
||||
|
||||
static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
|
||||
{
|
||||
#ifdef CONFIG_BLOCK
|
||||
struct io_context *ioc = current->io_context;
|
||||
struct io_context *new_ioc;
|
||||
|
||||
if (!ioc)
|
||||
return 0;
|
||||
/*
|
||||
* Share io context with parent, if CLONE_IO is set
|
||||
*/
|
||||
if (clone_flags & CLONE_IO) {
|
||||
ioc_task_link(ioc);
|
||||
tsk->io_context = ioc;
|
||||
} else if (ioprio_valid(ioc->ioprio)) {
|
||||
new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
|
||||
if (unlikely(!new_ioc))
|
||||
return -ENOMEM;
|
||||
|
||||
new_ioc->ioprio = ioc->ioprio;
|
||||
put_io_context(new_ioc);
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
|
||||
{
|
||||
struct sighand_struct *sig;
|
||||
@ -2035,12 +2007,6 @@ static __latent_entropy struct task_struct *copy_process(
|
||||
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
|
||||
}
|
||||
|
||||
/*
|
||||
* This _must_ happen before we call free_task(), i.e. before we jump
|
||||
* to any of the bad_fork_* labels. This is to avoid freeing
|
||||
* p->set_child_tid which is (ab)used as a kthread's data pointer for
|
||||
* kernel threads (PF_KTHREAD).
|
||||
*/
|
||||
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
|
||||
/*
|
||||
* Clear TID on mm_release()?
|
||||
@ -2121,12 +2087,16 @@ static __latent_entropy struct task_struct *copy_process(
|
||||
p->io_context = NULL;
|
||||
audit_set_context(p, NULL);
|
||||
cgroup_fork(p);
|
||||
if (p->flags & PF_KTHREAD) {
|
||||
if (!set_kthread_struct(p))
|
||||
goto bad_fork_cleanup_delayacct;
|
||||
}
|
||||
#ifdef CONFIG_NUMA
|
||||
p->mempolicy = mpol_dup(p->mempolicy);
|
||||
if (IS_ERR(p->mempolicy)) {
|
||||
retval = PTR_ERR(p->mempolicy);
|
||||
p->mempolicy = NULL;
|
||||
goto bad_fork_cleanup_threadgroup_lock;
|
||||
goto bad_fork_cleanup_delayacct;
|
||||
}
|
||||
#endif
|
||||
#ifdef CONFIG_CPUSETS
|
||||
@ -2473,8 +2443,8 @@ static __latent_entropy struct task_struct *copy_process(
|
||||
lockdep_free_task(p);
|
||||
#ifdef CONFIG_NUMA
|
||||
mpol_put(p->mempolicy);
|
||||
bad_fork_cleanup_threadgroup_lock:
|
||||
#endif
|
||||
bad_fork_cleanup_delayacct:
|
||||
delayacct_tsk_free(p);
|
||||
bad_fork_cleanup_count:
|
||||
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
|
||||
@ -3038,7 +3008,7 @@ int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
|
||||
int ksys_unshare(unsigned long unshare_flags)
|
||||
{
|
||||
struct fs_struct *fs, *new_fs = NULL;
|
||||
struct files_struct *fd, *new_fd = NULL;
|
||||
struct files_struct *new_fd = NULL;
|
||||
struct cred *new_cred = NULL;
|
||||
struct nsproxy *new_nsproxy = NULL;
|
||||
int do_sysvsem = 0;
|
||||
@ -3125,11 +3095,8 @@ int ksys_unshare(unsigned long unshare_flags)
|
||||
spin_unlock(&fs->lock);
|
||||
}
|
||||
|
||||
if (new_fd) {
|
||||
fd = current->files;
|
||||
current->files = new_fd;
|
||||
new_fd = fd;
|
||||
}
|
||||
if (new_fd)
|
||||
swap(current->files, new_fd);
|
||||
|
||||
task_unlock(current);
|
||||
|
||||
|
@ -4,7 +4,6 @@ menu "GCOV-based kernel profiling"
|
||||
config GCOV_KERNEL
|
||||
bool "Enable gcov-based kernel profiling"
|
||||
depends on DEBUG_FS
|
||||
depends on !CC_IS_CLANG || CLANG_VERSION >= 110000
|
||||
depends on !ARCH_WANTS_NO_INSTR || CC_HAS_NO_PROFILE_FN_ATTR
|
||||
select CONSTRUCTORS
|
||||
default n
|
||||
|
@ -63,7 +63,9 @@ static struct task_struct *watchdog_task;
|
||||
* Should we dump all CPUs backtraces in a hung task event?
|
||||
* Defaults to 0, can be changed via sysctl.
|
||||
*/
|
||||
unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
|
||||
static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
|
||||
#else
|
||||
#define sysctl_hung_task_all_cpu_backtrace 0
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
/*
|
||||
@ -222,11 +224,13 @@ static long hung_timeout_jiffies(unsigned long last_checked,
|
||||
MAX_SCHEDULE_TIMEOUT;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SYSCTL
|
||||
/*
|
||||
* Process updating of timeout sysctl
|
||||
*/
|
||||
int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
|
||||
void *buffer, size_t *lenp, loff_t *ppos)
|
||||
static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
|
||||
void __user *buffer,
|
||||
size_t *lenp, loff_t *ppos)
|
||||
{
|
||||
int ret;
|
||||
|
||||
@ -241,6 +245,76 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
|
||||
* and hung_task_check_interval_secs
|
||||
*/
|
||||
static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ);
|
||||
static struct ctl_table hung_task_sysctls[] = {
|
||||
#ifdef CONFIG_SMP
|
||||
{
|
||||
.procname = "hung_task_all_cpu_backtrace",
|
||||
.data = &sysctl_hung_task_all_cpu_backtrace,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
#endif /* CONFIG_SMP */
|
||||
{
|
||||
.procname = "hung_task_panic",
|
||||
.data = &sysctl_hung_task_panic,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
.extra2 = SYSCTL_ONE,
|
||||
},
|
||||
{
|
||||
.procname = "hung_task_check_count",
|
||||
.data = &sysctl_hung_task_check_count,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = SYSCTL_ZERO,
|
||||
},
|
||||
{
|
||||
.procname = "hung_task_timeout_secs",
|
||||
.data = &sysctl_hung_task_timeout_secs,
|
||||
.maxlen = sizeof(unsigned long),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dohung_task_timeout_secs,
|
||||
.extra2 = (void *)&hung_task_timeout_max,
|
||||
},
|
||||
{
|
||||
.procname = "hung_task_check_interval_secs",
|
||||
.data = &sysctl_hung_task_check_interval_secs,
|
||||
.maxlen = sizeof(unsigned long),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dohung_task_timeout_secs,
|
||||
.extra2 = (void *)&hung_task_timeout_max,
|
||||
},
|
||||
{
|
||||
.procname = "hung_task_warnings",
|
||||
.data = &sysctl_hung_task_warnings,
|
||||
.maxlen = sizeof(int),
|
||||
.mode = 0644,
|
||||
.proc_handler = proc_dointvec_minmax,
|
||||
.extra1 = SYSCTL_NEG_ONE,
|
||||
},
|
||||
{}
|
||||
};
|
||||
|
||||
static void __init hung_task_sysctl_init(void)
|
||||
{
|
||||
register_sysctl_init("kernel", hung_task_sysctls);
|
||||
}
|
||||
#else
|
||||
#define hung_task_sysctl_init() do { } while (0)
|
||||
#endif /* CONFIG_SYSCTL */
|
||||
|
||||
|
||||
static atomic_t reset_hung_task = ATOMIC_INIT(0);
|
||||
|
||||
void reset_hung_task_detector(void)
|
||||
@ -310,6 +384,7 @@ static int __init hung_task_init(void)
|
||||
pm_notifier(hungtask_pm_notify, 0);
|
||||
|
||||
watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
|
||||
hung_task_sysctl_init();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
@ -97,9 +97,6 @@ config GENERIC_MSI_IRQ_DOMAIN
|
||||
config IRQ_MSI_IOMMU
|
||||
bool
|
||||
|
||||
config HANDLE_DOMAIN_IRQ
|
||||
bool
|
||||
|
||||
config IRQ_TIMINGS
|
||||
bool
|
||||
|
||||
@ -144,3 +141,10 @@ config GENERIC_IRQ_MULTI_HANDLER
|
||||
bool
|
||||
help
|
||||
Allow to specify the low level IRQ handler at run time.
|
||||
|
||||
# Cavium Octeon is the last system to use this deprecated option
|
||||
# Do not even think of enabling this on any new platform
|
||||
config DEPRECATED_IRQ_CPU_ONOFFLINE
|
||||
bool
|
||||
depends on CAVIUM_OCTEON_SOC
|
||||
default CAVIUM_OCTEON_SOC
|
||||
|
@ -575,8 +575,6 @@ EXPORT_SYMBOL_GPL(handle_simple_irq);
|
||||
*/
|
||||
void handle_untracked_irq(struct irq_desc *desc)
|
||||
{
|
||||
unsigned int flags = 0;
|
||||
|
||||
raw_spin_lock(&desc->lock);
|
||||
|
||||
if (!irq_may_run(desc))
|
||||
@ -593,7 +591,7 @@ void handle_untracked_irq(struct irq_desc *desc)
|
||||
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
|
||||
raw_spin_unlock(&desc->lock);
|
||||
|
||||
__handle_irq_event_percpu(desc, &flags);
|
||||
__handle_irq_event_percpu(desc);
|
||||
|
||||
raw_spin_lock(&desc->lock);
|
||||
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
|
||||
@ -1122,6 +1120,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(irq_modify_status);
|
||||
|
||||
#ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE
|
||||
/**
|
||||
* irq_cpu_online - Invoke all irq_cpu_online functions.
|
||||
*
|
||||
@ -1181,6 +1180,7 @@ void irq_cpu_offline(void)
|
||||
raw_spin_unlock_irqrestore(&desc->lock, flags);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY
|
||||
|
||||
|
@ -25,6 +25,7 @@ static DEFINE_RAW_SPINLOCK(gc_lock);
|
||||
void irq_gc_noop(struct irq_data *d)
|
||||
{
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(irq_gc_noop);
|
||||
|
||||
/**
|
||||
* irq_gc_mask_disable_reg - Mask chip via disable register
|
||||
@ -44,6 +45,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
|
||||
*ct->mask_cache &= ~mask;
|
||||
irq_gc_unlock(gc);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(irq_gc_mask_disable_reg);
|
||||
|
||||
/**
|
||||
* irq_gc_mask_set_bit - Mask chip via setting bit in mask register
|
||||
@ -103,6 +105,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
|
||||
*ct->mask_cache |= mask;
|
||||
irq_gc_unlock(gc);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(irq_gc_unmask_enable_reg);
|
||||
|
||||
/**
|
||||
* irq_gc_ack_set_bit - Ack pending interrupt via setting bit
|
||||
@ -448,7 +451,7 @@ static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
|
||||
|
||||
}
|
||||
|
||||
struct irq_domain_ops irq_generic_chip_ops = {
|
||||
const struct irq_domain_ops irq_generic_chip_ops = {
|
||||
.map = irq_map_generic_chip,
|
||||
.unmap = irq_unmap_generic_chip,
|
||||
.xlate = irq_domain_xlate_onetwocell,
|
||||
|
@ -14,6 +14,8 @@
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/kernel_stat.h>
|
||||
|
||||
#include <asm/irq_regs.h>
|
||||
|
||||
#include <trace/events/irq.h>
|
||||
|
||||
#include "internals.h"
|
||||
@ -134,7 +136,7 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
|
||||
wake_up_process(action->thread);
|
||||
}
|
||||
|
||||
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags)
|
||||
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
|
||||
{
|
||||
irqreturn_t retval = IRQ_NONE;
|
||||
unsigned int irq = desc->irq_data.irq;
|
||||
@ -172,10 +174,6 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
|
||||
}
|
||||
|
||||
__irq_wake_thread(desc, action);
|
||||
|
||||
fallthrough; /* to add to randomness */
|
||||
case IRQ_HANDLED:
|
||||
*flags |= action->flags;
|
||||
break;
|
||||
|
||||
default:
|
||||
@ -191,11 +189,10 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
|
||||
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
|
||||
{
|
||||
irqreturn_t retval;
|
||||
unsigned int flags = 0;
|
||||
|
||||
retval = __handle_irq_event_percpu(desc, &flags);
|
||||
retval = __handle_irq_event_percpu(desc);
|
||||
|
||||
add_interrupt_randomness(desc->irq_data.irq, flags);
|
||||
add_interrupt_randomness(desc->irq_data.irq);
|
||||
|
||||
if (!irq_settings_no_debug(desc))
|
||||
note_interrupt(desc, retval);
|
||||
@ -226,4 +223,20 @@ int __init set_handle_irq(void (*handle_irq)(struct pt_regs *))
|
||||
handle_arch_irq = handle_irq;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* generic_handle_arch_irq - root irq handler for architectures which do no
|
||||
* entry accounting themselves
|
||||
* @regs: Register file coming from the low-level handling code
|
||||
*/
|
||||
asmlinkage void noinstr generic_handle_arch_irq(struct pt_regs *regs)
|
||||
{
|
||||
struct pt_regs *old_regs;
|
||||
|
||||
irq_enter();
|
||||
old_regs = set_irq_regs(regs);
|
||||
handle_arch_irq(regs);
|
||||
set_irq_regs(old_regs);
|
||||
irq_exit();
|
||||
}
|
||||
#endif
|
||||
|
@ -103,7 +103,7 @@ extern int __irq_get_irqchip_state(struct irq_data *data,
|
||||
|
||||
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
|
||||
|
||||
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
|
||||
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc);
|
||||
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
|
||||
irqreturn_t handle_irq_event(struct irq_desc *desc);
|
||||
|
||||
|
@ -646,12 +646,15 @@ int handle_irq_desc(struct irq_desc *desc)
|
||||
generic_handle_irq_desc(desc);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(handle_irq_desc);
|
||||
|
||||
/**
|
||||
* generic_handle_irq - Invoke the handler for a particular irq
|
||||
* @irq: The irq number to handle
|
||||
*
|
||||
* Returns: 0 on success, or -EINVAL if conversion has failed
|
||||
*
|
||||
* This function must be called from an IRQ context with irq regs
|
||||
* initialized.
|
||||
*/
|
||||
int generic_handle_irq(unsigned int irq)
|
||||
{
|
||||
@ -662,89 +665,39 @@ EXPORT_SYMBOL_GPL(generic_handle_irq);
|
||||
#ifdef CONFIG_IRQ_DOMAIN
|
||||
/**
|
||||
* generic_handle_domain_irq - Invoke the handler for a HW irq belonging
|
||||
* to a domain, usually for a non-root interrupt
|
||||
* controller
|
||||
* to a domain.
|
||||
* @domain: The domain where to perform the lookup
|
||||
* @hwirq: The HW irq number to convert to a logical one
|
||||
*
|
||||
* Returns: 0 on success, or -EINVAL if conversion has failed
|
||||
*
|
||||
* This function must be called from an IRQ context with irq regs
|
||||
* initialized.
|
||||
*/
|
||||
int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
|
||||
{
|
||||
WARN_ON_ONCE(!in_irq());
|
||||
return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
|
||||
|
||||
#ifdef CONFIG_HANDLE_DOMAIN_IRQ
|
||||
/**
|
||||
* handle_domain_irq - Invoke the handler for a HW irq belonging to a domain,
|
||||
* usually for a root interrupt controller
|
||||
* generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging
|
||||
* to a domain.
|
||||
* @domain: The domain where to perform the lookup
|
||||
* @hwirq: The HW irq number to convert to a logical one
|
||||
* @regs: Register file coming from the low-level handling code
|
||||
*
|
||||
* Returns: 0 on success, or -EINVAL if conversion has failed
|
||||
*/
|
||||
int handle_domain_irq(struct irq_domain *domain,
|
||||
unsigned int hwirq, struct pt_regs *regs)
|
||||
{
|
||||
struct pt_regs *old_regs = set_irq_regs(regs);
|
||||
struct irq_desc *desc;
|
||||
int ret = 0;
|
||||
|
||||
irq_enter();
|
||||
|
||||
/* The irqdomain code provides boundary checks */
|
||||
desc = irq_resolve_mapping(domain, hwirq);
|
||||
if (likely(desc))
|
||||
handle_irq_desc(desc);
|
||||
else
|
||||
ret = -EINVAL;
|
||||
|
||||
irq_exit();
|
||||
set_irq_regs(old_regs);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* handle_domain_nmi - Invoke the handler for a HW irq belonging to a domain
|
||||
* @domain: The domain where to perform the lookup
|
||||
* @hwirq: The HW irq number to convert to a logical one
|
||||
* @regs: Register file coming from the low-level handling code
|
||||
*
|
||||
* This function must be called from an NMI context.
|
||||
*
|
||||
* Returns: 0 on success, or -EINVAL if conversion has failed
|
||||
*/
|
||||
int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
|
||||
struct pt_regs *regs)
|
||||
* This function must be called from an NMI context with irq regs
|
||||
* initialized.
|
||||
**/
|
||||
int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq)
|
||||
{
|
||||
struct pt_regs *old_regs = set_irq_regs(regs);
|
||||
struct irq_desc *desc;
|
||||
int ret = 0;
|
||||
|
||||
/*
|
||||
* NMI context needs to be setup earlier in order to deal with tracing.
|
||||
*/
|
||||
WARN_ON(!in_nmi());
|
||||
|
||||
desc = irq_resolve_mapping(domain, hwirq);
|
||||
|
||||
/*
|
||||
* ack_bad_irq is not NMI-safe, just report
|
||||
* an invalid interrupt.
|
||||
*/
|
||||
if (likely(desc))
|
||||
handle_irq_desc(desc);
|
||||
else
|
||||
ret = -EINVAL;
|
||||
|
||||
set_irq_regs(old_regs);
|
||||
return ret;
|
||||
WARN_ON_ONCE(!in_nmi());
|
||||
return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* Dynamic interrupt handling */
|
||||
|
||||
|
@ -744,9 +744,8 @@ static int irq_domain_translate(struct irq_domain *d,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
|
||||
unsigned int count,
|
||||
struct irq_fwspec *fwspec)
|
||||
void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
|
||||
unsigned int count, struct irq_fwspec *fwspec)
|
||||
{
|
||||
int i;
|
||||
|
||||
@ -756,6 +755,7 @@ static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
|
||||
for (i = 0; i < count; i++)
|
||||
fwspec->param[i] = args[i];
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec);
|
||||
|
||||
unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
|
||||
{
|
||||
@ -1502,6 +1502,7 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
|
||||
irq_free_descs(virq, nr_irqs);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs);
|
||||
|
||||
/* The irq_data was moved, fix the revmap to refer to the new location */
|
||||
static void irq_domain_fix_revmap(struct irq_data *d)
|
||||
|
@ -486,7 +486,8 @@ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(irq_force_affinity);
|
||||
|
||||
int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
|
||||
int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m,
|
||||
bool setaffinity)
|
||||
{
|
||||
unsigned long flags;
|
||||
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
|
||||
@ -495,12 +496,11 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
|
||||
return -EINVAL;
|
||||
desc->affinity_hint = m;
|
||||
irq_put_desc_unlock(desc, flags);
|
||||
/* set the initial affinity to prevent every interrupt being on CPU0 */
|
||||
if (m)
|
||||
if (m && setaffinity)
|
||||
__irq_set_affinity(irq, m, false);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
|
||||
EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint);
|
||||
|
||||
static void irq_affinity_notify(struct work_struct *work)
|
||||
{
|
||||
@ -1259,6 +1259,8 @@ static int irq_thread(void *data)
|
||||
irqreturn_t (*handler_fn)(struct irq_desc *desc,
|
||||
struct irqaction *action);
|
||||
|
||||
sched_set_fifo(current);
|
||||
|
||||
if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD,
|
||||
&action->thread_flags))
|
||||
handler_fn = irq_forced_thread_fn;
|
||||
@ -1424,8 +1426,6 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
|
||||
if (IS_ERR(t))
|
||||
return PTR_ERR(t);
|
||||
|
||||
sched_set_fifo(t);
|
||||
|
||||
/*
|
||||
* We keep the reference to the task struct even if
|
||||
* the thread dies to avoid that the interrupt code
|
||||
@ -2827,7 +2827,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
|
||||
* This call sets the internal irqchip state of an interrupt,
|
||||
* depending on the value of @which.
|
||||
*
|
||||
* This function should be called with preemption disabled if the
|
||||
* This function should be called with migration disabled if the
|
||||
* interrupt controller has per-cpu registers.
|
||||
*/
|
||||
int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,
|
||||
|
794
kernel/irq/msi.c
794
kernel/irq/msi.c
@ -14,12 +14,15 @@
|
||||
#include <linux/irqdomain.h>
|
||||
#include <linux/msi.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/sysfs.h>
|
||||
#include <linux/pci.h>
|
||||
|
||||
#include "internals.h"
|
||||
|
||||
static inline int msi_sysfs_create_group(struct device *dev);
|
||||
|
||||
/**
|
||||
* alloc_msi_entry - Allocate an initialized msi_desc
|
||||
* msi_alloc_desc - Allocate an initialized msi_desc
|
||||
* @dev: Pointer to the device for which this is allocated
|
||||
* @nvec: The number of vectors used in this entry
|
||||
* @affinity: Optional pointer to an affinity mask array size of @nvec
|
||||
@ -29,34 +32,134 @@
|
||||
*
|
||||
* Return: pointer to allocated &msi_desc on success or %NULL on failure
|
||||
*/
|
||||
struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
|
||||
static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec,
|
||||
const struct irq_affinity_desc *affinity)
|
||||
{
|
||||
struct msi_desc *desc;
|
||||
struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
|
||||
|
||||
desc = kzalloc(sizeof(*desc), GFP_KERNEL);
|
||||
if (!desc)
|
||||
return NULL;
|
||||
|
||||
INIT_LIST_HEAD(&desc->list);
|
||||
desc->dev = dev;
|
||||
desc->nvec_used = nvec;
|
||||
if (affinity) {
|
||||
desc->affinity = kmemdup(affinity,
|
||||
nvec * sizeof(*desc->affinity), GFP_KERNEL);
|
||||
desc->affinity = kmemdup(affinity, nvec * sizeof(*desc->affinity), GFP_KERNEL);
|
||||
if (!desc->affinity) {
|
||||
kfree(desc);
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return desc;
|
||||
}
|
||||
|
||||
void free_msi_entry(struct msi_desc *entry)
|
||||
static void msi_free_desc(struct msi_desc *desc)
|
||||
{
|
||||
kfree(entry->affinity);
|
||||
kfree(entry);
|
||||
kfree(desc->affinity);
|
||||
kfree(desc);
|
||||
}
|
||||
|
||||
static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc, unsigned int index)
|
||||
{
|
||||
int ret;
|
||||
|
||||
desc->msi_index = index;
|
||||
ret = xa_insert(&md->__store, index, desc, GFP_KERNEL);
|
||||
if (ret)
|
||||
msi_free_desc(desc);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* msi_add_msi_desc - Allocate and initialize a MSI descriptor
|
||||
* @dev: Pointer to the device for which the descriptor is allocated
|
||||
* @init_desc: Pointer to an MSI descriptor to initialize the new descriptor
|
||||
*
|
||||
* Return: 0 on success or an appropriate failure code.
|
||||
*/
|
||||
int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc)
|
||||
{
|
||||
struct msi_desc *desc;
|
||||
|
||||
lockdep_assert_held(&dev->msi.data->mutex);
|
||||
|
||||
desc = msi_alloc_desc(dev, init_desc->nvec_used, init_desc->affinity);
|
||||
if (!desc)
|
||||
return -ENOMEM;
|
||||
|
||||
/* Copy type specific data to the new descriptor. */
|
||||
desc->pci = init_desc->pci;
|
||||
return msi_insert_desc(dev->msi.data, desc, init_desc->msi_index);
|
||||
}
|
||||
|
||||
/**
|
||||
* msi_add_simple_msi_descs - Allocate and initialize MSI descriptors
|
||||
* @dev: Pointer to the device for which the descriptors are allocated
|
||||
* @index: Index for the first MSI descriptor
|
||||
* @ndesc: Number of descriptors to allocate
|
||||
*
|
||||
* Return: 0 on success or an appropriate failure code.
|
||||
*/
|
||||
static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsigned int ndesc)
|
||||
{
|
||||
unsigned int idx, last = index + ndesc - 1;
|
||||
struct msi_desc *desc;
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held(&dev->msi.data->mutex);
|
||||
|
||||
for (idx = index; idx <= last; idx++) {
|
||||
desc = msi_alloc_desc(dev, 1, NULL);
|
||||
if (!desc)
|
||||
goto fail_mem;
|
||||
ret = msi_insert_desc(dev->msi.data, desc, idx);
|
||||
if (ret)
|
||||
goto fail;
|
||||
}
|
||||
return 0;
|
||||
|
||||
fail_mem:
|
||||
ret = -ENOMEM;
|
||||
fail:
|
||||
msi_free_msi_descs_range(dev, MSI_DESC_NOTASSOCIATED, index, last);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
|
||||
{
|
||||
switch (filter) {
|
||||
case MSI_DESC_ALL:
|
||||
return true;
|
||||
case MSI_DESC_NOTASSOCIATED:
|
||||
return !desc->irq;
|
||||
case MSI_DESC_ASSOCIATED:
|
||||
return !!desc->irq;
|
||||
}
|
||||
WARN_ON_ONCE(1);
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* msi_free_msi_descs_range - Free MSI descriptors of a device
|
||||
* @dev: Device to free the descriptors
|
||||
* @filter: Descriptor state filter
|
||||
* @first_index: Index to start freeing from
|
||||
* @last_index: Last index to be freed
|
||||
*/
|
||||
void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter,
|
||||
unsigned int first_index, unsigned int last_index)
|
||||
{
|
||||
struct xarray *xa = &dev->msi.data->__store;
|
||||
struct msi_desc *desc;
|
||||
unsigned long idx;
|
||||
|
||||
lockdep_assert_held(&dev->msi.data->mutex);
|
||||
|
||||
xa_for_each_range(xa, idx, desc, first_index, last_index) {
|
||||
if (msi_desc_match(desc, filter)) {
|
||||
xa_erase(xa, idx);
|
||||
msi_free_desc(desc);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
|
||||
@ -72,138 +175,289 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(get_cached_msi_msg);
|
||||
|
||||
static void msi_device_data_release(struct device *dev, void *res)
|
||||
{
|
||||
struct msi_device_data *md = res;
|
||||
|
||||
WARN_ON_ONCE(!xa_empty(&md->__store));
|
||||
xa_destroy(&md->__store);
|
||||
dev->msi.data = NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* msi_setup_device_data - Setup MSI device data
|
||||
* @dev: Device for which MSI device data should be set up
|
||||
*
|
||||
* Return: 0 on success, appropriate error code otherwise
|
||||
*
|
||||
* This can be called more than once for @dev. If the MSI device data is
|
||||
* already allocated the call succeeds. The allocated memory is
|
||||
* automatically released when the device is destroyed.
|
||||
*/
|
||||
int msi_setup_device_data(struct device *dev)
|
||||
{
|
||||
struct msi_device_data *md;
|
||||
int ret;
|
||||
|
||||
if (dev->msi.data)
|
||||
return 0;
|
||||
|
||||
md = devres_alloc(msi_device_data_release, sizeof(*md), GFP_KERNEL);
|
||||
if (!md)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = msi_sysfs_create_group(dev);
|
||||
if (ret) {
|
||||
devres_free(md);
|
||||
return ret;
|
||||
}
|
||||
|
||||
xa_init(&md->__store);
|
||||
mutex_init(&md->mutex);
|
||||
dev->msi.data = md;
|
||||
devres_add(dev, md);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* msi_lock_descs - Lock the MSI descriptor storage of a device
|
||||
* @dev: Device to operate on
|
||||
*/
|
||||
void msi_lock_descs(struct device *dev)
|
||||
{
|
||||
mutex_lock(&dev->msi.data->mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(msi_lock_descs);
|
||||
|
||||
/**
|
||||
* msi_unlock_descs - Unlock the MSI descriptor storage of a device
|
||||
* @dev: Device to operate on
|
||||
*/
|
||||
void msi_unlock_descs(struct device *dev)
|
||||
{
|
||||
/* Invalidate the index wich was cached by the iterator */
|
||||
dev->msi.data->__iter_idx = MSI_MAX_INDEX;
|
||||
mutex_unlock(&dev->msi.data->mutex);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(msi_unlock_descs);
|
||||
|
||||
static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_filter filter)
|
||||
{
|
||||
struct msi_desc *desc;
|
||||
|
||||
xa_for_each_start(&md->__store, md->__iter_idx, desc, md->__iter_idx) {
|
||||
if (msi_desc_match(desc, filter))
|
||||
return desc;
|
||||
}
|
||||
md->__iter_idx = MSI_MAX_INDEX;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* msi_first_desc - Get the first MSI descriptor of a device
|
||||
* @dev: Device to operate on
|
||||
* @filter: Descriptor state filter
|
||||
*
|
||||
* Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs()
|
||||
* must be invoked before the call.
|
||||
*
|
||||
* Return: Pointer to the first MSI descriptor matching the search
|
||||
* criteria, NULL if none found.
|
||||
*/
|
||||
struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter)
|
||||
{
|
||||
struct msi_device_data *md = dev->msi.data;
|
||||
|
||||
if (WARN_ON_ONCE(!md))
|
||||
return NULL;
|
||||
|
||||
lockdep_assert_held(&md->mutex);
|
||||
|
||||
md->__iter_idx = 0;
|
||||
return msi_find_desc(md, filter);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(msi_first_desc);
|
||||
|
||||
/**
|
||||
* msi_next_desc - Get the next MSI descriptor of a device
|
||||
* @dev: Device to operate on
|
||||
*
|
||||
* The first invocation of msi_next_desc() has to be preceeded by a
|
||||
* successful invocation of __msi_first_desc(). Consecutive invocations are
|
||||
* only valid if the previous one was successful. All these operations have
|
||||
* to be done within the same MSI mutex held region.
|
||||
*
|
||||
* Return: Pointer to the next MSI descriptor matching the search
|
||||
* criteria, NULL if none found.
|
||||
*/
|
||||
struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter)
|
||||
{
|
||||
struct msi_device_data *md = dev->msi.data;
|
||||
|
||||
if (WARN_ON_ONCE(!md))
|
||||
return NULL;
|
||||
|
||||
lockdep_assert_held(&md->mutex);
|
||||
|
||||
if (md->__iter_idx >= (unsigned long)MSI_MAX_INDEX)
|
||||
return NULL;
|
||||
|
||||
md->__iter_idx++;
|
||||
return msi_find_desc(md, filter);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(msi_next_desc);
|
||||
|
||||
/**
|
||||
* msi_get_virq - Return Linux interrupt number of a MSI interrupt
|
||||
* @dev: Device to operate on
|
||||
* @index: MSI interrupt index to look for (0-based)
|
||||
*
|
||||
* Return: The Linux interrupt number on success (> 0), 0 if not found
|
||||
*/
|
||||
unsigned int msi_get_virq(struct device *dev, unsigned int index)
|
||||
{
|
||||
struct msi_desc *desc;
|
||||
unsigned int ret = 0;
|
||||
bool pcimsi;
|
||||
|
||||
if (!dev->msi.data)
|
||||
return 0;
|
||||
|
||||
pcimsi = dev_is_pci(dev) ? to_pci_dev(dev)->msi_enabled : false;
|
||||
|
||||
msi_lock_descs(dev);
|
||||
desc = xa_load(&dev->msi.data->__store, pcimsi ? 0 : index);
|
||||
if (desc && desc->irq) {
|
||||
/*
|
||||
* PCI-MSI has only one descriptor for multiple interrupts.
|
||||
* PCI-MSIX and platform MSI use a descriptor per
|
||||
* interrupt.
|
||||
*/
|
||||
if (pcimsi) {
|
||||
if (index < desc->nvec_used)
|
||||
ret = desc->irq + index;
|
||||
} else {
|
||||
ret = desc->irq;
|
||||
}
|
||||
}
|
||||
msi_unlock_descs(dev);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(msi_get_virq);
|
||||
|
||||
#ifdef CONFIG_SYSFS
|
||||
static struct attribute *msi_dev_attrs[] = {
|
||||
NULL
|
||||
};
|
||||
|
||||
static const struct attribute_group msi_irqs_group = {
|
||||
.name = "msi_irqs",
|
||||
.attrs = msi_dev_attrs,
|
||||
};
|
||||
|
||||
static inline int msi_sysfs_create_group(struct device *dev)
|
||||
{
|
||||
return devm_device_add_group(dev, &msi_irqs_group);
|
||||
}
|
||||
|
||||
static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
|
||||
char *buf)
|
||||
{
|
||||
struct msi_desc *entry;
|
||||
bool is_msix = false;
|
||||
unsigned long irq;
|
||||
int retval;
|
||||
|
||||
retval = kstrtoul(attr->attr.name, 10, &irq);
|
||||
if (retval)
|
||||
return retval;
|
||||
|
||||
entry = irq_get_msi_desc(irq);
|
||||
if (!entry)
|
||||
return -ENODEV;
|
||||
|
||||
if (dev_is_pci(dev))
|
||||
is_msix = entry->msi_attrib.is_msix;
|
||||
/* MSI vs. MSIX is per device not per interrupt */
|
||||
bool is_msix = dev_is_pci(dev) ? to_pci_dev(dev)->msix_enabled : false;
|
||||
|
||||
return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi");
|
||||
}
|
||||
|
||||
/**
|
||||
* msi_populate_sysfs - Populate msi_irqs sysfs entries for devices
|
||||
* @dev: The device(PCI, platform etc) who will get sysfs entries
|
||||
*
|
||||
* Return attribute_group ** so that specific bus MSI can save it to
|
||||
* somewhere during initilizing msi irqs. If devices has no MSI irq,
|
||||
* return NULL; if it fails to populate sysfs, return ERR_PTR
|
||||
*/
|
||||
const struct attribute_group **msi_populate_sysfs(struct device *dev)
|
||||
static void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc)
|
||||
{
|
||||
const struct attribute_group **msi_irq_groups;
|
||||
struct attribute **msi_attrs, *msi_attr;
|
||||
struct device_attribute *msi_dev_attr;
|
||||
struct attribute_group *msi_irq_group;
|
||||
struct msi_desc *entry;
|
||||
int ret = -ENOMEM;
|
||||
int num_msi = 0;
|
||||
int count = 0;
|
||||
struct device_attribute *attrs = desc->sysfs_attrs;
|
||||
int i;
|
||||
|
||||
/* Determine how many msi entries we have */
|
||||
for_each_msi_entry(entry, dev)
|
||||
num_msi += entry->nvec_used;
|
||||
if (!num_msi)
|
||||
return NULL;
|
||||
if (!attrs)
|
||||
return;
|
||||
|
||||
/* Dynamically create the MSI attributes for the device */
|
||||
msi_attrs = kcalloc(num_msi + 1, sizeof(void *), GFP_KERNEL);
|
||||
if (!msi_attrs)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
for_each_msi_entry(entry, dev) {
|
||||
for (i = 0; i < entry->nvec_used; i++) {
|
||||
msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
|
||||
if (!msi_dev_attr)
|
||||
goto error_attrs;
|
||||
msi_attrs[count] = &msi_dev_attr->attr;
|
||||
|
||||
sysfs_attr_init(&msi_dev_attr->attr);
|
||||
msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d",
|
||||
entry->irq + i);
|
||||
if (!msi_dev_attr->attr.name)
|
||||
goto error_attrs;
|
||||
msi_dev_attr->attr.mode = 0444;
|
||||
msi_dev_attr->show = msi_mode_show;
|
||||
++count;
|
||||
desc->sysfs_attrs = NULL;
|
||||
for (i = 0; i < desc->nvec_used; i++) {
|
||||
if (attrs[i].show)
|
||||
sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name);
|
||||
kfree(attrs[i].attr.name);
|
||||
}
|
||||
kfree(attrs);
|
||||
}
|
||||
|
||||
msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL);
|
||||
if (!msi_irq_group)
|
||||
goto error_attrs;
|
||||
msi_irq_group->name = "msi_irqs";
|
||||
msi_irq_group->attrs = msi_attrs;
|
||||
static int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc)
|
||||
{
|
||||
struct device_attribute *attrs;
|
||||
int ret, i;
|
||||
|
||||
msi_irq_groups = kcalloc(2, sizeof(void *), GFP_KERNEL);
|
||||
if (!msi_irq_groups)
|
||||
goto error_irq_group;
|
||||
msi_irq_groups[0] = msi_irq_group;
|
||||
attrs = kcalloc(desc->nvec_used, sizeof(*attrs), GFP_KERNEL);
|
||||
if (!attrs)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = sysfs_create_groups(&dev->kobj, msi_irq_groups);
|
||||
desc->sysfs_attrs = attrs;
|
||||
for (i = 0; i < desc->nvec_used; i++) {
|
||||
sysfs_attr_init(&attrs[i].attr);
|
||||
attrs[i].attr.name = kasprintf(GFP_KERNEL, "%d", desc->irq + i);
|
||||
if (!attrs[i].attr.name) {
|
||||
ret = -ENOMEM;
|
||||
goto fail;
|
||||
}
|
||||
|
||||
attrs[i].attr.mode = 0444;
|
||||
attrs[i].show = msi_mode_show;
|
||||
|
||||
ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name);
|
||||
if (ret) {
|
||||
attrs[i].show = NULL;
|
||||
goto fail;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
msi_sysfs_remove_desc(dev, desc);
|
||||
return ret;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS
|
||||
/**
|
||||
* msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device
|
||||
* @dev: The device (PCI, platform etc) which will get sysfs entries
|
||||
*/
|
||||
int msi_device_populate_sysfs(struct device *dev)
|
||||
{
|
||||
struct msi_desc *desc;
|
||||
int ret;
|
||||
|
||||
msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
|
||||
if (desc->sysfs_attrs)
|
||||
continue;
|
||||
ret = msi_sysfs_populate_desc(dev, desc);
|
||||
if (ret)
|
||||
goto error_irq_groups;
|
||||
|
||||
return msi_irq_groups;
|
||||
|
||||
error_irq_groups:
|
||||
kfree(msi_irq_groups);
|
||||
error_irq_group:
|
||||
kfree(msi_irq_group);
|
||||
error_attrs:
|
||||
count = 0;
|
||||
msi_attr = msi_attrs[count];
|
||||
while (msi_attr) {
|
||||
msi_dev_attr = container_of(msi_attr, struct device_attribute, attr);
|
||||
kfree(msi_attr->name);
|
||||
kfree(msi_dev_attr);
|
||||
++count;
|
||||
msi_attr = msi_attrs[count];
|
||||
return ret;
|
||||
}
|
||||
kfree(msi_attrs);
|
||||
return ERR_PTR(ret);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* msi_destroy_sysfs - Destroy msi_irqs sysfs entries for devices
|
||||
* @dev: The device(PCI, platform etc) who will remove sysfs entries
|
||||
* @msi_irq_groups: attribute_group for device msi_irqs entries
|
||||
* msi_device_destroy_sysfs - Destroy msi_irqs sysfs entries for a device
|
||||
* @dev: The device (PCI, platform etc) for which to remove
|
||||
* sysfs entries
|
||||
*/
|
||||
void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups)
|
||||
void msi_device_destroy_sysfs(struct device *dev)
|
||||
{
|
||||
struct device_attribute *dev_attr;
|
||||
struct attribute **msi_attrs;
|
||||
int count = 0;
|
||||
struct msi_desc *desc;
|
||||
|
||||
if (msi_irq_groups) {
|
||||
sysfs_remove_groups(&dev->kobj, msi_irq_groups);
|
||||
msi_attrs = msi_irq_groups[0]->attrs;
|
||||
while (msi_attrs[count]) {
|
||||
dev_attr = container_of(msi_attrs[count],
|
||||
struct device_attribute, attr);
|
||||
kfree(dev_attr->attr.name);
|
||||
kfree(dev_attr);
|
||||
++count;
|
||||
}
|
||||
kfree(msi_attrs);
|
||||
kfree(msi_irq_groups[0]);
|
||||
kfree(msi_irq_groups);
|
||||
}
|
||||
msi_for_each_desc(desc, dev, MSI_DESC_ALL)
|
||||
msi_sysfs_remove_desc(dev, desc);
|
||||
}
|
||||
#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK */
|
||||
#else /* CONFIG_SYSFS */
|
||||
static inline int msi_sysfs_create_group(struct device *dev) { return 0; }
|
||||
static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; }
|
||||
static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { }
|
||||
#endif /* !CONFIG_SYSFS */
|
||||
|
||||
#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
|
||||
static inline void irq_chip_write_msi_msg(struct irq_data *data,
|
||||
@ -456,43 +710,38 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
|
||||
}
|
||||
|
||||
int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
|
||||
int virq, int nvec, msi_alloc_info_t *arg)
|
||||
int virq_base, int nvec, msi_alloc_info_t *arg)
|
||||
{
|
||||
struct msi_domain_info *info = domain->host_data;
|
||||
struct msi_domain_ops *ops = info->ops;
|
||||
struct msi_desc *desc;
|
||||
int ret = 0;
|
||||
int ret, virq;
|
||||
|
||||
for_each_msi_entry(desc, dev) {
|
||||
/* Don't even try the multi-MSI brain damage. */
|
||||
if (WARN_ON(!desc->irq || desc->nvec_used != 1)) {
|
||||
ret = -EINVAL;
|
||||
break;
|
||||
}
|
||||
msi_lock_descs(dev);
|
||||
ret = msi_add_simple_msi_descs(dev, virq_base, nvec);
|
||||
if (ret)
|
||||
goto unlock;
|
||||
|
||||
if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
|
||||
continue;
|
||||
for (virq = virq_base; virq < virq_base + nvec; virq++) {
|
||||
desc = xa_load(&dev->msi.data->__store, virq);
|
||||
desc->irq = virq;
|
||||
|
||||
ops->set_desc(arg, desc);
|
||||
/* Assumes the domain mutex is held! */
|
||||
ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1,
|
||||
arg);
|
||||
ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
|
||||
if (ret)
|
||||
break;
|
||||
goto fail;
|
||||
|
||||
irq_set_msi_desc_off(desc->irq, 0, desc);
|
||||
}
|
||||
|
||||
if (ret) {
|
||||
/* Mop up the damage */
|
||||
for_each_msi_entry(desc, dev) {
|
||||
if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
|
||||
continue;
|
||||
|
||||
irq_domain_free_irqs_common(domain, desc->irq, 1);
|
||||
}
|
||||
irq_set_msi_desc(virq, desc);
|
||||
}
|
||||
msi_unlock_descs(dev);
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
for (--virq; virq >= virq_base; virq--)
|
||||
irq_domain_free_irqs_common(domain, virq, 1);
|
||||
msi_free_msi_descs_range(dev, MSI_DESC_ALL, virq_base, virq_base + nvec - 1);
|
||||
unlock:
|
||||
msi_unlock_descs(dev);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -531,8 +780,59 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
|
||||
* Checking the first MSI descriptor is sufficient. MSIX supports
|
||||
* masking and MSI does so when the can_mask attribute is set.
|
||||
*/
|
||||
desc = first_msi_entry(dev);
|
||||
return desc->msi_attrib.is_msix || desc->msi_attrib.can_mask;
|
||||
desc = msi_first_desc(dev, MSI_DESC_ALL);
|
||||
return desc->pci.msi_attrib.is_msix || desc->pci.msi_attrib.can_mask;
|
||||
}
|
||||
|
||||
static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
|
||||
int allocated)
|
||||
{
|
||||
switch(domain->bus_token) {
|
||||
case DOMAIN_BUS_PCI_MSI:
|
||||
case DOMAIN_BUS_VMD_MSI:
|
||||
if (IS_ENABLED(CONFIG_PCI_MSI))
|
||||
break;
|
||||
fallthrough;
|
||||
default:
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
/* Let a failed PCI multi MSI allocation retry */
|
||||
if (desc->nvec_used > 1)
|
||||
return 1;
|
||||
|
||||
/* If there was a successful allocation let the caller know */
|
||||
return allocated ? allocated : -ENOSPC;
|
||||
}
|
||||
|
||||
#define VIRQ_CAN_RESERVE 0x01
|
||||
#define VIRQ_ACTIVATE 0x02
|
||||
#define VIRQ_NOMASK_QUIRK 0x04
|
||||
|
||||
static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags)
|
||||
{
|
||||
struct irq_data *irqd = irq_domain_get_irq_data(domain, virq);
|
||||
int ret;
|
||||
|
||||
if (!(vflags & VIRQ_CAN_RESERVE)) {
|
||||
irqd_clr_can_reserve(irqd);
|
||||
if (vflags & VIRQ_NOMASK_QUIRK)
|
||||
irqd_set_msi_nomask_quirk(irqd);
|
||||
}
|
||||
|
||||
if (!(vflags & VIRQ_ACTIVATE))
|
||||
return 0;
|
||||
|
||||
ret = irq_domain_activate_irq(irqd, vflags & VIRQ_CAN_RESERVE);
|
||||
if (ret)
|
||||
return ret;
|
||||
/*
|
||||
* If the interrupt uses reservation mode, clear the activated bit
|
||||
* so request_irq() will assign the final vector.
|
||||
*/
|
||||
if (vflags & VIRQ_CAN_RESERVE)
|
||||
irqd_clr_activated(irqd);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
|
||||
@ -540,83 +840,103 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
|
||||
{
|
||||
struct msi_domain_info *info = domain->host_data;
|
||||
struct msi_domain_ops *ops = info->ops;
|
||||
struct irq_data *irq_data;
|
||||
struct msi_desc *desc;
|
||||
msi_alloc_info_t arg = { };
|
||||
unsigned int vflags = 0;
|
||||
struct msi_desc *desc;
|
||||
int allocated = 0;
|
||||
int i, ret, virq;
|
||||
bool can_reserve;
|
||||
|
||||
ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
for_each_msi_entry(desc, dev) {
|
||||
ops->set_desc(&arg, desc);
|
||||
|
||||
virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
|
||||
dev_to_node(dev), &arg, false,
|
||||
desc->affinity);
|
||||
if (virq < 0) {
|
||||
ret = -ENOSPC;
|
||||
if (ops->handle_error)
|
||||
ret = ops->handle_error(domain, desc, ret);
|
||||
if (ops->msi_finish)
|
||||
ops->msi_finish(&arg, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
for (i = 0; i < desc->nvec_used; i++) {
|
||||
irq_set_msi_desc_off(virq, i, desc);
|
||||
irq_debugfs_copy_devname(virq + i, dev);
|
||||
}
|
||||
}
|
||||
|
||||
if (ops->msi_finish)
|
||||
ops->msi_finish(&arg, 0);
|
||||
|
||||
can_reserve = msi_check_reservation_mode(domain, info, dev);
|
||||
|
||||
/*
|
||||
* This flag is set by the PCI layer as we need to activate
|
||||
* the MSI entries before the PCI layer enables MSI in the
|
||||
* card. Otherwise the card latches a random msi message.
|
||||
*/
|
||||
if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
|
||||
goto skip_activate;
|
||||
if (info->flags & MSI_FLAG_ACTIVATE_EARLY)
|
||||
vflags |= VIRQ_ACTIVATE;
|
||||
|
||||
for_each_msi_vector(desc, i, dev) {
|
||||
if (desc->irq == i) {
|
||||
virq = desc->irq;
|
||||
dev_dbg(dev, "irq [%d-%d] for MSI\n",
|
||||
virq, virq + desc->nvec_used - 1);
|
||||
}
|
||||
|
||||
irq_data = irq_domain_get_irq_data(domain, i);
|
||||
if (!can_reserve) {
|
||||
irqd_clr_can_reserve(irq_data);
|
||||
if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
|
||||
irqd_set_msi_nomask_quirk(irq_data);
|
||||
}
|
||||
ret = irq_domain_activate_irq(irq_data, can_reserve);
|
||||
if (ret)
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
skip_activate:
|
||||
/*
|
||||
* If these interrupts use reservation mode, clear the activated bit
|
||||
* so request_irq() will assign the final vector.
|
||||
* Interrupt can use a reserved vector and will not occupy
|
||||
* a real device vector until the interrupt is requested.
|
||||
*/
|
||||
if (can_reserve) {
|
||||
for_each_msi_vector(desc, i, dev) {
|
||||
irq_data = irq_domain_get_irq_data(domain, i);
|
||||
irqd_clr_activated(irq_data);
|
||||
if (msi_check_reservation_mode(domain, info, dev)) {
|
||||
vflags |= VIRQ_CAN_RESERVE;
|
||||
/*
|
||||
* MSI affinity setting requires a special quirk (X86) when
|
||||
* reservation mode is active.
|
||||
*/
|
||||
if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
|
||||
vflags |= VIRQ_NOMASK_QUIRK;
|
||||
}
|
||||
|
||||
msi_for_each_desc(desc, dev, MSI_DESC_NOTASSOCIATED) {
|
||||
ops->set_desc(&arg, desc);
|
||||
|
||||
virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
|
||||
dev_to_node(dev), &arg, false,
|
||||
desc->affinity);
|
||||
if (virq < 0)
|
||||
return msi_handle_pci_fail(domain, desc, allocated);
|
||||
|
||||
for (i = 0; i < desc->nvec_used; i++) {
|
||||
irq_set_msi_desc_off(virq, i, desc);
|
||||
irq_debugfs_copy_devname(virq + i, dev);
|
||||
ret = msi_init_virq(domain, virq + i, vflags);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
if (info->flags & MSI_FLAG_DEV_SYSFS) {
|
||||
ret = msi_sysfs_populate_desc(dev, desc);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
allocated++;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
cleanup:
|
||||
msi_domain_free_irqs(domain, dev);
|
||||
static int msi_domain_add_simple_msi_descs(struct msi_domain_info *info,
|
||||
struct device *dev,
|
||||
unsigned int num_descs)
|
||||
{
|
||||
if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS))
|
||||
return 0;
|
||||
|
||||
return msi_add_simple_msi_descs(dev, 0, num_descs);
|
||||
}
|
||||
|
||||
/**
|
||||
* msi_domain_alloc_irqs_descs_locked - Allocate interrupts from a MSI interrupt domain
|
||||
* @domain: The domain to allocate from
|
||||
* @dev: Pointer to device struct of the device for which the interrupts
|
||||
* are allocated
|
||||
* @nvec: The number of interrupts to allocate
|
||||
*
|
||||
* Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
|
||||
* pair. Use this for MSI irqdomains which implement their own vector
|
||||
* allocation/free.
|
||||
*
|
||||
* Return: %0 on success or an error code.
|
||||
*/
|
||||
int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev,
|
||||
int nvec)
|
||||
{
|
||||
struct msi_domain_info *info = domain->host_data;
|
||||
struct msi_domain_ops *ops = info->ops;
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held(&dev->msi.data->mutex);
|
||||
|
||||
ret = msi_domain_add_simple_msi_descs(info, dev, nvec);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = ops->domain_alloc_irqs(domain, dev, nvec);
|
||||
if (ret)
|
||||
msi_domain_free_irqs_descs_locked(domain, dev);
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -629,38 +949,65 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
|
||||
*
|
||||
* Return: %0 on success or an error code.
|
||||
*/
|
||||
int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
|
||||
int nvec)
|
||||
int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec)
|
||||
{
|
||||
struct msi_domain_info *info = domain->host_data;
|
||||
struct msi_domain_ops *ops = info->ops;
|
||||
int ret;
|
||||
|
||||
return ops->domain_alloc_irqs(domain, dev, nvec);
|
||||
msi_lock_descs(dev);
|
||||
ret = msi_domain_alloc_irqs_descs_locked(domain, dev, nvec);
|
||||
msi_unlock_descs(dev);
|
||||
return ret;
|
||||
}
|
||||
|
||||
void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
|
||||
{
|
||||
struct irq_data *irq_data;
|
||||
struct msi_domain_info *info = domain->host_data;
|
||||
struct irq_data *irqd;
|
||||
struct msi_desc *desc;
|
||||
int i;
|
||||
|
||||
for_each_msi_vector(desc, i, dev) {
|
||||
irq_data = irq_domain_get_irq_data(domain, i);
|
||||
if (irqd_is_activated(irq_data))
|
||||
irq_domain_deactivate_irq(irq_data);
|
||||
/* Only handle MSI entries which have an interrupt associated */
|
||||
msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
|
||||
/* Make sure all interrupts are deactivated */
|
||||
for (i = 0; i < desc->nvec_used; i++) {
|
||||
irqd = irq_domain_get_irq_data(domain, desc->irq + i);
|
||||
if (irqd && irqd_is_activated(irqd))
|
||||
irq_domain_deactivate_irq(irqd);
|
||||
}
|
||||
|
||||
for_each_msi_entry(desc, dev) {
|
||||
/*
|
||||
* We might have failed to allocate an MSI early
|
||||
* enough that there is no IRQ associated to this
|
||||
* entry. If that's the case, don't do anything.
|
||||
*/
|
||||
if (desc->irq) {
|
||||
irq_domain_free_irqs(desc->irq, desc->nvec_used);
|
||||
if (info->flags & MSI_FLAG_DEV_SYSFS)
|
||||
msi_sysfs_remove_desc(dev, desc);
|
||||
desc->irq = 0;
|
||||
}
|
||||
}
|
||||
|
||||
static void msi_domain_free_msi_descs(struct msi_domain_info *info,
|
||||
struct device *dev)
|
||||
{
|
||||
if (info->flags & MSI_FLAG_FREE_MSI_DESCS)
|
||||
msi_free_msi_descs(dev);
|
||||
}
|
||||
|
||||
/**
|
||||
* msi_domain_free_irqs_descs_locked - Free interrupts from a MSI interrupt @domain associated to @dev
|
||||
* @domain: The domain to managing the interrupts
|
||||
* @dev: Pointer to device struct of the device for which the interrupts
|
||||
* are free
|
||||
*
|
||||
* Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
|
||||
* pair. Use this for MSI irqdomains which implement their own vector
|
||||
* allocation.
|
||||
*/
|
||||
void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev)
|
||||
{
|
||||
struct msi_domain_info *info = domain->host_data;
|
||||
struct msi_domain_ops *ops = info->ops;
|
||||
|
||||
lockdep_assert_held(&dev->msi.data->mutex);
|
||||
|
||||
ops->domain_free_irqs(domain, dev);
|
||||
msi_domain_free_msi_descs(info, dev);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -671,10 +1018,9 @@ void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
|
||||
*/
|
||||
void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
|
||||
{
|
||||
struct msi_domain_info *info = domain->host_data;
|
||||
struct msi_domain_ops *ops = info->ops;
|
||||
|
||||
return ops->domain_free_irqs(domain, dev);
|
||||
msi_lock_descs(dev);
|
||||
msi_domain_free_irqs_descs_locked(domain, dev);
|
||||
msi_unlock_descs(dev);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -447,6 +447,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
|
||||
|
||||
static int __init irqfixup_setup(char *str)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
|
||||
pr_warn("irqfixup boot option not supported with PREEMPT_RT\n");
|
||||
return 1;
|
||||
}
|
||||
irqfixup = 1;
|
||||
printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
|
||||
printk(KERN_WARNING "This may impact system performance.\n");
|
||||
@ -459,6 +463,10 @@ module_param(irqfixup, int, 0644);
|
||||
|
||||
static int __init irqpoll_setup(char *str)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
|
||||
pr_warn("irqpoll boot option not supported with PREEMPT_RT\n");
|
||||
return 1;
|
||||
}
|
||||
irqfixup = 2;
|
||||
printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
|
||||
"enabled\n");
|
||||
|
@ -18,11 +18,36 @@
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/notifier.h>
|
||||
#include <linux/smp.h>
|
||||
#include <linux/smpboot.h>
|
||||
#include <asm/processor.h>
|
||||
#include <linux/kasan.h>
|
||||
|
||||
static DEFINE_PER_CPU(struct llist_head, raised_list);
|
||||
static DEFINE_PER_CPU(struct llist_head, lazy_list);
|
||||
static DEFINE_PER_CPU(struct task_struct *, irq_workd);
|
||||
|
||||
static void wake_irq_workd(void)
|
||||
{
|
||||
struct task_struct *tsk = __this_cpu_read(irq_workd);
|
||||
|
||||
if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk)
|
||||
wake_up_process(tsk);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static void irq_work_wake(struct irq_work *entry)
|
||||
{
|
||||
wake_irq_workd();
|
||||
}
|
||||
|
||||
static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) =
|
||||
IRQ_WORK_INIT_HARD(irq_work_wake);
|
||||
#endif
|
||||
|
||||
static int irq_workd_should_run(unsigned int cpu)
|
||||
{
|
||||
return !llist_empty(this_cpu_ptr(&lazy_list));
|
||||
}
|
||||
|
||||
/*
|
||||
* Claim the entry so that no one else will poke at it.
|
||||
@ -52,15 +77,29 @@ void __weak arch_irq_work_raise(void)
|
||||
/* Enqueue on current CPU, work must already be claimed and preempt disabled */
|
||||
static void __irq_work_queue_local(struct irq_work *work)
|
||||
{
|
||||
struct llist_head *list;
|
||||
bool rt_lazy_work = false;
|
||||
bool lazy_work = false;
|
||||
int work_flags;
|
||||
|
||||
work_flags = atomic_read(&work->node.a_flags);
|
||||
if (work_flags & IRQ_WORK_LAZY)
|
||||
lazy_work = true;
|
||||
else if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
|
||||
!(work_flags & IRQ_WORK_HARD_IRQ))
|
||||
rt_lazy_work = true;
|
||||
|
||||
if (lazy_work || rt_lazy_work)
|
||||
list = this_cpu_ptr(&lazy_list);
|
||||
else
|
||||
list = this_cpu_ptr(&raised_list);
|
||||
|
||||
if (!llist_add(&work->node.llist, list))
|
||||
return;
|
||||
|
||||
/* If the work is "lazy", handle it from next tick if any */
|
||||
if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) {
|
||||
if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) &&
|
||||
tick_nohz_tick_stopped())
|
||||
if (!lazy_work || tick_nohz_tick_stopped())
|
||||
arch_irq_work_raise();
|
||||
} else {
|
||||
if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list)))
|
||||
arch_irq_work_raise();
|
||||
}
|
||||
}
|
||||
|
||||
/* Enqueue the irq work @work on the current CPU */
|
||||
@ -104,17 +143,34 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
|
||||
if (cpu != smp_processor_id()) {
|
||||
/* Arch remote IPI send/receive backend aren't NMI safe */
|
||||
WARN_ON_ONCE(in_nmi());
|
||||
|
||||
/*
|
||||
* On PREEMPT_RT the items which are not marked as
|
||||
* IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work
|
||||
* item is used on the remote CPU to wake the thread.
|
||||
*/
|
||||
if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
|
||||
!(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) {
|
||||
|
||||
if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu)))
|
||||
goto out;
|
||||
|
||||
work = &per_cpu(irq_work_wakeup, cpu);
|
||||
if (!irq_work_claim(work))
|
||||
goto out;
|
||||
}
|
||||
|
||||
__smp_call_single_queue(cpu, &work->node.llist);
|
||||
} else {
|
||||
__irq_work_queue_local(work);
|
||||
}
|
||||
out:
|
||||
preempt_enable();
|
||||
|
||||
return true;
|
||||
#endif /* CONFIG_SMP */
|
||||
}
|
||||
|
||||
|
||||
bool irq_work_needs_cpu(void)
|
||||
{
|
||||
struct llist_head *raised, *lazy;
|
||||
@ -160,6 +216,10 @@ void irq_work_single(void *arg)
|
||||
* else claimed it meanwhile.
|
||||
*/
|
||||
(void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);
|
||||
|
||||
if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
|
||||
!arch_irq_work_has_interrupt())
|
||||
rcuwait_wake_up(&work->irqwait);
|
||||
}
|
||||
|
||||
static void irq_work_run_list(struct llist_head *list)
|
||||
@ -167,7 +227,12 @@ static void irq_work_run_list(struct llist_head *list)
|
||||
struct irq_work *work, *tmp;
|
||||
struct llist_node *llnode;
|
||||
|
||||
BUG_ON(!irqs_disabled());
|
||||
/*
|
||||
* On PREEMPT_RT IRQ-work which is not marked as HARD will be processed
|
||||
* in a per-CPU thread in preemptible context. Only the items which are
|
||||
* marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context.
|
||||
*/
|
||||
BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT));
|
||||
|
||||
if (llist_empty(list))
|
||||
return;
|
||||
@ -184,7 +249,10 @@ static void irq_work_run_list(struct llist_head *list)
|
||||
void irq_work_run(void)
|
||||
{
|
||||
irq_work_run_list(this_cpu_ptr(&raised_list));
|
||||
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
|
||||
irq_work_run_list(this_cpu_ptr(&lazy_list));
|
||||
else
|
||||
wake_irq_workd();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(irq_work_run);
|
||||
|
||||
@ -194,7 +262,11 @@ void irq_work_tick(void)
|
||||
|
||||
if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
|
||||
irq_work_run_list(raised);
|
||||
|
||||
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
|
||||
irq_work_run_list(this_cpu_ptr(&lazy_list));
|
||||
else
|
||||
wake_irq_workd();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -204,8 +276,42 @@ void irq_work_tick(void)
|
||||
void irq_work_sync(struct irq_work *work)
|
||||
{
|
||||
lockdep_assert_irqs_enabled();
|
||||
might_sleep();
|
||||
|
||||
if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
|
||||
!arch_irq_work_has_interrupt()) {
|
||||
rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work),
|
||||
TASK_UNINTERRUPTIBLE);
|
||||
return;
|
||||
}
|
||||
|
||||
while (irq_work_is_busy(work))
|
||||
cpu_relax();
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(irq_work_sync);
|
||||
|
||||
static void run_irq_workd(unsigned int cpu)
|
||||
{
|
||||
irq_work_run_list(this_cpu_ptr(&lazy_list));
|
||||
}
|
||||
|
||||
static void irq_workd_setup(unsigned int cpu)
|
||||
{
|
||||
sched_set_fifo_low(current);
|
||||
}
|
||||
|
||||
static struct smp_hotplug_thread irqwork_threads = {
|
||||
.store = &irq_workd,
|
||||
.setup = irq_workd_setup,
|
||||
.thread_should_run = irq_workd_should_run,
|
||||
.thread_fn = run_irq_workd,
|
||||
.thread_comm = "irq_work/%u",
|
||||
};
|
||||
|
||||
static __init int irq_work_init_threads(void)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_PREEMPT_RT))
|
||||
BUG_ON(smpboot_register_percpu_thread(&irqwork_threads));
|
||||
return 0;
|
||||
}
|
||||
early_initcall(irq_work_init_threads);
|
||||
|
@ -164,26 +164,46 @@ static unsigned long kallsyms_sym_address(int idx)
|
||||
return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
|
||||
}
|
||||
|
||||
#if defined(CONFIG_CFI_CLANG) && defined(CONFIG_LTO_CLANG_THIN)
|
||||
/*
|
||||
* LLVM appends a hash to static function names when ThinLTO and CFI are
|
||||
* both enabled, i.e. foo() becomes foo$707af9a22804d33c81801f27dcfe489b.
|
||||
* This causes confusion and potentially breaks user space tools, so we
|
||||
* strip the suffix from expanded symbol names.
|
||||
*/
|
||||
static inline bool cleanup_symbol_name(char *s)
|
||||
static bool cleanup_symbol_name(char *s)
|
||||
{
|
||||
char *res;
|
||||
|
||||
res = strrchr(s, '$');
|
||||
if (res)
|
||||
*res = '\0';
|
||||
if (!IS_ENABLED(CONFIG_LTO_CLANG))
|
||||
return false;
|
||||
|
||||
return res != NULL;
|
||||
/*
|
||||
* LLVM appends various suffixes for local functions and variables that
|
||||
* must be promoted to global scope as part of LTO. This can break
|
||||
* hooking of static functions with kprobes. '.' is not a valid
|
||||
* character in an identifier in C. Suffixes observed:
|
||||
* - foo.llvm.[0-9a-f]+
|
||||
* - foo.[0-9a-f]+
|
||||
* - foo.[0-9a-f]+.cfi_jt
|
||||
*/
|
||||
res = strchr(s, '.');
|
||||
if (res) {
|
||||
*res = '\0';
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!IS_ENABLED(CONFIG_CFI_CLANG) ||
|
||||
!IS_ENABLED(CONFIG_LTO_CLANG_THIN) ||
|
||||
CONFIG_CLANG_VERSION >= 130000)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Prior to LLVM 13, the following suffixes were observed when thinLTO
|
||||
* and CFI are both enabled:
|
||||
* - foo$[0-9]+
|
||||
*/
|
||||
res = strrchr(s, '$');
|
||||
if (res) {
|
||||
*res = '\0';
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
#else
|
||||
static inline bool cleanup_symbol_name(char *s) { return false; }
|
||||
#endif
|
||||
|
||||
/* Lookup the address for this symbol. Returns 0 if not found. */
|
||||
unsigned long kallsyms_lookup_name(const char *name)
|
||||
@ -223,6 +243,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
|
||||
ret = fn(data, namebuf, NULL, kallsyms_sym_address(i));
|
||||
if (ret != 0)
|
||||
return ret;
|
||||
cond_resched();
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
@ -88,6 +88,7 @@ static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas);
|
||||
|
||||
struct kcov_percpu_data {
|
||||
void *irq_area;
|
||||
local_lock_t lock;
|
||||
|
||||
unsigned int saved_mode;
|
||||
unsigned int saved_size;
|
||||
@ -96,7 +97,9 @@ struct kcov_percpu_data {
|
||||
int saved_sequence;
|
||||
};
|
||||
|
||||
static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
|
||||
static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data) = {
|
||||
.lock = INIT_LOCAL_LOCK(lock),
|
||||
};
|
||||
|
||||
/* Must be called with kcov_remote_lock locked. */
|
||||
static struct kcov_remote *kcov_remote_find(u64 handle)
|
||||
@ -824,7 +827,7 @@ void kcov_remote_start(u64 handle)
|
||||
if (!in_task() && !in_serving_softirq())
|
||||
return;
|
||||
|
||||
local_irq_save(flags);
|
||||
local_lock_irqsave(&kcov_percpu_data.lock, flags);
|
||||
|
||||
/*
|
||||
* Check that kcov_remote_start() is not called twice in background
|
||||
@ -832,7 +835,7 @@ void kcov_remote_start(u64 handle)
|
||||
*/
|
||||
mode = READ_ONCE(t->kcov_mode);
|
||||
if (WARN_ON(in_task() && kcov_mode_enabled(mode))) {
|
||||
local_irq_restore(flags);
|
||||
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
|
||||
return;
|
||||
}
|
||||
/*
|
||||
@ -841,14 +844,15 @@ void kcov_remote_start(u64 handle)
|
||||
* happened while collecting coverage from a background thread.
|
||||
*/
|
||||
if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) {
|
||||
local_irq_restore(flags);
|
||||
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
|
||||
return;
|
||||
}
|
||||
|
||||
spin_lock(&kcov_remote_lock);
|
||||
remote = kcov_remote_find(handle);
|
||||
if (!remote) {
|
||||
spin_unlock_irqrestore(&kcov_remote_lock, flags);
|
||||
spin_unlock(&kcov_remote_lock);
|
||||
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
|
||||
return;
|
||||
}
|
||||
kcov_debug("handle = %llx, context: %s\n", handle,
|
||||
@ -869,19 +873,19 @@ void kcov_remote_start(u64 handle)
|
||||
size = CONFIG_KCOV_IRQ_AREA_SIZE;
|
||||
area = this_cpu_ptr(&kcov_percpu_data)->irq_area;
|
||||
}
|
||||
spin_unlock_irqrestore(&kcov_remote_lock, flags);
|
||||
spin_unlock(&kcov_remote_lock);
|
||||
|
||||
/* Can only happen when in_task(). */
|
||||
if (!area) {
|
||||
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
|
||||
area = vmalloc(size * sizeof(unsigned long));
|
||||
if (!area) {
|
||||
kcov_put(kcov);
|
||||
return;
|
||||
}
|
||||
local_lock_irqsave(&kcov_percpu_data.lock, flags);
|
||||
}
|
||||
|
||||
local_irq_save(flags);
|
||||
|
||||
/* Reset coverage size. */
|
||||
*(u64 *)area = 0;
|
||||
|
||||
@ -891,7 +895,7 @@ void kcov_remote_start(u64 handle)
|
||||
}
|
||||
kcov_start(t, kcov, size, area, mode, sequence);
|
||||
|
||||
local_irq_restore(flags);
|
||||
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
|
||||
|
||||
}
|
||||
EXPORT_SYMBOL(kcov_remote_start);
|
||||
@ -965,12 +969,12 @@ void kcov_remote_stop(void)
|
||||
if (!in_task() && !in_serving_softirq())
|
||||
return;
|
||||
|
||||
local_irq_save(flags);
|
||||
local_lock_irqsave(&kcov_percpu_data.lock, flags);
|
||||
|
||||
mode = READ_ONCE(t->kcov_mode);
|
||||
barrier();
|
||||
if (!kcov_mode_enabled(mode)) {
|
||||
local_irq_restore(flags);
|
||||
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
|
||||
return;
|
||||
}
|
||||
/*
|
||||
@ -978,12 +982,12 @@ void kcov_remote_stop(void)
|
||||
* actually found the remote handle and started collecting coverage.
|
||||
*/
|
||||
if (in_serving_softirq() && !t->kcov_softirq) {
|
||||
local_irq_restore(flags);
|
||||
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
|
||||
return;
|
||||
}
|
||||
/* Make sure that kcov_softirq is only set when in softirq. */
|
||||
if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) {
|
||||
local_irq_restore(flags);
|
||||
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1013,7 +1017,7 @@ void kcov_remote_stop(void)
|
||||
spin_unlock(&kcov_remote_lock);
|
||||
}
|
||||
|
||||
local_irq_restore(flags);
|
||||
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
|
||||
|
||||
/* Get in kcov_remote_start(). */
|
||||
kcov_put(kcov);
|
||||
@ -1034,8 +1038,8 @@ static int __init kcov_init(void)
|
||||
int cpu;
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE *
|
||||
sizeof(unsigned long));
|
||||
void *area = vmalloc_node(CONFIG_KCOV_IRQ_AREA_SIZE *
|
||||
sizeof(unsigned long), cpu_to_node(cpu));
|
||||
if (!area)
|
||||
return -ENOMEM;
|
||||
per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area;
|
||||
|
@ -8,9 +8,12 @@ CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
|
||||
CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
|
||||
|
||||
CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \
|
||||
$(call cc-option,-mno-outline-atomics) \
|
||||
-fno-stack-protector -DDISABLE_BRANCH_PROFILING
|
||||
|
||||
obj-y := core.o debugfs.o report.o
|
||||
|
||||
KCSAN_INSTRUMENT_BARRIERS_selftest.o := y
|
||||
obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
|
||||
|
||||
CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer
|
||||
|
@ -40,15 +40,17 @@ module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644);
|
||||
module_param_named(skip_watch, kcsan_skip_watch, long, 0644);
|
||||
module_param_named(interrupt_watcher, kcsan_interrupt_watcher, bool, 0444);
|
||||
|
||||
#ifdef CONFIG_KCSAN_WEAK_MEMORY
|
||||
static bool kcsan_weak_memory = true;
|
||||
module_param_named(weak_memory, kcsan_weak_memory, bool, 0644);
|
||||
#else
|
||||
#define kcsan_weak_memory false
|
||||
#endif
|
||||
|
||||
bool kcsan_enabled;
|
||||
|
||||
/* Per-CPU kcsan_ctx for interrupts */
|
||||
static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = {
|
||||
.disable_count = 0,
|
||||
.atomic_next = 0,
|
||||
.atomic_nest_count = 0,
|
||||
.in_flat_atomic = false,
|
||||
.access_mask = 0,
|
||||
.scoped_accesses = {LIST_POISON1, NULL},
|
||||
};
|
||||
|
||||
@ -202,22 +204,29 @@ static __always_inline struct kcsan_ctx *get_ctx(void)
|
||||
return in_task() ? ¤t->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx);
|
||||
}
|
||||
|
||||
static __always_inline void
|
||||
check_access(const volatile void *ptr, size_t size, int type, unsigned long ip);
|
||||
|
||||
/* Check scoped accesses; never inline because this is a slow-path! */
|
||||
static noinline void kcsan_check_scoped_accesses(void)
|
||||
{
|
||||
struct kcsan_ctx *ctx = get_ctx();
|
||||
struct list_head *prev_save = ctx->scoped_accesses.prev;
|
||||
struct kcsan_scoped_access *scoped_access;
|
||||
|
||||
ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */
|
||||
list_for_each_entry(scoped_access, &ctx->scoped_accesses, list)
|
||||
__kcsan_check_access(scoped_access->ptr, scoped_access->size, scoped_access->type);
|
||||
ctx->scoped_accesses.prev = prev_save;
|
||||
if (ctx->disable_scoped)
|
||||
return;
|
||||
|
||||
ctx->disable_scoped++;
|
||||
list_for_each_entry(scoped_access, &ctx->scoped_accesses, list) {
|
||||
check_access(scoped_access->ptr, scoped_access->size,
|
||||
scoped_access->type, scoped_access->ip);
|
||||
}
|
||||
ctx->disable_scoped--;
|
||||
}
|
||||
|
||||
/* Rules for generic atomic accesses. Called from fast-path. */
|
||||
static __always_inline bool
|
||||
is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx)
|
||||
is_atomic(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, int type)
|
||||
{
|
||||
if (type & KCSAN_ACCESS_ATOMIC)
|
||||
return true;
|
||||
@ -254,7 +263,7 @@ is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx
|
||||
}
|
||||
|
||||
static __always_inline bool
|
||||
should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx)
|
||||
should_watch(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, int type)
|
||||
{
|
||||
/*
|
||||
* Never set up watchpoints when memory operations are atomic.
|
||||
@ -263,7 +272,7 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *
|
||||
* should not count towards skipped instructions, and (2) to actually
|
||||
* decrement kcsan_atomic_next for consecutive instruction stream.
|
||||
*/
|
||||
if (is_atomic(ptr, size, type, ctx))
|
||||
if (is_atomic(ctx, ptr, size, type))
|
||||
return false;
|
||||
|
||||
if (this_cpu_dec_return(kcsan_skip) >= 0)
|
||||
@ -320,6 +329,21 @@ static void delay_access(int type)
|
||||
udelay(delay);
|
||||
}
|
||||
|
||||
/*
|
||||
* Reads the instrumented memory for value change detection; value change
|
||||
* detection is currently done for accesses up to a size of 8 bytes.
|
||||
*/
|
||||
static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size)
|
||||
{
|
||||
switch (size) {
|
||||
case 1: return READ_ONCE(*(const u8 *)ptr);
|
||||
case 2: return READ_ONCE(*(const u16 *)ptr);
|
||||
case 4: return READ_ONCE(*(const u32 *)ptr);
|
||||
case 8: return READ_ONCE(*(const u64 *)ptr);
|
||||
default: return 0; /* Ignore; we do not diff the values. */
|
||||
}
|
||||
}
|
||||
|
||||
void kcsan_save_irqtrace(struct task_struct *task)
|
||||
{
|
||||
#ifdef CONFIG_TRACE_IRQFLAGS
|
||||
@ -334,6 +358,76 @@ void kcsan_restore_irqtrace(struct task_struct *task)
|
||||
#endif
|
||||
}
|
||||
|
||||
static __always_inline int get_kcsan_stack_depth(void)
|
||||
{
|
||||
#ifdef CONFIG_KCSAN_WEAK_MEMORY
|
||||
return current->kcsan_stack_depth;
|
||||
#else
|
||||
BUILD_BUG();
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __always_inline void add_kcsan_stack_depth(int val)
|
||||
{
|
||||
#ifdef CONFIG_KCSAN_WEAK_MEMORY
|
||||
current->kcsan_stack_depth += val;
|
||||
#else
|
||||
BUILD_BUG();
|
||||
#endif
|
||||
}
|
||||
|
||||
static __always_inline struct kcsan_scoped_access *get_reorder_access(struct kcsan_ctx *ctx)
|
||||
{
|
||||
#ifdef CONFIG_KCSAN_WEAK_MEMORY
|
||||
return ctx->disable_scoped ? NULL : &ctx->reorder_access;
|
||||
#else
|
||||
return NULL;
|
||||
#endif
|
||||
}
|
||||
|
||||
static __always_inline bool
|
||||
find_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size,
|
||||
int type, unsigned long ip)
|
||||
{
|
||||
struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
|
||||
|
||||
if (!reorder_access)
|
||||
return false;
|
||||
|
||||
/*
|
||||
* Note: If accesses are repeated while reorder_access is identical,
|
||||
* never matches the new access, because !(type & KCSAN_ACCESS_SCOPED).
|
||||
*/
|
||||
return reorder_access->ptr == ptr && reorder_access->size == size &&
|
||||
reorder_access->type == type && reorder_access->ip == ip;
|
||||
}
|
||||
|
||||
static inline void
|
||||
set_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size,
|
||||
int type, unsigned long ip)
|
||||
{
|
||||
struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
|
||||
|
||||
if (!reorder_access || !kcsan_weak_memory)
|
||||
return;
|
||||
|
||||
/*
|
||||
* To avoid nested interrupts or scheduler (which share kcsan_ctx)
|
||||
* reading an inconsistent reorder_access, ensure that the below has
|
||||
* exclusive access to reorder_access by disallowing concurrent use.
|
||||
*/
|
||||
ctx->disable_scoped++;
|
||||
barrier();
|
||||
reorder_access->ptr = ptr;
|
||||
reorder_access->size = size;
|
||||
reorder_access->type = type | KCSAN_ACCESS_SCOPED;
|
||||
reorder_access->ip = ip;
|
||||
reorder_access->stack_depth = get_kcsan_stack_depth();
|
||||
barrier();
|
||||
ctx->disable_scoped--;
|
||||
}
|
||||
|
||||
/*
|
||||
* Pull everything together: check_access() below contains the performance
|
||||
* critical operations; the fast-path (including check_access) functions should
|
||||
@ -350,6 +444,7 @@ void kcsan_restore_irqtrace(struct task_struct *task)
|
||||
static noinline void kcsan_found_watchpoint(const volatile void *ptr,
|
||||
size_t size,
|
||||
int type,
|
||||
unsigned long ip,
|
||||
atomic_long_t *watchpoint,
|
||||
long encoded_watchpoint)
|
||||
{
|
||||
@ -371,8 +466,10 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
|
||||
* The access_mask check relies on value-change comparison. To avoid
|
||||
* reporting a race where e.g. the writer set up the watchpoint, but the
|
||||
* reader has access_mask!=0, we have to ignore the found watchpoint.
|
||||
*
|
||||
* reorder_access is never created from an access with access_mask set.
|
||||
*/
|
||||
if (ctx->access_mask)
|
||||
if (ctx->access_mask && !find_reorder_access(ctx, ptr, size, type, ip))
|
||||
return;
|
||||
|
||||
/*
|
||||
@ -396,7 +493,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
|
||||
|
||||
if (consumed) {
|
||||
kcsan_save_irqtrace(current);
|
||||
kcsan_report_set_info(ptr, size, type, watchpoint - watchpoints);
|
||||
kcsan_report_set_info(ptr, size, type, ip, watchpoint - watchpoints);
|
||||
kcsan_restore_irqtrace(current);
|
||||
} else {
|
||||
/*
|
||||
@ -416,17 +513,19 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
|
||||
}
|
||||
|
||||
static noinline void
|
||||
kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
|
||||
kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned long ip)
|
||||
{
|
||||
const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
|
||||
const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
|
||||
atomic_long_t *watchpoint;
|
||||
u64 old, new, diff;
|
||||
unsigned long access_mask;
|
||||
enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE;
|
||||
bool interrupt_watcher = kcsan_interrupt_watcher;
|
||||
unsigned long ua_flags = user_access_save();
|
||||
struct kcsan_ctx *ctx = get_ctx();
|
||||
unsigned long access_mask = ctx->access_mask;
|
||||
unsigned long irq_flags = 0;
|
||||
bool is_reorder_access;
|
||||
|
||||
/*
|
||||
* Always reset kcsan_skip counter in slow-path to avoid underflow; see
|
||||
@ -449,13 +548,33 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* The local CPU cannot observe reordering of its own accesses, and
|
||||
* therefore we need to take care of 2 cases to avoid false positives:
|
||||
*
|
||||
* 1. Races of the reordered access with interrupts. To avoid, if
|
||||
* the current access is reorder_access, disable interrupts.
|
||||
* 2. Avoid races of scoped accesses from nested interrupts (below).
|
||||
*/
|
||||
is_reorder_access = find_reorder_access(ctx, ptr, size, type, ip);
|
||||
if (is_reorder_access)
|
||||
interrupt_watcher = false;
|
||||
/*
|
||||
* Avoid races of scoped accesses from nested interrupts (or scheduler).
|
||||
* Assume setting up a watchpoint for a non-scoped (normal) access that
|
||||
* also conflicts with a current scoped access. In a nested interrupt,
|
||||
* which shares the context, it would check a conflicting scoped access.
|
||||
* To avoid, disable scoped access checking.
|
||||
*/
|
||||
ctx->disable_scoped++;
|
||||
|
||||
/*
|
||||
* Save and restore the IRQ state trace touched by KCSAN, since KCSAN's
|
||||
* runtime is entered for every memory access, and potentially useful
|
||||
* information is lost if dirtied by KCSAN.
|
||||
*/
|
||||
kcsan_save_irqtrace(current);
|
||||
if (!kcsan_interrupt_watcher)
|
||||
if (!interrupt_watcher)
|
||||
local_irq_save(irq_flags);
|
||||
|
||||
watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write);
|
||||
@ -476,23 +595,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
|
||||
* Read the current value, to later check and infer a race if the data
|
||||
* was modified via a non-instrumented access, e.g. from a device.
|
||||
*/
|
||||
old = 0;
|
||||
switch (size) {
|
||||
case 1:
|
||||
old = READ_ONCE(*(const u8 *)ptr);
|
||||
break;
|
||||
case 2:
|
||||
old = READ_ONCE(*(const u16 *)ptr);
|
||||
break;
|
||||
case 4:
|
||||
old = READ_ONCE(*(const u32 *)ptr);
|
||||
break;
|
||||
case 8:
|
||||
old = READ_ONCE(*(const u64 *)ptr);
|
||||
break;
|
||||
default:
|
||||
break; /* ignore; we do not diff the values */
|
||||
}
|
||||
old = is_reorder_access ? 0 : read_instrumented_memory(ptr, size);
|
||||
|
||||
/*
|
||||
* Delay this thread, to increase probability of observing a racy
|
||||
@ -504,23 +607,16 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
|
||||
* Re-read value, and check if it is as expected; if not, we infer a
|
||||
* racy access.
|
||||
*/
|
||||
access_mask = ctx->access_mask;
|
||||
if (!is_reorder_access) {
|
||||
new = read_instrumented_memory(ptr, size);
|
||||
} else {
|
||||
/*
|
||||
* Reordered accesses cannot be used for value change detection,
|
||||
* because the memory location may no longer be accessible and
|
||||
* could result in a fault.
|
||||
*/
|
||||
new = 0;
|
||||
switch (size) {
|
||||
case 1:
|
||||
new = READ_ONCE(*(const u8 *)ptr);
|
||||
break;
|
||||
case 2:
|
||||
new = READ_ONCE(*(const u16 *)ptr);
|
||||
break;
|
||||
case 4:
|
||||
new = READ_ONCE(*(const u32 *)ptr);
|
||||
break;
|
||||
case 8:
|
||||
new = READ_ONCE(*(const u64 *)ptr);
|
||||
break;
|
||||
default:
|
||||
break; /* ignore; we do not diff the values */
|
||||
access_mask = 0;
|
||||
}
|
||||
|
||||
diff = old ^ new;
|
||||
@ -568,8 +664,8 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
|
||||
if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE)
|
||||
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
|
||||
|
||||
kcsan_report_known_origin(ptr, size, type, value_change,
|
||||
watchpoint - watchpoints,
|
||||
kcsan_report_known_origin(ptr, size, type, ip,
|
||||
value_change, watchpoint - watchpoints,
|
||||
old, new, access_mask);
|
||||
} else if (value_change == KCSAN_VALUE_CHANGE_TRUE) {
|
||||
/* Inferring a race, since the value should not have changed. */
|
||||
@ -578,8 +674,10 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
|
||||
if (is_assert)
|
||||
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
|
||||
|
||||
if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert)
|
||||
kcsan_report_unknown_origin(ptr, size, type, old, new, access_mask);
|
||||
if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) {
|
||||
kcsan_report_unknown_origin(ptr, size, type, ip,
|
||||
old, new, access_mask);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -588,18 +686,27 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
|
||||
*/
|
||||
remove_watchpoint(watchpoint);
|
||||
atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]);
|
||||
|
||||
out_unlock:
|
||||
if (!kcsan_interrupt_watcher)
|
||||
if (!interrupt_watcher)
|
||||
local_irq_restore(irq_flags);
|
||||
kcsan_restore_irqtrace(current);
|
||||
ctx->disable_scoped--;
|
||||
|
||||
/*
|
||||
* Reordered accesses cannot be used for value change detection,
|
||||
* therefore never consider for reordering if access_mask is set.
|
||||
* ASSERT_EXCLUSIVE are not real accesses, ignore them as well.
|
||||
*/
|
||||
if (!access_mask && !is_assert)
|
||||
set_reorder_access(ctx, ptr, size, type, ip);
|
||||
out:
|
||||
user_access_restore(ua_flags);
|
||||
}
|
||||
|
||||
static __always_inline void check_access(const volatile void *ptr, size_t size,
|
||||
int type)
|
||||
static __always_inline void
|
||||
check_access(const volatile void *ptr, size_t size, int type, unsigned long ip)
|
||||
{
|
||||
const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
|
||||
atomic_long_t *watchpoint;
|
||||
long encoded_watchpoint;
|
||||
|
||||
@ -610,12 +717,14 @@ static __always_inline void check_access(const volatile void *ptr, size_t size,
|
||||
if (unlikely(size == 0))
|
||||
return;
|
||||
|
||||
again:
|
||||
/*
|
||||
* Avoid user_access_save in fast-path: find_watchpoint is safe without
|
||||
* user_access_save, as the address that ptr points to is only used to
|
||||
* check if a watchpoint exists; ptr is never dereferenced.
|
||||
*/
|
||||
watchpoint = find_watchpoint((unsigned long)ptr, size, !is_write,
|
||||
watchpoint = find_watchpoint((unsigned long)ptr, size,
|
||||
!(type & KCSAN_ACCESS_WRITE),
|
||||
&encoded_watchpoint);
|
||||
/*
|
||||
* It is safe to check kcsan_is_enabled() after find_watchpoint in the
|
||||
@ -625,14 +734,46 @@ static __always_inline void check_access(const volatile void *ptr, size_t size,
|
||||
*/
|
||||
|
||||
if (unlikely(watchpoint != NULL))
|
||||
kcsan_found_watchpoint(ptr, size, type, watchpoint,
|
||||
encoded_watchpoint);
|
||||
kcsan_found_watchpoint(ptr, size, type, ip, watchpoint, encoded_watchpoint);
|
||||
else {
|
||||
struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */
|
||||
|
||||
if (unlikely(should_watch(ptr, size, type, ctx)))
|
||||
kcsan_setup_watchpoint(ptr, size, type);
|
||||
else if (unlikely(ctx->scoped_accesses.prev))
|
||||
if (unlikely(should_watch(ctx, ptr, size, type))) {
|
||||
kcsan_setup_watchpoint(ptr, size, type, ip);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!(type & KCSAN_ACCESS_SCOPED)) {
|
||||
struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
|
||||
|
||||
if (reorder_access) {
|
||||
/*
|
||||
* reorder_access check: simulates reordering of
|
||||
* the access after subsequent operations.
|
||||
*/
|
||||
ptr = reorder_access->ptr;
|
||||
type = reorder_access->type;
|
||||
ip = reorder_access->ip;
|
||||
/*
|
||||
* Upon a nested interrupt, this context's
|
||||
* reorder_access can be modified (shared ctx).
|
||||
* We know that upon return, reorder_access is
|
||||
* always invalidated by setting size to 0 via
|
||||
* __tsan_func_exit(). Therefore we must read
|
||||
* and check size after the other fields.
|
||||
*/
|
||||
barrier();
|
||||
size = READ_ONCE(reorder_access->size);
|
||||
if (size)
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Always checked last, right before returning from runtime;
|
||||
* if reorder_access is valid, checked after it was checked.
|
||||
*/
|
||||
if (unlikely(ctx->scoped_accesses.prev))
|
||||
kcsan_check_scoped_accesses();
|
||||
}
|
||||
}
|
||||
@ -757,7 +898,7 @@ kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type,
|
||||
{
|
||||
struct kcsan_ctx *ctx = get_ctx();
|
||||
|
||||
__kcsan_check_access(ptr, size, type);
|
||||
check_access(ptr, size, type, _RET_IP_);
|
||||
|
||||
ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */
|
||||
|
||||
@ -765,6 +906,7 @@ kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type,
|
||||
sa->ptr = ptr;
|
||||
sa->size = size;
|
||||
sa->type = type;
|
||||
sa->ip = _RET_IP_;
|
||||
|
||||
if (!ctx->scoped_accesses.prev) /* Lazy initialize list head. */
|
||||
INIT_LIST_HEAD(&ctx->scoped_accesses);
|
||||
@ -796,16 +938,32 @@ void kcsan_end_scoped_access(struct kcsan_scoped_access *sa)
|
||||
|
||||
ctx->disable_count--;
|
||||
|
||||
__kcsan_check_access(sa->ptr, sa->size, sa->type);
|
||||
check_access(sa->ptr, sa->size, sa->type, sa->ip);
|
||||
}
|
||||
EXPORT_SYMBOL(kcsan_end_scoped_access);
|
||||
|
||||
void __kcsan_check_access(const volatile void *ptr, size_t size, int type)
|
||||
{
|
||||
check_access(ptr, size, type);
|
||||
check_access(ptr, size, type, _RET_IP_);
|
||||
}
|
||||
EXPORT_SYMBOL(__kcsan_check_access);
|
||||
|
||||
#define DEFINE_MEMORY_BARRIER(name, order_before_cond) \
|
||||
void __kcsan_##name(void) \
|
||||
{ \
|
||||
struct kcsan_scoped_access *sa = get_reorder_access(get_ctx()); \
|
||||
if (!sa) \
|
||||
return; \
|
||||
if (order_before_cond) \
|
||||
sa->size = 0; \
|
||||
} \
|
||||
EXPORT_SYMBOL(__kcsan_##name)
|
||||
|
||||
DEFINE_MEMORY_BARRIER(mb, true);
|
||||
DEFINE_MEMORY_BARRIER(wmb, sa->type & (KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND));
|
||||
DEFINE_MEMORY_BARRIER(rmb, !(sa->type & KCSAN_ACCESS_WRITE) || (sa->type & KCSAN_ACCESS_COMPOUND));
|
||||
DEFINE_MEMORY_BARRIER(release, true);
|
||||
|
||||
/*
|
||||
* KCSAN uses the same instrumentation that is emitted by supported compilers
|
||||
* for ThreadSanitizer (TSAN).
|
||||
@ -823,7 +981,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
|
||||
void __tsan_read##size(void *ptr); \
|
||||
void __tsan_read##size(void *ptr) \
|
||||
{ \
|
||||
check_access(ptr, size, 0); \
|
||||
check_access(ptr, size, 0, _RET_IP_); \
|
||||
} \
|
||||
EXPORT_SYMBOL(__tsan_read##size); \
|
||||
void __tsan_unaligned_read##size(void *ptr) \
|
||||
@ -832,7 +990,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
|
||||
void __tsan_write##size(void *ptr); \
|
||||
void __tsan_write##size(void *ptr) \
|
||||
{ \
|
||||
check_access(ptr, size, KCSAN_ACCESS_WRITE); \
|
||||
check_access(ptr, size, KCSAN_ACCESS_WRITE, _RET_IP_); \
|
||||
} \
|
||||
EXPORT_SYMBOL(__tsan_write##size); \
|
||||
void __tsan_unaligned_write##size(void *ptr) \
|
||||
@ -842,7 +1000,8 @@ EXPORT_SYMBOL(__kcsan_check_access);
|
||||
void __tsan_read_write##size(void *ptr) \
|
||||
{ \
|
||||
check_access(ptr, size, \
|
||||
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE); \
|
||||
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, \
|
||||
_RET_IP_); \
|
||||
} \
|
||||
EXPORT_SYMBOL(__tsan_read_write##size); \
|
||||
void __tsan_unaligned_read_write##size(void *ptr) \
|
||||
@ -858,14 +1017,14 @@ DEFINE_TSAN_READ_WRITE(16);
|
||||
void __tsan_read_range(void *ptr, size_t size);
|
||||
void __tsan_read_range(void *ptr, size_t size)
|
||||
{
|
||||
check_access(ptr, size, 0);
|
||||
check_access(ptr, size, 0, _RET_IP_);
|
||||
}
|
||||
EXPORT_SYMBOL(__tsan_read_range);
|
||||
|
||||
void __tsan_write_range(void *ptr, size_t size);
|
||||
void __tsan_write_range(void *ptr, size_t size)
|
||||
{
|
||||
check_access(ptr, size, KCSAN_ACCESS_WRITE);
|
||||
check_access(ptr, size, KCSAN_ACCESS_WRITE, _RET_IP_);
|
||||
}
|
||||
EXPORT_SYMBOL(__tsan_write_range);
|
||||
|
||||
@ -886,7 +1045,8 @@ EXPORT_SYMBOL(__tsan_write_range);
|
||||
IS_ALIGNED((unsigned long)ptr, size); \
|
||||
if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \
|
||||
return; \
|
||||
check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0); \
|
||||
check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0, \
|
||||
_RET_IP_); \
|
||||
} \
|
||||
EXPORT_SYMBOL(__tsan_volatile_read##size); \
|
||||
void __tsan_unaligned_volatile_read##size(void *ptr) \
|
||||
@ -901,7 +1061,8 @@ EXPORT_SYMBOL(__tsan_write_range);
|
||||
return; \
|
||||
check_access(ptr, size, \
|
||||
KCSAN_ACCESS_WRITE | \
|
||||
(is_atomic ? KCSAN_ACCESS_ATOMIC : 0)); \
|
||||
(is_atomic ? KCSAN_ACCESS_ATOMIC : 0), \
|
||||
_RET_IP_); \
|
||||
} \
|
||||
EXPORT_SYMBOL(__tsan_volatile_write##size); \
|
||||
void __tsan_unaligned_volatile_write##size(void *ptr) \
|
||||
@ -915,19 +1076,56 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(8);
|
||||
DEFINE_TSAN_VOLATILE_READ_WRITE(16);
|
||||
|
||||
/*
|
||||
* The below are not required by KCSAN, but can still be emitted by the
|
||||
* compiler.
|
||||
* Function entry and exit are used to determine the validty of reorder_access.
|
||||
* Reordering of the access ends at the end of the function scope where the
|
||||
* access happened. This is done for two reasons:
|
||||
*
|
||||
* 1. Artificially limits the scope where missing barriers are detected.
|
||||
* This minimizes false positives due to uninstrumented functions that
|
||||
* contain the required barriers but were missed.
|
||||
*
|
||||
* 2. Simplifies generating the stack trace of the access.
|
||||
*/
|
||||
void __tsan_func_entry(void *call_pc);
|
||||
void __tsan_func_entry(void *call_pc)
|
||||
noinline void __tsan_func_entry(void *call_pc)
|
||||
{
|
||||
if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
|
||||
return;
|
||||
|
||||
add_kcsan_stack_depth(1);
|
||||
}
|
||||
EXPORT_SYMBOL(__tsan_func_entry);
|
||||
|
||||
void __tsan_func_exit(void);
|
||||
void __tsan_func_exit(void)
|
||||
noinline void __tsan_func_exit(void)
|
||||
{
|
||||
struct kcsan_scoped_access *reorder_access;
|
||||
|
||||
if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
|
||||
return;
|
||||
|
||||
reorder_access = get_reorder_access(get_ctx());
|
||||
if (!reorder_access)
|
||||
goto out;
|
||||
|
||||
if (get_kcsan_stack_depth() <= reorder_access->stack_depth) {
|
||||
/*
|
||||
* Access check to catch cases where write without a barrier
|
||||
* (supposed release) was last access in function: because
|
||||
* instrumentation is inserted before the real access, a data
|
||||
* race due to the write giving up a c-s would only be caught if
|
||||
* we do the conflicting access after.
|
||||
*/
|
||||
check_access(reorder_access->ptr, reorder_access->size,
|
||||
reorder_access->type, reorder_access->ip);
|
||||
reorder_access->size = 0;
|
||||
reorder_access->stack_depth = INT_MIN;
|
||||
}
|
||||
out:
|
||||
add_kcsan_stack_depth(-1);
|
||||
}
|
||||
EXPORT_SYMBOL(__tsan_func_exit);
|
||||
|
||||
void __tsan_init(void);
|
||||
void __tsan_init(void)
|
||||
{
|
||||
@ -950,12 +1148,21 @@ EXPORT_SYMBOL(__tsan_init);
|
||||
* functions, whose job is to also execute the operation itself.
|
||||
*/
|
||||
|
||||
static __always_inline void kcsan_atomic_builtin_memorder(int memorder)
|
||||
{
|
||||
if (memorder == __ATOMIC_RELEASE ||
|
||||
memorder == __ATOMIC_SEQ_CST ||
|
||||
memorder == __ATOMIC_ACQ_REL)
|
||||
__kcsan_release();
|
||||
}
|
||||
|
||||
#define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \
|
||||
u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \
|
||||
u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \
|
||||
{ \
|
||||
kcsan_atomic_builtin_memorder(memorder); \
|
||||
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
|
||||
check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC); \
|
||||
check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC, _RET_IP_); \
|
||||
} \
|
||||
return __atomic_load_n(ptr, memorder); \
|
||||
} \
|
||||
@ -963,9 +1170,10 @@ EXPORT_SYMBOL(__tsan_init);
|
||||
void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \
|
||||
void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \
|
||||
{ \
|
||||
kcsan_atomic_builtin_memorder(memorder); \
|
||||
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
|
||||
check_access(ptr, bits / BITS_PER_BYTE, \
|
||||
KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \
|
||||
KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC, _RET_IP_); \
|
||||
} \
|
||||
__atomic_store_n(ptr, v, memorder); \
|
||||
} \
|
||||
@ -975,10 +1183,11 @@ EXPORT_SYMBOL(__tsan_init);
|
||||
u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \
|
||||
u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \
|
||||
{ \
|
||||
kcsan_atomic_builtin_memorder(memorder); \
|
||||
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
|
||||
check_access(ptr, bits / BITS_PER_BYTE, \
|
||||
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
|
||||
KCSAN_ACCESS_ATOMIC); \
|
||||
KCSAN_ACCESS_ATOMIC, _RET_IP_); \
|
||||
} \
|
||||
return __atomic_##op##suffix(ptr, v, memorder); \
|
||||
} \
|
||||
@ -1007,10 +1216,11 @@ EXPORT_SYMBOL(__tsan_init);
|
||||
int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \
|
||||
u##bits val, int mo, int fail_mo) \
|
||||
{ \
|
||||
kcsan_atomic_builtin_memorder(mo); \
|
||||
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
|
||||
check_access(ptr, bits / BITS_PER_BYTE, \
|
||||
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
|
||||
KCSAN_ACCESS_ATOMIC); \
|
||||
KCSAN_ACCESS_ATOMIC, _RET_IP_); \
|
||||
} \
|
||||
return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \
|
||||
} \
|
||||
@ -1022,10 +1232,11 @@ EXPORT_SYMBOL(__tsan_init);
|
||||
u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \
|
||||
int mo, int fail_mo) \
|
||||
{ \
|
||||
kcsan_atomic_builtin_memorder(mo); \
|
||||
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
|
||||
check_access(ptr, bits / BITS_PER_BYTE, \
|
||||
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
|
||||
KCSAN_ACCESS_ATOMIC); \
|
||||
KCSAN_ACCESS_ATOMIC, _RET_IP_); \
|
||||
} \
|
||||
__atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \
|
||||
return exp; \
|
||||
@ -1053,10 +1264,47 @@ DEFINE_TSAN_ATOMIC_OPS(64);
|
||||
void __tsan_atomic_thread_fence(int memorder);
|
||||
void __tsan_atomic_thread_fence(int memorder)
|
||||
{
|
||||
kcsan_atomic_builtin_memorder(memorder);
|
||||
__atomic_thread_fence(memorder);
|
||||
}
|
||||
EXPORT_SYMBOL(__tsan_atomic_thread_fence);
|
||||
|
||||
/*
|
||||
* In instrumented files, we emit instrumentation for barriers by mapping the
|
||||
* kernel barriers to an __atomic_signal_fence(), which is interpreted specially
|
||||
* and otherwise has no relation to a real __atomic_signal_fence(). No known
|
||||
* kernel code uses __atomic_signal_fence().
|
||||
*
|
||||
* Since fsanitize=thread instrumentation handles __atomic_signal_fence(), which
|
||||
* are turned into calls to __tsan_atomic_signal_fence(), such instrumentation
|
||||
* can be disabled via the __no_kcsan function attribute (vs. an explicit call
|
||||
* which could not). When __no_kcsan is requested, __atomic_signal_fence()
|
||||
* generates no code.
|
||||
*
|
||||
* Note: The result of using __atomic_signal_fence() with KCSAN enabled is
|
||||
* potentially limiting the compiler's ability to reorder operations; however,
|
||||
* if barriers were instrumented with explicit calls (without LTO), the compiler
|
||||
* couldn't optimize much anyway. The result of a hypothetical architecture
|
||||
* using __atomic_signal_fence() in normal code would be KCSAN false negatives.
|
||||
*/
|
||||
void __tsan_atomic_signal_fence(int memorder);
|
||||
void __tsan_atomic_signal_fence(int memorder) { }
|
||||
noinline void __tsan_atomic_signal_fence(int memorder)
|
||||
{
|
||||
switch (memorder) {
|
||||
case __KCSAN_BARRIER_TO_SIGNAL_FENCE_mb:
|
||||
__kcsan_mb();
|
||||
break;
|
||||
case __KCSAN_BARRIER_TO_SIGNAL_FENCE_wmb:
|
||||
__kcsan_wmb();
|
||||
break;
|
||||
case __KCSAN_BARRIER_TO_SIGNAL_FENCE_rmb:
|
||||
__kcsan_rmb();
|
||||
break;
|
||||
case __KCSAN_BARRIER_TO_SIGNAL_FENCE_release:
|
||||
__kcsan_release();
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(__tsan_atomic_signal_fence);
|
||||
|
@ -121,7 +121,7 @@ enum kcsan_value_change {
|
||||
* to be consumed by the reporting thread. No report is printed yet.
|
||||
*/
|
||||
void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
|
||||
int watchpoint_idx);
|
||||
unsigned long ip, int watchpoint_idx);
|
||||
|
||||
/*
|
||||
* The calling thread observed that the watchpoint it set up was hit and
|
||||
@ -129,14 +129,14 @@ void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_typ
|
||||
* thread.
|
||||
*/
|
||||
void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
|
||||
enum kcsan_value_change value_change, int watchpoint_idx,
|
||||
u64 old, u64 new, u64 mask);
|
||||
unsigned long ip, enum kcsan_value_change value_change,
|
||||
int watchpoint_idx, u64 old, u64 new, u64 mask);
|
||||
|
||||
/*
|
||||
* No other thread was observed to race with the access, but the data value
|
||||
* before and after the stall differs. Reports a race of "unknown origin".
|
||||
*/
|
||||
void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
|
||||
u64 old, u64 new, u64 mask);
|
||||
unsigned long ip, u64 old, u64 new, u64 mask);
|
||||
|
||||
#endif /* _KERNEL_KCSAN_KCSAN_H */
|
||||
|
@ -16,9 +16,12 @@
|
||||
#define pr_fmt(fmt) "kcsan_test: " fmt
|
||||
|
||||
#include <kunit/test.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/kcsan-checks.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/seqlock.h>
|
||||
#include <linux/spinlock.h>
|
||||
@ -29,6 +32,11 @@
|
||||
#include <linux/types.h>
|
||||
#include <trace/events/printk.h>
|
||||
|
||||
#define KCSAN_TEST_REQUIRES(test, cond) do { \
|
||||
if (!(cond)) \
|
||||
kunit_skip((test), "Test requires: " #cond); \
|
||||
} while (0)
|
||||
|
||||
#ifdef CONFIG_CC_HAS_TSAN_COMPOUND_READ_BEFORE_WRITE
|
||||
#define __KCSAN_ACCESS_RW(alt) (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE)
|
||||
#else
|
||||
@ -146,7 +154,7 @@ struct expect_report {
|
||||
|
||||
/* Check observed report matches information in @r. */
|
||||
__no_kcsan
|
||||
static bool report_matches(const struct expect_report *r)
|
||||
static bool __report_matches(const struct expect_report *r)
|
||||
{
|
||||
const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT;
|
||||
bool ret = false;
|
||||
@ -205,10 +213,12 @@ static bool report_matches(const struct expect_report *r)
|
||||
"read-write" :
|
||||
"write") :
|
||||
"read");
|
||||
const bool is_atomic = (ty & KCSAN_ACCESS_ATOMIC);
|
||||
const bool is_scoped = (ty & KCSAN_ACCESS_SCOPED);
|
||||
const char *const access_type_aux =
|
||||
(ty & KCSAN_ACCESS_ATOMIC) ?
|
||||
" (marked)" :
|
||||
((ty & KCSAN_ACCESS_SCOPED) ? " (scoped)" : "");
|
||||
(is_atomic && is_scoped) ? " (marked, reordered)"
|
||||
: (is_atomic ? " (marked)"
|
||||
: (is_scoped ? " (reordered)" : ""));
|
||||
|
||||
if (i == 1) {
|
||||
/* Access 2 */
|
||||
@ -246,6 +256,40 @@ static bool report_matches(const struct expect_report *r)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __always_inline const struct expect_report *
|
||||
__report_set_scoped(struct expect_report *r, int accesses)
|
||||
{
|
||||
BUILD_BUG_ON(accesses > 3);
|
||||
|
||||
if (accesses & 1)
|
||||
r->access[0].type |= KCSAN_ACCESS_SCOPED;
|
||||
else
|
||||
r->access[0].type &= ~KCSAN_ACCESS_SCOPED;
|
||||
|
||||
if (accesses & 2)
|
||||
r->access[1].type |= KCSAN_ACCESS_SCOPED;
|
||||
else
|
||||
r->access[1].type &= ~KCSAN_ACCESS_SCOPED;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
__no_kcsan
|
||||
static bool report_matches_any_reordered(struct expect_report *r)
|
||||
{
|
||||
return __report_matches(__report_set_scoped(r, 0)) ||
|
||||
__report_matches(__report_set_scoped(r, 1)) ||
|
||||
__report_matches(__report_set_scoped(r, 2)) ||
|
||||
__report_matches(__report_set_scoped(r, 3));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_KCSAN_WEAK_MEMORY
|
||||
/* Due to reordering accesses, any access may appear as "(reordered)". */
|
||||
#define report_matches report_matches_any_reordered
|
||||
#else
|
||||
#define report_matches __report_matches
|
||||
#endif
|
||||
|
||||
/* ===== Test kernels ===== */
|
||||
|
||||
static long test_sink;
|
||||
@ -256,6 +300,8 @@ static struct {
|
||||
long val[8];
|
||||
} test_struct;
|
||||
static DEFINE_SEQLOCK(test_seqlock);
|
||||
static DEFINE_SPINLOCK(test_spinlock);
|
||||
static DEFINE_MUTEX(test_mutex);
|
||||
|
||||
/*
|
||||
* Helper to avoid compiler optimizing out reads, and to generate source values
|
||||
@ -264,6 +310,16 @@ static DEFINE_SEQLOCK(test_seqlock);
|
||||
__no_kcsan
|
||||
static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); }
|
||||
|
||||
/*
|
||||
* Generates a delay and some accesses that enter the runtime but do not produce
|
||||
* data races.
|
||||
*/
|
||||
static noinline void test_delay(int iter)
|
||||
{
|
||||
while (iter--)
|
||||
sink_value(READ_ONCE(test_sink));
|
||||
}
|
||||
|
||||
static noinline void test_kernel_read(void) { sink_value(test_var); }
|
||||
|
||||
static noinline void test_kernel_write(void)
|
||||
@ -333,7 +389,10 @@ static noinline void test_kernel_assert_bits_nochange(void)
|
||||
ASSERT_EXCLUSIVE_BITS(test_var, ~TEST_CHANGE_BITS);
|
||||
}
|
||||
|
||||
/* To check that scoped assertions do trigger anywhere in scope. */
|
||||
/*
|
||||
* Scoped assertions do trigger anywhere in scope. However, the report should
|
||||
* still only point at the start of the scope.
|
||||
*/
|
||||
static noinline void test_enter_scope(void)
|
||||
{
|
||||
int x = 0;
|
||||
@ -422,19 +481,239 @@ static noinline void test_kernel_xor_1bit(void)
|
||||
kcsan_nestable_atomic_end();
|
||||
}
|
||||
|
||||
#define TEST_KERNEL_LOCKED(name, acquire, release) \
|
||||
static noinline void test_kernel_##name(void) \
|
||||
{ \
|
||||
long *flag = &test_struct.val[0]; \
|
||||
long v = 0; \
|
||||
if (!(acquire)) \
|
||||
return; \
|
||||
while (v++ < 100) { \
|
||||
test_var++; \
|
||||
barrier(); \
|
||||
} \
|
||||
release; \
|
||||
test_delay(10); \
|
||||
}
|
||||
|
||||
TEST_KERNEL_LOCKED(with_memorder,
|
||||
cmpxchg_acquire(flag, 0, 1) == 0,
|
||||
smp_store_release(flag, 0));
|
||||
TEST_KERNEL_LOCKED(wrong_memorder,
|
||||
cmpxchg_relaxed(flag, 0, 1) == 0,
|
||||
WRITE_ONCE(*flag, 0));
|
||||
TEST_KERNEL_LOCKED(atomic_builtin_with_memorder,
|
||||
__atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED),
|
||||
__atomic_store_n(flag, 0, __ATOMIC_RELEASE));
|
||||
TEST_KERNEL_LOCKED(atomic_builtin_wrong_memorder,
|
||||
__atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED),
|
||||
__atomic_store_n(flag, 0, __ATOMIC_RELAXED));
|
||||
|
||||
/* ===== Test cases ===== */
|
||||
|
||||
/*
|
||||
* Tests that various barriers have the expected effect on internal state. Not
|
||||
* exhaustive on atomic_t operations. Unlike the selftest, also checks for
|
||||
* too-strict barrier instrumentation; these can be tolerated, because it does
|
||||
* not cause false positives, but at least we should be aware of such cases.
|
||||
*/
|
||||
static void test_barrier_nothreads(struct kunit *test)
|
||||
{
|
||||
#ifdef CONFIG_KCSAN_WEAK_MEMORY
|
||||
struct kcsan_scoped_access *reorder_access = ¤t->kcsan_ctx.reorder_access;
|
||||
#else
|
||||
struct kcsan_scoped_access *reorder_access = NULL;
|
||||
#endif
|
||||
arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
|
||||
atomic_t dummy;
|
||||
|
||||
KCSAN_TEST_REQUIRES(test, reorder_access != NULL);
|
||||
KCSAN_TEST_REQUIRES(test, IS_ENABLED(CONFIG_SMP));
|
||||
|
||||
#define __KCSAN_EXPECT_BARRIER(access_type, barrier, order_before, name) \
|
||||
do { \
|
||||
reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \
|
||||
reorder_access->size = sizeof(test_var); \
|
||||
barrier; \
|
||||
KUNIT_EXPECT_EQ_MSG(test, reorder_access->size, \
|
||||
order_before ? 0 : sizeof(test_var), \
|
||||
"improperly instrumented type=(" #access_type "): " name); \
|
||||
} while (0)
|
||||
#define KCSAN_EXPECT_READ_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(0, b, o, #b)
|
||||
#define KCSAN_EXPECT_WRITE_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_WRITE, b, o, #b)
|
||||
#define KCSAN_EXPECT_RW_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, b, o, #b)
|
||||
|
||||
/*
|
||||
* Lockdep initialization can strengthen certain locking operations due
|
||||
* to calling into instrumented files; "warm up" our locks.
|
||||
*/
|
||||
spin_lock(&test_spinlock);
|
||||
spin_unlock(&test_spinlock);
|
||||
mutex_lock(&test_mutex);
|
||||
mutex_unlock(&test_mutex);
|
||||
|
||||
/* Force creating a valid entry in reorder_access first. */
|
||||
test_var = 0;
|
||||
while (test_var++ < 1000000 && reorder_access->size != sizeof(test_var))
|
||||
__kcsan_check_read(&test_var, sizeof(test_var));
|
||||
KUNIT_ASSERT_EQ(test, reorder_access->size, sizeof(test_var));
|
||||
|
||||
kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */
|
||||
|
||||
KCSAN_EXPECT_READ_BARRIER(mb(), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(wmb(), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(rmb(), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(smp_mb(), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(smp_wmb(), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(smp_rmb(), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(dma_wmb(), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(dma_rmb(), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(smp_mb__before_atomic(), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(smp_mb__after_atomic(), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(smp_mb__after_spinlock(), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(smp_store_mb(test_var, 0), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(smp_load_acquire(&test_var), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(smp_store_release(&test_var, 0), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(xchg(&test_var, 0), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(xchg_release(&test_var, 0), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(xchg_relaxed(&test_var, 0), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(cmpxchg(&test_var, 0, 0), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(atomic_read(&dummy), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(atomic_read_acquire(&dummy), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(atomic_set(&dummy, 0), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(atomic_set_release(&dummy, 0), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(atomic_add(1, &dummy), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(atomic_add_return(1, &dummy), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(atomic_add_return_acquire(1, &dummy), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(atomic_add_return_release(1, &dummy), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add(1, &dummy), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_release(1, &dummy), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(test_and_set_bit(0, &test_var), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(test_and_clear_bit(0, &test_var), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(test_and_change_bit(0, &test_var), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock(0, &test_var), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(__clear_bit_unlock(0, &test_var), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(arch_spin_lock(&arch_spinlock), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(arch_spin_unlock(&arch_spinlock), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(spin_lock(&test_spinlock), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(spin_unlock(&test_spinlock), true);
|
||||
KCSAN_EXPECT_READ_BARRIER(mutex_lock(&test_mutex), false);
|
||||
KCSAN_EXPECT_READ_BARRIER(mutex_unlock(&test_mutex), true);
|
||||
|
||||
KCSAN_EXPECT_WRITE_BARRIER(mb(), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(wmb(), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(rmb(), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(smp_mb(), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(smp_wmb(), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(smp_rmb(), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(dma_wmb(), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(dma_rmb(), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(smp_mb__before_atomic(), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_atomic(), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_spinlock(), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(smp_store_mb(test_var, 0), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(smp_load_acquire(&test_var), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(smp_store_release(&test_var, 0), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(xchg(&test_var, 0), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(xchg_release(&test_var, 0), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(xchg_relaxed(&test_var, 0), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(cmpxchg(&test_var, 0, 0), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(atomic_read(&dummy), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(atomic_read_acquire(&dummy), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(atomic_set(&dummy, 0), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(atomic_set_release(&dummy, 0), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(atomic_add(1, &dummy), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return(1, &dummy), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_acquire(1, &dummy), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_release(1, &dummy), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add(1, &dummy), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(test_and_set_bit(0, &test_var), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(test_and_clear_bit(0, &test_var), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(test_and_change_bit(0, &test_var), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock(0, &test_var), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(__clear_bit_unlock(0, &test_var), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(arch_spin_lock(&arch_spinlock), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(spin_lock(&test_spinlock), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(spin_unlock(&test_spinlock), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(mutex_lock(&test_mutex), false);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(mutex_unlock(&test_mutex), true);
|
||||
|
||||
KCSAN_EXPECT_RW_BARRIER(mb(), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(wmb(), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(rmb(), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(smp_mb(), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(smp_wmb(), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(smp_rmb(), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(dma_wmb(), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(dma_rmb(), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(smp_mb__before_atomic(), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(smp_mb__after_atomic(), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(smp_mb__after_spinlock(), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(smp_store_mb(test_var, 0), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(smp_load_acquire(&test_var), false);
|
||||
KCSAN_EXPECT_RW_BARRIER(smp_store_release(&test_var, 0), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(xchg(&test_var, 0), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(xchg_release(&test_var, 0), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(xchg_relaxed(&test_var, 0), false);
|
||||
KCSAN_EXPECT_RW_BARRIER(cmpxchg(&test_var, 0, 0), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
|
||||
KCSAN_EXPECT_RW_BARRIER(atomic_read(&dummy), false);
|
||||
KCSAN_EXPECT_RW_BARRIER(atomic_read_acquire(&dummy), false);
|
||||
KCSAN_EXPECT_RW_BARRIER(atomic_set(&dummy, 0), false);
|
||||
KCSAN_EXPECT_RW_BARRIER(atomic_set_release(&dummy, 0), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(atomic_add(1, &dummy), false);
|
||||
KCSAN_EXPECT_RW_BARRIER(atomic_add_return(1, &dummy), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(atomic_add_return_acquire(1, &dummy), false);
|
||||
KCSAN_EXPECT_RW_BARRIER(atomic_add_return_release(1, &dummy), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
|
||||
KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add(1, &dummy), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
|
||||
KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_release(1, &dummy), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
|
||||
KCSAN_EXPECT_RW_BARRIER(test_and_set_bit(0, &test_var), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(test_and_clear_bit(0, &test_var), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(test_and_change_bit(0, &test_var), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock(0, &test_var), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(__clear_bit_unlock(0, &test_var), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(arch_spin_lock(&arch_spinlock), false);
|
||||
KCSAN_EXPECT_RW_BARRIER(arch_spin_unlock(&arch_spinlock), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(spin_lock(&test_spinlock), false);
|
||||
KCSAN_EXPECT_RW_BARRIER(spin_unlock(&test_spinlock), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false);
|
||||
KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true);
|
||||
|
||||
#ifdef clear_bit_unlock_is_negative_byte
|
||||
KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
|
||||
KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
|
||||
KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
|
||||
#endif
|
||||
kcsan_nestable_atomic_end();
|
||||
}
|
||||
|
||||
/* Simple test with normal data race. */
|
||||
__no_kcsan
|
||||
static void test_basic(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect = {
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
|
||||
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
|
||||
},
|
||||
};
|
||||
static const struct expect_report never = {
|
||||
struct expect_report never = {
|
||||
.access = {
|
||||
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
|
||||
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
|
||||
@ -459,14 +738,14 @@ static void test_basic(struct kunit *test)
|
||||
__no_kcsan
|
||||
static void test_concurrent_races(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect = {
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
/* NULL will match any address. */
|
||||
{ test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
|
||||
{ test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(0) },
|
||||
},
|
||||
};
|
||||
static const struct expect_report never = {
|
||||
struct expect_report never = {
|
||||
.access = {
|
||||
{ test_kernel_rmw_array, NULL, 0, 0 },
|
||||
{ test_kernel_rmw_array, NULL, 0, 0 },
|
||||
@ -488,17 +767,24 @@ static void test_concurrent_races(struct kunit *test)
|
||||
__no_kcsan
|
||||
static void test_novalue_change(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect = {
|
||||
struct expect_report expect_rw = {
|
||||
.access = {
|
||||
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
|
||||
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
|
||||
},
|
||||
};
|
||||
struct expect_report expect_ww = {
|
||||
.access = {
|
||||
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
|
||||
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
|
||||
},
|
||||
};
|
||||
bool match_expect = false;
|
||||
|
||||
test_kernel_write_nochange(); /* Reset value. */
|
||||
begin_test_checks(test_kernel_write_nochange, test_kernel_read);
|
||||
do {
|
||||
match_expect = report_matches(&expect);
|
||||
match_expect = report_matches(&expect_rw) || report_matches(&expect_ww);
|
||||
} while (!end_test_checks(match_expect));
|
||||
if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY))
|
||||
KUNIT_EXPECT_FALSE(test, match_expect);
|
||||
@ -513,17 +799,24 @@ static void test_novalue_change(struct kunit *test)
|
||||
__no_kcsan
|
||||
static void test_novalue_change_exception(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect = {
|
||||
struct expect_report expect_rw = {
|
||||
.access = {
|
||||
{ test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
|
||||
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
|
||||
},
|
||||
};
|
||||
struct expect_report expect_ww = {
|
||||
.access = {
|
||||
{ test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
|
||||
{ test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
|
||||
},
|
||||
};
|
||||
bool match_expect = false;
|
||||
|
||||
test_kernel_write_nochange_rcu(); /* Reset value. */
|
||||
begin_test_checks(test_kernel_write_nochange_rcu, test_kernel_read);
|
||||
do {
|
||||
match_expect = report_matches(&expect);
|
||||
match_expect = report_matches(&expect_rw) || report_matches(&expect_ww);
|
||||
} while (!end_test_checks(match_expect));
|
||||
KUNIT_EXPECT_TRUE(test, match_expect);
|
||||
}
|
||||
@ -532,7 +825,7 @@ static void test_novalue_change_exception(struct kunit *test)
|
||||
__no_kcsan
|
||||
static void test_unknown_origin(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect = {
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
|
||||
{ NULL },
|
||||
@ -554,7 +847,7 @@ static void test_unknown_origin(struct kunit *test)
|
||||
__no_kcsan
|
||||
static void test_write_write_assume_atomic(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect = {
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
|
||||
{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
|
||||
@ -580,7 +873,7 @@ static void test_write_write_assume_atomic(struct kunit *test)
|
||||
__no_kcsan
|
||||
static void test_write_write_struct(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect = {
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
|
||||
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
|
||||
@ -602,7 +895,7 @@ static void test_write_write_struct(struct kunit *test)
|
||||
__no_kcsan
|
||||
static void test_write_write_struct_part(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect = {
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
|
||||
{ test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE },
|
||||
@ -634,7 +927,7 @@ static void test_read_atomic_write_atomic(struct kunit *test)
|
||||
__no_kcsan
|
||||
static void test_read_plain_atomic_write(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect = {
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
|
||||
{ test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC },
|
||||
@ -642,8 +935,7 @@ static void test_read_plain_atomic_write(struct kunit *test)
|
||||
};
|
||||
bool match_expect = false;
|
||||
|
||||
if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS))
|
||||
return;
|
||||
KCSAN_TEST_REQUIRES(test, !IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS));
|
||||
|
||||
begin_test_checks(test_kernel_read, test_kernel_write_atomic);
|
||||
do {
|
||||
@ -656,7 +948,7 @@ static void test_read_plain_atomic_write(struct kunit *test)
|
||||
__no_kcsan
|
||||
static void test_read_plain_atomic_rmw(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect = {
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
|
||||
{ test_kernel_atomic_rmw, &test_var, sizeof(test_var),
|
||||
@ -665,8 +957,7 @@ static void test_read_plain_atomic_rmw(struct kunit *test)
|
||||
};
|
||||
bool match_expect = false;
|
||||
|
||||
if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS))
|
||||
return;
|
||||
KCSAN_TEST_REQUIRES(test, !IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS));
|
||||
|
||||
begin_test_checks(test_kernel_read, test_kernel_atomic_rmw);
|
||||
do {
|
||||
@ -679,13 +970,13 @@ static void test_read_plain_atomic_rmw(struct kunit *test)
|
||||
__no_kcsan
|
||||
static void test_zero_size_access(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect = {
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
|
||||
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
|
||||
},
|
||||
};
|
||||
const struct expect_report never = {
|
||||
struct expect_report never = {
|
||||
.access = {
|
||||
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
|
||||
{ test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 },
|
||||
@ -719,7 +1010,7 @@ static void test_data_race(struct kunit *test)
|
||||
__no_kcsan
|
||||
static void test_assert_exclusive_writer(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect = {
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
|
||||
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
|
||||
@ -737,7 +1028,7 @@ static void test_assert_exclusive_writer(struct kunit *test)
|
||||
__no_kcsan
|
||||
static void test_assert_exclusive_access(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect = {
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
|
||||
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
|
||||
@ -755,19 +1046,19 @@ static void test_assert_exclusive_access(struct kunit *test)
|
||||
__no_kcsan
|
||||
static void test_assert_exclusive_access_writer(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect_access_writer = {
|
||||
struct expect_report expect_access_writer = {
|
||||
.access = {
|
||||
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
|
||||
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
|
||||
},
|
||||
};
|
||||
const struct expect_report expect_access_access = {
|
||||
struct expect_report expect_access_access = {
|
||||
.access = {
|
||||
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
|
||||
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
|
||||
},
|
||||
};
|
||||
const struct expect_report never = {
|
||||
struct expect_report never = {
|
||||
.access = {
|
||||
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
|
||||
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
|
||||
@ -791,7 +1082,7 @@ static void test_assert_exclusive_access_writer(struct kunit *test)
|
||||
__no_kcsan
|
||||
static void test_assert_exclusive_bits_change(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect = {
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
{ test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
|
||||
{ test_kernel_change_bits, &test_var, sizeof(test_var),
|
||||
@ -822,43 +1113,43 @@ static void test_assert_exclusive_bits_nochange(struct kunit *test)
|
||||
__no_kcsan
|
||||
static void test_assert_exclusive_writer_scoped(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect_start = {
|
||||
struct expect_report expect_start = {
|
||||
.access = {
|
||||
{ test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
|
||||
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
|
||||
},
|
||||
};
|
||||
const struct expect_report expect_anywhere = {
|
||||
struct expect_report expect_inscope = {
|
||||
.access = {
|
||||
{ test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
|
||||
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
|
||||
},
|
||||
};
|
||||
bool match_expect_start = false;
|
||||
bool match_expect_anywhere = false;
|
||||
bool match_expect_inscope = false;
|
||||
|
||||
begin_test_checks(test_kernel_assert_writer_scoped, test_kernel_write_nochange);
|
||||
do {
|
||||
match_expect_start |= report_matches(&expect_start);
|
||||
match_expect_anywhere |= report_matches(&expect_anywhere);
|
||||
} while (!end_test_checks(match_expect_start && match_expect_anywhere));
|
||||
match_expect_inscope |= report_matches(&expect_inscope);
|
||||
} while (!end_test_checks(match_expect_inscope));
|
||||
KUNIT_EXPECT_TRUE(test, match_expect_start);
|
||||
KUNIT_EXPECT_TRUE(test, match_expect_anywhere);
|
||||
KUNIT_EXPECT_FALSE(test, match_expect_inscope);
|
||||
}
|
||||
|
||||
__no_kcsan
|
||||
static void test_assert_exclusive_access_scoped(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect_start1 = {
|
||||
struct expect_report expect_start1 = {
|
||||
.access = {
|
||||
{ test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
|
||||
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
|
||||
},
|
||||
};
|
||||
const struct expect_report expect_start2 = {
|
||||
struct expect_report expect_start2 = {
|
||||
.access = { expect_start1.access[0], expect_start1.access[0] },
|
||||
};
|
||||
const struct expect_report expect_inscope = {
|
||||
struct expect_report expect_inscope = {
|
||||
.access = {
|
||||
{ test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
|
||||
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
|
||||
@ -872,9 +1163,9 @@ static void test_assert_exclusive_access_scoped(struct kunit *test)
|
||||
do {
|
||||
match_expect_start |= report_matches(&expect_start1) || report_matches(&expect_start2);
|
||||
match_expect_inscope |= report_matches(&expect_inscope);
|
||||
} while (!end_test_checks(match_expect_start && match_expect_inscope));
|
||||
} while (!end_test_checks(match_expect_inscope));
|
||||
KUNIT_EXPECT_TRUE(test, match_expect_start);
|
||||
KUNIT_EXPECT_TRUE(test, match_expect_inscope);
|
||||
KUNIT_EXPECT_FALSE(test, match_expect_inscope);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -963,7 +1254,7 @@ static void test_atomic_builtins(struct kunit *test)
|
||||
__no_kcsan
|
||||
static void test_1bit_value_change(struct kunit *test)
|
||||
{
|
||||
const struct expect_report expect = {
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
|
||||
{ test_kernel_xor_1bit, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
|
||||
@ -983,6 +1274,90 @@ static void test_1bit_value_change(struct kunit *test)
|
||||
KUNIT_EXPECT_TRUE(test, match);
|
||||
}
|
||||
|
||||
__no_kcsan
|
||||
static void test_correct_barrier(struct kunit *test)
|
||||
{
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
{ test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
|
||||
{ test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
|
||||
},
|
||||
};
|
||||
bool match_expect = false;
|
||||
|
||||
test_struct.val[0] = 0; /* init unlocked */
|
||||
begin_test_checks(test_kernel_with_memorder, test_kernel_with_memorder);
|
||||
do {
|
||||
match_expect = report_matches_any_reordered(&expect);
|
||||
} while (!end_test_checks(match_expect));
|
||||
KUNIT_EXPECT_FALSE(test, match_expect);
|
||||
}
|
||||
|
||||
__no_kcsan
|
||||
static void test_missing_barrier(struct kunit *test)
|
||||
{
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
{ test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
|
||||
{ test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
|
||||
},
|
||||
};
|
||||
bool match_expect = false;
|
||||
|
||||
test_struct.val[0] = 0; /* init unlocked */
|
||||
begin_test_checks(test_kernel_wrong_memorder, test_kernel_wrong_memorder);
|
||||
do {
|
||||
match_expect = report_matches_any_reordered(&expect);
|
||||
} while (!end_test_checks(match_expect));
|
||||
if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
|
||||
KUNIT_EXPECT_TRUE(test, match_expect);
|
||||
else
|
||||
KUNIT_EXPECT_FALSE(test, match_expect);
|
||||
}
|
||||
|
||||
__no_kcsan
|
||||
static void test_atomic_builtins_correct_barrier(struct kunit *test)
|
||||
{
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
{ test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
|
||||
{ test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
|
||||
},
|
||||
};
|
||||
bool match_expect = false;
|
||||
|
||||
test_struct.val[0] = 0; /* init unlocked */
|
||||
begin_test_checks(test_kernel_atomic_builtin_with_memorder,
|
||||
test_kernel_atomic_builtin_with_memorder);
|
||||
do {
|
||||
match_expect = report_matches_any_reordered(&expect);
|
||||
} while (!end_test_checks(match_expect));
|
||||
KUNIT_EXPECT_FALSE(test, match_expect);
|
||||
}
|
||||
|
||||
__no_kcsan
|
||||
static void test_atomic_builtins_missing_barrier(struct kunit *test)
|
||||
{
|
||||
struct expect_report expect = {
|
||||
.access = {
|
||||
{ test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
|
||||
{ test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
|
||||
},
|
||||
};
|
||||
bool match_expect = false;
|
||||
|
||||
test_struct.val[0] = 0; /* init unlocked */
|
||||
begin_test_checks(test_kernel_atomic_builtin_wrong_memorder,
|
||||
test_kernel_atomic_builtin_wrong_memorder);
|
||||
do {
|
||||
match_expect = report_matches_any_reordered(&expect);
|
||||
} while (!end_test_checks(match_expect));
|
||||
if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
|
||||
KUNIT_EXPECT_TRUE(test, match_expect);
|
||||
else
|
||||
KUNIT_EXPECT_FALSE(test, match_expect);
|
||||
}
|
||||
|
||||
/*
|
||||
* Generate thread counts for all test cases. Values generated are in interval
|
||||
* [2, 5] followed by exponentially increasing thread counts from 8 to 32.
|
||||
@ -1032,6 +1407,7 @@ static const void *nthreads_gen_params(const void *prev, char *desc)
|
||||
|
||||
#define KCSAN_KUNIT_CASE(test_name) KUNIT_CASE_PARAM(test_name, nthreads_gen_params)
|
||||
static struct kunit_case kcsan_test_cases[] = {
|
||||
KUNIT_CASE(test_barrier_nothreads),
|
||||
KCSAN_KUNIT_CASE(test_basic),
|
||||
KCSAN_KUNIT_CASE(test_concurrent_races),
|
||||
KCSAN_KUNIT_CASE(test_novalue_change),
|
||||
@ -1056,6 +1432,10 @@ static struct kunit_case kcsan_test_cases[] = {
|
||||
KCSAN_KUNIT_CASE(test_seqlock_noreport),
|
||||
KCSAN_KUNIT_CASE(test_atomic_builtins),
|
||||
KCSAN_KUNIT_CASE(test_1bit_value_change),
|
||||
KCSAN_KUNIT_CASE(test_correct_barrier),
|
||||
KCSAN_KUNIT_CASE(test_missing_barrier),
|
||||
KCSAN_KUNIT_CASE(test_atomic_builtins_correct_barrier),
|
||||
KCSAN_KUNIT_CASE(test_atomic_builtins_missing_barrier),
|
||||
{},
|
||||
};
|
||||
|
||||
@ -1120,6 +1500,9 @@ static int test_init(struct kunit *test)
|
||||
observed.nlines = 0;
|
||||
spin_unlock_irqrestore(&observed.lock, flags);
|
||||
|
||||
if (strstr(test->name, "nothreads"))
|
||||
return 0;
|
||||
|
||||
if (!torture_init_begin((char *)test->name, 1))
|
||||
return -EBUSY;
|
||||
|
||||
@ -1162,6 +1545,9 @@ static void test_exit(struct kunit *test)
|
||||
struct task_struct **stop_thread;
|
||||
int i;
|
||||
|
||||
if (strstr(test->name, "nothreads"))
|
||||
return;
|
||||
|
||||
if (torture_cleanup_begin())
|
||||
return;
|
||||
|
||||
@ -1224,7 +1610,7 @@ static void kcsan_test_exit(void)
|
||||
tracepoint_synchronize_unregister();
|
||||
}
|
||||
|
||||
late_initcall(kcsan_test_init);
|
||||
late_initcall_sync(kcsan_test_init);
|
||||
module_exit(kcsan_test_exit);
|
||||
|
||||
MODULE_LICENSE("GPL v2");
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include <linux/debug_locks.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/jiffies.h>
|
||||
#include <linux/kallsyms.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/lockdep.h>
|
||||
#include <linux/preempt.h>
|
||||
@ -31,6 +32,7 @@ struct access_info {
|
||||
int access_type;
|
||||
int task_pid;
|
||||
int cpu_id;
|
||||
unsigned long ip;
|
||||
};
|
||||
|
||||
/*
|
||||
@ -213,9 +215,9 @@ static const char *get_access_type(int type)
|
||||
if (type & KCSAN_ACCESS_ASSERT) {
|
||||
if (type & KCSAN_ACCESS_SCOPED) {
|
||||
if (type & KCSAN_ACCESS_WRITE)
|
||||
return "assert no accesses (scoped)";
|
||||
return "assert no accesses (reordered)";
|
||||
else
|
||||
return "assert no writes (scoped)";
|
||||
return "assert no writes (reordered)";
|
||||
} else {
|
||||
if (type & KCSAN_ACCESS_WRITE)
|
||||
return "assert no accesses";
|
||||
@ -238,13 +240,17 @@ static const char *get_access_type(int type)
|
||||
case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
|
||||
return "read-write (marked)";
|
||||
case KCSAN_ACCESS_SCOPED:
|
||||
return "read (scoped)";
|
||||
return "read (reordered)";
|
||||
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC:
|
||||
return "read (marked, scoped)";
|
||||
return "read (marked, reordered)";
|
||||
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE:
|
||||
return "write (scoped)";
|
||||
return "write (reordered)";
|
||||
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
|
||||
return "write (marked, scoped)";
|
||||
return "write (marked, reordered)";
|
||||
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE:
|
||||
return "read-write (reordered)";
|
||||
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
|
||||
return "read-write (marked, reordered)";
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
@ -300,6 +306,52 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
|
||||
return skip;
|
||||
}
|
||||
|
||||
/*
|
||||
* Skips to the first entry that matches the function of @ip, and then replaces
|
||||
* that entry with @ip, returning the entries to skip with @replaced containing
|
||||
* the replaced entry.
|
||||
*/
|
||||
static int
|
||||
replace_stack_entry(unsigned long stack_entries[], int num_entries, unsigned long ip,
|
||||
unsigned long *replaced)
|
||||
{
|
||||
unsigned long symbolsize, offset;
|
||||
unsigned long target_func;
|
||||
int skip;
|
||||
|
||||
if (kallsyms_lookup_size_offset(ip, &symbolsize, &offset))
|
||||
target_func = ip - offset;
|
||||
else
|
||||
goto fallback;
|
||||
|
||||
for (skip = 0; skip < num_entries; ++skip) {
|
||||
unsigned long func = stack_entries[skip];
|
||||
|
||||
if (!kallsyms_lookup_size_offset(func, &symbolsize, &offset))
|
||||
goto fallback;
|
||||
func -= offset;
|
||||
|
||||
if (func == target_func) {
|
||||
*replaced = stack_entries[skip];
|
||||
stack_entries[skip] = ip;
|
||||
return skip;
|
||||
}
|
||||
}
|
||||
|
||||
fallback:
|
||||
/* Should not happen; the resulting stack trace is likely misleading. */
|
||||
WARN_ONCE(1, "Cannot find frame for %pS in stack trace", (void *)ip);
|
||||
return get_stack_skipnr(stack_entries, num_entries);
|
||||
}
|
||||
|
||||
static int
|
||||
sanitize_stack_entries(unsigned long stack_entries[], int num_entries, unsigned long ip,
|
||||
unsigned long *replaced)
|
||||
{
|
||||
return ip ? replace_stack_entry(stack_entries, num_entries, ip, replaced) :
|
||||
get_stack_skipnr(stack_entries, num_entries);
|
||||
}
|
||||
|
||||
/* Compares symbolized strings of addr1 and addr2. */
|
||||
static int sym_strcmp(void *addr1, void *addr2)
|
||||
{
|
||||
@ -312,6 +364,14 @@ static int sym_strcmp(void *addr1, void *addr2)
|
||||
return strncmp(buf1, buf2, sizeof(buf1));
|
||||
}
|
||||
|
||||
static void
|
||||
print_stack_trace(unsigned long stack_entries[], int num_entries, unsigned long reordered_to)
|
||||
{
|
||||
stack_trace_print(stack_entries, num_entries, 0);
|
||||
if (reordered_to)
|
||||
pr_err(" |\n +-> reordered to: %pS\n", (void *)reordered_to);
|
||||
}
|
||||
|
||||
static void print_verbose_info(struct task_struct *task)
|
||||
{
|
||||
if (!task)
|
||||
@ -327,13 +387,15 @@ static void print_verbose_info(struct task_struct *task)
|
||||
|
||||
static void print_report(enum kcsan_value_change value_change,
|
||||
const struct access_info *ai,
|
||||
const struct other_info *other_info,
|
||||
struct other_info *other_info,
|
||||
u64 old, u64 new, u64 mask)
|
||||
{
|
||||
unsigned long reordered_to = 0;
|
||||
unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 };
|
||||
int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1);
|
||||
int skipnr = get_stack_skipnr(stack_entries, num_stack_entries);
|
||||
int skipnr = sanitize_stack_entries(stack_entries, num_stack_entries, ai->ip, &reordered_to);
|
||||
unsigned long this_frame = stack_entries[skipnr];
|
||||
unsigned long other_reordered_to = 0;
|
||||
unsigned long other_frame = 0;
|
||||
int other_skipnr = 0; /* silence uninit warnings */
|
||||
|
||||
@ -344,8 +406,9 @@ static void print_report(enum kcsan_value_change value_change,
|
||||
return;
|
||||
|
||||
if (other_info) {
|
||||
other_skipnr = get_stack_skipnr(other_info->stack_entries,
|
||||
other_info->num_stack_entries);
|
||||
other_skipnr = sanitize_stack_entries(other_info->stack_entries,
|
||||
other_info->num_stack_entries,
|
||||
other_info->ai.ip, &other_reordered_to);
|
||||
other_frame = other_info->stack_entries[other_skipnr];
|
||||
|
||||
/* @value_change is only known for the other thread */
|
||||
@ -385,10 +448,9 @@ static void print_report(enum kcsan_value_change value_change,
|
||||
other_info->ai.cpu_id);
|
||||
|
||||
/* Print the other thread's stack trace. */
|
||||
stack_trace_print(other_info->stack_entries + other_skipnr,
|
||||
print_stack_trace(other_info->stack_entries + other_skipnr,
|
||||
other_info->num_stack_entries - other_skipnr,
|
||||
0);
|
||||
|
||||
other_reordered_to);
|
||||
if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
|
||||
print_verbose_info(other_info->task);
|
||||
|
||||
@ -402,9 +464,7 @@ static void print_report(enum kcsan_value_change value_change,
|
||||
get_thread_desc(ai->task_pid), ai->cpu_id);
|
||||
}
|
||||
/* Print stack trace of this thread. */
|
||||
stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr,
|
||||
0);
|
||||
|
||||
print_stack_trace(stack_entries + skipnr, num_stack_entries - skipnr, reordered_to);
|
||||
if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
|
||||
print_verbose_info(current);
|
||||
|
||||
@ -576,21 +636,23 @@ static bool prepare_report_consumer(unsigned long *flags,
|
||||
}
|
||||
|
||||
static struct access_info prepare_access_info(const volatile void *ptr, size_t size,
|
||||
int access_type)
|
||||
int access_type, unsigned long ip)
|
||||
{
|
||||
return (struct access_info) {
|
||||
.ptr = ptr,
|
||||
.size = size,
|
||||
.access_type = access_type,
|
||||
.task_pid = in_task() ? task_pid_nr(current) : -1,
|
||||
.cpu_id = raw_smp_processor_id()
|
||||
.cpu_id = raw_smp_processor_id(),
|
||||
/* Only replace stack entry with @ip if scoped access. */
|
||||
.ip = (access_type & KCSAN_ACCESS_SCOPED) ? ip : 0,
|
||||
};
|
||||
}
|
||||
|
||||
void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
|
||||
int watchpoint_idx)
|
||||
unsigned long ip, int watchpoint_idx)
|
||||
{
|
||||
const struct access_info ai = prepare_access_info(ptr, size, access_type);
|
||||
const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
|
||||
unsigned long flags;
|
||||
|
||||
kcsan_disable_current();
|
||||
@ -603,10 +665,10 @@ void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_typ
|
||||
}
|
||||
|
||||
void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
|
||||
enum kcsan_value_change value_change, int watchpoint_idx,
|
||||
u64 old, u64 new, u64 mask)
|
||||
unsigned long ip, enum kcsan_value_change value_change,
|
||||
int watchpoint_idx, u64 old, u64 new, u64 mask)
|
||||
{
|
||||
const struct access_info ai = prepare_access_info(ptr, size, access_type);
|
||||
const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
|
||||
struct other_info *other_info = &other_infos[watchpoint_idx];
|
||||
unsigned long flags = 0;
|
||||
|
||||
@ -637,9 +699,9 @@ void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access
|
||||
}
|
||||
|
||||
void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
|
||||
u64 old, u64 new, u64 mask)
|
||||
unsigned long ip, u64 old, u64 new, u64 mask)
|
||||
{
|
||||
const struct access_info ai = prepare_access_info(ptr, size, access_type);
|
||||
const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
|
||||
unsigned long flags;
|
||||
|
||||
kcsan_disable_current();
|
||||
|
@ -7,10 +7,15 @@
|
||||
|
||||
#define pr_fmt(fmt) "kcsan: " fmt
|
||||
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/kcsan-checks.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/printk.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/types.h>
|
||||
|
||||
#include "encoding.h"
|
||||
@ -18,7 +23,7 @@
|
||||
#define ITERS_PER_TEST 2000
|
||||
|
||||
/* Test requirements. */
|
||||
static bool test_requires(void)
|
||||
static bool __init test_requires(void)
|
||||
{
|
||||
/* random should be initialized for the below tests */
|
||||
return prandom_u32() + prandom_u32() != 0;
|
||||
@ -28,14 +33,18 @@ static bool test_requires(void)
|
||||
* Test watchpoint encode and decode: check that encoding some access's info,
|
||||
* and then subsequent decode preserves the access's info.
|
||||
*/
|
||||
static bool test_encode_decode(void)
|
||||
static bool __init test_encode_decode(void)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ITERS_PER_TEST; ++i) {
|
||||
size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1;
|
||||
bool is_write = !!prandom_u32_max(2);
|
||||
unsigned long verif_masked_addr;
|
||||
long encoded_watchpoint;
|
||||
bool verif_is_write;
|
||||
unsigned long addr;
|
||||
size_t verif_size;
|
||||
|
||||
prandom_bytes(&addr, sizeof(addr));
|
||||
if (addr < PAGE_SIZE)
|
||||
@ -44,31 +53,18 @@ static bool test_encode_decode(void)
|
||||
if (WARN_ON(!check_encodable(addr, size)))
|
||||
return false;
|
||||
|
||||
/* Encode and decode */
|
||||
{
|
||||
const long encoded_watchpoint =
|
||||
encode_watchpoint(addr, size, is_write);
|
||||
unsigned long verif_masked_addr;
|
||||
size_t verif_size;
|
||||
bool verif_is_write;
|
||||
encoded_watchpoint = encode_watchpoint(addr, size, is_write);
|
||||
|
||||
/* Check special watchpoints */
|
||||
if (WARN_ON(decode_watchpoint(
|
||||
INVALID_WATCHPOINT, &verif_masked_addr,
|
||||
&verif_size, &verif_is_write)))
|
||||
if (WARN_ON(decode_watchpoint(INVALID_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write)))
|
||||
return false;
|
||||
if (WARN_ON(decode_watchpoint(
|
||||
CONSUMED_WATCHPOINT, &verif_masked_addr,
|
||||
&verif_size, &verif_is_write)))
|
||||
if (WARN_ON(decode_watchpoint(CONSUMED_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write)))
|
||||
return false;
|
||||
|
||||
/* Check decoding watchpoint returns same data */
|
||||
if (WARN_ON(!decode_watchpoint(
|
||||
encoded_watchpoint, &verif_masked_addr,
|
||||
&verif_size, &verif_is_write)))
|
||||
if (WARN_ON(!decode_watchpoint(encoded_watchpoint, &verif_masked_addr, &verif_size, &verif_is_write)))
|
||||
return false;
|
||||
if (WARN_ON(verif_masked_addr !=
|
||||
(addr & WATCHPOINT_ADDR_MASK)))
|
||||
if (WARN_ON(verif_masked_addr != (addr & WATCHPOINT_ADDR_MASK)))
|
||||
goto fail;
|
||||
if (WARN_ON(verif_size != size))
|
||||
goto fail;
|
||||
@ -78,19 +74,16 @@ static bool test_encode_decode(void)
|
||||
continue;
|
||||
fail:
|
||||
pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n",
|
||||
__func__, is_write ? "write" : "read", size,
|
||||
addr, encoded_watchpoint,
|
||||
verif_is_write ? "write" : "read", verif_size,
|
||||
verif_masked_addr);
|
||||
__func__, is_write ? "write" : "read", size, addr, encoded_watchpoint,
|
||||
verif_is_write ? "write" : "read", verif_size, verif_masked_addr);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Test access matching function. */
|
||||
static bool test_matching_access(void)
|
||||
static bool __init test_matching_access(void)
|
||||
{
|
||||
if (WARN_ON(!matching_access(10, 1, 10, 1)))
|
||||
return false;
|
||||
@ -115,6 +108,143 @@ static bool test_matching_access(void)
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Correct memory barrier instrumentation is critical to avoiding false
|
||||
* positives: simple test to check at boot certain barriers are always properly
|
||||
* instrumented. See kcsan_test for a more complete test.
|
||||
*/
|
||||
static DEFINE_SPINLOCK(test_spinlock);
|
||||
static bool __init test_barrier(void)
|
||||
{
|
||||
#ifdef CONFIG_KCSAN_WEAK_MEMORY
|
||||
struct kcsan_scoped_access *reorder_access = ¤t->kcsan_ctx.reorder_access;
|
||||
#else
|
||||
struct kcsan_scoped_access *reorder_access = NULL;
|
||||
#endif
|
||||
bool ret = true;
|
||||
arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
|
||||
atomic_t dummy;
|
||||
long test_var;
|
||||
|
||||
if (!reorder_access || !IS_ENABLED(CONFIG_SMP))
|
||||
return true;
|
||||
|
||||
#define __KCSAN_CHECK_BARRIER(access_type, barrier, name) \
|
||||
do { \
|
||||
reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \
|
||||
reorder_access->size = 1; \
|
||||
barrier; \
|
||||
if (reorder_access->size != 0) { \
|
||||
pr_err("improperly instrumented type=(" #access_type "): " name "\n"); \
|
||||
ret = false; \
|
||||
} \
|
||||
} while (0)
|
||||
#define KCSAN_CHECK_READ_BARRIER(b) __KCSAN_CHECK_BARRIER(0, b, #b)
|
||||
#define KCSAN_CHECK_WRITE_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE, b, #b)
|
||||
#define KCSAN_CHECK_RW_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND, b, #b)
|
||||
|
||||
kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */
|
||||
|
||||
KCSAN_CHECK_READ_BARRIER(mb());
|
||||
KCSAN_CHECK_READ_BARRIER(rmb());
|
||||
KCSAN_CHECK_READ_BARRIER(smp_mb());
|
||||
KCSAN_CHECK_READ_BARRIER(smp_rmb());
|
||||
KCSAN_CHECK_READ_BARRIER(dma_rmb());
|
||||
KCSAN_CHECK_READ_BARRIER(smp_mb__before_atomic());
|
||||
KCSAN_CHECK_READ_BARRIER(smp_mb__after_atomic());
|
||||
KCSAN_CHECK_READ_BARRIER(smp_mb__after_spinlock());
|
||||
KCSAN_CHECK_READ_BARRIER(smp_store_mb(test_var, 0));
|
||||
KCSAN_CHECK_READ_BARRIER(smp_store_release(&test_var, 0));
|
||||
KCSAN_CHECK_READ_BARRIER(xchg(&test_var, 0));
|
||||
KCSAN_CHECK_READ_BARRIER(xchg_release(&test_var, 0));
|
||||
KCSAN_CHECK_READ_BARRIER(cmpxchg(&test_var, 0, 0));
|
||||
KCSAN_CHECK_READ_BARRIER(cmpxchg_release(&test_var, 0, 0));
|
||||
KCSAN_CHECK_READ_BARRIER(atomic_set_release(&dummy, 0));
|
||||
KCSAN_CHECK_READ_BARRIER(atomic_add_return(1, &dummy));
|
||||
KCSAN_CHECK_READ_BARRIER(atomic_add_return_release(1, &dummy));
|
||||
KCSAN_CHECK_READ_BARRIER(atomic_fetch_add(1, &dummy));
|
||||
KCSAN_CHECK_READ_BARRIER(atomic_fetch_add_release(1, &dummy));
|
||||
KCSAN_CHECK_READ_BARRIER(test_and_set_bit(0, &test_var));
|
||||
KCSAN_CHECK_READ_BARRIER(test_and_clear_bit(0, &test_var));
|
||||
KCSAN_CHECK_READ_BARRIER(test_and_change_bit(0, &test_var));
|
||||
KCSAN_CHECK_READ_BARRIER(clear_bit_unlock(0, &test_var));
|
||||
KCSAN_CHECK_READ_BARRIER(__clear_bit_unlock(0, &test_var));
|
||||
arch_spin_lock(&arch_spinlock);
|
||||
KCSAN_CHECK_READ_BARRIER(arch_spin_unlock(&arch_spinlock));
|
||||
spin_lock(&test_spinlock);
|
||||
KCSAN_CHECK_READ_BARRIER(spin_unlock(&test_spinlock));
|
||||
|
||||
KCSAN_CHECK_WRITE_BARRIER(mb());
|
||||
KCSAN_CHECK_WRITE_BARRIER(wmb());
|
||||
KCSAN_CHECK_WRITE_BARRIER(smp_mb());
|
||||
KCSAN_CHECK_WRITE_BARRIER(smp_wmb());
|
||||
KCSAN_CHECK_WRITE_BARRIER(dma_wmb());
|
||||
KCSAN_CHECK_WRITE_BARRIER(smp_mb__before_atomic());
|
||||
KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_atomic());
|
||||
KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_spinlock());
|
||||
KCSAN_CHECK_WRITE_BARRIER(smp_store_mb(test_var, 0));
|
||||
KCSAN_CHECK_WRITE_BARRIER(smp_store_release(&test_var, 0));
|
||||
KCSAN_CHECK_WRITE_BARRIER(xchg(&test_var, 0));
|
||||
KCSAN_CHECK_WRITE_BARRIER(xchg_release(&test_var, 0));
|
||||
KCSAN_CHECK_WRITE_BARRIER(cmpxchg(&test_var, 0, 0));
|
||||
KCSAN_CHECK_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0));
|
||||
KCSAN_CHECK_WRITE_BARRIER(atomic_set_release(&dummy, 0));
|
||||
KCSAN_CHECK_WRITE_BARRIER(atomic_add_return(1, &dummy));
|
||||
KCSAN_CHECK_WRITE_BARRIER(atomic_add_return_release(1, &dummy));
|
||||
KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add(1, &dummy));
|
||||
KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy));
|
||||
KCSAN_CHECK_WRITE_BARRIER(test_and_set_bit(0, &test_var));
|
||||
KCSAN_CHECK_WRITE_BARRIER(test_and_clear_bit(0, &test_var));
|
||||
KCSAN_CHECK_WRITE_BARRIER(test_and_change_bit(0, &test_var));
|
||||
KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock(0, &test_var));
|
||||
KCSAN_CHECK_WRITE_BARRIER(__clear_bit_unlock(0, &test_var));
|
||||
arch_spin_lock(&arch_spinlock);
|
||||
KCSAN_CHECK_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock));
|
||||
spin_lock(&test_spinlock);
|
||||
KCSAN_CHECK_WRITE_BARRIER(spin_unlock(&test_spinlock));
|
||||
|
||||
KCSAN_CHECK_RW_BARRIER(mb());
|
||||
KCSAN_CHECK_RW_BARRIER(wmb());
|
||||
KCSAN_CHECK_RW_BARRIER(rmb());
|
||||
KCSAN_CHECK_RW_BARRIER(smp_mb());
|
||||
KCSAN_CHECK_RW_BARRIER(smp_wmb());
|
||||
KCSAN_CHECK_RW_BARRIER(smp_rmb());
|
||||
KCSAN_CHECK_RW_BARRIER(dma_wmb());
|
||||
KCSAN_CHECK_RW_BARRIER(dma_rmb());
|
||||
KCSAN_CHECK_RW_BARRIER(smp_mb__before_atomic());
|
||||
KCSAN_CHECK_RW_BARRIER(smp_mb__after_atomic());
|
||||
KCSAN_CHECK_RW_BARRIER(smp_mb__after_spinlock());
|
||||
KCSAN_CHECK_RW_BARRIER(smp_store_mb(test_var, 0));
|
||||
KCSAN_CHECK_RW_BARRIER(smp_store_release(&test_var, 0));
|
||||
KCSAN_CHECK_RW_BARRIER(xchg(&test_var, 0));
|
||||
KCSAN_CHECK_RW_BARRIER(xchg_release(&test_var, 0));
|
||||
KCSAN_CHECK_RW_BARRIER(cmpxchg(&test_var, 0, 0));
|
||||
KCSAN_CHECK_RW_BARRIER(cmpxchg_release(&test_var, 0, 0));
|
||||
KCSAN_CHECK_RW_BARRIER(atomic_set_release(&dummy, 0));
|
||||
KCSAN_CHECK_RW_BARRIER(atomic_add_return(1, &dummy));
|
||||
KCSAN_CHECK_RW_BARRIER(atomic_add_return_release(1, &dummy));
|
||||
KCSAN_CHECK_RW_BARRIER(atomic_fetch_add(1, &dummy));
|
||||
KCSAN_CHECK_RW_BARRIER(atomic_fetch_add_release(1, &dummy));
|
||||
KCSAN_CHECK_RW_BARRIER(test_and_set_bit(0, &test_var));
|
||||
KCSAN_CHECK_RW_BARRIER(test_and_clear_bit(0, &test_var));
|
||||
KCSAN_CHECK_RW_BARRIER(test_and_change_bit(0, &test_var));
|
||||
KCSAN_CHECK_RW_BARRIER(clear_bit_unlock(0, &test_var));
|
||||
KCSAN_CHECK_RW_BARRIER(__clear_bit_unlock(0, &test_var));
|
||||
arch_spin_lock(&arch_spinlock);
|
||||
KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock));
|
||||
spin_lock(&test_spinlock);
|
||||
KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock));
|
||||
|
||||
#ifdef clear_bit_unlock_is_negative_byte
|
||||
KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
|
||||
KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
|
||||
KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
|
||||
#endif
|
||||
kcsan_nestable_atomic_end();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __init kcsan_selftest(void)
|
||||
{
|
||||
int passed = 0;
|
||||
@ -132,6 +262,7 @@ static int __init kcsan_selftest(void)
|
||||
RUN_TEST(test_requires);
|
||||
RUN_TEST(test_encode_decode);
|
||||
RUN_TEST(test_matching_access);
|
||||
RUN_TEST(test_barrier);
|
||||
|
||||
pr_info("selftest: %d/%d tests passed\n", passed, total);
|
||||
if (passed != total)
|
||||
|
@ -81,7 +81,7 @@ int kexec_should_crash(struct task_struct *p)
|
||||
if (crash_kexec_post_notifiers)
|
||||
return 0;
|
||||
/*
|
||||
* There are 4 panic() calls in do_exit() path, each of which
|
||||
* There are 4 panic() calls in make_task_dead() path, each of which
|
||||
* corresponds to each of these 4 conditions.
|
||||
*/
|
||||
if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
|
||||
|
@ -556,6 +556,11 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
|
||||
if (kbuf->image->type == KEXEC_TYPE_CRASH)
|
||||
return func(&crashk_res, kbuf);
|
||||
|
||||
/*
|
||||
* Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See
|
||||
* IORESOURCE_SYSRAM_DRIVER_MANAGED handling in
|
||||
* locate_mem_hole_callback().
|
||||
*/
|
||||
if (kbuf->top_down) {
|
||||
for_each_free_mem_range_reverse(i, NUMA_NO_NODE, MEMBLOCK_NONE,
|
||||
&mstart, &mend, NULL) {
|
||||
|
538
kernel/kprobes.c
538
kernel/kprobes.c
File diff suppressed because it is too large
Load Diff
139
kernel/kthread.c
139
kernel/kthread.c
@ -52,6 +52,7 @@ struct kthread_create_info
|
||||
struct kthread {
|
||||
unsigned long flags;
|
||||
unsigned int cpu;
|
||||
int result;
|
||||
int (*threadfn)(void *);
|
||||
void *data;
|
||||
mm_segment_t oldfs;
|
||||
@ -60,6 +61,8 @@ struct kthread {
|
||||
#ifdef CONFIG_BLK_CGROUP
|
||||
struct cgroup_subsys_state *blkcg_css;
|
||||
#endif
|
||||
/* To store the full name if task comm is truncated. */
|
||||
char *full_name;
|
||||
};
|
||||
|
||||
enum KTHREAD_BITS {
|
||||
@ -71,7 +74,7 @@ enum KTHREAD_BITS {
|
||||
static inline struct kthread *to_kthread(struct task_struct *k)
|
||||
{
|
||||
WARN_ON(!(k->flags & PF_KTHREAD));
|
||||
return (__force void *)k->set_child_tid;
|
||||
return k->worker_private;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -79,7 +82,7 @@ static inline struct kthread *to_kthread(struct task_struct *k)
|
||||
*
|
||||
* Per construction; when:
|
||||
*
|
||||
* (p->flags & PF_KTHREAD) && p->set_child_tid
|
||||
* (p->flags & PF_KTHREAD) && p->worker_private
|
||||
*
|
||||
* the task is both a kthread and struct kthread is persistent. However
|
||||
* PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
|
||||
@ -87,26 +90,41 @@ static inline struct kthread *to_kthread(struct task_struct *k)
|
||||
*/
|
||||
static inline struct kthread *__to_kthread(struct task_struct *p)
|
||||
{
|
||||
void *kthread = (__force void *)p->set_child_tid;
|
||||
void *kthread = p->worker_private;
|
||||
if (kthread && !(p->flags & PF_KTHREAD))
|
||||
kthread = NULL;
|
||||
return kthread;
|
||||
}
|
||||
|
||||
void set_kthread_struct(struct task_struct *p)
|
||||
void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
|
||||
{
|
||||
struct kthread *kthread = to_kthread(tsk);
|
||||
|
||||
if (!kthread || !kthread->full_name) {
|
||||
__get_task_comm(buf, buf_size, tsk);
|
||||
return;
|
||||
}
|
||||
|
||||
strscpy_pad(buf, kthread->full_name, buf_size);
|
||||
}
|
||||
|
||||
bool set_kthread_struct(struct task_struct *p)
|
||||
{
|
||||
struct kthread *kthread;
|
||||
|
||||
if (__to_kthread(p))
|
||||
return;
|
||||
if (WARN_ON_ONCE(to_kthread(p)))
|
||||
return false;
|
||||
|
||||
kthread = kzalloc(sizeof(*kthread), GFP_KERNEL);
|
||||
/*
|
||||
* We abuse ->set_child_tid to avoid the new member and because it
|
||||
* can't be wrongly copied by copy_process(). We also rely on fact
|
||||
* that the caller can't exec, so PF_KTHREAD can't be cleared.
|
||||
*/
|
||||
p->set_child_tid = (__force void __user *)kthread;
|
||||
if (!kthread)
|
||||
return false;
|
||||
|
||||
init_completion(&kthread->exited);
|
||||
init_completion(&kthread->parked);
|
||||
p->vfork_done = &kthread->exited;
|
||||
|
||||
p->worker_private = kthread;
|
||||
return true;
|
||||
}
|
||||
|
||||
void free_kthread_struct(struct task_struct *k)
|
||||
@ -114,13 +132,17 @@ void free_kthread_struct(struct task_struct *k)
|
||||
struct kthread *kthread;
|
||||
|
||||
/*
|
||||
* Can be NULL if this kthread was created by kernel_thread()
|
||||
* or if kmalloc() in kthread() failed.
|
||||
* Can be NULL if kmalloc() in set_kthread_struct() failed.
|
||||
*/
|
||||
kthread = to_kthread(k);
|
||||
if (!kthread)
|
||||
return;
|
||||
|
||||
#ifdef CONFIG_BLK_CGROUP
|
||||
WARN_ON_ONCE(kthread && kthread->blkcg_css);
|
||||
WARN_ON_ONCE(kthread->blkcg_css);
|
||||
#endif
|
||||
k->worker_private = NULL;
|
||||
kfree(kthread->full_name);
|
||||
kfree(kthread);
|
||||
}
|
||||
|
||||
@ -268,8 +290,47 @@ void kthread_parkme(void)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kthread_parkme);
|
||||
|
||||
/**
|
||||
* kthread_exit - Cause the current kthread return @result to kthread_stop().
|
||||
* @result: The integer value to return to kthread_stop().
|
||||
*
|
||||
* While kthread_exit can be called directly, it exists so that
|
||||
* functions which do some additional work in non-modular code such as
|
||||
* module_put_and_kthread_exit can be implemented.
|
||||
*
|
||||
* Does not return.
|
||||
*/
|
||||
void __noreturn kthread_exit(long result)
|
||||
{
|
||||
struct kthread *kthread = to_kthread(current);
|
||||
kthread->result = result;
|
||||
do_exit(0);
|
||||
}
|
||||
|
||||
/**
|
||||
* kthread_complete_and_exit - Exit the current kthread.
|
||||
* @comp: Completion to complete
|
||||
* @code: The integer value to return to kthread_stop().
|
||||
*
|
||||
* If present complete @comp and the reuturn code to kthread_stop().
|
||||
*
|
||||
* A kernel thread whose module may be removed after the completion of
|
||||
* @comp can use this function exit safely.
|
||||
*
|
||||
* Does not return.
|
||||
*/
|
||||
void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
|
||||
{
|
||||
if (comp)
|
||||
complete(comp);
|
||||
|
||||
kthread_exit(code);
|
||||
}
|
||||
EXPORT_SYMBOL(kthread_complete_and_exit);
|
||||
|
||||
static int kthread(void *_create)
|
||||
{
|
||||
static const struct sched_param param = { .sched_priority = 0 };
|
||||
/* Copy data: it's on kthread's stack */
|
||||
struct kthread_create_info *create = _create;
|
||||
int (*threadfn)(void *data) = create->threadfn;
|
||||
@ -278,27 +339,24 @@ static int kthread(void *_create)
|
||||
struct kthread *self;
|
||||
int ret;
|
||||
|
||||
set_kthread_struct(current);
|
||||
self = to_kthread(current);
|
||||
|
||||
/* If user was SIGKILLed, I release the structure. */
|
||||
done = xchg(&create->done, NULL);
|
||||
if (!done) {
|
||||
kfree(create);
|
||||
do_exit(-EINTR);
|
||||
}
|
||||
|
||||
if (!self) {
|
||||
create->result = ERR_PTR(-ENOMEM);
|
||||
complete(done);
|
||||
do_exit(-ENOMEM);
|
||||
kthread_exit(-EINTR);
|
||||
}
|
||||
|
||||
self->threadfn = threadfn;
|
||||
self->data = data;
|
||||
init_completion(&self->exited);
|
||||
init_completion(&self->parked);
|
||||
current->vfork_done = &self->exited;
|
||||
|
||||
/*
|
||||
* The new thread inherited kthreadd's priority and CPU mask. Reset
|
||||
* back to default in case they have been changed.
|
||||
*/
|
||||
sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m);
|
||||
set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD));
|
||||
|
||||
/* OK, tell user we're spawned, wait for stop or wakeup */
|
||||
__set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
@ -318,7 +376,7 @@ static int kthread(void *_create)
|
||||
__kthread_parkme(self);
|
||||
ret = threadfn(data);
|
||||
}
|
||||
do_exit(ret);
|
||||
kthread_exit(ret);
|
||||
}
|
||||
|
||||
/* called from kernel_clone() to get node information for about to be created task */
|
||||
@ -397,22 +455,24 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
|
||||
}
|
||||
task = create->result;
|
||||
if (!IS_ERR(task)) {
|
||||
static const struct sched_param param = { .sched_priority = 0 };
|
||||
char name[TASK_COMM_LEN];
|
||||
va_list aq;
|
||||
int len;
|
||||
|
||||
/*
|
||||
* task is already visible to other tasks, so updating
|
||||
* COMM must be protected.
|
||||
*/
|
||||
vsnprintf(name, sizeof(name), namefmt, args);
|
||||
va_copy(aq, args);
|
||||
len = vsnprintf(name, sizeof(name), namefmt, aq);
|
||||
va_end(aq);
|
||||
if (len >= TASK_COMM_LEN) {
|
||||
struct kthread *kthread = to_kthread(task);
|
||||
|
||||
/* leave it truncated when out of memory. */
|
||||
kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
|
||||
}
|
||||
set_task_comm(task, name);
|
||||
/*
|
||||
* root may have changed our (kthreadd's) priority or CPU mask.
|
||||
* The kernel thread should not inherit these properties.
|
||||
*/
|
||||
sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m);
|
||||
set_cpus_allowed_ptr(task,
|
||||
housekeeping_cpumask(HK_FLAG_KTHREAD));
|
||||
}
|
||||
kfree(create);
|
||||
return task;
|
||||
@ -433,7 +493,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
|
||||
* If thread is going to be bound on a particular cpu, give its node
|
||||
* in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
|
||||
* When woken, the thread will run @threadfn() with @data as its
|
||||
* argument. @threadfn() can either call do_exit() directly if it is a
|
||||
* argument. @threadfn() can either return directly if it is a
|
||||
* standalone thread for which no one will call kthread_stop(), or
|
||||
* return when 'kthread_should_stop()' is true (which means
|
||||
* kthread_stop() has been called). The return value should be zero
|
||||
@ -523,6 +583,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
|
||||
to_kthread(p)->cpu = cpu;
|
||||
return p;
|
||||
}
|
||||
EXPORT_SYMBOL(kthread_create_on_cpu);
|
||||
|
||||
void kthread_set_per_cpu(struct task_struct *k, int cpu)
|
||||
{
|
||||
@ -627,7 +688,7 @@ EXPORT_SYMBOL_GPL(kthread_park);
|
||||
* instead of calling wake_up_process(): the thread will exit without
|
||||
* calling threadfn().
|
||||
*
|
||||
* If threadfn() may call do_exit() itself, the caller must ensure
|
||||
* If threadfn() may call kthread_exit() itself, the caller must ensure
|
||||
* task_struct can't go away.
|
||||
*
|
||||
* Returns the result of threadfn(), or %-EINTR if wake_up_process()
|
||||
@ -646,7 +707,7 @@ int kthread_stop(struct task_struct *k)
|
||||
kthread_unpark(k);
|
||||
wake_up_process(k);
|
||||
wait_for_completion(&kthread->exited);
|
||||
ret = k->exit_code;
|
||||
ret = kthread->result;
|
||||
put_task_struct(k);
|
||||
|
||||
trace_sched_kthread_stop_ret(ret);
|
||||
|
@ -862,14 +862,11 @@ static void klp_init_object_early(struct klp_patch *patch,
|
||||
list_add_tail(&obj->node, &patch->obj_list);
|
||||
}
|
||||
|
||||
static int klp_init_patch_early(struct klp_patch *patch)
|
||||
static void klp_init_patch_early(struct klp_patch *patch)
|
||||
{
|
||||
struct klp_object *obj;
|
||||
struct klp_func *func;
|
||||
|
||||
if (!patch->objs)
|
||||
return -EINVAL;
|
||||
|
||||
INIT_LIST_HEAD(&patch->list);
|
||||
INIT_LIST_HEAD(&patch->obj_list);
|
||||
kobject_init(&patch->kobj, &klp_ktype_patch);
|
||||
@ -879,20 +876,12 @@ static int klp_init_patch_early(struct klp_patch *patch)
|
||||
init_completion(&patch->finish);
|
||||
|
||||
klp_for_each_object_static(patch, obj) {
|
||||
if (!obj->funcs)
|
||||
return -EINVAL;
|
||||
|
||||
klp_init_object_early(patch, obj);
|
||||
|
||||
klp_for_each_func_static(obj, func) {
|
||||
klp_init_func_early(obj, func);
|
||||
}
|
||||
}
|
||||
|
||||
if (!try_module_get(patch->mod))
|
||||
return -ENODEV;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int klp_init_patch(struct klp_patch *patch)
|
||||
@ -1024,10 +1013,17 @@ static int __klp_enable_patch(struct klp_patch *patch)
|
||||
int klp_enable_patch(struct klp_patch *patch)
|
||||
{
|
||||
int ret;
|
||||
struct klp_object *obj;
|
||||
|
||||
if (!patch || !patch->mod)
|
||||
if (!patch || !patch->mod || !patch->objs)
|
||||
return -EINVAL;
|
||||
|
||||
klp_for_each_object_static(patch, obj) {
|
||||
if (!obj->funcs)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
|
||||
if (!is_livepatch_module(patch->mod)) {
|
||||
pr_err("module %s is not marked as a livepatch module\n",
|
||||
patch->mod->name);
|
||||
@ -1051,12 +1047,13 @@ int klp_enable_patch(struct klp_patch *patch)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = klp_init_patch_early(patch);
|
||||
if (ret) {
|
||||
if (!try_module_get(patch->mod)) {
|
||||
mutex_unlock(&klp_mutex);
|
||||
return ret;
|
||||
return -ENODEV;
|
||||
}
|
||||
|
||||
klp_init_patch_early(patch);
|
||||
|
||||
ret = klp_init_patch(patch);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
@ -49,14 +49,15 @@ static void notrace klp_ftrace_handler(unsigned long ip,
|
||||
|
||||
ops = container_of(fops, struct klp_ops, fops);
|
||||
|
||||
/*
|
||||
* The ftrace_test_recursion_trylock() will disable preemption,
|
||||
* which is required for the variant of synchronize_rcu() that is
|
||||
* used to allow patching functions where RCU is not watching.
|
||||
* See klp_synchronize_transition() for more details.
|
||||
*/
|
||||
bit = ftrace_test_recursion_trylock(ip, parent_ip);
|
||||
if (WARN_ON_ONCE(bit < 0))
|
||||
return;
|
||||
/*
|
||||
* A variant of synchronize_rcu() is used to allow patching functions
|
||||
* where RCU is not watching, see klp_synchronize_transition().
|
||||
*/
|
||||
preempt_disable_notrace();
|
||||
|
||||
func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
|
||||
stack_node);
|
||||
@ -120,7 +121,6 @@ static void notrace klp_ftrace_handler(unsigned long ip,
|
||||
klp_arch_set_pc(fregs, (unsigned long)func->new_func);
|
||||
|
||||
unlock:
|
||||
preempt_enable_notrace();
|
||||
ftrace_test_recursion_unlock(bit);
|
||||
}
|
||||
|
||||
|
@ -13,7 +13,6 @@
|
||||
#include "core.h"
|
||||
#include "patch.h"
|
||||
#include "transition.h"
|
||||
#include "../sched/sched.h"
|
||||
|
||||
#define MAX_STACK_ENTRIES 100
|
||||
#define STACK_ERR_BUF_SIZE 128
|
||||
@ -240,7 +239,7 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
|
||||
* Determine whether it's safe to transition the task to the target patch state
|
||||
* by looking for any to-be-patched or to-be-unpatched functions on its stack.
|
||||
*/
|
||||
static int klp_check_stack(struct task_struct *task, char *err_buf)
|
||||
static int klp_check_stack(struct task_struct *task, const char **oldname)
|
||||
{
|
||||
static unsigned long entries[MAX_STACK_ENTRIES];
|
||||
struct klp_object *obj;
|
||||
@ -248,12 +247,8 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
|
||||
int ret, nr_entries;
|
||||
|
||||
ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
|
||||
if (ret < 0) {
|
||||
snprintf(err_buf, STACK_ERR_BUF_SIZE,
|
||||
"%s: %s:%d has an unreliable stack\n",
|
||||
__func__, task->comm, task->pid);
|
||||
return ret;
|
||||
}
|
||||
if (ret < 0)
|
||||
return -EINVAL;
|
||||
nr_entries = ret;
|
||||
|
||||
klp_for_each_object(klp_transition_patch, obj) {
|
||||
@ -262,11 +257,8 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
|
||||
klp_for_each_func(obj, func) {
|
||||
ret = klp_check_stack_func(func, entries, nr_entries);
|
||||
if (ret) {
|
||||
snprintf(err_buf, STACK_ERR_BUF_SIZE,
|
||||
"%s: %s:%d is sleeping on function %s\n",
|
||||
__func__, task->comm, task->pid,
|
||||
func->old_name);
|
||||
return ret;
|
||||
*oldname = func->old_name;
|
||||
return -EADDRINUSE;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -274,6 +266,22 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int klp_check_and_switch_task(struct task_struct *task, void *arg)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (task_curr(task) && task != current)
|
||||
return -EBUSY;
|
||||
|
||||
ret = klp_check_stack(task, arg);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
|
||||
task->patch_state = klp_target_state;
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Try to safely switch a task to the target patch state. If it's currently
|
||||
* running, or it's sleeping on a to-be-patched or to-be-unpatched function, or
|
||||
@ -281,13 +289,8 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
|
||||
*/
|
||||
static bool klp_try_switch_task(struct task_struct *task)
|
||||
{
|
||||
static char err_buf[STACK_ERR_BUF_SIZE];
|
||||
struct rq *rq;
|
||||
struct rq_flags flags;
|
||||
const char *old_name;
|
||||
int ret;
|
||||
bool success = false;
|
||||
|
||||
err_buf[0] = '\0';
|
||||
|
||||
/* check if this task has already switched over */
|
||||
if (task->patch_state == klp_target_state)
|
||||
@ -305,36 +308,31 @@ static bool klp_try_switch_task(struct task_struct *task)
|
||||
* functions. If all goes well, switch the task to the target patch
|
||||
* state.
|
||||
*/
|
||||
rq = task_rq_lock(task, &flags);
|
||||
ret = task_call_func(task, klp_check_and_switch_task, &old_name);
|
||||
switch (ret) {
|
||||
case 0: /* success */
|
||||
break;
|
||||
|
||||
if (task_running(rq, task) && task != current) {
|
||||
snprintf(err_buf, STACK_ERR_BUF_SIZE,
|
||||
"%s: %s:%d is running\n", __func__, task->comm,
|
||||
task->pid);
|
||||
goto done;
|
||||
case -EBUSY: /* klp_check_and_switch_task() */
|
||||
pr_debug("%s: %s:%d is running\n",
|
||||
__func__, task->comm, task->pid);
|
||||
break;
|
||||
case -EINVAL: /* klp_check_and_switch_task() */
|
||||
pr_debug("%s: %s:%d has an unreliable stack\n",
|
||||
__func__, task->comm, task->pid);
|
||||
break;
|
||||
case -EADDRINUSE: /* klp_check_and_switch_task() */
|
||||
pr_debug("%s: %s:%d is sleeping on function %s\n",
|
||||
__func__, task->comm, task->pid, old_name);
|
||||
break;
|
||||
|
||||
default:
|
||||
pr_debug("%s: Unknown error code (%d) when trying to switch %s:%d\n",
|
||||
__func__, ret, task->comm, task->pid);
|
||||
break;
|
||||
}
|
||||
|
||||
ret = klp_check_stack(task, err_buf);
|
||||
if (ret)
|
||||
goto done;
|
||||
|
||||
success = true;
|
||||
|
||||
clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
|
||||
task->patch_state = klp_target_state;
|
||||
|
||||
done:
|
||||
task_rq_unlock(rq, task, &flags);
|
||||
|
||||
/*
|
||||
* Due to console deadlock issues, pr_debug() can't be used while
|
||||
* holding the task rq lock. Instead we have to use a temporary buffer
|
||||
* and print the debug message after releasing the lock.
|
||||
*/
|
||||
if (err_buf[0] != '\0')
|
||||
pr_debug("%s", err_buf);
|
||||
|
||||
return success;
|
||||
return !ret;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -415,8 +413,11 @@ void klp_try_complete_transition(void)
|
||||
for_each_possible_cpu(cpu) {
|
||||
task = idle_task(cpu);
|
||||
if (cpu_online(cpu)) {
|
||||
if (!klp_try_switch_task(task))
|
||||
if (!klp_try_switch_task(task)) {
|
||||
complete = false;
|
||||
/* Make idle task go through the main loop. */
|
||||
wake_up_if_idle(cpu);
|
||||
}
|
||||
} else if (task->patch_state != klp_target_state) {
|
||||
/* offline idle tasks can be switched immediately */
|
||||
clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
|
||||
|
@ -788,6 +788,21 @@ static int very_verbose(struct lock_class *class)
|
||||
* Is this the address of a static object:
|
||||
*/
|
||||
#ifdef __KERNEL__
|
||||
/*
|
||||
* Check if an address is part of freed initmem. After initmem is freed,
|
||||
* memory can be allocated from it, and such allocations would then have
|
||||
* addresses within the range [_stext, _end].
|
||||
*/
|
||||
#ifndef arch_is_kernel_initmem_freed
|
||||
static int arch_is_kernel_initmem_freed(unsigned long addr)
|
||||
{
|
||||
if (system_state < SYSTEM_FREEING_INITMEM)
|
||||
return 0;
|
||||
|
||||
return init_section_contains((void *)addr, 1);
|
||||
}
|
||||
#endif
|
||||
|
||||
static int static_obj(const void *obj)
|
||||
{
|
||||
unsigned long start = (unsigned long) &_stext,
|
||||
@ -803,9 +818,6 @@ static int static_obj(const void *obj)
|
||||
if ((addr >= start) && (addr < end))
|
||||
return 1;
|
||||
|
||||
if (arch_is_kernel_data(addr))
|
||||
return 1;
|
||||
|
||||
/*
|
||||
* in-kernel percpu var?
|
||||
*/
|
||||
@ -4671,7 +4683,7 @@ print_lock_invalid_wait_context(struct task_struct *curr,
|
||||
/*
|
||||
* Verify the wait_type context.
|
||||
*
|
||||
* This check validates we takes locks in the right wait-type order; that is it
|
||||
* This check validates we take locks in the right wait-type order; that is it
|
||||
* ensures that we do not take mutexes inside spinlocks and do not attempt to
|
||||
* acquire spinlocks inside raw_spinlocks and the sort.
|
||||
*
|
||||
@ -5473,6 +5485,7 @@ static noinstr void check_flags(unsigned long flags)
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef CONFIG_PREEMPT_RT
|
||||
/*
|
||||
* We dont accurately track softirq state in e.g.
|
||||
* hardirq contexts (such as on 4KSTACKS), so only
|
||||
@ -5487,6 +5500,7 @@ static noinstr void check_flags(unsigned long flags)
|
||||
DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
if (!debug_locks)
|
||||
print_irqtrace_events(current);
|
||||
|
@ -1022,23 +1022,23 @@ static int __init lock_torture_init(void)
|
||||
if (onoff_interval > 0) {
|
||||
firsterr = torture_onoff_init(onoff_holdoff * HZ,
|
||||
onoff_interval * HZ, NULL);
|
||||
if (firsterr)
|
||||
if (torture_init_error(firsterr))
|
||||
goto unwind;
|
||||
}
|
||||
if (shuffle_interval > 0) {
|
||||
firsterr = torture_shuffle_init(shuffle_interval);
|
||||
if (firsterr)
|
||||
if (torture_init_error(firsterr))
|
||||
goto unwind;
|
||||
}
|
||||
if (shutdown_secs > 0) {
|
||||
firsterr = torture_shutdown_init(shutdown_secs,
|
||||
lock_torture_cleanup);
|
||||
if (firsterr)
|
||||
if (torture_init_error(firsterr))
|
||||
goto unwind;
|
||||
}
|
||||
if (stutter > 0) {
|
||||
firsterr = torture_stutter_init(stutter, stutter);
|
||||
if (firsterr)
|
||||
if (torture_init_error(firsterr))
|
||||
goto unwind;
|
||||
}
|
||||
|
||||
@ -1047,7 +1047,7 @@ static int __init lock_torture_init(void)
|
||||
sizeof(writer_tasks[0]),
|
||||
GFP_KERNEL);
|
||||
if (writer_tasks == NULL) {
|
||||
VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
|
||||
TOROUT_ERRSTRING("writer_tasks: Out of memory");
|
||||
firsterr = -ENOMEM;
|
||||
goto unwind;
|
||||
}
|
||||
@ -1058,7 +1058,7 @@ static int __init lock_torture_init(void)
|
||||
sizeof(reader_tasks[0]),
|
||||
GFP_KERNEL);
|
||||
if (reader_tasks == NULL) {
|
||||
VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
|
||||
TOROUT_ERRSTRING("reader_tasks: Out of memory");
|
||||
kfree(writer_tasks);
|
||||
writer_tasks = NULL;
|
||||
firsterr = -ENOMEM;
|
||||
@ -1082,7 +1082,7 @@ static int __init lock_torture_init(void)
|
||||
/* Create writer. */
|
||||
firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
|
||||
writer_tasks[i]);
|
||||
if (firsterr)
|
||||
if (torture_init_error(firsterr))
|
||||
goto unwind;
|
||||
|
||||
create_reader:
|
||||
@ -1091,13 +1091,13 @@ static int __init lock_torture_init(void)
|
||||
/* Create reader. */
|
||||
firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j],
|
||||
reader_tasks[j]);
|
||||
if (firsterr)
|
||||
if (torture_init_error(firsterr))
|
||||
goto unwind;
|
||||
}
|
||||
if (stat_interval > 0) {
|
||||
firsterr = torture_create_kthread(lock_torture_stats, NULL,
|
||||
stats_task);
|
||||
if (firsterr)
|
||||
if (torture_init_error(firsterr))
|
||||
goto unwind;
|
||||
}
|
||||
torture_init_end();
|
||||
|
@ -94,6 +94,9 @@ static inline unsigned long __owner_flags(unsigned long owner)
|
||||
return owner & MUTEX_FLAGS;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns: __mutex_owner(lock) on failure or NULL on success.
|
||||
*/
|
||||
static inline struct task_struct *__mutex_trylock_common(struct mutex *lock, bool handoff)
|
||||
{
|
||||
unsigned long owner, curr = (unsigned long)current;
|
||||
@ -348,21 +351,23 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
|
||||
{
|
||||
bool ret = true;
|
||||
|
||||
rcu_read_lock();
|
||||
lockdep_assert_preemption_disabled();
|
||||
|
||||
while (__mutex_owner(lock) == owner) {
|
||||
/*
|
||||
* Ensure we emit the owner->on_cpu, dereference _after_
|
||||
* checking lock->owner still matches owner. If that fails,
|
||||
* owner might point to freed memory. If it still matches,
|
||||
* the rcu_read_lock() ensures the memory stays valid.
|
||||
* checking lock->owner still matches owner. And we already
|
||||
* disabled preemption which is equal to the RCU read-side
|
||||
* crital section in optimistic spinning code. Thus the
|
||||
* task_strcut structure won't go away during the spinning
|
||||
* period
|
||||
*/
|
||||
barrier();
|
||||
|
||||
/*
|
||||
* Use vcpu_is_preempted to detect lock holder preemption issue.
|
||||
*/
|
||||
if (!owner->on_cpu || need_resched() ||
|
||||
vcpu_is_preempted(task_cpu(owner))) {
|
||||
if (!owner_on_cpu(owner) || need_resched()) {
|
||||
ret = false;
|
||||
break;
|
||||
}
|
||||
@ -374,7 +379,6 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
|
||||
|
||||
cpu_relax();
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return ret;
|
||||
}
|
||||
@ -387,19 +391,19 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
|
||||
struct task_struct *owner;
|
||||
int retval = 1;
|
||||
|
||||
lockdep_assert_preemption_disabled();
|
||||
|
||||
if (need_resched())
|
||||
return 0;
|
||||
|
||||
rcu_read_lock();
|
||||
owner = __mutex_owner(lock);
|
||||
|
||||
/*
|
||||
* As lock holder preemption issue, we both skip spinning if task is not
|
||||
* on cpu or its cpu is preempted
|
||||
* We already disabled preemption which is equal to the RCU read-side
|
||||
* crital section in optimistic spinning code. Thus the task_strcut
|
||||
* structure won't go away during the spinning period.
|
||||
*/
|
||||
owner = __mutex_owner(lock);
|
||||
if (owner)
|
||||
retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
|
||||
rcu_read_unlock();
|
||||
retval = owner_on_cpu(owner);
|
||||
|
||||
/*
|
||||
* If lock->owner is not set, the mutex has been released. Return true
|
||||
@ -736,6 +740,44 @@ __ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
|
||||
return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* ww_mutex_trylock - tries to acquire the w/w mutex with optional acquire context
|
||||
* @ww: mutex to lock
|
||||
* @ww_ctx: optional w/w acquire context
|
||||
*
|
||||
* Trylocks a mutex with the optional acquire context; no deadlock detection is
|
||||
* possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise.
|
||||
*
|
||||
* Unlike ww_mutex_lock, no deadlock handling is performed. However, if a @ctx is
|
||||
* specified, -EALREADY handling may happen in calls to ww_mutex_trylock.
|
||||
*
|
||||
* A mutex acquired with this function must be released with ww_mutex_unlock.
|
||||
*/
|
||||
int ww_mutex_trylock(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
|
||||
{
|
||||
if (!ww_ctx)
|
||||
return mutex_trylock(&ww->base);
|
||||
|
||||
MUTEX_WARN_ON(ww->base.magic != &ww->base);
|
||||
|
||||
/*
|
||||
* Reset the wounded flag after a kill. No other process can
|
||||
* race and wound us here, since they can't have a valid owner
|
||||
* pointer if we don't have any locks held.
|
||||
*/
|
||||
if (ww_ctx->acquired == 0)
|
||||
ww_ctx->wounded = 0;
|
||||
|
||||
if (__mutex_trylock(&ww->base)) {
|
||||
ww_mutex_set_context_fastpath(ww, ww_ctx);
|
||||
mutex_acquire_nest(&ww->base.dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_);
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(ww_mutex_trylock);
|
||||
|
||||
#ifdef CONFIG_DEBUG_LOCK_ALLOC
|
||||
void __sched
|
||||
mutex_lock_nested(struct mutex *lock, unsigned int subclass)
|
||||
|
@ -446,17 +446,24 @@ static __always_inline void rt_mutex_adjust_prio(struct task_struct *p)
|
||||
}
|
||||
|
||||
/* RT mutex specific wake_q wrappers */
|
||||
static __always_inline void rt_mutex_wake_q_add_task(struct rt_wake_q_head *wqh,
|
||||
struct task_struct *task,
|
||||
unsigned int wake_state)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_PREEMPT_RT) && wake_state == TASK_RTLOCK_WAIT) {
|
||||
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
|
||||
WARN_ON_ONCE(wqh->rtlock_task);
|
||||
get_task_struct(task);
|
||||
wqh->rtlock_task = task;
|
||||
} else {
|
||||
wake_q_add(&wqh->head, task);
|
||||
}
|
||||
}
|
||||
|
||||
static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,
|
||||
struct rt_mutex_waiter *w)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_PREEMPT_RT) && w->wake_state != TASK_NORMAL) {
|
||||
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
|
||||
WARN_ON_ONCE(wqh->rtlock_task);
|
||||
get_task_struct(w->task);
|
||||
wqh->rtlock_task = w->task;
|
||||
} else {
|
||||
wake_q_add(&wqh->head, w->task);
|
||||
}
|
||||
rt_mutex_wake_q_add_task(wqh, w->task, w->wake_state);
|
||||
}
|
||||
|
||||
static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh)
|
||||
@ -1096,8 +1103,11 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
|
||||
* the other will detect the deadlock and return -EDEADLOCK,
|
||||
* which is wrong, as the other waiter is not in a deadlock
|
||||
* situation.
|
||||
*
|
||||
* Except for ww_mutex, in that case the chain walk must already deal
|
||||
* with spurious cycles, see the comments at [3] and [6].
|
||||
*/
|
||||
if (owner == task)
|
||||
if (owner == task && !(build_ww_mutex() && ww_ctx))
|
||||
return -EDEADLK;
|
||||
|
||||
raw_spin_lock(&task->pi_lock);
|
||||
@ -1372,9 +1382,8 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
|
||||
* for CONFIG_PREEMPT_RCU=y)
|
||||
* - the VCPU on which owner runs is preempted
|
||||
*/
|
||||
if (!owner->on_cpu || need_resched() ||
|
||||
!rt_mutex_waiter_is_top_waiter(lock, waiter) ||
|
||||
vcpu_is_preempted(task_cpu(owner))) {
|
||||
if (!owner_on_cpu(owner) || need_resched() ||
|
||||
!rt_mutex_waiter_is_top_waiter(lock, waiter)) {
|
||||
res = false;
|
||||
break;
|
||||
}
|
||||
|
@ -21,12 +21,13 @@ int max_lock_depth = 1024;
|
||||
*/
|
||||
static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock,
|
||||
unsigned int state,
|
||||
struct lockdep_map *nest_lock,
|
||||
unsigned int subclass)
|
||||
{
|
||||
int ret;
|
||||
|
||||
might_sleep();
|
||||
mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
|
||||
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_);
|
||||
ret = __rt_mutex_lock(&lock->rtmutex, state);
|
||||
if (ret)
|
||||
mutex_release(&lock->dep_map, _RET_IP_);
|
||||
@ -48,10 +49,16 @@ EXPORT_SYMBOL(rt_mutex_base_init);
|
||||
*/
|
||||
void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
|
||||
{
|
||||
__rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass);
|
||||
__rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
|
||||
|
||||
void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock)
|
||||
{
|
||||
__rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock);
|
||||
|
||||
#else /* !CONFIG_DEBUG_LOCK_ALLOC */
|
||||
|
||||
/**
|
||||
@ -61,7 +68,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
|
||||
*/
|
||||
void __sched rt_mutex_lock(struct rt_mutex *lock)
|
||||
{
|
||||
__rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0);
|
||||
__rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(rt_mutex_lock);
|
||||
#endif
|
||||
@ -77,10 +84,25 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
|
||||
*/
|
||||
int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
|
||||
{
|
||||
return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0);
|
||||
return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
|
||||
|
||||
/**
|
||||
* rt_mutex_lock_killable - lock a rt_mutex killable
|
||||
*
|
||||
* @lock: the rt_mutex to be locked
|
||||
*
|
||||
* Returns:
|
||||
* 0 on success
|
||||
* -EINTR when interrupted by a signal
|
||||
*/
|
||||
int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
|
||||
{
|
||||
return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
|
||||
|
||||
/**
|
||||
* rt_mutex_trylock - try to lock a rt_mutex
|
||||
*
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user