3
0
mirror of https://github.com/Qortal/Brooklyn.git synced 2025-02-01 07:42:18 +00:00
This commit is contained in:
Raziel K. Crowe 2022-04-02 17:12:23 +05:00
parent 6ca71e00d3
commit 4bdaa608f6
202 changed files with 11330 additions and 5631 deletions

View File

@ -550,7 +550,7 @@ config SCHED_THERMAL_PRESSURE
i.e. put less load on throttled CPUs than on non/less throttled ones.
This requires the architecture to implement
arch_set_thermal_pressure() and arch_scale_thermal_pressure().
arch_update_thermal_pressure() and arch_scale_thermal_pressure().
config BSD_PROCESS_ACCT
bool "BSD Process Accounting"
@ -885,6 +885,11 @@ config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
config CC_HAS_INT128
def_bool !$(cc-option,$(m64-flag) -D__SIZEOF_INT128__=0) && 64BIT
config CC_IMPLICIT_FALLTHROUGH
string
default "-Wimplicit-fallthrough=5" if CC_IS_GCC && $(cc-option,-Wimplicit-fallthrough=5)
default "-Wimplicit-fallthrough" if CC_IS_CLANG && $(cc-option,-Wunreachable-code-fallthrough)
#
# For architectures that know their GCC __int128 support is sound
#
@ -901,7 +906,7 @@ config NUMA_BALANCING
bool "Memory placement aware NUMA scheduler"
depends on ARCH_SUPPORTS_NUMA_BALANCING
depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY
depends on SMP && NUMA && MIGRATION
depends on SMP && NUMA && MIGRATION && !PREEMPT_RT
help
This option adds support for automatic NUMA aware memory/task placement.
The mechanism is quite primitive and is based on migrating memory when
@ -1409,7 +1414,6 @@ config LD_DEAD_CODE_DATA_ELIMINATION
config LD_ORPHAN_WARN
def_bool y
depends on ARCH_WANT_LD_ORPHAN_WARN
depends on !LD_IS_LLD || LLD_VERSION >= 110000
depends on $(ld-option,--orphan-handling=warn)
config SYSCTL
@ -1574,6 +1578,7 @@ config BASE_FULL
config FUTEX
bool "Enable futex support" if EXPERT
depends on !(SPARC32 && SMP)
default y
imply RT_MUTEXES
help
@ -1586,14 +1591,6 @@ config FUTEX_PI
depends on FUTEX && RT_MUTEXES
default y
config HAVE_FUTEX_CMPXCHG
bool
depends on FUTEX
help
Architectures should select this if futex_atomic_cmpxchg_inatomic()
is implemented and always working. This removes a couple of runtime
checks.
config EPOLL
bool "Enable eventpoll support" if EXPERT
default y
@ -1799,6 +1796,10 @@ config HAVE_PERF_EVENTS
help
See tools/perf/design.txt for details.
config GUEST_PERF_EVENTS
bool
depends on HAVE_PERF_EVENTS
config PERF_USE_VMALLOC
bool
help
@ -1896,6 +1897,7 @@ choice
config SLAB
bool "SLAB"
depends on !PREEMPT_RT
select HAVE_HARDENED_USERCOPY_ALLOCATOR
help
The regular slab allocator that is established and known to work
@ -1916,6 +1918,7 @@ config SLUB
config SLOB
depends on EXPERT
bool "SLOB (Simple Allocator)"
depends on !PREEMPT_RT
help
SLOB replaces the stock allocator with a drastically simpler
allocator. SLOB is generally more space efficient but
@ -1926,6 +1929,7 @@ endchoice
config SLAB_MERGE_DEFAULT
bool "Allow slab caches to be merged"
default y
depends on SLAB || SLUB
help
For reduced kernel memory fragmentation, slab caches can be
merged when they share the same size and other characteristics.
@ -2273,6 +2277,19 @@ config MODULE_COMPRESS_ZSTD
endchoice
config MODULE_DECOMPRESS
bool "Support in-kernel module decompression"
depends on MODULE_COMPRESS_GZIP || MODULE_COMPRESS_XZ
select ZLIB_INFLATE if MODULE_COMPRESS_GZIP
select XZ_DEC if MODULE_COMPRESS_XZ
help
Support for decompressing kernel modules by the kernel itself
instead of relying on userspace to perform this task. Useful when
load pinning security policy is enabled.
If unsure, say N.
config MODULE_ALLOW_MISSING_NAMESPACE_IMPORTS
bool "Allow loading of modules with missing namespace imports"
help

View File

@ -30,8 +30,8 @@ $(obj)/version.o: include/generated/compile.h
quiet_cmd_compile.h = CHK $@
cmd_compile.h = \
$(CONFIG_SHELL) $(srctree)/scripts/mkcompile_h $@ \
"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT)" \
"$(CONFIG_PREEMPT_RT)" $(CONFIG_CC_VERSION_TEXT) "$(LD)"
"$(UTS_MACHINE)" "$(CONFIG_SMP)" "$(CONFIG_PREEMPT_BUILD)" \
"$(CONFIG_PREEMPT_RT)" "$(CONFIG_CC_VERSION_TEXT)" "$(LD)"
include/generated/compile.h: FORCE
$(call cmd,compile.h)

View File

@ -182,11 +182,6 @@ struct task_struct init_task
#endif
#ifdef CONFIG_KCSAN
.kcsan_ctx = {
.disable_count = 0,
.atomic_next = 0,
.atomic_nest_count = 0,
.in_flat_atomic = false,
.access_mask = 0,
.scoped_accesses = {LIST_POISON1, NULL},
},
#endif

View File

@ -607,7 +607,7 @@ void __weak __init free_initrd_mem(unsigned long start, unsigned long end)
unsigned long aligned_start = ALIGN_DOWN(start, PAGE_SIZE);
unsigned long aligned_end = ALIGN(end, PAGE_SIZE);
memblock_free(__pa(aligned_start), aligned_end - aligned_start);
memblock_free((void *)aligned_start, aligned_end - aligned_start);
#endif
free_reserved_area((void *)start, (void *)end, POISON_FREE_INITMEM,

View File

@ -83,7 +83,6 @@
#include <linux/ptrace.h>
#include <linux/pti.h>
#include <linux/blkdev.h>
#include <linux/elevator.h>
#include <linux/sched/clock.h>
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
@ -382,7 +381,7 @@ static char * __init xbc_make_cmdline(const char *key)
ret = xbc_snprint_cmdline(new_cmdline, len + 1, root);
if (ret < 0 || ret > len) {
pr_err("Failed to print extra kernel cmdline.\n");
memblock_free_ptr(new_cmdline, len + 1);
memblock_free(new_cmdline, len + 1);
return NULL;
}
@ -410,7 +409,7 @@ static void __init setup_boot_config(void)
const char *msg;
int pos;
u32 size, csum;
char *data, *copy, *err;
char *data, *err;
int ret;
/* Cut out the bootconfig data even if we have no bootconfig option */
@ -443,16 +442,7 @@ static void __init setup_boot_config(void)
return;
}
copy = memblock_alloc(size + 1, SMP_CACHE_BYTES);
if (!copy) {
pr_err("Failed to allocate memory for bootconfig\n");
return;
}
memcpy(copy, data, size);
copy[size] = '\0';
ret = xbc_init(copy, &msg, &pos);
ret = xbc_init(data, size, &msg, &pos);
if (ret < 0) {
if (pos < 0)
pr_err("Failed to init bootconfig: %s.\n", msg);
@ -460,6 +450,7 @@ static void __init setup_boot_config(void)
pr_err("Failed to parse bootconfig: %s at %d.\n",
msg, pos);
} else {
xbc_get_info(&ret, NULL);
pr_info("Load bootconfig: %d bytes %d nodes\n", size, ret);
/* keys starting with "kernel." are passed via cmdline */
extra_command_line = xbc_make_cmdline("kernel");
@ -471,7 +462,7 @@ static void __init setup_boot_config(void)
static void __init exit_boot_config(void)
{
xbc_destroy_all();
xbc_exit();
}
#else /* !CONFIG_BOOT_CONFIG */
@ -843,12 +834,15 @@ static void __init mm_init(void)
init_mem_debugging_and_hardening();
kfence_alloc_pool();
report_meminit();
stack_depot_init();
stack_depot_early_init();
mem_init();
mem_init_print_info();
/* page_owner must be initialized after buddy is ready */
page_ext_init_flatmem_late();
kmem_cache_init();
/*
* page_owner must be initialized after buddy is ready, and also after
* slab is ready so that stack_depot_init() works properly
*/
page_ext_init_flatmem_late();
kmemleak_init();
pgtable_init();
debug_objects_mem_init();
@ -927,7 +921,7 @@ static void __init print_unknown_bootoptions(void)
/* Start at unknown_options[1] to skip the initial space */
pr_notice("Unknown kernel command line parameters \"%s\", will be passed to user space.\n",
&unknown_options[1]);
memblock_free_ptr(unknown_options, len);
memblock_free(unknown_options, len);
}
asmlinkage __visible void __init __no_sanitize_address start_kernel(void)
@ -1508,6 +1502,8 @@ static int __ref kernel_init(void *unused)
kernel_init_freeable();
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
system_state = SYSTEM_FREEING_INITMEM;
kprobe_free_init_mem();
ftrace_free_init_mem();
kgdb_free_init_mem();

View File

@ -10,6 +10,7 @@
#include <linux/nsproxy.h>
#include <linux/sysctl.h>
#include <linux/uaccess.h>
#include <linux/capability.h>
#include <linux/ipc_namespace.h>
#include <linux/msg.h>
#include "util.h"
@ -22,7 +23,6 @@ static void *get_ipc(struct ctl_table *table)
return which;
}
#ifdef CONFIG_PROC_SYSCTL
static int proc_ipc_dointvec(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
{
@ -104,13 +104,17 @@ static int proc_ipc_sem_dointvec(struct ctl_table *table, int write,
return ret;
}
#else
#define proc_ipc_doulongvec_minmax NULL
#define proc_ipc_dointvec NULL
#define proc_ipc_dointvec_minmax NULL
#define proc_ipc_dointvec_minmax_orphans NULL
#define proc_ipc_auto_msgmni NULL
#define proc_ipc_sem_dointvec NULL
#ifdef CONFIG_CHECKPOINT_RESTORE
static int proc_ipc_dointvec_minmax_checkpoint_restore(struct ctl_table *table,
int write, void *buffer, size_t *lenp, loff_t *ppos)
{
struct user_namespace *user_ns = current->nsproxy->ipc_ns->user_ns;
if (write && !checkpoint_restore_ns_capable(user_ns))
return -EPERM;
return proc_ipc_dointvec_minmax(table, write, buffer, lenp, ppos);
}
#endif
int ipc_mni = IPCMNI;
@ -198,8 +202,8 @@ static struct ctl_table ipc_kern_table[] = {
.procname = "sem_next_id",
.data = &init_ipc_ns.ids[IPC_SEM_IDS].next_id,
.maxlen = sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
.mode = 0644,
.proc_handler = proc_ipc_dointvec_minmax,
.mode = 0666,
.proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_INT_MAX,
},
@ -207,8 +211,8 @@ static struct ctl_table ipc_kern_table[] = {
.procname = "msg_next_id",
.data = &init_ipc_ns.ids[IPC_MSG_IDS].next_id,
.maxlen = sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
.mode = 0644,
.proc_handler = proc_ipc_dointvec_minmax,
.mode = 0666,
.proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_INT_MAX,
},
@ -216,8 +220,8 @@ static struct ctl_table ipc_kern_table[] = {
.procname = "shm_next_id",
.data = &init_ipc_ns.ids[IPC_SHM_IDS].next_id,
.maxlen = sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
.mode = 0644,
.proc_handler = proc_ipc_dointvec_minmax,
.mode = 0666,
.proc_handler = proc_ipc_dointvec_minmax_checkpoint_restore,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_INT_MAX,
},

View File

@ -330,9 +330,6 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
shm_unlock(shp);
if (!is_file_hugepages(shm_file))
shmem_lock(shm_file, 0, shp->mlock_ucounts);
else if (shp->mlock_ucounts)
user_shm_unlock(i_size_read(file_inode(shm_file)),
shp->mlock_ucounts);
fput(shm_file);
ipc_update_pid(&shp->shm_cprid, NULL);
ipc_update_pid(&shp->shm_lprid, NULL);
@ -742,8 +739,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
if (shmflg & SHM_NORESERVE)
acctflag = VM_NORESERVE;
file = hugetlb_file_setup(name, hugesize, acctflag,
&shp->mlock_ucounts, HUGETLB_SHMFS_INODE,
(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
HUGETLB_SHMFS_INODE, (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
} else {
/*
* Do not allow no accounting for OVERCOMMIT_NEVER, even
@ -794,8 +790,6 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
no_id:
ipc_update_pid(&shp->shm_cprid, NULL);
ipc_update_pid(&shp->shm_lprid, NULL);
if (is_file_hugepages(file) && shp->mlock_ucounts)
user_shm_unlock(size, shp->mlock_ucounts);
fput(file);
ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
return error;

View File

@ -1,11 +1,23 @@
# SPDX-License-Identifier: GPL-2.0-only
config PREEMPT_NONE_BUILD
bool
config PREEMPT_VOLUNTARY_BUILD
bool
config PREEMPT_BUILD
bool
select PREEMPTION
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
choice
prompt "Preemption Model"
default PREEMPT_NONE
config PREEMPT_NONE
bool "No Forced Preemption (Server)"
select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
help
This is the traditional Linux preemption model, geared towards
throughput. It will still provide good latencies most of the
@ -20,6 +32,7 @@ config PREEMPT_NONE
config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
depends on !ARCH_NO_PREEMPT
select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
help
This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new
@ -38,9 +51,7 @@ config PREEMPT_VOLUNTARY
config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
depends on !ARCH_NO_PREEMPT
select PREEMPTION
select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
select PREEMPT_DYNAMIC if HAVE_PREEMPT_DYNAMIC
select PREEMPT_BUILD
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
@ -83,7 +94,10 @@ config PREEMPTION
select PREEMPT_COUNT
config PREEMPT_DYNAMIC
bool
bool "Preemption behaviour defined on boot"
depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
select PREEMPT_BUILD
default y
help
This option allows to define the preemption model on the kernel
command line parameter and thus override the default preemption

View File

@ -59,7 +59,7 @@ obj-$(CONFIG_FREEZER) += freezer.o
obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
obj-$(CONFIG_FUTEX) += futex.o
obj-$(CONFIG_FUTEX) += futex/
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
@ -67,6 +67,7 @@ obj-y += up.o
endif
obj-$(CONFIG_UID16) += uid16.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_DECOMPRESS) += module_decompress.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_MODULE_SIG_FORMAT) += module_signature.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
@ -85,7 +86,6 @@ obj-$(CONFIG_PID_NS) += pid_namespace.o
obj-$(CONFIG_IKCONFIG) += configs.o
obj-$(CONFIG_IKHEADERS) += kheaders.o
obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_watch.o audit_fsnotify.o audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/

View File

@ -60,7 +60,6 @@
#include <linux/sched/cputime.h>
#include <asm/div64.h>
#include <linux/blkdev.h> /* sector_div */
#include <linux/pid_namespace.h>
#include <linux/fs_pin.h>

View File

@ -1468,7 +1468,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
if (err)
return err;
}
sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
sig_data = kmalloc(struct_size(sig_data, ctx, len), GFP_KERNEL);
if (!sig_data) {
if (audit_sig_sid)
security_release_secctx(ctx, len);
@ -1481,7 +1481,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
security_release_secctx(ctx, len);
}
audit_send_reply(skb, seq, AUDIT_SIGNAL_INFO, 0, 0,
sig_data, sizeof(*sig_data) + len);
sig_data, struct_size(sig_data, ctx, len));
kfree(sig_data);
break;
case AUDIT_TTY_GET: {
@ -2171,7 +2171,7 @@ int audit_log_task_context(struct audit_buffer *ab)
int error;
u32 sid;
security_task_getsecid_subj(current, &sid);
security_current_getsecid_subj(&sid);
if (!sid)
return 0;
@ -2392,7 +2392,7 @@ int audit_signal_info(int sig, struct task_struct *t)
audit_sig_uid = auid;
else
audit_sig_uid = uid;
security_task_getsecid_subj(current, &audit_sig_sid);
security_current_getsecid_subj(&audit_sig_sid);
}
return audit_signal_info_syscall(t);

View File

@ -14,6 +14,7 @@
#include <linux/skbuff.h>
#include <uapi/linux/mqueue.h>
#include <linux/tty.h>
#include <uapi/linux/openat2.h> // struct open_how
/* AUDIT_NAMES is the number of slots we reserve in the audit_context
* for saving names from getname(). If we get more names we will allocate
@ -100,10 +101,15 @@ struct audit_proctitle {
/* The per-task audit context. */
struct audit_context {
int dummy; /* must be the first element */
int in_syscall; /* 1 if task is in a syscall */
enum {
AUDIT_CTX_UNUSED, /* audit_context is currently unused */
AUDIT_CTX_SYSCALL, /* in use by syscall */
AUDIT_CTX_URING, /* in use by io_uring */
} context;
enum audit_state state, current_state;
unsigned int serial; /* serial number for record */
int major; /* syscall number */
int uring_op; /* uring operation */
struct timespec64 ctime; /* time of syscall entry */
unsigned long argv[4]; /* syscall arguments */
long return_code;/* syscall return code */
@ -188,6 +194,7 @@ struct audit_context {
int fd;
int flags;
} mmap;
struct open_how openat2;
struct {
int argc;
} execve;

View File

@ -160,8 +160,7 @@ static int audit_mark_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group) ||
WARN_ON_ONCE(!inode))
if (WARN_ON_ONCE(inode_mark->group != audit_fsnotify_group))
return 0;
if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {

View File

@ -30,7 +30,7 @@ struct audit_chunk {
int count;
atomic_long_t refs;
struct rcu_head head;
struct node {
struct audit_node {
struct list_head list;
struct audit_tree *owner;
unsigned index; /* index; upper bit indicates 'will prune' */
@ -94,7 +94,7 @@ static struct audit_tree *alloc_tree(const char *s)
{
struct audit_tree *tree;
tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL);
tree = kmalloc(struct_size(tree, pathname, strlen(s) + 1), GFP_KERNEL);
if (tree) {
refcount_set(&tree->count, 1);
tree->goner = 0;
@ -269,7 +269,7 @@ bool audit_tree_match(struct audit_chunk *chunk, struct audit_tree *tree)
/* tagging and untagging inodes with trees */
static struct audit_chunk *find_chunk(struct node *p)
static struct audit_chunk *find_chunk(struct audit_node *p)
{
int index = p->index & ~(1U<<31);
p -= index;
@ -322,7 +322,7 @@ static void replace_chunk(struct audit_chunk *new, struct audit_chunk *old)
list_replace_rcu(&old->hash, &new->hash);
}
static void remove_chunk_node(struct audit_chunk *chunk, struct node *p)
static void remove_chunk_node(struct audit_chunk *chunk, struct audit_node *p)
{
struct audit_tree *owner = p->owner;
@ -459,7 +459,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
{
struct fsnotify_mark *mark;
struct audit_chunk *chunk, *old;
struct node *p;
struct audit_node *p;
int n;
mutex_lock(&audit_tree_group->mark_mutex);
@ -570,11 +570,11 @@ static void prune_tree_chunks(struct audit_tree *victim, bool tagged)
{
spin_lock(&hash_lock);
while (!list_empty(&victim->chunks)) {
struct node *p;
struct audit_node *p;
struct audit_chunk *chunk;
struct fsnotify_mark *mark;
p = list_first_entry(&victim->chunks, struct node, list);
p = list_first_entry(&victim->chunks, struct audit_node, list);
/* have we run out of marked? */
if (tagged && !(p->index & (1U<<31)))
break;
@ -616,7 +616,7 @@ static void trim_marked(struct audit_tree *tree)
}
/* reorder */
for (p = tree->chunks.next; p != &tree->chunks; p = q) {
struct node *node = list_entry(p, struct node, list);
struct audit_node *node = list_entry(p, struct audit_node, list);
q = p->next;
if (node->index & (1U<<31)) {
list_del_init(p);
@ -684,7 +684,7 @@ void audit_trim_trees(void)
struct audit_tree *tree;
struct path path;
struct vfsmount *root_mnt;
struct node *node;
struct audit_node *node;
int err;
tree = container_of(cursor.next, struct audit_tree, list);
@ -726,7 +726,8 @@ int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
{
if (pathname[0] != '/' ||
rule->listnr != AUDIT_FILTER_EXIT ||
(rule->listnr != AUDIT_FILTER_EXIT &&
rule->listnr != AUDIT_FILTER_URING_EXIT) ||
op != Audit_equal ||
rule->inode_f || rule->watch || rule->tree)
return -EINVAL;
@ -839,7 +840,7 @@ int audit_add_tree_rule(struct audit_krule *rule)
drop_collected_mounts(mnt);
if (!err) {
struct node *node;
struct audit_node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);
@ -938,7 +939,7 @@ int audit_tag_tree(char *old, char *new)
mutex_unlock(&audit_filter_mutex);
if (!failed) {
struct node *node;
struct audit_node *node;
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list)
node->index &= ~(1U<<31);

View File

@ -183,7 +183,8 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
return -EOPNOTSUPP;
if (path[0] != '/' || path[len-1] == '/' ||
krule->listnr != AUDIT_FILTER_EXIT ||
(krule->listnr != AUDIT_FILTER_EXIT &&
krule->listnr != AUDIT_FILTER_URING_EXIT) ||
op != Audit_equal ||
krule->inode_f || krule->watch || krule->tree)
return -EINVAL;
@ -472,8 +473,7 @@ static int audit_watch_handle_event(struct fsnotify_mark *inode_mark, u32 mask,
parent = container_of(inode_mark, struct audit_parent, mark);
if (WARN_ON_ONCE(inode_mark->group != audit_watch_group) ||
WARN_ON_ONCE(!inode))
if (WARN_ON_ONCE(inode_mark->group != audit_watch_group))
return 0;
if (mask & (FS_CREATE|FS_MOVED_TO) && inode)

View File

@ -44,7 +44,8 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_filter_list[4]),
LIST_HEAD_INIT(audit_filter_list[5]),
LIST_HEAD_INIT(audit_filter_list[6]),
#if AUDIT_NR_FILTERS != 7
LIST_HEAD_INIT(audit_filter_list[7]),
#if AUDIT_NR_FILTERS != 8
#error Fix audit_filter_list initialiser
#endif
};
@ -56,6 +57,7 @@ static struct list_head audit_rules_list[AUDIT_NR_FILTERS] = {
LIST_HEAD_INIT(audit_rules_list[4]),
LIST_HEAD_INIT(audit_rules_list[5]),
LIST_HEAD_INIT(audit_rules_list[6]),
LIST_HEAD_INIT(audit_rules_list[7]),
};
DEFINE_MUTEX(audit_filter_mutex);
@ -151,7 +153,8 @@ char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
static inline int audit_to_inode(struct audit_krule *krule,
struct audit_field *f)
{
if (krule->listnr != AUDIT_FILTER_EXIT ||
if ((krule->listnr != AUDIT_FILTER_EXIT &&
krule->listnr != AUDIT_FILTER_URING_EXIT) ||
krule->inode_f || krule->watch || krule->tree ||
(f->op != Audit_equal && f->op != Audit_not_equal))
return -EINVAL;
@ -248,6 +251,7 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule_data *
pr_err("AUDIT_FILTER_ENTRY is deprecated\n");
goto exit_err;
case AUDIT_FILTER_EXIT:
case AUDIT_FILTER_URING_EXIT:
case AUDIT_FILTER_TASK:
#endif
case AUDIT_FILTER_USER:
@ -332,6 +336,10 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
if (entry->rule.listnr != AUDIT_FILTER_FS)
return -EINVAL;
break;
case AUDIT_PERM:
if (entry->rule.listnr == AUDIT_FILTER_URING_EXIT)
return -EINVAL;
break;
}
switch (entry->rule.listnr) {
@ -629,7 +637,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
void *bufp;
int i;
data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
data = kmalloc(struct_size(data, buf, krule->buflen), GFP_KERNEL);
if (unlikely(!data))
return NULL;
memset(data, 0, sizeof(*data));
@ -980,7 +988,8 @@ static inline int audit_add_rule(struct audit_entry *entry)
}
entry->rule.prio = ~0ULL;
if (entry->rule.listnr == AUDIT_FILTER_EXIT) {
if (entry->rule.listnr == AUDIT_FILTER_EXIT ||
entry->rule.listnr == AUDIT_FILTER_URING_EXIT) {
if (entry->rule.flags & AUDIT_FILTER_PREPEND)
entry->rule.prio = ++prio_high;
else
@ -1083,7 +1092,7 @@ static void audit_list_rules(int seq, struct sk_buff_head *q)
break;
skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1,
data,
sizeof(*data) + data->buflen);
struct_size(data, buf, data->buflen));
if (skb)
skb_queue_tail(q, skb);
kfree(data);
@ -1359,8 +1368,7 @@ int audit_filter(int msgtype, unsigned int listtype)
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
if (f->lsm_rule) {
security_task_getsecid_subj(current,
&sid);
security_current_getsecid_subj(&sid);
result = security_audit_rule_match(sid,
f->type, f->op, f->lsm_rule);
}

View File

@ -1,3 +1,4 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/* auditsc.c -- System-call auditing support
* Handles all system-call specific auditing features.
*
@ -6,20 +7,6 @@
* Copyright (C) 2005, 2006 IBM Corporation
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
* Written by Rickard E. (Rik) Faith <faith@redhat.com>
*
* Many of the ideas implemented here are from Stephen C. Tweedie,
@ -76,6 +63,7 @@
#include <linux/fsnotify_backend.h>
#include <uapi/linux/limits.h>
#include <uapi/linux/netfilter/nf_tables.h>
#include <uapi/linux/openat2.h> // struct open_how
#include "audit.h"
@ -166,7 +154,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
n = ctx->major;
switch (audit_classify_syscall(ctx->arch, n)) {
case 0: /* native */
case AUDITSC_NATIVE:
if ((mask & AUDIT_PERM_WRITE) &&
audit_match_class(AUDIT_CLASS_WRITE, n))
return 1;
@ -177,7 +165,7 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
audit_match_class(AUDIT_CLASS_CHATTR, n))
return 1;
return 0;
case 1: /* 32bit on biarch */
case AUDITSC_COMPAT: /* 32bit on biarch */
if ((mask & AUDIT_PERM_WRITE) &&
audit_match_class(AUDIT_CLASS_WRITE_32, n))
return 1;
@ -188,14 +176,16 @@ static int audit_match_perm(struct audit_context *ctx, int mask)
audit_match_class(AUDIT_CLASS_CHATTR_32, n))
return 1;
return 0;
case 2: /* open */
case AUDITSC_OPEN:
return mask & ACC_MODE(ctx->argv[1]);
case 3: /* openat */
case AUDITSC_OPENAT:
return mask & ACC_MODE(ctx->argv[2]);
case 4: /* socketcall */
case AUDITSC_SOCKETCALL:
return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND);
case 5: /* execve */
case AUDITSC_EXECVE:
return mask & AUDIT_PERM_EXEC;
case AUDITSC_OPENAT2:
return mask & ACC_MODE((u32)ctx->openat2.flags);
default:
return 0;
}
@ -480,6 +470,9 @@ static int audit_filter_rules(struct task_struct *tsk,
u32 sid;
unsigned int sessionid;
if (ctx && rule->prio <= ctx->prio)
return 0;
cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
for (i = 0; i < rule->field_count; i++) {
@ -673,7 +666,16 @@ static int audit_filter_rules(struct task_struct *tsk,
logged upon error */
if (f->lsm_rule) {
if (need_sid) {
security_task_getsecid_subj(tsk, &sid);
/* @tsk should always be equal to
* @current with the exception of
* fork()/copy_process() in which case
* the new @tsk creds are still a dup
* of @current's creds so we can still
* use security_current_getsecid_subj()
* here even though it always refs
* @current's creds
*/
security_current_getsecid_subj(&sid);
need_sid = 0;
}
result = security_audit_rule_match(sid, f->type,
@ -747,8 +749,6 @@ static int audit_filter_rules(struct task_struct *tsk,
}
if (ctx) {
if (rule->prio <= ctx->prio)
return 0;
if (rule->filterkey) {
kfree(ctx->filterkey);
ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
@ -805,6 +805,34 @@ static int audit_in_mask(const struct audit_krule *rule, unsigned long val)
return rule->mask[word] & bit;
}
/**
* audit_filter_uring - apply filters to an io_uring operation
* @tsk: associated task
* @ctx: audit context
*/
static void audit_filter_uring(struct task_struct *tsk,
struct audit_context *ctx)
{
struct audit_entry *e;
enum audit_state state;
if (auditd_test_task(tsk))
return;
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_URING_EXIT],
list) {
if (audit_in_mask(&e->rule, ctx->uring_op) &&
audit_filter_rules(tsk, &e->rule, ctx, NULL, &state,
false)) {
rcu_read_unlock();
ctx->current_state = state;
return;
}
}
rcu_read_unlock();
}
/* At syscall exit time, this filter is called if the audit_state is
* not low enough that auditing cannot take place, but is also not
* high enough that we already know we have to write an audit record
@ -915,10 +943,81 @@ static inline void audit_free_aux(struct audit_context *context)
context->aux = aux->next;
kfree(aux);
}
context->aux = NULL;
while ((aux = context->aux_pids)) {
context->aux_pids = aux->next;
kfree(aux);
}
context->aux_pids = NULL;
}
/**
* audit_reset_context - reset a audit_context structure
* @ctx: the audit_context to reset
*
* All fields in the audit_context will be reset to an initial state, all
* references held by fields will be dropped, and private memory will be
* released. When this function returns the audit_context will be suitable
* for reuse, so long as the passed context is not NULL or a dummy context.
*/
static void audit_reset_context(struct audit_context *ctx)
{
if (!ctx)
return;
/* if ctx is non-null, reset the "ctx->state" regardless */
ctx->context = AUDIT_CTX_UNUSED;
if (ctx->dummy)
return;
/*
* NOTE: It shouldn't matter in what order we release the fields, so
* release them in the order in which they appear in the struct;
* this gives us some hope of quickly making sure we are
* resetting the audit_context properly.
*
* Other things worth mentioning:
* - we don't reset "dummy"
* - we don't reset "state", we do reset "current_state"
* - we preserve "filterkey" if "state" is AUDIT_STATE_RECORD
* - much of this is likely overkill, but play it safe for now
* - we really need to work on improving the audit_context struct
*/
ctx->current_state = ctx->state;
ctx->serial = 0;
ctx->major = 0;
ctx->uring_op = 0;
ctx->ctime = (struct timespec64){ .tv_sec = 0, .tv_nsec = 0 };
memset(ctx->argv, 0, sizeof(ctx->argv));
ctx->return_code = 0;
ctx->prio = (ctx->state == AUDIT_STATE_RECORD ? ~0ULL : 0);
ctx->return_valid = AUDITSC_INVALID;
audit_free_names(ctx);
if (ctx->state != AUDIT_STATE_RECORD) {
kfree(ctx->filterkey);
ctx->filterkey = NULL;
}
audit_free_aux(ctx);
kfree(ctx->sockaddr);
ctx->sockaddr = NULL;
ctx->sockaddr_len = 0;
ctx->pid = ctx->ppid = 0;
ctx->uid = ctx->euid = ctx->suid = ctx->fsuid = KUIDT_INIT(0);
ctx->gid = ctx->egid = ctx->sgid = ctx->fsgid = KGIDT_INIT(0);
ctx->personality = 0;
ctx->arch = 0;
ctx->target_pid = 0;
ctx->target_auid = ctx->target_uid = KUIDT_INIT(0);
ctx->target_sessionid = 0;
ctx->target_sid = 0;
ctx->target_comm[0] = '\0';
unroll_tree_refs(ctx, NULL, 0);
WARN_ON(!list_empty(&ctx->killed_trees));
ctx->type = 0;
audit_free_module(ctx);
ctx->fds[0] = -1;
audit_proctitle_free(ctx);
}
static inline struct audit_context *audit_alloc_context(enum audit_state state)
@ -928,6 +1027,7 @@ static inline struct audit_context *audit_alloc_context(enum audit_state state)
context = kzalloc(sizeof(*context), GFP_KERNEL);
if (!context)
return NULL;
context->context = AUDIT_CTX_UNUSED;
context->state = state;
context->prio = state == AUDIT_STATE_RECORD ? ~0ULL : 0;
INIT_LIST_HEAD(&context->killed_trees);
@ -953,7 +1053,7 @@ int audit_alloc(struct task_struct *tsk)
char *key = NULL;
if (likely(!audit_ever_enabled))
return 0; /* Return if not auditing. */
return 0;
state = audit_filter_task(tsk, &key);
if (state == AUDIT_STATE_DISABLED) {
@ -973,16 +1073,37 @@ int audit_alloc(struct task_struct *tsk)
return 0;
}
/**
* audit_alloc_kernel - allocate an audit_context for a kernel task
* @tsk: the kernel task
*
* Similar to the audit_alloc() function, but intended for kernel private
* threads. Returns zero on success, negative values on failure.
*/
int audit_alloc_kernel(struct task_struct *tsk)
{
/*
* At the moment we are just going to call into audit_alloc() to
* simplify the code, but there two things to keep in mind with this
* approach:
*
* 1. Filtering internal kernel tasks is a bit laughable in almost all
* cases, but there is at least one case where there is a benefit:
* the '-a task,never' case allows the admin to effectively disable
* task auditing at runtime.
*
* 2. The {set,clear}_task_syscall_work() ops likely have zero effect
* on these internal kernel tasks, but they probably don't hurt either.
*/
return audit_alloc(tsk);
}
static inline void audit_free_context(struct audit_context *context)
{
audit_free_module(context);
audit_free_names(context);
unroll_tree_refs(context, NULL, 0);
/* resetting is extra work, but it is likely just noise */
audit_reset_context(context);
free_tree_refs(context);
audit_free_aux(context);
kfree(context->filterkey);
kfree(context->sockaddr);
audit_proctitle_free(context);
kfree(context);
}
@ -1316,6 +1437,12 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
context->mmap.flags);
break;
case AUDIT_OPENAT2:
audit_log_format(ab, "oflag=0%llo mode=0%llo resolve=0x%llx",
context->openat2.flags,
context->openat2.mode,
context->openat2.resolve);
break;
case AUDIT_EXECVE:
audit_log_execve_info(context, &ab);
break;
@ -1479,6 +1606,44 @@ static void audit_log_proctitle(void)
audit_log_end(ab);
}
/**
* audit_log_uring - generate a AUDIT_URINGOP record
* @ctx: the audit context
*/
static void audit_log_uring(struct audit_context *ctx)
{
struct audit_buffer *ab;
const struct cred *cred;
ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_URINGOP);
if (!ab)
return;
cred = current_cred();
audit_log_format(ab, "uring_op=%d", ctx->uring_op);
if (ctx->return_valid != AUDITSC_INVALID)
audit_log_format(ab, " success=%s exit=%ld",
(ctx->return_valid == AUDITSC_SUCCESS ?
"yes" : "no"),
ctx->return_code);
audit_log_format(ab,
" items=%d"
" ppid=%d pid=%d uid=%u gid=%u euid=%u suid=%u"
" fsuid=%u egid=%u sgid=%u fsgid=%u",
ctx->name_count,
task_ppid_nr(current), task_tgid_nr(current),
from_kuid(&init_user_ns, cred->uid),
from_kgid(&init_user_ns, cred->gid),
from_kuid(&init_user_ns, cred->euid),
from_kuid(&init_user_ns, cred->suid),
from_kuid(&init_user_ns, cred->fsuid),
from_kgid(&init_user_ns, cred->egid),
from_kgid(&init_user_ns, cred->sgid),
from_kgid(&init_user_ns, cred->fsgid));
audit_log_task_context(ab);
audit_log_key(ab, ctx->filterkey);
audit_log_end(ab);
}
static void audit_log_exit(void)
{
int i, call_panic = 0;
@ -1489,18 +1654,20 @@ static void audit_log_exit(void)
context->personality = current->personality;
switch (context->context) {
case AUDIT_CTX_SYSCALL:
ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
if (!ab)
return; /* audit_panic has been called */
return;
audit_log_format(ab, "arch=%x syscall=%d",
context->arch, context->major);
if (context->personality != PER_LINUX)
audit_log_format(ab, " per=%lx", context->personality);
if (context->return_valid != AUDITSC_INVALID)
audit_log_format(ab, " success=%s exit=%ld",
(context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
(context->return_valid == AUDITSC_SUCCESS ?
"yes" : "no"),
context->return_code);
audit_log_format(ab,
" a0=%lx a1=%lx a2=%lx a3=%lx items=%d",
context->argv[0],
@ -1508,10 +1675,17 @@ static void audit_log_exit(void)
context->argv[2],
context->argv[3],
context->name_count);
audit_log_task_info(ab);
audit_log_key(ab, context->filterkey);
audit_log_end(ab);
break;
case AUDIT_CTX_URING:
audit_log_uring(context);
break;
default:
BUG();
break;
}
for (aux = context->aux; aux; aux = aux->next) {
@ -1602,6 +1776,7 @@ static void audit_log_exit(void)
audit_log_name(context, n, NULL, i++, &call_panic);
}
if (context->context == AUDIT_CTX_SYSCALL)
audit_log_proctitle();
/* Send end of event record to help user space know we are finished */
@ -1609,14 +1784,14 @@ static void audit_log_exit(void)
if (ab)
audit_log_end(ab);
if (call_panic)
audit_panic("error converting sid to string");
audit_panic("error in audit_log_exit()");
}
/**
* __audit_free - free a per-task audit context
* @tsk: task whose audit context block to free
*
* Called from copy_process and do_exit
* Called from copy_process, do_exit, and the io_uring code
*/
void __audit_free(struct task_struct *tsk)
{
@ -1625,6 +1800,7 @@ void __audit_free(struct task_struct *tsk)
if (!context)
return;
/* this may generate CONFIG_CHANGE records */
if (!list_empty(&context->killed_trees))
audit_kill_trees(context);
@ -1633,20 +1809,152 @@ void __audit_free(struct task_struct *tsk)
* random task_struct that doesn't doesn't have any meaningful data we
* need to log via audit_log_exit().
*/
if (tsk == current && !context->dummy && context->in_syscall) {
if (tsk == current && !context->dummy) {
context->return_valid = AUDITSC_INVALID;
context->return_code = 0;
if (context->context == AUDIT_CTX_SYSCALL) {
audit_filter_syscall(tsk, context);
audit_filter_inodes(tsk, context);
if (context->current_state == AUDIT_STATE_RECORD)
audit_log_exit();
} else if (context->context == AUDIT_CTX_URING) {
/* TODO: verify this case is real and valid */
audit_filter_uring(tsk, context);
audit_filter_inodes(tsk, context);
if (context->current_state == AUDIT_STATE_RECORD)
audit_log_uring(context);
}
}
audit_set_context(tsk, NULL);
audit_free_context(context);
}
/**
* audit_return_fixup - fixup the return codes in the audit_context
* @ctx: the audit_context
* @success: true/false value to indicate if the operation succeeded or not
* @code: operation return code
*
* We need to fixup the return code in the audit logs if the actual return
* codes are later going to be fixed by the arch specific signal handlers.
*/
static void audit_return_fixup(struct audit_context *ctx,
int success, long code)
{
/*
* This is actually a test for:
* (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
* (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
*
* but is faster than a bunch of ||
*/
if (unlikely(code <= -ERESTARTSYS) &&
(code >= -ERESTART_RESTARTBLOCK) &&
(code != -ENOIOCTLCMD))
ctx->return_code = -EINTR;
else
ctx->return_code = code;
ctx->return_valid = (success ? AUDITSC_SUCCESS : AUDITSC_FAILURE);
}
/**
* __audit_uring_entry - prepare the kernel task's audit context for io_uring
* @op: the io_uring opcode
*
* This is similar to audit_syscall_entry() but is intended for use by io_uring
* operations. This function should only ever be called from
* audit_uring_entry() as we rely on the audit context checking present in that
* function.
*/
void __audit_uring_entry(u8 op)
{
struct audit_context *ctx = audit_context();
if (ctx->state == AUDIT_STATE_DISABLED)
return;
/*
* NOTE: It's possible that we can be called from the process' context
* before it returns to userspace, and before audit_syscall_exit()
* is called. In this case there is not much to do, just record
* the io_uring details and return.
*/
ctx->uring_op = op;
if (ctx->context == AUDIT_CTX_SYSCALL)
return;
ctx->dummy = !audit_n_rules;
if (!ctx->dummy && ctx->state == AUDIT_STATE_BUILD)
ctx->prio = 0;
ctx->context = AUDIT_CTX_URING;
ctx->current_state = ctx->state;
ktime_get_coarse_real_ts64(&ctx->ctime);
}
/**
* __audit_uring_exit - wrap up the kernel task's audit context after io_uring
* @success: true/false value to indicate if the operation succeeded or not
* @code: operation return code
*
* This is similar to audit_syscall_exit() but is intended for use by io_uring
* operations. This function should only ever be called from
* audit_uring_exit() as we rely on the audit context checking present in that
* function.
*/
void __audit_uring_exit(int success, long code)
{
struct audit_context *ctx = audit_context();
if (ctx->context == AUDIT_CTX_SYSCALL) {
/*
* NOTE: See the note in __audit_uring_entry() about the case
* where we may be called from process context before we
* return to userspace via audit_syscall_exit(). In this
* case we simply emit a URINGOP record and bail, the
* normal syscall exit handling will take care of
* everything else.
* It is also worth mentioning that when we are called,
* the current process creds may differ from the creds
* used during the normal syscall processing; keep that
* in mind if/when we move the record generation code.
*/
/*
* We need to filter on the syscall info here to decide if we
* should emit a URINGOP record. I know it seems odd but this
* solves the problem where users have a filter to block *all*
* syscall records in the "exit" filter; we want to preserve
* the behavior here.
*/
audit_filter_syscall(current, ctx);
if (ctx->current_state != AUDIT_STATE_RECORD)
audit_filter_uring(current, ctx);
audit_filter_inodes(current, ctx);
if (ctx->current_state != AUDIT_STATE_RECORD)
return;
audit_log_uring(ctx);
return;
}
/* this may generate CONFIG_CHANGE records */
if (!list_empty(&ctx->killed_trees))
audit_kill_trees(ctx);
/* run through both filters to ensure we set the filterkey properly */
audit_filter_uring(current, ctx);
audit_filter_inodes(current, ctx);
if (ctx->current_state != AUDIT_STATE_RECORD)
goto out;
audit_return_fixup(ctx, success, code);
audit_log_exit();
out:
audit_reset_context(ctx);
}
/**
* __audit_syscall_entry - fill in an audit record at syscall entry
* @major: major syscall type (function)
@ -1672,7 +1980,12 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
if (!audit_enabled || !context)
return;
BUG_ON(context->in_syscall || context->name_count);
WARN_ON(context->context != AUDIT_CTX_UNUSED);
WARN_ON(context->name_count);
if (context->context != AUDIT_CTX_UNUSED || context->name_count) {
audit_panic("unrecoverable error in audit_syscall_entry()");
return;
}
state = context->state;
if (state == AUDIT_STATE_DISABLED)
@ -1691,10 +2004,8 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
context->argv[1] = a2;
context->argv[2] = a3;
context->argv[3] = a4;
context->serial = 0;
context->in_syscall = 1;
context->context = AUDIT_CTX_SYSCALL;
context->current_state = state;
context->ppid = 0;
ktime_get_coarse_real_ts64(&context->ctime);
}
@ -1711,63 +2022,27 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2,
*/
void __audit_syscall_exit(int success, long return_code)
{
struct audit_context *context;
struct audit_context *context = audit_context();
context = audit_context();
if (!context)
return;
if (!context || context->dummy ||
context->context != AUDIT_CTX_SYSCALL)
goto out;
/* this may generate CONFIG_CHANGE records */
if (!list_empty(&context->killed_trees))
audit_kill_trees(context);
if (!context->dummy && context->in_syscall) {
if (success)
context->return_valid = AUDITSC_SUCCESS;
else
context->return_valid = AUDITSC_FAILURE;
/*
* we need to fix up the return code in the audit logs if the
* actual return codes are later going to be fixed up by the
* arch specific signal handlers
*
* This is actually a test for:
* (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
* (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
*
* but is faster than a bunch of ||
*/
if (unlikely(return_code <= -ERESTARTSYS) &&
(return_code >= -ERESTART_RESTARTBLOCK) &&
(return_code != -ENOIOCTLCMD))
context->return_code = -EINTR;
else
context->return_code = return_code;
/* run through both filters to ensure we set the filterkey properly */
audit_filter_syscall(current, context);
audit_filter_inodes(current, context);
if (context->current_state == AUDIT_STATE_RECORD)
if (context->current_state < AUDIT_STATE_RECORD)
goto out;
audit_return_fixup(context, success, return_code);
audit_log_exit();
}
context->in_syscall = 0;
context->prio = context->state == AUDIT_STATE_RECORD ? ~0ULL : 0;
audit_free_module(context);
audit_free_names(context);
unroll_tree_refs(context, NULL, 0);
audit_free_aux(context);
context->aux = NULL;
context->aux_pids = NULL;
context->target_pid = 0;
context->target_sid = 0;
context->sockaddr_len = 0;
context->type = 0;
context->fds[0] = -1;
if (context->state != AUDIT_STATE_RECORD) {
kfree(context->filterkey);
context->filterkey = NULL;
}
out:
audit_reset_context(context);
}
static inline void handle_one(const struct inode *inode)
@ -1919,7 +2194,7 @@ void __audit_getname(struct filename *name)
struct audit_context *context = audit_context();
struct audit_names *n;
if (!context->in_syscall)
if (context->context == AUDIT_CTX_UNUSED)
return;
n = audit_alloc_name(context, AUDIT_TYPE_UNKNOWN);
@ -1991,7 +2266,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry,
struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
int i;
if (!context->in_syscall)
if (context->context == AUDIT_CTX_UNUSED)
return;
rcu_read_lock();
@ -2109,7 +2384,7 @@ void __audit_inode_child(struct inode *parent,
struct list_head *list = &audit_filter_list[AUDIT_FILTER_FS];
int i;
if (!context->in_syscall)
if (context->context == AUDIT_CTX_UNUSED)
return;
rcu_read_lock();
@ -2208,7 +2483,7 @@ EXPORT_SYMBOL_GPL(__audit_inode_child);
int auditsc_get_stamp(struct audit_context *ctx,
struct timespec64 *t, unsigned int *serial)
{
if (!ctx->in_syscall)
if (ctx->context == AUDIT_CTX_UNUSED)
return 0;
if (!ctx->serial)
ctx->serial = audit_serial();
@ -2546,6 +2821,16 @@ void __audit_mmap_fd(int fd, int flags)
context->type = AUDIT_MMAP;
}
void __audit_openat2_how(struct open_how *how)
{
struct audit_context *context = audit_context();
context->openat2.flags = how->flags;
context->openat2.mode = how->mode;
context->openat2.resolve = how->resolve;
context->type = AUDIT_OPENAT2;
}
void __audit_log_kern_module(char *name)
{
struct audit_context *context = audit_context();
@ -2706,8 +2991,7 @@ void audit_seccomp_actions_logged(const char *names, const char *old_names,
struct list_head *audit_killed_trees(void)
{
struct audit_context *ctx = audit_context();
if (likely(!ctx || !ctx->in_syscall))
if (likely(!ctx || ctx->context == AUDIT_CTX_UNUSED))
return NULL;
return &ctx->killed_trees;
}

View File

@ -64,6 +64,7 @@ config BPF_JIT_DEFAULT_ON
config BPF_UNPRIV_DEFAULT_OFF
bool "Disable unprivileged BPF by default"
default y
depends on BPF_SYSCALL
help
Disables unprivileged BPF by default by setting the corresponding
@ -72,6 +73,12 @@ config BPF_UNPRIV_DEFAULT_OFF
disable it by setting it to 1 (from which no other transition to
0 is possible anymore).
Unprivileged BPF could be used to exploit certain potential
speculative execution side-channel vulnerabilities on unmitigated
affected hardware.
If you are unsure how to answer this question, answer Y.
source "kernel/bpf/preload/Kconfig"
config BPF_LSM

View File

@ -7,7 +7,7 @@ endif
CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy)
obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o
obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o bloom_filter.o
obj-$(CONFIG_BPF_SYSCALL) += local_storage.o queue_stack_maps.o ringbuf.o
obj-$(CONFIG_BPF_SYSCALL) += bpf_local_storage.o bpf_task_storage.o
obj-${CONFIG_BPF_LSM} += bpf_inode_storage.o
@ -36,3 +36,7 @@ obj-$(CONFIG_BPF_SYSCALL) += bpf_struct_ops.o
obj-${CONFIG_BPF_LSM} += bpf_lsm.o
endif
obj-$(CONFIG_BPF_PRELOAD) += preload/
obj-$(CONFIG_BPF_SYSCALL) += relo_core.o
$(obj)/relo_core.o: $(srctree)/tools/lib/bpf/relo_core.c FORCE
$(call if_changed_rule,cc_o_c)

View File

@ -645,7 +645,7 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_array_map_info),
};
static int bpf_for_each_array_elem(struct bpf_map *map, void *callback_fn,
static int bpf_for_each_array_elem(struct bpf_map *map, bpf_callback_t callback_fn,
void *callback_ctx, u64 flags)
{
u32 i, key, num_elems = 0;
@ -668,9 +668,8 @@ static int bpf_for_each_array_elem(struct bpf_map *map, void *callback_fn,
val = array->value + array->elem_size * i;
num_elems++;
key = i;
ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
(u64)(long)&key, (u64)(long)val,
(u64)(long)callback_ctx, 0);
ret = callback_fn((u64)(long)map, (u64)(long)&key,
(u64)(long)val, (u64)(long)callback_ctx, 0);
/* return value: 0 - continue, 1 - stop and return */
if (ret)
break;

View File

@ -17,6 +17,7 @@
#include <linux/bpf_lsm.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(inode_cache);
@ -44,7 +45,8 @@ static struct bpf_local_storage_data *inode_storage_lookup(struct inode *inode,
if (!bsb)
return NULL;
inode_storage = rcu_dereference(bsb->storage);
inode_storage =
rcu_dereference_check(bsb->storage, bpf_rcu_lock_held());
if (!inode_storage)
return NULL;
@ -172,6 +174,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
{
struct bpf_local_storage_data *sdata;
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
return (unsigned long)NULL;
@ -204,6 +207,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
BPF_CALL_2(bpf_inode_storage_delete,
struct bpf_map *, map, struct inode *, inode)
{
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!inode)
return -EINVAL;

View File

@ -714,3 +714,38 @@ const struct bpf_func_proto bpf_for_each_map_elem_proto = {
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
/* maximum number of loops */
#define MAX_LOOPS BIT(23)
BPF_CALL_4(bpf_loop, u32, nr_loops, void *, callback_fn, void *, callback_ctx,
u64, flags)
{
bpf_callback_t callback = (bpf_callback_t)callback_fn;
u64 ret;
u32 i;
if (flags)
return -EINVAL;
if (nr_loops > MAX_LOOPS)
return -E2BIG;
for (i = 0; i < nr_loops; i++) {
ret = callback((u64)i, (u64)(long)callback_ctx, 0, 0, 0);
/* return value: 0 - continue, 1 - stop and return */
if (ret)
return i + 1;
}
return i;
}
const struct bpf_func_proto bpf_loop_proto = {
.func = bpf_loop,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
.arg2_type = ARG_PTR_TO_FUNC,
.arg3_type = ARG_PTR_TO_STACK_OR_NULL,
.arg4_type = ARG_ANYTHING,
};

View File

@ -11,6 +11,9 @@
#include <net/sock.h>
#include <uapi/linux/sock_diag.h>
#include <uapi/linux/btf.h>
#include <linux/rcupdate.h>
#include <linux/rcupdate_trace.h>
#include <linux/rcupdate_wait.h>
#define BPF_LOCAL_STORAGE_CREATE_FLAG_MASK (BPF_F_NO_PREALLOC | BPF_F_CLONE)
@ -81,6 +84,22 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
}
void bpf_local_storage_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
local_storage = container_of(rcu, struct bpf_local_storage, rcu);
kfree_rcu(local_storage, rcu);
}
static void bpf_selem_free_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
kfree_rcu(selem, rcu);
}
/* local_storage->lock must be held and selem->local_storage == local_storage.
* The caller must ensure selem->smap is still valid to be
* dereferenced for its smap->elem_size and smap->cache_idx.
@ -93,7 +112,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
bool free_local_storage;
void *owner;
smap = rcu_dereference(SDATA(selem)->smap);
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
owner = local_storage->owner;
/* All uncharging on the owner must be done first.
@ -118,12 +137,12 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
*
* Although the unlock will be done under
* rcu_read_lock(), it is more intutivie to
* read if kfree_rcu(local_storage, rcu) is done
* read if the freeing of the storage is done
* after the raw_spin_unlock_bh(&local_storage->lock).
*
* Hence, a "bool free_local_storage" is returned
* to the caller which then calls the kfree_rcu()
* after unlock.
* to the caller which then calls then frees the storage after
* all the RCU grace periods have expired.
*/
}
hlist_del_init_rcu(&selem->snode);
@ -131,8 +150,7 @@ bool bpf_selem_unlink_storage_nolock(struct bpf_local_storage *local_storage,
SDATA(selem))
RCU_INIT_POINTER(local_storage->cache[smap->cache_idx], NULL);
kfree_rcu(selem, rcu);
call_rcu_tasks_trace(&selem->rcu, bpf_selem_free_rcu);
return free_local_storage;
}
@ -146,7 +164,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
/* selem has already been unlinked from sk */
return;
local_storage = rcu_dereference(selem->local_storage);
local_storage = rcu_dereference_check(selem->local_storage,
bpf_rcu_lock_held());
raw_spin_lock_irqsave(&local_storage->lock, flags);
if (likely(selem_linked_to_storage(selem)))
free_local_storage = bpf_selem_unlink_storage_nolock(
@ -154,7 +173,8 @@ static void __bpf_selem_unlink_storage(struct bpf_local_storage_elem *selem)
raw_spin_unlock_irqrestore(&local_storage->lock, flags);
if (free_local_storage)
kfree_rcu(local_storage, rcu);
call_rcu_tasks_trace(&local_storage->rcu,
bpf_local_storage_free_rcu);
}
void bpf_selem_link_storage_nolock(struct bpf_local_storage *local_storage,
@ -174,7 +194,7 @@ void bpf_selem_unlink_map(struct bpf_local_storage_elem *selem)
/* selem has already be unlinked from smap */
return;
smap = rcu_dereference(SDATA(selem)->smap);
smap = rcu_dereference_check(SDATA(selem)->smap, bpf_rcu_lock_held());
b = select_bucket(smap, selem);
raw_spin_lock_irqsave(&b->lock, flags);
if (likely(selem_linked_to_map(selem)))
@ -213,12 +233,14 @@ bpf_local_storage_lookup(struct bpf_local_storage *local_storage,
struct bpf_local_storage_elem *selem;
/* Fast path (cache hit) */
sdata = rcu_dereference(local_storage->cache[smap->cache_idx]);
sdata = rcu_dereference_check(local_storage->cache[smap->cache_idx],
bpf_rcu_lock_held());
if (sdata && rcu_access_pointer(sdata->smap) == smap)
return sdata;
/* Slow path (cache miss) */
hlist_for_each_entry_rcu(selem, &local_storage->list, snode)
hlist_for_each_entry_rcu(selem, &local_storage->list, snode,
rcu_read_lock_trace_held())
if (rcu_access_pointer(SDATA(selem)->smap) == smap)
break;
@ -306,7 +328,8 @@ int bpf_local_storage_alloc(void *owner,
* bucket->list, first_selem can be freed immediately
* (instead of kfree_rcu) because
* bpf_local_storage_map_free() does a
* synchronize_rcu() before walking the bucket->list.
* synchronize_rcu_mult (waiting for both sleepable and
* normal programs) before walking the bucket->list.
* Hence, no one is accessing selem from the
* bucket->list under rcu_read_lock().
*/
@ -342,7 +365,8 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
!map_value_has_spin_lock(&smap->map)))
return ERR_PTR(-EINVAL);
local_storage = rcu_dereference(*owner_storage(smap, owner));
local_storage = rcu_dereference_check(*owner_storage(smap, owner),
bpf_rcu_lock_held());
if (!local_storage || hlist_empty(&local_storage->list)) {
/* Very first elem for the owner */
err = check_flags(NULL, map_flags);

View File

@ -207,7 +207,7 @@ BTF_ID(func, bpf_lsm_socket_socketpair)
BTF_ID(func, bpf_lsm_syslog)
BTF_ID(func, bpf_lsm_task_alloc)
BTF_ID(func, bpf_lsm_task_getsecid_subj)
BTF_ID(func, bpf_lsm_current_getsecid_subj)
BTF_ID(func, bpf_lsm_task_getsecid_obj)
BTF_ID(func, bpf_lsm_task_prctl)
BTF_ID(func, bpf_lsm_task_setscheduler)

View File

@ -93,6 +93,9 @@ const struct bpf_verifier_ops bpf_struct_ops_verifier_ops = {
};
const struct bpf_prog_ops bpf_struct_ops_prog_ops = {
#ifdef CONFIG_NET
.test_run = bpf_struct_ops_test_run,
#endif
};
static const struct btf_type *module_type;
@ -162,7 +165,7 @@ void bpf_struct_ops_init(struct btf *btf, struct bpf_verifier_log *log)
break;
}
if (btf_member_bitfield_size(t, member)) {
if (__btf_member_bitfield_size(t, member)) {
pr_warn("bit field member %s in struct %s is not supported\n",
mname, st_ops->name);
break;
@ -293,7 +296,7 @@ static int check_zero_holes(const struct btf_type *t, void *data)
const struct btf_type *mtype;
for_each_member(i, t, member) {
moff = btf_member_bit_offset(t, member) / 8;
moff = __btf_member_bit_offset(t, member) / 8;
if (moff > prev_mend &&
memchr_inv(data + prev_mend, 0, moff - prev_mend))
return -EINVAL;
@ -312,6 +315,20 @@ static int check_zero_holes(const struct btf_type *t, void *data)
return 0;
}
int bpf_struct_ops_prepare_trampoline(struct bpf_tramp_progs *tprogs,
struct bpf_prog *prog,
const struct btf_func_model *model,
void *image, void *image_end)
{
u32 flags;
tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
flags = model->ret_size > 0 ? BPF_TRAMP_F_RET_FENTRY_RET : 0;
return arch_prepare_bpf_trampoline(NULL, image, image_end,
model, flags, tprogs, NULL);
}
static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
void *value, u64 flags)
{
@ -323,7 +340,7 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
struct bpf_tramp_progs *tprogs = NULL;
void *udata, *kdata;
int prog_fd, err = 0;
void *image;
void *image, *image_end;
u32 i;
if (flags)
@ -363,14 +380,14 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
udata = &uvalue->data;
kdata = &kvalue->data;
image = st_map->image;
image_end = st_map->image + PAGE_SIZE;
for_each_member(i, t, member) {
const struct btf_type *mtype, *ptype;
struct bpf_prog *prog;
u32 moff;
u32 flags;
moff = btf_member_bit_offset(t, member) / 8;
moff = __btf_member_bit_offset(t, member) / 8;
ptype = btf_type_resolve_ptr(btf_vmlinux, member->type, NULL);
if (ptype == module_type) {
if (*(void **)(udata + moff))
@ -430,14 +447,9 @@ static int bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key,
goto reset_unlock;
}
tprogs[BPF_TRAMP_FENTRY].progs[0] = prog;
tprogs[BPF_TRAMP_FENTRY].nr_progs = 1;
flags = st_ops->func_models[i].ret_size > 0 ?
BPF_TRAMP_F_RET_FENTRY_RET : 0;
err = arch_prepare_bpf_trampoline(NULL, image,
st_map->image + PAGE_SIZE,
err = bpf_struct_ops_prepare_trampoline(tprogs, prog,
&st_ops->func_models[i],
flags, tprogs, NULL);
image, image_end);
if (err < 0)
goto reset_unlock;

View File

@ -2,6 +2,9 @@
/* internal file - do not include directly */
#ifdef CONFIG_BPF_JIT
#ifdef CONFIG_NET
BPF_STRUCT_OPS_TYPE(bpf_dummy_ops)
#endif
#ifdef CONFIG_INET
#include <net/tcp.h>
BPF_STRUCT_OPS_TYPE(tcp_congestion_ops)

View File

@ -17,6 +17,7 @@
#include <uapi/linux/btf.h>
#include <linux/btf_ids.h>
#include <linux/fdtable.h>
#include <linux/rcupdate_trace.h>
DEFINE_BPF_STORAGE_CACHE(task_cache);
@ -59,7 +60,8 @@ task_storage_lookup(struct task_struct *task, struct bpf_map *map,
struct bpf_local_storage *task_storage;
struct bpf_local_storage_map *smap;
task_storage = rcu_dereference(task->bpf_storage);
task_storage =
rcu_dereference_check(task->bpf_storage, bpf_rcu_lock_held());
if (!task_storage)
return NULL;
@ -229,6 +231,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
{
struct bpf_local_storage_data *sdata;
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (flags & ~(BPF_LOCAL_STORAGE_GET_F_CREATE))
return (unsigned long)NULL;
@ -260,6 +263,7 @@ BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *,
{
int ret;
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (!task)
return -EINVAL;
@ -323,7 +327,7 @@ const struct bpf_func_proto bpf_task_storage_get_proto = {
.ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_BTF_ID,
.arg2_btf_id = &btf_task_struct_ids[0],
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg3_type = ARG_PTR_TO_MAP_VALUE_OR_NULL,
.arg4_type = ARG_ANYTHING,
};
@ -334,5 +338,5 @@ const struct bpf_func_proto bpf_task_storage_delete_proto = {
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_BTF_ID,
.arg2_btf_id = &btf_task_struct_ids[0],
.arg2_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
};

View File

@ -25,6 +25,7 @@
#include <linux/kobject.h>
#include <linux/sysfs.h>
#include <net/sock.h>
#include "../tools/lib/bpf/relo_core.h"
/* BTF (BPF Type Format) is the meta data format which describes
* the data types of BPF program/map. Hence, it basically focus
@ -281,6 +282,8 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = {
[BTF_KIND_VAR] = "VAR",
[BTF_KIND_DATASEC] = "DATASEC",
[BTF_KIND_FLOAT] = "FLOAT",
[BTF_KIND_DECL_TAG] = "DECL_TAG",
[BTF_KIND_TYPE_TAG] = "TYPE_TAG",
};
const char *btf_type_str(const struct btf_type *t)
@ -417,6 +420,7 @@ static bool btf_type_is_modifier(const struct btf_type *t)
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
case BTF_KIND_TYPE_TAG:
return true;
}
@ -459,6 +463,17 @@ static bool btf_type_is_datasec(const struct btf_type *t)
return BTF_INFO_KIND(t->info) == BTF_KIND_DATASEC;
}
static bool btf_type_is_decl_tag(const struct btf_type *t)
{
return BTF_INFO_KIND(t->info) == BTF_KIND_DECL_TAG;
}
static bool btf_type_is_decl_tag_target(const struct btf_type *t)
{
return btf_type_is_func(t) || btf_type_is_struct(t) ||
btf_type_is_var(t) || btf_type_is_typedef(t);
}
u32 btf_nr_types(const struct btf *btf)
{
u32 total = 0;
@ -537,6 +552,7 @@ const struct btf_type *btf_type_resolve_func_ptr(const struct btf *btf,
static bool btf_type_is_resolve_source_only(const struct btf_type *t)
{
return btf_type_is_var(t) ||
btf_type_is_decl_tag(t) ||
btf_type_is_datasec(t);
}
@ -563,6 +579,7 @@ static bool btf_type_needs_resolve(const struct btf_type *t)
btf_type_is_struct(t) ||
btf_type_is_array(t) ||
btf_type_is_var(t) ||
btf_type_is_decl_tag(t) ||
btf_type_is_datasec(t);
}
@ -616,6 +633,11 @@ static const struct btf_var *btf_type_var(const struct btf_type *t)
return (const struct btf_var *)(t + 1);
}
static const struct btf_decl_tag *btf_type_decl_tag(const struct btf_type *t)
{
return (const struct btf_decl_tag *)(t + 1);
}
static const struct btf_kind_operations *btf_type_ops(const struct btf_type *t)
{
return kind_ops[BTF_INFO_KIND(t->info)];
@ -815,7 +837,7 @@ static const char *btf_show_name(struct btf_show *show)
const char *ptr_suffix = &ptr_suffixes[strlen(ptr_suffixes)];
const char *name = NULL, *prefix = "", *parens = "";
const struct btf_member *m = show->state.member;
const struct btf_type *t = show->state.type;
const struct btf_type *t;
const struct btf_array *array;
u32 id = show->state.type_id;
const char *member = NULL;
@ -1718,6 +1740,7 @@ __btf_resolve_size(const struct btf *btf, const struct btf_type *type,
case BTF_KIND_VOLATILE:
case BTF_KIND_CONST:
case BTF_KIND_RESTRICT:
case BTF_KIND_TYPE_TAG:
id = type->type;
type = btf_type_by_id(btf, type->type);
break;
@ -2326,6 +2349,8 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
const char *value;
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
@ -2341,7 +2366,7 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
/* typedef type must have a valid name, and other ref types,
/* typedef/type_tag type must have a valid name, and other ref types,
* volatile, const, restrict, should have a null name.
*/
if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPEDEF) {
@ -2350,6 +2375,12 @@ static int btf_ref_type_check_meta(struct btf_verifier_env *env,
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
} else if (BTF_INFO_KIND(t->info) == BTF_KIND_TYPE_TAG) {
value = btf_name_by_offset(env->btf, t->name_off);
if (!value || !value[0]) {
btf_verifier_log_type(env, t, "Invalid name");
return -EINVAL;
}
} else {
if (t->name_off) {
btf_verifier_log_type(env, t, "Invalid name");
@ -2939,7 +2970,7 @@ static s32 btf_struct_check_meta(struct btf_verifier_env *env,
return -EINVAL;
}
offset = btf_member_bit_offset(t, member);
offset = __btf_member_bit_offset(t, member);
if (is_union && offset) {
btf_verifier_log_member(env, t, member,
"Invalid member bits_offset");
@ -3064,7 +3095,7 @@ static int btf_find_struct_field(const struct btf *btf, const struct btf_type *t
if (off != -ENOENT)
/* only one such field is allowed */
return -E2BIG;
off = btf_member_bit_offset(t, member);
off = __btf_member_bit_offset(t, member);
if (off % 8)
/* valid C code cannot generate such BTF */
return -EINVAL;
@ -3154,8 +3185,8 @@ static void __btf_struct_show(const struct btf *btf, const struct btf_type *t,
btf_show_start_member(show, member);
member_offset = btf_member_bit_offset(t, member);
bitfield_size = btf_member_bitfield_size(t, member);
member_offset = __btf_member_bit_offset(t, member);
bitfield_size = __btf_member_bitfield_size(t, member);
bytes_offset = BITS_ROUNDDOWN_BYTES(member_offset);
bits8_offset = BITS_PER_BYTE_MASKED(member_offset);
if (bitfield_size) {
@ -3801,6 +3832,110 @@ static const struct btf_kind_operations float_ops = {
.show = btf_df_show,
};
static s32 btf_decl_tag_check_meta(struct btf_verifier_env *env,
const struct btf_type *t,
u32 meta_left)
{
const struct btf_decl_tag *tag;
u32 meta_needed = sizeof(*tag);
s32 component_idx;
const char *value;
if (meta_left < meta_needed) {
btf_verifier_log_basic(env, t,
"meta_left:%u meta_needed:%u",
meta_left, meta_needed);
return -EINVAL;
}
value = btf_name_by_offset(env->btf, t->name_off);
if (!value || !value[0]) {
btf_verifier_log_type(env, t, "Invalid value");
return -EINVAL;
}
if (btf_type_vlen(t)) {
btf_verifier_log_type(env, t, "vlen != 0");
return -EINVAL;
}
if (btf_type_kflag(t)) {
btf_verifier_log_type(env, t, "Invalid btf_info kind_flag");
return -EINVAL;
}
component_idx = btf_type_decl_tag(t)->component_idx;
if (component_idx < -1) {
btf_verifier_log_type(env, t, "Invalid component_idx");
return -EINVAL;
}
btf_verifier_log_type(env, t, NULL);
return meta_needed;
}
static int btf_decl_tag_resolve(struct btf_verifier_env *env,
const struct resolve_vertex *v)
{
const struct btf_type *next_type;
const struct btf_type *t = v->t;
u32 next_type_id = t->type;
struct btf *btf = env->btf;
s32 component_idx;
u32 vlen;
next_type = btf_type_by_id(btf, next_type_id);
if (!next_type || !btf_type_is_decl_tag_target(next_type)) {
btf_verifier_log_type(env, v->t, "Invalid type_id");
return -EINVAL;
}
if (!env_type_is_resolve_sink(env, next_type) &&
!env_type_is_resolved(env, next_type_id))
return env_stack_push(env, next_type, next_type_id);
component_idx = btf_type_decl_tag(t)->component_idx;
if (component_idx != -1) {
if (btf_type_is_var(next_type) || btf_type_is_typedef(next_type)) {
btf_verifier_log_type(env, v->t, "Invalid component_idx");
return -EINVAL;
}
if (btf_type_is_struct(next_type)) {
vlen = btf_type_vlen(next_type);
} else {
/* next_type should be a function */
next_type = btf_type_by_id(btf, next_type->type);
vlen = btf_type_vlen(next_type);
}
if ((u32)component_idx >= vlen) {
btf_verifier_log_type(env, v->t, "Invalid component_idx");
return -EINVAL;
}
}
env_stack_pop_resolved(env, next_type_id, 0);
return 0;
}
static void btf_decl_tag_log(struct btf_verifier_env *env, const struct btf_type *t)
{
btf_verifier_log(env, "type=%u component_idx=%d", t->type,
btf_type_decl_tag(t)->component_idx);
}
static const struct btf_kind_operations decl_tag_ops = {
.check_meta = btf_decl_tag_check_meta,
.resolve = btf_decl_tag_resolve,
.check_member = btf_df_check_member,
.check_kflag_member = btf_df_check_kflag_member,
.log_details = btf_decl_tag_log,
.show = btf_df_show,
};
static int btf_func_proto_check(struct btf_verifier_env *env,
const struct btf_type *t)
{
@ -3935,6 +4070,8 @@ static const struct btf_kind_operations * const kind_ops[NR_BTF_KINDS] = {
[BTF_KIND_VAR] = &var_ops,
[BTF_KIND_DATASEC] = &datasec_ops,
[BTF_KIND_FLOAT] = &float_ops,
[BTF_KIND_DECL_TAG] = &decl_tag_ops,
[BTF_KIND_TYPE_TAG] = &modifier_ops,
};
static s32 btf_check_meta(struct btf_verifier_env *env,
@ -4019,6 +4156,10 @@ static bool btf_resolve_valid(struct btf_verifier_env *env,
return !btf_resolved_type_id(btf, type_id) &&
!btf_resolved_type_size(btf, type_id);
if (btf_type_is_decl_tag(t))
return btf_resolved_type_id(btf, type_id) &&
!btf_resolved_type_size(btf, type_id);
if (btf_type_is_modifier(t) || btf_type_is_ptr(t) ||
btf_type_is_var(t)) {
t = btf_type_id_resolve(btf, &type_id);
@ -4685,7 +4826,7 @@ struct btf *bpf_prog_get_target_btf(const struct bpf_prog *prog)
return prog->aux->attach_btf;
}
static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
static bool is_int_ptr(struct btf *btf, const struct btf_type *t)
{
/* t comes in already as a pointer */
t = btf_type_by_id(btf, t->type);
@ -4694,8 +4835,7 @@ static bool is_string_ptr(struct btf *btf, const struct btf_type *t)
if (BTF_INFO_KIND(t->info) == BTF_KIND_CONST)
t = btf_type_by_id(btf, t->type);
/* char, signed char, unsigned char */
return btf_type_is_int(t) && t->size == 1;
return btf_type_is_int(t);
}
bool btf_ctx_access(int off, int size, enum bpf_access_type type,
@ -4800,10 +4940,12 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
/* check for PTR_TO_RDONLY_BUF_OR_NULL or PTR_TO_RDWR_BUF_OR_NULL */
for (i = 0; i < prog->aux->ctx_arg_info_size; i++) {
const struct bpf_ctx_arg_aux *ctx_arg_info = &prog->aux->ctx_arg_info[i];
u32 type, flag;
if (ctx_arg_info->offset == off &&
(ctx_arg_info->reg_type == PTR_TO_RDONLY_BUF_OR_NULL ||
ctx_arg_info->reg_type == PTR_TO_RDWR_BUF_OR_NULL)) {
type = base_type(ctx_arg_info->reg_type);
flag = type_flag(ctx_arg_info->reg_type);
if (ctx_arg_info->offset == off && type == PTR_TO_BUF &&
(flag & PTR_MAYBE_NULL)) {
info->reg_type = ctx_arg_info->reg_type;
return true;
}
@ -4816,7 +4958,7 @@ bool btf_ctx_access(int off, int size, enum bpf_access_type type,
*/
return true;
if (is_string_ptr(btf, t))
if (is_int_ptr(btf, t))
return true;
/* this is a pointer to another type */
@ -4919,7 +5061,7 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
if (array_elem->nelems != 0)
goto error;
moff = btf_member_bit_offset(t, member) / 8;
moff = __btf_member_bit_offset(t, member) / 8;
if (off < moff)
goto error;
@ -4942,14 +5084,14 @@ static int btf_struct_walk(struct bpf_verifier_log *log, const struct btf *btf,
for_each_member(i, t, member) {
/* offset of the field in bytes */
moff = btf_member_bit_offset(t, member) / 8;
moff = __btf_member_bit_offset(t, member) / 8;
if (off + size <= moff)
/* won't find anything, field is already too far */
break;
if (btf_member_bitfield_size(t, member)) {
u32 end_bit = btf_member_bit_offset(t, member) +
btf_member_bitfield_size(t, member);
if (__btf_member_bitfield_size(t, member)) {
u32 end_bit = __btf_member_bit_offset(t, member) +
__btf_member_bitfield_size(t, member);
/* off <= moff instead of off == moff because clang
* does not generate a BTF member for anonymous
@ -5434,12 +5576,53 @@ static u32 *reg2btf_ids[__BPF_REG_TYPE_MAX] = {
#endif
};
/* Returns true if struct is composed of scalars, 4 levels of nesting allowed */
static bool __btf_type_is_scalar_struct(struct bpf_verifier_log *log,
const struct btf *btf,
const struct btf_type *t, int rec)
{
const struct btf_type *member_type;
const struct btf_member *member;
u32 i;
if (!btf_type_is_struct(t))
return false;
for_each_member(i, t, member) {
const struct btf_array *array;
member_type = btf_type_skip_modifiers(btf, member->type, NULL);
if (btf_type_is_struct(member_type)) {
if (rec >= 3) {
bpf_log(log, "max struct nesting depth exceeded\n");
return false;
}
if (!__btf_type_is_scalar_struct(log, btf, member_type, rec + 1))
return false;
continue;
}
if (btf_type_is_array(member_type)) {
array = btf_type_array(member_type);
if (!array->nelems)
return false;
member_type = btf_type_skip_modifiers(btf, array->type, NULL);
if (!btf_type_is_scalar(member_type))
return false;
continue;
}
if (!btf_type_is_scalar(member_type))
return false;
}
return true;
}
static int btf_check_func_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs,
bool ptr_to_mem_ok)
{
struct bpf_verifier_log *log = &env->log;
bool is_kfunc = btf_is_kernel(btf);
const char *func_name, *ref_tname;
const struct btf_type *t, *ref_t;
const struct btf_param *args;
@ -5492,7 +5675,21 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
ref_t = btf_type_skip_modifiers(btf, t->type, &ref_id);
ref_tname = btf_name_by_offset(btf, ref_t->name_off);
if (btf_is_kernel(btf)) {
if (btf_get_prog_ctx_type(log, btf, t,
env->prog->type, i)) {
/* If function expects ctx type in BTF check that caller
* is passing PTR_TO_CTX.
*/
if (reg->type != PTR_TO_CTX) {
bpf_log(log,
"arg#%d expected pointer to ctx, but got %s\n",
i, btf_type_str(t));
return -EINVAL;
}
if (check_ptr_off_reg(env, reg, regno))
return -EINVAL;
} else if (is_kfunc && (reg->type == PTR_TO_BTF_ID ||
(reg2btf_ids[base_type(reg->type)] && !type_flag(reg->type)))) {
const struct btf_type *reg_ref_t;
const struct btf *reg_btf;
const char *reg_ref_tname;
@ -5508,14 +5705,9 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (reg->type == PTR_TO_BTF_ID) {
reg_btf = reg->btf;
reg_ref_id = reg->btf_id;
} else if (reg2btf_ids[reg->type]) {
reg_btf = btf_vmlinux;
reg_ref_id = *reg2btf_ids[reg->type];
} else {
bpf_log(log, "kernel function %s args#%d expected pointer to %s %s but R%d is not a pointer to btf_id\n",
func_name, i,
btf_type_str(ref_t), ref_tname, regno);
return -EINVAL;
reg_btf = btf_vmlinux;
reg_ref_id = *reg2btf_ids[base_type(reg->type)];
}
reg_ref_t = btf_type_skip_modifiers(reg_btf, reg_ref_id,
@ -5531,23 +5723,24 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
reg_ref_tname);
return -EINVAL;
}
} else if (btf_get_prog_ctx_type(log, btf, t,
env->prog->type, i)) {
/* If function expects ctx type in BTF check that caller
* is passing PTR_TO_CTX.
*/
if (reg->type != PTR_TO_CTX) {
bpf_log(log,
"arg#%d expected pointer to ctx, but got %s\n",
i, btf_type_str(t));
return -EINVAL;
}
if (check_ctx_reg(env, reg, regno))
return -EINVAL;
} else if (ptr_to_mem_ok) {
const struct btf_type *resolve_ret;
u32 type_size;
if (is_kfunc) {
/* Permit pointer to mem, but only when argument
* type is pointer to scalar, or struct composed
* (recursively) of scalars.
*/
if (!btf_type_is_scalar(ref_t) &&
!__btf_type_is_scalar_struct(log, btf, ref_t, 0)) {
bpf_log(log,
"arg#%d pointer type %s %s must point to scalar or struct with scalar\n",
i, btf_type_str(ref_t), ref_tname);
return -EINVAL;
}
}
resolve_ret = btf_resolve_size(btf, ref_t, &type_size);
if (IS_ERR(resolve_ret)) {
bpf_log(log,
@ -5560,6 +5753,8 @@ static int btf_check_func_arg_match(struct bpf_verifier_env *env,
if (check_mem_reg(env, reg, regno, type_size))
return -EINVAL;
} else {
bpf_log(log, "reg type unsupported for arg#%d %sfunction %s#%d\n", i,
is_kfunc ? "kernel " : "", func_name, func_id);
return -EINVAL;
}
}
@ -5609,7 +5804,7 @@ int btf_check_kfunc_arg_match(struct bpf_verifier_env *env,
const struct btf *btf, u32 func_id,
struct bpf_reg_state *regs)
{
return btf_check_func_arg_match(env, btf, func_id, regs, false);
return btf_check_func_arg_match(env, btf, func_id, regs, true);
}
/* Convert BTF of a function into bpf_reg_state if possible
@ -5717,7 +5912,7 @@ int btf_prepare_func_args(struct bpf_verifier_env *env, int subprog,
return -EINVAL;
}
reg->type = PTR_TO_MEM_OR_NULL;
reg->type = PTR_TO_MEM | PTR_MAYBE_NULL;
reg->id = ++env->id_gen;
continue;
@ -6028,6 +6223,8 @@ btf_module_read(struct file *file, struct kobject *kobj,
return len;
}
static void purge_cand_cache(struct btf *btf);
static int btf_module_notify(struct notifier_block *nb, unsigned long op,
void *module)
{
@ -6062,6 +6259,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
goto out;
}
purge_cand_cache(NULL);
mutex_lock(&btf_module_mutex);
btf_mod->module = module;
btf_mod->btf = btf;
@ -6104,6 +6302,7 @@ static int btf_module_notify(struct notifier_block *nb, unsigned long op,
list_del(&btf_mod->list);
if (btf_mod->sysfs_attr)
sysfs_remove_bin_file(btf_kobj, btf_mod->sysfs_attr);
purge_cand_cache(btf_mod->btf);
btf_put(btf_mod->btf);
kfree(btf_mod->sysfs_attr);
kfree(btf_mod);
@ -6207,10 +6406,442 @@ const struct bpf_func_proto bpf_btf_find_by_name_kind_proto = {
.func = bpf_btf_find_by_name_kind,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_MEM,
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_ANYTHING,
};
BTF_ID_LIST_GLOBAL_SINGLE(btf_task_struct_ids, struct, task_struct)
BTF_ID_LIST_GLOBAL(btf_tracing_ids, MAX_BTF_TRACING_TYPE)
#define BTF_TRACING_TYPE(name, type) BTF_ID(struct, type)
BTF_TRACING_TYPE_xxx
#undef BTF_TRACING_TYPE
/* BTF ID set registration API for modules */
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
void register_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
struct kfunc_btf_id_set *s)
{
mutex_lock(&l->mutex);
list_add(&s->list, &l->list);
mutex_unlock(&l->mutex);
}
EXPORT_SYMBOL_GPL(register_kfunc_btf_id_set);
void unregister_kfunc_btf_id_set(struct kfunc_btf_id_list *l,
struct kfunc_btf_id_set *s)
{
mutex_lock(&l->mutex);
list_del_init(&s->list);
mutex_unlock(&l->mutex);
}
EXPORT_SYMBOL_GPL(unregister_kfunc_btf_id_set);
bool bpf_check_mod_kfunc_call(struct kfunc_btf_id_list *klist, u32 kfunc_id,
struct module *owner)
{
struct kfunc_btf_id_set *s;
mutex_lock(&klist->mutex);
list_for_each_entry(s, &klist->list, list) {
if (s->owner == owner && btf_id_set_contains(s->set, kfunc_id)) {
mutex_unlock(&klist->mutex);
return true;
}
}
mutex_unlock(&klist->mutex);
return false;
}
#define DEFINE_KFUNC_BTF_ID_LIST(name) \
struct kfunc_btf_id_list name = { LIST_HEAD_INIT(name.list), \
__MUTEX_INITIALIZER(name.mutex) }; \
EXPORT_SYMBOL_GPL(name)
DEFINE_KFUNC_BTF_ID_LIST(bpf_tcp_ca_kfunc_list);
DEFINE_KFUNC_BTF_ID_LIST(prog_test_kfunc_list);
#endif
int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id,
const struct btf *targ_btf, __u32 targ_id)
{
return -EOPNOTSUPP;
}
static bool bpf_core_is_flavor_sep(const char *s)
{
/* check X___Y name pattern, where X and Y are not underscores */
return s[0] != '_' && /* X */
s[1] == '_' && s[2] == '_' && s[3] == '_' && /* ___ */
s[4] != '_'; /* Y */
}
size_t bpf_core_essential_name_len(const char *name)
{
size_t n = strlen(name);
int i;
for (i = n - 5; i >= 0; i--) {
if (bpf_core_is_flavor_sep(name + i))
return i + 1;
}
return n;
}
struct bpf_cand_cache {
const char *name;
u32 name_len;
u16 kind;
u16 cnt;
struct {
const struct btf *btf;
u32 id;
} cands[];
};
static void bpf_free_cands(struct bpf_cand_cache *cands)
{
if (!cands->cnt)
/* empty candidate array was allocated on stack */
return;
kfree(cands);
}
static void bpf_free_cands_from_cache(struct bpf_cand_cache *cands)
{
kfree(cands->name);
kfree(cands);
}
#define VMLINUX_CAND_CACHE_SIZE 31
static struct bpf_cand_cache *vmlinux_cand_cache[VMLINUX_CAND_CACHE_SIZE];
#define MODULE_CAND_CACHE_SIZE 31
static struct bpf_cand_cache *module_cand_cache[MODULE_CAND_CACHE_SIZE];
static DEFINE_MUTEX(cand_cache_mutex);
static void __print_cand_cache(struct bpf_verifier_log *log,
struct bpf_cand_cache **cache,
int cache_size)
{
struct bpf_cand_cache *cc;
int i, j;
for (i = 0; i < cache_size; i++) {
cc = cache[i];
if (!cc)
continue;
bpf_log(log, "[%d]%s(", i, cc->name);
for (j = 0; j < cc->cnt; j++) {
bpf_log(log, "%d", cc->cands[j].id);
if (j < cc->cnt - 1)
bpf_log(log, " ");
}
bpf_log(log, "), ");
}
}
static void print_cand_cache(struct bpf_verifier_log *log)
{
mutex_lock(&cand_cache_mutex);
bpf_log(log, "vmlinux_cand_cache:");
__print_cand_cache(log, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
bpf_log(log, "\nmodule_cand_cache:");
__print_cand_cache(log, module_cand_cache, MODULE_CAND_CACHE_SIZE);
bpf_log(log, "\n");
mutex_unlock(&cand_cache_mutex);
}
static u32 hash_cands(struct bpf_cand_cache *cands)
{
return jhash(cands->name, cands->name_len, 0);
}
static struct bpf_cand_cache *check_cand_cache(struct bpf_cand_cache *cands,
struct bpf_cand_cache **cache,
int cache_size)
{
struct bpf_cand_cache *cc = cache[hash_cands(cands) % cache_size];
if (cc && cc->name_len == cands->name_len &&
!strncmp(cc->name, cands->name, cands->name_len))
return cc;
return NULL;
}
static size_t sizeof_cands(int cnt)
{
return offsetof(struct bpf_cand_cache, cands[cnt]);
}
static struct bpf_cand_cache *populate_cand_cache(struct bpf_cand_cache *cands,
struct bpf_cand_cache **cache,
int cache_size)
{
struct bpf_cand_cache **cc = &cache[hash_cands(cands) % cache_size], *new_cands;
if (*cc) {
bpf_free_cands_from_cache(*cc);
*cc = NULL;
}
new_cands = kmemdup(cands, sizeof_cands(cands->cnt), GFP_KERNEL);
if (!new_cands) {
bpf_free_cands(cands);
return ERR_PTR(-ENOMEM);
}
/* strdup the name, since it will stay in cache.
* the cands->name points to strings in prog's BTF and the prog can be unloaded.
*/
new_cands->name = kmemdup_nul(cands->name, cands->name_len, GFP_KERNEL);
bpf_free_cands(cands);
if (!new_cands->name) {
kfree(new_cands);
return ERR_PTR(-ENOMEM);
}
*cc = new_cands;
return new_cands;
}
#ifdef CONFIG_DEBUG_INFO_BTF_MODULES
static void __purge_cand_cache(struct btf *btf, struct bpf_cand_cache **cache,
int cache_size)
{
struct bpf_cand_cache *cc;
int i, j;
for (i = 0; i < cache_size; i++) {
cc = cache[i];
if (!cc)
continue;
if (!btf) {
/* when new module is loaded purge all of module_cand_cache,
* since new module might have candidates with the name
* that matches cached cands.
*/
bpf_free_cands_from_cache(cc);
cache[i] = NULL;
continue;
}
/* when module is unloaded purge cache entries
* that match module's btf
*/
for (j = 0; j < cc->cnt; j++)
if (cc->cands[j].btf == btf) {
bpf_free_cands_from_cache(cc);
cache[i] = NULL;
break;
}
}
}
static void purge_cand_cache(struct btf *btf)
{
mutex_lock(&cand_cache_mutex);
__purge_cand_cache(btf, module_cand_cache, MODULE_CAND_CACHE_SIZE);
mutex_unlock(&cand_cache_mutex);
}
#endif
static struct bpf_cand_cache *
bpf_core_add_cands(struct bpf_cand_cache *cands, const struct btf *targ_btf,
int targ_start_id)
{
struct bpf_cand_cache *new_cands;
const struct btf_type *t;
const char *targ_name;
size_t targ_essent_len;
int n, i;
n = btf_nr_types(targ_btf);
for (i = targ_start_id; i < n; i++) {
t = btf_type_by_id(targ_btf, i);
if (btf_kind(t) != cands->kind)
continue;
targ_name = btf_name_by_offset(targ_btf, t->name_off);
if (!targ_name)
continue;
/* the resched point is before strncmp to make sure that search
* for non-existing name will have a chance to schedule().
*/
cond_resched();
if (strncmp(cands->name, targ_name, cands->name_len) != 0)
continue;
targ_essent_len = bpf_core_essential_name_len(targ_name);
if (targ_essent_len != cands->name_len)
continue;
/* most of the time there is only one candidate for a given kind+name pair */
new_cands = kmalloc(sizeof_cands(cands->cnt + 1), GFP_KERNEL);
if (!new_cands) {
bpf_free_cands(cands);
return ERR_PTR(-ENOMEM);
}
memcpy(new_cands, cands, sizeof_cands(cands->cnt));
bpf_free_cands(cands);
cands = new_cands;
cands->cands[cands->cnt].btf = targ_btf;
cands->cands[cands->cnt].id = i;
cands->cnt++;
}
return cands;
}
static struct bpf_cand_cache *
bpf_core_find_cands(struct bpf_core_ctx *ctx, u32 local_type_id)
{
struct bpf_cand_cache *cands, *cc, local_cand = {};
const struct btf *local_btf = ctx->btf;
const struct btf_type *local_type;
const struct btf *main_btf;
size_t local_essent_len;
struct btf *mod_btf;
const char *name;
int id;
main_btf = bpf_get_btf_vmlinux();
if (IS_ERR(main_btf))
return ERR_CAST(main_btf);
local_type = btf_type_by_id(local_btf, local_type_id);
if (!local_type)
return ERR_PTR(-EINVAL);
name = btf_name_by_offset(local_btf, local_type->name_off);
if (str_is_empty(name))
return ERR_PTR(-EINVAL);
local_essent_len = bpf_core_essential_name_len(name);
cands = &local_cand;
cands->name = name;
cands->kind = btf_kind(local_type);
cands->name_len = local_essent_len;
cc = check_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
/* cands is a pointer to stack here */
if (cc) {
if (cc->cnt)
return cc;
goto check_modules;
}
/* Attempt to find target candidates in vmlinux BTF first */
cands = bpf_core_add_cands(cands, main_btf, 1);
if (IS_ERR(cands))
return ERR_CAST(cands);
/* cands is a pointer to kmalloced memory here if cands->cnt > 0 */
/* populate cache even when cands->cnt == 0 */
cc = populate_cand_cache(cands, vmlinux_cand_cache, VMLINUX_CAND_CACHE_SIZE);
if (IS_ERR(cc))
return ERR_CAST(cc);
/* if vmlinux BTF has any candidate, don't go for module BTFs */
if (cc->cnt)
return cc;
check_modules:
/* cands is a pointer to stack here and cands->cnt == 0 */
cc = check_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
if (cc)
/* if cache has it return it even if cc->cnt == 0 */
return cc;
/* If candidate is not found in vmlinux's BTF then search in module's BTFs */
spin_lock_bh(&btf_idr_lock);
idr_for_each_entry(&btf_idr, mod_btf, id) {
if (!btf_is_module(mod_btf))
continue;
/* linear search could be slow hence unlock/lock
* the IDR to avoiding holding it for too long
*/
btf_get(mod_btf);
spin_unlock_bh(&btf_idr_lock);
cands = bpf_core_add_cands(cands, mod_btf, btf_nr_types(main_btf));
if (IS_ERR(cands)) {
btf_put(mod_btf);
return ERR_CAST(cands);
}
spin_lock_bh(&btf_idr_lock);
btf_put(mod_btf);
}
spin_unlock_bh(&btf_idr_lock);
/* cands is a pointer to kmalloced memory here if cands->cnt > 0
* or pointer to stack if cands->cnd == 0.
* Copy it into the cache even when cands->cnt == 0 and
* return the result.
*/
return populate_cand_cache(cands, module_cand_cache, MODULE_CAND_CACHE_SIZE);
}
int bpf_core_apply(struct bpf_core_ctx *ctx, const struct bpf_core_relo *relo,
int relo_idx, void *insn)
{
bool need_cands = relo->kind != BPF_CORE_TYPE_ID_LOCAL;
struct bpf_core_cand_list cands = {};
struct bpf_core_spec *specs;
int err;
/* ~4k of temp memory necessary to convert LLVM spec like "0:1:0:5"
* into arrays of btf_ids of struct fields and array indices.
*/
specs = kcalloc(3, sizeof(*specs), GFP_KERNEL);
if (!specs)
return -ENOMEM;
if (need_cands) {
struct bpf_cand_cache *cc;
int i;
mutex_lock(&cand_cache_mutex);
cc = bpf_core_find_cands(ctx, relo->type_id);
if (IS_ERR(cc)) {
bpf_log(ctx->log, "target candidate search failed for %d\n",
relo->type_id);
err = PTR_ERR(cc);
goto out;
}
if (cc->cnt) {
cands.cands = kcalloc(cc->cnt, sizeof(*cands.cands), GFP_KERNEL);
if (!cands.cands) {
err = -ENOMEM;
goto out;
}
}
for (i = 0; i < cc->cnt; i++) {
bpf_log(ctx->log,
"CO-RE relocating %s %s: found target candidate [%d]\n",
btf_kind_str[cc->kind], cc->name, cc->cands[i].id);
cands.cands[i].btf = cc->cands[i].btf;
cands.cands[i].id = cc->cands[i].id;
}
cands.len = cc->cnt;
/* cand_cache_mutex needs to span the cache lookup and
* copy of btf pointer into bpf_core_cand_list,
* since module can be unloaded while bpf_core_apply_relo_insn
* is working with module's btf.
*/
}
err = bpf_core_apply_relo_insn((void *)ctx->log, insn, relo->insn_off / 8,
relo, relo_idx, ctx->btf, &cands, specs);
out:
kfree(specs);
if (need_cands) {
kfree(cands.cands);
mutex_unlock(&cand_cache_mutex);
if (ctx->log->level & BPF_LOG_LEVEL2)
print_cand_cache(ctx->log);
}
return err;
}

View File

@ -430,7 +430,7 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
* Exactly one of @prog or @link can be non-null.
* Must be called with cgroup_mutex held.
*/
int __cgroup_bpf_attach(struct cgroup *cgrp,
static int __cgroup_bpf_attach(struct cgroup *cgrp,
struct bpf_prog *prog, struct bpf_prog *replace_prog,
struct bpf_cgroup_link *link,
enum bpf_attach_type type, u32 flags)
@ -523,6 +523,20 @@ int __cgroup_bpf_attach(struct cgroup *cgrp,
return err;
}
static int cgroup_bpf_attach(struct cgroup *cgrp,
struct bpf_prog *prog, struct bpf_prog *replace_prog,
struct bpf_cgroup_link *link,
enum bpf_attach_type type,
u32 flags)
{
int ret;
mutex_lock(&cgroup_mutex);
ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
mutex_unlock(&cgroup_mutex);
return ret;
}
/* Swap updated BPF program for given link in effective program arrays across
* all descendant cgroups. This function is guaranteed to succeed.
*/
@ -672,13 +686,13 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
* propagate the change to descendants
* @cgrp: The cgroup which descendants to traverse
* @prog: A program to detach or NULL
* @prog: A link to detach or NULL
* @link: A link to detach or NULL
* @type: Type of detach operation
*
* At most one of @prog or @link can be non-NULL.
* Must be called with cgroup_mutex held.
*/
int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
struct bpf_cgroup_link *link, enum bpf_attach_type type)
{
enum cgroup_bpf_attach_type atype;
@ -730,8 +744,19 @@ int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
return err;
}
static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type)
{
int ret;
mutex_lock(&cgroup_mutex);
ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
mutex_unlock(&cgroup_mutex);
return ret;
}
/* Must be called with cgroup_mutex held to avoid races. */
int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
__u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
@ -789,6 +814,17 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
return ret;
}
static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
int ret;
mutex_lock(&cgroup_mutex);
ret = __cgroup_bpf_query(cgrp, attr, uattr);
mutex_unlock(&cgroup_mutex);
return ret;
}
int cgroup_bpf_prog_attach(const union bpf_attr *attr,
enum bpf_prog_type ptype, struct bpf_prog *prog)
{
@ -1753,7 +1789,7 @@ static const struct bpf_func_proto bpf_sysctl_set_new_value_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_PTR_TO_MEM,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};

View File

@ -32,6 +32,7 @@
#include <linux/perf_event.h>
#include <linux/extable.h>
#include <linux/log2.h>
#include <linux/bpf_verifier.h>
#include <asm/barrier.h>
#include <asm/unaligned.h>
@ -389,6 +390,13 @@ static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, s32 end_old,
i = end_new;
insn = prog->insnsi + end_old;
}
if (bpf_pseudo_func(insn)) {
ret = bpf_adj_delta_to_imm(insn, pos, end_old,
end_new, i, probe_pass);
if (ret)
return ret;
continue;
}
code = insn->code;
if ((BPF_CLASS(code) != BPF_JMP &&
BPF_CLASS(code) != BPF_JMP32) ||
@ -1566,7 +1574,8 @@ static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn)
if (unlikely(index >= array->map.max_entries))
goto out;
if (unlikely(tail_call_cnt > MAX_TAIL_CALL_CNT))
if (unlikely(tail_call_cnt >= MAX_TAIL_CALL_CNT))
goto out;
tail_call_cnt++;
@ -1883,7 +1892,7 @@ static void bpf_prog_select_func(struct bpf_prog *fp)
/**
* bpf_prog_select_runtime - select exec runtime for BPF program
* @fp: bpf_prog populated with internal BPF program
* @fp: bpf_prog populated with BPF program
* @err: pointer to error variable
*
* Try to JIT eBPF program, if JIT is not available, use interpreter.
@ -2263,6 +2272,9 @@ static void bpf_prog_free_deferred(struct work_struct *work)
int i;
aux = container_of(work, struct bpf_prog_aux, work);
#ifdef CONFIG_BPF_SYSCALL
bpf_free_kfunc_btf_tab(aux->kfunc_btf_tab);
#endif
bpf_free_used_maps(aux);
bpf_free_used_btfs(aux);
if (bpf_prog_is_dev_bound(aux))
@ -2289,7 +2301,6 @@ static void bpf_prog_free_deferred(struct work_struct *work)
}
}
/* Free internal BPF program */
void bpf_prog_free(struct bpf_prog *fp)
{
struct bpf_prog_aux *aux = fp->aux;
@ -2365,6 +2376,11 @@ const struct bpf_func_proto * __weak bpf_get_trace_printk_proto(void)
return NULL;
}
const struct bpf_func_proto * __weak bpf_get_trace_vprintk_proto(void)
{
return NULL;
}
u64 __weak
bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size,
void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy)

View File

@ -195,7 +195,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
}
return;
default:
bpf_warn_invalid_xdp_action(act);
bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(skb->dev, rcpu->prog, act);
@ -254,7 +254,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
}
break;
default:
bpf_warn_invalid_xdp_action(act);
bpf_warn_invalid_xdp_action(NULL, rcpu->prog, act);
fallthrough;
case XDP_DROP:
xdp_return_frame(xdpf);
@ -746,15 +746,9 @@ static void bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
list_add(&bq->flush_node, flush_list);
}
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
struct xdp_frame *xdpf;
xdpf = xdp_convert_buff_to_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
/* Info needed when constructing SKB on remote CPU */
xdpf->dev_rx = dev_rx;

View File

@ -348,7 +348,7 @@ static int dev_map_bpf_prog_run(struct bpf_prog *xdp_prog,
frames[nframes++] = xdpf;
break;
default:
bpf_warn_invalid_xdp_action(act);
bpf_warn_invalid_xdp_action(NULL, xdp_prog, act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(dev, xdp_prog, act);
@ -467,24 +467,19 @@ static void bq_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
bq->q[bq->count++] = xdpf;
}
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
static inline int __xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx,
struct bpf_prog *xdp_prog)
{
struct xdp_frame *xdpf;
int err;
if (!dev->netdev_ops->ndo_xdp_xmit)
return -EOPNOTSUPP;
err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
err = xdp_ok_fwd_dev(dev, xdpf->len);
if (unlikely(err))
return err;
xdpf = xdp_convert_buff_to_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
bq_enqueue(dev, xdpf, dev_rx, xdp_prog);
return 0;
}
@ -507,7 +502,7 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev
__skb_push(skb, skb->mac_len);
break;
default:
bpf_warn_invalid_xdp_action(act);
bpf_warn_invalid_xdp_action(NULL, dst->xdp_prog, act);
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(dst->dev, dst->xdp_prog, act);
@ -520,27 +515,27 @@ static u32 dev_map_bpf_prog_run_skb(struct sk_buff *skb, struct bpf_dtab_netdev
return act;
}
int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
int dev_xdp_enqueue(struct net_device *dev, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
return __xdp_enqueue(dev, xdp, dev_rx, NULL);
return __xdp_enqueue(dev, xdpf, dev_rx, NULL);
}
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_frame *xdpf,
struct net_device *dev_rx)
{
struct net_device *dev = dst->dev;
return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
return __xdp_enqueue(dev, xdpf, dev_rx, dst->xdp_prog);
}
static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_buff *xdp)
static bool is_valid_dst(struct bpf_dtab_netdev *obj, struct xdp_frame *xdpf)
{
if (!obj ||
!obj->dev->netdev_ops->ndo_xdp_xmit)
return false;
if (xdp_ok_fwd_dev(obj->dev, xdp->data_end - xdp->data))
if (xdp_ok_fwd_dev(obj->dev, xdpf->len))
return false;
return true;
@ -586,14 +581,13 @@ static int get_upper_ifindexes(struct net_device *dev, int *indexes)
return n;
}
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
int dev_map_enqueue_multi(struct xdp_frame *xdpf, struct net_device *dev_rx,
struct bpf_map *map, bool exclude_ingress)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dst, *last_dst = NULL;
int excluded_devices[1+MAX_NEST_DEV];
struct hlist_head *head;
struct xdp_frame *xdpf;
int num_excluded = 0;
unsigned int i;
int err;
@ -603,15 +597,11 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
excluded_devices[num_excluded++] = dev_rx->ifindex;
}
xdpf = xdp_convert_buff_to_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
for (i = 0; i < map->max_entries; i++) {
dst = rcu_dereference_check(dtab->netdev_map[i],
rcu_read_lock_bh_held());
if (!is_valid_dst(dst, xdp))
if (!is_valid_dst(dst, xdpf))
continue;
if (is_ifindex_excluded(excluded_devices, num_excluded, dst->dev->ifindex))
@ -634,7 +624,7 @@ int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
head = dev_map_index_hash(dtab, i);
hlist_for_each_entry_rcu(dst, head, index_hlist,
lockdep_is_held(&dtab->index_lock)) {
if (!is_valid_dst(dst, xdp))
if (!is_valid_dst(dst, xdpf))
continue;
if (is_ifindex_excluded(excluded_devices, num_excluded,

View File

@ -668,7 +668,7 @@ static int htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1);
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
offsetof(struct htab_elem, key) +
@ -709,7 +709,7 @@ static int htab_lru_map_gen_lookup(struct bpf_map *map,
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 4);
*insn++ = BPF_LDX_MEM(BPF_B, ref_reg, ret,
offsetof(struct htab_elem, lru_node) +
@ -2049,7 +2049,7 @@ static const struct bpf_iter_seq_info iter_seq_info = {
.seq_priv_size = sizeof(struct bpf_iter_seq_hash_map_info),
};
static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn,
static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
void *callback_ctx, u64 flags)
{
struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
@ -2089,9 +2089,8 @@ static int bpf_for_each_hash_elem(struct bpf_map *map, void *callback_fn,
val = elem->key + roundup_key_size;
}
num_elems++;
ret = BPF_CAST_CALL(callback_fn)((u64)(long)map,
(u64)(long)key, (u64)(long)val,
(u64)(long)callback_ctx, 0);
ret = callback_fn((u64)(long)map, (u64)(long)key,
(u64)(long)val, (u64)(long)callback_ctx, 0);
/* return value: 0 - continue, 1 - stop and return */
if (ret) {
rcu_read_unlock();
@ -2397,7 +2396,7 @@ static int htab_of_map_gen_lookup(struct bpf_map *map,
BUILD_BUG_ON(!__same_type(&__htab_map_lookup_elem,
(void *(*)(struct bpf_map *map, void *key))NULL));
*insn++ = BPF_EMIT_CALL(BPF_CAST_CALL(__htab_map_lookup_elem));
*insn++ = BPF_EMIT_CALL(__htab_map_lookup_elem);
*insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 2);
*insn++ = BPF_ALU64_IMM(BPF_ADD, ret,
offsetof(struct htab_elem, key) +

View File

@ -2,6 +2,8 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/bpf.h>
#include <linux/btf.h>
#include <linux/bpf-cgroup.h>
#include <linux/rcupdate.h>
#include <linux/random.h>
#include <linux/smp.h>
@ -530,7 +532,7 @@ const struct bpf_func_proto bpf_strtol_proto = {
.func = bpf_strtol,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_MEM,
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
@ -558,13 +560,27 @@ const struct bpf_func_proto bpf_strtoul_proto = {
.func = bpf_strtoul,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_MEM,
.arg1_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
};
#endif
BPF_CALL_3(bpf_strncmp, const char *, s1, u32, s1_sz, const char *, s2)
{
return strncmp(s1, s2, s1_sz);
}
const struct bpf_func_proto bpf_strncmp_proto = {
.func = bpf_strncmp,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_MEM,
.arg2_type = ARG_CONST_SIZE,
.arg3_type = ARG_PTR_TO_CONST_STR,
};
BPF_CALL_4(bpf_get_ns_current_pid_tgid, u64, dev, u64, ino,
struct bpf_pidns_info *, nsdata, u32, size)
{
@ -630,7 +646,7 @@ const struct bpf_func_proto bpf_event_output_data_proto = {
.arg1_type = ARG_PTR_TO_CTX,
.arg2_type = ARG_CONST_MAP_PTR,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_MEM,
.arg4_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@ -667,7 +683,7 @@ BPF_CALL_2(bpf_per_cpu_ptr, const void *, ptr, u32, cpu)
const struct bpf_func_proto bpf_per_cpu_ptr_proto = {
.func = bpf_per_cpu_ptr,
.gpl_only = false,
.ret_type = RET_PTR_TO_MEM_OR_BTF_ID_OR_NULL,
.ret_type = RET_PTR_TO_MEM_OR_BTF_ID | PTR_MAYBE_NULL | MEM_RDONLY,
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
.arg2_type = ARG_ANYTHING,
};
@ -680,7 +696,7 @@ BPF_CALL_1(bpf_this_cpu_ptr, const void *, percpu_ptr)
const struct bpf_func_proto bpf_this_cpu_ptr_proto = {
.func = bpf_this_cpu_ptr,
.gpl_only = false,
.ret_type = RET_PTR_TO_MEM_OR_BTF_ID,
.ret_type = RET_PTR_TO_MEM_OR_BTF_ID | MEM_RDONLY,
.arg1_type = ARG_PTR_TO_PERCPU_BTF_ID,
};
@ -979,15 +995,13 @@ int bpf_bprintf_prepare(char *fmt, u32 fmt_size, const u64 *raw_args,
return err;
}
#define MAX_SNPRINTF_VARARGS 12
BPF_CALL_5(bpf_snprintf, char *, str, u32, str_size, char *, fmt,
const void *, data, u32, data_len)
{
int err, num_args;
u32 *bin_args;
if (data_len % 8 || data_len > MAX_SNPRINTF_VARARGS * 8 ||
if (data_len % 8 || data_len > MAX_BPRINTF_VARARGS * 8 ||
(data_len && !data))
return -EINVAL;
num_args = data_len / 8;
@ -1013,7 +1027,7 @@ const struct bpf_func_proto bpf_snprintf_proto = {
.arg1_type = ARG_PTR_TO_MEM_OR_NULL,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_PTR_TO_CONST_STR,
.arg4_type = ARG_PTR_TO_MEM_OR_NULL,
.arg4_type = ARG_PTR_TO_MEM | PTR_MAYBE_NULL | MEM_RDONLY,
.arg5_type = ARG_CONST_SIZE_OR_ZERO,
};
@ -1058,10 +1072,11 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
struct bpf_hrtimer *t = container_of(hrtimer, struct bpf_hrtimer, timer);
struct bpf_map *map = t->map;
void *value = t->value;
void *callback_fn;
bpf_callback_t callback_fn;
void *key;
u32 idx;
BTF_TYPE_EMIT(struct bpf_timer);
callback_fn = rcu_dereference_check(t->callback_fn, rcu_read_lock_bh_held());
if (!callback_fn)
goto out;
@ -1083,8 +1098,7 @@ static enum hrtimer_restart bpf_timer_cb(struct hrtimer *hrtimer)
key = value - round_up(map->key_size, 8);
}
BPF_CAST_CALL(callback_fn)((u64)(long)map, (u64)(long)key,
(u64)(long)value, 0, 0);
callback_fn((u64)(long)map, (u64)(long)key, (u64)(long)value, 0, 0);
/* The verifier checked that return value is zero. */
this_cpu_write(hrtimer_running, NULL);
@ -1379,6 +1393,10 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_ringbuf_query_proto;
case BPF_FUNC_for_each_map_elem:
return &bpf_for_each_map_elem_proto;
case BPF_FUNC_loop:
return &bpf_loop_proto;
case BPF_FUNC_strncmp:
return &bpf_strncmp_proto;
default:
break;
}
@ -1435,6 +1453,8 @@ bpf_base_func_proto(enum bpf_func_id func_id)
return &bpf_snprintf_proto;
case BPF_FUNC_task_pt_regs:
return &bpf_task_pt_regs_proto;
case BPF_FUNC_trace_vprintk:
return bpf_get_trace_vprintk_proto();
default:
return NULL;
}

View File

@ -163,8 +163,7 @@ static int cgroup_storage_update_elem(struct bpf_map *map, void *key,
return 0;
}
new = bpf_map_kmalloc_node(map, sizeof(struct bpf_storage_buffer) +
map->value_size,
new = bpf_map_kmalloc_node(map, struct_size(new, data, map->value_size),
__GFP_ZERO | GFP_ATOMIC | __GFP_NOWARN,
map->numa_node);
if (!new)

View File

@ -174,9 +174,9 @@ static const struct bpf_iter_reg bpf_map_elem_reg_info = {
.ctx_arg_info_size = 2,
.ctx_arg_info = {
{ offsetof(struct bpf_iter__bpf_map_elem, key),
PTR_TO_RDONLY_BUF_OR_NULL },
PTR_TO_BUF | PTR_MAYBE_NULL | MEM_RDONLY },
{ offsetof(struct bpf_iter__bpf_map_elem, value),
PTR_TO_RDWR_BUF_OR_NULL },
PTR_TO_BUF | PTR_MAYBE_NULL },
},
};

View File

@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/bpf.h>
#include <linux/bpf-netns.h>
#include <linux/filter.h>
#include <net/net_namespace.h>

View File

@ -1,4 +1,2 @@
/FEATURE-DUMP.libbpf
/bpf_helper_defs.h
/feature
/libbpf
/bpf_preload_umd

View File

@ -1,21 +1,35 @@
# SPDX-License-Identifier: GPL-2.0
LIBBPF_SRCS = $(srctree)/tools/lib/bpf/
LIBBPF_A = $(obj)/libbpf.a
LIBBPF_OUT = $(abspath $(obj))
LIBBPF_OUT = $(abspath $(obj))/libbpf
LIBBPF_A = $(LIBBPF_OUT)/libbpf.a
LIBBPF_DESTDIR = $(LIBBPF_OUT)
LIBBPF_INCLUDE = $(LIBBPF_DESTDIR)/include
# Although not in use by libbpf's Makefile, set $(O) so that the "dummy" test
# in tools/scripts/Makefile.include always succeeds when building the kernel
# with $(O) pointing to a relative path, as in "make O=build bindeb-pkg".
$(LIBBPF_A):
$(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ $(LIBBPF_OUT)/libbpf.a
$(LIBBPF_A): | $(LIBBPF_OUT)
$(Q)$(MAKE) -C $(LIBBPF_SRCS) O=$(LIBBPF_OUT)/ OUTPUT=$(LIBBPF_OUT)/ \
DESTDIR=$(LIBBPF_DESTDIR) prefix= \
$(LIBBPF_OUT)/libbpf.a install_headers
libbpf_hdrs: $(LIBBPF_A)
.PHONY: libbpf_hdrs
$(LIBBPF_OUT):
$(call msg,MKDIR,$@)
$(Q)mkdir -p $@
userccflags += -I $(srctree)/tools/include/ -I $(srctree)/tools/include/uapi \
-I $(srctree)/tools/lib/ -Wno-unused-result
-I $(LIBBPF_INCLUDE) -Wno-unused-result
userprogs := bpf_preload_umd
clean-files := $(userprogs) bpf_helper_defs.h FEATURE-DUMP.libbpf staticobjs/ feature/
clean-files := libbpf/
$(obj)/iterators/iterators.o: | libbpf_hdrs
bpf_preload_umd-objs := iterators/iterators.o
bpf_preload_umd-userldlibs := $(LIBBPF_A) -lelf -lz

View File

@ -1,18 +1,26 @@
# SPDX-License-Identifier: GPL-2.0
OUTPUT := .output
abs_out := $(abspath $(OUTPUT))
CLANG ?= clang
LLC ?= llc
LLVM_STRIP ?= llvm-strip
TOOLS_PATH := $(abspath ../../../../tools)
BPFTOOL_SRC := $(TOOLS_PATH)/bpf/bpftool
BPFTOOL_OUTPUT := $(abs_out)/bpftool
DEFAULT_BPFTOOL := $(OUTPUT)/sbin/bpftool
BPFTOOL ?= $(DEFAULT_BPFTOOL)
LIBBPF_SRC := $(abspath ../../../../tools/lib/bpf)
BPFOBJ := $(OUTPUT)/libbpf.a
BPF_INCLUDE := $(OUTPUT)
INCLUDES := -I$(OUTPUT) -I$(BPF_INCLUDE) -I$(abspath ../../../../tools/lib) \
-I$(abspath ../../../../tools/include/uapi)
LIBBPF_SRC := $(TOOLS_PATH)/lib/bpf
LIBBPF_OUTPUT := $(abs_out)/libbpf
LIBBPF_DESTDIR := $(LIBBPF_OUTPUT)
LIBBPF_INCLUDE := $(LIBBPF_DESTDIR)/include
BPFOBJ := $(LIBBPF_OUTPUT)/libbpf.a
INCLUDES := -I$(OUTPUT) -I$(LIBBPF_INCLUDE) -I$(TOOLS_PATH)/include/uapi
CFLAGS := -g -Wall
abs_out := $(abspath $(OUTPUT))
ifeq ($(V),1)
Q =
msg =
@ -44,14 +52,18 @@ $(OUTPUT)/iterators.bpf.o: iterators.bpf.c $(BPFOBJ) | $(OUTPUT)
-c $(filter %.c,$^) -o $@ && \
$(LLVM_STRIP) -g $@
$(OUTPUT):
$(OUTPUT) $(LIBBPF_OUTPUT) $(BPFTOOL_OUTPUT):
$(call msg,MKDIR,$@)
$(Q)mkdir -p $(OUTPUT)
$(Q)mkdir -p $@
$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(OUTPUT)
$(BPFOBJ): $(wildcard $(LIBBPF_SRC)/*.[ch] $(LIBBPF_SRC)/Makefile) | $(LIBBPF_OUTPUT)
$(Q)$(MAKE) $(submake_extras) -C $(LIBBPF_SRC) \
OUTPUT=$(abspath $(dir $@))/ $(abspath $@)
OUTPUT=$(abspath $(dir $@))/ prefix= \
DESTDIR=$(LIBBPF_DESTDIR) $(abspath $@) install_headers
$(DEFAULT_BPFTOOL):
$(Q)$(MAKE) $(submake_extras) -C ../../../../tools/bpf/bpftool \
prefix= OUTPUT=$(abs_out)/ DESTDIR=$(abs_out) install
$(DEFAULT_BPFTOOL): $(BPFOBJ) | $(BPFTOOL_OUTPUT)
$(Q)$(MAKE) $(submake_extras) -C $(BPFTOOL_SRC) \
OUTPUT=$(BPFTOOL_OUTPUT)/ \
LIBBPF_OUTPUT=$(LIBBPF_OUTPUT)/ \
LIBBPF_DESTDIR=$(LIBBPF_DESTDIR)/ \
prefix= DESTDIR=$(abs_out)/ install-bin

View File

@ -152,16 +152,12 @@ static struct bpf_map *reuseport_array_alloc(union bpf_attr *attr)
{
int numa_node = bpf_map_attr_numa_node(attr);
struct reuseport_array *array;
u64 array_size;
if (!bpf_capable())
return ERR_PTR(-EPERM);
array_size = sizeof(*array);
array_size += (u64)attr->max_entries * sizeof(struct sock *);
/* allocate all map elements and zero-initialize them */
array = bpf_map_area_alloc(array_size, numa_node);
array = bpf_map_area_alloc(struct_size(array, ptrs, attr->max_entries), numa_node);
if (!array)
return ERR_PTR(-ENOMEM);

View File

@ -444,7 +444,7 @@ const struct bpf_func_proto bpf_ringbuf_output_proto = {
.func = bpf_ringbuf_output,
.ret_type = RET_INTEGER,
.arg1_type = ARG_CONST_MAP_PTR,
.arg2_type = ARG_PTR_TO_MEM,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
};

View File

@ -7,10 +7,10 @@
#include <linux/kernel.h>
#include <linux/stacktrace.h>
#include <linux/perf_event.h>
#include <linux/irq_work.h>
#include <linux/btf_ids.h>
#include <linux/buildid.h>
#include "percpu_freelist.h"
#include "mmap_unlock_work.h"
#define STACK_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
@ -31,25 +31,6 @@ struct bpf_stack_map {
struct stack_map_bucket *buckets[];
};
/* irq_work to run up_read() for build_id lookup in nmi context */
struct stack_map_irq_work {
struct irq_work irq_work;
struct mm_struct *mm;
};
static void do_up_read(struct irq_work *entry)
{
struct stack_map_irq_work *work;
if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
return;
work = container_of(entry, struct stack_map_irq_work, irq_work);
mmap_read_unlock_non_owner(work->mm);
}
static DEFINE_PER_CPU(struct stack_map_irq_work, up_read_work);
static inline bool stack_map_use_build_id(struct bpf_map *map)
{
return (map->map_flags & BPF_F_STACK_BUILD_ID);
@ -149,35 +130,13 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
u64 *ips, u32 trace_nr, bool user)
{
int i;
struct mmap_unlock_irq_work *work = NULL;
bool irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
struct vm_area_struct *vma;
bool irq_work_busy = false;
struct stack_map_irq_work *work = NULL;
if (irqs_disabled()) {
if (!IS_ENABLED(CONFIG_PREEMPT_RT)) {
work = this_cpu_ptr(&up_read_work);
if (irq_work_is_busy(&work->irq_work)) {
/* cannot queue more up_read, fallback */
irq_work_busy = true;
}
} else {
/*
* PREEMPT_RT does not allow to trylock mmap sem in
* interrupt disabled context. Force the fallback code.
*/
irq_work_busy = true;
}
}
/*
* We cannot do up_read() when the irq is disabled, because of
* risk to deadlock with rq_lock. To do build_id lookup when the
* irqs are disabled, we need to run up_read() in irq_work. We use
* a percpu variable to do the irq_work. If the irq_work is
* already used by another lookup, we fall back to report ips.
*
* Same fallback is used for kernel stack (!user) on a stackmap
* with build_id.
/* If the irq_work is in use, fall back to report ips. Same
* fallback is used for kernel stack (!user) on a stackmap with
* build_id.
*/
if (!user || !current || !current->mm || irq_work_busy ||
!mmap_read_trylock(current->mm)) {
@ -203,19 +162,7 @@ static void stack_map_get_build_id_offset(struct bpf_stack_build_id *id_offs,
- vma->vm_start;
id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
}
if (!work) {
mmap_read_unlock(current->mm);
} else {
work->mm = current->mm;
/* The lock will be released once we're out of interrupt
* context. Tell lockdep that we've released it now so
* it doesn't complain that we forgot to release it.
*/
rwsem_release(&current->mm->mmap_lock.dep_map, _RET_IP_);
irq_work_queue(&work->irq_work);
}
bpf_mmap_unlock_mm(work, current->mm);
}
static struct perf_callchain_entry *
@ -543,7 +490,7 @@ const struct bpf_func_proto bpf_get_task_stack_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &btf_task_struct_ids[0],
.arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg2_type = ARG_PTR_TO_UNINIT_MEM,
.arg3_type = ARG_CONST_SIZE_OR_ZERO,
.arg4_type = ARG_ANYTHING,
@ -720,16 +667,3 @@ const struct bpf_map_ops stack_trace_map_ops = {
.map_btf_name = "bpf_stack_map",
.map_btf_id = &stack_trace_map_btf_id,
};
static int __init stack_map_init(void)
{
int cpu;
struct stack_map_irq_work *work;
for_each_possible_cpu(cpu) {
work = per_cpu_ptr(&up_read_work, cpu);
init_irq_work(&work->irq_work, do_up_read);
}
return 0;
}
subsys_initcall(stack_map_init);

View File

@ -2,6 +2,7 @@
/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
*/
#include <linux/bpf.h>
#include <linux/bpf-cgroup.h>
#include <linux/bpf_trace.h>
#include <linux/bpf_lirc.h>
#include <linux/bpf_verifier.h>
@ -214,7 +215,8 @@ static int bpf_map_update_value(struct bpf_map *map, struct fd f, void *key,
err = bpf_fd_reuseport_array_update_elem(map, key, value,
flags);
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK) {
map->map_type == BPF_MAP_TYPE_STACK ||
map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
err = map->ops->map_push_elem(map, value, flags);
} else {
rcu_read_lock();
@ -253,7 +255,8 @@ static int bpf_map_copy_value(struct bpf_map *map, void *key, void *value,
} else if (map->map_type == BPF_MAP_TYPE_REUSEPORT_SOCKARRAY) {
err = bpf_fd_reuseport_array_lookup_elem(map, key, value);
} else if (map->map_type == BPF_MAP_TYPE_QUEUE ||
map->map_type == BPF_MAP_TYPE_STACK) {
map->map_type == BPF_MAP_TYPE_STACK ||
map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
err = map->ops->map_peek_elem(map, value);
} else if (map->map_type == BPF_MAP_TYPE_STRUCT_OPS) {
/* struct_ops map requires directly updating "value" */
@ -363,6 +366,7 @@ void bpf_map_init_from_attr(struct bpf_map *map, union bpf_attr *attr)
map->max_entries = attr->max_entries;
map->map_flags = bpf_map_flags_retain_permanent(attr->map_flags);
map->numa_node = bpf_map_attr_numa_node(attr);
map->map_extra = attr->map_extra;
}
static int bpf_map_alloc_id(struct bpf_map *map)
@ -570,6 +574,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
"value_size:\t%u\n"
"max_entries:\t%u\n"
"map_flags:\t%#x\n"
"map_extra:\t%#llx\n"
"memlock:\t%lu\n"
"map_id:\t%u\n"
"frozen:\t%u\n",
@ -578,6 +583,7 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
map->value_size,
map->max_entries,
map->map_flags,
(unsigned long long)map->map_extra,
bpf_map_memory_footprint(map),
map->id,
READ_ONCE(map->frozen));
@ -821,7 +827,7 @@ static int map_check_btf(struct bpf_map *map, const struct btf *btf,
return ret;
}
#define BPF_MAP_CREATE_LAST_FIELD btf_vmlinux_value_type_id
#define BPF_MAP_CREATE_LAST_FIELD map_extra
/* called via syscall */
static int map_create(union bpf_attr *attr)
{
@ -842,6 +848,10 @@ static int map_create(union bpf_attr *attr)
return -EINVAL;
}
if (attr->map_type != BPF_MAP_TYPE_BLOOM_FILTER &&
attr->map_extra != 0)
return -EINVAL;
f_flags = bpf_get_file_flag(attr->map_flags);
if (f_flags < 0)
return f_flags;
@ -1091,6 +1101,14 @@ static int map_lookup_elem(union bpf_attr *attr)
if (!value)
goto free_key;
if (map->map_type == BPF_MAP_TYPE_BLOOM_FILTER) {
if (copy_from_user(value, uvalue, value_size))
err = -EFAULT;
else
err = bpf_map_copy_value(map, key, value, attr->flags);
goto free_value;
}
err = bpf_map_copy_value(map, key, value, attr->flags);
if (err)
goto free_value;
@ -1874,7 +1892,8 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
"prog_id:\t%u\n"
"run_time_ns:\t%llu\n"
"run_cnt:\t%llu\n"
"recursion_misses:\t%llu\n",
"recursion_misses:\t%llu\n"
"verified_insns:\t%u\n",
prog->type,
prog->jited,
prog_tag,
@ -1882,7 +1901,8 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
prog->aux->id,
stats.nsecs,
stats.cnt,
stats.misses);
stats.misses,
prog->aux->verified_insns);
}
#endif
@ -2182,7 +2202,7 @@ static bool is_perfmon_prog_type(enum bpf_prog_type prog_type)
}
/* last field in 'union bpf_attr' used by this command */
#define BPF_PROG_LOAD_LAST_FIELD fd_array
#define BPF_PROG_LOAD_LAST_FIELD core_relo_rec_size
static int bpf_prog_load(union bpf_attr *attr, bpfptr_t uattr)
{
@ -3651,6 +3671,8 @@ static int bpf_prog_get_info_by_fd(struct file *file,
info.run_cnt = stats.cnt;
info.recursion_misses = stats.misses;
info.verified_insns = prog->aux->verified_insns;
if (!bpf_capable()) {
info.jited_prog_len = 0;
info.xlated_prog_len = 0;
@ -3897,6 +3919,7 @@ static int bpf_map_get_info_by_fd(struct file *file,
info.value_size = map->value_size;
info.max_entries = map->max_entries;
info.map_flags = map->map_flags;
info.map_extra = map->map_extra;
memcpy(info.name, map->name, sizeof(map->name));
if (map->btf) {
@ -4753,7 +4776,7 @@ static const struct bpf_func_proto bpf_sys_bpf_proto = {
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
.arg2_type = ARG_PTR_TO_MEM,
.arg2_type = ARG_PTR_TO_MEM | MEM_RDONLY,
.arg3_type = ARG_CONST_SIZE,
};
@ -4780,6 +4803,31 @@ static const struct bpf_func_proto bpf_sys_close_proto = {
.arg1_type = ARG_ANYTHING,
};
BPF_CALL_4(bpf_kallsyms_lookup_name, const char *, name, int, name_sz, int, flags, u64 *, res)
{
if (flags)
return -EINVAL;
if (name_sz <= 1 || name[name_sz - 1])
return -EINVAL;
if (!bpf_dump_raw_ok(current_cred()))
return -EPERM;
*res = kallsyms_lookup_name(name);
return *res ? 0 : -ENOENT;
}
const struct bpf_func_proto bpf_kallsyms_lookup_name_proto = {
.func = bpf_kallsyms_lookup_name,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_MEM,
.arg2_type = ARG_CONST_SIZE_OR_ZERO,
.arg3_type = ARG_ANYTHING,
.arg4_type = ARG_PTR_TO_LONG,
};
static const struct bpf_func_proto *
syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
{
@ -4790,6 +4838,8 @@ syscall_prog_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_btf_find_by_name_kind_proto;
case BPF_FUNC_sys_close:
return &bpf_sys_close_proto;
case BPF_FUNC_kallsyms_lookup_name:
return &bpf_kallsyms_lookup_name_proto;
default:
return tracing_prog_func_proto(func_id, prog);
}

View File

@ -8,6 +8,7 @@
#include <linux/fdtable.h>
#include <linux/filter.h>
#include <linux/btf_ids.h>
#include "mmap_unlock_work.h"
struct bpf_iter_seq_task_common {
struct pid_namespace *ns;
@ -524,10 +525,6 @@ static const struct seq_operations task_vma_seq_ops = {
.show = task_vma_seq_show,
};
BTF_ID_LIST(btf_task_file_ids)
BTF_ID(struct, file)
BTF_ID(struct, vm_area_struct)
static const struct bpf_iter_seq_info task_seq_info = {
.seq_ops = &task_seq_ops,
.init_seq_private = init_seq_pidns,
@ -586,23 +583,88 @@ static struct bpf_iter_reg task_vma_reg_info = {
.seq_info = &task_vma_seq_info,
};
BPF_CALL_5(bpf_find_vma, struct task_struct *, task, u64, start,
bpf_callback_t, callback_fn, void *, callback_ctx, u64, flags)
{
struct mmap_unlock_irq_work *work = NULL;
struct vm_area_struct *vma;
bool irq_work_busy = false;
struct mm_struct *mm;
int ret = -ENOENT;
if (flags)
return -EINVAL;
if (!task)
return -ENOENT;
mm = task->mm;
if (!mm)
return -ENOENT;
irq_work_busy = bpf_mmap_unlock_get_irq_work(&work);
if (irq_work_busy || !mmap_read_trylock(mm))
return -EBUSY;
vma = find_vma(mm, start);
if (vma && vma->vm_start <= start && vma->vm_end > start) {
callback_fn((u64)(long)task, (u64)(long)vma,
(u64)(long)callback_ctx, 0, 0);
ret = 0;
}
bpf_mmap_unlock_mm(work, mm);
return ret;
}
const struct bpf_func_proto bpf_find_vma_proto = {
.func = bpf_find_vma,
.ret_type = RET_INTEGER,
.arg1_type = ARG_PTR_TO_BTF_ID,
.arg1_btf_id = &btf_tracing_ids[BTF_TRACING_TYPE_TASK],
.arg2_type = ARG_ANYTHING,
.arg3_type = ARG_PTR_TO_FUNC,
.arg4_type = ARG_PTR_TO_STACK_OR_NULL,
.arg5_type = ARG_ANYTHING,
};
DEFINE_PER_CPU(struct mmap_unlock_irq_work, mmap_unlock_work);
static void do_mmap_read_unlock(struct irq_work *entry)
{
struct mmap_unlock_irq_work *work;
if (WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_RT)))
return;
work = container_of(entry, struct mmap_unlock_irq_work, irq_work);
mmap_read_unlock_non_owner(work->mm);
}
static int __init task_iter_init(void)
{
int ret;
struct mmap_unlock_irq_work *work;
int ret, cpu;
task_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
for_each_possible_cpu(cpu) {
work = per_cpu_ptr(&mmap_unlock_work, cpu);
init_irq_work(&work->irq_work, do_mmap_read_unlock);
}
task_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
ret = bpf_iter_reg_target(&task_reg_info);
if (ret)
return ret;
task_file_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
task_file_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[0];
task_file_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
task_file_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_FILE];
ret = bpf_iter_reg_target(&task_file_reg_info);
if (ret)
return ret;
task_vma_reg_info.ctx_arg_info[0].btf_id = btf_task_struct_ids[0];
task_vma_reg_info.ctx_arg_info[1].btf_id = btf_task_file_ids[1];
task_vma_reg_info.ctx_arg_info[0].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
task_vma_reg_info.ctx_arg_info[1].btf_id = btf_tracing_ids[BTF_TRACING_TYPE_VMA];
return bpf_iter_reg_target(&task_vma_reg_info);
}
late_initcall(task_iter_init);

View File

@ -10,6 +10,7 @@
#include <linux/rcupdate_trace.h>
#include <linux/rcupdate_wait.h>
#include <linux/module.h>
#include <linux/static_call.h>
/* dummy _ops. The verifier will operate on target program's ops. */
const struct bpf_verifier_ops bpf_extension_verifier_ops = {
@ -26,6 +27,14 @@ static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
/* serializes access to trampoline_table */
static DEFINE_MUTEX(trampoline_mutex);
bool bpf_prog_has_trampoline(const struct bpf_prog *prog)
{
enum bpf_attach_type eatype = prog->expected_attach_type;
return eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT ||
eatype == BPF_MODIFY_RETURN;
}
void *bpf_jit_alloc_exec_page(void)
{
void *image;
@ -526,7 +535,7 @@ void bpf_trampoline_put(struct bpf_trampoline *tr)
}
#define NO_START_TIME 1
static u64 notrace bpf_prog_start_time(void)
static __always_inline u64 notrace bpf_prog_start_time(void)
{
u64 start = NO_START_TIME;

File diff suppressed because it is too large Load Diff

View File

@ -63,9 +63,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
for_each_root(root) {
struct cgroup *from_cgrp;
if (root == &cgrp_dfl_root)
continue;
spin_lock_irq(&css_set_lock);
from_cgrp = task_cgroup_from_root(from, root);
spin_unlock_irq(&css_set_lock);
@ -675,11 +672,9 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
/*
* ideally we don't want subsystems moving around while we do this.
* cgroup_mutex is also necessary to guarantee an atomic snapshot of
* subsys/hierarchy state.
* Grab the subsystems state racily. No need to add avenue to
* cgroup_mutex contention.
*/
mutex_lock(&cgroup_mutex);
for_each_subsys(ss, i)
seq_printf(m, "%s\t%d\t%d\t%d\n",
@ -687,7 +682,6 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
atomic_read(&ss->root->nr_cgrps),
cgroup_ssid_enabled(i));
mutex_unlock(&cgroup_mutex);
return 0;
}
@ -714,8 +708,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
kernfs_type(kn) != KERNFS_DIR)
return -EINVAL;
mutex_lock(&cgroup_mutex);
/*
* We aren't being called from kernfs and there's no guarantee on
* @kn->priv's validity. For this and css_tryget_online_from_dir(),
@ -723,9 +715,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
*/
rcu_read_lock();
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
if (!cgrp || cgroup_is_dead(cgrp)) {
if (!cgrp || !cgroup_tryget(cgrp)) {
rcu_read_unlock();
mutex_unlock(&cgroup_mutex);
return -ENOENT;
}
rcu_read_unlock();
@ -753,7 +744,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
}
css_task_iter_end(&it);
mutex_unlock(&cgroup_mutex);
cgroup_put(cgrp);
return 0;
}

View File

@ -30,6 +30,7 @@
#include "cgroup-internal.h"
#include <linux/bpf-cgroup.h>
#include <linux/cred.h>
#include <linux/errno.h>
#include <linux/init_task.h>
@ -2650,11 +2651,11 @@ void cgroup_migrate_add_src(struct css_set *src_cset,
if (src_cset->dead)
return;
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
if (!list_empty(&src_cset->mg_preload_node))
return;
src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root);
WARN_ON(src_cset->mg_src_cgrp);
WARN_ON(src_cset->mg_dst_cgrp);
WARN_ON(!list_empty(&src_cset->mg_tasks));
@ -5747,7 +5748,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
/* Create the root cgroup state for this subsystem */
ss->root = &cgrp_dfl_root;
css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss));
css = ss->css_alloc(NULL);
/* We don't handle early failures gracefully */
BUG_ON(IS_ERR(css));
init_and_link_css(css, ss, &cgrp_dfl_root.cgrp);
@ -5979,17 +5980,20 @@ struct cgroup *cgroup_get_from_id(u64 id)
struct kernfs_node *kn;
struct cgroup *cgrp = NULL;
mutex_lock(&cgroup_mutex);
kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
if (!kn)
goto out_unlock;
goto out;
cgrp = kn->priv;
if (cgroup_is_dead(cgrp) || !cgroup_tryget(cgrp))
rcu_read_lock();
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
if (cgrp && !cgroup_tryget(cgrp))
cgrp = NULL;
rcu_read_unlock();
kernfs_put(kn);
out_unlock:
mutex_unlock(&cgroup_mutex);
out:
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_id);
@ -6171,6 +6175,20 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
if (ret)
goto err;
/*
* Spawning a task directly into a cgroup works by passing a file
* descriptor to the target cgroup directory. This can even be an O_PATH
* file descriptor. But it can never be a cgroup.procs file descriptor.
* This was done on purpose so spawning into a cgroup could be
* conceptualized as an atomic
*
* fd = openat(dfd_cgroup, "cgroup.procs", ...);
* write(fd, <child-pid>, ...);
*
* sequence, i.e. it's a shorthand for the caller opening and writing
* cgroup.procs of the cgroup indicated by @dfd_cgroup. This allows us
* to always use the caller's credentials.
*/
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
!(kargs->flags & CLONE_THREAD),
current->nsproxy->cgroup_ns);
@ -6572,30 +6590,34 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
*
* Find the cgroup at @path on the default hierarchy, increment its
* reference count and return it. Returns pointer to the found cgroup on
* success, ERR_PTR(-ENOENT) if @path doesn't exist and ERR_PTR(-ENOTDIR)
* if @path points to a non-directory.
* success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already
* been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.
*/
struct cgroup *cgroup_get_from_path(const char *path)
{
struct kernfs_node *kn;
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
struct cgroup *cgrp = ERR_PTR(-ENOENT);
kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
if (kn) {
if (kernfs_type(kn) == KERNFS_DIR) {
cgrp = kn->priv;
cgroup_get_live(cgrp);
} else {
if (!kn)
goto out;
if (kernfs_type(kn) != KERNFS_DIR) {
cgrp = ERR_PTR(-ENOTDIR);
}
kernfs_put(kn);
} else {
cgrp = ERR_PTR(-ENOENT);
goto out_kernfs;
}
mutex_unlock(&cgroup_mutex);
rcu_read_lock();
cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
if (!cgrp || !cgroup_tryget(cgrp))
cgrp = ERR_PTR(-ENOENT);
rcu_read_unlock();
out_kernfs:
kernfs_put(kn);
out:
return cgrp;
}
EXPORT_SYMBOL_GPL(cgroup_get_from_path);
@ -6723,44 +6745,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
#endif /* CONFIG_SOCK_CGROUP_DATA */
#ifdef CONFIG_CGROUP_BPF
int cgroup_bpf_attach(struct cgroup *cgrp,
struct bpf_prog *prog, struct bpf_prog *replace_prog,
struct bpf_cgroup_link *link,
enum bpf_attach_type type,
u32 flags)
{
int ret;
mutex_lock(&cgroup_mutex);
ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
mutex_unlock(&cgroup_mutex);
return ret;
}
int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
enum bpf_attach_type type)
{
int ret;
mutex_lock(&cgroup_mutex);
ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
mutex_unlock(&cgroup_mutex);
return ret;
}
int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
union bpf_attr __user *uattr)
{
int ret;
mutex_lock(&cgroup_mutex);
ret = __cgroup_bpf_query(cgrp, attr, uattr);
mutex_unlock(&cgroup_mutex);
return ret;
}
#endif /* CONFIG_CGROUP_BPF */
#ifdef CONFIG_SYSFS
static ssize_t show_delegatable_files(struct cftype *files, char *buf,
ssize_t size, const char *prefix)

View File

@ -69,6 +69,13 @@
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/*
* There could be abnormal cpuset configurations for cpu or memory
* node binding, add this key to provide a quick low-cost judgement
* of the situation.
*/
DEFINE_STATIC_KEY_FALSE(cpusets_insane_config_key);
/* See "Frequency meter" comments, below. */
struct fmeter {
@ -372,6 +379,17 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
static inline void check_insane_mems_config(nodemask_t *nodes)
{
if (!cpusets_insane_config() &&
movable_only_nodes(nodes)) {
static_branch_enable(&cpusets_insane_config_key);
pr_info("Unsupported (movable nodes only) cpuset configuration detected (nmask=%*pbl)!\n"
"Cpuset allocations might fail even with a lot of memory available.\n",
nodemask_pr_args(nodes));
}
}
/*
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
@ -572,6 +590,35 @@ static inline void free_cpuset(struct cpuset *cs)
kfree(cs);
}
/*
* validate_change_legacy() - Validate conditions specific to legacy (v1)
* behavior.
*/
static int validate_change_legacy(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
int ret;
WARN_ON_ONCE(!rcu_read_lock_held());
/* Each of our child cpusets must be a subset of us */
ret = -EBUSY;
cpuset_for_each_child(c, css, cur)
if (!is_cpuset_subset(c, trial))
goto out;
/* On legacy hierarchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
par = parent_cs(cur);
if (par && !is_cpuset_subset(trial, par))
goto out;
ret = 0;
out:
return ret;
}
/*
* validate_change() - Used to validate that any proposed cpuset change
* follows the structural rules for cpusets.
@ -596,28 +643,21 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
{
struct cgroup_subsys_state *css;
struct cpuset *c, *par;
int ret;
int ret = 0;
rcu_read_lock();
/* Each of our child cpusets must be a subset of us */
ret = -EBUSY;
cpuset_for_each_child(c, css, cur)
if (!is_cpuset_subset(c, trial))
if (!is_in_v2_mode())
ret = validate_change_legacy(cur, trial);
if (ret)
goto out;
/* Remaining checks don't apply to root cpuset */
ret = 0;
if (cur == &top_cpuset)
goto out;
par = parent_cs(cur);
/* On legacy hierarchy, we must be a subset of our parent cpuset. */
ret = -EACCES;
if (!is_in_v2_mode() && !is_cpuset_subset(trial, par))
goto out;
/*
* If either I or some sibling (!= me) is exclusive, we can't
* overlap
@ -1165,9 +1205,7 @@ enum subparts_cmd {
*
* Because of the implicit cpu exclusive nature of a partition root,
* cpumask changes that violates the cpu exclusivity rule will not be
* permitted when checked by validate_change(). The validate_change()
* function will also prevent any changes to the cpu list if it is not
* a superset of children's cpu lists.
* permitted when checked by validate_change().
*/
static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd,
struct cpumask *newmask,
@ -1879,6 +1917,8 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
if (retval < 0)
goto done;
check_insane_mems_config(&trialcs->mems_allowed);
spin_lock_irq(&callback_lock);
cs->mems_allowed = trialcs->mems_allowed;
spin_unlock_irq(&callback_lock);
@ -3184,6 +3224,9 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
mems_updated = !nodes_equal(new_mems, cs->effective_mems);
if (mems_updated)
check_insane_mems_config(&new_mems);
if (is_in_v2_mode())
hotplug_update_tasks(cs, &new_cpus, &new_mems,
cpus_updated, mems_updated);
@ -3481,8 +3524,8 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
return cs;
}
/**
* cpuset_node_allowed - Can we allocate on a memory node?
/*
* __cpuset_node_allowed - Can we allocate on a memory node?
* @node: is this an allowed node?
* @gfp_mask: memory allocation flags
*
@ -3524,7 +3567,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs)
bool __cpuset_node_allowed(int node, gfp_t gfp_mask)
{
struct cpuset *cs; /* current cpuset ancestors */
int allowed; /* is allocation in zone z allowed? */
bool allowed; /* is allocation in zone z allowed? */
unsigned long flags;
if (in_interrupt())
@ -3653,8 +3696,8 @@ void cpuset_print_current_mems_allowed(void)
int cpuset_memory_pressure_enabled __read_mostly;
/**
* cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
/*
* __cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
*
* Keep a running average of the rate of synchronous (direct)
* page reclaim efforts initiated by tasks in each cpuset.
@ -3669,7 +3712,7 @@ int cpuset_memory_pressure_enabled __read_mostly;
* "memory_pressure". Value displayed is an integer
* representing the recent rate of entry into the synchronous
* (direct) page reclaim by any task attached to the cpuset.
**/
*/
void __cpuset_memory_pressure_bump(void)
{

View File

@ -157,13 +157,6 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
new_usage = atomic_long_add_return(amount, &res->usage);
if (new_usage > READ_ONCE(res->max) ||
new_usage > READ_ONCE(misc_res_capacity[type])) {
if (!res->failed) {
pr_info("cgroup: charge rejected by the misc controller for %s resource in ",
misc_res_name[type]);
pr_cont_cgroup_path(i->css.cgroup);
pr_cont("\n");
res->failed = true;
}
ret = -EBUSY;
goto err_charge;
}
@ -171,6 +164,11 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
return 0;
err_charge:
for (j = i; j; j = parent_misc(j)) {
atomic_long_inc(&j->res[type].events);
cgroup_file_notify(&j->events_file);
}
for (j = cg; j != i; j = parent_misc(j))
misc_cg_cancel_charge(type, j, amount);
misc_cg_cancel_charge(type, i, amount);
@ -335,6 +333,19 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v)
return 0;
}
static int misc_events_show(struct seq_file *sf, void *v)
{
struct misc_cg *cg = css_misc(seq_css(sf));
unsigned long events, i;
for (i = 0; i < MISC_CG_RES_TYPES; i++) {
events = atomic_long_read(&cg->res[i].events);
if (READ_ONCE(misc_res_capacity[i]) || events)
seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events);
}
return 0;
}
/* Misc cgroup interface files */
static struct cftype misc_cg_files[] = {
{
@ -353,6 +364,12 @@ static struct cftype misc_cg_files[] = {
.seq_show = misc_cg_capacity_show,
.flags = CFTYPE_ONLY_ON_ROOT,
},
{
.name = "events",
.flags = CFTYPE_NOT_ON_ROOT,
.file_offset = offsetof(struct misc_cg, events_file),
.seq_show = misc_events_show,
},
{}
};

View File

@ -35,7 +35,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu)
* instead of NULL, we can tell whether @cgrp is on the list by
* testing the next pointer for NULL.
*/
if (cgroup_rstat_cpu(cgrp, cpu)->updated_next)
if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next))
return;
raw_spin_lock_irqsave(cpu_lock, flags);
@ -88,6 +88,7 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
struct cgroup *root, int cpu)
{
struct cgroup_rstat_cpu *rstatc;
struct cgroup *parent;
if (pos == root)
return NULL;
@ -96,10 +97,14 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* We're gonna walk down to the first leaf and visit/remove it. We
* can pick whatever unvisited node as the starting point.
*/
if (!pos)
if (!pos) {
pos = root;
else
/* return NULL if this subtree is not on-list */
if (!cgroup_rstat_cpu(pos, cpu)->updated_next)
return NULL;
} else {
pos = cgroup_parent(pos);
}
/* walk down to the first leaf */
while (true) {
@ -115,21 +120,17 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
* However, due to the way we traverse, @pos will be the first
* child in most cases. The only exception is @root.
*/
if (rstatc->updated_next) {
struct cgroup *parent = cgroup_parent(pos);
parent = cgroup_parent(pos);
if (parent) {
struct cgroup_rstat_cpu *prstatc;
struct cgroup **nextp;
prstatc = cgroup_rstat_cpu(parent, cpu);
nextp = &prstatc->updated_children;
while (true) {
while (*nextp != pos) {
struct cgroup_rstat_cpu *nrstatc;
nrstatc = cgroup_rstat_cpu(*nextp, cpu);
if (*nextp == pos)
break;
WARN_ON_ONCE(*nextp == parent);
nextp = &nrstatc->updated_next;
}
@ -140,10 +141,6 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos,
return pos;
}
/* only happens for @root */
return NULL;
}
/* see cgroup_rstat_flush() */
static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep)
__releases(&cgroup_rstat_lock) __acquires(&cgroup_rstat_lock)

View File

@ -17,6 +17,7 @@ CONFIG_SYMBOLIC_ERRNAME=y
# Compile-time checks and compiler options
#
CONFIG_DEBUG_INFO=y
CONFIG_DEBUG_INFO_DWARF_TOOLCHAIN_DEFAULT=y
CONFIG_DEBUG_SECTION_MISMATCH=y
CONFIG_FRAME_WARN=2048
CONFIG_SECTION_MISMATCH_WARN_ONLY=y

View File

@ -100,19 +100,10 @@ void __delayacct_blkio_start(void)
*/
void __delayacct_blkio_end(struct task_struct *p)
{
struct task_delay_info *delays = p->delays;
u64 *total;
u32 *count;
if (p->delays->flags & DELAYACCT_PF_SWAPIN) {
total = &delays->swapin_delay;
count = &delays->swapin_count;
} else {
total = &delays->blkio_delay;
count = &delays->blkio_count;
}
delayacct_end(&delays->lock, &delays->blkio_start, total, count);
delayacct_end(&p->delays->lock,
&p->delays->blkio_start,
&p->delays->blkio_delay,
&p->delays->blkio_count);
}
int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
@ -164,10 +155,13 @@ int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
tmp = d->thrashing_delay_total + tsk->delays->thrashing_delay;
d->thrashing_delay_total = (tmp < d->thrashing_delay_total) ? 0 : tmp;
tmp = d->compact_delay_total + tsk->delays->compact_delay;
d->compact_delay_total = (tmp < d->compact_delay_total) ? 0 : tmp;
d->blkio_count += tsk->delays->blkio_count;
d->swapin_count += tsk->delays->swapin_count;
d->freepages_count += tsk->delays->freepages_count;
d->thrashing_count += tsk->delays->thrashing_count;
d->compact_count += tsk->delays->compact_count;
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return 0;
@ -179,8 +173,7 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
unsigned long flags;
raw_spin_lock_irqsave(&tsk->delays->lock, flags);
ret = nsec_to_clock_t(tsk->delays->blkio_delay +
tsk->delays->swapin_delay);
ret = nsec_to_clock_t(tsk->delays->blkio_delay);
raw_spin_unlock_irqrestore(&tsk->delays->lock, flags);
return ret;
}
@ -210,3 +203,29 @@ void __delayacct_thrashing_end(void)
&current->delays->thrashing_delay,
&current->delays->thrashing_count);
}
void __delayacct_swapin_start(void)
{
current->delays->swapin_start = local_clock();
}
void __delayacct_swapin_end(void)
{
delayacct_end(&current->delays->lock,
&current->delays->swapin_start,
&current->delays->swapin_delay,
&current->delays->swapin_count);
}
void __delayacct_compact_start(void)
{
current->delays->compact_start = local_clock();
}
void __delayacct_compact_end(void)
{
delayacct_end(&current->delays->lock,
&current->delays->compact_start,
&current->delays->compact_delay,
&current->delays->compact_count);
}

View File

@ -40,7 +40,6 @@ static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
{
struct dma_coherent_mem *dma_mem;
int pages = size >> PAGE_SHIFT;
int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
void *mem_base;
if (!size)
@ -53,7 +52,7 @@ static struct dma_coherent_mem *dma_init_coherent_memory(phys_addr_t phys_addr,
dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
if (!dma_mem)
goto out_unmap_membase;
dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
dma_mem->bitmap = bitmap_zalloc(pages, GFP_KERNEL);
if (!dma_mem->bitmap)
goto out_free_dma_mem;
@ -81,7 +80,7 @@ static void dma_release_coherent_memory(struct dma_coherent_mem *mem)
return;
memunmap(mem->virt_base);
kfree(mem->bitmap);
bitmap_free(mem->bitmap);
kfree(mem);
}

View File

@ -75,15 +75,45 @@ static bool dma_coherent_ok(struct device *dev, phys_addr_t phys, size_t size)
min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit);
}
static int dma_set_decrypted(struct device *dev, void *vaddr, size_t size)
{
if (!force_dma_unencrypted(dev))
return 0;
return set_memory_decrypted((unsigned long)vaddr, 1 << get_order(size));
}
static int dma_set_encrypted(struct device *dev, void *vaddr, size_t size)
{
int ret;
if (!force_dma_unencrypted(dev))
return 0;
ret = set_memory_encrypted((unsigned long)vaddr, 1 << get_order(size));
if (ret)
pr_warn_ratelimited("leaking DMA memory that can't be re-encrypted\n");
return ret;
}
static void __dma_direct_free_pages(struct device *dev, struct page *page,
size_t size)
{
if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
swiotlb_free(dev, page, size))
if (swiotlb_free(dev, page, size))
return;
dma_free_contiguous(dev, page, size);
}
static struct page *dma_direct_alloc_swiotlb(struct device *dev, size_t size)
{
struct page *page = swiotlb_alloc(dev, size);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
swiotlb_free(dev, page, size);
return NULL;
}
return page;
}
static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
gfp_t gfp)
{
@ -93,18 +123,11 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
WARN_ON_ONCE(!PAGE_ALIGNED(size));
if (is_swiotlb_for_alloc(dev))
return dma_direct_alloc_swiotlb(dev, size);
gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
&phys_limit);
if (IS_ENABLED(CONFIG_DMA_RESTRICTED_POOL) &&
is_swiotlb_for_alloc(dev)) {
page = swiotlb_alloc(dev, size);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
__dma_direct_free_pages(dev, page, size);
return NULL;
}
return page;
}
page = dma_alloc_contiguous(dev, size, gfp);
if (page && !dma_coherent_ok(dev, page_to_phys(page), size)) {
dma_free_contiguous(dev, page, size);
@ -133,6 +156,15 @@ static struct page *__dma_direct_alloc_pages(struct device *dev, size_t size,
return page;
}
/*
* Check if a potentially blocking operations needs to dip into the atomic
* pools for the given device/gfp.
*/
static bool dma_direct_use_pool(struct device *dev, gfp_t gfp)
{
return !gfpflags_allow_blocking(gfp) && !is_swiotlb_for_alloc(dev);
}
static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp)
{
@ -140,6 +172,9 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
u64 phys_mask;
void *ret;
if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_DMA_COHERENT_POOL)))
return NULL;
gfp |= dma_direct_optimal_gfp_mask(dev, dev->coherent_dma_mask,
&phys_mask);
page = dma_alloc_from_pool(dev, size, &ret, gfp, dma_coherent_ok);
@ -149,64 +184,103 @@ static void *dma_direct_alloc_from_pool(struct device *dev, size_t size,
return ret;
}
static void *dma_direct_alloc_no_mapping(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp)
{
struct page *page;
page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
if (!page)
return NULL;
/* remove any dirty cache lines on the kernel alias */
if (!PageHighMem(page))
arch_dma_prep_coherent(page, size);
/* return the page pointer as the opaque cookie */
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return page;
}
void *dma_direct_alloc(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp, unsigned long attrs)
{
bool remap = false, set_uncached = false;
struct page *page;
void *ret;
int err;
size = PAGE_ALIGN(size);
if (attrs & DMA_ATTR_NO_WARN)
gfp |= __GFP_NOWARN;
if ((attrs & DMA_ATTR_NO_KERNEL_MAPPING) &&
!force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev)) {
page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
if (!page)
return NULL;
/* remove any dirty cache lines on the kernel alias */
if (!PageHighMem(page))
arch_dma_prep_coherent(page, size);
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
/* return the page pointer as the opaque cookie */
return page;
}
!force_dma_unencrypted(dev) && !is_swiotlb_for_alloc(dev))
return dma_direct_alloc_no_mapping(dev, size, dma_handle, gfp);
if (!dev_is_dma_coherent(dev)) {
/*
* Fallback to the arch handler if it exists. This should
* eventually go away.
*/
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
!IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
!IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
!dev_is_dma_coherent(dev) &&
!is_swiotlb_for_alloc(dev))
return arch_dma_alloc(dev, size, dma_handle, gfp, attrs);
if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL) &&
!dev_is_dma_coherent(dev))
return dma_alloc_from_global_coherent(dev, size, dma_handle);
return arch_dma_alloc(dev, size, dma_handle, gfp,
attrs);
/*
* Remapping or decrypting memory may block. If either is required and
* we can't block, allocate the memory from the atomic pools.
* If restricted DMA (i.e., is_swiotlb_for_alloc) is required, one must
* set up another device coherent pool by shared-dma-pool and use
* dma_alloc_from_dev_coherent instead.
* If there is a global pool, always allocate from it for
* non-coherent devices.
*/
if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
!gfpflags_allow_blocking(gfp) &&
(force_dma_unencrypted(dev) ||
(IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
!dev_is_dma_coherent(dev))) &&
!is_swiotlb_for_alloc(dev))
if (IS_ENABLED(CONFIG_DMA_GLOBAL_POOL))
return dma_alloc_from_global_coherent(dev, size,
dma_handle);
/*
* Otherwise remap if the architecture is asking for it. But
* given that remapping memory is a blocking operation we'll
* instead have to dip into the atomic pools.
*/
remap = IS_ENABLED(CONFIG_DMA_DIRECT_REMAP);
if (remap) {
if (dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size,
dma_handle, gfp);
} else {
if (!IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED))
return NULL;
set_uncached = true;
}
}
/*
* Decrypting memory may block, so allocate the memory from the atomic
* pools if we can't block.
*/
if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
/* we always manually zero the memory once we are done */
page = __dma_direct_alloc_pages(dev, size, gfp & ~__GFP_ZERO);
if (!page)
return NULL;
if (PageHighMem(page)) {
/*
* Depending on the cma= arguments and per-arch setup,
* dma_alloc_contiguous could return highmem pages.
* Without remapping there is no way to return them here, so
* log an error and fail.
*/
if (!IS_ENABLED(CONFIG_DMA_REMAP)) {
dev_info(dev, "Rejecting highmem page from CMA.\n");
goto out_free_pages;
}
remap = true;
set_uncached = false;
}
if ((IS_ENABLED(CONFIG_DMA_DIRECT_REMAP) &&
!dev_is_dma_coherent(dev)) ||
(IS_ENABLED(CONFIG_DMA_REMAP) && PageHighMem(page))) {
if (remap) {
/* remove any dirty cache lines on the kernel alias */
arch_dma_prep_coherent(page, size);
@ -216,56 +290,27 @@ void *dma_direct_alloc(struct device *dev, size_t size,
__builtin_return_address(0));
if (!ret)
goto out_free_pages;
if (force_dma_unencrypted(dev)) {
err = set_memory_decrypted((unsigned long)ret,
1 << get_order(size));
if (err)
goto out_free_pages;
}
memset(ret, 0, size);
goto done;
}
if (PageHighMem(page)) {
/*
* Depending on the cma= arguments and per-arch setup
* dma_alloc_contiguous could return highmem pages.
* Without remapping there is no way to return them here,
* so log an error and fail.
*/
dev_info(dev, "Rejecting highmem page from CMA.\n");
goto out_free_pages;
}
} else {
ret = page_address(page);
if (force_dma_unencrypted(dev)) {
err = set_memory_decrypted((unsigned long)ret,
1 << get_order(size));
if (err)
if (dma_set_decrypted(dev, ret, size))
goto out_free_pages;
}
memset(ret, 0, size);
if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_SET_UNCACHED) &&
!dev_is_dma_coherent(dev)) {
if (set_uncached) {
arch_dma_prep_coherent(page, size);
ret = arch_dma_set_uncached(ret, size);
if (IS_ERR(ret))
goto out_encrypt_pages;
}
done:
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return ret;
out_encrypt_pages:
if (force_dma_unencrypted(dev)) {
err = set_memory_encrypted((unsigned long)page_address(page),
1 << get_order(size));
/* If memory cannot be re-encrypted, it must be leaked */
if (err)
if (dma_set_encrypted(dev, page_address(page), size))
return NULL;
}
out_free_pages:
__dma_direct_free_pages(dev, page, size);
return NULL;
@ -304,13 +349,14 @@ void dma_direct_free(struct device *dev, size_t size,
dma_free_from_pool(dev, cpu_addr, PAGE_ALIGN(size)))
return;
if (force_dma_unencrypted(dev))
set_memory_encrypted((unsigned long)cpu_addr, 1 << page_order);
if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr))
if (IS_ENABLED(CONFIG_DMA_REMAP) && is_vmalloc_addr(cpu_addr)) {
vunmap(cpu_addr);
else if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
} else {
if (IS_ENABLED(CONFIG_ARCH_HAS_DMA_CLEAR_UNCACHED))
arch_dma_clear_uncached(cpu_addr, size);
if (dma_set_encrypted(dev, cpu_addr, 1 << page_order))
return;
}
__dma_direct_free_pages(dev, dma_direct_to_page(dev, dma_addr), size);
}
@ -321,9 +367,7 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
struct page *page;
void *ret;
if (IS_ENABLED(CONFIG_DMA_COHERENT_POOL) &&
force_dma_unencrypted(dev) && !gfpflags_allow_blocking(gfp) &&
!is_swiotlb_for_alloc(dev))
if (force_dma_unencrypted(dev) && dma_direct_use_pool(dev, gfp))
return dma_direct_alloc_from_pool(dev, size, dma_handle, gfp);
page = __dma_direct_alloc_pages(dev, size, gfp);
@ -341,11 +385,8 @@ struct page *dma_direct_alloc_pages(struct device *dev, size_t size,
}
ret = page_address(page);
if (force_dma_unencrypted(dev)) {
if (set_memory_decrypted((unsigned long)ret,
1 << get_order(size)))
if (dma_set_decrypted(dev, ret, size))
goto out_free_pages;
}
memset(ret, 0, size);
*dma_handle = phys_to_dma_direct(dev, page_to_phys(page));
return page;
@ -366,9 +407,8 @@ void dma_direct_free_pages(struct device *dev, size_t size,
dma_free_from_pool(dev, vaddr, size))
return;
if (force_dma_unencrypted(dev))
set_memory_encrypted((unsigned long)vaddr, 1 << page_order);
if (dma_set_encrypted(dev, vaddr, 1 << page_order))
return;
__dma_direct_free_pages(dev, page, size);
}

View File

@ -296,10 +296,6 @@ dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
if (WARN_ON_ONCE(!dev->dma_mask))
return DMA_MAPPING_ERROR;
/* Don't allow RAM to be mapped */
if (WARN_ON_ONCE(pfn_valid(PHYS_PFN(phys_addr))))
return DMA_MAPPING_ERROR;
if (dma_map_direct(dev, ops))
addr = dma_direct_map_resource(dev, phys_addr, size, dir, attrs);
else if (ops->map_resource)

View File

@ -34,7 +34,7 @@
#include <linux/highmem.h>
#include <linux/gfp.h>
#include <linux/scatterlist.h>
#include <linux/mem_encrypt.h>
#include <linux/cc_platform.h>
#include <linux/set_memory.h>
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
@ -50,6 +50,7 @@
#include <asm/io.h>
#include <asm/dma.h>
#include <linux/io.h>
#include <linux/init.h>
#include <linux/memblock.h>
#include <linux/iommu-helper.h>
@ -72,6 +73,8 @@ enum swiotlb_force swiotlb_force;
struct io_tlb_mem io_tlb_default_mem;
phys_addr_t swiotlb_unencrypted_base;
/*
* Max segment that we can provide which (if pages are contingous) will
* not be bounced (unless SWIOTLB_FORCE is set).
@ -155,6 +158,34 @@ static inline unsigned long nr_slots(u64 val)
return DIV_ROUND_UP(val, IO_TLB_SIZE);
}
/*
* Remap swioltb memory in the unencrypted physical address space
* when swiotlb_unencrypted_base is set. (e.g. for Hyper-V AMD SEV-SNP
* Isolation VMs).
*/
#ifdef CONFIG_HAS_IOMEM
static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
{
void *vaddr = NULL;
if (swiotlb_unencrypted_base) {
phys_addr_t paddr = mem->start + swiotlb_unencrypted_base;
vaddr = memremap(paddr, bytes, MEMREMAP_WB);
if (!vaddr)
pr_err("Failed to map the unencrypted memory %pa size %lx.\n",
&paddr, bytes);
}
return vaddr;
}
#else
static void *swiotlb_mem_remap(struct io_tlb_mem *mem, unsigned long bytes)
{
return NULL;
}
#endif
/*
* Early SWIOTLB allocation may be too early to allow an architecture to
* perform the desired operations. This function allows the architecture to
@ -172,7 +203,12 @@ void __init swiotlb_update_mem_attributes(void)
vaddr = phys_to_virt(mem->start);
bytes = PAGE_ALIGN(mem->nslabs << IO_TLB_SHIFT);
set_memory_decrypted((unsigned long)vaddr, bytes >> PAGE_SHIFT);
memset(vaddr, 0, bytes);
mem->vaddr = swiotlb_mem_remap(mem, bytes);
if (!mem->vaddr)
mem->vaddr = vaddr;
memset(mem->vaddr, 0, bytes);
}
static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
@ -196,7 +232,17 @@ static void swiotlb_init_io_tlb_mem(struct io_tlb_mem *mem, phys_addr_t start,
mem->slots[i].orig_addr = INVALID_PHYS_ADDR;
mem->slots[i].alloc_size = 0;
}
/*
* If swiotlb_unencrypted_base is set, the bounce buffer memory will
* be remapped and cleared in swiotlb_update_mem_attributes.
*/
if (swiotlb_unencrypted_base)
return;
memset(vaddr, 0, bytes);
mem->vaddr = vaddr;
return;
}
int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose)
@ -247,7 +293,7 @@ swiotlb_init(int verbose)
return;
fail_free_mem:
memblock_free_early(__pa(tlb), bytes);
memblock_free(tlb, bytes);
fail:
pr_warn("Cannot allocate buffer");
}
@ -371,7 +417,7 @@ static void swiotlb_bounce(struct device *dev, phys_addr_t tlb_addr, size_t size
phys_addr_t orig_addr = mem->slots[index].orig_addr;
size_t alloc_size = mem->slots[index].alloc_size;
unsigned long pfn = PFN_DOWN(orig_addr);
unsigned char *vaddr = phys_to_virt(tlb_addr);
unsigned char *vaddr = mem->vaddr + tlb_addr - mem->start;
unsigned int tlb_offset, orig_addr_offset;
if (orig_addr == INVALID_PHYS_ADDR)
@ -459,7 +505,7 @@ static unsigned int wrap_index(struct io_tlb_mem *mem, unsigned int index)
* allocate a buffer from that IO TLB pool.
*/
static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
size_t alloc_size)
size_t alloc_size, unsigned int alloc_align_mask)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned long boundary_mask = dma_get_seg_boundary(dev);
@ -483,6 +529,7 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1;
if (alloc_size >= PAGE_SIZE)
stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT));
stride = max(stride, (alloc_align_mask >> IO_TLB_SHIFT) + 1);
spin_lock_irqsave(&mem->lock, flags);
if (unlikely(nslots > mem->nslabs - mem->used))
@ -541,7 +588,8 @@ static int swiotlb_find_slots(struct device *dev, phys_addr_t orig_addr,
phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
size_t mapping_size, size_t alloc_size,
enum dma_data_direction dir, unsigned long attrs)
unsigned int alloc_align_mask, enum dma_data_direction dir,
unsigned long attrs)
{
struct io_tlb_mem *mem = dev->dma_io_tlb_mem;
unsigned int offset = swiotlb_align_offset(dev, orig_addr);
@ -552,7 +600,7 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
if (!mem)
panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer");
if (mem_encrypt_active())
if (cc_platform_has(CC_ATTR_MEM_ENCRYPT))
pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n");
if (mapping_size > alloc_size) {
@ -561,7 +609,8 @@ phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr,
return (phys_addr_t)DMA_MAPPING_ERROR;
}
index = swiotlb_find_slots(dev, orig_addr, alloc_size + offset);
index = swiotlb_find_slots(dev, orig_addr,
alloc_size + offset, alloc_align_mask);
if (index == -1) {
if (!(attrs & DMA_ATTR_NO_WARN))
dev_warn_ratelimited(dev,
@ -683,7 +732,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size,
trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size,
swiotlb_force);
swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, dir,
swiotlb_addr = swiotlb_tbl_map_single(dev, paddr, size, size, 0, dir,
attrs);
if (swiotlb_addr == (phys_addr_t)DMA_MAPPING_ERROR)
return DMA_MAPPING_ERROR;
@ -767,7 +816,7 @@ struct page *swiotlb_alloc(struct device *dev, size_t size)
if (!mem)
return NULL;
index = swiotlb_find_slots(dev, 0, size);
index = swiotlb_find_slots(dev, 0, size, 0);
if (index == -1)
return NULL;

View File

@ -187,7 +187,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
/* Check if any of the above work has queued a deferred wakeup */
tick_nohz_user_enter_prepare();
ti_work = READ_ONCE(current_thread_info()->flags);
ti_work = read_thread_flags();
}
/* Return the latest work state for arch_exit_to_user_mode() */
@ -196,7 +196,7 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs,
static void exit_to_user_mode_prepare(struct pt_regs *regs)
{
unsigned long ti_work = READ_ONCE(current_thread_info()->flags);
unsigned long ti_work = read_thread_flags();
lockdep_assert_irqs_disabled();

View File

@ -26,7 +26,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work)
if (ret)
return ret;
ti_work = READ_ONCE(current_thread_info()->flags);
ti_work = read_thread_flags();
} while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched());
return 0;
}
@ -43,7 +43,7 @@ int xfer_to_guest_mode_handle_work(struct kvm_vcpu *vcpu)
* disabled in the inner loop before going into guest mode. No need
* to disable interrupts here.
*/
ti_work = READ_ONCE(current_thread_info()->flags);
ti_work = read_thread_flags();
if (!(ti_work & XFER_TO_GUEST_MODE_WORK))
return 0;

View File

@ -1,10 +1,5 @@
# SPDX-License-Identifier: GPL-2.0
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_core.o = $(CC_FLAGS_FTRACE)
endif
obj-y := core.o ring_buffer.o callchain.o
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_UPROBES) += uprobes.o

View File

@ -1875,6 +1875,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list_add_rcu(&event->event_entry, &ctx->event_list);
ctx->nr_events++;
if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
ctx->nr_user++;
if (event->attr.inherit_stat)
ctx->nr_stat++;
@ -2066,6 +2068,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
ctx->nr_events--;
if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
ctx->nr_user--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@ -6598,33 +6602,43 @@ static void perf_pending_event(struct irq_work *entry)
perf_swevent_put_recursion_context(rctx);
}
/*
* We assume there is only KVM supporting the callbacks.
* Later on, we might change it to a list if there is
* another virtualization implementation supporting the callbacks.
*/
#ifdef CONFIG_GUEST_PERF_EVENTS
struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
DEFINE_STATIC_CALL_RET0(__perf_guest_state, *perf_guest_cbs->state);
DEFINE_STATIC_CALL_RET0(__perf_guest_get_ip, *perf_guest_cbs->get_ip);
DEFINE_STATIC_CALL_RET0(__perf_guest_handle_intel_pt_intr, *perf_guest_cbs->handle_intel_pt_intr);
void perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
return -EBUSY;
return;
rcu_assign_pointer(perf_guest_cbs, cbs);
return 0;
static_call_update(__perf_guest_state, cbs->state);
static_call_update(__perf_guest_get_ip, cbs->get_ip);
/* Implementing ->handle_intel_pt_intr is optional. */
if (cbs->handle_intel_pt_intr)
static_call_update(__perf_guest_handle_intel_pt_intr,
cbs->handle_intel_pt_intr);
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
void perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
return -EINVAL;
return;
rcu_assign_pointer(perf_guest_cbs, NULL);
static_call_update(__perf_guest_state, (void *)&__static_call_return0);
static_call_update(__perf_guest_get_ip, (void *)&__static_call_return0);
static_call_update(__perf_guest_handle_intel_pt_intr,
(void *)&__static_call_return0);
synchronize_rcu();
return 0;
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
#endif
static void
perf_output_sample_regs(struct perf_output_handle *handle,
@ -9183,6 +9197,36 @@ static void perf_log_itrace_start(struct perf_event *event)
perf_output_end(&handle);
}
void perf_report_aux_output_id(struct perf_event *event, u64 hw_id)
{
struct perf_output_handle handle;
struct perf_sample_data sample;
struct perf_aux_event {
struct perf_event_header header;
u64 hw_id;
} rec;
int ret;
if (event->parent)
event = event->parent;
rec.header.type = PERF_RECORD_AUX_OUTPUT_HW_ID;
rec.header.misc = 0;
rec.header.size = sizeof(rec);
rec.hw_id = hw_id;
perf_event_header__init_id(&rec.header, &sample, event);
ret = perf_output_begin(&handle, &sample, event, rec.header.size);
if (ret)
return;
perf_output_put(&handle, rec);
perf_event__output_id_sample(event, &handle, &sample);
perf_output_end(&handle);
}
static int
__perf_event_account_interrupt(struct perf_event *event, int throttle)
{
@ -13548,3 +13592,5 @@ struct cgroup_subsys perf_event_cgrp_subsys = {
.threaded = true,
};
#endif /* CONFIG_CGROUP_PERF */
DEFINE_STATIC_CALL_RET0(perf_snapshot_branch_stack, perf_snapshot_branch_stack_t);

View File

@ -205,12 +205,7 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user)
static inline int get_recursion_context(int *recursion)
{
unsigned int pc = preempt_count();
unsigned char rctx = 0;
rctx += !!(pc & (NMI_MASK));
rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK));
rctx += !!(pc & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET));
unsigned char rctx = interrupt_context_level();
if (recursion[rctx])
return -1;

View File

@ -167,7 +167,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
addr + PAGE_SIZE);
if (new_page) {
err = mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL);
err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm,
GFP_KERNEL);
if (err)
return err;
}

View File

@ -48,7 +48,6 @@
#include <linux/pipe_fs_i.h>
#include <linux/audit.h> /* for audit_free() */
#include <linux/resource.h>
#include <linux/blkdev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/tracehook.h>
#include <linux/fs_struct.h>
@ -64,6 +63,7 @@
#include <linux/rcuwait.h>
#include <linux/compat.h>
#include <linux/io_uring.h>
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <asm/unistd.h>
@ -116,7 +116,7 @@ static void __exit_signal(struct task_struct *tsk)
* then notify it:
*/
if (sig->notify_count > 0 && !--sig->notify_count)
wake_up_process(sig->group_exit_task);
wake_up_process(sig->group_exec_task);
if (tsk == sig->curr_target)
sig->curr_target = next_thread(tsk);
@ -168,6 +168,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
{
struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
kprobe_flush_task(tsk);
perf_event_delayed_put(tsk);
trace_sched_process_free(tsk);
put_task_struct(tsk);
@ -339,6 +340,46 @@ kill_orphaned_pgrp(struct task_struct *tsk, struct task_struct *parent)
}
}
static void coredump_task_exit(struct task_struct *tsk)
{
struct core_state *core_state;
/*
* Serialize with any possible pending coredump.
* We must hold siglock around checking core_state
* and setting PF_POSTCOREDUMP. The core-inducing thread
* will increment ->nr_threads for each thread in the
* group without PF_POSTCOREDUMP set.
*/
spin_lock_irq(&tsk->sighand->siglock);
tsk->flags |= PF_POSTCOREDUMP;
core_state = tsk->signal->core_state;
spin_unlock_irq(&tsk->sighand->siglock);
if (core_state) {
struct core_thread self;
self.task = current;
if (self.task->flags & PF_SIGNALED)
self.next = xchg(&core_state->dumper.next, &self);
else
self.task = NULL;
/*
* Implies mb(), the result of xchg() must be visible
* to core_state->dumper.
*/
if (atomic_dec_and_test(&core_state->nr_threads))
complete(&core_state->startup);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
freezable_schedule();
}
__set_current_state(TASK_RUNNING);
}
}
#ifdef CONFIG_MEMCG
/*
* A task is exiting. If it owned this mm, find a new owner for the mm.
@ -434,47 +475,12 @@ void mm_update_next_owner(struct mm_struct *mm)
static void exit_mm(void)
{
struct mm_struct *mm = current->mm;
struct core_state *core_state;
exit_mm_release(current, mm);
if (!mm)
return;
sync_mm_rss(mm);
/*
* Serialize with any possible pending coredump.
* We must hold mmap_lock around checking core_state
* and clearing tsk->mm. The core-inducing thread
* will increment ->nr_threads for each thread in the
* group with ->mm != NULL.
*/
mmap_read_lock(mm);
core_state = mm->core_state;
if (core_state) {
struct core_thread self;
mmap_read_unlock(mm);
self.task = current;
if (self.task->flags & PF_SIGNALED)
self.next = xchg(&core_state->dumper.next, &self);
else
self.task = NULL;
/*
* Implies mb(), the result of xchg() must be visible
* to core_state->dumper.
*/
if (atomic_dec_and_test(&core_state->nr_threads))
complete(&core_state->startup);
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!self.task) /* see coredump_finish() */
break;
freezable_schedule();
}
__set_current_state(TASK_RUNNING);
mmap_read_lock(mm);
}
mmgrab(mm);
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
@ -691,7 +697,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
/* mt-exec, de_thread() is waiting for group leader */
if (unlikely(tsk->signal->notify_count < 0))
wake_up_process(tsk->signal->group_exit_task);
wake_up_process(tsk->signal->group_exec_task);
write_unlock_irq(&tasklist_lock);
list_for_each_entry_safe(p, n, &dead, ptrace_entry) {
@ -729,54 +735,29 @@ void __noreturn do_exit(long code)
struct task_struct *tsk = current;
int group_dead;
/*
* We can get here from a kernel oops, sometimes with preemption off.
* Start by checking for critical errors.
* Then fix up important state like USER_DS and preemption.
* Then do everything else.
*/
WARN_ON(blk_needs_flush_plug(tsk));
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
/*
* If do_exit is called because this processes oopsed, it's possible
* If do_dead is called because this processes oopsed, it's possible
* that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
* continuing. Amongst other possible reasons, this is to prevent
* mm_release()->clear_child_tid() from writing to a user-controlled
* kernel address.
*
* On uptodate architectures force_uaccess_begin is a noop. On
* architectures that still have set_fs/get_fs in addition to handling
* oopses handles kernel threads that run as set_fs(KERNEL_DS) by
* default.
*/
force_uaccess_begin();
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
preempt_count());
preempt_count_set(PREEMPT_ENABLED);
}
profile_task_exit(tsk);
kcov_task_exit(tsk);
coredump_task_exit(tsk);
ptrace_event(PTRACE_EVENT_EXIT, code);
validate_creds_for_do_exit(tsk);
/*
* We're taking recursive faults here in do_exit. Safest is to just
* leave this task alone and wait for reboot.
*/
if (unlikely(tsk->flags & PF_EXITING)) {
pr_alert("Fixing recursive fault but reboot is needed!\n");
futex_exit_recursive(tsk);
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
}
io_uring_files_cancel();
exit_signals(tsk); /* sets PF_EXITING */
@ -875,16 +856,46 @@ void __noreturn do_exit(long code)
lockdep_free_task(tsk);
do_task_dead();
}
EXPORT_SYMBOL_GPL(do_exit);
void complete_and_exit(struct completion *comp, long code)
void __noreturn make_task_dead(int signr)
{
if (comp)
complete(comp);
/*
* Take the task off the cpu after something catastrophic has
* happened.
*
* We can get here from a kernel oops, sometimes with preemption off.
* Start by checking for critical errors.
* Then fix up important state like USER_DS and preemption.
* Then do everything else.
*/
struct task_struct *tsk = current;
do_exit(code);
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
current->comm, task_pid_nr(current),
preempt_count());
preempt_count_set(PREEMPT_ENABLED);
}
/*
* We're taking recursive faults here in make_task_dead. Safest is to just
* leave this task alone and wait for reboot.
*/
if (unlikely(tsk->flags & PF_EXITING)) {
pr_alert("Fixing recursive fault but reboot is needed!\n");
futex_exit_recursive(tsk);
tsk->exit_state = EXIT_DEAD;
refcount_inc(&tsk->rcu_users);
do_task_dead();
}
do_exit(signr);
}
EXPORT_SYMBOL(complete_and_exit);
SYSCALL_DEFINE1(exit, int, error_code)
{
@ -900,17 +911,19 @@ do_group_exit(int exit_code)
{
struct signal_struct *sig = current->signal;
BUG_ON(exit_code & 0x80); /* core dumps don't get here */
if (signal_group_exit(sig))
if (sig->flags & SIGNAL_GROUP_EXIT)
exit_code = sig->group_exit_code;
else if (sig->group_exec_task)
exit_code = 0;
else if (!thread_group_empty(current)) {
struct sighand_struct *const sighand = current->sighand;
spin_lock_irq(&sighand->siglock);
if (signal_group_exit(sig))
if (sig->flags & SIGNAL_GROUP_EXIT)
/* Another thread got here before we took the lock. */
exit_code = sig->group_exit_code;
else if (sig->group_exec_task)
exit_code = 0;
else {
sig->group_exit_code = exit_code;
sig->flags = SIGNAL_GROUP_EXIT;
@ -1005,7 +1018,8 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p)
return 0;
if (unlikely(wo->wo_flags & WNOWAIT)) {
status = p->exit_code;
status = (p->signal->flags & SIGNAL_GROUP_EXIT)
? p->signal->group_exit_code : p->exit_code;
get_task_struct(p);
read_unlock(&tasklist_lock);
sched_annotate_sleep();

View File

@ -62,40 +62,13 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
return e;
}
int init_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_sinittext &&
addr < (unsigned long)_einittext)
return 1;
return 0;
}
int notrace core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
addr < (unsigned long)_etext)
if (is_kernel_text(addr))
return 1;
if (system_state < SYSTEM_RUNNING &&
init_kernel_text(addr))
return 1;
return 0;
}
/**
* core_kernel_data - tell if addr points to kernel data
* @addr: address to test
*
* Returns true if @addr passed in is from the core kernel data
* section.
*
* Note: On some archs it may return true for core RODATA, and false
* for others. But will always be true for core RW data.
*/
int core_kernel_data(unsigned long addr)
{
if (addr >= (unsigned long)_sdata &&
addr < (unsigned long)_edata)
if (system_state < SYSTEM_FREEING_INITMEM &&
is_kernel_inittext(addr))
return 1;
return 0;
}
@ -112,7 +85,7 @@ int __kernel_text_address(unsigned long addr)
* Since we are after the module-symbols check, there's
* no danger of address overlap:
*/
if (init_kernel_text(addr))
if (is_kernel_inittext(addr))
return 1;
return 0;
}

View File

@ -42,6 +42,7 @@
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
@ -76,7 +77,6 @@
#include <linux/taskstats_kern.h>
#include <linux/random.h>
#include <linux/tty.h>
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
#include <linux/perf_event.h>
@ -366,12 +366,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
new->vm_next = new->vm_prev = NULL;
dup_anon_vma_name(orig, new);
}
return new;
}
void vm_area_free(struct vm_area_struct *vma)
{
free_anon_vma_name(vma);
kmem_cache_free(vm_area_cachep, vma);
}
@ -755,8 +757,6 @@ void __put_task_struct(struct task_struct *tsk)
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
sched_core_free(tsk);
if (!profile_handoff_task(tsk))
free_task(tsk);
}
EXPORT_SYMBOL_GPL(__put_task_struct);
@ -951,7 +951,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
tsk->splice_pipe = NULL;
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
tsk->pf_io_worker = NULL;
tsk->worker_private = NULL;
account_kernel_stack(tsk, 1);
@ -1044,7 +1044,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
seqcount_init(&mm->write_protect_seq);
mmap_init_lock(mm);
INIT_LIST_HEAD(&mm->mmlist);
mm->core_state = NULL;
mm_pgtables_bytes_init(mm);
mm->map_count = 0;
mm->locked_vm = 0;
@ -1392,8 +1391,7 @@ static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
* purposes.
*/
if (tsk->clear_child_tid) {
if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) &&
atomic_read(&mm->mm_users) > 1) {
if (atomic_read(&mm->mm_users) > 1) {
/*
* We don't check the error code - if userspace has
* not set up a proper pointer then tough luck.
@ -1559,32 +1557,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct *tsk)
return error;
}
static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
{
#ifdef CONFIG_BLOCK
struct io_context *ioc = current->io_context;
struct io_context *new_ioc;
if (!ioc)
return 0;
/*
* Share io context with parent, if CLONE_IO is set
*/
if (clone_flags & CLONE_IO) {
ioc_task_link(ioc);
tsk->io_context = ioc;
} else if (ioprio_valid(ioc->ioprio)) {
new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
if (unlikely(!new_ioc))
return -ENOMEM;
new_ioc->ioprio = ioc->ioprio;
put_io_context(new_ioc);
}
#endif
return 0;
}
static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
{
struct sighand_struct *sig;
@ -2035,12 +2007,6 @@ static __latent_entropy struct task_struct *copy_process(
siginitsetinv(&p->blocked, sigmask(SIGKILL)|sigmask(SIGSTOP));
}
/*
* This _must_ happen before we call free_task(), i.e. before we jump
* to any of the bad_fork_* labels. This is to avoid freeing
* p->set_child_tid which is (ab)used as a kthread's data pointer for
* kernel threads (PF_KTHREAD).
*/
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL;
/*
* Clear TID on mm_release()?
@ -2121,12 +2087,16 @@ static __latent_entropy struct task_struct *copy_process(
p->io_context = NULL;
audit_set_context(p, NULL);
cgroup_fork(p);
if (p->flags & PF_KTHREAD) {
if (!set_kthread_struct(p))
goto bad_fork_cleanup_delayacct;
}
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
if (IS_ERR(p->mempolicy)) {
retval = PTR_ERR(p->mempolicy);
p->mempolicy = NULL;
goto bad_fork_cleanup_threadgroup_lock;
goto bad_fork_cleanup_delayacct;
}
#endif
#ifdef CONFIG_CPUSETS
@ -2473,8 +2443,8 @@ static __latent_entropy struct task_struct *copy_process(
lockdep_free_task(p);
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
#endif
bad_fork_cleanup_delayacct:
delayacct_tsk_free(p);
bad_fork_cleanup_count:
dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
@ -3038,7 +3008,7 @@ int unshare_fd(unsigned long unshare_flags, unsigned int max_fds,
int ksys_unshare(unsigned long unshare_flags)
{
struct fs_struct *fs, *new_fs = NULL;
struct files_struct *fd, *new_fd = NULL;
struct files_struct *new_fd = NULL;
struct cred *new_cred = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
@ -3125,11 +3095,8 @@ int ksys_unshare(unsigned long unshare_flags)
spin_unlock(&fs->lock);
}
if (new_fd) {
fd = current->files;
current->files = new_fd;
new_fd = fd;
}
if (new_fd)
swap(current->files, new_fd);
task_unlock(current);

View File

@ -4,7 +4,6 @@ menu "GCOV-based kernel profiling"
config GCOV_KERNEL
bool "Enable gcov-based kernel profiling"
depends on DEBUG_FS
depends on !CC_IS_CLANG || CLANG_VERSION >= 110000
depends on !ARCH_WANTS_NO_INSTR || CC_HAS_NO_PROFILE_FN_ATTR
select CONSTRUCTORS
default n

View File

@ -63,7 +63,9 @@ static struct task_struct *watchdog_task;
* Should we dump all CPUs backtraces in a hung task event?
* Defaults to 0, can be changed via sysctl.
*/
unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
static unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace;
#else
#define sysctl_hung_task_all_cpu_backtrace 0
#endif /* CONFIG_SMP */
/*
@ -222,11 +224,13 @@ static long hung_timeout_jiffies(unsigned long last_checked,
MAX_SCHEDULE_TIMEOUT;
}
#ifdef CONFIG_SYSCTL
/*
* Process updating of timeout sysctl
*/
int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
void *buffer, size_t *lenp, loff_t *ppos)
static int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos)
{
int ret;
@ -241,6 +245,76 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
return ret;
}
/*
* This is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs
* and hung_task_check_interval_secs
*/
static const unsigned long hung_task_timeout_max = (LONG_MAX / HZ);
static struct ctl_table hung_task_sysctls[] = {
#ifdef CONFIG_SMP
{
.procname = "hung_task_all_cpu_backtrace",
.data = &sysctl_hung_task_all_cpu_backtrace,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
#endif /* CONFIG_SMP */
{
.procname = "hung_task_panic",
.data = &sysctl_hung_task_panic,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
.extra2 = SYSCTL_ONE,
},
{
.procname = "hung_task_check_count",
.data = &sysctl_hung_task_check_count,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_ZERO,
},
{
.procname = "hung_task_timeout_secs",
.data = &sysctl_hung_task_timeout_secs,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_dohung_task_timeout_secs,
.extra2 = (void *)&hung_task_timeout_max,
},
{
.procname = "hung_task_check_interval_secs",
.data = &sysctl_hung_task_check_interval_secs,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_dohung_task_timeout_secs,
.extra2 = (void *)&hung_task_timeout_max,
},
{
.procname = "hung_task_warnings",
.data = &sysctl_hung_task_warnings,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = SYSCTL_NEG_ONE,
},
{}
};
static void __init hung_task_sysctl_init(void)
{
register_sysctl_init("kernel", hung_task_sysctls);
}
#else
#define hung_task_sysctl_init() do { } while (0)
#endif /* CONFIG_SYSCTL */
static atomic_t reset_hung_task = ATOMIC_INIT(0);
void reset_hung_task_detector(void)
@ -310,6 +384,7 @@ static int __init hung_task_init(void)
pm_notifier(hungtask_pm_notify, 0);
watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
hung_task_sysctl_init();
return 0;
}

View File

@ -97,9 +97,6 @@ config GENERIC_MSI_IRQ_DOMAIN
config IRQ_MSI_IOMMU
bool
config HANDLE_DOMAIN_IRQ
bool
config IRQ_TIMINGS
bool
@ -144,3 +141,10 @@ config GENERIC_IRQ_MULTI_HANDLER
bool
help
Allow to specify the low level IRQ handler at run time.
# Cavium Octeon is the last system to use this deprecated option
# Do not even think of enabling this on any new platform
config DEPRECATED_IRQ_CPU_ONOFFLINE
bool
depends on CAVIUM_OCTEON_SOC
default CAVIUM_OCTEON_SOC

View File

@ -575,8 +575,6 @@ EXPORT_SYMBOL_GPL(handle_simple_irq);
*/
void handle_untracked_irq(struct irq_desc *desc)
{
unsigned int flags = 0;
raw_spin_lock(&desc->lock);
if (!irq_may_run(desc))
@ -593,7 +591,7 @@ void handle_untracked_irq(struct irq_desc *desc)
irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock(&desc->lock);
__handle_irq_event_percpu(desc, &flags);
__handle_irq_event_percpu(desc);
raw_spin_lock(&desc->lock);
irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
@ -1122,6 +1120,7 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
}
EXPORT_SYMBOL_GPL(irq_modify_status);
#ifdef CONFIG_DEPRECATED_IRQ_CPU_ONOFFLINE
/**
* irq_cpu_online - Invoke all irq_cpu_online functions.
*
@ -1181,6 +1180,7 @@ void irq_cpu_offline(void)
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
#endif
#ifdef CONFIG_IRQ_DOMAIN_HIERARCHY

View File

@ -25,6 +25,7 @@ static DEFINE_RAW_SPINLOCK(gc_lock);
void irq_gc_noop(struct irq_data *d)
{
}
EXPORT_SYMBOL_GPL(irq_gc_noop);
/**
* irq_gc_mask_disable_reg - Mask chip via disable register
@ -44,6 +45,7 @@ void irq_gc_mask_disable_reg(struct irq_data *d)
*ct->mask_cache &= ~mask;
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_mask_disable_reg);
/**
* irq_gc_mask_set_bit - Mask chip via setting bit in mask register
@ -103,6 +105,7 @@ void irq_gc_unmask_enable_reg(struct irq_data *d)
*ct->mask_cache |= mask;
irq_gc_unlock(gc);
}
EXPORT_SYMBOL_GPL(irq_gc_unmask_enable_reg);
/**
* irq_gc_ack_set_bit - Ack pending interrupt via setting bit
@ -448,7 +451,7 @@ static void irq_unmap_generic_chip(struct irq_domain *d, unsigned int virq)
}
struct irq_domain_ops irq_generic_chip_ops = {
const struct irq_domain_ops irq_generic_chip_ops = {
.map = irq_map_generic_chip,
.unmap = irq_unmap_generic_chip,
.xlate = irq_domain_xlate_onetwocell,

View File

@ -14,6 +14,8 @@
#include <linux/interrupt.h>
#include <linux/kernel_stat.h>
#include <asm/irq_regs.h>
#include <trace/events/irq.h>
#include "internals.h"
@ -134,7 +136,7 @@ void __irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
wake_up_process(action->thread);
}
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags)
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval = IRQ_NONE;
unsigned int irq = desc->irq_data.irq;
@ -172,10 +174,6 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
}
__irq_wake_thread(desc, action);
fallthrough; /* to add to randomness */
case IRQ_HANDLED:
*flags |= action->flags;
break;
default:
@ -191,11 +189,10 @@ irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
{
irqreturn_t retval;
unsigned int flags = 0;
retval = __handle_irq_event_percpu(desc, &flags);
retval = __handle_irq_event_percpu(desc);
add_interrupt_randomness(desc->irq_data.irq, flags);
add_interrupt_randomness(desc->irq_data.irq);
if (!irq_settings_no_debug(desc))
note_interrupt(desc, retval);
@ -226,4 +223,20 @@ int __init set_handle_irq(void (*handle_irq)(struct pt_regs *))
handle_arch_irq = handle_irq;
return 0;
}
/**
* generic_handle_arch_irq - root irq handler for architectures which do no
* entry accounting themselves
* @regs: Register file coming from the low-level handling code
*/
asmlinkage void noinstr generic_handle_arch_irq(struct pt_regs *regs)
{
struct pt_regs *old_regs;
irq_enter();
old_regs = set_irq_regs(regs);
handle_arch_irq(regs);
set_irq_regs(old_regs);
irq_exit();
}
#endif

View File

@ -103,7 +103,7 @@ extern int __irq_get_irqchip_state(struct irq_data *data,
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc, unsigned int *flags);
irqreturn_t __handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event_percpu(struct irq_desc *desc);
irqreturn_t handle_irq_event(struct irq_desc *desc);

View File

@ -646,12 +646,15 @@ int handle_irq_desc(struct irq_desc *desc)
generic_handle_irq_desc(desc);
return 0;
}
EXPORT_SYMBOL_GPL(handle_irq_desc);
/**
* generic_handle_irq - Invoke the handler for a particular irq
* @irq: The irq number to handle
*
* Returns: 0 on success, or -EINVAL if conversion has failed
*
* This function must be called from an IRQ context with irq regs
* initialized.
*/
int generic_handle_irq(unsigned int irq)
{
@ -662,89 +665,39 @@ EXPORT_SYMBOL_GPL(generic_handle_irq);
#ifdef CONFIG_IRQ_DOMAIN
/**
* generic_handle_domain_irq - Invoke the handler for a HW irq belonging
* to a domain, usually for a non-root interrupt
* controller
* to a domain.
* @domain: The domain where to perform the lookup
* @hwirq: The HW irq number to convert to a logical one
*
* Returns: 0 on success, or -EINVAL if conversion has failed
*
* This function must be called from an IRQ context with irq regs
* initialized.
*/
int generic_handle_domain_irq(struct irq_domain *domain, unsigned int hwirq)
{
WARN_ON_ONCE(!in_irq());
return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
}
EXPORT_SYMBOL_GPL(generic_handle_domain_irq);
#ifdef CONFIG_HANDLE_DOMAIN_IRQ
/**
* handle_domain_irq - Invoke the handler for a HW irq belonging to a domain,
* usually for a root interrupt controller
* generic_handle_domain_nmi - Invoke the handler for a HW nmi belonging
* to a domain.
* @domain: The domain where to perform the lookup
* @hwirq: The HW irq number to convert to a logical one
* @regs: Register file coming from the low-level handling code
*
* Returns: 0 on success, or -EINVAL if conversion has failed
*/
int handle_domain_irq(struct irq_domain *domain,
unsigned int hwirq, struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
struct irq_desc *desc;
int ret = 0;
irq_enter();
/* The irqdomain code provides boundary checks */
desc = irq_resolve_mapping(domain, hwirq);
if (likely(desc))
handle_irq_desc(desc);
else
ret = -EINVAL;
irq_exit();
set_irq_regs(old_regs);
return ret;
}
/**
* handle_domain_nmi - Invoke the handler for a HW irq belonging to a domain
* @domain: The domain where to perform the lookup
* @hwirq: The HW irq number to convert to a logical one
* @regs: Register file coming from the low-level handling code
*
* This function must be called from an NMI context.
*
* Returns: 0 on success, or -EINVAL if conversion has failed
*/
int handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq,
struct pt_regs *regs)
* This function must be called from an NMI context with irq regs
* initialized.
**/
int generic_handle_domain_nmi(struct irq_domain *domain, unsigned int hwirq)
{
struct pt_regs *old_regs = set_irq_regs(regs);
struct irq_desc *desc;
int ret = 0;
/*
* NMI context needs to be setup earlier in order to deal with tracing.
*/
WARN_ON(!in_nmi());
desc = irq_resolve_mapping(domain, hwirq);
/*
* ack_bad_irq is not NMI-safe, just report
* an invalid interrupt.
*/
if (likely(desc))
handle_irq_desc(desc);
else
ret = -EINVAL;
set_irq_regs(old_regs);
return ret;
WARN_ON_ONCE(!in_nmi());
return handle_irq_desc(irq_resolve_mapping(domain, hwirq));
}
#endif
#endif
/* Dynamic interrupt handling */

View File

@ -744,9 +744,8 @@ static int irq_domain_translate(struct irq_domain *d,
return 0;
}
static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
unsigned int count,
struct irq_fwspec *fwspec)
void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
unsigned int count, struct irq_fwspec *fwspec)
{
int i;
@ -756,6 +755,7 @@ static void of_phandle_args_to_fwspec(struct device_node *np, const u32 *args,
for (i = 0; i < count; i++)
fwspec->param[i] = args[i];
}
EXPORT_SYMBOL_GPL(of_phandle_args_to_fwspec);
unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
{
@ -1502,6 +1502,7 @@ int __irq_domain_alloc_irqs(struct irq_domain *domain, int irq_base,
irq_free_descs(virq, nr_irqs);
return ret;
}
EXPORT_SYMBOL_GPL(__irq_domain_alloc_irqs);
/* The irq_data was moved, fix the revmap to refer to the new location */
static void irq_domain_fix_revmap(struct irq_data *d)

View File

@ -486,7 +486,8 @@ int irq_force_affinity(unsigned int irq, const struct cpumask *cpumask)
}
EXPORT_SYMBOL_GPL(irq_force_affinity);
int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
int __irq_apply_affinity_hint(unsigned int irq, const struct cpumask *m,
bool setaffinity)
{
unsigned long flags;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_GLOBAL);
@ -495,12 +496,11 @@ int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
return -EINVAL;
desc->affinity_hint = m;
irq_put_desc_unlock(desc, flags);
/* set the initial affinity to prevent every interrupt being on CPU0 */
if (m)
if (m && setaffinity)
__irq_set_affinity(irq, m, false);
return 0;
}
EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
EXPORT_SYMBOL_GPL(__irq_apply_affinity_hint);
static void irq_affinity_notify(struct work_struct *work)
{
@ -1259,6 +1259,8 @@ static int irq_thread(void *data)
irqreturn_t (*handler_fn)(struct irq_desc *desc,
struct irqaction *action);
sched_set_fifo(current);
if (force_irqthreads() && test_bit(IRQTF_FORCED_THREAD,
&action->thread_flags))
handler_fn = irq_forced_thread_fn;
@ -1424,8 +1426,6 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary)
if (IS_ERR(t))
return PTR_ERR(t);
sched_set_fifo(t);
/*
* We keep the reference to the task struct even if
* the thread dies to avoid that the interrupt code
@ -2827,7 +2827,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state);
* This call sets the internal irqchip state of an interrupt,
* depending on the value of @which.
*
* This function should be called with preemption disabled if the
* This function should be called with migration disabled if the
* interrupt controller has per-cpu registers.
*/
int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which,

View File

@ -14,12 +14,15 @@
#include <linux/irqdomain.h>
#include <linux/msi.h>
#include <linux/slab.h>
#include <linux/sysfs.h>
#include <linux/pci.h>
#include "internals.h"
static inline int msi_sysfs_create_group(struct device *dev);
/**
* alloc_msi_entry - Allocate an initialized msi_desc
* msi_alloc_desc - Allocate an initialized msi_desc
* @dev: Pointer to the device for which this is allocated
* @nvec: The number of vectors used in this entry
* @affinity: Optional pointer to an affinity mask array size of @nvec
@ -29,34 +32,134 @@
*
* Return: pointer to allocated &msi_desc on success or %NULL on failure
*/
struct msi_desc *alloc_msi_entry(struct device *dev, int nvec,
static struct msi_desc *msi_alloc_desc(struct device *dev, int nvec,
const struct irq_affinity_desc *affinity)
{
struct msi_desc *desc;
struct msi_desc *desc = kzalloc(sizeof(*desc), GFP_KERNEL);
desc = kzalloc(sizeof(*desc), GFP_KERNEL);
if (!desc)
return NULL;
INIT_LIST_HEAD(&desc->list);
desc->dev = dev;
desc->nvec_used = nvec;
if (affinity) {
desc->affinity = kmemdup(affinity,
nvec * sizeof(*desc->affinity), GFP_KERNEL);
desc->affinity = kmemdup(affinity, nvec * sizeof(*desc->affinity), GFP_KERNEL);
if (!desc->affinity) {
kfree(desc);
return NULL;
}
}
return desc;
}
void free_msi_entry(struct msi_desc *entry)
static void msi_free_desc(struct msi_desc *desc)
{
kfree(entry->affinity);
kfree(entry);
kfree(desc->affinity);
kfree(desc);
}
static int msi_insert_desc(struct msi_device_data *md, struct msi_desc *desc, unsigned int index)
{
int ret;
desc->msi_index = index;
ret = xa_insert(&md->__store, index, desc, GFP_KERNEL);
if (ret)
msi_free_desc(desc);
return ret;
}
/**
* msi_add_msi_desc - Allocate and initialize a MSI descriptor
* @dev: Pointer to the device for which the descriptor is allocated
* @init_desc: Pointer to an MSI descriptor to initialize the new descriptor
*
* Return: 0 on success or an appropriate failure code.
*/
int msi_add_msi_desc(struct device *dev, struct msi_desc *init_desc)
{
struct msi_desc *desc;
lockdep_assert_held(&dev->msi.data->mutex);
desc = msi_alloc_desc(dev, init_desc->nvec_used, init_desc->affinity);
if (!desc)
return -ENOMEM;
/* Copy type specific data to the new descriptor. */
desc->pci = init_desc->pci;
return msi_insert_desc(dev->msi.data, desc, init_desc->msi_index);
}
/**
* msi_add_simple_msi_descs - Allocate and initialize MSI descriptors
* @dev: Pointer to the device for which the descriptors are allocated
* @index: Index for the first MSI descriptor
* @ndesc: Number of descriptors to allocate
*
* Return: 0 on success or an appropriate failure code.
*/
static int msi_add_simple_msi_descs(struct device *dev, unsigned int index, unsigned int ndesc)
{
unsigned int idx, last = index + ndesc - 1;
struct msi_desc *desc;
int ret;
lockdep_assert_held(&dev->msi.data->mutex);
for (idx = index; idx <= last; idx++) {
desc = msi_alloc_desc(dev, 1, NULL);
if (!desc)
goto fail_mem;
ret = msi_insert_desc(dev->msi.data, desc, idx);
if (ret)
goto fail;
}
return 0;
fail_mem:
ret = -ENOMEM;
fail:
msi_free_msi_descs_range(dev, MSI_DESC_NOTASSOCIATED, index, last);
return ret;
}
static bool msi_desc_match(struct msi_desc *desc, enum msi_desc_filter filter)
{
switch (filter) {
case MSI_DESC_ALL:
return true;
case MSI_DESC_NOTASSOCIATED:
return !desc->irq;
case MSI_DESC_ASSOCIATED:
return !!desc->irq;
}
WARN_ON_ONCE(1);
return false;
}
/**
* msi_free_msi_descs_range - Free MSI descriptors of a device
* @dev: Device to free the descriptors
* @filter: Descriptor state filter
* @first_index: Index to start freeing from
* @last_index: Last index to be freed
*/
void msi_free_msi_descs_range(struct device *dev, enum msi_desc_filter filter,
unsigned int first_index, unsigned int last_index)
{
struct xarray *xa = &dev->msi.data->__store;
struct msi_desc *desc;
unsigned long idx;
lockdep_assert_held(&dev->msi.data->mutex);
xa_for_each_range(xa, idx, desc, first_index, last_index) {
if (msi_desc_match(desc, filter)) {
xa_erase(xa, idx);
msi_free_desc(desc);
}
}
}
void __get_cached_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
@ -72,138 +175,289 @@ void get_cached_msi_msg(unsigned int irq, struct msi_msg *msg)
}
EXPORT_SYMBOL_GPL(get_cached_msi_msg);
static void msi_device_data_release(struct device *dev, void *res)
{
struct msi_device_data *md = res;
WARN_ON_ONCE(!xa_empty(&md->__store));
xa_destroy(&md->__store);
dev->msi.data = NULL;
}
/**
* msi_setup_device_data - Setup MSI device data
* @dev: Device for which MSI device data should be set up
*
* Return: 0 on success, appropriate error code otherwise
*
* This can be called more than once for @dev. If the MSI device data is
* already allocated the call succeeds. The allocated memory is
* automatically released when the device is destroyed.
*/
int msi_setup_device_data(struct device *dev)
{
struct msi_device_data *md;
int ret;
if (dev->msi.data)
return 0;
md = devres_alloc(msi_device_data_release, sizeof(*md), GFP_KERNEL);
if (!md)
return -ENOMEM;
ret = msi_sysfs_create_group(dev);
if (ret) {
devres_free(md);
return ret;
}
xa_init(&md->__store);
mutex_init(&md->mutex);
dev->msi.data = md;
devres_add(dev, md);
return 0;
}
/**
* msi_lock_descs - Lock the MSI descriptor storage of a device
* @dev: Device to operate on
*/
void msi_lock_descs(struct device *dev)
{
mutex_lock(&dev->msi.data->mutex);
}
EXPORT_SYMBOL_GPL(msi_lock_descs);
/**
* msi_unlock_descs - Unlock the MSI descriptor storage of a device
* @dev: Device to operate on
*/
void msi_unlock_descs(struct device *dev)
{
/* Invalidate the index wich was cached by the iterator */
dev->msi.data->__iter_idx = MSI_MAX_INDEX;
mutex_unlock(&dev->msi.data->mutex);
}
EXPORT_SYMBOL_GPL(msi_unlock_descs);
static struct msi_desc *msi_find_desc(struct msi_device_data *md, enum msi_desc_filter filter)
{
struct msi_desc *desc;
xa_for_each_start(&md->__store, md->__iter_idx, desc, md->__iter_idx) {
if (msi_desc_match(desc, filter))
return desc;
}
md->__iter_idx = MSI_MAX_INDEX;
return NULL;
}
/**
* msi_first_desc - Get the first MSI descriptor of a device
* @dev: Device to operate on
* @filter: Descriptor state filter
*
* Must be called with the MSI descriptor mutex held, i.e. msi_lock_descs()
* must be invoked before the call.
*
* Return: Pointer to the first MSI descriptor matching the search
* criteria, NULL if none found.
*/
struct msi_desc *msi_first_desc(struct device *dev, enum msi_desc_filter filter)
{
struct msi_device_data *md = dev->msi.data;
if (WARN_ON_ONCE(!md))
return NULL;
lockdep_assert_held(&md->mutex);
md->__iter_idx = 0;
return msi_find_desc(md, filter);
}
EXPORT_SYMBOL_GPL(msi_first_desc);
/**
* msi_next_desc - Get the next MSI descriptor of a device
* @dev: Device to operate on
*
* The first invocation of msi_next_desc() has to be preceeded by a
* successful invocation of __msi_first_desc(). Consecutive invocations are
* only valid if the previous one was successful. All these operations have
* to be done within the same MSI mutex held region.
*
* Return: Pointer to the next MSI descriptor matching the search
* criteria, NULL if none found.
*/
struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter)
{
struct msi_device_data *md = dev->msi.data;
if (WARN_ON_ONCE(!md))
return NULL;
lockdep_assert_held(&md->mutex);
if (md->__iter_idx >= (unsigned long)MSI_MAX_INDEX)
return NULL;
md->__iter_idx++;
return msi_find_desc(md, filter);
}
EXPORT_SYMBOL_GPL(msi_next_desc);
/**
* msi_get_virq - Return Linux interrupt number of a MSI interrupt
* @dev: Device to operate on
* @index: MSI interrupt index to look for (0-based)
*
* Return: The Linux interrupt number on success (> 0), 0 if not found
*/
unsigned int msi_get_virq(struct device *dev, unsigned int index)
{
struct msi_desc *desc;
unsigned int ret = 0;
bool pcimsi;
if (!dev->msi.data)
return 0;
pcimsi = dev_is_pci(dev) ? to_pci_dev(dev)->msi_enabled : false;
msi_lock_descs(dev);
desc = xa_load(&dev->msi.data->__store, pcimsi ? 0 : index);
if (desc && desc->irq) {
/*
* PCI-MSI has only one descriptor for multiple interrupts.
* PCI-MSIX and platform MSI use a descriptor per
* interrupt.
*/
if (pcimsi) {
if (index < desc->nvec_used)
ret = desc->irq + index;
} else {
ret = desc->irq;
}
}
msi_unlock_descs(dev);
return ret;
}
EXPORT_SYMBOL_GPL(msi_get_virq);
#ifdef CONFIG_SYSFS
static struct attribute *msi_dev_attrs[] = {
NULL
};
static const struct attribute_group msi_irqs_group = {
.name = "msi_irqs",
.attrs = msi_dev_attrs,
};
static inline int msi_sysfs_create_group(struct device *dev)
{
return devm_device_add_group(dev, &msi_irqs_group);
}
static ssize_t msi_mode_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct msi_desc *entry;
bool is_msix = false;
unsigned long irq;
int retval;
retval = kstrtoul(attr->attr.name, 10, &irq);
if (retval)
return retval;
entry = irq_get_msi_desc(irq);
if (!entry)
return -ENODEV;
if (dev_is_pci(dev))
is_msix = entry->msi_attrib.is_msix;
/* MSI vs. MSIX is per device not per interrupt */
bool is_msix = dev_is_pci(dev) ? to_pci_dev(dev)->msix_enabled : false;
return sysfs_emit(buf, "%s\n", is_msix ? "msix" : "msi");
}
/**
* msi_populate_sysfs - Populate msi_irqs sysfs entries for devices
* @dev: The device(PCI, platform etc) who will get sysfs entries
*
* Return attribute_group ** so that specific bus MSI can save it to
* somewhere during initilizing msi irqs. If devices has no MSI irq,
* return NULL; if it fails to populate sysfs, return ERR_PTR
*/
const struct attribute_group **msi_populate_sysfs(struct device *dev)
static void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc)
{
const struct attribute_group **msi_irq_groups;
struct attribute **msi_attrs, *msi_attr;
struct device_attribute *msi_dev_attr;
struct attribute_group *msi_irq_group;
struct msi_desc *entry;
int ret = -ENOMEM;
int num_msi = 0;
int count = 0;
struct device_attribute *attrs = desc->sysfs_attrs;
int i;
/* Determine how many msi entries we have */
for_each_msi_entry(entry, dev)
num_msi += entry->nvec_used;
if (!num_msi)
return NULL;
if (!attrs)
return;
/* Dynamically create the MSI attributes for the device */
msi_attrs = kcalloc(num_msi + 1, sizeof(void *), GFP_KERNEL);
if (!msi_attrs)
return ERR_PTR(-ENOMEM);
for_each_msi_entry(entry, dev) {
for (i = 0; i < entry->nvec_used; i++) {
msi_dev_attr = kzalloc(sizeof(*msi_dev_attr), GFP_KERNEL);
if (!msi_dev_attr)
goto error_attrs;
msi_attrs[count] = &msi_dev_attr->attr;
sysfs_attr_init(&msi_dev_attr->attr);
msi_dev_attr->attr.name = kasprintf(GFP_KERNEL, "%d",
entry->irq + i);
if (!msi_dev_attr->attr.name)
goto error_attrs;
msi_dev_attr->attr.mode = 0444;
msi_dev_attr->show = msi_mode_show;
++count;
desc->sysfs_attrs = NULL;
for (i = 0; i < desc->nvec_used; i++) {
if (attrs[i].show)
sysfs_remove_file_from_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name);
kfree(attrs[i].attr.name);
}
kfree(attrs);
}
msi_irq_group = kzalloc(sizeof(*msi_irq_group), GFP_KERNEL);
if (!msi_irq_group)
goto error_attrs;
msi_irq_group->name = "msi_irqs";
msi_irq_group->attrs = msi_attrs;
static int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc)
{
struct device_attribute *attrs;
int ret, i;
msi_irq_groups = kcalloc(2, sizeof(void *), GFP_KERNEL);
if (!msi_irq_groups)
goto error_irq_group;
msi_irq_groups[0] = msi_irq_group;
attrs = kcalloc(desc->nvec_used, sizeof(*attrs), GFP_KERNEL);
if (!attrs)
return -ENOMEM;
ret = sysfs_create_groups(&dev->kobj, msi_irq_groups);
desc->sysfs_attrs = attrs;
for (i = 0; i < desc->nvec_used; i++) {
sysfs_attr_init(&attrs[i].attr);
attrs[i].attr.name = kasprintf(GFP_KERNEL, "%d", desc->irq + i);
if (!attrs[i].attr.name) {
ret = -ENOMEM;
goto fail;
}
attrs[i].attr.mode = 0444;
attrs[i].show = msi_mode_show;
ret = sysfs_add_file_to_group(&dev->kobj, &attrs[i].attr, msi_irqs_group.name);
if (ret) {
attrs[i].show = NULL;
goto fail;
}
}
return 0;
fail:
msi_sysfs_remove_desc(dev, desc);
return ret;
}
#ifdef CONFIG_PCI_MSI_ARCH_FALLBACKS
/**
* msi_device_populate_sysfs - Populate msi_irqs sysfs entries for a device
* @dev: The device (PCI, platform etc) which will get sysfs entries
*/
int msi_device_populate_sysfs(struct device *dev)
{
struct msi_desc *desc;
int ret;
msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
if (desc->sysfs_attrs)
continue;
ret = msi_sysfs_populate_desc(dev, desc);
if (ret)
goto error_irq_groups;
return msi_irq_groups;
error_irq_groups:
kfree(msi_irq_groups);
error_irq_group:
kfree(msi_irq_group);
error_attrs:
count = 0;
msi_attr = msi_attrs[count];
while (msi_attr) {
msi_dev_attr = container_of(msi_attr, struct device_attribute, attr);
kfree(msi_attr->name);
kfree(msi_dev_attr);
++count;
msi_attr = msi_attrs[count];
return ret;
}
kfree(msi_attrs);
return ERR_PTR(ret);
return 0;
}
/**
* msi_destroy_sysfs - Destroy msi_irqs sysfs entries for devices
* @dev: The device(PCI, platform etc) who will remove sysfs entries
* @msi_irq_groups: attribute_group for device msi_irqs entries
* msi_device_destroy_sysfs - Destroy msi_irqs sysfs entries for a device
* @dev: The device (PCI, platform etc) for which to remove
* sysfs entries
*/
void msi_destroy_sysfs(struct device *dev, const struct attribute_group **msi_irq_groups)
void msi_device_destroy_sysfs(struct device *dev)
{
struct device_attribute *dev_attr;
struct attribute **msi_attrs;
int count = 0;
struct msi_desc *desc;
if (msi_irq_groups) {
sysfs_remove_groups(&dev->kobj, msi_irq_groups);
msi_attrs = msi_irq_groups[0]->attrs;
while (msi_attrs[count]) {
dev_attr = container_of(msi_attrs[count],
struct device_attribute, attr);
kfree(dev_attr->attr.name);
kfree(dev_attr);
++count;
}
kfree(msi_attrs);
kfree(msi_irq_groups[0]);
kfree(msi_irq_groups);
}
msi_for_each_desc(desc, dev, MSI_DESC_ALL)
msi_sysfs_remove_desc(dev, desc);
}
#endif /* CONFIG_PCI_MSI_ARCH_FALLBACK */
#else /* CONFIG_SYSFS */
static inline int msi_sysfs_create_group(struct device *dev) { return 0; }
static inline int msi_sysfs_populate_desc(struct device *dev, struct msi_desc *desc) { return 0; }
static inline void msi_sysfs_remove_desc(struct device *dev, struct msi_desc *desc) { }
#endif /* !CONFIG_SYSFS */
#ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN
static inline void irq_chip_write_msi_msg(struct irq_data *data,
@ -456,43 +710,38 @@ int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev,
}
int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev,
int virq, int nvec, msi_alloc_info_t *arg)
int virq_base, int nvec, msi_alloc_info_t *arg)
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
struct msi_desc *desc;
int ret = 0;
int ret, virq;
for_each_msi_entry(desc, dev) {
/* Don't even try the multi-MSI brain damage. */
if (WARN_ON(!desc->irq || desc->nvec_used != 1)) {
ret = -EINVAL;
break;
}
msi_lock_descs(dev);
ret = msi_add_simple_msi_descs(dev, virq_base, nvec);
if (ret)
goto unlock;
if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
continue;
for (virq = virq_base; virq < virq_base + nvec; virq++) {
desc = xa_load(&dev->msi.data->__store, virq);
desc->irq = virq;
ops->set_desc(arg, desc);
/* Assumes the domain mutex is held! */
ret = irq_domain_alloc_irqs_hierarchy(domain, desc->irq, 1,
arg);
ret = irq_domain_alloc_irqs_hierarchy(domain, virq, 1, arg);
if (ret)
break;
goto fail;
irq_set_msi_desc_off(desc->irq, 0, desc);
}
if (ret) {
/* Mop up the damage */
for_each_msi_entry(desc, dev) {
if (!(desc->irq >= virq && desc->irq < (virq + nvec)))
continue;
irq_domain_free_irqs_common(domain, desc->irq, 1);
}
irq_set_msi_desc(virq, desc);
}
msi_unlock_descs(dev);
return 0;
fail:
for (--virq; virq >= virq_base; virq--)
irq_domain_free_irqs_common(domain, virq, 1);
msi_free_msi_descs_range(dev, MSI_DESC_ALL, virq_base, virq_base + nvec - 1);
unlock:
msi_unlock_descs(dev);
return ret;
}
@ -531,8 +780,59 @@ static bool msi_check_reservation_mode(struct irq_domain *domain,
* Checking the first MSI descriptor is sufficient. MSIX supports
* masking and MSI does so when the can_mask attribute is set.
*/
desc = first_msi_entry(dev);
return desc->msi_attrib.is_msix || desc->msi_attrib.can_mask;
desc = msi_first_desc(dev, MSI_DESC_ALL);
return desc->pci.msi_attrib.is_msix || desc->pci.msi_attrib.can_mask;
}
static int msi_handle_pci_fail(struct irq_domain *domain, struct msi_desc *desc,
int allocated)
{
switch(domain->bus_token) {
case DOMAIN_BUS_PCI_MSI:
case DOMAIN_BUS_VMD_MSI:
if (IS_ENABLED(CONFIG_PCI_MSI))
break;
fallthrough;
default:
return -ENOSPC;
}
/* Let a failed PCI multi MSI allocation retry */
if (desc->nvec_used > 1)
return 1;
/* If there was a successful allocation let the caller know */
return allocated ? allocated : -ENOSPC;
}
#define VIRQ_CAN_RESERVE 0x01
#define VIRQ_ACTIVATE 0x02
#define VIRQ_NOMASK_QUIRK 0x04
static int msi_init_virq(struct irq_domain *domain, int virq, unsigned int vflags)
{
struct irq_data *irqd = irq_domain_get_irq_data(domain, virq);
int ret;
if (!(vflags & VIRQ_CAN_RESERVE)) {
irqd_clr_can_reserve(irqd);
if (vflags & VIRQ_NOMASK_QUIRK)
irqd_set_msi_nomask_quirk(irqd);
}
if (!(vflags & VIRQ_ACTIVATE))
return 0;
ret = irq_domain_activate_irq(irqd, vflags & VIRQ_CAN_RESERVE);
if (ret)
return ret;
/*
* If the interrupt uses reservation mode, clear the activated bit
* so request_irq() will assign the final vector.
*/
if (vflags & VIRQ_CAN_RESERVE)
irqd_clr_activated(irqd);
return 0;
}
int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
@ -540,83 +840,103 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
struct irq_data *irq_data;
struct msi_desc *desc;
msi_alloc_info_t arg = { };
unsigned int vflags = 0;
struct msi_desc *desc;
int allocated = 0;
int i, ret, virq;
bool can_reserve;
ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg);
if (ret)
return ret;
for_each_msi_entry(desc, dev) {
ops->set_desc(&arg, desc);
virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
dev_to_node(dev), &arg, false,
desc->affinity);
if (virq < 0) {
ret = -ENOSPC;
if (ops->handle_error)
ret = ops->handle_error(domain, desc, ret);
if (ops->msi_finish)
ops->msi_finish(&arg, ret);
return ret;
}
for (i = 0; i < desc->nvec_used; i++) {
irq_set_msi_desc_off(virq, i, desc);
irq_debugfs_copy_devname(virq + i, dev);
}
}
if (ops->msi_finish)
ops->msi_finish(&arg, 0);
can_reserve = msi_check_reservation_mode(domain, info, dev);
/*
* This flag is set by the PCI layer as we need to activate
* the MSI entries before the PCI layer enables MSI in the
* card. Otherwise the card latches a random msi message.
*/
if (!(info->flags & MSI_FLAG_ACTIVATE_EARLY))
goto skip_activate;
if (info->flags & MSI_FLAG_ACTIVATE_EARLY)
vflags |= VIRQ_ACTIVATE;
for_each_msi_vector(desc, i, dev) {
if (desc->irq == i) {
virq = desc->irq;
dev_dbg(dev, "irq [%d-%d] for MSI\n",
virq, virq + desc->nvec_used - 1);
}
irq_data = irq_domain_get_irq_data(domain, i);
if (!can_reserve) {
irqd_clr_can_reserve(irq_data);
if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
irqd_set_msi_nomask_quirk(irq_data);
}
ret = irq_domain_activate_irq(irq_data, can_reserve);
if (ret)
goto cleanup;
}
skip_activate:
/*
* If these interrupts use reservation mode, clear the activated bit
* so request_irq() will assign the final vector.
* Interrupt can use a reserved vector and will not occupy
* a real device vector until the interrupt is requested.
*/
if (can_reserve) {
for_each_msi_vector(desc, i, dev) {
irq_data = irq_domain_get_irq_data(domain, i);
irqd_clr_activated(irq_data);
if (msi_check_reservation_mode(domain, info, dev)) {
vflags |= VIRQ_CAN_RESERVE;
/*
* MSI affinity setting requires a special quirk (X86) when
* reservation mode is active.
*/
if (domain->flags & IRQ_DOMAIN_MSI_NOMASK_QUIRK)
vflags |= VIRQ_NOMASK_QUIRK;
}
msi_for_each_desc(desc, dev, MSI_DESC_NOTASSOCIATED) {
ops->set_desc(&arg, desc);
virq = __irq_domain_alloc_irqs(domain, -1, desc->nvec_used,
dev_to_node(dev), &arg, false,
desc->affinity);
if (virq < 0)
return msi_handle_pci_fail(domain, desc, allocated);
for (i = 0; i < desc->nvec_used; i++) {
irq_set_msi_desc_off(virq, i, desc);
irq_debugfs_copy_devname(virq + i, dev);
ret = msi_init_virq(domain, virq + i, vflags);
if (ret)
return ret;
}
if (info->flags & MSI_FLAG_DEV_SYSFS) {
ret = msi_sysfs_populate_desc(dev, desc);
if (ret)
return ret;
}
allocated++;
}
return 0;
}
cleanup:
msi_domain_free_irqs(domain, dev);
static int msi_domain_add_simple_msi_descs(struct msi_domain_info *info,
struct device *dev,
unsigned int num_descs)
{
if (!(info->flags & MSI_FLAG_ALLOC_SIMPLE_MSI_DESCS))
return 0;
return msi_add_simple_msi_descs(dev, 0, num_descs);
}
/**
* msi_domain_alloc_irqs_descs_locked - Allocate interrupts from a MSI interrupt domain
* @domain: The domain to allocate from
* @dev: Pointer to device struct of the device for which the interrupts
* are allocated
* @nvec: The number of interrupts to allocate
*
* Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
* pair. Use this for MSI irqdomains which implement their own vector
* allocation/free.
*
* Return: %0 on success or an error code.
*/
int msi_domain_alloc_irqs_descs_locked(struct irq_domain *domain, struct device *dev,
int nvec)
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
int ret;
lockdep_assert_held(&dev->msi.data->mutex);
ret = msi_domain_add_simple_msi_descs(info, dev, nvec);
if (ret)
return ret;
ret = ops->domain_alloc_irqs(domain, dev, nvec);
if (ret)
msi_domain_free_irqs_descs_locked(domain, dev);
return ret;
}
@ -629,38 +949,65 @@ int __msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
*
* Return: %0 on success or an error code.
*/
int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev,
int nvec)
int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, int nvec)
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
int ret;
return ops->domain_alloc_irqs(domain, dev, nvec);
msi_lock_descs(dev);
ret = msi_domain_alloc_irqs_descs_locked(domain, dev, nvec);
msi_unlock_descs(dev);
return ret;
}
void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
{
struct irq_data *irq_data;
struct msi_domain_info *info = domain->host_data;
struct irq_data *irqd;
struct msi_desc *desc;
int i;
for_each_msi_vector(desc, i, dev) {
irq_data = irq_domain_get_irq_data(domain, i);
if (irqd_is_activated(irq_data))
irq_domain_deactivate_irq(irq_data);
/* Only handle MSI entries which have an interrupt associated */
msi_for_each_desc(desc, dev, MSI_DESC_ASSOCIATED) {
/* Make sure all interrupts are deactivated */
for (i = 0; i < desc->nvec_used; i++) {
irqd = irq_domain_get_irq_data(domain, desc->irq + i);
if (irqd && irqd_is_activated(irqd))
irq_domain_deactivate_irq(irqd);
}
for_each_msi_entry(desc, dev) {
/*
* We might have failed to allocate an MSI early
* enough that there is no IRQ associated to this
* entry. If that's the case, don't do anything.
*/
if (desc->irq) {
irq_domain_free_irqs(desc->irq, desc->nvec_used);
if (info->flags & MSI_FLAG_DEV_SYSFS)
msi_sysfs_remove_desc(dev, desc);
desc->irq = 0;
}
}
static void msi_domain_free_msi_descs(struct msi_domain_info *info,
struct device *dev)
{
if (info->flags & MSI_FLAG_FREE_MSI_DESCS)
msi_free_msi_descs(dev);
}
/**
* msi_domain_free_irqs_descs_locked - Free interrupts from a MSI interrupt @domain associated to @dev
* @domain: The domain to managing the interrupts
* @dev: Pointer to device struct of the device for which the interrupts
* are free
*
* Must be invoked from within a msi_lock_descs() / msi_unlock_descs()
* pair. Use this for MSI irqdomains which implement their own vector
* allocation.
*/
void msi_domain_free_irqs_descs_locked(struct irq_domain *domain, struct device *dev)
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
lockdep_assert_held(&dev->msi.data->mutex);
ops->domain_free_irqs(domain, dev);
msi_domain_free_msi_descs(info, dev);
}
/**
@ -671,10 +1018,9 @@ void __msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
*/
void msi_domain_free_irqs(struct irq_domain *domain, struct device *dev)
{
struct msi_domain_info *info = domain->host_data;
struct msi_domain_ops *ops = info->ops;
return ops->domain_free_irqs(domain, dev);
msi_lock_descs(dev);
msi_domain_free_irqs_descs_locked(domain, dev);
msi_unlock_descs(dev);
}
/**

View File

@ -447,6 +447,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
static int __init irqfixup_setup(char *str)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
pr_warn("irqfixup boot option not supported with PREEMPT_RT\n");
return 1;
}
irqfixup = 1;
printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
printk(KERN_WARNING "This may impact system performance.\n");
@ -459,6 +463,10 @@ module_param(irqfixup, int, 0644);
static int __init irqpoll_setup(char *str)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
pr_warn("irqpoll boot option not supported with PREEMPT_RT\n");
return 1;
}
irqfixup = 2;
printk(KERN_WARNING "Misrouted IRQ fixup and polling support "
"enabled\n");

View File

@ -18,11 +18,36 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/smpboot.h>
#include <asm/processor.h>
#include <linux/kasan.h>
static DEFINE_PER_CPU(struct llist_head, raised_list);
static DEFINE_PER_CPU(struct llist_head, lazy_list);
static DEFINE_PER_CPU(struct task_struct *, irq_workd);
static void wake_irq_workd(void)
{
struct task_struct *tsk = __this_cpu_read(irq_workd);
if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk)
wake_up_process(tsk);
}
#ifdef CONFIG_SMP
static void irq_work_wake(struct irq_work *entry)
{
wake_irq_workd();
}
static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) =
IRQ_WORK_INIT_HARD(irq_work_wake);
#endif
static int irq_workd_should_run(unsigned int cpu)
{
return !llist_empty(this_cpu_ptr(&lazy_list));
}
/*
* Claim the entry so that no one else will poke at it.
@ -52,15 +77,29 @@ void __weak arch_irq_work_raise(void)
/* Enqueue on current CPU, work must already be claimed and preempt disabled */
static void __irq_work_queue_local(struct irq_work *work)
{
struct llist_head *list;
bool rt_lazy_work = false;
bool lazy_work = false;
int work_flags;
work_flags = atomic_read(&work->node.a_flags);
if (work_flags & IRQ_WORK_LAZY)
lazy_work = true;
else if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
!(work_flags & IRQ_WORK_HARD_IRQ))
rt_lazy_work = true;
if (lazy_work || rt_lazy_work)
list = this_cpu_ptr(&lazy_list);
else
list = this_cpu_ptr(&raised_list);
if (!llist_add(&work->node.llist, list))
return;
/* If the work is "lazy", handle it from next tick if any */
if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) {
if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) &&
tick_nohz_tick_stopped())
if (!lazy_work || tick_nohz_tick_stopped())
arch_irq_work_raise();
} else {
if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list)))
arch_irq_work_raise();
}
}
/* Enqueue the irq work @work on the current CPU */
@ -104,17 +143,34 @@ bool irq_work_queue_on(struct irq_work *work, int cpu)
if (cpu != smp_processor_id()) {
/* Arch remote IPI send/receive backend aren't NMI safe */
WARN_ON_ONCE(in_nmi());
/*
* On PREEMPT_RT the items which are not marked as
* IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work
* item is used on the remote CPU to wake the thread.
*/
if (IS_ENABLED(CONFIG_PREEMPT_RT) &&
!(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) {
if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu)))
goto out;
work = &per_cpu(irq_work_wakeup, cpu);
if (!irq_work_claim(work))
goto out;
}
__smp_call_single_queue(cpu, &work->node.llist);
} else {
__irq_work_queue_local(work);
}
out:
preempt_enable();
return true;
#endif /* CONFIG_SMP */
}
bool irq_work_needs_cpu(void)
{
struct llist_head *raised, *lazy;
@ -160,6 +216,10 @@ void irq_work_single(void *arg)
* else claimed it meanwhile.
*/
(void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY);
if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
!arch_irq_work_has_interrupt())
rcuwait_wake_up(&work->irqwait);
}
static void irq_work_run_list(struct llist_head *list)
@ -167,7 +227,12 @@ static void irq_work_run_list(struct llist_head *list)
struct irq_work *work, *tmp;
struct llist_node *llnode;
BUG_ON(!irqs_disabled());
/*
* On PREEMPT_RT IRQ-work which is not marked as HARD will be processed
* in a per-CPU thread in preemptible context. Only the items which are
* marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context.
*/
BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT));
if (llist_empty(list))
return;
@ -184,7 +249,10 @@ static void irq_work_run_list(struct llist_head *list)
void irq_work_run(void)
{
irq_work_run_list(this_cpu_ptr(&raised_list));
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
irq_work_run_list(this_cpu_ptr(&lazy_list));
else
wake_irq_workd();
}
EXPORT_SYMBOL_GPL(irq_work_run);
@ -194,7 +262,11 @@ void irq_work_tick(void)
if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
irq_work_run_list(raised);
if (!IS_ENABLED(CONFIG_PREEMPT_RT))
irq_work_run_list(this_cpu_ptr(&lazy_list));
else
wake_irq_workd();
}
/*
@ -204,8 +276,42 @@ void irq_work_tick(void)
void irq_work_sync(struct irq_work *work)
{
lockdep_assert_irqs_enabled();
might_sleep();
if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) ||
!arch_irq_work_has_interrupt()) {
rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work),
TASK_UNINTERRUPTIBLE);
return;
}
while (irq_work_is_busy(work))
cpu_relax();
}
EXPORT_SYMBOL_GPL(irq_work_sync);
static void run_irq_workd(unsigned int cpu)
{
irq_work_run_list(this_cpu_ptr(&lazy_list));
}
static void irq_workd_setup(unsigned int cpu)
{
sched_set_fifo_low(current);
}
static struct smp_hotplug_thread irqwork_threads = {
.store = &irq_workd,
.setup = irq_workd_setup,
.thread_should_run = irq_workd_should_run,
.thread_fn = run_irq_workd,
.thread_comm = "irq_work/%u",
};
static __init int irq_work_init_threads(void)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT))
BUG_ON(smpboot_register_percpu_thread(&irqwork_threads));
return 0;
}
early_initcall(irq_work_init_threads);

View File

@ -164,26 +164,46 @@ static unsigned long kallsyms_sym_address(int idx)
return kallsyms_relative_base - 1 - kallsyms_offsets[idx];
}
#if defined(CONFIG_CFI_CLANG) && defined(CONFIG_LTO_CLANG_THIN)
/*
* LLVM appends a hash to static function names when ThinLTO and CFI are
* both enabled, i.e. foo() becomes foo$707af9a22804d33c81801f27dcfe489b.
* This causes confusion and potentially breaks user space tools, so we
* strip the suffix from expanded symbol names.
*/
static inline bool cleanup_symbol_name(char *s)
static bool cleanup_symbol_name(char *s)
{
char *res;
res = strrchr(s, '$');
if (res)
*res = '\0';
if (!IS_ENABLED(CONFIG_LTO_CLANG))
return false;
return res != NULL;
/*
* LLVM appends various suffixes for local functions and variables that
* must be promoted to global scope as part of LTO. This can break
* hooking of static functions with kprobes. '.' is not a valid
* character in an identifier in C. Suffixes observed:
* - foo.llvm.[0-9a-f]+
* - foo.[0-9a-f]+
* - foo.[0-9a-f]+.cfi_jt
*/
res = strchr(s, '.');
if (res) {
*res = '\0';
return true;
}
if (!IS_ENABLED(CONFIG_CFI_CLANG) ||
!IS_ENABLED(CONFIG_LTO_CLANG_THIN) ||
CONFIG_CLANG_VERSION >= 130000)
return false;
/*
* Prior to LLVM 13, the following suffixes were observed when thinLTO
* and CFI are both enabled:
* - foo$[0-9]+
*/
res = strrchr(s, '$');
if (res) {
*res = '\0';
return true;
}
return false;
}
#else
static inline bool cleanup_symbol_name(char *s) { return false; }
#endif
/* Lookup the address for this symbol. Returns 0 if not found. */
unsigned long kallsyms_lookup_name(const char *name)
@ -223,6 +243,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
ret = fn(data, namebuf, NULL, kallsyms_sym_address(i));
if (ret != 0)
return ret;
cond_resched();
}
return 0;
}

View File

@ -88,6 +88,7 @@ static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas);
struct kcov_percpu_data {
void *irq_area;
local_lock_t lock;
unsigned int saved_mode;
unsigned int saved_size;
@ -96,7 +97,9 @@ struct kcov_percpu_data {
int saved_sequence;
};
static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data);
static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data) = {
.lock = INIT_LOCAL_LOCK(lock),
};
/* Must be called with kcov_remote_lock locked. */
static struct kcov_remote *kcov_remote_find(u64 handle)
@ -824,7 +827,7 @@ void kcov_remote_start(u64 handle)
if (!in_task() && !in_serving_softirq())
return;
local_irq_save(flags);
local_lock_irqsave(&kcov_percpu_data.lock, flags);
/*
* Check that kcov_remote_start() is not called twice in background
@ -832,7 +835,7 @@ void kcov_remote_start(u64 handle)
*/
mode = READ_ONCE(t->kcov_mode);
if (WARN_ON(in_task() && kcov_mode_enabled(mode))) {
local_irq_restore(flags);
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/*
@ -841,14 +844,15 @@ void kcov_remote_start(u64 handle)
* happened while collecting coverage from a background thread.
*/
if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) {
local_irq_restore(flags);
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
spin_lock(&kcov_remote_lock);
remote = kcov_remote_find(handle);
if (!remote) {
spin_unlock_irqrestore(&kcov_remote_lock, flags);
spin_unlock(&kcov_remote_lock);
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
kcov_debug("handle = %llx, context: %s\n", handle,
@ -869,19 +873,19 @@ void kcov_remote_start(u64 handle)
size = CONFIG_KCOV_IRQ_AREA_SIZE;
area = this_cpu_ptr(&kcov_percpu_data)->irq_area;
}
spin_unlock_irqrestore(&kcov_remote_lock, flags);
spin_unlock(&kcov_remote_lock);
/* Can only happen when in_task(). */
if (!area) {
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
area = vmalloc(size * sizeof(unsigned long));
if (!area) {
kcov_put(kcov);
return;
}
local_lock_irqsave(&kcov_percpu_data.lock, flags);
}
local_irq_save(flags);
/* Reset coverage size. */
*(u64 *)area = 0;
@ -891,7 +895,7 @@ void kcov_remote_start(u64 handle)
}
kcov_start(t, kcov, size, area, mode, sequence);
local_irq_restore(flags);
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
}
EXPORT_SYMBOL(kcov_remote_start);
@ -965,12 +969,12 @@ void kcov_remote_stop(void)
if (!in_task() && !in_serving_softirq())
return;
local_irq_save(flags);
local_lock_irqsave(&kcov_percpu_data.lock, flags);
mode = READ_ONCE(t->kcov_mode);
barrier();
if (!kcov_mode_enabled(mode)) {
local_irq_restore(flags);
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/*
@ -978,12 +982,12 @@ void kcov_remote_stop(void)
* actually found the remote handle and started collecting coverage.
*/
if (in_serving_softirq() && !t->kcov_softirq) {
local_irq_restore(flags);
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
/* Make sure that kcov_softirq is only set when in softirq. */
if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) {
local_irq_restore(flags);
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
return;
}
@ -1013,7 +1017,7 @@ void kcov_remote_stop(void)
spin_unlock(&kcov_remote_lock);
}
local_irq_restore(flags);
local_unlock_irqrestore(&kcov_percpu_data.lock, flags);
/* Get in kcov_remote_start(). */
kcov_put(kcov);
@ -1034,8 +1038,8 @@ static int __init kcov_init(void)
int cpu;
for_each_possible_cpu(cpu) {
void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE *
sizeof(unsigned long));
void *area = vmalloc_node(CONFIG_KCOV_IRQ_AREA_SIZE *
sizeof(unsigned long), cpu_to_node(cpu));
if (!area)
return -ENOMEM;
per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area;

View File

@ -8,9 +8,12 @@ CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \
$(call cc-option,-mno-outline-atomics) \
-fno-stack-protector -DDISABLE_BRANCH_PROFILING
obj-y := core.o debugfs.o report.o
KCSAN_INSTRUMENT_BARRIERS_selftest.o := y
obj-$(CONFIG_KCSAN_SELFTEST) += selftest.o
CFLAGS_kcsan_test.o := $(CFLAGS_KCSAN) -g -fno-omit-frame-pointer

View File

@ -40,15 +40,17 @@ module_param_named(udelay_interrupt, kcsan_udelay_interrupt, uint, 0644);
module_param_named(skip_watch, kcsan_skip_watch, long, 0644);
module_param_named(interrupt_watcher, kcsan_interrupt_watcher, bool, 0444);
#ifdef CONFIG_KCSAN_WEAK_MEMORY
static bool kcsan_weak_memory = true;
module_param_named(weak_memory, kcsan_weak_memory, bool, 0644);
#else
#define kcsan_weak_memory false
#endif
bool kcsan_enabled;
/* Per-CPU kcsan_ctx for interrupts */
static DEFINE_PER_CPU(struct kcsan_ctx, kcsan_cpu_ctx) = {
.disable_count = 0,
.atomic_next = 0,
.atomic_nest_count = 0,
.in_flat_atomic = false,
.access_mask = 0,
.scoped_accesses = {LIST_POISON1, NULL},
};
@ -202,22 +204,29 @@ static __always_inline struct kcsan_ctx *get_ctx(void)
return in_task() ? &current->kcsan_ctx : raw_cpu_ptr(&kcsan_cpu_ctx);
}
static __always_inline void
check_access(const volatile void *ptr, size_t size, int type, unsigned long ip);
/* Check scoped accesses; never inline because this is a slow-path! */
static noinline void kcsan_check_scoped_accesses(void)
{
struct kcsan_ctx *ctx = get_ctx();
struct list_head *prev_save = ctx->scoped_accesses.prev;
struct kcsan_scoped_access *scoped_access;
ctx->scoped_accesses.prev = NULL; /* Avoid recursion. */
list_for_each_entry(scoped_access, &ctx->scoped_accesses, list)
__kcsan_check_access(scoped_access->ptr, scoped_access->size, scoped_access->type);
ctx->scoped_accesses.prev = prev_save;
if (ctx->disable_scoped)
return;
ctx->disable_scoped++;
list_for_each_entry(scoped_access, &ctx->scoped_accesses, list) {
check_access(scoped_access->ptr, scoped_access->size,
scoped_access->type, scoped_access->ip);
}
ctx->disable_scoped--;
}
/* Rules for generic atomic accesses. Called from fast-path. */
static __always_inline bool
is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx)
is_atomic(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, int type)
{
if (type & KCSAN_ACCESS_ATOMIC)
return true;
@ -254,7 +263,7 @@ is_atomic(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx
}
static __always_inline bool
should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *ctx)
should_watch(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size, int type)
{
/*
* Never set up watchpoints when memory operations are atomic.
@ -263,7 +272,7 @@ should_watch(const volatile void *ptr, size_t size, int type, struct kcsan_ctx *
* should not count towards skipped instructions, and (2) to actually
* decrement kcsan_atomic_next for consecutive instruction stream.
*/
if (is_atomic(ptr, size, type, ctx))
if (is_atomic(ctx, ptr, size, type))
return false;
if (this_cpu_dec_return(kcsan_skip) >= 0)
@ -320,6 +329,21 @@ static void delay_access(int type)
udelay(delay);
}
/*
* Reads the instrumented memory for value change detection; value change
* detection is currently done for accesses up to a size of 8 bytes.
*/
static __always_inline u64 read_instrumented_memory(const volatile void *ptr, size_t size)
{
switch (size) {
case 1: return READ_ONCE(*(const u8 *)ptr);
case 2: return READ_ONCE(*(const u16 *)ptr);
case 4: return READ_ONCE(*(const u32 *)ptr);
case 8: return READ_ONCE(*(const u64 *)ptr);
default: return 0; /* Ignore; we do not diff the values. */
}
}
void kcsan_save_irqtrace(struct task_struct *task)
{
#ifdef CONFIG_TRACE_IRQFLAGS
@ -334,6 +358,76 @@ void kcsan_restore_irqtrace(struct task_struct *task)
#endif
}
static __always_inline int get_kcsan_stack_depth(void)
{
#ifdef CONFIG_KCSAN_WEAK_MEMORY
return current->kcsan_stack_depth;
#else
BUILD_BUG();
return 0;
#endif
}
static __always_inline void add_kcsan_stack_depth(int val)
{
#ifdef CONFIG_KCSAN_WEAK_MEMORY
current->kcsan_stack_depth += val;
#else
BUILD_BUG();
#endif
}
static __always_inline struct kcsan_scoped_access *get_reorder_access(struct kcsan_ctx *ctx)
{
#ifdef CONFIG_KCSAN_WEAK_MEMORY
return ctx->disable_scoped ? NULL : &ctx->reorder_access;
#else
return NULL;
#endif
}
static __always_inline bool
find_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size,
int type, unsigned long ip)
{
struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
if (!reorder_access)
return false;
/*
* Note: If accesses are repeated while reorder_access is identical,
* never matches the new access, because !(type & KCSAN_ACCESS_SCOPED).
*/
return reorder_access->ptr == ptr && reorder_access->size == size &&
reorder_access->type == type && reorder_access->ip == ip;
}
static inline void
set_reorder_access(struct kcsan_ctx *ctx, const volatile void *ptr, size_t size,
int type, unsigned long ip)
{
struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
if (!reorder_access || !kcsan_weak_memory)
return;
/*
* To avoid nested interrupts or scheduler (which share kcsan_ctx)
* reading an inconsistent reorder_access, ensure that the below has
* exclusive access to reorder_access by disallowing concurrent use.
*/
ctx->disable_scoped++;
barrier();
reorder_access->ptr = ptr;
reorder_access->size = size;
reorder_access->type = type | KCSAN_ACCESS_SCOPED;
reorder_access->ip = ip;
reorder_access->stack_depth = get_kcsan_stack_depth();
barrier();
ctx->disable_scoped--;
}
/*
* Pull everything together: check_access() below contains the performance
* critical operations; the fast-path (including check_access) functions should
@ -350,6 +444,7 @@ void kcsan_restore_irqtrace(struct task_struct *task)
static noinline void kcsan_found_watchpoint(const volatile void *ptr,
size_t size,
int type,
unsigned long ip,
atomic_long_t *watchpoint,
long encoded_watchpoint)
{
@ -371,8 +466,10 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
* The access_mask check relies on value-change comparison. To avoid
* reporting a race where e.g. the writer set up the watchpoint, but the
* reader has access_mask!=0, we have to ignore the found watchpoint.
*
* reorder_access is never created from an access with access_mask set.
*/
if (ctx->access_mask)
if (ctx->access_mask && !find_reorder_access(ctx, ptr, size, type, ip))
return;
/*
@ -396,7 +493,7 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
if (consumed) {
kcsan_save_irqtrace(current);
kcsan_report_set_info(ptr, size, type, watchpoint - watchpoints);
kcsan_report_set_info(ptr, size, type, ip, watchpoint - watchpoints);
kcsan_restore_irqtrace(current);
} else {
/*
@ -416,17 +513,19 @@ static noinline void kcsan_found_watchpoint(const volatile void *ptr,
}
static noinline void
kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type, unsigned long ip)
{
const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
const bool is_assert = (type & KCSAN_ACCESS_ASSERT) != 0;
atomic_long_t *watchpoint;
u64 old, new, diff;
unsigned long access_mask;
enum kcsan_value_change value_change = KCSAN_VALUE_CHANGE_MAYBE;
bool interrupt_watcher = kcsan_interrupt_watcher;
unsigned long ua_flags = user_access_save();
struct kcsan_ctx *ctx = get_ctx();
unsigned long access_mask = ctx->access_mask;
unsigned long irq_flags = 0;
bool is_reorder_access;
/*
* Always reset kcsan_skip counter in slow-path to avoid underflow; see
@ -449,13 +548,33 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
goto out;
}
/*
* The local CPU cannot observe reordering of its own accesses, and
* therefore we need to take care of 2 cases to avoid false positives:
*
* 1. Races of the reordered access with interrupts. To avoid, if
* the current access is reorder_access, disable interrupts.
* 2. Avoid races of scoped accesses from nested interrupts (below).
*/
is_reorder_access = find_reorder_access(ctx, ptr, size, type, ip);
if (is_reorder_access)
interrupt_watcher = false;
/*
* Avoid races of scoped accesses from nested interrupts (or scheduler).
* Assume setting up a watchpoint for a non-scoped (normal) access that
* also conflicts with a current scoped access. In a nested interrupt,
* which shares the context, it would check a conflicting scoped access.
* To avoid, disable scoped access checking.
*/
ctx->disable_scoped++;
/*
* Save and restore the IRQ state trace touched by KCSAN, since KCSAN's
* runtime is entered for every memory access, and potentially useful
* information is lost if dirtied by KCSAN.
*/
kcsan_save_irqtrace(current);
if (!kcsan_interrupt_watcher)
if (!interrupt_watcher)
local_irq_save(irq_flags);
watchpoint = insert_watchpoint((unsigned long)ptr, size, is_write);
@ -476,23 +595,7 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* Read the current value, to later check and infer a race if the data
* was modified via a non-instrumented access, e.g. from a device.
*/
old = 0;
switch (size) {
case 1:
old = READ_ONCE(*(const u8 *)ptr);
break;
case 2:
old = READ_ONCE(*(const u16 *)ptr);
break;
case 4:
old = READ_ONCE(*(const u32 *)ptr);
break;
case 8:
old = READ_ONCE(*(const u64 *)ptr);
break;
default:
break; /* ignore; we do not diff the values */
}
old = is_reorder_access ? 0 : read_instrumented_memory(ptr, size);
/*
* Delay this thread, to increase probability of observing a racy
@ -504,23 +607,16 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
* Re-read value, and check if it is as expected; if not, we infer a
* racy access.
*/
access_mask = ctx->access_mask;
if (!is_reorder_access) {
new = read_instrumented_memory(ptr, size);
} else {
/*
* Reordered accesses cannot be used for value change detection,
* because the memory location may no longer be accessible and
* could result in a fault.
*/
new = 0;
switch (size) {
case 1:
new = READ_ONCE(*(const u8 *)ptr);
break;
case 2:
new = READ_ONCE(*(const u16 *)ptr);
break;
case 4:
new = READ_ONCE(*(const u32 *)ptr);
break;
case 8:
new = READ_ONCE(*(const u64 *)ptr);
break;
default:
break; /* ignore; we do not diff the values */
access_mask = 0;
}
diff = old ^ new;
@ -568,8 +664,8 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
if (is_assert && value_change == KCSAN_VALUE_CHANGE_TRUE)
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
kcsan_report_known_origin(ptr, size, type, value_change,
watchpoint - watchpoints,
kcsan_report_known_origin(ptr, size, type, ip,
value_change, watchpoint - watchpoints,
old, new, access_mask);
} else if (value_change == KCSAN_VALUE_CHANGE_TRUE) {
/* Inferring a race, since the value should not have changed. */
@ -578,8 +674,10 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
if (is_assert)
atomic_long_inc(&kcsan_counters[KCSAN_COUNTER_ASSERT_FAILURES]);
if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert)
kcsan_report_unknown_origin(ptr, size, type, old, new, access_mask);
if (IS_ENABLED(CONFIG_KCSAN_REPORT_RACE_UNKNOWN_ORIGIN) || is_assert) {
kcsan_report_unknown_origin(ptr, size, type, ip,
old, new, access_mask);
}
}
/*
@ -588,18 +686,27 @@ kcsan_setup_watchpoint(const volatile void *ptr, size_t size, int type)
*/
remove_watchpoint(watchpoint);
atomic_long_dec(&kcsan_counters[KCSAN_COUNTER_USED_WATCHPOINTS]);
out_unlock:
if (!kcsan_interrupt_watcher)
if (!interrupt_watcher)
local_irq_restore(irq_flags);
kcsan_restore_irqtrace(current);
ctx->disable_scoped--;
/*
* Reordered accesses cannot be used for value change detection,
* therefore never consider for reordering if access_mask is set.
* ASSERT_EXCLUSIVE are not real accesses, ignore them as well.
*/
if (!access_mask && !is_assert)
set_reorder_access(ctx, ptr, size, type, ip);
out:
user_access_restore(ua_flags);
}
static __always_inline void check_access(const volatile void *ptr, size_t size,
int type)
static __always_inline void
check_access(const volatile void *ptr, size_t size, int type, unsigned long ip)
{
const bool is_write = (type & KCSAN_ACCESS_WRITE) != 0;
atomic_long_t *watchpoint;
long encoded_watchpoint;
@ -610,12 +717,14 @@ static __always_inline void check_access(const volatile void *ptr, size_t size,
if (unlikely(size == 0))
return;
again:
/*
* Avoid user_access_save in fast-path: find_watchpoint is safe without
* user_access_save, as the address that ptr points to is only used to
* check if a watchpoint exists; ptr is never dereferenced.
*/
watchpoint = find_watchpoint((unsigned long)ptr, size, !is_write,
watchpoint = find_watchpoint((unsigned long)ptr, size,
!(type & KCSAN_ACCESS_WRITE),
&encoded_watchpoint);
/*
* It is safe to check kcsan_is_enabled() after find_watchpoint in the
@ -625,14 +734,46 @@ static __always_inline void check_access(const volatile void *ptr, size_t size,
*/
if (unlikely(watchpoint != NULL))
kcsan_found_watchpoint(ptr, size, type, watchpoint,
encoded_watchpoint);
kcsan_found_watchpoint(ptr, size, type, ip, watchpoint, encoded_watchpoint);
else {
struct kcsan_ctx *ctx = get_ctx(); /* Call only once in fast-path. */
if (unlikely(should_watch(ptr, size, type, ctx)))
kcsan_setup_watchpoint(ptr, size, type);
else if (unlikely(ctx->scoped_accesses.prev))
if (unlikely(should_watch(ctx, ptr, size, type))) {
kcsan_setup_watchpoint(ptr, size, type, ip);
return;
}
if (!(type & KCSAN_ACCESS_SCOPED)) {
struct kcsan_scoped_access *reorder_access = get_reorder_access(ctx);
if (reorder_access) {
/*
* reorder_access check: simulates reordering of
* the access after subsequent operations.
*/
ptr = reorder_access->ptr;
type = reorder_access->type;
ip = reorder_access->ip;
/*
* Upon a nested interrupt, this context's
* reorder_access can be modified (shared ctx).
* We know that upon return, reorder_access is
* always invalidated by setting size to 0 via
* __tsan_func_exit(). Therefore we must read
* and check size after the other fields.
*/
barrier();
size = READ_ONCE(reorder_access->size);
if (size)
goto again;
}
}
/*
* Always checked last, right before returning from runtime;
* if reorder_access is valid, checked after it was checked.
*/
if (unlikely(ctx->scoped_accesses.prev))
kcsan_check_scoped_accesses();
}
}
@ -757,7 +898,7 @@ kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type,
{
struct kcsan_ctx *ctx = get_ctx();
__kcsan_check_access(ptr, size, type);
check_access(ptr, size, type, _RET_IP_);
ctx->disable_count++; /* Disable KCSAN, in case list debugging is on. */
@ -765,6 +906,7 @@ kcsan_begin_scoped_access(const volatile void *ptr, size_t size, int type,
sa->ptr = ptr;
sa->size = size;
sa->type = type;
sa->ip = _RET_IP_;
if (!ctx->scoped_accesses.prev) /* Lazy initialize list head. */
INIT_LIST_HEAD(&ctx->scoped_accesses);
@ -796,16 +938,32 @@ void kcsan_end_scoped_access(struct kcsan_scoped_access *sa)
ctx->disable_count--;
__kcsan_check_access(sa->ptr, sa->size, sa->type);
check_access(sa->ptr, sa->size, sa->type, sa->ip);
}
EXPORT_SYMBOL(kcsan_end_scoped_access);
void __kcsan_check_access(const volatile void *ptr, size_t size, int type)
{
check_access(ptr, size, type);
check_access(ptr, size, type, _RET_IP_);
}
EXPORT_SYMBOL(__kcsan_check_access);
#define DEFINE_MEMORY_BARRIER(name, order_before_cond) \
void __kcsan_##name(void) \
{ \
struct kcsan_scoped_access *sa = get_reorder_access(get_ctx()); \
if (!sa) \
return; \
if (order_before_cond) \
sa->size = 0; \
} \
EXPORT_SYMBOL(__kcsan_##name)
DEFINE_MEMORY_BARRIER(mb, true);
DEFINE_MEMORY_BARRIER(wmb, sa->type & (KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND));
DEFINE_MEMORY_BARRIER(rmb, !(sa->type & KCSAN_ACCESS_WRITE) || (sa->type & KCSAN_ACCESS_COMPOUND));
DEFINE_MEMORY_BARRIER(release, true);
/*
* KCSAN uses the same instrumentation that is emitted by supported compilers
* for ThreadSanitizer (TSAN).
@ -823,7 +981,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
void __tsan_read##size(void *ptr); \
void __tsan_read##size(void *ptr) \
{ \
check_access(ptr, size, 0); \
check_access(ptr, size, 0, _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_read##size); \
void __tsan_unaligned_read##size(void *ptr) \
@ -832,7 +990,7 @@ EXPORT_SYMBOL(__kcsan_check_access);
void __tsan_write##size(void *ptr); \
void __tsan_write##size(void *ptr) \
{ \
check_access(ptr, size, KCSAN_ACCESS_WRITE); \
check_access(ptr, size, KCSAN_ACCESS_WRITE, _RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_write##size); \
void __tsan_unaligned_write##size(void *ptr) \
@ -842,7 +1000,8 @@ EXPORT_SYMBOL(__kcsan_check_access);
void __tsan_read_write##size(void *ptr) \
{ \
check_access(ptr, size, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE); \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, \
_RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_read_write##size); \
void __tsan_unaligned_read_write##size(void *ptr) \
@ -858,14 +1017,14 @@ DEFINE_TSAN_READ_WRITE(16);
void __tsan_read_range(void *ptr, size_t size);
void __tsan_read_range(void *ptr, size_t size)
{
check_access(ptr, size, 0);
check_access(ptr, size, 0, _RET_IP_);
}
EXPORT_SYMBOL(__tsan_read_range);
void __tsan_write_range(void *ptr, size_t size);
void __tsan_write_range(void *ptr, size_t size)
{
check_access(ptr, size, KCSAN_ACCESS_WRITE);
check_access(ptr, size, KCSAN_ACCESS_WRITE, _RET_IP_);
}
EXPORT_SYMBOL(__tsan_write_range);
@ -886,7 +1045,8 @@ EXPORT_SYMBOL(__tsan_write_range);
IS_ALIGNED((unsigned long)ptr, size); \
if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS) && is_atomic) \
return; \
check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0); \
check_access(ptr, size, is_atomic ? KCSAN_ACCESS_ATOMIC : 0, \
_RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_volatile_read##size); \
void __tsan_unaligned_volatile_read##size(void *ptr) \
@ -901,7 +1061,8 @@ EXPORT_SYMBOL(__tsan_write_range);
return; \
check_access(ptr, size, \
KCSAN_ACCESS_WRITE | \
(is_atomic ? KCSAN_ACCESS_ATOMIC : 0)); \
(is_atomic ? KCSAN_ACCESS_ATOMIC : 0), \
_RET_IP_); \
} \
EXPORT_SYMBOL(__tsan_volatile_write##size); \
void __tsan_unaligned_volatile_write##size(void *ptr) \
@ -915,19 +1076,56 @@ DEFINE_TSAN_VOLATILE_READ_WRITE(8);
DEFINE_TSAN_VOLATILE_READ_WRITE(16);
/*
* The below are not required by KCSAN, but can still be emitted by the
* compiler.
* Function entry and exit are used to determine the validty of reorder_access.
* Reordering of the access ends at the end of the function scope where the
* access happened. This is done for two reasons:
*
* 1. Artificially limits the scope where missing barriers are detected.
* This minimizes false positives due to uninstrumented functions that
* contain the required barriers but were missed.
*
* 2. Simplifies generating the stack trace of the access.
*/
void __tsan_func_entry(void *call_pc);
void __tsan_func_entry(void *call_pc)
noinline void __tsan_func_entry(void *call_pc)
{
if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
return;
add_kcsan_stack_depth(1);
}
EXPORT_SYMBOL(__tsan_func_entry);
void __tsan_func_exit(void);
void __tsan_func_exit(void)
noinline void __tsan_func_exit(void)
{
struct kcsan_scoped_access *reorder_access;
if (!IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
return;
reorder_access = get_reorder_access(get_ctx());
if (!reorder_access)
goto out;
if (get_kcsan_stack_depth() <= reorder_access->stack_depth) {
/*
* Access check to catch cases where write without a barrier
* (supposed release) was last access in function: because
* instrumentation is inserted before the real access, a data
* race due to the write giving up a c-s would only be caught if
* we do the conflicting access after.
*/
check_access(reorder_access->ptr, reorder_access->size,
reorder_access->type, reorder_access->ip);
reorder_access->size = 0;
reorder_access->stack_depth = INT_MIN;
}
out:
add_kcsan_stack_depth(-1);
}
EXPORT_SYMBOL(__tsan_func_exit);
void __tsan_init(void);
void __tsan_init(void)
{
@ -950,12 +1148,21 @@ EXPORT_SYMBOL(__tsan_init);
* functions, whose job is to also execute the operation itself.
*/
static __always_inline void kcsan_atomic_builtin_memorder(int memorder)
{
if (memorder == __ATOMIC_RELEASE ||
memorder == __ATOMIC_SEQ_CST ||
memorder == __ATOMIC_ACQ_REL)
__kcsan_release();
}
#define DEFINE_TSAN_ATOMIC_LOAD_STORE(bits) \
u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder); \
u##bits __tsan_atomic##bits##_load(const u##bits *ptr, int memorder) \
{ \
kcsan_atomic_builtin_memorder(memorder); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC); \
check_access(ptr, bits / BITS_PER_BYTE, KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
return __atomic_load_n(ptr, memorder); \
} \
@ -963,9 +1170,10 @@ EXPORT_SYMBOL(__tsan_init);
void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder); \
void __tsan_atomic##bits##_store(u##bits *ptr, u##bits v, int memorder) \
{ \
kcsan_atomic_builtin_memorder(memorder); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC); \
KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
__atomic_store_n(ptr, v, memorder); \
} \
@ -975,10 +1183,11 @@ EXPORT_SYMBOL(__tsan_init);
u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder); \
u##bits __tsan_atomic##bits##_##op(u##bits *ptr, u##bits v, int memorder) \
{ \
kcsan_atomic_builtin_memorder(memorder); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
KCSAN_ACCESS_ATOMIC); \
KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
return __atomic_##op##suffix(ptr, v, memorder); \
} \
@ -1007,10 +1216,11 @@ EXPORT_SYMBOL(__tsan_init);
int __tsan_atomic##bits##_compare_exchange_##strength(u##bits *ptr, u##bits *exp, \
u##bits val, int mo, int fail_mo) \
{ \
kcsan_atomic_builtin_memorder(mo); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
KCSAN_ACCESS_ATOMIC); \
KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
return __atomic_compare_exchange_n(ptr, exp, val, weak, mo, fail_mo); \
} \
@ -1022,10 +1232,11 @@ EXPORT_SYMBOL(__tsan_init);
u##bits __tsan_atomic##bits##_compare_exchange_val(u##bits *ptr, u##bits exp, u##bits val, \
int mo, int fail_mo) \
{ \
kcsan_atomic_builtin_memorder(mo); \
if (!IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS)) { \
check_access(ptr, bits / BITS_PER_BYTE, \
KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | \
KCSAN_ACCESS_ATOMIC); \
KCSAN_ACCESS_ATOMIC, _RET_IP_); \
} \
__atomic_compare_exchange_n(ptr, &exp, val, 0, mo, fail_mo); \
return exp; \
@ -1053,10 +1264,47 @@ DEFINE_TSAN_ATOMIC_OPS(64);
void __tsan_atomic_thread_fence(int memorder);
void __tsan_atomic_thread_fence(int memorder)
{
kcsan_atomic_builtin_memorder(memorder);
__atomic_thread_fence(memorder);
}
EXPORT_SYMBOL(__tsan_atomic_thread_fence);
/*
* In instrumented files, we emit instrumentation for barriers by mapping the
* kernel barriers to an __atomic_signal_fence(), which is interpreted specially
* and otherwise has no relation to a real __atomic_signal_fence(). No known
* kernel code uses __atomic_signal_fence().
*
* Since fsanitize=thread instrumentation handles __atomic_signal_fence(), which
* are turned into calls to __tsan_atomic_signal_fence(), such instrumentation
* can be disabled via the __no_kcsan function attribute (vs. an explicit call
* which could not). When __no_kcsan is requested, __atomic_signal_fence()
* generates no code.
*
* Note: The result of using __atomic_signal_fence() with KCSAN enabled is
* potentially limiting the compiler's ability to reorder operations; however,
* if barriers were instrumented with explicit calls (without LTO), the compiler
* couldn't optimize much anyway. The result of a hypothetical architecture
* using __atomic_signal_fence() in normal code would be KCSAN false negatives.
*/
void __tsan_atomic_signal_fence(int memorder);
void __tsan_atomic_signal_fence(int memorder) { }
noinline void __tsan_atomic_signal_fence(int memorder)
{
switch (memorder) {
case __KCSAN_BARRIER_TO_SIGNAL_FENCE_mb:
__kcsan_mb();
break;
case __KCSAN_BARRIER_TO_SIGNAL_FENCE_wmb:
__kcsan_wmb();
break;
case __KCSAN_BARRIER_TO_SIGNAL_FENCE_rmb:
__kcsan_rmb();
break;
case __KCSAN_BARRIER_TO_SIGNAL_FENCE_release:
__kcsan_release();
break;
default:
break;
}
}
EXPORT_SYMBOL(__tsan_atomic_signal_fence);

View File

@ -121,7 +121,7 @@ enum kcsan_value_change {
* to be consumed by the reporting thread. No report is printed yet.
*/
void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
int watchpoint_idx);
unsigned long ip, int watchpoint_idx);
/*
* The calling thread observed that the watchpoint it set up was hit and
@ -129,14 +129,14 @@ void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_typ
* thread.
*/
void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
enum kcsan_value_change value_change, int watchpoint_idx,
u64 old, u64 new, u64 mask);
unsigned long ip, enum kcsan_value_change value_change,
int watchpoint_idx, u64 old, u64 new, u64 mask);
/*
* No other thread was observed to race with the access, but the data value
* before and after the stall differs. Reports a race of "unknown origin".
*/
void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
u64 old, u64 new, u64 mask);
unsigned long ip, u64 old, u64 new, u64 mask);
#endif /* _KERNEL_KCSAN_KCSAN_H */

View File

@ -16,9 +16,12 @@
#define pr_fmt(fmt) "kcsan_test: " fmt
#include <kunit/test.h>
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/jiffies.h>
#include <linux/kcsan-checks.h>
#include <linux/kernel.h>
#include <linux/mutex.h>
#include <linux/sched.h>
#include <linux/seqlock.h>
#include <linux/spinlock.h>
@ -29,6 +32,11 @@
#include <linux/types.h>
#include <trace/events/printk.h>
#define KCSAN_TEST_REQUIRES(test, cond) do { \
if (!(cond)) \
kunit_skip((test), "Test requires: " #cond); \
} while (0)
#ifdef CONFIG_CC_HAS_TSAN_COMPOUND_READ_BEFORE_WRITE
#define __KCSAN_ACCESS_RW(alt) (KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE)
#else
@ -146,7 +154,7 @@ struct expect_report {
/* Check observed report matches information in @r. */
__no_kcsan
static bool report_matches(const struct expect_report *r)
static bool __report_matches(const struct expect_report *r)
{
const bool is_assert = (r->access[0].type | r->access[1].type) & KCSAN_ACCESS_ASSERT;
bool ret = false;
@ -205,10 +213,12 @@ static bool report_matches(const struct expect_report *r)
"read-write" :
"write") :
"read");
const bool is_atomic = (ty & KCSAN_ACCESS_ATOMIC);
const bool is_scoped = (ty & KCSAN_ACCESS_SCOPED);
const char *const access_type_aux =
(ty & KCSAN_ACCESS_ATOMIC) ?
" (marked)" :
((ty & KCSAN_ACCESS_SCOPED) ? " (scoped)" : "");
(is_atomic && is_scoped) ? " (marked, reordered)"
: (is_atomic ? " (marked)"
: (is_scoped ? " (reordered)" : ""));
if (i == 1) {
/* Access 2 */
@ -246,6 +256,40 @@ static bool report_matches(const struct expect_report *r)
return ret;
}
static __always_inline const struct expect_report *
__report_set_scoped(struct expect_report *r, int accesses)
{
BUILD_BUG_ON(accesses > 3);
if (accesses & 1)
r->access[0].type |= KCSAN_ACCESS_SCOPED;
else
r->access[0].type &= ~KCSAN_ACCESS_SCOPED;
if (accesses & 2)
r->access[1].type |= KCSAN_ACCESS_SCOPED;
else
r->access[1].type &= ~KCSAN_ACCESS_SCOPED;
return r;
}
__no_kcsan
static bool report_matches_any_reordered(struct expect_report *r)
{
return __report_matches(__report_set_scoped(r, 0)) ||
__report_matches(__report_set_scoped(r, 1)) ||
__report_matches(__report_set_scoped(r, 2)) ||
__report_matches(__report_set_scoped(r, 3));
}
#ifdef CONFIG_KCSAN_WEAK_MEMORY
/* Due to reordering accesses, any access may appear as "(reordered)". */
#define report_matches report_matches_any_reordered
#else
#define report_matches __report_matches
#endif
/* ===== Test kernels ===== */
static long test_sink;
@ -256,6 +300,8 @@ static struct {
long val[8];
} test_struct;
static DEFINE_SEQLOCK(test_seqlock);
static DEFINE_SPINLOCK(test_spinlock);
static DEFINE_MUTEX(test_mutex);
/*
* Helper to avoid compiler optimizing out reads, and to generate source values
@ -264,6 +310,16 @@ static DEFINE_SEQLOCK(test_seqlock);
__no_kcsan
static noinline void sink_value(long v) { WRITE_ONCE(test_sink, v); }
/*
* Generates a delay and some accesses that enter the runtime but do not produce
* data races.
*/
static noinline void test_delay(int iter)
{
while (iter--)
sink_value(READ_ONCE(test_sink));
}
static noinline void test_kernel_read(void) { sink_value(test_var); }
static noinline void test_kernel_write(void)
@ -333,7 +389,10 @@ static noinline void test_kernel_assert_bits_nochange(void)
ASSERT_EXCLUSIVE_BITS(test_var, ~TEST_CHANGE_BITS);
}
/* To check that scoped assertions do trigger anywhere in scope. */
/*
* Scoped assertions do trigger anywhere in scope. However, the report should
* still only point at the start of the scope.
*/
static noinline void test_enter_scope(void)
{
int x = 0;
@ -422,19 +481,239 @@ static noinline void test_kernel_xor_1bit(void)
kcsan_nestable_atomic_end();
}
#define TEST_KERNEL_LOCKED(name, acquire, release) \
static noinline void test_kernel_##name(void) \
{ \
long *flag = &test_struct.val[0]; \
long v = 0; \
if (!(acquire)) \
return; \
while (v++ < 100) { \
test_var++; \
barrier(); \
} \
release; \
test_delay(10); \
}
TEST_KERNEL_LOCKED(with_memorder,
cmpxchg_acquire(flag, 0, 1) == 0,
smp_store_release(flag, 0));
TEST_KERNEL_LOCKED(wrong_memorder,
cmpxchg_relaxed(flag, 0, 1) == 0,
WRITE_ONCE(*flag, 0));
TEST_KERNEL_LOCKED(atomic_builtin_with_memorder,
__atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_ACQUIRE, __ATOMIC_RELAXED),
__atomic_store_n(flag, 0, __ATOMIC_RELEASE));
TEST_KERNEL_LOCKED(atomic_builtin_wrong_memorder,
__atomic_compare_exchange_n(flag, &v, 1, 0, __ATOMIC_RELAXED, __ATOMIC_RELAXED),
__atomic_store_n(flag, 0, __ATOMIC_RELAXED));
/* ===== Test cases ===== */
/*
* Tests that various barriers have the expected effect on internal state. Not
* exhaustive on atomic_t operations. Unlike the selftest, also checks for
* too-strict barrier instrumentation; these can be tolerated, because it does
* not cause false positives, but at least we should be aware of such cases.
*/
static void test_barrier_nothreads(struct kunit *test)
{
#ifdef CONFIG_KCSAN_WEAK_MEMORY
struct kcsan_scoped_access *reorder_access = &current->kcsan_ctx.reorder_access;
#else
struct kcsan_scoped_access *reorder_access = NULL;
#endif
arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
atomic_t dummy;
KCSAN_TEST_REQUIRES(test, reorder_access != NULL);
KCSAN_TEST_REQUIRES(test, IS_ENABLED(CONFIG_SMP));
#define __KCSAN_EXPECT_BARRIER(access_type, barrier, order_before, name) \
do { \
reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \
reorder_access->size = sizeof(test_var); \
barrier; \
KUNIT_EXPECT_EQ_MSG(test, reorder_access->size, \
order_before ? 0 : sizeof(test_var), \
"improperly instrumented type=(" #access_type "): " name); \
} while (0)
#define KCSAN_EXPECT_READ_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(0, b, o, #b)
#define KCSAN_EXPECT_WRITE_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_WRITE, b, o, #b)
#define KCSAN_EXPECT_RW_BARRIER(b, o) __KCSAN_EXPECT_BARRIER(KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE, b, o, #b)
/*
* Lockdep initialization can strengthen certain locking operations due
* to calling into instrumented files; "warm up" our locks.
*/
spin_lock(&test_spinlock);
spin_unlock(&test_spinlock);
mutex_lock(&test_mutex);
mutex_unlock(&test_mutex);
/* Force creating a valid entry in reorder_access first. */
test_var = 0;
while (test_var++ < 1000000 && reorder_access->size != sizeof(test_var))
__kcsan_check_read(&test_var, sizeof(test_var));
KUNIT_ASSERT_EQ(test, reorder_access->size, sizeof(test_var));
kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */
KCSAN_EXPECT_READ_BARRIER(mb(), true);
KCSAN_EXPECT_READ_BARRIER(wmb(), false);
KCSAN_EXPECT_READ_BARRIER(rmb(), true);
KCSAN_EXPECT_READ_BARRIER(smp_mb(), true);
KCSAN_EXPECT_READ_BARRIER(smp_wmb(), false);
KCSAN_EXPECT_READ_BARRIER(smp_rmb(), true);
KCSAN_EXPECT_READ_BARRIER(dma_wmb(), false);
KCSAN_EXPECT_READ_BARRIER(dma_rmb(), true);
KCSAN_EXPECT_READ_BARRIER(smp_mb__before_atomic(), true);
KCSAN_EXPECT_READ_BARRIER(smp_mb__after_atomic(), true);
KCSAN_EXPECT_READ_BARRIER(smp_mb__after_spinlock(), true);
KCSAN_EXPECT_READ_BARRIER(smp_store_mb(test_var, 0), true);
KCSAN_EXPECT_READ_BARRIER(smp_load_acquire(&test_var), false);
KCSAN_EXPECT_READ_BARRIER(smp_store_release(&test_var, 0), true);
KCSAN_EXPECT_READ_BARRIER(xchg(&test_var, 0), true);
KCSAN_EXPECT_READ_BARRIER(xchg_release(&test_var, 0), true);
KCSAN_EXPECT_READ_BARRIER(xchg_relaxed(&test_var, 0), false);
KCSAN_EXPECT_READ_BARRIER(cmpxchg(&test_var, 0, 0), true);
KCSAN_EXPECT_READ_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
KCSAN_EXPECT_READ_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
KCSAN_EXPECT_READ_BARRIER(atomic_read(&dummy), false);
KCSAN_EXPECT_READ_BARRIER(atomic_read_acquire(&dummy), false);
KCSAN_EXPECT_READ_BARRIER(atomic_set(&dummy, 0), false);
KCSAN_EXPECT_READ_BARRIER(atomic_set_release(&dummy, 0), true);
KCSAN_EXPECT_READ_BARRIER(atomic_add(1, &dummy), false);
KCSAN_EXPECT_READ_BARRIER(atomic_add_return(1, &dummy), true);
KCSAN_EXPECT_READ_BARRIER(atomic_add_return_acquire(1, &dummy), false);
KCSAN_EXPECT_READ_BARRIER(atomic_add_return_release(1, &dummy), true);
KCSAN_EXPECT_READ_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add(1, &dummy), true);
KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_release(1, &dummy), true);
KCSAN_EXPECT_READ_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
KCSAN_EXPECT_READ_BARRIER(test_and_set_bit(0, &test_var), true);
KCSAN_EXPECT_READ_BARRIER(test_and_clear_bit(0, &test_var), true);
KCSAN_EXPECT_READ_BARRIER(test_and_change_bit(0, &test_var), true);
KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock(0, &test_var), true);
KCSAN_EXPECT_READ_BARRIER(__clear_bit_unlock(0, &test_var), true);
KCSAN_EXPECT_READ_BARRIER(arch_spin_lock(&arch_spinlock), false);
KCSAN_EXPECT_READ_BARRIER(arch_spin_unlock(&arch_spinlock), true);
KCSAN_EXPECT_READ_BARRIER(spin_lock(&test_spinlock), false);
KCSAN_EXPECT_READ_BARRIER(spin_unlock(&test_spinlock), true);
KCSAN_EXPECT_READ_BARRIER(mutex_lock(&test_mutex), false);
KCSAN_EXPECT_READ_BARRIER(mutex_unlock(&test_mutex), true);
KCSAN_EXPECT_WRITE_BARRIER(mb(), true);
KCSAN_EXPECT_WRITE_BARRIER(wmb(), true);
KCSAN_EXPECT_WRITE_BARRIER(rmb(), false);
KCSAN_EXPECT_WRITE_BARRIER(smp_mb(), true);
KCSAN_EXPECT_WRITE_BARRIER(smp_wmb(), true);
KCSAN_EXPECT_WRITE_BARRIER(smp_rmb(), false);
KCSAN_EXPECT_WRITE_BARRIER(dma_wmb(), true);
KCSAN_EXPECT_WRITE_BARRIER(dma_rmb(), false);
KCSAN_EXPECT_WRITE_BARRIER(smp_mb__before_atomic(), true);
KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_atomic(), true);
KCSAN_EXPECT_WRITE_BARRIER(smp_mb__after_spinlock(), true);
KCSAN_EXPECT_WRITE_BARRIER(smp_store_mb(test_var, 0), true);
KCSAN_EXPECT_WRITE_BARRIER(smp_load_acquire(&test_var), false);
KCSAN_EXPECT_WRITE_BARRIER(smp_store_release(&test_var, 0), true);
KCSAN_EXPECT_WRITE_BARRIER(xchg(&test_var, 0), true);
KCSAN_EXPECT_WRITE_BARRIER(xchg_release(&test_var, 0), true);
KCSAN_EXPECT_WRITE_BARRIER(xchg_relaxed(&test_var, 0), false);
KCSAN_EXPECT_WRITE_BARRIER(cmpxchg(&test_var, 0, 0), true);
KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
KCSAN_EXPECT_WRITE_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
KCSAN_EXPECT_WRITE_BARRIER(atomic_read(&dummy), false);
KCSAN_EXPECT_WRITE_BARRIER(atomic_read_acquire(&dummy), false);
KCSAN_EXPECT_WRITE_BARRIER(atomic_set(&dummy, 0), false);
KCSAN_EXPECT_WRITE_BARRIER(atomic_set_release(&dummy, 0), true);
KCSAN_EXPECT_WRITE_BARRIER(atomic_add(1, &dummy), false);
KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return(1, &dummy), true);
KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_acquire(1, &dummy), false);
KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_release(1, &dummy), true);
KCSAN_EXPECT_WRITE_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add(1, &dummy), true);
KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy), true);
KCSAN_EXPECT_WRITE_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
KCSAN_EXPECT_WRITE_BARRIER(test_and_set_bit(0, &test_var), true);
KCSAN_EXPECT_WRITE_BARRIER(test_and_clear_bit(0, &test_var), true);
KCSAN_EXPECT_WRITE_BARRIER(test_and_change_bit(0, &test_var), true);
KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock(0, &test_var), true);
KCSAN_EXPECT_WRITE_BARRIER(__clear_bit_unlock(0, &test_var), true);
KCSAN_EXPECT_WRITE_BARRIER(arch_spin_lock(&arch_spinlock), false);
KCSAN_EXPECT_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock), true);
KCSAN_EXPECT_WRITE_BARRIER(spin_lock(&test_spinlock), false);
KCSAN_EXPECT_WRITE_BARRIER(spin_unlock(&test_spinlock), true);
KCSAN_EXPECT_WRITE_BARRIER(mutex_lock(&test_mutex), false);
KCSAN_EXPECT_WRITE_BARRIER(mutex_unlock(&test_mutex), true);
KCSAN_EXPECT_RW_BARRIER(mb(), true);
KCSAN_EXPECT_RW_BARRIER(wmb(), true);
KCSAN_EXPECT_RW_BARRIER(rmb(), true);
KCSAN_EXPECT_RW_BARRIER(smp_mb(), true);
KCSAN_EXPECT_RW_BARRIER(smp_wmb(), true);
KCSAN_EXPECT_RW_BARRIER(smp_rmb(), true);
KCSAN_EXPECT_RW_BARRIER(dma_wmb(), true);
KCSAN_EXPECT_RW_BARRIER(dma_rmb(), true);
KCSAN_EXPECT_RW_BARRIER(smp_mb__before_atomic(), true);
KCSAN_EXPECT_RW_BARRIER(smp_mb__after_atomic(), true);
KCSAN_EXPECT_RW_BARRIER(smp_mb__after_spinlock(), true);
KCSAN_EXPECT_RW_BARRIER(smp_store_mb(test_var, 0), true);
KCSAN_EXPECT_RW_BARRIER(smp_load_acquire(&test_var), false);
KCSAN_EXPECT_RW_BARRIER(smp_store_release(&test_var, 0), true);
KCSAN_EXPECT_RW_BARRIER(xchg(&test_var, 0), true);
KCSAN_EXPECT_RW_BARRIER(xchg_release(&test_var, 0), true);
KCSAN_EXPECT_RW_BARRIER(xchg_relaxed(&test_var, 0), false);
KCSAN_EXPECT_RW_BARRIER(cmpxchg(&test_var, 0, 0), true);
KCSAN_EXPECT_RW_BARRIER(cmpxchg_release(&test_var, 0, 0), true);
KCSAN_EXPECT_RW_BARRIER(cmpxchg_relaxed(&test_var, 0, 0), false);
KCSAN_EXPECT_RW_BARRIER(atomic_read(&dummy), false);
KCSAN_EXPECT_RW_BARRIER(atomic_read_acquire(&dummy), false);
KCSAN_EXPECT_RW_BARRIER(atomic_set(&dummy, 0), false);
KCSAN_EXPECT_RW_BARRIER(atomic_set_release(&dummy, 0), true);
KCSAN_EXPECT_RW_BARRIER(atomic_add(1, &dummy), false);
KCSAN_EXPECT_RW_BARRIER(atomic_add_return(1, &dummy), true);
KCSAN_EXPECT_RW_BARRIER(atomic_add_return_acquire(1, &dummy), false);
KCSAN_EXPECT_RW_BARRIER(atomic_add_return_release(1, &dummy), true);
KCSAN_EXPECT_RW_BARRIER(atomic_add_return_relaxed(1, &dummy), false);
KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add(1, &dummy), true);
KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_acquire(1, &dummy), false);
KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_release(1, &dummy), true);
KCSAN_EXPECT_RW_BARRIER(atomic_fetch_add_relaxed(1, &dummy), false);
KCSAN_EXPECT_RW_BARRIER(test_and_set_bit(0, &test_var), true);
KCSAN_EXPECT_RW_BARRIER(test_and_clear_bit(0, &test_var), true);
KCSAN_EXPECT_RW_BARRIER(test_and_change_bit(0, &test_var), true);
KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock(0, &test_var), true);
KCSAN_EXPECT_RW_BARRIER(__clear_bit_unlock(0, &test_var), true);
KCSAN_EXPECT_RW_BARRIER(arch_spin_lock(&arch_spinlock), false);
KCSAN_EXPECT_RW_BARRIER(arch_spin_unlock(&arch_spinlock), true);
KCSAN_EXPECT_RW_BARRIER(spin_lock(&test_spinlock), false);
KCSAN_EXPECT_RW_BARRIER(spin_unlock(&test_spinlock), true);
KCSAN_EXPECT_RW_BARRIER(mutex_lock(&test_mutex), false);
KCSAN_EXPECT_RW_BARRIER(mutex_unlock(&test_mutex), true);
#ifdef clear_bit_unlock_is_negative_byte
KCSAN_EXPECT_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
KCSAN_EXPECT_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
KCSAN_EXPECT_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var), true);
#endif
kcsan_nestable_atomic_end();
}
/* Simple test with normal data race. */
__no_kcsan
static void test_basic(struct kunit *test)
{
const struct expect_report expect = {
struct expect_report expect = {
.access = {
{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
static const struct expect_report never = {
struct expect_report never = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
@ -459,14 +738,14 @@ static void test_basic(struct kunit *test)
__no_kcsan
static void test_concurrent_races(struct kunit *test)
{
const struct expect_report expect = {
struct expect_report expect = {
.access = {
/* NULL will match any address. */
{ test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
{ test_kernel_rmw_array, NULL, 0, __KCSAN_ACCESS_RW(0) },
},
};
static const struct expect_report never = {
struct expect_report never = {
.access = {
{ test_kernel_rmw_array, NULL, 0, 0 },
{ test_kernel_rmw_array, NULL, 0, 0 },
@ -488,17 +767,24 @@ static void test_concurrent_races(struct kunit *test)
__no_kcsan
static void test_novalue_change(struct kunit *test)
{
const struct expect_report expect = {
struct expect_report expect_rw = {
.access = {
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
struct expect_report expect_ww = {
.access = {
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
},
};
bool match_expect = false;
test_kernel_write_nochange(); /* Reset value. */
begin_test_checks(test_kernel_write_nochange, test_kernel_read);
do {
match_expect = report_matches(&expect);
match_expect = report_matches(&expect_rw) || report_matches(&expect_ww);
} while (!end_test_checks(match_expect));
if (IS_ENABLED(CONFIG_KCSAN_REPORT_VALUE_CHANGE_ONLY))
KUNIT_EXPECT_FALSE(test, match_expect);
@ -513,17 +799,24 @@ static void test_novalue_change(struct kunit *test)
__no_kcsan
static void test_novalue_change_exception(struct kunit *test)
{
const struct expect_report expect = {
struct expect_report expect_rw = {
.access = {
{ test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
struct expect_report expect_ww = {
.access = {
{ test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_write_nochange_rcu, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
},
};
bool match_expect = false;
test_kernel_write_nochange_rcu(); /* Reset value. */
begin_test_checks(test_kernel_write_nochange_rcu, test_kernel_read);
do {
match_expect = report_matches(&expect);
match_expect = report_matches(&expect_rw) || report_matches(&expect_ww);
} while (!end_test_checks(match_expect));
KUNIT_EXPECT_TRUE(test, match_expect);
}
@ -532,7 +825,7 @@ static void test_novalue_change_exception(struct kunit *test)
__no_kcsan
static void test_unknown_origin(struct kunit *test)
{
const struct expect_report expect = {
struct expect_report expect = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ NULL },
@ -554,7 +847,7 @@ static void test_unknown_origin(struct kunit *test)
__no_kcsan
static void test_write_write_assume_atomic(struct kunit *test)
{
const struct expect_report expect = {
struct expect_report expect = {
.access = {
{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
{ test_kernel_write, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
@ -580,7 +873,7 @@ static void test_write_write_assume_atomic(struct kunit *test)
__no_kcsan
static void test_write_write_struct(struct kunit *test)
{
const struct expect_report expect = {
struct expect_report expect = {
.access = {
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
@ -602,7 +895,7 @@ static void test_write_write_struct(struct kunit *test)
__no_kcsan
static void test_write_write_struct_part(struct kunit *test)
{
const struct expect_report expect = {
struct expect_report expect = {
.access = {
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
{ test_kernel_write_struct_part, &test_struct.val[3], sizeof(test_struct.val[3]), KCSAN_ACCESS_WRITE },
@ -634,7 +927,7 @@ static void test_read_atomic_write_atomic(struct kunit *test)
__no_kcsan
static void test_read_plain_atomic_write(struct kunit *test)
{
const struct expect_report expect = {
struct expect_report expect = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ test_kernel_write_atomic, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC },
@ -642,8 +935,7 @@ static void test_read_plain_atomic_write(struct kunit *test)
};
bool match_expect = false;
if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS))
return;
KCSAN_TEST_REQUIRES(test, !IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS));
begin_test_checks(test_kernel_read, test_kernel_write_atomic);
do {
@ -656,7 +948,7 @@ static void test_read_plain_atomic_write(struct kunit *test)
__no_kcsan
static void test_read_plain_atomic_rmw(struct kunit *test)
{
const struct expect_report expect = {
struct expect_report expect = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ test_kernel_atomic_rmw, &test_var, sizeof(test_var),
@ -665,8 +957,7 @@ static void test_read_plain_atomic_rmw(struct kunit *test)
};
bool match_expect = false;
if (IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS))
return;
KCSAN_TEST_REQUIRES(test, !IS_ENABLED(CONFIG_KCSAN_IGNORE_ATOMICS));
begin_test_checks(test_kernel_read, test_kernel_atomic_rmw);
do {
@ -679,13 +970,13 @@ static void test_read_plain_atomic_rmw(struct kunit *test)
__no_kcsan
static void test_zero_size_access(struct kunit *test)
{
const struct expect_report expect = {
struct expect_report expect = {
.access = {
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
},
};
const struct expect_report never = {
struct expect_report never = {
.access = {
{ test_kernel_write_struct, &test_struct, sizeof(test_struct), KCSAN_ACCESS_WRITE },
{ test_kernel_read_struct_zero_size, &test_struct.val[3], 0, 0 },
@ -719,7 +1010,7 @@ static void test_data_race(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_writer(struct kunit *test)
{
const struct expect_report expect = {
struct expect_report expect = {
.access = {
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
@ -737,7 +1028,7 @@ static void test_assert_exclusive_writer(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_access(struct kunit *test)
{
const struct expect_report expect = {
struct expect_report expect = {
.access = {
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
@ -755,19 +1046,19 @@ static void test_assert_exclusive_access(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_access_writer(struct kunit *test)
{
const struct expect_report expect_access_writer = {
struct expect_report expect_access_writer = {
.access = {
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
},
};
const struct expect_report expect_access_access = {
struct expect_report expect_access_access = {
.access = {
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
{ test_kernel_assert_access, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE },
},
};
const struct expect_report never = {
struct expect_report never = {
.access = {
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
{ test_kernel_assert_writer, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
@ -791,7 +1082,7 @@ static void test_assert_exclusive_access_writer(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_bits_change(struct kunit *test)
{
const struct expect_report expect = {
struct expect_report expect = {
.access = {
{ test_kernel_assert_bits_change, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT },
{ test_kernel_change_bits, &test_var, sizeof(test_var),
@ -822,43 +1113,43 @@ static void test_assert_exclusive_bits_nochange(struct kunit *test)
__no_kcsan
static void test_assert_exclusive_writer_scoped(struct kunit *test)
{
const struct expect_report expect_start = {
struct expect_report expect_start = {
.access = {
{ test_kernel_assert_writer_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
},
};
const struct expect_report expect_anywhere = {
struct expect_report expect_inscope = {
.access = {
{ test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_SCOPED },
{ test_kernel_write_nochange, &test_var, sizeof(test_var), KCSAN_ACCESS_WRITE },
},
};
bool match_expect_start = false;
bool match_expect_anywhere = false;
bool match_expect_inscope = false;
begin_test_checks(test_kernel_assert_writer_scoped, test_kernel_write_nochange);
do {
match_expect_start |= report_matches(&expect_start);
match_expect_anywhere |= report_matches(&expect_anywhere);
} while (!end_test_checks(match_expect_start && match_expect_anywhere));
match_expect_inscope |= report_matches(&expect_inscope);
} while (!end_test_checks(match_expect_inscope));
KUNIT_EXPECT_TRUE(test, match_expect_start);
KUNIT_EXPECT_TRUE(test, match_expect_anywhere);
KUNIT_EXPECT_FALSE(test, match_expect_inscope);
}
__no_kcsan
static void test_assert_exclusive_access_scoped(struct kunit *test)
{
const struct expect_report expect_start1 = {
struct expect_report expect_start1 = {
.access = {
{ test_kernel_assert_access_scoped, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
},
};
const struct expect_report expect_start2 = {
struct expect_report expect_start2 = {
.access = { expect_start1.access[0], expect_start1.access[0] },
};
const struct expect_report expect_inscope = {
struct expect_report expect_inscope = {
.access = {
{ test_enter_scope, &test_var, sizeof(test_var), KCSAN_ACCESS_ASSERT | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_SCOPED },
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
@ -872,9 +1163,9 @@ static void test_assert_exclusive_access_scoped(struct kunit *test)
do {
match_expect_start |= report_matches(&expect_start1) || report_matches(&expect_start2);
match_expect_inscope |= report_matches(&expect_inscope);
} while (!end_test_checks(match_expect_start && match_expect_inscope));
} while (!end_test_checks(match_expect_inscope));
KUNIT_EXPECT_TRUE(test, match_expect_start);
KUNIT_EXPECT_TRUE(test, match_expect_inscope);
KUNIT_EXPECT_FALSE(test, match_expect_inscope);
}
/*
@ -963,7 +1254,7 @@ static void test_atomic_builtins(struct kunit *test)
__no_kcsan
static void test_1bit_value_change(struct kunit *test)
{
const struct expect_report expect = {
struct expect_report expect = {
.access = {
{ test_kernel_read, &test_var, sizeof(test_var), 0 },
{ test_kernel_xor_1bit, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
@ -983,6 +1274,90 @@ static void test_1bit_value_change(struct kunit *test)
KUNIT_EXPECT_TRUE(test, match);
}
__no_kcsan
static void test_correct_barrier(struct kunit *test)
{
struct expect_report expect = {
.access = {
{ test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
{ test_kernel_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
},
};
bool match_expect = false;
test_struct.val[0] = 0; /* init unlocked */
begin_test_checks(test_kernel_with_memorder, test_kernel_with_memorder);
do {
match_expect = report_matches_any_reordered(&expect);
} while (!end_test_checks(match_expect));
KUNIT_EXPECT_FALSE(test, match_expect);
}
__no_kcsan
static void test_missing_barrier(struct kunit *test)
{
struct expect_report expect = {
.access = {
{ test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
{ test_kernel_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
},
};
bool match_expect = false;
test_struct.val[0] = 0; /* init unlocked */
begin_test_checks(test_kernel_wrong_memorder, test_kernel_wrong_memorder);
do {
match_expect = report_matches_any_reordered(&expect);
} while (!end_test_checks(match_expect));
if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
KUNIT_EXPECT_TRUE(test, match_expect);
else
KUNIT_EXPECT_FALSE(test, match_expect);
}
__no_kcsan
static void test_atomic_builtins_correct_barrier(struct kunit *test)
{
struct expect_report expect = {
.access = {
{ test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
{ test_kernel_atomic_builtin_with_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
},
};
bool match_expect = false;
test_struct.val[0] = 0; /* init unlocked */
begin_test_checks(test_kernel_atomic_builtin_with_memorder,
test_kernel_atomic_builtin_with_memorder);
do {
match_expect = report_matches_any_reordered(&expect);
} while (!end_test_checks(match_expect));
KUNIT_EXPECT_FALSE(test, match_expect);
}
__no_kcsan
static void test_atomic_builtins_missing_barrier(struct kunit *test)
{
struct expect_report expect = {
.access = {
{ test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(KCSAN_ACCESS_WRITE) },
{ test_kernel_atomic_builtin_wrong_memorder, &test_var, sizeof(test_var), __KCSAN_ACCESS_RW(0) },
},
};
bool match_expect = false;
test_struct.val[0] = 0; /* init unlocked */
begin_test_checks(test_kernel_atomic_builtin_wrong_memorder,
test_kernel_atomic_builtin_wrong_memorder);
do {
match_expect = report_matches_any_reordered(&expect);
} while (!end_test_checks(match_expect));
if (IS_ENABLED(CONFIG_KCSAN_WEAK_MEMORY))
KUNIT_EXPECT_TRUE(test, match_expect);
else
KUNIT_EXPECT_FALSE(test, match_expect);
}
/*
* Generate thread counts for all test cases. Values generated are in interval
* [2, 5] followed by exponentially increasing thread counts from 8 to 32.
@ -1032,6 +1407,7 @@ static const void *nthreads_gen_params(const void *prev, char *desc)
#define KCSAN_KUNIT_CASE(test_name) KUNIT_CASE_PARAM(test_name, nthreads_gen_params)
static struct kunit_case kcsan_test_cases[] = {
KUNIT_CASE(test_barrier_nothreads),
KCSAN_KUNIT_CASE(test_basic),
KCSAN_KUNIT_CASE(test_concurrent_races),
KCSAN_KUNIT_CASE(test_novalue_change),
@ -1056,6 +1432,10 @@ static struct kunit_case kcsan_test_cases[] = {
KCSAN_KUNIT_CASE(test_seqlock_noreport),
KCSAN_KUNIT_CASE(test_atomic_builtins),
KCSAN_KUNIT_CASE(test_1bit_value_change),
KCSAN_KUNIT_CASE(test_correct_barrier),
KCSAN_KUNIT_CASE(test_missing_barrier),
KCSAN_KUNIT_CASE(test_atomic_builtins_correct_barrier),
KCSAN_KUNIT_CASE(test_atomic_builtins_missing_barrier),
{},
};
@ -1120,6 +1500,9 @@ static int test_init(struct kunit *test)
observed.nlines = 0;
spin_unlock_irqrestore(&observed.lock, flags);
if (strstr(test->name, "nothreads"))
return 0;
if (!torture_init_begin((char *)test->name, 1))
return -EBUSY;
@ -1162,6 +1545,9 @@ static void test_exit(struct kunit *test)
struct task_struct **stop_thread;
int i;
if (strstr(test->name, "nothreads"))
return;
if (torture_cleanup_begin())
return;
@ -1224,7 +1610,7 @@ static void kcsan_test_exit(void)
tracepoint_synchronize_unregister();
}
late_initcall(kcsan_test_init);
late_initcall_sync(kcsan_test_init);
module_exit(kcsan_test_exit);
MODULE_LICENSE("GPL v2");

View File

@ -8,6 +8,7 @@
#include <linux/debug_locks.h>
#include <linux/delay.h>
#include <linux/jiffies.h>
#include <linux/kallsyms.h>
#include <linux/kernel.h>
#include <linux/lockdep.h>
#include <linux/preempt.h>
@ -31,6 +32,7 @@ struct access_info {
int access_type;
int task_pid;
int cpu_id;
unsigned long ip;
};
/*
@ -213,9 +215,9 @@ static const char *get_access_type(int type)
if (type & KCSAN_ACCESS_ASSERT) {
if (type & KCSAN_ACCESS_SCOPED) {
if (type & KCSAN_ACCESS_WRITE)
return "assert no accesses (scoped)";
return "assert no accesses (reordered)";
else
return "assert no writes (scoped)";
return "assert no writes (reordered)";
} else {
if (type & KCSAN_ACCESS_WRITE)
return "assert no accesses";
@ -238,13 +240,17 @@ static const char *get_access_type(int type)
case KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
return "read-write (marked)";
case KCSAN_ACCESS_SCOPED:
return "read (scoped)";
return "read (reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_ATOMIC:
return "read (marked, scoped)";
return "read (marked, reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE:
return "write (scoped)";
return "write (reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
return "write (marked, scoped)";
return "write (marked, reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE:
return "read-write (reordered)";
case KCSAN_ACCESS_SCOPED | KCSAN_ACCESS_COMPOUND | KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ATOMIC:
return "read-write (marked, reordered)";
default:
BUG();
}
@ -300,6 +306,52 @@ static int get_stack_skipnr(const unsigned long stack_entries[], int num_entries
return skip;
}
/*
* Skips to the first entry that matches the function of @ip, and then replaces
* that entry with @ip, returning the entries to skip with @replaced containing
* the replaced entry.
*/
static int
replace_stack_entry(unsigned long stack_entries[], int num_entries, unsigned long ip,
unsigned long *replaced)
{
unsigned long symbolsize, offset;
unsigned long target_func;
int skip;
if (kallsyms_lookup_size_offset(ip, &symbolsize, &offset))
target_func = ip - offset;
else
goto fallback;
for (skip = 0; skip < num_entries; ++skip) {
unsigned long func = stack_entries[skip];
if (!kallsyms_lookup_size_offset(func, &symbolsize, &offset))
goto fallback;
func -= offset;
if (func == target_func) {
*replaced = stack_entries[skip];
stack_entries[skip] = ip;
return skip;
}
}
fallback:
/* Should not happen; the resulting stack trace is likely misleading. */
WARN_ONCE(1, "Cannot find frame for %pS in stack trace", (void *)ip);
return get_stack_skipnr(stack_entries, num_entries);
}
static int
sanitize_stack_entries(unsigned long stack_entries[], int num_entries, unsigned long ip,
unsigned long *replaced)
{
return ip ? replace_stack_entry(stack_entries, num_entries, ip, replaced) :
get_stack_skipnr(stack_entries, num_entries);
}
/* Compares symbolized strings of addr1 and addr2. */
static int sym_strcmp(void *addr1, void *addr2)
{
@ -312,6 +364,14 @@ static int sym_strcmp(void *addr1, void *addr2)
return strncmp(buf1, buf2, sizeof(buf1));
}
static void
print_stack_trace(unsigned long stack_entries[], int num_entries, unsigned long reordered_to)
{
stack_trace_print(stack_entries, num_entries, 0);
if (reordered_to)
pr_err(" |\n +-> reordered to: %pS\n", (void *)reordered_to);
}
static void print_verbose_info(struct task_struct *task)
{
if (!task)
@ -327,13 +387,15 @@ static void print_verbose_info(struct task_struct *task)
static void print_report(enum kcsan_value_change value_change,
const struct access_info *ai,
const struct other_info *other_info,
struct other_info *other_info,
u64 old, u64 new, u64 mask)
{
unsigned long reordered_to = 0;
unsigned long stack_entries[NUM_STACK_ENTRIES] = { 0 };
int num_stack_entries = stack_trace_save(stack_entries, NUM_STACK_ENTRIES, 1);
int skipnr = get_stack_skipnr(stack_entries, num_stack_entries);
int skipnr = sanitize_stack_entries(stack_entries, num_stack_entries, ai->ip, &reordered_to);
unsigned long this_frame = stack_entries[skipnr];
unsigned long other_reordered_to = 0;
unsigned long other_frame = 0;
int other_skipnr = 0; /* silence uninit warnings */
@ -344,8 +406,9 @@ static void print_report(enum kcsan_value_change value_change,
return;
if (other_info) {
other_skipnr = get_stack_skipnr(other_info->stack_entries,
other_info->num_stack_entries);
other_skipnr = sanitize_stack_entries(other_info->stack_entries,
other_info->num_stack_entries,
other_info->ai.ip, &other_reordered_to);
other_frame = other_info->stack_entries[other_skipnr];
/* @value_change is only known for the other thread */
@ -385,10 +448,9 @@ static void print_report(enum kcsan_value_change value_change,
other_info->ai.cpu_id);
/* Print the other thread's stack trace. */
stack_trace_print(other_info->stack_entries + other_skipnr,
print_stack_trace(other_info->stack_entries + other_skipnr,
other_info->num_stack_entries - other_skipnr,
0);
other_reordered_to);
if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
print_verbose_info(other_info->task);
@ -402,9 +464,7 @@ static void print_report(enum kcsan_value_change value_change,
get_thread_desc(ai->task_pid), ai->cpu_id);
}
/* Print stack trace of this thread. */
stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr,
0);
print_stack_trace(stack_entries + skipnr, num_stack_entries - skipnr, reordered_to);
if (IS_ENABLED(CONFIG_KCSAN_VERBOSE))
print_verbose_info(current);
@ -576,21 +636,23 @@ static bool prepare_report_consumer(unsigned long *flags,
}
static struct access_info prepare_access_info(const volatile void *ptr, size_t size,
int access_type)
int access_type, unsigned long ip)
{
return (struct access_info) {
.ptr = ptr,
.size = size,
.access_type = access_type,
.task_pid = in_task() ? task_pid_nr(current) : -1,
.cpu_id = raw_smp_processor_id()
.cpu_id = raw_smp_processor_id(),
/* Only replace stack entry with @ip if scoped access. */
.ip = (access_type & KCSAN_ACCESS_SCOPED) ? ip : 0,
};
}
void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_type,
int watchpoint_idx)
unsigned long ip, int watchpoint_idx)
{
const struct access_info ai = prepare_access_info(ptr, size, access_type);
const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
unsigned long flags;
kcsan_disable_current();
@ -603,10 +665,10 @@ void kcsan_report_set_info(const volatile void *ptr, size_t size, int access_typ
}
void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access_type,
enum kcsan_value_change value_change, int watchpoint_idx,
u64 old, u64 new, u64 mask)
unsigned long ip, enum kcsan_value_change value_change,
int watchpoint_idx, u64 old, u64 new, u64 mask)
{
const struct access_info ai = prepare_access_info(ptr, size, access_type);
const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
struct other_info *other_info = &other_infos[watchpoint_idx];
unsigned long flags = 0;
@ -637,9 +699,9 @@ void kcsan_report_known_origin(const volatile void *ptr, size_t size, int access
}
void kcsan_report_unknown_origin(const volatile void *ptr, size_t size, int access_type,
u64 old, u64 new, u64 mask)
unsigned long ip, u64 old, u64 new, u64 mask)
{
const struct access_info ai = prepare_access_info(ptr, size, access_type);
const struct access_info ai = prepare_access_info(ptr, size, access_type, ip);
unsigned long flags;
kcsan_disable_current();

View File

@ -7,10 +7,15 @@
#define pr_fmt(fmt) "kcsan: " fmt
#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/init.h>
#include <linux/kcsan-checks.h>
#include <linux/kernel.h>
#include <linux/printk.h>
#include <linux/random.h>
#include <linux/sched.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include "encoding.h"
@ -18,7 +23,7 @@
#define ITERS_PER_TEST 2000
/* Test requirements. */
static bool test_requires(void)
static bool __init test_requires(void)
{
/* random should be initialized for the below tests */
return prandom_u32() + prandom_u32() != 0;
@ -28,14 +33,18 @@ static bool test_requires(void)
* Test watchpoint encode and decode: check that encoding some access's info,
* and then subsequent decode preserves the access's info.
*/
static bool test_encode_decode(void)
static bool __init test_encode_decode(void)
{
int i;
for (i = 0; i < ITERS_PER_TEST; ++i) {
size_t size = prandom_u32_max(MAX_ENCODABLE_SIZE) + 1;
bool is_write = !!prandom_u32_max(2);
unsigned long verif_masked_addr;
long encoded_watchpoint;
bool verif_is_write;
unsigned long addr;
size_t verif_size;
prandom_bytes(&addr, sizeof(addr));
if (addr < PAGE_SIZE)
@ -44,31 +53,18 @@ static bool test_encode_decode(void)
if (WARN_ON(!check_encodable(addr, size)))
return false;
/* Encode and decode */
{
const long encoded_watchpoint =
encode_watchpoint(addr, size, is_write);
unsigned long verif_masked_addr;
size_t verif_size;
bool verif_is_write;
encoded_watchpoint = encode_watchpoint(addr, size, is_write);
/* Check special watchpoints */
if (WARN_ON(decode_watchpoint(
INVALID_WATCHPOINT, &verif_masked_addr,
&verif_size, &verif_is_write)))
if (WARN_ON(decode_watchpoint(INVALID_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write)))
return false;
if (WARN_ON(decode_watchpoint(
CONSUMED_WATCHPOINT, &verif_masked_addr,
&verif_size, &verif_is_write)))
if (WARN_ON(decode_watchpoint(CONSUMED_WATCHPOINT, &verif_masked_addr, &verif_size, &verif_is_write)))
return false;
/* Check decoding watchpoint returns same data */
if (WARN_ON(!decode_watchpoint(
encoded_watchpoint, &verif_masked_addr,
&verif_size, &verif_is_write)))
if (WARN_ON(!decode_watchpoint(encoded_watchpoint, &verif_masked_addr, &verif_size, &verif_is_write)))
return false;
if (WARN_ON(verif_masked_addr !=
(addr & WATCHPOINT_ADDR_MASK)))
if (WARN_ON(verif_masked_addr != (addr & WATCHPOINT_ADDR_MASK)))
goto fail;
if (WARN_ON(verif_size != size))
goto fail;
@ -78,19 +74,16 @@ static bool test_encode_decode(void)
continue;
fail:
pr_err("%s fail: %s %zu bytes @ %lx -> encoded: %lx -> %s %zu bytes @ %lx\n",
__func__, is_write ? "write" : "read", size,
addr, encoded_watchpoint,
verif_is_write ? "write" : "read", verif_size,
verif_masked_addr);
__func__, is_write ? "write" : "read", size, addr, encoded_watchpoint,
verif_is_write ? "write" : "read", verif_size, verif_masked_addr);
return false;
}
}
return true;
}
/* Test access matching function. */
static bool test_matching_access(void)
static bool __init test_matching_access(void)
{
if (WARN_ON(!matching_access(10, 1, 10, 1)))
return false;
@ -115,6 +108,143 @@ static bool test_matching_access(void)
return true;
}
/*
* Correct memory barrier instrumentation is critical to avoiding false
* positives: simple test to check at boot certain barriers are always properly
* instrumented. See kcsan_test for a more complete test.
*/
static DEFINE_SPINLOCK(test_spinlock);
static bool __init test_barrier(void)
{
#ifdef CONFIG_KCSAN_WEAK_MEMORY
struct kcsan_scoped_access *reorder_access = &current->kcsan_ctx.reorder_access;
#else
struct kcsan_scoped_access *reorder_access = NULL;
#endif
bool ret = true;
arch_spinlock_t arch_spinlock = __ARCH_SPIN_LOCK_UNLOCKED;
atomic_t dummy;
long test_var;
if (!reorder_access || !IS_ENABLED(CONFIG_SMP))
return true;
#define __KCSAN_CHECK_BARRIER(access_type, barrier, name) \
do { \
reorder_access->type = (access_type) | KCSAN_ACCESS_SCOPED; \
reorder_access->size = 1; \
barrier; \
if (reorder_access->size != 0) { \
pr_err("improperly instrumented type=(" #access_type "): " name "\n"); \
ret = false; \
} \
} while (0)
#define KCSAN_CHECK_READ_BARRIER(b) __KCSAN_CHECK_BARRIER(0, b, #b)
#define KCSAN_CHECK_WRITE_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE, b, #b)
#define KCSAN_CHECK_RW_BARRIER(b) __KCSAN_CHECK_BARRIER(KCSAN_ACCESS_WRITE | KCSAN_ACCESS_COMPOUND, b, #b)
kcsan_nestable_atomic_begin(); /* No watchpoints in called functions. */
KCSAN_CHECK_READ_BARRIER(mb());
KCSAN_CHECK_READ_BARRIER(rmb());
KCSAN_CHECK_READ_BARRIER(smp_mb());
KCSAN_CHECK_READ_BARRIER(smp_rmb());
KCSAN_CHECK_READ_BARRIER(dma_rmb());
KCSAN_CHECK_READ_BARRIER(smp_mb__before_atomic());
KCSAN_CHECK_READ_BARRIER(smp_mb__after_atomic());
KCSAN_CHECK_READ_BARRIER(smp_mb__after_spinlock());
KCSAN_CHECK_READ_BARRIER(smp_store_mb(test_var, 0));
KCSAN_CHECK_READ_BARRIER(smp_store_release(&test_var, 0));
KCSAN_CHECK_READ_BARRIER(xchg(&test_var, 0));
KCSAN_CHECK_READ_BARRIER(xchg_release(&test_var, 0));
KCSAN_CHECK_READ_BARRIER(cmpxchg(&test_var, 0, 0));
KCSAN_CHECK_READ_BARRIER(cmpxchg_release(&test_var, 0, 0));
KCSAN_CHECK_READ_BARRIER(atomic_set_release(&dummy, 0));
KCSAN_CHECK_READ_BARRIER(atomic_add_return(1, &dummy));
KCSAN_CHECK_READ_BARRIER(atomic_add_return_release(1, &dummy));
KCSAN_CHECK_READ_BARRIER(atomic_fetch_add(1, &dummy));
KCSAN_CHECK_READ_BARRIER(atomic_fetch_add_release(1, &dummy));
KCSAN_CHECK_READ_BARRIER(test_and_set_bit(0, &test_var));
KCSAN_CHECK_READ_BARRIER(test_and_clear_bit(0, &test_var));
KCSAN_CHECK_READ_BARRIER(test_and_change_bit(0, &test_var));
KCSAN_CHECK_READ_BARRIER(clear_bit_unlock(0, &test_var));
KCSAN_CHECK_READ_BARRIER(__clear_bit_unlock(0, &test_var));
arch_spin_lock(&arch_spinlock);
KCSAN_CHECK_READ_BARRIER(arch_spin_unlock(&arch_spinlock));
spin_lock(&test_spinlock);
KCSAN_CHECK_READ_BARRIER(spin_unlock(&test_spinlock));
KCSAN_CHECK_WRITE_BARRIER(mb());
KCSAN_CHECK_WRITE_BARRIER(wmb());
KCSAN_CHECK_WRITE_BARRIER(smp_mb());
KCSAN_CHECK_WRITE_BARRIER(smp_wmb());
KCSAN_CHECK_WRITE_BARRIER(dma_wmb());
KCSAN_CHECK_WRITE_BARRIER(smp_mb__before_atomic());
KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_atomic());
KCSAN_CHECK_WRITE_BARRIER(smp_mb__after_spinlock());
KCSAN_CHECK_WRITE_BARRIER(smp_store_mb(test_var, 0));
KCSAN_CHECK_WRITE_BARRIER(smp_store_release(&test_var, 0));
KCSAN_CHECK_WRITE_BARRIER(xchg(&test_var, 0));
KCSAN_CHECK_WRITE_BARRIER(xchg_release(&test_var, 0));
KCSAN_CHECK_WRITE_BARRIER(cmpxchg(&test_var, 0, 0));
KCSAN_CHECK_WRITE_BARRIER(cmpxchg_release(&test_var, 0, 0));
KCSAN_CHECK_WRITE_BARRIER(atomic_set_release(&dummy, 0));
KCSAN_CHECK_WRITE_BARRIER(atomic_add_return(1, &dummy));
KCSAN_CHECK_WRITE_BARRIER(atomic_add_return_release(1, &dummy));
KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add(1, &dummy));
KCSAN_CHECK_WRITE_BARRIER(atomic_fetch_add_release(1, &dummy));
KCSAN_CHECK_WRITE_BARRIER(test_and_set_bit(0, &test_var));
KCSAN_CHECK_WRITE_BARRIER(test_and_clear_bit(0, &test_var));
KCSAN_CHECK_WRITE_BARRIER(test_and_change_bit(0, &test_var));
KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock(0, &test_var));
KCSAN_CHECK_WRITE_BARRIER(__clear_bit_unlock(0, &test_var));
arch_spin_lock(&arch_spinlock);
KCSAN_CHECK_WRITE_BARRIER(arch_spin_unlock(&arch_spinlock));
spin_lock(&test_spinlock);
KCSAN_CHECK_WRITE_BARRIER(spin_unlock(&test_spinlock));
KCSAN_CHECK_RW_BARRIER(mb());
KCSAN_CHECK_RW_BARRIER(wmb());
KCSAN_CHECK_RW_BARRIER(rmb());
KCSAN_CHECK_RW_BARRIER(smp_mb());
KCSAN_CHECK_RW_BARRIER(smp_wmb());
KCSAN_CHECK_RW_BARRIER(smp_rmb());
KCSAN_CHECK_RW_BARRIER(dma_wmb());
KCSAN_CHECK_RW_BARRIER(dma_rmb());
KCSAN_CHECK_RW_BARRIER(smp_mb__before_atomic());
KCSAN_CHECK_RW_BARRIER(smp_mb__after_atomic());
KCSAN_CHECK_RW_BARRIER(smp_mb__after_spinlock());
KCSAN_CHECK_RW_BARRIER(smp_store_mb(test_var, 0));
KCSAN_CHECK_RW_BARRIER(smp_store_release(&test_var, 0));
KCSAN_CHECK_RW_BARRIER(xchg(&test_var, 0));
KCSAN_CHECK_RW_BARRIER(xchg_release(&test_var, 0));
KCSAN_CHECK_RW_BARRIER(cmpxchg(&test_var, 0, 0));
KCSAN_CHECK_RW_BARRIER(cmpxchg_release(&test_var, 0, 0));
KCSAN_CHECK_RW_BARRIER(atomic_set_release(&dummy, 0));
KCSAN_CHECK_RW_BARRIER(atomic_add_return(1, &dummy));
KCSAN_CHECK_RW_BARRIER(atomic_add_return_release(1, &dummy));
KCSAN_CHECK_RW_BARRIER(atomic_fetch_add(1, &dummy));
KCSAN_CHECK_RW_BARRIER(atomic_fetch_add_release(1, &dummy));
KCSAN_CHECK_RW_BARRIER(test_and_set_bit(0, &test_var));
KCSAN_CHECK_RW_BARRIER(test_and_clear_bit(0, &test_var));
KCSAN_CHECK_RW_BARRIER(test_and_change_bit(0, &test_var));
KCSAN_CHECK_RW_BARRIER(clear_bit_unlock(0, &test_var));
KCSAN_CHECK_RW_BARRIER(__clear_bit_unlock(0, &test_var));
arch_spin_lock(&arch_spinlock);
KCSAN_CHECK_RW_BARRIER(arch_spin_unlock(&arch_spinlock));
spin_lock(&test_spinlock);
KCSAN_CHECK_RW_BARRIER(spin_unlock(&test_spinlock));
#ifdef clear_bit_unlock_is_negative_byte
KCSAN_CHECK_RW_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
KCSAN_CHECK_READ_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
KCSAN_CHECK_WRITE_BARRIER(clear_bit_unlock_is_negative_byte(0, &test_var));
#endif
kcsan_nestable_atomic_end();
return ret;
}
static int __init kcsan_selftest(void)
{
int passed = 0;
@ -132,6 +262,7 @@ static int __init kcsan_selftest(void)
RUN_TEST(test_requires);
RUN_TEST(test_encode_decode);
RUN_TEST(test_matching_access);
RUN_TEST(test_barrier);
pr_info("selftest: %d/%d tests passed\n", passed, total);
if (passed != total)

View File

@ -81,7 +81,7 @@ int kexec_should_crash(struct task_struct *p)
if (crash_kexec_post_notifiers)
return 0;
/*
* There are 4 panic() calls in do_exit() path, each of which
* There are 4 panic() calls in make_task_dead() path, each of which
* corresponds to each of these 4 conditions.
*/
if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)

View File

@ -556,6 +556,11 @@ static int kexec_walk_memblock(struct kexec_buf *kbuf,
if (kbuf->image->type == KEXEC_TYPE_CRASH)
return func(&crashk_res, kbuf);
/*
* Using MEMBLOCK_NONE will properly skip MEMBLOCK_DRIVER_MANAGED. See
* IORESOURCE_SYSRAM_DRIVER_MANAGED handling in
* locate_mem_hole_callback().
*/
if (kbuf->top_down) {
for_each_free_mem_range_reverse(i, NUMA_NO_NODE, MEMBLOCK_NONE,
&mstart, &mend, NULL) {

File diff suppressed because it is too large Load Diff

View File

@ -52,6 +52,7 @@ struct kthread_create_info
struct kthread {
unsigned long flags;
unsigned int cpu;
int result;
int (*threadfn)(void *);
void *data;
mm_segment_t oldfs;
@ -60,6 +61,8 @@ struct kthread {
#ifdef CONFIG_BLK_CGROUP
struct cgroup_subsys_state *blkcg_css;
#endif
/* To store the full name if task comm is truncated. */
char *full_name;
};
enum KTHREAD_BITS {
@ -71,7 +74,7 @@ enum KTHREAD_BITS {
static inline struct kthread *to_kthread(struct task_struct *k)
{
WARN_ON(!(k->flags & PF_KTHREAD));
return (__force void *)k->set_child_tid;
return k->worker_private;
}
/*
@ -79,7 +82,7 @@ static inline struct kthread *to_kthread(struct task_struct *k)
*
* Per construction; when:
*
* (p->flags & PF_KTHREAD) && p->set_child_tid
* (p->flags & PF_KTHREAD) && p->worker_private
*
* the task is both a kthread and struct kthread is persistent. However
* PF_KTHREAD on it's own is not, kernel_thread() can exec() (See umh.c and
@ -87,26 +90,41 @@ static inline struct kthread *to_kthread(struct task_struct *k)
*/
static inline struct kthread *__to_kthread(struct task_struct *p)
{
void *kthread = (__force void *)p->set_child_tid;
void *kthread = p->worker_private;
if (kthread && !(p->flags & PF_KTHREAD))
kthread = NULL;
return kthread;
}
void set_kthread_struct(struct task_struct *p)
void get_kthread_comm(char *buf, size_t buf_size, struct task_struct *tsk)
{
struct kthread *kthread = to_kthread(tsk);
if (!kthread || !kthread->full_name) {
__get_task_comm(buf, buf_size, tsk);
return;
}
strscpy_pad(buf, kthread->full_name, buf_size);
}
bool set_kthread_struct(struct task_struct *p)
{
struct kthread *kthread;
if (__to_kthread(p))
return;
if (WARN_ON_ONCE(to_kthread(p)))
return false;
kthread = kzalloc(sizeof(*kthread), GFP_KERNEL);
/*
* We abuse ->set_child_tid to avoid the new member and because it
* can't be wrongly copied by copy_process(). We also rely on fact
* that the caller can't exec, so PF_KTHREAD can't be cleared.
*/
p->set_child_tid = (__force void __user *)kthread;
if (!kthread)
return false;
init_completion(&kthread->exited);
init_completion(&kthread->parked);
p->vfork_done = &kthread->exited;
p->worker_private = kthread;
return true;
}
void free_kthread_struct(struct task_struct *k)
@ -114,13 +132,17 @@ void free_kthread_struct(struct task_struct *k)
struct kthread *kthread;
/*
* Can be NULL if this kthread was created by kernel_thread()
* or if kmalloc() in kthread() failed.
* Can be NULL if kmalloc() in set_kthread_struct() failed.
*/
kthread = to_kthread(k);
if (!kthread)
return;
#ifdef CONFIG_BLK_CGROUP
WARN_ON_ONCE(kthread && kthread->blkcg_css);
WARN_ON_ONCE(kthread->blkcg_css);
#endif
k->worker_private = NULL;
kfree(kthread->full_name);
kfree(kthread);
}
@ -268,8 +290,47 @@ void kthread_parkme(void)
}
EXPORT_SYMBOL_GPL(kthread_parkme);
/**
* kthread_exit - Cause the current kthread return @result to kthread_stop().
* @result: The integer value to return to kthread_stop().
*
* While kthread_exit can be called directly, it exists so that
* functions which do some additional work in non-modular code such as
* module_put_and_kthread_exit can be implemented.
*
* Does not return.
*/
void __noreturn kthread_exit(long result)
{
struct kthread *kthread = to_kthread(current);
kthread->result = result;
do_exit(0);
}
/**
* kthread_complete_and_exit - Exit the current kthread.
* @comp: Completion to complete
* @code: The integer value to return to kthread_stop().
*
* If present complete @comp and the reuturn code to kthread_stop().
*
* A kernel thread whose module may be removed after the completion of
* @comp can use this function exit safely.
*
* Does not return.
*/
void __noreturn kthread_complete_and_exit(struct completion *comp, long code)
{
if (comp)
complete(comp);
kthread_exit(code);
}
EXPORT_SYMBOL(kthread_complete_and_exit);
static int kthread(void *_create)
{
static const struct sched_param param = { .sched_priority = 0 };
/* Copy data: it's on kthread's stack */
struct kthread_create_info *create = _create;
int (*threadfn)(void *data) = create->threadfn;
@ -278,27 +339,24 @@ static int kthread(void *_create)
struct kthread *self;
int ret;
set_kthread_struct(current);
self = to_kthread(current);
/* If user was SIGKILLed, I release the structure. */
done = xchg(&create->done, NULL);
if (!done) {
kfree(create);
do_exit(-EINTR);
}
if (!self) {
create->result = ERR_PTR(-ENOMEM);
complete(done);
do_exit(-ENOMEM);
kthread_exit(-EINTR);
}
self->threadfn = threadfn;
self->data = data;
init_completion(&self->exited);
init_completion(&self->parked);
current->vfork_done = &self->exited;
/*
* The new thread inherited kthreadd's priority and CPU mask. Reset
* back to default in case they have been changed.
*/
sched_setscheduler_nocheck(current, SCHED_NORMAL, &param);
set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD));
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
@ -318,7 +376,7 @@ static int kthread(void *_create)
__kthread_parkme(self);
ret = threadfn(data);
}
do_exit(ret);
kthread_exit(ret);
}
/* called from kernel_clone() to get node information for about to be created task */
@ -397,22 +455,24 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
}
task = create->result;
if (!IS_ERR(task)) {
static const struct sched_param param = { .sched_priority = 0 };
char name[TASK_COMM_LEN];
va_list aq;
int len;
/*
* task is already visible to other tasks, so updating
* COMM must be protected.
*/
vsnprintf(name, sizeof(name), namefmt, args);
va_copy(aq, args);
len = vsnprintf(name, sizeof(name), namefmt, aq);
va_end(aq);
if (len >= TASK_COMM_LEN) {
struct kthread *kthread = to_kthread(task);
/* leave it truncated when out of memory. */
kthread->full_name = kvasprintf(GFP_KERNEL, namefmt, args);
}
set_task_comm(task, name);
/*
* root may have changed our (kthreadd's) priority or CPU mask.
* The kernel thread should not inherit these properties.
*/
sched_setscheduler_nocheck(task, SCHED_NORMAL, &param);
set_cpus_allowed_ptr(task,
housekeeping_cpumask(HK_FLAG_KTHREAD));
}
kfree(create);
return task;
@ -433,7 +493,7 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data),
* If thread is going to be bound on a particular cpu, give its node
* in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
* When woken, the thread will run @threadfn() with @data as its
* argument. @threadfn() can either call do_exit() directly if it is a
* argument. @threadfn() can either return directly if it is a
* standalone thread for which no one will call kthread_stop(), or
* return when 'kthread_should_stop()' is true (which means
* kthread_stop() has been called). The return value should be zero
@ -523,6 +583,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
to_kthread(p)->cpu = cpu;
return p;
}
EXPORT_SYMBOL(kthread_create_on_cpu);
void kthread_set_per_cpu(struct task_struct *k, int cpu)
{
@ -627,7 +688,7 @@ EXPORT_SYMBOL_GPL(kthread_park);
* instead of calling wake_up_process(): the thread will exit without
* calling threadfn().
*
* If threadfn() may call do_exit() itself, the caller must ensure
* If threadfn() may call kthread_exit() itself, the caller must ensure
* task_struct can't go away.
*
* Returns the result of threadfn(), or %-EINTR if wake_up_process()
@ -646,7 +707,7 @@ int kthread_stop(struct task_struct *k)
kthread_unpark(k);
wake_up_process(k);
wait_for_completion(&kthread->exited);
ret = k->exit_code;
ret = kthread->result;
put_task_struct(k);
trace_sched_kthread_stop_ret(ret);

View File

@ -862,14 +862,11 @@ static void klp_init_object_early(struct klp_patch *patch,
list_add_tail(&obj->node, &patch->obj_list);
}
static int klp_init_patch_early(struct klp_patch *patch)
static void klp_init_patch_early(struct klp_patch *patch)
{
struct klp_object *obj;
struct klp_func *func;
if (!patch->objs)
return -EINVAL;
INIT_LIST_HEAD(&patch->list);
INIT_LIST_HEAD(&patch->obj_list);
kobject_init(&patch->kobj, &klp_ktype_patch);
@ -879,20 +876,12 @@ static int klp_init_patch_early(struct klp_patch *patch)
init_completion(&patch->finish);
klp_for_each_object_static(patch, obj) {
if (!obj->funcs)
return -EINVAL;
klp_init_object_early(patch, obj);
klp_for_each_func_static(obj, func) {
klp_init_func_early(obj, func);
}
}
if (!try_module_get(patch->mod))
return -ENODEV;
return 0;
}
static int klp_init_patch(struct klp_patch *patch)
@ -1024,10 +1013,17 @@ static int __klp_enable_patch(struct klp_patch *patch)
int klp_enable_patch(struct klp_patch *patch)
{
int ret;
struct klp_object *obj;
if (!patch || !patch->mod)
if (!patch || !patch->mod || !patch->objs)
return -EINVAL;
klp_for_each_object_static(patch, obj) {
if (!obj->funcs)
return -EINVAL;
}
if (!is_livepatch_module(patch->mod)) {
pr_err("module %s is not marked as a livepatch module\n",
patch->mod->name);
@ -1051,12 +1047,13 @@ int klp_enable_patch(struct klp_patch *patch)
return -EINVAL;
}
ret = klp_init_patch_early(patch);
if (ret) {
if (!try_module_get(patch->mod)) {
mutex_unlock(&klp_mutex);
return ret;
return -ENODEV;
}
klp_init_patch_early(patch);
ret = klp_init_patch(patch);
if (ret)
goto err;

View File

@ -49,14 +49,15 @@ static void notrace klp_ftrace_handler(unsigned long ip,
ops = container_of(fops, struct klp_ops, fops);
/*
* The ftrace_test_recursion_trylock() will disable preemption,
* which is required for the variant of synchronize_rcu() that is
* used to allow patching functions where RCU is not watching.
* See klp_synchronize_transition() for more details.
*/
bit = ftrace_test_recursion_trylock(ip, parent_ip);
if (WARN_ON_ONCE(bit < 0))
return;
/*
* A variant of synchronize_rcu() is used to allow patching functions
* where RCU is not watching, see klp_synchronize_transition().
*/
preempt_disable_notrace();
func = list_first_or_null_rcu(&ops->func_stack, struct klp_func,
stack_node);
@ -120,7 +121,6 @@ static void notrace klp_ftrace_handler(unsigned long ip,
klp_arch_set_pc(fregs, (unsigned long)func->new_func);
unlock:
preempt_enable_notrace();
ftrace_test_recursion_unlock(bit);
}

View File

@ -13,7 +13,6 @@
#include "core.h"
#include "patch.h"
#include "transition.h"
#include "../sched/sched.h"
#define MAX_STACK_ENTRIES 100
#define STACK_ERR_BUF_SIZE 128
@ -240,7 +239,7 @@ static int klp_check_stack_func(struct klp_func *func, unsigned long *entries,
* Determine whether it's safe to transition the task to the target patch state
* by looking for any to-be-patched or to-be-unpatched functions on its stack.
*/
static int klp_check_stack(struct task_struct *task, char *err_buf)
static int klp_check_stack(struct task_struct *task, const char **oldname)
{
static unsigned long entries[MAX_STACK_ENTRIES];
struct klp_object *obj;
@ -248,12 +247,8 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
int ret, nr_entries;
ret = stack_trace_save_tsk_reliable(task, entries, ARRAY_SIZE(entries));
if (ret < 0) {
snprintf(err_buf, STACK_ERR_BUF_SIZE,
"%s: %s:%d has an unreliable stack\n",
__func__, task->comm, task->pid);
return ret;
}
if (ret < 0)
return -EINVAL;
nr_entries = ret;
klp_for_each_object(klp_transition_patch, obj) {
@ -262,11 +257,8 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
klp_for_each_func(obj, func) {
ret = klp_check_stack_func(func, entries, nr_entries);
if (ret) {
snprintf(err_buf, STACK_ERR_BUF_SIZE,
"%s: %s:%d is sleeping on function %s\n",
__func__, task->comm, task->pid,
func->old_name);
return ret;
*oldname = func->old_name;
return -EADDRINUSE;
}
}
}
@ -274,6 +266,22 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
return 0;
}
static int klp_check_and_switch_task(struct task_struct *task, void *arg)
{
int ret;
if (task_curr(task) && task != current)
return -EBUSY;
ret = klp_check_stack(task, arg);
if (ret)
return ret;
clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
task->patch_state = klp_target_state;
return 0;
}
/*
* Try to safely switch a task to the target patch state. If it's currently
* running, or it's sleeping on a to-be-patched or to-be-unpatched function, or
@ -281,13 +289,8 @@ static int klp_check_stack(struct task_struct *task, char *err_buf)
*/
static bool klp_try_switch_task(struct task_struct *task)
{
static char err_buf[STACK_ERR_BUF_SIZE];
struct rq *rq;
struct rq_flags flags;
const char *old_name;
int ret;
bool success = false;
err_buf[0] = '\0';
/* check if this task has already switched over */
if (task->patch_state == klp_target_state)
@ -305,36 +308,31 @@ static bool klp_try_switch_task(struct task_struct *task)
* functions. If all goes well, switch the task to the target patch
* state.
*/
rq = task_rq_lock(task, &flags);
ret = task_call_func(task, klp_check_and_switch_task, &old_name);
switch (ret) {
case 0: /* success */
break;
if (task_running(rq, task) && task != current) {
snprintf(err_buf, STACK_ERR_BUF_SIZE,
"%s: %s:%d is running\n", __func__, task->comm,
task->pid);
goto done;
case -EBUSY: /* klp_check_and_switch_task() */
pr_debug("%s: %s:%d is running\n",
__func__, task->comm, task->pid);
break;
case -EINVAL: /* klp_check_and_switch_task() */
pr_debug("%s: %s:%d has an unreliable stack\n",
__func__, task->comm, task->pid);
break;
case -EADDRINUSE: /* klp_check_and_switch_task() */
pr_debug("%s: %s:%d is sleeping on function %s\n",
__func__, task->comm, task->pid, old_name);
break;
default:
pr_debug("%s: Unknown error code (%d) when trying to switch %s:%d\n",
__func__, ret, task->comm, task->pid);
break;
}
ret = klp_check_stack(task, err_buf);
if (ret)
goto done;
success = true;
clear_tsk_thread_flag(task, TIF_PATCH_PENDING);
task->patch_state = klp_target_state;
done:
task_rq_unlock(rq, task, &flags);
/*
* Due to console deadlock issues, pr_debug() can't be used while
* holding the task rq lock. Instead we have to use a temporary buffer
* and print the debug message after releasing the lock.
*/
if (err_buf[0] != '\0')
pr_debug("%s", err_buf);
return success;
return !ret;
}
/*
@ -415,8 +413,11 @@ void klp_try_complete_transition(void)
for_each_possible_cpu(cpu) {
task = idle_task(cpu);
if (cpu_online(cpu)) {
if (!klp_try_switch_task(task))
if (!klp_try_switch_task(task)) {
complete = false;
/* Make idle task go through the main loop. */
wake_up_if_idle(cpu);
}
} else if (task->patch_state != klp_target_state) {
/* offline idle tasks can be switched immediately */
clear_tsk_thread_flag(task, TIF_PATCH_PENDING);

View File

@ -788,6 +788,21 @@ static int very_verbose(struct lock_class *class)
* Is this the address of a static object:
*/
#ifdef __KERNEL__
/*
* Check if an address is part of freed initmem. After initmem is freed,
* memory can be allocated from it, and such allocations would then have
* addresses within the range [_stext, _end].
*/
#ifndef arch_is_kernel_initmem_freed
static int arch_is_kernel_initmem_freed(unsigned long addr)
{
if (system_state < SYSTEM_FREEING_INITMEM)
return 0;
return init_section_contains((void *)addr, 1);
}
#endif
static int static_obj(const void *obj)
{
unsigned long start = (unsigned long) &_stext,
@ -803,9 +818,6 @@ static int static_obj(const void *obj)
if ((addr >= start) && (addr < end))
return 1;
if (arch_is_kernel_data(addr))
return 1;
/*
* in-kernel percpu var?
*/
@ -4671,7 +4683,7 @@ print_lock_invalid_wait_context(struct task_struct *curr,
/*
* Verify the wait_type context.
*
* This check validates we takes locks in the right wait-type order; that is it
* This check validates we take locks in the right wait-type order; that is it
* ensures that we do not take mutexes inside spinlocks and do not attempt to
* acquire spinlocks inside raw_spinlocks and the sort.
*
@ -5473,6 +5485,7 @@ static noinstr void check_flags(unsigned long flags)
}
}
#ifndef CONFIG_PREEMPT_RT
/*
* We dont accurately track softirq state in e.g.
* hardirq contexts (such as on 4KSTACKS), so only
@ -5487,6 +5500,7 @@ static noinstr void check_flags(unsigned long flags)
DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
}
}
#endif
if (!debug_locks)
print_irqtrace_events(current);

View File

@ -1022,23 +1022,23 @@ static int __init lock_torture_init(void)
if (onoff_interval > 0) {
firsterr = torture_onoff_init(onoff_holdoff * HZ,
onoff_interval * HZ, NULL);
if (firsterr)
if (torture_init_error(firsterr))
goto unwind;
}
if (shuffle_interval > 0) {
firsterr = torture_shuffle_init(shuffle_interval);
if (firsterr)
if (torture_init_error(firsterr))
goto unwind;
}
if (shutdown_secs > 0) {
firsterr = torture_shutdown_init(shutdown_secs,
lock_torture_cleanup);
if (firsterr)
if (torture_init_error(firsterr))
goto unwind;
}
if (stutter > 0) {
firsterr = torture_stutter_init(stutter, stutter);
if (firsterr)
if (torture_init_error(firsterr))
goto unwind;
}
@ -1047,7 +1047,7 @@ static int __init lock_torture_init(void)
sizeof(writer_tasks[0]),
GFP_KERNEL);
if (writer_tasks == NULL) {
VERBOSE_TOROUT_ERRSTRING("writer_tasks: Out of memory");
TOROUT_ERRSTRING("writer_tasks: Out of memory");
firsterr = -ENOMEM;
goto unwind;
}
@ -1058,7 +1058,7 @@ static int __init lock_torture_init(void)
sizeof(reader_tasks[0]),
GFP_KERNEL);
if (reader_tasks == NULL) {
VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
TOROUT_ERRSTRING("reader_tasks: Out of memory");
kfree(writer_tasks);
writer_tasks = NULL;
firsterr = -ENOMEM;
@ -1082,7 +1082,7 @@ static int __init lock_torture_init(void)
/* Create writer. */
firsterr = torture_create_kthread(lock_torture_writer, &cxt.lwsa[i],
writer_tasks[i]);
if (firsterr)
if (torture_init_error(firsterr))
goto unwind;
create_reader:
@ -1091,13 +1091,13 @@ static int __init lock_torture_init(void)
/* Create reader. */
firsterr = torture_create_kthread(lock_torture_reader, &cxt.lrsa[j],
reader_tasks[j]);
if (firsterr)
if (torture_init_error(firsterr))
goto unwind;
}
if (stat_interval > 0) {
firsterr = torture_create_kthread(lock_torture_stats, NULL,
stats_task);
if (firsterr)
if (torture_init_error(firsterr))
goto unwind;
}
torture_init_end();

View File

@ -94,6 +94,9 @@ static inline unsigned long __owner_flags(unsigned long owner)
return owner & MUTEX_FLAGS;
}
/*
* Returns: __mutex_owner(lock) on failure or NULL on success.
*/
static inline struct task_struct *__mutex_trylock_common(struct mutex *lock, bool handoff)
{
unsigned long owner, curr = (unsigned long)current;
@ -348,21 +351,23 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
{
bool ret = true;
rcu_read_lock();
lockdep_assert_preemption_disabled();
while (__mutex_owner(lock) == owner) {
/*
* Ensure we emit the owner->on_cpu, dereference _after_
* checking lock->owner still matches owner. If that fails,
* owner might point to freed memory. If it still matches,
* the rcu_read_lock() ensures the memory stays valid.
* checking lock->owner still matches owner. And we already
* disabled preemption which is equal to the RCU read-side
* crital section in optimistic spinning code. Thus the
* task_strcut structure won't go away during the spinning
* period
*/
barrier();
/*
* Use vcpu_is_preempted to detect lock holder preemption issue.
*/
if (!owner->on_cpu || need_resched() ||
vcpu_is_preempted(task_cpu(owner))) {
if (!owner_on_cpu(owner) || need_resched()) {
ret = false;
break;
}
@ -374,7 +379,6 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner,
cpu_relax();
}
rcu_read_unlock();
return ret;
}
@ -387,19 +391,19 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock)
struct task_struct *owner;
int retval = 1;
lockdep_assert_preemption_disabled();
if (need_resched())
return 0;
rcu_read_lock();
owner = __mutex_owner(lock);
/*
* As lock holder preemption issue, we both skip spinning if task is not
* on cpu or its cpu is preempted
* We already disabled preemption which is equal to the RCU read-side
* crital section in optimistic spinning code. Thus the task_strcut
* structure won't go away during the spinning period.
*/
owner = __mutex_owner(lock);
if (owner)
retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner));
rcu_read_unlock();
retval = owner_on_cpu(owner);
/*
* If lock->owner is not set, the mutex has been released. Return true
@ -736,6 +740,44 @@ __ww_mutex_lock(struct mutex *lock, unsigned int state, unsigned int subclass,
return __mutex_lock_common(lock, state, subclass, NULL, ip, ww_ctx, true);
}
/**
* ww_mutex_trylock - tries to acquire the w/w mutex with optional acquire context
* @ww: mutex to lock
* @ww_ctx: optional w/w acquire context
*
* Trylocks a mutex with the optional acquire context; no deadlock detection is
* possible. Returns 1 if the mutex has been acquired successfully, 0 otherwise.
*
* Unlike ww_mutex_lock, no deadlock handling is performed. However, if a @ctx is
* specified, -EALREADY handling may happen in calls to ww_mutex_trylock.
*
* A mutex acquired with this function must be released with ww_mutex_unlock.
*/
int ww_mutex_trylock(struct ww_mutex *ww, struct ww_acquire_ctx *ww_ctx)
{
if (!ww_ctx)
return mutex_trylock(&ww->base);
MUTEX_WARN_ON(ww->base.magic != &ww->base);
/*
* Reset the wounded flag after a kill. No other process can
* race and wound us here, since they can't have a valid owner
* pointer if we don't have any locks held.
*/
if (ww_ctx->acquired == 0)
ww_ctx->wounded = 0;
if (__mutex_trylock(&ww->base)) {
ww_mutex_set_context_fastpath(ww, ww_ctx);
mutex_acquire_nest(&ww->base.dep_map, 0, 1, &ww_ctx->dep_map, _RET_IP_);
return 1;
}
return 0;
}
EXPORT_SYMBOL(ww_mutex_trylock);
#ifdef CONFIG_DEBUG_LOCK_ALLOC
void __sched
mutex_lock_nested(struct mutex *lock, unsigned int subclass)

View File

@ -446,17 +446,24 @@ static __always_inline void rt_mutex_adjust_prio(struct task_struct *p)
}
/* RT mutex specific wake_q wrappers */
static __always_inline void rt_mutex_wake_q_add_task(struct rt_wake_q_head *wqh,
struct task_struct *task,
unsigned int wake_state)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT) && wake_state == TASK_RTLOCK_WAIT) {
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
WARN_ON_ONCE(wqh->rtlock_task);
get_task_struct(task);
wqh->rtlock_task = task;
} else {
wake_q_add(&wqh->head, task);
}
}
static __always_inline void rt_mutex_wake_q_add(struct rt_wake_q_head *wqh,
struct rt_mutex_waiter *w)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT) && w->wake_state != TASK_NORMAL) {
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
WARN_ON_ONCE(wqh->rtlock_task);
get_task_struct(w->task);
wqh->rtlock_task = w->task;
} else {
wake_q_add(&wqh->head, w->task);
}
rt_mutex_wake_q_add_task(wqh, w->task, w->wake_state);
}
static __always_inline void rt_mutex_wake_up_q(struct rt_wake_q_head *wqh)
@ -1096,8 +1103,11 @@ static int __sched task_blocks_on_rt_mutex(struct rt_mutex_base *lock,
* the other will detect the deadlock and return -EDEADLOCK,
* which is wrong, as the other waiter is not in a deadlock
* situation.
*
* Except for ww_mutex, in that case the chain walk must already deal
* with spurious cycles, see the comments at [3] and [6].
*/
if (owner == task)
if (owner == task && !(build_ww_mutex() && ww_ctx))
return -EDEADLK;
raw_spin_lock(&task->pi_lock);
@ -1372,9 +1382,8 @@ static bool rtmutex_spin_on_owner(struct rt_mutex_base *lock,
* for CONFIG_PREEMPT_RCU=y)
* - the VCPU on which owner runs is preempted
*/
if (!owner->on_cpu || need_resched() ||
!rt_mutex_waiter_is_top_waiter(lock, waiter) ||
vcpu_is_preempted(task_cpu(owner))) {
if (!owner_on_cpu(owner) || need_resched() ||
!rt_mutex_waiter_is_top_waiter(lock, waiter)) {
res = false;
break;
}

View File

@ -21,12 +21,13 @@ int max_lock_depth = 1024;
*/
static __always_inline int __rt_mutex_lock_common(struct rt_mutex *lock,
unsigned int state,
struct lockdep_map *nest_lock,
unsigned int subclass)
{
int ret;
might_sleep();
mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, _RET_IP_);
ret = __rt_mutex_lock(&lock->rtmutex, state);
if (ret)
mutex_release(&lock->dep_map, _RET_IP_);
@ -48,10 +49,16 @@ EXPORT_SYMBOL(rt_mutex_base_init);
*/
void __sched rt_mutex_lock_nested(struct rt_mutex *lock, unsigned int subclass)
{
__rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass);
__rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, subclass);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
void __sched _rt_mutex_lock_nest_lock(struct rt_mutex *lock, struct lockdep_map *nest_lock)
{
__rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, nest_lock, 0);
}
EXPORT_SYMBOL_GPL(_rt_mutex_lock_nest_lock);
#else /* !CONFIG_DEBUG_LOCK_ALLOC */
/**
@ -61,7 +68,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_nested);
*/
void __sched rt_mutex_lock(struct rt_mutex *lock)
{
__rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0);
__rt_mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, NULL, 0);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock);
#endif
@ -77,10 +84,25 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock);
*/
int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
{
return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0);
return __rt_mutex_lock_common(lock, TASK_INTERRUPTIBLE, NULL, 0);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
/**
* rt_mutex_lock_killable - lock a rt_mutex killable
*
* @lock: the rt_mutex to be locked
*
* Returns:
* 0 on success
* -EINTR when interrupted by a signal
*/
int __sched rt_mutex_lock_killable(struct rt_mutex *lock)
{
return __rt_mutex_lock_common(lock, TASK_KILLABLE, NULL, 0);
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
/**
* rt_mutex_trylock - try to lock a rt_mutex
*

Some files were not shown because too many files have changed in this diff Show More