3
0
mirror of https://github.com/Qortal/Brooklyn.git synced 2025-01-30 14:52:17 +00:00
This commit is contained in:
Raziel K. Crowe 2022-04-02 17:10:59 +05:00
parent c06278f256
commit 6ca71e00d3
689 changed files with 29799 additions and 21404 deletions

View File

@ -14,6 +14,8 @@ hostprogs += mktables
ifeq ($(CONFIG_ALTIVEC),y)
altivec_flags := -maltivec $(call cc-option,-mabi=altivec)
# Enable <altivec.h>
altivec_flags += -isystem $(shell $(CC) -print-file-name=include)
ifdef CONFIG_CC_IS_CLANG
# clang ppc port does not yet support -maltivec when -msoft-float is
@ -34,6 +36,8 @@ endif
# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
NEON_FLAGS := -ffreestanding
# Enable <arm_neon.h>
NEON_FLAGS += -isystem $(shell $(CC) -print-file-name=include)
ifeq ($(ARCH),arm)
NEON_FLAGS += -march=armv7-a -mfloat-abi=softfp -mfpu=neon
endif

View File

@ -109,6 +109,13 @@ config NUMA_KEEP_MEMINFO
config MEMORY_ISOLATION
bool
# IORESOURCE_SYSTEM_RAM regions in the kernel resource tree that are marked
# IORESOURCE_EXCLUSIVE cannot be mapped to user space, for example, via
# /dev/mem.
config EXCLUSIVE_SYSTEM_RAM
def_bool y
depends on !DEVMEM || STRICT_DEVMEM
#
# Only be set on architectures that have completely implemented memory hotplug
# feature. If you are not sure, don't touch it.
@ -123,15 +130,11 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
select MEMORY_ISOLATION
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on SPARSEMEM
depends on ARCH_ENABLE_MEMORY_HOTPLUG
depends on 64BIT || BROKEN
depends on 64BIT
select NUMA_KEEP_MEMINFO if NUMA
config MEMORY_HOTPLUG_SPARSE
def_bool y
depends on SPARSEMEM && MEMORY_HOTPLUG
config MEMORY_HOTPLUG_DEFAULT_ONLINE
bool "Online the newly added memory blocks by default"
depends on MEMORY_HOTPLUG
@ -371,7 +374,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
select COMPACTION
select XARRAY_MULTI
help
@ -425,47 +428,24 @@ config THP_SWAP
# UP and nommu archs use km based percpu allocator
#
config NEED_PER_CPU_KM
depends on !SMP
depends on !SMP || !MMU
bool
default y
config CLEANCACHE
bool "Enable cleancache driver to cache clean pages if tmem is present"
help
Cleancache can be thought of as a page-granularity victim cache
for clean pages that the kernel's pageframe replacement algorithm
(PFRA) would like to keep around, but can't since there isn't enough
memory. So when the PFRA "evicts" a page, it first attempts to use
cleancache code to put the data contained in that page into
"transcendent memory", memory that is not directly accessible or
addressable by the kernel and is of unknown and possibly
time-varying size. And when a cleancache-enabled
filesystem wishes to access a page in a file on disk, it first
checks cleancache to see if it already contains it; if it does,
the page is copied into the kernel and a disk access is avoided.
When a transcendent memory driver is available (such as zcache or
Xen transcendent memory), a significant I/O reduction
may be achieved. When none is available, all cleancache calls
are reduced to a single pointer-compare-against-NULL resulting
in a negligible performance hit.
config NEED_PER_CPU_EMBED_FIRST_CHUNK
bool
If unsure, say Y to enable cleancache
config NEED_PER_CPU_PAGE_FIRST_CHUNK
bool
config USE_PERCPU_NUMA_NODE_ID
bool
config HAVE_SETUP_PER_CPU_AREA
bool
config FRONTSWAP
bool "Enable frontswap to cache swap pages if tmem is present"
depends on SWAP
help
Frontswap is so named because it can be thought of as the opposite
of a "backing" store for a swap device. The data is stored into
"transcendent memory", memory that is not directly accessible or
addressable by the kernel and is of unknown and possibly
time-varying size. When space in transcendent memory is available,
a significant swap I/O reduction may be achieved. When none is
available, all frontswap calls are reduced to a single pointer-
compare-against-NULL resulting in a negligible performance hit
and swap data is stored as normal on the matching swap device.
If unsure, say Y to enable frontswap.
bool
config CMA
bool "Contiguous Memory Allocator"
@ -530,7 +510,8 @@ config MEM_SOFT_DIRTY
config ZSWAP
bool "Compressed cache for swap pages (EXPERIMENTAL)"
depends on FRONTSWAP && CRYPTO=y
depends on SWAP && CRYPTO=y
select FRONTSWAP
select ZPOOL
help
A lightweight compressed cache for swap pages. It takes
@ -897,6 +878,20 @@ config IO_MAPPING
config SECRETMEM
def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
config ANON_VMA_NAME
bool "Anonymous VMA name support"
depends on PROC_FS && ADVISE_SYSCALLS && MMU
help
Allow naming anonymous virtual memory areas.
This feature allows assigning names to virtual memory areas. Assigned
names can be later retrieved from /proc/pid/maps and /proc/pid/smaps
and help identifying individual anonymous memory areas.
Assigning a name to anonymous virtual memory area might prevent that
area from being merged with adjacent virtual memory areas due to the
difference in their name.
source "mm/damon/Kconfig"
endmenu

View File

@ -62,6 +62,30 @@ config PAGE_OWNER
If unsure, say N.
config PAGE_TABLE_CHECK
bool "Check for invalid mappings in user page tables"
depends on ARCH_SUPPORTS_PAGE_TABLE_CHECK
select PAGE_EXTENSION
help
Check that anonymous page is not being mapped twice with read write
permissions. Check that anonymous and file pages are not being
erroneously shared. Since the checking is performed at the time
entries are added and removed to user page tables, leaking, corruption
and double mapping problems are detected synchronously.
If unsure say "n".
config PAGE_TABLE_CHECK_ENFORCED
bool "Enforce the page table checking by default"
depends on PAGE_TABLE_CHECK
help
Always enable page table checking. By default the page table checking
is disabled, and can be optionally enabled via page_table_check=on
kernel parameter. This config enforces that page table check is always
enabled.
If unsure say "n".
config PAGE_POISONING
bool "Poison pages after freeing"
help

View File

@ -15,6 +15,8 @@ KCSAN_SANITIZE_slab_common.o := n
KCSAN_SANITIZE_slab.o := n
KCSAN_SANITIZE_slub.o := n
KCSAN_SANITIZE_page_alloc.o := n
# But enable explicit instrumentation for memory barriers.
KCSAN_INSTRUMENT_BARRIERS := y
# These files are disabled because they produce non-interesting and/or
# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
@ -46,7 +48,7 @@ mmu-$(CONFIG_MMU) += process_vm_access.o
endif
obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page-writeback.o \
maccess.o page-writeback.o folio-compat.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o percpu.o slab_common.o \
@ -102,7 +104,6 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
obj-$(CONFIG_ZPOOL) += zpool.o
obj-$(CONFIG_ZBUD) += zbud.o
@ -112,6 +113,7 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
obj-$(CONFIG_SECRETMEM) += secretmem.o
obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o

View File

@ -2,8 +2,9 @@
#include <linux/wait.h>
#include <linux/rbtree.h>
#include <linux/backing-dev.h>
#include <linux/kthread.h>
#include <linux/backing-dev.h>
#include <linux/blk-cgroup.h>
#include <linux/freezer.h>
#include <linux/fs.h>
#include <linux/pagemap.h>
@ -291,8 +292,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
memset(wb, 0, sizeof(*wb));
if (wb != &bdi->wb)
bdi_get(bdi);
wb->bdi = bdi;
wb->last_old_flush = jiffies;
INIT_LIST_HEAD(&wb->b_dirty);
@ -316,7 +315,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
goto out_put_bdi;
return err;
for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
err = percpu_counter_init(&wb->stat[i], 0, gfp);
@ -330,9 +329,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
while (i--)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
out_put_bdi:
if (wb != &bdi->wb)
bdi_put(bdi);
return err;
}
@ -373,8 +369,6 @@ static void wb_exit(struct bdi_writeback *wb)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
if (wb != &wb->bdi->wb)
bdi_put(wb->bdi);
}
#ifdef CONFIG_CGROUP_WRITEBACK
@ -397,6 +391,7 @@ static void cgwb_release_workfn(struct work_struct *work)
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work);
struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
struct backing_dev_info *bdi = wb->bdi;
mutex_lock(&wb->bdi->cgwb_release_mutex);
wb_shutdown(wb);
@ -416,6 +411,7 @@ static void cgwb_release_workfn(struct work_struct *work)
percpu_ref_exit(&wb->refcnt);
wb_exit(wb);
bdi_put(bdi);
WARN_ON_ONCE(!list_empty(&wb->b_attached));
kfree_rcu(wb, rcu);
}
@ -497,6 +493,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
INIT_LIST_HEAD(&wb->b_attached);
INIT_WORK(&wb->release_work, cgwb_release_workfn);
set_bit(WB_registered, &wb->state);
bdi_get(bdi);
/*
* The root wb determines the registered state of the whole bdi and
@ -528,6 +525,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
goto out_put;
err_fprop_exit:
bdi_put(bdi);
fprop_local_destroy_percpu(&wb->memcg_completions);
err_ref_exit:
percpu_ref_exit(&wb->refcnt);
@ -965,14 +963,14 @@ void bdi_unregister(struct backing_dev_info *bdi)
bdi->owner = NULL;
}
}
EXPORT_SYMBOL(bdi_unregister);
static void release_bdi(struct kref *ref)
{
struct backing_dev_info *bdi =
container_of(ref, struct backing_dev_info, refcnt);
if (test_bit(WB_registered, &bdi->wb.state))
bdi_unregister(bdi);
WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
WARN_ON_ONCE(bdi->dev);
wb_exit(&bdi->wb);
kfree(bdi);
@ -984,6 +982,22 @@ void bdi_put(struct backing_dev_info *bdi)
}
EXPORT_SYMBOL(bdi_put);
struct backing_dev_info *inode_to_bdi(struct inode *inode)
{
struct super_block *sb;
if (!inode)
return &noop_backing_dev_info;
sb = inode->i_sb;
#ifdef CONFIG_BLOCK
if (sb_is_blkdev_sb(sb))
return I_BDEV(inode)->bd_disk->bdi;
#endif
return sb->s_bdi;
}
EXPORT_SYMBOL(inode_to_bdi);
const char *bdi_dev_name(struct backing_dev_info *bdi)
{
if (!bdi || !bdi->dev)
@ -1048,51 +1062,3 @@ long congestion_wait(int sync, long timeout)
return ret;
}
EXPORT_SYMBOL(congestion_wait);
/**
* wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
* @sync: SYNC or ASYNC IO
* @timeout: timeout in jiffies
*
* In the event of a congested backing_dev (any backing_dev) this waits
* for up to @timeout jiffies for either a BDI to exit congestion of the
* given @sync queue or a write to complete.
*
* The return value is 0 if the sleep is for the full timeout. Otherwise,
* it is the number of jiffies that were still remaining when the function
* returned. return_value == timeout implies the function did not sleep.
*/
long wait_iff_congested(int sync, long timeout)
{
long ret;
unsigned long start = jiffies;
DEFINE_WAIT(wait);
wait_queue_head_t *wqh = &congestion_wqh[sync];
/*
* If there is no congestion, yield if necessary instead
* of sleeping on the congestion queue
*/
if (atomic_read(&nr_wb_congested[sync]) == 0) {
cond_resched();
/* In case we scheduled, work out time remaining */
ret = timeout - (jiffies - start);
if (ret < 0)
ret = 0;
goto out;
}
/* Sleep until uncongested or a write happens */
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
ret = io_schedule_timeout(timeout);
finish_wait(wqh, &wait);
out:
trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
jiffies_to_usecs(jiffies - start));
return ret;
}
EXPORT_SYMBOL(wait_iff_congested);

View File

@ -15,7 +15,7 @@
void get_page_bootmem(unsigned long info, struct page *page, unsigned long type)
{
page->freelist = (void *)type;
page->index = type;
SetPagePrivate(page);
set_page_private(page, info);
page_ref_inc(page);
@ -23,14 +23,13 @@ void get_page_bootmem(unsigned long info, struct page *page, unsigned long type)
void put_page_bootmem(struct page *page)
{
unsigned long type;
unsigned long type = page->index;
type = (unsigned long) page->freelist;
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
if (page_ref_dec_return(page) == 1) {
page->freelist = NULL;
page->index = 0;
ClearPagePrivate(page);
set_page_private(page, 0);
INIT_LIST_HEAD(&page->lru);

View File

@ -378,7 +378,7 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
return 0;
free_mem:
memblock_free(base, size);
memblock_phys_free(base, size);
err:
pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
return ret;
@ -524,6 +524,25 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
return page;
}
bool cma_pages_valid(struct cma *cma, const struct page *pages,
unsigned long count)
{
unsigned long pfn;
if (!cma || !pages)
return false;
pfn = page_to_pfn(pages);
if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) {
pr_debug("%s(page %p, count %lu)\n", __func__,
(void *)pages, count);
return false;
}
return true;
}
/**
* cma_release() - release allocated pages
* @cma: Contiguous memory region for which the allocation is performed.
@ -539,16 +558,13 @@ bool cma_release(struct cma *cma, const struct page *pages,
{
unsigned long pfn;
if (!cma || !pages)
if (!cma_pages_valid(cma, pages, count))
return false;
pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
pfn = page_to_pfn(pages);
if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
return false;
VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
free_contig_range(pfn, count);

View File

@ -761,6 +761,8 @@ isolate_freepages_range(struct compact_control *cc,
/* Similar to reclaim, but different enough that they don't share logic */
static bool too_many_isolated(pg_data_t *pgdat)
{
bool too_many;
unsigned long active, inactive, isolated;
inactive = node_page_state(pgdat, NR_INACTIVE_FILE) +
@ -770,7 +772,11 @@ static bool too_many_isolated(pg_data_t *pgdat)
isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
node_page_state(pgdat, NR_ISOLATED_ANON);
return isolated > (inactive + active) / 2;
too_many = isolated > (inactive + active) / 2;
if (!too_many)
wake_throttle_isolated(pgdat);
return too_many;
}
/**
@ -822,7 +828,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (cc->mode == MIGRATE_ASYNC)
return -EAGAIN;
congestion_wait(BLK_RW_ASYNC, HZ/10);
reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
if (fatal_signal_pending(current))
return -EINTR;
@ -1022,7 +1028,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!TestClearPageLRU(page))
goto isolate_fail_put;
lruvec = mem_cgroup_page_lruvec(page);
lruvec = folio_lruvec(page_folio(page));
/* If we already hold the lock, we can skip some rechecking */
if (lruvec != locked) {
@ -1032,7 +1038,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
locked = lruvec;
lruvec_memcg_debug(lruvec, page);
lruvec_memcg_debug(lruvec, page_folio(page));
/* Try get exclusive access under lock */
if (!skip_updated) {
@ -2274,6 +2280,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
unsigned long last_migrated_pfn;
const bool sync = cc->mode != MIGRATE_ASYNC;
bool update_cached;
unsigned int nr_succeeded = 0;
/*
* These counters track activities during zone compaction. Initialize
@ -2392,10 +2399,10 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
err = migrate_pages(&cc->migratepages, compaction_alloc,
compaction_free, (unsigned long)cc, cc->mode,
MR_COMPACTION, NULL);
MR_COMPACTION, &nr_succeeded);
trace_mm_compaction_migratepages(cc->nr_migratepages, err,
&cc->migratepages);
trace_mm_compaction_migratepages(cc->nr_migratepages,
nr_succeeded);
/* All pages were either migrated or will be released */
cc->nr_migratepages = 0;

View File

@ -30,7 +30,15 @@ config DAMON_VADDR
select PAGE_IDLE_FLAG
help
This builds the default data access monitoring primitives for DAMON
that works for virtual address spaces.
that work for virtual address spaces.
config DAMON_PADDR
bool "Data access monitoring primitives for the physical address space"
depends on DAMON && MMU
select PAGE_IDLE_FLAG
help
This builds the default data access monitoring primitives for DAMON
that works for the physical address space.
config DAMON_VADDR_KUNIT_TEST
bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS
@ -46,7 +54,7 @@ config DAMON_VADDR_KUNIT_TEST
config DAMON_DBGFS
bool "DAMON debugfs interface"
depends on DAMON_VADDR && DEBUG_FS
depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
help
This builds the debugfs interface for DAMON. The user space admins
can use the interface for arbitrary data access monitoring.
@ -65,4 +73,16 @@ config DAMON_DBGFS_KUNIT_TEST
If unsure, say N.
config DAMON_RECLAIM
bool "Build DAMON-based reclaim (DAMON_RECLAIM)"
depends on DAMON_PADDR
help
This builds the DAMON-based reclamation subsystem. It finds pages
that not accessed for a long time (cold) using DAMON and reclaim
those.
This is suggested to be used as a proactive and lightweight
reclamation under light memory pressure, while the traditional page
scanning-based reclamation is used for heavy pressure.
endmenu

View File

@ -1,5 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
obj-$(CONFIG_DAMON) := core.o
obj-$(CONFIG_DAMON_VADDR) += vaddr.o
obj-$(CONFIG_DAMON_VADDR) += prmtv-common.o vaddr.o
obj-$(CONFIG_DAMON_PADDR) += prmtv-common.o paddr.o
obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o
obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o

View File

@ -10,8 +10,9 @@
#include <linux/damon.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/random.h>
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
#define CREATE_TRACE_POINTS
#include <trace/events/damon.h>
@ -21,9 +22,6 @@
#define DAMON_MIN_REGION 1
#endif
/* Get a random number in [l, r) */
#define damon_rand(l, r) (l + prandom_u32_max(r - l))
static DEFINE_MUTEX(damon_lock);
static int nr_running_ctxs;
@ -45,18 +43,10 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
region->nr_accesses = 0;
INIT_LIST_HEAD(&region->list);
return region;
}
region->age = 0;
region->last_nr_accesses = 0;
/*
* Add a region between two other regions
*/
inline void damon_insert_region(struct damon_region *r,
struct damon_region *prev, struct damon_region *next,
struct damon_target *t)
{
__list_add(&r->list, &prev->list, &next->list);
t->nr_regions++;
return region;
}
void damon_add_region(struct damon_region *r, struct damon_target *t)
@ -82,6 +72,73 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t)
damon_free_region(r);
}
struct damos *damon_new_scheme(
unsigned long min_sz_region, unsigned long max_sz_region,
unsigned int min_nr_accesses, unsigned int max_nr_accesses,
unsigned int min_age_region, unsigned int max_age_region,
enum damos_action action, struct damos_quota *quota,
struct damos_watermarks *wmarks)
{
struct damos *scheme;
scheme = kmalloc(sizeof(*scheme), GFP_KERNEL);
if (!scheme)
return NULL;
scheme->min_sz_region = min_sz_region;
scheme->max_sz_region = max_sz_region;
scheme->min_nr_accesses = min_nr_accesses;
scheme->max_nr_accesses = max_nr_accesses;
scheme->min_age_region = min_age_region;
scheme->max_age_region = max_age_region;
scheme->action = action;
scheme->stat = (struct damos_stat){};
INIT_LIST_HEAD(&scheme->list);
scheme->quota.ms = quota->ms;
scheme->quota.sz = quota->sz;
scheme->quota.reset_interval = quota->reset_interval;
scheme->quota.weight_sz = quota->weight_sz;
scheme->quota.weight_nr_accesses = quota->weight_nr_accesses;
scheme->quota.weight_age = quota->weight_age;
scheme->quota.total_charged_sz = 0;
scheme->quota.total_charged_ns = 0;
scheme->quota.esz = 0;
scheme->quota.charged_sz = 0;
scheme->quota.charged_from = 0;
scheme->quota.charge_target_from = NULL;
scheme->quota.charge_addr_from = 0;
scheme->wmarks.metric = wmarks->metric;
scheme->wmarks.interval = wmarks->interval;
scheme->wmarks.high = wmarks->high;
scheme->wmarks.mid = wmarks->mid;
scheme->wmarks.low = wmarks->low;
scheme->wmarks.activated = true;
return scheme;
}
void damon_add_scheme(struct damon_ctx *ctx, struct damos *s)
{
list_add_tail(&s->list, &ctx->schemes);
}
static void damon_del_scheme(struct damos *s)
{
list_del(&s->list);
}
static void damon_free_scheme(struct damos *s)
{
kfree(s);
}
void damon_destroy_scheme(struct damos *s)
{
damon_del_scheme(s);
damon_free_scheme(s);
}
/*
* Construct a damon_target struct
*
@ -107,6 +164,11 @@ void damon_add_target(struct damon_ctx *ctx, struct damon_target *t)
list_add_tail(&t->list, &ctx->adaptive_targets);
}
bool damon_targets_empty(struct damon_ctx *ctx)
{
return list_empty(&ctx->adaptive_targets);
}
static void damon_del_target(struct damon_target *t)
{
list_del(&t->list);
@ -153,6 +215,7 @@ struct damon_ctx *damon_new_ctx(void)
ctx->max_nr_regions = 1000;
INIT_LIST_HEAD(&ctx->adaptive_targets);
INIT_LIST_HEAD(&ctx->schemes);
return ctx;
}
@ -172,7 +235,13 @@ static void damon_destroy_targets(struct damon_ctx *ctx)
void damon_destroy_ctx(struct damon_ctx *ctx)
{
struct damos *s, *next_s;
damon_destroy_targets(ctx);
damon_for_each_scheme_safe(s, next_s, ctx)
damon_destroy_scheme(s);
kfree(ctx);
}
@ -197,7 +266,6 @@ int damon_set_targets(struct damon_ctx *ctx,
for (i = 0; i < nr_ids; i++) {
t = damon_new_target(ids[i]);
if (!t) {
pr_err("Failed to alloc damon_target\n");
/* The caller should do cleanup of the ids itself */
damon_for_each_target_safe(t, next, ctx)
damon_destroy_target(t);
@ -227,16 +295,10 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
unsigned long aggr_int, unsigned long primitive_upd_int,
unsigned long min_nr_reg, unsigned long max_nr_reg)
{
if (min_nr_reg < 3) {
pr_err("min_nr_regions (%lu) must be at least 3\n",
min_nr_reg);
if (min_nr_reg < 3)
return -EINVAL;
}
if (min_nr_reg > max_nr_reg) {
pr_err("invalid nr_regions. min (%lu) > max (%lu)\n",
min_nr_reg, max_nr_reg);
if (min_nr_reg > max_nr_reg)
return -EINVAL;
}
ctx->sample_interval = sample_int;
ctx->aggr_interval = aggr_int;
@ -247,6 +309,30 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
return 0;
}
/**
* damon_set_schemes() - Set data access monitoring based operation schemes.
* @ctx: monitoring context
* @schemes: array of the schemes
* @nr_schemes: number of entries in @schemes
*
* This function should not be called while the kdamond of the context is
* running.
*
* Return: 0 if success, or negative error code otherwise.
*/
int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes,
ssize_t nr_schemes)
{
struct damos *s, *next;
ssize_t i;
damon_for_each_scheme_safe(s, next, ctx)
damon_destroy_scheme(s);
for (i = 0; i < nr_schemes; i++)
damon_add_scheme(ctx, schemes[i]);
return 0;
}
/**
* damon_nr_running_ctxs() - Return number of currently running contexts.
*/
@ -281,17 +367,6 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
return sz;
}
static bool damon_kdamond_running(struct damon_ctx *ctx)
{
bool running;
mutex_lock(&ctx->kdamond_lock);
running = ctx->kdamond != NULL;
mutex_unlock(&ctx->kdamond_lock);
return running;
}
static int kdamond_fn(void *data);
/*
@ -309,12 +384,11 @@ static int __damon_start(struct damon_ctx *ctx)
mutex_lock(&ctx->kdamond_lock);
if (!ctx->kdamond) {
err = 0;
ctx->kdamond_stop = false;
ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d",
nr_running_ctxs);
if (IS_ERR(ctx->kdamond)) {
err = PTR_ERR(ctx->kdamond);
ctx->kdamond = 0;
ctx->kdamond = NULL;
}
}
mutex_unlock(&ctx->kdamond_lock);
@ -357,15 +431,6 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs)
return err;
}
static void kdamond_usleep(unsigned long usecs)
{
/* See Documentation/timers/timers-howto.rst for the thresholds */
if (usecs > 20 * 1000)
schedule_timeout_idle(usecs_to_jiffies(usecs));
else
usleep_idle_range(usecs, usecs + 1);
}
/*
* __damon_stop() - Stops monitoring of given context.
* @ctx: monitoring context
@ -374,12 +439,15 @@ static void kdamond_usleep(unsigned long usecs)
*/
static int __damon_stop(struct damon_ctx *ctx)
{
struct task_struct *tsk;
mutex_lock(&ctx->kdamond_lock);
if (ctx->kdamond) {
ctx->kdamond_stop = true;
tsk = ctx->kdamond;
if (tsk) {
get_task_struct(tsk);
mutex_unlock(&ctx->kdamond_lock);
while (damon_kdamond_running(ctx))
kdamond_usleep(ctx->sample_interval);
kthread_stop(tsk);
put_task_struct(tsk);
return 0;
}
mutex_unlock(&ctx->kdamond_lock);
@ -446,18 +514,221 @@ static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
static void kdamond_reset_aggregated(struct damon_ctx *c)
{
struct damon_target *t;
unsigned int ti = 0; /* target's index */
damon_for_each_target(t, c) {
struct damon_region *r;
damon_for_each_region(r, t) {
trace_damon_aggregated(t, r, damon_nr_regions(t));
trace_damon_aggregated(t, ti, r, damon_nr_regions(t));
r->last_nr_accesses = r->nr_accesses;
r->nr_accesses = 0;
}
ti++;
}
}
#define sz_damon_region(r) (r->ar.end - r->ar.start)
static void damon_split_region_at(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
unsigned long sz_r);
static bool __damos_valid_target(struct damon_region *r, struct damos *s)
{
unsigned long sz;
sz = r->ar.end - r->ar.start;
return s->min_sz_region <= sz && sz <= s->max_sz_region &&
s->min_nr_accesses <= r->nr_accesses &&
r->nr_accesses <= s->max_nr_accesses &&
s->min_age_region <= r->age && r->age <= s->max_age_region;
}
static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
struct damon_region *r, struct damos *s)
{
bool ret = __damos_valid_target(r, s);
if (!ret || !s->quota.esz || !c->primitive.get_scheme_score)
return ret;
return c->primitive.get_scheme_score(c, t, r, s) >= s->quota.min_score;
}
static void damon_do_apply_schemes(struct damon_ctx *c,
struct damon_target *t,
struct damon_region *r)
{
struct damos *s;
damon_for_each_scheme(s, c) {
struct damos_quota *quota = &s->quota;
unsigned long sz = r->ar.end - r->ar.start;
struct timespec64 begin, end;
unsigned long sz_applied = 0;
if (!s->wmarks.activated)
continue;
/* Check the quota */
if (quota->esz && quota->charged_sz >= quota->esz)
continue;
/* Skip previously charged regions */
if (quota->charge_target_from) {
if (t != quota->charge_target_from)
continue;
if (r == damon_last_region(t)) {
quota->charge_target_from = NULL;
quota->charge_addr_from = 0;
continue;
}
if (quota->charge_addr_from &&
r->ar.end <= quota->charge_addr_from)
continue;
if (quota->charge_addr_from && r->ar.start <
quota->charge_addr_from) {
sz = ALIGN_DOWN(quota->charge_addr_from -
r->ar.start, DAMON_MIN_REGION);
if (!sz) {
if (r->ar.end - r->ar.start <=
DAMON_MIN_REGION)
continue;
sz = DAMON_MIN_REGION;
}
damon_split_region_at(c, t, r, sz);
r = damon_next_region(r);
sz = r->ar.end - r->ar.start;
}
quota->charge_target_from = NULL;
quota->charge_addr_from = 0;
}
if (!damos_valid_target(c, t, r, s))
continue;
/* Apply the scheme */
if (c->primitive.apply_scheme) {
if (quota->esz &&
quota->charged_sz + sz > quota->esz) {
sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
DAMON_MIN_REGION);
if (!sz)
goto update_stat;
damon_split_region_at(c, t, r, sz);
}
ktime_get_coarse_ts64(&begin);
sz_applied = c->primitive.apply_scheme(c, t, r, s);
ktime_get_coarse_ts64(&end);
quota->total_charged_ns += timespec64_to_ns(&end) -
timespec64_to_ns(&begin);
quota->charged_sz += sz;
if (quota->esz && quota->charged_sz >= quota->esz) {
quota->charge_target_from = t;
quota->charge_addr_from = r->ar.end + 1;
}
}
if (s->action != DAMOS_STAT)
r->age = 0;
update_stat:
s->stat.nr_tried++;
s->stat.sz_tried += sz;
if (sz_applied)
s->stat.nr_applied++;
s->stat.sz_applied += sz_applied;
}
}
/* Shouldn't be called if quota->ms and quota->sz are zero */
static void damos_set_effective_quota(struct damos_quota *quota)
{
unsigned long throughput;
unsigned long esz;
if (!quota->ms) {
quota->esz = quota->sz;
return;
}
if (quota->total_charged_ns)
throughput = quota->total_charged_sz * 1000000 /
quota->total_charged_ns;
else
throughput = PAGE_SIZE * 1024;
esz = throughput * quota->ms;
if (quota->sz && quota->sz < esz)
esz = quota->sz;
quota->esz = esz;
}
static void kdamond_apply_schemes(struct damon_ctx *c)
{
struct damon_target *t;
struct damon_region *r, *next_r;
struct damos *s;
damon_for_each_scheme(s, c) {
struct damos_quota *quota = &s->quota;
unsigned long cumulated_sz;
unsigned int score, max_score = 0;
if (!s->wmarks.activated)
continue;
if (!quota->ms && !quota->sz)
continue;
/* New charge window starts */
if (time_after_eq(jiffies, quota->charged_from +
msecs_to_jiffies(
quota->reset_interval))) {
if (quota->esz && quota->charged_sz >= quota->esz)
s->stat.qt_exceeds++;
quota->total_charged_sz += quota->charged_sz;
quota->charged_from = jiffies;
quota->charged_sz = 0;
damos_set_effective_quota(quota);
}
if (!c->primitive.get_scheme_score)
continue;
/* Fill up the score histogram */
memset(quota->histogram, 0, sizeof(quota->histogram));
damon_for_each_target(t, c) {
damon_for_each_region(r, t) {
if (!__damos_valid_target(r, s))
continue;
score = c->primitive.get_scheme_score(
c, t, r, s);
quota->histogram[score] +=
r->ar.end - r->ar.start;
if (score > max_score)
max_score = score;
}
}
/* Set the min score limit */
for (cumulated_sz = 0, score = max_score; ; score--) {
cumulated_sz += quota->histogram[score];
if (cumulated_sz >= quota->esz || !score)
break;
}
quota->min_score = score;
}
damon_for_each_target(t, c) {
damon_for_each_region_safe(r, next_r, t)
damon_do_apply_schemes(c, t, r);
}
}
static inline unsigned long sz_damon_region(struct damon_region *r)
{
return r->ar.end - r->ar.start;
}
/*
* Merge two adjacent regions into one region
@ -469,12 +740,11 @@ static void damon_merge_two_regions(struct damon_target *t,
l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
(sz_l + sz_r);
l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r);
l->ar.end = r->ar.end;
damon_destroy_region(r, t);
}
#define diff_of(a, b) (a > b ? a - b : b - a)
/*
* Merge adjacent regions having similar access frequencies
*
@ -488,8 +758,13 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
struct damon_region *r, *prev = NULL, *next;
damon_for_each_region_safe(r, next, t) {
if (abs(r->nr_accesses - r->last_nr_accesses) > thres)
r->age = 0;
else
r->age++;
if (prev && prev->ar.end == r->ar.start &&
diff_of(prev->nr_accesses, r->nr_accesses) <= thres &&
abs(prev->nr_accesses - r->nr_accesses) <= thres &&
sz_damon_region(prev) + sz_damon_region(r) <= sz_limit)
damon_merge_two_regions(t, prev, r);
else
@ -535,6 +810,9 @@ static void damon_split_region_at(struct damon_ctx *ctx,
r->ar.end = new->ar.start;
new->age = r->age;
new->last_nr_accesses = r->last_nr_accesses;
damon_insert_region(new, r, damon_next_region(r), t);
}
@ -623,12 +901,8 @@ static bool kdamond_need_update_primitive(struct damon_ctx *ctx)
static bool kdamond_need_stop(struct damon_ctx *ctx)
{
struct damon_target *t;
bool stop;
mutex_lock(&ctx->kdamond_lock);
stop = ctx->kdamond_stop;
mutex_unlock(&ctx->kdamond_lock);
if (stop)
if (kthread_should_stop())
return true;
if (!ctx->primitive.target_valid)
@ -642,11 +916,82 @@ static bool kdamond_need_stop(struct damon_ctx *ctx)
return true;
}
static void set_kdamond_stop(struct damon_ctx *ctx)
static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric)
{
mutex_lock(&ctx->kdamond_lock);
ctx->kdamond_stop = true;
mutex_unlock(&ctx->kdamond_lock);
struct sysinfo i;
switch (metric) {
case DAMOS_WMARK_FREE_MEM_RATE:
si_meminfo(&i);
return i.freeram * 1000 / i.totalram;
default:
break;
}
return -EINVAL;
}
/*
* Returns zero if the scheme is active. Else, returns time to wait for next
* watermark check in micro-seconds.
*/
static unsigned long damos_wmark_wait_us(struct damos *scheme)
{
unsigned long metric;
if (scheme->wmarks.metric == DAMOS_WMARK_NONE)
return 0;
metric = damos_wmark_metric_value(scheme->wmarks.metric);
/* higher than high watermark or lower than low watermark */
if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) {
if (scheme->wmarks.activated)
pr_debug("deactivate a scheme (%d) for %s wmark\n",
scheme->action,
metric > scheme->wmarks.high ?
"high" : "low");
scheme->wmarks.activated = false;
return scheme->wmarks.interval;
}
/* inactive and higher than middle watermark */
if ((scheme->wmarks.high >= metric && metric >= scheme->wmarks.mid) &&
!scheme->wmarks.activated)
return scheme->wmarks.interval;
if (!scheme->wmarks.activated)
pr_debug("activate a scheme (%d)\n", scheme->action);
scheme->wmarks.activated = true;
return 0;
}
static void kdamond_usleep(unsigned long usecs)
{
/* See Documentation/timers/timers-howto.rst for the thresholds */
if (usecs > 20 * USEC_PER_MSEC)
schedule_timeout_idle(usecs_to_jiffies(usecs));
else
usleep_idle_range(usecs, usecs + 1);
}
/* Returns negative error code if it's not activated but should return */
static int kdamond_wait_activation(struct damon_ctx *ctx)
{
struct damos *s;
unsigned long wait_time;
unsigned long min_wait_time = 0;
while (!kdamond_need_stop(ctx)) {
damon_for_each_scheme(s, ctx) {
wait_time = damos_wmark_wait_us(s);
if (!min_wait_time || wait_time < min_wait_time)
min_wait_time = wait_time;
}
if (!min_wait_time)
return 0;
kdamond_usleep(min_wait_time);
}
return -EBUSY;
}
/*
@ -659,24 +1004,26 @@ static int kdamond_fn(void *data)
struct damon_region *r, *next;
unsigned int max_nr_accesses = 0;
unsigned long sz_limit = 0;
bool done = false;
mutex_lock(&ctx->kdamond_lock);
pr_info("kdamond (%d) starts\n", ctx->kdamond->pid);
mutex_unlock(&ctx->kdamond_lock);
pr_debug("kdamond (%d) starts\n", current->pid);
if (ctx->primitive.init)
ctx->primitive.init(ctx);
if (ctx->callback.before_start && ctx->callback.before_start(ctx))
set_kdamond_stop(ctx);
done = true;
sz_limit = damon_region_sz_limit(ctx);
while (!kdamond_need_stop(ctx)) {
while (!kdamond_need_stop(ctx) && !done) {
if (kdamond_wait_activation(ctx))
continue;
if (ctx->primitive.prepare_access_checks)
ctx->primitive.prepare_access_checks(ctx);
if (ctx->callback.after_sampling &&
ctx->callback.after_sampling(ctx))
set_kdamond_stop(ctx);
done = true;
kdamond_usleep(ctx->sample_interval);
@ -689,7 +1036,8 @@ static int kdamond_fn(void *data)
sz_limit);
if (ctx->callback.after_aggregation &&
ctx->callback.after_aggregation(ctx))
set_kdamond_stop(ctx);
done = true;
kdamond_apply_schemes(ctx);
kdamond_reset_aggregated(ctx);
kdamond_split_regions(ctx);
if (ctx->primitive.reset_aggregated)
@ -707,13 +1055,12 @@ static int kdamond_fn(void *data)
damon_destroy_region(r, t);
}
if (ctx->callback.before_terminate &&
ctx->callback.before_terminate(ctx))
set_kdamond_stop(ctx);
if (ctx->callback.before_terminate)
ctx->callback.before_terminate(ctx);
if (ctx->primitive.cleanup)
ctx->primitive.cleanup(ctx);
pr_debug("kdamond (%d) finishes\n", ctx->kdamond->pid);
pr_debug("kdamond (%d) finishes\n", current->pid);
mutex_lock(&ctx->kdamond_lock);
ctx->kdamond = NULL;
mutex_unlock(&ctx->kdamond_lock);
@ -722,7 +1069,7 @@ static int kdamond_fn(void *data)
nr_running_ctxs--;
mutex_unlock(&damon_lock);
do_exit(0);
return 0;
}
#include "core-test.h"

View File

@ -109,9 +109,63 @@ static void damon_dbgfs_test_set_targets(struct kunit *test)
dbgfs_destroy_ctx(ctx);
}
static void damon_dbgfs_test_set_init_regions(struct kunit *test)
{
struct damon_ctx *ctx = damon_new_ctx();
unsigned long ids[] = {1, 2, 3};
/* Each line represents one region in ``<target id> <start> <end>`` */
char * const valid_inputs[] = {"2 10 20\n 2 20 30\n2 35 45",
"2 10 20\n",
"2 10 20\n1 39 59\n1 70 134\n 2 20 25\n",
""};
/* Reading the file again will show sorted, clean output */
char * const valid_expects[] = {"2 10 20\n2 20 30\n2 35 45\n",
"2 10 20\n",
"1 39 59\n1 70 134\n2 10 20\n2 20 25\n",
""};
char * const invalid_inputs[] = {"4 10 20\n", /* target not exists */
"2 10 20\n 2 14 26\n", /* regions overlap */
"1 10 20\n2 30 40\n 1 5 8"}; /* not sorted by address */
char *input, *expect;
int i, rc;
char buf[256];
damon_set_targets(ctx, ids, 3);
/* Put valid inputs and check the results */
for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) {
input = valid_inputs[i];
expect = valid_expects[i];
rc = set_init_regions(ctx, input, strnlen(input, 256));
KUNIT_EXPECT_EQ(test, rc, 0);
memset(buf, 0, 256);
sprint_init_regions(ctx, buf, 256);
KUNIT_EXPECT_STREQ(test, (char *)buf, expect);
}
/* Put invalid inputs and check the return error code */
for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) {
input = invalid_inputs[i];
pr_info("input: %s\n", input);
rc = set_init_regions(ctx, input, strnlen(input, 256));
KUNIT_EXPECT_EQ(test, rc, -EINVAL);
memset(buf, 0, 256);
sprint_init_regions(ctx, buf, 256);
KUNIT_EXPECT_STREQ(test, (char *)buf, "");
}
damon_set_targets(ctx, NULL, 0);
damon_destroy_ctx(ctx);
}
static struct kunit_case damon_test_cases[] = {
KUNIT_CASE(damon_dbgfs_test_str_to_target_ids),
KUNIT_CASE(damon_dbgfs_test_set_targets),
KUNIT_CASE(damon_dbgfs_test_set_init_regions),
{},
};

View File

@ -69,8 +69,7 @@ static ssize_t dbgfs_attrs_write(struct file *file,
struct damon_ctx *ctx = file->private_data;
unsigned long s, a, r, minr, maxr;
char *kbuf;
ssize_t ret = count;
int err;
ssize_t ret;
kbuf = user_input_str(buf, count, ppos);
if (IS_ERR(kbuf))
@ -88,9 +87,9 @@ static ssize_t dbgfs_attrs_write(struct file *file,
goto unlock_out;
}
err = damon_set_attrs(ctx, s, a, r, minr, maxr);
if (err)
ret = err;
ret = damon_set_attrs(ctx, s, a, r, minr, maxr);
if (!ret)
ret = count;
unlock_out:
mutex_unlock(&ctx->kdamond_lock);
out:
@ -98,6 +97,184 @@ static ssize_t dbgfs_attrs_write(struct file *file,
return ret;
}
static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
{
struct damos *s;
int written = 0;
int rc;
damon_for_each_scheme(s, c) {
rc = scnprintf(&buf[written], len - written,
"%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
s->min_sz_region, s->max_sz_region,
s->min_nr_accesses, s->max_nr_accesses,
s->min_age_region, s->max_age_region,
s->action,
s->quota.ms, s->quota.sz,
s->quota.reset_interval,
s->quota.weight_sz,
s->quota.weight_nr_accesses,
s->quota.weight_age,
s->wmarks.metric, s->wmarks.interval,
s->wmarks.high, s->wmarks.mid, s->wmarks.low,
s->stat.nr_tried, s->stat.sz_tried,
s->stat.nr_applied, s->stat.sz_applied,
s->stat.qt_exceeds);
if (!rc)
return -ENOMEM;
written += rc;
}
return written;
}
static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct damon_ctx *ctx = file->private_data;
char *kbuf;
ssize_t len;
kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
if (!kbuf)
return -ENOMEM;
mutex_lock(&ctx->kdamond_lock);
len = sprint_schemes(ctx, kbuf, count);
mutex_unlock(&ctx->kdamond_lock);
if (len < 0)
goto out;
len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
out:
kfree(kbuf);
return len;
}
static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes)
{
ssize_t i;
for (i = 0; i < nr_schemes; i++)
kfree(schemes[i]);
kfree(schemes);
}
static bool damos_action_valid(int action)
{
switch (action) {
case DAMOS_WILLNEED:
case DAMOS_COLD:
case DAMOS_PAGEOUT:
case DAMOS_HUGEPAGE:
case DAMOS_NOHUGEPAGE:
case DAMOS_STAT:
return true;
default:
return false;
}
}
/*
* Converts a string into an array of struct damos pointers
*
* Returns an array of struct damos pointers that converted if the conversion
* success, or NULL otherwise.
*/
static struct damos **str_to_schemes(const char *str, ssize_t len,
ssize_t *nr_schemes)
{
struct damos *scheme, **schemes;
const int max_nr_schemes = 256;
int pos = 0, parsed, ret;
unsigned long min_sz, max_sz;
unsigned int min_nr_a, max_nr_a, min_age, max_age;
unsigned int action;
schemes = kmalloc_array(max_nr_schemes, sizeof(scheme),
GFP_KERNEL);
if (!schemes)
return NULL;
*nr_schemes = 0;
while (pos < len && *nr_schemes < max_nr_schemes) {
struct damos_quota quota = {};
struct damos_watermarks wmarks;
ret = sscanf(&str[pos],
"%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n",
&min_sz, &max_sz, &min_nr_a, &max_nr_a,
&min_age, &max_age, &action, &quota.ms,
&quota.sz, &quota.reset_interval,
&quota.weight_sz, &quota.weight_nr_accesses,
&quota.weight_age, &wmarks.metric,
&wmarks.interval, &wmarks.high, &wmarks.mid,
&wmarks.low, &parsed);
if (ret != 18)
break;
if (!damos_action_valid(action))
goto fail;
if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age)
goto fail;
if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low ||
wmarks.mid < wmarks.low)
goto fail;
pos += parsed;
scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a,
min_age, max_age, action, &quota, &wmarks);
if (!scheme)
goto fail;
schemes[*nr_schemes] = scheme;
*nr_schemes += 1;
}
return schemes;
fail:
free_schemes_arr(schemes, *nr_schemes);
return NULL;
}
static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
struct damon_ctx *ctx = file->private_data;
char *kbuf;
struct damos **schemes;
ssize_t nr_schemes = 0, ret;
kbuf = user_input_str(buf, count, ppos);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
schemes = str_to_schemes(kbuf, count, &nr_schemes);
if (!schemes) {
ret = -EINVAL;
goto out;
}
mutex_lock(&ctx->kdamond_lock);
if (ctx->kdamond) {
ret = -EBUSY;
goto unlock_out;
}
ret = damon_set_schemes(ctx, schemes, nr_schemes);
if (!ret) {
ret = count;
nr_schemes = 0;
}
unlock_out:
mutex_unlock(&ctx->kdamond_lock);
free_schemes_arr(schemes, nr_schemes);
out:
kfree(kbuf);
return ret;
}
static inline bool targetid_is_pid(const struct damon_ctx *ctx)
{
return ctx->primitive.target_valid == damon_va_target_valid;
@ -186,26 +363,30 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
{
struct damon_ctx *ctx = file->private_data;
struct damon_target *t, *next_t;
char *kbuf, *nrs;
bool id_is_pid = true;
char *kbuf;
unsigned long *targets;
ssize_t nr_targets;
ssize_t ret = count;
ssize_t ret;
int i;
int err;
kbuf = user_input_str(buf, count, ppos);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
nrs = kbuf;
if (!strncmp(kbuf, "paddr\n", count)) {
id_is_pid = false;
/* target id is meaningless here, but we set it just for fun */
scnprintf(kbuf, count, "42 ");
}
targets = str_to_target_ids(nrs, ret, &nr_targets);
targets = str_to_target_ids(kbuf, count, &nr_targets);
if (!targets) {
ret = -ENOMEM;
goto out;
}
if (targetid_is_pid(ctx)) {
if (id_is_pid) {
for (i = 0; i < nr_targets; i++) {
targets[i] = (unsigned long)find_get_pid(
(int)targets[i]);
@ -219,7 +400,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
mutex_lock(&ctx->kdamond_lock);
if (ctx->kdamond) {
if (targetid_is_pid(ctx))
if (id_is_pid)
dbgfs_put_pids(targets, nr_targets);
ret = -EBUSY;
goto unlock_out;
@ -232,11 +413,18 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
damon_destroy_target(t);
}
err = damon_set_targets(ctx, targets, nr_targets);
if (err) {
if (targetid_is_pid(ctx))
/* Configure the context for the address space type */
if (id_is_pid)
damon_va_set_primitives(ctx);
else
damon_pa_set_primitives(ctx);
ret = damon_set_targets(ctx, targets, nr_targets);
if (ret) {
if (id_is_pid)
dbgfs_put_pids(targets, nr_targets);
ret = err;
} else {
ret = count;
}
unlock_out:
@ -248,6 +436,152 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
return ret;
}
static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len)
{
struct damon_target *t;
struct damon_region *r;
int written = 0;
int rc;
damon_for_each_target(t, c) {
damon_for_each_region(r, t) {
rc = scnprintf(&buf[written], len - written,
"%lu %lu %lu\n",
t->id, r->ar.start, r->ar.end);
if (!rc)
return -ENOMEM;
written += rc;
}
}
return written;
}
static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct damon_ctx *ctx = file->private_data;
char *kbuf;
ssize_t len;
kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
if (!kbuf)
return -ENOMEM;
mutex_lock(&ctx->kdamond_lock);
if (ctx->kdamond) {
mutex_unlock(&ctx->kdamond_lock);
len = -EBUSY;
goto out;
}
len = sprint_init_regions(ctx, kbuf, count);
mutex_unlock(&ctx->kdamond_lock);
if (len < 0)
goto out;
len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
out:
kfree(kbuf);
return len;
}
static int add_init_region(struct damon_ctx *c,
unsigned long target_id, struct damon_addr_range *ar)
{
struct damon_target *t;
struct damon_region *r, *prev;
unsigned long id;
int rc = -EINVAL;
if (ar->start >= ar->end)
return -EINVAL;
damon_for_each_target(t, c) {
id = t->id;
if (targetid_is_pid(c))
id = (unsigned long)pid_vnr((struct pid *)id);
if (id == target_id) {
r = damon_new_region(ar->start, ar->end);
if (!r)
return -ENOMEM;
damon_add_region(r, t);
if (damon_nr_regions(t) > 1) {
prev = damon_prev_region(r);
if (prev->ar.end > r->ar.start) {
damon_destroy_region(r, t);
return -EINVAL;
}
}
rc = 0;
}
}
return rc;
}
static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
{
struct damon_target *t;
struct damon_region *r, *next;
int pos = 0, parsed, ret;
unsigned long target_id;
struct damon_addr_range ar;
int err;
damon_for_each_target(t, c) {
damon_for_each_region_safe(r, next, t)
damon_destroy_region(r, t);
}
while (pos < len) {
ret = sscanf(&str[pos], "%lu %lu %lu%n",
&target_id, &ar.start, &ar.end, &parsed);
if (ret != 3)
break;
err = add_init_region(c, target_id, &ar);
if (err)
goto fail;
pos += parsed;
}
return 0;
fail:
damon_for_each_target(t, c) {
damon_for_each_region_safe(r, next, t)
damon_destroy_region(r, t);
}
return err;
}
static ssize_t dbgfs_init_regions_write(struct file *file,
const char __user *buf, size_t count,
loff_t *ppos)
{
struct damon_ctx *ctx = file->private_data;
char *kbuf;
ssize_t ret = count;
int err;
kbuf = user_input_str(buf, count, ppos);
if (IS_ERR(kbuf))
return PTR_ERR(kbuf);
mutex_lock(&ctx->kdamond_lock);
if (ctx->kdamond) {
ret = -EBUSY;
goto unlock_out;
}
err = set_init_regions(ctx, kbuf, ret);
if (err)
ret = err;
unlock_out:
mutex_unlock(&ctx->kdamond_lock);
kfree(kbuf);
return ret;
}
static ssize_t dbgfs_kdamond_pid_read(struct file *file,
char __user *buf, size_t count, loff_t *ppos)
{
@ -287,12 +621,24 @@ static const struct file_operations attrs_fops = {
.write = dbgfs_attrs_write,
};
static const struct file_operations schemes_fops = {
.open = damon_dbgfs_open,
.read = dbgfs_schemes_read,
.write = dbgfs_schemes_write,
};
static const struct file_operations target_ids_fops = {
.open = damon_dbgfs_open,
.read = dbgfs_target_ids_read,
.write = dbgfs_target_ids_write,
};
static const struct file_operations init_regions_fops = {
.open = damon_dbgfs_open,
.read = dbgfs_init_regions_read,
.write = dbgfs_init_regions_write,
};
static const struct file_operations kdamond_pid_fops = {
.open = damon_dbgfs_open,
.read = dbgfs_kdamond_pid_read,
@ -300,22 +646,22 @@ static const struct file_operations kdamond_pid_fops = {
static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx)
{
const char * const file_names[] = {"attrs", "target_ids",
"kdamond_pid"};
const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops,
&kdamond_pid_fops};
const char * const file_names[] = {"attrs", "schemes", "target_ids",
"init_regions", "kdamond_pid"};
const struct file_operations *fops[] = {&attrs_fops, &schemes_fops,
&target_ids_fops, &init_regions_fops, &kdamond_pid_fops};
int i;
for (i = 0; i < ARRAY_SIZE(file_names); i++)
debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]);
}
static int dbgfs_before_terminate(struct damon_ctx *ctx)
static void dbgfs_before_terminate(struct damon_ctx *ctx)
{
struct damon_target *t, *next;
if (!targetid_is_pid(ctx))
return 0;
return;
mutex_lock(&ctx->kdamond_lock);
damon_for_each_target_safe(t, next, ctx) {
@ -323,7 +669,6 @@ static int dbgfs_before_terminate(struct damon_ctx *ctx)
damon_destroy_target(t);
}
mutex_unlock(&ctx->kdamond_lock);
return 0;
}
static struct damon_ctx *dbgfs_new_ctx(void)
@ -398,8 +743,7 @@ static ssize_t dbgfs_mk_context_write(struct file *file,
{
char *kbuf;
char *ctx_name;
ssize_t ret = count;
int err;
ssize_t ret;
kbuf = user_input_str(buf, count, ppos);
if (IS_ERR(kbuf))
@ -417,9 +761,9 @@ static ssize_t dbgfs_mk_context_write(struct file *file,
}
mutex_lock(&damon_dbgfs_lock);
err = dbgfs_mk_context(ctx_name);
if (err)
ret = err;
ret = dbgfs_mk_context(ctx_name);
if (!ret)
ret = count;
mutex_unlock(&damon_dbgfs_lock);
out:
@ -488,8 +832,7 @@ static ssize_t dbgfs_rm_context_write(struct file *file,
const char __user *buf, size_t count, loff_t *ppos)
{
char *kbuf;
ssize_t ret = count;
int err;
ssize_t ret;
char *ctx_name;
kbuf = user_input_str(buf, count, ppos);
@ -508,9 +851,9 @@ static ssize_t dbgfs_rm_context_write(struct file *file,
}
mutex_lock(&damon_dbgfs_lock);
err = dbgfs_rm_context(ctx_name);
if (err)
ret = err;
ret = dbgfs_rm_context(ctx_name);
if (!ret)
ret = count;
mutex_unlock(&damon_dbgfs_lock);
out:
@ -534,9 +877,8 @@ static ssize_t dbgfs_monitor_on_read(struct file *file,
static ssize_t dbgfs_monitor_on_write(struct file *file,
const char __user *buf, size_t count, loff_t *ppos)
{
ssize_t ret = count;
ssize_t ret;
char *kbuf;
int err;
kbuf = user_input_str(buf, count, ppos);
if (IS_ERR(kbuf))
@ -549,16 +891,26 @@ static ssize_t dbgfs_monitor_on_write(struct file *file,
}
mutex_lock(&damon_dbgfs_lock);
if (!strncmp(kbuf, "on", count))
err = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs);
else if (!strncmp(kbuf, "off", count))
err = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
else
err = -EINVAL;
if (!strncmp(kbuf, "on", count)) {
int i;
for (i = 0; i < dbgfs_nr_ctxs; i++) {
if (damon_targets_empty(dbgfs_ctxs[i])) {
kfree(kbuf);
mutex_unlock(&damon_dbgfs_lock);
return -EINVAL;
}
}
ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs);
} else if (!strncmp(kbuf, "off", count)) {
ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
} else {
ret = -EINVAL;
}
mutex_unlock(&damon_dbgfs_lock);
if (err)
ret = err;
if (!ret)
ret = count;
kfree(kbuf);
return ret;
}

View File

@ -135,7 +135,6 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
struct damon_addr_range *three_regions,
unsigned long *expected, int nr_expected)
{
struct damon_ctx *ctx = damon_new_ctx();
struct damon_target *t;
struct damon_region *r;
int i;
@ -145,7 +144,6 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
r = damon_new_region(regions[i * 2], regions[i * 2 + 1]);
damon_add_region(r, t);
}
damon_add_target(ctx, t);
damon_va_apply_three_regions(t, three_regions);
@ -154,8 +152,6 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
KUNIT_EXPECT_EQ(test, r->ar.start, expected[i * 2]);
KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]);
}
damon_destroy_ctx(ctx);
}
/*
@ -233,7 +229,7 @@ static void damon_test_apply_three_regions3(struct kunit *test)
* and 70-100) has totally freed and mapped to different area (30-32 and
* 65-68). The target regions which were in the old second and third big
* regions should now be removed and new target regions covering the new second
* and third big regions should be crated.
* and third big regions should be created.
*/
static void damon_test_apply_three_regions4(struct kunit *test)
{
@ -252,60 +248,59 @@ static void damon_test_apply_three_regions4(struct kunit *test)
new_three_regions, expected, ARRAY_SIZE(expected));
}
static void damon_test_split_evenly(struct kunit *test)
static void damon_test_split_evenly_fail(struct kunit *test,
unsigned long start, unsigned long end, unsigned int nr_pieces)
{
struct damon_ctx *c = damon_new_ctx();
struct damon_target *t;
struct damon_region *r;
unsigned long i;
KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5),
-EINVAL);
t = damon_new_target(42);
r = damon_new_region(0, 100);
KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 0), -EINVAL);
struct damon_target *t = damon_new_target(42);
struct damon_region *r = damon_new_region(start, end);
damon_add_region(r, t);
KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 10), 0);
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 10u);
i = 0;
damon_for_each_region(r, t) {
KUNIT_EXPECT_EQ(test, r->ar.start, i++ * 10);
KUNIT_EXPECT_EQ(test, r->ar.end, i * 10);
}
damon_free_target(t);
t = damon_new_target(42);
r = damon_new_region(5, 59);
damon_add_region(r, t);
KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 5), 0);
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u);
i = 0;
damon_for_each_region(r, t) {
if (i == 4)
break;
KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i++);
KUNIT_EXPECT_EQ(test, r->ar.end, 5 + 10 * i);
}
KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i);
KUNIT_EXPECT_EQ(test, r->ar.end, 59ul);
damon_free_target(t);
t = damon_new_target(42);
r = damon_new_region(5, 6);
damon_add_region(r, t);
KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 2), -EINVAL);
KUNIT_EXPECT_EQ(test,
damon_va_evenly_split_region(t, r, nr_pieces), -EINVAL);
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1u);
damon_for_each_region(r, t) {
KUNIT_EXPECT_EQ(test, r->ar.start, 5ul);
KUNIT_EXPECT_EQ(test, r->ar.end, 6ul);
KUNIT_EXPECT_EQ(test, r->ar.start, start);
KUNIT_EXPECT_EQ(test, r->ar.end, end);
}
damon_free_target(t);
damon_destroy_ctx(c);
}
static void damon_test_split_evenly_succ(struct kunit *test,
unsigned long start, unsigned long end, unsigned int nr_pieces)
{
struct damon_target *t = damon_new_target(42);
struct damon_region *r = damon_new_region(start, end);
unsigned long expected_width = (end - start) / nr_pieces;
unsigned long i = 0;
damon_add_region(r, t);
KUNIT_EXPECT_EQ(test,
damon_va_evenly_split_region(t, r, nr_pieces), 0);
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), nr_pieces);
damon_for_each_region(r, t) {
if (i == nr_pieces - 1)
break;
KUNIT_EXPECT_EQ(test,
r->ar.start, start + i++ * expected_width);
KUNIT_EXPECT_EQ(test, r->ar.end, start + i * expected_width);
}
KUNIT_EXPECT_EQ(test, r->ar.start, start + i * expected_width);
KUNIT_EXPECT_EQ(test, r->ar.end, end);
damon_free_target(t);
}
static void damon_test_split_evenly(struct kunit *test)
{
KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5),
-EINVAL);
damon_test_split_evenly_fail(test, 0, 100, 0);
damon_test_split_evenly_succ(test, 0, 100, 10);
damon_test_split_evenly_succ(test, 5, 59, 5);
damon_test_split_evenly_fail(test, 5, 6, 2);
}
static struct kunit_case damon_test_cases[] = {

View File

@ -7,31 +7,29 @@
#define pr_fmt(fmt) "damon-va: " fmt
#include <linux/damon.h>
#include <linux/hugetlb.h>
#include <linux/mm.h>
#include <linux/mmu_notifier.h>
#include <asm-generic/mman-common.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/mmu_notifier.h>
#include <linux/page_idle.h>
#include <linux/pagewalk.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
#include "prmtv-common.h"
#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
#undef DAMON_MIN_REGION
#define DAMON_MIN_REGION 1
#endif
/* Get a random number in [l, r) */
#define damon_rand(l, r) (l + prandom_u32_max(r - l))
/*
* 't->id' should be the pointer to the relevant 'struct pid' having reference
* count. Caller must put the returned task, unless it is NULL.
*/
#define damon_get_task_struct(t) \
(get_pid_task((struct pid *)t->id, PIDTYPE_PID))
static inline struct task_struct *damon_get_task_struct(struct damon_target *t)
{
return get_pid_task((struct pid *)t->id, PIDTYPE_PID);
}
/*
* Get the mm_struct of the given target
@ -102,16 +100,6 @@ static unsigned long sz_range(struct damon_addr_range *r)
return r->end - r->start;
}
static void swap_ranges(struct damon_addr_range *r1,
struct damon_addr_range *r2)
{
struct damon_addr_range tmp;
tmp = *r1;
*r1 = *r2;
*r2 = tmp;
}
/*
* Find three regions separated by two biggest unmapped regions
*
@ -150,9 +138,9 @@ static int __damon_va_three_regions(struct vm_area_struct *vma,
gap.start = last_vma->vm_end;
gap.end = vma->vm_start;
if (sz_range(&gap) > sz_range(&second_gap)) {
swap_ranges(&gap, &second_gap);
swap(gap, second_gap);
if (sz_range(&second_gap) > sz_range(&first_gap))
swap_ranges(&second_gap, &first_gap);
swap(second_gap, first_gap);
}
next:
last_vma = vma;
@ -163,7 +151,7 @@ static int __damon_va_three_regions(struct vm_area_struct *vma,
/* Sort the two biggest gaps by address */
if (first_gap.start > second_gap.start)
swap_ranges(&first_gap, &second_gap);
swap(first_gap, second_gap);
/* Store the result */
regions[0].start = ALIGN(start, DAMON_MIN_REGION);
@ -244,13 +232,19 @@ static int damon_va_three_regions(struct damon_target *t,
static void __damon_va_init_regions(struct damon_ctx *ctx,
struct damon_target *t)
{
struct damon_target *ti;
struct damon_region *r;
struct damon_addr_range regions[3];
unsigned long sz = 0, nr_pieces;
int i;
int i, tidx = 0;
if (damon_va_three_regions(t, regions)) {
pr_err("Failed to get three regions of target %lu\n", t->id);
damon_for_each_target(ti, ctx) {
if (ti == t)
break;
tidx++;
}
pr_debug("Failed to get three regions of %dth target\n", tidx);
return;
}
@ -276,7 +270,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx,
}
/* Initialize '->regions_list' of every target (task) */
void damon_va_init(struct damon_ctx *ctx)
static void damon_va_init(struct damon_ctx *ctx)
{
struct damon_target *t;
@ -296,7 +290,8 @@ void damon_va_init(struct damon_ctx *ctx)
*
* Returns true if it is.
*/
static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re)
static bool damon_intersect(struct damon_region *r,
struct damon_addr_range *re)
{
return !(r->ar.end <= re->start || re->end <= r->ar.start);
}
@ -311,7 +306,7 @@ static void damon_va_apply_three_regions(struct damon_target *t,
struct damon_addr_range bregions[3])
{
struct damon_region *r, *next;
unsigned int i = 0;
unsigned int i;
/* Remove regions which are not in the three big regions now */
damon_for_each_region_safe(r, next, t) {
@ -360,7 +355,7 @@ static void damon_va_apply_three_regions(struct damon_target *t,
/*
* Update regions for current memory mappings
*/
void damon_va_update(struct damon_ctx *ctx)
static void damon_va_update(struct damon_ctx *ctx)
{
struct damon_addr_range three_regions[3];
struct damon_target *t;
@ -372,82 +367,6 @@ void damon_va_update(struct damon_ctx *ctx)
}
}
/*
* Get an online page for a pfn if it's in the LRU list. Otherwise, returns
* NULL.
*
* The body of this function is stolen from the 'page_idle_get_page()'. We
* steal rather than reuse it because the code is quite simple.
*/
static struct page *damon_get_page(unsigned long pfn)
{
struct page *page = pfn_to_online_page(pfn);
if (!page || !PageLRU(page) || !get_page_unless_zero(page))
return NULL;
if (unlikely(!PageLRU(page))) {
put_page(page);
page = NULL;
}
return page;
}
static void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm,
unsigned long addr)
{
bool referenced = false;
struct page *page = damon_get_page(pte_pfn(*pte));
if (!page)
return;
if (pte_young(*pte)) {
referenced = true;
*pte = pte_mkold(*pte);
}
#ifdef CONFIG_MMU_NOTIFIER
if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE))
referenced = true;
#endif /* CONFIG_MMU_NOTIFIER */
if (referenced)
set_page_young(page);
set_page_idle(page);
put_page(page);
}
static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm,
unsigned long addr)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
bool referenced = false;
struct page *page = damon_get_page(pmd_pfn(*pmd));
if (!page)
return;
if (pmd_young(*pmd)) {
referenced = true;
*pmd = pmd_mkold(*pmd);
}
#ifdef CONFIG_MMU_NOTIFIER
if (mmu_notifier_clear_young(mm, addr,
addr + ((1UL) << HPAGE_PMD_SHIFT)))
referenced = true;
#endif /* CONFIG_MMU_NOTIFIER */
if (referenced)
set_page_young(page);
set_page_idle(page);
put_page(page);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
}
static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
@ -475,8 +394,65 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
return 0;
}
static struct mm_walk_ops damon_mkold_ops = {
#ifdef CONFIG_HUGETLB_PAGE
static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long addr)
{
bool referenced = false;
pte_t entry = huge_ptep_get(pte);
struct page *page = pte_page(entry);
if (!page)
return;
get_page(page);
if (pte_young(entry)) {
referenced = true;
entry = pte_mkold(entry);
huge_ptep_set_access_flags(vma, addr, pte, entry,
vma->vm_flags & VM_WRITE);
}
#ifdef CONFIG_MMU_NOTIFIER
if (mmu_notifier_clear_young(mm, addr,
addr + huge_page_size(hstate_vma(vma))))
referenced = true;
#endif /* CONFIG_MMU_NOTIFIER */
if (referenced)
set_page_young(page);
set_page_idle(page);
put_page(page);
}
static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct hstate *h = hstate_vma(walk->vma);
spinlock_t *ptl;
pte_t entry;
ptl = huge_pte_lock(h, walk->mm, pte);
entry = huge_ptep_get(pte);
if (!pte_present(entry))
goto out;
damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr);
out:
spin_unlock(ptl);
return 0;
}
#else
#define damon_mkold_hugetlb_entry NULL
#endif /* CONFIG_HUGETLB_PAGE */
static const struct mm_walk_ops damon_mkold_ops = {
.pmd_entry = damon_mkold_pmd_entry,
.hugetlb_entry = damon_mkold_hugetlb_entry,
};
static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
@ -490,7 +466,7 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
* Functions for the access checking of the regions
*/
static void damon_va_prepare_access_check(struct damon_ctx *ctx,
static void __damon_va_prepare_access_check(struct damon_ctx *ctx,
struct mm_struct *mm, struct damon_region *r)
{
r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
@ -498,7 +474,7 @@ static void damon_va_prepare_access_check(struct damon_ctx *ctx,
damon_va_mkold(mm, r->sampling_addr);
}
void damon_va_prepare_access_checks(struct damon_ctx *ctx)
static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
{
struct damon_target *t;
struct mm_struct *mm;
@ -509,7 +485,7 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx)
if (!mm)
continue;
damon_for_each_region(r, t)
damon_va_prepare_access_check(ctx, mm, r);
__damon_va_prepare_access_check(ctx, mm, r);
mmput(mm);
}
}
@ -571,8 +547,47 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
return 0;
}
static struct mm_walk_ops damon_young_ops = {
#ifdef CONFIG_HUGETLB_PAGE
static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
struct damon_young_walk_private *priv = walk->private;
struct hstate *h = hstate_vma(walk->vma);
struct page *page;
spinlock_t *ptl;
pte_t entry;
ptl = huge_pte_lock(h, walk->mm, pte);
entry = huge_ptep_get(pte);
if (!pte_present(entry))
goto out;
page = pte_page(entry);
if (!page)
goto out;
get_page(page);
if (pte_young(entry) || !page_is_idle(page) ||
mmu_notifier_test_young(walk->mm, addr)) {
*priv->page_sz = huge_page_size(h);
priv->young = true;
}
put_page(page);
out:
spin_unlock(ptl);
return 0;
}
#else
#define damon_young_hugetlb_entry NULL
#endif /* CONFIG_HUGETLB_PAGE */
static const struct mm_walk_ops damon_young_ops = {
.pmd_entry = damon_young_pmd_entry,
.hugetlb_entry = damon_young_hugetlb_entry,
};
static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
@ -595,7 +610,7 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
* mm 'mm_struct' for the given virtual address space
* r the region to be checked
*/
static void damon_va_check_access(struct damon_ctx *ctx,
static void __damon_va_check_access(struct damon_ctx *ctx,
struct mm_struct *mm, struct damon_region *r)
{
static struct mm_struct *last_mm;
@ -619,7 +634,7 @@ static void damon_va_check_access(struct damon_ctx *ctx,
last_addr = r->sampling_addr;
}
unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
{
struct damon_target *t;
struct mm_struct *mm;
@ -631,7 +646,7 @@ unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
if (!mm)
continue;
damon_for_each_region(r, t) {
damon_va_check_access(ctx, mm, r);
__damon_va_check_access(ctx, mm, r);
max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
}
mmput(mm);
@ -658,6 +673,78 @@ bool damon_va_target_valid(void *target)
return false;
}
#ifndef CONFIG_ADVISE_SYSCALLS
static unsigned long damos_madvise(struct damon_target *target,
struct damon_region *r, int behavior)
{
return 0;
}
#else
static unsigned long damos_madvise(struct damon_target *target,
struct damon_region *r, int behavior)
{
struct mm_struct *mm;
unsigned long start = PAGE_ALIGN(r->ar.start);
unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start);
unsigned long applied;
mm = damon_get_mm(target);
if (!mm)
return 0;
applied = do_madvise(mm, start, len, behavior) ? 0 : len;
mmput(mm);
return applied;
}
#endif /* CONFIG_ADVISE_SYSCALLS */
static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
struct damon_target *t, struct damon_region *r,
struct damos *scheme)
{
int madv_action;
switch (scheme->action) {
case DAMOS_WILLNEED:
madv_action = MADV_WILLNEED;
break;
case DAMOS_COLD:
madv_action = MADV_COLD;
break;
case DAMOS_PAGEOUT:
madv_action = MADV_PAGEOUT;
break;
case DAMOS_HUGEPAGE:
madv_action = MADV_HUGEPAGE;
break;
case DAMOS_NOHUGEPAGE:
madv_action = MADV_NOHUGEPAGE;
break;
case DAMOS_STAT:
return 0;
default:
return 0;
}
return damos_madvise(t, r, madv_action);
}
static int damon_va_scheme_score(struct damon_ctx *context,
struct damon_target *t, struct damon_region *r,
struct damos *scheme)
{
switch (scheme->action) {
case DAMOS_PAGEOUT:
return damon_pageout_score(context, r, scheme);
default:
break;
}
return DAMOS_MAX_SCORE;
}
void damon_va_set_primitives(struct damon_ctx *ctx)
{
ctx->primitive.init = damon_va_init;
@ -667,6 +754,8 @@ void damon_va_set_primitives(struct damon_ctx *ctx)
ctx->primitive.reset_aggregated = NULL;
ctx->primitive.target_valid = damon_va_target_valid;
ctx->primitive.cleanup = NULL;
ctx->primitive.apply_scheme = damon_va_apply_scheme;
ctx->primitive.get_scheme_score = damon_va_scheme_score;
}
#include "vaddr-test.h"

View File

@ -16,17 +16,19 @@
#include <linux/ctype.h>
#include "internal.h"
#include <trace/events/migrate.h>
/*
* Define EM() and EMe() so that MIGRATE_REASON from trace/events/migrate.h can
* be used to populate migrate_reason_names[].
*/
#undef EM
#undef EMe
#define EM(a, b) b,
#define EMe(a, b) b
const char *migrate_reason_names[MR_TYPES] = {
"compaction",
"memory_failure",
"memory_hotplug",
"syscall_or_cpuset",
"mempolicy_mbind",
"numa_misplaced",
"contig_range",
"longterm_pin",
"demotion",
MIGRATE_REASON
};
const struct trace_print_flags pageflag_names[] = {
@ -110,59 +112,11 @@ static void __dump_page(struct page *page)
type = "ksm ";
else if (PageAnon(page))
type = "anon ";
else if (mapping) {
struct inode *host;
const struct address_space_operations *a_ops;
struct hlist_node *dentry_first;
struct dentry *dentry_ptr;
struct dentry dentry;
unsigned long ino;
/*
* mapping can be invalid pointer and we don't want to crash
* accessing it, so probe everything depending on it carefully
*/
if (get_kernel_nofault(host, &mapping->host) ||
get_kernel_nofault(a_ops, &mapping->a_ops)) {
pr_warn("failed to read mapping contents, not a valid kernel address?\n");
goto out_mapping;
}
if (!host) {
pr_warn("aops:%ps\n", a_ops);
goto out_mapping;
}
if (get_kernel_nofault(dentry_first, &host->i_dentry.first) ||
get_kernel_nofault(ino, &host->i_ino)) {
pr_warn("aops:%ps with invalid host inode %px\n",
a_ops, host);
goto out_mapping;
}
if (!dentry_first) {
pr_warn("aops:%ps ino:%lx\n", a_ops, ino);
goto out_mapping;
}
dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
if (get_kernel_nofault(dentry, dentry_ptr)) {
pr_warn("aops:%ps ino:%lx with invalid dentry %px\n",
a_ops, ino, dentry_ptr);
} else {
/*
* if dentry is corrupted, the %pd handler may still
* crash, but it's unlikely that we reach here with a
* corrupted struct page
*/
pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n",
a_ops, ino, &dentry);
}
}
out_mapping:
else if (mapping)
dump_mapping(mapping);
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
pr_warn("%sflags: %#lx(%pGp)%s\n", type, head->flags, &head->flags,
pr_warn("%sflags: %pGp%s\n", type, &head->flags,
page_cma ? " CMA" : "");
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
@ -216,7 +170,7 @@ void dump_mm(const struct mm_struct *mm)
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
"binfmt %px flags %lx core_state %px\n"
"binfmt %px flags %lx\n"
#ifdef CONFIG_AIO
"ioctx_table %px\n"
#endif
@ -248,7 +202,7 @@ void dump_mm(const struct mm_struct *mm)
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
mm->start_brk, mm->brk, mm->start_stack,
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
mm->binfmt, mm->flags, mm->core_state,
mm->binfmt, mm->flags,
#ifdef CONFIG_AIO
mm->ioctx_table,
#endif

View File

@ -654,7 +654,7 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args)
set_pte_at(args->mm, args->vaddr, args->ptep, pte);
flush_dcache_page(page);
barrier();
pte_clear(args->mm, args->vaddr, args->ptep);
ptep_clear(args->mm, args->vaddr, args->ptep);
pte = ptep_get(args->ptep);
WARN_ON(!pte_none(pte));
}
@ -890,8 +890,8 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args)
pr_debug("Validating swap migration\n");
/*
* make_migration_entry() expects given page to be
* locked, otherwise it stumbles upon a BUG_ON().
* make_[readable|writable]_migration_entry() expects given page to
* be locked, otherwise it stumbles upon a BUG_ON().
*/
__SetPageLocked(page);
swp = make_writable_migration_entry(page_to_pfn(page));
@ -1106,13 +1106,14 @@ static int __init init_args(struct pgtable_debug_args *args)
/*
* Initialize the debugging data.
*
* __P000 (or even __S000) will help create page table entries with
* PROT_NONE permission as required for pxx_protnone_tests().
* protection_map[0] (or even protection_map[8]) will help create
* page table entries with PROT_NONE permission as required for
* pxx_protnone_tests().
*/
memset(args, 0, sizeof(*args));
args->vaddr = get_random_vaddr();
args->page_prot = vm_get_page_prot(VMFLAGS);
args->page_prot_none = __P000;
args->page_prot_none = protection_map[0];
args->is_contiguous_page = false;
args->pud_pfn = ULONG_MAX;
args->pmd_pfn = ULONG_MAX;

View File

@ -152,7 +152,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
else if ((boundary < size) || (boundary & (boundary - 1)))
return NULL;
retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
retval = kmalloc(sizeof(*retval), GFP_KERNEL);
if (!retval)
return retval;

File diff suppressed because it is too large Load Diff

View File

@ -27,27 +27,7 @@ DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key);
* may be registered, but implementations can never deregister. This
* is a simple singly-linked list of all registered implementations.
*/
static struct frontswap_ops *frontswap_ops __read_mostly;
#define for_each_frontswap_ops(ops) \
for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next)
/*
* If enabled, frontswap_store will return failure even on success. As
* a result, the swap subsystem will always write the page to swap, in
* effect converting frontswap into a writethrough cache. In this mode,
* there is no direct reduction in swap writes, but a frontswap backend
* can unilaterally "reclaim" any pages in use with no data loss, thus
* providing increases control over maximum memory usage due to frontswap.
*/
static bool frontswap_writethrough_enabled __read_mostly;
/*
* If enabled, the underlying tmem implementation is capable of doing
* exclusive gets, so frontswap_load, on a successful tmem_get must
* mark the page as no longer in frontswap AND mark it dirty.
*/
static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
static const struct frontswap_ops *frontswap_ops __read_mostly;
#ifdef CONFIG_DEBUG_FS
/*
@ -114,87 +94,22 @@ static inline void inc_frontswap_invalidates(void) { }
/*
* Register operations for frontswap
*/
void frontswap_register_ops(struct frontswap_ops *ops)
int frontswap_register_ops(const struct frontswap_ops *ops)
{
DECLARE_BITMAP(a, MAX_SWAPFILES);
DECLARE_BITMAP(b, MAX_SWAPFILES);
struct swap_info_struct *si;
unsigned int i;
bitmap_zero(a, MAX_SWAPFILES);
bitmap_zero(b, MAX_SWAPFILES);
spin_lock(&swap_lock);
plist_for_each_entry(si, &swap_active_head, list) {
if (!WARN_ON(!si->frontswap_map))
set_bit(si->type, a);
}
spin_unlock(&swap_lock);
/* the new ops needs to know the currently active swap devices */
for_each_set_bit(i, a, MAX_SWAPFILES)
ops->init(i);
/*
* Setting frontswap_ops must happen after the ops->init() calls
* above; cmpxchg implies smp_mb() which will ensure the init is
* complete at this point.
*/
do {
ops->next = frontswap_ops;
} while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
if (frontswap_ops)
return -EINVAL;
frontswap_ops = ops;
static_branch_inc(&frontswap_enabled_key);
spin_lock(&swap_lock);
plist_for_each_entry(si, &swap_active_head, list) {
if (si->frontswap_map)
set_bit(si->type, b);
}
spin_unlock(&swap_lock);
/*
* On the very unlikely chance that a swap device was added or
* removed between setting the "a" list bits and the ops init
* calls, we re-check and do init or invalidate for any changed
* bits.
*/
if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
for (i = 0; i < MAX_SWAPFILES; i++) {
if (!test_bit(i, a) && test_bit(i, b))
ops->init(i);
else if (test_bit(i, a) && !test_bit(i, b))
ops->invalidate_area(i);
}
}
return 0;
}
EXPORT_SYMBOL(frontswap_register_ops);
/*
* Enable/disable frontswap writethrough (see above).
*/
void frontswap_writethrough(bool enable)
{
frontswap_writethrough_enabled = enable;
}
EXPORT_SYMBOL(frontswap_writethrough);
/*
* Enable/disable frontswap exclusive gets (see above).
*/
void frontswap_tmem_exclusive_gets(bool enable)
{
frontswap_tmem_exclusive_gets_enabled = enable;
}
EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
/*
* Called when a swap device is swapon'd.
*/
void __frontswap_init(unsigned type, unsigned long *map)
void frontswap_init(unsigned type, unsigned long *map)
{
struct swap_info_struct *sis = swap_info[type];
struct frontswap_ops *ops;
VM_BUG_ON(sis == NULL);
@ -210,20 +125,16 @@ void __frontswap_init(unsigned type, unsigned long *map)
* p->frontswap set to something valid to work properly.
*/
frontswap_map_set(sis, map);
for_each_frontswap_ops(ops)
ops->init(type);
frontswap_ops->init(type);
}
EXPORT_SYMBOL(__frontswap_init);
bool __frontswap_test(struct swap_info_struct *sis,
static bool __frontswap_test(struct swap_info_struct *sis,
pgoff_t offset)
{
if (sis->frontswap_map)
return test_bit(offset, sis->frontswap_map);
return false;
}
EXPORT_SYMBOL(__frontswap_test);
static inline void __frontswap_set(struct swap_info_struct *sis,
pgoff_t offset)
@ -253,7 +164,6 @@ int __frontswap_store(struct page *page)
int type = swp_type(entry);
struct swap_info_struct *sis = swap_info[type];
pgoff_t offset = swp_offset(entry);
struct frontswap_ops *ops;
VM_BUG_ON(!frontswap_ops);
VM_BUG_ON(!PageLocked(page));
@ -267,28 +177,19 @@ int __frontswap_store(struct page *page)
*/
if (__frontswap_test(sis, offset)) {
__frontswap_clear(sis, offset);
for_each_frontswap_ops(ops)
ops->invalidate_page(type, offset);
frontswap_ops->invalidate_page(type, offset);
}
/* Try to store in each implementation, until one succeeds. */
for_each_frontswap_ops(ops) {
ret = ops->store(type, offset, page);
if (!ret) /* successful store */
break;
}
ret = frontswap_ops->store(type, offset, page);
if (ret == 0) {
__frontswap_set(sis, offset);
inc_frontswap_succ_stores();
} else {
inc_frontswap_failed_stores();
}
if (frontswap_writethrough_enabled)
/* report failure so swap also writes to swap device */
ret = -1;
return ret;
}
EXPORT_SYMBOL(__frontswap_store);
/*
* "Get" data from frontswap associated with swaptype and offset that were
@ -302,7 +203,6 @@ int __frontswap_load(struct page *page)
int type = swp_type(entry);
struct swap_info_struct *sis = swap_info[type];
pgoff_t offset = swp_offset(entry);
struct frontswap_ops *ops;
VM_BUG_ON(!frontswap_ops);
VM_BUG_ON(!PageLocked(page));
@ -312,21 +212,11 @@ int __frontswap_load(struct page *page)
return -1;
/* Try loading from each implementation, until one succeeds. */
for_each_frontswap_ops(ops) {
ret = ops->load(type, offset, page);
if (!ret) /* successful load */
break;
}
if (ret == 0) {
ret = frontswap_ops->load(type, offset, page);
if (ret == 0)
inc_frontswap_loads();
if (frontswap_tmem_exclusive_gets_enabled) {
SetPageDirty(page);
__frontswap_clear(sis, offset);
}
}
return ret;
}
EXPORT_SYMBOL(__frontswap_load);
/*
* Invalidate any data from frontswap associated with the specified swaptype
@ -335,7 +225,6 @@ EXPORT_SYMBOL(__frontswap_load);
void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
{
struct swap_info_struct *sis = swap_info[type];
struct frontswap_ops *ops;
VM_BUG_ON(!frontswap_ops);
VM_BUG_ON(sis == NULL);
@ -343,12 +232,10 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
if (!__frontswap_test(sis, offset))
return;
for_each_frontswap_ops(ops)
ops->invalidate_page(type, offset);
frontswap_ops->invalidate_page(type, offset);
__frontswap_clear(sis, offset);
inc_frontswap_invalidates();
}
EXPORT_SYMBOL(__frontswap_invalidate_page);
/*
* Invalidate all data from frontswap associated with all offsets for the
@ -357,7 +244,6 @@ EXPORT_SYMBOL(__frontswap_invalidate_page);
void __frontswap_invalidate_area(unsigned type)
{
struct swap_info_struct *sis = swap_info[type];
struct frontswap_ops *ops;
VM_BUG_ON(!frontswap_ops);
VM_BUG_ON(sis == NULL);
@ -365,123 +251,10 @@ void __frontswap_invalidate_area(unsigned type)
if (sis->frontswap_map == NULL)
return;
for_each_frontswap_ops(ops)
ops->invalidate_area(type);
frontswap_ops->invalidate_area(type);
atomic_set(&sis->frontswap_pages, 0);
bitmap_zero(sis->frontswap_map, sis->max);
}
EXPORT_SYMBOL(__frontswap_invalidate_area);
static unsigned long __frontswap_curr_pages(void)
{
unsigned long totalpages = 0;
struct swap_info_struct *si = NULL;
assert_spin_locked(&swap_lock);
plist_for_each_entry(si, &swap_active_head, list)
totalpages += atomic_read(&si->frontswap_pages);
return totalpages;
}
static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
int *swapid)
{
int ret = -EINVAL;
struct swap_info_struct *si = NULL;
int si_frontswap_pages;
unsigned long total_pages_to_unuse = total;
unsigned long pages = 0, pages_to_unuse = 0;
assert_spin_locked(&swap_lock);
plist_for_each_entry(si, &swap_active_head, list) {
si_frontswap_pages = atomic_read(&si->frontswap_pages);
if (total_pages_to_unuse < si_frontswap_pages) {
pages = pages_to_unuse = total_pages_to_unuse;
} else {
pages = si_frontswap_pages;
pages_to_unuse = 0; /* unuse all */
}
/* ensure there is enough RAM to fetch pages from frontswap */
if (security_vm_enough_memory_mm(current->mm, pages)) {
ret = -ENOMEM;
continue;
}
vm_unacct_memory(pages);
*unused = pages_to_unuse;
*swapid = si->type;
ret = 0;
break;
}
return ret;
}
/*
* Used to check if it's necessary and feasible to unuse pages.
* Return 1 when nothing to do, 0 when need to shrink pages,
* error code when there is an error.
*/
static int __frontswap_shrink(unsigned long target_pages,
unsigned long *pages_to_unuse,
int *type)
{
unsigned long total_pages = 0, total_pages_to_unuse;
assert_spin_locked(&swap_lock);
total_pages = __frontswap_curr_pages();
if (total_pages <= target_pages) {
/* Nothing to do */
*pages_to_unuse = 0;
return 1;
}
total_pages_to_unuse = total_pages - target_pages;
return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
}
/*
* Frontswap, like a true swap device, may unnecessarily retain pages
* under certain circumstances; "shrink" frontswap is essentially a
* "partial swapoff" and works by calling try_to_unuse to attempt to
* unuse enough frontswap pages to attempt to -- subject to memory
* constraints -- reduce the number of pages in frontswap to the
* number given in the parameter target_pages.
*/
void frontswap_shrink(unsigned long target_pages)
{
unsigned long pages_to_unuse = 0;
int type, ret;
/*
* we don't want to hold swap_lock while doing a very
* lengthy try_to_unuse, but swap_list may change
* so restart scan from swap_active_head each time
*/
spin_lock(&swap_lock);
ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
spin_unlock(&swap_lock);
if (ret == 0)
try_to_unuse(type, true, pages_to_unuse);
return;
}
EXPORT_SYMBOL(frontswap_shrink);
/*
* Count and return the number of frontswap pages across all
* swap devices. This is exported so that backend drivers can
* determine current usage without reading debugfs.
*/
unsigned long frontswap_curr_pages(void)
{
unsigned long totalpages = 0;
spin_lock(&swap_lock);
totalpages = __frontswap_curr_pages();
spin_unlock(&swap_lock);
return totalpages;
}
EXPORT_SYMBOL(frontswap_curr_pages);
static int __init init_frontswap(void)
{

140
mm/gup.c
View File

@ -667,12 +667,17 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
}
retry:
if (!pmd_present(pmdval)) {
/*
* Should never reach here, if thp migration is not supported;
* Otherwise, it must be a thp migration entry.
*/
VM_BUG_ON(!thp_migration_supported() ||
!is_pmd_migration_entry(pmdval));
if (likely(!(flags & FOLL_MIGRATION)))
return no_page_table(vma, flags);
VM_BUG_ON(thp_migration_supported() &&
!is_pmd_migration_entry(pmdval));
if (is_pmd_migration_entry(pmdval))
pmd_migration_entry_wait(mm, pmd);
pmd_migration_entry_wait(mm, pmd);
pmdval = READ_ONCE(*pmd);
/*
* MADV_DONTNEED may convert the pmd to null because
@ -943,6 +948,8 @@ static int faultin_page(struct vm_area_struct *vma,
/* mlock all present pages, but do not fault in new pages */
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
return -ENOENT;
if (*flags & FOLL_NOFAULT)
return -EFAULT;
if (*flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
if (*flags & FOLL_REMOTE)
@ -1681,6 +1688,124 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
}
#endif /* !CONFIG_MMU */
/**
* fault_in_writeable - fault in userspace address range for writing
* @uaddr: start of address range
* @size: size of address range
*
* Returns the number of bytes not faulted in (like copy_to_user() and
* copy_from_user()).
*/
size_t fault_in_writeable(char __user *uaddr, size_t size)
{
char __user *start = uaddr, *end;
if (unlikely(size == 0))
return 0;
if (!user_write_access_begin(uaddr, size))
return size;
if (!PAGE_ALIGNED(uaddr)) {
unsafe_put_user(0, uaddr, out);
uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
}
end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
if (unlikely(end < start))
end = NULL;
while (uaddr != end) {
unsafe_put_user(0, uaddr, out);
uaddr += PAGE_SIZE;
}
out:
user_write_access_end();
if (size > uaddr - start)
return size - (uaddr - start);
return 0;
}
EXPORT_SYMBOL(fault_in_writeable);
/*
* fault_in_safe_writeable - fault in an address range for writing
* @uaddr: start of address range
* @size: length of address range
*
* Faults in an address range for writing. This is primarily useful when we
* already know that some or all of the pages in the address range aren't in
* memory.
*
* Unlike fault_in_writeable(), this function is non-destructive.
*
* Note that we don't pin or otherwise hold the pages referenced that we fault
* in. There's no guarantee that they'll stay in memory for any duration of
* time.
*
* Returns the number of bytes not faulted in, like copy_to_user() and
* copy_from_user().
*/
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
{
unsigned long start = (unsigned long)uaddr, end;
struct mm_struct *mm = current->mm;
bool unlocked = false;
if (unlikely(size == 0))
return 0;
end = PAGE_ALIGN(start + size);
if (end < start)
end = 0;
mmap_read_lock(mm);
do {
if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
break;
start = (start + PAGE_SIZE) & PAGE_MASK;
} while (start != end);
mmap_read_unlock(mm);
if (size > (unsigned long)uaddr - start)
return size - ((unsigned long)uaddr - start);
return 0;
}
EXPORT_SYMBOL(fault_in_safe_writeable);
/**
* fault_in_readable - fault in userspace address range for reading
* @uaddr: start of user address range
* @size: size of user address range
*
* Returns the number of bytes not faulted in (like copy_to_user() and
* copy_from_user()).
*/
size_t fault_in_readable(const char __user *uaddr, size_t size)
{
const char __user *start = uaddr, *end;
volatile char c;
if (unlikely(size == 0))
return 0;
if (!user_read_access_begin(uaddr, size))
return size;
if (!PAGE_ALIGNED(uaddr)) {
unsafe_get_user(c, uaddr, out);
uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
}
end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
if (unlikely(end < start))
end = NULL;
while (uaddr != end) {
unsafe_get_user(c, uaddr, out);
uaddr += PAGE_SIZE;
}
out:
user_read_access_end();
(void)c;
if (size > uaddr - start)
return size - (uaddr - start);
return 0;
}
EXPORT_SYMBOL(fault_in_readable);
/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
@ -2253,7 +2378,6 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
{
int nr_start = *nr;
struct dev_pagemap *pgmap = NULL;
int ret = 1;
do {
struct page *page = pfn_to_page(pfn);
@ -2261,14 +2385,12 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
pgmap = get_dev_pagemap(pfn, pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, flags, pages);
ret = 0;
break;
}
SetPageReferenced(page);
pages[*nr] = page;
if (unlikely(!try_grab_page(page, flags))) {
undo_dev_pagemap(nr, nr_start, flags, pages);
ret = 0;
break;
}
(*nr)++;
@ -2276,7 +2398,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
} while (addr += PAGE_SIZE, addr != end);
put_dev_pagemap(pgmap);
return ret;
return addr == end;
}
static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
@ -2733,7 +2855,7 @@ static int internal_get_user_pages_fast(unsigned long start,
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
FOLL_FORCE | FOLL_PIN | FOLL_GET |
FOLL_FAST_ONLY)))
FOLL_FAST_ONLY | FOLL_NOFAULT)))
return -EINVAL;
if (gup_flags & FOLL_PIN)

View File

@ -23,7 +23,6 @@
#include <linux/bio.h>
#include <linux/pagemap.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
@ -360,7 +359,6 @@ void kunmap_high(struct page *page)
}
EXPORT_SYMBOL(kunmap_high);
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
unsigned start2, unsigned end2)
{
@ -383,7 +381,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
unsigned this_end = min_t(unsigned, end1, PAGE_SIZE);
if (end1 > start1) {
kaddr = kmap_atomic(page + i);
kaddr = kmap_local_page(page + i);
memset(kaddr + start1, 0, this_end - start1);
}
end1 -= this_end;
@ -398,7 +396,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
if (end2 > start2) {
if (!kaddr)
kaddr = kmap_atomic(page + i);
kaddr = kmap_local_page(page + i);
memset(kaddr + start2, 0, this_end - start2);
}
end2 -= this_end;
@ -406,7 +404,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
}
if (kaddr) {
kunmap_atomic(kaddr);
kunmap_local(kaddr);
flush_dcache_page(page + i);
}
@ -417,7 +415,6 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
BUG_ON((start1 | start2 | end1 | end2) != 0);
}
EXPORT_SYMBOL(zero_user_segments);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif /* CONFIG_HIGHMEM */
#ifdef CONFIG_KMAP_LOCAL

View File

@ -603,7 +603,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
VM_BUG_ON_PAGE(!PageCompound(page), page);
if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
if (mem_cgroup_charge(page_folio(page), vma->vm_mm, gfp)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
@ -1322,7 +1322,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
* We can only reuse the page if nobody else maps the huge page or it's
* part.
*/
if (reuse_swap_page(page, NULL)) {
if (reuse_swap_page(page)) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
@ -2405,7 +2405,8 @@ static void __split_huge_page_tail(struct page *head, int tail,
static void __split_huge_page(struct page *page, struct list_head *list,
pgoff_t end)
{
struct page *head = compound_head(page);
struct folio *folio = page_folio(page);
struct page *head = &folio->page;
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
@ -2424,7 +2425,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
lruvec = lock_page_lruvec(head);
lruvec = folio_lruvec_lock(folio);
ClearPageHasHWPoisoned(head);
@ -2541,38 +2542,28 @@ int total_mapcount(struct page *page)
* need full accuracy to avoid breaking page pinning, because
* page_trans_huge_mapcount() is slower than page_mapcount().
*/
int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
int page_trans_huge_mapcount(struct page *page)
{
int i, ret, _total_mapcount, mapcount;
int i, ret;
/* hugetlbfs shouldn't call it */
VM_BUG_ON_PAGE(PageHuge(page), page);
if (likely(!PageTransCompound(page))) {
mapcount = atomic_read(&page->_mapcount) + 1;
if (total_mapcount)
*total_mapcount = mapcount;
return mapcount;
}
if (likely(!PageTransCompound(page)))
return atomic_read(&page->_mapcount) + 1;
page = compound_head(page);
_total_mapcount = ret = 0;
ret = 0;
for (i = 0; i < thp_nr_pages(page); i++) {
mapcount = atomic_read(&page[i]._mapcount) + 1;
int mapcount = atomic_read(&page[i]._mapcount) + 1;
ret = max(ret, mapcount);
_total_mapcount += mapcount;
}
if (PageDoubleMap(page)) {
if (PageDoubleMap(page))
ret -= 1;
_total_mapcount -= thp_nr_pages(page);
}
mapcount = compound_mapcount(page);
ret += mapcount;
_total_mapcount += mapcount;
if (total_mapcount)
*total_mapcount = _total_mapcount;
return ret;
return ret + compound_mapcount(page);
}
/* Racy check whether the huge page can be split */
@ -2613,6 +2604,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct page *head = compound_head(page);
struct deferred_split *ds_queue = get_deferred_split_queue(head);
XA_STATE(xas, &head->mapping->i_pages, head->index);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int extra_pins, ret;
@ -2651,6 +2643,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out;
}
xas_split_alloc(&xas, head, compound_order(head),
mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK);
if (xas_error(&xas)) {
ret = xas_error(&xas);
goto out;
}
anon_vma = NULL;
i_mmap_lock_read(mapping);
@ -2680,13 +2679,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
/* block interrupt reentry in xa_lock and spinlock */
local_irq_disable();
if (mapping) {
XA_STATE(xas, &mapping->i_pages, page_index(head));
/*
* Check if the head page is present in page cache.
* We assume all tail are present too, if head is there.
*/
xa_lock(&mapping->i_pages);
xas_lock(&xas);
xas_reset(&xas);
if (xas_load(&xas) != head)
goto fail;
}
@ -2702,6 +2700,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (mapping) {
int nr = thp_nr_pages(head);
xas_split(&xas, head, thp_order(head));
if (PageSwapBacked(head)) {
__mod_lruvec_page_state(head, NR_SHMEM_THPS,
-nr);
@ -2718,7 +2717,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
spin_unlock(&ds_queue->split_queue_lock);
fail:
if (mapping)
xa_unlock(&mapping->i_pages);
xas_unlock(&xas);
local_irq_enable();
remap_page(head, thp_nr_pages(head));
ret = -EBUSY;
@ -2732,6 +2731,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (mapping)
i_mmap_unlock_read(mapping);
out:
/* Free any memory we didn't use */
xas_nomem(&xas, 0);
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
return ret;
}

File diff suppressed because it is too large Load Diff

View File

@ -27,9 +27,6 @@
#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
#define MEMFILE_ATTR(val) ((val) & 0xffff)
#define hugetlb_cgroup_from_counter(counter, idx) \
container_of(counter, struct hugetlb_cgroup, hugepage[idx])
static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
static inline struct page_counter *
@ -126,29 +123,58 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
}
}
static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
{
int node;
for_each_node(node)
kfree(h_cgroup->nodeinfo[node]);
kfree(h_cgroup);
}
static struct cgroup_subsys_state *
hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
struct hugetlb_cgroup *h_cgroup;
int node;
h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids),
GFP_KERNEL);
h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
if (!h_cgroup)
return ERR_PTR(-ENOMEM);
if (!parent_h_cgroup)
root_h_cgroup = h_cgroup;
/*
* TODO: this routine can waste much memory for nodes which will
* never be onlined. It's better to use memory hotplug callback
* function.
*/
for_each_node(node) {
/* Set node_to_alloc to -1 for offline nodes. */
int node_to_alloc =
node_state(node, N_NORMAL_MEMORY) ? node : -1;
h_cgroup->nodeinfo[node] =
kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
GFP_KERNEL, node_to_alloc);
if (!h_cgroup->nodeinfo[node])
goto fail_alloc_nodeinfo;
}
hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
return &h_cgroup->css;
fail_alloc_nodeinfo:
hugetlb_cgroup_free(h_cgroup);
return ERR_PTR(-ENOMEM);
}
static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct hugetlb_cgroup *h_cgroup;
h_cgroup = hugetlb_cgroup_from_css(css);
kfree(h_cgroup);
hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
}
/*
@ -292,7 +318,17 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
return;
__set_hugetlb_cgroup(page, h_cg, rsvd);
return;
if (!rsvd) {
unsigned long usage =
h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
/*
* This write is not atomic due to fetching usage and writing
* to it, but that's fine because we call this with
* hugetlb_lock held anyway.
*/
WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
usage + nr_pages);
}
}
void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
@ -331,8 +367,17 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
if (rsvd)
css_put(&h_cg->css);
return;
else {
unsigned long usage =
h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
/*
* This write is not atomic due to fetching usage and writing
* to it, but that's fine because we call this with
* hugetlb_lock held anyway.
*/
WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
usage - nr_pages);
}
}
void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
@ -421,6 +466,59 @@ enum {
RES_RSVD_FAILCNT,
};
static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
{
int nid;
struct cftype *cft = seq_cft(seq);
int idx = MEMFILE_IDX(cft->private);
bool legacy = MEMFILE_ATTR(cft->private);
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
struct cgroup_subsys_state *css;
unsigned long usage;
if (legacy) {
/* Add up usage across all nodes for the non-hierarchical total. */
usage = 0;
for_each_node_state(nid, N_MEMORY)
usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
seq_printf(seq, "total=%lu", usage * PAGE_SIZE);
/* Simply print the per-node usage for the non-hierarchical total. */
for_each_node_state(nid, N_MEMORY)
seq_printf(seq, " N%d=%lu", nid,
READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
PAGE_SIZE);
seq_putc(seq, '\n');
}
/*
* The hierarchical total is pretty much the value recorded by the
* counter, so use that.
*/
seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
/*
* For each node, transverse the css tree to obtain the hierarchical
* node usage.
*/
for_each_node_state(nid, N_MEMORY) {
usage = 0;
rcu_read_lock();
css_for_each_descendant_pre(css, &h_cg->css) {
usage += READ_ONCE(hugetlb_cgroup_from_css(css)
->nodeinfo[nid]
->usage[idx]);
}
rcu_read_unlock();
seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
}
seq_putc(seq, '\n');
return 0;
}
static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@ -671,8 +769,14 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx)
events_local_file[idx]);
cft->flags = CFTYPE_NOT_ON_ROOT;
/* NULL terminate the last cft */
/* Add the numa stat file */
cft = &h->cgroup_files_dfl[6];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
cft->seq_show = hugetlb_cgroup_read_numa_stat;
cft->flags = CFTYPE_NOT_ON_ROOT;
/* NULL terminate the last cft */
cft = &h->cgroup_files_dfl[7];
memset(cft, 0, sizeof(*cft));
WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
@ -742,8 +846,14 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx)
cft->write = hugetlb_cgroup_reset;
cft->read_u64 = hugetlb_cgroup_read_u64;
/* NULL terminate the last cft */
/* Add the numa stat file */
cft = &h->cgroup_files_legacy[8];
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
cft->private = MEMFILE_PRIVATE(idx, 1);
cft->seq_show = hugetlb_cgroup_read_numa_stat;
/* NULL terminate the last cft */
cft = &h->cgroup_files_legacy[9];
memset(cft, 0, sizeof(*cft));
WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,

View File

@ -12,6 +12,8 @@
#include <linux/pagemap.h>
#include <linux/tracepoint-defs.h>
struct folio_batch;
/*
* The set of flags that only affect watermark checking and reclaim
* behaviour. This is used by the MM to obey the caller constraints
@ -21,7 +23,7 @@
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
__GFP_ATOMIC)
__GFP_ATOMIC|__GFP_NOLOCKDEP)
/* The GFP flags allowed during early boot */
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
@ -34,16 +36,47 @@
void page_writeback_init(void);
static inline void *folio_raw_mapping(struct folio *folio)
{
unsigned long mapping = (unsigned long)folio->mapping;
return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
}
void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
int nr_throttled);
static inline void acct_reclaim_writeback(struct folio *folio)
{
pg_data_t *pgdat = folio_pgdat(folio);
int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled);
if (nr_throttled)
__acct_reclaim_writeback(pgdat, folio, nr_throttled);
}
static inline void wake_throttle_isolated(pg_data_t *pgdat)
{
wait_queue_head_t *wqh;
wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
if (waitqueue_active(wqh))
wake_up(wqh);
}
vm_fault_t do_swap_page(struct vm_fault *vmf);
void folio_rotate_reclaimable(struct folio *folio);
bool __folio_end_writeback(struct folio *folio);
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
{
return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
}
struct zap_details;
void unmap_page_range(struct mmu_gather *tlb,
struct vm_area_struct *vma,
unsigned long addr, unsigned long end,
@ -60,20 +93,37 @@ static inline void force_page_cache_readahead(struct address_space *mapping,
}
unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
pgoff_t end, struct pagevec *pvec, pgoff_t *indices);
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
void filemap_free_folio(struct address_space *mapping, struct folio *folio);
int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
loff_t end);
/**
* page_evictable - test whether a page is evictable
* @page: the page to test
* folio_evictable - Test whether a folio is evictable.
* @folio: The folio to test.
*
* Test whether page is evictable--i.e., should be placed on active/inactive
* lists vs unevictable list.
*
* Reasons page might not be evictable:
* (1) page's mapping marked unevictable
* (2) page is part of an mlocked VMA
* Test whether @folio is evictable -- i.e., should be placed on
* active/inactive lists vs unevictable list.
*
* Reasons folio might not be evictable:
* 1. folio's mapping marked unevictable
* 2. One of the pages in the folio is part of an mlocked VMA
*/
static inline bool folio_evictable(struct folio *folio)
{
bool ret;
/* Prevent address_space of inode and swap cache from being freed */
rcu_read_lock();
ret = !mapping_unevictable(folio_mapping(folio)) &&
!folio_test_mlocked(folio);
rcu_read_unlock();
return ret;
}
static inline bool page_evictable(struct page *page)
{
bool ret;
@ -109,17 +159,13 @@ extern unsigned long highest_memmap_pfn;
*/
extern int isolate_lru_page(struct page *page);
extern void putback_lru_page(struct page *page);
extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
/*
* in mm/rmap.c:
*/
extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
/*
* in mm/memcontrol.c:
*/
extern bool cgroup_memory_nokmem;
/*
* in mm/page_alloc.c
*/
@ -346,6 +392,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
#ifdef CONFIG_MMU
void unmap_mapping_folio(struct folio *folio);
extern long populate_vma_page_range(struct vm_area_struct *vma,
unsigned long start, unsigned long end, int *locked);
extern long faultin_vma_page_range(struct vm_area_struct *vma,
@ -449,8 +496,8 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
}
return fpin;
}
#else /* !CONFIG_MMU */
static inline void unmap_mapping_folio(struct folio *folio) { }
static inline void clear_page_mlock(struct page *page) { }
static inline void mlock_vma_page(struct page *page) { }
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)

View File

@ -30,20 +30,19 @@
#include "kasan.h"
#include "../slab.h"
depot_stack_handle_t kasan_save_stack(gfp_t flags)
depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc)
{
unsigned long entries[KASAN_STACK_DEPTH];
unsigned int nr_entries;
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
nr_entries = filter_irq_stacks(entries, nr_entries);
return stack_depot_save(entries, nr_entries, flags);
return __stack_depot_save(entries, nr_entries, flags, can_alloc);
}
void kasan_set_track(struct kasan_track *track, gfp_t flags)
{
track->pid = current->pid;
track->stack = kasan_save_stack(flags);
track->stack = kasan_save_stack(flags, true);
}
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
@ -247,8 +246,9 @@ struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
}
#endif
void __kasan_poison_slab(struct page *page)
void __kasan_poison_slab(struct slab *slab)
{
struct page *page = slab_page(slab);
unsigned long i;
for (i = 0; i < compound_nr(page); i++)
@ -298,7 +298,7 @@ static inline u8 assign_tag(struct kmem_cache *cache,
/* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
#ifdef CONFIG_SLAB
/* For SLAB assign tags based on the object index in the freelist. */
return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
return (u8)obj_to_index(cache, virt_to_slab(object), (void *)object);
#else
/*
* For SLUB assign a random tag during slab creation, otherwise reuse
@ -341,7 +341,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
if (is_kfence_address(object))
return false;
if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) !=
if (unlikely(nearest_obj(cache, virt_to_slab(object), object) !=
object)) {
kasan_report_invalid_free(tagged_object, ip);
return true;
@ -401,9 +401,9 @@ void __kasan_kfree_large(void *ptr, unsigned long ip)
void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
{
struct page *page;
struct folio *folio;
page = virt_to_head_page(ptr);
folio = virt_to_folio(ptr);
/*
* Even though this function is only called for kmem_cache_alloc and
@ -411,12 +411,14 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
* !PageSlab() when the size provided to kmalloc is larger than
* KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc.
*/
if (unlikely(!PageSlab(page))) {
if (unlikely(!folio_test_slab(folio))) {
if (____kasan_kfree_large(ptr, ip))
return;
kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE, false);
kasan_poison(ptr, folio_size(folio), KASAN_FREE_PAGE, false);
} else {
____kasan_slab_free(page->slab_cache, ptr, ip, false, false);
struct slab *slab = folio_slab(folio);
____kasan_slab_free(slab->slab_cache, ptr, ip, false, false);
}
}
@ -560,7 +562,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flags)
{
struct page *page;
struct slab *slab;
if (unlikely(object == ZERO_SIZE_PTR))
return (void *)object;
@ -572,13 +574,13 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
*/
kasan_unpoison(object, size, false);
page = virt_to_head_page(object);
slab = virt_to_slab(object);
/* Piggy-back on kmalloc() instrumentation to poison the redzone. */
if (unlikely(!PageSlab(page)))
if (unlikely(!slab))
return __kasan_kmalloc_large(object, size, flags);
else
return ____kasan_kmalloc(page->slab_cache, object, size, flags);
return ____kasan_kmalloc(slab->slab_cache, object, size, flags);
}
bool __kasan_check_byte(const void *address, unsigned long ip)

View File

@ -328,24 +328,34 @@ DEFINE_ASAN_SET_SHADOW(f3);
DEFINE_ASAN_SET_SHADOW(f5);
DEFINE_ASAN_SET_SHADOW(f8);
void kasan_record_aux_stack(void *addr)
static void __kasan_record_aux_stack(void *addr, bool can_alloc)
{
struct page *page = kasan_addr_to_page(addr);
struct slab *slab = kasan_addr_to_slab(addr);
struct kmem_cache *cache;
struct kasan_alloc_meta *alloc_meta;
void *object;
if (is_kfence_address(addr) || !(page && PageSlab(page)))
if (is_kfence_address(addr) || !slab)
return;
cache = page->slab_cache;
object = nearest_obj(cache, page, addr);
cache = slab->slab_cache;
object = nearest_obj(cache, slab, addr);
alloc_meta = kasan_get_alloc_meta(cache, object);
if (!alloc_meta)
return;
alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT);
alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT, can_alloc);
}
void kasan_record_aux_stack(void *addr)
{
return __kasan_record_aux_stack(addr, true);
}
void kasan_record_aux_stack_noalloc(void *addr)
{
return __kasan_record_aux_stack(addr, false);
}
void kasan_set_free_info(struct kmem_cache *cache,

View File

@ -29,6 +29,7 @@ enum kasan_arg_mode {
KASAN_ARG_MODE_DEFAULT,
KASAN_ARG_MODE_SYNC,
KASAN_ARG_MODE_ASYNC,
KASAN_ARG_MODE_ASYMM,
};
enum kasan_arg_stacktrace {
@ -45,9 +46,9 @@ static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init;
DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
EXPORT_SYMBOL(kasan_flag_enabled);
/* Whether the asynchronous mode is enabled. */
bool kasan_flag_async __ro_after_init;
EXPORT_SYMBOL_GPL(kasan_flag_async);
/* Whether the selected mode is synchronous/asynchronous/asymmetric.*/
enum kasan_mode kasan_mode __ro_after_init;
EXPORT_SYMBOL_GPL(kasan_mode);
/* Whether to collect alloc/free stack traces. */
DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
@ -69,7 +70,7 @@ static int __init early_kasan_flag(char *arg)
}
early_param("kasan", early_kasan_flag);
/* kasan.mode=sync/async */
/* kasan.mode=sync/async/asymm */
static int __init early_kasan_mode(char *arg)
{
if (!arg)
@ -79,6 +80,8 @@ static int __init early_kasan_mode(char *arg)
kasan_arg_mode = KASAN_ARG_MODE_SYNC;
else if (!strcmp(arg, "async"))
kasan_arg_mode = KASAN_ARG_MODE_ASYNC;
else if (!strcmp(arg, "asymm"))
kasan_arg_mode = KASAN_ARG_MODE_ASYMM;
else
return -EINVAL;
@ -103,6 +106,16 @@ static int __init early_kasan_flag_stacktrace(char *arg)
}
early_param("kasan.stacktrace", early_kasan_flag_stacktrace);
static inline const char *kasan_mode_info(void)
{
if (kasan_mode == KASAN_MODE_ASYNC)
return "async";
else if (kasan_mode == KASAN_MODE_ASYMM)
return "asymm";
else
return "sync";
}
/* kasan_init_hw_tags_cpu() is called for each CPU. */
void kasan_init_hw_tags_cpu(void)
{
@ -116,11 +129,13 @@ void kasan_init_hw_tags_cpu(void)
return;
/*
* Enable async mode only when explicitly requested through
* the command line.
* Enable async or asymm modes only when explicitly requested
* through the command line.
*/
if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC)
hw_enable_tagging_async();
else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM)
hw_enable_tagging_asymm();
else
hw_enable_tagging_sync();
}
@ -143,15 +158,19 @@ void __init kasan_init_hw_tags(void)
case KASAN_ARG_MODE_DEFAULT:
/*
* Default to sync mode.
* Do nothing, kasan_flag_async keeps its default value.
*/
break;
fallthrough;
case KASAN_ARG_MODE_SYNC:
/* Do nothing, kasan_flag_async keeps its default value. */
/* Sync mode enabled. */
kasan_mode = KASAN_MODE_SYNC;
break;
case KASAN_ARG_MODE_ASYNC:
/* Async mode enabled. */
kasan_flag_async = true;
kasan_mode = KASAN_MODE_ASYNC;
break;
case KASAN_ARG_MODE_ASYMM:
/* Asymm mode enabled. */
kasan_mode = KASAN_MODE_ASYMM;
break;
}
@ -168,7 +187,9 @@ void __init kasan_init_hw_tags(void)
break;
}
pr_info("KernelAddressSanitizer initialized\n");
pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, stacktrace=%s)\n",
kasan_mode_info(),
kasan_stack_collection_enabled() ? "on" : "off");
}
void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags)

View File

@ -13,16 +13,28 @@
#include "../slab.h"
DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
extern bool kasan_flag_async __ro_after_init;
enum kasan_mode {
KASAN_MODE_SYNC,
KASAN_MODE_ASYNC,
KASAN_MODE_ASYMM,
};
extern enum kasan_mode kasan_mode __ro_after_init;
static inline bool kasan_stack_collection_enabled(void)
{
return static_branch_unlikely(&kasan_flag_stacktrace);
}
static inline bool kasan_async_mode_enabled(void)
static inline bool kasan_async_fault_possible(void)
{
return kasan_flag_async;
return kasan_mode == KASAN_MODE_ASYNC || kasan_mode == KASAN_MODE_ASYMM;
}
static inline bool kasan_sync_fault_possible(void)
{
return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM;
}
#else
@ -31,14 +43,17 @@ static inline bool kasan_stack_collection_enabled(void)
return true;
}
static inline bool kasan_async_mode_enabled(void)
static inline bool kasan_async_fault_possible(void)
{
return false;
}
#endif
static inline bool kasan_sync_fault_possible(void)
{
return true;
}
extern bool kasan_flag_async __ro_after_init;
#endif
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
#define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
@ -250,8 +265,9 @@ bool kasan_report(unsigned long addr, size_t size,
void kasan_report_invalid_free(void *object, unsigned long ip);
struct page *kasan_addr_to_page(const void *addr);
struct slab *kasan_addr_to_slab(const void *addr);
depot_stack_handle_t kasan_save_stack(gfp_t flags);
depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc);
void kasan_set_track(struct kasan_track *track, gfp_t flags);
void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag);
struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
@ -289,6 +305,9 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#ifndef arch_enable_tagging_async
#define arch_enable_tagging_async()
#endif
#ifndef arch_enable_tagging_asymm
#define arch_enable_tagging_asymm()
#endif
#ifndef arch_force_async_tag_fault
#define arch_force_async_tag_fault()
#endif
@ -304,6 +323,7 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#define hw_enable_tagging_sync() arch_enable_tagging_sync()
#define hw_enable_tagging_async() arch_enable_tagging_async()
#define hw_enable_tagging_asymm() arch_enable_tagging_asymm()
#define hw_force_async_tag_fault() arch_force_async_tag_fault()
#define hw_get_random_tag() arch_get_random_tag()
#define hw_get_mem_tag(addr) arch_get_mem_tag(addr)
@ -314,6 +334,7 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
#define hw_enable_tagging_sync()
#define hw_enable_tagging_async()
#define hw_enable_tagging_asymm()
#endif /* CONFIG_KASAN_HW_TAGS */

View File

@ -117,7 +117,7 @@ static unsigned long quarantine_batch_size;
static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
{
return virt_to_head_page(qlink)->slab_cache;
return virt_to_slab(qlink)->slab_cache;
}
static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)

View File

@ -112,7 +112,7 @@ static void start_report(unsigned long *flags)
static void end_report(unsigned long *flags, unsigned long addr)
{
if (!kasan_async_mode_enabled())
if (!kasan_async_fault_possible())
trace_error_report_end(ERROR_DETECTOR_KASAN, addr);
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
@ -132,20 +132,11 @@ static void end_report(unsigned long *flags, unsigned long addr)
kasan_enable_current();
}
static void print_stack(depot_stack_handle_t stack)
{
unsigned long *entries;
unsigned int nr_entries;
nr_entries = stack_depot_fetch(stack, &entries);
stack_trace_print(entries, nr_entries, 0);
}
static void print_track(struct kasan_track *track, const char *prefix)
{
pr_err("%s by task %u:\n", prefix, track->pid);
if (track->stack) {
print_stack(track->stack);
stack_depot_print(track->stack);
} else {
pr_err("(stack is not available)\n");
}
@ -159,6 +150,14 @@ struct page *kasan_addr_to_page(const void *addr)
return NULL;
}
struct slab *kasan_addr_to_slab(const void *addr)
{
if ((addr >= (void *)PAGE_OFFSET) &&
(addr < high_memory))
return virt_to_slab(addr);
return NULL;
}
static void describe_object_addr(struct kmem_cache *cache, void *object,
const void *addr)
{
@ -214,12 +213,12 @@ static void describe_object_stacks(struct kmem_cache *cache, void *object,
return;
if (alloc_meta->aux_stack[0]) {
pr_err("Last potentially related work creation:\n");
print_stack(alloc_meta->aux_stack[0]);
stack_depot_print(alloc_meta->aux_stack[0]);
pr_err("\n");
}
if (alloc_meta->aux_stack[1]) {
pr_err("Second to last potentially related work creation:\n");
print_stack(alloc_meta->aux_stack[1]);
stack_depot_print(alloc_meta->aux_stack[1]);
pr_err("\n");
}
#endif
@ -235,7 +234,7 @@ static void describe_object(struct kmem_cache *cache, void *object,
static inline bool kernel_or_module_addr(const void *addr)
{
if (addr >= (void *)_stext && addr < (void *)_end)
if (is_kernel((unsigned long)addr))
return true;
if (is_module_address((unsigned long)addr))
return true;
@ -257,8 +256,9 @@ static void print_address_description(void *addr, u8 tag)
pr_err("\n");
if (page && PageSlab(page)) {
struct kmem_cache *cache = page->slab_cache;
void *object = nearest_obj(cache, page, addr);
struct slab *slab = page_slab(page);
struct kmem_cache *cache = slab->slab_cache;
void *object = nearest_obj(cache, slab, addr);
describe_object(cache, object, addr, tag);
}

View File

@ -12,7 +12,7 @@ const char *kasan_get_bug_type(struct kasan_access_info *info)
#ifdef CONFIG_KASAN_TAGS_IDENTIFY
struct kasan_alloc_meta *alloc_meta;
struct kmem_cache *cache;
struct page *page;
struct slab *slab;
const void *addr;
void *object;
u8 tag;
@ -20,10 +20,10 @@ const char *kasan_get_bug_type(struct kasan_access_info *info)
tag = get_tag(info->access_addr);
addr = kasan_reset_tag(info->access_addr);
page = kasan_addr_to_page(addr);
if (page && PageSlab(page)) {
cache = page->slab_cache;
object = nearest_obj(cache, page, (void *)addr);
slab = kasan_addr_to_slab(addr);
if (slab) {
cache = slab->slab_cache;
object = nearest_obj(cache, slab, (void *)addr);
alloc_meta = kasan_get_alloc_meta(cache, object);
if (alloc_meta) {

View File

@ -254,6 +254,11 @@ core_initcall(kasan_memhotplug_init);
#ifdef CONFIG_KASAN_VMALLOC
void __init __weak kasan_populate_early_vm_area_shadow(void *start,
unsigned long size)
{
}
static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
void *unused)
{

View File

@ -42,7 +42,7 @@ void __init kasan_init_sw_tags(void)
for_each_possible_cpu(cpu)
per_cpu(prng_state, cpu) = (u32)get_cycles();
pr_info("KernelAddressSanitizer initialized\n");
pr_info("KernelAddressSanitizer initialized (sw-tags)\n");
}
/*

View File

@ -10,12 +10,15 @@
#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/debugfs.h>
#include <linux/hash.h>
#include <linux/irq_work.h>
#include <linux/jhash.h>
#include <linux/kcsan-checks.h>
#include <linux/kfence.h>
#include <linux/kmemleak.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/log2.h>
#include <linux/memblock.h>
#include <linux/moduleparam.h>
#include <linux/random.h>
@ -44,7 +47,8 @@
static bool kfence_enabled __read_mostly;
static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */
#ifdef MODULE_PARAM_PREFIX
#undef MODULE_PARAM_PREFIX
@ -82,6 +86,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
};
module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
/* Pool usage% threshold when currently covered allocations are skipped. */
static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
/* The pool of pages used for guard pages and objects. */
char *__kfence_pool __ro_after_init;
EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
@ -106,6 +114,32 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
/* Gates the allocation, ensuring only one succeeds in a given period. */
atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
/*
* A Counting Bloom filter of allocation coverage: limits currently covered
* allocations of the same source filling up the pool.
*
* Assuming a range of 15%-85% unique allocations in the pool at any point in
* time, the below parameters provide a probablity of 0.02-0.33 for false
* positive hits respectively:
*
* P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
*/
#define ALLOC_COVERED_HNUM 2
#define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2)
#define ALLOC_COVERED_SIZE (1 << ALLOC_COVERED_ORDER)
#define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER)
#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1)
static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
/* Stack depth used to determine uniqueness of an allocation. */
#define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8)
/*
* Randomness for stack hashes, making the same collisions across reboots and
* different machines less likely.
*/
static u32 stack_hash_seed __ro_after_init;
/* Statistics counters for debugfs. */
enum kfence_counter_id {
KFENCE_COUNTER_ALLOCATED,
@ -113,6 +147,9 @@ enum kfence_counter_id {
KFENCE_COUNTER_FREES,
KFENCE_COUNTER_ZOMBIES,
KFENCE_COUNTER_BUGS,
KFENCE_COUNTER_SKIP_INCOMPAT,
KFENCE_COUNTER_SKIP_CAPACITY,
KFENCE_COUNTER_SKIP_COVERED,
KFENCE_COUNTER_COUNT,
};
static atomic_long_t counters[KFENCE_COUNTER_COUNT];
@ -122,11 +159,59 @@ static const char *const counter_names[] = {
[KFENCE_COUNTER_FREES] = "total frees",
[KFENCE_COUNTER_ZOMBIES] = "zombie allocations",
[KFENCE_COUNTER_BUGS] = "total bugs",
[KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)",
[KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)",
[KFENCE_COUNTER_SKIP_COVERED] = "skipped allocations (covered)",
};
static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
/* === Internals ============================================================ */
static inline bool should_skip_covered(void)
{
unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
}
static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
{
num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
num_entries = filter_irq_stacks(stack_entries, num_entries);
return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed);
}
/*
* Adds (or subtracts) count @val for allocation stack trace hash
* @alloc_stack_hash from Counting Bloom filter.
*/
static void alloc_covered_add(u32 alloc_stack_hash, int val)
{
int i;
for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
}
}
/*
* Returns true if the allocation stack trace hash @alloc_stack_hash is
* currently contained (non-zero count) in Counting Bloom filter.
*/
static bool alloc_covered_contains(u32 alloc_stack_hash)
{
int i;
for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
return false;
alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
}
return true;
}
static bool kfence_protect(unsigned long addr)
{
return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
@ -184,19 +269,26 @@ static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *m
* Update the object's metadata state, including updating the alloc/free stacks
* depending on the state transition.
*/
static noinline void metadata_update_state(struct kfence_metadata *meta,
enum kfence_object_state next)
static noinline void
metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next,
unsigned long *stack_entries, size_t num_stack_entries)
{
struct kfence_track *track =
next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
lockdep_assert_held(&meta->lock);
/*
* Skip over 1 (this) functions; noinline ensures we do not accidentally
* skip over the caller by never inlining.
*/
track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
if (stack_entries) {
memcpy(track->stack_entries, stack_entries,
num_stack_entries * sizeof(stack_entries[0]));
} else {
/*
* Skip over 1 (this) functions; noinline ensures we do not
* accidentally skip over the caller by never inlining.
*/
num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
}
track->num_stack_entries = num_stack_entries;
track->pid = task_pid_nr(current);
track->cpu = raw_smp_processor_id();
track->ts_nsec = local_clock(); /* Same source as printk timestamps. */
@ -219,12 +311,19 @@ static inline bool set_canary_byte(u8 *addr)
/* Check canary byte at @addr. */
static inline bool check_canary_byte(u8 *addr)
{
struct kfence_metadata *meta;
unsigned long flags;
if (likely(*addr == KFENCE_CANARY_PATTERN(addr)))
return true;
atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
kfence_report_error((unsigned long)addr, false, NULL, addr_to_metadata((unsigned long)addr),
KFENCE_ERROR_CORRUPTION);
meta = addr_to_metadata((unsigned long)addr);
raw_spin_lock_irqsave(&meta->lock, flags);
kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION);
raw_spin_unlock_irqrestore(&meta->lock, flags);
return false;
}
@ -234,8 +333,6 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
unsigned long addr;
lockdep_assert_held(&meta->lock);
/*
* We'll iterate over each canary byte per-side until fn() returns
* false. However, we'll still iterate over the canary bytes to the
@ -258,11 +355,13 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
}
}
static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
unsigned long *stack_entries, size_t num_stack_entries,
u32 alloc_stack_hash)
{
struct kfence_metadata *meta = NULL;
unsigned long flags;
struct page *page;
struct slab *slab;
void *addr;
/* Try to obtain a free object. */
@ -272,8 +371,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
list_del_init(&meta->list);
}
raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
if (!meta)
if (!meta) {
atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
return NULL;
}
if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
/*
@ -315,23 +416,26 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
addr = (void *)meta->addr;
/* Update remaining metadata. */
metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED);
metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries);
/* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
WRITE_ONCE(meta->cache, cache);
meta->size = size;
for_each_canary(meta, set_canary_byte);
/* Set required struct page fields. */
page = virt_to_page(meta->addr);
page->slab_cache = cache;
if (IS_ENABLED(CONFIG_SLUB))
page->objects = 1;
if (IS_ENABLED(CONFIG_SLAB))
page->s_mem = addr;
meta->alloc_stack_hash = alloc_stack_hash;
raw_spin_unlock_irqrestore(&meta->lock, flags);
alloc_covered_add(alloc_stack_hash, 1);
/* Set required slab fields. */
slab = virt_to_slab((void *)meta->addr);
slab->slab_cache = cache;
#if defined(CONFIG_SLUB)
slab->objects = 1;
#elif defined(CONFIG_SLAB)
slab->s_mem = addr;
#endif
/* Memory initialization. */
for_each_canary(meta, set_canary_byte);
/*
* We check slab_want_init_on_alloc() ourselves, rather than letting
@ -356,6 +460,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
{
struct kcsan_scoped_access assert_page_exclusive;
unsigned long flags;
bool init;
raw_spin_lock_irqsave(&meta->lock, flags);
@ -383,6 +488,13 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
meta->unprotected_page = 0;
}
/* Mark the object as freed. */
metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0);
init = slab_want_init_on_free(meta->cache);
raw_spin_unlock_irqrestore(&meta->lock, flags);
alloc_covered_add(meta->alloc_stack_hash, -1);
/* Check canary bytes for memory corruption. */
for_each_canary(meta, check_canary_byte);
@ -391,14 +503,9 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
* data is still there, and after a use-after-free is detected, we
* unprotect the page, so the data is still accessible.
*/
if (!zombie && unlikely(slab_want_init_on_free(meta->cache)))
if (!zombie && unlikely(init))
memzero_explicit(addr, meta->size);
/* Mark the object as freed. */
metadata_update_state(meta, KFENCE_OBJECT_FREED);
raw_spin_unlock_irqrestore(&meta->lock, flags);
/* Protect to detect use-after-frees. */
kfence_protect((unsigned long)addr);
@ -665,6 +772,7 @@ void __init kfence_init(void)
if (!kfence_sample_interval)
return;
stack_hash_seed = (u32)random_get_entropy();
if (!kfence_init_pool()) {
pr_err("%s failed\n", __func__);
return;
@ -740,12 +848,18 @@ void kfence_shutdown_cache(struct kmem_cache *s)
void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
{
unsigned long stack_entries[KFENCE_STACK_DEPTH];
size_t num_stack_entries;
u32 alloc_stack_hash;
/*
* Perform size check before switching kfence_allocation_gate, so that
* we don't disable KFENCE without making an allocation.
*/
if (size > PAGE_SIZE)
if (size > PAGE_SIZE) {
atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
return NULL;
}
/*
* Skip allocations from non-default zones, including DMA. We cannot
@ -753,8 +867,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
* properties (e.g. reside in DMAable memory).
*/
if ((flags & GFP_ZONEMASK) ||
(s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
(s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
return NULL;
}
if (atomic_inc_return(&kfence_allocation_gate) > 1)
return NULL;
@ -775,7 +891,25 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
if (!READ_ONCE(kfence_enabled))
return NULL;
return kfence_guarded_alloc(s, size, flags);
num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
/*
* Do expensive check for coverage of allocation in slow-path after
* allocation_gate has already become non-zero, even though it might
* mean not making any allocation within a given sample interval.
*
* This ensures reasonable allocation coverage when the pool is almost
* full, including avoiding long-lived allocations of the same source
* filling up the pool (e.g. pagecache allocations).
*/
alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
return NULL;
}
return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
alloc_stack_hash);
}
size_t kfence_ksize(const void *addr)

View File

@ -87,6 +87,8 @@ struct kfence_metadata {
/* Allocation and free stack information. */
struct kfence_track alloc_track;
struct kfence_track free_track;
/* For updating alloc_covered on frees. */
u32 alloc_stack_hash;
};
extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];

View File

@ -32,6 +32,11 @@
#define arch_kfence_test_address(addr) (addr)
#endif
#define KFENCE_TEST_REQUIRES(test, cond) do { \
if (!(cond)) \
kunit_skip((test), "Test requires: " #cond); \
} while (0)
/* Report as observed from console. */
static struct {
spinlock_t lock;
@ -263,13 +268,13 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
* 100x the sample interval should be more than enough to ensure we get
* a KFENCE allocation eventually.
*/
timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval);
/*
* Especially for non-preemption kernels, ensure the allocation-gate
* timer can catch up: after @resched_after, every failed allocation
* attempt yields, to ensure the allocation-gate timer is scheduled.
*/
resched_after = jiffies + msecs_to_jiffies(CONFIG_KFENCE_SAMPLE_INTERVAL);
resched_after = jiffies + msecs_to_jiffies(kfence_sample_interval);
do {
if (test_cache)
alloc = kmem_cache_alloc(test_cache, gfp);
@ -277,7 +282,7 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
alloc = kmalloc(size, gfp);
if (is_kfence_address(alloc)) {
struct page *page = virt_to_head_page(alloc);
struct slab *slab = virt_to_slab(alloc);
struct kmem_cache *s = test_cache ?:
kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)];
@ -286,8 +291,8 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
* even for KFENCE objects; these are required so that
* memcg accounting works correctly.
*/
KUNIT_EXPECT_EQ(test, obj_to_index(s, page, alloc), 0U);
KUNIT_EXPECT_EQ(test, objs_per_slab_page(s, page), 1);
KUNIT_EXPECT_EQ(test, obj_to_index(s, slab, alloc), 0U);
KUNIT_EXPECT_EQ(test, objs_per_slab(s, slab), 1);
if (policy == ALLOCATE_ANY)
return alloc;
@ -555,8 +560,7 @@ static void test_init_on_free(struct kunit *test)
};
int i;
if (!IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON))
return;
KFENCE_TEST_REQUIRES(test, IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON));
/* Assume it hasn't been disabled on command line. */
setup_test_cache(test, size, 0, NULL);
@ -603,10 +607,8 @@ static void test_gfpzero(struct kunit *test)
char *buf1, *buf2;
int i;
if (CONFIG_KFENCE_SAMPLE_INTERVAL > 100) {
kunit_warn(test, "skipping ... would take too long\n");
return;
}
/* Skip if we think it'd take too long. */
KFENCE_TEST_REQUIRES(test, kfence_sample_interval <= 100);
setup_test_cache(test, size, 0, NULL);
buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
@ -737,7 +739,7 @@ static void test_memcache_alloc_bulk(struct kunit *test)
* 100x the sample interval should be more than enough to ensure we get
* a KFENCE allocation eventually.
*/
timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval);
do {
void *objects[100];
int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects),

View File

@ -16,6 +16,7 @@
#include <linux/hashtable.h>
#include <linux/userfaultfd_k.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
@ -618,6 +619,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
goto out;
}
}
@ -636,6 +638,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
if (page_mapcount(page) > 1 &&
++shared > khugepaged_max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out;
}
@ -681,7 +684,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
if (!pte_write(pteval) && PageSwapCache(page) &&
!reuse_swap_page(page, NULL)) {
!reuse_swap_page(page)) {
/*
* Page is in the swap cache and cannot be re-used.
* It cannot be collapsed into a THP.
@ -756,11 +759,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
* ptl mostly unnecessary.
*/
spin_lock(ptl);
/*
* paravirt calls inside pte_clear here are
* superfluous.
*/
pte_clear(vma->vm_mm, address, _pte);
ptep_clear(vma->vm_mm, address, _pte);
spin_unlock(ptl);
}
} else {
@ -774,11 +773,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
* inside page_remove_rmap().
*/
spin_lock(ptl);
/*
* paravirt calls inside pte_clear here are
* superfluous.
*/
pte_clear(vma->vm_mm, address, _pte);
ptep_clear(vma->vm_mm, address, _pte);
page_remove_rmap(src_page, false);
spin_unlock(ptl);
free_page_and_swap_cache(src_page);
@ -1090,7 +1085,7 @@ static void collapse_huge_page(struct mm_struct *mm,
goto out_nolock;
}
if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out_nolock;
}
@ -1214,7 +1209,7 @@ static void collapse_huge_page(struct mm_struct *mm,
mmap_write_unlock(mm);
out_nolock:
if (!IS_ERR_OR_NULL(*hpage))
mem_cgroup_uncharge(*hpage);
mem_cgroup_uncharge(page_folio(*hpage));
trace_mm_collapse_huge_page(mm, isolated, result);
return;
}
@ -1261,6 +1256,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
continue;
} else {
result = SCAN_EXCEED_SWAP_PTE;
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
goto out_unmap;
}
}
@ -1270,6 +1266,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
continue;
} else {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
goto out_unmap;
}
}
@ -1298,6 +1295,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (page_mapcount(page) > 1 &&
++shared > khugepaged_max_ptes_shared) {
result = SCAN_EXCEED_SHARED_PTE;
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
goto out_unmap;
}
@ -1306,7 +1304,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
/*
* Record which node the original page is from and save this
* information to khugepaged_node_load[].
* Khupaged will allocate hugepage from the node has the max
* Khugepaged will allocate hugepage from the node has the max
* hit record.
*/
node = page_to_nid(page);
@ -1419,6 +1417,21 @@ static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
return 0;
}
static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t *pmdp)
{
spinlock_t *ptl;
pmd_t pmd;
mmap_assert_write_locked(mm);
ptl = pmd_lock(vma->vm_mm, pmdp);
pmd = pmdp_collapse_flush(vma, addr, pmdp);
spin_unlock(ptl);
mm_dec_nr_ptes(mm);
page_table_check_pte_clear_range(mm, addr, pmd);
pte_free(mm, pmd_pgtable(pmd));
}
/**
* collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
* address haddr.
@ -1436,7 +1449,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
struct vm_area_struct *vma = find_vma(mm, haddr);
struct page *hpage;
pte_t *start_pte, *pte;
pmd_t *pmd, _pmd;
pmd_t *pmd;
spinlock_t *ptl;
int count = 0;
int i;
@ -1512,12 +1525,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
}
/* step 4: collapse pmd */
ptl = pmd_lock(vma->vm_mm, pmd);
_pmd = pmdp_collapse_flush(vma, haddr, pmd);
spin_unlock(ptl);
mm_dec_nr_ptes(mm);
pte_free(mm, pmd_pgtable(_pmd));
collapse_and_free_pmd(mm, vma, haddr, pmd);
drop_hpage:
unlock_page(hpage);
put_page(hpage);
@ -1555,7 +1563,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
struct vm_area_struct *vma;
struct mm_struct *mm;
unsigned long addr;
pmd_t *pmd, _pmd;
pmd_t *pmd;
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
@ -1594,14 +1602,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
* reverse order. Trylock is a way to avoid deadlock.
*/
if (mmap_write_trylock(mm)) {
if (!khugepaged_test_exit(mm)) {
spinlock_t *ptl = pmd_lock(mm, pmd);
/* assume page table is clear */
_pmd = pmdp_collapse_flush(vma, addr, pmd);
spin_unlock(ptl);
mm_dec_nr_ptes(mm);
pte_free(mm, pmd_pgtable(_pmd));
}
if (!khugepaged_test_exit(mm))
collapse_and_free_pmd(mm, vma, addr, pmd);
mmap_write_unlock(mm);
} else {
/* Try again later */
@ -1661,13 +1663,16 @@ static void collapse_file(struct mm_struct *mm,
goto out;
}
if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out;
}
count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
/* This will be less messy when we use multi-index entries */
/*
* Ensure we have slots for all the pages in the range. This is
* almost certainly a no-op because most of the pages must be present
*/
do {
xas_lock_irq(&xas);
xas_create_range(&xas);
@ -1892,6 +1897,9 @@ static void collapse_file(struct mm_struct *mm,
__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
}
/* Join all the small entries into a single multi-index entry */
xas_set_order(&xas, start, HPAGE_PMD_ORDER);
xas_store(&xas, new_page);
xa_locked:
xas_unlock_irq(&xas);
xa_unlocked:
@ -1983,7 +1991,7 @@ static void collapse_file(struct mm_struct *mm,
out:
VM_BUG_ON(!list_empty(&pagelist));
if (!IS_ERR_OR_NULL(*hpage))
mem_cgroup_uncharge(*hpage);
mem_cgroup_uncharge(page_folio(*hpage));
/* TODO: tracepoints */
}
@ -2008,11 +2016,16 @@ static void khugepaged_scan_file(struct mm_struct *mm,
if (xa_is_value(page)) {
if (++swap > khugepaged_max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
break;
}
continue;
}
/*
* XXX: khugepaged should compact smaller compound pages
* into a PMD sized page
*/
if (PageTransCompound(page)) {
result = SCAN_PAGE_COMPOUND;
break;
@ -2054,6 +2067,7 @@ static void khugepaged_scan_file(struct mm_struct *mm,
if (result == SCAN_SUCCEED) {
if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
result = SCAN_EXCEED_NONE_PTE;
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
} else {
node = khugepaged_find_target_node();
collapse_file(mm, file, start, hpage, node);
@ -2299,6 +2313,11 @@ static void set_recommended_min_free_kbytes(void)
int nr_zones = 0;
unsigned long recommended_min;
if (!khugepaged_enabled()) {
calculate_min_free_kbytes();
goto update_wmarks;
}
for_each_populated_zone(zone) {
/*
* We don't need to worry about fragmentation of
@ -2334,6 +2353,8 @@ static void set_recommended_min_free_kbytes(void)
min_free_kbytes = recommended_min;
}
update_wmarks:
setup_per_zone_wmarks();
}
@ -2355,12 +2376,11 @@ int start_stop_khugepaged(void)
if (!list_empty(&khugepaged_scan.mm_head))
wake_up_interruptible(&khugepaged_wait);
set_recommended_min_free_kbytes();
} else if (khugepaged_thread) {
kthread_stop(khugepaged_thread);
khugepaged_thread = NULL;
}
set_recommended_min_free_kbytes();
fail:
mutex_unlock(&khugepaged_mutex);
return err;

View File

@ -381,15 +381,20 @@ static void dump_object_info(struct kmemleak_object *object)
static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
{
struct rb_node *rb = object_tree_root.rb_node;
unsigned long untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
while (rb) {
struct kmemleak_object *object =
rb_entry(rb, struct kmemleak_object, rb_node);
if (ptr < object->pointer)
struct kmemleak_object *object;
unsigned long untagged_objp;
object = rb_entry(rb, struct kmemleak_object, rb_node);
untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer);
if (untagged_ptr < untagged_objp)
rb = object->rb_node.rb_left;
else if (object->pointer + object->size <= ptr)
else if (untagged_objp + object->size <= untagged_ptr)
rb = object->rb_node.rb_right;
else if (object->pointer == ptr || alias)
else if (untagged_objp == untagged_ptr || alias)
return object;
else {
kmemleak_warn("Found object by alias at 0x%08lx\n",
@ -576,6 +581,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
struct kmemleak_object *object, *parent;
struct rb_node **link, *rb_parent;
unsigned long untagged_ptr;
unsigned long untagged_objp;
object = mem_pool_alloc(gfp);
if (!object) {
@ -629,9 +635,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
while (*link) {
rb_parent = *link;
parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
if (ptr + size <= parent->pointer)
untagged_objp = (unsigned long)kasan_reset_tag((void *)parent->pointer);
if (untagged_ptr + size <= untagged_objp)
link = &parent->rb_node.rb_left;
else if (parent->pointer + parent->size <= ptr)
else if (untagged_objp + parent->size <= untagged_ptr)
link = &parent->rb_node.rb_right;
else {
kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",

View File

@ -15,6 +15,7 @@
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/fs.h>
#include <linux/mman.h>
#include <linux/sched.h>
@ -751,7 +752,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node,
/*
* We come here from above when page->mapping or !PageSwapCache
* suggests that the node is stale; but it might be under migration.
* We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
* We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(),
* before checking whether node->kpfn has been changed.
*/
smp_rmb();
@ -852,9 +853,14 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
return err;
}
static inline struct stable_node *folio_stable_node(struct folio *folio)
{
return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
}
static inline struct stable_node *page_stable_node(struct page *page)
{
return PageKsm(page) ? page_rmapping(page) : NULL;
return folio_stable_node(page_folio(page));
}
static inline void set_page_stable_node(struct page *page,
@ -2570,15 +2576,16 @@ struct page *ksm_might_need_to_copy(struct page *page,
return page; /* no need to copy it */
} else if (!anon_vma) {
return page; /* no need to copy it */
} else if (anon_vma->root == vma->anon_vma->root &&
page->index == linear_page_index(vma, address)) {
} else if (page->index == linear_page_index(vma, address) &&
anon_vma->root == vma->anon_vma->root) {
return page; /* still no need to copy it */
}
if (!PageUptodate(page))
return page; /* let do_swap_page report the error */
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) {
if (new_page &&
mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) {
put_page(new_page);
new_page = NULL;
}
@ -2658,26 +2665,26 @@ void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
}
#ifdef CONFIG_MIGRATION
void ksm_migrate_page(struct page *newpage, struct page *oldpage)
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
{
struct stable_node *stable_node;
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio);
stable_node = page_stable_node(newpage);
stable_node = folio_stable_node(folio);
if (stable_node) {
VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
stable_node->kpfn = page_to_pfn(newpage);
VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio);
stable_node->kpfn = folio_pfn(newfolio);
/*
* newpage->mapping was set in advance; now we need smp_wmb()
* newfolio->mapping was set in advance; now we need smp_wmb()
* to make sure that the new stable_node->kpfn is visible
* to get_ksm_page() before it can see that oldpage->mapping
* has gone stale (or that PageSwapCache has been cleared).
* to get_ksm_page() before it can see that folio->mapping
* has gone stale (or that folio_test_swapcache has been cleared).
*/
smp_wmb();
set_page_stable_node(oldpage, NULL);
set_page_stable_node(&folio->page, NULL);
}
}
#endif /* CONFIG_MIGRATION */

View File

@ -15,18 +15,29 @@
#include "slab.h"
#ifdef CONFIG_MEMCG_KMEM
static LIST_HEAD(list_lrus);
static LIST_HEAD(memcg_list_lrus);
static DEFINE_MUTEX(list_lrus_mutex);
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
return lru->memcg_aware;
}
static void list_lru_register(struct list_lru *lru)
{
if (!list_lru_memcg_aware(lru))
return;
mutex_lock(&list_lrus_mutex);
list_add(&lru->list, &list_lrus);
list_add(&lru->list, &memcg_list_lrus);
mutex_unlock(&list_lrus_mutex);
}
static void list_lru_unregister(struct list_lru *lru)
{
if (!list_lru_memcg_aware(lru))
return;
mutex_lock(&list_lrus_mutex);
list_del(&lru->list);
mutex_unlock(&list_lrus_mutex);
@ -37,11 +48,6 @@ static int lru_shrinker_id(struct list_lru *lru)
return lru->shrinker_id;
}
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
return lru->memcg_aware;
}
static inline struct list_lru_one *
list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
{
@ -176,13 +182,16 @@ unsigned long list_lru_count_one(struct list_lru *lru,
{
struct list_lru_node *nlru = &lru->node[nid];
struct list_lru_one *l;
unsigned long count;
long count;
rcu_read_lock();
l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
count = READ_ONCE(l->nr_items);
rcu_read_unlock();
if (unlikely(count < 0))
count = 0;
return count;
}
EXPORT_SYMBOL_GPL(list_lru_count_one);
@ -354,8 +363,7 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
struct list_lru_memcg *memcg_lrus;
int size = memcg_nr_cache_ids;
memcg_lrus = kvmalloc(sizeof(*memcg_lrus) +
size * sizeof(void *), GFP_KERNEL);
memcg_lrus = kvmalloc(struct_size(memcg_lrus, lru, size), GFP_KERNEL);
if (!memcg_lrus)
return -ENOMEM;
@ -389,7 +397,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
old = rcu_dereference_protected(nlru->memcg_lrus,
lockdep_is_held(&list_lrus_mutex));
new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL);
new = kvmalloc(struct_size(new, lru, new_size), GFP_KERNEL);
if (!new)
return -ENOMEM;
@ -398,19 +406,8 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
return -ENOMEM;
}
memcpy(&new->lru, &old->lru, old_size * sizeof(void *));
/*
* The locking below allows readers that hold nlru->lock avoid taking
* rcu_read_lock (see list_lru_from_memcg_idx).
*
* Since list_lru_{add,del} may be called under an IRQ-safe lock,
* we have to use IRQ-safe primitives here to avoid deadlock.
*/
spin_lock_irq(&nlru->lock);
memcpy(&new->lru, &old->lru, flex_array_size(new, lru, old_size));
rcu_assign_pointer(nlru->memcg_lrus, new);
spin_unlock_irq(&nlru->lock);
kvfree_rcu(old, rcu);
return 0;
}
@ -466,9 +463,6 @@ static int memcg_update_list_lru(struct list_lru *lru,
{
int i;
if (!list_lru_memcg_aware(lru))
return 0;
for_each_node(i) {
if (memcg_update_list_lru_node(&lru->node[i],
old_size, new_size))
@ -491,9 +485,6 @@ static void memcg_cancel_update_list_lru(struct list_lru *lru,
{
int i;
if (!list_lru_memcg_aware(lru))
return;
for_each_node(i)
memcg_cancel_update_list_lru_node(&lru->node[i],
old_size, new_size);
@ -506,7 +497,7 @@ int memcg_update_all_list_lrus(int new_size)
int old_size = memcg_nr_cache_ids;
mutex_lock(&list_lrus_mutex);
list_for_each_entry(lru, &list_lrus, list) {
list_for_each_entry(lru, &memcg_list_lrus, list) {
ret = memcg_update_list_lru(lru, old_size, new_size);
if (ret)
goto fail;
@ -515,7 +506,7 @@ int memcg_update_all_list_lrus(int new_size)
mutex_unlock(&list_lrus_mutex);
return ret;
fail:
list_for_each_entry_continue_reverse(lru, &list_lrus, list)
list_for_each_entry_continue_reverse(lru, &memcg_list_lrus, list)
memcg_cancel_update_list_lru(lru, old_size, new_size);
goto out;
}
@ -552,9 +543,6 @@ static void memcg_drain_list_lru(struct list_lru *lru,
{
int i;
if (!list_lru_memcg_aware(lru))
return;
for_each_node(i)
memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg);
}
@ -564,7 +552,7 @@ void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg)
struct list_lru *lru;
mutex_lock(&list_lrus_mutex);
list_for_each_entry(lru, &list_lrus, list)
list_for_each_entry(lru, &memcg_list_lrus, list)
memcg_drain_list_lru(lru, src_idx, dst_memcg);
mutex_unlock(&list_lrus_mutex);
}

View File

@ -18,6 +18,8 @@
#include <linux/fadvise.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mm_inline.h>
#include <linux/string.h>
#include <linux/uio.h>
#include <linux/ksm.h>
#include <linux/fs.h>
@ -62,83 +64,94 @@ static int madvise_need_mmap_write(int behavior)
}
}
/*
* We can potentially split a vm area into separate
* areas, each area with its own behavior.
*/
static long madvise_behavior(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
#ifdef CONFIG_ANON_VMA_NAME
struct anon_vma_name *anon_vma_name_alloc(const char *name)
{
struct mm_struct *mm = vma->vm_mm;
int error = 0;
pgoff_t pgoff;
unsigned long new_flags = vma->vm_flags;
struct anon_vma_name *anon_name;
size_t count;
switch (behavior) {
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
break;
case MADV_SEQUENTIAL:
new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
break;
case MADV_RANDOM:
new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
break;
case MADV_DONTFORK:
new_flags |= VM_DONTCOPY;
break;
case MADV_DOFORK:
if (vma->vm_flags & VM_IO) {
error = -EINVAL;
goto out;
}
new_flags &= ~VM_DONTCOPY;
break;
case MADV_WIPEONFORK:
/* MADV_WIPEONFORK is only supported on anonymous memory. */
if (vma->vm_file || vma->vm_flags & VM_SHARED) {
error = -EINVAL;
goto out;
}
new_flags |= VM_WIPEONFORK;
break;
case MADV_KEEPONFORK:
new_flags &= ~VM_WIPEONFORK;
break;
case MADV_DONTDUMP:
new_flags |= VM_DONTDUMP;
break;
case MADV_DODUMP:
if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
error = -EINVAL;
goto out;
}
new_flags &= ~VM_DONTDUMP;
break;
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
error = ksm_madvise(vma, start, end, behavior, &new_flags);
if (error)
goto out_convert_errno;
break;
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
error = hugepage_madvise(vma, &new_flags, behavior);
if (error)
goto out_convert_errno;
break;
/* Add 1 for NUL terminator at the end of the anon_name->name */
count = strlen(name) + 1;
anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
if (anon_name) {
kref_init(&anon_name->kref);
memcpy(anon_name->name, name, count);
}
if (new_flags == vma->vm_flags) {
return anon_name;
}
void anon_vma_name_free(struct kref *kref)
{
struct anon_vma_name *anon_name =
container_of(kref, struct anon_vma_name, kref);
kfree(anon_name);
}
struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
{
mmap_assert_locked(vma->vm_mm);
if (vma->vm_file)
return NULL;
return vma->anon_name;
}
/* mmap_lock should be write-locked */
static int replace_anon_vma_name(struct vm_area_struct *vma,
struct anon_vma_name *anon_name)
{
struct anon_vma_name *orig_name = anon_vma_name(vma);
if (!anon_name) {
vma->anon_name = NULL;
anon_vma_name_put(orig_name);
return 0;
}
if (anon_vma_name_eq(orig_name, anon_name))
return 0;
vma->anon_name = anon_vma_name_reuse(anon_name);
anon_vma_name_put(orig_name);
return 0;
}
#else /* CONFIG_ANON_VMA_NAME */
static int replace_anon_vma_name(struct vm_area_struct *vma,
struct anon_vma_name *anon_name)
{
if (anon_name)
return -EINVAL;
return 0;
}
#endif /* CONFIG_ANON_VMA_NAME */
/*
* Update the vm_flags on region of a vma, splitting it or merging it as
* necessary. Must be called with mmap_sem held for writing;
* Caller should ensure anon_name stability by raising its refcount even when
* anon_name belongs to a valid vma because this function might free that vma.
*/
static int madvise_update_vma(struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
unsigned long end, unsigned long new_flags,
struct anon_vma_name *anon_name)
{
struct mm_struct *mm = vma->vm_mm;
int error;
pgoff_t pgoff;
if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
*prev = vma;
goto out;
return 0;
}
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx);
vma->vm_userfaultfd_ctx, anon_name);
if (*prev) {
vma = *prev;
goto success;
@ -147,23 +160,19 @@ static long madvise_behavior(struct vm_area_struct *vma,
*prev = vma;
if (start != vma->vm_start) {
if (unlikely(mm->map_count >= sysctl_max_map_count)) {
error = -ENOMEM;
goto out;
}
if (unlikely(mm->map_count >= sysctl_max_map_count))
return -ENOMEM;
error = __split_vma(mm, vma, start, 1);
if (error)
goto out_convert_errno;
return error;
}
if (end != vma->vm_end) {
if (unlikely(mm->map_count >= sysctl_max_map_count)) {
error = -ENOMEM;
goto out;
}
if (unlikely(mm->map_count >= sysctl_max_map_count))
return -ENOMEM;
error = __split_vma(mm, vma, end, 0);
if (error)
goto out_convert_errno;
return error;
}
success:
@ -171,16 +180,13 @@ static long madvise_behavior(struct vm_area_struct *vma,
* vm_flags is protected by the mmap_lock held in write mode.
*/
vma->vm_flags = new_flags;
if (!vma->vm_file) {
error = replace_anon_vma_name(vma, anon_name);
if (error)
return error;
}
out_convert_errno:
/*
* madvise() returns EAGAIN if kernel resources, such as
* slab, are temporarily unavailable.
*/
if (error == -ENOMEM)
error = -EAGAIN;
out:
return error;
return 0;
}
#ifdef CONFIG_SWAP
@ -930,6 +936,99 @@ static long madvise_remove(struct vm_area_struct *vma,
return error;
}
/*
* Apply an madvise behavior to a region of a vma. madvise_update_vma
* will handle splitting a vm area into separate areas, each area with its own
* behavior.
*/
static int madvise_vma_behavior(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
unsigned long behavior)
{
int error;
struct anon_vma_name *anon_name;
unsigned long new_flags = vma->vm_flags;
switch (behavior) {
case MADV_REMOVE:
return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
case MADV_COLD:
return madvise_cold(vma, prev, start, end);
case MADV_PAGEOUT:
return madvise_pageout(vma, prev, start, end);
case MADV_FREE:
case MADV_DONTNEED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
return madvise_populate(vma, prev, start, end, behavior);
case MADV_NORMAL:
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
break;
case MADV_SEQUENTIAL:
new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
break;
case MADV_RANDOM:
new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
break;
case MADV_DONTFORK:
new_flags |= VM_DONTCOPY;
break;
case MADV_DOFORK:
if (vma->vm_flags & VM_IO)
return -EINVAL;
new_flags &= ~VM_DONTCOPY;
break;
case MADV_WIPEONFORK:
/* MADV_WIPEONFORK is only supported on anonymous memory. */
if (vma->vm_file || vma->vm_flags & VM_SHARED)
return -EINVAL;
new_flags |= VM_WIPEONFORK;
break;
case MADV_KEEPONFORK:
new_flags &= ~VM_WIPEONFORK;
break;
case MADV_DONTDUMP:
new_flags |= VM_DONTDUMP;
break;
case MADV_DODUMP:
if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
return -EINVAL;
new_flags &= ~VM_DONTDUMP;
break;
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
error = ksm_madvise(vma, start, end, behavior, &new_flags);
if (error)
goto out;
break;
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
error = hugepage_madvise(vma, &new_flags, behavior);
if (error)
goto out;
break;
}
anon_name = anon_vma_name(vma);
anon_vma_name_get(anon_name);
error = madvise_update_vma(vma, prev, start, end, new_flags,
anon_name);
anon_vma_name_put(anon_name);
out:
/*
* madvise() returns EAGAIN if kernel resources, such as
* slab, are temporarily unavailable.
*/
if (error == -ENOMEM)
error = -EAGAIN;
return error;
}
#ifdef CONFIG_MEMORY_FAILURE
/*
* Error injection support for memory error handling.
@ -978,30 +1077,6 @@ static int madvise_inject_error(int behavior,
}
#endif
static long
madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
unsigned long start, unsigned long end, int behavior)
{
switch (behavior) {
case MADV_REMOVE:
return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
case MADV_COLD:
return madvise_cold(vma, prev, start, end);
case MADV_PAGEOUT:
return madvise_pageout(vma, prev, start, end);
case MADV_FREE:
case MADV_DONTNEED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
case MADV_POPULATE_READ:
case MADV_POPULATE_WRITE:
return madvise_populate(vma, prev, start, end, behavior);
default:
return madvise_behavior(vma, prev, start, end, behavior);
}
}
static bool
madvise_behavior_valid(int behavior)
{
@ -1055,6 +1130,122 @@ process_madvise_behavior_valid(int behavior)
}
}
/*
* Walk the vmas in range [start,end), and call the visit function on each one.
* The visit function will get start and end parameters that cover the overlap
* between the current vma and the original range. Any unmapped regions in the
* original range will result in this function returning -ENOMEM while still
* calling the visit function on all of the existing vmas in the range.
* Must be called with the mmap_lock held for reading or writing.
*/
static
int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned long arg,
int (*visit)(struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
unsigned long end, unsigned long arg))
{
struct vm_area_struct *vma;
struct vm_area_struct *prev;
unsigned long tmp;
int unmapped_error = 0;
/*
* If the interval [start,end) covers some unmapped address
* ranges, just ignore them, but return -ENOMEM at the end.
* - different from the way of handling in mlock etc.
*/
vma = find_vma_prev(mm, start, &prev);
if (vma && start > vma->vm_start)
prev = vma;
for (;;) {
int error;
/* Still start < end. */
if (!vma)
return -ENOMEM;
/* Here start < (end|vma->vm_end). */
if (start < vma->vm_start) {
unmapped_error = -ENOMEM;
start = vma->vm_start;
if (start >= end)
break;
}
/* Here vma->vm_start <= start < (end|vma->vm_end) */
tmp = vma->vm_end;
if (end < tmp)
tmp = end;
/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
error = visit(vma, &prev, start, tmp, arg);
if (error)
return error;
start = tmp;
if (prev && start < prev->vm_end)
start = prev->vm_end;
if (start >= end)
break;
if (prev)
vma = prev->vm_next;
else /* madvise_remove dropped mmap_lock */
vma = find_vma(mm, start);
}
return unmapped_error;
}
#ifdef CONFIG_ANON_VMA_NAME
static int madvise_vma_anon_name(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
unsigned long anon_name)
{
int error;
/* Only anonymous mappings can be named */
if (vma->vm_file)
return -EBADF;
error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
(struct anon_vma_name *)anon_name);
/*
* madvise() returns EAGAIN if kernel resources, such as
* slab, are temporarily unavailable.
*/
if (error == -ENOMEM)
error = -EAGAIN;
return error;
}
int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
unsigned long len_in, struct anon_vma_name *anon_name)
{
unsigned long end;
unsigned long len;
if (start & ~PAGE_MASK)
return -EINVAL;
len = (len_in + ~PAGE_MASK) & PAGE_MASK;
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
return -EINVAL;
end = start + len;
if (end < start)
return -EINVAL;
if (end == start)
return 0;
return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
/*
* The madvise(2) system call.
*
@ -1127,10 +1318,8 @@ process_madvise_behavior_valid(int behavior)
*/
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
{
unsigned long end, tmp;
struct vm_area_struct *vma, *prev;
int unmapped_error = 0;
int error = -EINVAL;
unsigned long end;
int error;
int write;
size_t len;
struct blk_plug plug;
@ -1138,23 +1327,22 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
start = untagged_addr(start);
if (!madvise_behavior_valid(behavior))
return error;
return -EINVAL;
if (!PAGE_ALIGNED(start))
return error;
return -EINVAL;
len = PAGE_ALIGN(len_in);
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
return error;
return -EINVAL;
end = start + len;
if (end < start)
return error;
return -EINVAL;
error = 0;
if (end == start)
return error;
return 0;
#ifdef CONFIG_MEMORY_FAILURE
if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
@ -1169,51 +1357,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
mmap_read_lock(mm);
}
/*
* If the interval [start,end) covers some unmapped address
* ranges, just ignore them, but return -ENOMEM at the end.
* - different from the way of handling in mlock etc.
*/
vma = find_vma_prev(mm, start, &prev);
if (vma && start > vma->vm_start)
prev = vma;
blk_start_plug(&plug);
for (;;) {
/* Still start < end. */
error = -ENOMEM;
if (!vma)
goto out;
/* Here start < (end|vma->vm_end). */
if (start < vma->vm_start) {
unmapped_error = -ENOMEM;
start = vma->vm_start;
if (start >= end)
goto out;
}
/* Here vma->vm_start <= start < (end|vma->vm_end) */
tmp = vma->vm_end;
if (end < tmp)
tmp = end;
/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
error = madvise_vma(vma, &prev, start, tmp, behavior);
if (error)
goto out;
start = tmp;
if (prev && start < prev->vm_end)
start = prev->vm_end;
error = unmapped_error;
if (start >= end)
goto out;
if (prev)
vma = prev->vm_next;
else /* madvise_remove dropped mmap_lock */
vma = find_vma(mm, start);
}
out:
error = madvise_walk_vmas(mm, start, end, behavior,
madvise_vma_behavior);
blk_finish_plug(&plug);
if (write)
mmap_write_unlock(mm);
@ -1235,7 +1381,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
struct iovec iovstack[UIO_FASTIOV], iovec;
struct iovec *iov = iovstack;
struct iov_iter iter;
struct pid *pid;
struct task_struct *task;
struct mm_struct *mm;
size_t total_len;
@ -1250,18 +1395,12 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
if (ret < 0)
goto out;
pid = pidfd_get_pid(pidfd, &f_flags);
if (IS_ERR(pid)) {
ret = PTR_ERR(pid);
task = pidfd_get_task(pidfd, &f_flags);
if (IS_ERR(task)) {
ret = PTR_ERR(task);
goto free_iov;
}
task = get_pid_task(pid, PIDTYPE_PID);
if (!task) {
ret = -ESRCH;
goto put_pid;
}
if (!process_madvise_behavior_valid(behavior)) {
ret = -EINVAL;
goto release_task;
@ -1301,8 +1440,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
mmput(mm);
release_task:
put_task_struct(task);
put_pid:
put_pid(pid);
free_iov:
kfree(iov);
out:

View File

@ -3,6 +3,7 @@
#include <linux/hugetlb.h>
#include <linux/bitops.h>
#include <linux/mmu_notifier.h>
#include <linux/mm_inline.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

View File

@ -287,7 +287,7 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
{
/* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE ||
end == MEMBLOCK_ALLOC_KASAN)
end == MEMBLOCK_ALLOC_NOLEAKTRACE)
end = memblock.current_limit;
/* avoid allocating the first page */
@ -369,7 +369,7 @@ void __init memblock_discard(void)
if (memblock_reserved_in_slab)
kfree(memblock.reserved.regions);
else
__memblock_free_late(addr, size);
memblock_free_late(addr, size);
}
if (memblock.memory.regions != memblock_memory_init_regions) {
@ -379,7 +379,7 @@ void __init memblock_discard(void)
if (memblock_memory_in_slab)
kfree(memblock.memory.regions);
else
__memblock_free_late(addr, size);
memblock_free_late(addr, size);
}
memblock_memory = NULL;
@ -478,7 +478,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
kfree(old_array);
else if (old_array != memblock_memory_init_regions &&
old_array != memblock_reserved_init_regions)
memblock_free_ptr(old_array, old_alloc_size);
memblock_free(old_array, old_alloc_size);
/*
* Reserve the new array if that comes from the memblock. Otherwise, we
@ -661,6 +661,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
* @base: base address of the new region
* @size: size of the new region
* @nid: nid of the new region
* @flags: flags of the new region
*
* Add new memblock region [@base, @base + @size) to the "memory"
* type. See memblock_add_range() description for mode details
@ -669,14 +670,14 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
* 0 on success, -errno on failure.
*/
int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
int nid)
int nid, enum memblock_flags flags)
{
phys_addr_t end = base + size - 1;
memblock_dbg("%s: [%pa-%pa] nid=%d %pS\n", __func__,
&base, &end, nid, (void *)_RET_IP_);
memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__,
&base, &end, nid, flags, (void *)_RET_IP_);
return memblock_add_range(&memblock.memory, base, size, nid, 0);
return memblock_add_range(&memblock.memory, base, size, nid, flags);
}
/**
@ -802,28 +803,28 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
}
/**
* memblock_free_ptr - free boot memory allocation
* memblock_free - free boot memory allocation
* @ptr: starting address of the boot memory allocation
* @size: size of the boot memory block in bytes
*
* Free boot memory block previously allocated by memblock_alloc_xx() API.
* The freeing memory will not be released to the buddy allocator.
*/
void __init_memblock memblock_free_ptr(void *ptr, size_t size)
void __init_memblock memblock_free(void *ptr, size_t size)
{
if (ptr)
memblock_free(__pa(ptr), size);
memblock_phys_free(__pa(ptr), size);
}
/**
* memblock_free - free boot memory block
* memblock_phys_free - free boot memory block
* @base: phys starting address of the boot memory block
* @size: size of the boot memory block in bytes
*
* Free boot memory block previously allocated by memblock_alloc_xx() API.
* The freeing memory will not be released to the buddy allocator.
*/
int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
@ -987,6 +988,10 @@ static bool should_skip_region(struct memblock_type *type,
if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
return true;
/* skip driver-managed memory unless we were asked for it explicitly */
if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m))
return true;
return false;
}
@ -1388,8 +1393,11 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
return 0;
done:
/* Skip kmemleak for kasan_init() due to high volume. */
if (end != MEMBLOCK_ALLOC_KASAN)
/*
* Skip kmemleak for those places like kasan_init() and
* early_pgtable_alloc() due to high volume.
*/
if (end != MEMBLOCK_ALLOC_NOLEAKTRACE)
/*
* The min_count is set to 0 so that memblock allocated
* blocks are never reported as leaks. This is because many
@ -1595,7 +1603,7 @@ void * __init memblock_alloc_try_nid(
}
/**
* __memblock_free_late - free pages directly to buddy allocator
* memblock_free_late - free pages directly to buddy allocator
* @base: phys starting address of the boot memory block
* @size: size of the boot memory block in bytes
*
@ -1603,7 +1611,7 @@ void * __init memblock_alloc_try_nid(
* down, but we are still initializing the system. Pages are released directly
* to the buddy allocator.
*/
void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
void __init memblock_free_late(phys_addr_t base, phys_addr_t size)
{
phys_addr_t cursor, end;
@ -1943,7 +1951,7 @@ static void __init free_memmap(unsigned long start_pfn, unsigned long end_pfn)
* memmap array.
*/
if (pg < pgend)
memblock_free(pg, pgend - pg);
memblock_phys_free(pg, pgend - pg);
}
/*

File diff suppressed because it is too large Load Diff

View File

@ -313,9 +313,7 @@ SYSCALL_DEFINE2(memfd_create,
}
if (flags & MFD_HUGETLB) {
struct ucounts *ucounts = NULL;
file = hugetlb_file_setup(name, 0, VM_NORESERVE, &ucounts,
file = hugetlb_file_setup(name, 0, VM_NORESERVE,
HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK);

View File

@ -39,6 +39,7 @@
#include <linux/kernel-page-flags.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h>
#include <linux/dax.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/export.h>
@ -57,6 +58,7 @@
#include <linux/ratelimit.h>
#include <linux/page-isolation.h>
#include <linux/pagewalk.h>
#include <linux/shmem_fs.h>
#include "internal.h"
#include "ras/ras_event.h"
@ -673,7 +675,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
#define hwpoison_hugetlb_range NULL
#endif
static struct mm_walk_ops hwp_walk_ops = {
static const struct mm_walk_ops hwp_walk_ops = {
.pmd_entry = hwpoison_pte_range,
.hugetlb_entry = hwpoison_hugetlb_range,
};
@ -721,7 +723,6 @@ static const char * const action_page_types[] = {
[MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
[MF_MSG_SLAB] = "kernel slab page",
[MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
[MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
[MF_MSG_HUGE] = "huge page",
[MF_MSG_FREE_HUGE] = "free huge page",
[MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
@ -736,7 +737,6 @@ static const char * const action_page_types[] = {
[MF_MSG_CLEAN_LRU] = "clean LRU page",
[MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
[MF_MSG_BUDDY] = "free buddy page",
[MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
[MF_MSG_DAX] = "dax page",
[MF_MSG_UNSPLIT_THP] = "unsplit thp",
[MF_MSG_UNKNOWN] = "unknown page",
@ -762,7 +762,7 @@ static int delete_from_lru_cache(struct page *p)
* Poisoned page might never drop its ref count to 0 so we have
* to uncharge it manually from its memcg.
*/
mem_cgroup_uncharge(p);
mem_cgroup_uncharge(page_folio(p));
/*
* drop the page count elevated by isolate_lru_page()
@ -806,12 +806,44 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
return ret;
}
struct page_state {
unsigned long mask;
unsigned long res;
enum mf_action_page_type type;
/* Callback ->action() has to unlock the relevant page inside it. */
int (*action)(struct page_state *ps, struct page *p);
};
/*
* Return true if page is still referenced by others, otherwise return
* false.
*
* The extra_pins is true when one extra refcount is expected.
*/
static bool has_extra_refcount(struct page_state *ps, struct page *p,
bool extra_pins)
{
int count = page_count(p) - 1;
if (extra_pins)
count -= 1;
if (count > 0) {
pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
page_to_pfn(p), action_page_types[ps->type], count);
return true;
}
return false;
}
/*
* Error hit kernel page.
* Do nothing, try to be lucky and not touch this instead. For a few cases we
* could be more sophisticated.
*/
static int me_kernel(struct page *p, unsigned long pfn)
static int me_kernel(struct page_state *ps, struct page *p)
{
unlock_page(p);
return MF_IGNORED;
@ -820,9 +852,9 @@ static int me_kernel(struct page *p, unsigned long pfn)
/*
* Page in unknown state. Do nothing.
*/
static int me_unknown(struct page *p, unsigned long pfn)
static int me_unknown(struct page_state *ps, struct page *p)
{
pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p));
unlock_page(p);
return MF_FAILED;
}
@ -830,10 +862,11 @@ static int me_unknown(struct page *p, unsigned long pfn)
/*
* Clean (or cleaned) page cache page.
*/
static int me_pagecache_clean(struct page *p, unsigned long pfn)
static int me_pagecache_clean(struct page_state *ps, struct page *p)
{
int ret;
struct address_space *mapping;
bool extra_pins;
delete_from_lru_cache(p);
@ -862,14 +895,24 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
goto out;
}
/*
* The shmem page is kept in page cache instead of truncating
* so is expected to have an extra refcount after error-handling.
*/
extra_pins = shmem_mapping(mapping);
/*
* Truncation is a bit tricky. Enable it per file system for now.
*
* Open: to take i_rwsem or not for this? Right now we don't.
*/
ret = truncate_error_page(p, pfn, mapping);
ret = truncate_error_page(p, page_to_pfn(p), mapping);
if (has_extra_refcount(ps, p, extra_pins))
ret = MF_FAILED;
out:
unlock_page(p);
return ret;
}
@ -878,7 +921,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
* Issues: when the error hit a hole page the error is not properly
* propagated.
*/
static int me_pagecache_dirty(struct page *p, unsigned long pfn)
static int me_pagecache_dirty(struct page_state *ps, struct page *p)
{
struct address_space *mapping = page_mapping(p);
@ -922,7 +965,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
mapping_set_error(mapping, -EIO);
}
return me_pagecache_clean(p, pfn);
return me_pagecache_clean(ps, p);
}
/*
@ -944,9 +987,10 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
* Clean swap cache pages can be directly isolated. A later page fault will
* bring in the known good data from disk.
*/
static int me_swapcache_dirty(struct page *p, unsigned long pfn)
static int me_swapcache_dirty(struct page_state *ps, struct page *p)
{
int ret;
bool extra_pins = false;
ClearPageDirty(p);
/* Trigger EIO in shmem: */
@ -954,10 +998,17 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
unlock_page(p);
if (ret == MF_DELAYED)
extra_pins = true;
if (has_extra_refcount(ps, p, extra_pins))
ret = MF_FAILED;
return ret;
}
static int me_swapcache_clean(struct page *p, unsigned long pfn)
static int me_swapcache_clean(struct page_state *ps, struct page *p)
{
int ret;
@ -965,6 +1016,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
unlock_page(p);
if (has_extra_refcount(ps, p, false))
ret = MF_FAILED;
return ret;
}
@ -974,7 +1029,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
* - Error on hugepage is contained in hugepage unit (not in raw page unit.)
* To narrow down kill region to one page, we need to break up pmd.
*/
static int me_huge_page(struct page *p, unsigned long pfn)
static int me_huge_page(struct page_state *ps, struct page *p)
{
int res;
struct page *hpage = compound_head(p);
@ -985,7 +1040,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
mapping = page_mapping(hpage);
if (mapping) {
res = truncate_error_page(hpage, pfn, mapping);
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
unlock_page(hpage);
} else {
res = MF_FAILED;
@ -1003,6 +1058,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
}
}
if (has_extra_refcount(ps, p, false))
res = MF_FAILED;
return res;
}
@ -1028,14 +1086,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
#define slab (1UL << PG_slab)
#define reserved (1UL << PG_reserved)
static struct page_state {
unsigned long mask;
unsigned long res;
enum mf_action_page_type type;
/* Callback ->action() has to unlock the relevant page inside it. */
int (*action)(struct page *p, unsigned long pfn);
} error_states[] = {
static struct page_state error_states[] = {
{ reserved, reserved, MF_MSG_KERNEL, me_kernel },
/*
* free pages are specially detected outside this table:
@ -1095,19 +1146,10 @@ static int page_action(struct page_state *ps, struct page *p,
unsigned long pfn)
{
int result;
int count;
/* page p should be unlocked after returning from ps->action(). */
result = ps->action(p, pfn);
result = ps->action(ps, p);
count = page_count(p) - 1;
if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
count--;
if (count > 0) {
pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
pfn, action_page_types[ps->type], count);
result = MF_FAILED;
}
action_result(pfn, ps->type, result);
/* Could do more checks here if page looks ok */
@ -1118,6 +1160,22 @@ static int page_action(struct page_state *ps, struct page *p,
return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
}
static inline bool PageHWPoisonTakenOff(struct page *page)
{
return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON;
}
void SetPageHWPoisonTakenOff(struct page *page)
{
set_page_private(page, MAGIC_HWPOISON);
}
void ClearPageHWPoisonTakenOff(struct page *page)
{
if (PageHWPoison(page))
set_page_private(page, 0);
}
/*
* Return true if a page type of a given page is supported by hwpoison
* mechanism (while handling could fail), otherwise false. This function
@ -1220,6 +1278,27 @@ static int get_any_page(struct page *p, unsigned long flags)
return ret;
}
static int __get_unpoison_page(struct page *page)
{
struct page *head = compound_head(page);
int ret = 0;
bool hugetlb = false;
ret = get_hwpoison_huge_page(head, &hugetlb);
if (hugetlb)
return ret;
/*
* PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
* but also isolated from buddy freelist, so need to identify the
* state and have to cancel both operations to unpoison.
*/
if (PageHWPoisonTakenOff(page))
return -EHWPOISON;
return get_page_unless_zero(page) ? 1 : 0;
}
/**
* get_hwpoison_page() - Get refcount for memory error handling
* @p: Raw error page (hit by memory error)
@ -1227,7 +1306,7 @@ static int get_any_page(struct page *p, unsigned long flags)
*
* get_hwpoison_page() takes a page refcount of an error page to handle memory
* error on it, after checking that the error page is in a well-defined state
* (defined as a page-type we can successfully handle the memor error on it,
* (defined as a page-type we can successfully handle the memory error on it,
* such as LRU page and hugetlb page).
*
* Memory error handling could be triggered at any time on any type of page,
@ -1236,18 +1315,26 @@ static int get_any_page(struct page *p, unsigned long flags)
* extra care for the error page's state (as done in __get_hwpoison_page()),
* and has some retry logic in get_any_page().
*
* When called from unpoison_memory(), the caller should already ensure that
* the given page has PG_hwpoison. So it's never reused for other page
* allocations, and __get_unpoison_page() never races with them.
*
* Return: 0 on failure,
* 1 on success for in-use pages in a well-defined state,
* -EIO for pages on which we can not handle memory errors,
* -EBUSY when get_hwpoison_page() has raced with page lifecycle
* operations like allocation and free.
* operations like allocation and free,
* -EHWPOISON when the page is hwpoisoned and taken off from buddy.
*/
static int get_hwpoison_page(struct page *p, unsigned long flags)
{
int ret;
zone_pcp_disable(page_zone(p));
ret = get_any_page(p, flags);
if (flags & MF_UNPOISON)
ret = __get_unpoison_page(p);
else
ret = get_any_page(p, flags);
zone_pcp_enable(page_zone(p));
return ret;
@ -1400,14 +1487,11 @@ static int identify_page_state(unsigned long pfn, struct page *p,
static int try_to_split_thp_page(struct page *page, const char *msg)
{
lock_page(page);
if (!PageAnon(page) || unlikely(split_huge_page(page))) {
if (unlikely(split_huge_page(page))) {
unsigned long pfn = page_to_pfn(page);
unlock_page(page);
if (!PageAnon(page))
pr_info("%s: %#lx: non anonymous thp\n", msg, pfn);
else
pr_info("%s: %#lx: thp split failed\n", msg, pfn);
pr_info("%s: %#lx: thp split failed\n", msg, pfn);
put_page(page);
return -EBUSY;
}
@ -1461,14 +1545,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
lock_page(head);
page_flags = head->flags;
if (!PageHWPoison(head)) {
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_dec();
unlock_page(head);
put_page(head);
return 0;
}
/*
* TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
* simply disable it. In order to make it work properly, we need
@ -1519,6 +1595,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
goto out;
}
/*
* Pages instantiated by device-dax (not filesystem-dax)
* may be compound pages.
*/
page = compound_head(page);
/*
* Prevent the inode from being freed while we are interrogating
* the address_space, typically this would be handled by
@ -1582,6 +1664,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
return rc;
}
static DEFINE_MUTEX(mf_mutex);
/**
* memory_failure - Handle memory failure of a page.
* @pfn: Page Number of the corrupted page
@ -1608,26 +1692,32 @@ int memory_failure(unsigned long pfn, int flags)
int res = 0;
unsigned long page_flags;
bool retry = true;
static DEFINE_MUTEX(mf_mutex);
if (!sysctl_memory_failure_recovery)
panic("Memory failure on page %lx", pfn);
mutex_lock(&mf_mutex);
p = pfn_to_online_page(pfn);
if (!p) {
res = arch_memory_failure(pfn, flags);
if (res == 0)
goto unlock_mutex;
if (pfn_valid(pfn)) {
pgmap = get_dev_pagemap(pfn, NULL);
if (pgmap)
return memory_failure_dev_pagemap(pfn, flags,
pgmap);
if (pgmap) {
res = memory_failure_dev_pagemap(pfn, flags,
pgmap);
goto unlock_mutex;
}
}
pr_err("Memory failure: %#lx: memory outside kernel control\n",
pfn);
return -ENXIO;
res = -ENXIO;
goto unlock_mutex;
}
mutex_lock(&mf_mutex);
try_again:
if (PageHuge(p)) {
res = memory_failure_hugetlb(pfn, flags);
@ -1742,16 +1832,6 @@ int memory_failure(unsigned long pfn, int flags)
*/
page_flags = p->flags;
/*
* unpoison always clear PG_hwpoison inside page lock
*/
if (!PageHWPoison(p)) {
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
num_poisoned_pages_dec();
unlock_page(p);
put_page(p);
goto unlock_mutex;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
@ -1915,6 +1995,28 @@ core_initcall(memory_failure_init);
pr_info(fmt, pfn); \
})
static inline int clear_page_hwpoison(struct ratelimit_state *rs, struct page *p)
{
if (TestClearPageHWPoison(p)) {
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
page_to_pfn(p), rs);
num_poisoned_pages_dec();
return 1;
}
return 0;
}
static inline int unpoison_taken_off_page(struct ratelimit_state *rs,
struct page *p)
{
if (put_page_back_buddy(p)) {
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
page_to_pfn(p), rs);
return 0;
}
return -EBUSY;
}
/**
* unpoison_memory - Unpoison a previously poisoned page
* @pfn: Page number of the to be unpoisoned page
@ -1931,8 +2033,7 @@ int unpoison_memory(unsigned long pfn)
{
struct page *page;
struct page *p;
int freeit = 0;
unsigned long flags = 0;
int ret = -EBUSY;
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@ -1942,69 +2043,60 @@ int unpoison_memory(unsigned long pfn)
p = pfn_to_page(pfn);
page = compound_head(p);
mutex_lock(&mf_mutex);
if (!PageHWPoison(p)) {
unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
pfn, &unpoison_rs);
return 0;
goto unlock_mutex;
}
if (page_count(page) > 1) {
unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
pfn, &unpoison_rs);
return 0;
goto unlock_mutex;
}
if (page_mapped(page)) {
unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
pfn, &unpoison_rs);
return 0;
goto unlock_mutex;
}
if (page_mapping(page)) {
unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
pfn, &unpoison_rs);
return 0;
goto unlock_mutex;
}
/*
* unpoison_memory() can encounter thp only when the thp is being
* worked by memory_failure() and the page lock is not held yet.
* In such case, we yield to memory_failure() and make unpoison fail.
*/
if (!PageHuge(page) && PageTransHuge(page)) {
unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
pfn, &unpoison_rs);
return 0;
}
if (PageSlab(page) || PageTable(page))
goto unlock_mutex;
if (!get_hwpoison_page(p, flags)) {
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
pfn, &unpoison_rs);
return 0;
}
ret = get_hwpoison_page(p, MF_UNPOISON);
if (!ret) {
if (clear_page_hwpoison(&unpoison_rs, page))
ret = 0;
else
ret = -EBUSY;
} else if (ret < 0) {
if (ret == -EHWPOISON) {
ret = unpoison_taken_off_page(&unpoison_rs, p);
} else
unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
pfn, &unpoison_rs);
} else {
int freeit = clear_page_hwpoison(&unpoison_rs, p);
lock_page(page);
/*
* This test is racy because PG_hwpoison is set outside of page lock.
* That's acceptable because that won't trigger kernel panic. Instead,
* the PG_hwpoison page will be caught and isolated on the entrance to
* the free buddy page pool.
*/
if (TestClearPageHWPoison(page)) {
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
pfn, &unpoison_rs);
num_poisoned_pages_dec();
freeit = 1;
}
unlock_page(page);
put_page(page);
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
put_page(page);
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) {
put_page(page);
ret = 0;
}
}
return 0;
unlock_mutex:
mutex_unlock(&mf_mutex);
return ret;
}
EXPORT_SYMBOL(unpoison_memory);
@ -2104,14 +2196,14 @@ static int __soft_offline_page(struct page *page)
if (!list_empty(&pagelist))
putback_movable_pages(&pagelist);
pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n",
pfn, msg_page[huge], ret, page->flags, &page->flags);
pr_info("soft offline: %#lx: %s migration failed %d, type %pGp\n",
pfn, msg_page[huge], ret, &page->flags);
if (ret > 0)
ret = -EBUSY;
}
} else {
pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %lx (%pGp)\n",
pfn, msg_page[huge], page_count(page), page->flags, &page->flags);
pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %pGp\n",
pfn, msg_page[huge], page_count(page), &page->flags);
ret = -EBUSY;
}
return ret;
@ -2185,9 +2277,12 @@ int soft_offline_page(unsigned long pfn, int flags)
return -EIO;
}
mutex_lock(&mf_mutex);
if (PageHWPoison(page)) {
pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
put_ref_page(ref_page);
mutex_unlock(&mf_mutex);
return 0;
}
@ -2206,5 +2301,7 @@ int soft_offline_page(unsigned long pfn, int flags)
}
}
mutex_unlock(&mf_mutex);
return ret;
}

View File

@ -41,6 +41,7 @@
#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>
@ -433,35 +434,39 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
}
}
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
{
spinlock_t *ptl = pmd_lock(mm, pmd);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
mm_inc_nr_ptes(mm);
/*
* Ensure all pte setup (eg. pte page lock and page clearing) are
* visible before the pte is made visible to other CPUs by being
* put into page tables.
*
* The other side of the story is the pointer chasing in the page
* table walking code (when walking the page table without locking;
* ie. most of the time). Fortunately, these data accesses consist
* of a chain of data-dependent loads, meaning most CPUs (alpha
* being the notable exception) will already guarantee loads are
* seen in-order. See the alpha page table accessors for the
* smp_rmb() barriers in page table walking code.
*/
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
pmd_populate(mm, pmd, *pte);
*pte = NULL;
}
spin_unlock(ptl);
}
int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
spinlock_t *ptl;
pgtable_t new = pte_alloc_one(mm);
if (!new)
return -ENOMEM;
/*
* Ensure all pte setup (eg. pte page lock and page clearing) are
* visible before the pte is made visible to other CPUs by being
* put into page tables.
*
* The other side of the story is the pointer chasing in the page
* table walking code (when walking the page table without locking;
* ie. most of the time). Fortunately, these data accesses consist
* of a chain of data-dependent loads, meaning most CPUs (alpha
* being the notable exception) will already guarantee loads are
* seen in-order. See the alpha page table accessors for the
* smp_rmb() barriers in page table walking code.
*/
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
ptl = pmd_lock(mm, pmd);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
mm_inc_nr_ptes(mm);
pmd_populate(mm, pmd, new);
new = NULL;
}
spin_unlock(ptl);
pmd_install(mm, pmd, &new);
if (new)
pte_free(mm, new);
return 0;
@ -473,10 +478,9 @@ int __pte_alloc_kernel(pmd_t *pmd)
if (!new)
return -ENOMEM;
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&init_mm.page_table_lock);
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
smp_wmb(); /* See comment in pmd_install() */
pmd_populate_kernel(&init_mm, pmd, new);
new = NULL;
}
@ -716,8 +720,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
else if (is_writable_device_exclusive_entry(entry))
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
set_pte_at(vma->vm_mm, address, ptep, pte);
/*
* No need to take a page reference as one was already
* created when the swap entry was made.
@ -731,6 +733,8 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
*/
WARN_ON_ONCE(!PageAnon(page));
set_pte_at(vma->vm_mm, address, ptep, pte);
if (vma->vm_flags & VM_LOCKED)
mlock_vma_page(page);
@ -990,7 +994,7 @@ page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
if (!new_page)
return NULL;
if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
if (mem_cgroup_charge(page_folio(new_page), src_mm, GFP_KERNEL)) {
put_page(new_page);
return NULL;
}
@ -1301,6 +1305,28 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
return ret;
}
/*
* Parameter block passed down to zap_pte_range in exceptional cases.
*/
struct zap_details {
struct address_space *zap_mapping; /* Check page->mapping if set */
struct folio *single_folio; /* Locked folio to be unmapped */
};
/*
* We set details->zap_mapping when we want to unmap shared but keep private
* pages. Return true if skip zapping this page, false otherwise.
*/
static inline bool
zap_skip_check_mapping(struct zap_details *details, struct page *page)
{
if (!details || !page)
return false;
return details->zap_mapping &&
(details->zap_mapping != page_rmapping(page));
}
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
@ -1333,16 +1359,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct page *page;
page = vm_normal_page(vma, addr, ptent);
if (unlikely(details) && page) {
/*
* unmap_shared_mapping_pages() wants to
* invalidate cache without truncating:
* unmap shared but keep private pages.
*/
if (details->check_mapping &&
details->check_mapping != page_rmapping(page))
continue;
}
if (unlikely(zap_skip_check_mapping(details, page)))
continue;
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
@ -1375,17 +1393,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
is_device_exclusive_entry(entry)) {
struct page *page = pfn_swap_entry_to_page(entry);
if (unlikely(details && details->check_mapping)) {
/*
* unmap_shared_mapping_pages() wants to
* invalidate cache without truncating:
* unmap shared but keep private pages.
*/
if (details->check_mapping !=
page_rmapping(page))
continue;
}
if (unlikely(zap_skip_check_mapping(details, page)))
continue;
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
rss[mm_counter(page)]--;
@ -1457,8 +1466,8 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
} else if (details && details->single_page &&
PageTransCompound(details->single_page) &&
} else if (details && details->single_folio &&
folio_test_pmd_mappable(details->single_folio) &&
next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
/*
@ -2724,19 +2733,19 @@ EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
* proceeding (but do_wp_page is only called after already making such a check;
* and do_anonymous_page can safely check later on).
*/
static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
pte_t *page_table, pte_t orig_pte)
static inline int pte_unmap_same(struct vm_fault *vmf)
{
int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
if (sizeof(pte_t) > sizeof(unsigned long)) {
spinlock_t *ptl = pte_lockptr(mm, pmd);
spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(ptl);
same = pte_same(*page_table, orig_pte);
same = pte_same(*vmf->pte, vmf->orig_pte);
spin_unlock(ptl);
}
#endif
pte_unmap(page_table);
pte_unmap(vmf->pte);
vmf->pte = NULL;
return same;
}
@ -3019,7 +3028,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
}
}
if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
goto oom_free_new;
cgroup_throttle_swaprate(new_page, GFP_KERNEL);
@ -3321,20 +3330,20 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
}
static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
pgoff_t first_index,
pgoff_t last_index,
struct zap_details *details)
{
struct vm_area_struct *vma;
pgoff_t vba, vea, zba, zea;
vma_interval_tree_foreach(vma, root,
details->first_index, details->last_index) {
vma_interval_tree_foreach(vma, root, first_index, last_index) {
vba = vma->vm_pgoff;
vea = vba + vma_pages(vma) - 1;
zba = details->first_index;
zba = first_index;
if (zba < vba)
zba = vba;
zea = details->last_index;
zea = last_index;
if (zea > vea)
zea = vea;
@ -3346,32 +3355,35 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
}
/**
* unmap_mapping_page() - Unmap single page from processes.
* @page: The locked page to be unmapped.
* unmap_mapping_folio() - Unmap single folio from processes.
* @folio: The locked folio to be unmapped.
*
* Unmap this page from any userspace process which still has it mmaped.
* Unmap this folio from any userspace process which still has it mmaped.
* Typically, for efficiency, the range of nearby pages has already been
* unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
* truncation or invalidation holds the lock on a page, it may find that
* the page has been remapped again: and then uses unmap_mapping_page()
* truncation or invalidation holds the lock on a folio, it may find that
* the page has been remapped again: and then uses unmap_mapping_folio()
* to unmap it finally.
*/
void unmap_mapping_page(struct page *page)
void unmap_mapping_folio(struct folio *folio)
{
struct address_space *mapping = page->mapping;
struct address_space *mapping = folio->mapping;
struct zap_details details = { };
pgoff_t first_index;
pgoff_t last_index;
VM_BUG_ON(!PageLocked(page));
VM_BUG_ON(PageTail(page));
VM_BUG_ON(!folio_test_locked(folio));
details.check_mapping = mapping;
details.first_index = page->index;
details.last_index = page->index + thp_nr_pages(page) - 1;
details.single_page = page;
first_index = folio->index;
last_index = folio->index + folio_nr_pages(folio) - 1;
details.zap_mapping = mapping;
details.single_folio = folio;
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
unmap_mapping_range_tree(&mapping->i_mmap, first_index,
last_index, &details);
i_mmap_unlock_write(mapping);
}
@ -3391,16 +3403,17 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
pgoff_t nr, bool even_cows)
{
struct zap_details details = { };
pgoff_t first_index = start;
pgoff_t last_index = start + nr - 1;
details.check_mapping = even_cows ? NULL : mapping;
details.first_index = start;
details.last_index = start + nr - 1;
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
details.zap_mapping = even_cows ? NULL : mapping;
if (last_index < first_index)
last_index = ULONG_MAX;
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
unmap_mapping_range_tree(&mapping->i_mmap, first_index,
last_index, &details);
i_mmap_unlock_write(mapping);
}
EXPORT_SYMBOL_GPL(unmap_mapping_pages);
@ -3488,7 +3501,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vm_fault_t ret = 0;
void *shadow = NULL;
if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
if (!pte_unmap_same(vmf))
goto out;
entry = pte_to_swp_entry(vmf->orig_pte);
@ -3516,7 +3529,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (unlikely(!si))
goto out;
delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
page = lookup_swap_cache(entry, vma, vmf->address);
swapcache = page;
@ -3539,7 +3551,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
workingset_refault(page, shadow);
workingset_refault(page_folio(page),
shadow);
lru_cache_add(page);
@ -3563,7 +3576,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
vmf->address, &vmf->ptl);
if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
ret = VM_FAULT_OOM;
delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
goto unlock;
}
@ -3577,13 +3589,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
* owner processes (which may be unknown at hwpoison time)
*/
ret = VM_FAULT_HWPOISON;
delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
goto out_release;
}
locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
if (!locked) {
ret |= VM_FAULT_RETRY;
goto out_release;
@ -3634,7 +3644,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
pte = mk_pte(page, vma->vm_page_prot);
if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
vmf->flags &= ~FAULT_FLAG_WRITE;
ret |= VM_FAULT_WRITE;
@ -3647,8 +3657,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
pte = pte_mkuffd_wp(pte);
pte = pte_wrprotect(pte);
}
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
vmf->orig_pte = pte;
/* ksm created a completely new copy */
@ -3659,6 +3667,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
}
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
swap_free(entry);
if (mem_cgroup_swap_full(page) ||
(vma->vm_flags & VM_LOCKED) || PageMlocked(page))
@ -3769,7 +3780,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
if (!page)
goto oom;
if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
goto oom_free_page;
cgroup_throttle_swaprate(page, GFP_KERNEL);
@ -3852,7 +3863,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
ret = vma->vm_ops->fault(vmf);
@ -3923,7 +3933,6 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
@ -4036,17 +4045,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
return ret;
}
if (vmf->prealloc_pte) {
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (likely(pmd_none(*vmf->pmd))) {
mm_inc_nr_ptes(vma->vm_mm);
pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
spin_unlock(vmf->ptl);
} else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
if (vmf->prealloc_pte)
pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
return VM_FAULT_OOM;
}
}
/* See comment in handle_pte_fault() */
@ -4155,7 +4157,6 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
}
return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
@ -4202,7 +4203,8 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
if (!vmf->cow_page)
return VM_FAULT_OOM;
if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm,
GFP_KERNEL)) {
put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
@ -4267,7 +4269,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults).
* The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
* return value. See filemap_fault() and __folio_lock_or_retry().
* If mmap_lock is released, vma may become invalid (for example
* by other thread calling munmap()).
*/
@ -4508,7 +4510,7 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
* concurrent faults).
*
* The mmap_lock may have been released depending on flags and our return value.
* See filemap_fault() and __lock_page_or_retry().
* See filemap_fault() and __folio_lock_or_retry().
*/
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
{
@ -4612,7 +4614,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
* By the time we get here, we already hold the mm semaphore
*
* The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
* return value. See filemap_fault() and __folio_lock_or_retry().
*/
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
unsigned long address, unsigned int flags)
@ -4768,7 +4770,7 @@ static inline void mm_account_fault(struct pt_regs *regs,
* By the time we get here, we already hold the mm semaphore
*
* The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
* return value. See filemap_fault() and __folio_lock_or_retry().
*/
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
unsigned int flags, struct pt_regs *regs)
@ -4829,13 +4831,13 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
if (!new)
return -ENOMEM;
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&mm->page_table_lock);
if (pgd_present(*pgd)) /* Another has populated it */
if (pgd_present(*pgd)) { /* Another has populated it */
p4d_free(mm, new);
else
} else {
smp_wmb(); /* See comment in pmd_install() */
pgd_populate(mm, pgd, new);
}
spin_unlock(&mm->page_table_lock);
return 0;
}
@ -4852,11 +4854,10 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
if (!new)
return -ENOMEM;
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&mm->page_table_lock);
if (!p4d_present(*p4d)) {
mm_inc_nr_puds(mm);
smp_wmb(); /* See comment in pmd_install() */
p4d_populate(mm, p4d, new);
} else /* Another has populated it */
pud_free(mm, new);
@ -4877,14 +4878,14 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
if (!new)
return -ENOMEM;
smp_wmb(); /* See comment in __pte_alloc */
ptl = pud_lock(mm, pud);
if (!pud_present(*pud)) {
mm_inc_nr_pmds(mm);
smp_wmb(); /* See comment in pmd_install() */
pud_populate(mm, pud, new);
} else /* Another has populated it */
} else { /* Another has populated it */
pmd_free(mm, new);
}
spin_unlock(ptl);
return 0;
}
@ -5265,7 +5266,7 @@ void __might_fault(const char *file, int line)
return;
if (pagefault_disabled())
return;
__might_sleep(file, line, 0);
__might_sleep(file, line);
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
might_lock_read(&current->mm->mmap_lock);
@ -5421,7 +5422,6 @@ long copy_huge_page_from_user(struct page *dst_page,
unsigned int pages_per_huge_page,
bool allow_pagefault)
{
void *src = (void *)usr_src;
void *page_kaddr;
unsigned long i, rc = 0;
unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
@ -5434,8 +5434,7 @@ long copy_huge_page_from_user(struct page *dst_page,
else
page_kaddr = kmap_atomic(subpage);
rc = copy_from_user(page_kaddr,
(const void __user *)(src + i * PAGE_SIZE),
PAGE_SIZE);
usr_src + i * PAGE_SIZE, PAGE_SIZE);
if (allow_pagefault)
kunmap(subpage);
else

View File

@ -21,7 +21,6 @@
#include <linux/memory.h>
#include <linux/memremap.h>
#include <linux/memory_hotplug.h>
#include <linux/highmem.h>
#include <linux/vmalloc.h>
#include <linux/ioport.h>
#include <linux/delay.h>
@ -36,6 +35,7 @@
#include <linux/memblock.h>
#include <linux/compaction.h>
#include <linux/rmap.h>
#include <linux/module.h>
#include <asm/tlbflush.h>
@ -57,7 +57,7 @@ enum {
ONLINE_POLICY_AUTO_MOVABLE,
};
const char *online_policy_to_str[] = {
static const char * const online_policy_to_str[] = {
[ONLINE_POLICY_CONTIG_ZONES] = "contig-zones",
[ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable",
};
@ -220,7 +220,6 @@ static void release_memory_resource(struct resource *res)
kfree(res);
}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
const char *reason)
{
@ -586,10 +585,6 @@ void generic_online_page(struct page *page, unsigned int order)
debug_pagealloc_map_pages(page, 1 << order);
__free_pages_core(page, order);
totalram_pages_add(1UL << order);
#ifdef CONFIG_HIGHMEM
if (PageHighMem(page))
totalhigh_pages_add(1UL << order);
#endif
}
EXPORT_SYMBOL_GPL(generic_online_page);
@ -626,16 +621,11 @@ static void node_states_check_changes_online(unsigned long nr_pages,
arg->status_change_nid = NUMA_NO_NODE;
arg->status_change_nid_normal = NUMA_NO_NODE;
arg->status_change_nid_high = NUMA_NO_NODE;
if (!node_state(nid, N_MEMORY))
arg->status_change_nid = nid;
if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
arg->status_change_nid_normal = nid;
#ifdef CONFIG_HIGHMEM
if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
arg->status_change_nid_high = nid;
#endif
}
static void node_states_set_node(int node, struct memory_notify *arg)
@ -643,9 +633,6 @@ static void node_states_set_node(int node, struct memory_notify *arg)
if (arg->status_change_nid_normal >= 0)
node_set_state(node, N_NORMAL_MEMORY);
if (arg->status_change_nid_high >= 0)
node_set_state(node, N_HIGH_MEMORY);
if (arg->status_change_nid >= 0)
node_set_state(node, N_MEMORY);
}
@ -1163,7 +1150,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
mem_hotplug_done();
return ret;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
static void reset_node_present_pages(pg_data_t *pgdat)
{
@ -1357,6 +1343,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
enum memblock_flags memblock_flags = MEMBLOCK_NONE;
struct vmem_altmap mhp_altmap = {};
struct memory_group *group = NULL;
u64 start, size;
@ -1384,8 +1371,13 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
mem_hotplug_begin();
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_add_node(start, size, nid);
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED)
memblock_flags = MEMBLOCK_DRIVER_MANAGED;
ret = memblock_add_node(start, size, nid, memblock_flags);
if (ret)
goto error_mem_hotplug_end;
}
ret = __try_online_node(nid, false);
if (ret < 0)
@ -1458,6 +1450,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
rollback_node_hotadd(nid);
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
memblock_remove(start, size);
error_mem_hotplug_end:
mem_hotplug_done();
return ret;
}
@ -1803,7 +1796,6 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
arg->status_change_nid = NUMA_NO_NODE;
arg->status_change_nid_normal = NUMA_NO_NODE;
arg->status_change_nid_high = NUMA_NO_NODE;
/*
* Check whether node_states[N_NORMAL_MEMORY] will be changed.
@ -1818,24 +1810,9 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
arg->status_change_nid_normal = zone_to_nid(zone);
#ifdef CONFIG_HIGHMEM
/*
* node_states[N_HIGH_MEMORY] contains nodes which
* have normal memory or high memory.
* Here we add the present_pages belonging to ZONE_HIGHMEM.
* If the zone is within the range of [0..ZONE_HIGHMEM), and
* we determine that the zones in that range become empty,
* we need to clear the node for N_HIGH_MEMORY.
*/
present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
arg->status_change_nid_high = zone_to_nid(zone);
#endif
/*
* We have accounted the pages from [0..ZONE_NORMAL), and
* in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
* as well.
* We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
* does not apply as we don't support 32bit.
* Here we count the possible pages from ZONE_MOVABLE.
* If after having accounted all the pages, we see that the nr_pages
* to be offlined is over or equal to the accounted pages,
@ -1853,9 +1830,6 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
if (arg->status_change_nid_normal >= 0)
node_clear_state(node, N_NORMAL_MEMORY);
if (arg->status_change_nid_high >= 0)
node_clear_state(node, N_HIGH_MEMORY);
if (arg->status_change_nid >= 0)
node_clear_state(node, N_MEMORY);
}
@ -2204,7 +2178,7 @@ static int __ref try_remove_memory(u64 start, u64 size)
arch_remove_memory(start, size, altmap);
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
memblock_free(start, size);
memblock_phys_free(start, size);
memblock_remove(start, size);
}

View File

@ -134,6 +134,8 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES];
* @node: Node id to start the search
*
* Lookup the next closest node by distance if @nid is not online.
*
* Return: this @node if it is online, otherwise the closest node by distance
*/
int numa_map_to_online_node(int node)
{
@ -296,6 +298,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
atomic_set(&policy->refcnt, 1);
policy->mode = mode;
policy->flags = flags;
policy->home_node = NUMA_NO_NODE;
return policy;
}
@ -810,7 +813,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
new_pol, vma->vm_userfaultfd_ctx);
new_pol, vma->vm_userfaultfd_ctx,
anon_vma_name(vma));
if (prev) {
vma = prev;
next = vma->vm_next;
@ -1477,6 +1481,77 @@ static long kernel_mbind(unsigned long start, unsigned long len,
return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
}
SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
unsigned long, home_node, unsigned long, flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct mempolicy *new;
unsigned long vmstart;
unsigned long vmend;
unsigned long end;
int err = -ENOENT;
start = untagged_addr(start);
if (start & ~PAGE_MASK)
return -EINVAL;
/*
* flags is used for future extension if any.
*/
if (flags != 0)
return -EINVAL;
/*
* Check home_node is online to avoid accessing uninitialized
* NODE_DATA.
*/
if (home_node >= MAX_NUMNODES || !node_online(home_node))
return -EINVAL;
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
end = start + len;
if (end < start)
return -EINVAL;
if (end == start)
return 0;
mmap_write_lock(mm);
vma = find_vma(mm, start);
for (; vma && vma->vm_start < end; vma = vma->vm_next) {
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
new = mpol_dup(vma_policy(vma));
if (IS_ERR(new)) {
err = PTR_ERR(new);
break;
}
/*
* Only update home node if there is an existing vma policy
*/
if (!new)
continue;
/*
* If any vma in the range got policy other than MPOL_BIND
* or MPOL_PREFERRED_MANY we return error. We don't reset
* the home node for vmas we already updated before.
*/
if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) {
err = -EOPNOTSUPP;
break;
}
new->home_node = home_node;
err = mbind_range(mm, vmstart, vmend, new);
mpol_put(new);
if (err)
break;
}
mmap_write_unlock(mm);
return err;
}
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
unsigned long, mode, const unsigned long __user *, nmask,
unsigned long, maxnode, unsigned int, flags)
@ -1801,6 +1876,11 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
}
if ((policy->mode == MPOL_BIND ||
policy->mode == MPOL_PREFERRED_MANY) &&
policy->home_node != NUMA_NO_NODE)
return policy->home_node;
return nd;
}
@ -2061,7 +2141,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
if (!page)
page = __alloc_pages(gfp, order, numa_node_id(), NULL);
page = __alloc_pages(gfp, order, nid, NULL);
return page;
}
@ -2072,7 +2152,6 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
* @order: Order of the GFP allocation.
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual address of the allocation. Must be inside @vma.
* @node: Which node to prefer for allocation (modulo policy).
* @hugepage: For hugepages try only the preferred node if possible.
*
* Allocate a page for a specific address in @vma, using the appropriate
@ -2083,9 +2162,10 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
* Return: The page on success or NULL if allocation fails.
*/
struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
unsigned long addr, int node, bool hugepage)
unsigned long addr, bool hugepage)
{
struct mempolicy *pol;
int node = numa_node_id();
struct page *page;
int preferred_nid;
nodemask_t *nmask;
@ -2102,6 +2182,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
}
if (pol->mode == MPOL_PREFERRED_MANY) {
node = policy_node(gfp, pol, node);
page = alloc_pages_preferred_many(gfp, order, node, pol);
mpol_cond_put(pol);
goto out;
@ -2185,7 +2266,7 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
else if (pol->mode == MPOL_PREFERRED_MANY)
page = alloc_pages_preferred_many(gfp, order,
numa_node_id(), pol);
policy_node(gfp, pol, numa_node_id()), pol);
else
page = __alloc_pages(gfp, order,
policy_node(gfp, pol, numa_node_id()),
@ -2195,6 +2276,98 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
}
EXPORT_SYMBOL(alloc_pages);
struct folio *folio_alloc(gfp_t gfp, unsigned order)
{
struct page *page = alloc_pages(gfp | __GFP_COMP, order);
if (page && order > 1)
prep_transhuge_page(page);
return (struct folio *)page;
}
EXPORT_SYMBOL(folio_alloc);
static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
int nodes;
unsigned long nr_pages_per_node;
int delta;
int i;
unsigned long nr_allocated;
unsigned long total_allocated = 0;
nodes = nodes_weight(pol->nodes);
nr_pages_per_node = nr_pages / nodes;
delta = nr_pages - nodes * nr_pages_per_node;
for (i = 0; i < nodes; i++) {
if (delta) {
nr_allocated = __alloc_pages_bulk(gfp,
interleave_nodes(pol), NULL,
nr_pages_per_node + 1, NULL,
page_array);
delta--;
} else {
nr_allocated = __alloc_pages_bulk(gfp,
interleave_nodes(pol), NULL,
nr_pages_per_node, NULL, page_array);
}
page_array += nr_allocated;
total_allocated += nr_allocated;
}
return total_allocated;
}
static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
struct mempolicy *pol, unsigned long nr_pages,
struct page **page_array)
{
gfp_t preferred_gfp;
unsigned long nr_allocated = 0;
preferred_gfp = gfp | __GFP_NOWARN;
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
nr_pages, NULL, page_array);
if (nr_allocated < nr_pages)
nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
nr_pages - nr_allocated, NULL,
page_array + nr_allocated);
return nr_allocated;
}
/* alloc pages bulk and mempolicy should be considered at the
* same time in some situation such as vmalloc.
*
* It can accelerate memory allocation especially interleaving
* allocate memory.
*/
unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
unsigned long nr_pages, struct page **page_array)
{
struct mempolicy *pol = &default_policy;
if (!in_interrupt() && !(gfp & __GFP_THISNODE))
pol = get_task_policy(current);
if (pol->mode == MPOL_INTERLEAVE)
return alloc_pages_bulk_array_interleave(gfp, pol,
nr_pages, page_array);
if (pol->mode == MPOL_PREFERRED_MANY)
return alloc_pages_bulk_array_preferred_many(gfp,
numa_node_id(), pol, nr_pages, page_array);
return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
policy_nodemask(gfp, pol), nr_pages, NULL,
page_array);
}
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
{
struct mempolicy *pol = mpol_dup(vma_policy(src));
@ -2249,6 +2422,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
return false;
if (a->flags != b->flags)
return false;
if (a->home_node != b->home_node)
return false;
if (mpol_store_user_nodemask(a))
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
return false;
@ -2792,7 +2967,7 @@ static const char * const policy_modes[] =
* Format of input:
* <mode>[=<flags>][:<nodelist>]
*
* On success, returns 0, else 1
* Return: %0 on success, else %1
*/
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
@ -2974,64 +3149,3 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
nodemask_pr_args(&nodes));
}
bool numa_demotion_enabled = false;
#ifdef CONFIG_SYSFS
static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return sysfs_emit(buf, "%s\n",
numa_demotion_enabled? "true" : "false");
}
static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
numa_demotion_enabled = true;
else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
numa_demotion_enabled = false;
else
return -EINVAL;
return count;
}
static struct kobj_attribute numa_demotion_enabled_attr =
__ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
numa_demotion_enabled_store);
static struct attribute *numa_attrs[] = {
&numa_demotion_enabled_attr.attr,
NULL,
};
static const struct attribute_group numa_attr_group = {
.attrs = numa_attrs,
};
static int __init numa_init_sysfs(void)
{
int err;
struct kobject *numa_kobj;
numa_kobj = kobject_create_and_add("numa", mm_kobj);
if (!numa_kobj) {
pr_err("failed to create numa kobject\n");
return -ENOMEM;
}
err = sysfs_create_group(numa_kobj, &numa_attr_group);
if (err) {
pr_err("failed to register numa group\n");
goto delete_obj;
}
return 0;
delete_obj:
kobject_put(numa_kobj);
return err;
}
subsys_initcall(numa_init_sysfs);
#endif

View File

@ -17,7 +17,6 @@
#include <linux/kmemleak.h>
#include <linux/export.h>
#include <linux/mempool.h>
#include <linux/blkdev.h>
#include <linux/writeback.h>
#include "slab.h"

View File

@ -102,39 +102,22 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
return (range->start + range_len(range)) >> PAGE_SHIFT;
}
static unsigned long pfn_next(unsigned long pfn)
static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
{
if (pfn % 1024 == 0)
if (pfn % (1024 << pgmap->vmemmap_shift))
cond_resched();
return pfn + 1;
return pfn + pgmap_vmemmap_nr(pgmap);
}
static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id)
{
return (pfn_end(pgmap, range_id) -
pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift;
}
#define for_each_device_pfn(pfn, map, i) \
for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
static void dev_pagemap_kill(struct dev_pagemap *pgmap)
{
if (pgmap->ops && pgmap->ops->kill)
pgmap->ops->kill(pgmap);
else
percpu_ref_kill(pgmap->ref);
}
static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
{
if (pgmap->ops && pgmap->ops->cleanup) {
pgmap->ops->cleanup(pgmap);
} else {
wait_for_completion(&pgmap->done);
percpu_ref_exit(pgmap->ref);
}
/*
* Undo the pgmap ref assignment for the internal case as the
* caller may re-enable the same pgmap.
*/
if (pgmap->ref == &pgmap->internal_ref)
pgmap->ref = NULL;
}
for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \
pfn = pfn_next(map, pfn))
static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
{
@ -167,11 +150,12 @@ void memunmap_pages(struct dev_pagemap *pgmap)
unsigned long pfn;
int i;
dev_pagemap_kill(pgmap);
percpu_ref_kill(&pgmap->ref);
for (i = 0; i < pgmap->nr_range; i++)
for_each_device_pfn(pfn, pgmap, i)
put_page(pfn_to_page(pfn));
dev_pagemap_cleanup(pgmap);
wait_for_completion(&pgmap->done);
percpu_ref_exit(&pgmap->ref);
for (i = 0; i < pgmap->nr_range; i++)
pageunmap_range(pgmap, i);
@ -188,8 +172,7 @@ static void devm_memremap_pages_release(void *data)
static void dev_pagemap_percpu_release(struct percpu_ref *ref)
{
struct dev_pagemap *pgmap =
container_of(ref, struct dev_pagemap, internal_ref);
struct dev_pagemap *pgmap = container_of(ref, struct dev_pagemap, ref);
complete(&pgmap->done);
}
@ -295,8 +278,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
PHYS_PFN(range->start),
PHYS_PFN(range_len(range)), pgmap);
percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
- pfn_first(pgmap, range_id));
percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
return 0;
err_add_memory:
@ -362,22 +344,11 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
break;
}
if (!pgmap->ref) {
if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
return ERR_PTR(-EINVAL);
init_completion(&pgmap->done);
error = percpu_ref_init(&pgmap->internal_ref,
dev_pagemap_percpu_release, 0, GFP_KERNEL);
if (error)
return ERR_PTR(error);
pgmap->ref = &pgmap->internal_ref;
} else {
if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
WARN(1, "Missing reference count teardown definition\n");
return ERR_PTR(-EINVAL);
}
}
init_completion(&pgmap->done);
error = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0,
GFP_KERNEL);
if (error)
return ERR_PTR(error);
devmap_managed_enable_get(pgmap);
@ -486,7 +457,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
/* fall back to slow path lookup */
rcu_read_lock();
pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
if (pgmap && !percpu_ref_tryget_live(&pgmap->ref))
pgmap = NULL;
rcu_read_unlock();
@ -505,7 +476,7 @@ void free_devmap_managed_page(struct page *page)
__ClearPageWaiters(page);
mem_cgroup_uncharge(page);
mem_cgroup_uncharge(page_folio(page));
/*
* When a device_private page is freed, the page->mapping field

File diff suppressed because it is too large Load Diff

View File

@ -271,6 +271,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
/* Phase 1: page isolation */
for (i = 0; i < nr; i++) {
struct page *page = pvec->pages[i];
struct folio *folio = page_folio(page);
if (TestClearPageMlocked(page)) {
/*
@ -278,7 +279,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
* so we can spare the get_page() here.
*/
if (TestClearPageLRU(page)) {
lruvec = relock_page_lruvec_irq(page, lruvec);
lruvec = folio_lruvec_relock_irq(folio, lruvec);
del_page_from_lru_list(page, lruvec);
continue;
} else
@ -511,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx);
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (*prev) {
vma = *prev;
goto success;

View File

@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
@ -1029,7 +1030,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
*/
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
struct anon_vma_name *anon_name)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@ -1047,6 +1049,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
return 0;
if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
return 0;
if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
return 0;
return 1;
}
@ -1079,9 +1083,10 @@ static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
struct anon_vma_name *anon_name)
{
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
@ -1100,9 +1105,10 @@ static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
struct anon_vma_name *anon_name)
{
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = vma_pages(vma);
@ -1113,9 +1119,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
}
/*
* Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
* whether that can be merged with its predecessor or its successor.
* Or both (it neatly fills a hole).
* Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
* figure out whether that can be merged with its predecessor or its
* successor. Or both (it neatly fills a hole).
*
* In most cases - when called for mmap, brk or mremap - [addr,end) is
* certain not to be mapped by the time vma_merge is called; but when
@ -1160,7 +1166,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
struct anon_vma_name *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@ -1190,7 +1197,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff,
vm_userfaultfd_ctx)) {
vm_userfaultfd_ctx, anon_name)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
@ -1199,7 +1206,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
can_vma_merge_before(next, vm_flags,
anon_vma, file,
pgoff+pglen,
vm_userfaultfd_ctx) &&
vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
@ -1222,7 +1229,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen,
vm_userfaultfd_ctx)) {
vm_userfaultfd_ctx, anon_name)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL, next);
@ -1599,7 +1606,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
goto out_fput;
}
} else if (flags & MAP_HUGETLB) {
struct ucounts *ucounts = NULL;
struct hstate *hs;
hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
@ -1615,7 +1621,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
*/
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE,
&ucounts, HUGETLB_ANONHUGE_INODE,
HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file))
return PTR_ERR(file);
@ -1755,7 +1761,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* Can we just expand an old mapping?
*/
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (vma)
goto out;
@ -1804,7 +1810,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
*/
if (unlikely(vm_flags != vma->vm_flags && prev)) {
merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (merge) {
/* ->mmap() can change vma->vm_file and fput the original file. So
* fput the vma->vm_file here or we would add an extra fput for file
@ -2929,7 +2935,6 @@ EXPORT_SYMBOL(vm_munmap);
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
addr = untagged_addr(addr);
profile_munmap(addr);
return __vm_munmap(addr, len, true);
}
@ -3057,7 +3062,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (vma)
goto out;
@ -3143,25 +3148,27 @@ void exit_mmap(struct mm_struct *mm)
* to mmu_notifier_release(mm) ensures mmu notifier callbacks in
* __oom_reap_task_mm() will not block.
*
* This needs to be done before calling munlock_vma_pages_all(),
* This needs to be done before calling unlock_range(),
* which clears VM_LOCKED, otherwise the oom reaper cannot
* reliably test it.
*/
(void)__oom_reap_task_mm(mm);
set_bit(MMF_OOM_SKIP, &mm->flags);
mmap_write_lock(mm);
mmap_write_unlock(mm);
}
mmap_write_lock(mm);
if (mm->locked_vm)
unlock_range(mm->mmap, ULONG_MAX);
arch_exit_mmap(mm);
vma = mm->mmap;
if (!vma) /* Can happen if dup_mmap() received an OOM */
if (!vma) {
/* Can happen if dup_mmap() received an OOM */
mmap_write_unlock(mm);
return;
}
lru_add_drain();
flush_cache_mm(mm);
@ -3172,16 +3179,15 @@ void exit_mmap(struct mm_struct *mm)
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
/*
* Walk the list again, actually closing and freeing it,
* with preemption enabled, without holding any MM locks.
*/
/* Walk the list again, actually closing and freeing it. */
while (vma) {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
vma = remove_vma(vma);
cond_resched();
}
mm->mmap = NULL;
mmap_write_unlock(mm);
vm_unacct_memory(nr_accounted);
}
@ -3250,7 +3256,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx);
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (new_vma) {
/*
* Source vma may have been merged into new_vma
@ -3332,7 +3338,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
{
mm->total_vm += npages;
WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);
if (is_exec_mapping(flags))
mm->exec_vm += npages;

View File

@ -3,6 +3,7 @@
#include <linux/kernel.h>
#include <linux/mmdebug.h>
#include <linux/mm_types.h>
#include <linux/mm_inline.h>
#include <linux/pagemap.h>
#include <linux/rcupdate.h>
#include <linux/smp.h>

View File

@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx);
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
if (*pprev) {
vma = *pprev;
VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
@ -563,7 +563,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
error = -ENOMEM;
if (!vma)
goto out;
prev = vma->vm_prev;
if (unlikely(grows & PROT_GROWSDOWN)) {
if (vma->vm_start >= end)
goto out;
@ -581,8 +581,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
goto out;
}
}
if (start > vma->vm_start)
prev = vma;
else
prev = vma->vm_prev;
for (nstart = start ; ; ) {
unsigned long mask_off_old_flags;

View File

@ -489,6 +489,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
if (is_vm_hugetlb_page(vma))
return move_hugetlb_page_tables(vma, new_vma, old_addr,
new_addr, len);
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
old_addr, old_end);
mmu_notifier_invalidate_range_start(&range);
@ -565,6 +569,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
bool *locked, unsigned long flags,
struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
{
long to_account = new_len - old_len;
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *new_vma;
unsigned long vm_flags = vma->vm_flags;
@ -583,6 +588,9 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (mm->map_count >= sysctl_max_map_count - 3)
return -ENOMEM;
if (unlikely(flags & MREMAP_DONTUNMAP))
to_account = new_len;
if (vma->vm_ops && vma->vm_ops->may_split) {
if (vma->vm_start != old_addr)
err = vma->vm_ops->may_split(vma, old_addr);
@ -604,8 +612,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (err)
return err;
if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) {
if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT))
if (vm_flags & VM_ACCOUNT) {
if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
return -ENOMEM;
}
@ -613,8 +621,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
&need_rmap_locks);
if (!new_vma) {
if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT))
vm_unacct_memory(new_len >> PAGE_SHIFT);
if (vm_flags & VM_ACCOUNT)
vm_unacct_memory(to_account >> PAGE_SHIFT);
return -ENOMEM;
}
@ -642,6 +650,10 @@ static unsigned long move_vma(struct vm_area_struct *vma,
mremap_userfaultfd_prep(new_vma, uf);
}
if (is_vm_hugetlb_page(vma)) {
clear_vma_resv_huge_pages(vma);
}
/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
vma->vm_flags &= ~VM_ACCOUNT;
@ -708,8 +720,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
}
static struct vm_area_struct *vma_to_resize(unsigned long addr,
unsigned long old_len, unsigned long new_len, unsigned long flags,
unsigned long *p)
unsigned long old_len, unsigned long new_len, unsigned long flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
@ -736,9 +747,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
return ERR_PTR(-EINVAL);
if (is_vm_hugetlb_page(vma))
return ERR_PTR(-EINVAL);
/* We can't remap across vm area boundaries */
if (old_len > vma->vm_end - addr)
return ERR_PTR(-EFAULT);
@ -768,13 +776,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
(new_len - old_len) >> PAGE_SHIFT))
return ERR_PTR(-ENOMEM);
if (vma->vm_flags & VM_ACCOUNT) {
unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
if (security_vm_enough_memory_mm(mm, charged))
return ERR_PTR(-ENOMEM);
*p = charged;
}
return vma;
}
@ -787,7 +788,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
unsigned long charged = 0;
unsigned long map_flags = 0;
if (offset_in_page(new_addr))
@ -830,7 +830,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
old_len = new_len;
}
vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
vma = vma_to_resize(addr, old_len, new_len, flags);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
@ -853,7 +853,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
((addr - vma->vm_start) >> PAGE_SHIFT),
map_flags);
if (IS_ERR_VALUE(ret))
goto out1;
goto out;
/* We got a new mapping */
if (!(flags & MREMAP_FIXED))
@ -862,12 +862,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
uf_unmap);
if (!(offset_in_page(ret)))
goto out;
out1:
vm_unacct_memory(charged);
out:
return ret;
}
@ -899,7 +893,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
unsigned long ret = -EINVAL;
unsigned long charged = 0;
bool locked = false;
bool downgraded = false;
struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
@ -949,6 +942,31 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (mmap_write_lock_killable(current->mm))
return -EINTR;
vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr) {
ret = EFAULT;
goto out;
}
if (is_vm_hugetlb_page(vma)) {
struct hstate *h __maybe_unused = hstate_vma(vma);
old_len = ALIGN(old_len, huge_page_size(h));
new_len = ALIGN(new_len, huge_page_size(h));
/* addrs must be huge page aligned */
if (addr & ~huge_page_mask(h))
goto out;
if (new_addr & ~huge_page_mask(h))
goto out;
/*
* Don't allow remap expansion, because the underlying hugetlb
* reservation is not yet capable to handle split reservation.
*/
if (new_len > old_len)
goto out;
}
if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
ret = mremap_to(addr, old_len, new_addr, new_len,
@ -981,7 +999,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
/*
* Ok, we need to grow..
*/
vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
vma = vma_to_resize(addr, old_len, new_len, flags);
if (IS_ERR(vma)) {
ret = PTR_ERR(vma);
goto out;
@ -992,10 +1010,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (old_len == vma->vm_end - addr) {
/* can we just expand the current mapping? */
if (vma_expandable(vma, new_len - old_len)) {
int pages = (new_len - old_len) >> PAGE_SHIFT;
long pages = (new_len - old_len) >> PAGE_SHIFT;
if (vma->vm_flags & VM_ACCOUNT) {
if (security_vm_enough_memory_mm(mm, pages)) {
ret = -ENOMEM;
goto out;
}
}
if (vma_adjust(vma, vma->vm_start, addr + new_len,
vma->vm_pgoff, NULL)) {
vm_unacct_memory(pages);
ret = -ENOMEM;
goto out;
}
@ -1034,10 +1060,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
&locked, flags, &uf, &uf_unmap);
}
out:
if (offset_in_page(ret)) {
vm_unacct_memory(charged);
if (offset_in_page(ret))
locked = false;
}
if (downgraded)
mmap_read_unlock(current->mm);
else

View File

@ -27,7 +27,6 @@
#include <linux/pagemap.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/compiler.h>
#include <linux/mount.h>
@ -1639,12 +1638,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
return -ENOMEM;
}
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
BUG();

View File

@ -641,6 +641,8 @@ static void oom_reap_task(struct task_struct *tsk)
static int oom_reaper(void *unused)
{
set_freezable();
while (true) {
struct task_struct *tsk = NULL;
@ -787,11 +789,11 @@ static inline bool __task_will_free_mem(struct task_struct *task)
struct signal_struct *sig = task->signal;
/*
* A coredumping process may sleep for an extended period in exit_mm(),
* so the oom killer cannot assume that the process will promptly exit
* and release memory.
* A coredumping process may sleep for an extended period in
* coredump_task_exit(), so the oom killer cannot assume that
* the process will promptly exit and release memory.
*/
if (sig->flags & SIGNAL_GROUP_COREDUMP)
if (sig->core_state)
return false;
if (sig->flags & SIGNAL_GROUP_EXIT)
@ -992,6 +994,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
* If necessary, kill all tasks in the selected memory cgroup.
*/
if (oom_group) {
memcg_memory_event(oom_group, MEMCG_OOM_GROUP_KILL);
mem_cgroup_print_oom_group(oom_group);
mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
(void *)message);
@ -1055,7 +1058,7 @@ bool out_of_memory(struct oom_control *oc)
if (!is_memcg_oom(oc)) {
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
if (freed > 0 && !is_sysrq_oom(oc))
/* Got some memory back in the last second. */
return true;
}
@ -1148,21 +1151,14 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
struct task_struct *p;
unsigned int f_flags;
bool reap = false;
struct pid *pid;
long ret = 0;
if (flags)
return -EINVAL;
pid = pidfd_get_pid(pidfd, &f_flags);
if (IS_ERR(pid))
return PTR_ERR(pid);
task = get_pid_task(pid, PIDTYPE_TGID);
if (!task) {
ret = -ESRCH;
goto put_pid;
}
task = pidfd_get_task(pidfd, &f_flags);
if (IS_ERR(task))
return PTR_ERR(task);
/*
* Make sure to choose a thread which still has a reference to mm
@ -1174,15 +1170,15 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
goto put_task;
}
if (mmget_not_zero(p->mm)) {
mm = p->mm;
if (task_will_free_mem(p))
reap = true;
else {
/* Error only if the work has not been done already */
if (!test_bit(MMF_OOM_SKIP, &mm->flags))
ret = -EINVAL;
}
mm = p->mm;
mmgrab(mm);
if (task_will_free_mem(p))
reap = true;
else {
/* Error only if the work has not been done already */
if (!test_bit(MMF_OOM_SKIP, &mm->flags))
ret = -EINVAL;
}
task_unlock(p);
@ -1193,17 +1189,18 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
ret = -EINTR;
goto drop_mm;
}
if (!__oom_reap_task_mm(mm))
/*
* Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
* possible change in exit_mmap is seen
*/
if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
ret = -EAGAIN;
mmap_read_unlock(mm);
drop_mm:
if (mm)
mmput(mm);
mmdrop(mm);
put_task:
put_task_struct(task);
put_pid:
put_pid(pid);
return ret;
#else
return -ENOSYS;

View File

@ -562,12 +562,12 @@ static unsigned long wp_next_time(unsigned long cur_time)
return cur_time;
}
static void wb_domain_writeout_inc(struct wb_domain *dom,
static void wb_domain_writeout_add(struct wb_domain *dom,
struct fprop_local_percpu *completions,
unsigned int max_prop_frac)
unsigned int max_prop_frac, long nr)
{
__fprop_inc_percpu_max(&dom->completions, completions,
max_prop_frac);
__fprop_add_percpu_max(&dom->completions, completions,
max_prop_frac, nr);
/* First event after period switching was turned off? */
if (unlikely(!dom->period_time)) {
/*
@ -583,20 +583,20 @@ static void wb_domain_writeout_inc(struct wb_domain *dom,
/*
* Increment @wb's writeout completion count and the global writeout
* completion count. Called from test_clear_page_writeback().
* completion count. Called from __folio_end_writeback().
*/
static inline void __wb_writeout_inc(struct bdi_writeback *wb)
static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr)
{
struct wb_domain *cgdom;
inc_wb_stat(wb, WB_WRITTEN);
wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
wb->bdi->max_prop_frac);
wb_stat_mod(wb, WB_WRITTEN, nr);
wb_domain_writeout_add(&global_wb_domain, &wb->completions,
wb->bdi->max_prop_frac, nr);
cgdom = mem_cgroup_wb_domain(wb);
if (cgdom)
wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
wb->bdi->max_prop_frac);
wb_domain_writeout_add(cgdom, wb_memcg_completions(wb),
wb->bdi->max_prop_frac, nr);
}
void wb_writeout_inc(struct bdi_writeback *wb)
@ -604,7 +604,7 @@ void wb_writeout_inc(struct bdi_writeback *wb)
unsigned long flags;
local_irq_save(flags);
__wb_writeout_inc(wb);
__wb_writeout_add(wb, 1);
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(wb_writeout_inc);
@ -1084,7 +1084,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb,
* write_bandwidth = ---------------------------------------------------
* period
*
* @written may have decreased due to account_page_redirty().
* @written may have decreased due to folio_account_redirty().
* Avoid underflowing @bw calculation.
*/
bw = written - min(written, wb->written_stamp);
@ -2366,8 +2366,15 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
ret = generic_writepages(mapping, wbc);
if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL))
break;
cond_resched();
congestion_wait(BLK_RW_ASYNC, HZ/50);
/*
* Lacking an allocation context or the locality or writeback
* state of any of the inode's pages, throttle based on
* writeback activity on the local node. It's as good a
* guess as any.
*/
reclaim_throttle(NODE_DATA(numa_node_id()),
VMSCAN_THROTTLE_WRITEBACK);
}
/*
* Usually few pages are written by now from those we've just submitted
@ -2381,44 +2388,44 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
}
/**
* write_one_page - write out a single page and wait on I/O
* @page: the page to write
* folio_write_one - write out a single folio and wait on I/O.
* @folio: The folio to write.
*
* The page must be locked by the caller and will be unlocked upon return.
* The folio must be locked by the caller and will be unlocked upon return.
*
* Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
* function returns.
*
* Return: %0 on success, negative error code otherwise
*/
int write_one_page(struct page *page)
int folio_write_one(struct folio *folio)
{
struct address_space *mapping = page->mapping;
struct address_space *mapping = folio->mapping;
int ret = 0;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_ALL,
.nr_to_write = 1,
.nr_to_write = folio_nr_pages(folio),
};
BUG_ON(!PageLocked(page));
BUG_ON(!folio_test_locked(folio));
wait_on_page_writeback(page);
folio_wait_writeback(folio);
if (clear_page_dirty_for_io(page)) {
get_page(page);
ret = mapping->a_ops->writepage(page, &wbc);
if (folio_clear_dirty_for_io(folio)) {
folio_get(folio);
ret = mapping->a_ops->writepage(&folio->page, &wbc);
if (ret == 0)
wait_on_page_writeback(page);
put_page(page);
folio_wait_writeback(folio);
folio_put(folio);
} else {
unlock_page(page);
folio_unlock(folio);
}
if (!ret)
ret = filemap_check_errors(mapping);
return ret;
}
EXPORT_SYMBOL(write_one_page);
EXPORT_SYMBOL(folio_write_one);
/*
* For address_spaces which do not use buffers nor write back.
@ -2438,29 +2445,30 @@ EXPORT_SYMBOL(__set_page_dirty_no_writeback);
*
* NOTE: This relies on being atomic wrt interrupts.
*/
static void account_page_dirtied(struct page *page,
static void folio_account_dirtied(struct folio *folio,
struct address_space *mapping)
{
struct inode *inode = mapping->host;
trace_writeback_dirty_page(page, mapping);
trace_writeback_dirty_folio(folio, mapping);
if (mapping_can_writeback(mapping)) {
struct bdi_writeback *wb;
long nr = folio_nr_pages(folio);
inode_attach_wb(inode, page);
inode_attach_wb(inode, &folio->page);
wb = inode_to_wb(inode);
__inc_lruvec_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
__inc_node_page_state(page, NR_DIRTIED);
inc_wb_stat(wb, WB_RECLAIMABLE);
inc_wb_stat(wb, WB_DIRTIED);
task_io_account_write(PAGE_SIZE);
current->nr_dirtied++;
__this_cpu_inc(bdp_ratelimits);
__lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
__zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
__node_stat_mod_folio(folio, NR_DIRTIED, nr);
wb_stat_mod(wb, WB_RECLAIMABLE, nr);
wb_stat_mod(wb, WB_DIRTIED, nr);
task_io_account_write(nr * PAGE_SIZE);
current->nr_dirtied += nr;
__this_cpu_add(bdp_ratelimits, nr);
mem_cgroup_track_foreign_dirty(page, wb);
mem_cgroup_track_foreign_dirty(folio, wb);
}
}
@ -2469,130 +2477,156 @@ static void account_page_dirtied(struct page *page,
*
* Caller must hold lock_page_memcg().
*/
void account_page_cleaned(struct page *page, struct address_space *mapping,
void folio_account_cleaned(struct folio *folio, struct address_space *mapping,
struct bdi_writeback *wb)
{
if (mapping_can_writeback(mapping)) {
dec_lruvec_page_state(page, NR_FILE_DIRTY);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE);
task_io_account_cancelled_write(PAGE_SIZE);
long nr = folio_nr_pages(folio);
lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
task_io_account_cancelled_write(nr * PAGE_SIZE);
}
}
/*
* Mark the page dirty, and set it dirty in the page cache, and mark the inode
* dirty.
* Mark the folio dirty, and set it dirty in the page cache, and mark
* the inode dirty.
*
* If warn is true, then emit a warning if the page is not uptodate and has
* If warn is true, then emit a warning if the folio is not uptodate and has
* not been truncated.
*
* The caller must hold lock_page_memcg().
* The caller must hold lock_page_memcg(). Most callers have the folio
* locked. A few have the folio blocked from truncation through other
* means (eg zap_page_range() has it mapped and is holding the page table
* lock). This can also be called from mark_buffer_dirty(), which I
* cannot prove is always protected against truncate.
*/
void __set_page_dirty(struct page *page, struct address_space *mapping,
void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
int warn)
{
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
if (page->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !PageUptodate(page));
account_page_dirtied(page, mapping);
__xa_set_mark(&mapping->i_pages, page_index(page),
if (folio->mapping) { /* Race with truncate? */
WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
folio_account_dirtied(folio, mapping);
__xa_set_mark(&mapping->i_pages, folio_index(folio),
PAGECACHE_TAG_DIRTY);
}
xa_unlock_irqrestore(&mapping->i_pages, flags);
}
/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
* the xarray.
/**
* filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads.
* @mapping: Address space this folio belongs to.
* @folio: Folio to be marked as dirty.
*
* This is also used when a single buffer is being dirtied: we want to set the
* page dirty in that case, but not all the buffers. This is a "bottom-up"
* dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
* Filesystems which do not use buffer heads should call this function
* from their set_page_dirty address space operation. It ignores the
* contents of folio_get_private(), so if the filesystem marks individual
* blocks as dirty, the filesystem should handle that itself.
*
* The caller must ensure this doesn't race with truncation. Most will simply
* hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
* the pte lock held, which also locks out truncation.
* This is also sometimes used by filesystems which use buffer_heads when
* a single buffer is being dirtied: we want to set the folio dirty in
* that case, but not all the buffers. This is a "bottom-up" dirtying,
* whereas __set_page_dirty_buffers() is a "top-down" dirtying.
*
* The caller must ensure this doesn't race with truncation. Most will
* simply hold the folio lock, but e.g. zap_pte_range() calls with the
* folio mapped and the pte lock held, which also locks out truncation.
*/
int __set_page_dirty_nobuffers(struct page *page)
bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
{
lock_page_memcg(page);
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
if (!mapping) {
unlock_page_memcg(page);
return 1;
}
__set_page_dirty(page, mapping, !PagePrivate(page));
unlock_page_memcg(page);
if (mapping->host) {
/* !PageAnon && !swapper_space */
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
return 1;
folio_memcg_lock(folio);
if (folio_test_set_dirty(folio)) {
folio_memcg_unlock(folio);
return false;
}
unlock_page_memcg(page);
return 0;
}
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
/*
* Call this whenever redirtying a page, to de-account the dirty counters
* (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written
* counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to
* systematic errors in balanced_dirty_ratelimit and the dirty pages position
* control.
__folio_mark_dirty(folio, mapping, !folio_test_private(folio));
folio_memcg_unlock(folio);
if (mapping->host) {
/* !PageAnon && !swapper_space */
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
}
return true;
}
EXPORT_SYMBOL(filemap_dirty_folio);
/**
* folio_account_redirty - Manually account for redirtying a page.
* @folio: The folio which is being redirtied.
*
* Most filesystems should call folio_redirty_for_writepage() instead
* of this fuction. If your filesystem is doing writeback outside the
* context of a writeback_control(), it can call this when redirtying
* a folio, to de-account the dirty counters (NR_DIRTIED, WB_DIRTIED,
* tsk->nr_dirtied), so that they match the written counters (NR_WRITTEN,
* WB_WRITTEN) in long term. The mismatches will lead to systematic errors
* in balanced_dirty_ratelimit and the dirty pages position control.
*/
void account_page_redirty(struct page *page)
void folio_account_redirty(struct folio *folio)
{
struct address_space *mapping = page->mapping;
struct address_space *mapping = folio->mapping;
if (mapping && mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
long nr = folio_nr_pages(folio);
wb = unlocked_inode_to_wb_begin(inode, &cookie);
current->nr_dirtied--;
dec_node_page_state(page, NR_DIRTIED);
dec_wb_stat(wb, WB_DIRTIED);
current->nr_dirtied -= nr;
node_stat_mod_folio(folio, NR_DIRTIED, -nr);
wb_stat_mod(wb, WB_DIRTIED, -nr);
unlocked_inode_to_wb_end(inode, &cookie);
}
}
EXPORT_SYMBOL(account_page_redirty);
EXPORT_SYMBOL(folio_account_redirty);
/*
* When a writepage implementation decides that it doesn't want to write this
* page for some reason, it should redirty the locked page via
* redirty_page_for_writepage() and it should then unlock the page and return 0
/**
* folio_redirty_for_writepage - Decline to write a dirty folio.
* @wbc: The writeback control.
* @folio: The folio.
*
* When a writepage implementation decides that it doesn't want to write
* @folio for some reason, it should call this function, unlock @folio and
* return 0.
*
* Return: True if we redirtied the folio. False if someone else dirtied
* it first.
*/
int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
bool folio_redirty_for_writepage(struct writeback_control *wbc,
struct folio *folio)
{
int ret;
bool ret;
long nr = folio_nr_pages(folio);
wbc->pages_skipped += nr;
ret = filemap_dirty_folio(folio->mapping, folio);
folio_account_redirty(folio);
wbc->pages_skipped++;
ret = __set_page_dirty_nobuffers(page);
account_page_redirty(page);
return ret;
}
EXPORT_SYMBOL(redirty_page_for_writepage);
EXPORT_SYMBOL(folio_redirty_for_writepage);
/*
* Dirty a page.
/**
* folio_mark_dirty - Mark a folio as being modified.
* @folio: The folio.
*
* For pages with a mapping this should be done under the page lock for the
* benefit of asynchronous memory errors who prefer a consistent dirty state.
* This rule can be broken in some special cases, but should be better not to.
* For folios with a mapping this should be done under the page lock
* for the benefit of asynchronous memory errors who prefer a consistent
* dirty state. This rule can be broken in some special cases,
* but should be better not to.
*
* Return: True if the folio was newly dirtied, false if it was already dirty.
*/
int set_page_dirty(struct page *page)
bool folio_mark_dirty(struct folio *folio)
{
struct address_space *mapping = page_mapping(page);
struct address_space *mapping = folio_mapping(folio);
page = compound_head(page);
if (likely(mapping)) {
/*
* readahead/lru_deactivate_page could remain
@ -2604,17 +2638,17 @@ int set_page_dirty(struct page *page)
* it will confuse readahead and make it restart the size rampup
* process. But it's a trivial problem.
*/
if (PageReclaim(page))
ClearPageReclaim(page);
return mapping->a_ops->set_page_dirty(page);
if (folio_test_reclaim(folio))
folio_clear_reclaim(folio);
return mapping->a_ops->set_page_dirty(&folio->page);
}
if (!PageDirty(page)) {
if (!TestSetPageDirty(page))
return 1;
if (!folio_test_dirty(folio)) {
if (!folio_test_set_dirty(folio))
return true;
}
return 0;
return false;
}
EXPORT_SYMBOL(set_page_dirty);
EXPORT_SYMBOL(folio_mark_dirty);
/*
* set_page_dirty() is racy if the caller has no reference against
@ -2650,49 +2684,49 @@ EXPORT_SYMBOL(set_page_dirty_lock);
* page without actually doing it through the VM. Can you say "ext3 is
* horribly ugly"? Thought you could.
*/
void __cancel_dirty_page(struct page *page)
void __folio_cancel_dirty(struct folio *folio)
{
struct address_space *mapping = page_mapping(page);
struct address_space *mapping = folio_mapping(folio);
if (mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
struct bdi_writeback *wb;
struct wb_lock_cookie cookie = {};
lock_page_memcg(page);
folio_memcg_lock(folio);
wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (TestClearPageDirty(page))
account_page_cleaned(page, mapping, wb);
if (folio_test_clear_dirty(folio))
folio_account_cleaned(folio, mapping, wb);
unlocked_inode_to_wb_end(inode, &cookie);
unlock_page_memcg(page);
folio_memcg_unlock(folio);
} else {
ClearPageDirty(page);
folio_clear_dirty(folio);
}
}
EXPORT_SYMBOL(__cancel_dirty_page);
EXPORT_SYMBOL(__folio_cancel_dirty);
/*
* Clear a page's dirty flag, while caring for dirty memory accounting.
* Returns true if the page was previously dirty.
* Clear a folio's dirty flag, while caring for dirty memory accounting.
* Returns true if the folio was previously dirty.
*
* This is for preparing to put the page under writeout. We leave the page
* tagged as dirty in the xarray so that a concurrent write-for-sync
* can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
* implementation will run either set_page_writeback() or set_page_dirty(),
* at which stage we bring the page's dirty flag and xarray dirty tag
* back into sync.
* This is for preparing to put the folio under writeout. We leave
* the folio tagged as dirty in the xarray so that a concurrent
* write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk.
* The ->writepage implementation will run either folio_start_writeback()
* or folio_mark_dirty(), at which stage we bring the folio's dirty flag
* and xarray dirty tag back into sync.
*
* This incoherency between the page's dirty flag and xarray tag is
* unfortunate, but it only exists while the page is locked.
* This incoherency between the folio's dirty flag and xarray tag is
* unfortunate, but it only exists while the folio is locked.
*/
int clear_page_dirty_for_io(struct page *page)
bool folio_clear_dirty_for_io(struct folio *folio)
{
struct address_space *mapping = page_mapping(page);
int ret = 0;
struct address_space *mapping = folio_mapping(folio);
bool ret = false;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
if (mapping && mapping_can_writeback(mapping)) {
struct inode *inode = mapping->host;
@ -2705,48 +2739,49 @@ int clear_page_dirty_for_io(struct page *page)
* We use this sequence to make sure that
* (a) we account for dirty stats properly
* (b) we tell the low-level filesystem to
* mark the whole page dirty if it was
* mark the whole folio dirty if it was
* dirty in a pagetable. Only to then
* (c) clean the page again and return 1 to
* (c) clean the folio again and return 1 to
* cause the writeback.
*
* This way we avoid all nasty races with the
* dirty bit in multiple places and clearing
* them concurrently from different threads.
*
* Note! Normally the "set_page_dirty(page)"
* Note! Normally the "folio_mark_dirty(folio)"
* has no effect on the actual dirty bit - since
* that will already usually be set. But we
* need the side effects, and it can help us
* avoid races.
*
* We basically use the page "master dirty bit"
* We basically use the folio "master dirty bit"
* as a serialization point for all the different
* threads doing their things.
*/
if (page_mkclean(page))
set_page_dirty(page);
if (folio_mkclean(folio))
folio_mark_dirty(folio);
/*
* We carefully synchronise fault handlers against
* installing a dirty pte and marking the page dirty
* installing a dirty pte and marking the folio dirty
* at this point. We do this by having them hold the
* page lock while dirtying the page, and pages are
* page lock while dirtying the folio, and folios are
* always locked coming in here, so we get the desired
* exclusion.
*/
wb = unlocked_inode_to_wb_begin(inode, &cookie);
if (TestClearPageDirty(page)) {
dec_lruvec_page_state(page, NR_FILE_DIRTY);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
dec_wb_stat(wb, WB_RECLAIMABLE);
ret = 1;
if (folio_test_clear_dirty(folio)) {
long nr = folio_nr_pages(folio);
lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
ret = true;
}
unlocked_inode_to_wb_end(inode, &cookie);
return ret;
}
return TestClearPageDirty(page);
return folio_test_clear_dirty(folio);
}
EXPORT_SYMBOL(clear_page_dirty_for_io);
EXPORT_SYMBOL(folio_clear_dirty_for_io);
static void wb_inode_writeback_start(struct bdi_writeback *wb)
{
@ -2766,27 +2801,28 @@ static void wb_inode_writeback_end(struct bdi_writeback *wb)
queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
}
int test_clear_page_writeback(struct page *page)
bool __folio_end_writeback(struct folio *folio)
{
struct address_space *mapping = page_mapping(page);
int ret;
long nr = folio_nr_pages(folio);
struct address_space *mapping = folio_mapping(folio);
bool ret;
lock_page_memcg(page);
folio_memcg_lock(folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
xa_lock_irqsave(&mapping->i_pages, flags);
ret = TestClearPageWriteback(page);
ret = folio_test_clear_writeback(folio);
if (ret) {
__xa_clear_mark(&mapping->i_pages, page_index(page),
__xa_clear_mark(&mapping->i_pages, folio_index(folio),
PAGECACHE_TAG_WRITEBACK);
if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
struct bdi_writeback *wb = inode_to_wb(inode);
dec_wb_stat(wb, WB_WRITEBACK);
__wb_writeout_inc(wb);
wb_stat_mod(wb, WB_WRITEBACK, -nr);
__wb_writeout_add(wb, nr);
if (!mapping_tagged(mapping,
PAGECACHE_TAG_WRITEBACK))
wb_inode_writeback_end(wb);
@ -2799,32 +2835,34 @@ int test_clear_page_writeback(struct page *page)
xa_unlock_irqrestore(&mapping->i_pages, flags);
} else {
ret = TestClearPageWriteback(page);
ret = folio_test_clear_writeback(folio);
}
if (ret) {
dec_lruvec_page_state(page, NR_WRITEBACK);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
inc_node_page_state(page, NR_WRITTEN);
lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr);
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
node_stat_mod_folio(folio, NR_WRITTEN, nr);
}
unlock_page_memcg(page);
folio_memcg_unlock(folio);
return ret;
}
int __test_set_page_writeback(struct page *page, bool keep_write)
bool __folio_start_writeback(struct folio *folio, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
int ret, access_ret;
long nr = folio_nr_pages(folio);
struct address_space *mapping = folio_mapping(folio);
bool ret;
int access_ret;
lock_page_memcg(page);
folio_memcg_lock(folio);
if (mapping && mapping_use_writeback_tags(mapping)) {
XA_STATE(xas, &mapping->i_pages, page_index(page));
XA_STATE(xas, &mapping->i_pages, folio_index(folio));
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
xas_lock_irqsave(&xas, flags);
xas_load(&xas);
ret = TestSetPageWriteback(page);
ret = folio_test_set_writeback(folio);
if (!ret) {
bool on_wblist;
@ -2835,84 +2873,105 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
struct bdi_writeback *wb = inode_to_wb(inode);
inc_wb_stat(wb, WB_WRITEBACK);
wb_stat_mod(wb, WB_WRITEBACK, nr);
if (!on_wblist)
wb_inode_writeback_start(wb);
}
/*
* We can come through here when swapping anonymous
* pages, so we don't necessarily have an inode to track
* for sync.
* We can come through here when swapping
* anonymous folios, so we don't necessarily
* have an inode to track for sync.
*/
if (mapping->host && !on_wblist)
sb_mark_inode_writeback(mapping->host);
}
if (!PageDirty(page))
if (!folio_test_dirty(folio))
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
if (!keep_write)
xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
xas_unlock_irqrestore(&xas, flags);
} else {
ret = TestSetPageWriteback(page);
ret = folio_test_set_writeback(folio);
}
if (!ret) {
inc_lruvec_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr);
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
}
unlock_page_memcg(page);
access_ret = arch_make_page_accessible(page);
folio_memcg_unlock(folio);
access_ret = arch_make_folio_accessible(folio);
/*
* If writeback has been triggered on a page that cannot be made
* accessible, it is too late to recover here.
*/
VM_BUG_ON_PAGE(access_ret != 0, page);
VM_BUG_ON_FOLIO(access_ret != 0, folio);
return ret;
}
EXPORT_SYMBOL(__test_set_page_writeback);
EXPORT_SYMBOL(__folio_start_writeback);
/*
* Wait for a page to complete writeback
/**
* folio_wait_writeback - Wait for a folio to finish writeback.
* @folio: The folio to wait for.
*
* If the folio is currently being written back to storage, wait for the
* I/O to complete.
*
* Context: Sleeps. Must be called in process context and with
* no spinlocks held. Caller should hold a reference on the folio.
* If the folio is not locked, writeback may start again after writeback
* has finished.
*/
void wait_on_page_writeback(struct page *page)
void folio_wait_writeback(struct folio *folio)
{
while (PageWriteback(page)) {
trace_wait_on_page_writeback(page, page_mapping(page));
wait_on_page_bit(page, PG_writeback);
while (folio_test_writeback(folio)) {
trace_folio_wait_writeback(folio, folio_mapping(folio));
folio_wait_bit(folio, PG_writeback);
}
}
EXPORT_SYMBOL_GPL(wait_on_page_writeback);
EXPORT_SYMBOL_GPL(folio_wait_writeback);
/*
* Wait for a page to complete writeback. Returns -EINTR if we get a
* fatal signal while waiting.
/**
* folio_wait_writeback_killable - Wait for a folio to finish writeback.
* @folio: The folio to wait for.
*
* If the folio is currently being written back to storage, wait for the
* I/O to complete or a fatal signal to arrive.
*
* Context: Sleeps. Must be called in process context and with
* no spinlocks held. Caller should hold a reference on the folio.
* If the folio is not locked, writeback may start again after writeback
* has finished.
* Return: 0 on success, -EINTR if we get a fatal signal while waiting.
*/
int wait_on_page_writeback_killable(struct page *page)
int folio_wait_writeback_killable(struct folio *folio)
{
while (PageWriteback(page)) {
trace_wait_on_page_writeback(page, page_mapping(page));
if (wait_on_page_bit_killable(page, PG_writeback))
while (folio_test_writeback(folio)) {
trace_folio_wait_writeback(folio, folio_mapping(folio));
if (folio_wait_bit_killable(folio, PG_writeback))
return -EINTR;
}
return 0;
}
EXPORT_SYMBOL_GPL(wait_on_page_writeback_killable);
EXPORT_SYMBOL_GPL(folio_wait_writeback_killable);
/**
* wait_for_stable_page() - wait for writeback to finish, if necessary.
* @page: The page to wait on.
* folio_wait_stable() - wait for writeback to finish, if necessary.
* @folio: The folio to wait on.
*
* This function determines if the given page is related to a backing device
* that requires page contents to be held stable during writeback. If so, then
* it will wait for any pending writeback to complete.
* This function determines if the given folio is related to a backing
* device that requires folio contents to be held stable during writeback.
* If so, then it will wait for any pending writeback to complete.
*
* Context: Sleeps. Must be called in process context and with
* no spinlocks held. Caller should hold a reference on the folio.
* If the folio is not locked, writeback may start again after writeback
* has finished.
*/
void wait_for_stable_page(struct page *page)
void folio_wait_stable(struct folio *folio)
{
page = thp_head(page);
if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
wait_on_page_writeback(page);
if (folio_inode(folio)->i_sb->s_iflags & SB_I_STABLE_WRITES)
folio_wait_writeback(folio);
}
EXPORT_SYMBOL_GPL(wait_for_stable_page);
EXPORT_SYMBOL_GPL(folio_wait_stable);

View File

@ -19,6 +19,7 @@
#include <linux/mm.h>
#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/jiffies.h>
@ -63,6 +64,7 @@
#include <linux/sched/rt.h>
#include <linux/sched/mm.h>
#include <linux/page_owner.h>
#include <linux/page_table_check.h>
#include <linux/kthread.h>
#include <linux/memcontrol.h>
#include <linux/ftrace.h>
@ -72,6 +74,7 @@
#include <linux/padata.h>
#include <linux/khugepaged.h>
#include <linux/buffer_head.h>
#include <linux/delayacct.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@ -190,6 +193,27 @@ EXPORT_SYMBOL(init_on_alloc);
DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
EXPORT_SYMBOL(init_on_free);
#define ALLOC_IN_CMA_THRESHOLD_MAX 16
#define ALLOC_IN_CMA_THRESHOLD_DEFAULT 12
static unsigned long _alloc_in_cma_threshold __read_mostly
= ALLOC_IN_CMA_THRESHOLD_DEFAULT;
static int __init alloc_in_cma_threshold_setup(char *buf)
{
unsigned long res;
if (kstrtoul(buf, 10, &res) < 0 ||
res > ALLOC_IN_CMA_THRESHOLD_MAX) {
pr_err("Bad alloc_cma_threshold value\n");
return 0;
}
_alloc_in_cma_threshold = res;
pr_info("Setting alloc_in_cma_threshold to %lu\n", res);
return 0;
}
early_param("alloc_in_cma_threshold", alloc_in_cma_threshold_setup);
static bool _init_on_alloc_enabled_early __read_mostly
= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
static int __init early_init_on_alloc(char *buf)
@ -677,10 +701,8 @@ static inline int pindex_to_order(unsigned int pindex)
int order = pindex / MIGRATE_PCPTYPES;
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (order > PAGE_ALLOC_COSTLY_ORDER) {
if (order > PAGE_ALLOC_COSTLY_ORDER)
order = pageblock_order;
VM_BUG_ON(order != pageblock_order);
}
#else
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
#endif
@ -724,27 +746,37 @@ static inline void free_the_page(struct page *page, unsigned int order)
void free_compound_page(struct page *page)
{
mem_cgroup_uncharge(page);
mem_cgroup_uncharge(page_folio(page));
free_the_page(page, compound_order(page));
}
static void prep_compound_head(struct page *page, unsigned int order)
{
set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
set_compound_order(page, order);
atomic_set(compound_mapcount_ptr(page), -1);
if (hpage_pincount_available(page))
atomic_set(compound_pincount_ptr(page), 0);
}
static void prep_compound_tail(struct page *head, int tail_idx)
{
struct page *p = head + tail_idx;
p->mapping = TAIL_MAPPING;
set_compound_head(p, head);
}
void prep_compound_page(struct page *page, unsigned int order)
{
int i;
int nr_pages = 1 << order;
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
p->mapping = TAIL_MAPPING;
set_compound_head(p, page);
}
for (i = 1; i < nr_pages; i++)
prep_compound_tail(page, i);
set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
set_compound_order(page, order);
atomic_set(compound_mapcount_ptr(page), -1);
if (hpage_pincount_available(page))
atomic_set(compound_pincount_ptr(page), 0);
prep_compound_head(page, order);
}
#ifdef CONFIG_DEBUG_PAGEALLOC
@ -1299,6 +1331,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
if (memcg_kmem_enabled() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
reset_page_owner(page, order);
page_table_check_free(page, order);
return false;
}
@ -1338,6 +1371,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
page_cpupid_reset_last(page);
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
reset_page_owner(page, order);
page_table_check_free(page, order);
if (!PageHighMem(page)) {
debug_check_no_locks_freed(page_address(page),
@ -1430,14 +1464,8 @@ static inline void prefetch_buddy(struct page *page)
/*
* Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone, and of same order.
* Assumes all pages on list are in same zone.
* count is the number of pages to free.
*
* If the zone was previously in an "all pages pinned" state then look to
* see if this freeing clears that state.
*
* And clear the zone's pages_scanned counter, to hold off the "all pages are
* pinned" detection logic.
*/
static void free_pcppages_bulk(struct zone *zone, int count,
struct per_cpu_pages *pcp)
@ -1591,7 +1619,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
struct zone *zone = &pgdat->node_zones[zid];
if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
if (zone_spans_pfn(zone, pfn))
break;
}
__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
@ -2418,6 +2446,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
}
set_page_owner(page, order, gfp_flags);
page_table_check_alloc(page, order);
}
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
@ -2980,12 +3009,13 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
if (IS_ENABLED(CONFIG_CMA)) {
/*
* Balance movable allocations between regular and CMA areas by
* allocating from CMA when over half of the zone's free memory
* is in the CMA area.
* allocating from CMA when over more than a given proportion of
* the zone's free memory is in the CMA area.
*/
if (alloc_flags & ALLOC_CMA &&
zone_page_state(zone, NR_FREE_CMA_PAGES) >
zone_page_state(zone, NR_FREE_PAGES) / 2) {
zone_page_state(zone, NR_FREE_PAGES) / ALLOC_IN_CMA_THRESHOLD_MAX
* _alloc_in_cma_threshold) {
page = __rmqueue_cma_fallback(zone, order);
if (page)
goto out;
@ -3149,9 +3179,9 @@ static void drain_local_pages_wq(struct work_struct *work)
* cpu which is alright but we also have to make sure to not move to
* a different one.
*/
preempt_disable();
migrate_disable();
drain_local_pages(drain->zone);
preempt_enable();
migrate_enable();
}
/*
@ -3968,6 +3998,8 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
}
#ifdef CONFIG_NUMA
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
@ -4356,6 +4388,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
return NULL;
psi_memstall_enter(&pflags);
delayacct_compact_start();
noreclaim_flag = memalloc_noreclaim_save();
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
@ -4363,6 +4396,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
memalloc_noreclaim_restore(noreclaim_flag);
psi_memstall_leave(&pflags);
delayacct_compact_end();
if (*compact_result == COMPACT_SKIPPED)
return NULL;
@ -4799,30 +4833,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
trace_reclaim_retry_zone(z, order, reclaimable,
available, min_wmark, *no_progress_loops, wmark);
if (wmark) {
/*
* If we didn't make any progress and have a lot of
* dirty + writeback pages then we should wait for
* an IO to complete to slow down the reclaim and
* prevent from pre mature OOM
*/
if (!did_some_progress) {
unsigned long write_pending;
write_pending = zone_page_state_snapshot(zone,
NR_ZONE_WRITE_PENDING);
if (2 * write_pending > reclaimable) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
return true;
}
}
ret = true;
goto out;
break;
}
}
out:
/*
* Memory allocation/reclaim might be called from a WQ context and the
* current implementation of the WQ concurrency control doesn't
@ -4918,6 +4933,19 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
if (!ac->preferred_zoneref->zone)
goto nopage;
/*
* Check for insane configurations where the cpuset doesn't contain
* any suitable zone to satisfy the request - e.g. non-movable
* GFP_HIGHUSER allocations from MOVABLE nodes only.
*/
if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
struct zoneref *z = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx,
&cpuset_current_mems_allowed);
if (!z->zone)
goto nopage;
}
if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
@ -5408,6 +5436,18 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
}
EXPORT_SYMBOL(__alloc_pages);
struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
nodemask_t *nodemask)
{
struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
preferred_nid, nodemask);
if (page && order > 1)
prep_transhuge_page(page);
return (struct folio *)page;
}
EXPORT_SYMBOL(__folio_alloc);
/*
* Common helper functions. Never use with __GFP_HIGHMEM because the returned
* address cannot represent highmem pages. Use alloc_pages and then kmap if
@ -5620,8 +5660,8 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
unsigned int order = get_order(size);
unsigned long addr;
if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
gfp_mask &= ~__GFP_COMP;
if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
addr = __get_free_pages(gfp_mask, order);
return make_alloc_exact(addr, order, size);
@ -5645,8 +5685,8 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
unsigned int order = get_order(size);
struct page *p;
if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
gfp_mask &= ~__GFP_COMP;
if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
p = alloc_pages_node(nid, gfp_mask, order);
if (!p)
@ -5988,6 +6028,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
printk(KERN_CONT
"%s"
" free:%lukB"
" boost:%lukB"
" min:%lukB"
" low:%lukB"
" high:%lukB"
@ -6008,6 +6049,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
"\n",
zone->name,
K(zone_page_state(zone, NR_FREE_PAGES)),
K(zone->watermark_boost),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
@ -6263,7 +6305,7 @@ static void build_zonelists(pg_data_t *pgdat)
*/
if (node_distance(local_node, node) !=
node_distance(local_node, prev_node))
node_load[node] = load;
node_load[node] += load;
node_order[nr_nodes++] = node;
prev_node = node;
@ -6272,6 +6314,10 @@ static void build_zonelists(pg_data_t *pgdat)
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
build_thisnode_zonelists(pgdat);
pr_info("Fallback order for Node %d: ", local_node);
for (node = 0; node < nr_nodes; node++)
pr_cont("%d ", node_order[node]);
pr_cont("\n");
}
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
@ -6558,6 +6604,75 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
}
#ifdef CONFIG_ZONE_DEVICE
static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
unsigned long zone_idx, int nid,
struct dev_pagemap *pgmap)
{
__init_single_page(page, pfn, zone_idx, nid);
/*
* Mark page reserved as it will need to wait for onlining
* phase for it to be fully associated with a zone.
*
* We can use the non-atomic __set_bit operation for setting
* the flag as we are still initializing the pages.
*/
__SetPageReserved(page);
/*
* ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
* and zone_device_data. It is a bug if a ZONE_DEVICE page is
* ever freed or placed on a driver-private list.
*/
page->pgmap = pgmap;
page->zone_device_data = NULL;
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
* to reserve their blocks rather than leaking throughout
* the address space during boot when many long-lived
* kernel allocations are made.
*
* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
* because this is done early in section_activate()
*/
if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
}
static void __ref memmap_init_compound(struct page *head,
unsigned long head_pfn,
unsigned long zone_idx, int nid,
struct dev_pagemap *pgmap,
unsigned long nr_pages)
{
unsigned long pfn, end_pfn = head_pfn + nr_pages;
unsigned int order = pgmap->vmemmap_shift;
__SetPageHead(head);
for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
struct page *page = pfn_to_page(pfn);
__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
prep_compound_tail(head, pfn - head_pfn);
set_page_count(page, 0);
/*
* The first tail page stores compound_mapcount_ptr() and
* compound_order() and the second tail page stores
* compound_pincount_ptr(). Call prep_compound_head() after
* the first and second tail pages have been initialized to
* not have the data overwritten.
*/
if (pfn == head_pfn + 2)
prep_compound_head(head, order);
}
}
void __ref memmap_init_zone_device(struct zone *zone,
unsigned long start_pfn,
unsigned long nr_pages,
@ -6566,6 +6681,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
unsigned long pfn, end_pfn = start_pfn + nr_pages;
struct pglist_data *pgdat = zone->zone_pgdat;
struct vmem_altmap *altmap = pgmap_altmap(pgmap);
unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
unsigned long zone_idx = zone_idx(zone);
unsigned long start = jiffies;
int nid = pgdat->node_id;
@ -6583,42 +6699,16 @@ void __ref memmap_init_zone_device(struct zone *zone,
nr_pages = end_pfn - start_pfn;
}
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
struct page *page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone_idx, nid);
__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
/*
* Mark page reserved as it will need to wait for onlining
* phase for it to be fully associated with a zone.
*
* We can use the non-atomic __set_bit operation for setting
* the flag as we are still initializing the pages.
*/
__SetPageReserved(page);
if (pfns_per_compound == 1)
continue;
/*
* ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
* and zone_device_data. It is a bug if a ZONE_DEVICE page is
* ever freed or placed on a driver-private list.
*/
page->pgmap = pgmap;
page->zone_device_data = NULL;
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
* to reserve their blocks rather than leaking throughout
* the address space during boot when many long-lived
* kernel allocations are made.
*
* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
* because this is done early in section_activate()
*/
if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
pfns_per_compound);
}
pr_info("%s initialised %lu pages in %ums\n", __func__,
@ -7397,6 +7487,8 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
{
int i;
pgdat_resize_init(pgdat);
pgdat_init_split_queue(pgdat);
@ -7405,6 +7497,9 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
init_waitqueue_head(&pgdat->reclaim_wait[i]);
pgdat_page_ext_init(pgdat);
lruvec_init(&pgdat->__lruvec);
}
@ -8134,8 +8229,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
}
if (pages && s)
pr_info("Freeing %s memory: %ldK\n",
s, pages << (PAGE_SHIFT - 10));
pr_info("Freeing %s memory: %ldK\n", s, K(pages));
return pages;
}
@ -8162,7 +8256,7 @@ void __init mem_init_print_info(void)
*/
#define adj_init_size(start, end, size, pos, adj) \
do { \
if (start <= pos && pos < end && size > adj) \
if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
size -= adj; \
} while (0)
@ -8180,14 +8274,13 @@ void __init mem_init_print_info(void)
", %luK highmem"
#endif
")\n",
nr_free_pages() << (PAGE_SHIFT - 10),
physpages << (PAGE_SHIFT - 10),
K(nr_free_pages()), K(physpages),
codesize >> 10, datasize >> 10, rosize >> 10,
(init_data_size + init_code_size) >> 10, bss_size >> 10,
(physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
totalcma_pages << (PAGE_SHIFT - 10)
K(physpages - totalram_pages() - totalcma_pages),
K(totalcma_pages)
#ifdef CONFIG_HIGHMEM
, totalhigh_pages() << (PAGE_SHIFT - 10)
, K(totalhigh_pages())
#endif
);
}
@ -8460,7 +8553,7 @@ void setup_per_zone_wmarks(void)
* 8192MB: 11584k
* 16384MB: 16384k
*/
int __meminit init_per_zone_wmark_min(void)
void calculate_min_free_kbytes(void)
{
unsigned long lowmem_kbytes;
int new_min_free_kbytes;
@ -8468,16 +8561,17 @@ int __meminit init_per_zone_wmark_min(void)
lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
if (new_min_free_kbytes > user_min_free_kbytes) {
min_free_kbytes = new_min_free_kbytes;
if (min_free_kbytes < 128)
min_free_kbytes = 128;
if (min_free_kbytes > 262144)
min_free_kbytes = 262144;
} else {
if (new_min_free_kbytes > user_min_free_kbytes)
min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144);
else
pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
new_min_free_kbytes, user_min_free_kbytes);
}
}
int __meminit init_per_zone_wmark_min(void)
{
calculate_min_free_kbytes();
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
@ -8764,7 +8858,8 @@ void *__init alloc_large_system_hash(const char *tablename,
} else if (get_order(size) >= MAX_ORDER || hashdist) {
table = __vmalloc(size, gfp_flags);
virt = true;
huge = is_vm_area_hugepages(table);
if (table)
huge = is_vm_area_hugepages(table);
} else {
/*
* If bucketsize is not a power-of-two, we may free
@ -9205,8 +9300,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
* for allocation requests which can not be fulfilled with the buddy allocator.
*
* The allocated memory is always aligned to a page boundary. If nr_pages is a
* power of two then the alignment is guaranteed to be to the given nr_pages
* (e.g. 1GB request would be aligned to 1GB).
* power of two, then allocated range is also guaranteed to be aligned to same
* nr_pages (e.g. 1GB request would be aligned to 1GB).
*
* Allocated pages can be freed with free_contig_range() or by manually calling
* __free_page() on each allocated page.
@ -9361,21 +9456,21 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
}
#endif
/*
* This function returns a stable result only if called under zone lock.
*/
bool is_free_buddy_page(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
unsigned long flags;
unsigned int order;
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
struct page *page_head = page - (pfn & ((1 << order) - 1));
if (PageBuddy(page_head) && buddy_order(page_head) >= order)
if (PageBuddy(page_head) &&
buddy_order_unsafe(page_head) >= order)
break;
}
spin_unlock_irqrestore(&zone->lock, flags);
return order < MAX_ORDER;
}
@ -9439,6 +9534,7 @@ bool take_page_off_buddy(struct page *page)
del_page_from_free_list(page_head, zone, page_order);
break_down_buddy_pages(zone, page_head, page, 0,
page_order, migratetype);
SetPageHWPoisonTakenOff(page);
if (!is_migrate_isolate(migratetype))
__mod_zone_freepage_state(zone, -1, migratetype);
ret = true;
@ -9450,6 +9546,31 @@ bool take_page_off_buddy(struct page *page)
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
}
/*
* Cancel takeoff done by take_page_off_buddy().
*/
bool put_page_back_buddy(struct page *page)
{
struct zone *zone = page_zone(page);
unsigned long pfn = page_to_pfn(page);
unsigned long flags;
int migratetype = get_pfnblock_migratetype(page, pfn);
bool ret = false;
spin_lock_irqsave(&zone->lock, flags);
if (put_page_testzero(page)) {
ClearPageHWPoisonTakenOff(page);
__free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
if (TestClearPageHWPoison(page)) {
num_poisoned_pages_dec();
ret = true;
}
}
spin_unlock_irqrestore(&zone->lock, flags);
return ret;
}
#endif
#ifdef CONFIG_ZONE_DMA

View File

@ -120,7 +120,6 @@ bool page_counter_try_charge(struct page_counter *counter,
new = atomic_long_add_return(nr_pages, &c->usage);
if (new > c->max) {
atomic_long_sub(nr_pages, &c->usage);
propagate_protected_usage(c, new);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt which is only used

View File

@ -8,6 +8,7 @@
#include <linux/kmemleak.h>
#include <linux/page_owner.h>
#include <linux/page_idle.h>
#include <linux/page_table_check.h>
/*
* struct page extension
@ -63,18 +64,21 @@ static bool need_page_idle(void)
{
return true;
}
struct page_ext_operations page_idle_ops = {
static struct page_ext_operations page_idle_ops __initdata = {
.need = need_page_idle,
};
#endif
static struct page_ext_operations *page_ext_ops[] = {
static struct page_ext_operations *page_ext_ops[] __initdata = {
#ifdef CONFIG_PAGE_OWNER
&page_owner_ops,
#endif
#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
&page_idle_ops,
#endif
#ifdef CONFIG_PAGE_TABLE_CHECK
&page_table_check_ops,
#endif
};
unsigned long page_ext_size = sizeof(struct page_ext);
@ -201,7 +205,7 @@ void __init page_ext_init_flatmem(void)
panic("Out of memory");
}
#else /* CONFIG_FLATMEM */
#else /* CONFIG_SPARSEMEM */
struct page_ext *lookup_page_ext(const struct page *page)
{

View File

@ -25,6 +25,7 @@
#include <linux/psi.h>
#include <linux/uio.h>
#include <linux/sched/task.h>
#include <linux/delayacct.h>
void end_swap_bio_write(struct bio *bio)
{
@ -38,7 +39,7 @@ void end_swap_bio_write(struct bio *bio)
* Also print a dire warning that things will go BAD (tm)
* very quickly.
*
* Also clear PG_reclaim to avoid rotate_reclaimable_page()
* Also clear PG_reclaim to avoid folio_rotate_reclaimable()
*/
set_page_dirty(page);
pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
@ -317,7 +318,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
* temporary failure if the system has limited
* memory for allocating transmit buffers.
* Mark the page dirty and avoid
* rotate_reclaimable_page but rate-limit the
* folio_rotate_reclaimable but rate-limit the
* messages but do not flag PageError like
* the normal direct-to-bio case as it could
* be temporary.
@ -358,8 +359,6 @@ int swap_readpage(struct page *page, bool synchronous)
struct bio *bio;
int ret = 0;
struct swap_info_struct *sis = page_swap_info(page);
blk_qc_t qc;
struct gendisk *disk;
unsigned long pflags;
VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
@ -372,6 +371,7 @@ int swap_readpage(struct page *page, bool synchronous)
* significant part of overall IO time.
*/
psi_memstall_enter(&pflags);
delayacct_swapin_start();
if (frontswap_load(page) == 0) {
SetPageUptodate(page);
@ -409,26 +409,24 @@ int swap_readpage(struct page *page, bool synchronous)
bio->bi_iter.bi_sector = swap_page_sector(page);
bio->bi_end_io = end_swap_bio_read;
bio_add_page(bio, page, thp_size(page), 0);
disk = bio->bi_bdev->bd_disk;
/*
* Keep this task valid during swap readpage because the oom killer may
* attempt to access it in the page fault retry time check.
*/
if (synchronous) {
bio->bi_opf |= REQ_HIPRI;
bio->bi_opf |= REQ_POLLED;
get_task_struct(current);
bio->bi_private = current;
}
count_vm_event(PSWPIN);
bio_get(bio);
qc = submit_bio(bio);
submit_bio(bio);
while (synchronous) {
set_current_state(TASK_UNINTERRUPTIBLE);
if (!READ_ONCE(bio->bi_private))
break;
if (!blk_poll(disk->queue, qc, true))
if (!bio_poll(bio, NULL, 0))
blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
@ -436,6 +434,7 @@ int swap_readpage(struct page *page, bool synchronous)
out:
psi_memstall_leave(&pflags);
delayacct_swapin_end();
return ret;
}

View File

@ -94,8 +94,13 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
buddy = page + (buddy_pfn - pfn);
if (!is_migrate_isolate_page(buddy)) {
__isolate_free_page(page, order);
isolated_page = true;
isolated_page = !!__isolate_free_page(page, order);
/*
* Isolating a free page in an isolated pageblock
* is expected to always work as watermarks don't
* apply here.
*/
VM_WARN_ON(!isolated_page);
}
}
}
@ -183,7 +188,6 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned migratetype, int flags)
{
unsigned long pfn;
unsigned long undo_pfn;
struct page *page;
BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
@ -193,25 +197,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
pfn < end_pfn;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
if (page) {
if (set_migratetype_isolate(page, migratetype, flags)) {
undo_pfn = pfn;
goto undo;
}
if (page && set_migratetype_isolate(page, migratetype, flags)) {
undo_isolate_page_range(start_pfn, pfn, migratetype);
return -EBUSY;
}
}
return 0;
undo:
for (pfn = start_pfn;
pfn < undo_pfn;
pfn += pageblock_nr_pages) {
struct page *page = pfn_to_online_page(pfn);
if (!page)
continue;
unset_migratetype_isolate(page, migratetype);
}
return -EBUSY;
}
/*

View File

@ -46,7 +46,7 @@ static int __init early_page_owner_param(char *buf)
}
early_param("page_owner", early_page_owner_param);
static bool need_page_owner(void)
static __init bool need_page_owner(void)
{
return page_owner_enabled;
}
@ -75,11 +75,13 @@ static noinline void register_early_stack(void)
early_handle = create_dummy_stack();
}
static void init_page_owner(void)
static __init void init_page_owner(void)
{
if (!page_owner_enabled)
return;
stack_depot_init();
register_dummy_stack();
register_failure_stack();
register_early_stack();
@ -125,7 +127,7 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags)
return handle;
}
void __reset_page_owner(struct page *page, unsigned int order)
void __reset_page_owner(struct page *page, unsigned short order)
{
int i;
struct page_ext *page_ext;
@ -149,7 +151,7 @@ void __reset_page_owner(struct page *page, unsigned int order)
static inline void __set_page_owner_handle(struct page_ext *page_ext,
depot_stack_handle_t handle,
unsigned int order, gfp_t gfp_mask)
unsigned short order, gfp_t gfp_mask)
{
struct page_owner *page_owner;
int i;
@ -169,7 +171,7 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext,
}
}
noinline void __set_page_owner(struct page *page, unsigned int order,
noinline void __set_page_owner(struct page *page, unsigned short order,
gfp_t gfp_mask)
{
struct page_ext *page_ext = lookup_page_ext(page);
@ -210,10 +212,10 @@ void __split_page_owner(struct page *page, unsigned int nr)
}
}
void __copy_page_owner(struct page *oldpage, struct page *newpage)
void __folio_copy_owner(struct folio *newfolio, struct folio *old)
{
struct page_ext *old_ext = lookup_page_ext(oldpage);
struct page_ext *new_ext = lookup_page_ext(newpage);
struct page_ext *old_ext = lookup_page_ext(&old->page);
struct page_ext *new_ext = lookup_page_ext(&newfolio->page);
struct page_owner *old_page_owner, *new_page_owner;
if (unlikely(!old_ext || !new_ext))
@ -231,11 +233,11 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
/*
* We don't clear the bit on the oldpage as it's going to be freed
* We don't clear the bit on the old folio as it's going to be freed
* after migration. Until then, the info can be useful in case of
* a bug, and the overall stats will be off a bit only temporarily.
* Also, migrate_misplaced_transhuge_page() can still fail the
* migration and then we want the oldpage to retain the info. But
* migration and then we want the old folio to retain the info. But
* in that case we also don't need to explicitly clear the info from
* the new page, which will be freed.
*/
@ -329,8 +331,6 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
depot_stack_handle_t handle)
{
int ret, pageblock_mt, page_mt;
unsigned long *entries;
unsigned int nr_entries;
char *kbuf;
count = min_t(size_t, count, PAGE_SIZE);
@ -351,18 +351,17 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
pageblock_mt = get_pageblock_migratetype(page);
page_mt = gfp_migratetype(page_owner->gfp_mask);
ret += snprintf(kbuf + ret, count - ret,
"PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
"PFN %lu type %s Block %lu type %s Flags %pGp\n",
pfn,
migratetype_names[page_mt],
pfn >> pageblock_order,
migratetype_names[pageblock_mt],
page->flags, &page->flags);
&page->flags);
if (ret >= count)
goto err;
nr_entries = stack_depot_fetch(handle, &entries);
ret += stack_trace_snprint(kbuf + ret, count - ret, entries, nr_entries, 0);
ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
if (ret >= count)
goto err;
@ -394,8 +393,6 @@ void __dump_page_owner(const struct page *page)
struct page_ext *page_ext = lookup_page_ext(page);
struct page_owner *page_owner;
depot_stack_handle_t handle;
unsigned long *entries;
unsigned int nr_entries;
gfp_t gfp_mask;
int mt;
@ -423,20 +420,17 @@ void __dump_page_owner(const struct page *page)
page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec);
handle = READ_ONCE(page_owner->handle);
if (!handle) {
if (!handle)
pr_alert("page_owner allocation stack trace missing\n");
} else {
nr_entries = stack_depot_fetch(handle, &entries);
stack_trace_print(entries, nr_entries, 0);
}
else
stack_depot_print(handle);
handle = READ_ONCE(page_owner->free_handle);
if (!handle) {
pr_alert("page_owner free stack trace missing\n");
} else {
nr_entries = stack_depot_fetch(handle, &entries);
pr_alert("page last free stack trace:\n");
stack_trace_print(entries, nr_entries, 0);
stack_depot_print(handle);
}
if (page_owner->last_migrate_reason != -1)

View File

@ -113,6 +113,24 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}
#ifdef CONFIG_MEMCG_KMEM
/**
* pcpu_obj_full_size - helper to calculate size of each accounted object
* @size: size of area to allocate in bytes
*
* For each accounted object there is an extra space which is used to store
* obj_cgroup membership. Charge it too.
*/
static inline size_t pcpu_obj_full_size(size_t size)
{
size_t extra_size;
extra_size = size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
return size * num_possible_cpus() + extra_size;
}
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_PERCPU_STATS
#include <linux/spinlock.h>

View File

@ -779,7 +779,7 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
{
struct pcpu_block_md *block = chunk->md_blocks + index;
unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
unsigned int rs, re, start; /* region start, region end */
unsigned int start, end; /* region start, region end */
/* promote scan_hint to contig_hint */
if (block->scan_hint) {
@ -795,9 +795,8 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
block->right_free = 0;
/* iterate over free areas and update the contig hints */
bitmap_for_each_clear_region(alloc_map, rs, re, start,
PCPU_BITMAP_BLOCK_BITS)
pcpu_block_update(block, rs, re);
for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS)
pcpu_block_update(block, start, end);
}
/**
@ -1070,17 +1069,18 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
int *next_off)
{
unsigned int page_start, page_end, rs, re;
unsigned int start, end;
page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
rs = page_start;
bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
if (rs >= page_end)
start = find_next_zero_bit(chunk->populated, end, start);
if (start >= end)
return true;
*next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
end = find_next_bit(chunk->populated, end, start + 1);
*next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
return false;
}
@ -1635,7 +1635,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
if (!objcg)
return true;
if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) {
obj_cgroup_put(objcg);
return false;
}
@ -1656,10 +1656,10 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
rcu_read_lock();
mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
size * num_possible_cpus());
pcpu_obj_full_size(size));
rcu_read_unlock();
} else {
obj_cgroup_uncharge(objcg, size * num_possible_cpus());
obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
obj_cgroup_put(objcg);
}
}
@ -1676,11 +1676,11 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
return;
chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
obj_cgroup_uncharge(objcg, size * num_possible_cpus());
obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
rcu_read_lock();
mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
-(size * num_possible_cpus()));
-pcpu_obj_full_size(size));
rcu_read_unlock();
obj_cgroup_put(objcg);
@ -1851,13 +1851,12 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
/* populate if not all pages are already there */
if (!is_atomic) {
unsigned int page_start, page_end, rs, re;
unsigned int page_end, rs, re;
page_start = PFN_DOWN(off);
rs = PFN_DOWN(off);
page_end = PFN_UP(off + size);
bitmap_for_each_clear_region(chunk->populated, rs, re,
page_start, page_end) {
for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) {
WARN_ON(chunk->immutable);
ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
@ -2013,8 +2012,7 @@ static void pcpu_balance_free(bool empty_only)
list_for_each_entry_safe(chunk, next, &to_free, list) {
unsigned int rs, re;
bitmap_for_each_set_region(chunk->populated, rs, re, 0,
chunk->nr_pages) {
for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
pcpu_depopulate_chunk(chunk, rs, re);
spin_lock_irq(&pcpu_lock);
pcpu_chunk_depopulated(chunk, rs, re);
@ -2084,8 +2082,7 @@ static void pcpu_balance_populated(void)
continue;
/* @chunk can't go away while pcpu_alloc_mutex is held */
bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
chunk->nr_pages) {
for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
int nr = min_t(int, re - rs, nr_to_pop);
spin_unlock_irq(&pcpu_lock);
@ -2472,7 +2469,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
*/
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
memblock_free_early(__pa(ai), ai->__ai_size);
memblock_free(ai, ai->__ai_size);
}
/**
@ -2992,6 +2989,42 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
return ai;
}
static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align,
pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
const unsigned long goal = __pa(MAX_DMA_ADDRESS);
#ifdef CONFIG_NUMA
int node = NUMA_NO_NODE;
void *ptr;
if (cpu_to_nd_fn)
node = cpu_to_nd_fn(cpu);
if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) {
ptr = memblock_alloc_from(size, align, goal);
pr_info("cpu %d has no node %d or node-local memory\n",
cpu, node);
pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n",
cpu, size, (u64)__pa(ptr));
} else {
ptr = memblock_alloc_try_nid(size, align, goal,
MEMBLOCK_ALLOC_ACCESSIBLE,
node);
pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n",
cpu, size, node, (u64)__pa(ptr));
}
return ptr;
#else
return memblock_alloc_from(size, align, goal);
#endif
}
static void __init pcpu_fc_free(void *ptr, size_t size)
{
memblock_free(ptr, size);
}
#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
#if defined(BUILD_EMBED_FIRST_CHUNK)
@ -3001,14 +3034,13 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
* @dyn_size: minimum free size for dynamic allocation in bytes
* @atom_size: allocation atom size
* @cpu_distance_fn: callback to determine distance between cpus, optional
* @alloc_fn: function to allocate percpu page
* @free_fn: function to free percpu page
* @cpu_to_nd_fn: callback to convert cpu to it's node, optional
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
*
* If this function is used to setup the first chunk, it is allocated
* by calling @alloc_fn and used as-is without being mapped into
* by calling pcpu_fc_alloc and used as-is without being mapped into
* vmalloc area. Allocations are always whole multiples of @atom_size
* aligned to @atom_size.
*
@ -3022,7 +3054,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
* @dyn_size specifies the minimum dynamic area size.
*
* If the needed size is smaller than the minimum or specified unit
* size, the leftover is returned using @free_fn.
* size, the leftover is returned using pcpu_fc_free.
*
* RETURNS:
* 0 on success, -errno on failure.
@ -3030,8 +3062,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn)
pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
void *base = (void *)ULONG_MAX;
void **areas = NULL;
@ -3066,7 +3097,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
BUG_ON(cpu == NR_CPUS);
/* allocate space for the whole group */
ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn);
if (!ptr) {
rc = -ENOMEM;
goto out_free_areas;
@ -3105,12 +3136,12 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
if (gi->cpu_map[i] == NR_CPUS) {
/* unused unit, free whole */
free_fn(ptr, ai->unit_size);
pcpu_fc_free(ptr, ai->unit_size);
continue;
}
/* copy and return the unused part */
memcpy(ptr, __per_cpu_load, ai->static_size);
free_fn(ptr + size_sum, ai->unit_size - size_sum);
pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
}
}
@ -3129,23 +3160,90 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
out_free_areas:
for (group = 0; group < ai->nr_groups; group++)
if (areas[group])
free_fn(areas[group],
pcpu_fc_free(areas[group],
ai->groups[group].nr_units * ai->unit_size);
out_free:
pcpu_free_alloc_info(ai);
if (areas)
memblock_free_early(__pa(areas), areas_size);
memblock_free(areas, areas_size);
return rc;
}
#endif /* BUILD_EMBED_FIRST_CHUNK */
#ifdef BUILD_PAGE_FIRST_CHUNK
#include <asm/pgalloc.h>
#ifndef P4D_TABLE_SIZE
#define P4D_TABLE_SIZE PAGE_SIZE
#endif
#ifndef PUD_TABLE_SIZE
#define PUD_TABLE_SIZE PAGE_SIZE
#endif
#ifndef PMD_TABLE_SIZE
#define PMD_TABLE_SIZE PAGE_SIZE
#endif
#ifndef PTE_TABLE_SIZE
#define PTE_TABLE_SIZE PAGE_SIZE
#endif
void __init __weak pcpu_populate_pte(unsigned long addr)
{
pgd_t *pgd = pgd_offset_k(addr);
p4d_t *p4d;
pud_t *pud;
pmd_t *pmd;
if (pgd_none(*pgd)) {
p4d_t *new;
new = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
if (!new)
goto err_alloc;
pgd_populate(&init_mm, pgd, new);
}
p4d = p4d_offset(pgd, addr);
if (p4d_none(*p4d)) {
pud_t *new;
new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
if (!new)
goto err_alloc;
p4d_populate(&init_mm, p4d, new);
}
pud = pud_offset(p4d, addr);
if (pud_none(*pud)) {
pmd_t *new;
new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
if (!new)
goto err_alloc;
pud_populate(&init_mm, pud, new);
}
pmd = pmd_offset(pud, addr);
if (!pmd_present(*pmd)) {
pte_t *new;
new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
if (!new)
goto err_alloc;
pmd_populate_kernel(&init_mm, pmd, new);
}
return;
err_alloc:
panic("%s: Failed to allocate memory\n", __func__);
}
/**
* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
* @reserved_size: the size of reserved percpu area in bytes
* @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
* @free_fn: function to free percpu page, always called with PAGE_SIZE
* @populate_pte_fn: function to populate pte
* @cpu_to_nd_fn: callback to convert cpu to it's node, optional
*
* This is a helper to ease setting up page-remapped first percpu
* chunk and can be called where pcpu_setup_first_chunk() is expected.
@ -3156,10 +3254,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
* RETURNS:
* 0 on success, -errno on failure.
*/
int __init pcpu_page_first_chunk(size_t reserved_size,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_populate_pte_fn_t populate_pte_fn)
int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
{
static struct vm_struct vm;
struct pcpu_alloc_info *ai;
@ -3201,7 +3296,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
for (i = 0; i < unit_pages; i++) {
void *ptr;
ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn);
if (!ptr) {
pr_warn("failed to allocate %s page for cpu%u\n",
psize_str, cpu);
@ -3223,7 +3318,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
(unsigned long)vm.addr + unit * ai->unit_size;
for (i = 0; i < unit_pages; i++)
populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT));
/* pte already populated, the following shouldn't fail */
rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
@ -3253,10 +3348,10 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
enomem:
while (--j >= 0)
free_fn(page_address(pages[j]), PAGE_SIZE);
pcpu_fc_free(page_address(pages[j]), PAGE_SIZE);
rc = -ENOMEM;
out_free_ar:
memblock_free_early(__pa(pages), pages_size);
memblock_free(pages, pages_size);
pcpu_free_alloc_info(ai);
return rc;
}
@ -3278,17 +3373,6 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
size_t align)
{
return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
}
static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
{
memblock_free_early(__pa(ptr), size);
}
void __init setup_per_cpu_areas(void)
{
unsigned long delta;
@ -3299,9 +3383,8 @@ void __init setup_per_cpu_areas(void)
* Always reserve area for module percpu variables. That's
* what the legacy allocator did.
*/
rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE,
PAGE_SIZE, NULL, NULL);
if (rc < 0)
panic("Failed to initialize percpu areas.");

View File

@ -10,6 +10,7 @@
#include <linux/pagemap.h>
#include <linux/hugetlb.h>
#include <linux/pgtable.h>
#include <linux/mm_inline.h>
#include <asm/tlb.h>
/*

View File

@ -12,7 +12,6 @@
#include <linux/dax.h>
#include <linux/gfp.h>
#include <linux/export.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/pagevec.h>
@ -197,9 +196,9 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
* Preallocate as many pages as we will need.
*/
for (i = 0; i < nr_to_read; i++) {
struct page *page = xa_load(&mapping->i_pages, index + i);
struct folio *folio = xa_load(&mapping->i_pages, index + i);
if (page && !xa_is_value(page)) {
if (folio && !xa_is_value(folio)) {
/*
* Page already present? Kick off the current batch
* of contiguous pages before continuing with the
@ -213,21 +212,21 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
continue;
}
page = __page_cache_alloc(gfp_mask);
if (!page)
folio = filemap_alloc_folio(gfp_mask, 0);
if (!folio)
break;
if (mapping->a_ops->readpages) {
page->index = index + i;
list_add(&page->lru, &page_pool);
} else if (add_to_page_cache_lru(page, mapping, index + i,
folio->index = index + i;
list_add(&folio->lru, &page_pool);
} else if (filemap_add_folio(mapping, folio, index + i,
gfp_mask) < 0) {
put_page(page);
folio_put(folio);
read_pages(ractl, &page_pool, true);
i = ractl->_index + ractl->_nr_pages - index - 1;
continue;
}
if (i == nr_to_read - lookahead_size)
SetPageReadahead(page);
folio_set_readahead(folio);
ractl->_nr_pages++;
}
@ -309,7 +308,7 @@ void force_page_cache_ra(struct readahead_control *ractl,
* Set the initial window size, round to next power of 2 and square
* for small size, x 4 for medium, and x 2 for large
* for 128k (32 page) max ra
* 1-8 page = 32k initial, > 8 page = 128k initial
* 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial
*/
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
{
@ -582,7 +581,7 @@ void page_cache_sync_ra(struct readahead_control *ractl,
EXPORT_SYMBOL_GPL(page_cache_sync_ra);
void page_cache_async_ra(struct readahead_control *ractl,
struct page *page, unsigned long req_count)
struct folio *folio, unsigned long req_count)
{
/* no read-ahead */
if (!ractl->ra->ra_pages)
@ -591,10 +590,10 @@ void page_cache_async_ra(struct readahead_control *ractl,
/*
* Same bit is used for PG_readahead and PG_reclaim.
*/
if (PageWriteback(page))
if (folio_test_writeback(folio))
return;
ClearPageReadahead(page);
folio_clear_readahead(folio);
/*
* Defer asynchronous read-ahead on IO congestion.

View File

@ -34,7 +34,7 @@
* mapping->private_lock (in __set_page_dirty_buffers)
* lock_page_memcg move_lock (in __set_page_dirty_buffers)
* i_pages lock (widely used)
* lruvec->lru_lock (in lock_page_lruvec_irq)
* lruvec->lru_lock (in folio_lruvec_lock_irq)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
@ -621,9 +621,20 @@ void try_to_unmap_flush_dirty(void)
try_to_unmap_flush();
}
/*
* Bits 0-14 of mm->tlb_flush_batched record pending generations.
* Bits 16-30 of mm->tlb_flush_batched bit record flushed generations.
*/
#define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16
#define TLB_FLUSH_BATCH_PENDING_MASK \
((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
#define TLB_FLUSH_BATCH_PENDING_LARGE \
(TLB_FLUSH_BATCH_PENDING_MASK / 2)
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
{
struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
int batch, nbatch;
arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
tlb_ubc->flush_required = true;
@ -633,7 +644,22 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
* before the PTE is cleared.
*/
barrier();
mm->tlb_flush_batched = true;
batch = atomic_read(&mm->tlb_flush_batched);
retry:
if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
/*
* Prevent `pending' from catching up with `flushed' because of
* overflow. Reset `pending' and `flushed' to be 1 and 0 if
* `pending' becomes large.
*/
nbatch = atomic_cmpxchg(&mm->tlb_flush_batched, batch, 1);
if (nbatch != batch) {
batch = nbatch;
goto retry;
}
} else {
atomic_inc(&mm->tlb_flush_batched);
}
/*
* If the PTE was dirty then it's best to assume it's writable. The
@ -680,15 +706,18 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
*/
void flush_tlb_batched_pending(struct mm_struct *mm)
{
if (data_race(mm->tlb_flush_batched)) {
flush_tlb_mm(mm);
int batch = atomic_read(&mm->tlb_flush_batched);
int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
if (pending != flushed) {
flush_tlb_mm(mm);
/*
* Do not allow the compiler to re-order the clearing of
* tlb_flush_batched before the tlb is flushed.
* If the new TLB flushing is pending during flushing, leave
* mm->tlb_flush_batched as is, to avoid losing flushing.
*/
barrier();
mm->tlb_flush_batched = false;
atomic_cmpxchg(&mm->tlb_flush_batched, batch,
pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
}
}
#else
@ -981,7 +1010,7 @@ static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
return true;
}
int page_mkclean(struct page *page)
int folio_mkclean(struct folio *folio)
{
int cleaned = 0;
struct address_space *mapping;
@ -991,20 +1020,20 @@ int page_mkclean(struct page *page)
.invalid_vma = invalid_mkclean_vma,
};
BUG_ON(!PageLocked(page));
BUG_ON(!folio_test_locked(folio));
if (!page_mapped(page))
if (!folio_mapped(folio))
return 0;
mapping = page_mapping(page);
mapping = folio_mapping(folio);
if (!mapping)
return 0;
rmap_walk(page, &rwc);
rmap_walk(&folio->page, &rwc);
return cleaned;
}
EXPORT_SYMBOL_GPL(page_mkclean);
EXPORT_SYMBOL_GPL(folio_mkclean);
/**
* page_move_anon_rmap - move a page to our anon_vma
@ -1807,6 +1836,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
update_hiwater_rss(mm);
if (is_zone_device_page(page)) {
unsigned long pfn = page_to_pfn(page);
swp_entry_t entry;
pte_t swp_pte;
@ -1815,8 +1845,11 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
* pte. do_swap_page() will wait until the migration
* pte is removed and then restart fault handling.
*/
entry = make_readable_migration_entry(
page_to_pfn(page));
entry = pte_to_swp_entry(pteval);
if (is_writable_device_private_entry(entry))
entry = make_writable_migration_entry(pfn);
else
entry = make_readable_migration_entry(pfn);
swp_pte = swp_entry_to_pte(entry);
/*

View File

@ -36,7 +36,6 @@
#include <linux/uio.h>
#include <linux/khugepaged.h>
#include <linux/hugetlb.h>
#include <linux/frontswap.h>
#include <linux/fs_parser.h>
#include <linux/swapfile.h>
@ -59,7 +58,6 @@ static struct vfsmount *shm_mnt;
#include <linux/backing-dev.h>
#include <linux/shmem_fs.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/pagevec.h>
#include <linux/percpu_counter.h>
#include <linux/falloc.h>
@ -700,7 +698,6 @@ static int shmem_add_to_page_cache(struct page *page,
struct mm_struct *charge_mm)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
unsigned long i = 0;
unsigned long nr = compound_nr(page);
int error;
@ -715,7 +712,7 @@ static int shmem_add_to_page_cache(struct page *page,
page->index = index;
if (!PageSwapCache(page)) {
error = mem_cgroup_charge(page, charge_mm, gfp);
error = mem_cgroup_charge(page_folio(page), charge_mm, gfp);
if (error) {
if (PageTransHuge(page)) {
count_vm_event(THP_FILE_FALLBACK);
@ -727,20 +724,18 @@ static int shmem_add_to_page_cache(struct page *page,
cgroup_throttle_swaprate(page, gfp);
do {
void *entry;
xas_lock_irq(&xas);
entry = xas_find_conflict(&xas);
if (entry != expected)
if (expected != xas_find_conflict(&xas)) {
xas_set_err(&xas, -EEXIST);
xas_create_range(&xas);
goto unlock;
}
if (expected && xas_find_conflict(&xas)) {
xas_set_err(&xas, -EEXIST);
goto unlock;
}
xas_store(&xas, page);
if (xas_error(&xas))
goto unlock;
next:
xas_store(&xas, page);
if (++i < nr) {
xas_next(&xas);
goto next;
}
if (PageTransHuge(page)) {
count_vm_event(THP_FILE_ALLOC);
__mod_lruvec_page_state(page, NR_SHMEM_THPS, nr);
@ -861,9 +856,8 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma)
return swapped << PAGE_SHIFT;
/* Here comes the more involved part */
return shmem_partial_swap_usage(mapping,
linear_page_index(vma, vma->vm_start),
linear_page_index(vma, vma->vm_end));
return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
vma->vm_pgoff + vma_pages(vma));
}
/*
@ -887,30 +881,26 @@ void shmem_unlock_mapping(struct address_space *mapping)
}
}
/*
* Check whether a hole-punch or truncation needs to split a huge page,
* returning true if no split was required, or the split has been successful.
*
* Eviction (or truncation to 0 size) should never need to split a huge page;
* but in rare cases might do so, if shmem_undo_range() failed to trylock on
* head, and then succeeded to trylock on tail.
*
* A split can only succeed when there are no additional references on the
* huge page: so the split below relies upon find_get_entries() having stopped
* when it found a subpage of the huge page, without getting further references.
*/
static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
{
if (!PageTransCompound(page))
return true;
struct folio *folio;
struct page *page;
/* Just proceed to delete a huge page wholly within the range punched */
if (PageHead(page) &&
page->index >= start && page->index + HPAGE_PMD_NR <= end)
return true;
/* Try to split huge page, so we can truly punch the hole or truncate */
return split_huge_page(page) >= 0;
/*
* At first avoid shmem_getpage(,,,SGP_READ): that fails
* beyond i_size, and reports fallocated pages as holes.
*/
folio = __filemap_get_folio(inode->i_mapping, index,
FGP_ENTRY | FGP_LOCK, 0);
if (!xa_is_value(folio))
return folio;
/*
* But read a page back from swap if any of it is within i_size
* (although in some cases this is just a waste of time).
*/
page = NULL;
shmem_getpage(inode, index, &page, SGP_READ);
return page ? page_folio(page) : NULL;
}
/*
@ -924,10 +914,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
pgoff_t end = (lend + 1) >> PAGE_SHIFT;
unsigned int partial_start = lstart & (PAGE_SIZE - 1);
unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
struct pagevec pvec;
struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE];
struct folio *folio;
bool same_folio;
long nr_swaps_freed = 0;
pgoff_t index;
int i;
@ -938,67 +928,64 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (info->fallocend > start && info->fallocend <= end && !unfalloc)
info->fallocend = start;
pagevec_init(&pvec);
folio_batch_init(&fbatch);
index = start;
while (index < end && find_lock_entries(mapping, index, end - 1,
&pvec, indices)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
&fbatch, indices)) {
for (i = 0; i < folio_batch_count(&fbatch); i++) {
folio = fbatch.folios[i];
index = indices[i];
if (xa_is_value(page)) {
if (xa_is_value(folio)) {
if (unfalloc)
continue;
nr_swaps_freed += !shmem_free_swap(mapping,
index, page);
index, folio);
continue;
}
index += thp_nr_pages(page) - 1;
index += folio_nr_pages(folio) - 1;
if (!unfalloc || !PageUptodate(page))
truncate_inode_page(mapping, page);
unlock_page(page);
if (!unfalloc || !folio_test_uptodate(folio))
truncate_inode_folio(mapping, folio);
folio_unlock(folio);
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
cond_resched();
index++;
}
if (partial_start) {
struct page *page = NULL;
shmem_getpage(inode, start - 1, &page, SGP_READ);
if (page) {
unsigned int top = PAGE_SIZE;
if (start > end) {
top = partial_end;
partial_end = 0;
}
zero_user_segment(page, partial_start, top);
set_page_dirty(page);
unlock_page(page);
put_page(page);
same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
if (folio) {
same_folio = lend < folio_pos(folio) + folio_size(folio);
folio_mark_dirty(folio);
if (!truncate_inode_partial_folio(folio, lstart, lend)) {
start = folio->index + folio_nr_pages(folio);
if (same_folio)
end = folio->index;
}
folio_unlock(folio);
folio_put(folio);
folio = NULL;
}
if (partial_end) {
struct page *page = NULL;
shmem_getpage(inode, end, &page, SGP_READ);
if (page) {
zero_user_segment(page, 0, partial_end);
set_page_dirty(page);
unlock_page(page);
put_page(page);
}
if (!same_folio)
folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
if (folio) {
folio_mark_dirty(folio);
if (!truncate_inode_partial_folio(folio, lstart, lend))
end = folio->index;
folio_unlock(folio);
folio_put(folio);
}
if (start >= end)
return;
index = start;
while (index < end) {
cond_resched();
if (!find_get_entries(mapping, index, end - 1, &pvec,
if (!find_get_entries(mapping, index, end - 1, &fbatch,
indices)) {
/* If all gone or hole-punch or unfalloc, we're done */
if (index == start || end != -1)
@ -1007,14 +994,14 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
index = start;
continue;
}
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
for (i = 0; i < folio_batch_count(&fbatch); i++) {
folio = fbatch.folios[i];
index = indices[i];
if (xa_is_value(page)) {
if (xa_is_value(folio)) {
if (unfalloc)
continue;
if (shmem_free_swap(mapping, index, page)) {
if (shmem_free_swap(mapping, index, folio)) {
/* Swap was replaced by page: retry */
index--;
break;
@ -1023,32 +1010,24 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
continue;
}
lock_page(page);
folio_lock(folio);
if (!unfalloc || !PageUptodate(page)) {
if (page_mapping(page) != mapping) {
if (!unfalloc || !folio_test_uptodate(folio)) {
if (folio_mapping(folio) != mapping) {
/* Page was replaced by swap: retry */
unlock_page(page);
folio_unlock(folio);
index--;
break;
}
VM_BUG_ON_PAGE(PageWriteback(page), page);
if (shmem_punch_compound(page, start, end))
truncate_inode_page(mapping, page);
else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
/* Wipe the page and don't get stuck */
clear_highpage(page);
flush_dcache_page(page);
set_page_dirty(page);
if (index <
round_up(start, HPAGE_PMD_NR))
start = index + 1;
}
VM_BUG_ON_FOLIO(folio_test_writeback(folio),
folio);
truncate_inode_folio(mapping, folio);
}
unlock_page(page);
index = folio->index + folio_nr_pages(folio) - 1;
folio_unlock(folio);
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
index++;
}
@ -1172,7 +1151,7 @@ static void shmem_evict_inode(struct inode *inode)
static int shmem_find_swap_entries(struct address_space *mapping,
pgoff_t start, unsigned int nr_entries,
struct page **entries, pgoff_t *indices,
unsigned int type, bool frontswap)
unsigned int type)
{
XA_STATE(xas, &mapping->i_pages, start);
struct page *page;
@ -1193,9 +1172,6 @@ static int shmem_find_swap_entries(struct address_space *mapping,
entry = radix_to_swp_entry(page);
if (swp_type(entry) != type)
continue;
if (frontswap &&
!frontswap_test(swap_info[type], swp_offset(entry)))
continue;
indices[ret] = xas.xa_index;
entries[ret] = page;
@ -1248,26 +1224,20 @@ static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
/*
* If swap found in inode, free it and move page from swapcache to filecache.
*/
static int shmem_unuse_inode(struct inode *inode, unsigned int type,
bool frontswap, unsigned long *fs_pages_to_unuse)
static int shmem_unuse_inode(struct inode *inode, unsigned int type)
{
struct address_space *mapping = inode->i_mapping;
pgoff_t start = 0;
struct pagevec pvec;
pgoff_t indices[PAGEVEC_SIZE];
bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
int ret = 0;
pagevec_init(&pvec);
do {
unsigned int nr_entries = PAGEVEC_SIZE;
if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
nr_entries = *fs_pages_to_unuse;
pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
pvec.pages, indices,
type, frontswap);
pvec.pages, indices, type);
if (pvec.nr == 0) {
ret = 0;
break;
@ -1277,14 +1247,6 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type,
if (ret < 0)
break;
if (frontswap_partial) {
*fs_pages_to_unuse -= ret;
if (*fs_pages_to_unuse == 0) {
ret = FRONTSWAP_PAGES_UNUSED;
break;
}
}
start = indices[pvec.nr - 1];
} while (true);
@ -1296,8 +1258,7 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type,
* device 'type' back into memory, so the swap device can be
* unused.
*/
int shmem_unuse(unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse)
int shmem_unuse(unsigned int type)
{
struct shmem_inode_info *info, *next;
int error = 0;
@ -1320,8 +1281,7 @@ int shmem_unuse(unsigned int type, bool frontswap,
atomic_inc(&info->stop_eviction);
mutex_unlock(&shmem_swaplist_mutex);
error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
fs_pages_to_unuse);
error = shmem_unuse_inode(&info->vfs_inode, type);
cond_resched();
mutex_lock(&shmem_swaplist_mutex);
@ -1566,8 +1526,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
return NULL;
shmem_pseudo_vma_init(&pvma, info, hindex);
page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(),
true);
page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
shmem_pseudo_vma_destroy(&pvma);
if (page)
prep_transhuge_page(page);
@ -1642,6 +1601,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct page *oldpage, *newpage;
struct folio *old, *new;
struct address_space *swap_mapping;
swp_entry_t entry;
pgoff_t swap_index;
@ -1678,7 +1638,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
xa_lock_irq(&swap_mapping->i_pages);
error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
if (!error) {
mem_cgroup_migrate(oldpage, newpage);
old = page_folio(oldpage);
new = page_folio(newpage);
mem_cgroup_migrate(old, new);
__inc_lruvec_page_state(newpage, NR_FILE_PAGES);
__dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
}
@ -2307,6 +2269,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
INIT_LIST_HEAD(&info->swaplist);
simple_xattrs_init(&info->xattrs);
cache_no_acl(inode);
mapping_set_large_folios(inode->i_mapping);
switch (mode & S_IFMT) {
default:
@ -2429,7 +2392,6 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
SetPageDirty(page);
unlock_page(page);
return 0;
out_delete_from_cache:
@ -2461,6 +2423,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t index = pos >> PAGE_SHIFT;
int ret = 0;
/* i_rwsem is held by caller */
if (unlikely(info->seals & (F_SEAL_GROW |
@ -2471,7 +2434,19 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
return -EPERM;
}
return shmem_getpage(inode, index, pagep, SGP_WRITE);
ret = shmem_getpage(inode, index, pagep, SGP_WRITE);
if (ret)
return ret;
if (PageHWPoison(*pagep)) {
unlock_page(*pagep);
put_page(*pagep);
*pagep = NULL;
return -EIO;
}
return 0;
}
static int
@ -2558,6 +2533,12 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (sgp == SGP_CACHE)
set_page_dirty(page);
unlock_page(page);
if (PageHWPoison(page)) {
put_page(page);
error = -EIO;
break;
}
}
/*
@ -2950,28 +2931,6 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
return shmem_unlink(dir, dentry);
}
static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
{
bool old_is_dir = d_is_dir(old_dentry);
bool new_is_dir = d_is_dir(new_dentry);
if (old_dir != new_dir && old_is_dir != new_is_dir) {
if (old_is_dir) {
drop_nlink(old_dir);
inc_nlink(new_dir);
} else {
drop_nlink(new_dir);
inc_nlink(old_dir);
}
}
old_dir->i_ctime = old_dir->i_mtime =
new_dir->i_ctime = new_dir->i_mtime =
d_inode(old_dentry)->i_ctime =
d_inode(new_dentry)->i_ctime = current_time(old_dir);
return 0;
}
static int shmem_whiteout(struct user_namespace *mnt_userns,
struct inode *old_dir, struct dentry *old_dentry)
{
@ -3017,7 +2976,7 @@ static int shmem_rename2(struct user_namespace *mnt_userns,
return -EINVAL;
if (flags & RENAME_EXCHANGE)
return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
@ -3119,7 +3078,8 @@ static const char *shmem_get_link(struct dentry *dentry,
page = find_get_page(inode->i_mapping, 0);
if (!page)
return ERR_PTR(-ECHILD);
if (!PageUptodate(page)) {
if (PageHWPoison(page) ||
!PageUptodate(page)) {
put_page(page);
return ERR_PTR(-ECHILD);
}
@ -3127,6 +3087,13 @@ static const char *shmem_get_link(struct dentry *dentry,
error = shmem_getpage(inode, 0, &page, SGP_READ);
if (error)
return ERR_PTR(error);
if (!page)
return ERR_PTR(-ECHILD);
if (PageHWPoison(page)) {
unlock_page(page);
put_page(page);
return ERR_PTR(-ECHILD);
}
unlock_page(page);
}
set_delayed_call(done, shmem_put_link, page);
@ -3777,6 +3744,13 @@ static void shmem_destroy_inodecache(void)
kmem_cache_destroy(shmem_inode_cachep);
}
/* Keep the page in page cache instead of truncating it */
static int shmem_error_remove_page(struct address_space *mapping,
struct page *page)
{
return 0;
}
const struct address_space_operations shmem_aops = {
.writepage = shmem_writepage,
.set_page_dirty = __set_page_dirty_no_writeback,
@ -3787,7 +3761,7 @@ const struct address_space_operations shmem_aops = {
#ifdef CONFIG_MIGRATION
.migratepage = migrate_page,
#endif
.error_remove_page = generic_error_remove_page,
.error_remove_page = shmem_error_remove_page,
};
EXPORT_SYMBOL(shmem_aops);
@ -3897,7 +3871,7 @@ static struct file_system_type shmem_fs_type = {
.parameters = shmem_fs_parameters,
#endif
.kill_sb = kill_litter_super,
.fs_flags = FS_USERNS_MOUNT | FS_THP_SUPPORT,
.fs_flags = FS_USERNS_MOUNT,
};
int __init shmem_init(void)
@ -4021,8 +3995,7 @@ int __init shmem_init(void)
return 0;
}
int shmem_unuse(unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse)
int shmem_unuse(unsigned int type)
{
return 0;
}
@ -4195,9 +4168,14 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
gfp, NULL, NULL, NULL);
if (error)
page = ERR_PTR(error);
else
unlock_page(page);
return ERR_PTR(error);
unlock_page(page);
if (PageHWPoison(page)) {
put_page(page);
return ERR_PTR(-EIO);
}
return page;
#else
/*

477
mm/slab.c

File diff suppressed because it is too large Load Diff

301
mm/slab.h
View File

@ -5,6 +5,191 @@
* Internal slab definitions
*/
/* Reuses the bits in struct page */
struct slab {
unsigned long __page_flags;
#if defined(CONFIG_SLAB)
union {
struct list_head slab_list;
struct rcu_head rcu_head;
};
struct kmem_cache *slab_cache;
void *freelist; /* array of free object indexes */
void *s_mem; /* first object */
unsigned int active;
#elif defined(CONFIG_SLUB)
union {
struct list_head slab_list;
struct rcu_head rcu_head;
#ifdef CONFIG_SLUB_CPU_PARTIAL
struct {
struct slab *next;
int slabs; /* Nr of slabs left */
};
#endif
};
struct kmem_cache *slab_cache;
/* Double-word boundary */
void *freelist; /* first free object */
union {
unsigned long counters;
struct {
unsigned inuse:16;
unsigned objects:15;
unsigned frozen:1;
};
};
unsigned int __unused;
#elif defined(CONFIG_SLOB)
struct list_head slab_list;
void *__unused_1;
void *freelist; /* first free block */
long units;
unsigned int __unused_2;
#else
#error "Unexpected slab allocator configured"
#endif
atomic_t __page_refcount;
#ifdef CONFIG_MEMCG
unsigned long memcg_data;
#endif
};
#define SLAB_MATCH(pg, sl) \
static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
SLAB_MATCH(flags, __page_flags);
SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */
#ifndef CONFIG_SLOB
SLAB_MATCH(rcu_head, rcu_head);
#endif
SLAB_MATCH(_refcount, __page_refcount);
#ifdef CONFIG_MEMCG
SLAB_MATCH(memcg_data, memcg_data);
#endif
#undef SLAB_MATCH
static_assert(sizeof(struct slab) <= sizeof(struct page));
/**
* folio_slab - Converts from folio to slab.
* @folio: The folio.
*
* Currently struct slab is a different representation of a folio where
* folio_test_slab() is true.
*
* Return: The slab which contains this folio.
*/
#define folio_slab(folio) (_Generic((folio), \
const struct folio *: (const struct slab *)(folio), \
struct folio *: (struct slab *)(folio)))
/**
* slab_folio - The folio allocated for a slab
* @slab: The slab.
*
* Slabs are allocated as folios that contain the individual objects and are
* using some fields in the first struct page of the folio - those fields are
* now accessed by struct slab. It is occasionally necessary to convert back to
* a folio in order to communicate with the rest of the mm. Please use this
* helper function instead of casting yourself, as the implementation may change
* in the future.
*/
#define slab_folio(s) (_Generic((s), \
const struct slab *: (const struct folio *)s, \
struct slab *: (struct folio *)s))
/**
* page_slab - Converts from first struct page to slab.
* @p: The first (either head of compound or single) page of slab.
*
* A temporary wrapper to convert struct page to struct slab in situations where
* we know the page is the compound head, or single order-0 page.
*
* Long-term ideally everything would work with struct slab directly or go
* through folio to struct slab.
*
* Return: The slab which contains this page
*/
#define page_slab(p) (_Generic((p), \
const struct page *: (const struct slab *)(p), \
struct page *: (struct slab *)(p)))
/**
* slab_page - The first struct page allocated for a slab
* @slab: The slab.
*
* A convenience wrapper for converting slab to the first struct page of the
* underlying folio, to communicate with code not yet converted to folio or
* struct slab.
*/
#define slab_page(s) folio_page(slab_folio(s), 0)
/*
* If network-based swap is enabled, sl*b must keep track of whether pages
* were allocated from pfmemalloc reserves.
*/
static inline bool slab_test_pfmemalloc(const struct slab *slab)
{
return folio_test_active((struct folio *)slab_folio(slab));
}
static inline void slab_set_pfmemalloc(struct slab *slab)
{
folio_set_active(slab_folio(slab));
}
static inline void slab_clear_pfmemalloc(struct slab *slab)
{
folio_clear_active(slab_folio(slab));
}
static inline void __slab_clear_pfmemalloc(struct slab *slab)
{
__folio_clear_active(slab_folio(slab));
}
static inline void *slab_address(const struct slab *slab)
{
return folio_address(slab_folio(slab));
}
static inline int slab_nid(const struct slab *slab)
{
return folio_nid(slab_folio(slab));
}
static inline pg_data_t *slab_pgdat(const struct slab *slab)
{
return folio_pgdat(slab_folio(slab));
}
static inline struct slab *virt_to_slab(const void *addr)
{
struct folio *folio = virt_to_folio(addr);
if (!folio_test_slab(folio))
return NULL;
return folio_slab(folio);
}
static inline int slab_order(const struct slab *slab)
{
return folio_order((struct folio *)slab_folio(slab));
}
static inline size_t slab_size(const struct slab *slab)
{
return PAGE_SIZE << slab_order(slab);
}
#ifdef CONFIG_SLOB
/*
* Common fields provided in kmem_cache by all slab allocators
@ -245,15 +430,33 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
}
#ifdef CONFIG_MEMCG_KMEM
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
gfp_t gfp, bool new_page);
/*
* slab_objcgs - get the object cgroups vector associated with a slab
* @slab: a pointer to the slab struct
*
* Returns a pointer to the object cgroups vector associated with the slab,
* or NULL if no such vector has been associated yet.
*/
static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
{
unsigned long memcg_data = READ_ONCE(slab->memcg_data);
VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS),
slab_page(slab));
VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab));
return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
}
int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
gfp_t gfp, bool new_slab);
void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr);
static inline void memcg_free_page_obj_cgroups(struct page *page)
static inline void memcg_free_slab_cgroups(struct slab *slab)
{
kfree(page_objcgs(page));
page->memcg_data = 0;
kfree(slab_objcgs(slab));
slab->memcg_data = 0;
}
static inline size_t obj_full_size(struct kmem_cache *s)
@ -298,7 +501,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
gfp_t flags, size_t size,
void **p)
{
struct page *page;
struct slab *slab;
unsigned long off;
size_t i;
@ -307,19 +510,19 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
for (i = 0; i < size; i++) {
if (likely(p[i])) {
page = virt_to_head_page(p[i]);
slab = virt_to_slab(p[i]);
if (!page_objcgs(page) &&
memcg_alloc_page_obj_cgroups(page, s, flags,
if (!slab_objcgs(slab) &&
memcg_alloc_slab_cgroups(slab, s, flags,
false)) {
obj_cgroup_uncharge(objcg, obj_full_size(s));
continue;
}
off = obj_to_index(s, page, p[i]);
off = obj_to_index(s, slab, p[i]);
obj_cgroup_get(objcg);
page_objcgs(page)[off] = objcg;
mod_objcg_state(objcg, page_pgdat(page),
slab_objcgs(slab)[off] = objcg;
mod_objcg_state(objcg, slab_pgdat(slab),
cache_vmstat_idx(s), obj_full_size(s));
} else {
obj_cgroup_uncharge(objcg, obj_full_size(s));
@ -334,7 +537,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
struct kmem_cache *s;
struct obj_cgroup **objcgs;
struct obj_cgroup *objcg;
struct page *page;
struct slab *slab;
unsigned int off;
int i;
@ -345,43 +548,52 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
if (unlikely(!p[i]))
continue;
page = virt_to_head_page(p[i]);
objcgs = page_objcgs_check(page);
slab = virt_to_slab(p[i]);
/* we could be given a kmalloc_large() object, skip those */
if (!slab)
continue;
objcgs = slab_objcgs(slab);
if (!objcgs)
continue;
if (!s_orig)
s = page->slab_cache;
s = slab->slab_cache;
else
s = s_orig;
off = obj_to_index(s, page, p[i]);
off = obj_to_index(s, slab, p[i]);
objcg = objcgs[off];
if (!objcg)
continue;
objcgs[off] = NULL;
obj_cgroup_uncharge(objcg, obj_full_size(s));
mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
-obj_full_size(s));
obj_cgroup_put(objcg);
}
}
#else /* CONFIG_MEMCG_KMEM */
static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
{
return NULL;
}
static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
{
return NULL;
}
static inline int memcg_alloc_page_obj_cgroups(struct page *page,
static inline int memcg_alloc_slab_cgroups(struct slab *slab,
struct kmem_cache *s, gfp_t gfp,
bool new_page)
bool new_slab)
{
return 0;
}
static inline void memcg_free_page_obj_cgroups(struct page *page)
static inline void memcg_free_slab_cgroups(struct slab *slab)
{
}
@ -405,35 +617,35 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s,
}
#endif /* CONFIG_MEMCG_KMEM */
#ifndef CONFIG_SLOB
static inline struct kmem_cache *virt_to_cache(const void *obj)
{
struct page *page;
struct slab *slab;
page = virt_to_head_page(obj);
if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n",
slab = virt_to_slab(obj);
if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n",
__func__))
return NULL;
return page->slab_cache;
return slab->slab_cache;
}
static __always_inline void account_slab_page(struct page *page, int order,
struct kmem_cache *s,
gfp_t gfp)
static __always_inline void account_slab(struct slab *slab, int order,
struct kmem_cache *s, gfp_t gfp)
{
if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT))
memcg_alloc_page_obj_cgroups(page, s, gfp, true);
memcg_alloc_slab_cgroups(slab, s, gfp, true);
mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
PAGE_SIZE << order);
}
static __always_inline void unaccount_slab_page(struct page *page, int order,
struct kmem_cache *s)
static __always_inline void unaccount_slab(struct slab *slab, int order,
struct kmem_cache *s)
{
if (memcg_kmem_enabled())
memcg_free_page_obj_cgroups(page);
memcg_free_slab_cgroups(slab);
mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
-(PAGE_SIZE << order));
}
@ -452,6 +664,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
print_tracking(cachep, x);
return cachep;
}
#endif /* CONFIG_SLOB */
static inline size_t slab_ksize(const struct kmem_cache *s)
{
@ -575,11 +788,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
#endif
void *slab_start(struct seq_file *m, loff_t *pos);
void *slab_next(struct seq_file *m, void *p, loff_t *pos);
void slab_stop(struct seq_file *m, void *p);
int memcg_slab_show(struct seq_file *m, void *p);
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
void dump_unreclaimable_slab(void);
#else
@ -635,7 +843,7 @@ static inline void debugfs_slab_release(struct kmem_cache *s) { }
#define KS_ADDRS_COUNT 16
struct kmem_obj_info {
void *kp_ptr;
struct page *kp_page;
struct slab *kp_slab;
void *kp_objp;
unsigned long kp_data_offset;
struct kmem_cache *kp_slab_cache;
@ -643,7 +851,18 @@ struct kmem_obj_info {
void *kp_stack[KS_ADDRS_COUNT];
void *kp_free_stack[KS_ADDRS_COUNT];
};
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab);
#endif
#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
void __check_heap_object(const void *ptr, unsigned long n,
const struct slab *slab, bool to_user);
#else
static inline
void __check_heap_object(const void *ptr, unsigned long n,
const struct slab *slab, bool to_user)
{
}
#endif
#endif /* MM_SLAB_H */

View File

@ -37,14 +37,6 @@ LIST_HEAD(slab_caches);
DEFINE_MUTEX(slab_mutex);
struct kmem_cache *kmem_cache;
#ifdef CONFIG_HARDENED_USERCOPY
bool usercopy_fallback __ro_after_init =
IS_ENABLED(CONFIG_HARDENED_USERCOPY_FALLBACK);
module_param(usercopy_fallback, bool, 0400);
MODULE_PARM_DESC(usercopy_fallback,
"WARN instead of reject usercopy whitelist violations");
#endif
static LIST_HEAD(slab_caches_to_rcu_destroy);
static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
@ -497,9 +489,7 @@ void slab_kmem_cache_release(struct kmem_cache *s)
void kmem_cache_destroy(struct kmem_cache *s)
{
int err;
if (unlikely(!s))
if (unlikely(!s) || !kasan_check_byte(s))
return;
cpus_read_lock();
@ -509,12 +499,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->refcount)
goto out_unlock;
err = shutdown_cache(s);
if (err) {
pr_err("%s %s: Slab cache still has objects\n",
__func__, s->name);
dump_stack();
}
WARN(shutdown_cache(s),
"%s %s: Slab cache still has objects when called from %pS",
__func__, s->name, (void *)_RET_IP_);
out_unlock:
mutex_unlock(&slab_mutex);
cpus_read_unlock();
@ -558,13 +545,13 @@ bool slab_is_available(void)
*/
bool kmem_valid_obj(void *object)
{
struct page *page;
struct folio *folio;
/* Some arches consider ZERO_SIZE_PTR to be a valid address. */
if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
return false;
page = virt_to_head_page(object);
return PageSlab(page);
folio = virt_to_folio(object);
return folio_test_slab(folio);
}
EXPORT_SYMBOL_GPL(kmem_valid_obj);
@ -587,18 +574,18 @@ void kmem_dump_obj(void *object)
{
char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
int i;
struct page *page;
struct slab *slab;
unsigned long ptroffset;
struct kmem_obj_info kp = { };
if (WARN_ON_ONCE(!virt_addr_valid(object)))
return;
page = virt_to_head_page(object);
if (WARN_ON_ONCE(!PageSlab(page))) {
slab = virt_to_slab(object);
if (WARN_ON_ONCE(!slab)) {
pr_cont(" non-slab memory.\n");
return;
}
kmem_obj_info(&kp, object, page);
kmem_obj_info(&kp, object, slab);
if (kp.kp_slab_cache)
pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
else
@ -832,7 +819,7 @@ void __init setup_kmalloc_cache_index_table(void)
if (KMALLOC_MIN_SIZE >= 64) {
/*
* The 96 byte size cache is not used if the alignment
* The 96 byte sized cache is not used if the alignment
* is 64 byte.
*/
for (i = 64 + 8; i <= 96; i += 8)
@ -857,7 +844,7 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
if (type == KMALLOC_RECLAIM) {
flags |= SLAB_RECLAIM_ACCOUNT;
} else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) {
if (cgroup_memory_nokmem) {
if (mem_cgroup_kmem_disabled()) {
kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
return;
}
@ -1052,18 +1039,18 @@ static void print_slabinfo_header(struct seq_file *m)
seq_putc(m, '\n');
}
void *slab_start(struct seq_file *m, loff_t *pos)
static void *slab_start(struct seq_file *m, loff_t *pos)
{
mutex_lock(&slab_mutex);
return seq_list_start(&slab_caches, *pos);
}
void *slab_next(struct seq_file *m, void *p, loff_t *pos)
static void *slab_next(struct seq_file *m, void *p, loff_t *pos)
{
return seq_list_next(p, &slab_caches, pos);
}
void slab_stop(struct seq_file *m, void *p)
static void slab_stop(struct seq_file *m, void *p)
{
mutex_unlock(&slab_mutex);
}
@ -1131,17 +1118,6 @@ void dump_unreclaimable_slab(void)
mutex_unlock(&slab_mutex);
}
#if defined(CONFIG_MEMCG_KMEM)
int memcg_slab_show(struct seq_file *m, void *p)
{
/*
* Deprecated.
* Please, take a look at tools/cgroup/slabinfo.py .
*/
return 0;
}
#endif
/*
* slabinfo_op - iterator that generates /proc/slabinfo
*

View File

@ -30,7 +30,7 @@
* If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
* alloc_pages() directly, allocating compound pages so the page order
* does not have to be separately tracked.
* These objects are detected in kfree() because PageSlab()
* These objects are detected in kfree() because folio_test_slab()
* is false for them.
*
* SLAB is emulated on top of SLOB by simply calling constructors and
@ -105,21 +105,21 @@ static LIST_HEAD(free_slob_large);
/*
* slob_page_free: true for pages on free_slob_pages list.
*/
static inline int slob_page_free(struct page *sp)
static inline int slob_page_free(struct slab *slab)
{
return PageSlobFree(sp);
return PageSlobFree(slab_page(slab));
}
static void set_slob_page_free(struct page *sp, struct list_head *list)
static void set_slob_page_free(struct slab *slab, struct list_head *list)
{
list_add(&sp->slab_list, list);
__SetPageSlobFree(sp);
list_add(&slab->slab_list, list);
__SetPageSlobFree(slab_page(slab));
}
static inline void clear_slob_page_free(struct page *sp)
static inline void clear_slob_page_free(struct slab *slab)
{
list_del(&sp->slab_list);
__ClearPageSlobFree(sp);
list_del(&slab->slab_list);
__ClearPageSlobFree(slab_page(slab));
}
#define SLOB_UNIT sizeof(slob_t)
@ -234,7 +234,7 @@ static void slob_free_pages(void *b, int order)
* freelist, in this case @page_removed_from_list will be set to
* true (set to false otherwise).
*/
static void *slob_page_alloc(struct page *sp, size_t size, int align,
static void *slob_page_alloc(struct slab *sp, size_t size, int align,
int align_offset, bool *page_removed_from_list)
{
slob_t *prev, *cur, *aligned = NULL;
@ -301,7 +301,8 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
int align_offset)
{
struct page *sp;
struct folio *folio;
struct slab *sp;
struct list_head *slob_list;
slob_t *b = NULL;
unsigned long flags;
@ -323,7 +324,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
* If there's a node specification, search for a partial
* page with a matching node id in the freelist.
*/
if (node != NUMA_NO_NODE && page_to_nid(sp) != node)
if (node != NUMA_NO_NODE && slab_nid(sp) != node)
continue;
#endif
/* Enough room on this page? */
@ -358,8 +359,9 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
if (!b)
return NULL;
sp = virt_to_page(b);
__SetPageSlab(sp);
folio = virt_to_folio(b);
__folio_set_slab(folio);
sp = folio_slab(folio);
spin_lock_irqsave(&slob_lock, flags);
sp->units = SLOB_UNITS(PAGE_SIZE);
@ -381,7 +383,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
*/
static void slob_free(void *block, int size)
{
struct page *sp;
struct slab *sp;
slob_t *prev, *next, *b = (slob_t *)block;
slobidx_t units;
unsigned long flags;
@ -391,7 +393,7 @@ static void slob_free(void *block, int size)
return;
BUG_ON(!size);
sp = virt_to_page(block);
sp = virt_to_slab(block);
units = SLOB_UNITS(size);
spin_lock_irqsave(&slob_lock, flags);
@ -401,8 +403,7 @@ static void slob_free(void *block, int size)
if (slob_page_free(sp))
clear_slob_page_free(sp);
spin_unlock_irqrestore(&slob_lock, flags);
__ClearPageSlab(sp);
page_mapcount_reset(sp);
__folio_clear_slab(slab_folio(sp));
slob_free_pages(b, 0);
return;
}
@ -462,10 +463,10 @@ static void slob_free(void *block, int size)
}
#ifdef CONFIG_PRINTK
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
{
kpp->kp_ptr = object;
kpp->kp_page = page;
kpp->kp_slab = slab;
}
#endif
@ -544,7 +545,7 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller);
void kfree(const void *block)
{
struct page *sp;
struct folio *sp;
trace_kfree(_RET_IP_, block);
@ -552,16 +553,17 @@ void kfree(const void *block)
return;
kmemleak_free(block);
sp = virt_to_page(block);
if (PageSlab(sp)) {
sp = virt_to_folio(block);
if (folio_test_slab(sp)) {
int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
unsigned int *m = (unsigned int *)(block - align);
slob_free(m, *m + align);
} else {
unsigned int order = compound_order(sp);
mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
unsigned int order = folio_order(sp);
mod_node_page_state(folio_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
-(PAGE_SIZE << order));
__free_pages(sp, order);
__free_pages(folio_page(sp, 0), order);
}
}
@ -570,7 +572,7 @@ EXPORT_SYMBOL(kfree);
/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
size_t __ksize(const void *block)
{
struct page *sp;
struct folio *folio;
int align;
unsigned int *m;
@ -578,9 +580,9 @@ size_t __ksize(const void *block)
if (unlikely(block == ZERO_SIZE_PTR))
return 0;
sp = virt_to_page(block);
if (unlikely(!PageSlab(sp)))
return page_size(sp);
folio = virt_to_folio(block);
if (unlikely(!folio_test_slab(folio)))
return folio_size(folio);
align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
m = (unsigned int *)(block - align);
@ -666,6 +668,7 @@ static void kmem_rcu_free(struct rcu_head *head)
void kmem_cache_free(struct kmem_cache *c, void *b)
{
kmemleak_free_recursive(b, c->flags);
trace_kmem_cache_free(_RET_IP_, b, c->name);
if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) {
struct slob_rcu *slob_rcu;
slob_rcu = b + (c->size - sizeof(struct slob_rcu));
@ -674,8 +677,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
} else {
__kmem_cache_free(b, c->size);
}
trace_kmem_cache_free(_RET_IP_, b, c->name);
}
EXPORT_SYMBOL(kmem_cache_free);

1250
mm/slub.c

File diff suppressed because it is too large Load Diff

View File

@ -451,7 +451,7 @@ static void *sparsemap_buf_end __meminitdata;
static inline void __meminit sparse_buffer_free(unsigned long size)
{
WARN_ON(!sparsemap_buf || size == 0);
memblock_free_early(__pa(sparsemap_buf), size);
memblock_free(sparsemap_buf, size);
}
static void __init sparse_buffer_init(unsigned long size, int nid)
@ -722,7 +722,7 @@ static void free_map_bootmem(struct page *memmap)
>> PAGE_SHIFT;
for (i = 0; i < nr_pages; i++, page++) {
magic = (unsigned long) page->freelist;
magic = page->index;
BUG_ON(magic == NODE_INFO);

249
mm/swap.c
View File

@ -80,10 +80,11 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
static void __page_cache_release(struct page *page)
{
if (PageLRU(page)) {
struct folio *folio = page_folio(page);
struct lruvec *lruvec;
unsigned long flags;
lruvec = lock_page_lruvec_irqsave(page, &flags);
lruvec = folio_lruvec_lock_irqsave(folio, &flags);
del_page_from_lru_list(page, lruvec);
__clear_page_lru_flags(page);
unlock_page_lruvec_irqrestore(lruvec, flags);
@ -94,7 +95,7 @@ static void __page_cache_release(struct page *page)
static void __put_single_page(struct page *page)
{
__page_cache_release(page);
mem_cgroup_uncharge(page);
mem_cgroup_uncharge(page_folio(page));
free_unref_page(page, 0);
}
@ -134,18 +135,28 @@ EXPORT_SYMBOL(__put_page);
* put_pages_list() - release a list of pages
* @pages: list of pages threaded on page->lru
*
* Release a list of pages which are strung together on page.lru. Currently
* used by read_cache_pages() and related error recovery code.
* Release a list of pages which are strung together on page.lru.
*/
void put_pages_list(struct list_head *pages)
{
while (!list_empty(pages)) {
struct page *victim;
struct page *page, *next;
victim = lru_to_page(pages);
list_del(&victim->lru);
put_page(victim);
list_for_each_entry_safe(page, next, pages, lru) {
if (!put_page_testzero(page)) {
list_del(&page->lru);
continue;
}
if (PageHead(page)) {
list_del(&page->lru);
__put_compound_page(page);
continue;
}
/* Cannot be PageLRU because it's passed to us using the lru */
__ClearPageWaiters(page);
}
free_unref_page_list(pages);
INIT_LIST_HEAD(pages);
}
EXPORT_SYMBOL(put_pages_list);
@ -188,12 +199,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
struct folio *folio = page_folio(page);
/* block memcg migration during page moving between lru */
if (!TestClearPageLRU(page))
continue;
lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
(*move_fn)(page, lruvec);
SetPageLRU(page);
@ -206,11 +218,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
{
if (!PageUnevictable(page)) {
del_page_from_lru_list(page, lruvec);
ClearPageActive(page);
add_page_to_lru_list_tail(page, lruvec);
__count_vm_events(PGROTATED, thp_nr_pages(page));
struct folio *folio = page_folio(page);
if (!folio_test_unevictable(folio)) {
lruvec_del_folio(lruvec, folio);
folio_clear_active(folio);
lruvec_add_folio_tail(lruvec, folio);
__count_vm_events(PGROTATED, folio_nr_pages(folio));
}
}
@ -227,23 +241,23 @@ static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
}
/*
* Writeback is about to end against a page which has been marked for immediate
* reclaim. If it still appears to be reclaimable, move it to the tail of the
* inactive list.
* Writeback is about to end against a folio which has been marked for
* immediate reclaim. If it still appears to be reclaimable, move it
* to the tail of the inactive list.
*
* rotate_reclaimable_page() must disable IRQs, to prevent nasty races.
* folio_rotate_reclaimable() must disable IRQs, to prevent nasty races.
*/
void rotate_reclaimable_page(struct page *page)
void folio_rotate_reclaimable(struct folio *folio)
{
if (!PageLocked(page) && !PageDirty(page) &&
!PageUnevictable(page) && PageLRU(page)) {
if (!folio_test_locked(folio) && !folio_test_dirty(folio) &&
!folio_test_unevictable(folio) && folio_test_lru(folio)) {
struct pagevec *pvec;
unsigned long flags;
get_page(page);
folio_get(folio);
local_lock_irqsave(&lru_rotate.lock, flags);
pvec = this_cpu_ptr(&lru_rotate.pvec);
if (pagevec_add_and_need_flush(pvec, page))
if (pagevec_add_and_need_flush(pvec, &folio->page))
pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
local_unlock_irqrestore(&lru_rotate.lock, flags);
}
@ -289,21 +303,21 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
} while ((lruvec = parent_lruvec(lruvec)));
}
void lru_note_cost_page(struct page *page)
void lru_note_cost_folio(struct folio *folio)
{
lru_note_cost(mem_cgroup_page_lruvec(page),
page_is_file_lru(page), thp_nr_pages(page));
lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
folio_nr_pages(folio));
}
static void __activate_page(struct page *page, struct lruvec *lruvec)
static void __folio_activate(struct folio *folio, struct lruvec *lruvec)
{
if (!PageActive(page) && !PageUnevictable(page)) {
int nr_pages = thp_nr_pages(page);
if (!folio_test_active(folio) && !folio_test_unevictable(folio)) {
long nr_pages = folio_nr_pages(folio);
del_page_from_lru_list(page, lruvec);
SetPageActive(page);
add_page_to_lru_list(page, lruvec);
trace_mm_lru_activate(page);
lruvec_del_folio(lruvec, folio);
folio_set_active(folio);
lruvec_add_folio(lruvec, folio);
trace_mm_lru_activate(folio);
__count_vm_events(PGACTIVATE, nr_pages);
__count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
@ -312,6 +326,11 @@ static void __activate_page(struct page *page, struct lruvec *lruvec)
}
#ifdef CONFIG_SMP
static void __activate_page(struct page *page, struct lruvec *lruvec)
{
return __folio_activate(page_folio(page), lruvec);
}
static void activate_page_drain(int cpu)
{
struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
@ -325,16 +344,16 @@ static bool need_activate_page_drain(int cpu)
return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
}
static void activate_page(struct page *page)
static void folio_activate(struct folio *folio)
{
page = compound_head(page);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
if (folio_test_lru(folio) && !folio_test_active(folio) &&
!folio_test_unevictable(folio)) {
struct pagevec *pvec;
folio_get(folio);
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.activate_page);
get_page(page);
if (pagevec_add_and_need_flush(pvec, page))
if (pagevec_add_and_need_flush(pvec, &folio->page))
pagevec_lru_move_fn(pvec, __activate_page);
local_unlock(&lru_pvecs.lock);
}
@ -345,21 +364,20 @@ static inline void activate_page_drain(int cpu)
{
}
static void activate_page(struct page *page)
static void folio_activate(struct folio *folio)
{
struct lruvec *lruvec;
page = compound_head(page);
if (TestClearPageLRU(page)) {
lruvec = lock_page_lruvec_irq(page);
__activate_page(page, lruvec);
if (folio_test_clear_lru(folio)) {
lruvec = folio_lruvec_lock_irq(folio);
__folio_activate(folio, lruvec);
unlock_page_lruvec_irq(lruvec);
SetPageLRU(page);
folio_set_lru(folio);
}
}
#endif
static void __lru_cache_activate_page(struct page *page)
static void __lru_cache_activate_folio(struct folio *folio)
{
struct pagevec *pvec;
int i;
@ -380,8 +398,8 @@ static void __lru_cache_activate_page(struct page *page)
for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
struct page *pagevec_page = pvec->pages[i];
if (pagevec_page == page) {
SetPageActive(page);
if (pagevec_page == &folio->page) {
folio_set_active(folio);
break;
}
}
@ -399,61 +417,59 @@ static void __lru_cache_activate_page(struct page *page)
* When a newly allocated page is not yet visible, so safe for non-atomic ops,
* __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
*/
void mark_page_accessed(struct page *page)
void folio_mark_accessed(struct folio *folio)
{
page = compound_head(page);
if (!PageReferenced(page)) {
SetPageReferenced(page);
} else if (PageUnevictable(page)) {
if (!folio_test_referenced(folio)) {
folio_set_referenced(folio);
} else if (folio_test_unevictable(folio)) {
/*
* Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
* this list is never rotated or maintained, so marking an
* evictable page accessed has no effect.
*/
} else if (!PageActive(page)) {
} else if (!folio_test_active(folio)) {
/*
* If the page is on the LRU, queue it for activation via
* lru_pvecs.activate_page. Otherwise, assume the page is on a
* pagevec, mark it active and it'll be moved to the active
* LRU on the next drain.
*/
if (PageLRU(page))
activate_page(page);
if (folio_test_lru(folio))
folio_activate(folio);
else
__lru_cache_activate_page(page);
ClearPageReferenced(page);
workingset_activation(page);
__lru_cache_activate_folio(folio);
folio_clear_referenced(folio);
workingset_activation(folio);
}
if (page_is_idle(page))
clear_page_idle(page);
if (folio_test_idle(folio))
folio_clear_idle(folio);
}
EXPORT_SYMBOL(mark_page_accessed);
EXPORT_SYMBOL(folio_mark_accessed);
/**
* lru_cache_add - add a page to a page list
* @page: the page to be added to the LRU.
* folio_add_lru - Add a folio to an LRU list.
* @folio: The folio to be added to the LRU.
*
* Queue the page for addition to the LRU via pagevec. The decision on whether
* Queue the folio for addition to the LRU. The decision on whether
* to add the page to the [in]active [file|anon] list is deferred until the
* pagevec is drained. This gives a chance for the caller of lru_cache_add()
* have the page added to the active list using mark_page_accessed().
* pagevec is drained. This gives a chance for the caller of folio_add_lru()
* have the folio added to the active list using folio_mark_accessed().
*/
void lru_cache_add(struct page *page)
void folio_add_lru(struct folio *folio)
{
struct pagevec *pvec;
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
get_page(page);
folio_get(folio);
local_lock(&lru_pvecs.lock);
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
if (pagevec_add_and_need_flush(pvec, page))
if (pagevec_add_and_need_flush(pvec, &folio->page))
__pagevec_lru_add(pvec);
local_unlock(&lru_pvecs.lock);
}
EXPORT_SYMBOL(lru_cache_add);
EXPORT_SYMBOL(folio_add_lru);
/**
* lru_cache_add_inactive_or_unevictable
@ -866,7 +882,7 @@ void lru_cache_disable(void)
* all online CPUs so any calls of lru_cache_disabled wrapped by
* local_lock or preemption disabled would be ordered by that.
* The atomic operation doesn't need to have stronger ordering
* requirements because that is enforeced by the scheduling
* requirements because that is enforced by the scheduling
* guarantees.
*/
__lru_add_drain_all(true);
@ -888,11 +904,12 @@ void release_pages(struct page **pages, int nr)
int i;
LIST_HEAD(pages_to_free);
struct lruvec *lruvec = NULL;
unsigned long flags;
unsigned long flags = 0;
unsigned int lock_batch;
for (i = 0; i < nr; i++) {
struct page *page = pages[i];
struct folio *folio = page_folio(page);
/*
* Make sure the IRQ-safe lock-holding time does not get
@ -904,7 +921,7 @@ void release_pages(struct page **pages, int nr)
lruvec = NULL;
}
page = compound_head(page);
page = &folio->page;
if (is_huge_zero_page(page))
continue;
@ -943,7 +960,7 @@ void release_pages(struct page **pages, int nr)
if (PageLRU(page)) {
struct lruvec *prev_lruvec = lruvec;
lruvec = relock_page_lruvec_irqsave(page, lruvec,
lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
&flags);
if (prev_lruvec != lruvec)
lock_batch = 0;
@ -985,17 +1002,18 @@ void __pagevec_release(struct pagevec *pvec)
}
EXPORT_SYMBOL(__pagevec_release);
static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec)
{
int was_unevictable = TestClearPageUnevictable(page);
int nr_pages = thp_nr_pages(page);
int was_unevictable = folio_test_clear_unevictable(folio);
long nr_pages = folio_nr_pages(folio);
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
/*
* Page becomes evictable in two ways:
* A folio becomes evictable in two ways:
* 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
* 2) Before acquiring LRU lock to put the page to correct LRU and then
* 2) Before acquiring LRU lock to put the folio on the correct LRU
* and then
* a) do PageLRU check with lock [check_move_unevictable_pages]
* b) do PageLRU check before lock [clear_page_mlock]
*
@ -1004,35 +1022,36 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
*
* #0: __pagevec_lru_add_fn #1: clear_page_mlock
*
* SetPageLRU() TestClearPageMlocked()
* folio_set_lru() folio_test_clear_mlocked()
* smp_mb() // explicit ordering // above provides strict
* // ordering
* PageMlocked() PageLRU()
* folio_test_mlocked() folio_test_lru()
*
*
* if '#1' does not observe setting of PG_lru by '#0' and fails
* isolation, the explicit barrier will make sure that page_evictable
* check will put the page in correct LRU. Without smp_mb(), SetPageLRU
* can be reordered after PageMlocked check and can make '#1' to fail
* the isolation of the page whose Mlocked bit is cleared (#0 is also
* looking at the same page) and the evictable page will be stranded
* in an unevictable LRU.
* if '#1' does not observe setting of PG_lru by '#0' and
* fails isolation, the explicit barrier will make sure that
* folio_evictable check will put the folio on the correct
* LRU. Without smp_mb(), folio_set_lru() can be reordered
* after folio_test_mlocked() check and can make '#1' fail the
* isolation of the folio whose mlocked bit is cleared (#0 is
* also looking at the same folio) and the evictable folio will
* be stranded on an unevictable LRU.
*/
SetPageLRU(page);
folio_set_lru(folio);
smp_mb__after_atomic();
if (page_evictable(page)) {
if (folio_evictable(folio)) {
if (was_unevictable)
__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
} else {
ClearPageActive(page);
SetPageUnevictable(page);
folio_clear_active(folio);
folio_set_unevictable(folio);
if (!was_unevictable)
__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
}
add_page_to_lru_list(page, lruvec);
trace_mm_lru_insertion(page);
lruvec_add_folio(lruvec, folio);
trace_mm_lru_insertion(folio);
}
/*
@ -1046,10 +1065,10 @@ void __pagevec_lru_add(struct pagevec *pvec)
unsigned long flags = 0;
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
struct folio *folio = page_folio(pvec->pages[i]);
lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
__pagevec_lru_add_fn(page, lruvec);
lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
__pagevec_lru_add_fn(folio, lruvec);
}
if (lruvec)
unlock_page_lruvec_irqrestore(lruvec, flags);
@ -1058,24 +1077,24 @@ void __pagevec_lru_add(struct pagevec *pvec)
}
/**
* pagevec_remove_exceptionals - pagevec exceptionals pruning
* @pvec: The pagevec to prune
* folio_batch_remove_exceptionals() - Prune non-folios from a batch.
* @fbatch: The batch to prune
*
* find_get_entries() fills both pages and XArray value entries (aka
* exceptional entries) into the pagevec. This function prunes all
* exceptionals from @pvec without leaving holes, so that it can be
* passed on to page-only pagevec operations.
* find_get_entries() fills a batch with both folios and shadow/swap/DAX
* entries. This function prunes all the non-folio entries from @fbatch
* without leaving holes, so that it can be passed on to folio-only batch
* operations.
*/
void pagevec_remove_exceptionals(struct pagevec *pvec)
void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
{
int i, j;
unsigned int i, j;
for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
if (!xa_is_value(page))
pvec->pages[j++] = page;
for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
if (!xa_is_value(folio))
fbatch->folios[j++] = folio;
}
pvec->nr = j;
fbatch->nr = j;
}
/**

View File

@ -30,6 +30,7 @@
#include <linux/swap_slots.h>
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
#include <linux/mutex.h>
#include <linux/mm.h>

View File

@ -478,7 +478,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* __read_swap_cache_async(), which has set SWAP_HAS_CACHE
* in swap_map, but not yet added its page to swap cache.
*/
cond_resched();
schedule_timeout_uninterruptible(1);
}
/*
@ -498,7 +498,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
mem_cgroup_swapin_uncharge_swap(entry);
if (shadow)
workingset_refault(page, shadow);
workingset_refault(page_folio(page), shadow);
/* Caller will initiate read into locked page */
lru_cache_add(page);

View File

@ -18,7 +18,7 @@
#include <linux/pagemap.h>
#include <linux/namei.h>
#include <linux/shmem_fs.h>
#include <linux/blkdev.h>
#include <linux/blk-cgroup.h>
#include <linux/random.h>
#include <linux/writeback.h>
#include <linux/proc_fs.h>
@ -49,7 +49,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
DEFINE_SPINLOCK(swap_lock);
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
atomic_long_t nr_swap_pages;
/*
@ -71,7 +71,7 @@ static const char Unused_offset[] = "Unused swap offset entry ";
* all active swap_info_structs
* protected with swap_lock, and ordered by priority.
*/
PLIST_HEAD(swap_active_head);
static PLIST_HEAD(swap_active_head);
/*
* all available (active, not full) swap_info_structs
@ -1601,31 +1601,30 @@ static bool page_swapped(struct page *page)
return false;
}
static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
static int page_trans_huge_map_swapcount(struct page *page,
int *total_swapcount)
{
int i, map_swapcount, _total_mapcount, _total_swapcount;
int i, map_swapcount, _total_swapcount;
unsigned long offset = 0;
struct swap_info_struct *si;
struct swap_cluster_info *ci = NULL;
unsigned char *map = NULL;
int mapcount, swapcount = 0;
int swapcount = 0;
/* hugetlbfs shouldn't call it */
VM_BUG_ON_PAGE(PageHuge(page), page);
if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
mapcount = page_trans_huge_mapcount(page, total_mapcount);
if (PageSwapCache(page))
swapcount = page_swapcount(page);
if (total_swapcount)
*total_swapcount = swapcount;
return mapcount + swapcount;
return swapcount + page_trans_huge_mapcount(page);
}
page = compound_head(page);
_total_mapcount = _total_swapcount = map_swapcount = 0;
_total_swapcount = map_swapcount = 0;
if (PageSwapCache(page)) {
swp_entry_t entry;
@ -1639,8 +1638,7 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
if (map)
ci = lock_cluster(si, offset);
for (i = 0; i < HPAGE_PMD_NR; i++) {
mapcount = atomic_read(&page[i]._mapcount) + 1;
_total_mapcount += mapcount;
int mapcount = atomic_read(&page[i]._mapcount) + 1;
if (map) {
swapcount = swap_count(map[offset + i]);
_total_swapcount += swapcount;
@ -1648,19 +1646,14 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
map_swapcount = max(map_swapcount, mapcount + swapcount);
}
unlock_cluster(ci);
if (PageDoubleMap(page)) {
if (PageDoubleMap(page))
map_swapcount -= 1;
_total_mapcount -= HPAGE_PMD_NR;
}
mapcount = compound_mapcount(page);
map_swapcount += mapcount;
_total_mapcount += mapcount;
if (total_mapcount)
*total_mapcount = _total_mapcount;
if (total_swapcount)
*total_swapcount = _total_swapcount;
return map_swapcount;
return map_swapcount + compound_mapcount(page);
}
/*
@ -1668,22 +1661,15 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
* to it. And as a side-effect, free up its swap: because the old content
* on disk will never be read, and seeking back there to write new content
* later would only waste time away from clustering.
*
* NOTE: total_map_swapcount should not be relied upon by the caller if
* reuse_swap_page() returns false, but it may be always overwritten
* (see the other implementation for CONFIG_SWAP=n).
*/
bool reuse_swap_page(struct page *page, int *total_map_swapcount)
bool reuse_swap_page(struct page *page)
{
int count, total_mapcount, total_swapcount;
int count, total_swapcount;
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (unlikely(PageKsm(page)))
return false;
count = page_trans_huge_map_swapcount(page, &total_mapcount,
&total_swapcount);
if (total_map_swapcount)
*total_map_swapcount = total_mapcount + total_swapcount;
count = page_trans_huge_map_swapcount(page, &total_swapcount);
if (count == 1 && PageSwapCache(page) &&
(likely(!PageTransCompound(page)) ||
/* The remaining swap count will be freed soon */
@ -1917,14 +1903,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
get_page(page);
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
page_add_anon_rmap(page, vma, addr, false);
} else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, addr, false);
lru_cache_add_inactive_or_unevictable(page, vma);
}
set_pte_at(vma->vm_mm, addr, pte,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
swap_free(entry);
out:
pte_unmap_unlock(pte, ptl);
@ -1937,8 +1923,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse)
unsigned int type)
{
struct page *page;
swp_entry_t entry;
@ -1959,9 +1944,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
continue;
offset = swp_offset(entry);
if (frontswap && !frontswap_test(si, offset))
continue;
pte_unmap(pte);
swap_map = &si->swap_map[offset];
page = lookup_swap_cache(entry, vma, addr);
@ -1993,11 +1975,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
try_to_free_swap(page);
unlock_page(page);
put_page(page);
if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
ret = FRONTSWAP_PAGES_UNUSED;
goto out;
}
try_next:
pte = pte_offset_map(pmd, addr);
} while (pte++, addr += PAGE_SIZE, addr != end);
@ -2010,8 +1987,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse)
unsigned int type)
{
pmd_t *pmd;
unsigned long next;
@ -2023,8 +1999,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
next = pmd_addr_end(addr, end);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
ret = unuse_pte_range(vma, pmd, addr, next, type,
frontswap, fs_pages_to_unuse);
ret = unuse_pte_range(vma, pmd, addr, next, type);
if (ret)
return ret;
} while (pmd++, addr = next, addr != end);
@ -2033,8 +2008,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
unsigned long addr, unsigned long end,
unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse)
unsigned int type)
{
pud_t *pud;
unsigned long next;
@ -2045,8 +2019,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
ret = unuse_pmd_range(vma, pud, addr, next, type,
frontswap, fs_pages_to_unuse);
ret = unuse_pmd_range(vma, pud, addr, next, type);
if (ret)
return ret;
} while (pud++, addr = next, addr != end);
@ -2055,8 +2028,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
unsigned int type, bool frontswap,
unsigned long *fs_pages_to_unuse)
unsigned int type)
{
p4d_t *p4d;
unsigned long next;
@ -2067,16 +2039,14 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d))
continue;
ret = unuse_pud_range(vma, p4d, addr, next, type,
frontswap, fs_pages_to_unuse);
ret = unuse_pud_range(vma, p4d, addr, next, type);
if (ret)
return ret;
} while (p4d++, addr = next, addr != end);
return 0;
}
static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
bool frontswap, unsigned long *fs_pages_to_unuse)
static int unuse_vma(struct vm_area_struct *vma, unsigned int type)
{
pgd_t *pgd;
unsigned long addr, end, next;
@ -2090,16 +2060,14 @@ static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
ret = unuse_p4d_range(vma, pgd, addr, next, type,
frontswap, fs_pages_to_unuse);
ret = unuse_p4d_range(vma, pgd, addr, next, type);
if (ret)
return ret;
} while (pgd++, addr = next, addr != end);
return 0;
}
static int unuse_mm(struct mm_struct *mm, unsigned int type,
bool frontswap, unsigned long *fs_pages_to_unuse)
static int unuse_mm(struct mm_struct *mm, unsigned int type)
{
struct vm_area_struct *vma;
int ret = 0;
@ -2107,8 +2075,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type,
mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->anon_vma) {
ret = unuse_vma(vma, type, frontswap,
fs_pages_to_unuse);
ret = unuse_vma(vma, type);
if (ret)
break;
}
@ -2124,7 +2091,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type,
* if there are no inuse entries after prev till end of the map.
*/
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
unsigned int prev, bool frontswap)
unsigned int prev)
{
unsigned int i;
unsigned char count;
@ -2138,8 +2105,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
for (i = prev + 1; i < si->max; i++) {
count = READ_ONCE(si->swap_map[i]);
if (count && swap_count(count) != SWAP_MAP_BAD)
if (!frontswap || frontswap_test(si, i))
break;
break;
if ((i % LATENCY_LIMIT) == 0)
cond_resched();
}
@ -2150,12 +2116,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
return i;
}
/*
* If the boolean frontswap is true, only unuse pages_to_unuse pages;
* pages_to_unuse==0 means all pages; ignored if frontswap is false
*/
int try_to_unuse(unsigned int type, bool frontswap,
unsigned long pages_to_unuse)
static int try_to_unuse(unsigned int type)
{
struct mm_struct *prev_mm;
struct mm_struct *mm;
@ -2169,13 +2130,10 @@ int try_to_unuse(unsigned int type, bool frontswap,
if (!READ_ONCE(si->inuse_pages))
return 0;
if (!frontswap)
pages_to_unuse = 0;
retry:
retval = shmem_unuse(type, frontswap, &pages_to_unuse);
retval = shmem_unuse(type);
if (retval)
goto out;
return retval;
prev_mm = &init_mm;
mmget(prev_mm);
@ -2192,11 +2150,10 @@ int try_to_unuse(unsigned int type, bool frontswap,
spin_unlock(&mmlist_lock);
mmput(prev_mm);
prev_mm = mm;
retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
retval = unuse_mm(mm, type);
if (retval) {
mmput(prev_mm);
goto out;
return retval;
}
/*
@ -2213,7 +2170,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
i = 0;
while (READ_ONCE(si->inuse_pages) &&
!signal_pending(current) &&
(i = find_next_to_unuse(si, i, frontswap)) != 0) {
(i = find_next_to_unuse(si, i)) != 0) {
entry = swp_entry(type, i);
page = find_get_page(swap_address_space(entry), i);
@ -2231,14 +2188,6 @@ int try_to_unuse(unsigned int type, bool frontswap,
try_to_free_swap(page);
unlock_page(page);
put_page(page);
/*
* For frontswap, we just need to unuse pages_to_unuse, if
* it was specified. Need not check frontswap again here as
* we already zeroed out pages_to_unuse if not frontswap.
*/
if (pages_to_unuse && --pages_to_unuse == 0)
goto out;
}
/*
@ -2256,10 +2205,10 @@ int try_to_unuse(unsigned int type, bool frontswap,
if (READ_ONCE(si->inuse_pages)) {
if (!signal_pending(current))
goto retry;
retval = -EINTR;
return -EINTR;
}
out:
return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
return 0;
}
/*
@ -2477,7 +2426,8 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
struct swap_cluster_info *cluster_info,
unsigned long *frontswap_map)
{
frontswap_init(p->type, frontswap_map);
if (IS_ENABLED(CONFIG_FRONTSWAP))
frontswap_init(p->type, frontswap_map);
spin_lock(&swap_lock);
spin_lock(&p->lock);
setup_swap_info(p, prio, swap_map, cluster_info);
@ -2590,7 +2540,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
disable_swap_slots_cache_lock();
set_current_oom_origin();
err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
err = try_to_unuse(p->type);
clear_current_oom_origin();
if (err) {
@ -2763,7 +2713,7 @@ static int swap_show(struct seq_file *swap, void *v)
struct swap_info_struct *si = v;
struct file *file;
int len;
unsigned int bytes, inuse;
unsigned long bytes, inuse;
if (si == SEQ_START_TOKEN) {
seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
@ -2775,7 +2725,7 @@ static int swap_show(struct seq_file *swap, void *v)
file = si->swap_file;
len = seq_file_path(swap, file, " \t\n\\");
seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
len < 40 ? 40 - len : 1, " ",
S_ISBLK(file_inode(file)->i_mode) ?
"partition" : "file\t",
@ -3118,7 +3068,7 @@ static bool swap_discardable(struct swap_info_struct *si)
{
struct request_queue *q = bdev_get_queue(si->bdev);
if (!q || !blk_queue_discard(q))
if (!blk_queue_discard(q))
return false;
return true;
@ -3534,13 +3484,13 @@ struct swap_info_struct *page_swap_info(struct page *page)
}
/*
* out-of-line __page_file_ methods to avoid include hell.
* out-of-line methods to avoid include hell.
*/
struct address_space *__page_file_mapping(struct page *page)
struct address_space *swapcache_mapping(struct folio *folio)
{
return page_swap_info(page)->swap_file->f_mapping;
return page_swap_info(&folio->page)->swap_file->f_mapping;
}
EXPORT_SYMBOL_GPL(__page_file_mapping);
EXPORT_SYMBOL_GPL(swapcache_mapping);
pgoff_t __page_file_index(struct page *page)
{

View File

@ -22,7 +22,6 @@
#include <linux/buffer_head.h> /* grr. try_to_release_page,
do_invalidatepage */
#include <linux/shmem_fs.h>
#include <linux/cleancache.h>
#include <linux/rmap.h>
#include "internal.h"
@ -45,18 +44,22 @@ static inline void __clear_shadow_entry(struct address_space *mapping,
static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
void *entry)
{
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
__clear_shadow_entry(mapping, index, entry);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
}
/*
* Unconditionally remove exceptional entries. Usually called from truncate
* path. Note that the pagevec may be altered by this function by removing
* exceptional entries similar to what pagevec_remove_exceptionals does.
* path. Note that the folio_batch may be altered by this function by removing
* exceptional entries similar to what folio_batch_remove_exceptionals() does.
*/
static void truncate_exceptional_pvec_entries(struct address_space *mapping,
struct pagevec *pvec, pgoff_t *indices)
static void truncate_folio_batch_exceptionals(struct address_space *mapping,
struct folio_batch *fbatch, pgoff_t *indices)
{
int i, j;
bool dax;
@ -65,23 +68,25 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
if (shmem_mapping(mapping))
return;
for (j = 0; j < pagevec_count(pvec); j++)
if (xa_is_value(pvec->pages[j]))
for (j = 0; j < folio_batch_count(fbatch); j++)
if (xa_is_value(fbatch->folios[j]))
break;
if (j == pagevec_count(pvec))
if (j == folio_batch_count(fbatch))
return;
dax = dax_mapping(mapping);
if (!dax)
if (!dax) {
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
}
for (i = j; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
for (i = j; i < folio_batch_count(fbatch); i++) {
struct folio *folio = fbatch->folios[i];
pgoff_t index = indices[i];
if (!xa_is_value(page)) {
pvec->pages[j++] = page;
if (!xa_is_value(folio)) {
fbatch->folios[j++] = folio;
continue;
}
@ -90,12 +95,16 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
continue;
}
__clear_shadow_entry(mapping, index, page);
__clear_shadow_entry(mapping, index, folio);
}
if (!dax)
if (!dax) {
xa_unlock_irq(&mapping->i_pages);
pvec->nr = j;
if (mapping_shrinkable(mapping))
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
}
fbatch->nr = j;
}
/*
@ -167,21 +176,21 @@ void do_invalidatepage(struct page *page, unsigned int offset,
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
*/
static void truncate_cleanup_page(struct page *page)
static void truncate_cleanup_folio(struct folio *folio)
{
if (page_mapped(page))
unmap_mapping_page(page);
if (folio_mapped(folio))
unmap_mapping_folio(folio);
if (page_has_private(page))
do_invalidatepage(page, 0, thp_size(page));
if (folio_has_private(folio))
do_invalidatepage(&folio->page, 0, folio_size(folio));
/*
* Some filesystems seem to re-dirty the page even after
* the VM has canceled the dirty bit (eg ext3 journaling).
* Hence dirty accounting check is placed after invalidation.
*/
cancel_dirty_page(page);
ClearPageMappedToDisk(page);
folio_cancel_dirty(folio);
folio_clear_mappedtodisk(folio);
}
/*
@ -195,7 +204,6 @@ static void truncate_cleanup_page(struct page *page)
static int
invalidate_complete_page(struct address_space *mapping, struct page *page)
{
int ret;
if (page->mapping != mapping)
return 0;
@ -203,28 +211,77 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
if (page_has_private(page) && !try_to_release_page(page, 0))
return 0;
ret = remove_mapping(mapping, page);
return ret;
return remove_mapping(mapping, page);
}
int truncate_inode_page(struct address_space *mapping, struct page *page)
int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
{
VM_BUG_ON_PAGE(PageTail(page), page);
if (page->mapping != mapping)
if (folio->mapping != mapping)
return -EIO;
truncate_cleanup_page(page);
delete_from_page_cache(page);
truncate_cleanup_folio(folio);
filemap_remove_folio(folio);
return 0;
}
/*
* Handle partial folios. The folio may be entirely within the
* range if a split has raced with us. If not, we zero the part of the
* folio that's within the [start, end] range, and then split the folio if
* it's large. split_page_range() will discard pages which now lie beyond
* i_size, and we rely on the caller to discard pages which lie within a
* newly created hole.
*
* Returns false if splitting failed so the caller can avoid
* discarding the entire folio which is stubbornly unsplit.
*/
bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
{
loff_t pos = folio_pos(folio);
unsigned int offset, length;
if (pos < start)
offset = start - pos;
else
offset = 0;
length = folio_size(folio);
if (pos + length <= (u64)end)
length = length - offset;
else
length = end + 1 - pos - offset;
folio_wait_writeback(folio);
if (length == folio_size(folio)) {
truncate_inode_folio(folio->mapping, folio);
return true;
}
/*
* We may be zeroing pages we're about to discard, but it avoids
* doing a complex calculation here, and then doing the zeroing
* anyway if the page split fails.
*/
folio_zero_range(folio, offset, length);
if (folio_has_private(folio))
do_invalidatepage(&folio->page, offset, length);
if (!folio_test_large(folio))
return true;
if (split_huge_page(&folio->page) == 0)
return true;
if (folio_test_dirty(folio))
return false;
truncate_inode_folio(folio->mapping, folio);
return true;
}
/*
* Used to get rid of pages on hardware memory corruption.
*/
int generic_error_remove_page(struct address_space *mapping, struct page *page)
{
VM_BUG_ON_PAGE(PageTail(page), page);
if (!mapping)
return -EINVAL;
/*
@ -233,7 +290,7 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page)
*/
if (!S_ISREG(mapping->host->i_mode))
return -EIO;
return truncate_inode_page(mapping, page);
return truncate_inode_folio(mapping, page_folio(page));
}
EXPORT_SYMBOL(generic_error_remove_page);
@ -284,19 +341,15 @@ void truncate_inode_pages_range(struct address_space *mapping,
{
pgoff_t start; /* inclusive */
pgoff_t end; /* exclusive */
unsigned int partial_start; /* inclusive */
unsigned int partial_end; /* exclusive */
struct pagevec pvec;
struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE];
pgoff_t index;
int i;
struct folio *folio;
bool same_folio;
if (mapping_empty(mapping))
goto out;
/* Offsets within partial pages */
partial_start = lstart & (PAGE_SIZE - 1);
partial_end = (lend + 1) & (PAGE_SIZE - 1);
return;
/*
* 'start' and 'end' always covers the range of pages to be fully
@ -315,64 +368,49 @@ void truncate_inode_pages_range(struct address_space *mapping,
else
end = (lend + 1) >> PAGE_SHIFT;
pagevec_init(&pvec);
folio_batch_init(&fbatch);
index = start;
while (index < end && find_lock_entries(mapping, index, end - 1,
&pvec, indices)) {
index = indices[pagevec_count(&pvec) - 1] + 1;
truncate_exceptional_pvec_entries(mapping, &pvec, indices);
for (i = 0; i < pagevec_count(&pvec); i++)
truncate_cleanup_page(pvec.pages[i]);
delete_from_page_cache_batch(mapping, &pvec);
for (i = 0; i < pagevec_count(&pvec); i++)
unlock_page(pvec.pages[i]);
pagevec_release(&pvec);
&fbatch, indices)) {
index = indices[folio_batch_count(&fbatch) - 1] + 1;
truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
for (i = 0; i < folio_batch_count(&fbatch); i++)
truncate_cleanup_folio(fbatch.folios[i]);
delete_from_page_cache_batch(mapping, &fbatch);
for (i = 0; i < folio_batch_count(&fbatch); i++)
folio_unlock(fbatch.folios[i]);
folio_batch_release(&fbatch);
cond_resched();
}
if (partial_start) {
struct page *page = find_lock_page(mapping, start - 1);
if (page) {
unsigned int top = PAGE_SIZE;
if (start > end) {
/* Truncation within a single page */
top = partial_end;
partial_end = 0;
}
wait_on_page_writeback(page);
zero_user_segment(page, partial_start, top);
cleancache_invalidate_page(mapping, page);
if (page_has_private(page))
do_invalidatepage(page, partial_start,
top - partial_start);
unlock_page(page);
put_page(page);
same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0);
if (folio) {
same_folio = lend < folio_pos(folio) + folio_size(folio);
if (!truncate_inode_partial_folio(folio, lstart, lend)) {
start = folio->index + folio_nr_pages(folio);
if (same_folio)
end = folio->index;
}
folio_unlock(folio);
folio_put(folio);
folio = NULL;
}
if (partial_end) {
struct page *page = find_lock_page(mapping, end);
if (page) {
wait_on_page_writeback(page);
zero_user_segment(page, 0, partial_end);
cleancache_invalidate_page(mapping, page);
if (page_has_private(page))
do_invalidatepage(page, 0,
partial_end);
unlock_page(page);
put_page(page);
}
if (!same_folio)
folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT,
FGP_LOCK, 0);
if (folio) {
if (!truncate_inode_partial_folio(folio, lstart, lend))
end = folio->index;
folio_unlock(folio);
folio_put(folio);
}
/*
* If the truncation happened within a single page no pages
* will be released, just zeroed, so we can bail out now.
*/
if (start >= end)
goto out;
index = start;
for ( ; ; ) {
while (index < end) {
cond_resched();
if (!find_get_entries(mapping, index, end - 1, &pvec,
if (!find_get_entries(mapping, index, end - 1, &fbatch,
indices)) {
/* If all gone from start onwards, we're done */
if (index == start)
@ -382,28 +420,26 @@ void truncate_inode_pages_range(struct address_space *mapping,
continue;
}
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing page->index */
index = indices[i];
if (xa_is_value(page))
if (xa_is_value(folio))
continue;
lock_page(page);
WARN_ON(page_to_index(page) != index);
wait_on_page_writeback(page);
truncate_inode_page(mapping, page);
unlock_page(page);
folio_lock(folio);
VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
folio_wait_writeback(folio);
truncate_inode_folio(mapping, folio);
folio_unlock(folio);
index = folio_index(folio) + folio_nr_pages(folio) - 1;
}
truncate_exceptional_pvec_entries(mapping, &pvec, indices);
pagevec_release(&pvec);
truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
folio_batch_release(&fbatch);
index++;
}
out:
cleancache_invalidate_inode(mapping);
}
EXPORT_SYMBOL(truncate_inode_pages_range);
@ -457,10 +493,6 @@ void truncate_inode_pages_final(struct address_space *mapping)
xa_unlock_irq(&mapping->i_pages);
}
/*
* Cleancache needs notification even if there are no pages or shadow
* entries.
*/
truncate_inode_pages(mapping, 0);
}
EXPORT_SYMBOL(truncate_inode_pages_final);
@ -469,16 +501,16 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
{
pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
struct folio_batch fbatch;
pgoff_t index = start;
unsigned long ret;
unsigned long count = 0;
int i;
pagevec_init(&pvec);
while (find_lock_entries(mapping, index, end, &pvec, indices)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
folio_batch_init(&fbatch);
while (find_lock_entries(mapping, index, end, &fbatch, indices)) {
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct page *page = &fbatch.folios[i]->page;
/* We rely upon deletion not changing page->index */
index = indices[i];
@ -505,8 +537,8 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
}
count += ret;
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
cond_resched();
index++;
}
@ -558,40 +590,43 @@ void invalidate_mapping_pagevec(struct address_space *mapping,
* shrink_page_list() has a temp ref on them, or because they're transiently
* sitting in the lru_cache_add() pagevecs.
*/
static int
invalidate_complete_page2(struct address_space *mapping, struct page *page)
static int invalidate_complete_folio2(struct address_space *mapping,
struct folio *folio)
{
if (page->mapping != mapping)
if (folio->mapping != mapping)
return 0;
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
if (folio_has_private(folio) &&
!filemap_release_folio(folio, GFP_KERNEL))
return 0;
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
if (PageDirty(page))
if (folio_test_dirty(folio))
goto failed;
BUG_ON(page_has_private(page));
__delete_from_page_cache(page, NULL);
BUG_ON(folio_has_private(folio));
__filemap_remove_folio(folio, NULL);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
if (mapping->a_ops->freepage)
mapping->a_ops->freepage(page);
put_page(page); /* pagecache ref */
filemap_free_folio(mapping, folio);
return 1;
failed:
xa_unlock_irq(&mapping->i_pages);
spin_unlock(&mapping->host->i_lock);
return 0;
}
static int do_launder_page(struct address_space *mapping, struct page *page)
static int do_launder_folio(struct address_space *mapping, struct folio *folio)
{
if (!PageDirty(page))
if (!folio_test_dirty(folio))
return 0;
if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
if (folio->mapping != mapping || mapping->a_ops->launder_page == NULL)
return 0;
return mapping->a_ops->launder_page(page);
return mapping->a_ops->launder_page(&folio->page);
}
/**
@ -609,7 +644,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
pgoff_t indices[PAGEVEC_SIZE];
struct pagevec pvec;
struct folio_batch fbatch;
pgoff_t index;
int i;
int ret = 0;
@ -617,27 +652,27 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
int did_range_unmap = 0;
if (mapping_empty(mapping))
goto out;
return 0;
pagevec_init(&pvec);
folio_batch_init(&fbatch);
index = start;
while (find_get_entries(mapping, index, end, &pvec, indices)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *page = pvec.pages[i];
while (find_get_entries(mapping, index, end, &fbatch, indices)) {
for (i = 0; i < folio_batch_count(&fbatch); i++) {
struct folio *folio = fbatch.folios[i];
/* We rely upon deletion not changing page->index */
/* We rely upon deletion not changing folio->index */
index = indices[i];
if (xa_is_value(page)) {
if (xa_is_value(folio)) {
if (!invalidate_exceptional_entry2(mapping,
index, page))
index, folio))
ret = -EBUSY;
continue;
}
if (!did_range_unmap && page_mapped(page)) {
if (!did_range_unmap && folio_mapped(folio)) {
/*
* If page is mapped, before taking its lock,
* If folio is mapped, before taking its lock,
* zap the rest of the file in one hit.
*/
unmap_mapping_pages(mapping, index,
@ -645,29 +680,29 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
did_range_unmap = 1;
}
lock_page(page);
WARN_ON(page_to_index(page) != index);
if (page->mapping != mapping) {
unlock_page(page);
folio_lock(folio);
VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
if (folio->mapping != mapping) {
folio_unlock(folio);
continue;
}
wait_on_page_writeback(page);
folio_wait_writeback(folio);
if (page_mapped(page))
unmap_mapping_page(page);
BUG_ON(page_mapped(page));
if (folio_mapped(folio))
unmap_mapping_folio(folio);
BUG_ON(folio_mapped(folio));
ret2 = do_launder_page(mapping, page);
ret2 = do_launder_folio(mapping, folio);
if (ret2 == 0) {
if (!invalidate_complete_page2(mapping, page))
if (!invalidate_complete_folio2(mapping, folio))
ret2 = -EBUSY;
}
if (ret2 < 0)
ret = ret2;
unlock_page(page);
folio_unlock(folio);
}
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
folio_batch_remove_exceptionals(&fbatch);
folio_batch_release(&fbatch);
cond_resched();
index++;
}
@ -681,8 +716,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
if (dax_mapping(mapping)) {
unmap_mapping_pages(mapping, start, end - start + 1, false);
}
out:
cleancache_invalidate_inode(mapping);
return ret;
}
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);

View File

@ -20,6 +20,7 @@
#include <linux/atomic.h>
#include <linux/jump_label.h>
#include <asm/sections.h>
#include "slab.h"
/*
* Checks if a given pointer and length is contained by the current
@ -223,7 +224,7 @@ static inline void check_page_span(const void *ptr, unsigned long n,
static inline void check_heap_object(const void *ptr, unsigned long n,
bool to_user)
{
struct page *page;
struct folio *folio;
if (!virt_addr_valid(ptr))
return;
@ -231,16 +232,16 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
/*
* When CONFIG_HIGHMEM=y, kmap_to_page() will give either the
* highmem page or fallback to virt_to_page(). The following
* is effectively a highmem-aware virt_to_head_page().
* is effectively a highmem-aware virt_to_slab().
*/
page = compound_head(kmap_to_page((void *)ptr));
folio = page_folio(kmap_to_page((void *)ptr));
if (PageSlab(page)) {
if (folio_test_slab(folio)) {
/* Check slab allocator for flags and size. */
__check_heap_object(ptr, n, page, to_user);
__check_heap_object(ptr, n, folio_slab(folio), to_user);
} else {
/* Verify object does not incorrectly span multiple pages. */
check_page_span(ptr, n, page, to_user);
check_page_span(ptr, n, folio_page(folio, 0), to_user);
}
}

View File

@ -69,10 +69,9 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
pgoff_t offset, max_off;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
_dst_pte = pte_mkdirty(_dst_pte);
if (page_in_cache && !vm_shared)
writable = false;
if (writable || !page_in_cache)
_dst_pte = pte_mkdirty(_dst_pte);
if (writable) {
if (wp_copy)
_dst_pte = pte_mkuffd_wp(_dst_pte);
@ -164,7 +163,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
__SetPageUptodate(page);
ret = -ENOMEM;
if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
if (mem_cgroup_charge(page_folio(page), dst_mm, GFP_KERNEL))
goto out_release;
ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
@ -233,6 +232,11 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
goto out;
}
if (PageHWPoison(page)) {
ret = -EIO;
goto out_release;
}
ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
page, false, wp_copy);
if (ret)

126
mm/util.c
View File

@ -549,13 +549,10 @@ EXPORT_SYMBOL(vm_mmap);
* Uses kmalloc to get the memory but if the allocation fails then falls back
* to the vmalloc allocator. Use kvfree for freeing the memory.
*
* Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
* GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
* __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
* preferable to the vmalloc fallback, due to visible performance drawbacks.
*
* Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
* fall back to vmalloc.
*
* Return: pointer to the allocated memory of %NULL in case of failure
*/
void *kvmalloc_node(size_t size, gfp_t flags, int node)
@ -563,13 +560,6 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
gfp_t kmalloc_flags = flags;
void *ret;
/*
* vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
* so the given set of flags has to be compatible.
*/
if ((flags & GFP_KERNEL) != GFP_KERNEL)
return kmalloc_node(size, flags, node);
/*
* We want to attempt a large physically contiguous block first because
* it is less likely to fragment multiple larger blocks and therefore
@ -582,6 +572,9 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
kmalloc_flags |= __GFP_NORETRY;
/* nofail semantic is implemented by the vmalloc fallback */
kmalloc_flags &= ~__GFP_NOFAIL;
}
ret = kmalloc_node(size, kmalloc_flags, node);
@ -656,81 +649,78 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
}
EXPORT_SYMBOL(kvrealloc);
static inline void *__page_rmapping(struct page *page)
{
unsigned long mapping;
mapping = (unsigned long)page->mapping;
mapping &= ~PAGE_MAPPING_FLAGS;
return (void *)mapping;
}
/* Neutral page->mapping pointer to address_space or anon_vma or other */
void *page_rmapping(struct page *page)
{
page = compound_head(page);
return __page_rmapping(page);
return folio_raw_mapping(page_folio(page));
}
/*
* Return true if this page is mapped into pagetables.
* For compound page it returns true if any subpage of compound page is mapped.
/**
* folio_mapped - Is this folio mapped into userspace?
* @folio: The folio.
*
* Return: True if any page in this folio is referenced by user page tables.
*/
bool page_mapped(struct page *page)
bool folio_mapped(struct folio *folio)
{
int i;
long i, nr;
if (likely(!PageCompound(page)))
return atomic_read(&page->_mapcount) >= 0;
page = compound_head(page);
if (atomic_read(compound_mapcount_ptr(page)) >= 0)
if (!folio_test_large(folio))
return atomic_read(&folio->_mapcount) >= 0;
if (atomic_read(folio_mapcount_ptr(folio)) >= 0)
return true;
if (PageHuge(page))
if (folio_test_hugetlb(folio))
return false;
for (i = 0; i < compound_nr(page); i++) {
if (atomic_read(&page[i]._mapcount) >= 0)
nr = folio_nr_pages(folio);
for (i = 0; i < nr; i++) {
if (atomic_read(&folio_page(folio, i)->_mapcount) >= 0)
return true;
}
return false;
}
EXPORT_SYMBOL(page_mapped);
EXPORT_SYMBOL(folio_mapped);
struct anon_vma *page_anon_vma(struct page *page)
{
unsigned long mapping;
struct folio *folio = page_folio(page);
unsigned long mapping = (unsigned long)folio->mapping;
page = compound_head(page);
mapping = (unsigned long)page->mapping;
if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
return NULL;
return __page_rmapping(page);
return (void *)(mapping - PAGE_MAPPING_ANON);
}
struct address_space *page_mapping(struct page *page)
/**
* folio_mapping - Find the mapping where this folio is stored.
* @folio: The folio.
*
* For folios which are in the page cache, return the mapping that this
* page belongs to. Folios in the swap cache return the swap mapping
* this page is stored in (which is different from the mapping for the
* swap file or swap device where the data is stored).
*
* You can call this for folios which aren't in the swap cache or page
* cache and it will return NULL.
*/
struct address_space *folio_mapping(struct folio *folio)
{
struct address_space *mapping;
page = compound_head(page);
/* This happens if someone calls flush_dcache_page on slab page */
if (unlikely(PageSlab(page)))
if (unlikely(folio_test_slab(folio)))
return NULL;
if (unlikely(PageSwapCache(page))) {
swp_entry_t entry;
if (unlikely(folio_test_swapcache(folio)))
return swap_address_space(folio_swap_entry(folio));
entry.val = page_private(page);
return swap_address_space(entry);
}
mapping = page->mapping;
mapping = folio->mapping;
if ((unsigned long)mapping & PAGE_MAPPING_ANON)
return NULL;
return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
}
EXPORT_SYMBOL(page_mapping);
EXPORT_SYMBOL(folio_mapping);
/* Slow path of page_mapcount() for compound pages */
int __page_mapcount(struct page *page)
@ -752,13 +742,26 @@ int __page_mapcount(struct page *page)
}
EXPORT_SYMBOL_GPL(__page_mapcount);
void copy_huge_page(struct page *dst, struct page *src)
/**
* folio_copy - Copy the contents of one folio to another.
* @dst: Folio to copy to.
* @src: Folio to copy from.
*
* The bytes in the folio represented by @src are copied to @dst.
* Assumes the caller has validated that @dst is at least as large as @src.
* Can be called in atomic context for order-0 folios, but if the folio is
* larger, it may sleep.
*/
void folio_copy(struct folio *dst, struct folio *src)
{
unsigned i, nr = compound_nr(src);
long i = 0;
long nr = folio_nr_pages(src);
for (i = 0; i < nr; i++) {
for (;;) {
copy_highpage(folio_page(dst, i), folio_page(src, i));
if (++i == nr)
break;
cond_resched();
copy_highpage(nth_page(dst, i), nth_page(src, i));
}
}
@ -1081,3 +1084,14 @@ void page_offline_end(void)
up_write(&page_offline_rwsem);
}
EXPORT_SYMBOL(page_offline_end);
#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
void flush_dcache_folio(struct folio *folio)
{
long i, nr = folio_nr_pages(folio);
for (i = 0; i < nr; i++)
flush_dcache_page(folio_page(folio, i));
}
EXPORT_SYMBOL(flush_dcache_folio);
#endif

View File

@ -31,6 +31,7 @@
#include <linux/kmemleak.h>
#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/memcontrol.h>
#include <linux/llist.h>
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
@ -38,6 +39,7 @@
#include <linux/pgtable.h>
#include <linux/uaccess.h>
#include <linux/hugetlb.h>
#include <linux/sched/mm.h>
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
@ -1195,18 +1197,14 @@ find_vmap_lowest_match(unsigned long size,
{
struct vmap_area *va;
struct rb_node *node;
unsigned long length;
/* Start from the root. */
node = free_vmap_area_root.rb_node;
/* Adjust the search size for alignment overhead. */
length = size + align - 1;
while (node) {
va = rb_entry(node, struct vmap_area, rb_node);
if (get_subtree_max_size(node->rb_left) >= length &&
if (get_subtree_max_size(node->rb_left) >= size &&
vstart < va->va_start) {
node = node->rb_left;
} else {
@ -1216,9 +1214,9 @@ find_vmap_lowest_match(unsigned long size,
/*
* Does not make sense to go deeper towards the right
* sub-tree if it does not have a free block that is
* equal or bigger to the requested search length.
* equal or bigger to the requested search size.
*/
if (get_subtree_max_size(node->rb_right) >= length) {
if (get_subtree_max_size(node->rb_right) >= size) {
node = node->rb_right;
continue;
}
@ -1226,15 +1224,23 @@ find_vmap_lowest_match(unsigned long size,
/*
* OK. We roll back and find the first right sub-tree,
* that will satisfy the search criteria. It can happen
* only once due to "vstart" restriction.
* due to "vstart" restriction or an alignment overhead
* that is bigger then PAGE_SIZE.
*/
while ((node = rb_parent(node))) {
va = rb_entry(node, struct vmap_area, rb_node);
if (is_within_this_va(va, size, align, vstart))
return va;
if (get_subtree_max_size(node->rb_right) >= length &&
if (get_subtree_max_size(node->rb_right) >= size &&
vstart <= va->va_start) {
/*
* Shift the vstart forward. Please note, we update it with
* parent's start address adding "1" because we do not want
* to enter same sub-tree after it has already been checked
* and no suitable free block found there.
*/
vstart = va->va_start + 1;
node = node->rb_right;
break;
}
@ -1265,7 +1271,7 @@ find_vmap_lowest_linear_match(unsigned long size,
}
static void
find_vmap_lowest_match_check(unsigned long size)
find_vmap_lowest_match_check(unsigned long size, unsigned long align)
{
struct vmap_area *va_1, *va_2;
unsigned long vstart;
@ -1274,8 +1280,8 @@ find_vmap_lowest_match_check(unsigned long size)
get_random_bytes(&rnd, sizeof(rnd));
vstart = VMALLOC_START + rnd;
va_1 = find_vmap_lowest_match(size, 1, vstart);
va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
va_1 = find_vmap_lowest_match(size, align, vstart);
va_2 = find_vmap_lowest_linear_match(size, align, vstart);
if (va_1 != va_2)
pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
@ -1454,7 +1460,7 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
return vend;
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
find_vmap_lowest_match_check(size);
find_vmap_lowest_match_check(size, align);
#endif
return nva_start_addr;
@ -2272,15 +2278,22 @@ void __init vm_area_add_early(struct vm_struct *vm)
*/
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
{
static size_t vm_init_off __initdata;
unsigned long addr;
unsigned long addr = ALIGN(VMALLOC_START, align);
struct vm_struct *cur, **p;
addr = ALIGN(VMALLOC_START + vm_init_off, align);
vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
BUG_ON(vmap_initialized);
for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
if ((unsigned long)cur->addr - addr >= vm->size)
break;
addr = ALIGN((unsigned long)cur->addr + cur->size, align);
}
BUG_ON(addr > VMALLOC_END - vm->size);
vm->addr = (void *)addr;
vm_area_add_early(vm);
vm->next = *p;
*p = vm;
kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
}
static void vmap_init_free_space(void)
@ -2612,12 +2625,13 @@ static void __vunmap(const void *addr, int deallocate_pages)
if (deallocate_pages) {
unsigned int page_order = vm_area_page_order(area);
int i;
int i, step = 1U << page_order;
for (i = 0; i < area->nr_pages; i += 1U << page_order) {
for (i = 0; i < area->nr_pages; i += step) {
struct page *page = area->pages[i];
BUG_ON(!page);
mod_memcg_page_state(page, MEMCG_VMALLOC, -step);
__free_pages(page, page_order);
cond_resched();
}
@ -2743,6 +2757,13 @@ void *vmap(struct page **pages, unsigned int count,
might_sleep();
/*
* Your top guard is someone else's bottom guard. Not having a top
* guard compromises someone else's mappings too.
*/
if (WARN_ON_ONCE(flags & VM_NO_GUARD))
flags &= ~VM_NO_GUARD;
if (count > totalram_pages())
return NULL;
@ -2825,7 +2846,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* to fails, fallback to a single page allocator that is
* more permissive.
*/
if (!order && nid != NUMA_NO_NODE) {
if (!order) {
gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;
while (nr_allocated < nr_pages) {
unsigned int nr, nr_pages_request;
@ -2837,8 +2860,20 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
*/
nr_pages_request = min(100U, nr_pages - nr_allocated);
nr = alloc_pages_bulk_array_node(gfp, nid,
nr_pages_request, pages + nr_allocated);
/* memory allocation should consider mempolicy, we can't
* wrongly use nearest node when nid == NUMA_NO_NODE,
* otherwise memory may be allocated in only one node,
* but mempolcy want to alloc memory by interleaving.
*/
if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
nr = alloc_pages_bulk_array_mempolicy(bulk_gfp,
nr_pages_request,
pages + nr_allocated);
else
nr = alloc_pages_bulk_array_node(bulk_gfp, nid,
nr_pages_request,
pages + nr_allocated);
nr_allocated += nr;
cond_resched();
@ -2850,7 +2885,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
if (nr != nr_pages_request)
break;
}
} else if (order)
} else
/*
* Compound pages required for remap_vmalloc_page if
* high-order pages.
@ -2860,6 +2895,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
/* High-order pages or fallback path if "bulk" fails. */
while (nr_allocated < nr_pages) {
if (fatal_signal_pending(current))
break;
if (nid == NUMA_NO_NODE)
page = alloc_pages(gfp, order);
else
@ -2887,11 +2925,15 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
int node)
{
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
const gfp_t orig_gfp_mask = gfp_mask;
bool nofail = gfp_mask & __GFP_NOFAIL;
unsigned long addr = (unsigned long)area->addr;
unsigned long size = get_vm_area_size(area);
unsigned long array_size;
unsigned int nr_small_pages = size >> PAGE_SHIFT;
unsigned int page_order;
unsigned int flags;
int ret;
array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
gfp_mask |= __GFP_NOWARN;
@ -2907,7 +2949,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
}
if (!area->pages) {
warn_alloc(gfp_mask, NULL,
warn_alloc(orig_gfp_mask, NULL,
"vmalloc error: size %lu, failed to allocated page array size %lu",
nr_small_pages * PAGE_SIZE, array_size);
free_vm_area(area);
@ -2921,21 +2963,48 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
page_order, nr_small_pages, area->pages);
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
if (gfp_mask & __GFP_ACCOUNT) {
int i, step = 1U << page_order;
for (i = 0; i < area->nr_pages; i += step)
mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC,
step);
}
/*
* If not enough pages were obtained to accomplish an
* allocation request, free them via __vfree() if any.
*/
if (area->nr_pages != nr_small_pages) {
warn_alloc(gfp_mask, NULL,
warn_alloc(orig_gfp_mask, NULL,
"vmalloc error: size %lu, page order %u, failed to allocate pages",
area->nr_pages * PAGE_SIZE, page_order);
goto fail;
}
if (vmap_pages_range(addr, addr + size, prot, area->pages,
page_shift) < 0) {
warn_alloc(gfp_mask, NULL,
/*
* page tables allocations ignore external gfp mask, enforce it
* by the scope API
*/
if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
flags = memalloc_nofs_save();
else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
flags = memalloc_noio_save();
do {
ret = vmap_pages_range(addr, addr + size, prot, area->pages,
page_shift);
if (nofail && (ret < 0))
schedule_timeout_uninterruptible(1);
} while (nofail && (ret < 0));
if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
memalloc_nofs_restore(flags);
else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
memalloc_noio_restore(flags);
if (ret < 0) {
warn_alloc(orig_gfp_mask, NULL,
"vmalloc error: size %lu, failed to map pages",
area->nr_pages * PAGE_SIZE);
goto fail;
@ -2961,8 +3030,18 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
* @caller: caller's return address
*
* Allocate enough pages to cover @size from the page level
* allocator with @gfp_mask flags. Map them into contiguous
* kernel virtual space, using a pagetable protection of @prot.
* allocator with @gfp_mask flags. Please note that the full set of gfp
* flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
* supported.
* Zone modifiers are not supported. From the reclaim modifiers
* __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
* and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
* __GFP_RETRY_MAYFAIL are not supported).
*
* __GFP_NOWARN can be used to suppress failures messages.
*
* Map them into contiguous kernel virtual space, using a pagetable
* protection of @prot.
*
* Return: the address of the area or %NULL on failure
*/
@ -3014,9 +3093,14 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
VM_UNINITIALIZED | vm_flags, start, end, node,
gfp_mask, caller);
if (!area) {
bool nofail = gfp_mask & __GFP_NOFAIL;
warn_alloc(gfp_mask, NULL,
"vmalloc error: size %lu, vm_struct allocation failed",
real_size);
"vmalloc error: size %lu, vm_struct allocation failed%s",
real_size, (nofail) ? ". Retrying." : "");
if (nofail) {
schedule_timeout_uninterruptible(1);
goto again;
}
goto fail;
}
@ -3857,6 +3941,7 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
{
if (IS_ENABLED(CONFIG_NUMA)) {
unsigned int nr, *counters = m->private;
unsigned int step = 1U << vm_area_page_order(v);
if (!counters)
return;
@ -3868,9 +3953,8 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
memset(counters, 0, nr_node_ids * sizeof(unsigned int));
for (nr = 0; nr < v->nr_pages; nr++)
counters[page_to_nid(v->pages[nr])]++;
for (nr = 0; nr < v->nr_pages; nr += step)
counters[page_to_nid(v->pages[nr])] += step;
for_each_node_state(nr, N_HIGH_MEMORY)
if (counters[nr])
seq_printf(m, " N%u=%u", nr, counters[nr]);
@ -3906,7 +3990,7 @@ static int s_show(struct seq_file *m, void *p)
(void *)va->va_start, (void *)va->va_end,
va->va_end - va->va_start);
return 0;
goto final;
}
v = va->vm;
@ -3947,6 +4031,7 @@ static int s_show(struct seq_file *m, void *p)
/*
* As a final step, dump "unpurged" areas.
*/
final:
if (list_is_last(&va->list, &vmap_area_list))
show_purge_info(m);

View File

@ -308,7 +308,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
* asserted for a second in which subsequent
* pressure events can occur.
*/
memcg->socket_pressure = jiffies + HZ;
WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
}
}
}

View File

@ -687,6 +687,21 @@ void unregister_shrinker(struct shrinker *shrinker)
}
EXPORT_SYMBOL(unregister_shrinker);
/**
* synchronize_shrinkers - Wait for all running shrinkers to complete.
*
* This is equivalent to calling unregister_shrink() and register_shrinker(),
* but atomically and with less overhead. This is useful to guarantee that all
* shrinker invocations have seen an update, before freeing memory, similar to
* rcu.
*/
void synchronize_shrinkers(void)
{
down_write(&shrinker_rwsem);
up_write(&shrinker_rwsem);
}
EXPORT_SYMBOL(synchronize_shrinkers);
#define SHRINK_BATCH 128
static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
@ -936,7 +951,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
return freed;
}
void drop_slab_node(int nid)
static void drop_slab_node(int nid)
{
unsigned long freed;
int shift = 0;
@ -1006,6 +1021,134 @@ static void handle_write_error(struct address_space *mapping,
unlock_page(page);
}
static bool skip_throttle_noprogress(pg_data_t *pgdat)
{
int reclaimable = 0, write_pending = 0;
int i;
/*
* If kswapd is disabled, reschedule if necessary but do not
* throttle as the system is likely near OOM.
*/
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return true;
/*
* If there are a lot of dirty/writeback pages then do not
* throttle as throttling will occur when the pages cycle
* towards the end of the LRU if still under writeback.
*/
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
if (!populated_zone(zone))
continue;
reclaimable += zone_reclaimable_pages(zone);
write_pending += zone_page_state_snapshot(zone,
NR_ZONE_WRITE_PENDING);
}
if (2 * write_pending <= reclaimable)
return true;
return false;
}
void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
{
wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
long timeout, ret;
DEFINE_WAIT(wait);
/*
* Do not throttle IO workers, kthreads other than kswapd or
* workqueues. They may be required for reclaim to make
* forward progress (e.g. journalling workqueues or kthreads).
*/
if (!current_is_kswapd() &&
current->flags & (PF_IO_WORKER|PF_KTHREAD)) {
cond_resched();
return;
}
/*
* These figures are pulled out of thin air.
* VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
* parallel reclaimers which is a short-lived event so the timeout is
* short. Failing to make progress or waiting on writeback are
* potentially long-lived events so use a longer timeout. This is shaky
* logic as a failure to make progress could be due to anything from
* writeback to a slow device to excessive references pages at the tail
* of the inactive LRU.
*/
switch(reason) {
case VMSCAN_THROTTLE_WRITEBACK:
timeout = HZ/10;
if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
WRITE_ONCE(pgdat->nr_reclaim_start,
node_page_state(pgdat, NR_THROTTLED_WRITTEN));
}
break;
case VMSCAN_THROTTLE_CONGESTED:
fallthrough;
case VMSCAN_THROTTLE_NOPROGRESS:
if (skip_throttle_noprogress(pgdat)) {
cond_resched();
return;
}
timeout = 1;
break;
case VMSCAN_THROTTLE_ISOLATED:
timeout = HZ/50;
break;
default:
WARN_ON_ONCE(1);
timeout = HZ;
break;
}
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
ret = schedule_timeout(timeout);
finish_wait(wqh, &wait);
if (reason == VMSCAN_THROTTLE_WRITEBACK)
atomic_dec(&pgdat->nr_writeback_throttled);
trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
jiffies_to_usecs(timeout - ret),
reason);
}
/*
* Account for pages written if tasks are throttled waiting on dirty
* pages to clean. If enough pages have been cleaned since throttling
* started then wakeup the throttled tasks.
*/
void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
int nr_throttled)
{
unsigned long nr_written;
node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
/*
* This is an inaccurate read as the per-cpu deltas may not
* be synchronised. However, given that the system is
* writeback throttled, it is not worth taking the penalty
* of getting an accurate count. At worst, the throttle
* timeout guarantees forward progress.
*/
nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
READ_ONCE(pgdat->nr_reclaim_start);
if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
}
/* possible outcome of pageout() */
typedef enum {
/* failed to write page out, page is locked */
@ -1105,6 +1248,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
if (!PageSwapCache(page))
spin_lock(&mapping->host->i_lock);
xa_lock_irq(&mapping->i_pages);
/*
* The non racy check for a busy page.
@ -1173,6 +1318,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
shadow = workingset_eviction(page, target_memcg);
__delete_from_page_cache(page, shadow);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
if (freepage != NULL)
freepage(page);
@ -1182,6 +1330,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
cannot_free:
xa_unlock_irq(&mapping->i_pages);
if (!PageSwapCache(page))
spin_unlock(&mapping->host->i_lock);
return 0;
}
@ -1337,7 +1487,6 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
{
int target_nid = next_demotion_node(pgdat->node_id);
unsigned int nr_succeeded;
int err;
if (list_empty(demote_pages))
return 0;
@ -1346,7 +1495,7 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
return 0;
/* Demotion ignores all cpuset and mempolicy settings */
err = migrate_pages(demote_pages, alloc_demote_page, NULL,
migrate_pages(demote_pages, alloc_demote_page, NULL,
target_nid, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
@ -1412,9 +1561,8 @@ static unsigned int shrink_page_list(struct list_head *page_list,
/*
* The number of dirty pages determines if a node is marked
* reclaim_congested which affects wait_iff_congested. kswapd
* will stall and start writing pages if the tail of the LRU
* is all dirty unqueued pages.
* reclaim_congested. kswapd will stall and start writing
* pages if the tail of the LRU is all dirty unqueued pages.
*/
page_check_dirty_writeback(page, &dirty, &writeback);
if (dirty || writeback)
@ -2090,6 +2238,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
*/
int isolate_lru_page(struct page *page)
{
struct folio *folio = page_folio(page);
int ret = -EBUSY;
VM_BUG_ON_PAGE(!page_count(page), page);
@ -2099,7 +2248,7 @@ int isolate_lru_page(struct page *page)
struct lruvec *lruvec;
get_page(page);
lruvec = lock_page_lruvec_irq(page);
lruvec = folio_lruvec_lock_irq(folio);
del_page_from_lru_list(page, lruvec);
unlock_page_lruvec_irq(lruvec);
ret = 0;
@ -2119,6 +2268,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
struct scan_control *sc)
{
unsigned long inactive, isolated;
bool too_many;
if (current_is_kswapd())
return 0;
@ -2142,7 +2292,13 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
inactive >>= 3;
return isolated > inactive;
too_many = isolated > inactive;
/* Wake up tasks throttled due to too_many_isolated. */
if (!too_many)
wake_throttle_isolated(pgdat);
return too_many;
}
/*
@ -2199,7 +2355,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec,
* All pages were isolated from the same lruvec (and isolation
* inhibits memcg migration).
*/
VM_BUG_ON_PAGE(!page_matches_lruvec(page, lruvec), page);
VM_BUG_ON_PAGE(!folio_matches_lruvec(page_folio(page), lruvec), page);
add_page_to_lru_list(page, lruvec);
nr_pages = thp_nr_pages(page);
nr_moved += nr_pages;
@ -2251,8 +2407,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
return 0;
/* wait a bit for the reclaimer. */
msleep(100);
stalled = true;
reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
/* We are about to die and free our memory. Return now. */
if (fatal_signal_pending(current))
@ -3180,19 +3336,19 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
* If kswapd scans pages marked for immediate
* reclaim and under writeback (nr_immediate), it
* implies that pages are cycling through the LRU
* faster than they are written so also forcibly stall.
* faster than they are written so forcibly stall
* until some pages complete writeback.
*/
if (sc->nr.immediate)
congestion_wait(BLK_RW_ASYNC, HZ/10);
reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
}
/*
* Tag a node/memcg as congested if all the dirty pages
* scanned were backed by a congested BDI and
* wait_iff_congested will stall.
* Tag a node/memcg as congested if all the dirty pages were marked
* for writeback and immediate reclaim (counted in nr.congested).
*
* Legacy memcg will stall in page writeback so avoid forcibly
* stalling in wait_iff_congested().
* stalling in reclaim_throttle().
*/
if ((current_is_kswapd() ||
(cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
@ -3200,15 +3356,15 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
/*
* Stall direct reclaim for IO completions if underlying BDIs
* and node is congested. Allow kswapd to continue until it
* Stall direct reclaim for IO completions if the lruvec is
* node is congested. Allow kswapd to continue until it
* starts encountering unqueued dirty pages or cycling through
* the LRU too quickly.
*/
if (!current_is_kswapd() && current_may_throttle() &&
!sc->hibernation_mode &&
test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
wait_iff_congested(BLK_RW_ASYNC, HZ/10);
reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
sc))
@ -3256,6 +3412,36 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
}
static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
{
/*
* If reclaim is making progress greater than 12% efficiency then
* wake all the NOPROGRESS throttled tasks.
*/
if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) {
wait_queue_head_t *wqh;
wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
if (waitqueue_active(wqh))
wake_up(wqh);
return;
}
/*
* Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will
* throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages
* under writeback and marked for immediate reclaim at the tail of the
* LRU.
*/
if (current_is_kswapd() || cgroup_reclaim(sc))
return;
/* Throttle if making no progress at high prioities. */
if (sc->priority == 1 && !sc->nr_reclaimed)
reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
}
/*
* This is the direct reclaim path, for page-allocating processes. We only
* try to reclaim pages from zones which will satisfy the caller's allocation
@ -3272,6 +3458,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
unsigned long nr_soft_scanned;
gfp_t orig_mask;
pg_data_t *last_pgdat = NULL;
pg_data_t *first_pgdat = NULL;
/*
* If the number of buffer_heads in the machine exceeds the maximum
@ -3335,6 +3522,9 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
/* need some check for avoid more shrink_zone() */
}
if (!first_pgdat)
first_pgdat = zone->zone_pgdat;
/* See comment about same check for global reclaim above */
if (zone->zone_pgdat == last_pgdat)
continue;
@ -3342,6 +3532,9 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
shrink_node(zone->zone_pgdat, sc);
}
if (first_pgdat)
consider_reclaim_throttle(first_pgdat, sc);
/*
* Restore to original mask to avoid the impact on the caller if we
* promoted it to __GFP_HIGHMEM.
@ -4286,6 +4479,7 @@ static int kswapd(void *p)
WRITE_ONCE(pgdat->kswapd_order, 0);
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
atomic_set(&pgdat->nr_writeback_throttled, 0);
for ( ; ; ) {
bool ret;
@ -4665,6 +4859,7 @@ void check_move_unevictable_pages(struct pagevec *pvec)
for (i = 0; i < pvec->nr; i++) {
struct page *page = pvec->pages[i];
struct folio *folio = page_folio(page);
int nr_pages;
if (PageTransTail(page))
@ -4677,7 +4872,7 @@ void check_move_unevictable_pages(struct pagevec *pvec)
if (!TestClearPageLRU(page))
continue;
lruvec = relock_page_lruvec_irq(page, lruvec);
lruvec = folio_lruvec_relock_irq(folio, lruvec);
if (page_evictable(page) && PageUnevictable(page)) {
del_page_from_lru_list(page, lruvec);
ClearPageUnevictable(page);

View File

@ -165,6 +165,34 @@ atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
EXPORT_SYMBOL(vm_zone_stat);
EXPORT_SYMBOL(vm_node_stat);
#ifdef CONFIG_NUMA
static void fold_vm_zone_numa_events(struct zone *zone)
{
unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
int cpu;
enum numa_stat_item item;
for_each_online_cpu(cpu) {
struct per_cpu_zonestat *pzstats;
pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
}
for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
zone_numa_event_add(zone_numa_events[item], zone, item);
}
void fold_vm_numa_events(void)
{
struct zone *zone;
for_each_populated_zone(zone)
fold_vm_zone_numa_events(zone);
}
#endif
#ifdef CONFIG_SMP
int calculate_pressure_threshold(struct zone *zone)
@ -771,34 +799,6 @@ static int fold_diff(int *zone_diff, int *node_diff)
return changes;
}
#ifdef CONFIG_NUMA
static void fold_vm_zone_numa_events(struct zone *zone)
{
unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
int cpu;
enum numa_stat_item item;
for_each_online_cpu(cpu) {
struct per_cpu_zonestat *pzstats;
pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
}
for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
zone_numa_event_add(zone_numa_events[item], zone, item);
}
void fold_vm_numa_events(void)
{
struct zone *zone;
for_each_populated_zone(zone)
fold_vm_zone_numa_events(zone);
}
#endif
/*
* Update the zone counters for the current cpu.
*
@ -1070,8 +1070,13 @@ static void fill_contig_page_info(struct zone *zone,
for (order = 0; order < MAX_ORDER; order++) {
unsigned long blocks;
/* Count number of free blocks */
blocks = zone->free_area[order].nr_free;
/*
* Count number of free blocks.
*
* Access to nr_free is lockless as nr_free is used only for
* diagnostic purposes. Use data_race to avoid KCSAN warning.
*/
blocks = data_race(zone->free_area[order].nr_free);
info->free_blocks_total += blocks;
/* Count free base pages */
@ -1225,6 +1230,7 @@ const char * const vmstat_text[] = {
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
"nr_written",
"nr_throttled_written",
"nr_kernel_misc_reclaimable",
"nr_foll_pin_acquired",
"nr_foll_pin_released",
@ -1347,6 +1353,9 @@ const char * const vmstat_text[] = {
"thp_split_page_failed",
"thp_deferred_split_page",
"thp_split_pmd",
"thp_scan_exceed_none_pte",
"thp_scan_exceed_swap_pte",
"thp_scan_exceed_share_pte",
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
"thp_split_pud",
#endif
@ -1445,7 +1454,11 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
for (order = 0; order < MAX_ORDER; ++order)
seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
/*
* Access to nr_free is lockless as nr_free is used only for
* printing purposes. Use data_race to avoid KCSAN warning.
*/
seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
seq_putc(m, '\n');
}
@ -1656,6 +1669,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
}
seq_printf(m,
"\n pages free %lu"
"\n boost %lu"
"\n min %lu"
"\n low %lu"
"\n high %lu"
@ -1664,6 +1678,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n managed %lu"
"\n cma %lu",
zone_page_state(zone, NR_FREE_PAGES),
zone->watermark_boost,
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
@ -2179,7 +2194,7 @@ static void extfrag_show_print(struct seq_file *m,
for (order = 0; order < MAX_ORDER; ++order) {
fill_contig_page_info(zone, order, &info);
index = __fragmentation_index(order, &info);
seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
}
seq_putc(m, '\n');

View File

@ -273,17 +273,17 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
}
/**
* workingset_refault - evaluate the refault of a previously evicted page
* @page: the freshly allocated replacement page
* @shadow: shadow entry of the evicted page
* workingset_refault - Evaluate the refault of a previously evicted folio.
* @folio: The freshly allocated replacement folio.
* @shadow: Shadow entry of the evicted folio.
*
* Calculates and evaluates the refault distance of the previously
* evicted page in the context of the node and the memcg whose memory
* evicted folio in the context of the node and the memcg whose memory
* pressure caused the eviction.
*/
void workingset_refault(struct page *page, void *shadow)
void workingset_refault(struct folio *folio, void *shadow)
{
bool file = page_is_file_lru(page);
bool file = folio_is_file_lru(folio);
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
unsigned long refault_distance;
@ -295,16 +295,17 @@ void workingset_refault(struct page *page, void *shadow)
unsigned long refault;
bool workingset;
int memcgid;
long nr;
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
rcu_read_lock();
/*
* Look up the memcg associated with the stored ID. It might
* have been deleted since the page's eviction.
* have been deleted since the folio's eviction.
*
* Note that in rare events the ID could have been recycled
* for a new cgroup that refaults a shared page. This is
* for a new cgroup that refaults a shared folio. This is
* impossible to tell from the available data. However, this
* should be a rare and limited disturbance, and activations
* are always speculative anyway. Ultimately, it's the aging
@ -340,17 +341,18 @@ void workingset_refault(struct page *page, void *shadow)
refault_distance = (refault - eviction) & EVICTION_MASK;
/*
* The activation decision for this page is made at the level
* The activation decision for this folio is made at the level
* where the eviction occurred, as that is where the LRU order
* during page reclaim is being determined.
* during folio reclaim is being determined.
*
* However, the cgroup that will own the page is the one that
* However, the cgroup that will own the folio is the one that
* is actually experiencing the refault event.
*/
memcg = page_memcg(page);
nr = folio_nr_pages(folio);
memcg = folio_memcg(folio);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file);
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
mem_cgroup_flush_stats();
/*
@ -376,16 +378,16 @@ void workingset_refault(struct page *page, void *shadow)
if (refault_distance > workingset_size)
goto out;
SetPageActive(page);
workingset_age_nonresident(lruvec, thp_nr_pages(page));
inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file);
folio_set_active(folio);
workingset_age_nonresident(lruvec, nr);
mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
/* Page was active prior to eviction */
/* Folio was active prior to eviction */
if (workingset) {
SetPageWorkingset(page);
folio_set_workingset(folio);
/* XXX: Move to lru_cache_add() when it supports new vs putback */
lru_note_cost_page(page);
inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file);
lru_note_cost_folio(folio);
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
}
out:
rcu_read_unlock();
@ -393,12 +395,11 @@ void workingset_refault(struct page *page, void *shadow)
/**
* workingset_activation - note a page activation
* @page: page that is being activated
* @folio: Folio that is being activated.
*/
void workingset_activation(struct page *page)
void workingset_activation(struct folio *folio)
{
struct mem_cgroup *memcg;
struct lruvec *lruvec;
rcu_read_lock();
/*
@ -408,11 +409,10 @@ void workingset_activation(struct page *page)
* XXX: See workingset_refault() - this should return
* root_mem_cgroup even for !CONFIG_MEMCG.
*/
memcg = page_memcg_rcu(page);
memcg = folio_memcg_rcu(folio);
if (!mem_cgroup_disabled() && !memcg)
goto out;
lruvec = mem_cgroup_page_lruvec(page);
workingset_age_nonresident(lruvec, thp_nr_pages(page));
workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
out:
rcu_read_unlock();
}
@ -543,6 +543,13 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
goto out;
}
if (!spin_trylock(&mapping->host->i_lock)) {
xa_unlock(&mapping->i_pages);
spin_unlock_irq(lru_lock);
ret = LRU_RETRY;
goto out;
}
list_lru_isolate(lru, item);
__dec_lruvec_kmem_state(node, WORKINGSET_NODES);
@ -562,6 +569,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
out_invalid:
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
ret = LRU_REMOVED_RETRY;
out:
cond_resched();

View File

@ -24,16 +24,11 @@ struct zpool {
const struct zpool_ops *ops;
bool evictable;
bool can_sleep_mapped;
struct list_head list;
};
static LIST_HEAD(drivers_head);
static DEFINE_SPINLOCK(drivers_lock);
static LIST_HEAD(pools_head);
static DEFINE_SPINLOCK(pools_lock);
/**
* zpool_register_driver() - register a zpool implementation.
* @driver: driver to register
@ -195,10 +190,6 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
pr_debug("created pool type %s\n", type);
spin_lock(&pools_lock);
list_add(&zpool->list, &pools_head);
spin_unlock(&pools_lock);
return zpool;
}
@ -217,9 +208,6 @@ void zpool_destroy_pool(struct zpool *zpool)
{
pr_debug("destroying pool type %s\n", zpool->driver->type);
spin_lock(&pools_lock);
list_del(&zpool->list);
spin_unlock(&pools_lock);
zpool->driver->destroy(zpool->pool);
zpool_put_driver(zpool->driver);
kfree(zpool);

File diff suppressed because it is too large Load Diff

View File

@ -1394,7 +1394,7 @@ static void zswap_frontswap_init(unsigned type)
zswap_trees[type] = tree;
}
static struct frontswap_ops zswap_frontswap_ops = {
static const struct frontswap_ops zswap_frontswap_ops = {
.store = zswap_frontswap_store,
.load = zswap_frontswap_load,
.invalidate_page = zswap_frontswap_invalidate_page,
@ -1479,7 +1479,9 @@ static int __init init_zswap(void)
if (!shrink_wq)
goto hp_fail;
frontswap_register_ops(&zswap_frontswap_ops);
ret = frontswap_register_ops(&zswap_frontswap_ops);
if (ret)
goto destroy_wq;
if (zswap_debugfs_init())
pr_warn("debugfs initialization failed\n");
@ -1488,6 +1490,8 @@ static int __init init_zswap(void)
return 0;
destroy_wq:
destroy_workqueue(shrink_wq);
hp_fail:
cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
dstmem_fail:

View File

@ -65,7 +65,7 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev,
hip->le.src_addr_type = 2; /* 12 bit SC address */
memcpy(hip->le.src_switch_addr, dev->dev_addr + 3, 3);
memset(&hip->le.reserved, 0, 16);
memset_startat(&hip->le, 0, reserved);
hip->snap.dsap = HIPPI_EXTENDED_SAP;
hip->snap.ssap = HIPPI_EXTENDED_SAP;
@ -121,7 +121,7 @@ int hippi_mac_addr(struct net_device *dev, void *p)
struct sockaddr *addr = p;
if (netif_running(dev))
return -EBUSY;
memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
dev_addr_set(dev, addr->sa_data);
return 0;
}
EXPORT_SYMBOL(hippi_mac_addr);

View File

@ -23,7 +23,7 @@
#include <net/p8022.h>
static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb,
unsigned char *dest)
const unsigned char *dest)
{
llc_build_and_send_ui_pkt(dl->sap, skb, dest, dl->sap->laddr.lsap);
return 0;

View File

@ -79,7 +79,7 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
* Put a SNAP header on a frame and pass to 802.2
*/
static int snap_request(struct datalink_proto *dl,
struct sk_buff *skb, u8 *dest)
struct sk_buff *skb, const u8 *dest)
{
memcpy(skb_push(skb, 5), dl->type, 5);
llc_build_and_send_ui_pkt(snap_sap, skb, dest, snap_sap->laddr.lsap);

View File

@ -319,8 +319,8 @@ static void vlan_transfer_features(struct net_device *dev,
{
struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
vlandev->gso_max_size = dev->gso_max_size;
vlandev->gso_max_segs = dev->gso_max_segs;
netif_set_gso_max_size(vlandev, dev->gso_max_size);
netif_set_gso_max_segs(vlandev, dev->gso_max_segs);
if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto))
vlandev->hard_header_len = dev->hard_header_len;

Some files were not shown because too many files have changed in this diff Show More