mirror of
https://github.com/Qortal/Brooklyn.git
synced 2025-01-30 14:52:17 +00:00
phase 4
This commit is contained in:
parent
c06278f256
commit
6ca71e00d3
@ -14,6 +14,8 @@ hostprogs += mktables
|
||||
|
||||
ifeq ($(CONFIG_ALTIVEC),y)
|
||||
altivec_flags := -maltivec $(call cc-option,-mabi=altivec)
|
||||
# Enable <altivec.h>
|
||||
altivec_flags += -isystem $(shell $(CC) -print-file-name=include)
|
||||
|
||||
ifdef CONFIG_CC_IS_CLANG
|
||||
# clang ppc port does not yet support -maltivec when -msoft-float is
|
||||
@ -34,6 +36,8 @@ endif
|
||||
# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
|
||||
ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
|
||||
NEON_FLAGS := -ffreestanding
|
||||
# Enable <arm_neon.h>
|
||||
NEON_FLAGS += -isystem $(shell $(CC) -print-file-name=include)
|
||||
ifeq ($(ARCH),arm)
|
||||
NEON_FLAGS += -march=armv7-a -mfloat-abi=softfp -mfpu=neon
|
||||
endif
|
||||
|
81
mm/Kconfig
81
mm/Kconfig
@ -109,6 +109,13 @@ config NUMA_KEEP_MEMINFO
|
||||
config MEMORY_ISOLATION
|
||||
bool
|
||||
|
||||
# IORESOURCE_SYSTEM_RAM regions in the kernel resource tree that are marked
|
||||
# IORESOURCE_EXCLUSIVE cannot be mapped to user space, for example, via
|
||||
# /dev/mem.
|
||||
config EXCLUSIVE_SYSTEM_RAM
|
||||
def_bool y
|
||||
depends on !DEVMEM || STRICT_DEVMEM
|
||||
|
||||
#
|
||||
# Only be set on architectures that have completely implemented memory hotplug
|
||||
# feature. If you are not sure, don't touch it.
|
||||
@ -123,15 +130,11 @@ config ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
config MEMORY_HOTPLUG
|
||||
bool "Allow for memory hot-add"
|
||||
select MEMORY_ISOLATION
|
||||
depends on SPARSEMEM || X86_64_ACPI_NUMA
|
||||
depends on SPARSEMEM
|
||||
depends on ARCH_ENABLE_MEMORY_HOTPLUG
|
||||
depends on 64BIT || BROKEN
|
||||
depends on 64BIT
|
||||
select NUMA_KEEP_MEMINFO if NUMA
|
||||
|
||||
config MEMORY_HOTPLUG_SPARSE
|
||||
def_bool y
|
||||
depends on SPARSEMEM && MEMORY_HOTPLUG
|
||||
|
||||
config MEMORY_HOTPLUG_DEFAULT_ONLINE
|
||||
bool "Online the newly added memory blocks by default"
|
||||
depends on MEMORY_HOTPLUG
|
||||
@ -371,7 +374,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
|
||||
|
||||
config TRANSPARENT_HUGEPAGE
|
||||
bool "Transparent Hugepage Support"
|
||||
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
|
||||
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT
|
||||
select COMPACTION
|
||||
select XARRAY_MULTI
|
||||
help
|
||||
@ -425,47 +428,24 @@ config THP_SWAP
|
||||
# UP and nommu archs use km based percpu allocator
|
||||
#
|
||||
config NEED_PER_CPU_KM
|
||||
depends on !SMP
|
||||
depends on !SMP || !MMU
|
||||
bool
|
||||
default y
|
||||
|
||||
config CLEANCACHE
|
||||
bool "Enable cleancache driver to cache clean pages if tmem is present"
|
||||
help
|
||||
Cleancache can be thought of as a page-granularity victim cache
|
||||
for clean pages that the kernel's pageframe replacement algorithm
|
||||
(PFRA) would like to keep around, but can't since there isn't enough
|
||||
memory. So when the PFRA "evicts" a page, it first attempts to use
|
||||
cleancache code to put the data contained in that page into
|
||||
"transcendent memory", memory that is not directly accessible or
|
||||
addressable by the kernel and is of unknown and possibly
|
||||
time-varying size. And when a cleancache-enabled
|
||||
filesystem wishes to access a page in a file on disk, it first
|
||||
checks cleancache to see if it already contains it; if it does,
|
||||
the page is copied into the kernel and a disk access is avoided.
|
||||
When a transcendent memory driver is available (such as zcache or
|
||||
Xen transcendent memory), a significant I/O reduction
|
||||
may be achieved. When none is available, all cleancache calls
|
||||
are reduced to a single pointer-compare-against-NULL resulting
|
||||
in a negligible performance hit.
|
||||
config NEED_PER_CPU_EMBED_FIRST_CHUNK
|
||||
bool
|
||||
|
||||
If unsure, say Y to enable cleancache
|
||||
config NEED_PER_CPU_PAGE_FIRST_CHUNK
|
||||
bool
|
||||
|
||||
config USE_PERCPU_NUMA_NODE_ID
|
||||
bool
|
||||
|
||||
config HAVE_SETUP_PER_CPU_AREA
|
||||
bool
|
||||
|
||||
config FRONTSWAP
|
||||
bool "Enable frontswap to cache swap pages if tmem is present"
|
||||
depends on SWAP
|
||||
help
|
||||
Frontswap is so named because it can be thought of as the opposite
|
||||
of a "backing" store for a swap device. The data is stored into
|
||||
"transcendent memory", memory that is not directly accessible or
|
||||
addressable by the kernel and is of unknown and possibly
|
||||
time-varying size. When space in transcendent memory is available,
|
||||
a significant swap I/O reduction may be achieved. When none is
|
||||
available, all frontswap calls are reduced to a single pointer-
|
||||
compare-against-NULL resulting in a negligible performance hit
|
||||
and swap data is stored as normal on the matching swap device.
|
||||
|
||||
If unsure, say Y to enable frontswap.
|
||||
bool
|
||||
|
||||
config CMA
|
||||
bool "Contiguous Memory Allocator"
|
||||
@ -530,7 +510,8 @@ config MEM_SOFT_DIRTY
|
||||
|
||||
config ZSWAP
|
||||
bool "Compressed cache for swap pages (EXPERIMENTAL)"
|
||||
depends on FRONTSWAP && CRYPTO=y
|
||||
depends on SWAP && CRYPTO=y
|
||||
select FRONTSWAP
|
||||
select ZPOOL
|
||||
help
|
||||
A lightweight compressed cache for swap pages. It takes
|
||||
@ -897,6 +878,20 @@ config IO_MAPPING
|
||||
config SECRETMEM
|
||||
def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
|
||||
|
||||
config ANON_VMA_NAME
|
||||
bool "Anonymous VMA name support"
|
||||
depends on PROC_FS && ADVISE_SYSCALLS && MMU
|
||||
|
||||
help
|
||||
Allow naming anonymous virtual memory areas.
|
||||
|
||||
This feature allows assigning names to virtual memory areas. Assigned
|
||||
names can be later retrieved from /proc/pid/maps and /proc/pid/smaps
|
||||
and help identifying individual anonymous memory areas.
|
||||
Assigning a name to anonymous virtual memory area might prevent that
|
||||
area from being merged with adjacent virtual memory areas due to the
|
||||
difference in their name.
|
||||
|
||||
source "mm/damon/Kconfig"
|
||||
|
||||
endmenu
|
||||
|
@ -62,6 +62,30 @@ config PAGE_OWNER
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config PAGE_TABLE_CHECK
|
||||
bool "Check for invalid mappings in user page tables"
|
||||
depends on ARCH_SUPPORTS_PAGE_TABLE_CHECK
|
||||
select PAGE_EXTENSION
|
||||
help
|
||||
Check that anonymous page is not being mapped twice with read write
|
||||
permissions. Check that anonymous and file pages are not being
|
||||
erroneously shared. Since the checking is performed at the time
|
||||
entries are added and removed to user page tables, leaking, corruption
|
||||
and double mapping problems are detected synchronously.
|
||||
|
||||
If unsure say "n".
|
||||
|
||||
config PAGE_TABLE_CHECK_ENFORCED
|
||||
bool "Enforce the page table checking by default"
|
||||
depends on PAGE_TABLE_CHECK
|
||||
help
|
||||
Always enable page table checking. By default the page table checking
|
||||
is disabled, and can be optionally enabled via page_table_check=on
|
||||
kernel parameter. This config enforces that page table check is always
|
||||
enabled.
|
||||
|
||||
If unsure say "n".
|
||||
|
||||
config PAGE_POISONING
|
||||
bool "Poison pages after freeing"
|
||||
help
|
||||
|
@ -15,6 +15,8 @@ KCSAN_SANITIZE_slab_common.o := n
|
||||
KCSAN_SANITIZE_slab.o := n
|
||||
KCSAN_SANITIZE_slub.o := n
|
||||
KCSAN_SANITIZE_page_alloc.o := n
|
||||
# But enable explicit instrumentation for memory barriers.
|
||||
KCSAN_INSTRUMENT_BARRIERS := y
|
||||
|
||||
# These files are disabled because they produce non-interesting and/or
|
||||
# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
|
||||
@ -46,7 +48,7 @@ mmu-$(CONFIG_MMU) += process_vm_access.o
|
||||
endif
|
||||
|
||||
obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
|
||||
maccess.o page-writeback.o \
|
||||
maccess.o page-writeback.o folio-compat.o \
|
||||
readahead.o swap.o truncate.o vmscan.o shmem.o \
|
||||
util.o mmzone.o vmstat.o backing-dev.o \
|
||||
mm_init.o percpu.o slab_common.o \
|
||||
@ -102,7 +104,6 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
|
||||
obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
|
||||
obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
|
||||
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
|
||||
obj-$(CONFIG_CLEANCACHE) += cleancache.o
|
||||
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
|
||||
obj-$(CONFIG_ZPOOL) += zpool.o
|
||||
obj-$(CONFIG_ZBUD) += zbud.o
|
||||
@ -112,6 +113,7 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
|
||||
obj-$(CONFIG_CMA) += cma.o
|
||||
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
|
||||
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
|
||||
obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o
|
||||
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
|
||||
obj-$(CONFIG_SECRETMEM) += secretmem.o
|
||||
obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
|
||||
|
@ -2,8 +2,9 @@
|
||||
|
||||
#include <linux/wait.h>
|
||||
#include <linux/rbtree.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/blk-cgroup.h>
|
||||
#include <linux/freezer.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/pagemap.h>
|
||||
@ -291,8 +292,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
|
||||
|
||||
memset(wb, 0, sizeof(*wb));
|
||||
|
||||
if (wb != &bdi->wb)
|
||||
bdi_get(bdi);
|
||||
wb->bdi = bdi;
|
||||
wb->last_old_flush = jiffies;
|
||||
INIT_LIST_HEAD(&wb->b_dirty);
|
||||
@ -316,7 +315,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
|
||||
|
||||
err = fprop_local_init_percpu(&wb->completions, gfp);
|
||||
if (err)
|
||||
goto out_put_bdi;
|
||||
return err;
|
||||
|
||||
for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
|
||||
err = percpu_counter_init(&wb->stat[i], 0, gfp);
|
||||
@ -330,9 +329,6 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
|
||||
while (i--)
|
||||
percpu_counter_destroy(&wb->stat[i]);
|
||||
fprop_local_destroy_percpu(&wb->completions);
|
||||
out_put_bdi:
|
||||
if (wb != &bdi->wb)
|
||||
bdi_put(bdi);
|
||||
return err;
|
||||
}
|
||||
|
||||
@ -373,8 +369,6 @@ static void wb_exit(struct bdi_writeback *wb)
|
||||
percpu_counter_destroy(&wb->stat[i]);
|
||||
|
||||
fprop_local_destroy_percpu(&wb->completions);
|
||||
if (wb != &wb->bdi->wb)
|
||||
bdi_put(wb->bdi);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_CGROUP_WRITEBACK
|
||||
@ -397,6 +391,7 @@ static void cgwb_release_workfn(struct work_struct *work)
|
||||
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
|
||||
release_work);
|
||||
struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
|
||||
struct backing_dev_info *bdi = wb->bdi;
|
||||
|
||||
mutex_lock(&wb->bdi->cgwb_release_mutex);
|
||||
wb_shutdown(wb);
|
||||
@ -416,6 +411,7 @@ static void cgwb_release_workfn(struct work_struct *work)
|
||||
|
||||
percpu_ref_exit(&wb->refcnt);
|
||||
wb_exit(wb);
|
||||
bdi_put(bdi);
|
||||
WARN_ON_ONCE(!list_empty(&wb->b_attached));
|
||||
kfree_rcu(wb, rcu);
|
||||
}
|
||||
@ -497,6 +493,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
|
||||
INIT_LIST_HEAD(&wb->b_attached);
|
||||
INIT_WORK(&wb->release_work, cgwb_release_workfn);
|
||||
set_bit(WB_registered, &wb->state);
|
||||
bdi_get(bdi);
|
||||
|
||||
/*
|
||||
* The root wb determines the registered state of the whole bdi and
|
||||
@ -528,6 +525,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
|
||||
goto out_put;
|
||||
|
||||
err_fprop_exit:
|
||||
bdi_put(bdi);
|
||||
fprop_local_destroy_percpu(&wb->memcg_completions);
|
||||
err_ref_exit:
|
||||
percpu_ref_exit(&wb->refcnt);
|
||||
@ -965,14 +963,14 @@ void bdi_unregister(struct backing_dev_info *bdi)
|
||||
bdi->owner = NULL;
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(bdi_unregister);
|
||||
|
||||
static void release_bdi(struct kref *ref)
|
||||
{
|
||||
struct backing_dev_info *bdi =
|
||||
container_of(ref, struct backing_dev_info, refcnt);
|
||||
|
||||
if (test_bit(WB_registered, &bdi->wb.state))
|
||||
bdi_unregister(bdi);
|
||||
WARN_ON_ONCE(test_bit(WB_registered, &bdi->wb.state));
|
||||
WARN_ON_ONCE(bdi->dev);
|
||||
wb_exit(&bdi->wb);
|
||||
kfree(bdi);
|
||||
@ -984,6 +982,22 @@ void bdi_put(struct backing_dev_info *bdi)
|
||||
}
|
||||
EXPORT_SYMBOL(bdi_put);
|
||||
|
||||
struct backing_dev_info *inode_to_bdi(struct inode *inode)
|
||||
{
|
||||
struct super_block *sb;
|
||||
|
||||
if (!inode)
|
||||
return &noop_backing_dev_info;
|
||||
|
||||
sb = inode->i_sb;
|
||||
#ifdef CONFIG_BLOCK
|
||||
if (sb_is_blkdev_sb(sb))
|
||||
return I_BDEV(inode)->bd_disk->bdi;
|
||||
#endif
|
||||
return sb->s_bdi;
|
||||
}
|
||||
EXPORT_SYMBOL(inode_to_bdi);
|
||||
|
||||
const char *bdi_dev_name(struct backing_dev_info *bdi)
|
||||
{
|
||||
if (!bdi || !bdi->dev)
|
||||
@ -1048,51 +1062,3 @@ long congestion_wait(int sync, long timeout)
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(congestion_wait);
|
||||
|
||||
/**
|
||||
* wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes
|
||||
* @sync: SYNC or ASYNC IO
|
||||
* @timeout: timeout in jiffies
|
||||
*
|
||||
* In the event of a congested backing_dev (any backing_dev) this waits
|
||||
* for up to @timeout jiffies for either a BDI to exit congestion of the
|
||||
* given @sync queue or a write to complete.
|
||||
*
|
||||
* The return value is 0 if the sleep is for the full timeout. Otherwise,
|
||||
* it is the number of jiffies that were still remaining when the function
|
||||
* returned. return_value == timeout implies the function did not sleep.
|
||||
*/
|
||||
long wait_iff_congested(int sync, long timeout)
|
||||
{
|
||||
long ret;
|
||||
unsigned long start = jiffies;
|
||||
DEFINE_WAIT(wait);
|
||||
wait_queue_head_t *wqh = &congestion_wqh[sync];
|
||||
|
||||
/*
|
||||
* If there is no congestion, yield if necessary instead
|
||||
* of sleeping on the congestion queue
|
||||
*/
|
||||
if (atomic_read(&nr_wb_congested[sync]) == 0) {
|
||||
cond_resched();
|
||||
|
||||
/* In case we scheduled, work out time remaining */
|
||||
ret = timeout - (jiffies - start);
|
||||
if (ret < 0)
|
||||
ret = 0;
|
||||
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Sleep until uncongested or a write happens */
|
||||
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
|
||||
ret = io_schedule_timeout(timeout);
|
||||
finish_wait(wqh, &wait);
|
||||
|
||||
out:
|
||||
trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
|
||||
jiffies_to_usecs(jiffies - start));
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(wait_iff_congested);
|
||||
|
@ -15,7 +15,7 @@
|
||||
|
||||
void get_page_bootmem(unsigned long info, struct page *page, unsigned long type)
|
||||
{
|
||||
page->freelist = (void *)type;
|
||||
page->index = type;
|
||||
SetPagePrivate(page);
|
||||
set_page_private(page, info);
|
||||
page_ref_inc(page);
|
||||
@ -23,14 +23,13 @@ void get_page_bootmem(unsigned long info, struct page *page, unsigned long type)
|
||||
|
||||
void put_page_bootmem(struct page *page)
|
||||
{
|
||||
unsigned long type;
|
||||
unsigned long type = page->index;
|
||||
|
||||
type = (unsigned long) page->freelist;
|
||||
BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
|
||||
type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
|
||||
|
||||
if (page_ref_dec_return(page) == 1) {
|
||||
page->freelist = NULL;
|
||||
page->index = 0;
|
||||
ClearPagePrivate(page);
|
||||
set_page_private(page, 0);
|
||||
INIT_LIST_HEAD(&page->lru);
|
||||
|
26
mm/cma.c
26
mm/cma.c
@ -378,7 +378,7 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
|
||||
return 0;
|
||||
|
||||
free_mem:
|
||||
memblock_free(base, size);
|
||||
memblock_phys_free(base, size);
|
||||
err:
|
||||
pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
|
||||
return ret;
|
||||
@ -524,6 +524,25 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
|
||||
return page;
|
||||
}
|
||||
|
||||
bool cma_pages_valid(struct cma *cma, const struct page *pages,
|
||||
unsigned long count)
|
||||
{
|
||||
unsigned long pfn;
|
||||
|
||||
if (!cma || !pages)
|
||||
return false;
|
||||
|
||||
pfn = page_to_pfn(pages);
|
||||
|
||||
if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count) {
|
||||
pr_debug("%s(page %p, count %lu)\n", __func__,
|
||||
(void *)pages, count);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* cma_release() - release allocated pages
|
||||
* @cma: Contiguous memory region for which the allocation is performed.
|
||||
@ -539,16 +558,13 @@ bool cma_release(struct cma *cma, const struct page *pages,
|
||||
{
|
||||
unsigned long pfn;
|
||||
|
||||
if (!cma || !pages)
|
||||
if (!cma_pages_valid(cma, pages, count))
|
||||
return false;
|
||||
|
||||
pr_debug("%s(page %p, count %lu)\n", __func__, (void *)pages, count);
|
||||
|
||||
pfn = page_to_pfn(pages);
|
||||
|
||||
if (pfn < cma->base_pfn || pfn >= cma->base_pfn + cma->count)
|
||||
return false;
|
||||
|
||||
VM_BUG_ON(pfn + count > cma->base_pfn + cma->count);
|
||||
|
||||
free_contig_range(pfn, count);
|
||||
|
@ -761,6 +761,8 @@ isolate_freepages_range(struct compact_control *cc,
|
||||
/* Similar to reclaim, but different enough that they don't share logic */
|
||||
static bool too_many_isolated(pg_data_t *pgdat)
|
||||
{
|
||||
bool too_many;
|
||||
|
||||
unsigned long active, inactive, isolated;
|
||||
|
||||
inactive = node_page_state(pgdat, NR_INACTIVE_FILE) +
|
||||
@ -770,7 +772,11 @@ static bool too_many_isolated(pg_data_t *pgdat)
|
||||
isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
|
||||
node_page_state(pgdat, NR_ISOLATED_ANON);
|
||||
|
||||
return isolated > (inactive + active) / 2;
|
||||
too_many = isolated > (inactive + active) / 2;
|
||||
if (!too_many)
|
||||
wake_throttle_isolated(pgdat);
|
||||
|
||||
return too_many;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -822,7 +828,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||
if (cc->mode == MIGRATE_ASYNC)
|
||||
return -EAGAIN;
|
||||
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/10);
|
||||
reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
|
||||
|
||||
if (fatal_signal_pending(current))
|
||||
return -EINTR;
|
||||
@ -1022,7 +1028,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||
if (!TestClearPageLRU(page))
|
||||
goto isolate_fail_put;
|
||||
|
||||
lruvec = mem_cgroup_page_lruvec(page);
|
||||
lruvec = folio_lruvec(page_folio(page));
|
||||
|
||||
/* If we already hold the lock, we can skip some rechecking */
|
||||
if (lruvec != locked) {
|
||||
@ -1032,7 +1038,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
|
||||
compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
|
||||
locked = lruvec;
|
||||
|
||||
lruvec_memcg_debug(lruvec, page);
|
||||
lruvec_memcg_debug(lruvec, page_folio(page));
|
||||
|
||||
/* Try get exclusive access under lock */
|
||||
if (!skip_updated) {
|
||||
@ -2274,6 +2280,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
|
||||
unsigned long last_migrated_pfn;
|
||||
const bool sync = cc->mode != MIGRATE_ASYNC;
|
||||
bool update_cached;
|
||||
unsigned int nr_succeeded = 0;
|
||||
|
||||
/*
|
||||
* These counters track activities during zone compaction. Initialize
|
||||
@ -2392,10 +2399,10 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
|
||||
|
||||
err = migrate_pages(&cc->migratepages, compaction_alloc,
|
||||
compaction_free, (unsigned long)cc, cc->mode,
|
||||
MR_COMPACTION, NULL);
|
||||
MR_COMPACTION, &nr_succeeded);
|
||||
|
||||
trace_mm_compaction_migratepages(cc->nr_migratepages, err,
|
||||
&cc->migratepages);
|
||||
trace_mm_compaction_migratepages(cc->nr_migratepages,
|
||||
nr_succeeded);
|
||||
|
||||
/* All pages were either migrated or will be released */
|
||||
cc->nr_migratepages = 0;
|
||||
|
@ -30,7 +30,15 @@ config DAMON_VADDR
|
||||
select PAGE_IDLE_FLAG
|
||||
help
|
||||
This builds the default data access monitoring primitives for DAMON
|
||||
that works for virtual address spaces.
|
||||
that work for virtual address spaces.
|
||||
|
||||
config DAMON_PADDR
|
||||
bool "Data access monitoring primitives for the physical address space"
|
||||
depends on DAMON && MMU
|
||||
select PAGE_IDLE_FLAG
|
||||
help
|
||||
This builds the default data access monitoring primitives for DAMON
|
||||
that works for the physical address space.
|
||||
|
||||
config DAMON_VADDR_KUNIT_TEST
|
||||
bool "Test for DAMON primitives" if !KUNIT_ALL_TESTS
|
||||
@ -46,7 +54,7 @@ config DAMON_VADDR_KUNIT_TEST
|
||||
|
||||
config DAMON_DBGFS
|
||||
bool "DAMON debugfs interface"
|
||||
depends on DAMON_VADDR && DEBUG_FS
|
||||
depends on DAMON_VADDR && DAMON_PADDR && DEBUG_FS
|
||||
help
|
||||
This builds the debugfs interface for DAMON. The user space admins
|
||||
can use the interface for arbitrary data access monitoring.
|
||||
@ -65,4 +73,16 @@ config DAMON_DBGFS_KUNIT_TEST
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config DAMON_RECLAIM
|
||||
bool "Build DAMON-based reclaim (DAMON_RECLAIM)"
|
||||
depends on DAMON_PADDR
|
||||
help
|
||||
This builds the DAMON-based reclamation subsystem. It finds pages
|
||||
that not accessed for a long time (cold) using DAMON and reclaim
|
||||
those.
|
||||
|
||||
This is suggested to be used as a proactive and lightweight
|
||||
reclamation under light memory pressure, while the traditional page
|
||||
scanning-based reclamation is used for heavy pressure.
|
||||
|
||||
endmenu
|
||||
|
@ -1,5 +1,7 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
obj-$(CONFIG_DAMON) := core.o
|
||||
obj-$(CONFIG_DAMON_VADDR) += vaddr.o
|
||||
obj-$(CONFIG_DAMON_VADDR) += prmtv-common.o vaddr.o
|
||||
obj-$(CONFIG_DAMON_PADDR) += prmtv-common.o paddr.o
|
||||
obj-$(CONFIG_DAMON_DBGFS) += dbgfs.o
|
||||
obj-$(CONFIG_DAMON_RECLAIM) += reclaim.o
|
||||
|
499
mm/damon/core.c
499
mm/damon/core.c
@ -10,8 +10,9 @@
|
||||
#include <linux/damon.h>
|
||||
#include <linux/delay.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/string.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
#include <trace/events/damon.h>
|
||||
@ -21,9 +22,6 @@
|
||||
#define DAMON_MIN_REGION 1
|
||||
#endif
|
||||
|
||||
/* Get a random number in [l, r) */
|
||||
#define damon_rand(l, r) (l + prandom_u32_max(r - l))
|
||||
|
||||
static DEFINE_MUTEX(damon_lock);
|
||||
static int nr_running_ctxs;
|
||||
|
||||
@ -45,18 +43,10 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
|
||||
region->nr_accesses = 0;
|
||||
INIT_LIST_HEAD(®ion->list);
|
||||
|
||||
return region;
|
||||
}
|
||||
region->age = 0;
|
||||
region->last_nr_accesses = 0;
|
||||
|
||||
/*
|
||||
* Add a region between two other regions
|
||||
*/
|
||||
inline void damon_insert_region(struct damon_region *r,
|
||||
struct damon_region *prev, struct damon_region *next,
|
||||
struct damon_target *t)
|
||||
{
|
||||
__list_add(&r->list, &prev->list, &next->list);
|
||||
t->nr_regions++;
|
||||
return region;
|
||||
}
|
||||
|
||||
void damon_add_region(struct damon_region *r, struct damon_target *t)
|
||||
@ -82,6 +72,73 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t)
|
||||
damon_free_region(r);
|
||||
}
|
||||
|
||||
struct damos *damon_new_scheme(
|
||||
unsigned long min_sz_region, unsigned long max_sz_region,
|
||||
unsigned int min_nr_accesses, unsigned int max_nr_accesses,
|
||||
unsigned int min_age_region, unsigned int max_age_region,
|
||||
enum damos_action action, struct damos_quota *quota,
|
||||
struct damos_watermarks *wmarks)
|
||||
{
|
||||
struct damos *scheme;
|
||||
|
||||
scheme = kmalloc(sizeof(*scheme), GFP_KERNEL);
|
||||
if (!scheme)
|
||||
return NULL;
|
||||
scheme->min_sz_region = min_sz_region;
|
||||
scheme->max_sz_region = max_sz_region;
|
||||
scheme->min_nr_accesses = min_nr_accesses;
|
||||
scheme->max_nr_accesses = max_nr_accesses;
|
||||
scheme->min_age_region = min_age_region;
|
||||
scheme->max_age_region = max_age_region;
|
||||
scheme->action = action;
|
||||
scheme->stat = (struct damos_stat){};
|
||||
INIT_LIST_HEAD(&scheme->list);
|
||||
|
||||
scheme->quota.ms = quota->ms;
|
||||
scheme->quota.sz = quota->sz;
|
||||
scheme->quota.reset_interval = quota->reset_interval;
|
||||
scheme->quota.weight_sz = quota->weight_sz;
|
||||
scheme->quota.weight_nr_accesses = quota->weight_nr_accesses;
|
||||
scheme->quota.weight_age = quota->weight_age;
|
||||
scheme->quota.total_charged_sz = 0;
|
||||
scheme->quota.total_charged_ns = 0;
|
||||
scheme->quota.esz = 0;
|
||||
scheme->quota.charged_sz = 0;
|
||||
scheme->quota.charged_from = 0;
|
||||
scheme->quota.charge_target_from = NULL;
|
||||
scheme->quota.charge_addr_from = 0;
|
||||
|
||||
scheme->wmarks.metric = wmarks->metric;
|
||||
scheme->wmarks.interval = wmarks->interval;
|
||||
scheme->wmarks.high = wmarks->high;
|
||||
scheme->wmarks.mid = wmarks->mid;
|
||||
scheme->wmarks.low = wmarks->low;
|
||||
scheme->wmarks.activated = true;
|
||||
|
||||
return scheme;
|
||||
}
|
||||
|
||||
void damon_add_scheme(struct damon_ctx *ctx, struct damos *s)
|
||||
{
|
||||
list_add_tail(&s->list, &ctx->schemes);
|
||||
}
|
||||
|
||||
static void damon_del_scheme(struct damos *s)
|
||||
{
|
||||
list_del(&s->list);
|
||||
}
|
||||
|
||||
static void damon_free_scheme(struct damos *s)
|
||||
{
|
||||
kfree(s);
|
||||
}
|
||||
|
||||
void damon_destroy_scheme(struct damos *s)
|
||||
{
|
||||
damon_del_scheme(s);
|
||||
damon_free_scheme(s);
|
||||
}
|
||||
|
||||
/*
|
||||
* Construct a damon_target struct
|
||||
*
|
||||
@ -107,6 +164,11 @@ void damon_add_target(struct damon_ctx *ctx, struct damon_target *t)
|
||||
list_add_tail(&t->list, &ctx->adaptive_targets);
|
||||
}
|
||||
|
||||
bool damon_targets_empty(struct damon_ctx *ctx)
|
||||
{
|
||||
return list_empty(&ctx->adaptive_targets);
|
||||
}
|
||||
|
||||
static void damon_del_target(struct damon_target *t)
|
||||
{
|
||||
list_del(&t->list);
|
||||
@ -153,6 +215,7 @@ struct damon_ctx *damon_new_ctx(void)
|
||||
ctx->max_nr_regions = 1000;
|
||||
|
||||
INIT_LIST_HEAD(&ctx->adaptive_targets);
|
||||
INIT_LIST_HEAD(&ctx->schemes);
|
||||
|
||||
return ctx;
|
||||
}
|
||||
@ -172,7 +235,13 @@ static void damon_destroy_targets(struct damon_ctx *ctx)
|
||||
|
||||
void damon_destroy_ctx(struct damon_ctx *ctx)
|
||||
{
|
||||
struct damos *s, *next_s;
|
||||
|
||||
damon_destroy_targets(ctx);
|
||||
|
||||
damon_for_each_scheme_safe(s, next_s, ctx)
|
||||
damon_destroy_scheme(s);
|
||||
|
||||
kfree(ctx);
|
||||
}
|
||||
|
||||
@ -197,7 +266,6 @@ int damon_set_targets(struct damon_ctx *ctx,
|
||||
for (i = 0; i < nr_ids; i++) {
|
||||
t = damon_new_target(ids[i]);
|
||||
if (!t) {
|
||||
pr_err("Failed to alloc damon_target\n");
|
||||
/* The caller should do cleanup of the ids itself */
|
||||
damon_for_each_target_safe(t, next, ctx)
|
||||
damon_destroy_target(t);
|
||||
@ -227,16 +295,10 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
|
||||
unsigned long aggr_int, unsigned long primitive_upd_int,
|
||||
unsigned long min_nr_reg, unsigned long max_nr_reg)
|
||||
{
|
||||
if (min_nr_reg < 3) {
|
||||
pr_err("min_nr_regions (%lu) must be at least 3\n",
|
||||
min_nr_reg);
|
||||
if (min_nr_reg < 3)
|
||||
return -EINVAL;
|
||||
}
|
||||
if (min_nr_reg > max_nr_reg) {
|
||||
pr_err("invalid nr_regions. min (%lu) > max (%lu)\n",
|
||||
min_nr_reg, max_nr_reg);
|
||||
if (min_nr_reg > max_nr_reg)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ctx->sample_interval = sample_int;
|
||||
ctx->aggr_interval = aggr_int;
|
||||
@ -247,6 +309,30 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* damon_set_schemes() - Set data access monitoring based operation schemes.
|
||||
* @ctx: monitoring context
|
||||
* @schemes: array of the schemes
|
||||
* @nr_schemes: number of entries in @schemes
|
||||
*
|
||||
* This function should not be called while the kdamond of the context is
|
||||
* running.
|
||||
*
|
||||
* Return: 0 if success, or negative error code otherwise.
|
||||
*/
|
||||
int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes,
|
||||
ssize_t nr_schemes)
|
||||
{
|
||||
struct damos *s, *next;
|
||||
ssize_t i;
|
||||
|
||||
damon_for_each_scheme_safe(s, next, ctx)
|
||||
damon_destroy_scheme(s);
|
||||
for (i = 0; i < nr_schemes; i++)
|
||||
damon_add_scheme(ctx, schemes[i]);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* damon_nr_running_ctxs() - Return number of currently running contexts.
|
||||
*/
|
||||
@ -281,17 +367,6 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx)
|
||||
return sz;
|
||||
}
|
||||
|
||||
static bool damon_kdamond_running(struct damon_ctx *ctx)
|
||||
{
|
||||
bool running;
|
||||
|
||||
mutex_lock(&ctx->kdamond_lock);
|
||||
running = ctx->kdamond != NULL;
|
||||
mutex_unlock(&ctx->kdamond_lock);
|
||||
|
||||
return running;
|
||||
}
|
||||
|
||||
static int kdamond_fn(void *data);
|
||||
|
||||
/*
|
||||
@ -309,12 +384,11 @@ static int __damon_start(struct damon_ctx *ctx)
|
||||
mutex_lock(&ctx->kdamond_lock);
|
||||
if (!ctx->kdamond) {
|
||||
err = 0;
|
||||
ctx->kdamond_stop = false;
|
||||
ctx->kdamond = kthread_run(kdamond_fn, ctx, "kdamond.%d",
|
||||
nr_running_ctxs);
|
||||
if (IS_ERR(ctx->kdamond)) {
|
||||
err = PTR_ERR(ctx->kdamond);
|
||||
ctx->kdamond = 0;
|
||||
ctx->kdamond = NULL;
|
||||
}
|
||||
}
|
||||
mutex_unlock(&ctx->kdamond_lock);
|
||||
@ -357,15 +431,6 @@ int damon_start(struct damon_ctx **ctxs, int nr_ctxs)
|
||||
return err;
|
||||
}
|
||||
|
||||
static void kdamond_usleep(unsigned long usecs)
|
||||
{
|
||||
/* See Documentation/timers/timers-howto.rst for the thresholds */
|
||||
if (usecs > 20 * 1000)
|
||||
schedule_timeout_idle(usecs_to_jiffies(usecs));
|
||||
else
|
||||
usleep_idle_range(usecs, usecs + 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* __damon_stop() - Stops monitoring of given context.
|
||||
* @ctx: monitoring context
|
||||
@ -374,12 +439,15 @@ static void kdamond_usleep(unsigned long usecs)
|
||||
*/
|
||||
static int __damon_stop(struct damon_ctx *ctx)
|
||||
{
|
||||
struct task_struct *tsk;
|
||||
|
||||
mutex_lock(&ctx->kdamond_lock);
|
||||
if (ctx->kdamond) {
|
||||
ctx->kdamond_stop = true;
|
||||
tsk = ctx->kdamond;
|
||||
if (tsk) {
|
||||
get_task_struct(tsk);
|
||||
mutex_unlock(&ctx->kdamond_lock);
|
||||
while (damon_kdamond_running(ctx))
|
||||
kdamond_usleep(ctx->sample_interval);
|
||||
kthread_stop(tsk);
|
||||
put_task_struct(tsk);
|
||||
return 0;
|
||||
}
|
||||
mutex_unlock(&ctx->kdamond_lock);
|
||||
@ -446,18 +514,221 @@ static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
|
||||
static void kdamond_reset_aggregated(struct damon_ctx *c)
|
||||
{
|
||||
struct damon_target *t;
|
||||
unsigned int ti = 0; /* target's index */
|
||||
|
||||
damon_for_each_target(t, c) {
|
||||
struct damon_region *r;
|
||||
|
||||
damon_for_each_region(r, t) {
|
||||
trace_damon_aggregated(t, r, damon_nr_regions(t));
|
||||
trace_damon_aggregated(t, ti, r, damon_nr_regions(t));
|
||||
r->last_nr_accesses = r->nr_accesses;
|
||||
r->nr_accesses = 0;
|
||||
}
|
||||
ti++;
|
||||
}
|
||||
}
|
||||
|
||||
#define sz_damon_region(r) (r->ar.end - r->ar.start)
|
||||
static void damon_split_region_at(struct damon_ctx *ctx,
|
||||
struct damon_target *t, struct damon_region *r,
|
||||
unsigned long sz_r);
|
||||
|
||||
static bool __damos_valid_target(struct damon_region *r, struct damos *s)
|
||||
{
|
||||
unsigned long sz;
|
||||
|
||||
sz = r->ar.end - r->ar.start;
|
||||
return s->min_sz_region <= sz && sz <= s->max_sz_region &&
|
||||
s->min_nr_accesses <= r->nr_accesses &&
|
||||
r->nr_accesses <= s->max_nr_accesses &&
|
||||
s->min_age_region <= r->age && r->age <= s->max_age_region;
|
||||
}
|
||||
|
||||
static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t,
|
||||
struct damon_region *r, struct damos *s)
|
||||
{
|
||||
bool ret = __damos_valid_target(r, s);
|
||||
|
||||
if (!ret || !s->quota.esz || !c->primitive.get_scheme_score)
|
||||
return ret;
|
||||
|
||||
return c->primitive.get_scheme_score(c, t, r, s) >= s->quota.min_score;
|
||||
}
|
||||
|
||||
static void damon_do_apply_schemes(struct damon_ctx *c,
|
||||
struct damon_target *t,
|
||||
struct damon_region *r)
|
||||
{
|
||||
struct damos *s;
|
||||
|
||||
damon_for_each_scheme(s, c) {
|
||||
struct damos_quota *quota = &s->quota;
|
||||
unsigned long sz = r->ar.end - r->ar.start;
|
||||
struct timespec64 begin, end;
|
||||
unsigned long sz_applied = 0;
|
||||
|
||||
if (!s->wmarks.activated)
|
||||
continue;
|
||||
|
||||
/* Check the quota */
|
||||
if (quota->esz && quota->charged_sz >= quota->esz)
|
||||
continue;
|
||||
|
||||
/* Skip previously charged regions */
|
||||
if (quota->charge_target_from) {
|
||||
if (t != quota->charge_target_from)
|
||||
continue;
|
||||
if (r == damon_last_region(t)) {
|
||||
quota->charge_target_from = NULL;
|
||||
quota->charge_addr_from = 0;
|
||||
continue;
|
||||
}
|
||||
if (quota->charge_addr_from &&
|
||||
r->ar.end <= quota->charge_addr_from)
|
||||
continue;
|
||||
|
||||
if (quota->charge_addr_from && r->ar.start <
|
||||
quota->charge_addr_from) {
|
||||
sz = ALIGN_DOWN(quota->charge_addr_from -
|
||||
r->ar.start, DAMON_MIN_REGION);
|
||||
if (!sz) {
|
||||
if (r->ar.end - r->ar.start <=
|
||||
DAMON_MIN_REGION)
|
||||
continue;
|
||||
sz = DAMON_MIN_REGION;
|
||||
}
|
||||
damon_split_region_at(c, t, r, sz);
|
||||
r = damon_next_region(r);
|
||||
sz = r->ar.end - r->ar.start;
|
||||
}
|
||||
quota->charge_target_from = NULL;
|
||||
quota->charge_addr_from = 0;
|
||||
}
|
||||
|
||||
if (!damos_valid_target(c, t, r, s))
|
||||
continue;
|
||||
|
||||
/* Apply the scheme */
|
||||
if (c->primitive.apply_scheme) {
|
||||
if (quota->esz &&
|
||||
quota->charged_sz + sz > quota->esz) {
|
||||
sz = ALIGN_DOWN(quota->esz - quota->charged_sz,
|
||||
DAMON_MIN_REGION);
|
||||
if (!sz)
|
||||
goto update_stat;
|
||||
damon_split_region_at(c, t, r, sz);
|
||||
}
|
||||
ktime_get_coarse_ts64(&begin);
|
||||
sz_applied = c->primitive.apply_scheme(c, t, r, s);
|
||||
ktime_get_coarse_ts64(&end);
|
||||
quota->total_charged_ns += timespec64_to_ns(&end) -
|
||||
timespec64_to_ns(&begin);
|
||||
quota->charged_sz += sz;
|
||||
if (quota->esz && quota->charged_sz >= quota->esz) {
|
||||
quota->charge_target_from = t;
|
||||
quota->charge_addr_from = r->ar.end + 1;
|
||||
}
|
||||
}
|
||||
if (s->action != DAMOS_STAT)
|
||||
r->age = 0;
|
||||
|
||||
update_stat:
|
||||
s->stat.nr_tried++;
|
||||
s->stat.sz_tried += sz;
|
||||
if (sz_applied)
|
||||
s->stat.nr_applied++;
|
||||
s->stat.sz_applied += sz_applied;
|
||||
}
|
||||
}
|
||||
|
||||
/* Shouldn't be called if quota->ms and quota->sz are zero */
|
||||
static void damos_set_effective_quota(struct damos_quota *quota)
|
||||
{
|
||||
unsigned long throughput;
|
||||
unsigned long esz;
|
||||
|
||||
if (!quota->ms) {
|
||||
quota->esz = quota->sz;
|
||||
return;
|
||||
}
|
||||
|
||||
if (quota->total_charged_ns)
|
||||
throughput = quota->total_charged_sz * 1000000 /
|
||||
quota->total_charged_ns;
|
||||
else
|
||||
throughput = PAGE_SIZE * 1024;
|
||||
esz = throughput * quota->ms;
|
||||
|
||||
if (quota->sz && quota->sz < esz)
|
||||
esz = quota->sz;
|
||||
quota->esz = esz;
|
||||
}
|
||||
|
||||
static void kdamond_apply_schemes(struct damon_ctx *c)
|
||||
{
|
||||
struct damon_target *t;
|
||||
struct damon_region *r, *next_r;
|
||||
struct damos *s;
|
||||
|
||||
damon_for_each_scheme(s, c) {
|
||||
struct damos_quota *quota = &s->quota;
|
||||
unsigned long cumulated_sz;
|
||||
unsigned int score, max_score = 0;
|
||||
|
||||
if (!s->wmarks.activated)
|
||||
continue;
|
||||
|
||||
if (!quota->ms && !quota->sz)
|
||||
continue;
|
||||
|
||||
/* New charge window starts */
|
||||
if (time_after_eq(jiffies, quota->charged_from +
|
||||
msecs_to_jiffies(
|
||||
quota->reset_interval))) {
|
||||
if (quota->esz && quota->charged_sz >= quota->esz)
|
||||
s->stat.qt_exceeds++;
|
||||
quota->total_charged_sz += quota->charged_sz;
|
||||
quota->charged_from = jiffies;
|
||||
quota->charged_sz = 0;
|
||||
damos_set_effective_quota(quota);
|
||||
}
|
||||
|
||||
if (!c->primitive.get_scheme_score)
|
||||
continue;
|
||||
|
||||
/* Fill up the score histogram */
|
||||
memset(quota->histogram, 0, sizeof(quota->histogram));
|
||||
damon_for_each_target(t, c) {
|
||||
damon_for_each_region(r, t) {
|
||||
if (!__damos_valid_target(r, s))
|
||||
continue;
|
||||
score = c->primitive.get_scheme_score(
|
||||
c, t, r, s);
|
||||
quota->histogram[score] +=
|
||||
r->ar.end - r->ar.start;
|
||||
if (score > max_score)
|
||||
max_score = score;
|
||||
}
|
||||
}
|
||||
|
||||
/* Set the min score limit */
|
||||
for (cumulated_sz = 0, score = max_score; ; score--) {
|
||||
cumulated_sz += quota->histogram[score];
|
||||
if (cumulated_sz >= quota->esz || !score)
|
||||
break;
|
||||
}
|
||||
quota->min_score = score;
|
||||
}
|
||||
|
||||
damon_for_each_target(t, c) {
|
||||
damon_for_each_region_safe(r, next_r, t)
|
||||
damon_do_apply_schemes(c, t, r);
|
||||
}
|
||||
}
|
||||
|
||||
static inline unsigned long sz_damon_region(struct damon_region *r)
|
||||
{
|
||||
return r->ar.end - r->ar.start;
|
||||
}
|
||||
|
||||
/*
|
||||
* Merge two adjacent regions into one region
|
||||
@ -469,12 +740,11 @@ static void damon_merge_two_regions(struct damon_target *t,
|
||||
|
||||
l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) /
|
||||
(sz_l + sz_r);
|
||||
l->age = (l->age * sz_l + r->age * sz_r) / (sz_l + sz_r);
|
||||
l->ar.end = r->ar.end;
|
||||
damon_destroy_region(r, t);
|
||||
}
|
||||
|
||||
#define diff_of(a, b) (a > b ? a - b : b - a)
|
||||
|
||||
/*
|
||||
* Merge adjacent regions having similar access frequencies
|
||||
*
|
||||
@ -488,8 +758,13 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
|
||||
struct damon_region *r, *prev = NULL, *next;
|
||||
|
||||
damon_for_each_region_safe(r, next, t) {
|
||||
if (abs(r->nr_accesses - r->last_nr_accesses) > thres)
|
||||
r->age = 0;
|
||||
else
|
||||
r->age++;
|
||||
|
||||
if (prev && prev->ar.end == r->ar.start &&
|
||||
diff_of(prev->nr_accesses, r->nr_accesses) <= thres &&
|
||||
abs(prev->nr_accesses - r->nr_accesses) <= thres &&
|
||||
sz_damon_region(prev) + sz_damon_region(r) <= sz_limit)
|
||||
damon_merge_two_regions(t, prev, r);
|
||||
else
|
||||
@ -535,6 +810,9 @@ static void damon_split_region_at(struct damon_ctx *ctx,
|
||||
|
||||
r->ar.end = new->ar.start;
|
||||
|
||||
new->age = r->age;
|
||||
new->last_nr_accesses = r->last_nr_accesses;
|
||||
|
||||
damon_insert_region(new, r, damon_next_region(r), t);
|
||||
}
|
||||
|
||||
@ -623,12 +901,8 @@ static bool kdamond_need_update_primitive(struct damon_ctx *ctx)
|
||||
static bool kdamond_need_stop(struct damon_ctx *ctx)
|
||||
{
|
||||
struct damon_target *t;
|
||||
bool stop;
|
||||
|
||||
mutex_lock(&ctx->kdamond_lock);
|
||||
stop = ctx->kdamond_stop;
|
||||
mutex_unlock(&ctx->kdamond_lock);
|
||||
if (stop)
|
||||
if (kthread_should_stop())
|
||||
return true;
|
||||
|
||||
if (!ctx->primitive.target_valid)
|
||||
@ -642,11 +916,82 @@ static bool kdamond_need_stop(struct damon_ctx *ctx)
|
||||
return true;
|
||||
}
|
||||
|
||||
static void set_kdamond_stop(struct damon_ctx *ctx)
|
||||
static unsigned long damos_wmark_metric_value(enum damos_wmark_metric metric)
|
||||
{
|
||||
mutex_lock(&ctx->kdamond_lock);
|
||||
ctx->kdamond_stop = true;
|
||||
mutex_unlock(&ctx->kdamond_lock);
|
||||
struct sysinfo i;
|
||||
|
||||
switch (metric) {
|
||||
case DAMOS_WMARK_FREE_MEM_RATE:
|
||||
si_meminfo(&i);
|
||||
return i.freeram * 1000 / i.totalram;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns zero if the scheme is active. Else, returns time to wait for next
|
||||
* watermark check in micro-seconds.
|
||||
*/
|
||||
static unsigned long damos_wmark_wait_us(struct damos *scheme)
|
||||
{
|
||||
unsigned long metric;
|
||||
|
||||
if (scheme->wmarks.metric == DAMOS_WMARK_NONE)
|
||||
return 0;
|
||||
|
||||
metric = damos_wmark_metric_value(scheme->wmarks.metric);
|
||||
/* higher than high watermark or lower than low watermark */
|
||||
if (metric > scheme->wmarks.high || scheme->wmarks.low > metric) {
|
||||
if (scheme->wmarks.activated)
|
||||
pr_debug("deactivate a scheme (%d) for %s wmark\n",
|
||||
scheme->action,
|
||||
metric > scheme->wmarks.high ?
|
||||
"high" : "low");
|
||||
scheme->wmarks.activated = false;
|
||||
return scheme->wmarks.interval;
|
||||
}
|
||||
|
||||
/* inactive and higher than middle watermark */
|
||||
if ((scheme->wmarks.high >= metric && metric >= scheme->wmarks.mid) &&
|
||||
!scheme->wmarks.activated)
|
||||
return scheme->wmarks.interval;
|
||||
|
||||
if (!scheme->wmarks.activated)
|
||||
pr_debug("activate a scheme (%d)\n", scheme->action);
|
||||
scheme->wmarks.activated = true;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void kdamond_usleep(unsigned long usecs)
|
||||
{
|
||||
/* See Documentation/timers/timers-howto.rst for the thresholds */
|
||||
if (usecs > 20 * USEC_PER_MSEC)
|
||||
schedule_timeout_idle(usecs_to_jiffies(usecs));
|
||||
else
|
||||
usleep_idle_range(usecs, usecs + 1);
|
||||
}
|
||||
|
||||
/* Returns negative error code if it's not activated but should return */
|
||||
static int kdamond_wait_activation(struct damon_ctx *ctx)
|
||||
{
|
||||
struct damos *s;
|
||||
unsigned long wait_time;
|
||||
unsigned long min_wait_time = 0;
|
||||
|
||||
while (!kdamond_need_stop(ctx)) {
|
||||
damon_for_each_scheme(s, ctx) {
|
||||
wait_time = damos_wmark_wait_us(s);
|
||||
if (!min_wait_time || wait_time < min_wait_time)
|
||||
min_wait_time = wait_time;
|
||||
}
|
||||
if (!min_wait_time)
|
||||
return 0;
|
||||
|
||||
kdamond_usleep(min_wait_time);
|
||||
}
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -659,24 +1004,26 @@ static int kdamond_fn(void *data)
|
||||
struct damon_region *r, *next;
|
||||
unsigned int max_nr_accesses = 0;
|
||||
unsigned long sz_limit = 0;
|
||||
bool done = false;
|
||||
|
||||
mutex_lock(&ctx->kdamond_lock);
|
||||
pr_info("kdamond (%d) starts\n", ctx->kdamond->pid);
|
||||
mutex_unlock(&ctx->kdamond_lock);
|
||||
pr_debug("kdamond (%d) starts\n", current->pid);
|
||||
|
||||
if (ctx->primitive.init)
|
||||
ctx->primitive.init(ctx);
|
||||
if (ctx->callback.before_start && ctx->callback.before_start(ctx))
|
||||
set_kdamond_stop(ctx);
|
||||
done = true;
|
||||
|
||||
sz_limit = damon_region_sz_limit(ctx);
|
||||
|
||||
while (!kdamond_need_stop(ctx)) {
|
||||
while (!kdamond_need_stop(ctx) && !done) {
|
||||
if (kdamond_wait_activation(ctx))
|
||||
continue;
|
||||
|
||||
if (ctx->primitive.prepare_access_checks)
|
||||
ctx->primitive.prepare_access_checks(ctx);
|
||||
if (ctx->callback.after_sampling &&
|
||||
ctx->callback.after_sampling(ctx))
|
||||
set_kdamond_stop(ctx);
|
||||
done = true;
|
||||
|
||||
kdamond_usleep(ctx->sample_interval);
|
||||
|
||||
@ -689,7 +1036,8 @@ static int kdamond_fn(void *data)
|
||||
sz_limit);
|
||||
if (ctx->callback.after_aggregation &&
|
||||
ctx->callback.after_aggregation(ctx))
|
||||
set_kdamond_stop(ctx);
|
||||
done = true;
|
||||
kdamond_apply_schemes(ctx);
|
||||
kdamond_reset_aggregated(ctx);
|
||||
kdamond_split_regions(ctx);
|
||||
if (ctx->primitive.reset_aggregated)
|
||||
@ -707,13 +1055,12 @@ static int kdamond_fn(void *data)
|
||||
damon_destroy_region(r, t);
|
||||
}
|
||||
|
||||
if (ctx->callback.before_terminate &&
|
||||
ctx->callback.before_terminate(ctx))
|
||||
set_kdamond_stop(ctx);
|
||||
if (ctx->callback.before_terminate)
|
||||
ctx->callback.before_terminate(ctx);
|
||||
if (ctx->primitive.cleanup)
|
||||
ctx->primitive.cleanup(ctx);
|
||||
|
||||
pr_debug("kdamond (%d) finishes\n", ctx->kdamond->pid);
|
||||
pr_debug("kdamond (%d) finishes\n", current->pid);
|
||||
mutex_lock(&ctx->kdamond_lock);
|
||||
ctx->kdamond = NULL;
|
||||
mutex_unlock(&ctx->kdamond_lock);
|
||||
@ -722,7 +1069,7 @@ static int kdamond_fn(void *data)
|
||||
nr_running_ctxs--;
|
||||
mutex_unlock(&damon_lock);
|
||||
|
||||
do_exit(0);
|
||||
return 0;
|
||||
}
|
||||
|
||||
#include "core-test.h"
|
||||
|
@ -109,9 +109,63 @@ static void damon_dbgfs_test_set_targets(struct kunit *test)
|
||||
dbgfs_destroy_ctx(ctx);
|
||||
}
|
||||
|
||||
static void damon_dbgfs_test_set_init_regions(struct kunit *test)
|
||||
{
|
||||
struct damon_ctx *ctx = damon_new_ctx();
|
||||
unsigned long ids[] = {1, 2, 3};
|
||||
/* Each line represents one region in ``<target id> <start> <end>`` */
|
||||
char * const valid_inputs[] = {"2 10 20\n 2 20 30\n2 35 45",
|
||||
"2 10 20\n",
|
||||
"2 10 20\n1 39 59\n1 70 134\n 2 20 25\n",
|
||||
""};
|
||||
/* Reading the file again will show sorted, clean output */
|
||||
char * const valid_expects[] = {"2 10 20\n2 20 30\n2 35 45\n",
|
||||
"2 10 20\n",
|
||||
"1 39 59\n1 70 134\n2 10 20\n2 20 25\n",
|
||||
""};
|
||||
char * const invalid_inputs[] = {"4 10 20\n", /* target not exists */
|
||||
"2 10 20\n 2 14 26\n", /* regions overlap */
|
||||
"1 10 20\n2 30 40\n 1 5 8"}; /* not sorted by address */
|
||||
char *input, *expect;
|
||||
int i, rc;
|
||||
char buf[256];
|
||||
|
||||
damon_set_targets(ctx, ids, 3);
|
||||
|
||||
/* Put valid inputs and check the results */
|
||||
for (i = 0; i < ARRAY_SIZE(valid_inputs); i++) {
|
||||
input = valid_inputs[i];
|
||||
expect = valid_expects[i];
|
||||
|
||||
rc = set_init_regions(ctx, input, strnlen(input, 256));
|
||||
KUNIT_EXPECT_EQ(test, rc, 0);
|
||||
|
||||
memset(buf, 0, 256);
|
||||
sprint_init_regions(ctx, buf, 256);
|
||||
|
||||
KUNIT_EXPECT_STREQ(test, (char *)buf, expect);
|
||||
}
|
||||
/* Put invalid inputs and check the return error code */
|
||||
for (i = 0; i < ARRAY_SIZE(invalid_inputs); i++) {
|
||||
input = invalid_inputs[i];
|
||||
pr_info("input: %s\n", input);
|
||||
rc = set_init_regions(ctx, input, strnlen(input, 256));
|
||||
KUNIT_EXPECT_EQ(test, rc, -EINVAL);
|
||||
|
||||
memset(buf, 0, 256);
|
||||
sprint_init_regions(ctx, buf, 256);
|
||||
|
||||
KUNIT_EXPECT_STREQ(test, (char *)buf, "");
|
||||
}
|
||||
|
||||
damon_set_targets(ctx, NULL, 0);
|
||||
damon_destroy_ctx(ctx);
|
||||
}
|
||||
|
||||
static struct kunit_case damon_test_cases[] = {
|
||||
KUNIT_CASE(damon_dbgfs_test_str_to_target_ids),
|
||||
KUNIT_CASE(damon_dbgfs_test_set_targets),
|
||||
KUNIT_CASE(damon_dbgfs_test_set_init_regions),
|
||||
{},
|
||||
};
|
||||
|
||||
|
438
mm/damon/dbgfs.c
438
mm/damon/dbgfs.c
@ -69,8 +69,7 @@ static ssize_t dbgfs_attrs_write(struct file *file,
|
||||
struct damon_ctx *ctx = file->private_data;
|
||||
unsigned long s, a, r, minr, maxr;
|
||||
char *kbuf;
|
||||
ssize_t ret = count;
|
||||
int err;
|
||||
ssize_t ret;
|
||||
|
||||
kbuf = user_input_str(buf, count, ppos);
|
||||
if (IS_ERR(kbuf))
|
||||
@ -88,9 +87,9 @@ static ssize_t dbgfs_attrs_write(struct file *file,
|
||||
goto unlock_out;
|
||||
}
|
||||
|
||||
err = damon_set_attrs(ctx, s, a, r, minr, maxr);
|
||||
if (err)
|
||||
ret = err;
|
||||
ret = damon_set_attrs(ctx, s, a, r, minr, maxr);
|
||||
if (!ret)
|
||||
ret = count;
|
||||
unlock_out:
|
||||
mutex_unlock(&ctx->kdamond_lock);
|
||||
out:
|
||||
@ -98,6 +97,184 @@ static ssize_t dbgfs_attrs_write(struct file *file,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
|
||||
{
|
||||
struct damos *s;
|
||||
int written = 0;
|
||||
int rc;
|
||||
|
||||
damon_for_each_scheme(s, c) {
|
||||
rc = scnprintf(&buf[written], len - written,
|
||||
"%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
|
||||
s->min_sz_region, s->max_sz_region,
|
||||
s->min_nr_accesses, s->max_nr_accesses,
|
||||
s->min_age_region, s->max_age_region,
|
||||
s->action,
|
||||
s->quota.ms, s->quota.sz,
|
||||
s->quota.reset_interval,
|
||||
s->quota.weight_sz,
|
||||
s->quota.weight_nr_accesses,
|
||||
s->quota.weight_age,
|
||||
s->wmarks.metric, s->wmarks.interval,
|
||||
s->wmarks.high, s->wmarks.mid, s->wmarks.low,
|
||||
s->stat.nr_tried, s->stat.sz_tried,
|
||||
s->stat.nr_applied, s->stat.sz_applied,
|
||||
s->stat.qt_exceeds);
|
||||
if (!rc)
|
||||
return -ENOMEM;
|
||||
|
||||
written += rc;
|
||||
}
|
||||
return written;
|
||||
}
|
||||
|
||||
static ssize_t dbgfs_schemes_read(struct file *file, char __user *buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
struct damon_ctx *ctx = file->private_data;
|
||||
char *kbuf;
|
||||
ssize_t len;
|
||||
|
||||
kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
|
||||
if (!kbuf)
|
||||
return -ENOMEM;
|
||||
|
||||
mutex_lock(&ctx->kdamond_lock);
|
||||
len = sprint_schemes(ctx, kbuf, count);
|
||||
mutex_unlock(&ctx->kdamond_lock);
|
||||
if (len < 0)
|
||||
goto out;
|
||||
len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
|
||||
|
||||
out:
|
||||
kfree(kbuf);
|
||||
return len;
|
||||
}
|
||||
|
||||
static void free_schemes_arr(struct damos **schemes, ssize_t nr_schemes)
|
||||
{
|
||||
ssize_t i;
|
||||
|
||||
for (i = 0; i < nr_schemes; i++)
|
||||
kfree(schemes[i]);
|
||||
kfree(schemes);
|
||||
}
|
||||
|
||||
static bool damos_action_valid(int action)
|
||||
{
|
||||
switch (action) {
|
||||
case DAMOS_WILLNEED:
|
||||
case DAMOS_COLD:
|
||||
case DAMOS_PAGEOUT:
|
||||
case DAMOS_HUGEPAGE:
|
||||
case DAMOS_NOHUGEPAGE:
|
||||
case DAMOS_STAT:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Converts a string into an array of struct damos pointers
|
||||
*
|
||||
* Returns an array of struct damos pointers that converted if the conversion
|
||||
* success, or NULL otherwise.
|
||||
*/
|
||||
static struct damos **str_to_schemes(const char *str, ssize_t len,
|
||||
ssize_t *nr_schemes)
|
||||
{
|
||||
struct damos *scheme, **schemes;
|
||||
const int max_nr_schemes = 256;
|
||||
int pos = 0, parsed, ret;
|
||||
unsigned long min_sz, max_sz;
|
||||
unsigned int min_nr_a, max_nr_a, min_age, max_age;
|
||||
unsigned int action;
|
||||
|
||||
schemes = kmalloc_array(max_nr_schemes, sizeof(scheme),
|
||||
GFP_KERNEL);
|
||||
if (!schemes)
|
||||
return NULL;
|
||||
|
||||
*nr_schemes = 0;
|
||||
while (pos < len && *nr_schemes < max_nr_schemes) {
|
||||
struct damos_quota quota = {};
|
||||
struct damos_watermarks wmarks;
|
||||
|
||||
ret = sscanf(&str[pos],
|
||||
"%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n",
|
||||
&min_sz, &max_sz, &min_nr_a, &max_nr_a,
|
||||
&min_age, &max_age, &action, "a.ms,
|
||||
"a.sz, "a.reset_interval,
|
||||
"a.weight_sz, "a.weight_nr_accesses,
|
||||
"a.weight_age, &wmarks.metric,
|
||||
&wmarks.interval, &wmarks.high, &wmarks.mid,
|
||||
&wmarks.low, &parsed);
|
||||
if (ret != 18)
|
||||
break;
|
||||
if (!damos_action_valid(action))
|
||||
goto fail;
|
||||
|
||||
if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age)
|
||||
goto fail;
|
||||
|
||||
if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low ||
|
||||
wmarks.mid < wmarks.low)
|
||||
goto fail;
|
||||
|
||||
pos += parsed;
|
||||
scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a,
|
||||
min_age, max_age, action, "a, &wmarks);
|
||||
if (!scheme)
|
||||
goto fail;
|
||||
|
||||
schemes[*nr_schemes] = scheme;
|
||||
*nr_schemes += 1;
|
||||
}
|
||||
return schemes;
|
||||
fail:
|
||||
free_schemes_arr(schemes, *nr_schemes);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
struct damon_ctx *ctx = file->private_data;
|
||||
char *kbuf;
|
||||
struct damos **schemes;
|
||||
ssize_t nr_schemes = 0, ret;
|
||||
|
||||
kbuf = user_input_str(buf, count, ppos);
|
||||
if (IS_ERR(kbuf))
|
||||
return PTR_ERR(kbuf);
|
||||
|
||||
schemes = str_to_schemes(kbuf, count, &nr_schemes);
|
||||
if (!schemes) {
|
||||
ret = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
|
||||
mutex_lock(&ctx->kdamond_lock);
|
||||
if (ctx->kdamond) {
|
||||
ret = -EBUSY;
|
||||
goto unlock_out;
|
||||
}
|
||||
|
||||
ret = damon_set_schemes(ctx, schemes, nr_schemes);
|
||||
if (!ret) {
|
||||
ret = count;
|
||||
nr_schemes = 0;
|
||||
}
|
||||
|
||||
unlock_out:
|
||||
mutex_unlock(&ctx->kdamond_lock);
|
||||
free_schemes_arr(schemes, nr_schemes);
|
||||
out:
|
||||
kfree(kbuf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline bool targetid_is_pid(const struct damon_ctx *ctx)
|
||||
{
|
||||
return ctx->primitive.target_valid == damon_va_target_valid;
|
||||
@ -186,26 +363,30 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
|
||||
{
|
||||
struct damon_ctx *ctx = file->private_data;
|
||||
struct damon_target *t, *next_t;
|
||||
char *kbuf, *nrs;
|
||||
bool id_is_pid = true;
|
||||
char *kbuf;
|
||||
unsigned long *targets;
|
||||
ssize_t nr_targets;
|
||||
ssize_t ret = count;
|
||||
ssize_t ret;
|
||||
int i;
|
||||
int err;
|
||||
|
||||
kbuf = user_input_str(buf, count, ppos);
|
||||
if (IS_ERR(kbuf))
|
||||
return PTR_ERR(kbuf);
|
||||
|
||||
nrs = kbuf;
|
||||
if (!strncmp(kbuf, "paddr\n", count)) {
|
||||
id_is_pid = false;
|
||||
/* target id is meaningless here, but we set it just for fun */
|
||||
scnprintf(kbuf, count, "42 ");
|
||||
}
|
||||
|
||||
targets = str_to_target_ids(nrs, ret, &nr_targets);
|
||||
targets = str_to_target_ids(kbuf, count, &nr_targets);
|
||||
if (!targets) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (targetid_is_pid(ctx)) {
|
||||
if (id_is_pid) {
|
||||
for (i = 0; i < nr_targets; i++) {
|
||||
targets[i] = (unsigned long)find_get_pid(
|
||||
(int)targets[i]);
|
||||
@ -219,7 +400,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
|
||||
|
||||
mutex_lock(&ctx->kdamond_lock);
|
||||
if (ctx->kdamond) {
|
||||
if (targetid_is_pid(ctx))
|
||||
if (id_is_pid)
|
||||
dbgfs_put_pids(targets, nr_targets);
|
||||
ret = -EBUSY;
|
||||
goto unlock_out;
|
||||
@ -232,11 +413,18 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
|
||||
damon_destroy_target(t);
|
||||
}
|
||||
|
||||
err = damon_set_targets(ctx, targets, nr_targets);
|
||||
if (err) {
|
||||
if (targetid_is_pid(ctx))
|
||||
/* Configure the context for the address space type */
|
||||
if (id_is_pid)
|
||||
damon_va_set_primitives(ctx);
|
||||
else
|
||||
damon_pa_set_primitives(ctx);
|
||||
|
||||
ret = damon_set_targets(ctx, targets, nr_targets);
|
||||
if (ret) {
|
||||
if (id_is_pid)
|
||||
dbgfs_put_pids(targets, nr_targets);
|
||||
ret = err;
|
||||
} else {
|
||||
ret = count;
|
||||
}
|
||||
|
||||
unlock_out:
|
||||
@ -248,6 +436,152 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t sprint_init_regions(struct damon_ctx *c, char *buf, ssize_t len)
|
||||
{
|
||||
struct damon_target *t;
|
||||
struct damon_region *r;
|
||||
int written = 0;
|
||||
int rc;
|
||||
|
||||
damon_for_each_target(t, c) {
|
||||
damon_for_each_region(r, t) {
|
||||
rc = scnprintf(&buf[written], len - written,
|
||||
"%lu %lu %lu\n",
|
||||
t->id, r->ar.start, r->ar.end);
|
||||
if (!rc)
|
||||
return -ENOMEM;
|
||||
written += rc;
|
||||
}
|
||||
}
|
||||
return written;
|
||||
}
|
||||
|
||||
static ssize_t dbgfs_init_regions_read(struct file *file, char __user *buf,
|
||||
size_t count, loff_t *ppos)
|
||||
{
|
||||
struct damon_ctx *ctx = file->private_data;
|
||||
char *kbuf;
|
||||
ssize_t len;
|
||||
|
||||
kbuf = kmalloc(count, GFP_KERNEL | __GFP_NOWARN);
|
||||
if (!kbuf)
|
||||
return -ENOMEM;
|
||||
|
||||
mutex_lock(&ctx->kdamond_lock);
|
||||
if (ctx->kdamond) {
|
||||
mutex_unlock(&ctx->kdamond_lock);
|
||||
len = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
len = sprint_init_regions(ctx, kbuf, count);
|
||||
mutex_unlock(&ctx->kdamond_lock);
|
||||
if (len < 0)
|
||||
goto out;
|
||||
len = simple_read_from_buffer(buf, count, ppos, kbuf, len);
|
||||
|
||||
out:
|
||||
kfree(kbuf);
|
||||
return len;
|
||||
}
|
||||
|
||||
static int add_init_region(struct damon_ctx *c,
|
||||
unsigned long target_id, struct damon_addr_range *ar)
|
||||
{
|
||||
struct damon_target *t;
|
||||
struct damon_region *r, *prev;
|
||||
unsigned long id;
|
||||
int rc = -EINVAL;
|
||||
|
||||
if (ar->start >= ar->end)
|
||||
return -EINVAL;
|
||||
|
||||
damon_for_each_target(t, c) {
|
||||
id = t->id;
|
||||
if (targetid_is_pid(c))
|
||||
id = (unsigned long)pid_vnr((struct pid *)id);
|
||||
if (id == target_id) {
|
||||
r = damon_new_region(ar->start, ar->end);
|
||||
if (!r)
|
||||
return -ENOMEM;
|
||||
damon_add_region(r, t);
|
||||
if (damon_nr_regions(t) > 1) {
|
||||
prev = damon_prev_region(r);
|
||||
if (prev->ar.end > r->ar.start) {
|
||||
damon_destroy_region(r, t);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
rc = 0;
|
||||
}
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
static int set_init_regions(struct damon_ctx *c, const char *str, ssize_t len)
|
||||
{
|
||||
struct damon_target *t;
|
||||
struct damon_region *r, *next;
|
||||
int pos = 0, parsed, ret;
|
||||
unsigned long target_id;
|
||||
struct damon_addr_range ar;
|
||||
int err;
|
||||
|
||||
damon_for_each_target(t, c) {
|
||||
damon_for_each_region_safe(r, next, t)
|
||||
damon_destroy_region(r, t);
|
||||
}
|
||||
|
||||
while (pos < len) {
|
||||
ret = sscanf(&str[pos], "%lu %lu %lu%n",
|
||||
&target_id, &ar.start, &ar.end, &parsed);
|
||||
if (ret != 3)
|
||||
break;
|
||||
err = add_init_region(c, target_id, &ar);
|
||||
if (err)
|
||||
goto fail;
|
||||
pos += parsed;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
fail:
|
||||
damon_for_each_target(t, c) {
|
||||
damon_for_each_region_safe(r, next, t)
|
||||
damon_destroy_region(r, t);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static ssize_t dbgfs_init_regions_write(struct file *file,
|
||||
const char __user *buf, size_t count,
|
||||
loff_t *ppos)
|
||||
{
|
||||
struct damon_ctx *ctx = file->private_data;
|
||||
char *kbuf;
|
||||
ssize_t ret = count;
|
||||
int err;
|
||||
|
||||
kbuf = user_input_str(buf, count, ppos);
|
||||
if (IS_ERR(kbuf))
|
||||
return PTR_ERR(kbuf);
|
||||
|
||||
mutex_lock(&ctx->kdamond_lock);
|
||||
if (ctx->kdamond) {
|
||||
ret = -EBUSY;
|
||||
goto unlock_out;
|
||||
}
|
||||
|
||||
err = set_init_regions(ctx, kbuf, ret);
|
||||
if (err)
|
||||
ret = err;
|
||||
|
||||
unlock_out:
|
||||
mutex_unlock(&ctx->kdamond_lock);
|
||||
kfree(kbuf);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t dbgfs_kdamond_pid_read(struct file *file,
|
||||
char __user *buf, size_t count, loff_t *ppos)
|
||||
{
|
||||
@ -287,12 +621,24 @@ static const struct file_operations attrs_fops = {
|
||||
.write = dbgfs_attrs_write,
|
||||
};
|
||||
|
||||
static const struct file_operations schemes_fops = {
|
||||
.open = damon_dbgfs_open,
|
||||
.read = dbgfs_schemes_read,
|
||||
.write = dbgfs_schemes_write,
|
||||
};
|
||||
|
||||
static const struct file_operations target_ids_fops = {
|
||||
.open = damon_dbgfs_open,
|
||||
.read = dbgfs_target_ids_read,
|
||||
.write = dbgfs_target_ids_write,
|
||||
};
|
||||
|
||||
static const struct file_operations init_regions_fops = {
|
||||
.open = damon_dbgfs_open,
|
||||
.read = dbgfs_init_regions_read,
|
||||
.write = dbgfs_init_regions_write,
|
||||
};
|
||||
|
||||
static const struct file_operations kdamond_pid_fops = {
|
||||
.open = damon_dbgfs_open,
|
||||
.read = dbgfs_kdamond_pid_read,
|
||||
@ -300,22 +646,22 @@ static const struct file_operations kdamond_pid_fops = {
|
||||
|
||||
static void dbgfs_fill_ctx_dir(struct dentry *dir, struct damon_ctx *ctx)
|
||||
{
|
||||
const char * const file_names[] = {"attrs", "target_ids",
|
||||
"kdamond_pid"};
|
||||
const struct file_operations *fops[] = {&attrs_fops, &target_ids_fops,
|
||||
&kdamond_pid_fops};
|
||||
const char * const file_names[] = {"attrs", "schemes", "target_ids",
|
||||
"init_regions", "kdamond_pid"};
|
||||
const struct file_operations *fops[] = {&attrs_fops, &schemes_fops,
|
||||
&target_ids_fops, &init_regions_fops, &kdamond_pid_fops};
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ARRAY_SIZE(file_names); i++)
|
||||
debugfs_create_file(file_names[i], 0600, dir, ctx, fops[i]);
|
||||
}
|
||||
|
||||
static int dbgfs_before_terminate(struct damon_ctx *ctx)
|
||||
static void dbgfs_before_terminate(struct damon_ctx *ctx)
|
||||
{
|
||||
struct damon_target *t, *next;
|
||||
|
||||
if (!targetid_is_pid(ctx))
|
||||
return 0;
|
||||
return;
|
||||
|
||||
mutex_lock(&ctx->kdamond_lock);
|
||||
damon_for_each_target_safe(t, next, ctx) {
|
||||
@ -323,7 +669,6 @@ static int dbgfs_before_terminate(struct damon_ctx *ctx)
|
||||
damon_destroy_target(t);
|
||||
}
|
||||
mutex_unlock(&ctx->kdamond_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct damon_ctx *dbgfs_new_ctx(void)
|
||||
@ -398,8 +743,7 @@ static ssize_t dbgfs_mk_context_write(struct file *file,
|
||||
{
|
||||
char *kbuf;
|
||||
char *ctx_name;
|
||||
ssize_t ret = count;
|
||||
int err;
|
||||
ssize_t ret;
|
||||
|
||||
kbuf = user_input_str(buf, count, ppos);
|
||||
if (IS_ERR(kbuf))
|
||||
@ -417,9 +761,9 @@ static ssize_t dbgfs_mk_context_write(struct file *file,
|
||||
}
|
||||
|
||||
mutex_lock(&damon_dbgfs_lock);
|
||||
err = dbgfs_mk_context(ctx_name);
|
||||
if (err)
|
||||
ret = err;
|
||||
ret = dbgfs_mk_context(ctx_name);
|
||||
if (!ret)
|
||||
ret = count;
|
||||
mutex_unlock(&damon_dbgfs_lock);
|
||||
|
||||
out:
|
||||
@ -488,8 +832,7 @@ static ssize_t dbgfs_rm_context_write(struct file *file,
|
||||
const char __user *buf, size_t count, loff_t *ppos)
|
||||
{
|
||||
char *kbuf;
|
||||
ssize_t ret = count;
|
||||
int err;
|
||||
ssize_t ret;
|
||||
char *ctx_name;
|
||||
|
||||
kbuf = user_input_str(buf, count, ppos);
|
||||
@ -508,9 +851,9 @@ static ssize_t dbgfs_rm_context_write(struct file *file,
|
||||
}
|
||||
|
||||
mutex_lock(&damon_dbgfs_lock);
|
||||
err = dbgfs_rm_context(ctx_name);
|
||||
if (err)
|
||||
ret = err;
|
||||
ret = dbgfs_rm_context(ctx_name);
|
||||
if (!ret)
|
||||
ret = count;
|
||||
mutex_unlock(&damon_dbgfs_lock);
|
||||
|
||||
out:
|
||||
@ -534,9 +877,8 @@ static ssize_t dbgfs_monitor_on_read(struct file *file,
|
||||
static ssize_t dbgfs_monitor_on_write(struct file *file,
|
||||
const char __user *buf, size_t count, loff_t *ppos)
|
||||
{
|
||||
ssize_t ret = count;
|
||||
ssize_t ret;
|
||||
char *kbuf;
|
||||
int err;
|
||||
|
||||
kbuf = user_input_str(buf, count, ppos);
|
||||
if (IS_ERR(kbuf))
|
||||
@ -549,16 +891,26 @@ static ssize_t dbgfs_monitor_on_write(struct file *file,
|
||||
}
|
||||
|
||||
mutex_lock(&damon_dbgfs_lock);
|
||||
if (!strncmp(kbuf, "on", count))
|
||||
err = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs);
|
||||
else if (!strncmp(kbuf, "off", count))
|
||||
err = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
|
||||
else
|
||||
err = -EINVAL;
|
||||
if (!strncmp(kbuf, "on", count)) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < dbgfs_nr_ctxs; i++) {
|
||||
if (damon_targets_empty(dbgfs_ctxs[i])) {
|
||||
kfree(kbuf);
|
||||
mutex_unlock(&damon_dbgfs_lock);
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
ret = damon_start(dbgfs_ctxs, dbgfs_nr_ctxs);
|
||||
} else if (!strncmp(kbuf, "off", count)) {
|
||||
ret = damon_stop(dbgfs_ctxs, dbgfs_nr_ctxs);
|
||||
} else {
|
||||
ret = -EINVAL;
|
||||
}
|
||||
mutex_unlock(&damon_dbgfs_lock);
|
||||
|
||||
if (err)
|
||||
ret = err;
|
||||
if (!ret)
|
||||
ret = count;
|
||||
kfree(kbuf);
|
||||
return ret;
|
||||
}
|
||||
|
@ -135,7 +135,6 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
|
||||
struct damon_addr_range *three_regions,
|
||||
unsigned long *expected, int nr_expected)
|
||||
{
|
||||
struct damon_ctx *ctx = damon_new_ctx();
|
||||
struct damon_target *t;
|
||||
struct damon_region *r;
|
||||
int i;
|
||||
@ -145,7 +144,6 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
|
||||
r = damon_new_region(regions[i * 2], regions[i * 2 + 1]);
|
||||
damon_add_region(r, t);
|
||||
}
|
||||
damon_add_target(ctx, t);
|
||||
|
||||
damon_va_apply_three_regions(t, three_regions);
|
||||
|
||||
@ -154,8 +152,6 @@ static void damon_do_test_apply_three_regions(struct kunit *test,
|
||||
KUNIT_EXPECT_EQ(test, r->ar.start, expected[i * 2]);
|
||||
KUNIT_EXPECT_EQ(test, r->ar.end, expected[i * 2 + 1]);
|
||||
}
|
||||
|
||||
damon_destroy_ctx(ctx);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -233,7 +229,7 @@ static void damon_test_apply_three_regions3(struct kunit *test)
|
||||
* and 70-100) has totally freed and mapped to different area (30-32 and
|
||||
* 65-68). The target regions which were in the old second and third big
|
||||
* regions should now be removed and new target regions covering the new second
|
||||
* and third big regions should be crated.
|
||||
* and third big regions should be created.
|
||||
*/
|
||||
static void damon_test_apply_three_regions4(struct kunit *test)
|
||||
{
|
||||
@ -252,60 +248,59 @@ static void damon_test_apply_three_regions4(struct kunit *test)
|
||||
new_three_regions, expected, ARRAY_SIZE(expected));
|
||||
}
|
||||
|
||||
static void damon_test_split_evenly(struct kunit *test)
|
||||
static void damon_test_split_evenly_fail(struct kunit *test,
|
||||
unsigned long start, unsigned long end, unsigned int nr_pieces)
|
||||
{
|
||||
struct damon_ctx *c = damon_new_ctx();
|
||||
struct damon_target *t;
|
||||
struct damon_region *r;
|
||||
unsigned long i;
|
||||
|
||||
KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5),
|
||||
-EINVAL);
|
||||
|
||||
t = damon_new_target(42);
|
||||
r = damon_new_region(0, 100);
|
||||
KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 0), -EINVAL);
|
||||
struct damon_target *t = damon_new_target(42);
|
||||
struct damon_region *r = damon_new_region(start, end);
|
||||
|
||||
damon_add_region(r, t);
|
||||
KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 10), 0);
|
||||
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 10u);
|
||||
|
||||
i = 0;
|
||||
damon_for_each_region(r, t) {
|
||||
KUNIT_EXPECT_EQ(test, r->ar.start, i++ * 10);
|
||||
KUNIT_EXPECT_EQ(test, r->ar.end, i * 10);
|
||||
}
|
||||
damon_free_target(t);
|
||||
|
||||
t = damon_new_target(42);
|
||||
r = damon_new_region(5, 59);
|
||||
damon_add_region(r, t);
|
||||
KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 5), 0);
|
||||
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 5u);
|
||||
|
||||
i = 0;
|
||||
damon_for_each_region(r, t) {
|
||||
if (i == 4)
|
||||
break;
|
||||
KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i++);
|
||||
KUNIT_EXPECT_EQ(test, r->ar.end, 5 + 10 * i);
|
||||
}
|
||||
KUNIT_EXPECT_EQ(test, r->ar.start, 5 + 10 * i);
|
||||
KUNIT_EXPECT_EQ(test, r->ar.end, 59ul);
|
||||
damon_free_target(t);
|
||||
|
||||
t = damon_new_target(42);
|
||||
r = damon_new_region(5, 6);
|
||||
damon_add_region(r, t);
|
||||
KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(t, r, 2), -EINVAL);
|
||||
KUNIT_EXPECT_EQ(test,
|
||||
damon_va_evenly_split_region(t, r, nr_pieces), -EINVAL);
|
||||
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 1u);
|
||||
|
||||
damon_for_each_region(r, t) {
|
||||
KUNIT_EXPECT_EQ(test, r->ar.start, 5ul);
|
||||
KUNIT_EXPECT_EQ(test, r->ar.end, 6ul);
|
||||
KUNIT_EXPECT_EQ(test, r->ar.start, start);
|
||||
KUNIT_EXPECT_EQ(test, r->ar.end, end);
|
||||
}
|
||||
|
||||
damon_free_target(t);
|
||||
damon_destroy_ctx(c);
|
||||
}
|
||||
|
||||
static void damon_test_split_evenly_succ(struct kunit *test,
|
||||
unsigned long start, unsigned long end, unsigned int nr_pieces)
|
||||
{
|
||||
struct damon_target *t = damon_new_target(42);
|
||||
struct damon_region *r = damon_new_region(start, end);
|
||||
unsigned long expected_width = (end - start) / nr_pieces;
|
||||
unsigned long i = 0;
|
||||
|
||||
damon_add_region(r, t);
|
||||
KUNIT_EXPECT_EQ(test,
|
||||
damon_va_evenly_split_region(t, r, nr_pieces), 0);
|
||||
KUNIT_EXPECT_EQ(test, damon_nr_regions(t), nr_pieces);
|
||||
|
||||
damon_for_each_region(r, t) {
|
||||
if (i == nr_pieces - 1)
|
||||
break;
|
||||
KUNIT_EXPECT_EQ(test,
|
||||
r->ar.start, start + i++ * expected_width);
|
||||
KUNIT_EXPECT_EQ(test, r->ar.end, start + i * expected_width);
|
||||
}
|
||||
KUNIT_EXPECT_EQ(test, r->ar.start, start + i * expected_width);
|
||||
KUNIT_EXPECT_EQ(test, r->ar.end, end);
|
||||
damon_free_target(t);
|
||||
}
|
||||
|
||||
static void damon_test_split_evenly(struct kunit *test)
|
||||
{
|
||||
KUNIT_EXPECT_EQ(test, damon_va_evenly_split_region(NULL, NULL, 5),
|
||||
-EINVAL);
|
||||
|
||||
damon_test_split_evenly_fail(test, 0, 100, 0);
|
||||
damon_test_split_evenly_succ(test, 0, 100, 10);
|
||||
damon_test_split_evenly_succ(test, 5, 59, 5);
|
||||
damon_test_split_evenly_fail(test, 5, 6, 2);
|
||||
}
|
||||
|
||||
static struct kunit_case damon_test_cases[] = {
|
||||
|
317
mm/damon/vaddr.c
317
mm/damon/vaddr.c
@ -7,31 +7,29 @@
|
||||
|
||||
#define pr_fmt(fmt) "damon-va: " fmt
|
||||
|
||||
#include <linux/damon.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <asm-generic/mman-common.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/page_idle.h>
|
||||
#include <linux/pagewalk.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
#include "prmtv-common.h"
|
||||
|
||||
#ifdef CONFIG_DAMON_VADDR_KUNIT_TEST
|
||||
#undef DAMON_MIN_REGION
|
||||
#define DAMON_MIN_REGION 1
|
||||
#endif
|
||||
|
||||
/* Get a random number in [l, r) */
|
||||
#define damon_rand(l, r) (l + prandom_u32_max(r - l))
|
||||
|
||||
/*
|
||||
* 't->id' should be the pointer to the relevant 'struct pid' having reference
|
||||
* count. Caller must put the returned task, unless it is NULL.
|
||||
*/
|
||||
#define damon_get_task_struct(t) \
|
||||
(get_pid_task((struct pid *)t->id, PIDTYPE_PID))
|
||||
static inline struct task_struct *damon_get_task_struct(struct damon_target *t)
|
||||
{
|
||||
return get_pid_task((struct pid *)t->id, PIDTYPE_PID);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the mm_struct of the given target
|
||||
@ -102,16 +100,6 @@ static unsigned long sz_range(struct damon_addr_range *r)
|
||||
return r->end - r->start;
|
||||
}
|
||||
|
||||
static void swap_ranges(struct damon_addr_range *r1,
|
||||
struct damon_addr_range *r2)
|
||||
{
|
||||
struct damon_addr_range tmp;
|
||||
|
||||
tmp = *r1;
|
||||
*r1 = *r2;
|
||||
*r2 = tmp;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find three regions separated by two biggest unmapped regions
|
||||
*
|
||||
@ -150,9 +138,9 @@ static int __damon_va_three_regions(struct vm_area_struct *vma,
|
||||
gap.start = last_vma->vm_end;
|
||||
gap.end = vma->vm_start;
|
||||
if (sz_range(&gap) > sz_range(&second_gap)) {
|
||||
swap_ranges(&gap, &second_gap);
|
||||
swap(gap, second_gap);
|
||||
if (sz_range(&second_gap) > sz_range(&first_gap))
|
||||
swap_ranges(&second_gap, &first_gap);
|
||||
swap(second_gap, first_gap);
|
||||
}
|
||||
next:
|
||||
last_vma = vma;
|
||||
@ -163,7 +151,7 @@ static int __damon_va_three_regions(struct vm_area_struct *vma,
|
||||
|
||||
/* Sort the two biggest gaps by address */
|
||||
if (first_gap.start > second_gap.start)
|
||||
swap_ranges(&first_gap, &second_gap);
|
||||
swap(first_gap, second_gap);
|
||||
|
||||
/* Store the result */
|
||||
regions[0].start = ALIGN(start, DAMON_MIN_REGION);
|
||||
@ -244,13 +232,19 @@ static int damon_va_three_regions(struct damon_target *t,
|
||||
static void __damon_va_init_regions(struct damon_ctx *ctx,
|
||||
struct damon_target *t)
|
||||
{
|
||||
struct damon_target *ti;
|
||||
struct damon_region *r;
|
||||
struct damon_addr_range regions[3];
|
||||
unsigned long sz = 0, nr_pieces;
|
||||
int i;
|
||||
int i, tidx = 0;
|
||||
|
||||
if (damon_va_three_regions(t, regions)) {
|
||||
pr_err("Failed to get three regions of target %lu\n", t->id);
|
||||
damon_for_each_target(ti, ctx) {
|
||||
if (ti == t)
|
||||
break;
|
||||
tidx++;
|
||||
}
|
||||
pr_debug("Failed to get three regions of %dth target\n", tidx);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -276,7 +270,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx,
|
||||
}
|
||||
|
||||
/* Initialize '->regions_list' of every target (task) */
|
||||
void damon_va_init(struct damon_ctx *ctx)
|
||||
static void damon_va_init(struct damon_ctx *ctx)
|
||||
{
|
||||
struct damon_target *t;
|
||||
|
||||
@ -296,7 +290,8 @@ void damon_va_init(struct damon_ctx *ctx)
|
||||
*
|
||||
* Returns true if it is.
|
||||
*/
|
||||
static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re)
|
||||
static bool damon_intersect(struct damon_region *r,
|
||||
struct damon_addr_range *re)
|
||||
{
|
||||
return !(r->ar.end <= re->start || re->end <= r->ar.start);
|
||||
}
|
||||
@ -311,7 +306,7 @@ static void damon_va_apply_three_regions(struct damon_target *t,
|
||||
struct damon_addr_range bregions[3])
|
||||
{
|
||||
struct damon_region *r, *next;
|
||||
unsigned int i = 0;
|
||||
unsigned int i;
|
||||
|
||||
/* Remove regions which are not in the three big regions now */
|
||||
damon_for_each_region_safe(r, next, t) {
|
||||
@ -360,7 +355,7 @@ static void damon_va_apply_three_regions(struct damon_target *t,
|
||||
/*
|
||||
* Update regions for current memory mappings
|
||||
*/
|
||||
void damon_va_update(struct damon_ctx *ctx)
|
||||
static void damon_va_update(struct damon_ctx *ctx)
|
||||
{
|
||||
struct damon_addr_range three_regions[3];
|
||||
struct damon_target *t;
|
||||
@ -372,82 +367,6 @@ void damon_va_update(struct damon_ctx *ctx)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Get an online page for a pfn if it's in the LRU list. Otherwise, returns
|
||||
* NULL.
|
||||
*
|
||||
* The body of this function is stolen from the 'page_idle_get_page()'. We
|
||||
* steal rather than reuse it because the code is quite simple.
|
||||
*/
|
||||
static struct page *damon_get_page(unsigned long pfn)
|
||||
{
|
||||
struct page *page = pfn_to_online_page(pfn);
|
||||
|
||||
if (!page || !PageLRU(page) || !get_page_unless_zero(page))
|
||||
return NULL;
|
||||
|
||||
if (unlikely(!PageLRU(page))) {
|
||||
put_page(page);
|
||||
page = NULL;
|
||||
}
|
||||
return page;
|
||||
}
|
||||
|
||||
static void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm,
|
||||
unsigned long addr)
|
||||
{
|
||||
bool referenced = false;
|
||||
struct page *page = damon_get_page(pte_pfn(*pte));
|
||||
|
||||
if (!page)
|
||||
return;
|
||||
|
||||
if (pte_young(*pte)) {
|
||||
referenced = true;
|
||||
*pte = pte_mkold(*pte);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MMU_NOTIFIER
|
||||
if (mmu_notifier_clear_young(mm, addr, addr + PAGE_SIZE))
|
||||
referenced = true;
|
||||
#endif /* CONFIG_MMU_NOTIFIER */
|
||||
|
||||
if (referenced)
|
||||
set_page_young(page);
|
||||
|
||||
set_page_idle(page);
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
static void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm,
|
||||
unsigned long addr)
|
||||
{
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
bool referenced = false;
|
||||
struct page *page = damon_get_page(pmd_pfn(*pmd));
|
||||
|
||||
if (!page)
|
||||
return;
|
||||
|
||||
if (pmd_young(*pmd)) {
|
||||
referenced = true;
|
||||
*pmd = pmd_mkold(*pmd);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MMU_NOTIFIER
|
||||
if (mmu_notifier_clear_young(mm, addr,
|
||||
addr + ((1UL) << HPAGE_PMD_SHIFT)))
|
||||
referenced = true;
|
||||
#endif /* CONFIG_MMU_NOTIFIER */
|
||||
|
||||
if (referenced)
|
||||
set_page_young(page);
|
||||
|
||||
set_page_idle(page);
|
||||
put_page(page);
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
}
|
||||
|
||||
static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
|
||||
unsigned long next, struct mm_walk *walk)
|
||||
{
|
||||
@ -475,8 +394,65 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct mm_walk_ops damon_mkold_ops = {
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
|
||||
struct vm_area_struct *vma, unsigned long addr)
|
||||
{
|
||||
bool referenced = false;
|
||||
pte_t entry = huge_ptep_get(pte);
|
||||
struct page *page = pte_page(entry);
|
||||
|
||||
if (!page)
|
||||
return;
|
||||
|
||||
get_page(page);
|
||||
|
||||
if (pte_young(entry)) {
|
||||
referenced = true;
|
||||
entry = pte_mkold(entry);
|
||||
huge_ptep_set_access_flags(vma, addr, pte, entry,
|
||||
vma->vm_flags & VM_WRITE);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MMU_NOTIFIER
|
||||
if (mmu_notifier_clear_young(mm, addr,
|
||||
addr + huge_page_size(hstate_vma(vma))))
|
||||
referenced = true;
|
||||
#endif /* CONFIG_MMU_NOTIFIER */
|
||||
|
||||
if (referenced)
|
||||
set_page_young(page);
|
||||
|
||||
set_page_idle(page);
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask,
|
||||
unsigned long addr, unsigned long end,
|
||||
struct mm_walk *walk)
|
||||
{
|
||||
struct hstate *h = hstate_vma(walk->vma);
|
||||
spinlock_t *ptl;
|
||||
pte_t entry;
|
||||
|
||||
ptl = huge_pte_lock(h, walk->mm, pte);
|
||||
entry = huge_ptep_get(pte);
|
||||
if (!pte_present(entry))
|
||||
goto out;
|
||||
|
||||
damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr);
|
||||
|
||||
out:
|
||||
spin_unlock(ptl);
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
#define damon_mkold_hugetlb_entry NULL
|
||||
#endif /* CONFIG_HUGETLB_PAGE */
|
||||
|
||||
static const struct mm_walk_ops damon_mkold_ops = {
|
||||
.pmd_entry = damon_mkold_pmd_entry,
|
||||
.hugetlb_entry = damon_mkold_hugetlb_entry,
|
||||
};
|
||||
|
||||
static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
|
||||
@ -490,7 +466,7 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
|
||||
* Functions for the access checking of the regions
|
||||
*/
|
||||
|
||||
static void damon_va_prepare_access_check(struct damon_ctx *ctx,
|
||||
static void __damon_va_prepare_access_check(struct damon_ctx *ctx,
|
||||
struct mm_struct *mm, struct damon_region *r)
|
||||
{
|
||||
r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
|
||||
@ -498,7 +474,7 @@ static void damon_va_prepare_access_check(struct damon_ctx *ctx,
|
||||
damon_va_mkold(mm, r->sampling_addr);
|
||||
}
|
||||
|
||||
void damon_va_prepare_access_checks(struct damon_ctx *ctx)
|
||||
static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
|
||||
{
|
||||
struct damon_target *t;
|
||||
struct mm_struct *mm;
|
||||
@ -509,7 +485,7 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx)
|
||||
if (!mm)
|
||||
continue;
|
||||
damon_for_each_region(r, t)
|
||||
damon_va_prepare_access_check(ctx, mm, r);
|
||||
__damon_va_prepare_access_check(ctx, mm, r);
|
||||
mmput(mm);
|
||||
}
|
||||
}
|
||||
@ -571,8 +547,47 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static struct mm_walk_ops damon_young_ops = {
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
|
||||
unsigned long addr, unsigned long end,
|
||||
struct mm_walk *walk)
|
||||
{
|
||||
struct damon_young_walk_private *priv = walk->private;
|
||||
struct hstate *h = hstate_vma(walk->vma);
|
||||
struct page *page;
|
||||
spinlock_t *ptl;
|
||||
pte_t entry;
|
||||
|
||||
ptl = huge_pte_lock(h, walk->mm, pte);
|
||||
entry = huge_ptep_get(pte);
|
||||
if (!pte_present(entry))
|
||||
goto out;
|
||||
|
||||
page = pte_page(entry);
|
||||
if (!page)
|
||||
goto out;
|
||||
|
||||
get_page(page);
|
||||
|
||||
if (pte_young(entry) || !page_is_idle(page) ||
|
||||
mmu_notifier_test_young(walk->mm, addr)) {
|
||||
*priv->page_sz = huge_page_size(h);
|
||||
priv->young = true;
|
||||
}
|
||||
|
||||
put_page(page);
|
||||
|
||||
out:
|
||||
spin_unlock(ptl);
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
#define damon_young_hugetlb_entry NULL
|
||||
#endif /* CONFIG_HUGETLB_PAGE */
|
||||
|
||||
static const struct mm_walk_ops damon_young_ops = {
|
||||
.pmd_entry = damon_young_pmd_entry,
|
||||
.hugetlb_entry = damon_young_hugetlb_entry,
|
||||
};
|
||||
|
||||
static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
|
||||
@ -595,7 +610,7 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
|
||||
* mm 'mm_struct' for the given virtual address space
|
||||
* r the region to be checked
|
||||
*/
|
||||
static void damon_va_check_access(struct damon_ctx *ctx,
|
||||
static void __damon_va_check_access(struct damon_ctx *ctx,
|
||||
struct mm_struct *mm, struct damon_region *r)
|
||||
{
|
||||
static struct mm_struct *last_mm;
|
||||
@ -619,7 +634,7 @@ static void damon_va_check_access(struct damon_ctx *ctx,
|
||||
last_addr = r->sampling_addr;
|
||||
}
|
||||
|
||||
unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
|
||||
static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
|
||||
{
|
||||
struct damon_target *t;
|
||||
struct mm_struct *mm;
|
||||
@ -631,7 +646,7 @@ unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
|
||||
if (!mm)
|
||||
continue;
|
||||
damon_for_each_region(r, t) {
|
||||
damon_va_check_access(ctx, mm, r);
|
||||
__damon_va_check_access(ctx, mm, r);
|
||||
max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
|
||||
}
|
||||
mmput(mm);
|
||||
@ -658,6 +673,78 @@ bool damon_va_target_valid(void *target)
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifndef CONFIG_ADVISE_SYSCALLS
|
||||
static unsigned long damos_madvise(struct damon_target *target,
|
||||
struct damon_region *r, int behavior)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static unsigned long damos_madvise(struct damon_target *target,
|
||||
struct damon_region *r, int behavior)
|
||||
{
|
||||
struct mm_struct *mm;
|
||||
unsigned long start = PAGE_ALIGN(r->ar.start);
|
||||
unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start);
|
||||
unsigned long applied;
|
||||
|
||||
mm = damon_get_mm(target);
|
||||
if (!mm)
|
||||
return 0;
|
||||
|
||||
applied = do_madvise(mm, start, len, behavior) ? 0 : len;
|
||||
mmput(mm);
|
||||
|
||||
return applied;
|
||||
}
|
||||
#endif /* CONFIG_ADVISE_SYSCALLS */
|
||||
|
||||
static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
|
||||
struct damon_target *t, struct damon_region *r,
|
||||
struct damos *scheme)
|
||||
{
|
||||
int madv_action;
|
||||
|
||||
switch (scheme->action) {
|
||||
case DAMOS_WILLNEED:
|
||||
madv_action = MADV_WILLNEED;
|
||||
break;
|
||||
case DAMOS_COLD:
|
||||
madv_action = MADV_COLD;
|
||||
break;
|
||||
case DAMOS_PAGEOUT:
|
||||
madv_action = MADV_PAGEOUT;
|
||||
break;
|
||||
case DAMOS_HUGEPAGE:
|
||||
madv_action = MADV_HUGEPAGE;
|
||||
break;
|
||||
case DAMOS_NOHUGEPAGE:
|
||||
madv_action = MADV_NOHUGEPAGE;
|
||||
break;
|
||||
case DAMOS_STAT:
|
||||
return 0;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
|
||||
return damos_madvise(t, r, madv_action);
|
||||
}
|
||||
|
||||
static int damon_va_scheme_score(struct damon_ctx *context,
|
||||
struct damon_target *t, struct damon_region *r,
|
||||
struct damos *scheme)
|
||||
{
|
||||
|
||||
switch (scheme->action) {
|
||||
case DAMOS_PAGEOUT:
|
||||
return damon_pageout_score(context, r, scheme);
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return DAMOS_MAX_SCORE;
|
||||
}
|
||||
|
||||
void damon_va_set_primitives(struct damon_ctx *ctx)
|
||||
{
|
||||
ctx->primitive.init = damon_va_init;
|
||||
@ -667,6 +754,8 @@ void damon_va_set_primitives(struct damon_ctx *ctx)
|
||||
ctx->primitive.reset_aggregated = NULL;
|
||||
ctx->primitive.target_valid = damon_va_target_valid;
|
||||
ctx->primitive.cleanup = NULL;
|
||||
ctx->primitive.apply_scheme = damon_va_apply_scheme;
|
||||
ctx->primitive.get_scheme_score = damon_va_scheme_score;
|
||||
}
|
||||
|
||||
#include "vaddr-test.h"
|
||||
|
78
mm/debug.c
78
mm/debug.c
@ -16,17 +16,19 @@
|
||||
#include <linux/ctype.h>
|
||||
|
||||
#include "internal.h"
|
||||
#include <trace/events/migrate.h>
|
||||
|
||||
/*
|
||||
* Define EM() and EMe() so that MIGRATE_REASON from trace/events/migrate.h can
|
||||
* be used to populate migrate_reason_names[].
|
||||
*/
|
||||
#undef EM
|
||||
#undef EMe
|
||||
#define EM(a, b) b,
|
||||
#define EMe(a, b) b
|
||||
|
||||
const char *migrate_reason_names[MR_TYPES] = {
|
||||
"compaction",
|
||||
"memory_failure",
|
||||
"memory_hotplug",
|
||||
"syscall_or_cpuset",
|
||||
"mempolicy_mbind",
|
||||
"numa_misplaced",
|
||||
"contig_range",
|
||||
"longterm_pin",
|
||||
"demotion",
|
||||
MIGRATE_REASON
|
||||
};
|
||||
|
||||
const struct trace_print_flags pageflag_names[] = {
|
||||
@ -110,59 +112,11 @@ static void __dump_page(struct page *page)
|
||||
type = "ksm ";
|
||||
else if (PageAnon(page))
|
||||
type = "anon ";
|
||||
else if (mapping) {
|
||||
struct inode *host;
|
||||
const struct address_space_operations *a_ops;
|
||||
struct hlist_node *dentry_first;
|
||||
struct dentry *dentry_ptr;
|
||||
struct dentry dentry;
|
||||
unsigned long ino;
|
||||
|
||||
/*
|
||||
* mapping can be invalid pointer and we don't want to crash
|
||||
* accessing it, so probe everything depending on it carefully
|
||||
*/
|
||||
if (get_kernel_nofault(host, &mapping->host) ||
|
||||
get_kernel_nofault(a_ops, &mapping->a_ops)) {
|
||||
pr_warn("failed to read mapping contents, not a valid kernel address?\n");
|
||||
goto out_mapping;
|
||||
}
|
||||
|
||||
if (!host) {
|
||||
pr_warn("aops:%ps\n", a_ops);
|
||||
goto out_mapping;
|
||||
}
|
||||
|
||||
if (get_kernel_nofault(dentry_first, &host->i_dentry.first) ||
|
||||
get_kernel_nofault(ino, &host->i_ino)) {
|
||||
pr_warn("aops:%ps with invalid host inode %px\n",
|
||||
a_ops, host);
|
||||
goto out_mapping;
|
||||
}
|
||||
|
||||
if (!dentry_first) {
|
||||
pr_warn("aops:%ps ino:%lx\n", a_ops, ino);
|
||||
goto out_mapping;
|
||||
}
|
||||
|
||||
dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
|
||||
if (get_kernel_nofault(dentry, dentry_ptr)) {
|
||||
pr_warn("aops:%ps ino:%lx with invalid dentry %px\n",
|
||||
a_ops, ino, dentry_ptr);
|
||||
} else {
|
||||
/*
|
||||
* if dentry is corrupted, the %pd handler may still
|
||||
* crash, but it's unlikely that we reach here with a
|
||||
* corrupted struct page
|
||||
*/
|
||||
pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n",
|
||||
a_ops, ino, &dentry);
|
||||
}
|
||||
}
|
||||
out_mapping:
|
||||
else if (mapping)
|
||||
dump_mapping(mapping);
|
||||
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
|
||||
|
||||
pr_warn("%sflags: %#lx(%pGp)%s\n", type, head->flags, &head->flags,
|
||||
pr_warn("%sflags: %pGp%s\n", type, &head->flags,
|
||||
page_cma ? " CMA" : "");
|
||||
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
|
||||
sizeof(unsigned long), page,
|
||||
@ -216,7 +170,7 @@ void dump_mm(const struct mm_struct *mm)
|
||||
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
|
||||
"start_brk %lx brk %lx start_stack %lx\n"
|
||||
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
|
||||
"binfmt %px flags %lx core_state %px\n"
|
||||
"binfmt %px flags %lx\n"
|
||||
#ifdef CONFIG_AIO
|
||||
"ioctx_table %px\n"
|
||||
#endif
|
||||
@ -248,7 +202,7 @@ void dump_mm(const struct mm_struct *mm)
|
||||
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
|
||||
mm->start_brk, mm->brk, mm->start_stack,
|
||||
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
|
||||
mm->binfmt, mm->flags, mm->core_state,
|
||||
mm->binfmt, mm->flags,
|
||||
#ifdef CONFIG_AIO
|
||||
mm->ioctx_table,
|
||||
#endif
|
||||
|
@ -654,7 +654,7 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args)
|
||||
set_pte_at(args->mm, args->vaddr, args->ptep, pte);
|
||||
flush_dcache_page(page);
|
||||
barrier();
|
||||
pte_clear(args->mm, args->vaddr, args->ptep);
|
||||
ptep_clear(args->mm, args->vaddr, args->ptep);
|
||||
pte = ptep_get(args->ptep);
|
||||
WARN_ON(!pte_none(pte));
|
||||
}
|
||||
@ -890,8 +890,8 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args)
|
||||
pr_debug("Validating swap migration\n");
|
||||
|
||||
/*
|
||||
* make_migration_entry() expects given page to be
|
||||
* locked, otherwise it stumbles upon a BUG_ON().
|
||||
* make_[readable|writable]_migration_entry() expects given page to
|
||||
* be locked, otherwise it stumbles upon a BUG_ON().
|
||||
*/
|
||||
__SetPageLocked(page);
|
||||
swp = make_writable_migration_entry(page_to_pfn(page));
|
||||
@ -1106,13 +1106,14 @@ static int __init init_args(struct pgtable_debug_args *args)
|
||||
/*
|
||||
* Initialize the debugging data.
|
||||
*
|
||||
* __P000 (or even __S000) will help create page table entries with
|
||||
* PROT_NONE permission as required for pxx_protnone_tests().
|
||||
* protection_map[0] (or even protection_map[8]) will help create
|
||||
* page table entries with PROT_NONE permission as required for
|
||||
* pxx_protnone_tests().
|
||||
*/
|
||||
memset(args, 0, sizeof(*args));
|
||||
args->vaddr = get_random_vaddr();
|
||||
args->page_prot = vm_get_page_prot(VMFLAGS);
|
||||
args->page_prot_none = __P000;
|
||||
args->page_prot_none = protection_map[0];
|
||||
args->is_contiguous_page = false;
|
||||
args->pud_pfn = ULONG_MAX;
|
||||
args->pmd_pfn = ULONG_MAX;
|
||||
|
@ -152,7 +152,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
|
||||
else if ((boundary < size) || (boundary & (boundary - 1)))
|
||||
return NULL;
|
||||
|
||||
retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
|
||||
retval = kmalloc(sizeof(*retval), GFP_KERNEL);
|
||||
if (!retval)
|
||||
return retval;
|
||||
|
||||
|
1733
mm/filemap.c
1733
mm/filemap.c
File diff suppressed because it is too large
Load Diff
259
mm/frontswap.c
259
mm/frontswap.c
@ -27,27 +27,7 @@ DEFINE_STATIC_KEY_FALSE(frontswap_enabled_key);
|
||||
* may be registered, but implementations can never deregister. This
|
||||
* is a simple singly-linked list of all registered implementations.
|
||||
*/
|
||||
static struct frontswap_ops *frontswap_ops __read_mostly;
|
||||
|
||||
#define for_each_frontswap_ops(ops) \
|
||||
for ((ops) = frontswap_ops; (ops); (ops) = (ops)->next)
|
||||
|
||||
/*
|
||||
* If enabled, frontswap_store will return failure even on success. As
|
||||
* a result, the swap subsystem will always write the page to swap, in
|
||||
* effect converting frontswap into a writethrough cache. In this mode,
|
||||
* there is no direct reduction in swap writes, but a frontswap backend
|
||||
* can unilaterally "reclaim" any pages in use with no data loss, thus
|
||||
* providing increases control over maximum memory usage due to frontswap.
|
||||
*/
|
||||
static bool frontswap_writethrough_enabled __read_mostly;
|
||||
|
||||
/*
|
||||
* If enabled, the underlying tmem implementation is capable of doing
|
||||
* exclusive gets, so frontswap_load, on a successful tmem_get must
|
||||
* mark the page as no longer in frontswap AND mark it dirty.
|
||||
*/
|
||||
static bool frontswap_tmem_exclusive_gets_enabled __read_mostly;
|
||||
static const struct frontswap_ops *frontswap_ops __read_mostly;
|
||||
|
||||
#ifdef CONFIG_DEBUG_FS
|
||||
/*
|
||||
@ -114,87 +94,22 @@ static inline void inc_frontswap_invalidates(void) { }
|
||||
/*
|
||||
* Register operations for frontswap
|
||||
*/
|
||||
void frontswap_register_ops(struct frontswap_ops *ops)
|
||||
int frontswap_register_ops(const struct frontswap_ops *ops)
|
||||
{
|
||||
DECLARE_BITMAP(a, MAX_SWAPFILES);
|
||||
DECLARE_BITMAP(b, MAX_SWAPFILES);
|
||||
struct swap_info_struct *si;
|
||||
unsigned int i;
|
||||
|
||||
bitmap_zero(a, MAX_SWAPFILES);
|
||||
bitmap_zero(b, MAX_SWAPFILES);
|
||||
|
||||
spin_lock(&swap_lock);
|
||||
plist_for_each_entry(si, &swap_active_head, list) {
|
||||
if (!WARN_ON(!si->frontswap_map))
|
||||
set_bit(si->type, a);
|
||||
}
|
||||
spin_unlock(&swap_lock);
|
||||
|
||||
/* the new ops needs to know the currently active swap devices */
|
||||
for_each_set_bit(i, a, MAX_SWAPFILES)
|
||||
ops->init(i);
|
||||
|
||||
/*
|
||||
* Setting frontswap_ops must happen after the ops->init() calls
|
||||
* above; cmpxchg implies smp_mb() which will ensure the init is
|
||||
* complete at this point.
|
||||
*/
|
||||
do {
|
||||
ops->next = frontswap_ops;
|
||||
} while (cmpxchg(&frontswap_ops, ops->next, ops) != ops->next);
|
||||
if (frontswap_ops)
|
||||
return -EINVAL;
|
||||
|
||||
frontswap_ops = ops;
|
||||
static_branch_inc(&frontswap_enabled_key);
|
||||
|
||||
spin_lock(&swap_lock);
|
||||
plist_for_each_entry(si, &swap_active_head, list) {
|
||||
if (si->frontswap_map)
|
||||
set_bit(si->type, b);
|
||||
}
|
||||
spin_unlock(&swap_lock);
|
||||
|
||||
/*
|
||||
* On the very unlikely chance that a swap device was added or
|
||||
* removed between setting the "a" list bits and the ops init
|
||||
* calls, we re-check and do init or invalidate for any changed
|
||||
* bits.
|
||||
*/
|
||||
if (unlikely(!bitmap_equal(a, b, MAX_SWAPFILES))) {
|
||||
for (i = 0; i < MAX_SWAPFILES; i++) {
|
||||
if (!test_bit(i, a) && test_bit(i, b))
|
||||
ops->init(i);
|
||||
else if (test_bit(i, a) && !test_bit(i, b))
|
||||
ops->invalidate_area(i);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(frontswap_register_ops);
|
||||
|
||||
/*
|
||||
* Enable/disable frontswap writethrough (see above).
|
||||
*/
|
||||
void frontswap_writethrough(bool enable)
|
||||
{
|
||||
frontswap_writethrough_enabled = enable;
|
||||
}
|
||||
EXPORT_SYMBOL(frontswap_writethrough);
|
||||
|
||||
/*
|
||||
* Enable/disable frontswap exclusive gets (see above).
|
||||
*/
|
||||
void frontswap_tmem_exclusive_gets(bool enable)
|
||||
{
|
||||
frontswap_tmem_exclusive_gets_enabled = enable;
|
||||
}
|
||||
EXPORT_SYMBOL(frontswap_tmem_exclusive_gets);
|
||||
|
||||
/*
|
||||
* Called when a swap device is swapon'd.
|
||||
*/
|
||||
void __frontswap_init(unsigned type, unsigned long *map)
|
||||
void frontswap_init(unsigned type, unsigned long *map)
|
||||
{
|
||||
struct swap_info_struct *sis = swap_info[type];
|
||||
struct frontswap_ops *ops;
|
||||
|
||||
VM_BUG_ON(sis == NULL);
|
||||
|
||||
@ -210,20 +125,16 @@ void __frontswap_init(unsigned type, unsigned long *map)
|
||||
* p->frontswap set to something valid to work properly.
|
||||
*/
|
||||
frontswap_map_set(sis, map);
|
||||
|
||||
for_each_frontswap_ops(ops)
|
||||
ops->init(type);
|
||||
frontswap_ops->init(type);
|
||||
}
|
||||
EXPORT_SYMBOL(__frontswap_init);
|
||||
|
||||
bool __frontswap_test(struct swap_info_struct *sis,
|
||||
static bool __frontswap_test(struct swap_info_struct *sis,
|
||||
pgoff_t offset)
|
||||
{
|
||||
if (sis->frontswap_map)
|
||||
return test_bit(offset, sis->frontswap_map);
|
||||
return false;
|
||||
}
|
||||
EXPORT_SYMBOL(__frontswap_test);
|
||||
|
||||
static inline void __frontswap_set(struct swap_info_struct *sis,
|
||||
pgoff_t offset)
|
||||
@ -253,7 +164,6 @@ int __frontswap_store(struct page *page)
|
||||
int type = swp_type(entry);
|
||||
struct swap_info_struct *sis = swap_info[type];
|
||||
pgoff_t offset = swp_offset(entry);
|
||||
struct frontswap_ops *ops;
|
||||
|
||||
VM_BUG_ON(!frontswap_ops);
|
||||
VM_BUG_ON(!PageLocked(page));
|
||||
@ -267,28 +177,19 @@ int __frontswap_store(struct page *page)
|
||||
*/
|
||||
if (__frontswap_test(sis, offset)) {
|
||||
__frontswap_clear(sis, offset);
|
||||
for_each_frontswap_ops(ops)
|
||||
ops->invalidate_page(type, offset);
|
||||
frontswap_ops->invalidate_page(type, offset);
|
||||
}
|
||||
|
||||
/* Try to store in each implementation, until one succeeds. */
|
||||
for_each_frontswap_ops(ops) {
|
||||
ret = ops->store(type, offset, page);
|
||||
if (!ret) /* successful store */
|
||||
break;
|
||||
}
|
||||
ret = frontswap_ops->store(type, offset, page);
|
||||
if (ret == 0) {
|
||||
__frontswap_set(sis, offset);
|
||||
inc_frontswap_succ_stores();
|
||||
} else {
|
||||
inc_frontswap_failed_stores();
|
||||
}
|
||||
if (frontswap_writethrough_enabled)
|
||||
/* report failure so swap also writes to swap device */
|
||||
ret = -1;
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(__frontswap_store);
|
||||
|
||||
/*
|
||||
* "Get" data from frontswap associated with swaptype and offset that were
|
||||
@ -302,7 +203,6 @@ int __frontswap_load(struct page *page)
|
||||
int type = swp_type(entry);
|
||||
struct swap_info_struct *sis = swap_info[type];
|
||||
pgoff_t offset = swp_offset(entry);
|
||||
struct frontswap_ops *ops;
|
||||
|
||||
VM_BUG_ON(!frontswap_ops);
|
||||
VM_BUG_ON(!PageLocked(page));
|
||||
@ -312,21 +212,11 @@ int __frontswap_load(struct page *page)
|
||||
return -1;
|
||||
|
||||
/* Try loading from each implementation, until one succeeds. */
|
||||
for_each_frontswap_ops(ops) {
|
||||
ret = ops->load(type, offset, page);
|
||||
if (!ret) /* successful load */
|
||||
break;
|
||||
}
|
||||
if (ret == 0) {
|
||||
ret = frontswap_ops->load(type, offset, page);
|
||||
if (ret == 0)
|
||||
inc_frontswap_loads();
|
||||
if (frontswap_tmem_exclusive_gets_enabled) {
|
||||
SetPageDirty(page);
|
||||
__frontswap_clear(sis, offset);
|
||||
}
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(__frontswap_load);
|
||||
|
||||
/*
|
||||
* Invalidate any data from frontswap associated with the specified swaptype
|
||||
@ -335,7 +225,6 @@ EXPORT_SYMBOL(__frontswap_load);
|
||||
void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
|
||||
{
|
||||
struct swap_info_struct *sis = swap_info[type];
|
||||
struct frontswap_ops *ops;
|
||||
|
||||
VM_BUG_ON(!frontswap_ops);
|
||||
VM_BUG_ON(sis == NULL);
|
||||
@ -343,12 +232,10 @@ void __frontswap_invalidate_page(unsigned type, pgoff_t offset)
|
||||
if (!__frontswap_test(sis, offset))
|
||||
return;
|
||||
|
||||
for_each_frontswap_ops(ops)
|
||||
ops->invalidate_page(type, offset);
|
||||
frontswap_ops->invalidate_page(type, offset);
|
||||
__frontswap_clear(sis, offset);
|
||||
inc_frontswap_invalidates();
|
||||
}
|
||||
EXPORT_SYMBOL(__frontswap_invalidate_page);
|
||||
|
||||
/*
|
||||
* Invalidate all data from frontswap associated with all offsets for the
|
||||
@ -357,7 +244,6 @@ EXPORT_SYMBOL(__frontswap_invalidate_page);
|
||||
void __frontswap_invalidate_area(unsigned type)
|
||||
{
|
||||
struct swap_info_struct *sis = swap_info[type];
|
||||
struct frontswap_ops *ops;
|
||||
|
||||
VM_BUG_ON(!frontswap_ops);
|
||||
VM_BUG_ON(sis == NULL);
|
||||
@ -365,123 +251,10 @@ void __frontswap_invalidate_area(unsigned type)
|
||||
if (sis->frontswap_map == NULL)
|
||||
return;
|
||||
|
||||
for_each_frontswap_ops(ops)
|
||||
ops->invalidate_area(type);
|
||||
frontswap_ops->invalidate_area(type);
|
||||
atomic_set(&sis->frontswap_pages, 0);
|
||||
bitmap_zero(sis->frontswap_map, sis->max);
|
||||
}
|
||||
EXPORT_SYMBOL(__frontswap_invalidate_area);
|
||||
|
||||
static unsigned long __frontswap_curr_pages(void)
|
||||
{
|
||||
unsigned long totalpages = 0;
|
||||
struct swap_info_struct *si = NULL;
|
||||
|
||||
assert_spin_locked(&swap_lock);
|
||||
plist_for_each_entry(si, &swap_active_head, list)
|
||||
totalpages += atomic_read(&si->frontswap_pages);
|
||||
return totalpages;
|
||||
}
|
||||
|
||||
static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
|
||||
int *swapid)
|
||||
{
|
||||
int ret = -EINVAL;
|
||||
struct swap_info_struct *si = NULL;
|
||||
int si_frontswap_pages;
|
||||
unsigned long total_pages_to_unuse = total;
|
||||
unsigned long pages = 0, pages_to_unuse = 0;
|
||||
|
||||
assert_spin_locked(&swap_lock);
|
||||
plist_for_each_entry(si, &swap_active_head, list) {
|
||||
si_frontswap_pages = atomic_read(&si->frontswap_pages);
|
||||
if (total_pages_to_unuse < si_frontswap_pages) {
|
||||
pages = pages_to_unuse = total_pages_to_unuse;
|
||||
} else {
|
||||
pages = si_frontswap_pages;
|
||||
pages_to_unuse = 0; /* unuse all */
|
||||
}
|
||||
/* ensure there is enough RAM to fetch pages from frontswap */
|
||||
if (security_vm_enough_memory_mm(current->mm, pages)) {
|
||||
ret = -ENOMEM;
|
||||
continue;
|
||||
}
|
||||
vm_unacct_memory(pages);
|
||||
*unused = pages_to_unuse;
|
||||
*swapid = si->type;
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Used to check if it's necessary and feasible to unuse pages.
|
||||
* Return 1 when nothing to do, 0 when need to shrink pages,
|
||||
* error code when there is an error.
|
||||
*/
|
||||
static int __frontswap_shrink(unsigned long target_pages,
|
||||
unsigned long *pages_to_unuse,
|
||||
int *type)
|
||||
{
|
||||
unsigned long total_pages = 0, total_pages_to_unuse;
|
||||
|
||||
assert_spin_locked(&swap_lock);
|
||||
|
||||
total_pages = __frontswap_curr_pages();
|
||||
if (total_pages <= target_pages) {
|
||||
/* Nothing to do */
|
||||
*pages_to_unuse = 0;
|
||||
return 1;
|
||||
}
|
||||
total_pages_to_unuse = total_pages - target_pages;
|
||||
return __frontswap_unuse_pages(total_pages_to_unuse, pages_to_unuse, type);
|
||||
}
|
||||
|
||||
/*
|
||||
* Frontswap, like a true swap device, may unnecessarily retain pages
|
||||
* under certain circumstances; "shrink" frontswap is essentially a
|
||||
* "partial swapoff" and works by calling try_to_unuse to attempt to
|
||||
* unuse enough frontswap pages to attempt to -- subject to memory
|
||||
* constraints -- reduce the number of pages in frontswap to the
|
||||
* number given in the parameter target_pages.
|
||||
*/
|
||||
void frontswap_shrink(unsigned long target_pages)
|
||||
{
|
||||
unsigned long pages_to_unuse = 0;
|
||||
int type, ret;
|
||||
|
||||
/*
|
||||
* we don't want to hold swap_lock while doing a very
|
||||
* lengthy try_to_unuse, but swap_list may change
|
||||
* so restart scan from swap_active_head each time
|
||||
*/
|
||||
spin_lock(&swap_lock);
|
||||
ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
|
||||
spin_unlock(&swap_lock);
|
||||
if (ret == 0)
|
||||
try_to_unuse(type, true, pages_to_unuse);
|
||||
return;
|
||||
}
|
||||
EXPORT_SYMBOL(frontswap_shrink);
|
||||
|
||||
/*
|
||||
* Count and return the number of frontswap pages across all
|
||||
* swap devices. This is exported so that backend drivers can
|
||||
* determine current usage without reading debugfs.
|
||||
*/
|
||||
unsigned long frontswap_curr_pages(void)
|
||||
{
|
||||
unsigned long totalpages = 0;
|
||||
|
||||
spin_lock(&swap_lock);
|
||||
totalpages = __frontswap_curr_pages();
|
||||
spin_unlock(&swap_lock);
|
||||
|
||||
return totalpages;
|
||||
}
|
||||
EXPORT_SYMBOL(frontswap_curr_pages);
|
||||
|
||||
static int __init init_frontswap(void)
|
||||
{
|
||||
|
140
mm/gup.c
140
mm/gup.c
@ -667,12 +667,17 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
|
||||
}
|
||||
retry:
|
||||
if (!pmd_present(pmdval)) {
|
||||
/*
|
||||
* Should never reach here, if thp migration is not supported;
|
||||
* Otherwise, it must be a thp migration entry.
|
||||
*/
|
||||
VM_BUG_ON(!thp_migration_supported() ||
|
||||
!is_pmd_migration_entry(pmdval));
|
||||
|
||||
if (likely(!(flags & FOLL_MIGRATION)))
|
||||
return no_page_table(vma, flags);
|
||||
VM_BUG_ON(thp_migration_supported() &&
|
||||
!is_pmd_migration_entry(pmdval));
|
||||
if (is_pmd_migration_entry(pmdval))
|
||||
pmd_migration_entry_wait(mm, pmd);
|
||||
|
||||
pmd_migration_entry_wait(mm, pmd);
|
||||
pmdval = READ_ONCE(*pmd);
|
||||
/*
|
||||
* MADV_DONTNEED may convert the pmd to null because
|
||||
@ -943,6 +948,8 @@ static int faultin_page(struct vm_area_struct *vma,
|
||||
/* mlock all present pages, but do not fault in new pages */
|
||||
if ((*flags & (FOLL_POPULATE | FOLL_MLOCK)) == FOLL_MLOCK)
|
||||
return -ENOENT;
|
||||
if (*flags & FOLL_NOFAULT)
|
||||
return -EFAULT;
|
||||
if (*flags & FOLL_WRITE)
|
||||
fault_flags |= FAULT_FLAG_WRITE;
|
||||
if (*flags & FOLL_REMOTE)
|
||||
@ -1681,6 +1688,124 @@ static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
|
||||
}
|
||||
#endif /* !CONFIG_MMU */
|
||||
|
||||
/**
|
||||
* fault_in_writeable - fault in userspace address range for writing
|
||||
* @uaddr: start of address range
|
||||
* @size: size of address range
|
||||
*
|
||||
* Returns the number of bytes not faulted in (like copy_to_user() and
|
||||
* copy_from_user()).
|
||||
*/
|
||||
size_t fault_in_writeable(char __user *uaddr, size_t size)
|
||||
{
|
||||
char __user *start = uaddr, *end;
|
||||
|
||||
if (unlikely(size == 0))
|
||||
return 0;
|
||||
if (!user_write_access_begin(uaddr, size))
|
||||
return size;
|
||||
if (!PAGE_ALIGNED(uaddr)) {
|
||||
unsafe_put_user(0, uaddr, out);
|
||||
uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr);
|
||||
}
|
||||
end = (char __user *)PAGE_ALIGN((unsigned long)start + size);
|
||||
if (unlikely(end < start))
|
||||
end = NULL;
|
||||
while (uaddr != end) {
|
||||
unsafe_put_user(0, uaddr, out);
|
||||
uaddr += PAGE_SIZE;
|
||||
}
|
||||
|
||||
out:
|
||||
user_write_access_end();
|
||||
if (size > uaddr - start)
|
||||
return size - (uaddr - start);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(fault_in_writeable);
|
||||
|
||||
/*
|
||||
* fault_in_safe_writeable - fault in an address range for writing
|
||||
* @uaddr: start of address range
|
||||
* @size: length of address range
|
||||
*
|
||||
* Faults in an address range for writing. This is primarily useful when we
|
||||
* already know that some or all of the pages in the address range aren't in
|
||||
* memory.
|
||||
*
|
||||
* Unlike fault_in_writeable(), this function is non-destructive.
|
||||
*
|
||||
* Note that we don't pin or otherwise hold the pages referenced that we fault
|
||||
* in. There's no guarantee that they'll stay in memory for any duration of
|
||||
* time.
|
||||
*
|
||||
* Returns the number of bytes not faulted in, like copy_to_user() and
|
||||
* copy_from_user().
|
||||
*/
|
||||
size_t fault_in_safe_writeable(const char __user *uaddr, size_t size)
|
||||
{
|
||||
unsigned long start = (unsigned long)uaddr, end;
|
||||
struct mm_struct *mm = current->mm;
|
||||
bool unlocked = false;
|
||||
|
||||
if (unlikely(size == 0))
|
||||
return 0;
|
||||
end = PAGE_ALIGN(start + size);
|
||||
if (end < start)
|
||||
end = 0;
|
||||
|
||||
mmap_read_lock(mm);
|
||||
do {
|
||||
if (fixup_user_fault(mm, start, FAULT_FLAG_WRITE, &unlocked))
|
||||
break;
|
||||
start = (start + PAGE_SIZE) & PAGE_MASK;
|
||||
} while (start != end);
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
if (size > (unsigned long)uaddr - start)
|
||||
return size - ((unsigned long)uaddr - start);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(fault_in_safe_writeable);
|
||||
|
||||
/**
|
||||
* fault_in_readable - fault in userspace address range for reading
|
||||
* @uaddr: start of user address range
|
||||
* @size: size of user address range
|
||||
*
|
||||
* Returns the number of bytes not faulted in (like copy_to_user() and
|
||||
* copy_from_user()).
|
||||
*/
|
||||
size_t fault_in_readable(const char __user *uaddr, size_t size)
|
||||
{
|
||||
const char __user *start = uaddr, *end;
|
||||
volatile char c;
|
||||
|
||||
if (unlikely(size == 0))
|
||||
return 0;
|
||||
if (!user_read_access_begin(uaddr, size))
|
||||
return size;
|
||||
if (!PAGE_ALIGNED(uaddr)) {
|
||||
unsafe_get_user(c, uaddr, out);
|
||||
uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr);
|
||||
}
|
||||
end = (const char __user *)PAGE_ALIGN((unsigned long)start + size);
|
||||
if (unlikely(end < start))
|
||||
end = NULL;
|
||||
while (uaddr != end) {
|
||||
unsafe_get_user(c, uaddr, out);
|
||||
uaddr += PAGE_SIZE;
|
||||
}
|
||||
|
||||
out:
|
||||
user_read_access_end();
|
||||
(void)c;
|
||||
if (size > uaddr - start)
|
||||
return size - (uaddr - start);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(fault_in_readable);
|
||||
|
||||
/**
|
||||
* get_dump_page() - pin user page in memory while writing it to core dump
|
||||
* @addr: user address
|
||||
@ -2253,7 +2378,6 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
|
||||
{
|
||||
int nr_start = *nr;
|
||||
struct dev_pagemap *pgmap = NULL;
|
||||
int ret = 1;
|
||||
|
||||
do {
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
@ -2261,14 +2385,12 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
|
||||
pgmap = get_dev_pagemap(pfn, pgmap);
|
||||
if (unlikely(!pgmap)) {
|
||||
undo_dev_pagemap(nr, nr_start, flags, pages);
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
SetPageReferenced(page);
|
||||
pages[*nr] = page;
|
||||
if (unlikely(!try_grab_page(page, flags))) {
|
||||
undo_dev_pagemap(nr, nr_start, flags, pages);
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
(*nr)++;
|
||||
@ -2276,7 +2398,7 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
|
||||
} while (addr += PAGE_SIZE, addr != end);
|
||||
|
||||
put_dev_pagemap(pgmap);
|
||||
return ret;
|
||||
return addr == end;
|
||||
}
|
||||
|
||||
static int __gup_device_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
|
||||
@ -2733,7 +2855,7 @@ static int internal_get_user_pages_fast(unsigned long start,
|
||||
|
||||
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
|
||||
FOLL_FORCE | FOLL_PIN | FOLL_GET |
|
||||
FOLL_FAST_ONLY)))
|
||||
FOLL_FAST_ONLY | FOLL_NOFAULT)))
|
||||
return -EINVAL;
|
||||
|
||||
if (gup_flags & FOLL_PIN)
|
||||
|
@ -23,7 +23,6 @@
|
||||
#include <linux/bio.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/hash.h>
|
||||
#include <linux/highmem.h>
|
||||
@ -360,7 +359,6 @@ void kunmap_high(struct page *page)
|
||||
}
|
||||
EXPORT_SYMBOL(kunmap_high);
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
|
||||
unsigned start2, unsigned end2)
|
||||
{
|
||||
@ -383,7 +381,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
|
||||
unsigned this_end = min_t(unsigned, end1, PAGE_SIZE);
|
||||
|
||||
if (end1 > start1) {
|
||||
kaddr = kmap_atomic(page + i);
|
||||
kaddr = kmap_local_page(page + i);
|
||||
memset(kaddr + start1, 0, this_end - start1);
|
||||
}
|
||||
end1 -= this_end;
|
||||
@ -398,7 +396,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
|
||||
|
||||
if (end2 > start2) {
|
||||
if (!kaddr)
|
||||
kaddr = kmap_atomic(page + i);
|
||||
kaddr = kmap_local_page(page + i);
|
||||
memset(kaddr + start2, 0, this_end - start2);
|
||||
}
|
||||
end2 -= this_end;
|
||||
@ -406,7 +404,7 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
|
||||
}
|
||||
|
||||
if (kaddr) {
|
||||
kunmap_atomic(kaddr);
|
||||
kunmap_local(kaddr);
|
||||
flush_dcache_page(page + i);
|
||||
}
|
||||
|
||||
@ -417,7 +415,6 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
|
||||
BUG_ON((start1 | start2 | end1 | end2) != 0);
|
||||
}
|
||||
EXPORT_SYMBOL(zero_user_segments);
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
#endif /* CONFIG_HIGHMEM */
|
||||
|
||||
#ifdef CONFIG_KMAP_LOCAL
|
||||
|
@ -603,7 +603,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
|
||||
|
||||
VM_BUG_ON_PAGE(!PageCompound(page), page);
|
||||
|
||||
if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
|
||||
if (mem_cgroup_charge(page_folio(page), vma->vm_mm, gfp)) {
|
||||
put_page(page);
|
||||
count_vm_event(THP_FAULT_FALLBACK);
|
||||
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
|
||||
@ -1322,7 +1322,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
|
||||
* We can only reuse the page if nobody else maps the huge page or it's
|
||||
* part.
|
||||
*/
|
||||
if (reuse_swap_page(page, NULL)) {
|
||||
if (reuse_swap_page(page)) {
|
||||
pmd_t entry;
|
||||
entry = pmd_mkyoung(orig_pmd);
|
||||
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
|
||||
@ -2405,7 +2405,8 @@ static void __split_huge_page_tail(struct page *head, int tail,
|
||||
static void __split_huge_page(struct page *page, struct list_head *list,
|
||||
pgoff_t end)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
struct folio *folio = page_folio(page);
|
||||
struct page *head = &folio->page;
|
||||
struct lruvec *lruvec;
|
||||
struct address_space *swap_cache = NULL;
|
||||
unsigned long offset = 0;
|
||||
@ -2424,7 +2425,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
|
||||
}
|
||||
|
||||
/* lock lru list/PageCompound, ref frozen by page_ref_freeze */
|
||||
lruvec = lock_page_lruvec(head);
|
||||
lruvec = folio_lruvec_lock(folio);
|
||||
|
||||
ClearPageHasHWPoisoned(head);
|
||||
|
||||
@ -2541,38 +2542,28 @@ int total_mapcount(struct page *page)
|
||||
* need full accuracy to avoid breaking page pinning, because
|
||||
* page_trans_huge_mapcount() is slower than page_mapcount().
|
||||
*/
|
||||
int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
|
||||
int page_trans_huge_mapcount(struct page *page)
|
||||
{
|
||||
int i, ret, _total_mapcount, mapcount;
|
||||
int i, ret;
|
||||
|
||||
/* hugetlbfs shouldn't call it */
|
||||
VM_BUG_ON_PAGE(PageHuge(page), page);
|
||||
|
||||
if (likely(!PageTransCompound(page))) {
|
||||
mapcount = atomic_read(&page->_mapcount) + 1;
|
||||
if (total_mapcount)
|
||||
*total_mapcount = mapcount;
|
||||
return mapcount;
|
||||
}
|
||||
if (likely(!PageTransCompound(page)))
|
||||
return atomic_read(&page->_mapcount) + 1;
|
||||
|
||||
page = compound_head(page);
|
||||
|
||||
_total_mapcount = ret = 0;
|
||||
ret = 0;
|
||||
for (i = 0; i < thp_nr_pages(page); i++) {
|
||||
mapcount = atomic_read(&page[i]._mapcount) + 1;
|
||||
int mapcount = atomic_read(&page[i]._mapcount) + 1;
|
||||
ret = max(ret, mapcount);
|
||||
_total_mapcount += mapcount;
|
||||
}
|
||||
if (PageDoubleMap(page)) {
|
||||
|
||||
if (PageDoubleMap(page))
|
||||
ret -= 1;
|
||||
_total_mapcount -= thp_nr_pages(page);
|
||||
}
|
||||
mapcount = compound_mapcount(page);
|
||||
ret += mapcount;
|
||||
_total_mapcount += mapcount;
|
||||
if (total_mapcount)
|
||||
*total_mapcount = _total_mapcount;
|
||||
return ret;
|
||||
|
||||
return ret + compound_mapcount(page);
|
||||
}
|
||||
|
||||
/* Racy check whether the huge page can be split */
|
||||
@ -2613,6 +2604,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
struct deferred_split *ds_queue = get_deferred_split_queue(head);
|
||||
XA_STATE(xas, &head->mapping->i_pages, head->index);
|
||||
struct anon_vma *anon_vma = NULL;
|
||||
struct address_space *mapping = NULL;
|
||||
int extra_pins, ret;
|
||||
@ -2651,6 +2643,13 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
goto out;
|
||||
}
|
||||
|
||||
xas_split_alloc(&xas, head, compound_order(head),
|
||||
mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK);
|
||||
if (xas_error(&xas)) {
|
||||
ret = xas_error(&xas);
|
||||
goto out;
|
||||
}
|
||||
|
||||
anon_vma = NULL;
|
||||
i_mmap_lock_read(mapping);
|
||||
|
||||
@ -2680,13 +2679,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
/* block interrupt reentry in xa_lock and spinlock */
|
||||
local_irq_disable();
|
||||
if (mapping) {
|
||||
XA_STATE(xas, &mapping->i_pages, page_index(head));
|
||||
|
||||
/*
|
||||
* Check if the head page is present in page cache.
|
||||
* We assume all tail are present too, if head is there.
|
||||
*/
|
||||
xa_lock(&mapping->i_pages);
|
||||
xas_lock(&xas);
|
||||
xas_reset(&xas);
|
||||
if (xas_load(&xas) != head)
|
||||
goto fail;
|
||||
}
|
||||
@ -2702,6 +2700,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
if (mapping) {
|
||||
int nr = thp_nr_pages(head);
|
||||
|
||||
xas_split(&xas, head, thp_order(head));
|
||||
if (PageSwapBacked(head)) {
|
||||
__mod_lruvec_page_state(head, NR_SHMEM_THPS,
|
||||
-nr);
|
||||
@ -2718,7 +2717,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
spin_unlock(&ds_queue->split_queue_lock);
|
||||
fail:
|
||||
if (mapping)
|
||||
xa_unlock(&mapping->i_pages);
|
||||
xas_unlock(&xas);
|
||||
local_irq_enable();
|
||||
remap_page(head, thp_nr_pages(head));
|
||||
ret = -EBUSY;
|
||||
@ -2732,6 +2731,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
|
||||
if (mapping)
|
||||
i_mmap_unlock_read(mapping);
|
||||
out:
|
||||
/* Free any memory we didn't use */
|
||||
xas_nomem(&xas, 0);
|
||||
count_vm_event(!ret ? THP_SPLIT_PAGE : THP_SPLIT_PAGE_FAILED);
|
||||
return ret;
|
||||
}
|
||||
|
708
mm/hugetlb.c
708
mm/hugetlb.c
File diff suppressed because it is too large
Load Diff
@ -27,9 +27,6 @@
|
||||
#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff)
|
||||
#define MEMFILE_ATTR(val) ((val) & 0xffff)
|
||||
|
||||
#define hugetlb_cgroup_from_counter(counter, idx) \
|
||||
container_of(counter, struct hugetlb_cgroup, hugepage[idx])
|
||||
|
||||
static struct hugetlb_cgroup *root_h_cgroup __read_mostly;
|
||||
|
||||
static inline struct page_counter *
|
||||
@ -126,29 +123,58 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
|
||||
}
|
||||
}
|
||||
|
||||
static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
|
||||
{
|
||||
int node;
|
||||
|
||||
for_each_node(node)
|
||||
kfree(h_cgroup->nodeinfo[node]);
|
||||
kfree(h_cgroup);
|
||||
}
|
||||
|
||||
static struct cgroup_subsys_state *
|
||||
hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
|
||||
{
|
||||
struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
|
||||
struct hugetlb_cgroup *h_cgroup;
|
||||
int node;
|
||||
|
||||
h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids),
|
||||
GFP_KERNEL);
|
||||
|
||||
h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
|
||||
if (!h_cgroup)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
if (!parent_h_cgroup)
|
||||
root_h_cgroup = h_cgroup;
|
||||
|
||||
/*
|
||||
* TODO: this routine can waste much memory for nodes which will
|
||||
* never be onlined. It's better to use memory hotplug callback
|
||||
* function.
|
||||
*/
|
||||
for_each_node(node) {
|
||||
/* Set node_to_alloc to -1 for offline nodes. */
|
||||
int node_to_alloc =
|
||||
node_state(node, N_NORMAL_MEMORY) ? node : -1;
|
||||
h_cgroup->nodeinfo[node] =
|
||||
kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
|
||||
GFP_KERNEL, node_to_alloc);
|
||||
if (!h_cgroup->nodeinfo[node])
|
||||
goto fail_alloc_nodeinfo;
|
||||
}
|
||||
|
||||
hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
|
||||
return &h_cgroup->css;
|
||||
|
||||
fail_alloc_nodeinfo:
|
||||
hugetlb_cgroup_free(h_cgroup);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct hugetlb_cgroup *h_cgroup;
|
||||
|
||||
h_cgroup = hugetlb_cgroup_from_css(css);
|
||||
kfree(h_cgroup);
|
||||
hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -292,7 +318,17 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
|
||||
return;
|
||||
|
||||
__set_hugetlb_cgroup(page, h_cg, rsvd);
|
||||
return;
|
||||
if (!rsvd) {
|
||||
unsigned long usage =
|
||||
h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
|
||||
/*
|
||||
* This write is not atomic due to fetching usage and writing
|
||||
* to it, but that's fine because we call this with
|
||||
* hugetlb_lock held anyway.
|
||||
*/
|
||||
WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
|
||||
usage + nr_pages);
|
||||
}
|
||||
}
|
||||
|
||||
void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
|
||||
@ -331,8 +367,17 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
|
||||
|
||||
if (rsvd)
|
||||
css_put(&h_cg->css);
|
||||
|
||||
return;
|
||||
else {
|
||||
unsigned long usage =
|
||||
h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
|
||||
/*
|
||||
* This write is not atomic due to fetching usage and writing
|
||||
* to it, but that's fine because we call this with
|
||||
* hugetlb_lock held anyway.
|
||||
*/
|
||||
WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
|
||||
usage - nr_pages);
|
||||
}
|
||||
}
|
||||
|
||||
void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
|
||||
@ -421,6 +466,59 @@ enum {
|
||||
RES_RSVD_FAILCNT,
|
||||
};
|
||||
|
||||
static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
|
||||
{
|
||||
int nid;
|
||||
struct cftype *cft = seq_cft(seq);
|
||||
int idx = MEMFILE_IDX(cft->private);
|
||||
bool legacy = MEMFILE_ATTR(cft->private);
|
||||
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
|
||||
struct cgroup_subsys_state *css;
|
||||
unsigned long usage;
|
||||
|
||||
if (legacy) {
|
||||
/* Add up usage across all nodes for the non-hierarchical total. */
|
||||
usage = 0;
|
||||
for_each_node_state(nid, N_MEMORY)
|
||||
usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
|
||||
seq_printf(seq, "total=%lu", usage * PAGE_SIZE);
|
||||
|
||||
/* Simply print the per-node usage for the non-hierarchical total. */
|
||||
for_each_node_state(nid, N_MEMORY)
|
||||
seq_printf(seq, " N%d=%lu", nid,
|
||||
READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
|
||||
PAGE_SIZE);
|
||||
seq_putc(seq, '\n');
|
||||
}
|
||||
|
||||
/*
|
||||
* The hierarchical total is pretty much the value recorded by the
|
||||
* counter, so use that.
|
||||
*/
|
||||
seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
|
||||
page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
|
||||
|
||||
/*
|
||||
* For each node, transverse the css tree to obtain the hierarchical
|
||||
* node usage.
|
||||
*/
|
||||
for_each_node_state(nid, N_MEMORY) {
|
||||
usage = 0;
|
||||
rcu_read_lock();
|
||||
css_for_each_descendant_pre(css, &h_cg->css) {
|
||||
usage += READ_ONCE(hugetlb_cgroup_from_css(css)
|
||||
->nodeinfo[nid]
|
||||
->usage[idx]);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
|
||||
}
|
||||
|
||||
seq_putc(seq, '\n');
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft)
|
||||
{
|
||||
@ -671,8 +769,14 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx)
|
||||
events_local_file[idx]);
|
||||
cft->flags = CFTYPE_NOT_ON_ROOT;
|
||||
|
||||
/* NULL terminate the last cft */
|
||||
/* Add the numa stat file */
|
||||
cft = &h->cgroup_files_dfl[6];
|
||||
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
|
||||
cft->seq_show = hugetlb_cgroup_read_numa_stat;
|
||||
cft->flags = CFTYPE_NOT_ON_ROOT;
|
||||
|
||||
/* NULL terminate the last cft */
|
||||
cft = &h->cgroup_files_dfl[7];
|
||||
memset(cft, 0, sizeof(*cft));
|
||||
|
||||
WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
|
||||
@ -742,8 +846,14 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx)
|
||||
cft->write = hugetlb_cgroup_reset;
|
||||
cft->read_u64 = hugetlb_cgroup_read_u64;
|
||||
|
||||
/* NULL terminate the last cft */
|
||||
/* Add the numa stat file */
|
||||
cft = &h->cgroup_files_legacy[8];
|
||||
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
|
||||
cft->private = MEMFILE_PRIVATE(idx, 1);
|
||||
cft->seq_show = hugetlb_cgroup_read_numa_stat;
|
||||
|
||||
/* NULL terminate the last cft */
|
||||
cft = &h->cgroup_files_legacy[9];
|
||||
memset(cft, 0, sizeof(*cft));
|
||||
|
||||
WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
|
||||
|
@ -12,6 +12,8 @@
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/tracepoint-defs.h>
|
||||
|
||||
struct folio_batch;
|
||||
|
||||
/*
|
||||
* The set of flags that only affect watermark checking and reclaim
|
||||
* behaviour. This is used by the MM to obey the caller constraints
|
||||
@ -21,7 +23,7 @@
|
||||
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
|
||||
__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
|
||||
__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
|
||||
__GFP_ATOMIC)
|
||||
__GFP_ATOMIC|__GFP_NOLOCKDEP)
|
||||
|
||||
/* The GFP flags allowed during early boot */
|
||||
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
|
||||
@ -34,16 +36,47 @@
|
||||
|
||||
void page_writeback_init(void);
|
||||
|
||||
static inline void *folio_raw_mapping(struct folio *folio)
|
||||
{
|
||||
unsigned long mapping = (unsigned long)folio->mapping;
|
||||
|
||||
return (void *)(mapping & ~PAGE_MAPPING_FLAGS);
|
||||
}
|
||||
|
||||
void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
|
||||
int nr_throttled);
|
||||
static inline void acct_reclaim_writeback(struct folio *folio)
|
||||
{
|
||||
pg_data_t *pgdat = folio_pgdat(folio);
|
||||
int nr_throttled = atomic_read(&pgdat->nr_writeback_throttled);
|
||||
|
||||
if (nr_throttled)
|
||||
__acct_reclaim_writeback(pgdat, folio, nr_throttled);
|
||||
}
|
||||
|
||||
static inline void wake_throttle_isolated(pg_data_t *pgdat)
|
||||
{
|
||||
wait_queue_head_t *wqh;
|
||||
|
||||
wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_ISOLATED];
|
||||
if (waitqueue_active(wqh))
|
||||
wake_up(wqh);
|
||||
}
|
||||
|
||||
vm_fault_t do_swap_page(struct vm_fault *vmf);
|
||||
void folio_rotate_reclaimable(struct folio *folio);
|
||||
bool __folio_end_writeback(struct folio *folio);
|
||||
|
||||
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
|
||||
unsigned long floor, unsigned long ceiling);
|
||||
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte);
|
||||
|
||||
static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
|
||||
{
|
||||
return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
|
||||
}
|
||||
|
||||
struct zap_details;
|
||||
void unmap_page_range(struct mmu_gather *tlb,
|
||||
struct vm_area_struct *vma,
|
||||
unsigned long addr, unsigned long end,
|
||||
@ -60,20 +93,37 @@ static inline void force_page_cache_readahead(struct address_space *mapping,
|
||||
}
|
||||
|
||||
unsigned find_lock_entries(struct address_space *mapping, pgoff_t start,
|
||||
pgoff_t end, struct pagevec *pvec, pgoff_t *indices);
|
||||
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
|
||||
unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
|
||||
pgoff_t end, struct folio_batch *fbatch, pgoff_t *indices);
|
||||
void filemap_free_folio(struct address_space *mapping, struct folio *folio);
|
||||
int truncate_inode_folio(struct address_space *mapping, struct folio *folio);
|
||||
bool truncate_inode_partial_folio(struct folio *folio, loff_t start,
|
||||
loff_t end);
|
||||
|
||||
/**
|
||||
* page_evictable - test whether a page is evictable
|
||||
* @page: the page to test
|
||||
* folio_evictable - Test whether a folio is evictable.
|
||||
* @folio: The folio to test.
|
||||
*
|
||||
* Test whether page is evictable--i.e., should be placed on active/inactive
|
||||
* lists vs unevictable list.
|
||||
*
|
||||
* Reasons page might not be evictable:
|
||||
* (1) page's mapping marked unevictable
|
||||
* (2) page is part of an mlocked VMA
|
||||
* Test whether @folio is evictable -- i.e., should be placed on
|
||||
* active/inactive lists vs unevictable list.
|
||||
*
|
||||
* Reasons folio might not be evictable:
|
||||
* 1. folio's mapping marked unevictable
|
||||
* 2. One of the pages in the folio is part of an mlocked VMA
|
||||
*/
|
||||
static inline bool folio_evictable(struct folio *folio)
|
||||
{
|
||||
bool ret;
|
||||
|
||||
/* Prevent address_space of inode and swap cache from being freed */
|
||||
rcu_read_lock();
|
||||
ret = !mapping_unevictable(folio_mapping(folio)) &&
|
||||
!folio_test_mlocked(folio);
|
||||
rcu_read_unlock();
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline bool page_evictable(struct page *page)
|
||||
{
|
||||
bool ret;
|
||||
@ -109,17 +159,13 @@ extern unsigned long highest_memmap_pfn;
|
||||
*/
|
||||
extern int isolate_lru_page(struct page *page);
|
||||
extern void putback_lru_page(struct page *page);
|
||||
extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
|
||||
|
||||
/*
|
||||
* in mm/rmap.c:
|
||||
*/
|
||||
extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
|
||||
|
||||
/*
|
||||
* in mm/memcontrol.c:
|
||||
*/
|
||||
extern bool cgroup_memory_nokmem;
|
||||
|
||||
/*
|
||||
* in mm/page_alloc.c
|
||||
*/
|
||||
@ -346,6 +392,7 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma);
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
void unmap_mapping_folio(struct folio *folio);
|
||||
extern long populate_vma_page_range(struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end, int *locked);
|
||||
extern long faultin_vma_page_range(struct vm_area_struct *vma,
|
||||
@ -449,8 +496,8 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
|
||||
}
|
||||
return fpin;
|
||||
}
|
||||
|
||||
#else /* !CONFIG_MMU */
|
||||
static inline void unmap_mapping_folio(struct folio *folio) { }
|
||||
static inline void clear_page_mlock(struct page *page) { }
|
||||
static inline void mlock_vma_page(struct page *page) { }
|
||||
static inline void vunmap_range_noflush(unsigned long start, unsigned long end)
|
||||
|
@ -30,20 +30,19 @@
|
||||
#include "kasan.h"
|
||||
#include "../slab.h"
|
||||
|
||||
depot_stack_handle_t kasan_save_stack(gfp_t flags)
|
||||
depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc)
|
||||
{
|
||||
unsigned long entries[KASAN_STACK_DEPTH];
|
||||
unsigned int nr_entries;
|
||||
|
||||
nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
|
||||
nr_entries = filter_irq_stacks(entries, nr_entries);
|
||||
return stack_depot_save(entries, nr_entries, flags);
|
||||
return __stack_depot_save(entries, nr_entries, flags, can_alloc);
|
||||
}
|
||||
|
||||
void kasan_set_track(struct kasan_track *track, gfp_t flags)
|
||||
{
|
||||
track->pid = current->pid;
|
||||
track->stack = kasan_save_stack(flags);
|
||||
track->stack = kasan_save_stack(flags, true);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
|
||||
@ -247,8 +246,9 @@ struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
|
||||
}
|
||||
#endif
|
||||
|
||||
void __kasan_poison_slab(struct page *page)
|
||||
void __kasan_poison_slab(struct slab *slab)
|
||||
{
|
||||
struct page *page = slab_page(slab);
|
||||
unsigned long i;
|
||||
|
||||
for (i = 0; i < compound_nr(page); i++)
|
||||
@ -298,7 +298,7 @@ static inline u8 assign_tag(struct kmem_cache *cache,
|
||||
/* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
|
||||
#ifdef CONFIG_SLAB
|
||||
/* For SLAB assign tags based on the object index in the freelist. */
|
||||
return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
|
||||
return (u8)obj_to_index(cache, virt_to_slab(object), (void *)object);
|
||||
#else
|
||||
/*
|
||||
* For SLUB assign a random tag during slab creation, otherwise reuse
|
||||
@ -341,7 +341,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
|
||||
if (is_kfence_address(object))
|
||||
return false;
|
||||
|
||||
if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) !=
|
||||
if (unlikely(nearest_obj(cache, virt_to_slab(object), object) !=
|
||||
object)) {
|
||||
kasan_report_invalid_free(tagged_object, ip);
|
||||
return true;
|
||||
@ -401,9 +401,9 @@ void __kasan_kfree_large(void *ptr, unsigned long ip)
|
||||
|
||||
void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
|
||||
{
|
||||
struct page *page;
|
||||
struct folio *folio;
|
||||
|
||||
page = virt_to_head_page(ptr);
|
||||
folio = virt_to_folio(ptr);
|
||||
|
||||
/*
|
||||
* Even though this function is only called for kmem_cache_alloc and
|
||||
@ -411,12 +411,14 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
|
||||
* !PageSlab() when the size provided to kmalloc is larger than
|
||||
* KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc.
|
||||
*/
|
||||
if (unlikely(!PageSlab(page))) {
|
||||
if (unlikely(!folio_test_slab(folio))) {
|
||||
if (____kasan_kfree_large(ptr, ip))
|
||||
return;
|
||||
kasan_poison(ptr, page_size(page), KASAN_FREE_PAGE, false);
|
||||
kasan_poison(ptr, folio_size(folio), KASAN_FREE_PAGE, false);
|
||||
} else {
|
||||
____kasan_slab_free(page->slab_cache, ptr, ip, false, false);
|
||||
struct slab *slab = folio_slab(folio);
|
||||
|
||||
____kasan_slab_free(slab->slab_cache, ptr, ip, false, false);
|
||||
}
|
||||
}
|
||||
|
||||
@ -560,7 +562,7 @@ void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
|
||||
|
||||
void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flags)
|
||||
{
|
||||
struct page *page;
|
||||
struct slab *slab;
|
||||
|
||||
if (unlikely(object == ZERO_SIZE_PTR))
|
||||
return (void *)object;
|
||||
@ -572,13 +574,13 @@ void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flag
|
||||
*/
|
||||
kasan_unpoison(object, size, false);
|
||||
|
||||
page = virt_to_head_page(object);
|
||||
slab = virt_to_slab(object);
|
||||
|
||||
/* Piggy-back on kmalloc() instrumentation to poison the redzone. */
|
||||
if (unlikely(!PageSlab(page)))
|
||||
if (unlikely(!slab))
|
||||
return __kasan_kmalloc_large(object, size, flags);
|
||||
else
|
||||
return ____kasan_kmalloc(page->slab_cache, object, size, flags);
|
||||
return ____kasan_kmalloc(slab->slab_cache, object, size, flags);
|
||||
}
|
||||
|
||||
bool __kasan_check_byte(const void *address, unsigned long ip)
|
||||
|
@ -328,24 +328,34 @@ DEFINE_ASAN_SET_SHADOW(f3);
|
||||
DEFINE_ASAN_SET_SHADOW(f5);
|
||||
DEFINE_ASAN_SET_SHADOW(f8);
|
||||
|
||||
void kasan_record_aux_stack(void *addr)
|
||||
static void __kasan_record_aux_stack(void *addr, bool can_alloc)
|
||||
{
|
||||
struct page *page = kasan_addr_to_page(addr);
|
||||
struct slab *slab = kasan_addr_to_slab(addr);
|
||||
struct kmem_cache *cache;
|
||||
struct kasan_alloc_meta *alloc_meta;
|
||||
void *object;
|
||||
|
||||
if (is_kfence_address(addr) || !(page && PageSlab(page)))
|
||||
if (is_kfence_address(addr) || !slab)
|
||||
return;
|
||||
|
||||
cache = page->slab_cache;
|
||||
object = nearest_obj(cache, page, addr);
|
||||
cache = slab->slab_cache;
|
||||
object = nearest_obj(cache, slab, addr);
|
||||
alloc_meta = kasan_get_alloc_meta(cache, object);
|
||||
if (!alloc_meta)
|
||||
return;
|
||||
|
||||
alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
|
||||
alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT);
|
||||
alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT, can_alloc);
|
||||
}
|
||||
|
||||
void kasan_record_aux_stack(void *addr)
|
||||
{
|
||||
return __kasan_record_aux_stack(addr, true);
|
||||
}
|
||||
|
||||
void kasan_record_aux_stack_noalloc(void *addr)
|
||||
{
|
||||
return __kasan_record_aux_stack(addr, false);
|
||||
}
|
||||
|
||||
void kasan_set_free_info(struct kmem_cache *cache,
|
||||
|
@ -29,6 +29,7 @@ enum kasan_arg_mode {
|
||||
KASAN_ARG_MODE_DEFAULT,
|
||||
KASAN_ARG_MODE_SYNC,
|
||||
KASAN_ARG_MODE_ASYNC,
|
||||
KASAN_ARG_MODE_ASYMM,
|
||||
};
|
||||
|
||||
enum kasan_arg_stacktrace {
|
||||
@ -45,9 +46,9 @@ static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init;
|
||||
DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
|
||||
EXPORT_SYMBOL(kasan_flag_enabled);
|
||||
|
||||
/* Whether the asynchronous mode is enabled. */
|
||||
bool kasan_flag_async __ro_after_init;
|
||||
EXPORT_SYMBOL_GPL(kasan_flag_async);
|
||||
/* Whether the selected mode is synchronous/asynchronous/asymmetric.*/
|
||||
enum kasan_mode kasan_mode __ro_after_init;
|
||||
EXPORT_SYMBOL_GPL(kasan_mode);
|
||||
|
||||
/* Whether to collect alloc/free stack traces. */
|
||||
DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
|
||||
@ -69,7 +70,7 @@ static int __init early_kasan_flag(char *arg)
|
||||
}
|
||||
early_param("kasan", early_kasan_flag);
|
||||
|
||||
/* kasan.mode=sync/async */
|
||||
/* kasan.mode=sync/async/asymm */
|
||||
static int __init early_kasan_mode(char *arg)
|
||||
{
|
||||
if (!arg)
|
||||
@ -79,6 +80,8 @@ static int __init early_kasan_mode(char *arg)
|
||||
kasan_arg_mode = KASAN_ARG_MODE_SYNC;
|
||||
else if (!strcmp(arg, "async"))
|
||||
kasan_arg_mode = KASAN_ARG_MODE_ASYNC;
|
||||
else if (!strcmp(arg, "asymm"))
|
||||
kasan_arg_mode = KASAN_ARG_MODE_ASYMM;
|
||||
else
|
||||
return -EINVAL;
|
||||
|
||||
@ -103,6 +106,16 @@ static int __init early_kasan_flag_stacktrace(char *arg)
|
||||
}
|
||||
early_param("kasan.stacktrace", early_kasan_flag_stacktrace);
|
||||
|
||||
static inline const char *kasan_mode_info(void)
|
||||
{
|
||||
if (kasan_mode == KASAN_MODE_ASYNC)
|
||||
return "async";
|
||||
else if (kasan_mode == KASAN_MODE_ASYMM)
|
||||
return "asymm";
|
||||
else
|
||||
return "sync";
|
||||
}
|
||||
|
||||
/* kasan_init_hw_tags_cpu() is called for each CPU. */
|
||||
void kasan_init_hw_tags_cpu(void)
|
||||
{
|
||||
@ -116,11 +129,13 @@ void kasan_init_hw_tags_cpu(void)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Enable async mode only when explicitly requested through
|
||||
* the command line.
|
||||
* Enable async or asymm modes only when explicitly requested
|
||||
* through the command line.
|
||||
*/
|
||||
if (kasan_arg_mode == KASAN_ARG_MODE_ASYNC)
|
||||
hw_enable_tagging_async();
|
||||
else if (kasan_arg_mode == KASAN_ARG_MODE_ASYMM)
|
||||
hw_enable_tagging_asymm();
|
||||
else
|
||||
hw_enable_tagging_sync();
|
||||
}
|
||||
@ -143,15 +158,19 @@ void __init kasan_init_hw_tags(void)
|
||||
case KASAN_ARG_MODE_DEFAULT:
|
||||
/*
|
||||
* Default to sync mode.
|
||||
* Do nothing, kasan_flag_async keeps its default value.
|
||||
*/
|
||||
break;
|
||||
fallthrough;
|
||||
case KASAN_ARG_MODE_SYNC:
|
||||
/* Do nothing, kasan_flag_async keeps its default value. */
|
||||
/* Sync mode enabled. */
|
||||
kasan_mode = KASAN_MODE_SYNC;
|
||||
break;
|
||||
case KASAN_ARG_MODE_ASYNC:
|
||||
/* Async mode enabled. */
|
||||
kasan_flag_async = true;
|
||||
kasan_mode = KASAN_MODE_ASYNC;
|
||||
break;
|
||||
case KASAN_ARG_MODE_ASYMM:
|
||||
/* Asymm mode enabled. */
|
||||
kasan_mode = KASAN_MODE_ASYMM;
|
||||
break;
|
||||
}
|
||||
|
||||
@ -168,7 +187,9 @@ void __init kasan_init_hw_tags(void)
|
||||
break;
|
||||
}
|
||||
|
||||
pr_info("KernelAddressSanitizer initialized\n");
|
||||
pr_info("KernelAddressSanitizer initialized (hw-tags, mode=%s, stacktrace=%s)\n",
|
||||
kasan_mode_info(),
|
||||
kasan_stack_collection_enabled() ? "on" : "off");
|
||||
}
|
||||
|
||||
void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags)
|
||||
|
@ -13,16 +13,28 @@
|
||||
#include "../slab.h"
|
||||
|
||||
DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
|
||||
extern bool kasan_flag_async __ro_after_init;
|
||||
|
||||
enum kasan_mode {
|
||||
KASAN_MODE_SYNC,
|
||||
KASAN_MODE_ASYNC,
|
||||
KASAN_MODE_ASYMM,
|
||||
};
|
||||
|
||||
extern enum kasan_mode kasan_mode __ro_after_init;
|
||||
|
||||
static inline bool kasan_stack_collection_enabled(void)
|
||||
{
|
||||
return static_branch_unlikely(&kasan_flag_stacktrace);
|
||||
}
|
||||
|
||||
static inline bool kasan_async_mode_enabled(void)
|
||||
static inline bool kasan_async_fault_possible(void)
|
||||
{
|
||||
return kasan_flag_async;
|
||||
return kasan_mode == KASAN_MODE_ASYNC || kasan_mode == KASAN_MODE_ASYMM;
|
||||
}
|
||||
|
||||
static inline bool kasan_sync_fault_possible(void)
|
||||
{
|
||||
return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM;
|
||||
}
|
||||
#else
|
||||
|
||||
@ -31,14 +43,17 @@ static inline bool kasan_stack_collection_enabled(void)
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool kasan_async_mode_enabled(void)
|
||||
static inline bool kasan_async_fault_possible(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
#endif
|
||||
static inline bool kasan_sync_fault_possible(void)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
extern bool kasan_flag_async __ro_after_init;
|
||||
#endif
|
||||
|
||||
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
|
||||
#define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
|
||||
@ -250,8 +265,9 @@ bool kasan_report(unsigned long addr, size_t size,
|
||||
void kasan_report_invalid_free(void *object, unsigned long ip);
|
||||
|
||||
struct page *kasan_addr_to_page(const void *addr);
|
||||
struct slab *kasan_addr_to_slab(const void *addr);
|
||||
|
||||
depot_stack_handle_t kasan_save_stack(gfp_t flags);
|
||||
depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc);
|
||||
void kasan_set_track(struct kasan_track *track, gfp_t flags);
|
||||
void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag);
|
||||
struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
|
||||
@ -289,6 +305,9 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
|
||||
#ifndef arch_enable_tagging_async
|
||||
#define arch_enable_tagging_async()
|
||||
#endif
|
||||
#ifndef arch_enable_tagging_asymm
|
||||
#define arch_enable_tagging_asymm()
|
||||
#endif
|
||||
#ifndef arch_force_async_tag_fault
|
||||
#define arch_force_async_tag_fault()
|
||||
#endif
|
||||
@ -304,6 +323,7 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
|
||||
|
||||
#define hw_enable_tagging_sync() arch_enable_tagging_sync()
|
||||
#define hw_enable_tagging_async() arch_enable_tagging_async()
|
||||
#define hw_enable_tagging_asymm() arch_enable_tagging_asymm()
|
||||
#define hw_force_async_tag_fault() arch_force_async_tag_fault()
|
||||
#define hw_get_random_tag() arch_get_random_tag()
|
||||
#define hw_get_mem_tag(addr) arch_get_mem_tag(addr)
|
||||
@ -314,6 +334,7 @@ static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
|
||||
|
||||
#define hw_enable_tagging_sync()
|
||||
#define hw_enable_tagging_async()
|
||||
#define hw_enable_tagging_asymm()
|
||||
|
||||
#endif /* CONFIG_KASAN_HW_TAGS */
|
||||
|
||||
|
@ -117,7 +117,7 @@ static unsigned long quarantine_batch_size;
|
||||
|
||||
static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink)
|
||||
{
|
||||
return virt_to_head_page(qlink)->slab_cache;
|
||||
return virt_to_slab(qlink)->slab_cache;
|
||||
}
|
||||
|
||||
static void *qlink_to_object(struct qlist_node *qlink, struct kmem_cache *cache)
|
||||
|
@ -112,7 +112,7 @@ static void start_report(unsigned long *flags)
|
||||
|
||||
static void end_report(unsigned long *flags, unsigned long addr)
|
||||
{
|
||||
if (!kasan_async_mode_enabled())
|
||||
if (!kasan_async_fault_possible())
|
||||
trace_error_report_end(ERROR_DETECTOR_KASAN, addr);
|
||||
pr_err("==================================================================\n");
|
||||
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
|
||||
@ -132,20 +132,11 @@ static void end_report(unsigned long *flags, unsigned long addr)
|
||||
kasan_enable_current();
|
||||
}
|
||||
|
||||
static void print_stack(depot_stack_handle_t stack)
|
||||
{
|
||||
unsigned long *entries;
|
||||
unsigned int nr_entries;
|
||||
|
||||
nr_entries = stack_depot_fetch(stack, &entries);
|
||||
stack_trace_print(entries, nr_entries, 0);
|
||||
}
|
||||
|
||||
static void print_track(struct kasan_track *track, const char *prefix)
|
||||
{
|
||||
pr_err("%s by task %u:\n", prefix, track->pid);
|
||||
if (track->stack) {
|
||||
print_stack(track->stack);
|
||||
stack_depot_print(track->stack);
|
||||
} else {
|
||||
pr_err("(stack is not available)\n");
|
||||
}
|
||||
@ -159,6 +150,14 @@ struct page *kasan_addr_to_page(const void *addr)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct slab *kasan_addr_to_slab(const void *addr)
|
||||
{
|
||||
if ((addr >= (void *)PAGE_OFFSET) &&
|
||||
(addr < high_memory))
|
||||
return virt_to_slab(addr);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void describe_object_addr(struct kmem_cache *cache, void *object,
|
||||
const void *addr)
|
||||
{
|
||||
@ -214,12 +213,12 @@ static void describe_object_stacks(struct kmem_cache *cache, void *object,
|
||||
return;
|
||||
if (alloc_meta->aux_stack[0]) {
|
||||
pr_err("Last potentially related work creation:\n");
|
||||
print_stack(alloc_meta->aux_stack[0]);
|
||||
stack_depot_print(alloc_meta->aux_stack[0]);
|
||||
pr_err("\n");
|
||||
}
|
||||
if (alloc_meta->aux_stack[1]) {
|
||||
pr_err("Second to last potentially related work creation:\n");
|
||||
print_stack(alloc_meta->aux_stack[1]);
|
||||
stack_depot_print(alloc_meta->aux_stack[1]);
|
||||
pr_err("\n");
|
||||
}
|
||||
#endif
|
||||
@ -235,7 +234,7 @@ static void describe_object(struct kmem_cache *cache, void *object,
|
||||
|
||||
static inline bool kernel_or_module_addr(const void *addr)
|
||||
{
|
||||
if (addr >= (void *)_stext && addr < (void *)_end)
|
||||
if (is_kernel((unsigned long)addr))
|
||||
return true;
|
||||
if (is_module_address((unsigned long)addr))
|
||||
return true;
|
||||
@ -257,8 +256,9 @@ static void print_address_description(void *addr, u8 tag)
|
||||
pr_err("\n");
|
||||
|
||||
if (page && PageSlab(page)) {
|
||||
struct kmem_cache *cache = page->slab_cache;
|
||||
void *object = nearest_obj(cache, page, addr);
|
||||
struct slab *slab = page_slab(page);
|
||||
struct kmem_cache *cache = slab->slab_cache;
|
||||
void *object = nearest_obj(cache, slab, addr);
|
||||
|
||||
describe_object(cache, object, addr, tag);
|
||||
}
|
||||
|
@ -12,7 +12,7 @@ const char *kasan_get_bug_type(struct kasan_access_info *info)
|
||||
#ifdef CONFIG_KASAN_TAGS_IDENTIFY
|
||||
struct kasan_alloc_meta *alloc_meta;
|
||||
struct kmem_cache *cache;
|
||||
struct page *page;
|
||||
struct slab *slab;
|
||||
const void *addr;
|
||||
void *object;
|
||||
u8 tag;
|
||||
@ -20,10 +20,10 @@ const char *kasan_get_bug_type(struct kasan_access_info *info)
|
||||
|
||||
tag = get_tag(info->access_addr);
|
||||
addr = kasan_reset_tag(info->access_addr);
|
||||
page = kasan_addr_to_page(addr);
|
||||
if (page && PageSlab(page)) {
|
||||
cache = page->slab_cache;
|
||||
object = nearest_obj(cache, page, (void *)addr);
|
||||
slab = kasan_addr_to_slab(addr);
|
||||
if (slab) {
|
||||
cache = slab->slab_cache;
|
||||
object = nearest_obj(cache, slab, (void *)addr);
|
||||
alloc_meta = kasan_get_alloc_meta(cache, object);
|
||||
|
||||
if (alloc_meta) {
|
||||
|
@ -254,6 +254,11 @@ core_initcall(kasan_memhotplug_init);
|
||||
|
||||
#ifdef CONFIG_KASAN_VMALLOC
|
||||
|
||||
void __init __weak kasan_populate_early_vm_area_shadow(void *start,
|
||||
unsigned long size)
|
||||
{
|
||||
}
|
||||
|
||||
static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
|
||||
void *unused)
|
||||
{
|
||||
|
@ -42,7 +42,7 @@ void __init kasan_init_sw_tags(void)
|
||||
for_each_possible_cpu(cpu)
|
||||
per_cpu(prng_state, cpu) = (u32)get_cycles();
|
||||
|
||||
pr_info("KernelAddressSanitizer initialized\n");
|
||||
pr_info("KernelAddressSanitizer initialized (sw-tags)\n");
|
||||
}
|
||||
|
||||
/*
|
||||
|
204
mm/kfence/core.c
204
mm/kfence/core.c
@ -10,12 +10,15 @@
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/bug.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/hash.h>
|
||||
#include <linux/irq_work.h>
|
||||
#include <linux/jhash.h>
|
||||
#include <linux/kcsan-checks.h>
|
||||
#include <linux/kfence.h>
|
||||
#include <linux/kmemleak.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/lockdep.h>
|
||||
#include <linux/log2.h>
|
||||
#include <linux/memblock.h>
|
||||
#include <linux/moduleparam.h>
|
||||
#include <linux/random.h>
|
||||
@ -44,7 +47,8 @@
|
||||
|
||||
static bool kfence_enabled __read_mostly;
|
||||
|
||||
static unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
|
||||
unsigned long kfence_sample_interval __read_mostly = CONFIG_KFENCE_SAMPLE_INTERVAL;
|
||||
EXPORT_SYMBOL_GPL(kfence_sample_interval); /* Export for test modules. */
|
||||
|
||||
#ifdef MODULE_PARAM_PREFIX
|
||||
#undef MODULE_PARAM_PREFIX
|
||||
@ -82,6 +86,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
|
||||
};
|
||||
module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
|
||||
|
||||
/* Pool usage% threshold when currently covered allocations are skipped. */
|
||||
static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
|
||||
module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
|
||||
|
||||
/* The pool of pages used for guard pages and objects. */
|
||||
char *__kfence_pool __ro_after_init;
|
||||
EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
|
||||
@ -106,6 +114,32 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
|
||||
/* Gates the allocation, ensuring only one succeeds in a given period. */
|
||||
atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
|
||||
|
||||
/*
|
||||
* A Counting Bloom filter of allocation coverage: limits currently covered
|
||||
* allocations of the same source filling up the pool.
|
||||
*
|
||||
* Assuming a range of 15%-85% unique allocations in the pool at any point in
|
||||
* time, the below parameters provide a probablity of 0.02-0.33 for false
|
||||
* positive hits respectively:
|
||||
*
|
||||
* P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
|
||||
*/
|
||||
#define ALLOC_COVERED_HNUM 2
|
||||
#define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2)
|
||||
#define ALLOC_COVERED_SIZE (1 << ALLOC_COVERED_ORDER)
|
||||
#define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER)
|
||||
#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1)
|
||||
static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
|
||||
|
||||
/* Stack depth used to determine uniqueness of an allocation. */
|
||||
#define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8)
|
||||
|
||||
/*
|
||||
* Randomness for stack hashes, making the same collisions across reboots and
|
||||
* different machines less likely.
|
||||
*/
|
||||
static u32 stack_hash_seed __ro_after_init;
|
||||
|
||||
/* Statistics counters for debugfs. */
|
||||
enum kfence_counter_id {
|
||||
KFENCE_COUNTER_ALLOCATED,
|
||||
@ -113,6 +147,9 @@ enum kfence_counter_id {
|
||||
KFENCE_COUNTER_FREES,
|
||||
KFENCE_COUNTER_ZOMBIES,
|
||||
KFENCE_COUNTER_BUGS,
|
||||
KFENCE_COUNTER_SKIP_INCOMPAT,
|
||||
KFENCE_COUNTER_SKIP_CAPACITY,
|
||||
KFENCE_COUNTER_SKIP_COVERED,
|
||||
KFENCE_COUNTER_COUNT,
|
||||
};
|
||||
static atomic_long_t counters[KFENCE_COUNTER_COUNT];
|
||||
@ -122,11 +159,59 @@ static const char *const counter_names[] = {
|
||||
[KFENCE_COUNTER_FREES] = "total frees",
|
||||
[KFENCE_COUNTER_ZOMBIES] = "zombie allocations",
|
||||
[KFENCE_COUNTER_BUGS] = "total bugs",
|
||||
[KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)",
|
||||
[KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)",
|
||||
[KFENCE_COUNTER_SKIP_COVERED] = "skipped allocations (covered)",
|
||||
};
|
||||
static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
|
||||
|
||||
/* === Internals ============================================================ */
|
||||
|
||||
static inline bool should_skip_covered(void)
|
||||
{
|
||||
unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
|
||||
|
||||
return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
|
||||
}
|
||||
|
||||
static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
|
||||
{
|
||||
num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
|
||||
num_entries = filter_irq_stacks(stack_entries, num_entries);
|
||||
return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed);
|
||||
}
|
||||
|
||||
/*
|
||||
* Adds (or subtracts) count @val for allocation stack trace hash
|
||||
* @alloc_stack_hash from Counting Bloom filter.
|
||||
*/
|
||||
static void alloc_covered_add(u32 alloc_stack_hash, int val)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
|
||||
atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
|
||||
alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns true if the allocation stack trace hash @alloc_stack_hash is
|
||||
* currently contained (non-zero count) in Counting Bloom filter.
|
||||
*/
|
||||
static bool alloc_covered_contains(u32 alloc_stack_hash)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
|
||||
if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
|
||||
return false;
|
||||
alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool kfence_protect(unsigned long addr)
|
||||
{
|
||||
return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
|
||||
@ -184,19 +269,26 @@ static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *m
|
||||
* Update the object's metadata state, including updating the alloc/free stacks
|
||||
* depending on the state transition.
|
||||
*/
|
||||
static noinline void metadata_update_state(struct kfence_metadata *meta,
|
||||
enum kfence_object_state next)
|
||||
static noinline void
|
||||
metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state next,
|
||||
unsigned long *stack_entries, size_t num_stack_entries)
|
||||
{
|
||||
struct kfence_track *track =
|
||||
next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
|
||||
|
||||
lockdep_assert_held(&meta->lock);
|
||||
|
||||
/*
|
||||
* Skip over 1 (this) functions; noinline ensures we do not accidentally
|
||||
* skip over the caller by never inlining.
|
||||
*/
|
||||
track->num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
|
||||
if (stack_entries) {
|
||||
memcpy(track->stack_entries, stack_entries,
|
||||
num_stack_entries * sizeof(stack_entries[0]));
|
||||
} else {
|
||||
/*
|
||||
* Skip over 1 (this) functions; noinline ensures we do not
|
||||
* accidentally skip over the caller by never inlining.
|
||||
*/
|
||||
num_stack_entries = stack_trace_save(track->stack_entries, KFENCE_STACK_DEPTH, 1);
|
||||
}
|
||||
track->num_stack_entries = num_stack_entries;
|
||||
track->pid = task_pid_nr(current);
|
||||
track->cpu = raw_smp_processor_id();
|
||||
track->ts_nsec = local_clock(); /* Same source as printk timestamps. */
|
||||
@ -219,12 +311,19 @@ static inline bool set_canary_byte(u8 *addr)
|
||||
/* Check canary byte at @addr. */
|
||||
static inline bool check_canary_byte(u8 *addr)
|
||||
{
|
||||
struct kfence_metadata *meta;
|
||||
unsigned long flags;
|
||||
|
||||
if (likely(*addr == KFENCE_CANARY_PATTERN(addr)))
|
||||
return true;
|
||||
|
||||
atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
|
||||
kfence_report_error((unsigned long)addr, false, NULL, addr_to_metadata((unsigned long)addr),
|
||||
KFENCE_ERROR_CORRUPTION);
|
||||
|
||||
meta = addr_to_metadata((unsigned long)addr);
|
||||
raw_spin_lock_irqsave(&meta->lock, flags);
|
||||
kfence_report_error((unsigned long)addr, false, NULL, meta, KFENCE_ERROR_CORRUPTION);
|
||||
raw_spin_unlock_irqrestore(&meta->lock, flags);
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -234,8 +333,6 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
|
||||
const unsigned long pageaddr = ALIGN_DOWN(meta->addr, PAGE_SIZE);
|
||||
unsigned long addr;
|
||||
|
||||
lockdep_assert_held(&meta->lock);
|
||||
|
||||
/*
|
||||
* We'll iterate over each canary byte per-side until fn() returns
|
||||
* false. However, we'll still iterate over the canary bytes to the
|
||||
@ -258,11 +355,13 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
|
||||
}
|
||||
}
|
||||
|
||||
static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp)
|
||||
static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
|
||||
unsigned long *stack_entries, size_t num_stack_entries,
|
||||
u32 alloc_stack_hash)
|
||||
{
|
||||
struct kfence_metadata *meta = NULL;
|
||||
unsigned long flags;
|
||||
struct page *page;
|
||||
struct slab *slab;
|
||||
void *addr;
|
||||
|
||||
/* Try to obtain a free object. */
|
||||
@ -272,8 +371,10 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
|
||||
list_del_init(&meta->list);
|
||||
}
|
||||
raw_spin_unlock_irqrestore(&kfence_freelist_lock, flags);
|
||||
if (!meta)
|
||||
if (!meta) {
|
||||
atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_CAPACITY]);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (unlikely(!raw_spin_trylock_irqsave(&meta->lock, flags))) {
|
||||
/*
|
||||
@ -315,23 +416,26 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
|
||||
addr = (void *)meta->addr;
|
||||
|
||||
/* Update remaining metadata. */
|
||||
metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED);
|
||||
metadata_update_state(meta, KFENCE_OBJECT_ALLOCATED, stack_entries, num_stack_entries);
|
||||
/* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
|
||||
WRITE_ONCE(meta->cache, cache);
|
||||
meta->size = size;
|
||||
for_each_canary(meta, set_canary_byte);
|
||||
|
||||
/* Set required struct page fields. */
|
||||
page = virt_to_page(meta->addr);
|
||||
page->slab_cache = cache;
|
||||
if (IS_ENABLED(CONFIG_SLUB))
|
||||
page->objects = 1;
|
||||
if (IS_ENABLED(CONFIG_SLAB))
|
||||
page->s_mem = addr;
|
||||
|
||||
meta->alloc_stack_hash = alloc_stack_hash;
|
||||
raw_spin_unlock_irqrestore(&meta->lock, flags);
|
||||
|
||||
alloc_covered_add(alloc_stack_hash, 1);
|
||||
|
||||
/* Set required slab fields. */
|
||||
slab = virt_to_slab((void *)meta->addr);
|
||||
slab->slab_cache = cache;
|
||||
#if defined(CONFIG_SLUB)
|
||||
slab->objects = 1;
|
||||
#elif defined(CONFIG_SLAB)
|
||||
slab->s_mem = addr;
|
||||
#endif
|
||||
|
||||
/* Memory initialization. */
|
||||
for_each_canary(meta, set_canary_byte);
|
||||
|
||||
/*
|
||||
* We check slab_want_init_on_alloc() ourselves, rather than letting
|
||||
@ -356,6 +460,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
|
||||
{
|
||||
struct kcsan_scoped_access assert_page_exclusive;
|
||||
unsigned long flags;
|
||||
bool init;
|
||||
|
||||
raw_spin_lock_irqsave(&meta->lock, flags);
|
||||
|
||||
@ -383,6 +488,13 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
|
||||
meta->unprotected_page = 0;
|
||||
}
|
||||
|
||||
/* Mark the object as freed. */
|
||||
metadata_update_state(meta, KFENCE_OBJECT_FREED, NULL, 0);
|
||||
init = slab_want_init_on_free(meta->cache);
|
||||
raw_spin_unlock_irqrestore(&meta->lock, flags);
|
||||
|
||||
alloc_covered_add(meta->alloc_stack_hash, -1);
|
||||
|
||||
/* Check canary bytes for memory corruption. */
|
||||
for_each_canary(meta, check_canary_byte);
|
||||
|
||||
@ -391,14 +503,9 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
|
||||
* data is still there, and after a use-after-free is detected, we
|
||||
* unprotect the page, so the data is still accessible.
|
||||
*/
|
||||
if (!zombie && unlikely(slab_want_init_on_free(meta->cache)))
|
||||
if (!zombie && unlikely(init))
|
||||
memzero_explicit(addr, meta->size);
|
||||
|
||||
/* Mark the object as freed. */
|
||||
metadata_update_state(meta, KFENCE_OBJECT_FREED);
|
||||
|
||||
raw_spin_unlock_irqrestore(&meta->lock, flags);
|
||||
|
||||
/* Protect to detect use-after-frees. */
|
||||
kfence_protect((unsigned long)addr);
|
||||
|
||||
@ -665,6 +772,7 @@ void __init kfence_init(void)
|
||||
if (!kfence_sample_interval)
|
||||
return;
|
||||
|
||||
stack_hash_seed = (u32)random_get_entropy();
|
||||
if (!kfence_init_pool()) {
|
||||
pr_err("%s failed\n", __func__);
|
||||
return;
|
||||
@ -740,12 +848,18 @@ void kfence_shutdown_cache(struct kmem_cache *s)
|
||||
|
||||
void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
|
||||
{
|
||||
unsigned long stack_entries[KFENCE_STACK_DEPTH];
|
||||
size_t num_stack_entries;
|
||||
u32 alloc_stack_hash;
|
||||
|
||||
/*
|
||||
* Perform size check before switching kfence_allocation_gate, so that
|
||||
* we don't disable KFENCE without making an allocation.
|
||||
*/
|
||||
if (size > PAGE_SIZE)
|
||||
if (size > PAGE_SIZE) {
|
||||
atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Skip allocations from non-default zones, including DMA. We cannot
|
||||
@ -753,8 +867,10 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
|
||||
* properties (e.g. reside in DMAable memory).
|
||||
*/
|
||||
if ((flags & GFP_ZONEMASK) ||
|
||||
(s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32)))
|
||||
(s->flags & (SLAB_CACHE_DMA | SLAB_CACHE_DMA32))) {
|
||||
atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_INCOMPAT]);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
if (atomic_inc_return(&kfence_allocation_gate) > 1)
|
||||
return NULL;
|
||||
@ -775,7 +891,25 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
|
||||
if (!READ_ONCE(kfence_enabled))
|
||||
return NULL;
|
||||
|
||||
return kfence_guarded_alloc(s, size, flags);
|
||||
num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
|
||||
|
||||
/*
|
||||
* Do expensive check for coverage of allocation in slow-path after
|
||||
* allocation_gate has already become non-zero, even though it might
|
||||
* mean not making any allocation within a given sample interval.
|
||||
*
|
||||
* This ensures reasonable allocation coverage when the pool is almost
|
||||
* full, including avoiding long-lived allocations of the same source
|
||||
* filling up the pool (e.g. pagecache allocations).
|
||||
*/
|
||||
alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
|
||||
if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
|
||||
atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
|
||||
alloc_stack_hash);
|
||||
}
|
||||
|
||||
size_t kfence_ksize(const void *addr)
|
||||
|
@ -87,6 +87,8 @@ struct kfence_metadata {
|
||||
/* Allocation and free stack information. */
|
||||
struct kfence_track alloc_track;
|
||||
struct kfence_track free_track;
|
||||
/* For updating alloc_covered on frees. */
|
||||
u32 alloc_stack_hash;
|
||||
};
|
||||
|
||||
extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];
|
||||
|
@ -32,6 +32,11 @@
|
||||
#define arch_kfence_test_address(addr) (addr)
|
||||
#endif
|
||||
|
||||
#define KFENCE_TEST_REQUIRES(test, cond) do { \
|
||||
if (!(cond)) \
|
||||
kunit_skip((test), "Test requires: " #cond); \
|
||||
} while (0)
|
||||
|
||||
/* Report as observed from console. */
|
||||
static struct {
|
||||
spinlock_t lock;
|
||||
@ -263,13 +268,13 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
|
||||
* 100x the sample interval should be more than enough to ensure we get
|
||||
* a KFENCE allocation eventually.
|
||||
*/
|
||||
timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
|
||||
timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval);
|
||||
/*
|
||||
* Especially for non-preemption kernels, ensure the allocation-gate
|
||||
* timer can catch up: after @resched_after, every failed allocation
|
||||
* attempt yields, to ensure the allocation-gate timer is scheduled.
|
||||
*/
|
||||
resched_after = jiffies + msecs_to_jiffies(CONFIG_KFENCE_SAMPLE_INTERVAL);
|
||||
resched_after = jiffies + msecs_to_jiffies(kfence_sample_interval);
|
||||
do {
|
||||
if (test_cache)
|
||||
alloc = kmem_cache_alloc(test_cache, gfp);
|
||||
@ -277,7 +282,7 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
|
||||
alloc = kmalloc(size, gfp);
|
||||
|
||||
if (is_kfence_address(alloc)) {
|
||||
struct page *page = virt_to_head_page(alloc);
|
||||
struct slab *slab = virt_to_slab(alloc);
|
||||
struct kmem_cache *s = test_cache ?:
|
||||
kmalloc_caches[kmalloc_type(GFP_KERNEL)][__kmalloc_index(size, false)];
|
||||
|
||||
@ -286,8 +291,8 @@ static void *test_alloc(struct kunit *test, size_t size, gfp_t gfp, enum allocat
|
||||
* even for KFENCE objects; these are required so that
|
||||
* memcg accounting works correctly.
|
||||
*/
|
||||
KUNIT_EXPECT_EQ(test, obj_to_index(s, page, alloc), 0U);
|
||||
KUNIT_EXPECT_EQ(test, objs_per_slab_page(s, page), 1);
|
||||
KUNIT_EXPECT_EQ(test, obj_to_index(s, slab, alloc), 0U);
|
||||
KUNIT_EXPECT_EQ(test, objs_per_slab(s, slab), 1);
|
||||
|
||||
if (policy == ALLOCATE_ANY)
|
||||
return alloc;
|
||||
@ -555,8 +560,7 @@ static void test_init_on_free(struct kunit *test)
|
||||
};
|
||||
int i;
|
||||
|
||||
if (!IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON))
|
||||
return;
|
||||
KFENCE_TEST_REQUIRES(test, IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON));
|
||||
/* Assume it hasn't been disabled on command line. */
|
||||
|
||||
setup_test_cache(test, size, 0, NULL);
|
||||
@ -603,10 +607,8 @@ static void test_gfpzero(struct kunit *test)
|
||||
char *buf1, *buf2;
|
||||
int i;
|
||||
|
||||
if (CONFIG_KFENCE_SAMPLE_INTERVAL > 100) {
|
||||
kunit_warn(test, "skipping ... would take too long\n");
|
||||
return;
|
||||
}
|
||||
/* Skip if we think it'd take too long. */
|
||||
KFENCE_TEST_REQUIRES(test, kfence_sample_interval <= 100);
|
||||
|
||||
setup_test_cache(test, size, 0, NULL);
|
||||
buf1 = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY);
|
||||
@ -737,7 +739,7 @@ static void test_memcache_alloc_bulk(struct kunit *test)
|
||||
* 100x the sample interval should be more than enough to ensure we get
|
||||
* a KFENCE allocation eventually.
|
||||
*/
|
||||
timeout = jiffies + msecs_to_jiffies(100 * CONFIG_KFENCE_SAMPLE_INTERVAL);
|
||||
timeout = jiffies + msecs_to_jiffies(100 * kfence_sample_interval);
|
||||
do {
|
||||
void *objects[100];
|
||||
int i, num = kmem_cache_alloc_bulk(test_cache, GFP_ATOMIC, ARRAY_SIZE(objects),
|
||||
|
@ -16,6 +16,7 @@
|
||||
#include <linux/hashtable.h>
|
||||
#include <linux/userfaultfd_k.h>
|
||||
#include <linux/page_idle.h>
|
||||
#include <linux/page_table_check.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/shmem_fs.h>
|
||||
|
||||
@ -618,6 +619,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
|
||||
continue;
|
||||
} else {
|
||||
result = SCAN_EXCEED_NONE_PTE;
|
||||
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
@ -636,6 +638,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
|
||||
if (page_mapcount(page) > 1 &&
|
||||
++shared > khugepaged_max_ptes_shared) {
|
||||
result = SCAN_EXCEED_SHARED_PTE;
|
||||
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@ -681,7 +684,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
|
||||
goto out;
|
||||
}
|
||||
if (!pte_write(pteval) && PageSwapCache(page) &&
|
||||
!reuse_swap_page(page, NULL)) {
|
||||
!reuse_swap_page(page)) {
|
||||
/*
|
||||
* Page is in the swap cache and cannot be re-used.
|
||||
* It cannot be collapsed into a THP.
|
||||
@ -756,11 +759,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
|
||||
* ptl mostly unnecessary.
|
||||
*/
|
||||
spin_lock(ptl);
|
||||
/*
|
||||
* paravirt calls inside pte_clear here are
|
||||
* superfluous.
|
||||
*/
|
||||
pte_clear(vma->vm_mm, address, _pte);
|
||||
ptep_clear(vma->vm_mm, address, _pte);
|
||||
spin_unlock(ptl);
|
||||
}
|
||||
} else {
|
||||
@ -774,11 +773,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
|
||||
* inside page_remove_rmap().
|
||||
*/
|
||||
spin_lock(ptl);
|
||||
/*
|
||||
* paravirt calls inside pte_clear here are
|
||||
* superfluous.
|
||||
*/
|
||||
pte_clear(vma->vm_mm, address, _pte);
|
||||
ptep_clear(vma->vm_mm, address, _pte);
|
||||
page_remove_rmap(src_page, false);
|
||||
spin_unlock(ptl);
|
||||
free_page_and_swap_cache(src_page);
|
||||
@ -1090,7 +1085,7 @@ static void collapse_huge_page(struct mm_struct *mm,
|
||||
goto out_nolock;
|
||||
}
|
||||
|
||||
if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
|
||||
if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
|
||||
result = SCAN_CGROUP_CHARGE_FAIL;
|
||||
goto out_nolock;
|
||||
}
|
||||
@ -1214,7 +1209,7 @@ static void collapse_huge_page(struct mm_struct *mm,
|
||||
mmap_write_unlock(mm);
|
||||
out_nolock:
|
||||
if (!IS_ERR_OR_NULL(*hpage))
|
||||
mem_cgroup_uncharge(*hpage);
|
||||
mem_cgroup_uncharge(page_folio(*hpage));
|
||||
trace_mm_collapse_huge_page(mm, isolated, result);
|
||||
return;
|
||||
}
|
||||
@ -1261,6 +1256,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
|
||||
continue;
|
||||
} else {
|
||||
result = SCAN_EXCEED_SWAP_PTE;
|
||||
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
|
||||
goto out_unmap;
|
||||
}
|
||||
}
|
||||
@ -1270,6 +1266,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
|
||||
continue;
|
||||
} else {
|
||||
result = SCAN_EXCEED_NONE_PTE;
|
||||
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
|
||||
goto out_unmap;
|
||||
}
|
||||
}
|
||||
@ -1298,6 +1295,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
|
||||
if (page_mapcount(page) > 1 &&
|
||||
++shared > khugepaged_max_ptes_shared) {
|
||||
result = SCAN_EXCEED_SHARED_PTE;
|
||||
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
|
||||
goto out_unmap;
|
||||
}
|
||||
|
||||
@ -1306,7 +1304,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
|
||||
/*
|
||||
* Record which node the original page is from and save this
|
||||
* information to khugepaged_node_load[].
|
||||
* Khupaged will allocate hugepage from the node has the max
|
||||
* Khugepaged will allocate hugepage from the node has the max
|
||||
* hit record.
|
||||
*/
|
||||
node = page_to_nid(page);
|
||||
@ -1419,6 +1417,21 @@ static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
|
||||
unsigned long addr, pmd_t *pmdp)
|
||||
{
|
||||
spinlock_t *ptl;
|
||||
pmd_t pmd;
|
||||
|
||||
mmap_assert_write_locked(mm);
|
||||
ptl = pmd_lock(vma->vm_mm, pmdp);
|
||||
pmd = pmdp_collapse_flush(vma, addr, pmdp);
|
||||
spin_unlock(ptl);
|
||||
mm_dec_nr_ptes(mm);
|
||||
page_table_check_pte_clear_range(mm, addr, pmd);
|
||||
pte_free(mm, pmd_pgtable(pmd));
|
||||
}
|
||||
|
||||
/**
|
||||
* collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
|
||||
* address haddr.
|
||||
@ -1436,7 +1449,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
|
||||
struct vm_area_struct *vma = find_vma(mm, haddr);
|
||||
struct page *hpage;
|
||||
pte_t *start_pte, *pte;
|
||||
pmd_t *pmd, _pmd;
|
||||
pmd_t *pmd;
|
||||
spinlock_t *ptl;
|
||||
int count = 0;
|
||||
int i;
|
||||
@ -1512,12 +1525,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
|
||||
}
|
||||
|
||||
/* step 4: collapse pmd */
|
||||
ptl = pmd_lock(vma->vm_mm, pmd);
|
||||
_pmd = pmdp_collapse_flush(vma, haddr, pmd);
|
||||
spin_unlock(ptl);
|
||||
mm_dec_nr_ptes(mm);
|
||||
pte_free(mm, pmd_pgtable(_pmd));
|
||||
|
||||
collapse_and_free_pmd(mm, vma, haddr, pmd);
|
||||
drop_hpage:
|
||||
unlock_page(hpage);
|
||||
put_page(hpage);
|
||||
@ -1555,7 +1563,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
|
||||
struct vm_area_struct *vma;
|
||||
struct mm_struct *mm;
|
||||
unsigned long addr;
|
||||
pmd_t *pmd, _pmd;
|
||||
pmd_t *pmd;
|
||||
|
||||
i_mmap_lock_write(mapping);
|
||||
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
|
||||
@ -1594,14 +1602,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
|
||||
* reverse order. Trylock is a way to avoid deadlock.
|
||||
*/
|
||||
if (mmap_write_trylock(mm)) {
|
||||
if (!khugepaged_test_exit(mm)) {
|
||||
spinlock_t *ptl = pmd_lock(mm, pmd);
|
||||
/* assume page table is clear */
|
||||
_pmd = pmdp_collapse_flush(vma, addr, pmd);
|
||||
spin_unlock(ptl);
|
||||
mm_dec_nr_ptes(mm);
|
||||
pte_free(mm, pmd_pgtable(_pmd));
|
||||
}
|
||||
if (!khugepaged_test_exit(mm))
|
||||
collapse_and_free_pmd(mm, vma, addr, pmd);
|
||||
mmap_write_unlock(mm);
|
||||
} else {
|
||||
/* Try again later */
|
||||
@ -1661,13 +1663,16 @@ static void collapse_file(struct mm_struct *mm,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
|
||||
if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) {
|
||||
result = SCAN_CGROUP_CHARGE_FAIL;
|
||||
goto out;
|
||||
}
|
||||
count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
|
||||
|
||||
/* This will be less messy when we use multi-index entries */
|
||||
/*
|
||||
* Ensure we have slots for all the pages in the range. This is
|
||||
* almost certainly a no-op because most of the pages must be present
|
||||
*/
|
||||
do {
|
||||
xas_lock_irq(&xas);
|
||||
xas_create_range(&xas);
|
||||
@ -1892,6 +1897,9 @@ static void collapse_file(struct mm_struct *mm,
|
||||
__mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
|
||||
}
|
||||
|
||||
/* Join all the small entries into a single multi-index entry */
|
||||
xas_set_order(&xas, start, HPAGE_PMD_ORDER);
|
||||
xas_store(&xas, new_page);
|
||||
xa_locked:
|
||||
xas_unlock_irq(&xas);
|
||||
xa_unlocked:
|
||||
@ -1983,7 +1991,7 @@ static void collapse_file(struct mm_struct *mm,
|
||||
out:
|
||||
VM_BUG_ON(!list_empty(&pagelist));
|
||||
if (!IS_ERR_OR_NULL(*hpage))
|
||||
mem_cgroup_uncharge(*hpage);
|
||||
mem_cgroup_uncharge(page_folio(*hpage));
|
||||
/* TODO: tracepoints */
|
||||
}
|
||||
|
||||
@ -2008,11 +2016,16 @@ static void khugepaged_scan_file(struct mm_struct *mm,
|
||||
if (xa_is_value(page)) {
|
||||
if (++swap > khugepaged_max_ptes_swap) {
|
||||
result = SCAN_EXCEED_SWAP_PTE;
|
||||
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* XXX: khugepaged should compact smaller compound pages
|
||||
* into a PMD sized page
|
||||
*/
|
||||
if (PageTransCompound(page)) {
|
||||
result = SCAN_PAGE_COMPOUND;
|
||||
break;
|
||||
@ -2054,6 +2067,7 @@ static void khugepaged_scan_file(struct mm_struct *mm,
|
||||
if (result == SCAN_SUCCEED) {
|
||||
if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
|
||||
result = SCAN_EXCEED_NONE_PTE;
|
||||
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
|
||||
} else {
|
||||
node = khugepaged_find_target_node();
|
||||
collapse_file(mm, file, start, hpage, node);
|
||||
@ -2299,6 +2313,11 @@ static void set_recommended_min_free_kbytes(void)
|
||||
int nr_zones = 0;
|
||||
unsigned long recommended_min;
|
||||
|
||||
if (!khugepaged_enabled()) {
|
||||
calculate_min_free_kbytes();
|
||||
goto update_wmarks;
|
||||
}
|
||||
|
||||
for_each_populated_zone(zone) {
|
||||
/*
|
||||
* We don't need to worry about fragmentation of
|
||||
@ -2334,6 +2353,8 @@ static void set_recommended_min_free_kbytes(void)
|
||||
|
||||
min_free_kbytes = recommended_min;
|
||||
}
|
||||
|
||||
update_wmarks:
|
||||
setup_per_zone_wmarks();
|
||||
}
|
||||
|
||||
@ -2355,12 +2376,11 @@ int start_stop_khugepaged(void)
|
||||
|
||||
if (!list_empty(&khugepaged_scan.mm_head))
|
||||
wake_up_interruptible(&khugepaged_wait);
|
||||
|
||||
set_recommended_min_free_kbytes();
|
||||
} else if (khugepaged_thread) {
|
||||
kthread_stop(khugepaged_thread);
|
||||
khugepaged_thread = NULL;
|
||||
}
|
||||
set_recommended_min_free_kbytes();
|
||||
fail:
|
||||
mutex_unlock(&khugepaged_mutex);
|
||||
return err;
|
||||
|
@ -381,15 +381,20 @@ static void dump_object_info(struct kmemleak_object *object)
|
||||
static struct kmemleak_object *lookup_object(unsigned long ptr, int alias)
|
||||
{
|
||||
struct rb_node *rb = object_tree_root.rb_node;
|
||||
unsigned long untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
|
||||
|
||||
while (rb) {
|
||||
struct kmemleak_object *object =
|
||||
rb_entry(rb, struct kmemleak_object, rb_node);
|
||||
if (ptr < object->pointer)
|
||||
struct kmemleak_object *object;
|
||||
unsigned long untagged_objp;
|
||||
|
||||
object = rb_entry(rb, struct kmemleak_object, rb_node);
|
||||
untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer);
|
||||
|
||||
if (untagged_ptr < untagged_objp)
|
||||
rb = object->rb_node.rb_left;
|
||||
else if (object->pointer + object->size <= ptr)
|
||||
else if (untagged_objp + object->size <= untagged_ptr)
|
||||
rb = object->rb_node.rb_right;
|
||||
else if (object->pointer == ptr || alias)
|
||||
else if (untagged_objp == untagged_ptr || alias)
|
||||
return object;
|
||||
else {
|
||||
kmemleak_warn("Found object by alias at 0x%08lx\n",
|
||||
@ -576,6 +581,7 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
|
||||
struct kmemleak_object *object, *parent;
|
||||
struct rb_node **link, *rb_parent;
|
||||
unsigned long untagged_ptr;
|
||||
unsigned long untagged_objp;
|
||||
|
||||
object = mem_pool_alloc(gfp);
|
||||
if (!object) {
|
||||
@ -629,9 +635,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size,
|
||||
while (*link) {
|
||||
rb_parent = *link;
|
||||
parent = rb_entry(rb_parent, struct kmemleak_object, rb_node);
|
||||
if (ptr + size <= parent->pointer)
|
||||
untagged_objp = (unsigned long)kasan_reset_tag((void *)parent->pointer);
|
||||
if (untagged_ptr + size <= untagged_objp)
|
||||
link = &parent->rb_node.rb_left;
|
||||
else if (parent->pointer + parent->size <= ptr)
|
||||
else if (untagged_objp + parent->size <= untagged_ptr)
|
||||
link = &parent->rb_node.rb_right;
|
||||
else {
|
||||
kmemleak_stop("Cannot insert 0x%lx into the object search tree (overlaps existing)\n",
|
||||
|
39
mm/ksm.c
39
mm/ksm.c
@ -15,6 +15,7 @@
|
||||
|
||||
#include <linux/errno.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/mman.h>
|
||||
#include <linux/sched.h>
|
||||
@ -751,7 +752,7 @@ static struct page *get_ksm_page(struct stable_node *stable_node,
|
||||
/*
|
||||
* We come here from above when page->mapping or !PageSwapCache
|
||||
* suggests that the node is stale; but it might be under migration.
|
||||
* We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(),
|
||||
* We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(),
|
||||
* before checking whether node->kpfn has been changed.
|
||||
*/
|
||||
smp_rmb();
|
||||
@ -852,9 +853,14 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma,
|
||||
return err;
|
||||
}
|
||||
|
||||
static inline struct stable_node *folio_stable_node(struct folio *folio)
|
||||
{
|
||||
return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL;
|
||||
}
|
||||
|
||||
static inline struct stable_node *page_stable_node(struct page *page)
|
||||
{
|
||||
return PageKsm(page) ? page_rmapping(page) : NULL;
|
||||
return folio_stable_node(page_folio(page));
|
||||
}
|
||||
|
||||
static inline void set_page_stable_node(struct page *page,
|
||||
@ -2570,15 +2576,16 @@ struct page *ksm_might_need_to_copy(struct page *page,
|
||||
return page; /* no need to copy it */
|
||||
} else if (!anon_vma) {
|
||||
return page; /* no need to copy it */
|
||||
} else if (anon_vma->root == vma->anon_vma->root &&
|
||||
page->index == linear_page_index(vma, address)) {
|
||||
} else if (page->index == linear_page_index(vma, address) &&
|
||||
anon_vma->root == vma->anon_vma->root) {
|
||||
return page; /* still no need to copy it */
|
||||
}
|
||||
if (!PageUptodate(page))
|
||||
return page; /* let do_swap_page report the error */
|
||||
|
||||
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
|
||||
if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) {
|
||||
if (new_page &&
|
||||
mem_cgroup_charge(page_folio(new_page), vma->vm_mm, GFP_KERNEL)) {
|
||||
put_page(new_page);
|
||||
new_page = NULL;
|
||||
}
|
||||
@ -2658,26 +2665,26 @@ void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MIGRATION
|
||||
void ksm_migrate_page(struct page *newpage, struct page *oldpage)
|
||||
void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
|
||||
{
|
||||
struct stable_node *stable_node;
|
||||
|
||||
VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
|
||||
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
|
||||
VM_BUG_ON_PAGE(newpage->mapping != oldpage->mapping, newpage);
|
||||
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
|
||||
VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio);
|
||||
VM_BUG_ON_FOLIO(newfolio->mapping != folio->mapping, newfolio);
|
||||
|
||||
stable_node = page_stable_node(newpage);
|
||||
stable_node = folio_stable_node(folio);
|
||||
if (stable_node) {
|
||||
VM_BUG_ON_PAGE(stable_node->kpfn != page_to_pfn(oldpage), oldpage);
|
||||
stable_node->kpfn = page_to_pfn(newpage);
|
||||
VM_BUG_ON_FOLIO(stable_node->kpfn != folio_pfn(folio), folio);
|
||||
stable_node->kpfn = folio_pfn(newfolio);
|
||||
/*
|
||||
* newpage->mapping was set in advance; now we need smp_wmb()
|
||||
* newfolio->mapping was set in advance; now we need smp_wmb()
|
||||
* to make sure that the new stable_node->kpfn is visible
|
||||
* to get_ksm_page() before it can see that oldpage->mapping
|
||||
* has gone stale (or that PageSwapCache has been cleared).
|
||||
* to get_ksm_page() before it can see that folio->mapping
|
||||
* has gone stale (or that folio_test_swapcache has been cleared).
|
||||
*/
|
||||
smp_wmb();
|
||||
set_page_stable_node(oldpage, NULL);
|
||||
set_page_stable_node(&folio->page, NULL);
|
||||
}
|
||||
}
|
||||
#endif /* CONFIG_MIGRATION */
|
||||
|
@ -15,18 +15,29 @@
|
||||
#include "slab.h"
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
static LIST_HEAD(list_lrus);
|
||||
static LIST_HEAD(memcg_list_lrus);
|
||||
static DEFINE_MUTEX(list_lrus_mutex);
|
||||
|
||||
static inline bool list_lru_memcg_aware(struct list_lru *lru)
|
||||
{
|
||||
return lru->memcg_aware;
|
||||
}
|
||||
|
||||
static void list_lru_register(struct list_lru *lru)
|
||||
{
|
||||
if (!list_lru_memcg_aware(lru))
|
||||
return;
|
||||
|
||||
mutex_lock(&list_lrus_mutex);
|
||||
list_add(&lru->list, &list_lrus);
|
||||
list_add(&lru->list, &memcg_list_lrus);
|
||||
mutex_unlock(&list_lrus_mutex);
|
||||
}
|
||||
|
||||
static void list_lru_unregister(struct list_lru *lru)
|
||||
{
|
||||
if (!list_lru_memcg_aware(lru))
|
||||
return;
|
||||
|
||||
mutex_lock(&list_lrus_mutex);
|
||||
list_del(&lru->list);
|
||||
mutex_unlock(&list_lrus_mutex);
|
||||
@ -37,11 +48,6 @@ static int lru_shrinker_id(struct list_lru *lru)
|
||||
return lru->shrinker_id;
|
||||
}
|
||||
|
||||
static inline bool list_lru_memcg_aware(struct list_lru *lru)
|
||||
{
|
||||
return lru->memcg_aware;
|
||||
}
|
||||
|
||||
static inline struct list_lru_one *
|
||||
list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
|
||||
{
|
||||
@ -176,13 +182,16 @@ unsigned long list_lru_count_one(struct list_lru *lru,
|
||||
{
|
||||
struct list_lru_node *nlru = &lru->node[nid];
|
||||
struct list_lru_one *l;
|
||||
unsigned long count;
|
||||
long count;
|
||||
|
||||
rcu_read_lock();
|
||||
l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
|
||||
count = READ_ONCE(l->nr_items);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (unlikely(count < 0))
|
||||
count = 0;
|
||||
|
||||
return count;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(list_lru_count_one);
|
||||
@ -354,8 +363,7 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru)
|
||||
struct list_lru_memcg *memcg_lrus;
|
||||
int size = memcg_nr_cache_ids;
|
||||
|
||||
memcg_lrus = kvmalloc(sizeof(*memcg_lrus) +
|
||||
size * sizeof(void *), GFP_KERNEL);
|
||||
memcg_lrus = kvmalloc(struct_size(memcg_lrus, lru, size), GFP_KERNEL);
|
||||
if (!memcg_lrus)
|
||||
return -ENOMEM;
|
||||
|
||||
@ -389,7 +397,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
|
||||
|
||||
old = rcu_dereference_protected(nlru->memcg_lrus,
|
||||
lockdep_is_held(&list_lrus_mutex));
|
||||
new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL);
|
||||
new = kvmalloc(struct_size(new, lru, new_size), GFP_KERNEL);
|
||||
if (!new)
|
||||
return -ENOMEM;
|
||||
|
||||
@ -398,19 +406,8 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
memcpy(&new->lru, &old->lru, old_size * sizeof(void *));
|
||||
|
||||
/*
|
||||
* The locking below allows readers that hold nlru->lock avoid taking
|
||||
* rcu_read_lock (see list_lru_from_memcg_idx).
|
||||
*
|
||||
* Since list_lru_{add,del} may be called under an IRQ-safe lock,
|
||||
* we have to use IRQ-safe primitives here to avoid deadlock.
|
||||
*/
|
||||
spin_lock_irq(&nlru->lock);
|
||||
memcpy(&new->lru, &old->lru, flex_array_size(new, lru, old_size));
|
||||
rcu_assign_pointer(nlru->memcg_lrus, new);
|
||||
spin_unlock_irq(&nlru->lock);
|
||||
|
||||
kvfree_rcu(old, rcu);
|
||||
return 0;
|
||||
}
|
||||
@ -466,9 +463,6 @@ static int memcg_update_list_lru(struct list_lru *lru,
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!list_lru_memcg_aware(lru))
|
||||
return 0;
|
||||
|
||||
for_each_node(i) {
|
||||
if (memcg_update_list_lru_node(&lru->node[i],
|
||||
old_size, new_size))
|
||||
@ -491,9 +485,6 @@ static void memcg_cancel_update_list_lru(struct list_lru *lru,
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!list_lru_memcg_aware(lru))
|
||||
return;
|
||||
|
||||
for_each_node(i)
|
||||
memcg_cancel_update_list_lru_node(&lru->node[i],
|
||||
old_size, new_size);
|
||||
@ -506,7 +497,7 @@ int memcg_update_all_list_lrus(int new_size)
|
||||
int old_size = memcg_nr_cache_ids;
|
||||
|
||||
mutex_lock(&list_lrus_mutex);
|
||||
list_for_each_entry(lru, &list_lrus, list) {
|
||||
list_for_each_entry(lru, &memcg_list_lrus, list) {
|
||||
ret = memcg_update_list_lru(lru, old_size, new_size);
|
||||
if (ret)
|
||||
goto fail;
|
||||
@ -515,7 +506,7 @@ int memcg_update_all_list_lrus(int new_size)
|
||||
mutex_unlock(&list_lrus_mutex);
|
||||
return ret;
|
||||
fail:
|
||||
list_for_each_entry_continue_reverse(lru, &list_lrus, list)
|
||||
list_for_each_entry_continue_reverse(lru, &memcg_list_lrus, list)
|
||||
memcg_cancel_update_list_lru(lru, old_size, new_size);
|
||||
goto out;
|
||||
}
|
||||
@ -552,9 +543,6 @@ static void memcg_drain_list_lru(struct list_lru *lru,
|
||||
{
|
||||
int i;
|
||||
|
||||
if (!list_lru_memcg_aware(lru))
|
||||
return;
|
||||
|
||||
for_each_node(i)
|
||||
memcg_drain_list_lru_node(lru, i, src_idx, dst_memcg);
|
||||
}
|
||||
@ -564,7 +552,7 @@ void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg)
|
||||
struct list_lru *lru;
|
||||
|
||||
mutex_lock(&list_lrus_mutex);
|
||||
list_for_each_entry(lru, &list_lrus, list)
|
||||
list_for_each_entry(lru, &memcg_list_lrus, list)
|
||||
memcg_drain_list_lru(lru, src_idx, dst_memcg);
|
||||
mutex_unlock(&list_lrus_mutex);
|
||||
}
|
||||
|
489
mm/madvise.c
489
mm/madvise.c
@ -18,6 +18,8 @@
|
||||
#include <linux/fadvise.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/ksm.h>
|
||||
#include <linux/fs.h>
|
||||
@ -62,83 +64,94 @@ static int madvise_need_mmap_write(int behavior)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* We can potentially split a vm area into separate
|
||||
* areas, each area with its own behavior.
|
||||
*/
|
||||
static long madvise_behavior(struct vm_area_struct *vma,
|
||||
struct vm_area_struct **prev,
|
||||
unsigned long start, unsigned long end, int behavior)
|
||||
#ifdef CONFIG_ANON_VMA_NAME
|
||||
struct anon_vma_name *anon_vma_name_alloc(const char *name)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
int error = 0;
|
||||
pgoff_t pgoff;
|
||||
unsigned long new_flags = vma->vm_flags;
|
||||
struct anon_vma_name *anon_name;
|
||||
size_t count;
|
||||
|
||||
switch (behavior) {
|
||||
case MADV_NORMAL:
|
||||
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
|
||||
break;
|
||||
case MADV_SEQUENTIAL:
|
||||
new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
|
||||
break;
|
||||
case MADV_RANDOM:
|
||||
new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
|
||||
break;
|
||||
case MADV_DONTFORK:
|
||||
new_flags |= VM_DONTCOPY;
|
||||
break;
|
||||
case MADV_DOFORK:
|
||||
if (vma->vm_flags & VM_IO) {
|
||||
error = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
new_flags &= ~VM_DONTCOPY;
|
||||
break;
|
||||
case MADV_WIPEONFORK:
|
||||
/* MADV_WIPEONFORK is only supported on anonymous memory. */
|
||||
if (vma->vm_file || vma->vm_flags & VM_SHARED) {
|
||||
error = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
new_flags |= VM_WIPEONFORK;
|
||||
break;
|
||||
case MADV_KEEPONFORK:
|
||||
new_flags &= ~VM_WIPEONFORK;
|
||||
break;
|
||||
case MADV_DONTDUMP:
|
||||
new_flags |= VM_DONTDUMP;
|
||||
break;
|
||||
case MADV_DODUMP:
|
||||
if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) {
|
||||
error = -EINVAL;
|
||||
goto out;
|
||||
}
|
||||
new_flags &= ~VM_DONTDUMP;
|
||||
break;
|
||||
case MADV_MERGEABLE:
|
||||
case MADV_UNMERGEABLE:
|
||||
error = ksm_madvise(vma, start, end, behavior, &new_flags);
|
||||
if (error)
|
||||
goto out_convert_errno;
|
||||
break;
|
||||
case MADV_HUGEPAGE:
|
||||
case MADV_NOHUGEPAGE:
|
||||
error = hugepage_madvise(vma, &new_flags, behavior);
|
||||
if (error)
|
||||
goto out_convert_errno;
|
||||
break;
|
||||
/* Add 1 for NUL terminator at the end of the anon_name->name */
|
||||
count = strlen(name) + 1;
|
||||
anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
|
||||
if (anon_name) {
|
||||
kref_init(&anon_name->kref);
|
||||
memcpy(anon_name->name, name, count);
|
||||
}
|
||||
|
||||
if (new_flags == vma->vm_flags) {
|
||||
return anon_name;
|
||||
}
|
||||
|
||||
void anon_vma_name_free(struct kref *kref)
|
||||
{
|
||||
struct anon_vma_name *anon_name =
|
||||
container_of(kref, struct anon_vma_name, kref);
|
||||
kfree(anon_name);
|
||||
}
|
||||
|
||||
struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma)
|
||||
{
|
||||
mmap_assert_locked(vma->vm_mm);
|
||||
|
||||
if (vma->vm_file)
|
||||
return NULL;
|
||||
|
||||
return vma->anon_name;
|
||||
}
|
||||
|
||||
/* mmap_lock should be write-locked */
|
||||
static int replace_anon_vma_name(struct vm_area_struct *vma,
|
||||
struct anon_vma_name *anon_name)
|
||||
{
|
||||
struct anon_vma_name *orig_name = anon_vma_name(vma);
|
||||
|
||||
if (!anon_name) {
|
||||
vma->anon_name = NULL;
|
||||
anon_vma_name_put(orig_name);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if (anon_vma_name_eq(orig_name, anon_name))
|
||||
return 0;
|
||||
|
||||
vma->anon_name = anon_vma_name_reuse(anon_name);
|
||||
anon_vma_name_put(orig_name);
|
||||
|
||||
return 0;
|
||||
}
|
||||
#else /* CONFIG_ANON_VMA_NAME */
|
||||
static int replace_anon_vma_name(struct vm_area_struct *vma,
|
||||
struct anon_vma_name *anon_name)
|
||||
{
|
||||
if (anon_name)
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_ANON_VMA_NAME */
|
||||
/*
|
||||
* Update the vm_flags on region of a vma, splitting it or merging it as
|
||||
* necessary. Must be called with mmap_sem held for writing;
|
||||
* Caller should ensure anon_name stability by raising its refcount even when
|
||||
* anon_name belongs to a valid vma because this function might free that vma.
|
||||
*/
|
||||
static int madvise_update_vma(struct vm_area_struct *vma,
|
||||
struct vm_area_struct **prev, unsigned long start,
|
||||
unsigned long end, unsigned long new_flags,
|
||||
struct anon_vma_name *anon_name)
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
int error;
|
||||
pgoff_t pgoff;
|
||||
|
||||
if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) {
|
||||
*prev = vma;
|
||||
goto out;
|
||||
return 0;
|
||||
}
|
||||
|
||||
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
|
||||
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
|
||||
vma->vm_file, pgoff, vma_policy(vma),
|
||||
vma->vm_userfaultfd_ctx);
|
||||
vma->vm_userfaultfd_ctx, anon_name);
|
||||
if (*prev) {
|
||||
vma = *prev;
|
||||
goto success;
|
||||
@ -147,23 +160,19 @@ static long madvise_behavior(struct vm_area_struct *vma,
|
||||
*prev = vma;
|
||||
|
||||
if (start != vma->vm_start) {
|
||||
if (unlikely(mm->map_count >= sysctl_max_map_count)) {
|
||||
error = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
if (unlikely(mm->map_count >= sysctl_max_map_count))
|
||||
return -ENOMEM;
|
||||
error = __split_vma(mm, vma, start, 1);
|
||||
if (error)
|
||||
goto out_convert_errno;
|
||||
return error;
|
||||
}
|
||||
|
||||
if (end != vma->vm_end) {
|
||||
if (unlikely(mm->map_count >= sysctl_max_map_count)) {
|
||||
error = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
if (unlikely(mm->map_count >= sysctl_max_map_count))
|
||||
return -ENOMEM;
|
||||
error = __split_vma(mm, vma, end, 0);
|
||||
if (error)
|
||||
goto out_convert_errno;
|
||||
return error;
|
||||
}
|
||||
|
||||
success:
|
||||
@ -171,16 +180,13 @@ static long madvise_behavior(struct vm_area_struct *vma,
|
||||
* vm_flags is protected by the mmap_lock held in write mode.
|
||||
*/
|
||||
vma->vm_flags = new_flags;
|
||||
if (!vma->vm_file) {
|
||||
error = replace_anon_vma_name(vma, anon_name);
|
||||
if (error)
|
||||
return error;
|
||||
}
|
||||
|
||||
out_convert_errno:
|
||||
/*
|
||||
* madvise() returns EAGAIN if kernel resources, such as
|
||||
* slab, are temporarily unavailable.
|
||||
*/
|
||||
if (error == -ENOMEM)
|
||||
error = -EAGAIN;
|
||||
out:
|
||||
return error;
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SWAP
|
||||
@ -930,6 +936,99 @@ static long madvise_remove(struct vm_area_struct *vma,
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* Apply an madvise behavior to a region of a vma. madvise_update_vma
|
||||
* will handle splitting a vm area into separate areas, each area with its own
|
||||
* behavior.
|
||||
*/
|
||||
static int madvise_vma_behavior(struct vm_area_struct *vma,
|
||||
struct vm_area_struct **prev,
|
||||
unsigned long start, unsigned long end,
|
||||
unsigned long behavior)
|
||||
{
|
||||
int error;
|
||||
struct anon_vma_name *anon_name;
|
||||
unsigned long new_flags = vma->vm_flags;
|
||||
|
||||
switch (behavior) {
|
||||
case MADV_REMOVE:
|
||||
return madvise_remove(vma, prev, start, end);
|
||||
case MADV_WILLNEED:
|
||||
return madvise_willneed(vma, prev, start, end);
|
||||
case MADV_COLD:
|
||||
return madvise_cold(vma, prev, start, end);
|
||||
case MADV_PAGEOUT:
|
||||
return madvise_pageout(vma, prev, start, end);
|
||||
case MADV_FREE:
|
||||
case MADV_DONTNEED:
|
||||
return madvise_dontneed_free(vma, prev, start, end, behavior);
|
||||
case MADV_POPULATE_READ:
|
||||
case MADV_POPULATE_WRITE:
|
||||
return madvise_populate(vma, prev, start, end, behavior);
|
||||
case MADV_NORMAL:
|
||||
new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ;
|
||||
break;
|
||||
case MADV_SEQUENTIAL:
|
||||
new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ;
|
||||
break;
|
||||
case MADV_RANDOM:
|
||||
new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ;
|
||||
break;
|
||||
case MADV_DONTFORK:
|
||||
new_flags |= VM_DONTCOPY;
|
||||
break;
|
||||
case MADV_DOFORK:
|
||||
if (vma->vm_flags & VM_IO)
|
||||
return -EINVAL;
|
||||
new_flags &= ~VM_DONTCOPY;
|
||||
break;
|
||||
case MADV_WIPEONFORK:
|
||||
/* MADV_WIPEONFORK is only supported on anonymous memory. */
|
||||
if (vma->vm_file || vma->vm_flags & VM_SHARED)
|
||||
return -EINVAL;
|
||||
new_flags |= VM_WIPEONFORK;
|
||||
break;
|
||||
case MADV_KEEPONFORK:
|
||||
new_flags &= ~VM_WIPEONFORK;
|
||||
break;
|
||||
case MADV_DONTDUMP:
|
||||
new_flags |= VM_DONTDUMP;
|
||||
break;
|
||||
case MADV_DODUMP:
|
||||
if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL)
|
||||
return -EINVAL;
|
||||
new_flags &= ~VM_DONTDUMP;
|
||||
break;
|
||||
case MADV_MERGEABLE:
|
||||
case MADV_UNMERGEABLE:
|
||||
error = ksm_madvise(vma, start, end, behavior, &new_flags);
|
||||
if (error)
|
||||
goto out;
|
||||
break;
|
||||
case MADV_HUGEPAGE:
|
||||
case MADV_NOHUGEPAGE:
|
||||
error = hugepage_madvise(vma, &new_flags, behavior);
|
||||
if (error)
|
||||
goto out;
|
||||
break;
|
||||
}
|
||||
|
||||
anon_name = anon_vma_name(vma);
|
||||
anon_vma_name_get(anon_name);
|
||||
error = madvise_update_vma(vma, prev, start, end, new_flags,
|
||||
anon_name);
|
||||
anon_vma_name_put(anon_name);
|
||||
|
||||
out:
|
||||
/*
|
||||
* madvise() returns EAGAIN if kernel resources, such as
|
||||
* slab, are temporarily unavailable.
|
||||
*/
|
||||
if (error == -ENOMEM)
|
||||
error = -EAGAIN;
|
||||
return error;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_FAILURE
|
||||
/*
|
||||
* Error injection support for memory error handling.
|
||||
@ -978,30 +1077,6 @@ static int madvise_inject_error(int behavior,
|
||||
}
|
||||
#endif
|
||||
|
||||
static long
|
||||
madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
|
||||
unsigned long start, unsigned long end, int behavior)
|
||||
{
|
||||
switch (behavior) {
|
||||
case MADV_REMOVE:
|
||||
return madvise_remove(vma, prev, start, end);
|
||||
case MADV_WILLNEED:
|
||||
return madvise_willneed(vma, prev, start, end);
|
||||
case MADV_COLD:
|
||||
return madvise_cold(vma, prev, start, end);
|
||||
case MADV_PAGEOUT:
|
||||
return madvise_pageout(vma, prev, start, end);
|
||||
case MADV_FREE:
|
||||
case MADV_DONTNEED:
|
||||
return madvise_dontneed_free(vma, prev, start, end, behavior);
|
||||
case MADV_POPULATE_READ:
|
||||
case MADV_POPULATE_WRITE:
|
||||
return madvise_populate(vma, prev, start, end, behavior);
|
||||
default:
|
||||
return madvise_behavior(vma, prev, start, end, behavior);
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
madvise_behavior_valid(int behavior)
|
||||
{
|
||||
@ -1055,6 +1130,122 @@ process_madvise_behavior_valid(int behavior)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Walk the vmas in range [start,end), and call the visit function on each one.
|
||||
* The visit function will get start and end parameters that cover the overlap
|
||||
* between the current vma and the original range. Any unmapped regions in the
|
||||
* original range will result in this function returning -ENOMEM while still
|
||||
* calling the visit function on all of the existing vmas in the range.
|
||||
* Must be called with the mmap_lock held for reading or writing.
|
||||
*/
|
||||
static
|
||||
int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long end, unsigned long arg,
|
||||
int (*visit)(struct vm_area_struct *vma,
|
||||
struct vm_area_struct **prev, unsigned long start,
|
||||
unsigned long end, unsigned long arg))
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
struct vm_area_struct *prev;
|
||||
unsigned long tmp;
|
||||
int unmapped_error = 0;
|
||||
|
||||
/*
|
||||
* If the interval [start,end) covers some unmapped address
|
||||
* ranges, just ignore them, but return -ENOMEM at the end.
|
||||
* - different from the way of handling in mlock etc.
|
||||
*/
|
||||
vma = find_vma_prev(mm, start, &prev);
|
||||
if (vma && start > vma->vm_start)
|
||||
prev = vma;
|
||||
|
||||
for (;;) {
|
||||
int error;
|
||||
|
||||
/* Still start < end. */
|
||||
if (!vma)
|
||||
return -ENOMEM;
|
||||
|
||||
/* Here start < (end|vma->vm_end). */
|
||||
if (start < vma->vm_start) {
|
||||
unmapped_error = -ENOMEM;
|
||||
start = vma->vm_start;
|
||||
if (start >= end)
|
||||
break;
|
||||
}
|
||||
|
||||
/* Here vma->vm_start <= start < (end|vma->vm_end) */
|
||||
tmp = vma->vm_end;
|
||||
if (end < tmp)
|
||||
tmp = end;
|
||||
|
||||
/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
|
||||
error = visit(vma, &prev, start, tmp, arg);
|
||||
if (error)
|
||||
return error;
|
||||
start = tmp;
|
||||
if (prev && start < prev->vm_end)
|
||||
start = prev->vm_end;
|
||||
if (start >= end)
|
||||
break;
|
||||
if (prev)
|
||||
vma = prev->vm_next;
|
||||
else /* madvise_remove dropped mmap_lock */
|
||||
vma = find_vma(mm, start);
|
||||
}
|
||||
|
||||
return unmapped_error;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ANON_VMA_NAME
|
||||
static int madvise_vma_anon_name(struct vm_area_struct *vma,
|
||||
struct vm_area_struct **prev,
|
||||
unsigned long start, unsigned long end,
|
||||
unsigned long anon_name)
|
||||
{
|
||||
int error;
|
||||
|
||||
/* Only anonymous mappings can be named */
|
||||
if (vma->vm_file)
|
||||
return -EBADF;
|
||||
|
||||
error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
|
||||
(struct anon_vma_name *)anon_name);
|
||||
|
||||
/*
|
||||
* madvise() returns EAGAIN if kernel resources, such as
|
||||
* slab, are temporarily unavailable.
|
||||
*/
|
||||
if (error == -ENOMEM)
|
||||
error = -EAGAIN;
|
||||
return error;
|
||||
}
|
||||
|
||||
int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
|
||||
unsigned long len_in, struct anon_vma_name *anon_name)
|
||||
{
|
||||
unsigned long end;
|
||||
unsigned long len;
|
||||
|
||||
if (start & ~PAGE_MASK)
|
||||
return -EINVAL;
|
||||
len = (len_in + ~PAGE_MASK) & PAGE_MASK;
|
||||
|
||||
/* Check to see whether len was rounded up from small -ve to zero */
|
||||
if (len_in && !len)
|
||||
return -EINVAL;
|
||||
|
||||
end = start + len;
|
||||
if (end < start)
|
||||
return -EINVAL;
|
||||
|
||||
if (end == start)
|
||||
return 0;
|
||||
|
||||
return madvise_walk_vmas(mm, start, end, (unsigned long)anon_name,
|
||||
madvise_vma_anon_name);
|
||||
}
|
||||
#endif /* CONFIG_ANON_VMA_NAME */
|
||||
/*
|
||||
* The madvise(2) system call.
|
||||
*
|
||||
@ -1127,10 +1318,8 @@ process_madvise_behavior_valid(int behavior)
|
||||
*/
|
||||
int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior)
|
||||
{
|
||||
unsigned long end, tmp;
|
||||
struct vm_area_struct *vma, *prev;
|
||||
int unmapped_error = 0;
|
||||
int error = -EINVAL;
|
||||
unsigned long end;
|
||||
int error;
|
||||
int write;
|
||||
size_t len;
|
||||
struct blk_plug plug;
|
||||
@ -1138,23 +1327,22 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
|
||||
start = untagged_addr(start);
|
||||
|
||||
if (!madvise_behavior_valid(behavior))
|
||||
return error;
|
||||
return -EINVAL;
|
||||
|
||||
if (!PAGE_ALIGNED(start))
|
||||
return error;
|
||||
return -EINVAL;
|
||||
len = PAGE_ALIGN(len_in);
|
||||
|
||||
/* Check to see whether len was rounded up from small -ve to zero */
|
||||
if (len_in && !len)
|
||||
return error;
|
||||
return -EINVAL;
|
||||
|
||||
end = start + len;
|
||||
if (end < start)
|
||||
return error;
|
||||
return -EINVAL;
|
||||
|
||||
error = 0;
|
||||
if (end == start)
|
||||
return error;
|
||||
return 0;
|
||||
|
||||
#ifdef CONFIG_MEMORY_FAILURE
|
||||
if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE)
|
||||
@ -1169,51 +1357,9 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
|
||||
mmap_read_lock(mm);
|
||||
}
|
||||
|
||||
/*
|
||||
* If the interval [start,end) covers some unmapped address
|
||||
* ranges, just ignore them, but return -ENOMEM at the end.
|
||||
* - different from the way of handling in mlock etc.
|
||||
*/
|
||||
vma = find_vma_prev(mm, start, &prev);
|
||||
if (vma && start > vma->vm_start)
|
||||
prev = vma;
|
||||
|
||||
blk_start_plug(&plug);
|
||||
for (;;) {
|
||||
/* Still start < end. */
|
||||
error = -ENOMEM;
|
||||
if (!vma)
|
||||
goto out;
|
||||
|
||||
/* Here start < (end|vma->vm_end). */
|
||||
if (start < vma->vm_start) {
|
||||
unmapped_error = -ENOMEM;
|
||||
start = vma->vm_start;
|
||||
if (start >= end)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* Here vma->vm_start <= start < (end|vma->vm_end) */
|
||||
tmp = vma->vm_end;
|
||||
if (end < tmp)
|
||||
tmp = end;
|
||||
|
||||
/* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
|
||||
error = madvise_vma(vma, &prev, start, tmp, behavior);
|
||||
if (error)
|
||||
goto out;
|
||||
start = tmp;
|
||||
if (prev && start < prev->vm_end)
|
||||
start = prev->vm_end;
|
||||
error = unmapped_error;
|
||||
if (start >= end)
|
||||
goto out;
|
||||
if (prev)
|
||||
vma = prev->vm_next;
|
||||
else /* madvise_remove dropped mmap_lock */
|
||||
vma = find_vma(mm, start);
|
||||
}
|
||||
out:
|
||||
error = madvise_walk_vmas(mm, start, end, behavior,
|
||||
madvise_vma_behavior);
|
||||
blk_finish_plug(&plug);
|
||||
if (write)
|
||||
mmap_write_unlock(mm);
|
||||
@ -1235,7 +1381,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
|
||||
struct iovec iovstack[UIO_FASTIOV], iovec;
|
||||
struct iovec *iov = iovstack;
|
||||
struct iov_iter iter;
|
||||
struct pid *pid;
|
||||
struct task_struct *task;
|
||||
struct mm_struct *mm;
|
||||
size_t total_len;
|
||||
@ -1250,18 +1395,12 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
pid = pidfd_get_pid(pidfd, &f_flags);
|
||||
if (IS_ERR(pid)) {
|
||||
ret = PTR_ERR(pid);
|
||||
task = pidfd_get_task(pidfd, &f_flags);
|
||||
if (IS_ERR(task)) {
|
||||
ret = PTR_ERR(task);
|
||||
goto free_iov;
|
||||
}
|
||||
|
||||
task = get_pid_task(pid, PIDTYPE_PID);
|
||||
if (!task) {
|
||||
ret = -ESRCH;
|
||||
goto put_pid;
|
||||
}
|
||||
|
||||
if (!process_madvise_behavior_valid(behavior)) {
|
||||
ret = -EINVAL;
|
||||
goto release_task;
|
||||
@ -1301,8 +1440,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
|
||||
mmput(mm);
|
||||
release_task:
|
||||
put_task_struct(task);
|
||||
put_pid:
|
||||
put_pid(pid);
|
||||
free_iov:
|
||||
kfree(iov);
|
||||
out:
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <asm/cacheflush.h>
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
|
@ -287,7 +287,7 @@ static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
|
||||
{
|
||||
/* pump up @end */
|
||||
if (end == MEMBLOCK_ALLOC_ACCESSIBLE ||
|
||||
end == MEMBLOCK_ALLOC_KASAN)
|
||||
end == MEMBLOCK_ALLOC_NOLEAKTRACE)
|
||||
end = memblock.current_limit;
|
||||
|
||||
/* avoid allocating the first page */
|
||||
@ -369,7 +369,7 @@ void __init memblock_discard(void)
|
||||
if (memblock_reserved_in_slab)
|
||||
kfree(memblock.reserved.regions);
|
||||
else
|
||||
__memblock_free_late(addr, size);
|
||||
memblock_free_late(addr, size);
|
||||
}
|
||||
|
||||
if (memblock.memory.regions != memblock_memory_init_regions) {
|
||||
@ -379,7 +379,7 @@ void __init memblock_discard(void)
|
||||
if (memblock_memory_in_slab)
|
||||
kfree(memblock.memory.regions);
|
||||
else
|
||||
__memblock_free_late(addr, size);
|
||||
memblock_free_late(addr, size);
|
||||
}
|
||||
|
||||
memblock_memory = NULL;
|
||||
@ -478,7 +478,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type,
|
||||
kfree(old_array);
|
||||
else if (old_array != memblock_memory_init_regions &&
|
||||
old_array != memblock_reserved_init_regions)
|
||||
memblock_free_ptr(old_array, old_alloc_size);
|
||||
memblock_free(old_array, old_alloc_size);
|
||||
|
||||
/*
|
||||
* Reserve the new array if that comes from the memblock. Otherwise, we
|
||||
@ -661,6 +661,7 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
|
||||
* @base: base address of the new region
|
||||
* @size: size of the new region
|
||||
* @nid: nid of the new region
|
||||
* @flags: flags of the new region
|
||||
*
|
||||
* Add new memblock region [@base, @base + @size) to the "memory"
|
||||
* type. See memblock_add_range() description for mode details
|
||||
@ -669,14 +670,14 @@ static int __init_memblock memblock_add_range(struct memblock_type *type,
|
||||
* 0 on success, -errno on failure.
|
||||
*/
|
||||
int __init_memblock memblock_add_node(phys_addr_t base, phys_addr_t size,
|
||||
int nid)
|
||||
int nid, enum memblock_flags flags)
|
||||
{
|
||||
phys_addr_t end = base + size - 1;
|
||||
|
||||
memblock_dbg("%s: [%pa-%pa] nid=%d %pS\n", __func__,
|
||||
&base, &end, nid, (void *)_RET_IP_);
|
||||
memblock_dbg("%s: [%pa-%pa] nid=%d flags=%x %pS\n", __func__,
|
||||
&base, &end, nid, flags, (void *)_RET_IP_);
|
||||
|
||||
return memblock_add_range(&memblock.memory, base, size, nid, 0);
|
||||
return memblock_add_range(&memblock.memory, base, size, nid, flags);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -802,28 +803,28 @@ int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
|
||||
}
|
||||
|
||||
/**
|
||||
* memblock_free_ptr - free boot memory allocation
|
||||
* memblock_free - free boot memory allocation
|
||||
* @ptr: starting address of the boot memory allocation
|
||||
* @size: size of the boot memory block in bytes
|
||||
*
|
||||
* Free boot memory block previously allocated by memblock_alloc_xx() API.
|
||||
* The freeing memory will not be released to the buddy allocator.
|
||||
*/
|
||||
void __init_memblock memblock_free_ptr(void *ptr, size_t size)
|
||||
void __init_memblock memblock_free(void *ptr, size_t size)
|
||||
{
|
||||
if (ptr)
|
||||
memblock_free(__pa(ptr), size);
|
||||
memblock_phys_free(__pa(ptr), size);
|
||||
}
|
||||
|
||||
/**
|
||||
* memblock_free - free boot memory block
|
||||
* memblock_phys_free - free boot memory block
|
||||
* @base: phys starting address of the boot memory block
|
||||
* @size: size of the boot memory block in bytes
|
||||
*
|
||||
* Free boot memory block previously allocated by memblock_alloc_xx() API.
|
||||
* The freeing memory will not be released to the buddy allocator.
|
||||
*/
|
||||
int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
|
||||
int __init_memblock memblock_phys_free(phys_addr_t base, phys_addr_t size)
|
||||
{
|
||||
phys_addr_t end = base + size - 1;
|
||||
|
||||
@ -987,6 +988,10 @@ static bool should_skip_region(struct memblock_type *type,
|
||||
if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
|
||||
return true;
|
||||
|
||||
/* skip driver-managed memory unless we were asked for it explicitly */
|
||||
if (!(flags & MEMBLOCK_DRIVER_MANAGED) && memblock_is_driver_managed(m))
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1388,8 +1393,11 @@ phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
|
||||
return 0;
|
||||
|
||||
done:
|
||||
/* Skip kmemleak for kasan_init() due to high volume. */
|
||||
if (end != MEMBLOCK_ALLOC_KASAN)
|
||||
/*
|
||||
* Skip kmemleak for those places like kasan_init() and
|
||||
* early_pgtable_alloc() due to high volume.
|
||||
*/
|
||||
if (end != MEMBLOCK_ALLOC_NOLEAKTRACE)
|
||||
/*
|
||||
* The min_count is set to 0 so that memblock allocated
|
||||
* blocks are never reported as leaks. This is because many
|
||||
@ -1595,7 +1603,7 @@ void * __init memblock_alloc_try_nid(
|
||||
}
|
||||
|
||||
/**
|
||||
* __memblock_free_late - free pages directly to buddy allocator
|
||||
* memblock_free_late - free pages directly to buddy allocator
|
||||
* @base: phys starting address of the boot memory block
|
||||
* @size: size of the boot memory block in bytes
|
||||
*
|
||||
@ -1603,7 +1611,7 @@ void * __init memblock_alloc_try_nid(
|
||||
* down, but we are still initializing the system. Pages are released directly
|
||||
* to the buddy allocator.
|
||||
*/
|
||||
void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
|
||||
void __init memblock_free_late(phys_addr_t base, phys_addr_t size)
|
||||
{
|
||||
phys_addr_t cursor, end;
|
||||
|
||||
@ -1943,7 +1951,7 @@ static void __init free_memmap(unsigned long start_pfn, unsigned long end_pfn)
|
||||
* memmap array.
|
||||
*/
|
||||
if (pg < pgend)
|
||||
memblock_free(pg, pgend - pg);
|
||||
memblock_phys_free(pg, pgend - pg);
|
||||
}
|
||||
|
||||
/*
|
||||
|
634
mm/memcontrol.c
634
mm/memcontrol.c
File diff suppressed because it is too large
Load Diff
@ -313,9 +313,7 @@ SYSCALL_DEFINE2(memfd_create,
|
||||
}
|
||||
|
||||
if (flags & MFD_HUGETLB) {
|
||||
struct ucounts *ucounts = NULL;
|
||||
|
||||
file = hugetlb_file_setup(name, 0, VM_NORESERVE, &ucounts,
|
||||
file = hugetlb_file_setup(name, 0, VM_NORESERVE,
|
||||
HUGETLB_ANONHUGE_INODE,
|
||||
(flags >> MFD_HUGE_SHIFT) &
|
||||
MFD_HUGE_MASK);
|
||||
|
@ -39,6 +39,7 @@
|
||||
#include <linux/kernel-page-flags.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/sched/task.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/ksm.h>
|
||||
#include <linux/rmap.h>
|
||||
#include <linux/export.h>
|
||||
@ -57,6 +58,7 @@
|
||||
#include <linux/ratelimit.h>
|
||||
#include <linux/page-isolation.h>
|
||||
#include <linux/pagewalk.h>
|
||||
#include <linux/shmem_fs.h>
|
||||
#include "internal.h"
|
||||
#include "ras/ras_event.h"
|
||||
|
||||
@ -673,7 +675,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask,
|
||||
#define hwpoison_hugetlb_range NULL
|
||||
#endif
|
||||
|
||||
static struct mm_walk_ops hwp_walk_ops = {
|
||||
static const struct mm_walk_ops hwp_walk_ops = {
|
||||
.pmd_entry = hwpoison_pte_range,
|
||||
.hugetlb_entry = hwpoison_hugetlb_range,
|
||||
};
|
||||
@ -721,7 +723,6 @@ static const char * const action_page_types[] = {
|
||||
[MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
|
||||
[MF_MSG_SLAB] = "kernel slab page",
|
||||
[MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
|
||||
[MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
|
||||
[MF_MSG_HUGE] = "huge page",
|
||||
[MF_MSG_FREE_HUGE] = "free huge page",
|
||||
[MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
|
||||
@ -736,7 +737,6 @@ static const char * const action_page_types[] = {
|
||||
[MF_MSG_CLEAN_LRU] = "clean LRU page",
|
||||
[MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
|
||||
[MF_MSG_BUDDY] = "free buddy page",
|
||||
[MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
|
||||
[MF_MSG_DAX] = "dax page",
|
||||
[MF_MSG_UNSPLIT_THP] = "unsplit thp",
|
||||
[MF_MSG_UNKNOWN] = "unknown page",
|
||||
@ -762,7 +762,7 @@ static int delete_from_lru_cache(struct page *p)
|
||||
* Poisoned page might never drop its ref count to 0 so we have
|
||||
* to uncharge it manually from its memcg.
|
||||
*/
|
||||
mem_cgroup_uncharge(p);
|
||||
mem_cgroup_uncharge(page_folio(p));
|
||||
|
||||
/*
|
||||
* drop the page count elevated by isolate_lru_page()
|
||||
@ -806,12 +806,44 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
|
||||
return ret;
|
||||
}
|
||||
|
||||
struct page_state {
|
||||
unsigned long mask;
|
||||
unsigned long res;
|
||||
enum mf_action_page_type type;
|
||||
|
||||
/* Callback ->action() has to unlock the relevant page inside it. */
|
||||
int (*action)(struct page_state *ps, struct page *p);
|
||||
};
|
||||
|
||||
/*
|
||||
* Return true if page is still referenced by others, otherwise return
|
||||
* false.
|
||||
*
|
||||
* The extra_pins is true when one extra refcount is expected.
|
||||
*/
|
||||
static bool has_extra_refcount(struct page_state *ps, struct page *p,
|
||||
bool extra_pins)
|
||||
{
|
||||
int count = page_count(p) - 1;
|
||||
|
||||
if (extra_pins)
|
||||
count -= 1;
|
||||
|
||||
if (count > 0) {
|
||||
pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
|
||||
page_to_pfn(p), action_page_types[ps->type], count);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* Error hit kernel page.
|
||||
* Do nothing, try to be lucky and not touch this instead. For a few cases we
|
||||
* could be more sophisticated.
|
||||
*/
|
||||
static int me_kernel(struct page *p, unsigned long pfn)
|
||||
static int me_kernel(struct page_state *ps, struct page *p)
|
||||
{
|
||||
unlock_page(p);
|
||||
return MF_IGNORED;
|
||||
@ -820,9 +852,9 @@ static int me_kernel(struct page *p, unsigned long pfn)
|
||||
/*
|
||||
* Page in unknown state. Do nothing.
|
||||
*/
|
||||
static int me_unknown(struct page *p, unsigned long pfn)
|
||||
static int me_unknown(struct page_state *ps, struct page *p)
|
||||
{
|
||||
pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
|
||||
pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p));
|
||||
unlock_page(p);
|
||||
return MF_FAILED;
|
||||
}
|
||||
@ -830,10 +862,11 @@ static int me_unknown(struct page *p, unsigned long pfn)
|
||||
/*
|
||||
* Clean (or cleaned) page cache page.
|
||||
*/
|
||||
static int me_pagecache_clean(struct page *p, unsigned long pfn)
|
||||
static int me_pagecache_clean(struct page_state *ps, struct page *p)
|
||||
{
|
||||
int ret;
|
||||
struct address_space *mapping;
|
||||
bool extra_pins;
|
||||
|
||||
delete_from_lru_cache(p);
|
||||
|
||||
@ -862,14 +895,24 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* The shmem page is kept in page cache instead of truncating
|
||||
* so is expected to have an extra refcount after error-handling.
|
||||
*/
|
||||
extra_pins = shmem_mapping(mapping);
|
||||
|
||||
/*
|
||||
* Truncation is a bit tricky. Enable it per file system for now.
|
||||
*
|
||||
* Open: to take i_rwsem or not for this? Right now we don't.
|
||||
*/
|
||||
ret = truncate_error_page(p, pfn, mapping);
|
||||
ret = truncate_error_page(p, page_to_pfn(p), mapping);
|
||||
if (has_extra_refcount(ps, p, extra_pins))
|
||||
ret = MF_FAILED;
|
||||
|
||||
out:
|
||||
unlock_page(p);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -878,7 +921,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
|
||||
* Issues: when the error hit a hole page the error is not properly
|
||||
* propagated.
|
||||
*/
|
||||
static int me_pagecache_dirty(struct page *p, unsigned long pfn)
|
||||
static int me_pagecache_dirty(struct page_state *ps, struct page *p)
|
||||
{
|
||||
struct address_space *mapping = page_mapping(p);
|
||||
|
||||
@ -922,7 +965,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
|
||||
mapping_set_error(mapping, -EIO);
|
||||
}
|
||||
|
||||
return me_pagecache_clean(p, pfn);
|
||||
return me_pagecache_clean(ps, p);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -944,9 +987,10 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
|
||||
* Clean swap cache pages can be directly isolated. A later page fault will
|
||||
* bring in the known good data from disk.
|
||||
*/
|
||||
static int me_swapcache_dirty(struct page *p, unsigned long pfn)
|
||||
static int me_swapcache_dirty(struct page_state *ps, struct page *p)
|
||||
{
|
||||
int ret;
|
||||
bool extra_pins = false;
|
||||
|
||||
ClearPageDirty(p);
|
||||
/* Trigger EIO in shmem: */
|
||||
@ -954,10 +998,17 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
|
||||
|
||||
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
|
||||
unlock_page(p);
|
||||
|
||||
if (ret == MF_DELAYED)
|
||||
extra_pins = true;
|
||||
|
||||
if (has_extra_refcount(ps, p, extra_pins))
|
||||
ret = MF_FAILED;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int me_swapcache_clean(struct page *p, unsigned long pfn)
|
||||
static int me_swapcache_clean(struct page_state *ps, struct page *p)
|
||||
{
|
||||
int ret;
|
||||
|
||||
@ -965,6 +1016,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
|
||||
|
||||
ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
|
||||
unlock_page(p);
|
||||
|
||||
if (has_extra_refcount(ps, p, false))
|
||||
ret = MF_FAILED;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
@ -974,7 +1029,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
|
||||
* - Error on hugepage is contained in hugepage unit (not in raw page unit.)
|
||||
* To narrow down kill region to one page, we need to break up pmd.
|
||||
*/
|
||||
static int me_huge_page(struct page *p, unsigned long pfn)
|
||||
static int me_huge_page(struct page_state *ps, struct page *p)
|
||||
{
|
||||
int res;
|
||||
struct page *hpage = compound_head(p);
|
||||
@ -985,7 +1040,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
|
||||
|
||||
mapping = page_mapping(hpage);
|
||||
if (mapping) {
|
||||
res = truncate_error_page(hpage, pfn, mapping);
|
||||
res = truncate_error_page(hpage, page_to_pfn(p), mapping);
|
||||
unlock_page(hpage);
|
||||
} else {
|
||||
res = MF_FAILED;
|
||||
@ -1003,6 +1058,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
|
||||
}
|
||||
}
|
||||
|
||||
if (has_extra_refcount(ps, p, false))
|
||||
res = MF_FAILED;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
@ -1028,14 +1086,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
|
||||
#define slab (1UL << PG_slab)
|
||||
#define reserved (1UL << PG_reserved)
|
||||
|
||||
static struct page_state {
|
||||
unsigned long mask;
|
||||
unsigned long res;
|
||||
enum mf_action_page_type type;
|
||||
|
||||
/* Callback ->action() has to unlock the relevant page inside it. */
|
||||
int (*action)(struct page *p, unsigned long pfn);
|
||||
} error_states[] = {
|
||||
static struct page_state error_states[] = {
|
||||
{ reserved, reserved, MF_MSG_KERNEL, me_kernel },
|
||||
/*
|
||||
* free pages are specially detected outside this table:
|
||||
@ -1095,19 +1146,10 @@ static int page_action(struct page_state *ps, struct page *p,
|
||||
unsigned long pfn)
|
||||
{
|
||||
int result;
|
||||
int count;
|
||||
|
||||
/* page p should be unlocked after returning from ps->action(). */
|
||||
result = ps->action(p, pfn);
|
||||
result = ps->action(ps, p);
|
||||
|
||||
count = page_count(p) - 1;
|
||||
if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
|
||||
count--;
|
||||
if (count > 0) {
|
||||
pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
|
||||
pfn, action_page_types[ps->type], count);
|
||||
result = MF_FAILED;
|
||||
}
|
||||
action_result(pfn, ps->type, result);
|
||||
|
||||
/* Could do more checks here if page looks ok */
|
||||
@ -1118,6 +1160,22 @@ static int page_action(struct page_state *ps, struct page *p,
|
||||
return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
|
||||
}
|
||||
|
||||
static inline bool PageHWPoisonTakenOff(struct page *page)
|
||||
{
|
||||
return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON;
|
||||
}
|
||||
|
||||
void SetPageHWPoisonTakenOff(struct page *page)
|
||||
{
|
||||
set_page_private(page, MAGIC_HWPOISON);
|
||||
}
|
||||
|
||||
void ClearPageHWPoisonTakenOff(struct page *page)
|
||||
{
|
||||
if (PageHWPoison(page))
|
||||
set_page_private(page, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true if a page type of a given page is supported by hwpoison
|
||||
* mechanism (while handling could fail), otherwise false. This function
|
||||
@ -1220,6 +1278,27 @@ static int get_any_page(struct page *p, unsigned long flags)
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __get_unpoison_page(struct page *page)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
int ret = 0;
|
||||
bool hugetlb = false;
|
||||
|
||||
ret = get_hwpoison_huge_page(head, &hugetlb);
|
||||
if (hugetlb)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
|
||||
* but also isolated from buddy freelist, so need to identify the
|
||||
* state and have to cancel both operations to unpoison.
|
||||
*/
|
||||
if (PageHWPoisonTakenOff(page))
|
||||
return -EHWPOISON;
|
||||
|
||||
return get_page_unless_zero(page) ? 1 : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* get_hwpoison_page() - Get refcount for memory error handling
|
||||
* @p: Raw error page (hit by memory error)
|
||||
@ -1227,7 +1306,7 @@ static int get_any_page(struct page *p, unsigned long flags)
|
||||
*
|
||||
* get_hwpoison_page() takes a page refcount of an error page to handle memory
|
||||
* error on it, after checking that the error page is in a well-defined state
|
||||
* (defined as a page-type we can successfully handle the memor error on it,
|
||||
* (defined as a page-type we can successfully handle the memory error on it,
|
||||
* such as LRU page and hugetlb page).
|
||||
*
|
||||
* Memory error handling could be triggered at any time on any type of page,
|
||||
@ -1236,18 +1315,26 @@ static int get_any_page(struct page *p, unsigned long flags)
|
||||
* extra care for the error page's state (as done in __get_hwpoison_page()),
|
||||
* and has some retry logic in get_any_page().
|
||||
*
|
||||
* When called from unpoison_memory(), the caller should already ensure that
|
||||
* the given page has PG_hwpoison. So it's never reused for other page
|
||||
* allocations, and __get_unpoison_page() never races with them.
|
||||
*
|
||||
* Return: 0 on failure,
|
||||
* 1 on success for in-use pages in a well-defined state,
|
||||
* -EIO for pages on which we can not handle memory errors,
|
||||
* -EBUSY when get_hwpoison_page() has raced with page lifecycle
|
||||
* operations like allocation and free.
|
||||
* operations like allocation and free,
|
||||
* -EHWPOISON when the page is hwpoisoned and taken off from buddy.
|
||||
*/
|
||||
static int get_hwpoison_page(struct page *p, unsigned long flags)
|
||||
{
|
||||
int ret;
|
||||
|
||||
zone_pcp_disable(page_zone(p));
|
||||
ret = get_any_page(p, flags);
|
||||
if (flags & MF_UNPOISON)
|
||||
ret = __get_unpoison_page(p);
|
||||
else
|
||||
ret = get_any_page(p, flags);
|
||||
zone_pcp_enable(page_zone(p));
|
||||
|
||||
return ret;
|
||||
@ -1400,14 +1487,11 @@ static int identify_page_state(unsigned long pfn, struct page *p,
|
||||
static int try_to_split_thp_page(struct page *page, const char *msg)
|
||||
{
|
||||
lock_page(page);
|
||||
if (!PageAnon(page) || unlikely(split_huge_page(page))) {
|
||||
if (unlikely(split_huge_page(page))) {
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
|
||||
unlock_page(page);
|
||||
if (!PageAnon(page))
|
||||
pr_info("%s: %#lx: non anonymous thp\n", msg, pfn);
|
||||
else
|
||||
pr_info("%s: %#lx: thp split failed\n", msg, pfn);
|
||||
pr_info("%s: %#lx: thp split failed\n", msg, pfn);
|
||||
put_page(page);
|
||||
return -EBUSY;
|
||||
}
|
||||
@ -1461,14 +1545,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
|
||||
lock_page(head);
|
||||
page_flags = head->flags;
|
||||
|
||||
if (!PageHWPoison(head)) {
|
||||
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
|
||||
num_poisoned_pages_dec();
|
||||
unlock_page(head);
|
||||
put_page(head);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
|
||||
* simply disable it. In order to make it work properly, we need
|
||||
@ -1519,6 +1595,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Pages instantiated by device-dax (not filesystem-dax)
|
||||
* may be compound pages.
|
||||
*/
|
||||
page = compound_head(page);
|
||||
|
||||
/*
|
||||
* Prevent the inode from being freed while we are interrogating
|
||||
* the address_space, typically this would be handled by
|
||||
@ -1582,6 +1664,8 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
|
||||
return rc;
|
||||
}
|
||||
|
||||
static DEFINE_MUTEX(mf_mutex);
|
||||
|
||||
/**
|
||||
* memory_failure - Handle memory failure of a page.
|
||||
* @pfn: Page Number of the corrupted page
|
||||
@ -1608,26 +1692,32 @@ int memory_failure(unsigned long pfn, int flags)
|
||||
int res = 0;
|
||||
unsigned long page_flags;
|
||||
bool retry = true;
|
||||
static DEFINE_MUTEX(mf_mutex);
|
||||
|
||||
if (!sysctl_memory_failure_recovery)
|
||||
panic("Memory failure on page %lx", pfn);
|
||||
|
||||
mutex_lock(&mf_mutex);
|
||||
|
||||
p = pfn_to_online_page(pfn);
|
||||
if (!p) {
|
||||
res = arch_memory_failure(pfn, flags);
|
||||
if (res == 0)
|
||||
goto unlock_mutex;
|
||||
|
||||
if (pfn_valid(pfn)) {
|
||||
pgmap = get_dev_pagemap(pfn, NULL);
|
||||
if (pgmap)
|
||||
return memory_failure_dev_pagemap(pfn, flags,
|
||||
pgmap);
|
||||
if (pgmap) {
|
||||
res = memory_failure_dev_pagemap(pfn, flags,
|
||||
pgmap);
|
||||
goto unlock_mutex;
|
||||
}
|
||||
}
|
||||
pr_err("Memory failure: %#lx: memory outside kernel control\n",
|
||||
pfn);
|
||||
return -ENXIO;
|
||||
res = -ENXIO;
|
||||
goto unlock_mutex;
|
||||
}
|
||||
|
||||
mutex_lock(&mf_mutex);
|
||||
|
||||
try_again:
|
||||
if (PageHuge(p)) {
|
||||
res = memory_failure_hugetlb(pfn, flags);
|
||||
@ -1742,16 +1832,6 @@ int memory_failure(unsigned long pfn, int flags)
|
||||
*/
|
||||
page_flags = p->flags;
|
||||
|
||||
/*
|
||||
* unpoison always clear PG_hwpoison inside page lock
|
||||
*/
|
||||
if (!PageHWPoison(p)) {
|
||||
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
|
||||
num_poisoned_pages_dec();
|
||||
unlock_page(p);
|
||||
put_page(p);
|
||||
goto unlock_mutex;
|
||||
}
|
||||
if (hwpoison_filter(p)) {
|
||||
if (TestClearPageHWPoison(p))
|
||||
num_poisoned_pages_dec();
|
||||
@ -1915,6 +1995,28 @@ core_initcall(memory_failure_init);
|
||||
pr_info(fmt, pfn); \
|
||||
})
|
||||
|
||||
static inline int clear_page_hwpoison(struct ratelimit_state *rs, struct page *p)
|
||||
{
|
||||
if (TestClearPageHWPoison(p)) {
|
||||
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
|
||||
page_to_pfn(p), rs);
|
||||
num_poisoned_pages_dec();
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int unpoison_taken_off_page(struct ratelimit_state *rs,
|
||||
struct page *p)
|
||||
{
|
||||
if (put_page_back_buddy(p)) {
|
||||
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
|
||||
page_to_pfn(p), rs);
|
||||
return 0;
|
||||
}
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
/**
|
||||
* unpoison_memory - Unpoison a previously poisoned page
|
||||
* @pfn: Page number of the to be unpoisoned page
|
||||
@ -1931,8 +2033,7 @@ int unpoison_memory(unsigned long pfn)
|
||||
{
|
||||
struct page *page;
|
||||
struct page *p;
|
||||
int freeit = 0;
|
||||
unsigned long flags = 0;
|
||||
int ret = -EBUSY;
|
||||
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
|
||||
DEFAULT_RATELIMIT_BURST);
|
||||
|
||||
@ -1942,69 +2043,60 @@ int unpoison_memory(unsigned long pfn)
|
||||
p = pfn_to_page(pfn);
|
||||
page = compound_head(p);
|
||||
|
||||
mutex_lock(&mf_mutex);
|
||||
|
||||
if (!PageHWPoison(p)) {
|
||||
unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
goto unlock_mutex;
|
||||
}
|
||||
|
||||
if (page_count(page) > 1) {
|
||||
unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
goto unlock_mutex;
|
||||
}
|
||||
|
||||
if (page_mapped(page)) {
|
||||
unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
goto unlock_mutex;
|
||||
}
|
||||
|
||||
if (page_mapping(page)) {
|
||||
unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
goto unlock_mutex;
|
||||
}
|
||||
|
||||
/*
|
||||
* unpoison_memory() can encounter thp only when the thp is being
|
||||
* worked by memory_failure() and the page lock is not held yet.
|
||||
* In such case, we yield to memory_failure() and make unpoison fail.
|
||||
*/
|
||||
if (!PageHuge(page) && PageTransHuge(page)) {
|
||||
unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
}
|
||||
if (PageSlab(page) || PageTable(page))
|
||||
goto unlock_mutex;
|
||||
|
||||
if (!get_hwpoison_page(p, flags)) {
|
||||
if (TestClearPageHWPoison(p))
|
||||
num_poisoned_pages_dec();
|
||||
unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
}
|
||||
ret = get_hwpoison_page(p, MF_UNPOISON);
|
||||
if (!ret) {
|
||||
if (clear_page_hwpoison(&unpoison_rs, page))
|
||||
ret = 0;
|
||||
else
|
||||
ret = -EBUSY;
|
||||
} else if (ret < 0) {
|
||||
if (ret == -EHWPOISON) {
|
||||
ret = unpoison_taken_off_page(&unpoison_rs, p);
|
||||
} else
|
||||
unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
} else {
|
||||
int freeit = clear_page_hwpoison(&unpoison_rs, p);
|
||||
|
||||
lock_page(page);
|
||||
/*
|
||||
* This test is racy because PG_hwpoison is set outside of page lock.
|
||||
* That's acceptable because that won't trigger kernel panic. Instead,
|
||||
* the PG_hwpoison page will be caught and isolated on the entrance to
|
||||
* the free buddy page pool.
|
||||
*/
|
||||
if (TestClearPageHWPoison(page)) {
|
||||
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
num_poisoned_pages_dec();
|
||||
freeit = 1;
|
||||
}
|
||||
unlock_page(page);
|
||||
|
||||
put_page(page);
|
||||
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
|
||||
put_page(page);
|
||||
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) {
|
||||
put_page(page);
|
||||
ret = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
unlock_mutex:
|
||||
mutex_unlock(&mf_mutex);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(unpoison_memory);
|
||||
|
||||
@ -2104,14 +2196,14 @@ static int __soft_offline_page(struct page *page)
|
||||
if (!list_empty(&pagelist))
|
||||
putback_movable_pages(&pagelist);
|
||||
|
||||
pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n",
|
||||
pfn, msg_page[huge], ret, page->flags, &page->flags);
|
||||
pr_info("soft offline: %#lx: %s migration failed %d, type %pGp\n",
|
||||
pfn, msg_page[huge], ret, &page->flags);
|
||||
if (ret > 0)
|
||||
ret = -EBUSY;
|
||||
}
|
||||
} else {
|
||||
pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %lx (%pGp)\n",
|
||||
pfn, msg_page[huge], page_count(page), page->flags, &page->flags);
|
||||
pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %pGp\n",
|
||||
pfn, msg_page[huge], page_count(page), &page->flags);
|
||||
ret = -EBUSY;
|
||||
}
|
||||
return ret;
|
||||
@ -2185,9 +2277,12 @@ int soft_offline_page(unsigned long pfn, int flags)
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
mutex_lock(&mf_mutex);
|
||||
|
||||
if (PageHWPoison(page)) {
|
||||
pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
|
||||
put_ref_page(ref_page);
|
||||
mutex_unlock(&mf_mutex);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -2206,5 +2301,7 @@ int soft_offline_page(unsigned long pfn, int flags)
|
||||
}
|
||||
}
|
||||
|
||||
mutex_unlock(&mf_mutex);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
245
mm/memory.c
245
mm/memory.c
@ -41,6 +41,7 @@
|
||||
|
||||
#include <linux/kernel_stat.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/sched/coredump.h>
|
||||
#include <linux/sched/numa_balancing.h>
|
||||
@ -433,35 +434,39 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||
}
|
||||
}
|
||||
|
||||
void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
|
||||
{
|
||||
spinlock_t *ptl = pmd_lock(mm, pmd);
|
||||
|
||||
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
|
||||
mm_inc_nr_ptes(mm);
|
||||
/*
|
||||
* Ensure all pte setup (eg. pte page lock and page clearing) are
|
||||
* visible before the pte is made visible to other CPUs by being
|
||||
* put into page tables.
|
||||
*
|
||||
* The other side of the story is the pointer chasing in the page
|
||||
* table walking code (when walking the page table without locking;
|
||||
* ie. most of the time). Fortunately, these data accesses consist
|
||||
* of a chain of data-dependent loads, meaning most CPUs (alpha
|
||||
* being the notable exception) will already guarantee loads are
|
||||
* seen in-order. See the alpha page table accessors for the
|
||||
* smp_rmb() barriers in page table walking code.
|
||||
*/
|
||||
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
|
||||
pmd_populate(mm, pmd, *pte);
|
||||
*pte = NULL;
|
||||
}
|
||||
spin_unlock(ptl);
|
||||
}
|
||||
|
||||
int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
|
||||
{
|
||||
spinlock_t *ptl;
|
||||
pgtable_t new = pte_alloc_one(mm);
|
||||
if (!new)
|
||||
return -ENOMEM;
|
||||
|
||||
/*
|
||||
* Ensure all pte setup (eg. pte page lock and page clearing) are
|
||||
* visible before the pte is made visible to other CPUs by being
|
||||
* put into page tables.
|
||||
*
|
||||
* The other side of the story is the pointer chasing in the page
|
||||
* table walking code (when walking the page table without locking;
|
||||
* ie. most of the time). Fortunately, these data accesses consist
|
||||
* of a chain of data-dependent loads, meaning most CPUs (alpha
|
||||
* being the notable exception) will already guarantee loads are
|
||||
* seen in-order. See the alpha page table accessors for the
|
||||
* smp_rmb() barriers in page table walking code.
|
||||
*/
|
||||
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
|
||||
|
||||
ptl = pmd_lock(mm, pmd);
|
||||
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
|
||||
mm_inc_nr_ptes(mm);
|
||||
pmd_populate(mm, pmd, new);
|
||||
new = NULL;
|
||||
}
|
||||
spin_unlock(ptl);
|
||||
pmd_install(mm, pmd, &new);
|
||||
if (new)
|
||||
pte_free(mm, new);
|
||||
return 0;
|
||||
@ -473,10 +478,9 @@ int __pte_alloc_kernel(pmd_t *pmd)
|
||||
if (!new)
|
||||
return -ENOMEM;
|
||||
|
||||
smp_wmb(); /* See comment in __pte_alloc */
|
||||
|
||||
spin_lock(&init_mm.page_table_lock);
|
||||
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
|
||||
smp_wmb(); /* See comment in pmd_install() */
|
||||
pmd_populate_kernel(&init_mm, pmd, new);
|
||||
new = NULL;
|
||||
}
|
||||
@ -716,8 +720,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
|
||||
else if (is_writable_device_exclusive_entry(entry))
|
||||
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
|
||||
|
||||
set_pte_at(vma->vm_mm, address, ptep, pte);
|
||||
|
||||
/*
|
||||
* No need to take a page reference as one was already
|
||||
* created when the swap entry was made.
|
||||
@ -731,6 +733,8 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
|
||||
*/
|
||||
WARN_ON_ONCE(!PageAnon(page));
|
||||
|
||||
set_pte_at(vma->vm_mm, address, ptep, pte);
|
||||
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
mlock_vma_page(page);
|
||||
|
||||
@ -990,7 +994,7 @@ page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
|
||||
if (!new_page)
|
||||
return NULL;
|
||||
|
||||
if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
|
||||
if (mem_cgroup_charge(page_folio(new_page), src_mm, GFP_KERNEL)) {
|
||||
put_page(new_page);
|
||||
return NULL;
|
||||
}
|
||||
@ -1301,6 +1305,28 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Parameter block passed down to zap_pte_range in exceptional cases.
|
||||
*/
|
||||
struct zap_details {
|
||||
struct address_space *zap_mapping; /* Check page->mapping if set */
|
||||
struct folio *single_folio; /* Locked folio to be unmapped */
|
||||
};
|
||||
|
||||
/*
|
||||
* We set details->zap_mapping when we want to unmap shared but keep private
|
||||
* pages. Return true if skip zapping this page, false otherwise.
|
||||
*/
|
||||
static inline bool
|
||||
zap_skip_check_mapping(struct zap_details *details, struct page *page)
|
||||
{
|
||||
if (!details || !page)
|
||||
return false;
|
||||
|
||||
return details->zap_mapping &&
|
||||
(details->zap_mapping != page_rmapping(page));
|
||||
}
|
||||
|
||||
static unsigned long zap_pte_range(struct mmu_gather *tlb,
|
||||
struct vm_area_struct *vma, pmd_t *pmd,
|
||||
unsigned long addr, unsigned long end,
|
||||
@ -1333,16 +1359,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
|
||||
struct page *page;
|
||||
|
||||
page = vm_normal_page(vma, addr, ptent);
|
||||
if (unlikely(details) && page) {
|
||||
/*
|
||||
* unmap_shared_mapping_pages() wants to
|
||||
* invalidate cache without truncating:
|
||||
* unmap shared but keep private pages.
|
||||
*/
|
||||
if (details->check_mapping &&
|
||||
details->check_mapping != page_rmapping(page))
|
||||
continue;
|
||||
}
|
||||
if (unlikely(zap_skip_check_mapping(details, page)))
|
||||
continue;
|
||||
ptent = ptep_get_and_clear_full(mm, addr, pte,
|
||||
tlb->fullmm);
|
||||
tlb_remove_tlb_entry(tlb, pte, addr);
|
||||
@ -1375,17 +1393,8 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
|
||||
is_device_exclusive_entry(entry)) {
|
||||
struct page *page = pfn_swap_entry_to_page(entry);
|
||||
|
||||
if (unlikely(details && details->check_mapping)) {
|
||||
/*
|
||||
* unmap_shared_mapping_pages() wants to
|
||||
* invalidate cache without truncating:
|
||||
* unmap shared but keep private pages.
|
||||
*/
|
||||
if (details->check_mapping !=
|
||||
page_rmapping(page))
|
||||
continue;
|
||||
}
|
||||
|
||||
if (unlikely(zap_skip_check_mapping(details, page)))
|
||||
continue;
|
||||
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
|
||||
rss[mm_counter(page)]--;
|
||||
|
||||
@ -1457,8 +1466,8 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
|
||||
else if (zap_huge_pmd(tlb, vma, pmd, addr))
|
||||
goto next;
|
||||
/* fall through */
|
||||
} else if (details && details->single_page &&
|
||||
PageTransCompound(details->single_page) &&
|
||||
} else if (details && details->single_folio &&
|
||||
folio_test_pmd_mappable(details->single_folio) &&
|
||||
next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
|
||||
spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
|
||||
/*
|
||||
@ -2724,19 +2733,19 @@ EXPORT_SYMBOL_GPL(apply_to_existing_page_range);
|
||||
* proceeding (but do_wp_page is only called after already making such a check;
|
||||
* and do_anonymous_page can safely check later on).
|
||||
*/
|
||||
static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
|
||||
pte_t *page_table, pte_t orig_pte)
|
||||
static inline int pte_unmap_same(struct vm_fault *vmf)
|
||||
{
|
||||
int same = 1;
|
||||
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION)
|
||||
if (sizeof(pte_t) > sizeof(unsigned long)) {
|
||||
spinlock_t *ptl = pte_lockptr(mm, pmd);
|
||||
spinlock_t *ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
|
||||
spin_lock(ptl);
|
||||
same = pte_same(*page_table, orig_pte);
|
||||
same = pte_same(*vmf->pte, vmf->orig_pte);
|
||||
spin_unlock(ptl);
|
||||
}
|
||||
#endif
|
||||
pte_unmap(page_table);
|
||||
pte_unmap(vmf->pte);
|
||||
vmf->pte = NULL;
|
||||
return same;
|
||||
}
|
||||
|
||||
@ -3019,7 +3028,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
|
||||
}
|
||||
}
|
||||
|
||||
if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
|
||||
if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL))
|
||||
goto oom_free_new;
|
||||
cgroup_throttle_swaprate(new_page, GFP_KERNEL);
|
||||
|
||||
@ -3321,20 +3330,20 @@ static void unmap_mapping_range_vma(struct vm_area_struct *vma,
|
||||
}
|
||||
|
||||
static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
|
||||
pgoff_t first_index,
|
||||
pgoff_t last_index,
|
||||
struct zap_details *details)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
pgoff_t vba, vea, zba, zea;
|
||||
|
||||
vma_interval_tree_foreach(vma, root,
|
||||
details->first_index, details->last_index) {
|
||||
|
||||
vma_interval_tree_foreach(vma, root, first_index, last_index) {
|
||||
vba = vma->vm_pgoff;
|
||||
vea = vba + vma_pages(vma) - 1;
|
||||
zba = details->first_index;
|
||||
zba = first_index;
|
||||
if (zba < vba)
|
||||
zba = vba;
|
||||
zea = details->last_index;
|
||||
zea = last_index;
|
||||
if (zea > vea)
|
||||
zea = vea;
|
||||
|
||||
@ -3346,32 +3355,35 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root,
|
||||
}
|
||||
|
||||
/**
|
||||
* unmap_mapping_page() - Unmap single page from processes.
|
||||
* @page: The locked page to be unmapped.
|
||||
* unmap_mapping_folio() - Unmap single folio from processes.
|
||||
* @folio: The locked folio to be unmapped.
|
||||
*
|
||||
* Unmap this page from any userspace process which still has it mmaped.
|
||||
* Unmap this folio from any userspace process which still has it mmaped.
|
||||
* Typically, for efficiency, the range of nearby pages has already been
|
||||
* unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
|
||||
* truncation or invalidation holds the lock on a page, it may find that
|
||||
* the page has been remapped again: and then uses unmap_mapping_page()
|
||||
* truncation or invalidation holds the lock on a folio, it may find that
|
||||
* the page has been remapped again: and then uses unmap_mapping_folio()
|
||||
* to unmap it finally.
|
||||
*/
|
||||
void unmap_mapping_page(struct page *page)
|
||||
void unmap_mapping_folio(struct folio *folio)
|
||||
{
|
||||
struct address_space *mapping = page->mapping;
|
||||
struct address_space *mapping = folio->mapping;
|
||||
struct zap_details details = { };
|
||||
pgoff_t first_index;
|
||||
pgoff_t last_index;
|
||||
|
||||
VM_BUG_ON(!PageLocked(page));
|
||||
VM_BUG_ON(PageTail(page));
|
||||
VM_BUG_ON(!folio_test_locked(folio));
|
||||
|
||||
details.check_mapping = mapping;
|
||||
details.first_index = page->index;
|
||||
details.last_index = page->index + thp_nr_pages(page) - 1;
|
||||
details.single_page = page;
|
||||
first_index = folio->index;
|
||||
last_index = folio->index + folio_nr_pages(folio) - 1;
|
||||
|
||||
details.zap_mapping = mapping;
|
||||
details.single_folio = folio;
|
||||
|
||||
i_mmap_lock_write(mapping);
|
||||
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
|
||||
unmap_mapping_range_tree(&mapping->i_mmap, &details);
|
||||
unmap_mapping_range_tree(&mapping->i_mmap, first_index,
|
||||
last_index, &details);
|
||||
i_mmap_unlock_write(mapping);
|
||||
}
|
||||
|
||||
@ -3391,16 +3403,17 @@ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start,
|
||||
pgoff_t nr, bool even_cows)
|
||||
{
|
||||
struct zap_details details = { };
|
||||
pgoff_t first_index = start;
|
||||
pgoff_t last_index = start + nr - 1;
|
||||
|
||||
details.check_mapping = even_cows ? NULL : mapping;
|
||||
details.first_index = start;
|
||||
details.last_index = start + nr - 1;
|
||||
if (details.last_index < details.first_index)
|
||||
details.last_index = ULONG_MAX;
|
||||
details.zap_mapping = even_cows ? NULL : mapping;
|
||||
if (last_index < first_index)
|
||||
last_index = ULONG_MAX;
|
||||
|
||||
i_mmap_lock_write(mapping);
|
||||
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
|
||||
unmap_mapping_range_tree(&mapping->i_mmap, &details);
|
||||
unmap_mapping_range_tree(&mapping->i_mmap, first_index,
|
||||
last_index, &details);
|
||||
i_mmap_unlock_write(mapping);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(unmap_mapping_pages);
|
||||
@ -3488,7 +3501,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
||||
vm_fault_t ret = 0;
|
||||
void *shadow = NULL;
|
||||
|
||||
if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
|
||||
if (!pte_unmap_same(vmf))
|
||||
goto out;
|
||||
|
||||
entry = pte_to_swp_entry(vmf->orig_pte);
|
||||
@ -3516,7 +3529,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
||||
if (unlikely(!si))
|
||||
goto out;
|
||||
|
||||
delayacct_set_flag(current, DELAYACCT_PF_SWAPIN);
|
||||
page = lookup_swap_cache(entry, vma, vmf->address);
|
||||
swapcache = page;
|
||||
|
||||
@ -3539,7 +3551,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
||||
|
||||
shadow = get_shadow_from_swap_cache(entry);
|
||||
if (shadow)
|
||||
workingset_refault(page, shadow);
|
||||
workingset_refault(page_folio(page),
|
||||
shadow);
|
||||
|
||||
lru_cache_add(page);
|
||||
|
||||
@ -3563,7 +3576,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
||||
vmf->address, &vmf->ptl);
|
||||
if (likely(pte_same(*vmf->pte, vmf->orig_pte)))
|
||||
ret = VM_FAULT_OOM;
|
||||
delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
@ -3577,13 +3589,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
||||
* owner processes (which may be unknown at hwpoison time)
|
||||
*/
|
||||
ret = VM_FAULT_HWPOISON;
|
||||
delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
|
||||
goto out_release;
|
||||
}
|
||||
|
||||
locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags);
|
||||
|
||||
delayacct_clear_flag(current, DELAYACCT_PF_SWAPIN);
|
||||
if (!locked) {
|
||||
ret |= VM_FAULT_RETRY;
|
||||
goto out_release;
|
||||
@ -3634,7 +3644,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
||||
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
|
||||
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
|
||||
pte = mk_pte(page, vma->vm_page_prot);
|
||||
if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
|
||||
if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
|
||||
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
|
||||
vmf->flags &= ~FAULT_FLAG_WRITE;
|
||||
ret |= VM_FAULT_WRITE;
|
||||
@ -3647,8 +3657,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
||||
pte = pte_mkuffd_wp(pte);
|
||||
pte = pte_wrprotect(pte);
|
||||
}
|
||||
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
|
||||
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
|
||||
vmf->orig_pte = pte;
|
||||
|
||||
/* ksm created a completely new copy */
|
||||
@ -3659,6 +3667,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
||||
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
|
||||
}
|
||||
|
||||
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
|
||||
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
|
||||
|
||||
swap_free(entry);
|
||||
if (mem_cgroup_swap_full(page) ||
|
||||
(vma->vm_flags & VM_LOCKED) || PageMlocked(page))
|
||||
@ -3769,7 +3780,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
|
||||
if (!page)
|
||||
goto oom;
|
||||
|
||||
if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
|
||||
if (mem_cgroup_charge(page_folio(page), vma->vm_mm, GFP_KERNEL))
|
||||
goto oom_free_page;
|
||||
cgroup_throttle_swaprate(page, GFP_KERNEL);
|
||||
|
||||
@ -3852,7 +3863,6 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
|
||||
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
|
||||
if (!vmf->prealloc_pte)
|
||||
return VM_FAULT_OOM;
|
||||
smp_wmb(); /* See comment in __pte_alloc() */
|
||||
}
|
||||
|
||||
ret = vma->vm_ops->fault(vmf);
|
||||
@ -3923,7 +3933,6 @@ vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
|
||||
vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
|
||||
if (!vmf->prealloc_pte)
|
||||
return VM_FAULT_OOM;
|
||||
smp_wmb(); /* See comment in __pte_alloc() */
|
||||
}
|
||||
|
||||
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
|
||||
@ -4036,17 +4045,10 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (vmf->prealloc_pte) {
|
||||
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
|
||||
if (likely(pmd_none(*vmf->pmd))) {
|
||||
mm_inc_nr_ptes(vma->vm_mm);
|
||||
pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
|
||||
vmf->prealloc_pte = NULL;
|
||||
}
|
||||
spin_unlock(vmf->ptl);
|
||||
} else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
|
||||
if (vmf->prealloc_pte)
|
||||
pmd_install(vma->vm_mm, vmf->pmd, &vmf->prealloc_pte);
|
||||
else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd)))
|
||||
return VM_FAULT_OOM;
|
||||
}
|
||||
}
|
||||
|
||||
/* See comment in handle_pte_fault() */
|
||||
@ -4155,7 +4157,6 @@ static vm_fault_t do_fault_around(struct vm_fault *vmf)
|
||||
vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
|
||||
if (!vmf->prealloc_pte)
|
||||
return VM_FAULT_OOM;
|
||||
smp_wmb(); /* See comment in __pte_alloc() */
|
||||
}
|
||||
|
||||
return vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff);
|
||||
@ -4202,7 +4203,8 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
|
||||
if (!vmf->cow_page)
|
||||
return VM_FAULT_OOM;
|
||||
|
||||
if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
|
||||
if (mem_cgroup_charge(page_folio(vmf->cow_page), vma->vm_mm,
|
||||
GFP_KERNEL)) {
|
||||
put_page(vmf->cow_page);
|
||||
return VM_FAULT_OOM;
|
||||
}
|
||||
@ -4267,7 +4269,7 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
|
||||
* We enter with non-exclusive mmap_lock (to exclude vma changes,
|
||||
* but allow concurrent faults).
|
||||
* The mmap_lock may have been released depending on flags and our
|
||||
* return value. See filemap_fault() and __lock_page_or_retry().
|
||||
* return value. See filemap_fault() and __folio_lock_or_retry().
|
||||
* If mmap_lock is released, vma may become invalid (for example
|
||||
* by other thread calling munmap()).
|
||||
*/
|
||||
@ -4508,7 +4510,7 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
|
||||
* concurrent faults).
|
||||
*
|
||||
* The mmap_lock may have been released depending on flags and our return value.
|
||||
* See filemap_fault() and __lock_page_or_retry().
|
||||
* See filemap_fault() and __folio_lock_or_retry().
|
||||
*/
|
||||
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
|
||||
{
|
||||
@ -4612,7 +4614,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
|
||||
* By the time we get here, we already hold the mm semaphore
|
||||
*
|
||||
* The mmap_lock may have been released depending on flags and our
|
||||
* return value. See filemap_fault() and __lock_page_or_retry().
|
||||
* return value. See filemap_fault() and __folio_lock_or_retry().
|
||||
*/
|
||||
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
|
||||
unsigned long address, unsigned int flags)
|
||||
@ -4768,7 +4770,7 @@ static inline void mm_account_fault(struct pt_regs *regs,
|
||||
* By the time we get here, we already hold the mm semaphore
|
||||
*
|
||||
* The mmap_lock may have been released depending on flags and our
|
||||
* return value. See filemap_fault() and __lock_page_or_retry().
|
||||
* return value. See filemap_fault() and __folio_lock_or_retry().
|
||||
*/
|
||||
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
|
||||
unsigned int flags, struct pt_regs *regs)
|
||||
@ -4829,13 +4831,13 @@ int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
|
||||
if (!new)
|
||||
return -ENOMEM;
|
||||
|
||||
smp_wmb(); /* See comment in __pte_alloc */
|
||||
|
||||
spin_lock(&mm->page_table_lock);
|
||||
if (pgd_present(*pgd)) /* Another has populated it */
|
||||
if (pgd_present(*pgd)) { /* Another has populated it */
|
||||
p4d_free(mm, new);
|
||||
else
|
||||
} else {
|
||||
smp_wmb(); /* See comment in pmd_install() */
|
||||
pgd_populate(mm, pgd, new);
|
||||
}
|
||||
spin_unlock(&mm->page_table_lock);
|
||||
return 0;
|
||||
}
|
||||
@ -4852,11 +4854,10 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
|
||||
if (!new)
|
||||
return -ENOMEM;
|
||||
|
||||
smp_wmb(); /* See comment in __pte_alloc */
|
||||
|
||||
spin_lock(&mm->page_table_lock);
|
||||
if (!p4d_present(*p4d)) {
|
||||
mm_inc_nr_puds(mm);
|
||||
smp_wmb(); /* See comment in pmd_install() */
|
||||
p4d_populate(mm, p4d, new);
|
||||
} else /* Another has populated it */
|
||||
pud_free(mm, new);
|
||||
@ -4877,14 +4878,14 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
|
||||
if (!new)
|
||||
return -ENOMEM;
|
||||
|
||||
smp_wmb(); /* See comment in __pte_alloc */
|
||||
|
||||
ptl = pud_lock(mm, pud);
|
||||
if (!pud_present(*pud)) {
|
||||
mm_inc_nr_pmds(mm);
|
||||
smp_wmb(); /* See comment in pmd_install() */
|
||||
pud_populate(mm, pud, new);
|
||||
} else /* Another has populated it */
|
||||
} else { /* Another has populated it */
|
||||
pmd_free(mm, new);
|
||||
}
|
||||
spin_unlock(ptl);
|
||||
return 0;
|
||||
}
|
||||
@ -5265,7 +5266,7 @@ void __might_fault(const char *file, int line)
|
||||
return;
|
||||
if (pagefault_disabled())
|
||||
return;
|
||||
__might_sleep(file, line, 0);
|
||||
__might_sleep(file, line);
|
||||
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
|
||||
if (current->mm)
|
||||
might_lock_read(¤t->mm->mmap_lock);
|
||||
@ -5421,7 +5422,6 @@ long copy_huge_page_from_user(struct page *dst_page,
|
||||
unsigned int pages_per_huge_page,
|
||||
bool allow_pagefault)
|
||||
{
|
||||
void *src = (void *)usr_src;
|
||||
void *page_kaddr;
|
||||
unsigned long i, rc = 0;
|
||||
unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
|
||||
@ -5434,8 +5434,7 @@ long copy_huge_page_from_user(struct page *dst_page,
|
||||
else
|
||||
page_kaddr = kmap_atomic(subpage);
|
||||
rc = copy_from_user(page_kaddr,
|
||||
(const void __user *)(src + i * PAGE_SIZE),
|
||||
PAGE_SIZE);
|
||||
usr_src + i * PAGE_SIZE, PAGE_SIZE);
|
||||
if (allow_pagefault)
|
||||
kunmap(subpage);
|
||||
else
|
||||
|
@ -21,7 +21,6 @@
|
||||
#include <linux/memory.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/memory_hotplug.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/ioport.h>
|
||||
#include <linux/delay.h>
|
||||
@ -36,6 +35,7 @@
|
||||
#include <linux/memblock.h>
|
||||
#include <linux/compaction.h>
|
||||
#include <linux/rmap.h>
|
||||
#include <linux/module.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
@ -57,7 +57,7 @@ enum {
|
||||
ONLINE_POLICY_AUTO_MOVABLE,
|
||||
};
|
||||
|
||||
const char *online_policy_to_str[] = {
|
||||
static const char * const online_policy_to_str[] = {
|
||||
[ONLINE_POLICY_CONTIG_ZONES] = "contig-zones",
|
||||
[ONLINE_POLICY_AUTO_MOVABLE] = "auto-movable",
|
||||
};
|
||||
@ -220,7 +220,6 @@ static void release_memory_resource(struct resource *res)
|
||||
kfree(res);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
|
||||
static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
|
||||
const char *reason)
|
||||
{
|
||||
@ -586,10 +585,6 @@ void generic_online_page(struct page *page, unsigned int order)
|
||||
debug_pagealloc_map_pages(page, 1 << order);
|
||||
__free_pages_core(page, order);
|
||||
totalram_pages_add(1UL << order);
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
if (PageHighMem(page))
|
||||
totalhigh_pages_add(1UL << order);
|
||||
#endif
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(generic_online_page);
|
||||
|
||||
@ -626,16 +621,11 @@ static void node_states_check_changes_online(unsigned long nr_pages,
|
||||
|
||||
arg->status_change_nid = NUMA_NO_NODE;
|
||||
arg->status_change_nid_normal = NUMA_NO_NODE;
|
||||
arg->status_change_nid_high = NUMA_NO_NODE;
|
||||
|
||||
if (!node_state(nid, N_MEMORY))
|
||||
arg->status_change_nid = nid;
|
||||
if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
|
||||
arg->status_change_nid_normal = nid;
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
|
||||
arg->status_change_nid_high = nid;
|
||||
#endif
|
||||
}
|
||||
|
||||
static void node_states_set_node(int node, struct memory_notify *arg)
|
||||
@ -643,9 +633,6 @@ static void node_states_set_node(int node, struct memory_notify *arg)
|
||||
if (arg->status_change_nid_normal >= 0)
|
||||
node_set_state(node, N_NORMAL_MEMORY);
|
||||
|
||||
if (arg->status_change_nid_high >= 0)
|
||||
node_set_state(node, N_HIGH_MEMORY);
|
||||
|
||||
if (arg->status_change_nid >= 0)
|
||||
node_set_state(node, N_MEMORY);
|
||||
}
|
||||
@ -1163,7 +1150,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
|
||||
mem_hotplug_done();
|
||||
return ret;
|
||||
}
|
||||
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
|
||||
|
||||
static void reset_node_present_pages(pg_data_t *pgdat)
|
||||
{
|
||||
@ -1357,6 +1343,7 @@ bool mhp_supports_memmap_on_memory(unsigned long size)
|
||||
int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
|
||||
{
|
||||
struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
|
||||
enum memblock_flags memblock_flags = MEMBLOCK_NONE;
|
||||
struct vmem_altmap mhp_altmap = {};
|
||||
struct memory_group *group = NULL;
|
||||
u64 start, size;
|
||||
@ -1384,8 +1371,13 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
|
||||
|
||||
mem_hotplug_begin();
|
||||
|
||||
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
|
||||
memblock_add_node(start, size, nid);
|
||||
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
|
||||
if (res->flags & IORESOURCE_SYSRAM_DRIVER_MANAGED)
|
||||
memblock_flags = MEMBLOCK_DRIVER_MANAGED;
|
||||
ret = memblock_add_node(start, size, nid, memblock_flags);
|
||||
if (ret)
|
||||
goto error_mem_hotplug_end;
|
||||
}
|
||||
|
||||
ret = __try_online_node(nid, false);
|
||||
if (ret < 0)
|
||||
@ -1458,6 +1450,7 @@ int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
|
||||
rollback_node_hotadd(nid);
|
||||
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
|
||||
memblock_remove(start, size);
|
||||
error_mem_hotplug_end:
|
||||
mem_hotplug_done();
|
||||
return ret;
|
||||
}
|
||||
@ -1803,7 +1796,6 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
|
||||
|
||||
arg->status_change_nid = NUMA_NO_NODE;
|
||||
arg->status_change_nid_normal = NUMA_NO_NODE;
|
||||
arg->status_change_nid_high = NUMA_NO_NODE;
|
||||
|
||||
/*
|
||||
* Check whether node_states[N_NORMAL_MEMORY] will be changed.
|
||||
@ -1818,24 +1810,9 @@ static void node_states_check_changes_offline(unsigned long nr_pages,
|
||||
if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
|
||||
arg->status_change_nid_normal = zone_to_nid(zone);
|
||||
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
/*
|
||||
* node_states[N_HIGH_MEMORY] contains nodes which
|
||||
* have normal memory or high memory.
|
||||
* Here we add the present_pages belonging to ZONE_HIGHMEM.
|
||||
* If the zone is within the range of [0..ZONE_HIGHMEM), and
|
||||
* we determine that the zones in that range become empty,
|
||||
* we need to clear the node for N_HIGH_MEMORY.
|
||||
*/
|
||||
present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
|
||||
if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
|
||||
arg->status_change_nid_high = zone_to_nid(zone);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* We have accounted the pages from [0..ZONE_NORMAL), and
|
||||
* in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
|
||||
* as well.
|
||||
* We have accounted the pages from [0..ZONE_NORMAL); ZONE_HIGHMEM
|
||||
* does not apply as we don't support 32bit.
|
||||
* Here we count the possible pages from ZONE_MOVABLE.
|
||||
* If after having accounted all the pages, we see that the nr_pages
|
||||
* to be offlined is over or equal to the accounted pages,
|
||||
@ -1853,9 +1830,6 @@ static void node_states_clear_node(int node, struct memory_notify *arg)
|
||||
if (arg->status_change_nid_normal >= 0)
|
||||
node_clear_state(node, N_NORMAL_MEMORY);
|
||||
|
||||
if (arg->status_change_nid_high >= 0)
|
||||
node_clear_state(node, N_HIGH_MEMORY);
|
||||
|
||||
if (arg->status_change_nid >= 0)
|
||||
node_clear_state(node, N_MEMORY);
|
||||
}
|
||||
@ -2204,7 +2178,7 @@ static int __ref try_remove_memory(u64 start, u64 size)
|
||||
arch_remove_memory(start, size, altmap);
|
||||
|
||||
if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
|
||||
memblock_free(start, size);
|
||||
memblock_phys_free(start, size);
|
||||
memblock_remove(start, size);
|
||||
}
|
||||
|
||||
|
248
mm/mempolicy.c
248
mm/mempolicy.c
@ -134,6 +134,8 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES];
|
||||
* @node: Node id to start the search
|
||||
*
|
||||
* Lookup the next closest node by distance if @nid is not online.
|
||||
*
|
||||
* Return: this @node if it is online, otherwise the closest node by distance
|
||||
*/
|
||||
int numa_map_to_online_node(int node)
|
||||
{
|
||||
@ -296,6 +298,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
|
||||
atomic_set(&policy->refcnt, 1);
|
||||
policy->mode = mode;
|
||||
policy->flags = flags;
|
||||
policy->home_node = NUMA_NO_NODE;
|
||||
|
||||
return policy;
|
||||
}
|
||||
@ -810,7 +813,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
|
||||
((vmstart - vma->vm_start) >> PAGE_SHIFT);
|
||||
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
|
||||
vma->anon_vma, vma->vm_file, pgoff,
|
||||
new_pol, vma->vm_userfaultfd_ctx);
|
||||
new_pol, vma->vm_userfaultfd_ctx,
|
||||
anon_vma_name(vma));
|
||||
if (prev) {
|
||||
vma = prev;
|
||||
next = vma->vm_next;
|
||||
@ -1477,6 +1481,77 @@ static long kernel_mbind(unsigned long start, unsigned long len,
|
||||
return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
|
||||
unsigned long, home_node, unsigned long, flags)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
struct mempolicy *new;
|
||||
unsigned long vmstart;
|
||||
unsigned long vmend;
|
||||
unsigned long end;
|
||||
int err = -ENOENT;
|
||||
|
||||
start = untagged_addr(start);
|
||||
if (start & ~PAGE_MASK)
|
||||
return -EINVAL;
|
||||
/*
|
||||
* flags is used for future extension if any.
|
||||
*/
|
||||
if (flags != 0)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Check home_node is online to avoid accessing uninitialized
|
||||
* NODE_DATA.
|
||||
*/
|
||||
if (home_node >= MAX_NUMNODES || !node_online(home_node))
|
||||
return -EINVAL;
|
||||
|
||||
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
|
||||
end = start + len;
|
||||
|
||||
if (end < start)
|
||||
return -EINVAL;
|
||||
if (end == start)
|
||||
return 0;
|
||||
mmap_write_lock(mm);
|
||||
vma = find_vma(mm, start);
|
||||
for (; vma && vma->vm_start < end; vma = vma->vm_next) {
|
||||
|
||||
vmstart = max(start, vma->vm_start);
|
||||
vmend = min(end, vma->vm_end);
|
||||
new = mpol_dup(vma_policy(vma));
|
||||
if (IS_ERR(new)) {
|
||||
err = PTR_ERR(new);
|
||||
break;
|
||||
}
|
||||
/*
|
||||
* Only update home node if there is an existing vma policy
|
||||
*/
|
||||
if (!new)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If any vma in the range got policy other than MPOL_BIND
|
||||
* or MPOL_PREFERRED_MANY we return error. We don't reset
|
||||
* the home node for vmas we already updated before.
|
||||
*/
|
||||
if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) {
|
||||
err = -EOPNOTSUPP;
|
||||
break;
|
||||
}
|
||||
|
||||
new->home_node = home_node;
|
||||
err = mbind_range(mm, vmstart, vmend, new);
|
||||
mpol_put(new);
|
||||
if (err)
|
||||
break;
|
||||
}
|
||||
mmap_write_unlock(mm);
|
||||
return err;
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
|
||||
unsigned long, mode, const unsigned long __user *, nmask,
|
||||
unsigned long, maxnode, unsigned int, flags)
|
||||
@ -1801,6 +1876,11 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
|
||||
WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
|
||||
}
|
||||
|
||||
if ((policy->mode == MPOL_BIND ||
|
||||
policy->mode == MPOL_PREFERRED_MANY) &&
|
||||
policy->home_node != NUMA_NO_NODE)
|
||||
return policy->home_node;
|
||||
|
||||
return nd;
|
||||
}
|
||||
|
||||
@ -2061,7 +2141,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
|
||||
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
|
||||
page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
|
||||
if (!page)
|
||||
page = __alloc_pages(gfp, order, numa_node_id(), NULL);
|
||||
page = __alloc_pages(gfp, order, nid, NULL);
|
||||
|
||||
return page;
|
||||
}
|
||||
@ -2072,7 +2152,6 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
|
||||
* @order: Order of the GFP allocation.
|
||||
* @vma: Pointer to VMA or NULL if not available.
|
||||
* @addr: Virtual address of the allocation. Must be inside @vma.
|
||||
* @node: Which node to prefer for allocation (modulo policy).
|
||||
* @hugepage: For hugepages try only the preferred node if possible.
|
||||
*
|
||||
* Allocate a page for a specific address in @vma, using the appropriate
|
||||
@ -2083,9 +2162,10 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
|
||||
* Return: The page on success or NULL if allocation fails.
|
||||
*/
|
||||
struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
|
||||
unsigned long addr, int node, bool hugepage)
|
||||
unsigned long addr, bool hugepage)
|
||||
{
|
||||
struct mempolicy *pol;
|
||||
int node = numa_node_id();
|
||||
struct page *page;
|
||||
int preferred_nid;
|
||||
nodemask_t *nmask;
|
||||
@ -2102,6 +2182,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
|
||||
}
|
||||
|
||||
if (pol->mode == MPOL_PREFERRED_MANY) {
|
||||
node = policy_node(gfp, pol, node);
|
||||
page = alloc_pages_preferred_many(gfp, order, node, pol);
|
||||
mpol_cond_put(pol);
|
||||
goto out;
|
||||
@ -2185,7 +2266,7 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
|
||||
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
|
||||
else if (pol->mode == MPOL_PREFERRED_MANY)
|
||||
page = alloc_pages_preferred_many(gfp, order,
|
||||
numa_node_id(), pol);
|
||||
policy_node(gfp, pol, numa_node_id()), pol);
|
||||
else
|
||||
page = __alloc_pages(gfp, order,
|
||||
policy_node(gfp, pol, numa_node_id()),
|
||||
@ -2195,6 +2276,98 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
|
||||
}
|
||||
EXPORT_SYMBOL(alloc_pages);
|
||||
|
||||
struct folio *folio_alloc(gfp_t gfp, unsigned order)
|
||||
{
|
||||
struct page *page = alloc_pages(gfp | __GFP_COMP, order);
|
||||
|
||||
if (page && order > 1)
|
||||
prep_transhuge_page(page);
|
||||
return (struct folio *)page;
|
||||
}
|
||||
EXPORT_SYMBOL(folio_alloc);
|
||||
|
||||
static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
|
||||
struct mempolicy *pol, unsigned long nr_pages,
|
||||
struct page **page_array)
|
||||
{
|
||||
int nodes;
|
||||
unsigned long nr_pages_per_node;
|
||||
int delta;
|
||||
int i;
|
||||
unsigned long nr_allocated;
|
||||
unsigned long total_allocated = 0;
|
||||
|
||||
nodes = nodes_weight(pol->nodes);
|
||||
nr_pages_per_node = nr_pages / nodes;
|
||||
delta = nr_pages - nodes * nr_pages_per_node;
|
||||
|
||||
for (i = 0; i < nodes; i++) {
|
||||
if (delta) {
|
||||
nr_allocated = __alloc_pages_bulk(gfp,
|
||||
interleave_nodes(pol), NULL,
|
||||
nr_pages_per_node + 1, NULL,
|
||||
page_array);
|
||||
delta--;
|
||||
} else {
|
||||
nr_allocated = __alloc_pages_bulk(gfp,
|
||||
interleave_nodes(pol), NULL,
|
||||
nr_pages_per_node, NULL, page_array);
|
||||
}
|
||||
|
||||
page_array += nr_allocated;
|
||||
total_allocated += nr_allocated;
|
||||
}
|
||||
|
||||
return total_allocated;
|
||||
}
|
||||
|
||||
static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
|
||||
struct mempolicy *pol, unsigned long nr_pages,
|
||||
struct page **page_array)
|
||||
{
|
||||
gfp_t preferred_gfp;
|
||||
unsigned long nr_allocated = 0;
|
||||
|
||||
preferred_gfp = gfp | __GFP_NOWARN;
|
||||
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
|
||||
|
||||
nr_allocated = __alloc_pages_bulk(preferred_gfp, nid, &pol->nodes,
|
||||
nr_pages, NULL, page_array);
|
||||
|
||||
if (nr_allocated < nr_pages)
|
||||
nr_allocated += __alloc_pages_bulk(gfp, numa_node_id(), NULL,
|
||||
nr_pages - nr_allocated, NULL,
|
||||
page_array + nr_allocated);
|
||||
return nr_allocated;
|
||||
}
|
||||
|
||||
/* alloc pages bulk and mempolicy should be considered at the
|
||||
* same time in some situation such as vmalloc.
|
||||
*
|
||||
* It can accelerate memory allocation especially interleaving
|
||||
* allocate memory.
|
||||
*/
|
||||
unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp,
|
||||
unsigned long nr_pages, struct page **page_array)
|
||||
{
|
||||
struct mempolicy *pol = &default_policy;
|
||||
|
||||
if (!in_interrupt() && !(gfp & __GFP_THISNODE))
|
||||
pol = get_task_policy(current);
|
||||
|
||||
if (pol->mode == MPOL_INTERLEAVE)
|
||||
return alloc_pages_bulk_array_interleave(gfp, pol,
|
||||
nr_pages, page_array);
|
||||
|
||||
if (pol->mode == MPOL_PREFERRED_MANY)
|
||||
return alloc_pages_bulk_array_preferred_many(gfp,
|
||||
numa_node_id(), pol, nr_pages, page_array);
|
||||
|
||||
return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()),
|
||||
policy_nodemask(gfp, pol), nr_pages, NULL,
|
||||
page_array);
|
||||
}
|
||||
|
||||
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
|
||||
{
|
||||
struct mempolicy *pol = mpol_dup(vma_policy(src));
|
||||
@ -2249,6 +2422,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
|
||||
return false;
|
||||
if (a->flags != b->flags)
|
||||
return false;
|
||||
if (a->home_node != b->home_node)
|
||||
return false;
|
||||
if (mpol_store_user_nodemask(a))
|
||||
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
|
||||
return false;
|
||||
@ -2792,7 +2967,7 @@ static const char * const policy_modes[] =
|
||||
* Format of input:
|
||||
* <mode>[=<flags>][:<nodelist>]
|
||||
*
|
||||
* On success, returns 0, else 1
|
||||
* Return: %0 on success, else %1
|
||||
*/
|
||||
int mpol_parse_str(char *str, struct mempolicy **mpol)
|
||||
{
|
||||
@ -2974,64 +3149,3 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
|
||||
p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
|
||||
nodemask_pr_args(&nodes));
|
||||
}
|
||||
|
||||
bool numa_demotion_enabled = false;
|
||||
|
||||
#ifdef CONFIG_SYSFS
|
||||
static ssize_t numa_demotion_enabled_show(struct kobject *kobj,
|
||||
struct kobj_attribute *attr, char *buf)
|
||||
{
|
||||
return sysfs_emit(buf, "%s\n",
|
||||
numa_demotion_enabled? "true" : "false");
|
||||
}
|
||||
|
||||
static ssize_t numa_demotion_enabled_store(struct kobject *kobj,
|
||||
struct kobj_attribute *attr,
|
||||
const char *buf, size_t count)
|
||||
{
|
||||
if (!strncmp(buf, "true", 4) || !strncmp(buf, "1", 1))
|
||||
numa_demotion_enabled = true;
|
||||
else if (!strncmp(buf, "false", 5) || !strncmp(buf, "0", 1))
|
||||
numa_demotion_enabled = false;
|
||||
else
|
||||
return -EINVAL;
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
static struct kobj_attribute numa_demotion_enabled_attr =
|
||||
__ATTR(demotion_enabled, 0644, numa_demotion_enabled_show,
|
||||
numa_demotion_enabled_store);
|
||||
|
||||
static struct attribute *numa_attrs[] = {
|
||||
&numa_demotion_enabled_attr.attr,
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct attribute_group numa_attr_group = {
|
||||
.attrs = numa_attrs,
|
||||
};
|
||||
|
||||
static int __init numa_init_sysfs(void)
|
||||
{
|
||||
int err;
|
||||
struct kobject *numa_kobj;
|
||||
|
||||
numa_kobj = kobject_create_and_add("numa", mm_kobj);
|
||||
if (!numa_kobj) {
|
||||
pr_err("failed to create numa kobject\n");
|
||||
return -ENOMEM;
|
||||
}
|
||||
err = sysfs_create_group(numa_kobj, &numa_attr_group);
|
||||
if (err) {
|
||||
pr_err("failed to register numa group\n");
|
||||
goto delete_obj;
|
||||
}
|
||||
return 0;
|
||||
|
||||
delete_obj:
|
||||
kobject_put(numa_kobj);
|
||||
return err;
|
||||
}
|
||||
subsys_initcall(numa_init_sysfs);
|
||||
#endif
|
||||
|
@ -17,7 +17,6 @@
|
||||
#include <linux/kmemleak.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/mempool.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/writeback.h>
|
||||
#include "slab.h"
|
||||
|
||||
|
@ -102,39 +102,22 @@ static unsigned long pfn_end(struct dev_pagemap *pgmap, int range_id)
|
||||
return (range->start + range_len(range)) >> PAGE_SHIFT;
|
||||
}
|
||||
|
||||
static unsigned long pfn_next(unsigned long pfn)
|
||||
static unsigned long pfn_next(struct dev_pagemap *pgmap, unsigned long pfn)
|
||||
{
|
||||
if (pfn % 1024 == 0)
|
||||
if (pfn % (1024 << pgmap->vmemmap_shift))
|
||||
cond_resched();
|
||||
return pfn + 1;
|
||||
return pfn + pgmap_vmemmap_nr(pgmap);
|
||||
}
|
||||
|
||||
static unsigned long pfn_len(struct dev_pagemap *pgmap, unsigned long range_id)
|
||||
{
|
||||
return (pfn_end(pgmap, range_id) -
|
||||
pfn_first(pgmap, range_id)) >> pgmap->vmemmap_shift;
|
||||
}
|
||||
|
||||
#define for_each_device_pfn(pfn, map, i) \
|
||||
for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); pfn = pfn_next(pfn))
|
||||
|
||||
static void dev_pagemap_kill(struct dev_pagemap *pgmap)
|
||||
{
|
||||
if (pgmap->ops && pgmap->ops->kill)
|
||||
pgmap->ops->kill(pgmap);
|
||||
else
|
||||
percpu_ref_kill(pgmap->ref);
|
||||
}
|
||||
|
||||
static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
|
||||
{
|
||||
if (pgmap->ops && pgmap->ops->cleanup) {
|
||||
pgmap->ops->cleanup(pgmap);
|
||||
} else {
|
||||
wait_for_completion(&pgmap->done);
|
||||
percpu_ref_exit(pgmap->ref);
|
||||
}
|
||||
/*
|
||||
* Undo the pgmap ref assignment for the internal case as the
|
||||
* caller may re-enable the same pgmap.
|
||||
*/
|
||||
if (pgmap->ref == &pgmap->internal_ref)
|
||||
pgmap->ref = NULL;
|
||||
}
|
||||
for (pfn = pfn_first(map, i); pfn < pfn_end(map, i); \
|
||||
pfn = pfn_next(map, pfn))
|
||||
|
||||
static void pageunmap_range(struct dev_pagemap *pgmap, int range_id)
|
||||
{
|
||||
@ -167,11 +150,12 @@ void memunmap_pages(struct dev_pagemap *pgmap)
|
||||
unsigned long pfn;
|
||||
int i;
|
||||
|
||||
dev_pagemap_kill(pgmap);
|
||||
percpu_ref_kill(&pgmap->ref);
|
||||
for (i = 0; i < pgmap->nr_range; i++)
|
||||
for_each_device_pfn(pfn, pgmap, i)
|
||||
put_page(pfn_to_page(pfn));
|
||||
dev_pagemap_cleanup(pgmap);
|
||||
wait_for_completion(&pgmap->done);
|
||||
percpu_ref_exit(&pgmap->ref);
|
||||
|
||||
for (i = 0; i < pgmap->nr_range; i++)
|
||||
pageunmap_range(pgmap, i);
|
||||
@ -188,8 +172,7 @@ static void devm_memremap_pages_release(void *data)
|
||||
|
||||
static void dev_pagemap_percpu_release(struct percpu_ref *ref)
|
||||
{
|
||||
struct dev_pagemap *pgmap =
|
||||
container_of(ref, struct dev_pagemap, internal_ref);
|
||||
struct dev_pagemap *pgmap = container_of(ref, struct dev_pagemap, ref);
|
||||
|
||||
complete(&pgmap->done);
|
||||
}
|
||||
@ -295,8 +278,7 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params,
|
||||
memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
|
||||
PHYS_PFN(range->start),
|
||||
PHYS_PFN(range_len(range)), pgmap);
|
||||
percpu_ref_get_many(pgmap->ref, pfn_end(pgmap, range_id)
|
||||
- pfn_first(pgmap, range_id));
|
||||
percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id));
|
||||
return 0;
|
||||
|
||||
err_add_memory:
|
||||
@ -362,22 +344,11 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
|
||||
break;
|
||||
}
|
||||
|
||||
if (!pgmap->ref) {
|
||||
if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
init_completion(&pgmap->done);
|
||||
error = percpu_ref_init(&pgmap->internal_ref,
|
||||
dev_pagemap_percpu_release, 0, GFP_KERNEL);
|
||||
if (error)
|
||||
return ERR_PTR(error);
|
||||
pgmap->ref = &pgmap->internal_ref;
|
||||
} else {
|
||||
if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
|
||||
WARN(1, "Missing reference count teardown definition\n");
|
||||
return ERR_PTR(-EINVAL);
|
||||
}
|
||||
}
|
||||
init_completion(&pgmap->done);
|
||||
error = percpu_ref_init(&pgmap->ref, dev_pagemap_percpu_release, 0,
|
||||
GFP_KERNEL);
|
||||
if (error)
|
||||
return ERR_PTR(error);
|
||||
|
||||
devmap_managed_enable_get(pgmap);
|
||||
|
||||
@ -486,7 +457,7 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
|
||||
/* fall back to slow path lookup */
|
||||
rcu_read_lock();
|
||||
pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
|
||||
if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
|
||||
if (pgmap && !percpu_ref_tryget_live(&pgmap->ref))
|
||||
pgmap = NULL;
|
||||
rcu_read_unlock();
|
||||
|
||||
@ -505,7 +476,7 @@ void free_devmap_managed_page(struct page *page)
|
||||
|
||||
__ClearPageWaiters(page);
|
||||
|
||||
mem_cgroup_uncharge(page);
|
||||
mem_cgroup_uncharge(page_folio(page));
|
||||
|
||||
/*
|
||||
* When a device_private page is freed, the page->mapping field
|
||||
|
817
mm/migrate.c
817
mm/migrate.c
File diff suppressed because it is too large
Load Diff
@ -271,6 +271,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
|
||||
/* Phase 1: page isolation */
|
||||
for (i = 0; i < nr; i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
struct folio *folio = page_folio(page);
|
||||
|
||||
if (TestClearPageMlocked(page)) {
|
||||
/*
|
||||
@ -278,7 +279,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
|
||||
* so we can spare the get_page() here.
|
||||
*/
|
||||
if (TestClearPageLRU(page)) {
|
||||
lruvec = relock_page_lruvec_irq(page, lruvec);
|
||||
lruvec = folio_lruvec_relock_irq(folio, lruvec);
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
continue;
|
||||
} else
|
||||
@ -511,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
|
||||
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
|
||||
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
|
||||
vma->vm_file, pgoff, vma_policy(vma),
|
||||
vma->vm_userfaultfd_ctx);
|
||||
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
|
||||
if (*prev) {
|
||||
vma = *prev;
|
||||
goto success;
|
||||
|
62
mm/mmap.c
62
mm/mmap.c
@ -13,6 +13,7 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/vmacache.h>
|
||||
#include <linux/shm.h>
|
||||
#include <linux/mman.h>
|
||||
@ -1029,7 +1030,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
|
||||
*/
|
||||
static inline int is_mergeable_vma(struct vm_area_struct *vma,
|
||||
struct file *file, unsigned long vm_flags,
|
||||
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
|
||||
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
|
||||
struct anon_vma_name *anon_name)
|
||||
{
|
||||
/*
|
||||
* VM_SOFTDIRTY should not prevent from VMA merging, if we
|
||||
@ -1047,6 +1049,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
|
||||
return 0;
|
||||
if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
|
||||
return 0;
|
||||
if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
|
||||
return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
@ -1079,9 +1083,10 @@ static int
|
||||
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
|
||||
struct anon_vma *anon_vma, struct file *file,
|
||||
pgoff_t vm_pgoff,
|
||||
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
|
||||
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
|
||||
struct anon_vma_name *anon_name)
|
||||
{
|
||||
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
|
||||
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
|
||||
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
|
||||
if (vma->vm_pgoff == vm_pgoff)
|
||||
return 1;
|
||||
@ -1100,9 +1105,10 @@ static int
|
||||
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
|
||||
struct anon_vma *anon_vma, struct file *file,
|
||||
pgoff_t vm_pgoff,
|
||||
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
|
||||
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
|
||||
struct anon_vma_name *anon_name)
|
||||
{
|
||||
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
|
||||
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
|
||||
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
|
||||
pgoff_t vm_pglen;
|
||||
vm_pglen = vma_pages(vma);
|
||||
@ -1113,9 +1119,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
|
||||
}
|
||||
|
||||
/*
|
||||
* Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
|
||||
* whether that can be merged with its predecessor or its successor.
|
||||
* Or both (it neatly fills a hole).
|
||||
* Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
|
||||
* figure out whether that can be merged with its predecessor or its
|
||||
* successor. Or both (it neatly fills a hole).
|
||||
*
|
||||
* In most cases - when called for mmap, brk or mremap - [addr,end) is
|
||||
* certain not to be mapped by the time vma_merge is called; but when
|
||||
@ -1160,7 +1166,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
|
||||
unsigned long end, unsigned long vm_flags,
|
||||
struct anon_vma *anon_vma, struct file *file,
|
||||
pgoff_t pgoff, struct mempolicy *policy,
|
||||
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
|
||||
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
|
||||
struct anon_vma_name *anon_name)
|
||||
{
|
||||
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
|
||||
struct vm_area_struct *area, *next;
|
||||
@ -1190,7 +1197,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
|
||||
mpol_equal(vma_policy(prev), policy) &&
|
||||
can_vma_merge_after(prev, vm_flags,
|
||||
anon_vma, file, pgoff,
|
||||
vm_userfaultfd_ctx)) {
|
||||
vm_userfaultfd_ctx, anon_name)) {
|
||||
/*
|
||||
* OK, it can. Can we now merge in the successor as well?
|
||||
*/
|
||||
@ -1199,7 +1206,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
|
||||
can_vma_merge_before(next, vm_flags,
|
||||
anon_vma, file,
|
||||
pgoff+pglen,
|
||||
vm_userfaultfd_ctx) &&
|
||||
vm_userfaultfd_ctx, anon_name) &&
|
||||
is_mergeable_anon_vma(prev->anon_vma,
|
||||
next->anon_vma, NULL)) {
|
||||
/* cases 1, 6 */
|
||||
@ -1222,7 +1229,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
|
||||
mpol_equal(policy, vma_policy(next)) &&
|
||||
can_vma_merge_before(next, vm_flags,
|
||||
anon_vma, file, pgoff+pglen,
|
||||
vm_userfaultfd_ctx)) {
|
||||
vm_userfaultfd_ctx, anon_name)) {
|
||||
if (prev && addr < prev->vm_end) /* case 4 */
|
||||
err = __vma_adjust(prev, prev->vm_start,
|
||||
addr, prev->vm_pgoff, NULL, next);
|
||||
@ -1599,7 +1606,6 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
|
||||
goto out_fput;
|
||||
}
|
||||
} else if (flags & MAP_HUGETLB) {
|
||||
struct ucounts *ucounts = NULL;
|
||||
struct hstate *hs;
|
||||
|
||||
hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
|
||||
@ -1615,7 +1621,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
|
||||
*/
|
||||
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
|
||||
VM_NORESERVE,
|
||||
&ucounts, HUGETLB_ANONHUGE_INODE,
|
||||
HUGETLB_ANONHUGE_INODE,
|
||||
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
|
||||
if (IS_ERR(file))
|
||||
return PTR_ERR(file);
|
||||
@ -1755,7 +1761,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
|
||||
* Can we just expand an old mapping?
|
||||
*/
|
||||
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
|
||||
NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
|
||||
NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
|
||||
if (vma)
|
||||
goto out;
|
||||
|
||||
@ -1804,7 +1810,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
|
||||
*/
|
||||
if (unlikely(vm_flags != vma->vm_flags && prev)) {
|
||||
merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
|
||||
NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
|
||||
NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
|
||||
if (merge) {
|
||||
/* ->mmap() can change vma->vm_file and fput the original file. So
|
||||
* fput the vma->vm_file here or we would add an extra fput for file
|
||||
@ -2929,7 +2935,6 @@ EXPORT_SYMBOL(vm_munmap);
|
||||
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
|
||||
{
|
||||
addr = untagged_addr(addr);
|
||||
profile_munmap(addr);
|
||||
return __vm_munmap(addr, len, true);
|
||||
}
|
||||
|
||||
@ -3057,7 +3062,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
|
||||
|
||||
/* Can we just expand an old private anonymous mapping? */
|
||||
vma = vma_merge(mm, prev, addr, addr + len, flags,
|
||||
NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
|
||||
NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
|
||||
if (vma)
|
||||
goto out;
|
||||
|
||||
@ -3143,25 +3148,27 @@ void exit_mmap(struct mm_struct *mm)
|
||||
* to mmu_notifier_release(mm) ensures mmu notifier callbacks in
|
||||
* __oom_reap_task_mm() will not block.
|
||||
*
|
||||
* This needs to be done before calling munlock_vma_pages_all(),
|
||||
* This needs to be done before calling unlock_range(),
|
||||
* which clears VM_LOCKED, otherwise the oom reaper cannot
|
||||
* reliably test it.
|
||||
*/
|
||||
(void)__oom_reap_task_mm(mm);
|
||||
|
||||
set_bit(MMF_OOM_SKIP, &mm->flags);
|
||||
mmap_write_lock(mm);
|
||||
mmap_write_unlock(mm);
|
||||
}
|
||||
|
||||
mmap_write_lock(mm);
|
||||
if (mm->locked_vm)
|
||||
unlock_range(mm->mmap, ULONG_MAX);
|
||||
|
||||
arch_exit_mmap(mm);
|
||||
|
||||
vma = mm->mmap;
|
||||
if (!vma) /* Can happen if dup_mmap() received an OOM */
|
||||
if (!vma) {
|
||||
/* Can happen if dup_mmap() received an OOM */
|
||||
mmap_write_unlock(mm);
|
||||
return;
|
||||
}
|
||||
|
||||
lru_add_drain();
|
||||
flush_cache_mm(mm);
|
||||
@ -3172,16 +3179,15 @@ void exit_mmap(struct mm_struct *mm)
|
||||
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
|
||||
tlb_finish_mmu(&tlb);
|
||||
|
||||
/*
|
||||
* Walk the list again, actually closing and freeing it,
|
||||
* with preemption enabled, without holding any MM locks.
|
||||
*/
|
||||
/* Walk the list again, actually closing and freeing it. */
|
||||
while (vma) {
|
||||
if (vma->vm_flags & VM_ACCOUNT)
|
||||
nr_accounted += vma_pages(vma);
|
||||
vma = remove_vma(vma);
|
||||
cond_resched();
|
||||
}
|
||||
mm->mmap = NULL;
|
||||
mmap_write_unlock(mm);
|
||||
vm_unacct_memory(nr_accounted);
|
||||
}
|
||||
|
||||
@ -3250,7 +3256,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
|
||||
return NULL; /* should never get here */
|
||||
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
|
||||
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
|
||||
vma->vm_userfaultfd_ctx);
|
||||
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
|
||||
if (new_vma) {
|
||||
/*
|
||||
* Source vma may have been merged into new_vma
|
||||
@ -3332,7 +3338,7 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
|
||||
|
||||
void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
|
||||
{
|
||||
mm->total_vm += npages;
|
||||
WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm)+npages);
|
||||
|
||||
if (is_exec_mapping(flags))
|
||||
mm->exec_vm += npages;
|
||||
|
@ -3,6 +3,7 @@
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/mmdebug.h>
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/rcupdate.h>
|
||||
#include <linux/smp.h>
|
||||
|
@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
|
||||
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
|
||||
*pprev = vma_merge(mm, *pprev, start, end, newflags,
|
||||
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
|
||||
vma->vm_userfaultfd_ctx);
|
||||
vma->vm_userfaultfd_ctx, anon_vma_name(vma));
|
||||
if (*pprev) {
|
||||
vma = *pprev;
|
||||
VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
|
||||
@ -563,7 +563,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
|
||||
error = -ENOMEM;
|
||||
if (!vma)
|
||||
goto out;
|
||||
prev = vma->vm_prev;
|
||||
|
||||
if (unlikely(grows & PROT_GROWSDOWN)) {
|
||||
if (vma->vm_start >= end)
|
||||
goto out;
|
||||
@ -581,8 +581,11 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
if (start > vma->vm_start)
|
||||
prev = vma;
|
||||
else
|
||||
prev = vma->vm_prev;
|
||||
|
||||
for (nstart = start ; ; ) {
|
||||
unsigned long mask_off_old_flags;
|
||||
|
86
mm/mremap.c
86
mm/mremap.c
@ -489,6 +489,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
|
||||
old_end = old_addr + len;
|
||||
flush_cache_range(vma, old_addr, old_end);
|
||||
|
||||
if (is_vm_hugetlb_page(vma))
|
||||
return move_hugetlb_page_tables(vma, new_vma, old_addr,
|
||||
new_addr, len);
|
||||
|
||||
mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
|
||||
old_addr, old_end);
|
||||
mmu_notifier_invalidate_range_start(&range);
|
||||
@ -565,6 +569,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
|
||||
bool *locked, unsigned long flags,
|
||||
struct vm_userfaultfd_ctx *uf, struct list_head *uf_unmap)
|
||||
{
|
||||
long to_account = new_len - old_len;
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
struct vm_area_struct *new_vma;
|
||||
unsigned long vm_flags = vma->vm_flags;
|
||||
@ -583,6 +588,9 @@ static unsigned long move_vma(struct vm_area_struct *vma,
|
||||
if (mm->map_count >= sysctl_max_map_count - 3)
|
||||
return -ENOMEM;
|
||||
|
||||
if (unlikely(flags & MREMAP_DONTUNMAP))
|
||||
to_account = new_len;
|
||||
|
||||
if (vma->vm_ops && vma->vm_ops->may_split) {
|
||||
if (vma->vm_start != old_addr)
|
||||
err = vma->vm_ops->may_split(vma, old_addr);
|
||||
@ -604,8 +612,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) {
|
||||
if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT))
|
||||
if (vm_flags & VM_ACCOUNT) {
|
||||
if (security_vm_enough_memory_mm(mm, to_account >> PAGE_SHIFT))
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
@ -613,8 +621,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
|
||||
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
|
||||
&need_rmap_locks);
|
||||
if (!new_vma) {
|
||||
if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT))
|
||||
vm_unacct_memory(new_len >> PAGE_SHIFT);
|
||||
if (vm_flags & VM_ACCOUNT)
|
||||
vm_unacct_memory(to_account >> PAGE_SHIFT);
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
@ -642,6 +650,10 @@ static unsigned long move_vma(struct vm_area_struct *vma,
|
||||
mremap_userfaultfd_prep(new_vma, uf);
|
||||
}
|
||||
|
||||
if (is_vm_hugetlb_page(vma)) {
|
||||
clear_vma_resv_huge_pages(vma);
|
||||
}
|
||||
|
||||
/* Conceal VM_ACCOUNT so old reservation is not undone */
|
||||
if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
|
||||
vma->vm_flags &= ~VM_ACCOUNT;
|
||||
@ -708,8 +720,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
|
||||
}
|
||||
|
||||
static struct vm_area_struct *vma_to_resize(unsigned long addr,
|
||||
unsigned long old_len, unsigned long new_len, unsigned long flags,
|
||||
unsigned long *p)
|
||||
unsigned long old_len, unsigned long new_len, unsigned long flags)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
@ -736,9 +747,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
|
||||
(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
if (is_vm_hugetlb_page(vma))
|
||||
return ERR_PTR(-EINVAL);
|
||||
|
||||
/* We can't remap across vm area boundaries */
|
||||
if (old_len > vma->vm_end - addr)
|
||||
return ERR_PTR(-EFAULT);
|
||||
@ -768,13 +776,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
|
||||
(new_len - old_len) >> PAGE_SHIFT))
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
if (vma->vm_flags & VM_ACCOUNT) {
|
||||
unsigned long charged = (new_len - old_len) >> PAGE_SHIFT;
|
||||
if (security_vm_enough_memory_mm(mm, charged))
|
||||
return ERR_PTR(-ENOMEM);
|
||||
*p = charged;
|
||||
}
|
||||
|
||||
return vma;
|
||||
}
|
||||
|
||||
@ -787,7 +788,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long ret = -EINVAL;
|
||||
unsigned long charged = 0;
|
||||
unsigned long map_flags = 0;
|
||||
|
||||
if (offset_in_page(new_addr))
|
||||
@ -830,7 +830,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
|
||||
old_len = new_len;
|
||||
}
|
||||
|
||||
vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
|
||||
vma = vma_to_resize(addr, old_len, new_len, flags);
|
||||
if (IS_ERR(vma)) {
|
||||
ret = PTR_ERR(vma);
|
||||
goto out;
|
||||
@ -853,7 +853,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
|
||||
((addr - vma->vm_start) >> PAGE_SHIFT),
|
||||
map_flags);
|
||||
if (IS_ERR_VALUE(ret))
|
||||
goto out1;
|
||||
goto out;
|
||||
|
||||
/* We got a new mapping */
|
||||
if (!(flags & MREMAP_FIXED))
|
||||
@ -862,12 +862,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
|
||||
ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf,
|
||||
uf_unmap);
|
||||
|
||||
if (!(offset_in_page(ret)))
|
||||
goto out;
|
||||
|
||||
out1:
|
||||
vm_unacct_memory(charged);
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
@ -899,7 +893,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
unsigned long ret = -EINVAL;
|
||||
unsigned long charged = 0;
|
||||
bool locked = false;
|
||||
bool downgraded = false;
|
||||
struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
|
||||
@ -949,6 +942,31 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
|
||||
|
||||
if (mmap_write_lock_killable(current->mm))
|
||||
return -EINTR;
|
||||
vma = find_vma(mm, addr);
|
||||
if (!vma || vma->vm_start > addr) {
|
||||
ret = EFAULT;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (is_vm_hugetlb_page(vma)) {
|
||||
struct hstate *h __maybe_unused = hstate_vma(vma);
|
||||
|
||||
old_len = ALIGN(old_len, huge_page_size(h));
|
||||
new_len = ALIGN(new_len, huge_page_size(h));
|
||||
|
||||
/* addrs must be huge page aligned */
|
||||
if (addr & ~huge_page_mask(h))
|
||||
goto out;
|
||||
if (new_addr & ~huge_page_mask(h))
|
||||
goto out;
|
||||
|
||||
/*
|
||||
* Don't allow remap expansion, because the underlying hugetlb
|
||||
* reservation is not yet capable to handle split reservation.
|
||||
*/
|
||||
if (new_len > old_len)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
|
||||
ret = mremap_to(addr, old_len, new_addr, new_len,
|
||||
@ -981,7 +999,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
|
||||
/*
|
||||
* Ok, we need to grow..
|
||||
*/
|
||||
vma = vma_to_resize(addr, old_len, new_len, flags, &charged);
|
||||
vma = vma_to_resize(addr, old_len, new_len, flags);
|
||||
if (IS_ERR(vma)) {
|
||||
ret = PTR_ERR(vma);
|
||||
goto out;
|
||||
@ -992,10 +1010,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
|
||||
if (old_len == vma->vm_end - addr) {
|
||||
/* can we just expand the current mapping? */
|
||||
if (vma_expandable(vma, new_len - old_len)) {
|
||||
int pages = (new_len - old_len) >> PAGE_SHIFT;
|
||||
long pages = (new_len - old_len) >> PAGE_SHIFT;
|
||||
|
||||
if (vma->vm_flags & VM_ACCOUNT) {
|
||||
if (security_vm_enough_memory_mm(mm, pages)) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
|
||||
if (vma_adjust(vma, vma->vm_start, addr + new_len,
|
||||
vma->vm_pgoff, NULL)) {
|
||||
vm_unacct_memory(pages);
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
}
|
||||
@ -1034,10 +1060,8 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
|
||||
&locked, flags, &uf, &uf_unmap);
|
||||
}
|
||||
out:
|
||||
if (offset_in_page(ret)) {
|
||||
vm_unacct_memory(charged);
|
||||
if (offset_in_page(ret))
|
||||
locked = false;
|
||||
}
|
||||
if (downgraded)
|
||||
mmap_read_unlock(current->mm);
|
||||
else
|
||||
|
@ -27,7 +27,6 @@
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/mount.h>
|
||||
@ -1639,12 +1638,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
|
||||
}
|
||||
EXPORT_SYMBOL(remap_vmalloc_range);
|
||||
|
||||
unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr,
|
||||
unsigned long len, unsigned long pgoff, unsigned long flags)
|
||||
{
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
vm_fault_t filemap_fault(struct vm_fault *vmf)
|
||||
{
|
||||
BUG();
|
||||
|
@ -641,6 +641,8 @@ static void oom_reap_task(struct task_struct *tsk)
|
||||
|
||||
static int oom_reaper(void *unused)
|
||||
{
|
||||
set_freezable();
|
||||
|
||||
while (true) {
|
||||
struct task_struct *tsk = NULL;
|
||||
|
||||
@ -787,11 +789,11 @@ static inline bool __task_will_free_mem(struct task_struct *task)
|
||||
struct signal_struct *sig = task->signal;
|
||||
|
||||
/*
|
||||
* A coredumping process may sleep for an extended period in exit_mm(),
|
||||
* so the oom killer cannot assume that the process will promptly exit
|
||||
* and release memory.
|
||||
* A coredumping process may sleep for an extended period in
|
||||
* coredump_task_exit(), so the oom killer cannot assume that
|
||||
* the process will promptly exit and release memory.
|
||||
*/
|
||||
if (sig->flags & SIGNAL_GROUP_COREDUMP)
|
||||
if (sig->core_state)
|
||||
return false;
|
||||
|
||||
if (sig->flags & SIGNAL_GROUP_EXIT)
|
||||
@ -992,6 +994,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message)
|
||||
* If necessary, kill all tasks in the selected memory cgroup.
|
||||
*/
|
||||
if (oom_group) {
|
||||
memcg_memory_event(oom_group, MEMCG_OOM_GROUP_KILL);
|
||||
mem_cgroup_print_oom_group(oom_group);
|
||||
mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
|
||||
(void *)message);
|
||||
@ -1055,7 +1058,7 @@ bool out_of_memory(struct oom_control *oc)
|
||||
|
||||
if (!is_memcg_oom(oc)) {
|
||||
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
|
||||
if (freed > 0)
|
||||
if (freed > 0 && !is_sysrq_oom(oc))
|
||||
/* Got some memory back in the last second. */
|
||||
return true;
|
||||
}
|
||||
@ -1148,21 +1151,14 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
|
||||
struct task_struct *p;
|
||||
unsigned int f_flags;
|
||||
bool reap = false;
|
||||
struct pid *pid;
|
||||
long ret = 0;
|
||||
|
||||
if (flags)
|
||||
return -EINVAL;
|
||||
|
||||
pid = pidfd_get_pid(pidfd, &f_flags);
|
||||
if (IS_ERR(pid))
|
||||
return PTR_ERR(pid);
|
||||
|
||||
task = get_pid_task(pid, PIDTYPE_TGID);
|
||||
if (!task) {
|
||||
ret = -ESRCH;
|
||||
goto put_pid;
|
||||
}
|
||||
task = pidfd_get_task(pidfd, &f_flags);
|
||||
if (IS_ERR(task))
|
||||
return PTR_ERR(task);
|
||||
|
||||
/*
|
||||
* Make sure to choose a thread which still has a reference to mm
|
||||
@ -1174,15 +1170,15 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
|
||||
goto put_task;
|
||||
}
|
||||
|
||||
if (mmget_not_zero(p->mm)) {
|
||||
mm = p->mm;
|
||||
if (task_will_free_mem(p))
|
||||
reap = true;
|
||||
else {
|
||||
/* Error only if the work has not been done already */
|
||||
if (!test_bit(MMF_OOM_SKIP, &mm->flags))
|
||||
ret = -EINVAL;
|
||||
}
|
||||
mm = p->mm;
|
||||
mmgrab(mm);
|
||||
|
||||
if (task_will_free_mem(p))
|
||||
reap = true;
|
||||
else {
|
||||
/* Error only if the work has not been done already */
|
||||
if (!test_bit(MMF_OOM_SKIP, &mm->flags))
|
||||
ret = -EINVAL;
|
||||
}
|
||||
task_unlock(p);
|
||||
|
||||
@ -1193,17 +1189,18 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
|
||||
ret = -EINTR;
|
||||
goto drop_mm;
|
||||
}
|
||||
if (!__oom_reap_task_mm(mm))
|
||||
/*
|
||||
* Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
|
||||
* possible change in exit_mmap is seen
|
||||
*/
|
||||
if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
|
||||
ret = -EAGAIN;
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
drop_mm:
|
||||
if (mm)
|
||||
mmput(mm);
|
||||
mmdrop(mm);
|
||||
put_task:
|
||||
put_task_struct(task);
|
||||
put_pid:
|
||||
put_pid(pid);
|
||||
return ret;
|
||||
#else
|
||||
return -ENOSYS;
|
||||
|
@ -562,12 +562,12 @@ static unsigned long wp_next_time(unsigned long cur_time)
|
||||
return cur_time;
|
||||
}
|
||||
|
||||
static void wb_domain_writeout_inc(struct wb_domain *dom,
|
||||
static void wb_domain_writeout_add(struct wb_domain *dom,
|
||||
struct fprop_local_percpu *completions,
|
||||
unsigned int max_prop_frac)
|
||||
unsigned int max_prop_frac, long nr)
|
||||
{
|
||||
__fprop_inc_percpu_max(&dom->completions, completions,
|
||||
max_prop_frac);
|
||||
__fprop_add_percpu_max(&dom->completions, completions,
|
||||
max_prop_frac, nr);
|
||||
/* First event after period switching was turned off? */
|
||||
if (unlikely(!dom->period_time)) {
|
||||
/*
|
||||
@ -583,20 +583,20 @@ static void wb_domain_writeout_inc(struct wb_domain *dom,
|
||||
|
||||
/*
|
||||
* Increment @wb's writeout completion count and the global writeout
|
||||
* completion count. Called from test_clear_page_writeback().
|
||||
* completion count. Called from __folio_end_writeback().
|
||||
*/
|
||||
static inline void __wb_writeout_inc(struct bdi_writeback *wb)
|
||||
static inline void __wb_writeout_add(struct bdi_writeback *wb, long nr)
|
||||
{
|
||||
struct wb_domain *cgdom;
|
||||
|
||||
inc_wb_stat(wb, WB_WRITTEN);
|
||||
wb_domain_writeout_inc(&global_wb_domain, &wb->completions,
|
||||
wb->bdi->max_prop_frac);
|
||||
wb_stat_mod(wb, WB_WRITTEN, nr);
|
||||
wb_domain_writeout_add(&global_wb_domain, &wb->completions,
|
||||
wb->bdi->max_prop_frac, nr);
|
||||
|
||||
cgdom = mem_cgroup_wb_domain(wb);
|
||||
if (cgdom)
|
||||
wb_domain_writeout_inc(cgdom, wb_memcg_completions(wb),
|
||||
wb->bdi->max_prop_frac);
|
||||
wb_domain_writeout_add(cgdom, wb_memcg_completions(wb),
|
||||
wb->bdi->max_prop_frac, nr);
|
||||
}
|
||||
|
||||
void wb_writeout_inc(struct bdi_writeback *wb)
|
||||
@ -604,7 +604,7 @@ void wb_writeout_inc(struct bdi_writeback *wb)
|
||||
unsigned long flags;
|
||||
|
||||
local_irq_save(flags);
|
||||
__wb_writeout_inc(wb);
|
||||
__wb_writeout_add(wb, 1);
|
||||
local_irq_restore(flags);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(wb_writeout_inc);
|
||||
@ -1084,7 +1084,7 @@ static void wb_update_write_bandwidth(struct bdi_writeback *wb,
|
||||
* write_bandwidth = ---------------------------------------------------
|
||||
* period
|
||||
*
|
||||
* @written may have decreased due to account_page_redirty().
|
||||
* @written may have decreased due to folio_account_redirty().
|
||||
* Avoid underflowing @bw calculation.
|
||||
*/
|
||||
bw = written - min(written, wb->written_stamp);
|
||||
@ -2366,8 +2366,15 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
|
||||
ret = generic_writepages(mapping, wbc);
|
||||
if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL))
|
||||
break;
|
||||
cond_resched();
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/50);
|
||||
|
||||
/*
|
||||
* Lacking an allocation context or the locality or writeback
|
||||
* state of any of the inode's pages, throttle based on
|
||||
* writeback activity on the local node. It's as good a
|
||||
* guess as any.
|
||||
*/
|
||||
reclaim_throttle(NODE_DATA(numa_node_id()),
|
||||
VMSCAN_THROTTLE_WRITEBACK);
|
||||
}
|
||||
/*
|
||||
* Usually few pages are written by now from those we've just submitted
|
||||
@ -2381,44 +2388,44 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc)
|
||||
}
|
||||
|
||||
/**
|
||||
* write_one_page - write out a single page and wait on I/O
|
||||
* @page: the page to write
|
||||
* folio_write_one - write out a single folio and wait on I/O.
|
||||
* @folio: The folio to write.
|
||||
*
|
||||
* The page must be locked by the caller and will be unlocked upon return.
|
||||
* The folio must be locked by the caller and will be unlocked upon return.
|
||||
*
|
||||
* Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
|
||||
* function returns.
|
||||
*
|
||||
* Return: %0 on success, negative error code otherwise
|
||||
*/
|
||||
int write_one_page(struct page *page)
|
||||
int folio_write_one(struct folio *folio)
|
||||
{
|
||||
struct address_space *mapping = page->mapping;
|
||||
struct address_space *mapping = folio->mapping;
|
||||
int ret = 0;
|
||||
struct writeback_control wbc = {
|
||||
.sync_mode = WB_SYNC_ALL,
|
||||
.nr_to_write = 1,
|
||||
.nr_to_write = folio_nr_pages(folio),
|
||||
};
|
||||
|
||||
BUG_ON(!PageLocked(page));
|
||||
BUG_ON(!folio_test_locked(folio));
|
||||
|
||||
wait_on_page_writeback(page);
|
||||
folio_wait_writeback(folio);
|
||||
|
||||
if (clear_page_dirty_for_io(page)) {
|
||||
get_page(page);
|
||||
ret = mapping->a_ops->writepage(page, &wbc);
|
||||
if (folio_clear_dirty_for_io(folio)) {
|
||||
folio_get(folio);
|
||||
ret = mapping->a_ops->writepage(&folio->page, &wbc);
|
||||
if (ret == 0)
|
||||
wait_on_page_writeback(page);
|
||||
put_page(page);
|
||||
folio_wait_writeback(folio);
|
||||
folio_put(folio);
|
||||
} else {
|
||||
unlock_page(page);
|
||||
folio_unlock(folio);
|
||||
}
|
||||
|
||||
if (!ret)
|
||||
ret = filemap_check_errors(mapping);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(write_one_page);
|
||||
EXPORT_SYMBOL(folio_write_one);
|
||||
|
||||
/*
|
||||
* For address_spaces which do not use buffers nor write back.
|
||||
@ -2438,29 +2445,30 @@ EXPORT_SYMBOL(__set_page_dirty_no_writeback);
|
||||
*
|
||||
* NOTE: This relies on being atomic wrt interrupts.
|
||||
*/
|
||||
static void account_page_dirtied(struct page *page,
|
||||
static void folio_account_dirtied(struct folio *folio,
|
||||
struct address_space *mapping)
|
||||
{
|
||||
struct inode *inode = mapping->host;
|
||||
|
||||
trace_writeback_dirty_page(page, mapping);
|
||||
trace_writeback_dirty_folio(folio, mapping);
|
||||
|
||||
if (mapping_can_writeback(mapping)) {
|
||||
struct bdi_writeback *wb;
|
||||
long nr = folio_nr_pages(folio);
|
||||
|
||||
inode_attach_wb(inode, page);
|
||||
inode_attach_wb(inode, &folio->page);
|
||||
wb = inode_to_wb(inode);
|
||||
|
||||
__inc_lruvec_page_state(page, NR_FILE_DIRTY);
|
||||
__inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
|
||||
__inc_node_page_state(page, NR_DIRTIED);
|
||||
inc_wb_stat(wb, WB_RECLAIMABLE);
|
||||
inc_wb_stat(wb, WB_DIRTIED);
|
||||
task_io_account_write(PAGE_SIZE);
|
||||
current->nr_dirtied++;
|
||||
__this_cpu_inc(bdp_ratelimits);
|
||||
__lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, nr);
|
||||
__zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
|
||||
__node_stat_mod_folio(folio, NR_DIRTIED, nr);
|
||||
wb_stat_mod(wb, WB_RECLAIMABLE, nr);
|
||||
wb_stat_mod(wb, WB_DIRTIED, nr);
|
||||
task_io_account_write(nr * PAGE_SIZE);
|
||||
current->nr_dirtied += nr;
|
||||
__this_cpu_add(bdp_ratelimits, nr);
|
||||
|
||||
mem_cgroup_track_foreign_dirty(page, wb);
|
||||
mem_cgroup_track_foreign_dirty(folio, wb);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2469,130 +2477,156 @@ static void account_page_dirtied(struct page *page,
|
||||
*
|
||||
* Caller must hold lock_page_memcg().
|
||||
*/
|
||||
void account_page_cleaned(struct page *page, struct address_space *mapping,
|
||||
void folio_account_cleaned(struct folio *folio, struct address_space *mapping,
|
||||
struct bdi_writeback *wb)
|
||||
{
|
||||
if (mapping_can_writeback(mapping)) {
|
||||
dec_lruvec_page_state(page, NR_FILE_DIRTY);
|
||||
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
|
||||
dec_wb_stat(wb, WB_RECLAIMABLE);
|
||||
task_io_account_cancelled_write(PAGE_SIZE);
|
||||
long nr = folio_nr_pages(folio);
|
||||
lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
|
||||
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
|
||||
wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
|
||||
task_io_account_cancelled_write(nr * PAGE_SIZE);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Mark the page dirty, and set it dirty in the page cache, and mark the inode
|
||||
* dirty.
|
||||
* Mark the folio dirty, and set it dirty in the page cache, and mark
|
||||
* the inode dirty.
|
||||
*
|
||||
* If warn is true, then emit a warning if the page is not uptodate and has
|
||||
* If warn is true, then emit a warning if the folio is not uptodate and has
|
||||
* not been truncated.
|
||||
*
|
||||
* The caller must hold lock_page_memcg().
|
||||
* The caller must hold lock_page_memcg(). Most callers have the folio
|
||||
* locked. A few have the folio blocked from truncation through other
|
||||
* means (eg zap_page_range() has it mapped and is holding the page table
|
||||
* lock). This can also be called from mark_buffer_dirty(), which I
|
||||
* cannot prove is always protected against truncate.
|
||||
*/
|
||||
void __set_page_dirty(struct page *page, struct address_space *mapping,
|
||||
void __folio_mark_dirty(struct folio *folio, struct address_space *mapping,
|
||||
int warn)
|
||||
{
|
||||
unsigned long flags;
|
||||
|
||||
xa_lock_irqsave(&mapping->i_pages, flags);
|
||||
if (page->mapping) { /* Race with truncate? */
|
||||
WARN_ON_ONCE(warn && !PageUptodate(page));
|
||||
account_page_dirtied(page, mapping);
|
||||
__xa_set_mark(&mapping->i_pages, page_index(page),
|
||||
if (folio->mapping) { /* Race with truncate? */
|
||||
WARN_ON_ONCE(warn && !folio_test_uptodate(folio));
|
||||
folio_account_dirtied(folio, mapping);
|
||||
__xa_set_mark(&mapping->i_pages, folio_index(folio),
|
||||
PAGECACHE_TAG_DIRTY);
|
||||
}
|
||||
xa_unlock_irqrestore(&mapping->i_pages, flags);
|
||||
}
|
||||
|
||||
/*
|
||||
* For address_spaces which do not use buffers. Just tag the page as dirty in
|
||||
* the xarray.
|
||||
/**
|
||||
* filemap_dirty_folio - Mark a folio dirty for filesystems which do not use buffer_heads.
|
||||
* @mapping: Address space this folio belongs to.
|
||||
* @folio: Folio to be marked as dirty.
|
||||
*
|
||||
* This is also used when a single buffer is being dirtied: we want to set the
|
||||
* page dirty in that case, but not all the buffers. This is a "bottom-up"
|
||||
* dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
|
||||
* Filesystems which do not use buffer heads should call this function
|
||||
* from their set_page_dirty address space operation. It ignores the
|
||||
* contents of folio_get_private(), so if the filesystem marks individual
|
||||
* blocks as dirty, the filesystem should handle that itself.
|
||||
*
|
||||
* The caller must ensure this doesn't race with truncation. Most will simply
|
||||
* hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
|
||||
* the pte lock held, which also locks out truncation.
|
||||
* This is also sometimes used by filesystems which use buffer_heads when
|
||||
* a single buffer is being dirtied: we want to set the folio dirty in
|
||||
* that case, but not all the buffers. This is a "bottom-up" dirtying,
|
||||
* whereas __set_page_dirty_buffers() is a "top-down" dirtying.
|
||||
*
|
||||
* The caller must ensure this doesn't race with truncation. Most will
|
||||
* simply hold the folio lock, but e.g. zap_pte_range() calls with the
|
||||
* folio mapped and the pte lock held, which also locks out truncation.
|
||||
*/
|
||||
int __set_page_dirty_nobuffers(struct page *page)
|
||||
bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio)
|
||||
{
|
||||
lock_page_memcg(page);
|
||||
if (!TestSetPageDirty(page)) {
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
|
||||
if (!mapping) {
|
||||
unlock_page_memcg(page);
|
||||
return 1;
|
||||
}
|
||||
__set_page_dirty(page, mapping, !PagePrivate(page));
|
||||
unlock_page_memcg(page);
|
||||
|
||||
if (mapping->host) {
|
||||
/* !PageAnon && !swapper_space */
|
||||
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
|
||||
}
|
||||
return 1;
|
||||
folio_memcg_lock(folio);
|
||||
if (folio_test_set_dirty(folio)) {
|
||||
folio_memcg_unlock(folio);
|
||||
return false;
|
||||
}
|
||||
unlock_page_memcg(page);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(__set_page_dirty_nobuffers);
|
||||
|
||||
/*
|
||||
* Call this whenever redirtying a page, to de-account the dirty counters
|
||||
* (NR_DIRTIED, WB_DIRTIED, tsk->nr_dirtied), so that they match the written
|
||||
* counters (NR_WRITTEN, WB_WRITTEN) in long term. The mismatches will lead to
|
||||
* systematic errors in balanced_dirty_ratelimit and the dirty pages position
|
||||
* control.
|
||||
__folio_mark_dirty(folio, mapping, !folio_test_private(folio));
|
||||
folio_memcg_unlock(folio);
|
||||
|
||||
if (mapping->host) {
|
||||
/* !PageAnon && !swapper_space */
|
||||
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL(filemap_dirty_folio);
|
||||
|
||||
/**
|
||||
* folio_account_redirty - Manually account for redirtying a page.
|
||||
* @folio: The folio which is being redirtied.
|
||||
*
|
||||
* Most filesystems should call folio_redirty_for_writepage() instead
|
||||
* of this fuction. If your filesystem is doing writeback outside the
|
||||
* context of a writeback_control(), it can call this when redirtying
|
||||
* a folio, to de-account the dirty counters (NR_DIRTIED, WB_DIRTIED,
|
||||
* tsk->nr_dirtied), so that they match the written counters (NR_WRITTEN,
|
||||
* WB_WRITTEN) in long term. The mismatches will lead to systematic errors
|
||||
* in balanced_dirty_ratelimit and the dirty pages position control.
|
||||
*/
|
||||
void account_page_redirty(struct page *page)
|
||||
void folio_account_redirty(struct folio *folio)
|
||||
{
|
||||
struct address_space *mapping = page->mapping;
|
||||
struct address_space *mapping = folio->mapping;
|
||||
|
||||
if (mapping && mapping_can_writeback(mapping)) {
|
||||
struct inode *inode = mapping->host;
|
||||
struct bdi_writeback *wb;
|
||||
struct wb_lock_cookie cookie = {};
|
||||
long nr = folio_nr_pages(folio);
|
||||
|
||||
wb = unlocked_inode_to_wb_begin(inode, &cookie);
|
||||
current->nr_dirtied--;
|
||||
dec_node_page_state(page, NR_DIRTIED);
|
||||
dec_wb_stat(wb, WB_DIRTIED);
|
||||
current->nr_dirtied -= nr;
|
||||
node_stat_mod_folio(folio, NR_DIRTIED, -nr);
|
||||
wb_stat_mod(wb, WB_DIRTIED, -nr);
|
||||
unlocked_inode_to_wb_end(inode, &cookie);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(account_page_redirty);
|
||||
EXPORT_SYMBOL(folio_account_redirty);
|
||||
|
||||
/*
|
||||
* When a writepage implementation decides that it doesn't want to write this
|
||||
* page for some reason, it should redirty the locked page via
|
||||
* redirty_page_for_writepage() and it should then unlock the page and return 0
|
||||
/**
|
||||
* folio_redirty_for_writepage - Decline to write a dirty folio.
|
||||
* @wbc: The writeback control.
|
||||
* @folio: The folio.
|
||||
*
|
||||
* When a writepage implementation decides that it doesn't want to write
|
||||
* @folio for some reason, it should call this function, unlock @folio and
|
||||
* return 0.
|
||||
*
|
||||
* Return: True if we redirtied the folio. False if someone else dirtied
|
||||
* it first.
|
||||
*/
|
||||
int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
|
||||
bool folio_redirty_for_writepage(struct writeback_control *wbc,
|
||||
struct folio *folio)
|
||||
{
|
||||
int ret;
|
||||
bool ret;
|
||||
long nr = folio_nr_pages(folio);
|
||||
|
||||
wbc->pages_skipped += nr;
|
||||
ret = filemap_dirty_folio(folio->mapping, folio);
|
||||
folio_account_redirty(folio);
|
||||
|
||||
wbc->pages_skipped++;
|
||||
ret = __set_page_dirty_nobuffers(page);
|
||||
account_page_redirty(page);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(redirty_page_for_writepage);
|
||||
EXPORT_SYMBOL(folio_redirty_for_writepage);
|
||||
|
||||
/*
|
||||
* Dirty a page.
|
||||
/**
|
||||
* folio_mark_dirty - Mark a folio as being modified.
|
||||
* @folio: The folio.
|
||||
*
|
||||
* For pages with a mapping this should be done under the page lock for the
|
||||
* benefit of asynchronous memory errors who prefer a consistent dirty state.
|
||||
* This rule can be broken in some special cases, but should be better not to.
|
||||
* For folios with a mapping this should be done under the page lock
|
||||
* for the benefit of asynchronous memory errors who prefer a consistent
|
||||
* dirty state. This rule can be broken in some special cases,
|
||||
* but should be better not to.
|
||||
*
|
||||
* Return: True if the folio was newly dirtied, false if it was already dirty.
|
||||
*/
|
||||
int set_page_dirty(struct page *page)
|
||||
bool folio_mark_dirty(struct folio *folio)
|
||||
{
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
struct address_space *mapping = folio_mapping(folio);
|
||||
|
||||
page = compound_head(page);
|
||||
if (likely(mapping)) {
|
||||
/*
|
||||
* readahead/lru_deactivate_page could remain
|
||||
@ -2604,17 +2638,17 @@ int set_page_dirty(struct page *page)
|
||||
* it will confuse readahead and make it restart the size rampup
|
||||
* process. But it's a trivial problem.
|
||||
*/
|
||||
if (PageReclaim(page))
|
||||
ClearPageReclaim(page);
|
||||
return mapping->a_ops->set_page_dirty(page);
|
||||
if (folio_test_reclaim(folio))
|
||||
folio_clear_reclaim(folio);
|
||||
return mapping->a_ops->set_page_dirty(&folio->page);
|
||||
}
|
||||
if (!PageDirty(page)) {
|
||||
if (!TestSetPageDirty(page))
|
||||
return 1;
|
||||
if (!folio_test_dirty(folio)) {
|
||||
if (!folio_test_set_dirty(folio))
|
||||
return true;
|
||||
}
|
||||
return 0;
|
||||
return false;
|
||||
}
|
||||
EXPORT_SYMBOL(set_page_dirty);
|
||||
EXPORT_SYMBOL(folio_mark_dirty);
|
||||
|
||||
/*
|
||||
* set_page_dirty() is racy if the caller has no reference against
|
||||
@ -2650,49 +2684,49 @@ EXPORT_SYMBOL(set_page_dirty_lock);
|
||||
* page without actually doing it through the VM. Can you say "ext3 is
|
||||
* horribly ugly"? Thought you could.
|
||||
*/
|
||||
void __cancel_dirty_page(struct page *page)
|
||||
void __folio_cancel_dirty(struct folio *folio)
|
||||
{
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
struct address_space *mapping = folio_mapping(folio);
|
||||
|
||||
if (mapping_can_writeback(mapping)) {
|
||||
struct inode *inode = mapping->host;
|
||||
struct bdi_writeback *wb;
|
||||
struct wb_lock_cookie cookie = {};
|
||||
|
||||
lock_page_memcg(page);
|
||||
folio_memcg_lock(folio);
|
||||
wb = unlocked_inode_to_wb_begin(inode, &cookie);
|
||||
|
||||
if (TestClearPageDirty(page))
|
||||
account_page_cleaned(page, mapping, wb);
|
||||
if (folio_test_clear_dirty(folio))
|
||||
folio_account_cleaned(folio, mapping, wb);
|
||||
|
||||
unlocked_inode_to_wb_end(inode, &cookie);
|
||||
unlock_page_memcg(page);
|
||||
folio_memcg_unlock(folio);
|
||||
} else {
|
||||
ClearPageDirty(page);
|
||||
folio_clear_dirty(folio);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(__cancel_dirty_page);
|
||||
EXPORT_SYMBOL(__folio_cancel_dirty);
|
||||
|
||||
/*
|
||||
* Clear a page's dirty flag, while caring for dirty memory accounting.
|
||||
* Returns true if the page was previously dirty.
|
||||
* Clear a folio's dirty flag, while caring for dirty memory accounting.
|
||||
* Returns true if the folio was previously dirty.
|
||||
*
|
||||
* This is for preparing to put the page under writeout. We leave the page
|
||||
* tagged as dirty in the xarray so that a concurrent write-for-sync
|
||||
* can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
|
||||
* implementation will run either set_page_writeback() or set_page_dirty(),
|
||||
* at which stage we bring the page's dirty flag and xarray dirty tag
|
||||
* back into sync.
|
||||
* This is for preparing to put the folio under writeout. We leave
|
||||
* the folio tagged as dirty in the xarray so that a concurrent
|
||||
* write-for-sync can discover it via a PAGECACHE_TAG_DIRTY walk.
|
||||
* The ->writepage implementation will run either folio_start_writeback()
|
||||
* or folio_mark_dirty(), at which stage we bring the folio's dirty flag
|
||||
* and xarray dirty tag back into sync.
|
||||
*
|
||||
* This incoherency between the page's dirty flag and xarray tag is
|
||||
* unfortunate, but it only exists while the page is locked.
|
||||
* This incoherency between the folio's dirty flag and xarray tag is
|
||||
* unfortunate, but it only exists while the folio is locked.
|
||||
*/
|
||||
int clear_page_dirty_for_io(struct page *page)
|
||||
bool folio_clear_dirty_for_io(struct folio *folio)
|
||||
{
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
int ret = 0;
|
||||
struct address_space *mapping = folio_mapping(folio);
|
||||
bool ret = false;
|
||||
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
|
||||
|
||||
if (mapping && mapping_can_writeback(mapping)) {
|
||||
struct inode *inode = mapping->host;
|
||||
@ -2705,48 +2739,49 @@ int clear_page_dirty_for_io(struct page *page)
|
||||
* We use this sequence to make sure that
|
||||
* (a) we account for dirty stats properly
|
||||
* (b) we tell the low-level filesystem to
|
||||
* mark the whole page dirty if it was
|
||||
* mark the whole folio dirty if it was
|
||||
* dirty in a pagetable. Only to then
|
||||
* (c) clean the page again and return 1 to
|
||||
* (c) clean the folio again and return 1 to
|
||||
* cause the writeback.
|
||||
*
|
||||
* This way we avoid all nasty races with the
|
||||
* dirty bit in multiple places and clearing
|
||||
* them concurrently from different threads.
|
||||
*
|
||||
* Note! Normally the "set_page_dirty(page)"
|
||||
* Note! Normally the "folio_mark_dirty(folio)"
|
||||
* has no effect on the actual dirty bit - since
|
||||
* that will already usually be set. But we
|
||||
* need the side effects, and it can help us
|
||||
* avoid races.
|
||||
*
|
||||
* We basically use the page "master dirty bit"
|
||||
* We basically use the folio "master dirty bit"
|
||||
* as a serialization point for all the different
|
||||
* threads doing their things.
|
||||
*/
|
||||
if (page_mkclean(page))
|
||||
set_page_dirty(page);
|
||||
if (folio_mkclean(folio))
|
||||
folio_mark_dirty(folio);
|
||||
/*
|
||||
* We carefully synchronise fault handlers against
|
||||
* installing a dirty pte and marking the page dirty
|
||||
* installing a dirty pte and marking the folio dirty
|
||||
* at this point. We do this by having them hold the
|
||||
* page lock while dirtying the page, and pages are
|
||||
* page lock while dirtying the folio, and folios are
|
||||
* always locked coming in here, so we get the desired
|
||||
* exclusion.
|
||||
*/
|
||||
wb = unlocked_inode_to_wb_begin(inode, &cookie);
|
||||
if (TestClearPageDirty(page)) {
|
||||
dec_lruvec_page_state(page, NR_FILE_DIRTY);
|
||||
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
|
||||
dec_wb_stat(wb, WB_RECLAIMABLE);
|
||||
ret = 1;
|
||||
if (folio_test_clear_dirty(folio)) {
|
||||
long nr = folio_nr_pages(folio);
|
||||
lruvec_stat_mod_folio(folio, NR_FILE_DIRTY, -nr);
|
||||
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
|
||||
wb_stat_mod(wb, WB_RECLAIMABLE, -nr);
|
||||
ret = true;
|
||||
}
|
||||
unlocked_inode_to_wb_end(inode, &cookie);
|
||||
return ret;
|
||||
}
|
||||
return TestClearPageDirty(page);
|
||||
return folio_test_clear_dirty(folio);
|
||||
}
|
||||
EXPORT_SYMBOL(clear_page_dirty_for_io);
|
||||
EXPORT_SYMBOL(folio_clear_dirty_for_io);
|
||||
|
||||
static void wb_inode_writeback_start(struct bdi_writeback *wb)
|
||||
{
|
||||
@ -2766,27 +2801,28 @@ static void wb_inode_writeback_end(struct bdi_writeback *wb)
|
||||
queue_delayed_work(bdi_wq, &wb->bw_dwork, BANDWIDTH_INTERVAL);
|
||||
}
|
||||
|
||||
int test_clear_page_writeback(struct page *page)
|
||||
bool __folio_end_writeback(struct folio *folio)
|
||||
{
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
int ret;
|
||||
long nr = folio_nr_pages(folio);
|
||||
struct address_space *mapping = folio_mapping(folio);
|
||||
bool ret;
|
||||
|
||||
lock_page_memcg(page);
|
||||
folio_memcg_lock(folio);
|
||||
if (mapping && mapping_use_writeback_tags(mapping)) {
|
||||
struct inode *inode = mapping->host;
|
||||
struct backing_dev_info *bdi = inode_to_bdi(inode);
|
||||
unsigned long flags;
|
||||
|
||||
xa_lock_irqsave(&mapping->i_pages, flags);
|
||||
ret = TestClearPageWriteback(page);
|
||||
ret = folio_test_clear_writeback(folio);
|
||||
if (ret) {
|
||||
__xa_clear_mark(&mapping->i_pages, page_index(page),
|
||||
__xa_clear_mark(&mapping->i_pages, folio_index(folio),
|
||||
PAGECACHE_TAG_WRITEBACK);
|
||||
if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
|
||||
struct bdi_writeback *wb = inode_to_wb(inode);
|
||||
|
||||
dec_wb_stat(wb, WB_WRITEBACK);
|
||||
__wb_writeout_inc(wb);
|
||||
wb_stat_mod(wb, WB_WRITEBACK, -nr);
|
||||
__wb_writeout_add(wb, nr);
|
||||
if (!mapping_tagged(mapping,
|
||||
PAGECACHE_TAG_WRITEBACK))
|
||||
wb_inode_writeback_end(wb);
|
||||
@ -2799,32 +2835,34 @@ int test_clear_page_writeback(struct page *page)
|
||||
|
||||
xa_unlock_irqrestore(&mapping->i_pages, flags);
|
||||
} else {
|
||||
ret = TestClearPageWriteback(page);
|
||||
ret = folio_test_clear_writeback(folio);
|
||||
}
|
||||
if (ret) {
|
||||
dec_lruvec_page_state(page, NR_WRITEBACK);
|
||||
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
|
||||
inc_node_page_state(page, NR_WRITTEN);
|
||||
lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr);
|
||||
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr);
|
||||
node_stat_mod_folio(folio, NR_WRITTEN, nr);
|
||||
}
|
||||
unlock_page_memcg(page);
|
||||
folio_memcg_unlock(folio);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int __test_set_page_writeback(struct page *page, bool keep_write)
|
||||
bool __folio_start_writeback(struct folio *folio, bool keep_write)
|
||||
{
|
||||
struct address_space *mapping = page_mapping(page);
|
||||
int ret, access_ret;
|
||||
long nr = folio_nr_pages(folio);
|
||||
struct address_space *mapping = folio_mapping(folio);
|
||||
bool ret;
|
||||
int access_ret;
|
||||
|
||||
lock_page_memcg(page);
|
||||
folio_memcg_lock(folio);
|
||||
if (mapping && mapping_use_writeback_tags(mapping)) {
|
||||
XA_STATE(xas, &mapping->i_pages, page_index(page));
|
||||
XA_STATE(xas, &mapping->i_pages, folio_index(folio));
|
||||
struct inode *inode = mapping->host;
|
||||
struct backing_dev_info *bdi = inode_to_bdi(inode);
|
||||
unsigned long flags;
|
||||
|
||||
xas_lock_irqsave(&xas, flags);
|
||||
xas_load(&xas);
|
||||
ret = TestSetPageWriteback(page);
|
||||
ret = folio_test_set_writeback(folio);
|
||||
if (!ret) {
|
||||
bool on_wblist;
|
||||
|
||||
@ -2835,84 +2873,105 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
|
||||
if (bdi->capabilities & BDI_CAP_WRITEBACK_ACCT) {
|
||||
struct bdi_writeback *wb = inode_to_wb(inode);
|
||||
|
||||
inc_wb_stat(wb, WB_WRITEBACK);
|
||||
wb_stat_mod(wb, WB_WRITEBACK, nr);
|
||||
if (!on_wblist)
|
||||
wb_inode_writeback_start(wb);
|
||||
}
|
||||
|
||||
/*
|
||||
* We can come through here when swapping anonymous
|
||||
* pages, so we don't necessarily have an inode to track
|
||||
* for sync.
|
||||
* We can come through here when swapping
|
||||
* anonymous folios, so we don't necessarily
|
||||
* have an inode to track for sync.
|
||||
*/
|
||||
if (mapping->host && !on_wblist)
|
||||
sb_mark_inode_writeback(mapping->host);
|
||||
}
|
||||
if (!PageDirty(page))
|
||||
if (!folio_test_dirty(folio))
|
||||
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
|
||||
if (!keep_write)
|
||||
xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
|
||||
xas_unlock_irqrestore(&xas, flags);
|
||||
} else {
|
||||
ret = TestSetPageWriteback(page);
|
||||
ret = folio_test_set_writeback(folio);
|
||||
}
|
||||
if (!ret) {
|
||||
inc_lruvec_page_state(page, NR_WRITEBACK);
|
||||
inc_zone_page_state(page, NR_ZONE_WRITE_PENDING);
|
||||
lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr);
|
||||
zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr);
|
||||
}
|
||||
unlock_page_memcg(page);
|
||||
access_ret = arch_make_page_accessible(page);
|
||||
folio_memcg_unlock(folio);
|
||||
access_ret = arch_make_folio_accessible(folio);
|
||||
/*
|
||||
* If writeback has been triggered on a page that cannot be made
|
||||
* accessible, it is too late to recover here.
|
||||
*/
|
||||
VM_BUG_ON_PAGE(access_ret != 0, page);
|
||||
VM_BUG_ON_FOLIO(access_ret != 0, folio);
|
||||
|
||||
return ret;
|
||||
|
||||
}
|
||||
EXPORT_SYMBOL(__test_set_page_writeback);
|
||||
EXPORT_SYMBOL(__folio_start_writeback);
|
||||
|
||||
/*
|
||||
* Wait for a page to complete writeback
|
||||
/**
|
||||
* folio_wait_writeback - Wait for a folio to finish writeback.
|
||||
* @folio: The folio to wait for.
|
||||
*
|
||||
* If the folio is currently being written back to storage, wait for the
|
||||
* I/O to complete.
|
||||
*
|
||||
* Context: Sleeps. Must be called in process context and with
|
||||
* no spinlocks held. Caller should hold a reference on the folio.
|
||||
* If the folio is not locked, writeback may start again after writeback
|
||||
* has finished.
|
||||
*/
|
||||
void wait_on_page_writeback(struct page *page)
|
||||
void folio_wait_writeback(struct folio *folio)
|
||||
{
|
||||
while (PageWriteback(page)) {
|
||||
trace_wait_on_page_writeback(page, page_mapping(page));
|
||||
wait_on_page_bit(page, PG_writeback);
|
||||
while (folio_test_writeback(folio)) {
|
||||
trace_folio_wait_writeback(folio, folio_mapping(folio));
|
||||
folio_wait_bit(folio, PG_writeback);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(wait_on_page_writeback);
|
||||
EXPORT_SYMBOL_GPL(folio_wait_writeback);
|
||||
|
||||
/*
|
||||
* Wait for a page to complete writeback. Returns -EINTR if we get a
|
||||
* fatal signal while waiting.
|
||||
/**
|
||||
* folio_wait_writeback_killable - Wait for a folio to finish writeback.
|
||||
* @folio: The folio to wait for.
|
||||
*
|
||||
* If the folio is currently being written back to storage, wait for the
|
||||
* I/O to complete or a fatal signal to arrive.
|
||||
*
|
||||
* Context: Sleeps. Must be called in process context and with
|
||||
* no spinlocks held. Caller should hold a reference on the folio.
|
||||
* If the folio is not locked, writeback may start again after writeback
|
||||
* has finished.
|
||||
* Return: 0 on success, -EINTR if we get a fatal signal while waiting.
|
||||
*/
|
||||
int wait_on_page_writeback_killable(struct page *page)
|
||||
int folio_wait_writeback_killable(struct folio *folio)
|
||||
{
|
||||
while (PageWriteback(page)) {
|
||||
trace_wait_on_page_writeback(page, page_mapping(page));
|
||||
if (wait_on_page_bit_killable(page, PG_writeback))
|
||||
while (folio_test_writeback(folio)) {
|
||||
trace_folio_wait_writeback(folio, folio_mapping(folio));
|
||||
if (folio_wait_bit_killable(folio, PG_writeback))
|
||||
return -EINTR;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(wait_on_page_writeback_killable);
|
||||
EXPORT_SYMBOL_GPL(folio_wait_writeback_killable);
|
||||
|
||||
/**
|
||||
* wait_for_stable_page() - wait for writeback to finish, if necessary.
|
||||
* @page: The page to wait on.
|
||||
* folio_wait_stable() - wait for writeback to finish, if necessary.
|
||||
* @folio: The folio to wait on.
|
||||
*
|
||||
* This function determines if the given page is related to a backing device
|
||||
* that requires page contents to be held stable during writeback. If so, then
|
||||
* it will wait for any pending writeback to complete.
|
||||
* This function determines if the given folio is related to a backing
|
||||
* device that requires folio contents to be held stable during writeback.
|
||||
* If so, then it will wait for any pending writeback to complete.
|
||||
*
|
||||
* Context: Sleeps. Must be called in process context and with
|
||||
* no spinlocks held. Caller should hold a reference on the folio.
|
||||
* If the folio is not locked, writeback may start again after writeback
|
||||
* has finished.
|
||||
*/
|
||||
void wait_for_stable_page(struct page *page)
|
||||
void folio_wait_stable(struct folio *folio)
|
||||
{
|
||||
page = thp_head(page);
|
||||
if (page->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES)
|
||||
wait_on_page_writeback(page);
|
||||
if (folio_inode(folio)->i_sb->s_iflags & SB_I_STABLE_WRITES)
|
||||
folio_wait_writeback(folio);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(wait_for_stable_page);
|
||||
EXPORT_SYMBOL_GPL(folio_wait_stable);
|
||||
|
339
mm/page_alloc.c
339
mm/page_alloc.c
@ -19,6 +19,7 @@
|
||||
#include <linux/mm.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/jiffies.h>
|
||||
@ -63,6 +64,7 @@
|
||||
#include <linux/sched/rt.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/page_owner.h>
|
||||
#include <linux/page_table_check.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/ftrace.h>
|
||||
@ -72,6 +74,7 @@
|
||||
#include <linux/padata.h>
|
||||
#include <linux/khugepaged.h>
|
||||
#include <linux/buffer_head.h>
|
||||
#include <linux/delayacct.h>
|
||||
#include <asm/sections.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/div64.h>
|
||||
@ -190,6 +193,27 @@ EXPORT_SYMBOL(init_on_alloc);
|
||||
DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free);
|
||||
EXPORT_SYMBOL(init_on_free);
|
||||
|
||||
#define ALLOC_IN_CMA_THRESHOLD_MAX 16
|
||||
#define ALLOC_IN_CMA_THRESHOLD_DEFAULT 12
|
||||
|
||||
static unsigned long _alloc_in_cma_threshold __read_mostly
|
||||
= ALLOC_IN_CMA_THRESHOLD_DEFAULT;
|
||||
|
||||
static int __init alloc_in_cma_threshold_setup(char *buf)
|
||||
{
|
||||
unsigned long res;
|
||||
|
||||
if (kstrtoul(buf, 10, &res) < 0 ||
|
||||
res > ALLOC_IN_CMA_THRESHOLD_MAX) {
|
||||
pr_err("Bad alloc_cma_threshold value\n");
|
||||
return 0;
|
||||
}
|
||||
_alloc_in_cma_threshold = res;
|
||||
pr_info("Setting alloc_in_cma_threshold to %lu\n", res);
|
||||
return 0;
|
||||
}
|
||||
early_param("alloc_in_cma_threshold", alloc_in_cma_threshold_setup);
|
||||
|
||||
static bool _init_on_alloc_enabled_early __read_mostly
|
||||
= IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
|
||||
static int __init early_init_on_alloc(char *buf)
|
||||
@ -677,10 +701,8 @@ static inline int pindex_to_order(unsigned int pindex)
|
||||
int order = pindex / MIGRATE_PCPTYPES;
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
if (order > PAGE_ALLOC_COSTLY_ORDER) {
|
||||
if (order > PAGE_ALLOC_COSTLY_ORDER)
|
||||
order = pageblock_order;
|
||||
VM_BUG_ON(order != pageblock_order);
|
||||
}
|
||||
#else
|
||||
VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER);
|
||||
#endif
|
||||
@ -724,27 +746,37 @@ static inline void free_the_page(struct page *page, unsigned int order)
|
||||
|
||||
void free_compound_page(struct page *page)
|
||||
{
|
||||
mem_cgroup_uncharge(page);
|
||||
mem_cgroup_uncharge(page_folio(page));
|
||||
free_the_page(page, compound_order(page));
|
||||
}
|
||||
|
||||
static void prep_compound_head(struct page *page, unsigned int order)
|
||||
{
|
||||
set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
|
||||
set_compound_order(page, order);
|
||||
atomic_set(compound_mapcount_ptr(page), -1);
|
||||
if (hpage_pincount_available(page))
|
||||
atomic_set(compound_pincount_ptr(page), 0);
|
||||
}
|
||||
|
||||
static void prep_compound_tail(struct page *head, int tail_idx)
|
||||
{
|
||||
struct page *p = head + tail_idx;
|
||||
|
||||
p->mapping = TAIL_MAPPING;
|
||||
set_compound_head(p, head);
|
||||
}
|
||||
|
||||
void prep_compound_page(struct page *page, unsigned int order)
|
||||
{
|
||||
int i;
|
||||
int nr_pages = 1 << order;
|
||||
|
||||
__SetPageHead(page);
|
||||
for (i = 1; i < nr_pages; i++) {
|
||||
struct page *p = page + i;
|
||||
p->mapping = TAIL_MAPPING;
|
||||
set_compound_head(p, page);
|
||||
}
|
||||
for (i = 1; i < nr_pages; i++)
|
||||
prep_compound_tail(page, i);
|
||||
|
||||
set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
|
||||
set_compound_order(page, order);
|
||||
atomic_set(compound_mapcount_ptr(page), -1);
|
||||
if (hpage_pincount_available(page))
|
||||
atomic_set(compound_pincount_ptr(page), 0);
|
||||
prep_compound_head(page, order);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_DEBUG_PAGEALLOC
|
||||
@ -1299,6 +1331,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
|
||||
if (memcg_kmem_enabled() && PageMemcgKmem(page))
|
||||
__memcg_kmem_uncharge_page(page, order);
|
||||
reset_page_owner(page, order);
|
||||
page_table_check_free(page, order);
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1338,6 +1371,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
|
||||
page_cpupid_reset_last(page);
|
||||
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
|
||||
reset_page_owner(page, order);
|
||||
page_table_check_free(page, order);
|
||||
|
||||
if (!PageHighMem(page)) {
|
||||
debug_check_no_locks_freed(page_address(page),
|
||||
@ -1430,14 +1464,8 @@ static inline void prefetch_buddy(struct page *page)
|
||||
|
||||
/*
|
||||
* Frees a number of pages from the PCP lists
|
||||
* Assumes all pages on list are in same zone, and of same order.
|
||||
* Assumes all pages on list are in same zone.
|
||||
* count is the number of pages to free.
|
||||
*
|
||||
* If the zone was previously in an "all pages pinned" state then look to
|
||||
* see if this freeing clears that state.
|
||||
*
|
||||
* And clear the zone's pages_scanned counter, to hold off the "all pages are
|
||||
* pinned" detection logic.
|
||||
*/
|
||||
static void free_pcppages_bulk(struct zone *zone, int count,
|
||||
struct per_cpu_pages *pcp)
|
||||
@ -1591,7 +1619,7 @@ static void __meminit init_reserved_page(unsigned long pfn)
|
||||
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
|
||||
struct zone *zone = &pgdat->node_zones[zid];
|
||||
|
||||
if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone))
|
||||
if (zone_spans_pfn(zone, pfn))
|
||||
break;
|
||||
}
|
||||
__init_single_page(pfn_to_page(pfn), pfn, zid, nid);
|
||||
@ -2418,6 +2446,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
|
||||
}
|
||||
|
||||
set_page_owner(page, order, gfp_flags);
|
||||
page_table_check_alloc(page, order);
|
||||
}
|
||||
|
||||
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
|
||||
@ -2980,12 +3009,13 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
|
||||
if (IS_ENABLED(CONFIG_CMA)) {
|
||||
/*
|
||||
* Balance movable allocations between regular and CMA areas by
|
||||
* allocating from CMA when over half of the zone's free memory
|
||||
* is in the CMA area.
|
||||
* allocating from CMA when over more than a given proportion of
|
||||
* the zone's free memory is in the CMA area.
|
||||
*/
|
||||
if (alloc_flags & ALLOC_CMA &&
|
||||
zone_page_state(zone, NR_FREE_CMA_PAGES) >
|
||||
zone_page_state(zone, NR_FREE_PAGES) / 2) {
|
||||
zone_page_state(zone, NR_FREE_PAGES) / ALLOC_IN_CMA_THRESHOLD_MAX
|
||||
* _alloc_in_cma_threshold) {
|
||||
page = __rmqueue_cma_fallback(zone, order);
|
||||
if (page)
|
||||
goto out;
|
||||
@ -3149,9 +3179,9 @@ static void drain_local_pages_wq(struct work_struct *work)
|
||||
* cpu which is alright but we also have to make sure to not move to
|
||||
* a different one.
|
||||
*/
|
||||
preempt_disable();
|
||||
migrate_disable();
|
||||
drain_local_pages(drain->zone);
|
||||
preempt_enable();
|
||||
migrate_enable();
|
||||
}
|
||||
|
||||
/*
|
||||
@ -3968,6 +3998,8 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
|
||||
|
||||
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
|
||||
{
|
||||
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
|
||||
@ -4356,6 +4388,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
|
||||
return NULL;
|
||||
|
||||
psi_memstall_enter(&pflags);
|
||||
delayacct_compact_start();
|
||||
noreclaim_flag = memalloc_noreclaim_save();
|
||||
|
||||
*compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
|
||||
@ -4363,6 +4396,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
|
||||
|
||||
memalloc_noreclaim_restore(noreclaim_flag);
|
||||
psi_memstall_leave(&pflags);
|
||||
delayacct_compact_end();
|
||||
|
||||
if (*compact_result == COMPACT_SKIPPED)
|
||||
return NULL;
|
||||
@ -4799,30 +4833,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
|
||||
trace_reclaim_retry_zone(z, order, reclaimable,
|
||||
available, min_wmark, *no_progress_loops, wmark);
|
||||
if (wmark) {
|
||||
/*
|
||||
* If we didn't make any progress and have a lot of
|
||||
* dirty + writeback pages then we should wait for
|
||||
* an IO to complete to slow down the reclaim and
|
||||
* prevent from pre mature OOM
|
||||
*/
|
||||
if (!did_some_progress) {
|
||||
unsigned long write_pending;
|
||||
|
||||
write_pending = zone_page_state_snapshot(zone,
|
||||
NR_ZONE_WRITE_PENDING);
|
||||
|
||||
if (2 * write_pending > reclaimable) {
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/10);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
ret = true;
|
||||
goto out;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
/*
|
||||
* Memory allocation/reclaim might be called from a WQ context and the
|
||||
* current implementation of the WQ concurrency control doesn't
|
||||
@ -4918,6 +4933,19 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
|
||||
if (!ac->preferred_zoneref->zone)
|
||||
goto nopage;
|
||||
|
||||
/*
|
||||
* Check for insane configurations where the cpuset doesn't contain
|
||||
* any suitable zone to satisfy the request - e.g. non-movable
|
||||
* GFP_HIGHUSER allocations from MOVABLE nodes only.
|
||||
*/
|
||||
if (cpusets_insane_config() && (gfp_mask & __GFP_HARDWALL)) {
|
||||
struct zoneref *z = first_zones_zonelist(ac->zonelist,
|
||||
ac->highest_zoneidx,
|
||||
&cpuset_current_mems_allowed);
|
||||
if (!z->zone)
|
||||
goto nopage;
|
||||
}
|
||||
|
||||
if (alloc_flags & ALLOC_KSWAPD)
|
||||
wake_all_kswapds(order, gfp_mask, ac);
|
||||
|
||||
@ -5408,6 +5436,18 @@ struct page *__alloc_pages(gfp_t gfp, unsigned int order, int preferred_nid,
|
||||
}
|
||||
EXPORT_SYMBOL(__alloc_pages);
|
||||
|
||||
struct folio *__folio_alloc(gfp_t gfp, unsigned int order, int preferred_nid,
|
||||
nodemask_t *nodemask)
|
||||
{
|
||||
struct page *page = __alloc_pages(gfp | __GFP_COMP, order,
|
||||
preferred_nid, nodemask);
|
||||
|
||||
if (page && order > 1)
|
||||
prep_transhuge_page(page);
|
||||
return (struct folio *)page;
|
||||
}
|
||||
EXPORT_SYMBOL(__folio_alloc);
|
||||
|
||||
/*
|
||||
* Common helper functions. Never use with __GFP_HIGHMEM because the returned
|
||||
* address cannot represent highmem pages. Use alloc_pages and then kmap if
|
||||
@ -5620,8 +5660,8 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
|
||||
unsigned int order = get_order(size);
|
||||
unsigned long addr;
|
||||
|
||||
if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
|
||||
gfp_mask &= ~__GFP_COMP;
|
||||
if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
|
||||
gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
|
||||
|
||||
addr = __get_free_pages(gfp_mask, order);
|
||||
return make_alloc_exact(addr, order, size);
|
||||
@ -5645,8 +5685,8 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
|
||||
unsigned int order = get_order(size);
|
||||
struct page *p;
|
||||
|
||||
if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
|
||||
gfp_mask &= ~__GFP_COMP;
|
||||
if (WARN_ON_ONCE(gfp_mask & (__GFP_COMP | __GFP_HIGHMEM)))
|
||||
gfp_mask &= ~(__GFP_COMP | __GFP_HIGHMEM);
|
||||
|
||||
p = alloc_pages_node(nid, gfp_mask, order);
|
||||
if (!p)
|
||||
@ -5988,6 +6028,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
|
||||
printk(KERN_CONT
|
||||
"%s"
|
||||
" free:%lukB"
|
||||
" boost:%lukB"
|
||||
" min:%lukB"
|
||||
" low:%lukB"
|
||||
" high:%lukB"
|
||||
@ -6008,6 +6049,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
|
||||
"\n",
|
||||
zone->name,
|
||||
K(zone_page_state(zone, NR_FREE_PAGES)),
|
||||
K(zone->watermark_boost),
|
||||
K(min_wmark_pages(zone)),
|
||||
K(low_wmark_pages(zone)),
|
||||
K(high_wmark_pages(zone)),
|
||||
@ -6263,7 +6305,7 @@ static void build_zonelists(pg_data_t *pgdat)
|
||||
*/
|
||||
if (node_distance(local_node, node) !=
|
||||
node_distance(local_node, prev_node))
|
||||
node_load[node] = load;
|
||||
node_load[node] += load;
|
||||
|
||||
node_order[nr_nodes++] = node;
|
||||
prev_node = node;
|
||||
@ -6272,6 +6314,10 @@ static void build_zonelists(pg_data_t *pgdat)
|
||||
|
||||
build_zonelists_in_node_order(pgdat, node_order, nr_nodes);
|
||||
build_thisnode_zonelists(pgdat);
|
||||
pr_info("Fallback order for Node %d: ", local_node);
|
||||
for (node = 0; node < nr_nodes; node++)
|
||||
pr_cont("%d ", node_order[node]);
|
||||
pr_cont("\n");
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HAVE_MEMORYLESS_NODES
|
||||
@ -6558,6 +6604,75 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ZONE_DEVICE
|
||||
static void __ref __init_zone_device_page(struct page *page, unsigned long pfn,
|
||||
unsigned long zone_idx, int nid,
|
||||
struct dev_pagemap *pgmap)
|
||||
{
|
||||
|
||||
__init_single_page(page, pfn, zone_idx, nid);
|
||||
|
||||
/*
|
||||
* Mark page reserved as it will need to wait for onlining
|
||||
* phase for it to be fully associated with a zone.
|
||||
*
|
||||
* We can use the non-atomic __set_bit operation for setting
|
||||
* the flag as we are still initializing the pages.
|
||||
*/
|
||||
__SetPageReserved(page);
|
||||
|
||||
/*
|
||||
* ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
|
||||
* and zone_device_data. It is a bug if a ZONE_DEVICE page is
|
||||
* ever freed or placed on a driver-private list.
|
||||
*/
|
||||
page->pgmap = pgmap;
|
||||
page->zone_device_data = NULL;
|
||||
|
||||
/*
|
||||
* Mark the block movable so that blocks are reserved for
|
||||
* movable at startup. This will force kernel allocations
|
||||
* to reserve their blocks rather than leaking throughout
|
||||
* the address space during boot when many long-lived
|
||||
* kernel allocations are made.
|
||||
*
|
||||
* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
|
||||
* because this is done early in section_activate()
|
||||
*/
|
||||
if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
|
||||
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
|
||||
cond_resched();
|
||||
}
|
||||
}
|
||||
|
||||
static void __ref memmap_init_compound(struct page *head,
|
||||
unsigned long head_pfn,
|
||||
unsigned long zone_idx, int nid,
|
||||
struct dev_pagemap *pgmap,
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
unsigned long pfn, end_pfn = head_pfn + nr_pages;
|
||||
unsigned int order = pgmap->vmemmap_shift;
|
||||
|
||||
__SetPageHead(head);
|
||||
for (pfn = head_pfn + 1; pfn < end_pfn; pfn++) {
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
|
||||
prep_compound_tail(head, pfn - head_pfn);
|
||||
set_page_count(page, 0);
|
||||
|
||||
/*
|
||||
* The first tail page stores compound_mapcount_ptr() and
|
||||
* compound_order() and the second tail page stores
|
||||
* compound_pincount_ptr(). Call prep_compound_head() after
|
||||
* the first and second tail pages have been initialized to
|
||||
* not have the data overwritten.
|
||||
*/
|
||||
if (pfn == head_pfn + 2)
|
||||
prep_compound_head(head, order);
|
||||
}
|
||||
}
|
||||
|
||||
void __ref memmap_init_zone_device(struct zone *zone,
|
||||
unsigned long start_pfn,
|
||||
unsigned long nr_pages,
|
||||
@ -6566,6 +6681,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
|
||||
unsigned long pfn, end_pfn = start_pfn + nr_pages;
|
||||
struct pglist_data *pgdat = zone->zone_pgdat;
|
||||
struct vmem_altmap *altmap = pgmap_altmap(pgmap);
|
||||
unsigned int pfns_per_compound = pgmap_vmemmap_nr(pgmap);
|
||||
unsigned long zone_idx = zone_idx(zone);
|
||||
unsigned long start = jiffies;
|
||||
int nid = pgdat->node_id;
|
||||
@ -6583,42 +6699,16 @@ void __ref memmap_init_zone_device(struct zone *zone,
|
||||
nr_pages = end_pfn - start_pfn;
|
||||
}
|
||||
|
||||
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
|
||||
for (pfn = start_pfn; pfn < end_pfn; pfn += pfns_per_compound) {
|
||||
struct page *page = pfn_to_page(pfn);
|
||||
|
||||
__init_single_page(page, pfn, zone_idx, nid);
|
||||
__init_zone_device_page(page, pfn, zone_idx, nid, pgmap);
|
||||
|
||||
/*
|
||||
* Mark page reserved as it will need to wait for onlining
|
||||
* phase for it to be fully associated with a zone.
|
||||
*
|
||||
* We can use the non-atomic __set_bit operation for setting
|
||||
* the flag as we are still initializing the pages.
|
||||
*/
|
||||
__SetPageReserved(page);
|
||||
if (pfns_per_compound == 1)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
|
||||
* and zone_device_data. It is a bug if a ZONE_DEVICE page is
|
||||
* ever freed or placed on a driver-private list.
|
||||
*/
|
||||
page->pgmap = pgmap;
|
||||
page->zone_device_data = NULL;
|
||||
|
||||
/*
|
||||
* Mark the block movable so that blocks are reserved for
|
||||
* movable at startup. This will force kernel allocations
|
||||
* to reserve their blocks rather than leaking throughout
|
||||
* the address space during boot when many long-lived
|
||||
* kernel allocations are made.
|
||||
*
|
||||
* Please note that MEMINIT_HOTPLUG path doesn't clear memmap
|
||||
* because this is done early in section_activate()
|
||||
*/
|
||||
if (IS_ALIGNED(pfn, pageblock_nr_pages)) {
|
||||
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
|
||||
cond_resched();
|
||||
}
|
||||
memmap_init_compound(page, pfn, zone_idx, nid, pgmap,
|
||||
pfns_per_compound);
|
||||
}
|
||||
|
||||
pr_info("%s initialised %lu pages in %ums\n", __func__,
|
||||
@ -7397,6 +7487,8 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat) {}
|
||||
|
||||
static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
|
||||
{
|
||||
int i;
|
||||
|
||||
pgdat_resize_init(pgdat);
|
||||
|
||||
pgdat_init_split_queue(pgdat);
|
||||
@ -7405,6 +7497,9 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
|
||||
init_waitqueue_head(&pgdat->kswapd_wait);
|
||||
init_waitqueue_head(&pgdat->pfmemalloc_wait);
|
||||
|
||||
for (i = 0; i < NR_VMSCAN_THROTTLE; i++)
|
||||
init_waitqueue_head(&pgdat->reclaim_wait[i]);
|
||||
|
||||
pgdat_page_ext_init(pgdat);
|
||||
lruvec_init(&pgdat->__lruvec);
|
||||
}
|
||||
@ -8134,8 +8229,7 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
|
||||
}
|
||||
|
||||
if (pages && s)
|
||||
pr_info("Freeing %s memory: %ldK\n",
|
||||
s, pages << (PAGE_SHIFT - 10));
|
||||
pr_info("Freeing %s memory: %ldK\n", s, K(pages));
|
||||
|
||||
return pages;
|
||||
}
|
||||
@ -8162,7 +8256,7 @@ void __init mem_init_print_info(void)
|
||||
*/
|
||||
#define adj_init_size(start, end, size, pos, adj) \
|
||||
do { \
|
||||
if (start <= pos && pos < end && size > adj) \
|
||||
if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
|
||||
size -= adj; \
|
||||
} while (0)
|
||||
|
||||
@ -8180,14 +8274,13 @@ void __init mem_init_print_info(void)
|
||||
", %luK highmem"
|
||||
#endif
|
||||
")\n",
|
||||
nr_free_pages() << (PAGE_SHIFT - 10),
|
||||
physpages << (PAGE_SHIFT - 10),
|
||||
K(nr_free_pages()), K(physpages),
|
||||
codesize >> 10, datasize >> 10, rosize >> 10,
|
||||
(init_data_size + init_code_size) >> 10, bss_size >> 10,
|
||||
(physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
|
||||
totalcma_pages << (PAGE_SHIFT - 10)
|
||||
K(physpages - totalram_pages() - totalcma_pages),
|
||||
K(totalcma_pages)
|
||||
#ifdef CONFIG_HIGHMEM
|
||||
, totalhigh_pages() << (PAGE_SHIFT - 10)
|
||||
, K(totalhigh_pages())
|
||||
#endif
|
||||
);
|
||||
}
|
||||
@ -8460,7 +8553,7 @@ void setup_per_zone_wmarks(void)
|
||||
* 8192MB: 11584k
|
||||
* 16384MB: 16384k
|
||||
*/
|
||||
int __meminit init_per_zone_wmark_min(void)
|
||||
void calculate_min_free_kbytes(void)
|
||||
{
|
||||
unsigned long lowmem_kbytes;
|
||||
int new_min_free_kbytes;
|
||||
@ -8468,16 +8561,17 @@ int __meminit init_per_zone_wmark_min(void)
|
||||
lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
|
||||
new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
|
||||
|
||||
if (new_min_free_kbytes > user_min_free_kbytes) {
|
||||
min_free_kbytes = new_min_free_kbytes;
|
||||
if (min_free_kbytes < 128)
|
||||
min_free_kbytes = 128;
|
||||
if (min_free_kbytes > 262144)
|
||||
min_free_kbytes = 262144;
|
||||
} else {
|
||||
if (new_min_free_kbytes > user_min_free_kbytes)
|
||||
min_free_kbytes = clamp(new_min_free_kbytes, 128, 262144);
|
||||
else
|
||||
pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
|
||||
new_min_free_kbytes, user_min_free_kbytes);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int __meminit init_per_zone_wmark_min(void)
|
||||
{
|
||||
calculate_min_free_kbytes();
|
||||
setup_per_zone_wmarks();
|
||||
refresh_zone_stat_thresholds();
|
||||
setup_per_zone_lowmem_reserve();
|
||||
@ -8764,7 +8858,8 @@ void *__init alloc_large_system_hash(const char *tablename,
|
||||
} else if (get_order(size) >= MAX_ORDER || hashdist) {
|
||||
table = __vmalloc(size, gfp_flags);
|
||||
virt = true;
|
||||
huge = is_vm_area_hugepages(table);
|
||||
if (table)
|
||||
huge = is_vm_area_hugepages(table);
|
||||
} else {
|
||||
/*
|
||||
* If bucketsize is not a power-of-two, we may free
|
||||
@ -9205,8 +9300,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
|
||||
* for allocation requests which can not be fulfilled with the buddy allocator.
|
||||
*
|
||||
* The allocated memory is always aligned to a page boundary. If nr_pages is a
|
||||
* power of two then the alignment is guaranteed to be to the given nr_pages
|
||||
* (e.g. 1GB request would be aligned to 1GB).
|
||||
* power of two, then allocated range is also guaranteed to be aligned to same
|
||||
* nr_pages (e.g. 1GB request would be aligned to 1GB).
|
||||
*
|
||||
* Allocated pages can be freed with free_contig_range() or by manually calling
|
||||
* __free_page() on each allocated page.
|
||||
@ -9361,21 +9456,21 @@ void __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* This function returns a stable result only if called under zone lock.
|
||||
*/
|
||||
bool is_free_buddy_page(struct page *page)
|
||||
{
|
||||
struct zone *zone = page_zone(page);
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
unsigned long flags;
|
||||
unsigned int order;
|
||||
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
for (order = 0; order < MAX_ORDER; order++) {
|
||||
struct page *page_head = page - (pfn & ((1 << order) - 1));
|
||||
|
||||
if (PageBuddy(page_head) && buddy_order(page_head) >= order)
|
||||
if (PageBuddy(page_head) &&
|
||||
buddy_order_unsafe(page_head) >= order)
|
||||
break;
|
||||
}
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
|
||||
return order < MAX_ORDER;
|
||||
}
|
||||
@ -9439,6 +9534,7 @@ bool take_page_off_buddy(struct page *page)
|
||||
del_page_from_free_list(page_head, zone, page_order);
|
||||
break_down_buddy_pages(zone, page_head, page, 0,
|
||||
page_order, migratetype);
|
||||
SetPageHWPoisonTakenOff(page);
|
||||
if (!is_migrate_isolate(migratetype))
|
||||
__mod_zone_freepage_state(zone, -1, migratetype);
|
||||
ret = true;
|
||||
@ -9450,6 +9546,31 @@ bool take_page_off_buddy(struct page *page)
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Cancel takeoff done by take_page_off_buddy().
|
||||
*/
|
||||
bool put_page_back_buddy(struct page *page)
|
||||
{
|
||||
struct zone *zone = page_zone(page);
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
unsigned long flags;
|
||||
int migratetype = get_pfnblock_migratetype(page, pfn);
|
||||
bool ret = false;
|
||||
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
if (put_page_testzero(page)) {
|
||||
ClearPageHWPoisonTakenOff(page);
|
||||
__free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
|
||||
if (TestClearPageHWPoison(page)) {
|
||||
num_poisoned_pages_dec();
|
||||
ret = true;
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_ZONE_DMA
|
||||
|
@ -120,7 +120,6 @@ bool page_counter_try_charge(struct page_counter *counter,
|
||||
new = atomic_long_add_return(nr_pages, &c->usage);
|
||||
if (new > c->max) {
|
||||
atomic_long_sub(nr_pages, &c->usage);
|
||||
propagate_protected_usage(c, new);
|
||||
/*
|
||||
* This is racy, but we can live with some
|
||||
* inaccuracy in the failcnt which is only used
|
||||
|
@ -8,6 +8,7 @@
|
||||
#include <linux/kmemleak.h>
|
||||
#include <linux/page_owner.h>
|
||||
#include <linux/page_idle.h>
|
||||
#include <linux/page_table_check.h>
|
||||
|
||||
/*
|
||||
* struct page extension
|
||||
@ -63,18 +64,21 @@ static bool need_page_idle(void)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
struct page_ext_operations page_idle_ops = {
|
||||
static struct page_ext_operations page_idle_ops __initdata = {
|
||||
.need = need_page_idle,
|
||||
};
|
||||
#endif
|
||||
|
||||
static struct page_ext_operations *page_ext_ops[] = {
|
||||
static struct page_ext_operations *page_ext_ops[] __initdata = {
|
||||
#ifdef CONFIG_PAGE_OWNER
|
||||
&page_owner_ops,
|
||||
#endif
|
||||
#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
|
||||
&page_idle_ops,
|
||||
#endif
|
||||
#ifdef CONFIG_PAGE_TABLE_CHECK
|
||||
&page_table_check_ops,
|
||||
#endif
|
||||
};
|
||||
|
||||
unsigned long page_ext_size = sizeof(struct page_ext);
|
||||
@ -201,7 +205,7 @@ void __init page_ext_init_flatmem(void)
|
||||
panic("Out of memory");
|
||||
}
|
||||
|
||||
#else /* CONFIG_FLATMEM */
|
||||
#else /* CONFIG_SPARSEMEM */
|
||||
|
||||
struct page_ext *lookup_page_ext(const struct page *page)
|
||||
{
|
||||
|
17
mm/page_io.c
17
mm/page_io.c
@ -25,6 +25,7 @@
|
||||
#include <linux/psi.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/sched/task.h>
|
||||
#include <linux/delayacct.h>
|
||||
|
||||
void end_swap_bio_write(struct bio *bio)
|
||||
{
|
||||
@ -38,7 +39,7 @@ void end_swap_bio_write(struct bio *bio)
|
||||
* Also print a dire warning that things will go BAD (tm)
|
||||
* very quickly.
|
||||
*
|
||||
* Also clear PG_reclaim to avoid rotate_reclaimable_page()
|
||||
* Also clear PG_reclaim to avoid folio_rotate_reclaimable()
|
||||
*/
|
||||
set_page_dirty(page);
|
||||
pr_alert_ratelimited("Write-error on swap-device (%u:%u:%llu)\n",
|
||||
@ -317,7 +318,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
|
||||
* temporary failure if the system has limited
|
||||
* memory for allocating transmit buffers.
|
||||
* Mark the page dirty and avoid
|
||||
* rotate_reclaimable_page but rate-limit the
|
||||
* folio_rotate_reclaimable but rate-limit the
|
||||
* messages but do not flag PageError like
|
||||
* the normal direct-to-bio case as it could
|
||||
* be temporary.
|
||||
@ -358,8 +359,6 @@ int swap_readpage(struct page *page, bool synchronous)
|
||||
struct bio *bio;
|
||||
int ret = 0;
|
||||
struct swap_info_struct *sis = page_swap_info(page);
|
||||
blk_qc_t qc;
|
||||
struct gendisk *disk;
|
||||
unsigned long pflags;
|
||||
|
||||
VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page);
|
||||
@ -372,6 +371,7 @@ int swap_readpage(struct page *page, bool synchronous)
|
||||
* significant part of overall IO time.
|
||||
*/
|
||||
psi_memstall_enter(&pflags);
|
||||
delayacct_swapin_start();
|
||||
|
||||
if (frontswap_load(page) == 0) {
|
||||
SetPageUptodate(page);
|
||||
@ -409,26 +409,24 @@ int swap_readpage(struct page *page, bool synchronous)
|
||||
bio->bi_iter.bi_sector = swap_page_sector(page);
|
||||
bio->bi_end_io = end_swap_bio_read;
|
||||
bio_add_page(bio, page, thp_size(page), 0);
|
||||
|
||||
disk = bio->bi_bdev->bd_disk;
|
||||
/*
|
||||
* Keep this task valid during swap readpage because the oom killer may
|
||||
* attempt to access it in the page fault retry time check.
|
||||
*/
|
||||
if (synchronous) {
|
||||
bio->bi_opf |= REQ_HIPRI;
|
||||
bio->bi_opf |= REQ_POLLED;
|
||||
get_task_struct(current);
|
||||
bio->bi_private = current;
|
||||
}
|
||||
count_vm_event(PSWPIN);
|
||||
bio_get(bio);
|
||||
qc = submit_bio(bio);
|
||||
submit_bio(bio);
|
||||
while (synchronous) {
|
||||
set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
if (!READ_ONCE(bio->bi_private))
|
||||
break;
|
||||
|
||||
if (!blk_poll(disk->queue, qc, true))
|
||||
if (!bio_poll(bio, NULL, 0))
|
||||
blk_io_schedule();
|
||||
}
|
||||
__set_current_state(TASK_RUNNING);
|
||||
@ -436,6 +434,7 @@ int swap_readpage(struct page *page, bool synchronous)
|
||||
|
||||
out:
|
||||
psi_memstall_leave(&pflags);
|
||||
delayacct_swapin_end();
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -94,8 +94,13 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
|
||||
buddy = page + (buddy_pfn - pfn);
|
||||
|
||||
if (!is_migrate_isolate_page(buddy)) {
|
||||
__isolate_free_page(page, order);
|
||||
isolated_page = true;
|
||||
isolated_page = !!__isolate_free_page(page, order);
|
||||
/*
|
||||
* Isolating a free page in an isolated pageblock
|
||||
* is expected to always work as watermarks don't
|
||||
* apply here.
|
||||
*/
|
||||
VM_WARN_ON(!isolated_page);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -183,7 +188,6 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||
unsigned migratetype, int flags)
|
||||
{
|
||||
unsigned long pfn;
|
||||
unsigned long undo_pfn;
|
||||
struct page *page;
|
||||
|
||||
BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
|
||||
@ -193,25 +197,12 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
|
||||
pfn < end_pfn;
|
||||
pfn += pageblock_nr_pages) {
|
||||
page = __first_valid_page(pfn, pageblock_nr_pages);
|
||||
if (page) {
|
||||
if (set_migratetype_isolate(page, migratetype, flags)) {
|
||||
undo_pfn = pfn;
|
||||
goto undo;
|
||||
}
|
||||
if (page && set_migratetype_isolate(page, migratetype, flags)) {
|
||||
undo_isolate_page_range(start_pfn, pfn, migratetype);
|
||||
return -EBUSY;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
undo:
|
||||
for (pfn = start_pfn;
|
||||
pfn < undo_pfn;
|
||||
pfn += pageblock_nr_pages) {
|
||||
struct page *page = pfn_to_online_page(pfn);
|
||||
if (!page)
|
||||
continue;
|
||||
unset_migratetype_isolate(page, migratetype);
|
||||
}
|
||||
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -46,7 +46,7 @@ static int __init early_page_owner_param(char *buf)
|
||||
}
|
||||
early_param("page_owner", early_page_owner_param);
|
||||
|
||||
static bool need_page_owner(void)
|
||||
static __init bool need_page_owner(void)
|
||||
{
|
||||
return page_owner_enabled;
|
||||
}
|
||||
@ -75,11 +75,13 @@ static noinline void register_early_stack(void)
|
||||
early_handle = create_dummy_stack();
|
||||
}
|
||||
|
||||
static void init_page_owner(void)
|
||||
static __init void init_page_owner(void)
|
||||
{
|
||||
if (!page_owner_enabled)
|
||||
return;
|
||||
|
||||
stack_depot_init();
|
||||
|
||||
register_dummy_stack();
|
||||
register_failure_stack();
|
||||
register_early_stack();
|
||||
@ -125,7 +127,7 @@ static noinline depot_stack_handle_t save_stack(gfp_t flags)
|
||||
return handle;
|
||||
}
|
||||
|
||||
void __reset_page_owner(struct page *page, unsigned int order)
|
||||
void __reset_page_owner(struct page *page, unsigned short order)
|
||||
{
|
||||
int i;
|
||||
struct page_ext *page_ext;
|
||||
@ -149,7 +151,7 @@ void __reset_page_owner(struct page *page, unsigned int order)
|
||||
|
||||
static inline void __set_page_owner_handle(struct page_ext *page_ext,
|
||||
depot_stack_handle_t handle,
|
||||
unsigned int order, gfp_t gfp_mask)
|
||||
unsigned short order, gfp_t gfp_mask)
|
||||
{
|
||||
struct page_owner *page_owner;
|
||||
int i;
|
||||
@ -169,7 +171,7 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext,
|
||||
}
|
||||
}
|
||||
|
||||
noinline void __set_page_owner(struct page *page, unsigned int order,
|
||||
noinline void __set_page_owner(struct page *page, unsigned short order,
|
||||
gfp_t gfp_mask)
|
||||
{
|
||||
struct page_ext *page_ext = lookup_page_ext(page);
|
||||
@ -210,10 +212,10 @@ void __split_page_owner(struct page *page, unsigned int nr)
|
||||
}
|
||||
}
|
||||
|
||||
void __copy_page_owner(struct page *oldpage, struct page *newpage)
|
||||
void __folio_copy_owner(struct folio *newfolio, struct folio *old)
|
||||
{
|
||||
struct page_ext *old_ext = lookup_page_ext(oldpage);
|
||||
struct page_ext *new_ext = lookup_page_ext(newpage);
|
||||
struct page_ext *old_ext = lookup_page_ext(&old->page);
|
||||
struct page_ext *new_ext = lookup_page_ext(&newfolio->page);
|
||||
struct page_owner *old_page_owner, *new_page_owner;
|
||||
|
||||
if (unlikely(!old_ext || !new_ext))
|
||||
@ -231,11 +233,11 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
|
||||
new_page_owner->free_ts_nsec = old_page_owner->ts_nsec;
|
||||
|
||||
/*
|
||||
* We don't clear the bit on the oldpage as it's going to be freed
|
||||
* We don't clear the bit on the old folio as it's going to be freed
|
||||
* after migration. Until then, the info can be useful in case of
|
||||
* a bug, and the overall stats will be off a bit only temporarily.
|
||||
* Also, migrate_misplaced_transhuge_page() can still fail the
|
||||
* migration and then we want the oldpage to retain the info. But
|
||||
* migration and then we want the old folio to retain the info. But
|
||||
* in that case we also don't need to explicitly clear the info from
|
||||
* the new page, which will be freed.
|
||||
*/
|
||||
@ -329,8 +331,6 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
|
||||
depot_stack_handle_t handle)
|
||||
{
|
||||
int ret, pageblock_mt, page_mt;
|
||||
unsigned long *entries;
|
||||
unsigned int nr_entries;
|
||||
char *kbuf;
|
||||
|
||||
count = min_t(size_t, count, PAGE_SIZE);
|
||||
@ -351,18 +351,17 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
|
||||
pageblock_mt = get_pageblock_migratetype(page);
|
||||
page_mt = gfp_migratetype(page_owner->gfp_mask);
|
||||
ret += snprintf(kbuf + ret, count - ret,
|
||||
"PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
|
||||
"PFN %lu type %s Block %lu type %s Flags %pGp\n",
|
||||
pfn,
|
||||
migratetype_names[page_mt],
|
||||
pfn >> pageblock_order,
|
||||
migratetype_names[pageblock_mt],
|
||||
page->flags, &page->flags);
|
||||
&page->flags);
|
||||
|
||||
if (ret >= count)
|
||||
goto err;
|
||||
|
||||
nr_entries = stack_depot_fetch(handle, &entries);
|
||||
ret += stack_trace_snprint(kbuf + ret, count - ret, entries, nr_entries, 0);
|
||||
ret += stack_depot_snprint(handle, kbuf + ret, count - ret, 0);
|
||||
if (ret >= count)
|
||||
goto err;
|
||||
|
||||
@ -394,8 +393,6 @@ void __dump_page_owner(const struct page *page)
|
||||
struct page_ext *page_ext = lookup_page_ext(page);
|
||||
struct page_owner *page_owner;
|
||||
depot_stack_handle_t handle;
|
||||
unsigned long *entries;
|
||||
unsigned int nr_entries;
|
||||
gfp_t gfp_mask;
|
||||
int mt;
|
||||
|
||||
@ -423,20 +420,17 @@ void __dump_page_owner(const struct page *page)
|
||||
page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec);
|
||||
|
||||
handle = READ_ONCE(page_owner->handle);
|
||||
if (!handle) {
|
||||
if (!handle)
|
||||
pr_alert("page_owner allocation stack trace missing\n");
|
||||
} else {
|
||||
nr_entries = stack_depot_fetch(handle, &entries);
|
||||
stack_trace_print(entries, nr_entries, 0);
|
||||
}
|
||||
else
|
||||
stack_depot_print(handle);
|
||||
|
||||
handle = READ_ONCE(page_owner->free_handle);
|
||||
if (!handle) {
|
||||
pr_alert("page_owner free stack trace missing\n");
|
||||
} else {
|
||||
nr_entries = stack_depot_fetch(handle, &entries);
|
||||
pr_alert("page last free stack trace:\n");
|
||||
stack_trace_print(entries, nr_entries, 0);
|
||||
stack_depot_print(handle);
|
||||
}
|
||||
|
||||
if (page_owner->last_migrate_reason != -1)
|
||||
|
@ -113,6 +113,24 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
|
||||
return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
/**
|
||||
* pcpu_obj_full_size - helper to calculate size of each accounted object
|
||||
* @size: size of area to allocate in bytes
|
||||
*
|
||||
* For each accounted object there is an extra space which is used to store
|
||||
* obj_cgroup membership. Charge it too.
|
||||
*/
|
||||
static inline size_t pcpu_obj_full_size(size_t size)
|
||||
{
|
||||
size_t extra_size;
|
||||
|
||||
extra_size = size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *);
|
||||
|
||||
return size * num_possible_cpus() + extra_size;
|
||||
}
|
||||
#endif /* CONFIG_MEMCG_KMEM */
|
||||
|
||||
#ifdef CONFIG_PERCPU_STATS
|
||||
|
||||
#include <linux/spinlock.h>
|
||||
|
205
mm/percpu.c
205
mm/percpu.c
@ -779,7 +779,7 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
|
||||
{
|
||||
struct pcpu_block_md *block = chunk->md_blocks + index;
|
||||
unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
|
||||
unsigned int rs, re, start; /* region start, region end */
|
||||
unsigned int start, end; /* region start, region end */
|
||||
|
||||
/* promote scan_hint to contig_hint */
|
||||
if (block->scan_hint) {
|
||||
@ -795,9 +795,8 @@ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index)
|
||||
block->right_free = 0;
|
||||
|
||||
/* iterate over free areas and update the contig hints */
|
||||
bitmap_for_each_clear_region(alloc_map, rs, re, start,
|
||||
PCPU_BITMAP_BLOCK_BITS)
|
||||
pcpu_block_update(block, rs, re);
|
||||
for_each_clear_bitrange_from(start, end, alloc_map, PCPU_BITMAP_BLOCK_BITS)
|
||||
pcpu_block_update(block, start, end);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1070,17 +1069,18 @@ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
|
||||
static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits,
|
||||
int *next_off)
|
||||
{
|
||||
unsigned int page_start, page_end, rs, re;
|
||||
unsigned int start, end;
|
||||
|
||||
page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
|
||||
page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
|
||||
start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE);
|
||||
end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
|
||||
|
||||
rs = page_start;
|
||||
bitmap_next_clear_region(chunk->populated, &rs, &re, page_end);
|
||||
if (rs >= page_end)
|
||||
start = find_next_zero_bit(chunk->populated, end, start);
|
||||
if (start >= end)
|
||||
return true;
|
||||
|
||||
*next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
|
||||
end = find_next_bit(chunk->populated, end, start + 1);
|
||||
|
||||
*next_off = end * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE;
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -1635,7 +1635,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
|
||||
if (!objcg)
|
||||
return true;
|
||||
|
||||
if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
|
||||
if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) {
|
||||
obj_cgroup_put(objcg);
|
||||
return false;
|
||||
}
|
||||
@ -1656,10 +1656,10 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
|
||||
|
||||
rcu_read_lock();
|
||||
mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
|
||||
size * num_possible_cpus());
|
||||
pcpu_obj_full_size(size));
|
||||
rcu_read_unlock();
|
||||
} else {
|
||||
obj_cgroup_uncharge(objcg, size * num_possible_cpus());
|
||||
obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
|
||||
obj_cgroup_put(objcg);
|
||||
}
|
||||
}
|
||||
@ -1676,11 +1676,11 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
|
||||
return;
|
||||
chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
|
||||
|
||||
obj_cgroup_uncharge(objcg, size * num_possible_cpus());
|
||||
obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size));
|
||||
|
||||
rcu_read_lock();
|
||||
mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
|
||||
-(size * num_possible_cpus()));
|
||||
-pcpu_obj_full_size(size));
|
||||
rcu_read_unlock();
|
||||
|
||||
obj_cgroup_put(objcg);
|
||||
@ -1851,13 +1851,12 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
|
||||
|
||||
/* populate if not all pages are already there */
|
||||
if (!is_atomic) {
|
||||
unsigned int page_start, page_end, rs, re;
|
||||
unsigned int page_end, rs, re;
|
||||
|
||||
page_start = PFN_DOWN(off);
|
||||
rs = PFN_DOWN(off);
|
||||
page_end = PFN_UP(off + size);
|
||||
|
||||
bitmap_for_each_clear_region(chunk->populated, rs, re,
|
||||
page_start, page_end) {
|
||||
for_each_clear_bitrange_from(rs, re, chunk->populated, page_end) {
|
||||
WARN_ON(chunk->immutable);
|
||||
|
||||
ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp);
|
||||
@ -2013,8 +2012,7 @@ static void pcpu_balance_free(bool empty_only)
|
||||
list_for_each_entry_safe(chunk, next, &to_free, list) {
|
||||
unsigned int rs, re;
|
||||
|
||||
bitmap_for_each_set_region(chunk->populated, rs, re, 0,
|
||||
chunk->nr_pages) {
|
||||
for_each_set_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
|
||||
pcpu_depopulate_chunk(chunk, rs, re);
|
||||
spin_lock_irq(&pcpu_lock);
|
||||
pcpu_chunk_depopulated(chunk, rs, re);
|
||||
@ -2084,8 +2082,7 @@ static void pcpu_balance_populated(void)
|
||||
continue;
|
||||
|
||||
/* @chunk can't go away while pcpu_alloc_mutex is held */
|
||||
bitmap_for_each_clear_region(chunk->populated, rs, re, 0,
|
||||
chunk->nr_pages) {
|
||||
for_each_clear_bitrange(rs, re, chunk->populated, chunk->nr_pages) {
|
||||
int nr = min_t(int, re - rs, nr_to_pop);
|
||||
|
||||
spin_unlock_irq(&pcpu_lock);
|
||||
@ -2472,7 +2469,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
|
||||
*/
|
||||
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
|
||||
{
|
||||
memblock_free_early(__pa(ai), ai->__ai_size);
|
||||
memblock_free(ai, ai->__ai_size);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -2992,6 +2989,42 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
|
||||
|
||||
return ai;
|
||||
}
|
||||
|
||||
static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align,
|
||||
pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
|
||||
{
|
||||
const unsigned long goal = __pa(MAX_DMA_ADDRESS);
|
||||
#ifdef CONFIG_NUMA
|
||||
int node = NUMA_NO_NODE;
|
||||
void *ptr;
|
||||
|
||||
if (cpu_to_nd_fn)
|
||||
node = cpu_to_nd_fn(cpu);
|
||||
|
||||
if (node == NUMA_NO_NODE || !node_online(node) || !NODE_DATA(node)) {
|
||||
ptr = memblock_alloc_from(size, align, goal);
|
||||
pr_info("cpu %d has no node %d or node-local memory\n",
|
||||
cpu, node);
|
||||
pr_debug("per cpu data for cpu%d %zu bytes at 0x%llx\n",
|
||||
cpu, size, (u64)__pa(ptr));
|
||||
} else {
|
||||
ptr = memblock_alloc_try_nid(size, align, goal,
|
||||
MEMBLOCK_ALLOC_ACCESSIBLE,
|
||||
node);
|
||||
|
||||
pr_debug("per cpu data for cpu%d %zu bytes on node%d at 0x%llx\n",
|
||||
cpu, size, node, (u64)__pa(ptr));
|
||||
}
|
||||
return ptr;
|
||||
#else
|
||||
return memblock_alloc_from(size, align, goal);
|
||||
#endif
|
||||
}
|
||||
|
||||
static void __init pcpu_fc_free(void *ptr, size_t size)
|
||||
{
|
||||
memblock_free(ptr, size);
|
||||
}
|
||||
#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
|
||||
|
||||
#if defined(BUILD_EMBED_FIRST_CHUNK)
|
||||
@ -3001,14 +3034,13 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
|
||||
* @dyn_size: minimum free size for dynamic allocation in bytes
|
||||
* @atom_size: allocation atom size
|
||||
* @cpu_distance_fn: callback to determine distance between cpus, optional
|
||||
* @alloc_fn: function to allocate percpu page
|
||||
* @free_fn: function to free percpu page
|
||||
* @cpu_to_nd_fn: callback to convert cpu to it's node, optional
|
||||
*
|
||||
* This is a helper to ease setting up embedded first percpu chunk and
|
||||
* can be called where pcpu_setup_first_chunk() is expected.
|
||||
*
|
||||
* If this function is used to setup the first chunk, it is allocated
|
||||
* by calling @alloc_fn and used as-is without being mapped into
|
||||
* by calling pcpu_fc_alloc and used as-is without being mapped into
|
||||
* vmalloc area. Allocations are always whole multiples of @atom_size
|
||||
* aligned to @atom_size.
|
||||
*
|
||||
@ -3022,7 +3054,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
|
||||
* @dyn_size specifies the minimum dynamic area size.
|
||||
*
|
||||
* If the needed size is smaller than the minimum or specified unit
|
||||
* size, the leftover is returned using @free_fn.
|
||||
* size, the leftover is returned using pcpu_fc_free.
|
||||
*
|
||||
* RETURNS:
|
||||
* 0 on success, -errno on failure.
|
||||
@ -3030,8 +3062,7 @@ static struct pcpu_alloc_info * __init __flatten pcpu_build_alloc_info(
|
||||
int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
|
||||
size_t atom_size,
|
||||
pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
|
||||
pcpu_fc_alloc_fn_t alloc_fn,
|
||||
pcpu_fc_free_fn_t free_fn)
|
||||
pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
|
||||
{
|
||||
void *base = (void *)ULONG_MAX;
|
||||
void **areas = NULL;
|
||||
@ -3066,7 +3097,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
|
||||
BUG_ON(cpu == NR_CPUS);
|
||||
|
||||
/* allocate space for the whole group */
|
||||
ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
|
||||
ptr = pcpu_fc_alloc(cpu, gi->nr_units * ai->unit_size, atom_size, cpu_to_nd_fn);
|
||||
if (!ptr) {
|
||||
rc = -ENOMEM;
|
||||
goto out_free_areas;
|
||||
@ -3105,12 +3136,12 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
|
||||
for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
|
||||
if (gi->cpu_map[i] == NR_CPUS) {
|
||||
/* unused unit, free whole */
|
||||
free_fn(ptr, ai->unit_size);
|
||||
pcpu_fc_free(ptr, ai->unit_size);
|
||||
continue;
|
||||
}
|
||||
/* copy and return the unused part */
|
||||
memcpy(ptr, __per_cpu_load, ai->static_size);
|
||||
free_fn(ptr + size_sum, ai->unit_size - size_sum);
|
||||
pcpu_fc_free(ptr + size_sum, ai->unit_size - size_sum);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3129,23 +3160,90 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
|
||||
out_free_areas:
|
||||
for (group = 0; group < ai->nr_groups; group++)
|
||||
if (areas[group])
|
||||
free_fn(areas[group],
|
||||
pcpu_fc_free(areas[group],
|
||||
ai->groups[group].nr_units * ai->unit_size);
|
||||
out_free:
|
||||
pcpu_free_alloc_info(ai);
|
||||
if (areas)
|
||||
memblock_free_early(__pa(areas), areas_size);
|
||||
memblock_free(areas, areas_size);
|
||||
return rc;
|
||||
}
|
||||
#endif /* BUILD_EMBED_FIRST_CHUNK */
|
||||
|
||||
#ifdef BUILD_PAGE_FIRST_CHUNK
|
||||
#include <asm/pgalloc.h>
|
||||
|
||||
#ifndef P4D_TABLE_SIZE
|
||||
#define P4D_TABLE_SIZE PAGE_SIZE
|
||||
#endif
|
||||
|
||||
#ifndef PUD_TABLE_SIZE
|
||||
#define PUD_TABLE_SIZE PAGE_SIZE
|
||||
#endif
|
||||
|
||||
#ifndef PMD_TABLE_SIZE
|
||||
#define PMD_TABLE_SIZE PAGE_SIZE
|
||||
#endif
|
||||
|
||||
#ifndef PTE_TABLE_SIZE
|
||||
#define PTE_TABLE_SIZE PAGE_SIZE
|
||||
#endif
|
||||
void __init __weak pcpu_populate_pte(unsigned long addr)
|
||||
{
|
||||
pgd_t *pgd = pgd_offset_k(addr);
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
|
||||
if (pgd_none(*pgd)) {
|
||||
p4d_t *new;
|
||||
|
||||
new = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE);
|
||||
if (!new)
|
||||
goto err_alloc;
|
||||
pgd_populate(&init_mm, pgd, new);
|
||||
}
|
||||
|
||||
p4d = p4d_offset(pgd, addr);
|
||||
if (p4d_none(*p4d)) {
|
||||
pud_t *new;
|
||||
|
||||
new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE);
|
||||
if (!new)
|
||||
goto err_alloc;
|
||||
p4d_populate(&init_mm, p4d, new);
|
||||
}
|
||||
|
||||
pud = pud_offset(p4d, addr);
|
||||
if (pud_none(*pud)) {
|
||||
pmd_t *new;
|
||||
|
||||
new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE);
|
||||
if (!new)
|
||||
goto err_alloc;
|
||||
pud_populate(&init_mm, pud, new);
|
||||
}
|
||||
|
||||
pmd = pmd_offset(pud, addr);
|
||||
if (!pmd_present(*pmd)) {
|
||||
pte_t *new;
|
||||
|
||||
new = memblock_alloc(PTE_TABLE_SIZE, PTE_TABLE_SIZE);
|
||||
if (!new)
|
||||
goto err_alloc;
|
||||
pmd_populate_kernel(&init_mm, pmd, new);
|
||||
}
|
||||
|
||||
return;
|
||||
|
||||
err_alloc:
|
||||
panic("%s: Failed to allocate memory\n", __func__);
|
||||
}
|
||||
|
||||
/**
|
||||
* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
|
||||
* @reserved_size: the size of reserved percpu area in bytes
|
||||
* @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
|
||||
* @free_fn: function to free percpu page, always called with PAGE_SIZE
|
||||
* @populate_pte_fn: function to populate pte
|
||||
* @cpu_to_nd_fn: callback to convert cpu to it's node, optional
|
||||
*
|
||||
* This is a helper to ease setting up page-remapped first percpu
|
||||
* chunk and can be called where pcpu_setup_first_chunk() is expected.
|
||||
@ -3156,10 +3254,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
|
||||
* RETURNS:
|
||||
* 0 on success, -errno on failure.
|
||||
*/
|
||||
int __init pcpu_page_first_chunk(size_t reserved_size,
|
||||
pcpu_fc_alloc_fn_t alloc_fn,
|
||||
pcpu_fc_free_fn_t free_fn,
|
||||
pcpu_fc_populate_pte_fn_t populate_pte_fn)
|
||||
int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_cpu_to_node_fn_t cpu_to_nd_fn)
|
||||
{
|
||||
static struct vm_struct vm;
|
||||
struct pcpu_alloc_info *ai;
|
||||
@ -3201,7 +3296,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
|
||||
for (i = 0; i < unit_pages; i++) {
|
||||
void *ptr;
|
||||
|
||||
ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
|
||||
ptr = pcpu_fc_alloc(cpu, PAGE_SIZE, PAGE_SIZE, cpu_to_nd_fn);
|
||||
if (!ptr) {
|
||||
pr_warn("failed to allocate %s page for cpu%u\n",
|
||||
psize_str, cpu);
|
||||
@ -3223,7 +3318,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
|
||||
(unsigned long)vm.addr + unit * ai->unit_size;
|
||||
|
||||
for (i = 0; i < unit_pages; i++)
|
||||
populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
|
||||
pcpu_populate_pte(unit_addr + (i << PAGE_SHIFT));
|
||||
|
||||
/* pte already populated, the following shouldn't fail */
|
||||
rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
|
||||
@ -3253,10 +3348,10 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
|
||||
|
||||
enomem:
|
||||
while (--j >= 0)
|
||||
free_fn(page_address(pages[j]), PAGE_SIZE);
|
||||
pcpu_fc_free(page_address(pages[j]), PAGE_SIZE);
|
||||
rc = -ENOMEM;
|
||||
out_free_ar:
|
||||
memblock_free_early(__pa(pages), pages_size);
|
||||
memblock_free(pages, pages_size);
|
||||
pcpu_free_alloc_info(ai);
|
||||
return rc;
|
||||
}
|
||||
@ -3278,17 +3373,6 @@ int __init pcpu_page_first_chunk(size_t reserved_size,
|
||||
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
|
||||
EXPORT_SYMBOL(__per_cpu_offset);
|
||||
|
||||
static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
|
||||
size_t align)
|
||||
{
|
||||
return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
|
||||
}
|
||||
|
||||
static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
|
||||
{
|
||||
memblock_free_early(__pa(ptr), size);
|
||||
}
|
||||
|
||||
void __init setup_per_cpu_areas(void)
|
||||
{
|
||||
unsigned long delta;
|
||||
@ -3299,9 +3383,8 @@ void __init setup_per_cpu_areas(void)
|
||||
* Always reserve area for module percpu variables. That's
|
||||
* what the legacy allocator did.
|
||||
*/
|
||||
rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
|
||||
PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
|
||||
pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
|
||||
rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE,
|
||||
PAGE_SIZE, NULL, NULL);
|
||||
if (rc < 0)
|
||||
panic("Failed to initialize percpu areas.");
|
||||
|
||||
|
@ -10,6 +10,7 @@
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/pgtable.h>
|
||||
#include <linux/mm_inline.h>
|
||||
#include <asm/tlb.h>
|
||||
|
||||
/*
|
||||
|
@ -12,7 +12,6 @@
|
||||
#include <linux/dax.h>
|
||||
#include <linux/gfp.h>
|
||||
#include <linux/export.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/task_io_accounting_ops.h>
|
||||
#include <linux/pagevec.h>
|
||||
@ -197,9 +196,9 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
|
||||
* Preallocate as many pages as we will need.
|
||||
*/
|
||||
for (i = 0; i < nr_to_read; i++) {
|
||||
struct page *page = xa_load(&mapping->i_pages, index + i);
|
||||
struct folio *folio = xa_load(&mapping->i_pages, index + i);
|
||||
|
||||
if (page && !xa_is_value(page)) {
|
||||
if (folio && !xa_is_value(folio)) {
|
||||
/*
|
||||
* Page already present? Kick off the current batch
|
||||
* of contiguous pages before continuing with the
|
||||
@ -213,21 +212,21 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
|
||||
continue;
|
||||
}
|
||||
|
||||
page = __page_cache_alloc(gfp_mask);
|
||||
if (!page)
|
||||
folio = filemap_alloc_folio(gfp_mask, 0);
|
||||
if (!folio)
|
||||
break;
|
||||
if (mapping->a_ops->readpages) {
|
||||
page->index = index + i;
|
||||
list_add(&page->lru, &page_pool);
|
||||
} else if (add_to_page_cache_lru(page, mapping, index + i,
|
||||
folio->index = index + i;
|
||||
list_add(&folio->lru, &page_pool);
|
||||
} else if (filemap_add_folio(mapping, folio, index + i,
|
||||
gfp_mask) < 0) {
|
||||
put_page(page);
|
||||
folio_put(folio);
|
||||
read_pages(ractl, &page_pool, true);
|
||||
i = ractl->_index + ractl->_nr_pages - index - 1;
|
||||
continue;
|
||||
}
|
||||
if (i == nr_to_read - lookahead_size)
|
||||
SetPageReadahead(page);
|
||||
folio_set_readahead(folio);
|
||||
ractl->_nr_pages++;
|
||||
}
|
||||
|
||||
@ -309,7 +308,7 @@ void force_page_cache_ra(struct readahead_control *ractl,
|
||||
* Set the initial window size, round to next power of 2 and square
|
||||
* for small size, x 4 for medium, and x 2 for large
|
||||
* for 128k (32 page) max ra
|
||||
* 1-8 page = 32k initial, > 8 page = 128k initial
|
||||
* 1-2 page = 16k, 3-4 page 32k, 5-8 page = 64k, > 8 page = 128k initial
|
||||
*/
|
||||
static unsigned long get_init_ra_size(unsigned long size, unsigned long max)
|
||||
{
|
||||
@ -582,7 +581,7 @@ void page_cache_sync_ra(struct readahead_control *ractl,
|
||||
EXPORT_SYMBOL_GPL(page_cache_sync_ra);
|
||||
|
||||
void page_cache_async_ra(struct readahead_control *ractl,
|
||||
struct page *page, unsigned long req_count)
|
||||
struct folio *folio, unsigned long req_count)
|
||||
{
|
||||
/* no read-ahead */
|
||||
if (!ractl->ra->ra_pages)
|
||||
@ -591,10 +590,10 @@ void page_cache_async_ra(struct readahead_control *ractl,
|
||||
/*
|
||||
* Same bit is used for PG_readahead and PG_reclaim.
|
||||
*/
|
||||
if (PageWriteback(page))
|
||||
if (folio_test_writeback(folio))
|
||||
return;
|
||||
|
||||
ClearPageReadahead(page);
|
||||
folio_clear_readahead(folio);
|
||||
|
||||
/*
|
||||
* Defer asynchronous read-ahead on IO congestion.
|
||||
|
65
mm/rmap.c
65
mm/rmap.c
@ -34,7 +34,7 @@
|
||||
* mapping->private_lock (in __set_page_dirty_buffers)
|
||||
* lock_page_memcg move_lock (in __set_page_dirty_buffers)
|
||||
* i_pages lock (widely used)
|
||||
* lruvec->lru_lock (in lock_page_lruvec_irq)
|
||||
* lruvec->lru_lock (in folio_lruvec_lock_irq)
|
||||
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
|
||||
* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
|
||||
* sb_lock (within inode_lock in fs/fs-writeback.c)
|
||||
@ -621,9 +621,20 @@ void try_to_unmap_flush_dirty(void)
|
||||
try_to_unmap_flush();
|
||||
}
|
||||
|
||||
/*
|
||||
* Bits 0-14 of mm->tlb_flush_batched record pending generations.
|
||||
* Bits 16-30 of mm->tlb_flush_batched bit record flushed generations.
|
||||
*/
|
||||
#define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16
|
||||
#define TLB_FLUSH_BATCH_PENDING_MASK \
|
||||
((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1)
|
||||
#define TLB_FLUSH_BATCH_PENDING_LARGE \
|
||||
(TLB_FLUSH_BATCH_PENDING_MASK / 2)
|
||||
|
||||
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
|
||||
{
|
||||
struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc;
|
||||
int batch, nbatch;
|
||||
|
||||
arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
|
||||
tlb_ubc->flush_required = true;
|
||||
@ -633,7 +644,22 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
|
||||
* before the PTE is cleared.
|
||||
*/
|
||||
barrier();
|
||||
mm->tlb_flush_batched = true;
|
||||
batch = atomic_read(&mm->tlb_flush_batched);
|
||||
retry:
|
||||
if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) {
|
||||
/*
|
||||
* Prevent `pending' from catching up with `flushed' because of
|
||||
* overflow. Reset `pending' and `flushed' to be 1 and 0 if
|
||||
* `pending' becomes large.
|
||||
*/
|
||||
nbatch = atomic_cmpxchg(&mm->tlb_flush_batched, batch, 1);
|
||||
if (nbatch != batch) {
|
||||
batch = nbatch;
|
||||
goto retry;
|
||||
}
|
||||
} else {
|
||||
atomic_inc(&mm->tlb_flush_batched);
|
||||
}
|
||||
|
||||
/*
|
||||
* If the PTE was dirty then it's best to assume it's writable. The
|
||||
@ -680,15 +706,18 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
|
||||
*/
|
||||
void flush_tlb_batched_pending(struct mm_struct *mm)
|
||||
{
|
||||
if (data_race(mm->tlb_flush_batched)) {
|
||||
flush_tlb_mm(mm);
|
||||
int batch = atomic_read(&mm->tlb_flush_batched);
|
||||
int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK;
|
||||
int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT;
|
||||
|
||||
if (pending != flushed) {
|
||||
flush_tlb_mm(mm);
|
||||
/*
|
||||
* Do not allow the compiler to re-order the clearing of
|
||||
* tlb_flush_batched before the tlb is flushed.
|
||||
* If the new TLB flushing is pending during flushing, leave
|
||||
* mm->tlb_flush_batched as is, to avoid losing flushing.
|
||||
*/
|
||||
barrier();
|
||||
mm->tlb_flush_batched = false;
|
||||
atomic_cmpxchg(&mm->tlb_flush_batched, batch,
|
||||
pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT));
|
||||
}
|
||||
}
|
||||
#else
|
||||
@ -981,7 +1010,7 @@ static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg)
|
||||
return true;
|
||||
}
|
||||
|
||||
int page_mkclean(struct page *page)
|
||||
int folio_mkclean(struct folio *folio)
|
||||
{
|
||||
int cleaned = 0;
|
||||
struct address_space *mapping;
|
||||
@ -991,20 +1020,20 @@ int page_mkclean(struct page *page)
|
||||
.invalid_vma = invalid_mkclean_vma,
|
||||
};
|
||||
|
||||
BUG_ON(!PageLocked(page));
|
||||
BUG_ON(!folio_test_locked(folio));
|
||||
|
||||
if (!page_mapped(page))
|
||||
if (!folio_mapped(folio))
|
||||
return 0;
|
||||
|
||||
mapping = page_mapping(page);
|
||||
mapping = folio_mapping(folio);
|
||||
if (!mapping)
|
||||
return 0;
|
||||
|
||||
rmap_walk(page, &rwc);
|
||||
rmap_walk(&folio->page, &rwc);
|
||||
|
||||
return cleaned;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(page_mkclean);
|
||||
EXPORT_SYMBOL_GPL(folio_mkclean);
|
||||
|
||||
/**
|
||||
* page_move_anon_rmap - move a page to our anon_vma
|
||||
@ -1807,6 +1836,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
|
||||
update_hiwater_rss(mm);
|
||||
|
||||
if (is_zone_device_page(page)) {
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
swp_entry_t entry;
|
||||
pte_t swp_pte;
|
||||
|
||||
@ -1815,8 +1845,11 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma,
|
||||
* pte. do_swap_page() will wait until the migration
|
||||
* pte is removed and then restart fault handling.
|
||||
*/
|
||||
entry = make_readable_migration_entry(
|
||||
page_to_pfn(page));
|
||||
entry = pte_to_swp_entry(pteval);
|
||||
if (is_writable_device_private_entry(entry))
|
||||
entry = make_writable_migration_entry(pfn);
|
||||
else
|
||||
entry = make_readable_migration_entry(pfn);
|
||||
swp_pte = swp_entry_to_pte(entry);
|
||||
|
||||
/*
|
||||
|
302
mm/shmem.c
302
mm/shmem.c
@ -36,7 +36,6 @@
|
||||
#include <linux/uio.h>
|
||||
#include <linux/khugepaged.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/frontswap.h>
|
||||
#include <linux/fs_parser.h>
|
||||
#include <linux/swapfile.h>
|
||||
|
||||
@ -59,7 +58,6 @@ static struct vfsmount *shm_mnt;
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/shmem_fs.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/percpu_counter.h>
|
||||
#include <linux/falloc.h>
|
||||
@ -700,7 +698,6 @@ static int shmem_add_to_page_cache(struct page *page,
|
||||
struct mm_struct *charge_mm)
|
||||
{
|
||||
XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
|
||||
unsigned long i = 0;
|
||||
unsigned long nr = compound_nr(page);
|
||||
int error;
|
||||
|
||||
@ -715,7 +712,7 @@ static int shmem_add_to_page_cache(struct page *page,
|
||||
page->index = index;
|
||||
|
||||
if (!PageSwapCache(page)) {
|
||||
error = mem_cgroup_charge(page, charge_mm, gfp);
|
||||
error = mem_cgroup_charge(page_folio(page), charge_mm, gfp);
|
||||
if (error) {
|
||||
if (PageTransHuge(page)) {
|
||||
count_vm_event(THP_FILE_FALLBACK);
|
||||
@ -727,20 +724,18 @@ static int shmem_add_to_page_cache(struct page *page,
|
||||
cgroup_throttle_swaprate(page, gfp);
|
||||
|
||||
do {
|
||||
void *entry;
|
||||
xas_lock_irq(&xas);
|
||||
entry = xas_find_conflict(&xas);
|
||||
if (entry != expected)
|
||||
if (expected != xas_find_conflict(&xas)) {
|
||||
xas_set_err(&xas, -EEXIST);
|
||||
xas_create_range(&xas);
|
||||
goto unlock;
|
||||
}
|
||||
if (expected && xas_find_conflict(&xas)) {
|
||||
xas_set_err(&xas, -EEXIST);
|
||||
goto unlock;
|
||||
}
|
||||
xas_store(&xas, page);
|
||||
if (xas_error(&xas))
|
||||
goto unlock;
|
||||
next:
|
||||
xas_store(&xas, page);
|
||||
if (++i < nr) {
|
||||
xas_next(&xas);
|
||||
goto next;
|
||||
}
|
||||
if (PageTransHuge(page)) {
|
||||
count_vm_event(THP_FILE_ALLOC);
|
||||
__mod_lruvec_page_state(page, NR_SHMEM_THPS, nr);
|
||||
@ -861,9 +856,8 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma)
|
||||
return swapped << PAGE_SHIFT;
|
||||
|
||||
/* Here comes the more involved part */
|
||||
return shmem_partial_swap_usage(mapping,
|
||||
linear_page_index(vma, vma->vm_start),
|
||||
linear_page_index(vma, vma->vm_end));
|
||||
return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
|
||||
vma->vm_pgoff + vma_pages(vma));
|
||||
}
|
||||
|
||||
/*
|
||||
@ -887,30 +881,26 @@ void shmem_unlock_mapping(struct address_space *mapping)
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Check whether a hole-punch or truncation needs to split a huge page,
|
||||
* returning true if no split was required, or the split has been successful.
|
||||
*
|
||||
* Eviction (or truncation to 0 size) should never need to split a huge page;
|
||||
* but in rare cases might do so, if shmem_undo_range() failed to trylock on
|
||||
* head, and then succeeded to trylock on tail.
|
||||
*
|
||||
* A split can only succeed when there are no additional references on the
|
||||
* huge page: so the split below relies upon find_get_entries() having stopped
|
||||
* when it found a subpage of the huge page, without getting further references.
|
||||
*/
|
||||
static bool shmem_punch_compound(struct page *page, pgoff_t start, pgoff_t end)
|
||||
static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
|
||||
{
|
||||
if (!PageTransCompound(page))
|
||||
return true;
|
||||
struct folio *folio;
|
||||
struct page *page;
|
||||
|
||||
/* Just proceed to delete a huge page wholly within the range punched */
|
||||
if (PageHead(page) &&
|
||||
page->index >= start && page->index + HPAGE_PMD_NR <= end)
|
||||
return true;
|
||||
|
||||
/* Try to split huge page, so we can truly punch the hole or truncate */
|
||||
return split_huge_page(page) >= 0;
|
||||
/*
|
||||
* At first avoid shmem_getpage(,,,SGP_READ): that fails
|
||||
* beyond i_size, and reports fallocated pages as holes.
|
||||
*/
|
||||
folio = __filemap_get_folio(inode->i_mapping, index,
|
||||
FGP_ENTRY | FGP_LOCK, 0);
|
||||
if (!xa_is_value(folio))
|
||||
return folio;
|
||||
/*
|
||||
* But read a page back from swap if any of it is within i_size
|
||||
* (although in some cases this is just a waste of time).
|
||||
*/
|
||||
page = NULL;
|
||||
shmem_getpage(inode, index, &page, SGP_READ);
|
||||
return page ? page_folio(page) : NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -924,10 +914,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
|
||||
struct shmem_inode_info *info = SHMEM_I(inode);
|
||||
pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
|
||||
pgoff_t end = (lend + 1) >> PAGE_SHIFT;
|
||||
unsigned int partial_start = lstart & (PAGE_SIZE - 1);
|
||||
unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
|
||||
struct pagevec pvec;
|
||||
struct folio_batch fbatch;
|
||||
pgoff_t indices[PAGEVEC_SIZE];
|
||||
struct folio *folio;
|
||||
bool same_folio;
|
||||
long nr_swaps_freed = 0;
|
||||
pgoff_t index;
|
||||
int i;
|
||||
@ -938,67 +928,64 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
|
||||
if (info->fallocend > start && info->fallocend <= end && !unfalloc)
|
||||
info->fallocend = start;
|
||||
|
||||
pagevec_init(&pvec);
|
||||
folio_batch_init(&fbatch);
|
||||
index = start;
|
||||
while (index < end && find_lock_entries(mapping, index, end - 1,
|
||||
&pvec, indices)) {
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
&fbatch, indices)) {
|
||||
for (i = 0; i < folio_batch_count(&fbatch); i++) {
|
||||
folio = fbatch.folios[i];
|
||||
|
||||
index = indices[i];
|
||||
|
||||
if (xa_is_value(page)) {
|
||||
if (xa_is_value(folio)) {
|
||||
if (unfalloc)
|
||||
continue;
|
||||
nr_swaps_freed += !shmem_free_swap(mapping,
|
||||
index, page);
|
||||
index, folio);
|
||||
continue;
|
||||
}
|
||||
index += thp_nr_pages(page) - 1;
|
||||
index += folio_nr_pages(folio) - 1;
|
||||
|
||||
if (!unfalloc || !PageUptodate(page))
|
||||
truncate_inode_page(mapping, page);
|
||||
unlock_page(page);
|
||||
if (!unfalloc || !folio_test_uptodate(folio))
|
||||
truncate_inode_folio(mapping, folio);
|
||||
folio_unlock(folio);
|
||||
}
|
||||
pagevec_remove_exceptionals(&pvec);
|
||||
pagevec_release(&pvec);
|
||||
folio_batch_remove_exceptionals(&fbatch);
|
||||
folio_batch_release(&fbatch);
|
||||
cond_resched();
|
||||
index++;
|
||||
}
|
||||
|
||||
if (partial_start) {
|
||||
struct page *page = NULL;
|
||||
shmem_getpage(inode, start - 1, &page, SGP_READ);
|
||||
if (page) {
|
||||
unsigned int top = PAGE_SIZE;
|
||||
if (start > end) {
|
||||
top = partial_end;
|
||||
partial_end = 0;
|
||||
}
|
||||
zero_user_segment(page, partial_start, top);
|
||||
set_page_dirty(page);
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
|
||||
folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
|
||||
if (folio) {
|
||||
same_folio = lend < folio_pos(folio) + folio_size(folio);
|
||||
folio_mark_dirty(folio);
|
||||
if (!truncate_inode_partial_folio(folio, lstart, lend)) {
|
||||
start = folio->index + folio_nr_pages(folio);
|
||||
if (same_folio)
|
||||
end = folio->index;
|
||||
}
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
folio = NULL;
|
||||
}
|
||||
if (partial_end) {
|
||||
struct page *page = NULL;
|
||||
shmem_getpage(inode, end, &page, SGP_READ);
|
||||
if (page) {
|
||||
zero_user_segment(page, 0, partial_end);
|
||||
set_page_dirty(page);
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
if (!same_folio)
|
||||
folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
|
||||
if (folio) {
|
||||
folio_mark_dirty(folio);
|
||||
if (!truncate_inode_partial_folio(folio, lstart, lend))
|
||||
end = folio->index;
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
}
|
||||
if (start >= end)
|
||||
return;
|
||||
|
||||
index = start;
|
||||
while (index < end) {
|
||||
cond_resched();
|
||||
|
||||
if (!find_get_entries(mapping, index, end - 1, &pvec,
|
||||
if (!find_get_entries(mapping, index, end - 1, &fbatch,
|
||||
indices)) {
|
||||
/* If all gone or hole-punch or unfalloc, we're done */
|
||||
if (index == start || end != -1)
|
||||
@ -1007,14 +994,14 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
|
||||
index = start;
|
||||
continue;
|
||||
}
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
for (i = 0; i < folio_batch_count(&fbatch); i++) {
|
||||
folio = fbatch.folios[i];
|
||||
|
||||
index = indices[i];
|
||||
if (xa_is_value(page)) {
|
||||
if (xa_is_value(folio)) {
|
||||
if (unfalloc)
|
||||
continue;
|
||||
if (shmem_free_swap(mapping, index, page)) {
|
||||
if (shmem_free_swap(mapping, index, folio)) {
|
||||
/* Swap was replaced by page: retry */
|
||||
index--;
|
||||
break;
|
||||
@ -1023,32 +1010,24 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
|
||||
continue;
|
||||
}
|
||||
|
||||
lock_page(page);
|
||||
folio_lock(folio);
|
||||
|
||||
if (!unfalloc || !PageUptodate(page)) {
|
||||
if (page_mapping(page) != mapping) {
|
||||
if (!unfalloc || !folio_test_uptodate(folio)) {
|
||||
if (folio_mapping(folio) != mapping) {
|
||||
/* Page was replaced by swap: retry */
|
||||
unlock_page(page);
|
||||
folio_unlock(folio);
|
||||
index--;
|
||||
break;
|
||||
}
|
||||
VM_BUG_ON_PAGE(PageWriteback(page), page);
|
||||
if (shmem_punch_compound(page, start, end))
|
||||
truncate_inode_page(mapping, page);
|
||||
else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
|
||||
/* Wipe the page and don't get stuck */
|
||||
clear_highpage(page);
|
||||
flush_dcache_page(page);
|
||||
set_page_dirty(page);
|
||||
if (index <
|
||||
round_up(start, HPAGE_PMD_NR))
|
||||
start = index + 1;
|
||||
}
|
||||
VM_BUG_ON_FOLIO(folio_test_writeback(folio),
|
||||
folio);
|
||||
truncate_inode_folio(mapping, folio);
|
||||
}
|
||||
unlock_page(page);
|
||||
index = folio->index + folio_nr_pages(folio) - 1;
|
||||
folio_unlock(folio);
|
||||
}
|
||||
pagevec_remove_exceptionals(&pvec);
|
||||
pagevec_release(&pvec);
|
||||
folio_batch_remove_exceptionals(&fbatch);
|
||||
folio_batch_release(&fbatch);
|
||||
index++;
|
||||
}
|
||||
|
||||
@ -1172,7 +1151,7 @@ static void shmem_evict_inode(struct inode *inode)
|
||||
static int shmem_find_swap_entries(struct address_space *mapping,
|
||||
pgoff_t start, unsigned int nr_entries,
|
||||
struct page **entries, pgoff_t *indices,
|
||||
unsigned int type, bool frontswap)
|
||||
unsigned int type)
|
||||
{
|
||||
XA_STATE(xas, &mapping->i_pages, start);
|
||||
struct page *page;
|
||||
@ -1193,9 +1172,6 @@ static int shmem_find_swap_entries(struct address_space *mapping,
|
||||
entry = radix_to_swp_entry(page);
|
||||
if (swp_type(entry) != type)
|
||||
continue;
|
||||
if (frontswap &&
|
||||
!frontswap_test(swap_info[type], swp_offset(entry)))
|
||||
continue;
|
||||
|
||||
indices[ret] = xas.xa_index;
|
||||
entries[ret] = page;
|
||||
@ -1248,26 +1224,20 @@ static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
|
||||
/*
|
||||
* If swap found in inode, free it and move page from swapcache to filecache.
|
||||
*/
|
||||
static int shmem_unuse_inode(struct inode *inode, unsigned int type,
|
||||
bool frontswap, unsigned long *fs_pages_to_unuse)
|
||||
static int shmem_unuse_inode(struct inode *inode, unsigned int type)
|
||||
{
|
||||
struct address_space *mapping = inode->i_mapping;
|
||||
pgoff_t start = 0;
|
||||
struct pagevec pvec;
|
||||
pgoff_t indices[PAGEVEC_SIZE];
|
||||
bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
|
||||
int ret = 0;
|
||||
|
||||
pagevec_init(&pvec);
|
||||
do {
|
||||
unsigned int nr_entries = PAGEVEC_SIZE;
|
||||
|
||||
if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
|
||||
nr_entries = *fs_pages_to_unuse;
|
||||
|
||||
pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
|
||||
pvec.pages, indices,
|
||||
type, frontswap);
|
||||
pvec.pages, indices, type);
|
||||
if (pvec.nr == 0) {
|
||||
ret = 0;
|
||||
break;
|
||||
@ -1277,14 +1247,6 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type,
|
||||
if (ret < 0)
|
||||
break;
|
||||
|
||||
if (frontswap_partial) {
|
||||
*fs_pages_to_unuse -= ret;
|
||||
if (*fs_pages_to_unuse == 0) {
|
||||
ret = FRONTSWAP_PAGES_UNUSED;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
start = indices[pvec.nr - 1];
|
||||
} while (true);
|
||||
|
||||
@ -1296,8 +1258,7 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type,
|
||||
* device 'type' back into memory, so the swap device can be
|
||||
* unused.
|
||||
*/
|
||||
int shmem_unuse(unsigned int type, bool frontswap,
|
||||
unsigned long *fs_pages_to_unuse)
|
||||
int shmem_unuse(unsigned int type)
|
||||
{
|
||||
struct shmem_inode_info *info, *next;
|
||||
int error = 0;
|
||||
@ -1320,8 +1281,7 @@ int shmem_unuse(unsigned int type, bool frontswap,
|
||||
atomic_inc(&info->stop_eviction);
|
||||
mutex_unlock(&shmem_swaplist_mutex);
|
||||
|
||||
error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
|
||||
fs_pages_to_unuse);
|
||||
error = shmem_unuse_inode(&info->vfs_inode, type);
|
||||
cond_resched();
|
||||
|
||||
mutex_lock(&shmem_swaplist_mutex);
|
||||
@ -1566,8 +1526,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
|
||||
return NULL;
|
||||
|
||||
shmem_pseudo_vma_init(&pvma, info, hindex);
|
||||
page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(),
|
||||
true);
|
||||
page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
|
||||
shmem_pseudo_vma_destroy(&pvma);
|
||||
if (page)
|
||||
prep_transhuge_page(page);
|
||||
@ -1642,6 +1601,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
|
||||
struct shmem_inode_info *info, pgoff_t index)
|
||||
{
|
||||
struct page *oldpage, *newpage;
|
||||
struct folio *old, *new;
|
||||
struct address_space *swap_mapping;
|
||||
swp_entry_t entry;
|
||||
pgoff_t swap_index;
|
||||
@ -1678,7 +1638,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
|
||||
xa_lock_irq(&swap_mapping->i_pages);
|
||||
error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
|
||||
if (!error) {
|
||||
mem_cgroup_migrate(oldpage, newpage);
|
||||
old = page_folio(oldpage);
|
||||
new = page_folio(newpage);
|
||||
mem_cgroup_migrate(old, new);
|
||||
__inc_lruvec_page_state(newpage, NR_FILE_PAGES);
|
||||
__dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
|
||||
}
|
||||
@ -2307,6 +2269,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
|
||||
INIT_LIST_HEAD(&info->swaplist);
|
||||
simple_xattrs_init(&info->xattrs);
|
||||
cache_no_acl(inode);
|
||||
mapping_set_large_folios(inode->i_mapping);
|
||||
|
||||
switch (mode & S_IFMT) {
|
||||
default:
|
||||
@ -2429,7 +2392,6 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
|
||||
shmem_recalc_inode(inode);
|
||||
spin_unlock_irq(&info->lock);
|
||||
|
||||
SetPageDirty(page);
|
||||
unlock_page(page);
|
||||
return 0;
|
||||
out_delete_from_cache:
|
||||
@ -2461,6 +2423,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
|
||||
struct inode *inode = mapping->host;
|
||||
struct shmem_inode_info *info = SHMEM_I(inode);
|
||||
pgoff_t index = pos >> PAGE_SHIFT;
|
||||
int ret = 0;
|
||||
|
||||
/* i_rwsem is held by caller */
|
||||
if (unlikely(info->seals & (F_SEAL_GROW |
|
||||
@ -2471,7 +2434,19 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
|
||||
return -EPERM;
|
||||
}
|
||||
|
||||
return shmem_getpage(inode, index, pagep, SGP_WRITE);
|
||||
ret = shmem_getpage(inode, index, pagep, SGP_WRITE);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (PageHWPoison(*pagep)) {
|
||||
unlock_page(*pagep);
|
||||
put_page(*pagep);
|
||||
*pagep = NULL;
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int
|
||||
@ -2558,6 +2533,12 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
|
||||
if (sgp == SGP_CACHE)
|
||||
set_page_dirty(page);
|
||||
unlock_page(page);
|
||||
|
||||
if (PageHWPoison(page)) {
|
||||
put_page(page);
|
||||
error = -EIO;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2950,28 +2931,6 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
|
||||
return shmem_unlink(dir, dentry);
|
||||
}
|
||||
|
||||
static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
|
||||
{
|
||||
bool old_is_dir = d_is_dir(old_dentry);
|
||||
bool new_is_dir = d_is_dir(new_dentry);
|
||||
|
||||
if (old_dir != new_dir && old_is_dir != new_is_dir) {
|
||||
if (old_is_dir) {
|
||||
drop_nlink(old_dir);
|
||||
inc_nlink(new_dir);
|
||||
} else {
|
||||
drop_nlink(new_dir);
|
||||
inc_nlink(old_dir);
|
||||
}
|
||||
}
|
||||
old_dir->i_ctime = old_dir->i_mtime =
|
||||
new_dir->i_ctime = new_dir->i_mtime =
|
||||
d_inode(old_dentry)->i_ctime =
|
||||
d_inode(new_dentry)->i_ctime = current_time(old_dir);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int shmem_whiteout(struct user_namespace *mnt_userns,
|
||||
struct inode *old_dir, struct dentry *old_dentry)
|
||||
{
|
||||
@ -3017,7 +2976,7 @@ static int shmem_rename2(struct user_namespace *mnt_userns,
|
||||
return -EINVAL;
|
||||
|
||||
if (flags & RENAME_EXCHANGE)
|
||||
return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
|
||||
return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
|
||||
|
||||
if (!simple_empty(new_dentry))
|
||||
return -ENOTEMPTY;
|
||||
@ -3119,7 +3078,8 @@ static const char *shmem_get_link(struct dentry *dentry,
|
||||
page = find_get_page(inode->i_mapping, 0);
|
||||
if (!page)
|
||||
return ERR_PTR(-ECHILD);
|
||||
if (!PageUptodate(page)) {
|
||||
if (PageHWPoison(page) ||
|
||||
!PageUptodate(page)) {
|
||||
put_page(page);
|
||||
return ERR_PTR(-ECHILD);
|
||||
}
|
||||
@ -3127,6 +3087,13 @@ static const char *shmem_get_link(struct dentry *dentry,
|
||||
error = shmem_getpage(inode, 0, &page, SGP_READ);
|
||||
if (error)
|
||||
return ERR_PTR(error);
|
||||
if (!page)
|
||||
return ERR_PTR(-ECHILD);
|
||||
if (PageHWPoison(page)) {
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
return ERR_PTR(-ECHILD);
|
||||
}
|
||||
unlock_page(page);
|
||||
}
|
||||
set_delayed_call(done, shmem_put_link, page);
|
||||
@ -3777,6 +3744,13 @@ static void shmem_destroy_inodecache(void)
|
||||
kmem_cache_destroy(shmem_inode_cachep);
|
||||
}
|
||||
|
||||
/* Keep the page in page cache instead of truncating it */
|
||||
static int shmem_error_remove_page(struct address_space *mapping,
|
||||
struct page *page)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
const struct address_space_operations shmem_aops = {
|
||||
.writepage = shmem_writepage,
|
||||
.set_page_dirty = __set_page_dirty_no_writeback,
|
||||
@ -3787,7 +3761,7 @@ const struct address_space_operations shmem_aops = {
|
||||
#ifdef CONFIG_MIGRATION
|
||||
.migratepage = migrate_page,
|
||||
#endif
|
||||
.error_remove_page = generic_error_remove_page,
|
||||
.error_remove_page = shmem_error_remove_page,
|
||||
};
|
||||
EXPORT_SYMBOL(shmem_aops);
|
||||
|
||||
@ -3897,7 +3871,7 @@ static struct file_system_type shmem_fs_type = {
|
||||
.parameters = shmem_fs_parameters,
|
||||
#endif
|
||||
.kill_sb = kill_litter_super,
|
||||
.fs_flags = FS_USERNS_MOUNT | FS_THP_SUPPORT,
|
||||
.fs_flags = FS_USERNS_MOUNT,
|
||||
};
|
||||
|
||||
int __init shmem_init(void)
|
||||
@ -4021,8 +3995,7 @@ int __init shmem_init(void)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int shmem_unuse(unsigned int type, bool frontswap,
|
||||
unsigned long *fs_pages_to_unuse)
|
||||
int shmem_unuse(unsigned int type)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
@ -4195,9 +4168,14 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
|
||||
error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
|
||||
gfp, NULL, NULL, NULL);
|
||||
if (error)
|
||||
page = ERR_PTR(error);
|
||||
else
|
||||
unlock_page(page);
|
||||
return ERR_PTR(error);
|
||||
|
||||
unlock_page(page);
|
||||
if (PageHWPoison(page)) {
|
||||
put_page(page);
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
return page;
|
||||
#else
|
||||
/*
|
||||
|
301
mm/slab.h
301
mm/slab.h
@ -5,6 +5,191 @@
|
||||
* Internal slab definitions
|
||||
*/
|
||||
|
||||
/* Reuses the bits in struct page */
|
||||
struct slab {
|
||||
unsigned long __page_flags;
|
||||
|
||||
#if defined(CONFIG_SLAB)
|
||||
|
||||
union {
|
||||
struct list_head slab_list;
|
||||
struct rcu_head rcu_head;
|
||||
};
|
||||
struct kmem_cache *slab_cache;
|
||||
void *freelist; /* array of free object indexes */
|
||||
void *s_mem; /* first object */
|
||||
unsigned int active;
|
||||
|
||||
#elif defined(CONFIG_SLUB)
|
||||
|
||||
union {
|
||||
struct list_head slab_list;
|
||||
struct rcu_head rcu_head;
|
||||
#ifdef CONFIG_SLUB_CPU_PARTIAL
|
||||
struct {
|
||||
struct slab *next;
|
||||
int slabs; /* Nr of slabs left */
|
||||
};
|
||||
#endif
|
||||
};
|
||||
struct kmem_cache *slab_cache;
|
||||
/* Double-word boundary */
|
||||
void *freelist; /* first free object */
|
||||
union {
|
||||
unsigned long counters;
|
||||
struct {
|
||||
unsigned inuse:16;
|
||||
unsigned objects:15;
|
||||
unsigned frozen:1;
|
||||
};
|
||||
};
|
||||
unsigned int __unused;
|
||||
|
||||
#elif defined(CONFIG_SLOB)
|
||||
|
||||
struct list_head slab_list;
|
||||
void *__unused_1;
|
||||
void *freelist; /* first free block */
|
||||
long units;
|
||||
unsigned int __unused_2;
|
||||
|
||||
#else
|
||||
#error "Unexpected slab allocator configured"
|
||||
#endif
|
||||
|
||||
atomic_t __page_refcount;
|
||||
#ifdef CONFIG_MEMCG
|
||||
unsigned long memcg_data;
|
||||
#endif
|
||||
};
|
||||
|
||||
#define SLAB_MATCH(pg, sl) \
|
||||
static_assert(offsetof(struct page, pg) == offsetof(struct slab, sl))
|
||||
SLAB_MATCH(flags, __page_flags);
|
||||
SLAB_MATCH(compound_head, slab_list); /* Ensure bit 0 is clear */
|
||||
#ifndef CONFIG_SLOB
|
||||
SLAB_MATCH(rcu_head, rcu_head);
|
||||
#endif
|
||||
SLAB_MATCH(_refcount, __page_refcount);
|
||||
#ifdef CONFIG_MEMCG
|
||||
SLAB_MATCH(memcg_data, memcg_data);
|
||||
#endif
|
||||
#undef SLAB_MATCH
|
||||
static_assert(sizeof(struct slab) <= sizeof(struct page));
|
||||
|
||||
/**
|
||||
* folio_slab - Converts from folio to slab.
|
||||
* @folio: The folio.
|
||||
*
|
||||
* Currently struct slab is a different representation of a folio where
|
||||
* folio_test_slab() is true.
|
||||
*
|
||||
* Return: The slab which contains this folio.
|
||||
*/
|
||||
#define folio_slab(folio) (_Generic((folio), \
|
||||
const struct folio *: (const struct slab *)(folio), \
|
||||
struct folio *: (struct slab *)(folio)))
|
||||
|
||||
/**
|
||||
* slab_folio - The folio allocated for a slab
|
||||
* @slab: The slab.
|
||||
*
|
||||
* Slabs are allocated as folios that contain the individual objects and are
|
||||
* using some fields in the first struct page of the folio - those fields are
|
||||
* now accessed by struct slab. It is occasionally necessary to convert back to
|
||||
* a folio in order to communicate with the rest of the mm. Please use this
|
||||
* helper function instead of casting yourself, as the implementation may change
|
||||
* in the future.
|
||||
*/
|
||||
#define slab_folio(s) (_Generic((s), \
|
||||
const struct slab *: (const struct folio *)s, \
|
||||
struct slab *: (struct folio *)s))
|
||||
|
||||
/**
|
||||
* page_slab - Converts from first struct page to slab.
|
||||
* @p: The first (either head of compound or single) page of slab.
|
||||
*
|
||||
* A temporary wrapper to convert struct page to struct slab in situations where
|
||||
* we know the page is the compound head, or single order-0 page.
|
||||
*
|
||||
* Long-term ideally everything would work with struct slab directly or go
|
||||
* through folio to struct slab.
|
||||
*
|
||||
* Return: The slab which contains this page
|
||||
*/
|
||||
#define page_slab(p) (_Generic((p), \
|
||||
const struct page *: (const struct slab *)(p), \
|
||||
struct page *: (struct slab *)(p)))
|
||||
|
||||
/**
|
||||
* slab_page - The first struct page allocated for a slab
|
||||
* @slab: The slab.
|
||||
*
|
||||
* A convenience wrapper for converting slab to the first struct page of the
|
||||
* underlying folio, to communicate with code not yet converted to folio or
|
||||
* struct slab.
|
||||
*/
|
||||
#define slab_page(s) folio_page(slab_folio(s), 0)
|
||||
|
||||
/*
|
||||
* If network-based swap is enabled, sl*b must keep track of whether pages
|
||||
* were allocated from pfmemalloc reserves.
|
||||
*/
|
||||
static inline bool slab_test_pfmemalloc(const struct slab *slab)
|
||||
{
|
||||
return folio_test_active((struct folio *)slab_folio(slab));
|
||||
}
|
||||
|
||||
static inline void slab_set_pfmemalloc(struct slab *slab)
|
||||
{
|
||||
folio_set_active(slab_folio(slab));
|
||||
}
|
||||
|
||||
static inline void slab_clear_pfmemalloc(struct slab *slab)
|
||||
{
|
||||
folio_clear_active(slab_folio(slab));
|
||||
}
|
||||
|
||||
static inline void __slab_clear_pfmemalloc(struct slab *slab)
|
||||
{
|
||||
__folio_clear_active(slab_folio(slab));
|
||||
}
|
||||
|
||||
static inline void *slab_address(const struct slab *slab)
|
||||
{
|
||||
return folio_address(slab_folio(slab));
|
||||
}
|
||||
|
||||
static inline int slab_nid(const struct slab *slab)
|
||||
{
|
||||
return folio_nid(slab_folio(slab));
|
||||
}
|
||||
|
||||
static inline pg_data_t *slab_pgdat(const struct slab *slab)
|
||||
{
|
||||
return folio_pgdat(slab_folio(slab));
|
||||
}
|
||||
|
||||
static inline struct slab *virt_to_slab(const void *addr)
|
||||
{
|
||||
struct folio *folio = virt_to_folio(addr);
|
||||
|
||||
if (!folio_test_slab(folio))
|
||||
return NULL;
|
||||
|
||||
return folio_slab(folio);
|
||||
}
|
||||
|
||||
static inline int slab_order(const struct slab *slab)
|
||||
{
|
||||
return folio_order((struct folio *)slab_folio(slab));
|
||||
}
|
||||
|
||||
static inline size_t slab_size(const struct slab *slab)
|
||||
{
|
||||
return PAGE_SIZE << slab_order(slab);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SLOB
|
||||
/*
|
||||
* Common fields provided in kmem_cache by all slab allocators
|
||||
@ -245,15 +430,33 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG_KMEM
|
||||
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
|
||||
gfp_t gfp, bool new_page);
|
||||
/*
|
||||
* slab_objcgs - get the object cgroups vector associated with a slab
|
||||
* @slab: a pointer to the slab struct
|
||||
*
|
||||
* Returns a pointer to the object cgroups vector associated with the slab,
|
||||
* or NULL if no such vector has been associated yet.
|
||||
*/
|
||||
static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
|
||||
{
|
||||
unsigned long memcg_data = READ_ONCE(slab->memcg_data);
|
||||
|
||||
VM_BUG_ON_PAGE(memcg_data && !(memcg_data & MEMCG_DATA_OBJCGS),
|
||||
slab_page(slab));
|
||||
VM_BUG_ON_PAGE(memcg_data & MEMCG_DATA_KMEM, slab_page(slab));
|
||||
|
||||
return (struct obj_cgroup **)(memcg_data & ~MEMCG_DATA_FLAGS_MASK);
|
||||
}
|
||||
|
||||
int memcg_alloc_slab_cgroups(struct slab *slab, struct kmem_cache *s,
|
||||
gfp_t gfp, bool new_slab);
|
||||
void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
|
||||
enum node_stat_item idx, int nr);
|
||||
|
||||
static inline void memcg_free_page_obj_cgroups(struct page *page)
|
||||
static inline void memcg_free_slab_cgroups(struct slab *slab)
|
||||
{
|
||||
kfree(page_objcgs(page));
|
||||
page->memcg_data = 0;
|
||||
kfree(slab_objcgs(slab));
|
||||
slab->memcg_data = 0;
|
||||
}
|
||||
|
||||
static inline size_t obj_full_size(struct kmem_cache *s)
|
||||
@ -298,7 +501,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
|
||||
gfp_t flags, size_t size,
|
||||
void **p)
|
||||
{
|
||||
struct page *page;
|
||||
struct slab *slab;
|
||||
unsigned long off;
|
||||
size_t i;
|
||||
|
||||
@ -307,19 +510,19 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
|
||||
|
||||
for (i = 0; i < size; i++) {
|
||||
if (likely(p[i])) {
|
||||
page = virt_to_head_page(p[i]);
|
||||
slab = virt_to_slab(p[i]);
|
||||
|
||||
if (!page_objcgs(page) &&
|
||||
memcg_alloc_page_obj_cgroups(page, s, flags,
|
||||
if (!slab_objcgs(slab) &&
|
||||
memcg_alloc_slab_cgroups(slab, s, flags,
|
||||
false)) {
|
||||
obj_cgroup_uncharge(objcg, obj_full_size(s));
|
||||
continue;
|
||||
}
|
||||
|
||||
off = obj_to_index(s, page, p[i]);
|
||||
off = obj_to_index(s, slab, p[i]);
|
||||
obj_cgroup_get(objcg);
|
||||
page_objcgs(page)[off] = objcg;
|
||||
mod_objcg_state(objcg, page_pgdat(page),
|
||||
slab_objcgs(slab)[off] = objcg;
|
||||
mod_objcg_state(objcg, slab_pgdat(slab),
|
||||
cache_vmstat_idx(s), obj_full_size(s));
|
||||
} else {
|
||||
obj_cgroup_uncharge(objcg, obj_full_size(s));
|
||||
@ -334,7 +537,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
|
||||
struct kmem_cache *s;
|
||||
struct obj_cgroup **objcgs;
|
||||
struct obj_cgroup *objcg;
|
||||
struct page *page;
|
||||
struct slab *slab;
|
||||
unsigned int off;
|
||||
int i;
|
||||
|
||||
@ -345,43 +548,52 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
|
||||
if (unlikely(!p[i]))
|
||||
continue;
|
||||
|
||||
page = virt_to_head_page(p[i]);
|
||||
objcgs = page_objcgs_check(page);
|
||||
slab = virt_to_slab(p[i]);
|
||||
/* we could be given a kmalloc_large() object, skip those */
|
||||
if (!slab)
|
||||
continue;
|
||||
|
||||
objcgs = slab_objcgs(slab);
|
||||
if (!objcgs)
|
||||
continue;
|
||||
|
||||
if (!s_orig)
|
||||
s = page->slab_cache;
|
||||
s = slab->slab_cache;
|
||||
else
|
||||
s = s_orig;
|
||||
|
||||
off = obj_to_index(s, page, p[i]);
|
||||
off = obj_to_index(s, slab, p[i]);
|
||||
objcg = objcgs[off];
|
||||
if (!objcg)
|
||||
continue;
|
||||
|
||||
objcgs[off] = NULL;
|
||||
obj_cgroup_uncharge(objcg, obj_full_size(s));
|
||||
mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
|
||||
mod_objcg_state(objcg, slab_pgdat(slab), cache_vmstat_idx(s),
|
||||
-obj_full_size(s));
|
||||
obj_cgroup_put(objcg);
|
||||
}
|
||||
}
|
||||
|
||||
#else /* CONFIG_MEMCG_KMEM */
|
||||
static inline struct obj_cgroup **slab_objcgs(struct slab *slab)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline int memcg_alloc_page_obj_cgroups(struct page *page,
|
||||
static inline int memcg_alloc_slab_cgroups(struct slab *slab,
|
||||
struct kmem_cache *s, gfp_t gfp,
|
||||
bool new_page)
|
||||
bool new_slab)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void memcg_free_page_obj_cgroups(struct page *page)
|
||||
static inline void memcg_free_slab_cgroups(struct slab *slab)
|
||||
{
|
||||
}
|
||||
|
||||
@ -405,35 +617,35 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s,
|
||||
}
|
||||
#endif /* CONFIG_MEMCG_KMEM */
|
||||
|
||||
#ifndef CONFIG_SLOB
|
||||
static inline struct kmem_cache *virt_to_cache(const void *obj)
|
||||
{
|
||||
struct page *page;
|
||||
struct slab *slab;
|
||||
|
||||
page = virt_to_head_page(obj);
|
||||
if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n",
|
||||
slab = virt_to_slab(obj);
|
||||
if (WARN_ONCE(!slab, "%s: Object is not a Slab page!\n",
|
||||
__func__))
|
||||
return NULL;
|
||||
return page->slab_cache;
|
||||
return slab->slab_cache;
|
||||
}
|
||||
|
||||
static __always_inline void account_slab_page(struct page *page, int order,
|
||||
struct kmem_cache *s,
|
||||
gfp_t gfp)
|
||||
static __always_inline void account_slab(struct slab *slab, int order,
|
||||
struct kmem_cache *s, gfp_t gfp)
|
||||
{
|
||||
if (memcg_kmem_enabled() && (s->flags & SLAB_ACCOUNT))
|
||||
memcg_alloc_page_obj_cgroups(page, s, gfp, true);
|
||||
memcg_alloc_slab_cgroups(slab, s, gfp, true);
|
||||
|
||||
mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
|
||||
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
|
||||
PAGE_SIZE << order);
|
||||
}
|
||||
|
||||
static __always_inline void unaccount_slab_page(struct page *page, int order,
|
||||
struct kmem_cache *s)
|
||||
static __always_inline void unaccount_slab(struct slab *slab, int order,
|
||||
struct kmem_cache *s)
|
||||
{
|
||||
if (memcg_kmem_enabled())
|
||||
memcg_free_page_obj_cgroups(page);
|
||||
memcg_free_slab_cgroups(slab);
|
||||
|
||||
mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
|
||||
mod_node_page_state(slab_pgdat(slab), cache_vmstat_idx(s),
|
||||
-(PAGE_SIZE << order));
|
||||
}
|
||||
|
||||
@ -452,6 +664,7 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
|
||||
print_tracking(cachep, x);
|
||||
return cachep;
|
||||
}
|
||||
#endif /* CONFIG_SLOB */
|
||||
|
||||
static inline size_t slab_ksize(const struct kmem_cache *s)
|
||||
{
|
||||
@ -575,11 +788,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
|
||||
|
||||
#endif
|
||||
|
||||
void *slab_start(struct seq_file *m, loff_t *pos);
|
||||
void *slab_next(struct seq_file *m, void *p, loff_t *pos);
|
||||
void slab_stop(struct seq_file *m, void *p);
|
||||
int memcg_slab_show(struct seq_file *m, void *p);
|
||||
|
||||
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
|
||||
void dump_unreclaimable_slab(void);
|
||||
#else
|
||||
@ -635,7 +843,7 @@ static inline void debugfs_slab_release(struct kmem_cache *s) { }
|
||||
#define KS_ADDRS_COUNT 16
|
||||
struct kmem_obj_info {
|
||||
void *kp_ptr;
|
||||
struct page *kp_page;
|
||||
struct slab *kp_slab;
|
||||
void *kp_objp;
|
||||
unsigned long kp_data_offset;
|
||||
struct kmem_cache *kp_slab_cache;
|
||||
@ -643,7 +851,18 @@ struct kmem_obj_info {
|
||||
void *kp_stack[KS_ADDRS_COUNT];
|
||||
void *kp_free_stack[KS_ADDRS_COUNT];
|
||||
};
|
||||
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
|
||||
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab);
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR
|
||||
void __check_heap_object(const void *ptr, unsigned long n,
|
||||
const struct slab *slab, bool to_user);
|
||||
#else
|
||||
static inline
|
||||
void __check_heap_object(const void *ptr, unsigned long n,
|
||||
const struct slab *slab, bool to_user)
|
||||
{
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* MM_SLAB_H */
|
||||
|
@ -37,14 +37,6 @@ LIST_HEAD(slab_caches);
|
||||
DEFINE_MUTEX(slab_mutex);
|
||||
struct kmem_cache *kmem_cache;
|
||||
|
||||
#ifdef CONFIG_HARDENED_USERCOPY
|
||||
bool usercopy_fallback __ro_after_init =
|
||||
IS_ENABLED(CONFIG_HARDENED_USERCOPY_FALLBACK);
|
||||
module_param(usercopy_fallback, bool, 0400);
|
||||
MODULE_PARM_DESC(usercopy_fallback,
|
||||
"WARN instead of reject usercopy whitelist violations");
|
||||
#endif
|
||||
|
||||
static LIST_HEAD(slab_caches_to_rcu_destroy);
|
||||
static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work);
|
||||
static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
|
||||
@ -497,9 +489,7 @@ void slab_kmem_cache_release(struct kmem_cache *s)
|
||||
|
||||
void kmem_cache_destroy(struct kmem_cache *s)
|
||||
{
|
||||
int err;
|
||||
|
||||
if (unlikely(!s))
|
||||
if (unlikely(!s) || !kasan_check_byte(s))
|
||||
return;
|
||||
|
||||
cpus_read_lock();
|
||||
@ -509,12 +499,9 @@ void kmem_cache_destroy(struct kmem_cache *s)
|
||||
if (s->refcount)
|
||||
goto out_unlock;
|
||||
|
||||
err = shutdown_cache(s);
|
||||
if (err) {
|
||||
pr_err("%s %s: Slab cache still has objects\n",
|
||||
__func__, s->name);
|
||||
dump_stack();
|
||||
}
|
||||
WARN(shutdown_cache(s),
|
||||
"%s %s: Slab cache still has objects when called from %pS",
|
||||
__func__, s->name, (void *)_RET_IP_);
|
||||
out_unlock:
|
||||
mutex_unlock(&slab_mutex);
|
||||
cpus_read_unlock();
|
||||
@ -558,13 +545,13 @@ bool slab_is_available(void)
|
||||
*/
|
||||
bool kmem_valid_obj(void *object)
|
||||
{
|
||||
struct page *page;
|
||||
struct folio *folio;
|
||||
|
||||
/* Some arches consider ZERO_SIZE_PTR to be a valid address. */
|
||||
if (object < (void *)PAGE_SIZE || !virt_addr_valid(object))
|
||||
return false;
|
||||
page = virt_to_head_page(object);
|
||||
return PageSlab(page);
|
||||
folio = virt_to_folio(object);
|
||||
return folio_test_slab(folio);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(kmem_valid_obj);
|
||||
|
||||
@ -587,18 +574,18 @@ void kmem_dump_obj(void *object)
|
||||
{
|
||||
char *cp = IS_ENABLED(CONFIG_MMU) ? "" : "/vmalloc";
|
||||
int i;
|
||||
struct page *page;
|
||||
struct slab *slab;
|
||||
unsigned long ptroffset;
|
||||
struct kmem_obj_info kp = { };
|
||||
|
||||
if (WARN_ON_ONCE(!virt_addr_valid(object)))
|
||||
return;
|
||||
page = virt_to_head_page(object);
|
||||
if (WARN_ON_ONCE(!PageSlab(page))) {
|
||||
slab = virt_to_slab(object);
|
||||
if (WARN_ON_ONCE(!slab)) {
|
||||
pr_cont(" non-slab memory.\n");
|
||||
return;
|
||||
}
|
||||
kmem_obj_info(&kp, object, page);
|
||||
kmem_obj_info(&kp, object, slab);
|
||||
if (kp.kp_slab_cache)
|
||||
pr_cont(" slab%s %s", cp, kp.kp_slab_cache->name);
|
||||
else
|
||||
@ -832,7 +819,7 @@ void __init setup_kmalloc_cache_index_table(void)
|
||||
|
||||
if (KMALLOC_MIN_SIZE >= 64) {
|
||||
/*
|
||||
* The 96 byte size cache is not used if the alignment
|
||||
* The 96 byte sized cache is not used if the alignment
|
||||
* is 64 byte.
|
||||
*/
|
||||
for (i = 64 + 8; i <= 96; i += 8)
|
||||
@ -857,7 +844,7 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
|
||||
if (type == KMALLOC_RECLAIM) {
|
||||
flags |= SLAB_RECLAIM_ACCOUNT;
|
||||
} else if (IS_ENABLED(CONFIG_MEMCG_KMEM) && (type == KMALLOC_CGROUP)) {
|
||||
if (cgroup_memory_nokmem) {
|
||||
if (mem_cgroup_kmem_disabled()) {
|
||||
kmalloc_caches[type][idx] = kmalloc_caches[KMALLOC_NORMAL][idx];
|
||||
return;
|
||||
}
|
||||
@ -1052,18 +1039,18 @@ static void print_slabinfo_header(struct seq_file *m)
|
||||
seq_putc(m, '\n');
|
||||
}
|
||||
|
||||
void *slab_start(struct seq_file *m, loff_t *pos)
|
||||
static void *slab_start(struct seq_file *m, loff_t *pos)
|
||||
{
|
||||
mutex_lock(&slab_mutex);
|
||||
return seq_list_start(&slab_caches, *pos);
|
||||
}
|
||||
|
||||
void *slab_next(struct seq_file *m, void *p, loff_t *pos)
|
||||
static void *slab_next(struct seq_file *m, void *p, loff_t *pos)
|
||||
{
|
||||
return seq_list_next(p, &slab_caches, pos);
|
||||
}
|
||||
|
||||
void slab_stop(struct seq_file *m, void *p)
|
||||
static void slab_stop(struct seq_file *m, void *p)
|
||||
{
|
||||
mutex_unlock(&slab_mutex);
|
||||
}
|
||||
@ -1131,17 +1118,6 @@ void dump_unreclaimable_slab(void)
|
||||
mutex_unlock(&slab_mutex);
|
||||
}
|
||||
|
||||
#if defined(CONFIG_MEMCG_KMEM)
|
||||
int memcg_slab_show(struct seq_file *m, void *p)
|
||||
{
|
||||
/*
|
||||
* Deprecated.
|
||||
* Please, take a look at tools/cgroup/slabinfo.py .
|
||||
*/
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* slabinfo_op - iterator that generates /proc/slabinfo
|
||||
*
|
||||
|
65
mm/slob.c
65
mm/slob.c
@ -30,7 +30,7 @@
|
||||
* If kmalloc is asked for objects of PAGE_SIZE or larger, it calls
|
||||
* alloc_pages() directly, allocating compound pages so the page order
|
||||
* does not have to be separately tracked.
|
||||
* These objects are detected in kfree() because PageSlab()
|
||||
* These objects are detected in kfree() because folio_test_slab()
|
||||
* is false for them.
|
||||
*
|
||||
* SLAB is emulated on top of SLOB by simply calling constructors and
|
||||
@ -105,21 +105,21 @@ static LIST_HEAD(free_slob_large);
|
||||
/*
|
||||
* slob_page_free: true for pages on free_slob_pages list.
|
||||
*/
|
||||
static inline int slob_page_free(struct page *sp)
|
||||
static inline int slob_page_free(struct slab *slab)
|
||||
{
|
||||
return PageSlobFree(sp);
|
||||
return PageSlobFree(slab_page(slab));
|
||||
}
|
||||
|
||||
static void set_slob_page_free(struct page *sp, struct list_head *list)
|
||||
static void set_slob_page_free(struct slab *slab, struct list_head *list)
|
||||
{
|
||||
list_add(&sp->slab_list, list);
|
||||
__SetPageSlobFree(sp);
|
||||
list_add(&slab->slab_list, list);
|
||||
__SetPageSlobFree(slab_page(slab));
|
||||
}
|
||||
|
||||
static inline void clear_slob_page_free(struct page *sp)
|
||||
static inline void clear_slob_page_free(struct slab *slab)
|
||||
{
|
||||
list_del(&sp->slab_list);
|
||||
__ClearPageSlobFree(sp);
|
||||
list_del(&slab->slab_list);
|
||||
__ClearPageSlobFree(slab_page(slab));
|
||||
}
|
||||
|
||||
#define SLOB_UNIT sizeof(slob_t)
|
||||
@ -234,7 +234,7 @@ static void slob_free_pages(void *b, int order)
|
||||
* freelist, in this case @page_removed_from_list will be set to
|
||||
* true (set to false otherwise).
|
||||
*/
|
||||
static void *slob_page_alloc(struct page *sp, size_t size, int align,
|
||||
static void *slob_page_alloc(struct slab *sp, size_t size, int align,
|
||||
int align_offset, bool *page_removed_from_list)
|
||||
{
|
||||
slob_t *prev, *cur, *aligned = NULL;
|
||||
@ -301,7 +301,8 @@ static void *slob_page_alloc(struct page *sp, size_t size, int align,
|
||||
static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
|
||||
int align_offset)
|
||||
{
|
||||
struct page *sp;
|
||||
struct folio *folio;
|
||||
struct slab *sp;
|
||||
struct list_head *slob_list;
|
||||
slob_t *b = NULL;
|
||||
unsigned long flags;
|
||||
@ -323,7 +324,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
|
||||
* If there's a node specification, search for a partial
|
||||
* page with a matching node id in the freelist.
|
||||
*/
|
||||
if (node != NUMA_NO_NODE && page_to_nid(sp) != node)
|
||||
if (node != NUMA_NO_NODE && slab_nid(sp) != node)
|
||||
continue;
|
||||
#endif
|
||||
/* Enough room on this page? */
|
||||
@ -358,8 +359,9 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
|
||||
b = slob_new_pages(gfp & ~__GFP_ZERO, 0, node);
|
||||
if (!b)
|
||||
return NULL;
|
||||
sp = virt_to_page(b);
|
||||
__SetPageSlab(sp);
|
||||
folio = virt_to_folio(b);
|
||||
__folio_set_slab(folio);
|
||||
sp = folio_slab(folio);
|
||||
|
||||
spin_lock_irqsave(&slob_lock, flags);
|
||||
sp->units = SLOB_UNITS(PAGE_SIZE);
|
||||
@ -381,7 +383,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
|
||||
*/
|
||||
static void slob_free(void *block, int size)
|
||||
{
|
||||
struct page *sp;
|
||||
struct slab *sp;
|
||||
slob_t *prev, *next, *b = (slob_t *)block;
|
||||
slobidx_t units;
|
||||
unsigned long flags;
|
||||
@ -391,7 +393,7 @@ static void slob_free(void *block, int size)
|
||||
return;
|
||||
BUG_ON(!size);
|
||||
|
||||
sp = virt_to_page(block);
|
||||
sp = virt_to_slab(block);
|
||||
units = SLOB_UNITS(size);
|
||||
|
||||
spin_lock_irqsave(&slob_lock, flags);
|
||||
@ -401,8 +403,7 @@ static void slob_free(void *block, int size)
|
||||
if (slob_page_free(sp))
|
||||
clear_slob_page_free(sp);
|
||||
spin_unlock_irqrestore(&slob_lock, flags);
|
||||
__ClearPageSlab(sp);
|
||||
page_mapcount_reset(sp);
|
||||
__folio_clear_slab(slab_folio(sp));
|
||||
slob_free_pages(b, 0);
|
||||
return;
|
||||
}
|
||||
@ -462,10 +463,10 @@ static void slob_free(void *block, int size)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PRINTK
|
||||
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
|
||||
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
|
||||
{
|
||||
kpp->kp_ptr = object;
|
||||
kpp->kp_page = page;
|
||||
kpp->kp_slab = slab;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -544,7 +545,7 @@ EXPORT_SYMBOL(__kmalloc_node_track_caller);
|
||||
|
||||
void kfree(const void *block)
|
||||
{
|
||||
struct page *sp;
|
||||
struct folio *sp;
|
||||
|
||||
trace_kfree(_RET_IP_, block);
|
||||
|
||||
@ -552,16 +553,17 @@ void kfree(const void *block)
|
||||
return;
|
||||
kmemleak_free(block);
|
||||
|
||||
sp = virt_to_page(block);
|
||||
if (PageSlab(sp)) {
|
||||
sp = virt_to_folio(block);
|
||||
if (folio_test_slab(sp)) {
|
||||
int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
|
||||
unsigned int *m = (unsigned int *)(block - align);
|
||||
slob_free(m, *m + align);
|
||||
} else {
|
||||
unsigned int order = compound_order(sp);
|
||||
mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
|
||||
unsigned int order = folio_order(sp);
|
||||
|
||||
mod_node_page_state(folio_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
|
||||
-(PAGE_SIZE << order));
|
||||
__free_pages(sp, order);
|
||||
__free_pages(folio_page(sp, 0), order);
|
||||
|
||||
}
|
||||
}
|
||||
@ -570,7 +572,7 @@ EXPORT_SYMBOL(kfree);
|
||||
/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
|
||||
size_t __ksize(const void *block)
|
||||
{
|
||||
struct page *sp;
|
||||
struct folio *folio;
|
||||
int align;
|
||||
unsigned int *m;
|
||||
|
||||
@ -578,9 +580,9 @@ size_t __ksize(const void *block)
|
||||
if (unlikely(block == ZERO_SIZE_PTR))
|
||||
return 0;
|
||||
|
||||
sp = virt_to_page(block);
|
||||
if (unlikely(!PageSlab(sp)))
|
||||
return page_size(sp);
|
||||
folio = virt_to_folio(block);
|
||||
if (unlikely(!folio_test_slab(folio)))
|
||||
return folio_size(folio);
|
||||
|
||||
align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
|
||||
m = (unsigned int *)(block - align);
|
||||
@ -666,6 +668,7 @@ static void kmem_rcu_free(struct rcu_head *head)
|
||||
void kmem_cache_free(struct kmem_cache *c, void *b)
|
||||
{
|
||||
kmemleak_free_recursive(b, c->flags);
|
||||
trace_kmem_cache_free(_RET_IP_, b, c->name);
|
||||
if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) {
|
||||
struct slob_rcu *slob_rcu;
|
||||
slob_rcu = b + (c->size - sizeof(struct slob_rcu));
|
||||
@ -674,8 +677,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
|
||||
} else {
|
||||
__kmem_cache_free(b, c->size);
|
||||
}
|
||||
|
||||
trace_kmem_cache_free(_RET_IP_, b, c->name);
|
||||
}
|
||||
EXPORT_SYMBOL(kmem_cache_free);
|
||||
|
||||
|
@ -451,7 +451,7 @@ static void *sparsemap_buf_end __meminitdata;
|
||||
static inline void __meminit sparse_buffer_free(unsigned long size)
|
||||
{
|
||||
WARN_ON(!sparsemap_buf || size == 0);
|
||||
memblock_free_early(__pa(sparsemap_buf), size);
|
||||
memblock_free(sparsemap_buf, size);
|
||||
}
|
||||
|
||||
static void __init sparse_buffer_init(unsigned long size, int nid)
|
||||
@ -722,7 +722,7 @@ static void free_map_bootmem(struct page *memmap)
|
||||
>> PAGE_SHIFT;
|
||||
|
||||
for (i = 0; i < nr_pages; i++, page++) {
|
||||
magic = (unsigned long) page->freelist;
|
||||
magic = page->index;
|
||||
|
||||
BUG_ON(magic == NODE_INFO);
|
||||
|
||||
|
249
mm/swap.c
249
mm/swap.c
@ -80,10 +80,11 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
|
||||
static void __page_cache_release(struct page *page)
|
||||
{
|
||||
if (PageLRU(page)) {
|
||||
struct folio *folio = page_folio(page);
|
||||
struct lruvec *lruvec;
|
||||
unsigned long flags;
|
||||
|
||||
lruvec = lock_page_lruvec_irqsave(page, &flags);
|
||||
lruvec = folio_lruvec_lock_irqsave(folio, &flags);
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
__clear_page_lru_flags(page);
|
||||
unlock_page_lruvec_irqrestore(lruvec, flags);
|
||||
@ -94,7 +95,7 @@ static void __page_cache_release(struct page *page)
|
||||
static void __put_single_page(struct page *page)
|
||||
{
|
||||
__page_cache_release(page);
|
||||
mem_cgroup_uncharge(page);
|
||||
mem_cgroup_uncharge(page_folio(page));
|
||||
free_unref_page(page, 0);
|
||||
}
|
||||
|
||||
@ -134,18 +135,28 @@ EXPORT_SYMBOL(__put_page);
|
||||
* put_pages_list() - release a list of pages
|
||||
* @pages: list of pages threaded on page->lru
|
||||
*
|
||||
* Release a list of pages which are strung together on page.lru. Currently
|
||||
* used by read_cache_pages() and related error recovery code.
|
||||
* Release a list of pages which are strung together on page.lru.
|
||||
*/
|
||||
void put_pages_list(struct list_head *pages)
|
||||
{
|
||||
while (!list_empty(pages)) {
|
||||
struct page *victim;
|
||||
struct page *page, *next;
|
||||
|
||||
victim = lru_to_page(pages);
|
||||
list_del(&victim->lru);
|
||||
put_page(victim);
|
||||
list_for_each_entry_safe(page, next, pages, lru) {
|
||||
if (!put_page_testzero(page)) {
|
||||
list_del(&page->lru);
|
||||
continue;
|
||||
}
|
||||
if (PageHead(page)) {
|
||||
list_del(&page->lru);
|
||||
__put_compound_page(page);
|
||||
continue;
|
||||
}
|
||||
/* Cannot be PageLRU because it's passed to us using the lru */
|
||||
__ClearPageWaiters(page);
|
||||
}
|
||||
|
||||
free_unref_page_list(pages);
|
||||
INIT_LIST_HEAD(pages);
|
||||
}
|
||||
EXPORT_SYMBOL(put_pages_list);
|
||||
|
||||
@ -188,12 +199,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
|
||||
|
||||
for (i = 0; i < pagevec_count(pvec); i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
struct folio *folio = page_folio(page);
|
||||
|
||||
/* block memcg migration during page moving between lru */
|
||||
if (!TestClearPageLRU(page))
|
||||
continue;
|
||||
|
||||
lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
|
||||
lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
|
||||
(*move_fn)(page, lruvec);
|
||||
|
||||
SetPageLRU(page);
|
||||
@ -206,11 +218,13 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,
|
||||
|
||||
static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
|
||||
{
|
||||
if (!PageUnevictable(page)) {
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
ClearPageActive(page);
|
||||
add_page_to_lru_list_tail(page, lruvec);
|
||||
__count_vm_events(PGROTATED, thp_nr_pages(page));
|
||||
struct folio *folio = page_folio(page);
|
||||
|
||||
if (!folio_test_unevictable(folio)) {
|
||||
lruvec_del_folio(lruvec, folio);
|
||||
folio_clear_active(folio);
|
||||
lruvec_add_folio_tail(lruvec, folio);
|
||||
__count_vm_events(PGROTATED, folio_nr_pages(folio));
|
||||
}
|
||||
}
|
||||
|
||||
@ -227,23 +241,23 @@ static bool pagevec_add_and_need_flush(struct pagevec *pvec, struct page *page)
|
||||
}
|
||||
|
||||
/*
|
||||
* Writeback is about to end against a page which has been marked for immediate
|
||||
* reclaim. If it still appears to be reclaimable, move it to the tail of the
|
||||
* inactive list.
|
||||
* Writeback is about to end against a folio which has been marked for
|
||||
* immediate reclaim. If it still appears to be reclaimable, move it
|
||||
* to the tail of the inactive list.
|
||||
*
|
||||
* rotate_reclaimable_page() must disable IRQs, to prevent nasty races.
|
||||
* folio_rotate_reclaimable() must disable IRQs, to prevent nasty races.
|
||||
*/
|
||||
void rotate_reclaimable_page(struct page *page)
|
||||
void folio_rotate_reclaimable(struct folio *folio)
|
||||
{
|
||||
if (!PageLocked(page) && !PageDirty(page) &&
|
||||
!PageUnevictable(page) && PageLRU(page)) {
|
||||
if (!folio_test_locked(folio) && !folio_test_dirty(folio) &&
|
||||
!folio_test_unevictable(folio) && folio_test_lru(folio)) {
|
||||
struct pagevec *pvec;
|
||||
unsigned long flags;
|
||||
|
||||
get_page(page);
|
||||
folio_get(folio);
|
||||
local_lock_irqsave(&lru_rotate.lock, flags);
|
||||
pvec = this_cpu_ptr(&lru_rotate.pvec);
|
||||
if (pagevec_add_and_need_flush(pvec, page))
|
||||
if (pagevec_add_and_need_flush(pvec, &folio->page))
|
||||
pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
|
||||
local_unlock_irqrestore(&lru_rotate.lock, flags);
|
||||
}
|
||||
@ -289,21 +303,21 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
|
||||
} while ((lruvec = parent_lruvec(lruvec)));
|
||||
}
|
||||
|
||||
void lru_note_cost_page(struct page *page)
|
||||
void lru_note_cost_folio(struct folio *folio)
|
||||
{
|
||||
lru_note_cost(mem_cgroup_page_lruvec(page),
|
||||
page_is_file_lru(page), thp_nr_pages(page));
|
||||
lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio),
|
||||
folio_nr_pages(folio));
|
||||
}
|
||||
|
||||
static void __activate_page(struct page *page, struct lruvec *lruvec)
|
||||
static void __folio_activate(struct folio *folio, struct lruvec *lruvec)
|
||||
{
|
||||
if (!PageActive(page) && !PageUnevictable(page)) {
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
if (!folio_test_active(folio) && !folio_test_unevictable(folio)) {
|
||||
long nr_pages = folio_nr_pages(folio);
|
||||
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
SetPageActive(page);
|
||||
add_page_to_lru_list(page, lruvec);
|
||||
trace_mm_lru_activate(page);
|
||||
lruvec_del_folio(lruvec, folio);
|
||||
folio_set_active(folio);
|
||||
lruvec_add_folio(lruvec, folio);
|
||||
trace_mm_lru_activate(folio);
|
||||
|
||||
__count_vm_events(PGACTIVATE, nr_pages);
|
||||
__count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
|
||||
@ -312,6 +326,11 @@ static void __activate_page(struct page *page, struct lruvec *lruvec)
|
||||
}
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
static void __activate_page(struct page *page, struct lruvec *lruvec)
|
||||
{
|
||||
return __folio_activate(page_folio(page), lruvec);
|
||||
}
|
||||
|
||||
static void activate_page_drain(int cpu)
|
||||
{
|
||||
struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
|
||||
@ -325,16 +344,16 @@ static bool need_activate_page_drain(int cpu)
|
||||
return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
|
||||
}
|
||||
|
||||
static void activate_page(struct page *page)
|
||||
static void folio_activate(struct folio *folio)
|
||||
{
|
||||
page = compound_head(page);
|
||||
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
|
||||
if (folio_test_lru(folio) && !folio_test_active(folio) &&
|
||||
!folio_test_unevictable(folio)) {
|
||||
struct pagevec *pvec;
|
||||
|
||||
folio_get(folio);
|
||||
local_lock(&lru_pvecs.lock);
|
||||
pvec = this_cpu_ptr(&lru_pvecs.activate_page);
|
||||
get_page(page);
|
||||
if (pagevec_add_and_need_flush(pvec, page))
|
||||
if (pagevec_add_and_need_flush(pvec, &folio->page))
|
||||
pagevec_lru_move_fn(pvec, __activate_page);
|
||||
local_unlock(&lru_pvecs.lock);
|
||||
}
|
||||
@ -345,21 +364,20 @@ static inline void activate_page_drain(int cpu)
|
||||
{
|
||||
}
|
||||
|
||||
static void activate_page(struct page *page)
|
||||
static void folio_activate(struct folio *folio)
|
||||
{
|
||||
struct lruvec *lruvec;
|
||||
|
||||
page = compound_head(page);
|
||||
if (TestClearPageLRU(page)) {
|
||||
lruvec = lock_page_lruvec_irq(page);
|
||||
__activate_page(page, lruvec);
|
||||
if (folio_test_clear_lru(folio)) {
|
||||
lruvec = folio_lruvec_lock_irq(folio);
|
||||
__folio_activate(folio, lruvec);
|
||||
unlock_page_lruvec_irq(lruvec);
|
||||
SetPageLRU(page);
|
||||
folio_set_lru(folio);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
static void __lru_cache_activate_page(struct page *page)
|
||||
static void __lru_cache_activate_folio(struct folio *folio)
|
||||
{
|
||||
struct pagevec *pvec;
|
||||
int i;
|
||||
@ -380,8 +398,8 @@ static void __lru_cache_activate_page(struct page *page)
|
||||
for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
|
||||
struct page *pagevec_page = pvec->pages[i];
|
||||
|
||||
if (pagevec_page == page) {
|
||||
SetPageActive(page);
|
||||
if (pagevec_page == &folio->page) {
|
||||
folio_set_active(folio);
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -399,61 +417,59 @@ static void __lru_cache_activate_page(struct page *page)
|
||||
* When a newly allocated page is not yet visible, so safe for non-atomic ops,
|
||||
* __SetPageReferenced(page) may be substituted for mark_page_accessed(page).
|
||||
*/
|
||||
void mark_page_accessed(struct page *page)
|
||||
void folio_mark_accessed(struct folio *folio)
|
||||
{
|
||||
page = compound_head(page);
|
||||
|
||||
if (!PageReferenced(page)) {
|
||||
SetPageReferenced(page);
|
||||
} else if (PageUnevictable(page)) {
|
||||
if (!folio_test_referenced(folio)) {
|
||||
folio_set_referenced(folio);
|
||||
} else if (folio_test_unevictable(folio)) {
|
||||
/*
|
||||
* Unevictable pages are on the "LRU_UNEVICTABLE" list. But,
|
||||
* this list is never rotated or maintained, so marking an
|
||||
* evictable page accessed has no effect.
|
||||
*/
|
||||
} else if (!PageActive(page)) {
|
||||
} else if (!folio_test_active(folio)) {
|
||||
/*
|
||||
* If the page is on the LRU, queue it for activation via
|
||||
* lru_pvecs.activate_page. Otherwise, assume the page is on a
|
||||
* pagevec, mark it active and it'll be moved to the active
|
||||
* LRU on the next drain.
|
||||
*/
|
||||
if (PageLRU(page))
|
||||
activate_page(page);
|
||||
if (folio_test_lru(folio))
|
||||
folio_activate(folio);
|
||||
else
|
||||
__lru_cache_activate_page(page);
|
||||
ClearPageReferenced(page);
|
||||
workingset_activation(page);
|
||||
__lru_cache_activate_folio(folio);
|
||||
folio_clear_referenced(folio);
|
||||
workingset_activation(folio);
|
||||
}
|
||||
if (page_is_idle(page))
|
||||
clear_page_idle(page);
|
||||
if (folio_test_idle(folio))
|
||||
folio_clear_idle(folio);
|
||||
}
|
||||
EXPORT_SYMBOL(mark_page_accessed);
|
||||
EXPORT_SYMBOL(folio_mark_accessed);
|
||||
|
||||
/**
|
||||
* lru_cache_add - add a page to a page list
|
||||
* @page: the page to be added to the LRU.
|
||||
* folio_add_lru - Add a folio to an LRU list.
|
||||
* @folio: The folio to be added to the LRU.
|
||||
*
|
||||
* Queue the page for addition to the LRU via pagevec. The decision on whether
|
||||
* Queue the folio for addition to the LRU. The decision on whether
|
||||
* to add the page to the [in]active [file|anon] list is deferred until the
|
||||
* pagevec is drained. This gives a chance for the caller of lru_cache_add()
|
||||
* have the page added to the active list using mark_page_accessed().
|
||||
* pagevec is drained. This gives a chance for the caller of folio_add_lru()
|
||||
* have the folio added to the active list using folio_mark_accessed().
|
||||
*/
|
||||
void lru_cache_add(struct page *page)
|
||||
void folio_add_lru(struct folio *folio)
|
||||
{
|
||||
struct pagevec *pvec;
|
||||
|
||||
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
|
||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||
VM_BUG_ON_FOLIO(folio_test_active(folio) && folio_test_unevictable(folio), folio);
|
||||
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
|
||||
|
||||
get_page(page);
|
||||
folio_get(folio);
|
||||
local_lock(&lru_pvecs.lock);
|
||||
pvec = this_cpu_ptr(&lru_pvecs.lru_add);
|
||||
if (pagevec_add_and_need_flush(pvec, page))
|
||||
if (pagevec_add_and_need_flush(pvec, &folio->page))
|
||||
__pagevec_lru_add(pvec);
|
||||
local_unlock(&lru_pvecs.lock);
|
||||
}
|
||||
EXPORT_SYMBOL(lru_cache_add);
|
||||
EXPORT_SYMBOL(folio_add_lru);
|
||||
|
||||
/**
|
||||
* lru_cache_add_inactive_or_unevictable
|
||||
@ -866,7 +882,7 @@ void lru_cache_disable(void)
|
||||
* all online CPUs so any calls of lru_cache_disabled wrapped by
|
||||
* local_lock or preemption disabled would be ordered by that.
|
||||
* The atomic operation doesn't need to have stronger ordering
|
||||
* requirements because that is enforeced by the scheduling
|
||||
* requirements because that is enforced by the scheduling
|
||||
* guarantees.
|
||||
*/
|
||||
__lru_add_drain_all(true);
|
||||
@ -888,11 +904,12 @@ void release_pages(struct page **pages, int nr)
|
||||
int i;
|
||||
LIST_HEAD(pages_to_free);
|
||||
struct lruvec *lruvec = NULL;
|
||||
unsigned long flags;
|
||||
unsigned long flags = 0;
|
||||
unsigned int lock_batch;
|
||||
|
||||
for (i = 0; i < nr; i++) {
|
||||
struct page *page = pages[i];
|
||||
struct folio *folio = page_folio(page);
|
||||
|
||||
/*
|
||||
* Make sure the IRQ-safe lock-holding time does not get
|
||||
@ -904,7 +921,7 @@ void release_pages(struct page **pages, int nr)
|
||||
lruvec = NULL;
|
||||
}
|
||||
|
||||
page = compound_head(page);
|
||||
page = &folio->page;
|
||||
if (is_huge_zero_page(page))
|
||||
continue;
|
||||
|
||||
@ -943,7 +960,7 @@ void release_pages(struct page **pages, int nr)
|
||||
if (PageLRU(page)) {
|
||||
struct lruvec *prev_lruvec = lruvec;
|
||||
|
||||
lruvec = relock_page_lruvec_irqsave(page, lruvec,
|
||||
lruvec = folio_lruvec_relock_irqsave(folio, lruvec,
|
||||
&flags);
|
||||
if (prev_lruvec != lruvec)
|
||||
lock_batch = 0;
|
||||
@ -985,17 +1002,18 @@ void __pagevec_release(struct pagevec *pvec)
|
||||
}
|
||||
EXPORT_SYMBOL(__pagevec_release);
|
||||
|
||||
static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
|
||||
static void __pagevec_lru_add_fn(struct folio *folio, struct lruvec *lruvec)
|
||||
{
|
||||
int was_unevictable = TestClearPageUnevictable(page);
|
||||
int nr_pages = thp_nr_pages(page);
|
||||
int was_unevictable = folio_test_clear_unevictable(folio);
|
||||
long nr_pages = folio_nr_pages(folio);
|
||||
|
||||
VM_BUG_ON_PAGE(PageLRU(page), page);
|
||||
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
|
||||
|
||||
/*
|
||||
* Page becomes evictable in two ways:
|
||||
* A folio becomes evictable in two ways:
|
||||
* 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
|
||||
* 2) Before acquiring LRU lock to put the page to correct LRU and then
|
||||
* 2) Before acquiring LRU lock to put the folio on the correct LRU
|
||||
* and then
|
||||
* a) do PageLRU check with lock [check_move_unevictable_pages]
|
||||
* b) do PageLRU check before lock [clear_page_mlock]
|
||||
*
|
||||
@ -1004,35 +1022,36 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
|
||||
*
|
||||
* #0: __pagevec_lru_add_fn #1: clear_page_mlock
|
||||
*
|
||||
* SetPageLRU() TestClearPageMlocked()
|
||||
* folio_set_lru() folio_test_clear_mlocked()
|
||||
* smp_mb() // explicit ordering // above provides strict
|
||||
* // ordering
|
||||
* PageMlocked() PageLRU()
|
||||
* folio_test_mlocked() folio_test_lru()
|
||||
*
|
||||
*
|
||||
* if '#1' does not observe setting of PG_lru by '#0' and fails
|
||||
* isolation, the explicit barrier will make sure that page_evictable
|
||||
* check will put the page in correct LRU. Without smp_mb(), SetPageLRU
|
||||
* can be reordered after PageMlocked check and can make '#1' to fail
|
||||
* the isolation of the page whose Mlocked bit is cleared (#0 is also
|
||||
* looking at the same page) and the evictable page will be stranded
|
||||
* in an unevictable LRU.
|
||||
* if '#1' does not observe setting of PG_lru by '#0' and
|
||||
* fails isolation, the explicit barrier will make sure that
|
||||
* folio_evictable check will put the folio on the correct
|
||||
* LRU. Without smp_mb(), folio_set_lru() can be reordered
|
||||
* after folio_test_mlocked() check and can make '#1' fail the
|
||||
* isolation of the folio whose mlocked bit is cleared (#0 is
|
||||
* also looking at the same folio) and the evictable folio will
|
||||
* be stranded on an unevictable LRU.
|
||||
*/
|
||||
SetPageLRU(page);
|
||||
folio_set_lru(folio);
|
||||
smp_mb__after_atomic();
|
||||
|
||||
if (page_evictable(page)) {
|
||||
if (folio_evictable(folio)) {
|
||||
if (was_unevictable)
|
||||
__count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
|
||||
} else {
|
||||
ClearPageActive(page);
|
||||
SetPageUnevictable(page);
|
||||
folio_clear_active(folio);
|
||||
folio_set_unevictable(folio);
|
||||
if (!was_unevictable)
|
||||
__count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
|
||||
}
|
||||
|
||||
add_page_to_lru_list(page, lruvec);
|
||||
trace_mm_lru_insertion(page);
|
||||
lruvec_add_folio(lruvec, folio);
|
||||
trace_mm_lru_insertion(folio);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1046,10 +1065,10 @@ void __pagevec_lru_add(struct pagevec *pvec)
|
||||
unsigned long flags = 0;
|
||||
|
||||
for (i = 0; i < pagevec_count(pvec); i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
struct folio *folio = page_folio(pvec->pages[i]);
|
||||
|
||||
lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
|
||||
__pagevec_lru_add_fn(page, lruvec);
|
||||
lruvec = folio_lruvec_relock_irqsave(folio, lruvec, &flags);
|
||||
__pagevec_lru_add_fn(folio, lruvec);
|
||||
}
|
||||
if (lruvec)
|
||||
unlock_page_lruvec_irqrestore(lruvec, flags);
|
||||
@ -1058,24 +1077,24 @@ void __pagevec_lru_add(struct pagevec *pvec)
|
||||
}
|
||||
|
||||
/**
|
||||
* pagevec_remove_exceptionals - pagevec exceptionals pruning
|
||||
* @pvec: The pagevec to prune
|
||||
* folio_batch_remove_exceptionals() - Prune non-folios from a batch.
|
||||
* @fbatch: The batch to prune
|
||||
*
|
||||
* find_get_entries() fills both pages and XArray value entries (aka
|
||||
* exceptional entries) into the pagevec. This function prunes all
|
||||
* exceptionals from @pvec without leaving holes, so that it can be
|
||||
* passed on to page-only pagevec operations.
|
||||
* find_get_entries() fills a batch with both folios and shadow/swap/DAX
|
||||
* entries. This function prunes all the non-folio entries from @fbatch
|
||||
* without leaving holes, so that it can be passed on to folio-only batch
|
||||
* operations.
|
||||
*/
|
||||
void pagevec_remove_exceptionals(struct pagevec *pvec)
|
||||
void folio_batch_remove_exceptionals(struct folio_batch *fbatch)
|
||||
{
|
||||
int i, j;
|
||||
unsigned int i, j;
|
||||
|
||||
for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
if (!xa_is_value(page))
|
||||
pvec->pages[j++] = page;
|
||||
for (i = 0, j = 0; i < folio_batch_count(fbatch); i++) {
|
||||
struct folio *folio = fbatch->folios[i];
|
||||
if (!xa_is_value(folio))
|
||||
fbatch->folios[j++] = folio;
|
||||
}
|
||||
pvec->nr = j;
|
||||
fbatch->nr = j;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -30,6 +30,7 @@
|
||||
#include <linux/swap_slots.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/cpumask.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/mm.h>
|
||||
|
@ -478,7 +478,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||
* __read_swap_cache_async(), which has set SWAP_HAS_CACHE
|
||||
* in swap_map, but not yet added its page to swap cache.
|
||||
*/
|
||||
cond_resched();
|
||||
schedule_timeout_uninterruptible(1);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -498,7 +498,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
|
||||
mem_cgroup_swapin_uncharge_swap(entry);
|
||||
|
||||
if (shadow)
|
||||
workingset_refault(page, shadow);
|
||||
workingset_refault(page_folio(page), shadow);
|
||||
|
||||
/* Caller will initiate read into locked page */
|
||||
lru_cache_add(page);
|
||||
|
150
mm/swapfile.c
150
mm/swapfile.c
@ -18,7 +18,7 @@
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/namei.h>
|
||||
#include <linux/shmem_fs.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/blk-cgroup.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/proc_fs.h>
|
||||
@ -49,7 +49,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
|
||||
unsigned char);
|
||||
static void free_swap_count_continuations(struct swap_info_struct *);
|
||||
|
||||
DEFINE_SPINLOCK(swap_lock);
|
||||
static DEFINE_SPINLOCK(swap_lock);
|
||||
static unsigned int nr_swapfiles;
|
||||
atomic_long_t nr_swap_pages;
|
||||
/*
|
||||
@ -71,7 +71,7 @@ static const char Unused_offset[] = "Unused swap offset entry ";
|
||||
* all active swap_info_structs
|
||||
* protected with swap_lock, and ordered by priority.
|
||||
*/
|
||||
PLIST_HEAD(swap_active_head);
|
||||
static PLIST_HEAD(swap_active_head);
|
||||
|
||||
/*
|
||||
* all available (active, not full) swap_info_structs
|
||||
@ -1601,31 +1601,30 @@ static bool page_swapped(struct page *page)
|
||||
return false;
|
||||
}
|
||||
|
||||
static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
|
||||
static int page_trans_huge_map_swapcount(struct page *page,
|
||||
int *total_swapcount)
|
||||
{
|
||||
int i, map_swapcount, _total_mapcount, _total_swapcount;
|
||||
int i, map_swapcount, _total_swapcount;
|
||||
unsigned long offset = 0;
|
||||
struct swap_info_struct *si;
|
||||
struct swap_cluster_info *ci = NULL;
|
||||
unsigned char *map = NULL;
|
||||
int mapcount, swapcount = 0;
|
||||
int swapcount = 0;
|
||||
|
||||
/* hugetlbfs shouldn't call it */
|
||||
VM_BUG_ON_PAGE(PageHuge(page), page);
|
||||
|
||||
if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) {
|
||||
mapcount = page_trans_huge_mapcount(page, total_mapcount);
|
||||
if (PageSwapCache(page))
|
||||
swapcount = page_swapcount(page);
|
||||
if (total_swapcount)
|
||||
*total_swapcount = swapcount;
|
||||
return mapcount + swapcount;
|
||||
return swapcount + page_trans_huge_mapcount(page);
|
||||
}
|
||||
|
||||
page = compound_head(page);
|
||||
|
||||
_total_mapcount = _total_swapcount = map_swapcount = 0;
|
||||
_total_swapcount = map_swapcount = 0;
|
||||
if (PageSwapCache(page)) {
|
||||
swp_entry_t entry;
|
||||
|
||||
@ -1639,8 +1638,7 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
|
||||
if (map)
|
||||
ci = lock_cluster(si, offset);
|
||||
for (i = 0; i < HPAGE_PMD_NR; i++) {
|
||||
mapcount = atomic_read(&page[i]._mapcount) + 1;
|
||||
_total_mapcount += mapcount;
|
||||
int mapcount = atomic_read(&page[i]._mapcount) + 1;
|
||||
if (map) {
|
||||
swapcount = swap_count(map[offset + i]);
|
||||
_total_swapcount += swapcount;
|
||||
@ -1648,19 +1646,14 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
|
||||
map_swapcount = max(map_swapcount, mapcount + swapcount);
|
||||
}
|
||||
unlock_cluster(ci);
|
||||
if (PageDoubleMap(page)) {
|
||||
|
||||
if (PageDoubleMap(page))
|
||||
map_swapcount -= 1;
|
||||
_total_mapcount -= HPAGE_PMD_NR;
|
||||
}
|
||||
mapcount = compound_mapcount(page);
|
||||
map_swapcount += mapcount;
|
||||
_total_mapcount += mapcount;
|
||||
if (total_mapcount)
|
||||
*total_mapcount = _total_mapcount;
|
||||
|
||||
if (total_swapcount)
|
||||
*total_swapcount = _total_swapcount;
|
||||
|
||||
return map_swapcount;
|
||||
return map_swapcount + compound_mapcount(page);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1668,22 +1661,15 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount,
|
||||
* to it. And as a side-effect, free up its swap: because the old content
|
||||
* on disk will never be read, and seeking back there to write new content
|
||||
* later would only waste time away from clustering.
|
||||
*
|
||||
* NOTE: total_map_swapcount should not be relied upon by the caller if
|
||||
* reuse_swap_page() returns false, but it may be always overwritten
|
||||
* (see the other implementation for CONFIG_SWAP=n).
|
||||
*/
|
||||
bool reuse_swap_page(struct page *page, int *total_map_swapcount)
|
||||
bool reuse_swap_page(struct page *page)
|
||||
{
|
||||
int count, total_mapcount, total_swapcount;
|
||||
int count, total_swapcount;
|
||||
|
||||
VM_BUG_ON_PAGE(!PageLocked(page), page);
|
||||
if (unlikely(PageKsm(page)))
|
||||
return false;
|
||||
count = page_trans_huge_map_swapcount(page, &total_mapcount,
|
||||
&total_swapcount);
|
||||
if (total_map_swapcount)
|
||||
*total_map_swapcount = total_mapcount + total_swapcount;
|
||||
count = page_trans_huge_map_swapcount(page, &total_swapcount);
|
||||
if (count == 1 && PageSwapCache(page) &&
|
||||
(likely(!PageTransCompound(page)) ||
|
||||
/* The remaining swap count will be freed soon */
|
||||
@ -1917,14 +1903,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
|
||||
inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
|
||||
get_page(page);
|
||||
set_pte_at(vma->vm_mm, addr, pte,
|
||||
pte_mkold(mk_pte(page, vma->vm_page_prot)));
|
||||
if (page == swapcache) {
|
||||
page_add_anon_rmap(page, vma, addr, false);
|
||||
} else { /* ksm created a completely new copy */
|
||||
page_add_new_anon_rmap(page, vma, addr, false);
|
||||
lru_cache_add_inactive_or_unevictable(page, vma);
|
||||
}
|
||||
set_pte_at(vma->vm_mm, addr, pte,
|
||||
pte_mkold(mk_pte(page, vma->vm_page_prot)));
|
||||
swap_free(entry);
|
||||
out:
|
||||
pte_unmap_unlock(pte, ptl);
|
||||
@ -1937,8 +1923,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
|
||||
static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
unsigned long addr, unsigned long end,
|
||||
unsigned int type, bool frontswap,
|
||||
unsigned long *fs_pages_to_unuse)
|
||||
unsigned int type)
|
||||
{
|
||||
struct page *page;
|
||||
swp_entry_t entry;
|
||||
@ -1959,9 +1944,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
continue;
|
||||
|
||||
offset = swp_offset(entry);
|
||||
if (frontswap && !frontswap_test(si, offset))
|
||||
continue;
|
||||
|
||||
pte_unmap(pte);
|
||||
swap_map = &si->swap_map[offset];
|
||||
page = lookup_swap_cache(entry, vma, addr);
|
||||
@ -1993,11 +1975,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
try_to_free_swap(page);
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
|
||||
if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
|
||||
ret = FRONTSWAP_PAGES_UNUSED;
|
||||
goto out;
|
||||
}
|
||||
try_next:
|
||||
pte = pte_offset_map(pmd, addr);
|
||||
} while (pte++, addr += PAGE_SIZE, addr != end);
|
||||
@ -2010,8 +1987,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
|
||||
|
||||
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
|
||||
unsigned long addr, unsigned long end,
|
||||
unsigned int type, bool frontswap,
|
||||
unsigned long *fs_pages_to_unuse)
|
||||
unsigned int type)
|
||||
{
|
||||
pmd_t *pmd;
|
||||
unsigned long next;
|
||||
@ -2023,8 +1999,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
|
||||
next = pmd_addr_end(addr, end);
|
||||
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
|
||||
continue;
|
||||
ret = unuse_pte_range(vma, pmd, addr, next, type,
|
||||
frontswap, fs_pages_to_unuse);
|
||||
ret = unuse_pte_range(vma, pmd, addr, next, type);
|
||||
if (ret)
|
||||
return ret;
|
||||
} while (pmd++, addr = next, addr != end);
|
||||
@ -2033,8 +2008,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
|
||||
|
||||
static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
|
||||
unsigned long addr, unsigned long end,
|
||||
unsigned int type, bool frontswap,
|
||||
unsigned long *fs_pages_to_unuse)
|
||||
unsigned int type)
|
||||
{
|
||||
pud_t *pud;
|
||||
unsigned long next;
|
||||
@ -2045,8 +2019,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
|
||||
next = pud_addr_end(addr, end);
|
||||
if (pud_none_or_clear_bad(pud))
|
||||
continue;
|
||||
ret = unuse_pmd_range(vma, pud, addr, next, type,
|
||||
frontswap, fs_pages_to_unuse);
|
||||
ret = unuse_pmd_range(vma, pud, addr, next, type);
|
||||
if (ret)
|
||||
return ret;
|
||||
} while (pud++, addr = next, addr != end);
|
||||
@ -2055,8 +2028,7 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
|
||||
|
||||
static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
|
||||
unsigned long addr, unsigned long end,
|
||||
unsigned int type, bool frontswap,
|
||||
unsigned long *fs_pages_to_unuse)
|
||||
unsigned int type)
|
||||
{
|
||||
p4d_t *p4d;
|
||||
unsigned long next;
|
||||
@ -2067,16 +2039,14 @@ static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
|
||||
next = p4d_addr_end(addr, end);
|
||||
if (p4d_none_or_clear_bad(p4d))
|
||||
continue;
|
||||
ret = unuse_pud_range(vma, p4d, addr, next, type,
|
||||
frontswap, fs_pages_to_unuse);
|
||||
ret = unuse_pud_range(vma, p4d, addr, next, type);
|
||||
if (ret)
|
||||
return ret;
|
||||
} while (p4d++, addr = next, addr != end);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
|
||||
bool frontswap, unsigned long *fs_pages_to_unuse)
|
||||
static int unuse_vma(struct vm_area_struct *vma, unsigned int type)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
unsigned long addr, end, next;
|
||||
@ -2090,16 +2060,14 @@ static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
|
||||
next = pgd_addr_end(addr, end);
|
||||
if (pgd_none_or_clear_bad(pgd))
|
||||
continue;
|
||||
ret = unuse_p4d_range(vma, pgd, addr, next, type,
|
||||
frontswap, fs_pages_to_unuse);
|
||||
ret = unuse_p4d_range(vma, pgd, addr, next, type);
|
||||
if (ret)
|
||||
return ret;
|
||||
} while (pgd++, addr = next, addr != end);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int unuse_mm(struct mm_struct *mm, unsigned int type,
|
||||
bool frontswap, unsigned long *fs_pages_to_unuse)
|
||||
static int unuse_mm(struct mm_struct *mm, unsigned int type)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
int ret = 0;
|
||||
@ -2107,8 +2075,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type,
|
||||
mmap_read_lock(mm);
|
||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||
if (vma->anon_vma) {
|
||||
ret = unuse_vma(vma, type, frontswap,
|
||||
fs_pages_to_unuse);
|
||||
ret = unuse_vma(vma, type);
|
||||
if (ret)
|
||||
break;
|
||||
}
|
||||
@ -2124,7 +2091,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type,
|
||||
* if there are no inuse entries after prev till end of the map.
|
||||
*/
|
||||
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
|
||||
unsigned int prev, bool frontswap)
|
||||
unsigned int prev)
|
||||
{
|
||||
unsigned int i;
|
||||
unsigned char count;
|
||||
@ -2138,8 +2105,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
|
||||
for (i = prev + 1; i < si->max; i++) {
|
||||
count = READ_ONCE(si->swap_map[i]);
|
||||
if (count && swap_count(count) != SWAP_MAP_BAD)
|
||||
if (!frontswap || frontswap_test(si, i))
|
||||
break;
|
||||
break;
|
||||
if ((i % LATENCY_LIMIT) == 0)
|
||||
cond_resched();
|
||||
}
|
||||
@ -2150,12 +2116,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si,
|
||||
return i;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the boolean frontswap is true, only unuse pages_to_unuse pages;
|
||||
* pages_to_unuse==0 means all pages; ignored if frontswap is false
|
||||
*/
|
||||
int try_to_unuse(unsigned int type, bool frontswap,
|
||||
unsigned long pages_to_unuse)
|
||||
static int try_to_unuse(unsigned int type)
|
||||
{
|
||||
struct mm_struct *prev_mm;
|
||||
struct mm_struct *mm;
|
||||
@ -2169,13 +2130,10 @@ int try_to_unuse(unsigned int type, bool frontswap,
|
||||
if (!READ_ONCE(si->inuse_pages))
|
||||
return 0;
|
||||
|
||||
if (!frontswap)
|
||||
pages_to_unuse = 0;
|
||||
|
||||
retry:
|
||||
retval = shmem_unuse(type, frontswap, &pages_to_unuse);
|
||||
retval = shmem_unuse(type);
|
||||
if (retval)
|
||||
goto out;
|
||||
return retval;
|
||||
|
||||
prev_mm = &init_mm;
|
||||
mmget(prev_mm);
|
||||
@ -2192,11 +2150,10 @@ int try_to_unuse(unsigned int type, bool frontswap,
|
||||
spin_unlock(&mmlist_lock);
|
||||
mmput(prev_mm);
|
||||
prev_mm = mm;
|
||||
retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
|
||||
|
||||
retval = unuse_mm(mm, type);
|
||||
if (retval) {
|
||||
mmput(prev_mm);
|
||||
goto out;
|
||||
return retval;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2213,7 +2170,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
|
||||
i = 0;
|
||||
while (READ_ONCE(si->inuse_pages) &&
|
||||
!signal_pending(current) &&
|
||||
(i = find_next_to_unuse(si, i, frontswap)) != 0) {
|
||||
(i = find_next_to_unuse(si, i)) != 0) {
|
||||
|
||||
entry = swp_entry(type, i);
|
||||
page = find_get_page(swap_address_space(entry), i);
|
||||
@ -2231,14 +2188,6 @@ int try_to_unuse(unsigned int type, bool frontswap,
|
||||
try_to_free_swap(page);
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
|
||||
/*
|
||||
* For frontswap, we just need to unuse pages_to_unuse, if
|
||||
* it was specified. Need not check frontswap again here as
|
||||
* we already zeroed out pages_to_unuse if not frontswap.
|
||||
*/
|
||||
if (pages_to_unuse && --pages_to_unuse == 0)
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2256,10 +2205,10 @@ int try_to_unuse(unsigned int type, bool frontswap,
|
||||
if (READ_ONCE(si->inuse_pages)) {
|
||||
if (!signal_pending(current))
|
||||
goto retry;
|
||||
retval = -EINTR;
|
||||
return -EINTR;
|
||||
}
|
||||
out:
|
||||
return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2477,7 +2426,8 @@ static void enable_swap_info(struct swap_info_struct *p, int prio,
|
||||
struct swap_cluster_info *cluster_info,
|
||||
unsigned long *frontswap_map)
|
||||
{
|
||||
frontswap_init(p->type, frontswap_map);
|
||||
if (IS_ENABLED(CONFIG_FRONTSWAP))
|
||||
frontswap_init(p->type, frontswap_map);
|
||||
spin_lock(&swap_lock);
|
||||
spin_lock(&p->lock);
|
||||
setup_swap_info(p, prio, swap_map, cluster_info);
|
||||
@ -2590,7 +2540,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
||||
disable_swap_slots_cache_lock();
|
||||
|
||||
set_current_oom_origin();
|
||||
err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
|
||||
err = try_to_unuse(p->type);
|
||||
clear_current_oom_origin();
|
||||
|
||||
if (err) {
|
||||
@ -2763,7 +2713,7 @@ static int swap_show(struct seq_file *swap, void *v)
|
||||
struct swap_info_struct *si = v;
|
||||
struct file *file;
|
||||
int len;
|
||||
unsigned int bytes, inuse;
|
||||
unsigned long bytes, inuse;
|
||||
|
||||
if (si == SEQ_START_TOKEN) {
|
||||
seq_puts(swap, "Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
|
||||
@ -2775,7 +2725,7 @@ static int swap_show(struct seq_file *swap, void *v)
|
||||
|
||||
file = si->swap_file;
|
||||
len = seq_file_path(swap, file, " \t\n\\");
|
||||
seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
|
||||
seq_printf(swap, "%*s%s\t%lu\t%s%lu\t%s%d\n",
|
||||
len < 40 ? 40 - len : 1, " ",
|
||||
S_ISBLK(file_inode(file)->i_mode) ?
|
||||
"partition" : "file\t",
|
||||
@ -3118,7 +3068,7 @@ static bool swap_discardable(struct swap_info_struct *si)
|
||||
{
|
||||
struct request_queue *q = bdev_get_queue(si->bdev);
|
||||
|
||||
if (!q || !blk_queue_discard(q))
|
||||
if (!blk_queue_discard(q))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
@ -3534,13 +3484,13 @@ struct swap_info_struct *page_swap_info(struct page *page)
|
||||
}
|
||||
|
||||
/*
|
||||
* out-of-line __page_file_ methods to avoid include hell.
|
||||
* out-of-line methods to avoid include hell.
|
||||
*/
|
||||
struct address_space *__page_file_mapping(struct page *page)
|
||||
struct address_space *swapcache_mapping(struct folio *folio)
|
||||
{
|
||||
return page_swap_info(page)->swap_file->f_mapping;
|
||||
return page_swap_info(&folio->page)->swap_file->f_mapping;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__page_file_mapping);
|
||||
EXPORT_SYMBOL_GPL(swapcache_mapping);
|
||||
|
||||
pgoff_t __page_file_index(struct page *page)
|
||||
{
|
||||
|
341
mm/truncate.c
341
mm/truncate.c
@ -22,7 +22,6 @@
|
||||
#include <linux/buffer_head.h> /* grr. try_to_release_page,
|
||||
do_invalidatepage */
|
||||
#include <linux/shmem_fs.h>
|
||||
#include <linux/cleancache.h>
|
||||
#include <linux/rmap.h>
|
||||
#include "internal.h"
|
||||
|
||||
@ -45,18 +44,22 @@ static inline void __clear_shadow_entry(struct address_space *mapping,
|
||||
static void clear_shadow_entry(struct address_space *mapping, pgoff_t index,
|
||||
void *entry)
|
||||
{
|
||||
spin_lock(&mapping->host->i_lock);
|
||||
xa_lock_irq(&mapping->i_pages);
|
||||
__clear_shadow_entry(mapping, index, entry);
|
||||
xa_unlock_irq(&mapping->i_pages);
|
||||
if (mapping_shrinkable(mapping))
|
||||
inode_add_lru(mapping->host);
|
||||
spin_unlock(&mapping->host->i_lock);
|
||||
}
|
||||
|
||||
/*
|
||||
* Unconditionally remove exceptional entries. Usually called from truncate
|
||||
* path. Note that the pagevec may be altered by this function by removing
|
||||
* exceptional entries similar to what pagevec_remove_exceptionals does.
|
||||
* path. Note that the folio_batch may be altered by this function by removing
|
||||
* exceptional entries similar to what folio_batch_remove_exceptionals() does.
|
||||
*/
|
||||
static void truncate_exceptional_pvec_entries(struct address_space *mapping,
|
||||
struct pagevec *pvec, pgoff_t *indices)
|
||||
static void truncate_folio_batch_exceptionals(struct address_space *mapping,
|
||||
struct folio_batch *fbatch, pgoff_t *indices)
|
||||
{
|
||||
int i, j;
|
||||
bool dax;
|
||||
@ -65,23 +68,25 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
|
||||
if (shmem_mapping(mapping))
|
||||
return;
|
||||
|
||||
for (j = 0; j < pagevec_count(pvec); j++)
|
||||
if (xa_is_value(pvec->pages[j]))
|
||||
for (j = 0; j < folio_batch_count(fbatch); j++)
|
||||
if (xa_is_value(fbatch->folios[j]))
|
||||
break;
|
||||
|
||||
if (j == pagevec_count(pvec))
|
||||
if (j == folio_batch_count(fbatch))
|
||||
return;
|
||||
|
||||
dax = dax_mapping(mapping);
|
||||
if (!dax)
|
||||
if (!dax) {
|
||||
spin_lock(&mapping->host->i_lock);
|
||||
xa_lock_irq(&mapping->i_pages);
|
||||
}
|
||||
|
||||
for (i = j; i < pagevec_count(pvec); i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
for (i = j; i < folio_batch_count(fbatch); i++) {
|
||||
struct folio *folio = fbatch->folios[i];
|
||||
pgoff_t index = indices[i];
|
||||
|
||||
if (!xa_is_value(page)) {
|
||||
pvec->pages[j++] = page;
|
||||
if (!xa_is_value(folio)) {
|
||||
fbatch->folios[j++] = folio;
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -90,12 +95,16 @@ static void truncate_exceptional_pvec_entries(struct address_space *mapping,
|
||||
continue;
|
||||
}
|
||||
|
||||
__clear_shadow_entry(mapping, index, page);
|
||||
__clear_shadow_entry(mapping, index, folio);
|
||||
}
|
||||
|
||||
if (!dax)
|
||||
if (!dax) {
|
||||
xa_unlock_irq(&mapping->i_pages);
|
||||
pvec->nr = j;
|
||||
if (mapping_shrinkable(mapping))
|
||||
inode_add_lru(mapping->host);
|
||||
spin_unlock(&mapping->host->i_lock);
|
||||
}
|
||||
fbatch->nr = j;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -167,21 +176,21 @@ void do_invalidatepage(struct page *page, unsigned int offset,
|
||||
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
|
||||
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
|
||||
*/
|
||||
static void truncate_cleanup_page(struct page *page)
|
||||
static void truncate_cleanup_folio(struct folio *folio)
|
||||
{
|
||||
if (page_mapped(page))
|
||||
unmap_mapping_page(page);
|
||||
if (folio_mapped(folio))
|
||||
unmap_mapping_folio(folio);
|
||||
|
||||
if (page_has_private(page))
|
||||
do_invalidatepage(page, 0, thp_size(page));
|
||||
if (folio_has_private(folio))
|
||||
do_invalidatepage(&folio->page, 0, folio_size(folio));
|
||||
|
||||
/*
|
||||
* Some filesystems seem to re-dirty the page even after
|
||||
* the VM has canceled the dirty bit (eg ext3 journaling).
|
||||
* Hence dirty accounting check is placed after invalidation.
|
||||
*/
|
||||
cancel_dirty_page(page);
|
||||
ClearPageMappedToDisk(page);
|
||||
folio_cancel_dirty(folio);
|
||||
folio_clear_mappedtodisk(folio);
|
||||
}
|
||||
|
||||
/*
|
||||
@ -195,7 +204,6 @@ static void truncate_cleanup_page(struct page *page)
|
||||
static int
|
||||
invalidate_complete_page(struct address_space *mapping, struct page *page)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (page->mapping != mapping)
|
||||
return 0;
|
||||
@ -203,28 +211,77 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
|
||||
if (page_has_private(page) && !try_to_release_page(page, 0))
|
||||
return 0;
|
||||
|
||||
ret = remove_mapping(mapping, page);
|
||||
|
||||
return ret;
|
||||
return remove_mapping(mapping, page);
|
||||
}
|
||||
|
||||
int truncate_inode_page(struct address_space *mapping, struct page *page)
|
||||
int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
|
||||
{
|
||||
VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
|
||||
if (page->mapping != mapping)
|
||||
if (folio->mapping != mapping)
|
||||
return -EIO;
|
||||
|
||||
truncate_cleanup_page(page);
|
||||
delete_from_page_cache(page);
|
||||
truncate_cleanup_folio(folio);
|
||||
filemap_remove_folio(folio);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle partial folios. The folio may be entirely within the
|
||||
* range if a split has raced with us. If not, we zero the part of the
|
||||
* folio that's within the [start, end] range, and then split the folio if
|
||||
* it's large. split_page_range() will discard pages which now lie beyond
|
||||
* i_size, and we rely on the caller to discard pages which lie within a
|
||||
* newly created hole.
|
||||
*
|
||||
* Returns false if splitting failed so the caller can avoid
|
||||
* discarding the entire folio which is stubbornly unsplit.
|
||||
*/
|
||||
bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
|
||||
{
|
||||
loff_t pos = folio_pos(folio);
|
||||
unsigned int offset, length;
|
||||
|
||||
if (pos < start)
|
||||
offset = start - pos;
|
||||
else
|
||||
offset = 0;
|
||||
length = folio_size(folio);
|
||||
if (pos + length <= (u64)end)
|
||||
length = length - offset;
|
||||
else
|
||||
length = end + 1 - pos - offset;
|
||||
|
||||
folio_wait_writeback(folio);
|
||||
if (length == folio_size(folio)) {
|
||||
truncate_inode_folio(folio->mapping, folio);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* We may be zeroing pages we're about to discard, but it avoids
|
||||
* doing a complex calculation here, and then doing the zeroing
|
||||
* anyway if the page split fails.
|
||||
*/
|
||||
folio_zero_range(folio, offset, length);
|
||||
|
||||
if (folio_has_private(folio))
|
||||
do_invalidatepage(&folio->page, offset, length);
|
||||
if (!folio_test_large(folio))
|
||||
return true;
|
||||
if (split_huge_page(&folio->page) == 0)
|
||||
return true;
|
||||
if (folio_test_dirty(folio))
|
||||
return false;
|
||||
truncate_inode_folio(folio->mapping, folio);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Used to get rid of pages on hardware memory corruption.
|
||||
*/
|
||||
int generic_error_remove_page(struct address_space *mapping, struct page *page)
|
||||
{
|
||||
VM_BUG_ON_PAGE(PageTail(page), page);
|
||||
|
||||
if (!mapping)
|
||||
return -EINVAL;
|
||||
/*
|
||||
@ -233,7 +290,7 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page)
|
||||
*/
|
||||
if (!S_ISREG(mapping->host->i_mode))
|
||||
return -EIO;
|
||||
return truncate_inode_page(mapping, page);
|
||||
return truncate_inode_folio(mapping, page_folio(page));
|
||||
}
|
||||
EXPORT_SYMBOL(generic_error_remove_page);
|
||||
|
||||
@ -284,19 +341,15 @@ void truncate_inode_pages_range(struct address_space *mapping,
|
||||
{
|
||||
pgoff_t start; /* inclusive */
|
||||
pgoff_t end; /* exclusive */
|
||||
unsigned int partial_start; /* inclusive */
|
||||
unsigned int partial_end; /* exclusive */
|
||||
struct pagevec pvec;
|
||||
struct folio_batch fbatch;
|
||||
pgoff_t indices[PAGEVEC_SIZE];
|
||||
pgoff_t index;
|
||||
int i;
|
||||
struct folio *folio;
|
||||
bool same_folio;
|
||||
|
||||
if (mapping_empty(mapping))
|
||||
goto out;
|
||||
|
||||
/* Offsets within partial pages */
|
||||
partial_start = lstart & (PAGE_SIZE - 1);
|
||||
partial_end = (lend + 1) & (PAGE_SIZE - 1);
|
||||
return;
|
||||
|
||||
/*
|
||||
* 'start' and 'end' always covers the range of pages to be fully
|
||||
@ -315,64 +368,49 @@ void truncate_inode_pages_range(struct address_space *mapping,
|
||||
else
|
||||
end = (lend + 1) >> PAGE_SHIFT;
|
||||
|
||||
pagevec_init(&pvec);
|
||||
folio_batch_init(&fbatch);
|
||||
index = start;
|
||||
while (index < end && find_lock_entries(mapping, index, end - 1,
|
||||
&pvec, indices)) {
|
||||
index = indices[pagevec_count(&pvec) - 1] + 1;
|
||||
truncate_exceptional_pvec_entries(mapping, &pvec, indices);
|
||||
for (i = 0; i < pagevec_count(&pvec); i++)
|
||||
truncate_cleanup_page(pvec.pages[i]);
|
||||
delete_from_page_cache_batch(mapping, &pvec);
|
||||
for (i = 0; i < pagevec_count(&pvec); i++)
|
||||
unlock_page(pvec.pages[i]);
|
||||
pagevec_release(&pvec);
|
||||
&fbatch, indices)) {
|
||||
index = indices[folio_batch_count(&fbatch) - 1] + 1;
|
||||
truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
|
||||
for (i = 0; i < folio_batch_count(&fbatch); i++)
|
||||
truncate_cleanup_folio(fbatch.folios[i]);
|
||||
delete_from_page_cache_batch(mapping, &fbatch);
|
||||
for (i = 0; i < folio_batch_count(&fbatch); i++)
|
||||
folio_unlock(fbatch.folios[i]);
|
||||
folio_batch_release(&fbatch);
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
if (partial_start) {
|
||||
struct page *page = find_lock_page(mapping, start - 1);
|
||||
if (page) {
|
||||
unsigned int top = PAGE_SIZE;
|
||||
if (start > end) {
|
||||
/* Truncation within a single page */
|
||||
top = partial_end;
|
||||
partial_end = 0;
|
||||
}
|
||||
wait_on_page_writeback(page);
|
||||
zero_user_segment(page, partial_start, top);
|
||||
cleancache_invalidate_page(mapping, page);
|
||||
if (page_has_private(page))
|
||||
do_invalidatepage(page, partial_start,
|
||||
top - partial_start);
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
|
||||
folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0);
|
||||
if (folio) {
|
||||
same_folio = lend < folio_pos(folio) + folio_size(folio);
|
||||
if (!truncate_inode_partial_folio(folio, lstart, lend)) {
|
||||
start = folio->index + folio_nr_pages(folio);
|
||||
if (same_folio)
|
||||
end = folio->index;
|
||||
}
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
folio = NULL;
|
||||
}
|
||||
if (partial_end) {
|
||||
struct page *page = find_lock_page(mapping, end);
|
||||
if (page) {
|
||||
wait_on_page_writeback(page);
|
||||
zero_user_segment(page, 0, partial_end);
|
||||
cleancache_invalidate_page(mapping, page);
|
||||
if (page_has_private(page))
|
||||
do_invalidatepage(page, 0,
|
||||
partial_end);
|
||||
unlock_page(page);
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
if (!same_folio)
|
||||
folio = __filemap_get_folio(mapping, lend >> PAGE_SHIFT,
|
||||
FGP_LOCK, 0);
|
||||
if (folio) {
|
||||
if (!truncate_inode_partial_folio(folio, lstart, lend))
|
||||
end = folio->index;
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
}
|
||||
/*
|
||||
* If the truncation happened within a single page no pages
|
||||
* will be released, just zeroed, so we can bail out now.
|
||||
*/
|
||||
if (start >= end)
|
||||
goto out;
|
||||
|
||||
index = start;
|
||||
for ( ; ; ) {
|
||||
while (index < end) {
|
||||
cond_resched();
|
||||
if (!find_get_entries(mapping, index, end - 1, &pvec,
|
||||
if (!find_get_entries(mapping, index, end - 1, &fbatch,
|
||||
indices)) {
|
||||
/* If all gone from start onwards, we're done */
|
||||
if (index == start)
|
||||
@ -382,28 +420,26 @@ void truncate_inode_pages_range(struct address_space *mapping,
|
||||
continue;
|
||||
}
|
||||
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
for (i = 0; i < folio_batch_count(&fbatch); i++) {
|
||||
struct folio *folio = fbatch.folios[i];
|
||||
|
||||
/* We rely upon deletion not changing page->index */
|
||||
index = indices[i];
|
||||
|
||||
if (xa_is_value(page))
|
||||
if (xa_is_value(folio))
|
||||
continue;
|
||||
|
||||
lock_page(page);
|
||||
WARN_ON(page_to_index(page) != index);
|
||||
wait_on_page_writeback(page);
|
||||
truncate_inode_page(mapping, page);
|
||||
unlock_page(page);
|
||||
folio_lock(folio);
|
||||
VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
|
||||
folio_wait_writeback(folio);
|
||||
truncate_inode_folio(mapping, folio);
|
||||
folio_unlock(folio);
|
||||
index = folio_index(folio) + folio_nr_pages(folio) - 1;
|
||||
}
|
||||
truncate_exceptional_pvec_entries(mapping, &pvec, indices);
|
||||
pagevec_release(&pvec);
|
||||
truncate_folio_batch_exceptionals(mapping, &fbatch, indices);
|
||||
folio_batch_release(&fbatch);
|
||||
index++;
|
||||
}
|
||||
|
||||
out:
|
||||
cleancache_invalidate_inode(mapping);
|
||||
}
|
||||
EXPORT_SYMBOL(truncate_inode_pages_range);
|
||||
|
||||
@ -457,10 +493,6 @@ void truncate_inode_pages_final(struct address_space *mapping)
|
||||
xa_unlock_irq(&mapping->i_pages);
|
||||
}
|
||||
|
||||
/*
|
||||
* Cleancache needs notification even if there are no pages or shadow
|
||||
* entries.
|
||||
*/
|
||||
truncate_inode_pages(mapping, 0);
|
||||
}
|
||||
EXPORT_SYMBOL(truncate_inode_pages_final);
|
||||
@ -469,16 +501,16 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
|
||||
pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
|
||||
{
|
||||
pgoff_t indices[PAGEVEC_SIZE];
|
||||
struct pagevec pvec;
|
||||
struct folio_batch fbatch;
|
||||
pgoff_t index = start;
|
||||
unsigned long ret;
|
||||
unsigned long count = 0;
|
||||
int i;
|
||||
|
||||
pagevec_init(&pvec);
|
||||
while (find_lock_entries(mapping, index, end, &pvec, indices)) {
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
folio_batch_init(&fbatch);
|
||||
while (find_lock_entries(mapping, index, end, &fbatch, indices)) {
|
||||
for (i = 0; i < folio_batch_count(&fbatch); i++) {
|
||||
struct page *page = &fbatch.folios[i]->page;
|
||||
|
||||
/* We rely upon deletion not changing page->index */
|
||||
index = indices[i];
|
||||
@ -505,8 +537,8 @@ static unsigned long __invalidate_mapping_pages(struct address_space *mapping,
|
||||
}
|
||||
count += ret;
|
||||
}
|
||||
pagevec_remove_exceptionals(&pvec);
|
||||
pagevec_release(&pvec);
|
||||
folio_batch_remove_exceptionals(&fbatch);
|
||||
folio_batch_release(&fbatch);
|
||||
cond_resched();
|
||||
index++;
|
||||
}
|
||||
@ -558,40 +590,43 @@ void invalidate_mapping_pagevec(struct address_space *mapping,
|
||||
* shrink_page_list() has a temp ref on them, or because they're transiently
|
||||
* sitting in the lru_cache_add() pagevecs.
|
||||
*/
|
||||
static int
|
||||
invalidate_complete_page2(struct address_space *mapping, struct page *page)
|
||||
static int invalidate_complete_folio2(struct address_space *mapping,
|
||||
struct folio *folio)
|
||||
{
|
||||
if (page->mapping != mapping)
|
||||
if (folio->mapping != mapping)
|
||||
return 0;
|
||||
|
||||
if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
|
||||
if (folio_has_private(folio) &&
|
||||
!filemap_release_folio(folio, GFP_KERNEL))
|
||||
return 0;
|
||||
|
||||
spin_lock(&mapping->host->i_lock);
|
||||
xa_lock_irq(&mapping->i_pages);
|
||||
if (PageDirty(page))
|
||||
if (folio_test_dirty(folio))
|
||||
goto failed;
|
||||
|
||||
BUG_ON(page_has_private(page));
|
||||
__delete_from_page_cache(page, NULL);
|
||||
BUG_ON(folio_has_private(folio));
|
||||
__filemap_remove_folio(folio, NULL);
|
||||
xa_unlock_irq(&mapping->i_pages);
|
||||
if (mapping_shrinkable(mapping))
|
||||
inode_add_lru(mapping->host);
|
||||
spin_unlock(&mapping->host->i_lock);
|
||||
|
||||
if (mapping->a_ops->freepage)
|
||||
mapping->a_ops->freepage(page);
|
||||
|
||||
put_page(page); /* pagecache ref */
|
||||
filemap_free_folio(mapping, folio);
|
||||
return 1;
|
||||
failed:
|
||||
xa_unlock_irq(&mapping->i_pages);
|
||||
spin_unlock(&mapping->host->i_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int do_launder_page(struct address_space *mapping, struct page *page)
|
||||
static int do_launder_folio(struct address_space *mapping, struct folio *folio)
|
||||
{
|
||||
if (!PageDirty(page))
|
||||
if (!folio_test_dirty(folio))
|
||||
return 0;
|
||||
if (page->mapping != mapping || mapping->a_ops->launder_page == NULL)
|
||||
if (folio->mapping != mapping || mapping->a_ops->launder_page == NULL)
|
||||
return 0;
|
||||
return mapping->a_ops->launder_page(page);
|
||||
return mapping->a_ops->launder_page(&folio->page);
|
||||
}
|
||||
|
||||
/**
|
||||
@ -609,7 +644,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
|
||||
pgoff_t start, pgoff_t end)
|
||||
{
|
||||
pgoff_t indices[PAGEVEC_SIZE];
|
||||
struct pagevec pvec;
|
||||
struct folio_batch fbatch;
|
||||
pgoff_t index;
|
||||
int i;
|
||||
int ret = 0;
|
||||
@ -617,27 +652,27 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
|
||||
int did_range_unmap = 0;
|
||||
|
||||
if (mapping_empty(mapping))
|
||||
goto out;
|
||||
return 0;
|
||||
|
||||
pagevec_init(&pvec);
|
||||
folio_batch_init(&fbatch);
|
||||
index = start;
|
||||
while (find_get_entries(mapping, index, end, &pvec, indices)) {
|
||||
for (i = 0; i < pagevec_count(&pvec); i++) {
|
||||
struct page *page = pvec.pages[i];
|
||||
while (find_get_entries(mapping, index, end, &fbatch, indices)) {
|
||||
for (i = 0; i < folio_batch_count(&fbatch); i++) {
|
||||
struct folio *folio = fbatch.folios[i];
|
||||
|
||||
/* We rely upon deletion not changing page->index */
|
||||
/* We rely upon deletion not changing folio->index */
|
||||
index = indices[i];
|
||||
|
||||
if (xa_is_value(page)) {
|
||||
if (xa_is_value(folio)) {
|
||||
if (!invalidate_exceptional_entry2(mapping,
|
||||
index, page))
|
||||
index, folio))
|
||||
ret = -EBUSY;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!did_range_unmap && page_mapped(page)) {
|
||||
if (!did_range_unmap && folio_mapped(folio)) {
|
||||
/*
|
||||
* If page is mapped, before taking its lock,
|
||||
* If folio is mapped, before taking its lock,
|
||||
* zap the rest of the file in one hit.
|
||||
*/
|
||||
unmap_mapping_pages(mapping, index,
|
||||
@ -645,29 +680,29 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
|
||||
did_range_unmap = 1;
|
||||
}
|
||||
|
||||
lock_page(page);
|
||||
WARN_ON(page_to_index(page) != index);
|
||||
if (page->mapping != mapping) {
|
||||
unlock_page(page);
|
||||
folio_lock(folio);
|
||||
VM_BUG_ON_FOLIO(!folio_contains(folio, index), folio);
|
||||
if (folio->mapping != mapping) {
|
||||
folio_unlock(folio);
|
||||
continue;
|
||||
}
|
||||
wait_on_page_writeback(page);
|
||||
folio_wait_writeback(folio);
|
||||
|
||||
if (page_mapped(page))
|
||||
unmap_mapping_page(page);
|
||||
BUG_ON(page_mapped(page));
|
||||
if (folio_mapped(folio))
|
||||
unmap_mapping_folio(folio);
|
||||
BUG_ON(folio_mapped(folio));
|
||||
|
||||
ret2 = do_launder_page(mapping, page);
|
||||
ret2 = do_launder_folio(mapping, folio);
|
||||
if (ret2 == 0) {
|
||||
if (!invalidate_complete_page2(mapping, page))
|
||||
if (!invalidate_complete_folio2(mapping, folio))
|
||||
ret2 = -EBUSY;
|
||||
}
|
||||
if (ret2 < 0)
|
||||
ret = ret2;
|
||||
unlock_page(page);
|
||||
folio_unlock(folio);
|
||||
}
|
||||
pagevec_remove_exceptionals(&pvec);
|
||||
pagevec_release(&pvec);
|
||||
folio_batch_remove_exceptionals(&fbatch);
|
||||
folio_batch_release(&fbatch);
|
||||
cond_resched();
|
||||
index++;
|
||||
}
|
||||
@ -681,8 +716,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
|
||||
if (dax_mapping(mapping)) {
|
||||
unmap_mapping_pages(mapping, start, end - start + 1, false);
|
||||
}
|
||||
out:
|
||||
cleancache_invalidate_inode(mapping);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range);
|
||||
|
@ -20,6 +20,7 @@
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/jump_label.h>
|
||||
#include <asm/sections.h>
|
||||
#include "slab.h"
|
||||
|
||||
/*
|
||||
* Checks if a given pointer and length is contained by the current
|
||||
@ -223,7 +224,7 @@ static inline void check_page_span(const void *ptr, unsigned long n,
|
||||
static inline void check_heap_object(const void *ptr, unsigned long n,
|
||||
bool to_user)
|
||||
{
|
||||
struct page *page;
|
||||
struct folio *folio;
|
||||
|
||||
if (!virt_addr_valid(ptr))
|
||||
return;
|
||||
@ -231,16 +232,16 @@ static inline void check_heap_object(const void *ptr, unsigned long n,
|
||||
/*
|
||||
* When CONFIG_HIGHMEM=y, kmap_to_page() will give either the
|
||||
* highmem page or fallback to virt_to_page(). The following
|
||||
* is effectively a highmem-aware virt_to_head_page().
|
||||
* is effectively a highmem-aware virt_to_slab().
|
||||
*/
|
||||
page = compound_head(kmap_to_page((void *)ptr));
|
||||
folio = page_folio(kmap_to_page((void *)ptr));
|
||||
|
||||
if (PageSlab(page)) {
|
||||
if (folio_test_slab(folio)) {
|
||||
/* Check slab allocator for flags and size. */
|
||||
__check_heap_object(ptr, n, page, to_user);
|
||||
__check_heap_object(ptr, n, folio_slab(folio), to_user);
|
||||
} else {
|
||||
/* Verify object does not incorrectly span multiple pages. */
|
||||
check_page_span(ptr, n, page, to_user);
|
||||
check_page_span(ptr, n, folio_page(folio, 0), to_user);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -69,10 +69,9 @@ int mfill_atomic_install_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd,
|
||||
pgoff_t offset, max_off;
|
||||
|
||||
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
|
||||
_dst_pte = pte_mkdirty(_dst_pte);
|
||||
if (page_in_cache && !vm_shared)
|
||||
writable = false;
|
||||
if (writable || !page_in_cache)
|
||||
_dst_pte = pte_mkdirty(_dst_pte);
|
||||
if (writable) {
|
||||
if (wp_copy)
|
||||
_dst_pte = pte_mkuffd_wp(_dst_pte);
|
||||
@ -164,7 +163,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
|
||||
__SetPageUptodate(page);
|
||||
|
||||
ret = -ENOMEM;
|
||||
if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
|
||||
if (mem_cgroup_charge(page_folio(page), dst_mm, GFP_KERNEL))
|
||||
goto out_release;
|
||||
|
||||
ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
|
||||
@ -233,6 +232,11 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (PageHWPoison(page)) {
|
||||
ret = -EIO;
|
||||
goto out_release;
|
||||
}
|
||||
|
||||
ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
|
||||
page, false, wp_copy);
|
||||
if (ret)
|
||||
|
126
mm/util.c
126
mm/util.c
@ -549,13 +549,10 @@ EXPORT_SYMBOL(vm_mmap);
|
||||
* Uses kmalloc to get the memory but if the allocation fails then falls back
|
||||
* to the vmalloc allocator. Use kvfree for freeing the memory.
|
||||
*
|
||||
* Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported.
|
||||
* GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier.
|
||||
* __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is
|
||||
* preferable to the vmalloc fallback, due to visible performance drawbacks.
|
||||
*
|
||||
* Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
|
||||
* fall back to vmalloc.
|
||||
*
|
||||
* Return: pointer to the allocated memory of %NULL in case of failure
|
||||
*/
|
||||
void *kvmalloc_node(size_t size, gfp_t flags, int node)
|
||||
@ -563,13 +560,6 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
|
||||
gfp_t kmalloc_flags = flags;
|
||||
void *ret;
|
||||
|
||||
/*
|
||||
* vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables)
|
||||
* so the given set of flags has to be compatible.
|
||||
*/
|
||||
if ((flags & GFP_KERNEL) != GFP_KERNEL)
|
||||
return kmalloc_node(size, flags, node);
|
||||
|
||||
/*
|
||||
* We want to attempt a large physically contiguous block first because
|
||||
* it is less likely to fragment multiple larger blocks and therefore
|
||||
@ -582,6 +572,9 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
|
||||
|
||||
if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
|
||||
kmalloc_flags |= __GFP_NORETRY;
|
||||
|
||||
/* nofail semantic is implemented by the vmalloc fallback */
|
||||
kmalloc_flags &= ~__GFP_NOFAIL;
|
||||
}
|
||||
|
||||
ret = kmalloc_node(size, kmalloc_flags, node);
|
||||
@ -656,81 +649,78 @@ void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
|
||||
}
|
||||
EXPORT_SYMBOL(kvrealloc);
|
||||
|
||||
static inline void *__page_rmapping(struct page *page)
|
||||
{
|
||||
unsigned long mapping;
|
||||
|
||||
mapping = (unsigned long)page->mapping;
|
||||
mapping &= ~PAGE_MAPPING_FLAGS;
|
||||
|
||||
return (void *)mapping;
|
||||
}
|
||||
|
||||
/* Neutral page->mapping pointer to address_space or anon_vma or other */
|
||||
void *page_rmapping(struct page *page)
|
||||
{
|
||||
page = compound_head(page);
|
||||
return __page_rmapping(page);
|
||||
return folio_raw_mapping(page_folio(page));
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true if this page is mapped into pagetables.
|
||||
* For compound page it returns true if any subpage of compound page is mapped.
|
||||
/**
|
||||
* folio_mapped - Is this folio mapped into userspace?
|
||||
* @folio: The folio.
|
||||
*
|
||||
* Return: True if any page in this folio is referenced by user page tables.
|
||||
*/
|
||||
bool page_mapped(struct page *page)
|
||||
bool folio_mapped(struct folio *folio)
|
||||
{
|
||||
int i;
|
||||
long i, nr;
|
||||
|
||||
if (likely(!PageCompound(page)))
|
||||
return atomic_read(&page->_mapcount) >= 0;
|
||||
page = compound_head(page);
|
||||
if (atomic_read(compound_mapcount_ptr(page)) >= 0)
|
||||
if (!folio_test_large(folio))
|
||||
return atomic_read(&folio->_mapcount) >= 0;
|
||||
if (atomic_read(folio_mapcount_ptr(folio)) >= 0)
|
||||
return true;
|
||||
if (PageHuge(page))
|
||||
if (folio_test_hugetlb(folio))
|
||||
return false;
|
||||
for (i = 0; i < compound_nr(page); i++) {
|
||||
if (atomic_read(&page[i]._mapcount) >= 0)
|
||||
|
||||
nr = folio_nr_pages(folio);
|
||||
for (i = 0; i < nr; i++) {
|
||||
if (atomic_read(&folio_page(folio, i)->_mapcount) >= 0)
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
EXPORT_SYMBOL(page_mapped);
|
||||
EXPORT_SYMBOL(folio_mapped);
|
||||
|
||||
struct anon_vma *page_anon_vma(struct page *page)
|
||||
{
|
||||
unsigned long mapping;
|
||||
struct folio *folio = page_folio(page);
|
||||
unsigned long mapping = (unsigned long)folio->mapping;
|
||||
|
||||
page = compound_head(page);
|
||||
mapping = (unsigned long)page->mapping;
|
||||
if ((mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON)
|
||||
return NULL;
|
||||
return __page_rmapping(page);
|
||||
return (void *)(mapping - PAGE_MAPPING_ANON);
|
||||
}
|
||||
|
||||
struct address_space *page_mapping(struct page *page)
|
||||
/**
|
||||
* folio_mapping - Find the mapping where this folio is stored.
|
||||
* @folio: The folio.
|
||||
*
|
||||
* For folios which are in the page cache, return the mapping that this
|
||||
* page belongs to. Folios in the swap cache return the swap mapping
|
||||
* this page is stored in (which is different from the mapping for the
|
||||
* swap file or swap device where the data is stored).
|
||||
*
|
||||
* You can call this for folios which aren't in the swap cache or page
|
||||
* cache and it will return NULL.
|
||||
*/
|
||||
struct address_space *folio_mapping(struct folio *folio)
|
||||
{
|
||||
struct address_space *mapping;
|
||||
|
||||
page = compound_head(page);
|
||||
|
||||
/* This happens if someone calls flush_dcache_page on slab page */
|
||||
if (unlikely(PageSlab(page)))
|
||||
if (unlikely(folio_test_slab(folio)))
|
||||
return NULL;
|
||||
|
||||
if (unlikely(PageSwapCache(page))) {
|
||||
swp_entry_t entry;
|
||||
if (unlikely(folio_test_swapcache(folio)))
|
||||
return swap_address_space(folio_swap_entry(folio));
|
||||
|
||||
entry.val = page_private(page);
|
||||
return swap_address_space(entry);
|
||||
}
|
||||
|
||||
mapping = page->mapping;
|
||||
mapping = folio->mapping;
|
||||
if ((unsigned long)mapping & PAGE_MAPPING_ANON)
|
||||
return NULL;
|
||||
|
||||
return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
|
||||
}
|
||||
EXPORT_SYMBOL(page_mapping);
|
||||
EXPORT_SYMBOL(folio_mapping);
|
||||
|
||||
/* Slow path of page_mapcount() for compound pages */
|
||||
int __page_mapcount(struct page *page)
|
||||
@ -752,13 +742,26 @@ int __page_mapcount(struct page *page)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(__page_mapcount);
|
||||
|
||||
void copy_huge_page(struct page *dst, struct page *src)
|
||||
/**
|
||||
* folio_copy - Copy the contents of one folio to another.
|
||||
* @dst: Folio to copy to.
|
||||
* @src: Folio to copy from.
|
||||
*
|
||||
* The bytes in the folio represented by @src are copied to @dst.
|
||||
* Assumes the caller has validated that @dst is at least as large as @src.
|
||||
* Can be called in atomic context for order-0 folios, but if the folio is
|
||||
* larger, it may sleep.
|
||||
*/
|
||||
void folio_copy(struct folio *dst, struct folio *src)
|
||||
{
|
||||
unsigned i, nr = compound_nr(src);
|
||||
long i = 0;
|
||||
long nr = folio_nr_pages(src);
|
||||
|
||||
for (i = 0; i < nr; i++) {
|
||||
for (;;) {
|
||||
copy_highpage(folio_page(dst, i), folio_page(src, i));
|
||||
if (++i == nr)
|
||||
break;
|
||||
cond_resched();
|
||||
copy_highpage(nth_page(dst, i), nth_page(src, i));
|
||||
}
|
||||
}
|
||||
|
||||
@ -1081,3 +1084,14 @@ void page_offline_end(void)
|
||||
up_write(&page_offline_rwsem);
|
||||
}
|
||||
EXPORT_SYMBOL(page_offline_end);
|
||||
|
||||
#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_FOLIO
|
||||
void flush_dcache_folio(struct folio *folio)
|
||||
{
|
||||
long i, nr = folio_nr_pages(folio);
|
||||
|
||||
for (i = 0; i < nr; i++)
|
||||
flush_dcache_page(folio_page(folio, i));
|
||||
}
|
||||
EXPORT_SYMBOL(flush_dcache_folio);
|
||||
#endif
|
||||
|
161
mm/vmalloc.c
161
mm/vmalloc.c
@ -31,6 +31,7 @@
|
||||
#include <linux/kmemleak.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/compiler.h>
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/llist.h>
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/rbtree_augmented.h>
|
||||
@ -38,6 +39,7 @@
|
||||
#include <linux/pgtable.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/shmparam.h>
|
||||
|
||||
@ -1195,18 +1197,14 @@ find_vmap_lowest_match(unsigned long size,
|
||||
{
|
||||
struct vmap_area *va;
|
||||
struct rb_node *node;
|
||||
unsigned long length;
|
||||
|
||||
/* Start from the root. */
|
||||
node = free_vmap_area_root.rb_node;
|
||||
|
||||
/* Adjust the search size for alignment overhead. */
|
||||
length = size + align - 1;
|
||||
|
||||
while (node) {
|
||||
va = rb_entry(node, struct vmap_area, rb_node);
|
||||
|
||||
if (get_subtree_max_size(node->rb_left) >= length &&
|
||||
if (get_subtree_max_size(node->rb_left) >= size &&
|
||||
vstart < va->va_start) {
|
||||
node = node->rb_left;
|
||||
} else {
|
||||
@ -1216,9 +1214,9 @@ find_vmap_lowest_match(unsigned long size,
|
||||
/*
|
||||
* Does not make sense to go deeper towards the right
|
||||
* sub-tree if it does not have a free block that is
|
||||
* equal or bigger to the requested search length.
|
||||
* equal or bigger to the requested search size.
|
||||
*/
|
||||
if (get_subtree_max_size(node->rb_right) >= length) {
|
||||
if (get_subtree_max_size(node->rb_right) >= size) {
|
||||
node = node->rb_right;
|
||||
continue;
|
||||
}
|
||||
@ -1226,15 +1224,23 @@ find_vmap_lowest_match(unsigned long size,
|
||||
/*
|
||||
* OK. We roll back and find the first right sub-tree,
|
||||
* that will satisfy the search criteria. It can happen
|
||||
* only once due to "vstart" restriction.
|
||||
* due to "vstart" restriction or an alignment overhead
|
||||
* that is bigger then PAGE_SIZE.
|
||||
*/
|
||||
while ((node = rb_parent(node))) {
|
||||
va = rb_entry(node, struct vmap_area, rb_node);
|
||||
if (is_within_this_va(va, size, align, vstart))
|
||||
return va;
|
||||
|
||||
if (get_subtree_max_size(node->rb_right) >= length &&
|
||||
if (get_subtree_max_size(node->rb_right) >= size &&
|
||||
vstart <= va->va_start) {
|
||||
/*
|
||||
* Shift the vstart forward. Please note, we update it with
|
||||
* parent's start address adding "1" because we do not want
|
||||
* to enter same sub-tree after it has already been checked
|
||||
* and no suitable free block found there.
|
||||
*/
|
||||
vstart = va->va_start + 1;
|
||||
node = node->rb_right;
|
||||
break;
|
||||
}
|
||||
@ -1265,7 +1271,7 @@ find_vmap_lowest_linear_match(unsigned long size,
|
||||
}
|
||||
|
||||
static void
|
||||
find_vmap_lowest_match_check(unsigned long size)
|
||||
find_vmap_lowest_match_check(unsigned long size, unsigned long align)
|
||||
{
|
||||
struct vmap_area *va_1, *va_2;
|
||||
unsigned long vstart;
|
||||
@ -1274,8 +1280,8 @@ find_vmap_lowest_match_check(unsigned long size)
|
||||
get_random_bytes(&rnd, sizeof(rnd));
|
||||
vstart = VMALLOC_START + rnd;
|
||||
|
||||
va_1 = find_vmap_lowest_match(size, 1, vstart);
|
||||
va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
|
||||
va_1 = find_vmap_lowest_match(size, align, vstart);
|
||||
va_2 = find_vmap_lowest_linear_match(size, align, vstart);
|
||||
|
||||
if (va_1 != va_2)
|
||||
pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
|
||||
@ -1454,7 +1460,7 @@ __alloc_vmap_area(unsigned long size, unsigned long align,
|
||||
return vend;
|
||||
|
||||
#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
|
||||
find_vmap_lowest_match_check(size);
|
||||
find_vmap_lowest_match_check(size, align);
|
||||
#endif
|
||||
|
||||
return nva_start_addr;
|
||||
@ -2272,15 +2278,22 @@ void __init vm_area_add_early(struct vm_struct *vm)
|
||||
*/
|
||||
void __init vm_area_register_early(struct vm_struct *vm, size_t align)
|
||||
{
|
||||
static size_t vm_init_off __initdata;
|
||||
unsigned long addr;
|
||||
unsigned long addr = ALIGN(VMALLOC_START, align);
|
||||
struct vm_struct *cur, **p;
|
||||
|
||||
addr = ALIGN(VMALLOC_START + vm_init_off, align);
|
||||
vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
|
||||
BUG_ON(vmap_initialized);
|
||||
|
||||
for (p = &vmlist; (cur = *p) != NULL; p = &cur->next) {
|
||||
if ((unsigned long)cur->addr - addr >= vm->size)
|
||||
break;
|
||||
addr = ALIGN((unsigned long)cur->addr + cur->size, align);
|
||||
}
|
||||
|
||||
BUG_ON(addr > VMALLOC_END - vm->size);
|
||||
vm->addr = (void *)addr;
|
||||
|
||||
vm_area_add_early(vm);
|
||||
vm->next = *p;
|
||||
*p = vm;
|
||||
kasan_populate_early_vm_area_shadow(vm->addr, vm->size);
|
||||
}
|
||||
|
||||
static void vmap_init_free_space(void)
|
||||
@ -2612,12 +2625,13 @@ static void __vunmap(const void *addr, int deallocate_pages)
|
||||
|
||||
if (deallocate_pages) {
|
||||
unsigned int page_order = vm_area_page_order(area);
|
||||
int i;
|
||||
int i, step = 1U << page_order;
|
||||
|
||||
for (i = 0; i < area->nr_pages; i += 1U << page_order) {
|
||||
for (i = 0; i < area->nr_pages; i += step) {
|
||||
struct page *page = area->pages[i];
|
||||
|
||||
BUG_ON(!page);
|
||||
mod_memcg_page_state(page, MEMCG_VMALLOC, -step);
|
||||
__free_pages(page, page_order);
|
||||
cond_resched();
|
||||
}
|
||||
@ -2743,6 +2757,13 @@ void *vmap(struct page **pages, unsigned int count,
|
||||
|
||||
might_sleep();
|
||||
|
||||
/*
|
||||
* Your top guard is someone else's bottom guard. Not having a top
|
||||
* guard compromises someone else's mappings too.
|
||||
*/
|
||||
if (WARN_ON_ONCE(flags & VM_NO_GUARD))
|
||||
flags &= ~VM_NO_GUARD;
|
||||
|
||||
if (count > totalram_pages())
|
||||
return NULL;
|
||||
|
||||
@ -2825,7 +2846,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
|
||||
* to fails, fallback to a single page allocator that is
|
||||
* more permissive.
|
||||
*/
|
||||
if (!order && nid != NUMA_NO_NODE) {
|
||||
if (!order) {
|
||||
gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;
|
||||
|
||||
while (nr_allocated < nr_pages) {
|
||||
unsigned int nr, nr_pages_request;
|
||||
|
||||
@ -2837,8 +2860,20 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
|
||||
*/
|
||||
nr_pages_request = min(100U, nr_pages - nr_allocated);
|
||||
|
||||
nr = alloc_pages_bulk_array_node(gfp, nid,
|
||||
nr_pages_request, pages + nr_allocated);
|
||||
/* memory allocation should consider mempolicy, we can't
|
||||
* wrongly use nearest node when nid == NUMA_NO_NODE,
|
||||
* otherwise memory may be allocated in only one node,
|
||||
* but mempolcy want to alloc memory by interleaving.
|
||||
*/
|
||||
if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
|
||||
nr = alloc_pages_bulk_array_mempolicy(bulk_gfp,
|
||||
nr_pages_request,
|
||||
pages + nr_allocated);
|
||||
|
||||
else
|
||||
nr = alloc_pages_bulk_array_node(bulk_gfp, nid,
|
||||
nr_pages_request,
|
||||
pages + nr_allocated);
|
||||
|
||||
nr_allocated += nr;
|
||||
cond_resched();
|
||||
@ -2850,7 +2885,7 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
|
||||
if (nr != nr_pages_request)
|
||||
break;
|
||||
}
|
||||
} else if (order)
|
||||
} else
|
||||
/*
|
||||
* Compound pages required for remap_vmalloc_page if
|
||||
* high-order pages.
|
||||
@ -2860,6 +2895,9 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
|
||||
/* High-order pages or fallback path if "bulk" fails. */
|
||||
|
||||
while (nr_allocated < nr_pages) {
|
||||
if (fatal_signal_pending(current))
|
||||
break;
|
||||
|
||||
if (nid == NUMA_NO_NODE)
|
||||
page = alloc_pages(gfp, order);
|
||||
else
|
||||
@ -2887,11 +2925,15 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
|
||||
int node)
|
||||
{
|
||||
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
|
||||
const gfp_t orig_gfp_mask = gfp_mask;
|
||||
bool nofail = gfp_mask & __GFP_NOFAIL;
|
||||
unsigned long addr = (unsigned long)area->addr;
|
||||
unsigned long size = get_vm_area_size(area);
|
||||
unsigned long array_size;
|
||||
unsigned int nr_small_pages = size >> PAGE_SHIFT;
|
||||
unsigned int page_order;
|
||||
unsigned int flags;
|
||||
int ret;
|
||||
|
||||
array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
|
||||
gfp_mask |= __GFP_NOWARN;
|
||||
@ -2907,7 +2949,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
|
||||
}
|
||||
|
||||
if (!area->pages) {
|
||||
warn_alloc(gfp_mask, NULL,
|
||||
warn_alloc(orig_gfp_mask, NULL,
|
||||
"vmalloc error: size %lu, failed to allocated page array size %lu",
|
||||
nr_small_pages * PAGE_SIZE, array_size);
|
||||
free_vm_area(area);
|
||||
@ -2921,21 +2963,48 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
|
||||
page_order, nr_small_pages, area->pages);
|
||||
|
||||
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
|
||||
if (gfp_mask & __GFP_ACCOUNT) {
|
||||
int i, step = 1U << page_order;
|
||||
|
||||
for (i = 0; i < area->nr_pages; i += step)
|
||||
mod_memcg_page_state(area->pages[i], MEMCG_VMALLOC,
|
||||
step);
|
||||
}
|
||||
|
||||
/*
|
||||
* If not enough pages were obtained to accomplish an
|
||||
* allocation request, free them via __vfree() if any.
|
||||
*/
|
||||
if (area->nr_pages != nr_small_pages) {
|
||||
warn_alloc(gfp_mask, NULL,
|
||||
warn_alloc(orig_gfp_mask, NULL,
|
||||
"vmalloc error: size %lu, page order %u, failed to allocate pages",
|
||||
area->nr_pages * PAGE_SIZE, page_order);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
if (vmap_pages_range(addr, addr + size, prot, area->pages,
|
||||
page_shift) < 0) {
|
||||
warn_alloc(gfp_mask, NULL,
|
||||
/*
|
||||
* page tables allocations ignore external gfp mask, enforce it
|
||||
* by the scope API
|
||||
*/
|
||||
if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
|
||||
flags = memalloc_nofs_save();
|
||||
else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
|
||||
flags = memalloc_noio_save();
|
||||
|
||||
do {
|
||||
ret = vmap_pages_range(addr, addr + size, prot, area->pages,
|
||||
page_shift);
|
||||
if (nofail && (ret < 0))
|
||||
schedule_timeout_uninterruptible(1);
|
||||
} while (nofail && (ret < 0));
|
||||
|
||||
if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO)
|
||||
memalloc_nofs_restore(flags);
|
||||
else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0)
|
||||
memalloc_noio_restore(flags);
|
||||
|
||||
if (ret < 0) {
|
||||
warn_alloc(orig_gfp_mask, NULL,
|
||||
"vmalloc error: size %lu, failed to map pages",
|
||||
area->nr_pages * PAGE_SIZE);
|
||||
goto fail;
|
||||
@ -2961,8 +3030,18 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
|
||||
* @caller: caller's return address
|
||||
*
|
||||
* Allocate enough pages to cover @size from the page level
|
||||
* allocator with @gfp_mask flags. Map them into contiguous
|
||||
* kernel virtual space, using a pagetable protection of @prot.
|
||||
* allocator with @gfp_mask flags. Please note that the full set of gfp
|
||||
* flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all
|
||||
* supported.
|
||||
* Zone modifiers are not supported. From the reclaim modifiers
|
||||
* __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported)
|
||||
* and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and
|
||||
* __GFP_RETRY_MAYFAIL are not supported).
|
||||
*
|
||||
* __GFP_NOWARN can be used to suppress failures messages.
|
||||
*
|
||||
* Map them into contiguous kernel virtual space, using a pagetable
|
||||
* protection of @prot.
|
||||
*
|
||||
* Return: the address of the area or %NULL on failure
|
||||
*/
|
||||
@ -3014,9 +3093,14 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
|
||||
VM_UNINITIALIZED | vm_flags, start, end, node,
|
||||
gfp_mask, caller);
|
||||
if (!area) {
|
||||
bool nofail = gfp_mask & __GFP_NOFAIL;
|
||||
warn_alloc(gfp_mask, NULL,
|
||||
"vmalloc error: size %lu, vm_struct allocation failed",
|
||||
real_size);
|
||||
"vmalloc error: size %lu, vm_struct allocation failed%s",
|
||||
real_size, (nofail) ? ". Retrying." : "");
|
||||
if (nofail) {
|
||||
schedule_timeout_uninterruptible(1);
|
||||
goto again;
|
||||
}
|
||||
goto fail;
|
||||
}
|
||||
|
||||
@ -3857,6 +3941,7 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_NUMA)) {
|
||||
unsigned int nr, *counters = m->private;
|
||||
unsigned int step = 1U << vm_area_page_order(v);
|
||||
|
||||
if (!counters)
|
||||
return;
|
||||
@ -3868,9 +3953,8 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
|
||||
|
||||
memset(counters, 0, nr_node_ids * sizeof(unsigned int));
|
||||
|
||||
for (nr = 0; nr < v->nr_pages; nr++)
|
||||
counters[page_to_nid(v->pages[nr])]++;
|
||||
|
||||
for (nr = 0; nr < v->nr_pages; nr += step)
|
||||
counters[page_to_nid(v->pages[nr])] += step;
|
||||
for_each_node_state(nr, N_HIGH_MEMORY)
|
||||
if (counters[nr])
|
||||
seq_printf(m, " N%u=%u", nr, counters[nr]);
|
||||
@ -3906,7 +3990,7 @@ static int s_show(struct seq_file *m, void *p)
|
||||
(void *)va->va_start, (void *)va->va_end,
|
||||
va->va_end - va->va_start);
|
||||
|
||||
return 0;
|
||||
goto final;
|
||||
}
|
||||
|
||||
v = va->vm;
|
||||
@ -3947,6 +4031,7 @@ static int s_show(struct seq_file *m, void *p)
|
||||
/*
|
||||
* As a final step, dump "unpurged" areas.
|
||||
*/
|
||||
final:
|
||||
if (list_is_last(&va->list, &vmap_area_list))
|
||||
show_purge_info(m);
|
||||
|
||||
|
@ -308,7 +308,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
|
||||
* asserted for a second in which subsequent
|
||||
* pressure events can occur.
|
||||
*/
|
||||
memcg->socket_pressure = jiffies + HZ;
|
||||
WRITE_ONCE(memcg->socket_pressure, jiffies + HZ);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
235
mm/vmscan.c
235
mm/vmscan.c
@ -687,6 +687,21 @@ void unregister_shrinker(struct shrinker *shrinker)
|
||||
}
|
||||
EXPORT_SYMBOL(unregister_shrinker);
|
||||
|
||||
/**
|
||||
* synchronize_shrinkers - Wait for all running shrinkers to complete.
|
||||
*
|
||||
* This is equivalent to calling unregister_shrink() and register_shrinker(),
|
||||
* but atomically and with less overhead. This is useful to guarantee that all
|
||||
* shrinker invocations have seen an update, before freeing memory, similar to
|
||||
* rcu.
|
||||
*/
|
||||
void synchronize_shrinkers(void)
|
||||
{
|
||||
down_write(&shrinker_rwsem);
|
||||
up_write(&shrinker_rwsem);
|
||||
}
|
||||
EXPORT_SYMBOL(synchronize_shrinkers);
|
||||
|
||||
#define SHRINK_BATCH 128
|
||||
|
||||
static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
|
||||
@ -936,7 +951,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
|
||||
return freed;
|
||||
}
|
||||
|
||||
void drop_slab_node(int nid)
|
||||
static void drop_slab_node(int nid)
|
||||
{
|
||||
unsigned long freed;
|
||||
int shift = 0;
|
||||
@ -1006,6 +1021,134 @@ static void handle_write_error(struct address_space *mapping,
|
||||
unlock_page(page);
|
||||
}
|
||||
|
||||
static bool skip_throttle_noprogress(pg_data_t *pgdat)
|
||||
{
|
||||
int reclaimable = 0, write_pending = 0;
|
||||
int i;
|
||||
|
||||
/*
|
||||
* If kswapd is disabled, reschedule if necessary but do not
|
||||
* throttle as the system is likely near OOM.
|
||||
*/
|
||||
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* If there are a lot of dirty/writeback pages then do not
|
||||
* throttle as throttling will occur when the pages cycle
|
||||
* towards the end of the LRU if still under writeback.
|
||||
*/
|
||||
for (i = 0; i < MAX_NR_ZONES; i++) {
|
||||
struct zone *zone = pgdat->node_zones + i;
|
||||
|
||||
if (!populated_zone(zone))
|
||||
continue;
|
||||
|
||||
reclaimable += zone_reclaimable_pages(zone);
|
||||
write_pending += zone_page_state_snapshot(zone,
|
||||
NR_ZONE_WRITE_PENDING);
|
||||
}
|
||||
if (2 * write_pending <= reclaimable)
|
||||
return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason)
|
||||
{
|
||||
wait_queue_head_t *wqh = &pgdat->reclaim_wait[reason];
|
||||
long timeout, ret;
|
||||
DEFINE_WAIT(wait);
|
||||
|
||||
/*
|
||||
* Do not throttle IO workers, kthreads other than kswapd or
|
||||
* workqueues. They may be required for reclaim to make
|
||||
* forward progress (e.g. journalling workqueues or kthreads).
|
||||
*/
|
||||
if (!current_is_kswapd() &&
|
||||
current->flags & (PF_IO_WORKER|PF_KTHREAD)) {
|
||||
cond_resched();
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* These figures are pulled out of thin air.
|
||||
* VMSCAN_THROTTLE_ISOLATED is a transient condition based on too many
|
||||
* parallel reclaimers which is a short-lived event so the timeout is
|
||||
* short. Failing to make progress or waiting on writeback are
|
||||
* potentially long-lived events so use a longer timeout. This is shaky
|
||||
* logic as a failure to make progress could be due to anything from
|
||||
* writeback to a slow device to excessive references pages at the tail
|
||||
* of the inactive LRU.
|
||||
*/
|
||||
switch(reason) {
|
||||
case VMSCAN_THROTTLE_WRITEBACK:
|
||||
timeout = HZ/10;
|
||||
|
||||
if (atomic_inc_return(&pgdat->nr_writeback_throttled) == 1) {
|
||||
WRITE_ONCE(pgdat->nr_reclaim_start,
|
||||
node_page_state(pgdat, NR_THROTTLED_WRITTEN));
|
||||
}
|
||||
|
||||
break;
|
||||
case VMSCAN_THROTTLE_CONGESTED:
|
||||
fallthrough;
|
||||
case VMSCAN_THROTTLE_NOPROGRESS:
|
||||
if (skip_throttle_noprogress(pgdat)) {
|
||||
cond_resched();
|
||||
return;
|
||||
}
|
||||
|
||||
timeout = 1;
|
||||
|
||||
break;
|
||||
case VMSCAN_THROTTLE_ISOLATED:
|
||||
timeout = HZ/50;
|
||||
break;
|
||||
default:
|
||||
WARN_ON_ONCE(1);
|
||||
timeout = HZ;
|
||||
break;
|
||||
}
|
||||
|
||||
prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
|
||||
ret = schedule_timeout(timeout);
|
||||
finish_wait(wqh, &wait);
|
||||
|
||||
if (reason == VMSCAN_THROTTLE_WRITEBACK)
|
||||
atomic_dec(&pgdat->nr_writeback_throttled);
|
||||
|
||||
trace_mm_vmscan_throttled(pgdat->node_id, jiffies_to_usecs(timeout),
|
||||
jiffies_to_usecs(timeout - ret),
|
||||
reason);
|
||||
}
|
||||
|
||||
/*
|
||||
* Account for pages written if tasks are throttled waiting on dirty
|
||||
* pages to clean. If enough pages have been cleaned since throttling
|
||||
* started then wakeup the throttled tasks.
|
||||
*/
|
||||
void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio,
|
||||
int nr_throttled)
|
||||
{
|
||||
unsigned long nr_written;
|
||||
|
||||
node_stat_add_folio(folio, NR_THROTTLED_WRITTEN);
|
||||
|
||||
/*
|
||||
* This is an inaccurate read as the per-cpu deltas may not
|
||||
* be synchronised. However, given that the system is
|
||||
* writeback throttled, it is not worth taking the penalty
|
||||
* of getting an accurate count. At worst, the throttle
|
||||
* timeout guarantees forward progress.
|
||||
*/
|
||||
nr_written = node_page_state(pgdat, NR_THROTTLED_WRITTEN) -
|
||||
READ_ONCE(pgdat->nr_reclaim_start);
|
||||
|
||||
if (nr_written > SWAP_CLUSTER_MAX * nr_throttled)
|
||||
wake_up(&pgdat->reclaim_wait[VMSCAN_THROTTLE_WRITEBACK]);
|
||||
}
|
||||
|
||||
/* possible outcome of pageout() */
|
||||
typedef enum {
|
||||
/* failed to write page out, page is locked */
|
||||
@ -1105,6 +1248,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
|
||||
BUG_ON(!PageLocked(page));
|
||||
BUG_ON(mapping != page_mapping(page));
|
||||
|
||||
if (!PageSwapCache(page))
|
||||
spin_lock(&mapping->host->i_lock);
|
||||
xa_lock_irq(&mapping->i_pages);
|
||||
/*
|
||||
* The non racy check for a busy page.
|
||||
@ -1173,6 +1318,9 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
|
||||
shadow = workingset_eviction(page, target_memcg);
|
||||
__delete_from_page_cache(page, shadow);
|
||||
xa_unlock_irq(&mapping->i_pages);
|
||||
if (mapping_shrinkable(mapping))
|
||||
inode_add_lru(mapping->host);
|
||||
spin_unlock(&mapping->host->i_lock);
|
||||
|
||||
if (freepage != NULL)
|
||||
freepage(page);
|
||||
@ -1182,6 +1330,8 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
|
||||
|
||||
cannot_free:
|
||||
xa_unlock_irq(&mapping->i_pages);
|
||||
if (!PageSwapCache(page))
|
||||
spin_unlock(&mapping->host->i_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -1337,7 +1487,6 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
|
||||
{
|
||||
int target_nid = next_demotion_node(pgdat->node_id);
|
||||
unsigned int nr_succeeded;
|
||||
int err;
|
||||
|
||||
if (list_empty(demote_pages))
|
||||
return 0;
|
||||
@ -1346,7 +1495,7 @@ static unsigned int demote_page_list(struct list_head *demote_pages,
|
||||
return 0;
|
||||
|
||||
/* Demotion ignores all cpuset and mempolicy settings */
|
||||
err = migrate_pages(demote_pages, alloc_demote_page, NULL,
|
||||
migrate_pages(demote_pages, alloc_demote_page, NULL,
|
||||
target_nid, MIGRATE_ASYNC, MR_DEMOTION,
|
||||
&nr_succeeded);
|
||||
|
||||
@ -1412,9 +1561,8 @@ static unsigned int shrink_page_list(struct list_head *page_list,
|
||||
|
||||
/*
|
||||
* The number of dirty pages determines if a node is marked
|
||||
* reclaim_congested which affects wait_iff_congested. kswapd
|
||||
* will stall and start writing pages if the tail of the LRU
|
||||
* is all dirty unqueued pages.
|
||||
* reclaim_congested. kswapd will stall and start writing
|
||||
* pages if the tail of the LRU is all dirty unqueued pages.
|
||||
*/
|
||||
page_check_dirty_writeback(page, &dirty, &writeback);
|
||||
if (dirty || writeback)
|
||||
@ -2090,6 +2238,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
|
||||
*/
|
||||
int isolate_lru_page(struct page *page)
|
||||
{
|
||||
struct folio *folio = page_folio(page);
|
||||
int ret = -EBUSY;
|
||||
|
||||
VM_BUG_ON_PAGE(!page_count(page), page);
|
||||
@ -2099,7 +2248,7 @@ int isolate_lru_page(struct page *page)
|
||||
struct lruvec *lruvec;
|
||||
|
||||
get_page(page);
|
||||
lruvec = lock_page_lruvec_irq(page);
|
||||
lruvec = folio_lruvec_lock_irq(folio);
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
unlock_page_lruvec_irq(lruvec);
|
||||
ret = 0;
|
||||
@ -2119,6 +2268,7 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
|
||||
struct scan_control *sc)
|
||||
{
|
||||
unsigned long inactive, isolated;
|
||||
bool too_many;
|
||||
|
||||
if (current_is_kswapd())
|
||||
return 0;
|
||||
@ -2142,7 +2292,13 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
|
||||
if ((sc->gfp_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS))
|
||||
inactive >>= 3;
|
||||
|
||||
return isolated > inactive;
|
||||
too_many = isolated > inactive;
|
||||
|
||||
/* Wake up tasks throttled due to too_many_isolated. */
|
||||
if (!too_many)
|
||||
wake_throttle_isolated(pgdat);
|
||||
|
||||
return too_many;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -2199,7 +2355,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec,
|
||||
* All pages were isolated from the same lruvec (and isolation
|
||||
* inhibits memcg migration).
|
||||
*/
|
||||
VM_BUG_ON_PAGE(!page_matches_lruvec(page, lruvec), page);
|
||||
VM_BUG_ON_PAGE(!folio_matches_lruvec(page_folio(page), lruvec), page);
|
||||
add_page_to_lru_list(page, lruvec);
|
||||
nr_pages = thp_nr_pages(page);
|
||||
nr_moved += nr_pages;
|
||||
@ -2251,8 +2407,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
|
||||
return 0;
|
||||
|
||||
/* wait a bit for the reclaimer. */
|
||||
msleep(100);
|
||||
stalled = true;
|
||||
reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED);
|
||||
|
||||
/* We are about to die and free our memory. Return now. */
|
||||
if (fatal_signal_pending(current))
|
||||
@ -3180,19 +3336,19 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
|
||||
* If kswapd scans pages marked for immediate
|
||||
* reclaim and under writeback (nr_immediate), it
|
||||
* implies that pages are cycling through the LRU
|
||||
* faster than they are written so also forcibly stall.
|
||||
* faster than they are written so forcibly stall
|
||||
* until some pages complete writeback.
|
||||
*/
|
||||
if (sc->nr.immediate)
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/10);
|
||||
reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
|
||||
}
|
||||
|
||||
/*
|
||||
* Tag a node/memcg as congested if all the dirty pages
|
||||
* scanned were backed by a congested BDI and
|
||||
* wait_iff_congested will stall.
|
||||
* Tag a node/memcg as congested if all the dirty pages were marked
|
||||
* for writeback and immediate reclaim (counted in nr.congested).
|
||||
*
|
||||
* Legacy memcg will stall in page writeback so avoid forcibly
|
||||
* stalling in wait_iff_congested().
|
||||
* stalling in reclaim_throttle().
|
||||
*/
|
||||
if ((current_is_kswapd() ||
|
||||
(cgroup_reclaim(sc) && writeback_throttling_sane(sc))) &&
|
||||
@ -3200,15 +3356,15 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc)
|
||||
set_bit(LRUVEC_CONGESTED, &target_lruvec->flags);
|
||||
|
||||
/*
|
||||
* Stall direct reclaim for IO completions if underlying BDIs
|
||||
* and node is congested. Allow kswapd to continue until it
|
||||
* Stall direct reclaim for IO completions if the lruvec is
|
||||
* node is congested. Allow kswapd to continue until it
|
||||
* starts encountering unqueued dirty pages or cycling through
|
||||
* the LRU too quickly.
|
||||
*/
|
||||
if (!current_is_kswapd() && current_may_throttle() &&
|
||||
!sc->hibernation_mode &&
|
||||
test_bit(LRUVEC_CONGESTED, &target_lruvec->flags))
|
||||
wait_iff_congested(BLK_RW_ASYNC, HZ/10);
|
||||
reclaim_throttle(pgdat, VMSCAN_THROTTLE_CONGESTED);
|
||||
|
||||
if (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
|
||||
sc))
|
||||
@ -3256,6 +3412,36 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc)
|
||||
return zone_watermark_ok_safe(zone, 0, watermark, sc->reclaim_idx);
|
||||
}
|
||||
|
||||
static void consider_reclaim_throttle(pg_data_t *pgdat, struct scan_control *sc)
|
||||
{
|
||||
/*
|
||||
* If reclaim is making progress greater than 12% efficiency then
|
||||
* wake all the NOPROGRESS throttled tasks.
|
||||
*/
|
||||
if (sc->nr_reclaimed > (sc->nr_scanned >> 3)) {
|
||||
wait_queue_head_t *wqh;
|
||||
|
||||
wqh = &pgdat->reclaim_wait[VMSCAN_THROTTLE_NOPROGRESS];
|
||||
if (waitqueue_active(wqh))
|
||||
wake_up(wqh);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Do not throttle kswapd or cgroup reclaim on NOPROGRESS as it will
|
||||
* throttle on VMSCAN_THROTTLE_WRITEBACK if there are too many pages
|
||||
* under writeback and marked for immediate reclaim at the tail of the
|
||||
* LRU.
|
||||
*/
|
||||
if (current_is_kswapd() || cgroup_reclaim(sc))
|
||||
return;
|
||||
|
||||
/* Throttle if making no progress at high prioities. */
|
||||
if (sc->priority == 1 && !sc->nr_reclaimed)
|
||||
reclaim_throttle(pgdat, VMSCAN_THROTTLE_NOPROGRESS);
|
||||
}
|
||||
|
||||
/*
|
||||
* This is the direct reclaim path, for page-allocating processes. We only
|
||||
* try to reclaim pages from zones which will satisfy the caller's allocation
|
||||
@ -3272,6 +3458,7 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
|
||||
unsigned long nr_soft_scanned;
|
||||
gfp_t orig_mask;
|
||||
pg_data_t *last_pgdat = NULL;
|
||||
pg_data_t *first_pgdat = NULL;
|
||||
|
||||
/*
|
||||
* If the number of buffer_heads in the machine exceeds the maximum
|
||||
@ -3335,6 +3522,9 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
|
||||
/* need some check for avoid more shrink_zone() */
|
||||
}
|
||||
|
||||
if (!first_pgdat)
|
||||
first_pgdat = zone->zone_pgdat;
|
||||
|
||||
/* See comment about same check for global reclaim above */
|
||||
if (zone->zone_pgdat == last_pgdat)
|
||||
continue;
|
||||
@ -3342,6 +3532,9 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
|
||||
shrink_node(zone->zone_pgdat, sc);
|
||||
}
|
||||
|
||||
if (first_pgdat)
|
||||
consider_reclaim_throttle(first_pgdat, sc);
|
||||
|
||||
/*
|
||||
* Restore to original mask to avoid the impact on the caller if we
|
||||
* promoted it to __GFP_HIGHMEM.
|
||||
@ -4286,6 +4479,7 @@ static int kswapd(void *p)
|
||||
|
||||
WRITE_ONCE(pgdat->kswapd_order, 0);
|
||||
WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
|
||||
atomic_set(&pgdat->nr_writeback_throttled, 0);
|
||||
for ( ; ; ) {
|
||||
bool ret;
|
||||
|
||||
@ -4665,6 +4859,7 @@ void check_move_unevictable_pages(struct pagevec *pvec)
|
||||
|
||||
for (i = 0; i < pvec->nr; i++) {
|
||||
struct page *page = pvec->pages[i];
|
||||
struct folio *folio = page_folio(page);
|
||||
int nr_pages;
|
||||
|
||||
if (PageTransTail(page))
|
||||
@ -4677,7 +4872,7 @@ void check_move_unevictable_pages(struct pagevec *pvec)
|
||||
if (!TestClearPageLRU(page))
|
||||
continue;
|
||||
|
||||
lruvec = relock_page_lruvec_irq(page, lruvec);
|
||||
lruvec = folio_lruvec_relock_irq(folio, lruvec);
|
||||
if (page_evictable(page) && PageUnevictable(page)) {
|
||||
del_page_from_lru_list(page, lruvec);
|
||||
ClearPageUnevictable(page);
|
||||
|
79
mm/vmstat.c
79
mm/vmstat.c
@ -165,6 +165,34 @@ atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS] __cacheline_aligned_in_smp;
|
||||
EXPORT_SYMBOL(vm_zone_stat);
|
||||
EXPORT_SYMBOL(vm_node_stat);
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
static void fold_vm_zone_numa_events(struct zone *zone)
|
||||
{
|
||||
unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
|
||||
int cpu;
|
||||
enum numa_stat_item item;
|
||||
|
||||
for_each_online_cpu(cpu) {
|
||||
struct per_cpu_zonestat *pzstats;
|
||||
|
||||
pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
|
||||
for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
|
||||
zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
|
||||
}
|
||||
|
||||
for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
|
||||
zone_numa_event_add(zone_numa_events[item], zone, item);
|
||||
}
|
||||
|
||||
void fold_vm_numa_events(void)
|
||||
{
|
||||
struct zone *zone;
|
||||
|
||||
for_each_populated_zone(zone)
|
||||
fold_vm_zone_numa_events(zone);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_SMP
|
||||
|
||||
int calculate_pressure_threshold(struct zone *zone)
|
||||
@ -771,34 +799,6 @@ static int fold_diff(int *zone_diff, int *node_diff)
|
||||
return changes;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA
|
||||
static void fold_vm_zone_numa_events(struct zone *zone)
|
||||
{
|
||||
unsigned long zone_numa_events[NR_VM_NUMA_EVENT_ITEMS] = { 0, };
|
||||
int cpu;
|
||||
enum numa_stat_item item;
|
||||
|
||||
for_each_online_cpu(cpu) {
|
||||
struct per_cpu_zonestat *pzstats;
|
||||
|
||||
pzstats = per_cpu_ptr(zone->per_cpu_zonestats, cpu);
|
||||
for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
|
||||
zone_numa_events[item] += xchg(&pzstats->vm_numa_event[item], 0);
|
||||
}
|
||||
|
||||
for (item = 0; item < NR_VM_NUMA_EVENT_ITEMS; item++)
|
||||
zone_numa_event_add(zone_numa_events[item], zone, item);
|
||||
}
|
||||
|
||||
void fold_vm_numa_events(void)
|
||||
{
|
||||
struct zone *zone;
|
||||
|
||||
for_each_populated_zone(zone)
|
||||
fold_vm_zone_numa_events(zone);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Update the zone counters for the current cpu.
|
||||
*
|
||||
@ -1070,8 +1070,13 @@ static void fill_contig_page_info(struct zone *zone,
|
||||
for (order = 0; order < MAX_ORDER; order++) {
|
||||
unsigned long blocks;
|
||||
|
||||
/* Count number of free blocks */
|
||||
blocks = zone->free_area[order].nr_free;
|
||||
/*
|
||||
* Count number of free blocks.
|
||||
*
|
||||
* Access to nr_free is lockless as nr_free is used only for
|
||||
* diagnostic purposes. Use data_race to avoid KCSAN warning.
|
||||
*/
|
||||
blocks = data_race(zone->free_area[order].nr_free);
|
||||
info->free_blocks_total += blocks;
|
||||
|
||||
/* Count free base pages */
|
||||
@ -1225,6 +1230,7 @@ const char * const vmstat_text[] = {
|
||||
"nr_vmscan_immediate_reclaim",
|
||||
"nr_dirtied",
|
||||
"nr_written",
|
||||
"nr_throttled_written",
|
||||
"nr_kernel_misc_reclaimable",
|
||||
"nr_foll_pin_acquired",
|
||||
"nr_foll_pin_released",
|
||||
@ -1347,6 +1353,9 @@ const char * const vmstat_text[] = {
|
||||
"thp_split_page_failed",
|
||||
"thp_deferred_split_page",
|
||||
"thp_split_pmd",
|
||||
"thp_scan_exceed_none_pte",
|
||||
"thp_scan_exceed_swap_pte",
|
||||
"thp_scan_exceed_share_pte",
|
||||
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
|
||||
"thp_split_pud",
|
||||
#endif
|
||||
@ -1445,7 +1454,11 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
|
||||
|
||||
seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
|
||||
for (order = 0; order < MAX_ORDER; ++order)
|
||||
seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
|
||||
/*
|
||||
* Access to nr_free is lockless as nr_free is used only for
|
||||
* printing purposes. Use data_race to avoid KCSAN warning.
|
||||
*/
|
||||
seq_printf(m, "%6lu ", data_race(zone->free_area[order].nr_free));
|
||||
seq_putc(m, '\n');
|
||||
}
|
||||
|
||||
@ -1656,6 +1669,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
|
||||
}
|
||||
seq_printf(m,
|
||||
"\n pages free %lu"
|
||||
"\n boost %lu"
|
||||
"\n min %lu"
|
||||
"\n low %lu"
|
||||
"\n high %lu"
|
||||
@ -1664,6 +1678,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
|
||||
"\n managed %lu"
|
||||
"\n cma %lu",
|
||||
zone_page_state(zone, NR_FREE_PAGES),
|
||||
zone->watermark_boost,
|
||||
min_wmark_pages(zone),
|
||||
low_wmark_pages(zone),
|
||||
high_wmark_pages(zone),
|
||||
@ -2179,7 +2194,7 @@ static void extfrag_show_print(struct seq_file *m,
|
||||
for (order = 0; order < MAX_ORDER; ++order) {
|
||||
fill_contig_page_info(zone, order, &info);
|
||||
index = __fragmentation_index(order, &info);
|
||||
seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
|
||||
seq_printf(m, "%2d.%03d ", index / 1000, index % 1000);
|
||||
}
|
||||
|
||||
seq_putc(m, '\n');
|
||||
|
@ -273,17 +273,17 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
|
||||
}
|
||||
|
||||
/**
|
||||
* workingset_refault - evaluate the refault of a previously evicted page
|
||||
* @page: the freshly allocated replacement page
|
||||
* @shadow: shadow entry of the evicted page
|
||||
* workingset_refault - Evaluate the refault of a previously evicted folio.
|
||||
* @folio: The freshly allocated replacement folio.
|
||||
* @shadow: Shadow entry of the evicted folio.
|
||||
*
|
||||
* Calculates and evaluates the refault distance of the previously
|
||||
* evicted page in the context of the node and the memcg whose memory
|
||||
* evicted folio in the context of the node and the memcg whose memory
|
||||
* pressure caused the eviction.
|
||||
*/
|
||||
void workingset_refault(struct page *page, void *shadow)
|
||||
void workingset_refault(struct folio *folio, void *shadow)
|
||||
{
|
||||
bool file = page_is_file_lru(page);
|
||||
bool file = folio_is_file_lru(folio);
|
||||
struct mem_cgroup *eviction_memcg;
|
||||
struct lruvec *eviction_lruvec;
|
||||
unsigned long refault_distance;
|
||||
@ -295,16 +295,17 @@ void workingset_refault(struct page *page, void *shadow)
|
||||
unsigned long refault;
|
||||
bool workingset;
|
||||
int memcgid;
|
||||
long nr;
|
||||
|
||||
unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
|
||||
|
||||
rcu_read_lock();
|
||||
/*
|
||||
* Look up the memcg associated with the stored ID. It might
|
||||
* have been deleted since the page's eviction.
|
||||
* have been deleted since the folio's eviction.
|
||||
*
|
||||
* Note that in rare events the ID could have been recycled
|
||||
* for a new cgroup that refaults a shared page. This is
|
||||
* for a new cgroup that refaults a shared folio. This is
|
||||
* impossible to tell from the available data. However, this
|
||||
* should be a rare and limited disturbance, and activations
|
||||
* are always speculative anyway. Ultimately, it's the aging
|
||||
@ -340,17 +341,18 @@ void workingset_refault(struct page *page, void *shadow)
|
||||
refault_distance = (refault - eviction) & EVICTION_MASK;
|
||||
|
||||
/*
|
||||
* The activation decision for this page is made at the level
|
||||
* The activation decision for this folio is made at the level
|
||||
* where the eviction occurred, as that is where the LRU order
|
||||
* during page reclaim is being determined.
|
||||
* during folio reclaim is being determined.
|
||||
*
|
||||
* However, the cgroup that will own the page is the one that
|
||||
* However, the cgroup that will own the folio is the one that
|
||||
* is actually experiencing the refault event.
|
||||
*/
|
||||
memcg = page_memcg(page);
|
||||
nr = folio_nr_pages(folio);
|
||||
memcg = folio_memcg(folio);
|
||||
lruvec = mem_cgroup_lruvec(memcg, pgdat);
|
||||
|
||||
inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file);
|
||||
mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr);
|
||||
|
||||
mem_cgroup_flush_stats();
|
||||
/*
|
||||
@ -376,16 +378,16 @@ void workingset_refault(struct page *page, void *shadow)
|
||||
if (refault_distance > workingset_size)
|
||||
goto out;
|
||||
|
||||
SetPageActive(page);
|
||||
workingset_age_nonresident(lruvec, thp_nr_pages(page));
|
||||
inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file);
|
||||
folio_set_active(folio);
|
||||
workingset_age_nonresident(lruvec, nr);
|
||||
mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr);
|
||||
|
||||
/* Page was active prior to eviction */
|
||||
/* Folio was active prior to eviction */
|
||||
if (workingset) {
|
||||
SetPageWorkingset(page);
|
||||
folio_set_workingset(folio);
|
||||
/* XXX: Move to lru_cache_add() when it supports new vs putback */
|
||||
lru_note_cost_page(page);
|
||||
inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file);
|
||||
lru_note_cost_folio(folio);
|
||||
mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr);
|
||||
}
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
@ -393,12 +395,11 @@ void workingset_refault(struct page *page, void *shadow)
|
||||
|
||||
/**
|
||||
* workingset_activation - note a page activation
|
||||
* @page: page that is being activated
|
||||
* @folio: Folio that is being activated.
|
||||
*/
|
||||
void workingset_activation(struct page *page)
|
||||
void workingset_activation(struct folio *folio)
|
||||
{
|
||||
struct mem_cgroup *memcg;
|
||||
struct lruvec *lruvec;
|
||||
|
||||
rcu_read_lock();
|
||||
/*
|
||||
@ -408,11 +409,10 @@ void workingset_activation(struct page *page)
|
||||
* XXX: See workingset_refault() - this should return
|
||||
* root_mem_cgroup even for !CONFIG_MEMCG.
|
||||
*/
|
||||
memcg = page_memcg_rcu(page);
|
||||
memcg = folio_memcg_rcu(folio);
|
||||
if (!mem_cgroup_disabled() && !memcg)
|
||||
goto out;
|
||||
lruvec = mem_cgroup_page_lruvec(page);
|
||||
workingset_age_nonresident(lruvec, thp_nr_pages(page));
|
||||
workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio));
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
}
|
||||
@ -543,6 +543,13 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!spin_trylock(&mapping->host->i_lock)) {
|
||||
xa_unlock(&mapping->i_pages);
|
||||
spin_unlock_irq(lru_lock);
|
||||
ret = LRU_RETRY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
list_lru_isolate(lru, item);
|
||||
__dec_lruvec_kmem_state(node, WORKINGSET_NODES);
|
||||
|
||||
@ -562,6 +569,9 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
|
||||
|
||||
out_invalid:
|
||||
xa_unlock_irq(&mapping->i_pages);
|
||||
if (mapping_shrinkable(mapping))
|
||||
inode_add_lru(mapping->host);
|
||||
spin_unlock(&mapping->host->i_lock);
|
||||
ret = LRU_REMOVED_RETRY;
|
||||
out:
|
||||
cond_resched();
|
||||
|
12
mm/zpool.c
12
mm/zpool.c
@ -24,16 +24,11 @@ struct zpool {
|
||||
const struct zpool_ops *ops;
|
||||
bool evictable;
|
||||
bool can_sleep_mapped;
|
||||
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
static LIST_HEAD(drivers_head);
|
||||
static DEFINE_SPINLOCK(drivers_lock);
|
||||
|
||||
static LIST_HEAD(pools_head);
|
||||
static DEFINE_SPINLOCK(pools_lock);
|
||||
|
||||
/**
|
||||
* zpool_register_driver() - register a zpool implementation.
|
||||
* @driver: driver to register
|
||||
@ -195,10 +190,6 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp,
|
||||
|
||||
pr_debug("created pool type %s\n", type);
|
||||
|
||||
spin_lock(&pools_lock);
|
||||
list_add(&zpool->list, &pools_head);
|
||||
spin_unlock(&pools_lock);
|
||||
|
||||
return zpool;
|
||||
}
|
||||
|
||||
@ -217,9 +208,6 @@ void zpool_destroy_pool(struct zpool *zpool)
|
||||
{
|
||||
pr_debug("destroying pool type %s\n", zpool->driver->type);
|
||||
|
||||
spin_lock(&pools_lock);
|
||||
list_del(&zpool->list);
|
||||
spin_unlock(&pools_lock);
|
||||
zpool->driver->destroy(zpool->pool);
|
||||
zpool_put_driver(zpool->driver);
|
||||
kfree(zpool);
|
||||
|
547
mm/zsmalloc.c
547
mm/zsmalloc.c
File diff suppressed because it is too large
Load Diff
@ -1394,7 +1394,7 @@ static void zswap_frontswap_init(unsigned type)
|
||||
zswap_trees[type] = tree;
|
||||
}
|
||||
|
||||
static struct frontswap_ops zswap_frontswap_ops = {
|
||||
static const struct frontswap_ops zswap_frontswap_ops = {
|
||||
.store = zswap_frontswap_store,
|
||||
.load = zswap_frontswap_load,
|
||||
.invalidate_page = zswap_frontswap_invalidate_page,
|
||||
@ -1479,7 +1479,9 @@ static int __init init_zswap(void)
|
||||
if (!shrink_wq)
|
||||
goto hp_fail;
|
||||
|
||||
frontswap_register_ops(&zswap_frontswap_ops);
|
||||
ret = frontswap_register_ops(&zswap_frontswap_ops);
|
||||
if (ret)
|
||||
goto destroy_wq;
|
||||
if (zswap_debugfs_init())
|
||||
pr_warn("debugfs initialization failed\n");
|
||||
|
||||
@ -1488,6 +1490,8 @@ static int __init init_zswap(void)
|
||||
|
||||
return 0;
|
||||
|
||||
destroy_wq:
|
||||
destroy_workqueue(shrink_wq);
|
||||
hp_fail:
|
||||
cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
|
||||
dstmem_fail:
|
||||
|
@ -65,7 +65,7 @@ static int hippi_header(struct sk_buff *skb, struct net_device *dev,
|
||||
hip->le.src_addr_type = 2; /* 12 bit SC address */
|
||||
|
||||
memcpy(hip->le.src_switch_addr, dev->dev_addr + 3, 3);
|
||||
memset(&hip->le.reserved, 0, 16);
|
||||
memset_startat(&hip->le, 0, reserved);
|
||||
|
||||
hip->snap.dsap = HIPPI_EXTENDED_SAP;
|
||||
hip->snap.ssap = HIPPI_EXTENDED_SAP;
|
||||
@ -121,7 +121,7 @@ int hippi_mac_addr(struct net_device *dev, void *p)
|
||||
struct sockaddr *addr = p;
|
||||
if (netif_running(dev))
|
||||
return -EBUSY;
|
||||
memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
|
||||
dev_addr_set(dev, addr->sa_data);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(hippi_mac_addr);
|
||||
|
@ -23,7 +23,7 @@
|
||||
#include <net/p8022.h>
|
||||
|
||||
static int p8022_request(struct datalink_proto *dl, struct sk_buff *skb,
|
||||
unsigned char *dest)
|
||||
const unsigned char *dest)
|
||||
{
|
||||
llc_build_and_send_ui_pkt(dl->sap, skb, dest, dl->sap->laddr.lsap);
|
||||
return 0;
|
||||
|
@ -79,7 +79,7 @@ static int snap_rcv(struct sk_buff *skb, struct net_device *dev,
|
||||
* Put a SNAP header on a frame and pass to 802.2
|
||||
*/
|
||||
static int snap_request(struct datalink_proto *dl,
|
||||
struct sk_buff *skb, u8 *dest)
|
||||
struct sk_buff *skb, const u8 *dest)
|
||||
{
|
||||
memcpy(skb_push(skb, 5), dl->type, 5);
|
||||
llc_build_and_send_ui_pkt(snap_sap, skb, dest, snap_sap->laddr.lsap);
|
||||
|
@ -319,8 +319,8 @@ static void vlan_transfer_features(struct net_device *dev,
|
||||
{
|
||||
struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
|
||||
|
||||
vlandev->gso_max_size = dev->gso_max_size;
|
||||
vlandev->gso_max_segs = dev->gso_max_segs;
|
||||
netif_set_gso_max_size(vlandev, dev->gso_max_size);
|
||||
netif_set_gso_max_segs(vlandev, dev->gso_max_segs);
|
||||
|
||||
if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto))
|
||||
vlandev->hard_header_len = dev->hard_header_len;
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user