From 966e274c871584d88c850b244ab21d7b64a6a713 Mon Sep 17 00:00:00 2001 From: Kaihao Bai Date: Sat, 7 Oct 2023 13:54:27 +0800 Subject: [PATCH] anolis: mm: support allocating page table pages bound with local numa node ANBZ: #6618 Currently page table can not be migrated through numa balancing. If most of page table pages are located on remote numa node, the performance can be degraded in some scenarios. Thus this patch provides a way to allocate page table pages in local numa node with reserved memory range. To switch on the pgtable bind feature globally: echo 2 > /sys/kernel/mm/pgtable_bind/enabled Only for misplaced page table pages statisitcs echo 1 > /sys/kernel/mm/pgtable_bind/enabled To enable the pgtable bind feature of cgroup level: echo 1 > /sys/fs/cgroup/memory//memory.pgtable_bind To get the amount of misplaced page table pages cat /sys/fs/cgroup/memory//memory.pgtable_misplaced Besides, if needs to reset the value of pgtable_misplaced echo 0 > /sys/fs/cgroup/memory//memory.pgtable_misplaced Signed-off-by: Kaihao Bai Reviewed-by: Xu Yu Reviewed-by: Baolin Wang Link: https://gitee.com/anolis/cloud-kernel/pulls/4625 --- .../L1-RECOMMEND/default/CONFIG_PGTABLE_BIND | 1 + arch/arm64/mm/pgd.c | 21 ++++++ include/asm-generic/pgalloc.h | 40 +++++++++++ include/linux/gfp_types.h | 12 ++++ include/linux/memcontrol.h | 5 ++ include/linux/pgtable_bind.h | 33 +++++++++ include/trace/events/mmflags.h | 1 + mm/Kconfig | 12 ++++ mm/Makefile | 1 + mm/memcontrol-v1.c | 12 ++++ mm/memcontrol-v1.h | 11 +++ mm/memcontrol.c | 57 +++++++++++++++ mm/page_alloc.c | 28 ++++++++ mm/pgtable_bind.c | 70 +++++++++++++++++++ tools/perf/builtin-kmem.c | 1 + 15 files changed, 305 insertions(+) create mode 100644 anolis/configs/L1-RECOMMEND/default/CONFIG_PGTABLE_BIND create mode 100644 include/linux/pgtable_bind.h create mode 100644 mm/pgtable_bind.c diff --git a/anolis/configs/L1-RECOMMEND/default/CONFIG_PGTABLE_BIND b/anolis/configs/L1-RECOMMEND/default/CONFIG_PGTABLE_BIND new file mode 100644 index 000000000000..8e7a91a7d53d --- /dev/null +++ b/anolis/configs/L1-RECOMMEND/default/CONFIG_PGTABLE_BIND @@ -0,0 +1 @@ +CONFIG_PGTABLE_BIND=y diff --git a/arch/arm64/mm/pgd.c b/arch/arm64/mm/pgd.c index bf5110b91e2f..b1aed149cac2 100644 --- a/arch/arm64/mm/pgd.c +++ b/arch/arm64/mm/pgd.c @@ -10,6 +10,8 @@ #include #include #include +#include +#include #include #include @@ -31,7 +33,26 @@ static bool pgdir_is_page_size(void) pgd_t *pgd_alloc(struct mm_struct *mm) { gfp_t gfp = GFP_PGTABLE_USER; +#ifdef CONFIG_PGTABLE_BIND + struct mem_cgroup *memcg; + bool pgtable_alloc = false; + if (pgtable_stat_enabled()) { + memcg = get_mem_cgroup_from_mm(mm); + if (memcg) { + pgtable_alloc = memcg->allow_pgtable_bind; + css_put(&memcg->css); + } + + /* Only target on user processes */ + if (pgtable_alloc) { + gfp |= __GFP_PGTABLE; + + if (pgtable_bind_enabled()) + gfp |= __GFP_HIGH | __GFP_THISNODE; + } + } +#endif if (pgdir_is_page_size()) return __pgd_alloc(mm, 0); else diff --git a/include/asm-generic/pgalloc.h b/include/asm-generic/pgalloc.h index dedefcca1774..39743e92f89a 100644 --- a/include/asm-generic/pgalloc.h +++ b/include/asm-generic/pgalloc.h @@ -2,11 +2,45 @@ #ifndef __ASM_GENERIC_PGALLOC_H #define __ASM_GENERIC_PGALLOC_H +#include +#include + #ifdef CONFIG_MMU #define GFP_PGTABLE_KERNEL (GFP_KERNEL | __GFP_ZERO | __GFP_NOKFENCE) #define GFP_PGTABLE_USER (GFP_PGTABLE_KERNEL | __GFP_ACCOUNT) +#ifdef CONFIG_PGTABLE_BIND +static gfp_t gfp_pgtable_alloc(struct mm_struct *mm, gfp_t gfp) +{ + struct mem_cgroup *memcg; + bool pgtable_alloc = false; + + if (pgtable_stat_enabled()) { + memcg = get_mem_cgroup_from_mm(mm); + if (memcg) { + pgtable_alloc = memcg->allow_pgtable_bind; + css_put(&memcg->css); + } + + /* Only target on user processes */ + if (pgtable_alloc) { + gfp |= __GFP_PGTABLE; + + if (pgtable_bind_enabled()) + gfp |= __GFP_HIGH | __GFP_THISNODE; + } + } + + return gfp; +} +#else +static gfp_t gfp_pgtable_alloc(struct mm_struct *mm, gfp_t gfp) +{ + return gfp; +} +#endif + /** * __pte_alloc_one_kernel - allocate memory for a PTE-level kernel page table * @mm: the mm_struct of the current context @@ -73,6 +107,8 @@ static inline pgtable_t __pte_alloc_one_noprof(struct mm_struct *mm, gfp_t gfp) { struct ptdesc *ptdesc; + gfp = gfp_pgtable_alloc(mm, gfp); + ptdesc = pagetable_alloc_noprof(gfp, 0); if (!ptdesc) return NULL; @@ -138,6 +174,8 @@ static inline pmd_t *pmd_alloc_one_noprof(struct mm_struct *mm, unsigned long ad struct ptdesc *ptdesc; gfp_t gfp = GFP_PGTABLE_USER; + gfp = gfp_pgtable_alloc(mm, gfp); + if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; ptdesc = pagetable_alloc_noprof(gfp, 0); @@ -175,6 +213,8 @@ static inline pud_t *__pud_alloc_one_noprof(struct mm_struct *mm, unsigned long gfp_t gfp = GFP_PGTABLE_USER; struct ptdesc *ptdesc; + gfp = gfp_pgtable_alloc(mm, gfp); + if (mm == &init_mm) gfp = GFP_PGTABLE_KERNEL; diff --git a/include/linux/gfp_types.h b/include/linux/gfp_types.h index 1ac8604bf750..af485b2dc15b 100644 --- a/include/linux/gfp_types.h +++ b/include/linux/gfp_types.h @@ -58,6 +58,9 @@ enum { ___GFP_NO_OBJ_EXT_BIT, #ifdef CONFIG_KFENCE ___GFP_NOKFENCE_BIT, +#endif +#ifdef CONFIG_PGTABLE_BIND + ___GFP_PGTABLE_BIT, #endif ___GFP_LAST_BIT }; @@ -105,6 +108,12 @@ enum { #else #define ___GFP_NOKFENCE 0 #endif +#ifdef CONFIG_PGTABLE_BIND +#define ___GFP_PGTABLE BIT(___GFP_PGTABLE_BIT) +#else +#define ___GFP_PGTABLE 0 +#endif +/* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* * Physical address zone modifiers (see linux/mmzone.h - low four bits) @@ -151,6 +160,8 @@ enum { * with this flag to indicate that their NULL tags are expected and normal. * * %__GFP_NOKFENCE informs DO NOT try to alloc page from kfence pool. + * %__GFP_PGTABLE indicates the allocation of page table pages. + * */ #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) @@ -159,6 +170,7 @@ enum { #define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT) #define __GFP_NO_OBJ_EXT ((__force gfp_t)___GFP_NO_OBJ_EXT) #define __GFP_NOKFENCE ((__force gfp_t)___GFP_NOKFENCE) +#define __GFP_PGTABLE ((__force gfp_t)___GFP_PGTABLE) /** * DOC: Watermark modifiers diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 61f19fcd375b..5e4e5c9dc17d 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -334,6 +334,11 @@ struct mem_cgroup { struct lru_gen_mm_list mm_list; #endif +#ifdef CONFIG_PGTABLE_BIND + unsigned long pgtable_misplaced; + bool allow_pgtable_bind; +#endif + #ifdef CONFIG_MEMCG_V1 /* Legacy consumer-oriented counters */ struct page_counter kmem; /* v1 only */ diff --git a/include/linux/pgtable_bind.h b/include/linux/pgtable_bind.h new file mode 100644 index 000000000000..e88a75554949 --- /dev/null +++ b/include/linux/pgtable_bind.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_PGTABLE_BIND_H_ +#define _LINUX_PGTABLE_BIND_H_ + +#include +#include +#include + +#ifdef CONFIG_PGTABLE_BIND +DECLARE_STATIC_KEY_FALSE(pgtable_bind_enabled_key); +DECLARE_STATIC_KEY_FALSE(pgtable_stat_enabled_key); +static inline bool pgtable_bind_enabled(void) +{ + return static_key_enabled(&pgtable_bind_enabled_key); +} + +static inline bool pgtable_stat_enabled(void) +{ + return static_key_enabled(&pgtable_stat_enabled_key); +} +#else +static inline bool pgtable_bind_enabled(void) +{ + return false; +} + +static inline bool pgtable_stat_enabled(void) +{ + return false; +} +#endif + +#endif /* _LINUX_PGTABLE_BIND_H_ */ diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 4583bdcb017b..dba1e2de2d34 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -109,6 +109,7 @@ TRACE_DEFINE_ENUM(___GFP_LAST_BIT); gfpflag_string(GFP_DMA), \ gfpflag_string(GFP_DMA32), \ gfpflag_string(__GFP_RECLAIM), \ + gfpflag_string(__GFP_PGTABLE), \ TRACE_GFP_FLAGS \ { 0, NULL } diff --git a/mm/Kconfig b/mm/Kconfig index 56b90bff84a2..1cad535748b3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1489,6 +1489,18 @@ config PRE_OOM This feature is used to ensure that higher priority tasks would not enter the direct reclaim path when applying for memory allocation. +config PGTABLE_BIND + bool "Enable page table allocation bound with CPUs and misplaced statistics" + depends on MEMCG + default n + help + This feature is used to solve the problem that the page table pages can not be + migrated through numa balancing. If pages are located on remote numa node, the + performance can be degraded in some scenarios. Thus this configuration provides a + way to allocate page table pages in local numa node with reserved memory range. + + If unsure, say N. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index 581a396bb7b3..778c357cdb0c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -151,3 +151,4 @@ obj-$(CONFIG_EXECMEM) += execmem.o obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o obj-$(CONFIG_LAZY_MMU_MODE_KUNIT_TEST) += tests/lazy_mmu_mode_kunit.o obj-$(CONFIG_PRE_OOM) += pre_oom.o +obj-$(CONFIG_PGTABLE_BIND) += pgtable_bind.o diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 79b87c1450f9..deaa0fd879a8 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -2372,6 +2372,18 @@ struct cftype mem_cgroup_legacy_files[] = { .write = memcg_lru_gen_write, }, #endif +#ifdef CONFIG_PGTABLE_BIND + { + .name = "pgtable_bind", + .write_u64 = memcg_pgtable_bind_write, + .read_u64 = memcg_pgtable_bind_read, + }, + { + .name = "pgtable_misplaced", + .write_u64 = memcg_pgtable_misplaced_write, + .read_u64 = memcg_pgtable_misplaced_read, + }, +#endif #ifdef CONFIG_PRE_OOM { .name = "pre_oom", diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index b0f363cc0d35..6ad5e557a594 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -141,4 +141,15 @@ static inline void memcg1_uncharge_skmem(struct mem_cgroup *memcg, unsigned int #endif /* CONFIG_MEMCG_V1 */ +#ifdef CONFIG_PGTABLE_BIND +u64 memcg_pgtable_bind_read(struct cgroup_subsys_state *css, + struct cftype *cft); +int memcg_pgtable_bind_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val); +u64 memcg_pgtable_misplaced_read(struct cgroup_subsys_state *css, + struct cftype *cft); +int memcg_pgtable_misplaced_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val); +#endif + #endif /* __MM_MEMCONTROL_V1_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 930257220428..0e9152c3e2cf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4122,6 +4122,51 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg) #endif /* CONFIG_CGROUP_WRITEBACK */ +#ifdef CONFIG_PGTABLE_BIND +u64 memcg_pgtable_bind_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return READ_ONCE(memcg->allow_pgtable_bind); +} + +int memcg_pgtable_bind_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val) + memcg->allow_pgtable_bind = true; + else + memcg->allow_pgtable_bind = false; + + return 0; +} + +u64 memcg_pgtable_misplaced_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return READ_ONCE(memcg->pgtable_misplaced); +} + +int memcg_pgtable_misplaced_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val) + return -EINVAL; + + /* reset the stat of current memcg */ + memcg->pgtable_misplaced = 0; + + return 0; +} +#endif /* CONFIG_PGTABLE_BIND */ + /* * Private memory cgroup IDR * @@ -5566,6 +5611,18 @@ static struct cftype memory_files[] = { .write = memcg_lru_gen_write, }, #endif +#ifdef CONFIG_PGTABLE_BIND + { + .name = "pgtable_bind", + .write_u64 = memcg_pgtable_bind_write, + .read_u64 = memcg_pgtable_bind_read, + }, + { + .name = "pgtable_misplaced", + .write_u64 = memcg_pgtable_misplaced_write, + .read_u64 = memcg_pgtable_misplaced_read, + }, +#endif #ifdef CONFIG_PRE_OOM { .name = "pre_oom", diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ce5b14c70936..6f19a9da8762 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3971,6 +3971,24 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, if (page) { prep_new_page(page, order, gfp_mask, alloc_flags); +#ifdef CONFIG_PGTABLE_BIND + /* + * If allocated page belongs to remote numa node, + * accumulate memcg->pgtable_misplaced to show how many pages + * are from remote node. + */ + if ((gfp_mask & __GFP_PGTABLE) && + (zone_to_nid(ac->preferred_zoneref->zone) != zone_to_nid(zone))) { + struct mem_cgroup *memcg; + + memcg = get_mem_cgroup_from_mm(current->mm); + if (memcg) { + memcg->pgtable_misplaced++; + css_put(&memcg->css); + } + } +#endif + return page; } else { if (cond_accept_memory(zone, order, alloc_flags)) @@ -5306,6 +5324,16 @@ struct page *__alloc_frozen_pages_noprof(gfp_t gfp, unsigned int order, */ ac.nodemask = nodemask; + /* + * Restore the __GFP_THISNODE restriction if current allocation is page + * table. + */ + if (gfp & __GFP_PGTABLE) { + gfp &= ~__GFP_THISNODE; + alloc_gfp &= ~__GFP_THISNODE; + ac.zonelist = node_zonelist(preferred_nid, gfp); + } + page = __alloc_pages_slowpath(alloc_gfp, order, &ac); out: diff --git a/mm/pgtable_bind.c b/mm/pgtable_bind.c new file mode 100644 index 000000000000..1f6caa54f1da --- /dev/null +++ b/mm/pgtable_bind.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_PGTABLE_BIND +#ifdef CONFIG_SYSFS +DEFINE_STATIC_KEY_FALSE(pgtable_bind_enabled_key); +DEFINE_STATIC_KEY_FALSE(pgtable_stat_enabled_key); + +static ssize_t pgtable_bind_enabled_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", !!static_key_enabled(&pgtable_bind_enabled_key) + + !!static_key_enabled(&pgtable_stat_enabled_key)); +} + +static ssize_t pgtable_bind_enabled_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + static DEFINE_MUTEX(mutex); + ssize_t ret = count; + + mutex_lock(&mutex); + + if (!strncmp(buf, "2", 1)) { + static_branch_enable(&pgtable_bind_enabled_key); + static_branch_enable(&pgtable_stat_enabled_key); + } else if (!strncmp(buf, "1", 1)) { + static_branch_disable(&pgtable_bind_enabled_key); + static_branch_enable(&pgtable_stat_enabled_key); + } else if (!strncmp(buf, "0", 1)) { + static_branch_disable(&pgtable_bind_enabled_key); + static_branch_disable(&pgtable_stat_enabled_key); + } + + mutex_unlock(&mutex); + return ret; +} + +static struct kobj_attribute pgtable_bind_enabled_attr = + __ATTR(enabled, 0644, pgtable_bind_enabled_show, + pgtable_bind_enabled_store); +static struct attribute *pgtable_bind_attrs[] = { + &pgtable_bind_enabled_attr.attr, + NULL, +}; +static const struct attribute_group pgtable_bind_attr_group = { + .attrs = pgtable_bind_attrs, + .name = "pgtable_bind", +}; + +static int __init pgtable_bind_init(void) +{ + int ret; + + ret = sysfs_create_group(mm_kobj, &pgtable_bind_attr_group); + if (ret) + pr_err("pgtable_bind: register sysfs failed\n"); + + return ret; +} +subsys_initcall(pgtable_bind_init); +#endif /* CONFIG_SYSFS */ +#endif /* CONFIG_PGTABLE_BIND */ diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 9c64a0d74823..35d012500751 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -682,6 +682,7 @@ static const struct { { "__GFP_RECLAIM", "R" }, { "__GFP_DIRECT_RECLAIM", "DR" }, { "__GFP_KSWAPD_RECLAIM", "KR" }, + { "__GFP_PGTABLE", "PT" }, }; static size_t max_gfp_len; -- Gitee