diff options
45 files changed, 1938 insertions, 172 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 20e248cc03a..ea8e5b48557 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2032,6 +2032,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. nr_uarts= [SERIAL] maximum number of UARTs to be registered. + numa_balancing= [KNL,X86] Enable or disable automatic NUMA balancing. + Allowed values are enable and disable + numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. one of ['zone', 'node', 'default'] can be specified This can be set from sysctl after boot. diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index cb8f9920f4d..0f7c852f355 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -111,6 +111,7 @@ config VSYSCALL config NUMA bool "Non Uniform Memory Access (NUMA) Support" depends on MMU && SYS_SUPPORTS_NUMA && EXPERIMENTAL + select ARCH_WANT_NUMA_VARIABLE_LOCALITY default n help Some SH systems have many various memories scattered around diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 65a872bf72f..97f8c5ad8c2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,6 +22,8 @@ config X86 def_bool y select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK + select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_WANTS_PROT_NUMA_PROT_NONE select HAVE_IDE select HAVE_OPROFILE select HAVE_PCSPKR_PLATFORM diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a1f780d45f7..5199db2923d 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -404,7 +404,14 @@ static inline int pte_same(pte_t a, pte_t b) static inline int pte_present(pte_t a) { - return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); + return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE | + _PAGE_NUMA); +} + +#define pte_accessible pte_accessible +static inline int pte_accessible(pte_t a) +{ + return pte_flags(a) & _PAGE_PRESENT; } static inline int pte_hidden(pte_t pte) @@ -420,7 +427,8 @@ static inline int pmd_present(pmd_t pmd) * the _PAGE_PSE flag will remain set at all times while the * _PAGE_PRESENT bit is clear). */ - return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE); + return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE | + _PAGE_NUMA); } static inline int pmd_none(pmd_t pmd) @@ -479,6 +487,11 @@ static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) static inline int pmd_bad(pmd_t pmd) { +#ifdef CONFIG_NUMA_BALANCING + /* pmd_numa check */ + if ((pmd_flags(pmd) & (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA) + return 0; +#endif return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE; } diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index ec8a1fc9505..3c32db8c539 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -64,6 +64,26 @@ #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) +/* + * _PAGE_NUMA indicates that this page will trigger a numa hinting + * minor page fault to gather numa placement statistics (see + * pte_numa()). The bit picked (8) is within the range between + * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't + * require changes to the swp entry format because that bit is always + * zero when the pte is not present. + * + * The bit picked must be always zero when the pmd is present and not + * present, so that we don't lose information when we set it while + * atomically clearing the present bit. + * + * Because we shared the same bit (8) with _PAGE_PROTNONE this can be + * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE + * couldn't reach, like handle_mm_fault() (see access_error in + * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for + * handle_mm_fault() to be invoked). + */ +#define _PAGE_NUMA _PAGE_PROTNONE + #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 217eb705fac..e27fbf887f3 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -301,6 +301,13 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd) free_page((unsigned long)pgd); } +/* + * Used to set accessed or dirty bits in the page table entries + * on other architectures. On x86, the accessed and dirty bits + * are tracked by hardware. However, do_wp_page calls this function + * to also make the pte writeable at the same time the dirty bit is + * set. In that case we do actually need to write the PTE. + */ int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t entry, int dirty) @@ -310,7 +317,6 @@ int ptep_set_access_flags(struct vm_area_struct *vma, if (changed && dirty) { *ptep = entry; pte_update_defer(vma->vm_mm, address, ptep); - flush_tlb_page(vma, address); } return changed; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 284e80831d2..701beab27aa 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -219,6 +219,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) #define move_pte(pte, prot, old_addr, new_addr) (pte) #endif +#ifndef pte_accessible +# define pte_accessible(pte) ((void)(pte),1) +#endif + #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) #endif @@ -580,6 +584,112 @@ static inline int pmd_trans_unstable(pmd_t *pmd) #endif } +#ifdef CONFIG_NUMA_BALANCING +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +/* + * _PAGE_NUMA works identical to _PAGE_PROTNONE (it's actually the + * same bit too). It's set only when _PAGE_PRESET is not set and it's + * never set if _PAGE_PRESENT is set. + * + * pte/pmd_present() returns true if pte/pmd_numa returns true. Page + * fault triggers on those regions if pte/pmd_numa returns true + * (because _PAGE_PRESENT is not set). + */ +#ifndef pte_numa +static inline int pte_numa(pte_t pte) +{ + return (pte_flags(pte) & + (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; +} +#endif + +#ifndef pmd_numa +static inline int pmd_numa(pmd_t pmd) +{ + return (pmd_flags(pmd) & + (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; +} +#endif + +/* + * pte/pmd_mknuma sets the _PAGE_ACCESSED bitflag automatically + * because they're called by the NUMA hinting minor page fault. If we + * wouldn't set the _PAGE_ACCESSED bitflag here, the TLB miss handler + * would be forced to set it later while filling the TLB after we + * return to userland. That would trigger a second write to memory + * that we optimize away by setting _PAGE_ACCESSED here. + */ +#ifndef pte_mknonnuma +static inline pte_t pte_mknonnuma(pte_t pte) +{ + pte = pte_clear_flags(pte, _PAGE_NUMA); + return pte_set_flags(pte, _PAGE_PRESENT|_PAGE_ACCESSED); +} +#endif + +#ifndef pmd_mknonnuma +static inline pmd_t pmd_mknonnuma(pmd_t pmd) +{ + pmd = pmd_clear_flags(pmd, _PAGE_NUMA); + return pmd_set_flags(pmd, _PAGE_PRESENT|_PAGE_ACCESSED); +} +#endif + +#ifndef pte_mknuma +static inline pte_t pte_mknuma(pte_t pte) +{ + pte = pte_set_flags(pte, _PAGE_NUMA); + return pte_clear_flags(pte, _PAGE_PRESENT); +} +#endif + +#ifndef pmd_mknuma +static inline pmd_t pmd_mknuma(pmd_t pmd) +{ + pmd = pmd_set_flags(pmd, _PAGE_NUMA); + return pmd_clear_flags(pmd, _PAGE_PRESENT); +} +#endif +#else +extern int pte_numa(pte_t pte); +extern int pmd_numa(pmd_t pmd); +extern pte_t pte_mknonnuma(pte_t pte); +extern pmd_t pmd_mknonnuma(pmd_t pmd); +extern pte_t pte_mknuma(pte_t pte); +extern pmd_t pmd_mknuma(pmd_t pmd); +#endif /* CONFIG_ARCH_USES_NUMA_PROT_NONE */ +#else +static inline int pmd_numa(pmd_t pmd) +{ + return 0; +} + +static inline int pte_numa(pte_t pte) +{ + return 0; +} + +static inline pte_t pte_mknonnuma(pte_t pte) +{ + return pte; +} + +static inline pmd_t pmd_mknonnuma(pmd_t pmd) +{ + return pmd; +} + +static inline pte_t pte_mknuma(pte_t pte) +{ + return pte; +} + +static inline pmd_t pmd_mknuma(pmd_t pmd) +{ + return pmd; +} +#endif /* CONFIG_NUMA_BALANCING */ + #endif /* CONFIG_MMU */ #endif /* !__ASSEMBLY__ */ diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 092dc5305a3..1d76f8ca90f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -31,7 +31,8 @@ extern int move_huge_pmd(struct vm_area_struct *vma, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd); extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot); + unsigned long addr, pgprot_t newprot, + int prot_numa); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, @@ -111,7 +112,7 @@ extern void __split_huge_page_pmd(struct vm_area_struct *vma, #define wait_split_huge_page(__anon_vma, __pmd) \ do { \ pmd_t *____pmd = (__pmd); \ - anon_vma_lock(__anon_vma); \ + anon_vma_lock_write(__anon_vma); \ anon_vma_unlock(__anon_vma); \ BUG_ON(pmd_trans_splitting(*____pmd) || \ pmd_trans_huge(*____pmd)); \ @@ -171,6 +172,10 @@ static inline struct page *compound_trans_head(struct page *page) } return page; } + +extern int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp); + #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) @@ -209,6 +214,13 @@ static inline int pmd_trans_huge_lock(pmd_t *pmd, { return 0; } + +static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd, pmd_t *pmdp) +{ + return 0; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3e7fa1acf09..0c80d3f57a5 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -87,7 +87,7 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, pud_t *pud, int write); int pmd_huge(pmd_t pmd); int pud_huge(pud_t pmd); -void hugetlb_change_protection(struct vm_area_struct *vma, +unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); #else /* !CONFIG_HUGETLB_PAGE */ @@ -132,7 +132,11 @@ static inline void copy_huge_page(struct page *dst, struct page *src) { } -#define hugetlb_change_protection(vma, address, end, newprot) +static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, + unsigned long address, unsigned long end, pgprot_t newprot) +{ + return 0; +} static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index dbd212723b7..9adc270de7e 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -188,6 +188,8 @@ static inline int vma_migratable(struct vm_area_struct *vma) return 1; } +extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); + #else struct mempolicy {}; @@ -307,5 +309,11 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, return 0; } +static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, + unsigned long address) +{ + return -1; /* no node preference */ +} + #endif /* CONFIG_NUMA */ #endif diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 0b5865c61ef..1e9f627967a 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -23,6 +23,15 @@ typedef struct page *new_page_t(struct page *, unsigned long private, int **); #define MIGRATEPAGE_BALLOON_SUCCESS 1 /* special ret code for balloon page * sucessful migration case. */ +enum migrate_reason { + MR_COMPACTION, + MR_MEMORY_FAILURE, + MR_MEMORY_HOTPLUG, + MR_SYSCALL, /* also applies to cpusets */ + MR_MEMPOLICY_MBIND, + MR_NUMA_MISPLACED, + MR_CMA +}; #ifdef CONFIG_MIGRATION @@ -32,7 +41,7 @@ extern int migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - enum migrate_mode mode); + enum migrate_mode mode, int reason); extern int migrate_huge_page(struct page *, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode); @@ -54,7 +63,7 @@ static inline void putback_lru_pages(struct list_head *l) {} static inline void putback_movable_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, - enum migrate_mode mode) { return -ENOSYS; } + enum migrate_mode mode, int reason) { return -ENOSYS; } static inline int migrate_huge_page(struct page *page, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode) { return -ENOSYS; } @@ -83,4 +92,37 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, #define fail_migrate_page NULL #endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_NUMA_BALANCING +extern int migrate_misplaced_page(struct page *page, int node); +extern int migrate_misplaced_page(struct page *page, int node); +extern bool migrate_ratelimited(int node); +#else +static inline int migrate_misplaced_page(struct page *page, int node) +{ + return -EAGAIN; /* can't migrate now */ +} +static inline bool migrate_ratelimited(int node) +{ + return false; +} +#endif /* CONFIG_NUMA_BALANCING */ + +#if defined(CONFIG_NUMA_BALANCING) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +extern int migrate_misplaced_transhuge_page(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, pmd_t entry, + unsigned long address, + struct page *page, int node); +#else +static inline int migrate_misplaced_transhuge_page(struct mm_struct *mm, + struct vm_area_struct *vma, + pmd_t *pmd, pmd_t entry, + unsigned long address, + struct page *page, int node) +{ + return -EAGAIN; +} +#endif /* CONFIG_NUMA_BALANCING && CONFIG_TRANSPARENT_HUGEPAGE*/ + #endif /* _LINUX_MIGRATE_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 4af4f0b1be4..7f4f906190b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -693,6 +693,36 @@ static inline int page_to_nid(const struct page *page) } #endif +#ifdef CONFIG_NUMA_BALANCING +static inline int page_xchg_last_nid(struct page *page, int nid) +{ + return xchg(&page->_last_nid, nid); +} + +static inline int page_last_nid(struct page *page) +{ + return page->_last_nid; +} +static inline void reset_page_last_nid(struct page *page) +{ + page->_last_nid = -1; +} +#else +static inline int page_xchg_last_nid(struct page *page, int nid) +{ + return page_to_nid(page); +} + +static inline int page_last_nid(struct page *page) +{ + return page_to_nid(page); +} + +static inline void reset_page_last_nid(struct page *page) +{ +} +#endif + static inline struct zone *page_zone(const struct page *page) { return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; @@ -1078,6 +1108,9 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma, extern unsigned long do_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr); +extern unsigned long change_protection(struct vm_area_struct *vma, unsigned long start, + unsigned long end, pgprot_t newprot, + int dirty_accountable, int prot_numa); extern int mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, unsigned long start, unsigned long end, unsigned long newflags); @@ -1579,6 +1612,11 @@ static inline pgprot_t vm_get_page_prot(unsigned long vm_flags) } #endif +#ifdef CONFIG_ARCH_USES_NUMA_PROT_NONE +unsigned long change_prot_numa(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +#endif + struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); @@ -1600,6 +1638,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_MLOCK 0x40 /* mark page as mlocked */ #define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ +#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7ade2731b5d..7d9ebb7cc98 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -175,6 +175,10 @@ struct page { */ void *shadow; #endif + +#ifdef CONFIG_NUMA_BALANCING + int _last_nid; +#endif } /* * The struct page can be forced to be double word aligned so that atomic ops @@ -411,9 +415,36 @@ struct mm_struct { #ifdef CONFIG_CPUMASK_OFFSTACK struct cpumask cpumask_allocation; #endif +#ifdef CONFIG_NUMA_BALANCING + /* + * numa_next_scan is the next time when the PTEs will me marked + * pte_numa to gather statistics and migrate pages to new nodes + * if necessary + */ + unsigned long numa_next_scan; + + /* numa_next_reset is when the PTE scanner period will be reset */ + unsigned long numa_next_reset; + + /* Restart point for scanning and setting pte_numa */ + unsigned long numa_scan_offset; + + /* numa_scan_seq prevents two threads setting pte_numa */ + int numa_scan_seq; + + /* + * The first node a task was scheduled on. If a task runs on + * a different node than Make PTE Scan Go Now. + */ + int first_nid; +#endif struct uprobes_state uprobes_state; }; +/* first nid will either be a valid NID or one of these values */ +#define NUMA_PTE_SCAN_INIT -1 +#define NUMA_PTE_SCAN_ACTIVE -2 + static inline void mm_init_cpumask(struct mm_struct *mm) { #ifdef CONFIG_CPUMASK_OFFSTACK diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index cd55dad56aa..4bec5be82ca 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -735,6 +735,19 @@ typedef struct pglist_data { struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ int kswapd_max_order; enum zone_type classzone_idx; +#ifdef CONFIG_NUMA_BALANCING + /* + * Lock serializing the per destination node AutoNUMA memory + * migration rate limiting data. + */ + spinlock_t numabalancing_migrate_lock; + + /* Rate limiting time interval */ + unsigned long numabalancing_migrate_next_window; + + /* Number of pages migrated during the rate limiting time interval */ + unsigned long numabalancing_migrate_nr_pages; +#endif } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bfe1f478064..c20635c527a 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -7,7 +7,7 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/mm.h> -#include <linux/mutex.h> +#include <linux/rwsem.h> #include <linux/memcontrol.h> /* @@ -25,8 +25,8 @@ * pointing to this anon_vma once its vma list is empty. */ struct anon_vma { - struct anon_vma *root; /* Root of this anon_vma tree */ - struct mutex mutex; /* Serialize access to vma list */ + struct anon_vma *root; /* Root of this anon_vma tree */ + struct rw_semaphore rwsem; /* W: modification, R: walking the list */ /* * The refcount is taken on an anon_vma when there is no * guarantee that the vma of page tables will exist for @@ -64,7 +64,7 @@ struct anon_vma_chain { struct vm_area_struct *vma; struct anon_vma *anon_vma; struct list_head same_vma; /* locked by mmap_sem & page_table_lock */ - struct rb_node rb; /* locked by anon_vma->mutex */ + struct rb_node rb; /* locked by anon_vma->rwsem */ unsigned long rb_subtree_last; #ifdef CONFIG_DEBUG_VM_RB unsigned long cached_vma_start, cached_vma_last; @@ -108,26 +108,37 @@ static inline void vma_lock_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) - mutex_lock(&anon_vma->root->mutex); + down_write(&anon_vma->root->rwsem); } static inline void vma_unlock_anon_vma(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; if (anon_vma) - mutex_unlock(&anon_vma->root->mutex); + up_write(&anon_vma->root->rwsem); } -static inline void anon_vma_lock(struct anon_vma *anon_vma) +static inline void anon_vma_lock_write(struct anon_vma *anon_vma) { - mutex_lock(&anon_vma->root->mutex); + down_write(&anon_vma->root->rwsem); } static inline void anon_vma_unlock(struct anon_vma *anon_vma) { - mutex_unlock(&anon_vma->root->mutex); + up_write(&anon_vma->root->rwsem); } +static inline void anon_vma_lock_read(struct anon_vma *anon_vma) +{ + down_read(&anon_vma->root->rwsem); +} + +static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) +{ + up_read(&anon_vma->root->rwsem); +} + + /* * anon_vma helper functions. */ @@ -220,8 +231,8 @@ int try_to_munlock(struct page *); /* * Called by memory-failure.c to kill processes. */ -struct anon_vma *page_lock_anon_vma(struct page *page); -void page_unlock_anon_vma(struct anon_vma *anon_vma); +struct anon_vma *page_lock_anon_vma_read(struct page *page); +void page_unlock_anon_vma_read(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 2c2f3072bee..b089c92c609 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1527,6 +1527,14 @@ struct task_struct { short il_next; short pref_node_fork; #endif +#ifdef CONFIG_NUMA_BALANCING + int numa_scan_seq; + int numa_migrate_seq; + unsigned int numa_scan_period; + u64 node_stamp; /* migration stamp */ + struct callback_head numa_work; +#endif /* CONFIG_NUMA_BALANCING */ + struct rcu_head rcu; /* @@ -1601,6 +1609,18 @@ struct task_struct { /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) +#ifdef CONFIG_NUMA_BALANCING +extern void task_numa_fault(int node, int pages, bool migrated); +extern void set_numabalancing_state(bool enabled); +#else +static inline void task_numa_fault(int node, int pages, bool migrated) +{ +} +static inline void set_numabalancing_state(bool enabled) +{ +} +#endif + /* * Priority of a process goes from 0..MAX_PRIO-1, valid RT * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH @@ -2030,6 +2050,13 @@ enum sched_tunable_scaling { }; extern enum sched_tunable_scaling sysctl_sched_tunable_scaling; +extern unsigned int sysctl_numa_balancing_scan_delay; +extern unsigned int sysctl_numa_balancing_scan_period_min; +extern unsigned int sysctl_numa_balancing_scan_period_max; +extern unsigned int sysctl_numa_balancing_scan_period_reset; +extern unsigned int sysctl_numa_balancing_scan_size; +extern unsigned int sysctl_numa_balancing_settle_count; + #ifdef CONFIG_SCHED_DEBUG extern unsigned int sysctl_sched_migration_cost; extern unsigned int sysctl_sched_nr_migrate; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index fe786f07d2b..fce0a2799d4 100644 --- a/include/linux/vm_event_item.h +++ b/include/linu |