From 57c1ffcefb5acb3c8b5f8436c325a6bdbd8e9c78 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 14 Nov 2013 14:30:45 -0800 Subject: mm: rename USE_SPLIT_PTLOCKS to USE_SPLIT_PTE_PTLOCKS We're going to introduce split page table lock for PMD level. Let's rename existing split ptlock for PTE level to avoid confusion. Signed-off-by: Kirill A. Shutemov Tested-by: Alex Thorlton Cc: Ingo Molnar Cc: Naoya Horiguchi Cc: "Eric W . Biederman" Cc: "Paul E . McKenney" Cc: Al Viro Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Dave Hansen Cc: Dave Jones Cc: David Howells Cc: Frederic Weisbecker Cc: Johannes Weiner Cc: Kees Cook Cc: Mel Gorman Cc: Michael Kerrisk Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Robin Holt Cc: Sedat Dilek Cc: Srikar Dronamraju Cc: Thomas Gleixner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 6 +++--- include/linux/mm_types.h | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 42a35d94b82..dc3333d6c98 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1316,7 +1316,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ -#if USE_SPLIT_PTLOCKS +#if USE_SPLIT_PTE_PTLOCKS /* * We tuck a spinlock to guard each pagetable page into its struct page, * at page->private, with BUILD_BUG_ON to make sure that this will not @@ -1329,14 +1329,14 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a } while (0) #define pte_lock_deinit(page) ((page)->mapping = NULL) #define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) -#else /* !USE_SPLIT_PTLOCKS */ +#else /* !USE_SPLIT_PTE_PTLOCKS */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ #define pte_lock_init(page) do {} while (0) #define pte_lock_deinit(page) do {} while (0) #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) -#endif /* USE_SPLIT_PTLOCKS */ +#endif /* USE_SPLIT_PTE_PTLOCKS */ static inline void pgtable_page_ctor(struct page *page) { diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index a3198e5aaf4..f1ff66dfa79 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -23,7 +23,7 @@ struct address_space; -#define USE_SPLIT_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) +#define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) /* * Each physical page in the system has a struct page associated with @@ -141,7 +141,7 @@ struct page { * indicates order in the buddy * system if PG_buddy is set. */ -#if USE_SPLIT_PTLOCKS +#if USE_SPLIT_PTE_PTLOCKS spinlock_t ptl; #endif struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ @@ -309,14 +309,14 @@ enum { NR_MM_COUNTERS }; -#if USE_SPLIT_PTLOCKS && defined(CONFIG_MMU) +#if USE_SPLIT_PTE_PTLOCKS && defined(CONFIG_MMU) #define SPLIT_RSS_COUNTING /* per-thread cached information, */ struct task_rss_stat { int events; /* for synchronization threshold */ int count[NR_MM_COUNTERS]; }; -#endif /* USE_SPLIT_PTLOCKS */ +#endif /* USE_SPLIT_PTE_PTLOCKS */ struct mm_rss_stat { atomic_long_t count[NR_MM_COUNTERS]; -- cgit v1.2.3-70-g09d2 From e1f56c89b040134add93f686931cc266541d239a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 14 Nov 2013 14:30:48 -0800 Subject: mm: convert mm->nr_ptes to atomic_long_t With split page table lock for PMD level we can't hold mm->page_table_lock while updating nr_ptes. Let's convert it to atomic_long_t to avoid races. Signed-off-by: Kirill A. Shutemov Tested-by: Alex Thorlton Cc: Ingo Molnar Cc: Naoya Horiguchi Cc: "Eric W . Biederman" Cc: "Paul E . McKenney" Cc: Al Viro Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Dave Hansen Cc: Dave Jones Cc: David Howells Cc: Frederic Weisbecker Cc: Johannes Weiner Cc: Kees Cook Cc: Mel Gorman Cc: Michael Kerrisk Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Robin Holt Cc: Sedat Dilek Cc: Srikar Dronamraju Cc: Thomas Gleixner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 ++- include/linux/mm_types.h | 2 +- kernel/fork.c | 2 +- mm/huge_memory.c | 10 +++++----- mm/memory.c | 4 ++-- mm/mmap.c | 3 ++- mm/oom_kill.c | 6 +++--- 7 files changed, 16 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index abbe825d20f..8faaebdc6b0 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -62,7 +62,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - (PTRS_PER_PTE*sizeof(pte_t)*mm->nr_ptes) >> 10, + (PTRS_PER_PTE * sizeof(pte_t) * + atomic_long_read(&mm->nr_ptes)) >> 10, swap << (PAGE_SHIFT-10)); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f1ff66dfa79..566df579c51 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -339,6 +339,7 @@ struct mm_struct { pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ + atomic_long_t nr_ptes; /* Page table pages */ int map_count; /* number of VMAs */ spinlock_t page_table_lock; /* Protects page tables and some counters */ @@ -360,7 +361,6 @@ struct mm_struct { unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE */ unsigned long stack_vm; /* VM_GROWSUP/DOWN */ unsigned long def_flags; - unsigned long nr_ptes; /* Page table pages */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; diff --git a/kernel/fork.c b/kernel/fork.c index f6d11fc67f7..e2520756e00 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -532,7 +532,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->flags = (current->mm) ? (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; - mm->nr_ptes = 0; + atomic_long_set(&mm->nr_ptes, 0); memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm_init_aio(mm); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0556c6a4495..e5b2d316be2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -738,7 +738,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); - mm->nr_ptes++; + atomic_long_inc(&mm->nr_ptes); spin_unlock(&mm->page_table_lock); } @@ -771,7 +771,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - mm->nr_ptes++; + atomic_long_inc(&mm->nr_ptes); return true; } @@ -896,7 +896,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd = pmd_mkold(pmd_wrprotect(pmd)); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - dst_mm->nr_ptes++; + atomic_long_inc(&dst_mm->nr_ptes); ret = 0; out_unlock: @@ -1392,7 +1392,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_remove_pmd_tlb_entry(tlb, pmd, addr); pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { - tlb->mm->nr_ptes--; + atomic_long_dec(&tlb->mm->nr_ptes); spin_unlock(&tlb->mm->page_table_lock); put_huge_zero_page(); } else { @@ -1401,7 +1401,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(page_mapcount(page) < 0); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON(!PageHead(page)); - tlb->mm->nr_ptes--; + atomic_long_dec(&tlb->mm->nr_ptes); spin_unlock(&tlb->mm->page_table_lock); tlb_remove_page(tlb, page); } diff --git a/mm/memory.c b/mm/memory.c index bf8665849a5..0b5a93a49f2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -382,7 +382,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); - tlb->mm->nr_ptes--; + atomic_long_dec(&tlb->mm->nr_ptes); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -573,7 +573,7 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, spin_lock(&mm->page_table_lock); wait_split_huge_page = 0; if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ - mm->nr_ptes++; + atomic_long_inc(&mm->nr_ptes); pmd_populate(mm, pmd, new); new = NULL; } else if (unlikely(pmd_trans_splitting(*pmd))) diff --git a/mm/mmap.c b/mm/mmap.c index 5a6baddde15..834b2d785f1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2724,7 +2724,8 @@ void exit_mmap(struct mm_struct *mm) } vm_unacct_memory(nr_accounted); - WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); + WARN_ON(atomic_long_read(&mm->nr_ptes) > + (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); } /* Insert vm structure into process list sorted by address diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 6738c47f1f7..1e4a600a616 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -161,7 +161,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + p->mm->nr_ptes + + points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + get_mm_counter(p->mm, MM_SWAPENTS); task_unlock(p); @@ -364,10 +364,10 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - task->mm->nr_ptes, + atomic_long_read(&task->mm->nr_ptes), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); -- cgit v1.2.3-70-g09d2 From 9a86cb7bdc4ccbe3f99a1ca275b90a322a90f9ce Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 14 Nov 2013 14:30:51 -0800 Subject: mm: introduce api for split page table lock for PMD level Basic api, backed by mm->page_table_lock for now. Actual implementation will be added later. Signed-off-by: Naoya Horiguchi Signed-off-by: Kirill A. Shutemov Tested-by: Alex Thorlton Cc: Ingo Molnar Cc: "Eric W . Biederman" Cc: "Paul E . McKenney" Cc: Al Viro Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Dave Hansen Cc: Dave Jones Cc: David Howells Cc: Frederic Weisbecker Cc: Johannes Weiner Cc: Kees Cook Cc: Mel Gorman Cc: Michael Kerrisk Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Robin Holt Cc: Sedat Dilek Cc: Srikar Dronamraju Cc: Thomas Gleixner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index dc3333d6c98..4f4ca41fcf2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1378,6 +1378,19 @@ static inline void pgtable_page_dtor(struct page *page) ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ NULL: pte_offset_kernel(pmd, address)) +static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) +{ + return &mm->page_table_lock; +} + + +static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) +{ + spinlock_t *ptl = pmd_lockptr(mm, pmd); + spin_lock(ptl); + return ptl; +} + extern void free_area_init(unsigned long * zones_size); extern void free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); -- cgit v1.2.3-70-g09d2 From bf929152e9f6c49b66fad4ebf08cc95b02ce48f5 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 14 Nov 2013 14:30:54 -0800 Subject: mm, thp: change pmd_trans_huge_lock() to return taken lock With split ptlock it's important to know which lock pmd_trans_huge_lock() took. This patch adds one more parameter to the function to return the lock. In most places migration to new api is trivial. Exception is move_huge_pmd(): we need to take two locks if pmd tables are different. Signed-off-by: Naoya Horiguchi Signed-off-by: Kirill A. Shutemov Tested-by: Alex Thorlton Cc: Ingo Molnar Cc: "Eric W . Biederman" Cc: "Paul E . McKenney" Cc: Al Viro Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Dave Hansen Cc: Dave Jones Cc: David Howells Cc: Frederic Weisbecker Cc: Johannes Weiner Cc: Kees Cook Cc: Mel Gorman Cc: Michael Kerrisk Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Robin Holt Cc: Sedat Dilek Cc: Srikar Dronamraju Cc: Thomas Gleixner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 13 +++++++------ include/linux/huge_mm.h | 14 +++++++------- mm/huge_memory.c | 40 +++++++++++++++++++++++++++------------- mm/memcontrol.c | 10 +++++----- 4 files changed, 46 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 8faaebdc6b0..42b5cf5d032 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -506,9 +506,9 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { smaps_pte_entry(*(pte_t *)pmd, addr, HPAGE_PMD_SIZE, walk); - spin_unlock(&walk->mm->page_table_lock); + spin_unlock(ptl); mss->anonymous_thp += HPAGE_PMD_SIZE; return 0; } @@ -999,13 +999,14 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, { struct vm_area_struct *vma; struct pagemapread *pm = walk->private; + spinlock_t *ptl; pte_t *pte; int err = 0; pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); - if (vma && pmd_trans_huge_lock(pmd, vma) == 1) { + if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { int pmd_flags2; if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) @@ -1023,7 +1024,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (err) break; } - spin_unlock(&walk->mm->page_table_lock); + spin_unlock(ptl); return err; } @@ -1325,7 +1326,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, md = walk->private; - if (pmd_trans_huge_lock(pmd, md->vma) == 1) { + if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) { pte_t huge_pte = *(pte_t *)pmd; struct page *page; @@ -1333,7 +1334,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, if (page) gather_stats(page, md, pte_dirty(huge_pte), HPAGE_PMD_SIZE/PAGE_SIZE); - spin_unlock(&walk->mm->page_table_lock); + spin_unlock(ptl); return 0; } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 3935428c57c..4aca0d8da11 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -129,15 +129,15 @@ extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); -extern int __pmd_trans_huge_lock(pmd_t *pmd, - struct vm_area_struct *vma); +extern int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl); /* mmap_sem must be held on entry */ -static inline int pmd_trans_huge_lock(pmd_t *pmd, - struct vm_area_struct *vma) +static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl) { VM_BUG_ON(!rwsem_is_locked(&vma->vm_mm->mmap_sem)); if (pmd_trans_huge(*pmd)) - return __pmd_trans_huge_lock(pmd, vma); + return __pmd_trans_huge_lock(pmd, vma, ptl); else return 0; } @@ -215,8 +215,8 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, long adjust_next) { } -static inline int pmd_trans_huge_lock(pmd_t *pmd, - struct vm_area_struct *vma) +static inline int pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl) { return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e5b2d316be2..471eb04066f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1376,9 +1376,10 @@ out: int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { struct page *page; pgtable_t pgtable; pmd_t orig_pmd; @@ -1393,7 +1394,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { atomic_long_dec(&tlb->mm->nr_ptes); - spin_unlock(&tlb->mm->page_table_lock); + spin_unlock(ptl); put_huge_zero_page(); } else { page = pmd_page(orig_pmd); @@ -1402,7 +1403,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON(!PageHead(page)); atomic_long_dec(&tlb->mm->nr_ptes); - spin_unlock(&tlb->mm->page_table_lock); + spin_unlock(ptl); tlb_remove_page(tlb, page); } pte_free(tlb->mm, pgtable); @@ -1415,14 +1416,15 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec) { + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { /* * All logical pages in the range are present * if backed by a huge page. */ - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); memset(vec, 1, (end - addr) >> PAGE_SHIFT); ret = 1; } @@ -1435,6 +1437,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) { + spinlock_t *old_ptl, *new_ptl; int ret = 0; pmd_t pmd; @@ -1455,12 +1458,21 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, goto out; } - ret = __pmd_trans_huge_lock(old_pmd, vma); + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_sem prevents deadlock. + */ + ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl); if (ret == 1) { + new_ptl = pmd_lockptr(mm, new_pmd); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); - spin_unlock(&mm->page_table_lock); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); } out: return ret; @@ -1476,9 +1488,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma) == 1) { + if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pmd_t entry; ret = 1; if (!prot_numa) { @@ -1507,7 +1520,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, if (ret == HPAGE_PMD_NR) set_pmd_at(mm, addr, pmd, entry); - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); } return ret; @@ -1520,12 +1533,13 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * Note that if it returns 1, this routine returns without unlocking page * table locks. So callers must unlock them. */ -int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) +int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, + spinlock_t **ptl) { - spin_lock(&vma->vm_mm->page_table_lock); + *ptl = pmd_lock(vma->vm_mm, pmd); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(*ptl); wait_split_huge_page(vma->anon_vma, pmd); return -1; } else { @@ -1534,7 +1548,7 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) return 1; } } - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(*ptl); return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e3cd40b2d5d..f1a0ae6e11b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6605,10 +6605,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } @@ -6797,9 +6797,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, * to be unlocked in __split_huge_page_splitting(), where the main * part of thp split is not executed yet. */ - if (pmd_trans_huge_lock(pmd, vma) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { if (mc.precharge < HPAGE_PMD_NR) { - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); @@ -6816,7 +6816,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, } put_page(page); } - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); return 0; } -- cgit v1.2.3-70-g09d2 From 117b0791ac42f2ec447bc864e70ad622b5604059 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 14 Nov 2013 14:30:56 -0800 Subject: mm, thp: move ptl taking inside page_check_address_pmd() With split page table lock we can't know which lock we need to take before we find the relevant pmd. Let's move lock taking inside the function. Signed-off-by: Naoya Horiguchi Signed-off-by: Kirill A. Shutemov Tested-by: Alex Thorlton Cc: Ingo Molnar Cc: "Eric W . Biederman" Cc: "Paul E . McKenney" Cc: Al Viro Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Dave Hansen Cc: Dave Jones Cc: David Howells Cc: Frederic Weisbecker Cc: Johannes Weiner Cc: Kees Cook Cc: Mel Gorman Cc: Michael Kerrisk Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Robin Holt Cc: Sedat Dilek Cc: Srikar Dronamraju Cc: Thomas Gleixner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 3 ++- mm/huge_memory.c | 43 +++++++++++++++++++++++++++---------------- mm/rmap.c | 13 +++++-------- 3 files changed, 34 insertions(+), 25 deletions(-) (limited to 'include') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 4aca0d8da11..91672e2deec 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -54,7 +54,8 @@ enum page_check_address_pmd_flag { extern pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, - enum page_check_address_pmd_flag flag); + enum page_check_address_pmd_flag flag, + spinlock_t **ptl); #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<vm_mm; + spinlock_t *ptl; pmd_t *pmd; int ret = 0; /* For mmu_notifiers */ @@ -1600,9 +1612,8 @@ static int __split_huge_page_splitting(struct page *page, const unsigned long mmun_end = address + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl); if (pmd) { /* * We can't temporarily set the pmd to null in order @@ -1613,8 +1624,8 @@ static int __split_huge_page_splitting(struct page *page, */ pmdp_splitting_flush(vma, address, pmd); ret = 1; + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return ret; @@ -1745,14 +1756,14 @@ static int __split_huge_page_map(struct page *page, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; pmd_t *pmd, _pmd; int ret = 0, i; pgtable_t pgtable; unsigned long haddr; - spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl); if (pmd) { pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -1807,8 +1818,8 @@ static int __split_huge_page_map(struct page *page, pmdp_invalidate(vma, address, pmd); pmd_populate(mm, pmd, pgtable); ret = 1; + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); return ret; } diff --git a/mm/rmap.c b/mm/rmap.c index fd3ee7a54a1..b59d741dcf6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -665,25 +665,23 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; + spinlock_t *ptl; int referenced = 0; if (unlikely(PageTransHuge(page))) { pmd_t *pmd; - spin_lock(&mm->page_table_lock); /* * rmap might return false positives; we must filter * these out using page_check_address_pmd(). */ pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_FLAG); - if (!pmd) { - spin_unlock(&mm->page_table_lock); + PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); + if (!pmd) goto out; - } if (vma->vm_flags & VM_LOCKED) { - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); *mapcount = 0; /* break early from loop */ *vm_flags |= VM_LOCKED; goto out; @@ -692,10 +690,9 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, /* go ahead even if the pmd is pmd_trans_splitting() */ if (pmdp_clear_flush_young_notify(vma, address, pmd)) referenced++; - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); } else { pte_t *pte; - spinlock_t *ptl; /* * rmap might return false positives; we must filter -- cgit v1.2.3-70-g09d2 From c389a250ab4cfa4a3775d9f2c45271618af6d5b2 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 14 Nov 2013 14:30:59 -0800 Subject: mm, thp: do not access mm->pmd_huge_pte directly Currently mm->pmd_huge_pte protected by page table lock. It will not work with split lock. We have to have per-pmd pmd_huge_pte for proper access serialization. For now, let's just introduce wrapper to access mm->pmd_huge_pte. Signed-off-by: Kirill A. Shutemov Tested-by: Alex Thorlton Cc: Alex Thorlton Cc: Ingo Molnar Cc: Naoya Horiguchi Cc: "Eric W . Biederman" Cc: "Paul E . McKenney" Cc: Al Viro Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Dave Hansen Cc: Dave Jones Cc: David Howells Cc: Frederic Weisbecker Cc: Johannes Weiner Cc: Kees Cook Cc: Mel Gorman Cc: Michael Kerrisk Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Robin Holt Cc: Sedat Dilek Cc: Srikar Dronamraju Cc: Thomas Gleixner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/pgtable.c | 12 ++++++------ arch/sparc/mm/tlb.c | 12 ++++++------ include/linux/mm.h | 1 + mm/pgtable-generic.c | 12 ++++++------ 4 files changed, 19 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 0a2e5e08674..1ea18fcfa21 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -1244,11 +1244,11 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, assert_spin_locked(&mm->page_table_lock); /* FIFO */ - if (!mm->pmd_huge_pte) + if (!pmd_huge_pte(mm, pmdp)) INIT_LIST_HEAD(lh); else - list_add(lh, (struct list_head *) mm->pmd_huge_pte); - mm->pmd_huge_pte = pgtable; + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; } pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) @@ -1260,12 +1260,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) assert_spin_locked(&mm->page_table_lock); /* FIFO */ - pgtable = mm->pmd_huge_pte; + pgtable = pmd_huge_pte(mm, pmdp); lh = (struct list_head *) pgtable; if (list_empty(lh)) - mm->pmd_huge_pte = NULL; + pmd_huge_pte(mm, pmdp) = NULL; else { - mm->pmd_huge_pte = (pgtable_t) lh->next; + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; list_del(lh); } ptep = (pte_t *) pgtable; diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 7a91f288c70..656cc46a81f 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -196,11 +196,11 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, assert_spin_locked(&mm->page_table_lock); /* FIFO */ - if (!mm->pmd_huge_pte) + if (!pmd_huge_pte(mm, pmdp)) INIT_LIST_HEAD(lh); else - list_add(lh, (struct list_head *) mm->pmd_huge_pte); - mm->pmd_huge_pte = pgtable; + list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp)); + pmd_huge_pte(mm, pmdp) = pgtable; } pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) @@ -211,12 +211,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) assert_spin_locked(&mm->page_table_lock); /* FIFO */ - pgtable = mm->pmd_huge_pte; + pgtable = pmd_huge_pte(mm, pmdp); lh = (struct list_head *) pgtable; if (list_empty(lh)) - mm->pmd_huge_pte = NULL; + pmd_huge_pte(mm, pmdp) = NULL; else { - mm->pmd_huge_pte = (pgtable_t) lh->next; + pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next; list_del(lh); } pte_val(pgtable[0]) = 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index 4f4ca41fcf2..861cad53b74 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1383,6 +1383,7 @@ static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) return &mm->page_table_lock; } +#define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) { diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 3929a40bd6c..41fee3e5d57 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -154,11 +154,11 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, assert_spin_locked(&mm->page_table_lock); /* FIFO */ - if (!mm->pmd_huge_pte) + if (!pmd_huge_pte(mm, pmdp)) INIT_LIST_HEAD(&pgtable->lru); else - list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); - mm->pmd_huge_pte = pgtable; + list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); + pmd_huge_pte(mm, pmdp) = pgtable; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif @@ -173,11 +173,11 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) assert_spin_locked(&mm->page_table_lock); /* FIFO */ - pgtable = mm->pmd_huge_pte; + pgtable = pmd_huge_pte(mm, pmdp); if (list_empty(&pgtable->lru)) - mm->pmd_huge_pte = NULL; + pmd_huge_pte(mm, pmdp) = NULL; else { - mm->pmd_huge_pte = list_entry(pgtable->lru.next, + pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next, struct page, lru); list_del(&pgtable->lru); } -- cgit v1.2.3-70-g09d2 From cb900f41215447433cbc456d1c4294e858a84d7c Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 14 Nov 2013 14:31:02 -0800 Subject: mm, hugetlb: convert hugetlbfs to use split pmd lock Hugetlb supports multiple page sizes. We use split lock only for PMD level, but not for PUD. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Naoya Horiguchi Signed-off-by: Kirill A. Shutemov Tested-by: Alex Thorlton Cc: Ingo Molnar Cc: "Eric W . Biederman" Cc: "Paul E . McKenney" Cc: Al Viro Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Dave Hansen Cc: Dave Jones Cc: David Howells Cc: Frederic Weisbecker Cc: Johannes Weiner Cc: Kees Cook Cc: Mel Gorman Cc: Michael Kerrisk Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Robin Holt Cc: Sedat Dilek Cc: Srikar Dronamraju Cc: Thomas Gleixner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 2 +- include/linux/hugetlb.h | 26 ++++++++++++ include/linux/swapops.h | 7 +-- mm/hugetlb.c | 110 +++++++++++++++++++++++++++++------------------- mm/mempolicy.c | 5 ++- mm/migrate.c | 7 +-- mm/rmap.c | 2 +- 7 files changed, 105 insertions(+), 54 deletions(-) (limited to 'include') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index c805d5b69ba..a77d2b29919 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -1,8 +1,8 @@ #include -#include #include #include #include +#include #include #include #include diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0393270466c..acd2010328f 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -392,6 +392,15 @@ static inline int hugepage_migration_support(struct hstate *h) return pmd_huge_support() && (huge_page_shift(h) == PMD_SHIFT); } +static inline spinlock_t *huge_pte_lockptr(struct hstate *h, + struct mm_struct *mm, pte_t *pte) +{ + if (huge_page_size(h) == PMD_SIZE) + return pmd_lockptr(mm, (pmd_t *) pte); + VM_BUG_ON(huge_page_size(h) == PAGE_SIZE); + return &mm->page_table_lock; +} + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; #define alloc_huge_page_node(h, nid) NULL @@ -401,6 +410,7 @@ struct hstate {}; #define hstate_sizelog(s) NULL #define hstate_vma(v) NULL #define hstate_inode(i) NULL +#define page_hstate(page) NULL #define huge_page_size(h) PAGE_SIZE #define huge_page_mask(h) PAGE_MASK #define vma_kernel_pagesize(v) PAGE_SIZE @@ -421,6 +431,22 @@ static inline pgoff_t basepage_index(struct page *page) #define dissolve_free_huge_pages(s, e) do {} while (0) #define pmd_huge_support() 0 #define hugepage_migration_support(h) 0 + +static inline spinlock_t *huge_pte_lockptr(struct hstate *h, + struct mm_struct *mm, pte_t *pte) +{ + return &mm->page_table_lock; +} #endif /* CONFIG_HUGETLB_PAGE */ +static inline spinlock_t *huge_pte_lock(struct hstate *h, + struct mm_struct *mm, pte_t *pte) +{ + spinlock_t *ptl; + + ptl = huge_pte_lockptr(h, mm, pte); + spin_lock(ptl); + return ptl; +} + #endif /* _LINUX_HUGETLB_H */ diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 8d4fa82bfb9..c0f75261a72 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -139,7 +139,8 @@ static inline void make_migration_entry_read(swp_entry_t *entry) extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address); -extern void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte); +extern void migration_entry_wait_huge(struct vm_area_struct *vma, + struct mm_struct *mm, pte_t *pte); #else #define make_migration_entry(page, write) swp_entry(0, 0) @@ -151,8 +152,8 @@ static inline int is_migration_entry(swp_entry_t swp) static inline void make_migration_entry_read(swp_entry_t *entryp) { } static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, unsigned long address) { } -static inline void migration_entry_wait_huge(struct mm_struct *mm, - pte_t *pte) { } +static inline void migration_entry_wait_huge(struct vm_area_struct *vma, + struct mm_struct *mm, pte_t *pte) { } static inline int is_write_migration_entry(swp_entry_t entry) { return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0b7656e804d..7d57af21f49 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2376,6 +2376,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { + spinlock_t *src_ptl, *dst_ptl; src_pte = huge_pte_offset(src, addr); if (!src_pte) continue; @@ -2387,8 +2388,9 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, if (dst_pte == src_pte) continue; - spin_lock(&dst->page_table_lock); - spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); + dst_ptl = huge_pte_lock(h, dst, dst_pte); + src_ptl = huge_pte_lockptr(h, src, src_pte); + spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); if (!huge_pte_none(huge_ptep_get(src_pte))) { if (cow) huge_ptep_set_wrprotect(src, addr, src_pte); @@ -2398,8 +2400,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); } - spin_unlock(&src->page_table_lock); - spin_unlock(&dst->page_table_lock); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); } return 0; @@ -2442,6 +2444,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long address; pte_t *ptep; pte_t pte; + spinlock_t *ptl; struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); @@ -2455,25 +2458,25 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); again: - spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; + ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, &address, ptep)) - continue; + goto unlock; pte = huge_ptep_get(ptep); if (huge_pte_none(pte)) - continue; + goto unlock; /* * HWPoisoned hugepage is already unmapped and dropped reference */ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { huge_pte_clear(mm, address, ptep); - continue; + goto unlock; } page = pte_page(pte); @@ -2484,7 +2487,7 @@ again: */ if (ref_page) { if (page != ref_page) - continue; + goto unlock; /* * Mark the VMA as having unmapped its page so that @@ -2501,13 +2504,18 @@ again: page_remove_rmap(page); force_flush = !__tlb_remove_page(tlb, page); - if (force_flush) + if (force_flush) { + spin_unlock(ptl); break; + } /* Bail out after unmapping reference page if supplied */ - if (ref_page) + if (ref_page) { + spin_unlock(ptl); break; + } +unlock: + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); /* * mmu_gather ran out of room to batch pages, we break out of * the PTE lock to avoid doing the potential expensive TLB invalidate @@ -2613,7 +2621,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, - struct page *pagecache_page) + struct page *pagecache_page, spinlock_t *ptl) { struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; @@ -2647,8 +2655,8 @@ retry_avoidcopy: page_cache_get(old_page); - /* Drop page_table_lock as buddy allocator may be called */ - spin_unlock(&mm->page_table_lock); + /* Drop page table lock as buddy allocator may be called */ + spin_unlock(ptl); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { @@ -2666,13 +2674,13 @@ retry_avoidcopy: BUG_ON(huge_pte_none(pte)); if (unmap_ref_private(mm, vma, old_page, address)) { BUG_ON(huge_pte_none(pte)); - spin_lock(&mm->page_table_lock); + spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; /* - * race occurs while re-acquiring page_table_lock, and - * our job is done. + * race occurs while re-acquiring page table + * lock, and our job is done. */ return 0; } @@ -2680,7 +2688,7 @@ retry_avoidcopy: } /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); if (err == -ENOMEM) return VM_FAULT_OOM; else @@ -2695,7 +2703,7 @@ retry_avoidcopy: page_cache_release(new_page); page_cache_release(old_page); /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); return VM_FAULT_OOM; } @@ -2707,10 +2715,10 @@ retry_avoidcopy: mmun_end = mmun_start + huge_page_size(h); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* - * Retake the page_table_lock to check for racing updates + * Retake the page table lock to check for racing updates * before the page tables are altered */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); @@ -2724,13 +2732,13 @@ retry_avoidcopy: /* Make the old page be freed below */ new_page = old_page; } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); page_cache_release(new_page); page_cache_release(old_page); /* Caller expects lock to be held */ - spin_lock(&mm->page_table_lock); + spin_lock(ptl); return 0; } @@ -2778,6 +2786,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; struct address_space *mapping; pte_t new_pte; + spinlock_t *ptl; /* * Currently, we are forced to kill the process in the event the @@ -2864,7 +2873,8 @@ retry: goto backout_unlocked; } - spin_lock(&mm->page_table_lock); + ptl = huge_pte_lockptr(h, mm, ptep); + spin_lock(ptl); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto backout; @@ -2885,16 +2895,16 @@ retry: if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); + ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); } - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); unlock_page(page); out: return ret; backout: - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); backout_unlocked: unlock_page(page); put_page(page); @@ -2906,6 +2916,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, { pte_t *ptep; pte_t entry; + spinlock_t *ptl; int ret; struct page *page = NULL; struct page *pagecache_page = NULL; @@ -2918,7 +2929,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait_huge(mm, ptep); + migration_entry_wait_huge(vma, mm, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | @@ -2974,17 +2985,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (page != pagecache_page) lock_page(page); - spin_lock(&mm->page_table_lock); + ptl = huge_pte_lockptr(h, mm, ptep); + spin_lock(ptl); /* Check for a racing update before calling hugetlb_cow */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) - goto out_page_table_lock; + goto out_ptl; if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, - pagecache_page); - goto out_page_table_lock; + pagecache_page, ptl); + goto out_ptl; } entry = huge_pte_mkdirty(entry); } @@ -2993,8 +3005,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, address, ptep); -out_page_table_lock: - spin_unlock(&mm->page_table_lock); +out_ptl: + spin_unlock(ptl); if (pagecache_page) { unlock_page(pagecache_page); @@ -3020,9 +3032,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long remainder = *nr_pages; struct hstate *h = hstate_vma(vma); - spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; + spinlock_t *ptl = NULL; int absent; struct page *page; @@ -3030,8 +3042,12 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * Some archs (sparc64, sh*) have multiple pte_ts to * each hugepage. We have to make sure we get the * first, for the page indexing below to work. + * + * Note that page table lock is not held when pte is null. */ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + if (pte) + ptl = huge_pte_lock(h, mm, pte); absent = !pte || huge_pte_none(huge_ptep_get(pte)); /* @@ -3043,6 +3059,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, */ if (absent && (flags & FOLL_DUMP) && !hugetlbfs_pagecache_present(h, vma, vaddr)) { + if (pte) + spin_unlock(ptl); remainder = 0; break; } @@ -3062,10 +3080,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, !huge_pte_write(huge_ptep_get(pte)))) { int ret; - spin_unlock(&mm->page_table_lock); + if (pte) + spin_unlock(ptl); ret = hugetlb_fault(mm, vma, vaddr, (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); - spin_lock(&mm->page_table_lock); if (!(ret & VM_FAULT_ERROR)) continue; @@ -3096,8 +3114,8 @@ same_page: */ goto same_page; } + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); *nr_pages = remainder; *position = vaddr; @@ -3118,13 +3136,15 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, address, end); mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); - spin_lock(&mm->page_table_lock); for (; address < end; address += huge_page_size(h)) { + spinlock_t *ptl; ptep = huge_pte_offset(mm, address); if (!ptep) continue; + ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, &address, ptep)) { pages++; + spin_unlock(ptl); continue; } if (!huge_pte_none(huge_ptep_get(ptep))) { @@ -3134,8 +3154,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, set_huge_pte_at(mm, address, ptep, pte); pages++; } + spin_unlock(ptl); } - spin_unlock(&mm->page_table_lock); /* * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare * may have cleared our pud entry and done put_page on the page table: @@ -3298,6 +3318,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) unsigned long saddr; pte_t *spte = NULL; pte_t *pte; + spinlock_t *ptl; if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); @@ -3320,13 +3341,14 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!spte) goto out; - spin_lock(&mm->page_table_lock); + ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); + spin_lock(ptl); if (pud_none(*pud)) pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); else put_page(virt_to_page(spte)); - spin_unlock(&mm->page_table_lock); + spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); mutex_unlock(&mapping->i_mmap_mutex); @@ -3340,7 +3362,7 @@ out: * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * - * called with vma->vm_mm->page_table_lock held. + * called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4cc19f6ab6c..c4403cdf343 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -525,8 +525,9 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, #ifdef CONFIG_HUGETLB_PAGE int nid; struct page *page; + spinlock_t *ptl; - spin_lock(&vma->vm_mm->page_table_lock); + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); page = pte_page(huge_ptep_get((pte_t *)pmd)); nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) @@ -536,7 +537,7 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) isolate_huge_page(page, private); unlock: - spin_unlock(&vma->vm_mm->page_table_lock); + spin_unlock(ptl); #else BUG(); #endif diff --git a/mm/migrate.c b/mm/migrate.c index dfc8300ecbb..01f45cefa4c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -130,7 +130,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, addr); if (!ptep) goto out; - ptl = &mm->page_table_lock; + ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep); } else { pmd = mm_find_pmd(mm, addr); if (!pmd) @@ -249,9 +249,10 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, __migration_entry_wait(mm, ptep, ptl); } -void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte) +void migration_entry_wait_huge(struct vm_area_struct *vma, + struct mm_struct *mm, pte_t *pte) { - spinlock_t *ptl = &(mm)->page_table_lock; + spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); __migration_entry_wait(mm, pte, ptl); } diff --git a/mm/rmap.c b/mm/rmap.c index b59d741dcf6..55c8b8dc9ff 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -601,7 +601,7 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, if (unlikely(PageHuge(page))) { pte = huge_pte_offset(mm, address); - ptl = &mm->page_table_lock; + ptl = huge_pte_lockptr(page_hstate(page), mm, pte); goto check; } -- cgit v1.2.3-70-g09d2 From e009bb30c8df8a52a9622b616b67436b6a03a0cd Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 14 Nov 2013 14:31:07 -0800 Subject: mm: implement split page table lock for PMD level The basic idea is the same as with PTE level: the lock is embedded into struct page of table's page. We can't use mm->pmd_huge_pte to store pgtables for THP, since we don't take mm->page_table_lock anymore. Let's reuse page->lru of table's page for that. pgtable_pmd_page_ctor() returns true, if initialization is successful and false otherwise. Current implementation never fails, but assumption that constructor can fail will help to port it to -rt where spinlock_t is rather huge and cannot be embedded into struct page -- dynamic allocation is required. Signed-off-by: Naoya Horiguchi Signed-off-by: Kirill A. Shutemov Tested-by: Alex Thorlton Cc: Ingo Molnar Cc: "Eric W . Biederman" Cc: "Paul E . McKenney" Cc: Al Viro Cc: Andi Kleen Cc: Andrea Arcangeli Cc: Dave Hansen Cc: Dave Jones Cc: David Howells Cc: Frederic Weisbecker Cc: Johannes Weiner Cc: Kees Cook Cc: Mel Gorman Cc: Michael Kerrisk Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Robin Holt Cc: Sedat Dilek Cc: Srikar Dronamraju Cc: Thomas Gleixner Cc: Hugh Dickins Reviewed-by: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 7 ++++++- kernel/fork.c | 4 ++-- mm/Kconfig | 3 +++ 4 files changed, 43 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 861cad53b74..255750d9b1b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1378,13 +1378,45 @@ static inline void pgtable_page_dtor(struct page *page) ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ NULL: pte_offset_kernel(pmd, address)) +#if USE_SPLIT_PMD_PTLOCKS + +static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) +{ + return &virt_to_page(pmd)->ptl; +} + +static inline bool pgtable_pmd_page_ctor(struct page *page) +{ + spin_lock_init(&page->ptl); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + page->pmd_huge_pte = NULL; +#endif + return true; +} + +static inline void pgtable_pmd_page_dtor(struct page *page) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + VM_BUG_ON(page->pmd_huge_pte); +#endif +} + +#define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte) + +#else + static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } +static inline bool pgtable_pmd_page_ctor(struct page *page) { return true; } +static inline void pgtable_pmd_page_dtor(struct page *page) {} + #define pmd_huge_pte(mm, pmd) ((mm)->pmd_huge_pte) +#endif + static inline spinlock_t *pmd_lock(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl = pmd_lockptr(mm, pmd); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 566df579c51..93426109997 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -24,6 +24,8 @@ struct address_space; #define USE_SPLIT_PTE_PTLOCKS (NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS) +#define USE_SPLIT_PMD_PTLOCKS (USE_SPLIT_PTE_PTLOCKS && \ + IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) /* * Each physical page in the system has a struct page associated with @@ -63,6 +65,9 @@ struct page { * this page is only used to * free other pages. */ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS + pgtable_t pmd_huge_pte; /* protected by page->ptl */ +#endif }; union { @@ -406,7 +411,7 @@ struct mm_struct { #ifdef CONFIG_MMU_NOTIFIER struct mmu_notifier_mm *mmu_notifier_mm; #endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS pgtable_t pmd_huge_pte; /* protected by page_table_lock */ #endif #ifdef CONFIG_CPUMASK_OFFSTACK diff --git a/kernel/fork.c b/kernel/fork.c index e2520756e00..728d5be9548 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -560,7 +560,7 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON(mm->pmd_huge_pte); #endif } @@ -814,7 +814,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); mm_init_cpumask(mm); -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS mm->pmd_huge_pte = NULL; #endif if (!mm_init(mm, tsk)) diff --git a/mm/Kconfig b/mm/Kconfig index c28d247e524..7aa02def466 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -222,6 +222,9 @@ config SPLIT_PTLOCK_CPUS default "999999" if !64BIT && GENERIC_LOCKBREAK default "4" +config ARCH_ENABLE_SPLIT_PMD_PTLOCK + boolean + # # support for memory balloon compaction config BALLOON_COMPACTION -- cgit v1.2.3-70-g09d2 From 390f44e2aa2ab83f08231d7d05f066dc3494490e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 14 Nov 2013 14:31:20 -0800 Subject: mm: allow pgtable_page_ctor() to fail Change pgtable_page_ctor() return type from void to bool. Returns true, if initialization is successful and false otherwise. Current implementation never fails, but it will change later. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 255750d9b1b..e855351c336 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1338,10 +1338,11 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a #define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) #endif /* USE_SPLIT_PTE_PTLOCKS */ -static inline void pgtable_page_ctor(struct page *page) +static inline bool pgtable_page_ctor(struct page *page) { pte_lock_init(page); inc_zone_page_state(page, NR_PAGETABLE); + return true; } static inline void pgtable_page_dtor(struct page *page) -- cgit v1.2.3-70-g09d2 From 49076ec2ccaf68610aa03d96bced9a6694b93ca1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 14 Nov 2013 14:31:51 -0800 Subject: mm: dynamically allocate page->ptl if it cannot be embedded to struct page If split page table lock is in use, we embed the lock into struct page of table's page. We have to disable split lock, if spinlock_t is too big be to be embedded, like when DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC enabled. This patch add support for dynamic allocation of split page table lock if we can't embed it to struct page. page->ptl is unsigned long now and we use it as spinlock_t if sizeof(spinlock_t) <= sizeof(long), otherwise it's pointer to spinlock_t. The spinlock_t allocated in pgtable_page_ctor() for PTE table and in pgtable_pmd_page_ctor() for PMD table. All other helpers converted to support dynamically allocated page->ptl. Signed-off-by: Kirill A. Shutemov Reviewed-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/split_page_table_lock | 94 ++++++++++++++++++++++++++++++++++ arch/x86/xen/mmu.c | 2 +- include/linux/mm.h | 81 +++++++++++++++++++++-------- include/linux/mm_types.h | 5 +- mm/Kconfig | 2 - mm/memory.c | 19 +++++++ 6 files changed, 179 insertions(+), 24 deletions(-) create mode 100644 Documentation/vm/split_page_table_lock (limited to 'include') diff --git a/Documentation/vm/split_page_table_lock b/Documentation/vm/split_page_table_lock new file mode 100644 index 00000000000..7521d367f21 --- /dev/null +++ b/Documentation/vm/split_page_table_lock @@ -0,0 +1,94 @@ +Split page table lock +===================== + +Originally, mm->page_table_lock spinlock protected all page tables of the +mm_struct. But this approach leads to poor page fault scalability of +multi-threaded applications due high contention on the lock. To improve +scalability, split page table lock was introduced. + +With split page table lock we have separate per-table lock to serialize +access to the table. At the moment we use split lock for PTE and PMD +tables. Access to higher level tables protected by mm->page_table_lock. + +There are helpers to lock/unlock a table and other accessor functions: + - pte_offset_map_lock() + maps pte and takes PTE table lock, returns pointer to the taken + lock; + - pte_unmap_unlock() + unlocks and unmaps PTE table; + - pte_alloc_map_lock() + allocates PTE table if needed and take the lock, returns pointer + to taken lock or NULL if allocation failed; + - pte_lockptr() + returns pointer to PTE table lock; + - pmd_lock() + takes PMD table lock, returns pointer to taken lock; + - pmd_lockptr() + returns pointer to PMD table lock; + +Split page table lock for PTE tables is enabled compile-time if +CONFIG_SPLIT_PTLOCK_CPUS (usually 4) is less or equal to NR_CPUS. +If split lock is disabled, all tables guaded by mm->page_table_lock. + +Split page table lock for PMD tables is enabled, if it's enabled for PTE +tables and the architecture supports it (see below). + +Hugetlb and split page table lock +--------------------------------- + +Hugetlb can support several page sizes. We use split lock only for PMD +level, but not for PUD. + +Hugetlb-specific helpers: + - huge_pte_lock() + takes pmd split lock for PMD_SIZE page, mm->page_table_lock + otherwise; + - huge_pte_lockptr() + returns pointer to table lock; + +Support of split page table lock by an architecture +--------------------------------------------------- + +There's no need in special enabling of PTE split page table lock: +everything required is done by pgtable_page_ctor() and pgtable_page_dtor(), +which must be called on PTE table allocation / freeing. + +Make sure the architecture doesn't use slab allocator for page table +allocation: slab uses page->slab_cache and page->first_page for its pages. +These fields share storage with page->ptl. + +PMD split lock only makes sense if you have more than two page table +levels. + +PMD split lock enabling requires pgtable_pmd_page_ctor() call on PMD table +allocation and pgtable_pmd_page_dtor() on freeing. + +Allocation usually happens in pmd_alloc_one(), freeing in pmd_free(), but +make sure you cover all PMD table allocation / freeing paths: i.e X86_PAE +preallocate few PMDs on pgd_alloc(). + +With everything in place you can set CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK. + +NOTE: pgtable_page_ctor() and pgtable_pmd_page_ctor() can fail -- it must +be handled properly. + +page->ptl +--------- + +page->ptl is used to access split page table lock, where 'page' is struct +page of page containing the table. It shares storage with page->private +(and few other fields in union). + +To avoid increasing size of struct page and have best performance, we use a +trick: + - if spinlock_t fits into long, we use page->ptr as spinlock, so we + can avoid indirect access and save a cache line. + - if size of spinlock_t is bigger then size of long, we use page->ptl as + pointer to spinlock_t and allocate it dynamically. This allows to use + split lock with enabled DEBUG_SPINLOCK or DEBUG_LOCK_ALLOC, but costs + one more cache line for indirect access; + +The spinlock_t allocated in pgtable_page_ctor() for PTE table and in +pgtable_pmd_page_ctor() for PMD table. + +Please, never access page->ptl directly -- use appropriate helper. diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 455c873ce00..49c962fe7e6 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -797,7 +797,7 @@ static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) spinlock_t *ptl = NULL; #if USE_SPLIT_PTE_PTLOCKS - ptl = __pte_lockptr(page); + ptl = ptlock_ptr(page); spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif diff --git a/include/linux/mm.h b/include/linux/mm.h index e855351c336..d0339741b6c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1317,32 +1317,73 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ #if USE_SPLIT_PTE_PTLOCKS -/* - * We tuck a spinlock to guard each pagetable page into its struct page, - * at page->private, with BUILD_BUG_ON to make sure that this will not - * overflow into the next struct page (as it might with DEBUG_SPINLOCK). - * When freeing, reset page->mapping so free_pages_check won't complain. - */ -#define __pte_lockptr(page) &((page)->ptl) -#define pte_lock_init(_page) do { \ - spin_lock_init(__pte_lockptr(_page)); \ -} while (0) -#define pte_lock_deinit(page) ((page)->mapping = NULL) -#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));}) +bool __ptlock_alloc(struct page *page); +void __ptlock_free(struct page *page); +static inline bool ptlock_alloc(struct page *page) +{ + if (sizeof(spinlock_t) > sizeof(page->ptl)) + return __ptlock_alloc(page); + return true; +} +static inline void ptlock_free(struct page *page) +{ + if (sizeof(spinlock_t) > sizeof(page->ptl)) + __ptlock_free(page); +} + +static inline spinlock_t *ptlock_ptr(struct page *page) +{ + if (sizeof(spinlock_t) > sizeof(page->ptl)) + return (spinlock_t *) page->ptl; + else + return (spinlock_t *) &page->ptl; +} + +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) +{ + return ptlock_ptr(pmd_page(*pmd)); +} + +static inline bool ptlock_init(struct page *page) +{ + /* + * prep_new_page() initialize page->private (and therefore page->ptl) + * with 0. Make sure nobody took it in use in between. + * + * It can happen if arch try to use slab for page table allocation: + * slab code uses page->slab_cache and page->first_page (for tail + * pages), which share storage with page->ptl. + */ + VM_BUG_ON(page->ptl); + if (!ptlock_alloc(page)) + return false; + spin_lock_init(ptlock_ptr(page)); + return true; +} + +/* Reset page->mapping so free_pages_check won't complain. */ +static inline void pte_lock_deinit(struct page *page) +{ + page->mapping = NULL; + ptlock_free(page); +} + #else /* !USE_SPLIT_PTE_PTLOCKS */ /* * We use mm->page_table_lock to guard all pagetable pages of the mm. */ -#define pte_lock_init(page) do {} while (0) -#define pte_lock_deinit(page) do {} while (0) -#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;}) +static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) +{ + return &mm->page_table_lock; +} +static inline bool ptlock_init(struct page *page) { return true; } +static inline void pte_lock_deinit(struct page *page) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ static inline bool pgtable_page_ctor(struct page *page) { - pte_lock_init(page); inc_zone_page_state(page, NR_PAGETABLE); - return true; + return ptlock_init(page); } static inline void pgtable_page_dtor(struct page *page) @@ -1383,16 +1424,15 @@ static inline void pgtable_page_dtor(struct page *page) static inline spinlock_t *pmd_lockptr(struct mm_struct *mm, pmd_t *pmd) { - return &virt_to_page(pmd)->ptl; + return ptlock_ptr(virt_to_page(pmd)); } static inline bool pgtable_pmd_page_ctor(struct page *page) { - spin_lock_init(&page->ptl); #ifdef CONFIG_TRANSPARENT_HUGEPAGE page->pmd_huge_pte = NULL; #endif - return true; + return ptlock_init(page); } static inline void pgtable_pmd_page_dtor(struct page *page) @@ -1400,6 +1440,7 @@ static inline void pgtable_pmd_page_dtor(struct page *page) #ifdef CONFIG_TRANSPARENT_HUGEPAGE VM_BUG_ON(page->pmd_huge_pte); #endif + ptlock_free(page); } #define pmd_huge_pte(mm, pmd) (virt_to_page(pmd)->pmd_huge_pte) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 93426109997..423da79877e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -147,7 +147,10 @@ struct page { * system if PG_buddy is set. */ #if USE_SPLIT_PTE_PTLOCKS - spinlock_t ptl; + unsigned long ptl; /* It's spinlock_t if it fits to long, + * otherwise it's pointer to dynamicaly + * allocated spinlock_t. + */ #endif struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ struct page *first_page; /* Compound tail pages */ diff --git a/mm/Kconfig b/mm/Kconfig index 7aa02def466..de31af25620 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -218,8 +218,6 @@ config SPLIT_PTLOCK_CPUS int default "999999" if ARM && !CPU_CACHE_VIPT default "999999" if PARISC && !PA20 - default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC - default "999999" if !64BIT && GENERIC_LOCKBREAK default "4" config ARCH_ENABLE_SPLIT_PMD_PTLOCK diff --git a/mm/memory.c b/mm/memory.c index d05c6b10e92..24ffae2a530 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4270,3 +4270,22 @@ void copy_user_huge_page(struct page *dst, struct page *src, } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + +#if USE_SPLIT_PTE_PTLOCKS +bool __ptlock_alloc(struct page *page) +{ + spinlock_t *ptl; + + ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); + if (!ptl) + return false; + page->ptl = (unsigned long)ptl; + return true; +} + +void __ptlock_free(struct page *page) +{ + if (sizeof(spinlock_t) > sizeof(page->ptl)) + kfree((spinlock_t *)page->ptl); +} +#endif -- cgit v1.2.3-70-g09d2 From 539edb5846c740d78a8b6c2e43a99ca4323df68f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Nov 2013 14:31:52 -0800 Subject: mm: properly separate the bloated ptl from the regular case Use kernel/bounds.c to convert build-time spinlock_t size check into a preprocessor symbol and apply that to properly separate the page::ptl situation. Signed-off-by: Peter Zijlstra Signed-off-by: Kirill A. Shutemov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 24 +++++++++++++----------- include/linux/mm_types.h | 9 +++++---- kernel/bounds.c | 2 ++ mm/memory.c | 11 +++++------ 4 files changed, 25 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index d0339741b6c..1cedd000cf2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1317,27 +1317,29 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ #if USE_SPLIT_PTE_PTLOCKS -bool __ptlock_alloc(struct page *page); -void __ptlock_free(struct page *page); +#if BLOATED_SPINLOCKS +extern bool ptlock_alloc(struct page *page); +extern void ptlock_free(struct page *page); + +static inline spinlock_t *ptlock_ptr(struct page *page) +{ + return page->ptl; +} +#else /* BLOATED_SPINLOCKS */ static inline bool ptlock_alloc(struct page *page) { - if (sizeof(spinlock_t) > sizeof(page->ptl)) - return __ptlock_alloc(page); return true; } + static inline void ptlock_free(struct page *page) { - if (sizeof(spinlock_t) > sizeof(page->ptl)) - __ptlock_free(page); } static inline spinlock_t *ptlock_ptr(struct page *page) { - if (sizeof(spinlock_t) > sizeof(page->ptl)) - return (spinlock_t *) page->ptl; - else - return (spinlock_t *) &page->ptl; + return &page->ptl; } +#endif /* BLOATED_SPINLOCKS */ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { @@ -1354,7 +1356,7 @@ static inline bool ptlock_init(struct page *page) * slab code uses page->slab_cache and page->first_page (for tail * pages), which share storage with page->ptl. */ - VM_BUG_ON(page->ptl); + VM_BUG_ON(*(unsigned long *)&page->ptl); if (!ptlock_alloc(page)) return false; spin_lock_init(ptlock_ptr(page)); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 423da79877e..10f5a7272b8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -147,10 +147,11 @@ struct page { * system if PG_buddy is set. */ #if USE_SPLIT_PTE_PTLOCKS - unsigned long ptl; /* It's spinlock_t if it fits to long, - * otherwise it's pointer to dynamicaly - * allocated spinlock_t. - */ +#if BLOATED_SPINLOCKS + spinlock_t *ptl; +#else + spinlock_t ptl; +#endif #endif struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ struct page *first_page; /* Compound tail pages */ diff --git a/kernel/bounds.c b/kernel/bounds.c index e8ca97b5c38..578782ef6ae 100644 --- a/kernel/bounds.c +++ b/kernel/bounds.c @@ -11,6 +11,7 @@ #include #include #include +#include void foo(void) { @@ -21,5 +22,6 @@ void foo(void) #ifdef CONFIG_SMP DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); #endif + DEFINE(BLOATED_SPINLOCKS, sizeof(spinlock_t) > sizeof(int)); /* End of constants */ } diff --git a/mm/memory.c b/mm/memory.c index 24ffae2a530..5d9025f3b3e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4271,21 +4271,20 @@ void copy_user_huge_page(struct page *dst, struct page *src, } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ -#if USE_SPLIT_PTE_PTLOCKS -bool __ptlock_alloc(struct page *page) +#if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS +bool ptlock_alloc(struct page *page) { spinlock_t *ptl; ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); if (!ptl) return false; - page->ptl = (unsigned long)ptl; + page->ptl = ptl; return true; } -void __ptlock_free(struct page *page) +void ptlock_free(struct page *page) { - if (sizeof(spinlock_t) > sizeof(page->ptl)) - kfree((spinlock_t *)page->ptl); + kfree(page->ptl); } #endif -- cgit v1.2.3-70-g09d2 From ea1e7ed33708c7a760419ff9ded0a6cb90586a50 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 14 Nov 2013 14:31:53 -0800 Subject: mm: create a separate slab for page->ptl allocation If DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC are enabled spinlock_t on x86_64 is 72 bytes. For page->ptl they will be allocated from kmalloc-96 slab, so we loose 24 on each. An average system can easily allocate few tens thousands of page->ptl and overhead is significant. Let's create a separate slab for page->ptl allocation to solve this. Signed-off-by: Kirill A. Shutemov Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 +++++++++ init/main.c | 2 +- mm/memory.c | 7 +++++++ 3 files changed, 17 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 1cedd000cf2..0548eb201e0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1318,6 +1318,7 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a #if USE_SPLIT_PTE_PTLOCKS #if BLOATED_SPINLOCKS +void __init ptlock_cache_init(void); extern bool ptlock_alloc(struct page *page); extern void ptlock_free(struct page *page); @@ -1326,6 +1327,7 @@ static inline spinlock_t *ptlock_ptr(struct page *page) return page->ptl; } #else /* BLOATED_SPINLOCKS */ +static inline void ptlock_cache_init(void) {} static inline bool ptlock_alloc(struct page *page) { return true; @@ -1378,10 +1380,17 @@ static inline spinlock_t *pte_lockptr(struct mm_struct *mm, pmd_t *pmd) { return &mm->page_table_lock; } +static inline void ptlock_cache_init(void) {} static inline bool ptlock_init(struct page *page) { return true; } static inline void pte_lock_deinit(struct page *page) {} #endif /* USE_SPLIT_PTE_PTLOCKS */ +static inline void pgtable_init(void) +{ + ptlock_cache_init(); + pgtable_cache_init(); +} + static inline bool pgtable_page_ctor(struct page *page) { inc_zone_page_state(page, NR_PAGETABLE); diff --git a/init/main.c b/init/main.c index 6ad1a533a8c..5f191133376 100644 --- a/init/main.c +++ b/init/main.c @@ -473,7 +473,7 @@ static void __init mm_init(void) mem_init(); kmem_cache_init(); percpu_init_late(); - pgtable_cache_init(); + pgtable_init(); vmalloc_init(); } diff --git a/mm/memory.c b/mm/memory.c index 5d9025f3b3e..0409e8f43fa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4272,6 +4272,13 @@ void copy_user_huge_page(struct page *dst, struct page *src, #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ #if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS +static struct kmem_cache *page_ptl_cachep; +void __init ptlock_cache_init(void) +{ + page_ptl_cachep = kmem_cache_create("page->ptl", sizeof(spinlock_t), 0, + SLAB_PANIC, NULL); +} + bool ptlock_alloc(struct page *page) { spinlock_t *ptl; -- cgit v1.2.3-70-g09d2 From 57f4257eae33e036125973858934730250d464e3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Nov 2013 14:31:54 -0800 Subject: lockref: use BLOATED_SPINLOCKS to avoid explicit config dependencies Avoid the fragile Kconfig construct guestimating spinlock_t sizes; use a friendly compile-time test to determine this. [kirill.shutemov@linux.intel.com: drop CONFIG_CMPXCHG_LOCKREF] Signed-off-by: Peter Zijlstra Signed-off-by: Kirill A. Shutemov Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/lockref.h | 7 ++++++- lib/Kconfig | 7 ------- lib/lockref.c | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/lockref.h b/include/linux/lockref.h index 13dfd36a329..c8929c3832d 100644 --- a/include/linux/lockref.h +++ b/include/linux/lockref.h @@ -15,10 +15,15 @@ */ #include +#include + +#define USE_CMPXCHG_LOCKREF \ + (IS_ENABLED(CONFIG_ARCH_USE_CMPXCHG_LOCKREF) && \ + IS_ENABLED(CONFIG_SMP) && !BLOATED_SPINLOCKS) struct lockref { union { -#ifdef CONFIG_CMPXCHG_LOCKREF +#if USE_CMPXCHG_LOCKREF aligned_u64 lock_count; #endif struct { diff --git a/lib/Kconfig b/lib/Kconfig index 75485e163ca..06dc74200a5 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -51,13 +51,6 @@ config PERCPU_RWSEM config ARCH_USE_CMPXCHG_LOCKREF bool -config CMPXCHG_LOCKREF - def_bool y if ARCH_USE_CMPXCHG_LOCKREF - depends on SMP - depends on !GENERIC_LOCKBREAK - depends on !DEBUG_SPINLOCK - depends on !DEBUG_LOCK_ALLOC - config CRC_CCITT tristate "CRC-CCITT functions" help diff --git a/lib/lockref.c b/lib/lockref.c index af6e95d0bed..d2b123f8456 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -1,7 +1,7 @@ #include #include -#ifdef CONFIG_CMPXCHG_LOCKREF +#if USE_CMPXCHG_LOCKREF /* * Allow weakly-ordered memory architectures to provide barrier-less -- cgit v1.2.3-70-g09d2 From 839cc2a94cc3665bafe32203c2f095f4dd470a80 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 14 Nov 2013 14:31:56 -0800 Subject: seq_file: introduce seq_setwidth() and seq_pad() There are several users who want to know bytes written by seq_*() for alignment purpose. Currently they are using %n format for knowing it because seq_*() returns 0 on success. This patch introduces seq_setwidth() and seq_pad() for allowing them to align without using %n format. Signed-off-by: Tetsuo Handa Signed-off-by: Kees Cook Cc: Joe Perches Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 15 +++++++++++++++ include/linux/seq_file.h | 15 +++++++++++++++ 2 files changed, 30 insertions(+) (limited to 'include') diff --git a/fs/seq_file.c b/fs/seq_file.c index a290157265e..1cd2388ca5b 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -766,6 +766,21 @@ int seq_write(struct seq_file *seq, const void *data, size_t len) } EXPORT_SYMBOL(seq_write); +/** + * seq_pad - write padding spaces to buffer + * @m: seq_file identifying the buffer to which data should be written + * @c: the byte to append after padding if non-zero + */ +void seq_pad(struct seq_file *m, char c) +{ + int size = m->pad_until - m->count; + if (size > 0) + seq_printf(m, "%*s", size, ""); + if (c) + seq_putc(m, c); +} +EXPORT_SYMBOL(seq_pad); + struct list_head *seq_list_start(struct list_head *head, loff_t pos) { struct list_head *lh; diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h index 4e32edc8f50..52e0097f61f 100644 --- a/include/linux/seq_file.h +++ b/include/linux/seq_file.h @@ -20,6 +20,7 @@ struct seq_file { size_t size; size_t from; size_t count; + size_t pad_until; loff_t index; loff_t read_pos; u64 version; @@ -79,6 +80,20 @@ static inline void seq_commit(struct seq_file *m, int num) } } +/** + * seq_setwidth - set padding width + * @m: the seq_file handle + * @size: the max number of bytes to pad. + * + * Call seq_setwidth() for setting max width, then call seq_printf() etc. and + * finally call seq_pad() to pad the remaining bytes. + */ +static inline void seq_setwidth(struct seq_file *m, size_t size) +{ + m->pad_until = m->count + size; +} +void seq_pad(struct seq_file *m, char c); + char *mangle_path(char *s, const char *p, const char *esc); int seq_open(struct file *, const struct seq_operations *); ssize_t seq_read(struct file *, char __user *, size_t, loff_t *); -- cgit v1.2.3-70-g09d2 From c32f74ab2872994bc8336ed367313da3139350ca Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 14 Nov 2013 14:32:01 -0800 Subject: sched: replace INIT_COMPLETION with reinit_completion For the casual device driver writer, it is hard to remember when to use init_completion (to init a completion structure) or INIT_COMPLETION (to *reinit* a completion structure). Furthermore, while all other completion functions exepct a pointer as a parameter, INIT_COMPLETION does not. To make it easier to remember which function to use and to make code more readable, introduce a new inline function with the proper name and consistent argument type. Update the kernel-doc for init_completion while we are here. Signed-off-by: Wolfram Sang Acked-by: Linus Walleij (personally at LCE13) Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/completion.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/completion.h b/include/linux/completion.h index 22c33e35bcb..124e4b4334c 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -19,8 +19,8 @@ * * See also: complete(), wait_for_completion() (and friends _timeout, * _interruptible, _interruptible_timeout, and _killable), init_completion(), - * and macros DECLARE_COMPLETION(), DECLARE_COMPLETION_ONSTACK(), and - * INIT_COMPLETION(). + * reinit_completion(), and macros DECLARE_COMPLETION(), + * DECLARE_COMPLETION_ONSTACK(). */ struct completion { unsigned int done; @@ -65,7 +65,7 @@ struct completion { /** * init_completion - Initialize a dynamically allocated completion - * @x: completion structure that is to be initialized + * @x: pointer to completion structure that is to be initialized * * This inline function will initialize a dynamically created completion * structure. @@ -76,6 +76,18 @@ static inline void init_completion(struct completion *x) init_waitqueue_head(&x->wait); } +/** + * reinit_completion - reinitialize a completion structure + * @x: pointer to completion structure that is to be reinitialized + * + * This inline function should be used to reinitialize a completion structure so it can + * be reused. This is especially important after complete_all() is used. + */ +static inline void reinit_completion(struct completion *x) +{ + x->done = 0; +} + extern void wait_for_completion(struct completion *); extern void wait_for_completion_io(struct completion *); extern int wait_for_completion_interruptible(struct completion *x); -- cgit v1.2.3-70-g09d2 From 62026aedaacedbe1ffe94a3599ad4acd8ecdf587 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 14 Nov 2013 14:32:03 -0800 Subject: sched: remove INIT_COMPLETION All users are converted over to reinit_completion(). Remove the old macro now. Signed-off-by: Wolfram Sang Cc: Linus Walleij Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/completion.h | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'include') diff --git a/include/linux/completion.h b/include/linux/completion.h index 124e4b4334c..5d5aaae3af4 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -106,14 +106,4 @@ extern bool completion_done(struct completion *x); extern void complete(struct completion *); extern void complete_all(struct completion *); -/** - * INIT_COMPLETION - reinitialize a completion structure - * @x: completion structure to be reinitialized - * - * This macro should be used to reinitialize a completion structure so it can - * be reused. This is especially important after complete_all() is used. - */ -#define INIT_COMPLETION(x) ((x).done = 0) - - #endif -- cgit v1.2.3-70-g09d2 From fc21c0cff2f425891b28ff6fb6b03b325c977428 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Nov 2013 14:32:06 -0800 Subject: revert "softirq: Add support for triggering softirq work on softirqs" This commit was incomplete in that code to remove items from the per-cpu lists was missing and never acquired a user in the 5 years it has been in the tree. We're going to implement what it seems to try to archive in a simpler way, and this code is in the way of doing so. Signed-off-by: Christoph Hellwig Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/interrupt.h | 22 -------- kernel/softirq.c | 131 ---------------------------------------------- 2 files changed, 153 deletions(-) (limited to 'include') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index c9e831dc80b..db43b58a335 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -11,8 +11,6 @@ #include #include #include -#include -#include #include #include #include @@ -392,15 +390,6 @@ extern void __raise_softirq_irqoff(unsigned int nr); extern void raise_softirq_irqoff(unsigned int nr); extern void raise_softirq(unsigned int nr); -/* This is the worklist that queues up per-cpu softirq work. - * - * send_remote_sendirq() adds work to these lists, and - * the softirq handler itself dequeues from them. The queues - * are protected by disabling local cpu interrupts and they must - * only be accessed by the local cpu that they are for. - */ -DECLARE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); - DECLARE_PER_CPU(struct task_struct *, ksoftirqd); static inline struct task_struct *this_cpu_ksoftirqd(void) @@ -408,17 +397,6 @@ static inline struct task_struct *this_cpu_ksoftirqd(void) return this_cpu_read(ksoftirqd); } -/* Try to send a softirq to a remote cpu. If this cannot be done, the - * work will be queued to the local cpu. - */ -extern void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq); - -/* Like send_remote_softirq(), but the caller must disable local cpu interrupts - * and compute the current cpu, passed in as 'this_cpu'. - */ -extern void __send_remote_softirq(struct call_single_data *cp, int cpu, - int this_cpu, int softirq); - /* Tasklets --- multithreaded analogue of BHs. Main feature differing them of generic softirqs: tasklet diff --git a/kernel/softirq.c b/kernel/softirq.c index b2498835345..11025ccc06d 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -6,8 +6,6 @@ * Distribute under GPLv2. * * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) - * - * Remote softirq infrastructure is by Jens Axboe. */ #include @@ -627,146 +625,17 @@ void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, } EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); -/* - * Remote softirq bits - */ - -DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); -EXPORT_PER_CPU_SYMBOL(softirq_work_list); - -static void __local_trigger(struct call_single_data *cp, int softirq) -{ - struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]); - - list_add_tail(&cp->list, head); - - /* Trigger the softirq only if the list was previously empty. */ - if (head->next == &cp->list) - raise_softirq_irqoff(softirq); -} - -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS -static void remote_softirq_receive(void *data) -{ - struct call_single_data *cp = data; - unsigned long flags; - int softirq; - - softirq = *(int *)cp->info; - local_irq_save(flags); - __local_trigger(cp, softirq); - local_irq_restore(flags); -} - -static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - if (cpu_online(cpu)) { - cp->func = remote_softirq_receive; - cp->info = &softirq; - cp->flags = 0; - - __smp_call_function_single(cpu, cp, 0); - return 0; - } - return 1; -} -#else /* CONFIG_USE_GENERIC_SMP_HELPERS */ -static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - return 1; -} -#endif - -/** - * __send_remote_softirq - try to schedule softirq work on a remote cpu - * @cp: private SMP call function data area - * @cpu: the remote cpu - * @this_cpu: the currently executing cpu - * @softirq: the softirq for the work - * - * Attempt to schedule softirq work on a remote cpu. If this cannot be - * done, the work is instead queued up on the local cpu. - * - * Interrupts must be disabled. - */ -void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq) -{ - if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq)) - __local_trigger(cp, softirq); -} -EXPORT_SYMBOL(__send_remote_softirq); - -/** - * send_remote_softirq - try to schedule softirq work on a remote cpu - * @cp: private SMP call function data area - * @cpu: the remote cpu - * @softirq: the softirq for the work - * - * Like __send_remote_softirq except that disabling interrupts and - * computing the current cpu is done for the caller. - */ -void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq) -{ - unsigned long flags; - int this_cpu; - - local_irq_save(flags); - this_cpu = smp_processor_id(); - __send_remote_softirq(cp, cpu, this_cpu, softirq); - local_irq_restore(flags); -} -EXPORT_SYMBOL(send_remote_softirq); - -static int remote_softirq_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { - int cpu = (unsigned long) hcpu; - int i; - - local_irq_disable(); - for (i = 0; i < NR_SOFTIRQS; i++) { - struct list_head *head = &per_cpu(softirq_work_list[i], cpu); - struct list_head *local_head; - - if (list_empty(head)) - continue; - - local_head = &__get_cpu_var(softirq_work_list[i]); - list_splice_init(head, local_head); - raise_softirq_irqoff(i); - } - local_irq_enable(); - } - - return NOTIFY_OK; -} - -static struct notifier_block remote_softirq_cpu_notifier = { - .notifier_call = remote_softirq_cpu_notify, -}; - void __init softirq_init(void) { int cpu; for_each_possible_cpu(cpu) { - int i; - per_cpu(tasklet_vec, cpu).tail = &per_cpu(tasklet_vec, cpu).head; per_cpu(tasklet_hi_vec, cpu).tail = &per_cpu(tasklet_hi_vec, cpu).head; - for (i = 0; i < NR_SOFTIRQS; i++) - INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu)); } - register_hotcpu_notifier(&remote_softirq_cpu_notifier); - open_softirq(TASKLET_SOFTIRQ, tasklet_action); open_softirq(HI_SOFTIRQ, tasklet_hi_action); } -- cgit v1.2.3-70-g09d2 From 0a06ff068f1255bcd7965ab07bc0f4adc3eb639a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Nov 2013 14:32:07 -0800 Subject: kernel: remove CONFIG_USE_GENERIC_SMP_HELPERS We've switched over every architecture that supports SMP to it, so remove the new useless config variable. Signed-off-by: Christoph Hellwig Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/Kconfig | 3 --- arch/alpha/Kconfig | 1 - arch/arc/Kconfig | 1 - arch/arm/Kconfig | 1 - arch/arm64/Kconfig | 1 - arch/blackfin/Kconfig | 1 - arch/hexagon/Kconfig | 1 - arch/ia64/Kconfig | 1 - arch/m32r/Kconfig | 1 - arch/metag/Kconfig | 1 - arch/mips/Kconfig | 1 - arch/mn10300/Kconfig | 1 - arch/parisc/Kconfig | 1 - arch/powerpc/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/sh/Kconfig | 1 - arch/sparc/Kconfig | 1 - arch/tile/Kconfig | 1 - arch/x86/Kconfig | 1 - block/blk-mq.c | 4 ++-- block/blk-softirq.c | 4 ++-- block/blk-sysfs.c | 2 +- include/linux/smp.h | 4 ---- kernel/Kconfig.hz | 2 +- kernel/smp.c | 2 -- 25 files changed, 6 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/arch/Kconfig b/arch/Kconfig index ded747c7b74..f1cf895c040 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -207,9 +207,6 @@ config HAVE_DMA_ATTRS config HAVE_DMA_CONTIGUOUS bool -config USE_GENERIC_SMP_HELPERS - bool - config GENERIC_SMP_IDLE_THREAD bool diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 35a300d4a9f..8d2a4833acd 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -522,7 +522,6 @@ config ARCH_MAY_HAVE_PC_FDC config SMP bool "Symmetric multi-processing support" depends on ALPHA_SABLE || ALPHA_LYNX || ALPHA_RAWHIDE || ALPHA_DP264 || ALPHA_WILDFIRE || ALPHA_TITAN || ALPHA_GENERIC || ALPHA_SHARK || ALPHA_MARVEL - select USE_GENERIC_SMP_HELPERS ---help--- This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 5ede5460c80..2ee0c9bfd03 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -125,7 +125,6 @@ config ARC_PLAT_NEEDS_CPU_TO_DMA config SMP bool "Symmetric Multi-Processing (Incomplete)" default n - select USE_GENERIC_SMP_HELPERS help This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 603d661b445..00c1ff45a15 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1432,7 +1432,6 @@ config SMP depends on GENERIC_CLOCKEVENTS depends on HAVE_SMP depends on MMU || ARM_MPU - select USE_GENERIC_SMP_HELPERS help This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index bb0bf1bfc05..9714fe0403b 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -143,7 +143,6 @@ config CPU_BIG_ENDIAN config SMP bool "Symmetric Multi-Processing" - select USE_GENERIC_SMP_HELPERS help This enables support for systems with more than one CPU. If you say N here, the kernel will run on single and diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig index e887b57c317..9ceccef9c64 100644 --- a/arch/blackfin/Kconfig +++ b/arch/blackfin/Kconfig @@ -34,7 +34,6 @@ config BLACKFIN select ARCH_WANT_IPC_PARSE_VERSION select GENERIC_ATOMIC64 select GENERIC_IRQ_PROBE - select USE_GENERIC_SMP_HELPERS if SMP select HAVE_NMI_WATCHDOG if NMI_WATCHDOG select GENERIC_SMP_IDLE_THREAD select ARCH_USES_GETTIMEOFFSET if !GENERIC_CLOCKEVENTS diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig index 99041b07e61..09df2608f40 100644 --- a/arch/hexagon/Kconfig +++ b/arch/hexagon/Kconfig @@ -4,7 +4,6 @@ comment "Linux Kernel Configuration for Hexagon" config HEXAGON def_bool y select HAVE_OPROFILE - select USE_GENERIC_SMP_HELPERS if SMP # Other pending projects/to-do items. # select HAVE_REGS_AND_STACK_ACCESS_API # select HAVE_HW_BREAKPOINT if PERF_EVENTS diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index 7740ab10a17..dfe85e92ca2 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -343,7 +343,6 @@ config FORCE_MAX_ZONEORDER config SMP bool "Symmetric multi-processing support" - select USE_GENERIC_SMP_HELPERS help This enables support for systems with more than one CPU. If you have a system with only one CPU, say N. If you have a system with more diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 75661fbf452..09ef94a8a7c 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -275,7 +275,6 @@ source "kernel/Kconfig.preempt" config SMP bool "Symmetric multi-processing support" - select USE_GENERIC_SMP_HELPERS ---help--- This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff --git a/arch/metag/Kconfig b/arch/metag/Kconfig index 36368eb07e1..e56abd2c1b4 100644 --- a/arch/metag/Kconfig +++ b/arch/metag/Kconfig @@ -111,7 +111,6 @@ config METAG_META21 config SMP bool "Symmetric multi-processing support" depends on METAG_META21 && METAG_META21_MMU - select USE_GENERIC_SMP_HELPERS help This enables support for systems with more than one thread running Linux. If you have a system with only one thread running Linux, diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 17cc7ff8458..867d7db1158 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2125,7 +2125,6 @@ source "mm/Kconfig" config SMP bool "Multi-Processing support" depends on SYS_SUPPORTS_SMP - select USE_GENERIC_SMP_HELPERS help This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff --git a/arch/mn10300/Kconfig b/arch/mn10300/Kconfig index 6aaa1607001..8bde9237d13 100644 --- a/arch/mn10300/Kconfig +++ b/arch/mn10300/Kconfig @@ -181,7 +181,6 @@ endmenu config SMP bool "Symmetric multi-processing support" default y - select USE_GENERIC_SMP_HELPERS depends on MN10300_PROC_MN2WS0038 || MN10300_PROC_MN2WS0050 ---help--- This enables support for systems with more than one CPU. If you have diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 7dcde539d61..c03567a9a91 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -226,7 +226,6 @@ endchoice config SMP bool "Symmetric multi-processing support" - select USE_GENERIC_SMP_HELPERS ---help--- This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2f898d63eb9..4740b0a15fa 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -106,7 +106,6 @@ config PPC select HAVE_MEMBLOCK_NODE_MAP select HAVE_DMA_ATTRS select HAVE_DMA_API_DEBUG - select USE_GENERIC_SMP_HELPERS if SMP select HAVE_OPROFILE select HAVE_DEBUG_KMEMLEAK select GENERIC_ATOMIC64 if PPC32 diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index f75d7e51792..314fced4fc1 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -141,7 +141,6 @@ config S390 select OLD_SIGACTION select OLD_SIGSUSPEND3 select SYSCTL_EXCEPTION_TRACE - select USE_GENERIC_SMP_HELPERS if SMP select VIRT_CPU_ACCOUNTING select VIRT_TO_BUS diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 224f4bc9925..e78561bc30e 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -711,7 +711,6 @@ config CC_STACKPROTECTOR config SMP bool "Symmetric multi-processing support" depends on SYS_SUPPORTS_SMP - select USE_GENERIC_SMP_HELPERS ---help--- This enables support for systems with more than one CPU. If you have a system with only one CPU, like most personal computers, say N. If diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 78c4fdb91bc..8591b201d9c 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -28,7 +28,6 @@ config SPARC select HAVE_ARCH_JUMP_LABEL select GENERIC_IRQ_SHOW select ARCH_WANT_IPC_PARSE_VERSION - select USE_GENERIC_SMP_HELPERS if SMP select GENERIC_PCI_IOMAP select HAVE_NMI_WATCHDOG if SPARC64 select HAVE_BPF_JIT diff --git a/arch/tile/Kconfig b/arch/tile/Kconfig index d45a2c48f18..b3692ce78f9 100644 --- a/arch/tile/Kconfig +++ b/arch/tile/Kconfig @@ -8,7 +8,6 @@ config TILE select HAVE_KVM if !TILEGX select GENERIC_FIND_FIRST_BIT select SYSCTL_EXCEPTION_TRACE - select USE_GENERIC_SMP_HELPERS select CC_OPTIMIZE_FOR_SIZE select HAVE_DEBUG_KMEMLEAK select GENERIC_IRQ_PROBE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index af5513e5798..83f521aa103 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -90,7 +90,6 @@ config X86 select GENERIC_IRQ_SHOW select GENERIC_CLOCKEVENTS_MIN_ADJUST select IRQ_FORCED_THREADING - select USE_GENERIC_SMP_HELPERS if SMP select HAVE_BPF_JIT if X86_64 select HAVE_ARCH_TRANSPARENT_HUGEPAGE select CLKEVT_I8253 diff --git a/block/blk-mq.c b/block/blk-mq.c index 88d4e864d4c..c661896e246 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -319,7 +319,7 @@ void __blk_mq_end_io(struct request *rq, int error) blk_mq_complete_request(rq, error); } -#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) +#if defined(CONFIG_SMP) /* * Called with interrupts disabled. @@ -361,7 +361,7 @@ static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, return true; } -#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */ +#else /* CONFIG_SMP */ static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, struct request *rq, const int error) { diff --git a/block/blk-softirq.c b/block/blk-softirq.c index ce4b8bfd3d2..57790c1a97e 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -36,7 +36,7 @@ static void blk_done_softirq(struct softirq_action *h) } } -#if defined(CONFIG_SMP) && defined(CONFIG_USE_GENERIC_SMP_HELPERS) +#ifdef CONFIG_SMP static void trigger_softirq(void *data) { struct request *rq = data; @@ -71,7 +71,7 @@ static int raise_blk_irq(int cpu, struct request *rq) return 1; } -#else /* CONFIG_SMP && CONFIG_USE_GENERIC_SMP_HELPERS */ +#else /* CONFIG_SMP */ static int raise_blk_irq(int cpu, struct request *rq) { return 1; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 4f8c4d90ec7..97779522472 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -288,7 +288,7 @@ static ssize_t queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) { ssize_t ret = -EINVAL; -#if defined(CONFIG_USE_GENERIC_SMP_HELPERS) +#ifdef CONFIG_SMP unsigned long val; ret = queue_var_store(&val, page, count); diff --git a/include/linux/smp.h b/include/linux/smp.h index 731f5237d5f..78851513030 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -106,14 +106,10 @@ void kick_all_cpus_sync(void); /* * Generic and arch helpers */ -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS void __init call_function_init(void); void generic_smp_call_function_single_interrupt(void); #define generic_smp_call_function_interrupt \ generic_smp_call_function_single_interrupt -#else -static inline void call_function_init(void) { } -#endif /* * Mark the boot cpu "online" so that it can call console drivers in diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 94fabd534b0..2a202a84675 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz @@ -55,4 +55,4 @@ config HZ default 1000 if HZ_1000 config SCHED_HRTICK - def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS) + def_bool HIGH_RES_TIMERS diff --git a/kernel/smp.c b/kernel/smp.c index 46116100f0e..1c194e20e94 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -15,7 +15,6 @@ #include "smpboot.h" -#ifdef CONFIG_USE_GENERIC_SMP_HELPERS enum { CSD_FLAG_LOCK = 0x01, CSD_FLAG_WAIT = 0x02, @@ -464,7 +463,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait) return 0; } EXPORT_SYMBOL(smp_call_function); -#endif /* USE_GENERIC_SMP_HELPERS */ /* Setup configured maximum number of CPUs to activate */ unsigned int setup_max_cpus = NR_CPUS; -- cgit v1.2.3-70-g09d2 From 7cf64f861b7a43b395f0855994003254b06a7e5a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 14 Nov 2013 14:32:09 -0800 Subject: kernel-provide-a-__smp_call_function_single-stub-for-config_smp-fix x86_64 allnoconfig: kernel/up.c:25: error: redefinition of '__smp_call_function_single' include/linux/smp.h:154: note: previous definition of '__smp_call_function_single' was here Cc: Christoph Hellwig Cc: Christoph Hellwig Cc: Jan Kara Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/smp.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/smp.h b/include/linux/smp.h index 78851513030..5da22ee42e1 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -49,6 +49,9 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), smp_call_func_t func, void *info, bool wait, gfp_t gfp_flags); +void __smp_call_function_single(int cpuid, struct call_single_data *data, + int wait); + #ifdef CONFIG_SMP #include @@ -95,9 +98,6 @@ int smp_call_function(smp_call_func_t func, void *info, int wait); void smp_call_function_many(const struct cpumask *mask, smp_call_func_t func, void *info, bool wait); -void __smp_call_function_single(int cpuid, struct call_single_data *data, - int wait); - int smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, void *info, int wait); @@ -151,12 +151,6 @@ smp_call_function_any(const struct cpumask *mask, smp_call_func_t func, static inline void kick_all_cpus_sync(void) { } -static inline void __smp_call_function_single(int cpuid, - struct call_single_data *data, int wait) -{ - on_each_cpu(data->func, data->info, wait); -} - #endif /* !SMP */ /* -- cgit v1.2.3-70-g09d2 From b89241e8cdb8321c20546d47645a9b65b58113b5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 14 Nov 2013 14:32:11 -0800 Subject: llists: move llist_reverse_order from raid5 to llist.c Make this useful helper available for other users. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Cc: Jens Axboe Cc: Neil Brown Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/md/raid5.c | 14 -------------- include/linux/llist.h | 2 ++ lib/llist.c | 22 ++++++++++++++++++++++ 3 files changed, 24 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index f8b90684392..7f0e17a27ae 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -293,20 +293,6 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) do_release_stripe(conf, sh); } -static struct llist_node *llist_reverse_order(struct llist_node *head) -{ - struct llist_node *new_head = NULL; - - while (head) { - struct llist_node *tmp = head; - head = head->next; - tmp->next = new_head; - new_head = tmp; - } - - return new_head; -} - /* should hold conf->device_lock already */ static int release_stripe_list(struct r5conf *conf) { diff --git a/include/linux/llist.h b/include/linux/llist.h index 8828a78dec9..fbf10a0bc09 100644 --- a/include/linux/llist.h +++ b/include/linux/llist.h @@ -195,4 +195,6 @@ static inline struct llist_node *llist_del_all(struct llist_head *head) extern struct llist_node *llist_del_first(struct llist_head *head); +struct llist_node *llist_reverse_order(struct llist_node *head); + #endif /* LLIST_H */ diff --git a/lib/llist.c b/lib/llist.c index 4a70d120138..ef48b87247e 100644 --- a/lib/llist.c +++ b/lib/llist.c @@ -81,3 +81,25 @@ struct llist_node *llist_del_first(struct llist_head *head) return entry; } EXPORT_SYMBOL_GPL(llist_del_first); + +/** + * llist_reverse_order - reverse order of a llist chain + * @head: first item of the list to be reversed + * + * Reverse the oder of a chain of llist entries and return the + * new first entry. + */ +struct llist_node *llist_reverse_order(struct llist_node *head) +{ + struct llist_node *new_head = NULL; + + while (head) { + struct llist_node *tmp = head; + head = head->next; + tmp->next = new_head; + new_head = tmp; + } + + return new_head; +} +EXPORT_SYMBOL_GPL(llist_reverse_order); -- cgit v1.2.3-70-g09d2 From 498d319bb512992ef0784c278fa03679f2f5649d Mon Sep 17 00:00:00 2001 From: Stefani Seibold Date: Thu, 14 Nov 2013 14:32:17 -0800 Subject: kfifo API type safety This patch enhances the type safety for the kfifo API. It is now safe to put const data into a non const FIFO and the API will now generate a compiler warning when reading from the fifo where the destination address is pointing to a const variable. As a side effect the kfifo_put() does now expect the value of an element instead a pointer to the element. This was suggested Russell King. It make the handling of the kfifo_put easier since there is no need to create a helper variable for getting the address of a pointer or to pass integers of different sizes. IMHO the API break is okay, since there are currently only six users of kfifo_put(). The code is also cleaner by kicking out the "if (0)" expressions. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Stefani Seibold Cc: Russell King Cc: Hauke Mehrtens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/drm_flip_work.c | 2 +- drivers/iio/industrialio-event.c | 2 +- drivers/net/wireless/rt2x00/rt2800mmio.c | 2 +- drivers/net/wireless/rt2x00/rt2800usb.c | 2 +- drivers/pci/pcie/aer/aerdrv_core.c | 2 +- include/linux/kfifo.h | 47 ++++++++++---------------------- mm/memory-failure.c | 2 +- samples/kfifo/bytestream-example.c | 4 +-- samples/kfifo/dma-example.c | 2 +- samples/kfifo/inttype-example.c | 4 +-- 10 files changed, 25 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/drm_flip_work.c b/drivers/gpu/drm/drm_flip_work.c index e788882d902..f9c7fa3d001 100644 --- a/drivers/gpu/drm/drm_flip_work.c +++ b/drivers/gpu/drm/drm_flip_work.c @@ -34,7 +34,7 @@ */ void drm_flip_work_queue(struct drm_flip_work *work, void *val) { - if (kfifo_put(&work->fifo, (const void **)&val)) { + if (kfifo_put(&work->fifo, val)) { atomic_inc(&work->pending); } else { DRM_ERROR("%s fifo full!\n", work->name); diff --git a/drivers/iio/industrialio-event.c b/drivers/iio/industrialio-event.c index dac15b9f9df..c10eab64bc0 100644 --- a/drivers/iio/industrialio-event.c +++ b/drivers/iio/industrialio-event.c @@ -56,7 +56,7 @@ int iio_push_event(struct iio_dev *indio_dev, u64 ev_code, s64 timestamp) ev.id = ev_code; ev.timestamp = timestamp; - copied = kfifo_put(&ev_int->det_events, &ev); + copied = kfifo_put(&ev_int->det_events, ev); if (copied != 0) wake_up_locked_poll(&ev_int->wait, POLLIN); } diff --git a/drivers/net/wireless/rt2x00/rt2800mmio.c b/drivers/net/wireless/rt2x00/rt2800mmio.c index ae152280e07..a8cc736b506 100644 --- a/drivers/net/wireless/rt2x00/rt2800mmio.c +++ b/drivers/net/wireless/rt2x00/rt2800mmio.c @@ -446,7 +446,7 @@ static void rt2800mmio_txstatus_interrupt(struct rt2x00_dev *rt2x00dev) if (!rt2x00_get_field32(status, TX_STA_FIFO_VALID)) break; - if (!kfifo_put(&rt2x00dev->txstatus_fifo, &status)) { + if (!kfifo_put(&rt2x00dev->txstatus_fifo, status)) { rt2x00_warn(rt2x00dev, "TX status FIFO overrun, drop tx status report\n"); break; } diff --git a/drivers/net/wireless/rt2x00/rt2800usb.c b/drivers/net/wireless/rt2x00/rt2800usb.c index 997df03a0c2..a81ceb61d74 100644 --- a/drivers/net/wireless/rt2x00/rt2800usb.c +++ b/drivers/net/wireless/rt2x00/rt2800usb.c @@ -164,7 +164,7 @@ static bool rt2800usb_tx_sta_fifo_read_completed(struct rt2x00_dev *rt2x00dev, valid = rt2x00_get_field32(tx_status, TX_STA_FIFO_VALID); if (valid) { - if (!kfifo_put(&rt2x00dev->txstatus_fifo, &tx_status)) + if (!kfifo_put(&rt2x00dev->txstatus_fifo, tx_status)) rt2x00_warn(rt2x00dev, "TX status FIFO overrun\n"); queue_work(rt2x00dev->workqueue, &rt2x00dev->txdone_work); diff --git a/drivers/pci/pcie/aer/aerdrv_core.c b/drivers/pci/pcie/aer/aerdrv_core.c index 85ca36f2136..6b3a958e1be 100644 --- a/drivers/pci/pcie/aer/aerdrv_core.c +++ b/drivers/pci/pcie/aer/aerdrv_core.c @@ -574,7 +574,7 @@ void aer_recover_queue(int domain, unsigned int bus, unsigned int devfn, }; spin_lock_irqsave(&aer_recover_ring_lock, flags); - if (kfifo_put(&aer_recover_ring, &entry)) + if (kfifo_put(&aer_recover_ring, entry)) schedule_work(&aer_recover_work); else pr_err("AER recover: Buffer overflow when recovering AER for %04x:%02x:%02x:%x\n", diff --git a/include/linux/kfifo.h b/include/linux/kfifo.h index 10308c6a3d1..552d51efb42 100644 --- a/include/linux/kfifo.h +++ b/include/linux/kfifo.h @@ -1,7 +1,7 @@ /* * A generic kernel FIFO implementation * - * Copyright (C) 2009/2010 Stefani Seibold + * Copyright (C) 2013 Stefani Seibold * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -67,9 +67,10 @@ struct __kfifo { union { \ struct __kfifo kfifo; \ datatype *type; \ + const datatype *const_type; \ char (*rectype)[recsize]; \ ptrtype *ptr; \ - const ptrtype *ptr_const; \ + ptrtype const *ptr_const; \ } #define __STRUCT_KFIFO(type, size, recsize, ptrtype) \ @@ -386,16 +387,12 @@ __kfifo_int_must_check_helper( \ #define kfifo_put(fifo, val) \ ({ \ typeof((fifo) + 1) __tmp = (fifo); \ - typeof((val) + 1) __val = (val); \ + typeof(*__tmp->const_type) __val = (val); \ unsigned int __ret; \ - const size_t __recsize = sizeof(*__tmp->rectype); \ + size_t __recsize = sizeof(*__tmp->rectype); \ struct __kfifo *__kfifo = &__tmp->kfifo; \ - if (0) { \ - typeof(__tmp->ptr_const) __dummy __attribute__ ((unused)); \ - __dummy = (typeof(__val))NULL; \ - } \ if (__recsize) \ - __ret = __kfifo_in_r(__kfifo, __val, sizeof(*__val), \ + __ret = __kfifo_in_r(__kfifo, &__val, sizeof(__val), \ __recsize); \ else { \ __ret = !kfifo_is_full(__tmp); \ @@ -404,7 +401,7 @@ __kfifo_int_must_check_helper( \ ((typeof(__tmp->type))__kfifo->data) : \ (__tmp->buf) \ )[__kfifo->in & __tmp->kfifo.mask] = \ - *(typeof(__tmp->type))__val; \ + (typeof(*__tmp->type))__val; \ smp_wmb(); \ __kfifo->in++; \ } \ @@ -415,7 +412,7 @@ __kfifo_int_must_check_helper( \ /** * kfifo_get - get data from the fifo * @fifo: address of the fifo to be used - * @val: the var where to store the data to be added + * @val: address where to store the data * * This macro reads the data from the fifo. * It returns 0 if the fifo was empty. Otherwise it returns the number @@ -428,12 +425,10 @@ __kfifo_int_must_check_helper( \ __kfifo_uint_must_check_helper( \ ({ \ typeof((fifo) + 1) __tmp = (fifo); \ - typeof((val) + 1) __val = (val); \ + typeof(__tmp->ptr) __val = (val); \ unsigned int __ret; \ const size_t __recsize = sizeof(*__tmp->rectype); \ struct __kfifo *__kfifo = &__tmp->kfifo; \ - if (0) \ - __val = (typeof(__tmp->ptr))0; \ if (__recsize) \ __ret = __kfifo_out_r(__kfifo, __val, sizeof(*__val), \ __recsize); \ @@ -456,7 +451,7 @@ __kfifo_uint_must_check_helper( \ /** * kfifo_peek - get data from the fifo without removing * @fifo: address of the fifo to be used - * @val: the var where to store the data to be added + * @val: address where to store the data * * This reads the data from the fifo without removing it from the fifo. * It returns 0 if the fifo was empty. Otherwise it returns the number @@ -469,12 +464,10 @@ __kfifo_uint_must_check_helper( \ __kfifo_uint_must_check_helper( \ ({ \ typeof((fifo) + 1) __tmp = (fifo); \ - typeof((val) + 1) __val = (val); \ + typeof(__tmp->ptr) __val = (val); \ unsigned int __ret; \ const size_t __recsize = sizeof(*__tmp->rectype); \ struct __kfifo *__kfifo = &__tmp->kfifo; \ - if (0) \ - __val = (typeof(__tmp->ptr))NULL; \ if (__recsize) \ __ret = __kfifo_out_peek_r(__kfifo, __val, sizeof(*__val), \ __recsize); \ @@ -508,14 +501,10 @@ __kfifo_uint_must_check_helper( \ #define kfifo_in(fifo, buf, n) \ ({ \ typeof((fifo) + 1) __tmp = (fifo); \ - typeof((buf) + 1) __buf = (buf); \ + typeof(__tmp->ptr_const) __buf = (buf); \ unsigned long __n = (n); \ const size_t __recsize = sizeof(*__tmp->rectype); \ struct __kfifo *__kfifo = &__tmp->kfifo; \ - if (0) { \ - typeof(__tmp->ptr_const) __dummy __attribute__ ((unused)); \ - __dummy = (typeof(__buf))NULL; \ - } \ (__recsize) ?\ __kfifo_in_r(__kfifo, __buf, __n, __recsize) : \ __kfifo_in(__kfifo, __buf, __n); \ @@ -561,14 +550,10 @@ __kfifo_uint_must_check_helper( \ __kfifo_uint_must_check_helper( \ ({ \ typeof((fifo) + 1) __tmp = (fifo); \ - typeof((buf) + 1) __buf = (buf); \ + typeof(__tmp->ptr) __buf = (buf); \ unsigned long __n = (n); \ const size_t __recsize = sizeof(*__tmp->rectype); \ struct __kfifo *__kfifo = &__tmp->kfifo; \ - if (0) { \ - typeof(__tmp->ptr) __dummy = NULL; \ - __buf = __dummy; \ - } \ (__recsize) ?\ __kfifo_out_r(__kfifo, __buf, __n, __recsize) : \ __kfifo_out(__kfifo, __buf, __n); \ @@ -773,14 +758,10 @@ __kfifo_uint_must_check_helper( \ __kfifo_uint_must_check_helper( \ ({ \ typeof((fifo) + 1) __tmp = (fifo); \ - typeof((buf) + 1) __buf = (buf); \ + typeof(__tmp->ptr) __buf = (buf); \ unsigned long __n = (n); \ const size_t __recsize = sizeof(*__tmp->rectype); \ struct __kfifo *__kfifo = &__tmp->kfifo; \ - if (0) { \ - typeof(__tmp->ptr) __dummy __attribute__ ((unused)) = NULL; \ - __buf = __dummy; \ - } \ (__recsize) ? \ __kfifo_out_peek_r(__kfifo, __buf, __n, __recsize) : \ __kfifo_out_peek(__kfifo, __buf, __n); \ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f9d78ec7831..b7c171602ba 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1269,7 +1269,7 @@ void memory_failure_queue(unsigned long pfn, int trapno, int flags) mf_cpu = &get_cpu_var(memory_failure_cpu); spin_lock_irqsave(&mf_cpu->lock, proc_flags); - if (kfifo_put(&mf_cpu->fifo, &entry)) + if (kfifo_put(&mf_cpu->fifo, entry)) schedule_work_on(smp_processor_id(), &mf_cpu->work); else pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", diff --git a/samples/kfifo/bytestream-example.c b/samples/kfifo/bytestream-example.c index cfe40addda7..2fca916d9ed 100644 --- a/samples/kfifo/bytestream-example.c +++ b/samples/kfifo/bytestream-example.c @@ -64,7 +64,7 @@ static int __init testfunc(void) /* put values into the fifo */ for (i = 0; i != 10; i++) - kfifo_put(&test, &i); + kfifo_put(&test, i); /* show the number of used elements */ printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test)); @@ -85,7 +85,7 @@ static int __init testfunc(void) kfifo_skip(&test); /* put values into the fifo until is full */ - for (i = 20; kfifo_put(&test, &i); i++) + for (i = 20; kfifo_put(&test, i); i++) ; printk(KERN_INFO "queue len: %u\n", kfifo_len(&test)); diff --git a/samples/kfifo/dma-example.c b/samples/kfifo/dma-example.c index 06473791c08..aa243db93f0 100644 --- a/samples/kfifo/dma-example.c +++ b/samples/kfifo/dma-example.c @@ -39,7 +39,7 @@ static int __init example_init(void) kfifo_in(&fifo, "test", 4); for (i = 0; i != 9; i++) - kfifo_put(&fifo, &i); + kfifo_put(&fifo, i); /* kick away first byte */ kfifo_skip(&fifo); diff --git a/samples/kfifo/inttype-example.c b/samples/kfifo/inttype-example.c index 6f8e79e76c9..8dc3c2e7105 100644 --- a/samples/kfifo/inttype-example.c +++ b/samples/kfifo/inttype-example.c @@ -61,7 +61,7 @@ static int __init testfunc(void) /* put values into the fifo */ for (i = 0; i != 10; i++) - kfifo_put(&test, &i); + kfifo_put(&test, i); /* show the number of used elements */ printk(KERN_INFO "fifo len: %u\n", kfifo_len(&test)); @@ -78,7 +78,7 @@ static int __init testfunc(void) kfifo_skip(&test); /* put values into the fifo until is full */ - for (i = 20; kfifo_put(&test, &i); i++) + for (i = 20; kfifo_put(&test, i); i++) ; printk(KERN_INFO "queue len: %u\n", kfifo_len(&test)); -- cgit v1.2.3-70-g09d2 From 8d3ef556aba2b5b7d8b7144f7be1814d75ea3cc6 Mon Sep 17 00:00:00 2001 From: Alexander Beregalov Date: Thu, 14 Nov 2013 14:32:19 -0800 Subject: cmdline-parser: fix build Fix following errors: include/linux/cmdline-parser.h:17:12: error: 'BDEVNAME_SIZE' undeclared here block/cmdline-parser.c:17:2: error: implicit declaration of function 'kzalloc' Signed-off-by: Alexander Beregalov Cc: CaiZhiyong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cmdline-parser.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/cmdline-parser.h b/include/linux/cmdline-parser.h index 98e892ef6d5..a0f9280421e 100644 --- a/include/linux/cmdline-parser.h +++ b/include/linux/cmdline-parser.h @@ -8,6 +8,8 @@ #define CMDLINEPARSEH #include +#include +#include /* partition flags */ #define PF_RDONLY 0x01 /* Device is read only */ -- cgit v1.2.3-70-g09d2