From b726b7dfb400c937546fa91cf8523dcb1aa2fc6e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:53 +0100 Subject: Revert "mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node" PTE scanning and NUMA hinting fault handling is expensive so commit 5bca2303 ("mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node") deferred the PTE scan until a task had been scheduled on another node. The problem is that in the purely shared memory case that this may never happen and no NUMA hinting fault information will be captured. We are not ruling out the possibility that something better can be done here but for now, this patch needs to be reverted and depend entirely on the scan_delay to avoid punishing short-lived processes. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-16-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- kernel/fork.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/fork.c b/kernel/fork.c index 086fe73ad6b..7192d91b541 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -816,9 +816,6 @@ struct mm_struct *dup_mm(struct task_struct *tsk) #ifdef CONFIG_TRANSPARENT_HUGEPAGE mm->pmd_huge_pte = NULL; -#endif -#ifdef CONFIG_NUMA_BALANCING - mm->first_nid = NUMA_PTE_SCAN_INIT; #endif if (!mm_init(mm, tsk)) goto fail_nomem; -- cgit v1.2.3-70-g09d2 From 5e1576ed0e54d419286a8096133029062b6ad456 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:26 +0100 Subject: sched/numa: Stay on the same node if CLONE_VM A newly spawned thread inside a process should stay on the same NUMA node as its parent. This prevents processes from being "torn" across multiple NUMA nodes every time they spawn a new thread. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-49-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 +- kernel/fork.c | 2 +- kernel/sched/core.c | 14 +++++++++----- 3 files changed, 11 insertions(+), 7 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/sched.h b/include/linux/sched.h index ff543851a18..8563e3dd5c0 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2021,7 +2021,7 @@ extern void wake_up_new_task(struct task_struct *tsk); #else static inline void kick_process(struct task_struct *tsk) { } #endif -extern void sched_fork(struct task_struct *p); +extern void sched_fork(unsigned long clone_flags, struct task_struct *p); extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void); diff --git a/kernel/fork.c b/kernel/fork.c index 7192d91b541..c93be06dee8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1310,7 +1310,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif /* Perform scheduler related setup. Assign this task to a CPU. */ - sched_fork(p); + sched_fork(clone_flags, p); retval = perf_event_init_task(p); if (retval) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 51092d5cc64..3e2c893df17 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1696,7 +1696,7 @@ int wake_up_state(struct task_struct *p, unsigned int state) * * __sched_fork() is basic setup used by init_idle() too: */ -static void __sched_fork(struct task_struct *p) +static void __sched_fork(unsigned long clone_flags, struct task_struct *p) { p->on_rq = 0; @@ -1725,11 +1725,15 @@ static void __sched_fork(struct task_struct *p) p->mm->numa_scan_seq = 0; } + if (clone_flags & CLONE_VM) + p->numa_preferred_nid = current->numa_preferred_nid; + else + p->numa_preferred_nid = -1; + p->node_stamp = 0ULL; p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; p->numa_migrate_seq = 1; p->numa_scan_period = sysctl_numa_balancing_scan_delay; - p->numa_preferred_nid = -1; p->numa_work.next = &p->numa_work; p->numa_faults = NULL; p->numa_faults_buffer = NULL; @@ -1761,12 +1765,12 @@ void set_numabalancing_state(bool enabled) /* * fork()/clone()-time setup: */ -void sched_fork(struct task_struct *p) +void sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; int cpu = get_cpu(); - __sched_fork(p); + __sched_fork(clone_flags, p); /* * We mark the process as running here. This guarantees that * nobody will actually run it, and a signal or other external @@ -4287,7 +4291,7 @@ void init_idle(struct task_struct *idle, int cpu) raw_spin_lock_irqsave(&rq->lock, flags); - __sched_fork(idle); + __sched_fork(0, idle); idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); -- cgit v1.2.3-70-g09d2 From b68e0749100e1b901bf11330f149b321c082178e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 13 Oct 2013 21:18:31 +0200 Subject: uprobes: Change the callsite of uprobe_copy_process() Preparation for the next patches. Move the callsite of uprobe_copy_process() in copy_process() down to the succesfull return. We do not care if copy_process() fails, uprobe_free_utask() won't be called in this case so the wrong ->utask != NULL doesn't matter. OTOH, with this change we know that copy_process() can't fail when uprobe_copy_process() is called, the new task should either return to user-mode or call do_exit(). This way uprobe_copy_process() can: 1. setup p->utask != NULL if necessary 2. setup uprobes_state.xol_area 3. use task_work_add(p) Also, move the definition of uprobe_copy_process() down so that it can see get_utask(). Signed-off-by: Oleg Nesterov Acked-by: Srikar Dronamraju --- kernel/events/uprobes.c | 16 ++++++++-------- kernel/fork.c | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel/fork.c') diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index ad8e1bdca70..db7a1dcb3dd 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1344,14 +1344,6 @@ void uprobe_free_utask(struct task_struct *t) t->utask = NULL; } -/* - * Called in context of a new clone/fork from copy_process. - */ -void uprobe_copy_process(struct task_struct *t) -{ - t->utask = NULL; -} - /* * Allocate a uprobe_task object for the task if if necessary. * Called when the thread hits a breakpoint. @@ -1367,6 +1359,14 @@ static struct uprobe_task *get_utask(void) return current->utask; } +/* + * Called in context of a new clone/fork from copy_process. + */ +void uprobe_copy_process(struct task_struct *t) +{ + t->utask = NULL; +} + /* * Current area->vaddr notion assume the trampoline address is always * equal area->vaddr. diff --git a/kernel/fork.c b/kernel/fork.c index 086fe73ad6b..d3603b81246 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1373,7 +1373,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->pi_state_list); p->pi_state_cache = NULL; #endif - uprobe_copy_process(p); /* * sigaltstack should be cleared when sharing the same VM */ @@ -1490,6 +1489,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, perf_event_fork(p); trace_task_newtask(p, clone_flags); + uprobe_copy_process(p); return p; -- cgit v1.2.3-70-g09d2 From 3ab679661721b1ec2aaad99a801870ed59ab1110 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 16 Oct 2013 19:39:37 +0200 Subject: uprobes: Teach uprobe_copy_process() to handle CLONE_VFORK uprobe_copy_process() does nothing if the child shares ->mm with the forking process, but there is a special case: CLONE_VFORK. In this case it would be more correct to do dup_utask() but avoid dup_xol(). This is not that important, the child should not unwind its stack too much, this can corrupt the parent's stack, but at least we need this to allow to ret-probe __vfork() itself. Note: in theory, it would be better to check task_pt_regs(p)->sp instead of CLONE_VFORK, we need to dup_utask() if and only if the child can return from the function called by the parent. But this needs the arch-dependant helper, and I think that nobody actually does clone(same_stack, CLONE_VM). Reported-by: Martin Cermak Reported-by: David Smith Signed-off-by: Oleg Nesterov --- include/linux/uprobes.h | 4 ++-- kernel/events/uprobes.c | 10 ++++++++-- kernel/fork.c | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) (limited to 'kernel/fork.c') diff --git a/include/linux/uprobes.h b/include/linux/uprobes.h index e6fba627ea4..9e0d5a6fe7a 100644 --- a/include/linux/uprobes.h +++ b/include/linux/uprobes.h @@ -117,7 +117,7 @@ extern void uprobe_start_dup_mmap(void); extern void uprobe_end_dup_mmap(void); extern void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm); extern void uprobe_free_utask(struct task_struct *t); -extern void uprobe_copy_process(struct task_struct *t); +extern void uprobe_copy_process(struct task_struct *t, unsigned long flags); extern unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs); extern int uprobe_post_sstep_notifier(struct pt_regs *regs); extern int uprobe_pre_sstep_notifier(struct pt_regs *regs); @@ -174,7 +174,7 @@ static inline unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) static inline void uprobe_free_utask(struct task_struct *t) { } -static inline void uprobe_copy_process(struct task_struct *t) +static inline void uprobe_copy_process(struct task_struct *t, unsigned long flags) { } static inline void uprobe_clear_state(struct mm_struct *mm) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 9f282e14925..ae9e1d2ef25 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1415,7 +1415,7 @@ static void dup_xol_work(struct callback_head *work) /* * Called in context of a new clone/fork from copy_process. */ -void uprobe_copy_process(struct task_struct *t) +void uprobe_copy_process(struct task_struct *t, unsigned long flags) { struct uprobe_task *utask = current->utask; struct mm_struct *mm = current->mm; @@ -1424,7 +1424,10 @@ void uprobe_copy_process(struct task_struct *t) t->utask = NULL; - if (mm == t->mm || !utask || !utask->return_instances) + if (!utask || !utask->return_instances) + return; + + if (mm == t->mm && !(flags & CLONE_VFORK)) return; if (dup_utask(t, utask)) @@ -1435,6 +1438,9 @@ void uprobe_copy_process(struct task_struct *t) if (!area) return uprobe_warn(t, "dup xol area"); + if (mm == t->mm) + return; + /* TODO: move it into the union in uprobe_task */ work = kmalloc(sizeof(*work), GFP_KERNEL); if (!work) diff --git a/kernel/fork.c b/kernel/fork.c index d3603b81246..8531609b6a8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1489,7 +1489,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, perf_event_fork(p); trace_task_newtask(p, clone_flags); - uprobe_copy_process(p); + uprobe_copy_process(p, clone_flags); return p; -- cgit v1.2.3-70-g09d2