diff options
Diffstat (limited to 'kernel')
35 files changed, 1631 insertions, 1422 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 961379caf66..3d9c7e27e3f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -90,7 +90,6 @@ obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o  obj-$(CONFIG_MARKERS) += marker.o  obj-$(CONFIG_TRACEPOINTS) += tracepoint.o  obj-$(CONFIG_LATENCYTOP) += latencytop.o -obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o  obj-$(CONFIG_FUNCTION_TRACER) += trace/  obj-$(CONFIG_TRACING) += trace/  obj-$(CONFIG_X86_DS) += trace/ diff --git a/kernel/cpu.c b/kernel/cpu.c index 8ce10043e4a..6ba0f1ecb21 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -401,6 +401,7 @@ int disable_nonboot_cpus(void)  			break;  		}  	} +  	if (!error) {  		BUG_ON(num_online_cpus() > 1);  		/* Make sure the CPUs won't be enabled by someone else */ @@ -413,6 +414,14 @@ int disable_nonboot_cpus(void)  	return error;  } +void __weak arch_enable_nonboot_cpus_begin(void) +{ +} + +void __weak arch_enable_nonboot_cpus_end(void) +{ +} +  void __ref enable_nonboot_cpus(void)  {  	int cpu, error; @@ -424,6 +433,9 @@ void __ref enable_nonboot_cpus(void)  		goto out;  	printk("Enabling non-boot CPUs ...\n"); + +	arch_enable_nonboot_cpus_begin(); +  	for_each_cpu(cpu, frozen_cpus) {  		error = _cpu_up(cpu, 1);  		if (!error) { @@ -432,6 +444,9 @@ void __ref enable_nonboot_cpus(void)  		}  		printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);  	} + +	arch_enable_nonboot_cpus_end(); +  	cpumask_clear(frozen_cpus);  out:  	cpu_maps_update_done(); diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c deleted file mode 100644 index 962a3b574f2..00000000000 --- a/kernel/dma-coherent.c +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Coherent per-device memory handling. - * Borrowed from i386 - */ -#include <linux/kernel.h> -#include <linux/dma-mapping.h> - -struct dma_coherent_mem { -	void		*virt_base; -	u32		device_base; -	int		size; -	int		flags; -	unsigned long	*bitmap; -}; - -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, -				dma_addr_t device_addr, size_t size, int flags) -{ -	void __iomem *mem_base = NULL; -	int pages = size >> PAGE_SHIFT; -	int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); - -	if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) -		goto out; -	if (!size) -		goto out; -	if (dev->dma_mem) -		goto out; - -	/* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ - -	mem_base = ioremap(bus_addr, size); -	if (!mem_base) -		goto out; - -	dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); -	if (!dev->dma_mem) -		goto out; -	dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); -	if (!dev->dma_mem->bitmap) -		goto free1_out; - -	dev->dma_mem->virt_base = mem_base; -	dev->dma_mem->device_base = device_addr; -	dev->dma_mem->size = pages; -	dev->dma_mem->flags = flags; - -	if (flags & DMA_MEMORY_MAP) -		return DMA_MEMORY_MAP; - -	return DMA_MEMORY_IO; - - free1_out: -	kfree(dev->dma_mem); - out: -	if (mem_base) -		iounmap(mem_base); -	return 0; -} -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ -	struct dma_coherent_mem *mem = dev->dma_mem; - -	if (!mem) -		return; -	dev->dma_mem = NULL; -	iounmap(mem->virt_base); -	kfree(mem->bitmap); -	kfree(mem); -} -EXPORT_SYMBOL(dma_release_declared_memory); - -void *dma_mark_declared_memory_occupied(struct device *dev, -					dma_addr_t device_addr, size_t size) -{ -	struct dma_coherent_mem *mem = dev->dma_mem; -	int pos, err; - -	size += device_addr & ~PAGE_MASK; - -	if (!mem) -		return ERR_PTR(-EINVAL); - -	pos = (device_addr - mem->device_base) >> PAGE_SHIFT; -	err = bitmap_allocate_region(mem->bitmap, pos, get_order(size)); -	if (err != 0) -		return ERR_PTR(err); -	return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - -/** - * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area - * - * @dev:	device from which we allocate memory - * @size:	size of requested memory area - * @dma_handle:	This will be filled with the correct dma handle - * @ret:	This pointer will be filled with the virtual address - *		to allocated area. - * - * This function should be only called from per-arch dma_alloc_coherent() - * to support allocation from per-device coherent memory pools. - * - * Returns 0 if dma_alloc_coherent should continue with allocating from - * generic memory areas, or !0 if dma_alloc_coherent should return @ret. - */ -int dma_alloc_from_coherent(struct device *dev, ssize_t size, -				       dma_addr_t *dma_handle, void **ret) -{ -	struct dma_coherent_mem *mem; -	int order = get_order(size); -	int pageno; - -	if (!dev) -		return 0; -	mem = dev->dma_mem; -	if (!mem) -		return 0; - -	*ret = NULL; - -	if (unlikely(size > (mem->size << PAGE_SHIFT))) -		goto err; - -	pageno = bitmap_find_free_region(mem->bitmap, mem->size, order); -	if (unlikely(pageno < 0)) -		goto err; - -	/* -	 * Memory was found in the per-device area. -	 */ -	*dma_handle = mem->device_base + (pageno << PAGE_SHIFT); -	*ret = mem->virt_base + (pageno << PAGE_SHIFT); -	memset(*ret, 0, size); - -	return 1; - -err: -	/* -	 * In the case where the allocation can not be satisfied from the -	 * per-device area, try to fall back to generic memory if the -	 * constraints allow it. -	 */ -	return mem->flags & DMA_MEMORY_EXCLUSIVE; -} -EXPORT_SYMBOL(dma_alloc_from_coherent); - -/** - * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool - * @dev:	device from which the memory was allocated - * @order:	the order of pages allocated - * @vaddr:	virtual address of allocated pages - * - * This checks whether the memory was allocated from the per-device - * coherent memory pool and if so, releases that memory. - * - * Returns 1 if we correctly released the memory, or 0 if - * dma_release_coherent() should proceed with releasing memory from - * generic pools. - */ -int dma_release_from_coherent(struct device *dev, int order, void *vaddr) -{ -	struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - -	if (mem && vaddr >= mem->virt_base && vaddr < -		   (mem->virt_base + (mem->size << PAGE_SHIFT))) { -		int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; - -		bitmap_release_region(mem->bitmap, page, order); -		return 1; -	} -	return 0; -} -EXPORT_SYMBOL(dma_release_from_coherent); diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index 22e9dcfaa3d..654efd09f6a 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -34,7 +34,7 @@ config GCOV_KERNEL  config GCOV_PROFILE_ALL  	bool "Profile entire Kernel"  	depends on GCOV_KERNEL -	depends on S390 || X86 +	depends on S390 || X86 || (PPC && EXPERIMENTAL)  	default n  	---help---  	This options activates profiling for the entire kernel. diff --git a/kernel/module.c b/kernel/module.c index 46580edff0c..05ce49ced8f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -369,7 +369,7 @@ EXPORT_SYMBOL_GPL(find_module);  #ifdef CONFIG_SMP -#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA +#ifndef CONFIG_HAVE_LEGACY_PER_CPU_AREA  static void *percpu_modalloc(unsigned long size, unsigned long align,  			     const char *name) @@ -394,7 +394,7 @@ static void percpu_modfree(void *freeme)  	free_percpu(freeme);  } -#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#else /* ... CONFIG_HAVE_LEGACY_PER_CPU_AREA */  /* Number of blocks used and allocated. */  static unsigned int pcpu_num_used, pcpu_num_allocated; @@ -540,7 +540,7 @@ static int percpu_modinit(void)  }  __initcall(percpu_modinit); -#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */ +#endif /* CONFIG_HAVE_LEGACY_PER_CPU_AREA */  static unsigned int find_pcpusec(Elf_Ehdr *hdr,  				 Elf_Shdr *sechdrs, diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e0d91fdf0c3..8cb94a52d1b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -106,16 +106,16 @@ hw_perf_group_sched_in(struct perf_counter *group_leader,  void __weak perf_counter_print_debug(void)	{ } -static DEFINE_PER_CPU(int, disable_count); +static DEFINE_PER_CPU(int, perf_disable_count);  void __perf_disable(void)  { -	__get_cpu_var(disable_count)++; +	__get_cpu_var(perf_disable_count)++;  }  bool __perf_enable(void)  { -	return !--__get_cpu_var(disable_count); +	return !--__get_cpu_var(perf_disable_count);  }  void perf_disable(void) @@ -4215,6 +4215,7 @@ static int perf_copy_attr(struct perf_counter_attr __user *uattr,  			if (val)  				goto err_size;  		} +		size = sizeof(*attr);  	}  	ret = copy_from_user(attr, uattr, size); diff --git a/kernel/sched.c b/kernel/sched.c index e27a53685ed..faf4d463bbf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -119,8 +119,6 @@   */  #define RUNTIME_INF	((u64)~0ULL) -static void double_rq_lock(struct rq *rq1, struct rq *rq2); -  static inline int rt_policy(int policy)  {  	if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) @@ -295,12 +293,12 @@ struct task_group root_task_group;  /* Default task group's sched entity on each cpu */  static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);  /* Default task group's cfs_rq on each cpu */ -static DEFINE_PER_CPU(struct cfs_rq, init_tg_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct cfs_rq, init_tg_cfs_rq);  #endif /* CONFIG_FAIR_GROUP_SCHED */  #ifdef CONFIG_RT_GROUP_SCHED  static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); -static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);  #endif /* CONFIG_RT_GROUP_SCHED */  #else /* !CONFIG_USER_SCHED */  #define root_task_group init_task_group @@ -378,13 +376,6 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)  #else -#ifdef CONFIG_SMP -static int root_task_group_empty(void) -{ -	return 1; -} -#endif -  static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }  static inline struct task_group *task_group(struct task_struct *p)  { @@ -514,14 +505,6 @@ struct root_domain {  #ifdef CONFIG_SMP  	struct cpupri cpupri;  #endif -#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -	/* -	 * Preferred wake up cpu nominated by sched_mc balance that will be -	 * used when most cpus are idle in the system indicating overall very -	 * low system utilisation. Triggered at POWERSAVINGS_BALANCE_WAKEUP(2) -	 */ -	unsigned int sched_mc_preferred_wakeup_cpu; -#endif  };  /* @@ -646,9 +629,10 @@ struct rq {  static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); -static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync) +static inline +void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)  { -	rq->curr->sched_class->check_preempt_curr(rq, p, sync); +	rq->curr->sched_class->check_preempt_curr(rq, p, flags);  }  static inline int cpu_of(struct rq *rq) @@ -1509,8 +1493,65 @@ static int tg_nop(struct task_group *tg, void *data)  #endif  #ifdef CONFIG_SMP -static unsigned long source_load(int cpu, int type); -static unsigned long target_load(int cpu, int type); +/* Used instead of source_load when we know the type == 0 */ +static unsigned long weighted_cpuload(const int cpu) +{ +	return cpu_rq(cpu)->load.weight; +} + +/* + * Return a low guess at the load of a migration-source cpu weighted + * according to the scheduling class and "nice" value. + * + * We want to under-estimate the load of migration sources, to + * balance conservatively. + */ +static unsigned long source_load(int cpu, int type) +{ +	struct rq *rq = cpu_rq(cpu); +	unsigned long total = weighted_cpuload(cpu); + +	if (type == 0 || !sched_feat(LB_BIAS)) +		return total; + +	return min(rq->cpu_load[type-1], total); +} + +/* + * Return a high guess at the load of a migration-target cpu weighted + * according to the scheduling class and "nice" value. + */ +static unsigned long target_load(int cpu, int type) +{ +	struct rq *rq = cpu_rq(cpu); +	unsigned long total = weighted_cpuload(cpu); + +	if (type == 0 || !sched_feat(LB_BIAS)) +		return total; + +	return max(rq->cpu_load[type-1], total); +} + +static struct sched_group *group_of(int cpu) +{ +	struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); + +	if (!sd) +		return NULL; + +	return sd->groups; +} + +static unsigned long power_of(int cpu) +{ +	struct sched_group *group = group_of(cpu); + +	if (!group) +		return SCHED_LOAD_SCALE; + +	return group->cpu_power; +} +  static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);  static unsigned long cpu_avg_load_per_task(int cpu) @@ -1695,6 +1736,8 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)  #ifdef CONFIG_PREEMPT +static void double_rq_lock(struct rq *rq1, struct rq *rq2); +  /*   * fair double_lock_balance: Safely acquires both rq->locks in a fair   * way at the expense of forcing extra atomic operations in all @@ -1959,13 +2002,6 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,  }  #ifdef CONFIG_SMP - -/* Used instead of source_load when we know the type == 0 */ -static unsigned long weighted_cpuload(const int cpu) -{ -	return cpu_rq(cpu)->load.weight; -} -  /*   * Is this task likely cache-hot:   */ @@ -2239,185 +2275,6 @@ void kick_process(struct task_struct *p)  	preempt_enable();  }  EXPORT_SYMBOL_GPL(kick_process); - -/* - * Return a low guess at the load of a migration-source cpu weighted - * according to the scheduling class and "nice" value. - * - * We want to under-estimate the load of migration sources, to - * balance conservatively. - */ -static unsigned long source_load(int cpu, int type) -{ -	struct rq *rq = cpu_rq(cpu); -	unsigned long total = weighted_cpuload(cpu); - -	if (type == 0 || !sched_feat(LB_BIAS)) -		return total; - -	return min(rq->cpu_load[type-1], total); -} - -/* - * Return a high guess at the load of a migration-target cpu weighted - * according to the scheduling class and "nice" value. - */ -static unsigned long target_load(int cpu, int type) -{ -	struct rq *rq = cpu_rq(cpu); -	unsigned long total = weighted_cpuload(cpu); - -	if (type == 0 || !sched_feat(LB_BIAS)) -		return total; - -	return max(rq->cpu_load[type-1], total); -} - -/* - * find_idlest_group finds and returns the least busy CPU group within the - * domain. - */ -static struct sched_group * -find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) -{ -	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; -	unsigned long min_load = ULONG_MAX, this_load = 0; -	int load_idx = sd->forkexec_idx; -	int imbalance = 100 + (sd->imbalance_pct-100)/2; - -	do { -		unsigned long load, avg_load; -		int local_group; -		int i; - -		/* Skip over this group if it has no CPUs allowed */ -		if (!cpumask_intersects(sched_group_cpus(group), -					&p->cpus_allowed)) -			continue; - -		local_group = cpumask_test_cpu(this_cpu, -					       sched_group_cpus(group)); - -		/* Tally up the load of all CPUs in the group */ -		avg_load = 0; - -		for_each_cpu(i, sched_group_cpus(group)) { -			/* Bias balancing toward cpus of our domain */ -			if (local_group) -				load = source_load(i, load_idx); -			else -				load = target_load(i, load_idx); - -			avg_load += load; -		} - -		/* Adjust by relative CPU power of the group */ -		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; - -		if (local_group) { -			this_load = avg_load; -			this = group; -		} else if (avg_load < min_load) { -			min_load = avg_load; -			idlest = group; -		} -	} while (group = group->next, group != sd->groups); - -	if (!idlest || 100*this_load < imbalance*min_load) -		return NULL; -	return idlest; -} - -/* - * find_idlest_cpu - find the idlest cpu among the cpus in group. - */ -static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) -{ -	unsigned long load, min_load = ULONG_MAX; -	int idlest = -1; -	int i; - -	/* Traverse only the allowed CPUs */ -	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { -		load = weighted_cpuload(i); - -		if (load < min_load || (load == min_load && i == this_cpu)) { -			min_load = load; -			idlest = i; -		} -	} - -	return idlest; -} - -/* - * sched_balance_self: balance the current task (running on cpu) in domains - * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and - * SD_BALANCE_EXEC. - * - * Balance, ie. select the least loaded group. - * - * Returns the target CPU number, or the same CPU if no balancing is needed. - * - * preempt must be disabled. - */ -static int sched_balance_self(int cpu, int flag) -{ -	struct task_struct *t = current; -	struct sched_domain *tmp, *sd = NULL; - -	for_each_domain(cpu, tmp) { -		/* -		 * If power savings logic is enabled for a domain, stop there. -		 */ -		if (tmp->flags & SD_POWERSAVINGS_BALANCE) -			break; -		if (tmp->flags & flag) -			sd = tmp; -	} - -	if (sd) -		update_shares(sd); - -	while (sd) { -		struct sched_group *group; -		int new_cpu, weight; - -		if (!(sd->flags & flag)) { -			sd = sd->child; -			continue; -		} - -		group = find_idlest_group(sd, t, cpu); -		if (!group) { -			sd = sd->child; -			continue; -		} - -		new_cpu = find_idlest_cpu(group, t, cpu); -		if (new_cpu == -1 || new_cpu == cpu) { -			/* Now try balancing at a lower domain level of cpu */ -			sd = sd->child; -			continue; -		} - -		/* Now try balancing at a lower domain level of new_cpu */ -		cpu = new_cpu; -		weight = cpumask_weight(sched_domain_span(sd)); -		sd = NULL; -		for_each_domain(cpu, tmp) { -			if (weight <= cpumask_weight(sched_domain_span(tmp))) -				break; -			if (tmp->flags & flag) -				sd = tmp; -		} -		/* while loop will break here if sd == NULL */ -	} - -	return cpu; -} -  #endif /* CONFIG_SMP */  /** @@ -2455,37 +2312,22 @@ void task_oncpu_function_call(struct task_struct *p,   *   * returns failure only if the task is already active.   */ -static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) +static int try_to_wake_up(struct task_struct *p, unsigned int state, +			  int wake_flags)  {  	int cpu, orig_cpu, this_cpu, success = 0;  	unsigned long flags; -	long old_state;  	struct rq *rq;  	if (!sched_feat(SYNC_WAKEUPS)) -		sync = 0; - -#ifdef CONFIG_SMP -	if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) { -		struct sched_domain *sd; +		wake_flags &= ~WF_SYNC; -		this_cpu = raw_smp_processor_id(); -		cpu = task_cpu(p); - -		for_each_domain(this_cpu, sd) { -			if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { -				update_shares(sd); -				break; -			} -		} -	} -#endif +	this_cpu = get_cpu();  	smp_wmb();  	rq = task_rq_lock(p, &flags);  	update_rq_clock(rq); -	old_state = p->state; -	if (!(old_state & state)) +	if (!(p->state & state))  		goto out;  	if (p->se.on_rq) @@ -2493,27 +2335,29 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)  	cpu = task_cpu(p);  	orig_cpu = cpu; -	this_cpu = smp_processor_id();  #ifdef CONFIG_SMP  	if (unlikely(task_running(rq, p)))  		goto out_activate; -	cpu = p->sched_class->select_task_rq(p, sync); -	if (cpu != orig_cpu) { +	/* +	 * In order to handle concurrent wakeups and release the rq->lock +	 * we put the task in TASK_WAKING state. +	 * +	 * First fix up the nr_uninterruptible count: +	 */ +	if (task_contributes_to_load(p)) +		rq->nr_uninterruptible--; +	p->state = TASK_WAKING; +	task_rq_unlock(rq, &flags); + +	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_WAKE, wake_flags); +	if (cpu != orig_cpu)  		set_task_cpu(p, cpu); -		task_rq_unlock(rq, &flags); -		/* might preempt at this point */ -		rq = task_rq_lock(p, &flags); -		old_state = p->state; -		if (!(old_state & state)) -			goto out; -		if (p->se.on_rq) -			goto out_running; -		this_cpu = smp_processor_id(); -		cpu = task_cpu(p); -	} +	rq = task_rq_lock(p, &flags); +	WARN_ON(p->state != TASK_WAKING); +	cpu = task_cpu(p);  #ifdef CONFIG_SCHEDSTATS  	schedstat_inc(rq, ttwu_count); @@ -2533,7 +2377,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)  out_activate:  #endif /* CONFIG_SMP */  	schedstat_inc(p, se.nr_wakeups); -	if (sync) +	if (wake_flags & WF_SYNC)  		schedstat_inc(p, se.nr_wakeups_sync);  	if (orig_cpu != cpu)  		schedstat_inc(p, se.nr_wakeups_migrate); @@ -2562,7 +2406,7 @@ out_activate:  out_running:  	trace_sched_wakeup(rq, p, success); -	check_preempt_curr(rq, p, sync); +	check_preempt_curr(rq, p, wake_flags);  	p->state = TASK_RUNNING;  #ifdef CONFIG_SMP @@ -2571,6 +2415,7 @@ out_running:  #endif  out:  	task_rq_unlock(rq, &flags); +	put_cpu();  	return success;  } @@ -2613,6 +2458,7 @@ static void __sched_fork(struct task_struct *p)  	p->se.avg_overlap		= 0;  	p->se.start_runtime		= 0;  	p->se.avg_wakeup		= sysctl_sched_wakeup_granularity; +	p->se.avg_running		= 0;  #ifdef CONFIG_SCHEDSTATS  	p->se.wait_start			= 0; @@ -2674,11 +2520,6 @@ void sched_fork(struct task_struct *p, int clone_flags)  	__sched_fork(p); -#ifdef CONFIG_SMP -	cpu = sched_balance_self(cpu, SD_BALANCE_FORK); -#endif -	set_task_cpu(p, cpu); -  	/*  	 * Make sure we do not leak PI boosting priority to the child.  	 */ @@ -2709,6 +2550,11 @@ void sched_fork(struct task_struct *p, int clone_flags)  	if (!rt_prio(p->prio))  		p->sched_class = &fair_sched_class; +#ifdef CONFIG_SMP +	cpu = p->sched_class->select_task_rq(p, SD_BALANCE_FORK, 0); +#endif +	set_task_cpu(p, cpu); +  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)  	if (likely(sched_info_on()))  		memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -2754,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)  		inc_nr_running(rq);  	}  	trace_sched_wakeup_new(rq, p, 1); -	check_preempt_curr(rq, p, 0); +	check_preempt_curr(rq, p, WF_FORK);  #ifdef CONFIG_SMP  	if (p->sched_class->task_wake_up)  		p->sched_class->task_wake_up(rq, p); @@ -3263,7 +3109,7 @@ out:  void sched_exec(void)  {  	int new_cpu, this_cpu = get_cpu(); -	new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); +	new_cpu = current->sched_class->select_task_rq(current, SD_BALANCE_EXEC, 0);  	put_cpu();  	if (new_cpu != this_cpu)  		sched_migrate_task(current, new_cpu); @@ -3683,11 +3529,6 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,  	*imbalance = sds->min_load_per_task;  	sds->busiest = sds->group_min; -	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) { -		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu = -			group_first_cpu(sds->group_leader); -	} -  	return 1;  } @@ -3711,7 +3552,18 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,  }  #endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ -unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) + +unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) +{ +	return SCHED_LOAD_SCALE; +} + +unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu) +{ +	return default_scale_freq_power(sd, cpu); +} + +unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)  {  	unsigned long weight = cpumask_weight(sched_domain_span(sd));  	unsigned long smt_gain = sd->smt_gain; @@ -3721,6 +3573,11 @@ unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)  	return smt_gain;  } +unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu) +{ +	return default_scale_smt_power(sd, cpu); +} +  unsigned long scale_rt_power(int cpu)  {  	struct rq *rq = cpu_rq(cpu); @@ -3745,10 +3602,19 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)  	unsigned long power = SCHED_LOAD_SCALE;  	struct sched_group *sdg = sd->groups; -	/* here we could scale based on cpufreq */ +	if (sched_feat(ARCH_POWER)) +		power *= arch_scale_freq_power(sd, cpu); +	else +		power *= default_scale_freq_power(sd, cpu); + +	power >>= SCHED_LOAD_SHIFT;  	if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) { -		power *= arch_scale_smt_power(sd, cpu); +		if (sched_feat(ARCH_POWER)) +			power *= arch_scale_smt_power(sd, cpu); +		else +			power *= default_scale_smt_power(sd, cpu); +  		power >>= SCHED_LOAD_SHIFT;  	} @@ -4161,26 +4027,6 @@ ret:  	return NULL;  } -static struct sched_group *group_of(int cpu) -{ -	struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); - -	if (!sd) -		return NULL; - -	return sd->groups; -} - -static unsigned long power_of(int cpu) -{ -	struct sched_group *group = group_of(cpu); - -	if (!group) -		return SCHED_LOAD_SCALE; - -	return group->cpu_power; -} -  /*   * find_busiest_queue - find the busiest runqueue among the cpus in group.   */ @@ -5465,14 +5311,13 @@ static inline void schedule_debug(struct task_struct *prev)  #endif  } -static void put_prev_task(struct rq *rq, struct task_struct *prev) +static void put_prev_task(struct rq *rq, struct task_struct *p)  { -	if (prev->state == TASK_RUNNING) { -		u64 runtime = prev->se.sum_exec_runtime; +	u64 runtime = p->se.sum_exec_runtime - p->se.prev_sum_exec_runtime; -		runtime -= prev->se.prev_sum_exec_runtime; -		runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); +	update_avg(&p->se.avg_running, runtime); +	if (p->state == TASK_RUNNING) {  		/*  		 * In order to avoid avg_overlap growing stale when we are  		 * indeed overlapping and hence not getting put to sleep, grow @@ -5482,9 +5327,12 @@ static void put_prev_task(struct rq *rq, struct task_struct *prev)  		 * correlates to the amount of cache footprint a task can  		 * build up.  		 */ -		update_avg(&prev->se.avg_overlap, runtime); +		runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost); +		update_avg(&p->se.avg_overlap, runtime); +	} else { +		update_avg(&p->se.avg_running, 0);  	} -	prev->sched_class->put_prev_task(rq, prev); +	p->sched_class->put_prev_task(rq, p);  }  /* @@ -5716,10 +5564,10 @@ asmlinkage void __sched preempt_schedule_irq(void)  #endif /* CONFIG_PREEMPT */ -int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, +int default_wake_function(wait_queue_t *curr, unsigned mode, int wake_flags,  			  void *key)  { -	return try_to_wake_up(curr->private, mode, sync); +	return try_to_wake_up(curr->private, mode, wake_flags);  }  EXPORT_SYMBOL(default_wake_function); @@ -5733,14 +5581,14 @@ EXPORT_SYMBOL(default_wake_function);   * zero in this (rare) case, and we handle it by continuing to scan the queue.   */  static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, -			int nr_exclusive, int sync, void *key) +			int nr_exclusive, int wake_flags, void *key)  {  	wait_queue_t *curr, *next;  	list_for_each_entry_safe(curr, next, &q->task_list, task_list) {  		unsigned flags = curr->flags; -		if (curr->func(curr, mode, sync, key) && +		if (curr->func(curr, mode, wake_flags, key) &&  				(flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)  			break;  	} @@ -5801,16 +5649,16 @@ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,  			int nr_exclusive, void *key)  {  	unsigned long flags; -	int sync = 1; +	int wake_flags = WF_SYNC;  	if (unlikely(!q))  		return;  	if (unlikely(!nr_exclusive)) -		sync = 0; +		wake_flags = 0;  	spin_lock_irqsave(&q->lock, flags); -	__wake_up_common(q, mode, nr_exclusive, sync, key); +	__wake_up_common(q, mode, nr_exclusive, wake_flags, key);  	spin_unlock_irqrestore(&q->lock, flags);  }  EXPORT_SYMBOL_GPL(__wake_up_sync_key); @@ -8000,9 +7848,7 @@ static int sd_degenerate(struct sched_domain *sd)  	}  	/* Following flags don't use groups */ -	if (sd->flags & (SD_WAKE_IDLE | -			 SD_WAKE_AFFINE | -			 SD_WAKE_BALANCE)) +	if (sd->flags & (SD_WAKE_AFFINE))  		return 0;  	return 1; @@ -8019,10 +7865,6 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)  	if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))  		return 0; -	/* Does parent contain flags not in child? */ -	/* WAKE_BALANCE is a subset of WAKE_AFFINE */ -	if (cflags & SD_WAKE_AFFINE) -		pflags &= ~SD_WAKE_BALANCE;  	/* Flags needing groups don't count if only 1 group in parent */  	if (parent->groups == parent->groups->next) {  		pflags &= ~(SD_LOAD_BALANCE | @@ -8708,10 +8550,10 @@ static void set_domain_attribute(struct sched_domain *sd,  		request = attr->relax_domain_level;  	if (request < sd->level) {  		/* turn off idle balance on this domain */ -		sd->flags &= ~(SD_WAKE_IDLE|SD_BALANCE_NEWIDLE); +		sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);  	} else {  		/* turn on idle balance on this domain */ -		sd->flags |= (SD_WAKE_IDLE_FAR|SD_BALANCE_NEWIDLE); +		sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);  	}  } diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 5ddbd089126..efb84409bc4 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -395,6 +395,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)  	PN(se.sum_exec_runtime);  	PN(se.avg_overlap);  	PN(se.avg_wakeup); +	PN(se.avg_running);  	nr_switches = p->nvcsw + p->nivcsw; diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index aa7f8412101..10d218ab69f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -711,7 +711,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)  	if (!initial) {  		/* sleeps upto a single latency don't count. */ -		if (sched_feat(NEW_FAIR_SLEEPERS)) { +		if (sched_feat(FAIR_SLEEPERS)) {  			unsigned long thresh = sysctl_sched_latency;  			/* @@ -725,6 +725,13 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)  					 task_of(se)->policy != SCHED_IDLE))  				thresh = calc_delta_fair(thresh, se); +			/* +			 * Halve their sleep time's effect, to allow +			 * for a gentler effect of sleepers: +			 */ +			if (sched_feat(GENTLE_FAIR_SLEEPERS)) +				thresh >>= 1; +  			vruntime -= thresh;  		}  	} @@ -757,10 +764,10 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)  static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)  { -	if (cfs_rq->last == se) +	if (!se || cfs_rq->last == se)  		cfs_rq->last = NULL; -	if (cfs_rq->next == se) +	if (!se || cfs_rq->next == se)  		cfs_rq->next = NULL;  } @@ -1062,83 +1069,6 @@ static void yield_task_fair(struct rq *rq)  	se->vruntime = rightmost->vruntime + 1;  } -/* - * wake_idle() will wake a task on an idle cpu if task->cpu is - * not idle and an idle cpu is available.  The span of cpus to - * search starts with cpus closest then further out as needed, - * so we always favor a closer, idle cpu. - * Domains may include CPUs that are not usable for migration, - * hence we need to mask them out (rq->rd->online) - * - * Returns the CPU we should wake onto. - */ -#if defined(ARCH_HAS_SCHED_WAKE_IDLE) - -#define cpu_rd_active(cpu, rq) cpumask_test_cpu(cpu, rq->rd->online) - -static int wake_idle(int cpu, struct task_struct *p) -{ -	struct sched_domain *sd; -	int i; -	unsigned int chosen_wakeup_cpu; -	int this_cpu; -	struct rq *task_rq = task_rq(p); - -	/* -	 * At POWERSAVINGS_BALANCE_WAKEUP level, if both this_cpu and prev_cpu -	 * are idle and this is not a kernel thread and this task's affinity -	 * allows it to be moved to preferred cpu, then just move! -	 */ - -	this_cpu = smp_processor_id(); -	chosen_wakeup_cpu = -		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu; - -	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP && -		idle_cpu(cpu) && idle_cpu(this_cpu) && -		p->mm && !(p->flags & PF_KTHREAD) && -		cpu_isset(chosen_wakeup_cpu, p->cpus_allowed)) -		return chosen_wakeup_cpu; - -	/* -	 * If it is idle, then it is the best cpu to run this task. -	 * -	 * This cpu is also the best, if it has more than one task already. -	 * Siblings must be also busy(in most cases) as they didn't already -	 * pickup the extra load from this cpu and hence we need not check -	 * sibling runqueue info. This will avoid the checks and cache miss -	 * penalities associated with that. -	 */ -	if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1) -		return cpu; - -	for_each_domain(cpu, sd) { -		if ((sd->flags & SD_WAKE_IDLE) -		    || ((sd->flags & SD_WAKE_IDLE_FAR) -			&& !task_hot(p, task_rq->clock, sd))) { -			for_each_cpu_and(i, sched_domain_span(sd), -					 &p->cpus_allowed) { -				if (cpu_rd_active(i, task_rq) && idle_cpu(i)) { -					if (i != task_cpu(p)) { -						schedstat_inc(p, -						       se.nr_wakeups_idle); -					} -					return i; -				} -			} -		} else { -			break; -		} -	} -	return cpu; -} -#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/ -static inline int wake_idle(int cpu, struct task_struct *p) -{ -	return cpu; -} -#endif -  #ifdef CONFIG_SMP  #ifdef CONFIG_FAIR_GROUP_SCHED @@ -1225,25 +1155,34 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,  #endif -static int -wake_affine(struct sched_domain *this_sd, struct rq *this_rq, -	    struct task_struct *p, int prev_cpu, int this_cpu, int sync, -	    int idx, unsigned long load, unsigned long this_load, -	    unsigned int imbalance) +static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)  { -	struct task_struct *curr = this_rq->curr; -	struct task_group *tg; -	unsigned long tl = this_load; +	struct task_struct *curr = current; +	unsigned long this_load, load; +	int idx, this_cpu, prev_cpu;  	unsigned long tl_per_task; +	unsigned int imbalance; +	struct task_group *tg;  	unsigned long weight;  	int balanced; -	if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) -		return 0; +	idx	  = sd->wake_idx; +	this_cpu  = smp_processor_id(); +	prev_cpu  = task_cpu(p); +	load	  = source_load(prev_cpu, idx); +	this_load = target_load(this_cpu, idx); -	if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost || -			p->se.avg_overlap > sysctl_sched_migration_cost)) -		sync = 0; +	if (sync) { +	       if (sched_feat(SYNC_LESS) && +		   (curr->se.avg_overlap > sysctl_sched_migration_cost || +		    p->se.avg_overlap > sysctl_sched_migration_cost)) +		       sync = 0; +	} else { +		if (sched_feat(SYNC_MORE) && +		    (curr->se.avg_overlap < sysctl_sched_migration_cost && +		     p->se.avg_overlap < sysctl_sched_migration_cost)) +			sync = 1; +	}  	/*  	 * If sync wakeup then subtract the (maximum possible) @@ -1254,24 +1193,26 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,  		tg = task_group(current);  		weight = current->se.load.weight; -		tl += effective_load(tg, this_cpu, -weight, -weight); +		this_load += effective_load(tg, this_cpu, -weight, -weight);  		load += effective_load(tg, prev_cpu, 0, -weight);  	}  	tg = task_group(p);  	weight = p->se.load.weight; +	imbalance = 100 + (sd->imbalance_pct - 100) / 2; +  	/*  	 * In low-load situations, where prev_cpu is idle and this_cpu is idle -	 * due to the sync cause above having dropped tl to 0, we'll always have -	 * an imbalance, but there's really nothing you can do about that, so -	 * that's good too. +	 * due to the sync cause above having dropped this_load to 0, we'll +	 * always have an imbalance, but there's really nothing you can do +	 * about that, so that's good too.  	 *  	 * Otherwise check if either cpus are near enough in load to allow this  	 * task to be woken on this_cpu.  	 */ -	balanced = !tl || -		100*(tl + effective_load(tg, this_cpu, weight, weight)) <= +	balanced = !this_load || +		100*(this_load + effective_load(tg, this_cpu, weight, weight)) <=  		imbalance*(load + effective_load(tg, prev_cpu, 0, weight));  	/* @@ -1285,14 +1226,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,  	schedstat_inc(p, se.nr_wakeups_affine_attempts);  	tl_per_task = cpu_avg_load_per_task(this_cpu); -	if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <= -			tl_per_task)) { +	if (balanced || +	    (this_load <= load && +	     this_load + target_load(prev_cpu, idx) <= tl_per_task)) {  		/*  		 * This domain has SD_WAKE_AFFINE and  		 * p is cache cold in this domain, and  		 * there is no bad imbalance.  		 */ -		schedstat_inc(this_sd, ttwu_move_affine); +		schedstat_inc(sd, ttwu_move_affine);  		schedstat_inc(p, se.nr_wakeups_affine);  		return 1; @@ -1300,65 +1242,215 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,  	return 0;  } -static int select_task_rq_fair(struct task_struct *p, int sync) +/* + * find_idlest_group finds and returns the least busy CPU group within the + * domain. + */ +static struct sched_group * +find_idlest_group(struct sched_domain *sd, struct task_struct *p, +		  int this_cpu, int load_idx)  { -	struct sched_domain *sd, *this_sd = NULL; -	int prev_cpu, this_cpu, new_cpu; -	unsigned long load, this_load; -	struct rq *this_rq; -	unsigned int imbalance; -	int idx; +	struct sched_group *idlest = NULL, *this = NULL, *group = sd->groups; +	unsigned long min_load = ULONG_MAX, this_load = 0; +	int imbalance = 100 + (sd->imbalance_pct-100)/2; -	prev_cpu	= task_cpu(p); -	this_cpu	= smp_processor_id(); -	this_rq		= cpu_rq(this_cpu); -	new_cpu		= prev_cpu; +	do { +		unsigned long load, avg_load; +		int local_group; +		int i; -	/* -	 * 'this_sd' is the first domain that both -	 * this_cpu and prev_cpu are present in: -	 */ -	for_each_domain(this_cpu, sd) { -		if (cpumask_test_cpu(prev_cpu, sched_domain_span(sd))) { -			this_sd = sd; -			break; +		/* Skip over this group if it has no CPUs allowed */ +		if (!cpumask_intersects(sched_group_cpus(group), +					&p->cpus_allowed)) +			continue; + +		local_group = cpumask_test_cpu(this_cpu, +					       sched_group_cpus(group)); + +		/* Tally up the load of all CPUs in the group */ +		avg_load = 0; + +		for_each_cpu(i, sched_group_cpus(group)) { +			/* Bias balancing toward cpus of our domain */ +			if (local_group) +				load = source_load(i, load_idx); +			else +				load = target_load(i, load_idx); + +			avg_load += load; +		} + +		/* Adjust by relative CPU power of the group */ +		avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + +		if (local_group) { +			this_load = avg_load; +			this = group; +		} else if (avg_load < min_load) { +			min_load = avg_load; +			idlest = group; +		} +	} while (group = group->next, group != sd->groups); + +	if (!idlest || 100*this_load < imbalance*min_load) +		return NULL; +	return idlest; +} + +/* + * find_idlest_cpu - find the idlest cpu among the cpus in group. + */ +static int +find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +{ +	unsigned long load, min_load = ULONG_MAX; +	int idlest = -1; +	int i; + +	/* Traverse only the allowed CPUs */ +	for_each_cpu_and(i, sched_group_cpus(group), &p->cpus_allowed) { +		load = weighted_cpuload(i); + +		if (load < min_load || (load == min_load && i == this_cpu)) { +			min_load = load; +			idlest = i;  		}  	} -	if (unlikely(!cpumask_test_cpu(this_cpu, &p->cpus_allowed))) -		goto out; +	return idlest; +} -	/* -	 * Check for affine wakeup and passive balancing possibilities. -	 */ -	if (!this_sd) +/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. + * + * Balance, ie. select the least loaded group. + * + * Returns the target CPU number, or the same CPU if no balancing is needed. + * + * preempt must be disabled. + */ +static int select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) +{ +	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; +	int cpu = smp_processor_id(); +	int prev_cpu = task_cpu(p); +	int new_cpu = cpu; +	int want_affine = 0; +	int want_sd = 1; +	int sync = wake_flags & WF_SYNC; + +	if (sd_flag & SD_BALANCE_WAKE) { +		if (sched_feat(AFFINE_WAKEUPS)) +			want_affine = 1; +		new_cpu = prev_cpu; +	} + +	rcu_read_lock(); +	for_each_domain(cpu, tmp) { +		/* +		 * If power savings logic is enabled for a domain, see if we +		 * are not overloaded, if so, don't balance wider. +		 */ +		if (tmp->flags & (SD_POWERSAVINGS_BALANCE|SD_PREFER_LOCAL)) { +			unsigned long power = 0; +			unsigned long nr_running = 0; +			unsigned long capacity; +			int i; + +			for_each_cpu(i, sched_domain_span(tmp)) { +				power += power_of(i); +				nr_running += cpu_rq(i)->cfs.nr_running; +			} + +			capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE); + +			if (tmp->flags & SD_POWERSAVINGS_BALANCE) +				nr_running /= 2; + +			if (nr_running < capacity) +				want_sd = 0; +		} + +		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && +		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { + +			affine_sd = tmp; +			want_affine = 0; +		} + +		if (!want_sd && !want_affine) +			break; + +		if (!(tmp->flags & sd_flag)) +			continue; + +		if (want_sd) +			sd = tmp; +	} + +	if (sched_feat(LB_SHARES_UPDATE)) { +		/* +		 * Pick the largest domain to update shares over +		 */ +		tmp = sd; +		if (affine_sd && (!tmp || +				  cpumask_weight(sched_domain_span(affine_sd)) > +				  cpumask_weight(sched_domain_span(sd)))) +			tmp = affine_sd; + +		if (tmp) +			update_shares(tmp); +	} + +	if (affine_sd && wake_affine(affine_sd, p, sync)) { +		new_cpu = cpu;  		goto out; +	} -	idx = this_sd->wake_idx; +	while (sd) { +		int load_idx = sd->forkexec_idx; +		struct sched_group *group; +		int weight; -	imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; +		if (!(sd->flags & sd_flag)) { +			sd = sd->child; +			continue; +		} -	load = source_load(prev_cpu, idx); -	this_load = target_load(this_cpu, idx); +		if (sd_flag & SD_BALANCE_WAKE) +			load_idx = sd->wake_idx; -	if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, -				     load, this_load, imbalance)) -		return this_cpu; +		group = find_idlest_group(sd, p, cpu, load_idx); +		if (!group) { +			sd = sd->child; +			continue; +		} -	/* -	 * Start passive balancing when half the imbalance_pct -	 * limit is reached. -	 */ -	if (this_sd->flags & SD_WAKE_BALANCE) { -		if (imbalance*this_load <= 100*load) { -			schedstat_inc(this_sd, ttwu_move_balance); -			schedstat_inc(p, se.nr_wakeups_passive); -			return this_cpu; +		new_cpu = find_idlest_cpu(group, p, cpu); +		if (new_cpu == -1 || new_cpu == cpu) { +			/* Now try balancing at a lower domain level of cpu */ +			sd = sd->child; +			continue;  		} + +		/* Now try balancing at a lower domain level of new_cpu */ +		cpu = new_cpu; +		weight = cpumask_weight(sched_domain_span(sd)); +		sd = NULL; +		for_each_domain(cpu, tmp) { +			if (weight <= cpumask_weight(sched_domain_span(tmp))) +				break; +			if (tmp->flags & sd_flag) +				sd = tmp; +		} +		/* while loop will break here if sd == NULL */  	}  out: -	return wake_idle(new_cpu, p); +	rcu_read_unlock(); +	return new_cpu;  }  #endif /* CONFIG_SMP */ @@ -1471,11 +1563,12 @@ static void set_next_buddy(struct sched_entity *se)  /*   * Preempt the current task with a newly woken task if needed:   */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)  {  	struct task_struct *curr = rq->curr;  	struct sched_entity *se = &curr->se, *pse = &p->se;  	struct cfs_rq *cfs_rq = task_cfs_rq(curr); +	int sync = wake_flags & WF_SYNC;  	update_curr(cfs_rq); @@ -1501,7 +1594,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)  	 */  	if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))  		set_last_buddy(se); -	set_next_buddy(pse); +	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK)) +		set_next_buddy(pse);  	/*  	 * We can come here with TIF_NEED_RESCHED already set from new task @@ -1523,16 +1617,25 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)  		return;  	} -	if (!sched_feat(WAKEUP_PREEMPT)) -		return; - -	if (sched_feat(WAKEUP_OVERLAP) && (sync || -			(se->avg_overlap < sysctl_sched_migration_cost && -			 pse->avg_overlap < sysctl_sched_migration_cost))) { +	if ((sched_feat(WAKEUP_SYNC) && sync) || +	    (sched_feat(WAKEUP_OVERLAP) && +	     (se->avg_overlap < sysctl_sched_migration_cost && +	      pse->avg_overlap < sysctl_sched_migration_cost))) {  		resched_task(curr);  		return;  	} +	if (sched_feat(WAKEUP_RUNNING)) { +		if (pse->avg_running < se->avg_running) { +			set_next_buddy(pse); +			resched_task(curr); +			return; +		} +	} + +	if (!sched_feat(WAKEUP_PREEMPT)) +		return; +  	find_matching_se(&se, &pse);  	BUG_ON(!pse); @@ -1555,8 +1658,13 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)  		/*  		 * If se was a buddy, clear it so that it will have to earn  		 * the favour again. +		 * +		 * If se was not a buddy, clear the buddies because neither +		 * was elegible to run, let them earn it again. +		 * +		 * IOW. unconditionally clear buddies.  		 */ -		__clear_buddies(cfs_rq, se); +		__clear_buddies(cfs_rq, NULL);  		set_next_entity(cfs_rq, se);  		cfs_rq = group_cfs_rq(se);  	} while (cfs_rq); diff --git a/kernel/sched_features.h b/kernel/sched_features.h index e2dc63a5815..0d94083582c 100644 --- a/kernel/sched_features.h +++ b/kernel/sched_features.h @@ -1,17 +1,123 @@ -SCHED_FEAT(NEW_FAIR_SLEEPERS, 0) +/* + * Disregards a certain amount of sleep time (sched_latency_ns) and + * considers the task to be running during that period. This gives it + * a service deficit on wakeup, allowing it to run sooner. + */ +SCHED_FEAT(FAIR_SLEEPERS, 1) + +/* + * Only give sleepers 50% of their service deficit. This allows + * them to run sooner, but does not allow tons of sleepers to + * rip the spread apart. + */ +SCHED_FEAT(GENTLE_FAIR_SLEEPERS, 1) + +/* + * By not normalizing the sleep time, heavy tasks get an effective + * longer period, and lighter task an effective shorter period they + * are considered running. + */  SCHED_FEAT(NORMALIZED_SLEEPER, 0) -SCHED_FEAT(ADAPTIVE_GRAN, 1) -SCHED_FEAT(WAKEUP_PREEMPT, 1) + +/* + * Place new tasks ahead so that they do not starve already running + * tasks + */  SCHED_FEAT(START_DEBIT, 1) + +/* + * Should wakeups try to preempt running tasks. + */ +SCHED_FEAT(WAKEUP_PREEMPT, 1) + +/* + * Compute wakeup_gran based on task behaviour, clipped to + *  [0, sched_wakeup_gran_ns] + */ +SCHED_FEAT(ADAPTIVE_GRAN, 1) + +/* + * When converting the wakeup granularity to virtual time, do it such + * that heavier tasks preempting a lighter task have an edge. + */ +SCHED_FEAT(ASYM_GRAN, 1) + +/* + * Always wakeup-preempt SYNC wakeups, see SYNC_WAKEUPS. + */ +SCHED_FEAT(WAKEUP_SYNC, 0) + +/* + * Wakeup preempt based on task behaviour. Tasks that do not overlap + * don't get preempted. + */ +SCHED_FEAT(WAKEUP_OVERLAP, 0) + +/* + * Wakeup preemption towards tasks that run short + */ +SCHED_FEAT(WAKEUP_RUNNING, 0) + +/* + * Use the SYNC wakeup hint, pipes and the likes use this to indicate + * the remote end is likely to consume the data we just wrote, and + * therefore has cache benefit from being placed on the same cpu, see + * also AFFINE_WAKEUPS. + */ +SCHED_FEAT(SYNC_WAKEUPS, 1) + +/* + * Based on load and program behaviour, see if it makes sense to place + * a newly woken task on the same cpu as the task that woke it -- + * improve cache locality. Typically used with SYNC wakeups as + * generated by pipes and the like, see also SYNC_WAKEUPS. + */  SCHED_FEAT(AFFINE_WAKEUPS, 1) + +/* + * Weaken SYNC hint based on overlap + */ +SCHED_FEAT(SYNC_LESS, 1) + +/* + * Add SYNC hint based on overlap + */ +SCHED_FEAT(SYNC_MORE, 0) + +/* + * Prefer to schedule the task we woke last (assuming it failed + * wakeup-preemption), since its likely going to consume data we + * touched, increases cache locality. + */ +SCHED_FEAT(NEXT_BUDDY, 0) + +/* + * Prefer to schedule the task that ran last (when we did + * wake-preempt) as that likely will touch the same data, increases + * cache locality. + */ +SCHED_FEAT(LAST_BUDDY, 1) + +/* + * Consider buddies to be cache hot, decreases the likelyness of a + * cache buddy being migrated away, increases cache locality. + */  SCHED_FEAT(CACHE_HOT_BUDDY, 1) -SCHED_FEAT(SYNC_WAKEUPS, 1) + +/* + * Use arch dependent cpu power functions + */ +SCHED_FEAT(ARCH_POWER, 0) +  SCHED_FEAT(HRTICK, 0)  SCHED_FEAT(DOUBLE_TICK, 0) -SCHED_FEAT(ASYM_GRAN, 1)  SCHED_FEAT(LB_BIAS, 1) -SCHED_FEAT(LB_WAKEUP_UPDATE, 1) +SCHED_FEAT(LB_SHARES_UPDATE, 1)  SCHED_FEAT(ASYM_EFF_LOAD, 1) -SCHED_FEAT(WAKEUP_OVERLAP, 0) -SCHED_FEAT(LAST_BUDDY, 1) + +/* + * Spin-wait on mutex acquisition when the mutex owner is running on + * another cpu -- assumes that when the owner is running, it will soon + * release the lock. Decreases scheduling overhead. + */  SCHED_FEAT(OWNER_SPIN, 1) diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 499672c10cb..a8b448af004 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -6,7 +6,7 @@   */  #ifdef CONFIG_SMP -static int select_task_rq_idle(struct task_struct *p, int sync) +static int select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)  {  	return task_cpu(p); /* IDLE tasks as never migrated */  } @@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)  /*   * Idle tasks are unconditionally rescheduled:   */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)  {  	resched_task(rq->idle);  } diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 2eb4bd6a526..13de7126a6a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -938,10 +938,13 @@ static void yield_task_rt(struct rq *rq)  #ifdef CONFIG_SMP  static int find_lowest_rq(struct task_struct *task); -static int select_task_rq_rt(struct task_struct *p, int sync) +static int select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)  {  	struct rq *rq = task_rq(p); +	if (sd_flag != SD_BALANCE_WAKE) +		return smp_processor_id(); +  	/*  	 * If the current task is an RT task, then  	 * try to see if we can wake this RT task up on another @@ -999,7 +1002,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)  /*   * Preempt the current task with a newly woken task if needed:   */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync) +static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)  {  	if (p->prio < rq->curr->prio) {  		resched_task(rq->curr); diff --git a/kernel/smp.c b/kernel/smp.c index 94188b8ecc3..8e218500ab1 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -177,6 +177,11 @@ void generic_smp_call_function_interrupt(void)  	int cpu = get_cpu();  	/* +	 * Shouldn't receive this interrupt on a cpu that is not yet online. +	 */ +	WARN_ON_ONCE(!cpu_online(cpu)); + +	/*  	 * Ensure entry is visible on call_function_queue after we have  	 * entered the IPI. See comment in smp_call_function_many.  	 * If we don't have this, then we may miss an entry on the list @@ -230,6 +235,11 @@ void generic_smp_call_function_single_interrupt(void)  	unsigned int data_flags;  	LIST_HEAD(list); +	/* +	 * Shouldn't receive this interrupt on a cpu that is not yet online. +	 */ +	WARN_ON_ONCE(!cpu_online(smp_processor_id())); +  	spin_lock(&q->lock);  	list_replace_init(&q->list, &list);  	spin_unlock(&q->lock); @@ -285,8 +295,14 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,  	 */  	this_cpu = get_cpu(); -	/* Can deadlock when called with interrupts disabled */ -	WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); +	/* +	 * Can deadlock when called with interrupts disabled. +	 * We allow cpu's that are not yet online though, as no one else can +	 * send smp call function interrupt to this cpu and as such deadlocks +	 * can't happen. +	 */ +	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() +		     && !oops_in_progress);  	if (cpu == this_cpu) {  		local_irq_save(flags); @@ -329,8 +345,14 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,  {  	csd_lock(data); -	/* Can deadlock when called with interrupts disabled */ -	WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress); +	/* +	 * Can deadlock when called with interrupts disabled. +	 * We allow cpu's that are not yet online though, as no one else can +	 * send smp call function interrupt to this cpu and as such deadlocks +	 * can't happen. +	 */ +	WARN_ON_ONCE(cpu_online(smp_processor_id()) && wait && irqs_disabled() +		     && !oops_in_progress);  	generic_exec_single(cpu, data, wait);  } @@ -365,8 +387,14 @@ void smp_call_function_many(const struct cpumask *mask,  	unsigned long flags;  	int cpu, next_cpu, this_cpu = smp_processor_id(); -	/* Can deadlock when called with interrupts disabled */ -	WARN_ON_ONCE(irqs_disabled() && !oops_in_progress); +	/* +	 * Can deadlock when called with interrupts disabled. +	 * We allow cpu's that are not yet online though, as no one else can +	 * send smp call function interrupt to this cpu and as such deadlocks +	 * can't happen. +	 */ +	WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() +		     && !oops_in_progress);  	/* So, what's a CPU they want? Ignoring this one. */  	cpu = cpumask_first_and(mask, cpu_online_mask); diff --git a/kernel/softirq.c b/kernel/softirq.c index 7db25067cd2..f8749e5216e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -57,7 +57,7 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp  static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);  char *softirq_to_name[NR_SOFTIRQS] = { -	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", +	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",  	"TASKLET", "SCHED", "HRTIMER",	"RCU"  }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 6bb59f70740..1a631ba684a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -91,7 +91,9 @@ extern int sysctl_nr_trim_pages;  #ifdef CONFIG_RCU_TORTURE_TEST  extern int rcutorture_runnable;  #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ +#ifdef CONFIG_BLOCK  extern int blk_iopoll_enabled; +#endif  /* Constants used for minimum and  maximum */  #ifdef CONFIG_DETECT_SOFTLOCKUP @@ -998,6 +1000,7 @@ static struct ctl_table kern_table[] = {  		.proc_handler	= &proc_dointvec,  	},  #endif +#ifdef CONFIG_BLOCK  	{  		.ctl_name	= CTL_UNNUMBERED,  		.procname	= "blk_iopoll", @@ -1006,6 +1009,7 @@ static struct ctl_table kern_table[] = {  		.mode		= 0644,  		.proc_handler	= &proc_dointvec,  	}, +#endif  /*   * NOTE: do not add new entries to this table unless you have read   * Documentation/sysctl/ctl_unnumbered.txt diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 1ea0d1234f4..e7163460440 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -11,12 +11,18 @@ config NOP_TRACER  config HAVE_FTRACE_NMI_ENTER  	bool +	help +	  See Documentation/trace/ftrace-implementation.txt  config HAVE_FUNCTION_TRACER  	bool +	help +	  See Documentation/trace/ftrace-implementation.txt  config HAVE_FUNCTION_GRAPH_TRACER  	bool +	help +	  See Documentation/trace/ftrace-implementation.txt  config HAVE_FUNCTION_GRAPH_FP_TEST  	bool @@ -28,21 +34,25 @@ config HAVE_FUNCTION_GRAPH_FP_TEST  config HAVE_FUNCTION_TRACE_MCOUNT_TEST  	bool  	help -	 This gets selected when the arch tests the function_trace_stop -	 variable at the mcount call site. Otherwise, this variable -	 is tested by the called function. +	  See Documentation/trace/ftrace-implementation.txt  config HAVE_DYNAMIC_FTRACE  	bool +	help +	  See Documentation/trace/ftrace-implementation.txt  config HAVE_FTRACE_MCOUNT_RECORD  	bool +	help +	  See Documentation/trace/ftrace-implementation.txt  config HAVE_HW_BRANCH_TRACER  	bool  config HAVE_SYSCALL_TRACEPOINTS  	bool +	help +	  See Documentation/trace/ftrace-implementation.txt  config TRACER_MAX_TRACE  	bool @@ -469,6 +479,18 @@ config FTRACE_STARTUP_TEST  	  functioning properly. It will do tests on all the configured  	  tracers of ftrace. +config EVENT_TRACE_TEST_SYSCALLS +	bool "Run selftest on syscall events" +	depends on FTRACE_STARTUP_TEST +	help +	 This option will also enable testing every syscall event. +	 It only enables the event and disables it and runs various loads +	 with the event enabled. This adds a bit more time for kernel boot +	 up since it runs this on every system call defined. + +	 TBD - enable a way to actually call the syscalls as we test their +	       events +  config MMIOTRACE  	bool "Memory mapped IO tracing"  	depends on HAVE_MMIOTRACE_SUPPORT && PCI diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8c804e24f96..cc615f84751 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1323,11 +1323,10 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)  enum {  	FTRACE_ITER_FILTER	= (1 << 0), -	FTRACE_ITER_CONT	= (1 << 1), -	FTRACE_ITER_NOTRACE	= (1 << 2), -	FTRACE_ITER_FAILURES	= (1 << 3), -	FTRACE_ITER_PRINTALL	= (1 << 4), -	FTRACE_ITER_HASH	= (1 << 5), +	FTRACE_ITER_NOTRACE	= (1 << 1), +	FTRACE_ITER_FAILURES	= (1 << 2), +	FTRACE_ITER_PRINTALL	= (1 << 3), +	FTRACE_ITER_HASH	= (1 << 4),  };  #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ @@ -1337,8 +1336,7 @@ struct ftrace_iterator {  	int			hidx;  	int			idx;  	unsigned		flags; -	unsigned char		buffer[FTRACE_BUFF_MAX+1]; -	unsigned		buffer_idx; +	struct trace_parser	parser;  };  static void * @@ -1407,7 +1405,7 @@ static int t_hash_show(struct seq_file *m, void *v)  	if (rec->ops->print)  		return rec->ops->print(m, rec->ip, rec->ops, rec->data); -	seq_printf(m, "%pf:%pf", (void *)rec->ip, (void *)rec->ops->func); +	seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func);  	if (rec->data)  		seq_printf(m, ":%p", rec->data); @@ -1517,7 +1515,7 @@ static int t_show(struct seq_file *m, void *v)  	if (!rec)  		return 0; -	seq_printf(m, "%pf\n", (void *)rec->ip); +	seq_printf(m, "%ps\n", (void *)rec->ip);  	return 0;  } @@ -1604,6 +1602,11 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)  	if (!iter)  		return -ENOMEM; +	if (trace_parser_get_init(&iter->parser, FTRACE_BUFF_MAX)) { +		kfree(iter); +		return -ENOMEM; +	} +  	mutex_lock(&ftrace_regex_lock);  	if ((file->f_mode & FMODE_WRITE) &&  	    (file->f_flags & O_TRUNC)) @@ -2059,9 +2062,9 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,  	int i, len = 0;  	char *search; -	if (glob && (strcmp(glob, "*") || !strlen(glob))) +	if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))  		glob = NULL; -	else { +	else if (glob) {  		int not;  		type = ftrace_setup_glob(glob, strlen(glob), &search, ¬); @@ -2196,9 +2199,8 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,  		   size_t cnt, loff_t *ppos, int enable)  {  	struct ftrace_iterator *iter; -	char ch; -	size_t read = 0; -	ssize_t ret; +	struct trace_parser *parser; +	ssize_t ret, read;  	if (!cnt || cnt < 0)  		return 0; @@ -2211,72 +2213,23 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,  	} else  		iter = file->private_data; -	if (!*ppos) { -		iter->flags &= ~FTRACE_ITER_CONT; -		iter->buffer_idx = 0; -	} - -	ret = get_user(ch, ubuf++); -	if (ret) -		goto out; -	read++; -	cnt--; +	parser = &iter->parser; +	read = trace_get_user(parser, ubuf, cnt, ppos); -	/* -	 * If the parser haven't finished with the last write, -	 * continue reading the user input without skipping spaces. -	 */ -	if (!(iter->flags & FTRACE_ITER_CONT)) { -		/* skip white space */ -		while (cnt && isspace(ch)) { -			ret = get_user(ch, ubuf++); -			if (ret) -				goto out; -			read++; -			cnt--; -		} - -		/* only spaces were written */ -		if (isspace(ch)) { -			*ppos += read; -			ret = read; -			goto out; -		} - -		iter->buffer_idx = 0; -	} - -	while (cnt && !isspace(ch)) { -		if (iter->buffer_idx < FTRACE_BUFF_MAX) -			iter->buffer[iter->buffer_idx++] = ch; -		else { -			ret = -EINVAL; -			goto out; -		} -		ret = get_user(ch, ubuf++); +	if (trace_parser_loaded(parser) && +	    !trace_parser_cont(parser)) { +		ret = ftrace_process_regex(parser->buffer, +					   parser->idx, enable);  		if (ret)  			goto out; -		read++; -		cnt--; -	} -	if (isspace(ch)) { -		iter->buffer[iter->buffer_idx] = 0; -		ret = ftrace_process_regex(iter->buffer, -					   iter->buffer_idx, enable); -		if (ret) -			goto out; -		iter->buffer_idx = 0; -	} else { -		iter->flags |= FTRACE_ITER_CONT; -		iter->buffer[iter->buffer_idx++] = ch; +		trace_parser_clear(parser);  	} -	*ppos += read;  	ret = read; - out: -	mutex_unlock(&ftrace_regex_lock); +	mutex_unlock(&ftrace_regex_lock); +out:  	return ret;  } @@ -2381,6 +2334,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)  {  	struct seq_file *m = (struct seq_file *)file->private_data;  	struct ftrace_iterator *iter; +	struct trace_parser *parser;  	mutex_lock(&ftrace_regex_lock);  	if (file->f_mode & FMODE_READ) { @@ -2390,9 +2344,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)  	} else  		iter = file->private_data; -	if (iter->buffer_idx) { -		iter->buffer[iter->buffer_idx] = 0; -		ftrace_match_records(iter->buffer, iter->buffer_idx, enable); +	parser = &iter->parser; +	if (trace_parser_loaded(parser)) { +		parser->buffer[parser->idx] = 0; +		ftrace_match_records(parser->buffer, parser->idx, enable);  	}  	mutex_lock(&ftrace_lock); @@ -2400,7 +2355,9 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)  		ftrace_run_update_code(FTRACE_ENABLE_CALLS);  	mutex_unlock(&ftrace_lock); +	trace_parser_put(parser);  	kfree(iter); +  	mutex_unlock(&ftrace_regex_lock);  	return 0;  } @@ -2499,7 +2456,7 @@ static int g_show(struct seq_file *m, void *v)  		return 0;  	} -	seq_printf(m, "%pf\n", v); +	seq_printf(m, "%ps\n", (void *)*ptr);  	return 0;  } @@ -2602,12 +2559,10 @@ static ssize_t  ftrace_graph_write(struct file *file, const char __user *ubuf,  		   size_t cnt, loff_t *ppos)  { -	unsigned char buffer[FTRACE_BUFF_MAX+1]; +	struct trace_parser parser;  	unsigned long *array;  	size_t read = 0;  	ssize_t ret; -	int index = 0; -	char ch;  	if (!cnt || cnt < 0)  		return 0; @@ -2625,51 +2580,26 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,  	} else  		array = file->private_data; -	ret = get_user(ch, ubuf++); -	if (ret) +	if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { +		ret = -ENOMEM;  		goto out; -	read++; -	cnt--; - -	/* skip white space */ -	while (cnt && isspace(ch)) { -		ret = get_user(ch, ubuf++); -		if (ret) -			goto out; -		read++; -		cnt--;  	} -	if (isspace(ch)) { -		*ppos += read; -		ret = read; -		goto out; -	} +	read = trace_get_user(&parser, ubuf, cnt, ppos); -	while (cnt && !isspace(ch)) { -		if (index < FTRACE_BUFF_MAX) -			buffer[index++] = ch; -		else { -			ret = -EINVAL; -			goto out; -		} -		ret = get_user(ch, ubuf++); +	if (trace_parser_loaded((&parser))) { +		parser.buffer[parser.idx] = 0; + +		/* we allow only one expression at a time */ +		ret = ftrace_set_func(array, &ftrace_graph_count, +					parser.buffer);  		if (ret)  			goto out; -		read++; -		cnt--;  	} -	buffer[index] = 0; - -	/* we allow only one expression at a time */ -	ret = ftrace_set_func(array, &ftrace_graph_count, buffer); -	if (ret) -		goto out; - -	file->f_pos += read;  	ret = read;   out: +	trace_parser_put(&parser);  	mutex_unlock(&graph_lock);  	return ret; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 454e74e718c..6eef38923b0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -701,8 +701,8 @@ static int rb_head_page_set(struct ring_buffer_per_cpu *cpu_buffer,  	val &= ~RB_FLAG_MASK; -	ret = (unsigned long)cmpxchg(&list->next, -				     val | old_flag, val | new_flag); +	ret = cmpxchg((unsigned long *)&list->next, +		      val | old_flag, val | new_flag);  	/* check if the reader took the page */  	if ((ret & ~RB_FLAG_MASK) != val) @@ -794,7 +794,7 @@ static int rb_head_page_replace(struct buffer_page *old,  	val = *ptr & ~RB_FLAG_MASK;  	val |= RB_PAGE_HEAD; -	ret = cmpxchg(ptr, val, &new->list); +	ret = cmpxchg(ptr, val, (unsigned long)&new->list);  	return ret == val;  } @@ -2997,15 +2997,12 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)  }  static struct ring_buffer_event * -rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) +rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts)  { -	struct ring_buffer_per_cpu *cpu_buffer;  	struct ring_buffer_event *event;  	struct buffer_page *reader;  	int nr_loops = 0; -	cpu_buffer = buffer->buffers[cpu]; -   again:  	/*  	 * We repeat when a timestamp is encountered. It is possible @@ -3049,7 +3046,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)  	case RINGBUF_TYPE_DATA:  		if (ts) {  			*ts = cpu_buffer->read_stamp + event->time_delta; -			ring_buffer_normalize_time_stamp(buffer, +			ring_buffer_normalize_time_stamp(cpu_buffer->buffer,  							 cpu_buffer->cpu, ts);  		}  		return event; @@ -3168,7 +3165,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)  	local_irq_save(flags);  	if (dolock)  		spin_lock(&cpu_buffer->reader_lock); -	event = rb_buffer_peek(buffer, cpu, ts); +	event = rb_buffer_peek(cpu_buffer, ts);  	if (event && event->type_len == RINGBUF_TYPE_PADDING)  		rb_advance_reader(cpu_buffer);  	if (dolock) @@ -3237,7 +3234,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)  	if (dolock)  		spin_lock(&cpu_buffer->reader_lock); -	event = rb_buffer_peek(buffer, cpu, ts); +	event = rb_buffer_peek(cpu_buffer, ts);  	if (event)  		rb_advance_reader(cpu_buffer); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5c75deeefe3..fd52a19dd17 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -339,6 +339,112 @@ static struct {  int trace_clock_id; +/* + * trace_parser_get_init - gets the buffer for trace parser + */ +int trace_parser_get_init(struct trace_parser *parser, int size) +{ +	memset(parser, 0, sizeof(*parser)); + +	parser->buffer = kmalloc(size, GFP_KERNEL); +	if (!parser->buffer) +		return 1; + +	parser->size = size; +	return 0; +} + +/* + * trace_parser_put - frees the buffer for trace parser + */ +void trace_parser_put(struct trace_parser *parser) +{ +	kfree(parser->buffer); +} + +/* + * trace_get_user - reads the user input string separated by  space + * (matched by isspace(ch)) + * + * For each string found the 'struct trace_parser' is updated, + * and the function returns. + * + * Returns number of bytes read. + * + * See kernel/trace/trace.h for 'struct trace_parser' details. + */ +int trace_get_user(struct trace_parser *parser, const char __user *ubuf, +	size_t cnt, loff_t *ppos) +{ +	char ch; +	size_t read = 0; +	ssize_t ret; + +	if (!*ppos) +		trace_parser_clear(parser); + +	ret = get_user(ch, ubuf++); +	if (ret) +		goto out; + +	read++; +	cnt--; + +	/* +	 * The parser is not finished with the last write, +	 * continue reading the user input without skipping spaces. +	 */ +	if (!parser->cont) { +		/* skip white space */ +		while (cnt && isspace(ch)) { +			ret = get_user(ch, ubuf++); +			if (ret) +				goto out; +			read++; +			cnt--; +		} + +		/* only spaces were written */ +		if (isspace(ch)) { +			*ppos += read; +			ret = read; +			goto out; +		} + +		parser->idx = 0; +	} + +	/* read the non-space input */ +	while (cnt && !isspace(ch)) { +		if (parser->idx < parser->size) +			parser->buffer[parser->idx++] = ch; +		else { +			ret = -EINVAL; +			goto out; +		} +		ret = get_user(ch, ubuf++); +		if (ret) +			goto out; +		read++; +		cnt--; +	} + +	/* We either got finished input or we have to wait for another call. */ +	if (isspace(ch)) { +		parser->buffer[parser->idx] = 0; +		parser->cont = false; +	} else { +		parser->cont = true; +		parser->buffer[parser->idx++] = ch; +	} + +	*ppos += read; +	ret = read; + +out: +	return ret; +} +  ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)  {  	int len; @@ -719,6 +825,11 @@ static void trace_init_cmdlines(void)  	cmdline_idx = 0;  } +int is_tracing_stopped(void) +{ +	return trace_stop_count; +} +  /**   * ftrace_off_permanent - disable all ftrace code permanently   * @@ -886,7 +997,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,  	entry->preempt_count		= pc & 0xff;  	entry->pid			= (tsk) ? tsk->pid : 0; -	entry->tgid			= (tsk) ? tsk->tgid : 0; +	entry->lock_depth		= (tsk) ? tsk->lock_depth : 0;  	entry->flags =  #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT  		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | @@ -1068,6 +1179,7 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)  		return;  	entry	= ring_buffer_event_data(event); +	entry->tgid		= current->tgid;  	memset(&entry->caller, 0, sizeof(entry->caller));  	trace.nr_entries	= 0; @@ -1094,6 +1206,7 @@ ftrace_trace_special(void *__tr,  		     unsigned long arg1, unsigned long arg2, unsigned long arg3,  		     int pc)  { +	struct ftrace_event_call *call = &event_special;  	struct ring_buffer_event *event;  	struct trace_array *tr = __tr;  	struct ring_buffer *buffer = tr->buffer; @@ -1107,7 +1220,9 @@ ftrace_trace_special(void *__tr,  	entry->arg1			= arg1;  	entry->arg2			= arg2;  	entry->arg3			= arg3; -	trace_buffer_unlock_commit(buffer, event, 0, pc); + +	if (!filter_check_discard(call, entry, buffer, event)) +		trace_buffer_unlock_commit(buffer, event, 0, pc);  }  void @@ -1530,10 +1645,10 @@ static void print_lat_help_header(struct seq_file *m)  	seq_puts(m, "#                | / _----=> need-resched    \n");  	seq_puts(m, "#                || / _---=> hardirq/softirq \n");  	seq_puts(m, "#                ||| / _--=> preempt-depth   \n"); -	seq_puts(m, "#                |||| /                      \n"); -	seq_puts(m, "#                |||||     delay             \n"); -	seq_puts(m, "#  cmd     pid   ||||| time  |   caller      \n"); -	seq_puts(m, "#     \\   /      |||||   \\   |   /           \n"); +	seq_puts(m, "#                |||| /_--=> lock-depth       \n"); +	seq_puts(m, "#                |||||/     delay             \n"); +	seq_puts(m, "#  cmd     pid   |||||| time  |   caller      \n"); +	seq_puts(m, "#     \\   /      ||||||   \\   |   /           \n");  }  static void print_func_help_header(struct seq_file *m) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fa1dccb579d..86bcff94791 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -7,6 +7,7 @@  #include <linux/clocksource.h>  #include <linux/ring_buffer.h>  #include <linux/mmiotrace.h> +#include <linux/tracepoint.h>  #include <linux/ftrace.h>  #include <trace/boot.h>  #include <linux/kmemtrace.h> @@ -42,157 +43,54 @@ enum trace_type {  	__TRACE_LAST_TYPE,  }; -/* - * Function trace entry - function address and parent function addres: - */ -struct ftrace_entry { -	struct trace_entry	ent; -	unsigned long		ip; -	unsigned long		parent_ip; -}; - -/* Function call entry */ -struct ftrace_graph_ent_entry { -	struct trace_entry		ent; -	struct ftrace_graph_ent		graph_ent; +enum kmemtrace_type_id { +	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */ +	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */ +	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */  }; -/* Function return entry */ -struct ftrace_graph_ret_entry { -	struct trace_entry		ent; -	struct ftrace_graph_ret		ret; -};  extern struct tracer boot_tracer; -/* - * Context switch trace entry - which task (and prio) we switched from/to: - */ -struct ctx_switch_entry { -	struct trace_entry	ent; -	unsigned int		prev_pid; -	unsigned char		prev_prio; -	unsigned char		prev_state; -	unsigned int		next_pid; -	unsigned char		next_prio; -	unsigned char		next_state; -	unsigned int		next_cpu; -}; - -/* - * Special (free-form) trace entry: - */ -struct special_entry { -	struct trace_entry	ent; -	unsigned long		arg1; -	unsigned long		arg2; -	unsigned long		arg3; -}; - -/* - * Stack-trace entry: - */ - -#define FTRACE_STACK_ENTRIES	8 - -struct stack_entry { -	struct trace_entry	ent; -	unsigned long		caller[FTRACE_STACK_ENTRIES]; -}; - -struct userstack_entry { -	struct trace_entry	ent; -	unsigned long		caller[FTRACE_STACK_ENTRIES]; -}; - -/* - * trace_printk entry: - */ -struct bprint_entry { -	struct trace_entry	ent; -	unsigned long		ip; -	const char		*fmt; -	u32			buf[]; -}; +#undef __field +#define __field(type, item)		type	item; -struct print_entry { -	struct trace_entry	ent; -	unsigned long		ip; -	char			buf[]; -}; +#undef __field_struct +#define __field_struct(type, item)	__field(type, item) -#define TRACE_OLD_SIZE		88 +#undef __field_desc +#define __field_desc(type, container, item) -struct trace_field_cont { -	unsigned char		type; -	/* Temporary till we get rid of this completely */ -	char			buf[TRACE_OLD_SIZE - 1]; -}; +#undef __array +#define __array(type, item, size)	type	item[size]; -struct trace_mmiotrace_rw { -	struct trace_entry	ent; -	struct mmiotrace_rw	rw; -}; +#undef __array_desc +#define __array_desc(type, container, item, size) -struct trace_mmiotrace_map { -	struct trace_entry	ent; -	struct mmiotrace_map	map; -}; +#undef __dynamic_array +#define __dynamic_array(type, item)	type	item[]; -struct trace_boot_call { -	struct trace_entry	ent; -	struct boot_trace_call boot_call; -}; +#undef F_STRUCT +#define F_STRUCT(args...)		args -struct trace_boot_ret { -	struct trace_entry	ent; -	struct boot_trace_ret boot_ret; -}; - -#define TRACE_FUNC_SIZE 30 -#define TRACE_FILE_SIZE 20 -struct trace_branch { -	struct trace_entry	ent; -	unsigned	        line; -	char			func[TRACE_FUNC_SIZE+1]; -	char			file[TRACE_FILE_SIZE+1]; -	char			correct; -}; - -struct hw_branch_entry { -	struct trace_entry	ent; -	u64			from; -	u64			to; -}; - -struct trace_power { -	struct trace_entry	ent; -	struct power_trace	state_data; -}; +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)	\ +	struct struct_name {					\ +		struct trace_entry	ent;			\ +		tstruct						\ +	} -enum kmemtrace_type_id { -	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */ -	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */ -	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */ -}; +#undef TP_ARGS +#define TP_ARGS(args...)	args -struct kmemtrace_alloc_entry { -	struct trace_entry	ent; -	enum kmemtrace_type_id type_id; -	unsigned long call_site; -	const void *ptr; -	size_t bytes_req; -	size_t bytes_alloc; -	gfp_t gfp_flags; -	int node; -}; +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) -struct kmemtrace_free_entry { -	struct trace_entry	ent; -	enum kmemtrace_type_id type_id; -	unsigned long call_site; -	const void *ptr; -}; +#include "trace_entries.h" +/* + * syscalls are special, and need special handling, this is why + * they are not included in trace_entries.h + */  struct syscall_trace_enter {  	struct trace_entry	ent;  	int			nr; @@ -205,13 +103,12 @@ struct syscall_trace_exit {  	unsigned long		ret;  }; -  /*   * trace_flag_type is an enumeration that holds different   * states when a trace occurs. These are:   *  IRQS_OFF		- interrupts were disabled   *  IRQS_NOSUPPORT	- arch does not support irqs_disabled_flags - *  NEED_RESCED		- reschedule is requested + *  NEED_RESCHED	- reschedule is requested   *  HARDIRQ		- inside an interrupt handler   *  SOFTIRQ		- inside a softirq handler   */ @@ -390,7 +287,6 @@ struct tracer {  	struct tracer		*next;  	int			print_max;  	struct tracer_flags	*flags; -	struct tracer_stat	*stats;  }; @@ -469,6 +365,7 @@ void tracing_stop_sched_switch_record(void);  void tracing_start_sched_switch_record(void);  int register_tracer(struct tracer *type);  void unregister_tracer(struct tracer *type); +int is_tracing_stopped(void);  extern unsigned long nsecs_to_usecs(unsigned long nsecs); @@ -509,20 +406,6 @@ static inline void __trace_stack(struct trace_array *tr, unsigned long flags,  extern cycle_t ftrace_now(int cpu); -#ifdef CONFIG_CONTEXT_SWITCH_TRACER -typedef void -(*tracer_switch_func_t)(void *private, -			void *__rq, -			struct task_struct *prev, -			struct task_struct *next); - -struct tracer_switch_ops { -	tracer_switch_func_t		func; -	void				*private; -	struct tracer_switch_ops	*next; -}; -#endif /* CONFIG_CONTEXT_SWITCH_TRACER */ -  extern void trace_find_cmdline(int pid, char comm[]);  #ifdef CONFIG_DYNAMIC_FTRACE @@ -638,6 +521,41 @@ static inline int ftrace_trace_task(struct task_struct *task)  #endif  /* + * struct trace_parser - servers for reading the user input separated by spaces + * @cont: set if the input is not complete - no final space char was found + * @buffer: holds the parsed user input + * @idx: user input lenght + * @size: buffer size + */ +struct trace_parser { +	bool		cont; +	char		*buffer; +	unsigned	idx; +	unsigned	size; +}; + +static inline bool trace_parser_loaded(struct trace_parser *parser) +{ +	return (parser->idx != 0); +} + +static inline bool trace_parser_cont(struct trace_parser *parser) +{ +	return parser->cont; +} + +static inline void trace_parser_clear(struct trace_parser *parser) +{ +	parser->cont = false; +	parser->idx = 0; +} + +extern int trace_parser_get_init(struct trace_parser *parser, int size); +extern void trace_parser_put(struct trace_parser *parser); +extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, +	size_t cnt, loff_t *ppos); + +/*   * trace_iterator_flags is an enumeration that defines bit   * positions into trace_flags that controls the output.   * @@ -823,58 +741,18 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,  	return 0;  } -#define DEFINE_COMPARISON_PRED(type)					\ -static int filter_pred_##type(struct filter_pred *pred, void *event,	\ -			      int val1, int val2)			\ -{									\ -	type *addr = (type *)(event + pred->offset);			\ -	type val = (type)pred->val;					\ -	int match = 0;							\ -									\ -	switch (pred->op) {						\ -	case OP_LT:							\ -		match = (*addr < val);					\ -		break;							\ -	case OP_LE:							\ -		match = (*addr <= val);					\ -		break;							\ -	case OP_GT:							\ -		match = (*addr > val);					\ -		break;							\ -	case OP_GE:							\ -		match = (*addr >= val);					\ -		break;							\ -	default:							\ -		break;							\ -	}								\ -									\ -	return match;							\ -} - -#define DEFINE_EQUALITY_PRED(size)					\ -static int filter_pred_##size(struct filter_pred *pred, void *event,	\ -			      int val1, int val2)			\ -{									\ -	u##size *addr = (u##size *)(event + pred->offset);		\ -	u##size val = (u##size)pred->val;				\ -	int match;							\ -									\ -	match = (val == *addr) ^ pred->not;				\ -									\ -	return match;							\ -} -  extern struct mutex event_mutex;  extern struct list_head ftrace_events;  extern const char *__start___trace_bprintk_fmt[];  extern const char *__stop___trace_bprintk_fmt[]; -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, id, tstruct, print)		\  	extern struct ftrace_event_call event_##call; -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) -#include "trace_event_types.h" +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print)		\ +	FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) +#include "trace_entries.h"  #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 19bfc75d467..c21d5f3956a 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -129,6 +129,7 @@ struct tracer boot_tracer __read_mostly =  void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)  { +	struct ftrace_event_call *call = &event_boot_call;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer;  	struct trace_boot_call *entry; @@ -150,13 +151,15 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)  		goto out;  	entry	= ring_buffer_event_data(event);  	entry->boot_call = *bt; -	trace_buffer_unlock_commit(buffer, event, 0, 0); +	if (!filter_check_discard(call, entry, buffer, event)) +		trace_buffer_unlock_commit(buffer, event, 0, 0);   out:  	preempt_enable();  }  void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)  { +	struct ftrace_event_call *call = &event_boot_ret;  	struct ring_buffer_event *event;  	struct ring_buffer *buffer;  	struct trace_boot_ret *entry; @@ -175,7 +178,8 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)  		goto out;  	entry	= ring_buffer_event_data(event);  	entry->boot_ret = *bt; -	trace_buffer_unlock_commit(buffer, event, 0, 0); +	if (!filter_check_discard(call, entry, buffer, event)) +		trace_buffer_unlock_commit(buffer, event, 0, 0);   out:  	preempt_enable();  } diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index b588fd81f7f..20c5f92e28a 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -66,10 +66,14 @@ u64 notrace trace_clock(void)   * Used by plugins that need globally coherent timestamps.   */ -static u64 prev_trace_clock_time; - -static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp = -	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; +/* keep prev_time and lock in the same cacheline. */ +static struct { +	u64 prev_time; +	raw_spinlock_t lock; +} trace_clock_struct ____cacheline_aligned_in_smp = +	{ +		.lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED, +	};  u64 notrace trace_clock_global(void)  { @@ -88,19 +92,19 @@ u64 notrace trace_clock_global(void)  	if (unlikely(in_nmi()))  		goto out; -	__raw_spin_lock(&trace_clock_lock); +	__raw_spin_lock(&trace_clock_struct.lock);  	/*  	 * TODO: if this happens often then maybe we should reset -	 * my_scd->clock to prev_trace_clock_time+1, to make sure +	 * my_scd->clock to prev_time+1, to make sure  	 * we start ticking with the local clock from now on?  	 */ -	if ((s64)(now - prev_trace_clock_time) < 0) -		now = prev_trace_clock_time + 1; +	if ((s64)(now - trace_clock_struct.prev_time) < 0) +		now = trace_clock_struct.prev_time + 1; -	prev_trace_clock_time = now; +	trace_clock_struct.prev_time = now; -	__raw_spin_unlock(&trace_clock_lock); +	__raw_spin_unlock(&trace_clock_struct.lock);   out:  	raw_local_irq_restore(flags); diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h new file mode 100644 index 00000000000..a431748ddd6 --- /dev/null +++ b/kernel/trace/trace_entries.h @@ -0,0 +1,383 @@ +/* + * This file defines the trace event structures that go into the ring + * buffer directly. They are created via macros so that changes for them + * appear in the format file. Using macros will automate this process. + * + * The macro used to create a ftrace data structure is: + * + * FTRACE_ENTRY( name, struct_name, id, structure, print ) + * + * @name: the name used the event name, as well as the name of + *   the directory that holds the format file. + * + * @struct_name: the name of the structure that is created. + * + * @id: The event identifier that is used to detect what event + *    this is from the ring buffer. + * + * @structure: the structure layout + * + *  - __field(	type,	item	) + *	  This is equivalent to declaring + *		type	item; + *	  in the structure. + *  - __array(	type,	item,	size	) + *	  This is equivalent to declaring + *		type	item[size]; + *	  in the structure. + * + *   * for structures within structures, the format of the internal + *	structure is layed out. This allows the internal structure + *	to be deciphered for the format file. Although these macros + *	may become out of sync with the internal structure, they + *	will create a compile error if it happens. Since the + *	internel structures are just tracing helpers, this is not + *	an issue. + * + *	When an internal structure is used, it should use: + * + *	__field_struct(	type,	item	) + * + *	instead of __field. This will prevent it from being shown in + *	the output file. The fields in the structure should use. + * + *	__field_desc(	type,	container,	item		) + *	__array_desc(	type,	container,	item,	len	) + * + *	type, item and len are the same as __field and __array, but + *	container is added. This is the name of the item in + *	__field_struct that this is describing. + * + * + * @print: the print format shown to users in the format file. + */ + +/* + * Function trace entry - function address and parent function addres: + */ +FTRACE_ENTRY(function, ftrace_entry, + +	TRACE_FN, + +	F_STRUCT( +		__field(	unsigned long,	ip		) +		__field(	unsigned long,	parent_ip	) +	), + +	F_printk(" %lx <-- %lx", __entry->ip, __entry->parent_ip) +); + +/* Function call entry */ +FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, + +	TRACE_GRAPH_ENT, + +	F_STRUCT( +		__field_struct(	struct ftrace_graph_ent,	graph_ent	) +		__field_desc(	unsigned long,	graph_ent,	func		) +		__field_desc(	int,		graph_ent,	depth		) +	), + +	F_printk("--> %lx (%d)", __entry->func, __entry->depth) +); + +/* Function return entry */ +FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry, + +	TRACE_GRAPH_RET, + +	F_STRUCT( +		__field_struct(	struct ftrace_graph_ret,	ret	) +		__field_desc(	unsigned long,	ret,		func	) +		__field_desc(	unsigned long long, ret,	calltime) +		__field_desc(	unsigned long long, ret,	rettime	) +		__field_desc(	unsigned long,	ret,		overrun	) +		__field_desc(	int,		ret,		depth	) +	), + +	F_printk("<-- %lx (%d) (start: %llx  end: %llx) over: %d", +		 __entry->func, __entry->depth, +		 __entry->calltime, __entry->rettime, +		 __entry->depth) +); + +/* + * Context switch trace entry - which task (and prio) we switched from/to: + * + * This is used for both wakeup and context switches. We only want + * to create one structure, but we need two outputs for it. + */ +#define FTRACE_CTX_FIELDS					\ +	__field(	unsigned int,	prev_pid	)	\ +	__field(	unsigned char,	prev_prio	)	\ +	__field(	unsigned char,	prev_state	)	\ +	__field(	unsigned int,	next_pid	)	\ +	__field(	unsigned char,	next_prio	)	\ +	__field(	unsigned char,	next_state	)	\ +	__field(	unsigned int,	next_cpu	) + +FTRACE_ENTRY(context_switch, ctx_switch_entry, + +	TRACE_CTX, + +	F_STRUCT( +		FTRACE_CTX_FIELDS +	), + +	F_printk("%u:%u:%u  ==> %u:%u:%u [%03u]", +		 __entry->prev_pid, __entry->prev_prio, __entry->prev_state, +		 __entry->next_pid, __entry->next_prio, __entry->next_state, +		 __entry->next_cpu +		) +); + +/* + * FTRACE_ENTRY_DUP only creates the format file, it will not + *  create another structure. + */ +FTRACE_ENTRY_DUP(wakeup, ctx_switch_entry, + +	TRACE_WAKE, + +	F_STRUCT( +		FTRACE_CTX_FIELDS +	), + +	F_printk("%u:%u:%u  ==+ %u:%u:%u [%03u]", +		 __entry->prev_pid, __entry->prev_prio, __entry->prev_state, +		 __entry->next_pid, __entry->next_prio, __entry->next_state, +		 __entry->next_cpu +		) +); + +/* + * Special (free-form) trace entry: + */ +FTRACE_ENTRY(special, special_entry, + +	TRACE_SPECIAL, + +	F_STRUCT( +		__field(	unsigned long,	arg1	) +		__field(	unsigned long,	arg2	) +		__field(	unsigned long,	arg3	) +	), + +	F_printk("(%08lx) (%08lx) (%08lx)", +		 __entry->arg1, __entry->arg2, __entry->arg3) +); + +/* + * Stack-trace entry: + */ + +#define FTRACE_STACK_ENTRIES	8 + +FTRACE_ENTRY(kernel_stack, stack_entry, + +	TRACE_STACK, + +	F_STRUCT( +		__array(	unsigned long,	caller, FTRACE_STACK_ENTRIES	) +	), + +	F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" +		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", +		 __entry->caller[0], __entry->caller[1], __entry->caller[2], +		 __entry->caller[3], __entry->caller[4], __entry->caller[5], +		 __entry->caller[6], __entry->caller[7]) +); + +FTRACE_ENTRY(user_stack, userstack_entry, + +	TRACE_USER_STACK, + +	F_STRUCT( +		__field(	unsigned int,	tgid	) +		__array(	unsigned long,	caller, FTRACE_STACK_ENTRIES	) +	), + +	F_printk("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" +		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n", +		 __entry->caller[0], __entry->caller[1], __entry->caller[2], +		 __entry->caller[3], __entry->caller[4], __entry->caller[5], +		 __entry->caller[6], __entry->caller[7]) +); + +/* + * trace_printk entry: + */ +FTRACE_ENTRY(bprint, bprint_entry, + +	TRACE_BPRINT, + +	F_STRUCT( +		__field(	unsigned long,	ip	) +		__field(	const char *,	fmt	) +		__dynamic_array(	u32,	buf	) +	), + +	F_printk("%08lx fmt:%p", +		 __entry->ip, __entry->fmt) +); + +FTRACE_ENTRY(print, print_entry, + +	TRACE_PRINT, + +	F_STRUCT( +		__field(	unsigned long,	ip	) +		__dynamic_array(	char,	buf	) +	), + +	F_printk("%08lx %s", +		 __entry->ip, __entry->buf) +); + +FTRACE_ENTRY(mmiotrace_rw, trace_mmiotrace_rw, + +	TRACE_MMIO_RW, + +	F_STRUCT( +		__field_struct(	struct mmiotrace_rw,	rw	) +		__field_desc(	resource_size_t, rw,	phys	) +		__field_desc(	unsigned long,	rw,	value	) +		__field_desc(	unsigned long,	rw,	pc	) +		__field_desc(	int, 		rw,	map_id	) +		__field_desc(	unsigned char,	rw,	opcode	) +		__field_desc(	unsigned char,	rw,	width	) +	), + +	F_printk("%lx %lx %lx %d %x %x", +		 (unsigned long)__entry->phys, __entry->value, __entry->pc, +		 __entry->map_id, __entry->opcode, __entry->width) +); + +FTRACE_ENTRY(mmiotrace_map, trace_mmiotrace_map, + +	TRACE_MMIO_MAP, + +	F_STRUCT( +		__field_struct(	struct mmiotrace_map,	map	) +		__field_desc(	resource_size_t, map,	phys	) +		__field_desc(	unsigned long,	map,	virt	) +		__field_desc(	unsigned long,	map,	len	) +		__field_desc(	int, 		map,	map_id	) +		__field_desc(	unsigned char,	map,	opcode	) +	), + +	F_printk("%lx %lx %lx %d %x", +		 (unsigned long)__entry->phys, __entry->virt, __entry->len, +		 __entry->map_id, __entry->opcode) +); + +FTRACE_ENTRY(boot_call, trace_boot_call, + +	TRACE_BOOT_CALL, + +	F_STRUCT( +		__field_struct(	struct boot_trace_call,	boot_call	) +		__field_desc(	pid_t,	boot_call,	caller		) +		__array_desc(	char,	boot_call,	func,	KSYM_SYMBOL_LEN) +	), + +	F_printk("%d  %s", __entry->caller, __entry->func) +); + +FTRACE_ENTRY(boot_ret, trace_boot_ret, + +	TRACE_BOOT_RET, + +	F_STRUCT( +		__field_struct(	struct boot_trace_ret,	boot_ret	) +		__array_desc(	char,	boot_ret,	func,	KSYM_SYMBOL_LEN) +		__field_desc(	int,	boot_ret,	result		) +		__field_desc(	unsigned long, boot_ret, duration	) +	), + +	F_printk("%s %d %lx", +		 __entry->func, __entry->result, __entry->duration) +); + +#define TRACE_FUNC_SIZE 30 +#define TRACE_FILE_SIZE 20 + +FTRACE_ENTRY(branch, trace_branch, + +	TRACE_BRANCH, + +	F_STRUCT( +		__field(	unsigned int,	line				) +		__array(	char,		func,	TRACE_FUNC_SIZE+1	) +		__array(	char,		file,	TRACE_FILE_SIZE+1	) +		__field(	char,		correct				) +	), + +	F_printk("%u:%s:%s (%u)", +		 __entry->line, +		 __entry->func, __entry->file, __entry->correct) +); + +FTRACE_ENTRY(hw_branch, hw_branch_entry, + +	TRACE_HW_BRANCHES, + +	F_STRUCT( +		__field(	u64,	from	) +		__field(	u64,	to	) +	), + +	F_printk("from: %llx to: %llx", __entry->from, __entry->to) +); + +FTRACE_ENTRY(power, trace_power, + +	TRACE_POWER, + +	F_STRUCT( +		__field_struct(	struct power_trace,	state_data	) +		__field_desc(	s64,	state_data,	stamp		) +		__field_desc(	s64,	state_data,	end		) +		__field_desc(	int,	state_data,	type		) +		__field_desc(	int,	state_data,	state		) +	), + +	F_printk("%llx->%llx type:%u state:%u", +		 __entry->stamp, __entry->end, +		 __entry->type, __entry->state) +); + +FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry, + +	TRACE_KMEM_ALLOC, + +	F_STRUCT( +		__field(	enum kmemtrace_type_id,	type_id		) +		__field(	unsigned long,		call_site	) +		__field(	const void *,		ptr		) +		__field(	size_t,			bytes_req	) +		__field(	size_t,			bytes_alloc	) +		__field(	gfp_t,			gfp_flags	) +		__field(	int,			node		) +	), + +	F_printk("type:%u call_site:%lx ptr:%p req:%zi alloc:%zi" +		 " flags:%x node:%d", +		 __entry->type_id, __entry->call_site, __entry->ptr, +		 __entry->bytes_req, __entry->bytes_alloc, +		 __entry->gfp_flags, __entry->node) +); + +FTRACE_ENTRY(kmem_free, kmemtrace_free_entry, + +	TRACE_KMEM_FREE, + +	F_STRUCT( +		__field(	enum kmemtrace_type_id,	type_id		) +		__field(	unsigned long,		call_site	) +		__field(	const void *,		ptr		) +	), + +	F_printk("type:%u call_site:%lx ptr:%p", +		 __entry->type_id, __entry->call_site, __entry->ptr) +); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 11ba5bb4ed0..55a25c933d1 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -5,6 +5,7 @@   *   */ +#include <linux/module.h>  #include "trace.h"  int ftrace_profile_enable(int event_id) @@ -14,7 +15,8 @@ int ftrace_profile_enable(int event_id)  	mutex_lock(&event_mutex);  	list_for_each_entry(event, &ftrace_events, list) { -		if (event->id == event_id && event->profile_enable) { +		if (event->id == event_id && event->profile_enable && +		    try_module_get(event->mod)) {  			ret = event->profile_enable(event);  			break;  		} @@ -32,6 +34,7 @@ void ftrace_profile_disable(int event_id)  	list_for_each_entry(event, &ftrace_events, list) {  		if (event->id == event_id) {  			event->profile_disable(event); +			module_put(event->mod);  			break;  		}  	} diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h deleted file mode 100644 index 6db005e1248..00000000000 --- a/kernel/trace/trace_event_types.h +++ /dev/null @@ -1,178 +0,0 @@ -#undef TRACE_SYSTEM -#define TRACE_SYSTEM	ftrace - -/* - * We cheat and use the proto type field as the ID - * and args as the entry type (minus 'struct') - */ -TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore, -	TRACE_STRUCT( -		TRACE_FIELD(unsigned long, ip, ip) -		TRACE_FIELD(unsigned long, parent_ip, parent_ip) -	), -	TP_RAW_FMT(" %lx <-- %lx") -); - -TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT, -		   ftrace_graph_ent_entry, ignore, -	TRACE_STRUCT( -		TRACE_FIELD(unsigned long, graph_ent.func, func) -		TRACE_FIELD(int, graph_ent.depth, depth) -	), -	TP_RAW_FMT("--> %lx (%d)") -); - -TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET, -		   ftrace_graph_ret_entry, ignore, -	TRACE_STRUCT( -		TRACE_FIELD(unsigned long, ret.func, func) -		TRACE_FIELD(unsigned long long, ret.calltime, calltime) -		TRACE_FIELD(unsigned long long, ret.rettime, rettime) -		TRACE_FIELD(unsigned long, ret.overrun, overrun) -		TRACE_FIELD(int, ret.depth, depth) -	), -	TP_RAW_FMT("<-- %lx (%d)") -); - -TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore, -	TRACE_STRUCT( -		TRACE_FIELD(unsigned int, prev_pid, prev_pid) -		TRACE_FIELD(unsigned char, prev_prio, prev_prio) -		TRACE_FIELD(unsigned char, prev_state, prev_state) -		TRACE_FIELD(unsigned int, next_pid, next_pid) -		TRACE_FIELD(unsigned char, next_prio, next_prio) -		TRACE_FIELD(unsigned char, next_state, next_state) -		TRACE_FIELD(unsigned int, next_cpu, next_cpu) -	), -	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]") -); - -TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, -	TRACE_STRUCT( -		TRACE_FIELD(unsigned int, prev_pid, prev_pid) -		TRACE_FIELD(unsigned char, prev_prio, prev_prio) -		TRACE_FIELD(unsigned char, prev_state, prev_state) -		TRACE_FIELD(unsigned int, next_pid, next_pid) -		TRACE_FIELD(unsigned char, next_prio, next_prio) -		TRACE_FIELD(unsigned char, next_state, next_state) -		TRACE_FIELD(unsigned int, next_cpu, next_cpu) -	), -	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]") -); - -TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore, -	TRACE_STRUCT( -		TRACE_FIELD(unsigned long, arg1, arg1) -		TRACE_FIELD(unsigned long, arg2, arg2) -		TRACE_FIELD(unsigned long, arg3, arg3) -	), -	TP_RAW_FMT("(%08lx) (%08lx) (%08lx)") -); - -/* - * Stack-trace entry: - */ - -/* #define FTRACE_STACK_ENTRIES   8 */ - -TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore, -	TRACE_STRUCT( -		TRACE_FIELD(unsigned long, caller[0], stack0) -		TRACE_FIELD(unsigned long, caller[1], stack1) -		TRACE_FIELD(unsigned long, caller[2], stack2) -		TRACE_FIELD(unsigned long, caller[3], stack3) -		TRACE_FIELD(unsigned long, caller[4], stack4) -		TRACE_FIELD(unsigned long, caller[5], stack5) -		TRACE_FIELD(unsigned long, caller[6], stack6) -		TRACE_FIELD(unsigned long, caller[7], stack7) -	), -	TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" -		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") -); - -TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore, -	TRACE_STRUCT( -		TRACE_FIELD(unsigned long, caller[0], stack0) -		TRACE_FIELD(unsigned long, caller[1], stack1) -		TRACE_FIELD(unsigned long, caller[2], stack2) -		TRACE_FIELD(unsigned long, caller[3], stack3) -		TRACE_FIELD(unsigned long, caller[4], stack4) -		TRACE_FIELD(unsigned long, caller[5], stack5) -		TRACE_FIELD(unsigned long, caller[6], stack6) -		TRACE_FIELD(unsigned long, caller[7], stack7) -	), -	TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n" -		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n") -); - -TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore, -	TRACE_STRUCT( -		TRACE_FIELD(unsigned long, ip, ip) -		TRACE_FIELD(char *, fmt, fmt) -		TRACE_FIELD_ZERO_CHAR(buf) -	), -	TP_RAW_FMT("%08lx (%d) fmt:%p %s") -); - -TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, -	TRACE_STRUCT( -		TRACE_FIELD(unsigned long, ip, ip) -		TRACE_FIELD_ZERO_CHAR(buf) -	), -	TP_RAW_FMT("%08lx (%d) fmt:%p %s") -); - -TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, -	TRACE_STRUCT( -		TRACE_FIELD(unsigned int, line, line) -		TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, -				    TRACE_FUNC_SIZE+1, func) -		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, -				    TRACE_FUNC_SIZE+1, file) -		TRACE_FIELD(char, correct, correct) -	), -	TP_RAW_FMT("%u:%s:%s (%u)") -); - -TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, -	TRACE_STRUCT( -		TRACE_FIELD(u64, from, from) -		TRACE_FIELD(u64, to, to) -	), -	TP_RAW_FMT("from: %llx to: %llx") -); - -TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, -	TRACE_STRUCT( -		TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1) -		TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1) -		TRACE_FIELD(int, state_data.type, type) -		TRACE_FIELD(int, state_data.state, state) -	), -	TP_RAW_FMT("%llx->%llx type:%u state:%u") -); - -TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore, -	TRACE_STRUCT( -		TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) -		TRACE_FIELD(unsigned long, call_site, call_site) -		TRACE_FIELD(const void *, ptr, ptr) -		TRACE_FIELD(size_t, bytes_req, bytes_req) -		TRACE_FIELD(size_t, bytes_alloc, bytes_alloc) -		TRACE_FIELD(gfp_t, gfp_flags, gfp_flags) -		TRACE_FIELD(int, node, node) -	), -	TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu" -		 " flags:%x node:%d") -); - -TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore, -	TRACE_STRUCT( -		TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id) -		TRACE_FIELD(unsigned long, call_site, call_site) -		TRACE_FIELD(const void *, ptr, ptr) -	), -	TP_RAW_FMT("type:%u call_site:%lx ptr:%p") -); - -#undef TRACE_SYSTEM diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 78b1ed23017..56c260b83a9 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -21,6 +21,7 @@  #include "trace_output.h" +#undef TRACE_SYSTEM  #define TRACE_SYSTEM "TRACE_SYSTEM"  DEFINE_MUTEX(event_mutex); @@ -86,7 +87,7 @@ int trace_define_common_fields(struct ftrace_event_call *call)  	__common_field(unsigned char, flags);  	__common_field(unsigned char, preempt_count);  	__common_field(int, pid); -	__common_field(int, tgid); +	__common_field(int, lock_depth);  	return ret;  } @@ -230,11 +231,9 @@ static ssize_t  ftrace_event_write(struct file *file, const char __user *ubuf,  		   size_t cnt, loff_t *ppos)  { +	struct trace_parser parser;  	size_t read = 0; -	int i, set = 1;  	ssize_t ret; -	char *buf; -	char ch;  	if (!cnt || cnt < 0)  		return 0; @@ -243,60 +242,28 @@ ftrace_event_write(struct file *file, const char __user *ubuf,  	if (ret < 0)  		return ret; -	ret = get_user(ch, ubuf++); -	if (ret) -		return ret; -	read++; -	cnt--; - -	/* skip white space */ -	while (cnt && isspace(ch)) { -		ret = get_user(ch, ubuf++); -		if (ret) -			return ret; -		read++; -		cnt--; -	} - -	/* Only white space found? */ -	if (isspace(ch)) { -		file->f_pos += read; -		ret = read; -		return ret; -	} - -	buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL); -	if (!buf) +	if (trace_parser_get_init(&parser, EVENT_BUF_SIZE + 1))  		return -ENOMEM; -	if (cnt > EVENT_BUF_SIZE) -		cnt = EVENT_BUF_SIZE; +	read = trace_get_user(&parser, ubuf, cnt, ppos); + +	if (trace_parser_loaded((&parser))) { +		int set = 1; -	i = 0; -	while (cnt && !isspace(ch)) { -		if (!i && ch == '!') +		if (*parser.buffer == '!')  			set = 0; -		else -			buf[i++] = ch; -		ret = get_user(ch, ubuf++); +		parser.buffer[parser.idx] = 0; + +		ret = ftrace_set_clr_event(parser.buffer + !set, set);  		if (ret) -			goto out_free; -		read++; -		cnt--; +			goto out_put;  	} -	buf[i] = 0; - -	file->f_pos += read; - -	ret = ftrace_set_clr_event(buf, set); -	if (ret) -		goto out_free;  	ret = read; - out_free: -	kfree(buf); + out_put: +	trace_parser_put(&parser);  	return ret;  } @@ -578,7 +545,7 @@ static int trace_write_header(struct trace_seq *s)  				FIELD(unsigned char, flags),  				FIELD(unsigned char, preempt_count),  				FIELD(int, pid), -				FIELD(int, tgid)); +				FIELD(int, lock_depth));  }  static ssize_t @@ -1187,7 +1154,7 @@ static int trace_module_notify(struct notifier_block *self,  }  #endif /* CONFIG_MODULES */ -struct notifier_block trace_module_nb = { +static struct notifier_block trace_module_nb = {  	.notifier_call = trace_module_notify,  	.priority = 0,  }; @@ -1359,6 +1326,18 @@ static __init void event_trace_self_tests(void)  		if (!call->regfunc)  			continue; +/* + * Testing syscall events here is pretty useless, but + * we still do it if configured. But this is time consuming. + * What we really need is a user thread to perform the + * syscalls as we test. + */ +#ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS +		if (call->system && +		    strcmp(call->system, "syscalls") == 0) +			continue; +#endif +  		pr_info("Testing event %s: ", call->name);  		/* @@ -1432,7 +1411,7 @@ static __init void event_trace_self_tests(void)  #ifdef CONFIG_FUNCTION_TRACER -static DEFINE_PER_CPU(atomic_t, test_event_disable); +static DEFINE_PER_CPU(atomic_t, ftrace_test_event_disable);  static void  function_test_events_call(unsigned long ip, unsigned long parent_ip) @@ -1449,7 +1428,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)  	pc = preempt_count();  	resched = ftrace_preempt_disable();  	cpu = raw_smp_processor_id(); -	disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); +	disabled = atomic_inc_return(&per_cpu(ftrace_test_event_disable, cpu));  	if (disabled != 1)  		goto out; @@ -1468,7 +1447,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip)  	trace_nowake_buffer_unlock_commit(buffer, event, flags, pc);   out: -	atomic_dec(&per_cpu(test_event_disable, cpu)); +	atomic_dec(&per_cpu(ftrace_test_event_disable, cpu));  	ftrace_preempt_enable(resched);  } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 93660fbbf62..23245785927 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -121,6 +121,47 @@ struct filter_parse_state {  	} operand;  }; +#define DEFINE_COMPARISON_PRED(type)					\ +static int filter_pred_##type(struct filter_pred *pred, void *event,	\ +			      int val1, int val2)			\ +{									\ +	type *addr = (type *)(event + pred->offset);			\ +	type val = (type)pred->val;					\ +	int match = 0;							\ +									\ +	switch (pred->op) {						\ +	case OP_LT:							\ +		match = (*addr < val);					\ +		break;							\ +	case OP_LE:							\ +		match = (*addr <= val);					\ +		break;							\ +	case OP_GT:							\ +		match = (*addr > val);					\ +		break;							\ +	case OP_GE:							\ +		match = (*addr >= val);					\ +		break;							\ +	default:							\ +		break;							\ +	}								\ +									\ +	return match;							\ +} + +#define DEFINE_EQUALITY_PRED(size)					\ +static int filter_pred_##size(struct filter_pred *pred, void *event,	\ +			      int val1, int val2)			\ +{									\ +	u##size *addr = (u##size *)(event + pred->offset);		\ +	u##size val = (u##size)pred->val;				\ +	int match;							\ +									\ +	match = (val == *addr) ^ pred->not;				\ +									\ +	return match;							\ +} +  DEFINE_COMPARISON_PRED(s64);  DEFINE_COMPARISON_PRED(u64);  DEFINE_COMPARISON_PRED(s32); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index df1bf6e48bb..9753fcc61bc 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -15,146 +15,125 @@  #include "trace_output.h" +#undef TRACE_SYSTEM +#define TRACE_SYSTEM	ftrace -#undef TRACE_STRUCT -#define TRACE_STRUCT(args...) args +/* not needed for this file */ +#undef __field_struct +#define __field_struct(type, item) -extern void __bad_type_size(void); +#undef __field +#define __field(type, item)				type item; -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)					\ -	if (sizeof(type) != sizeof(field.item))				\ -		__bad_type_size();					\ +#undef __field_desc +#define __field_desc(type, container, item)		type item; + +#undef __array +#define __array(type, item, size)			type item[size]; + +#undef __array_desc +#define __array_desc(type, container, item, size)	type item[size]; + +#undef __dynamic_array +#define __dynamic_array(type, item)			type item[]; + +#undef F_STRUCT +#define F_STRUCT(args...)				args + +#undef F_printk +#define F_printk(fmt, args...) fmt, args + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)	\ +struct ____ftrace_##name {					\ +	tstruct							\ +};								\ +static void __used ____ftrace_check_##name(void)		\ +{								\ +	struct ____ftrace_##name *__entry = NULL;		\ +								\ +	/* force cmpile-time check on F_printk() */		\ +	printk(print);						\ +} + +#undef FTRACE_ENTRY_DUP +#define FTRACE_ENTRY_DUP(name, struct_name, id, tstruct, print)	\ +	FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) + +#include "trace_entries.h" + + +#undef __field +#define __field(type, item)						\  	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\ -			       "offset:%u;\tsize:%u;\n",		\ -			       (unsigned int)offsetof(typeof(field), item), \ -			       (unsigned int)sizeof(field.item));	\ +			       "offset:%zu;\tsize:%zu;\n",		\ +			       offsetof(typeof(field), item),		\ +			       sizeof(field.item));			\  	if (!ret)							\  		return 0; +#undef __field_desc +#define __field_desc(type, container, item)				\ +	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\ +			       "offset:%zu;\tsize:%zu;\n",		\ +			       offsetof(typeof(field), container.item),	\ +			       sizeof(field.container.item));		\ +	if (!ret)							\ +		return 0; -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)			\ -	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\ -			       "offset:%u;\tsize:%u;\n",		\ -			       (unsigned int)offsetof(typeof(field), item), \ -			       (unsigned int)sizeof(field.item));	\ +#undef __array +#define __array(type, item, len)					\ +	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ +			       "offset:%zu;\tsize:%zu;\n",		\ +			       offsetof(typeof(field), item),	\ +			       sizeof(field.item));		\  	if (!ret)							\  		return 0; -#undef TRACE_FIELD_ZERO_CHAR -#define TRACE_FIELD_ZERO_CHAR(item)					\ -	ret = trace_seq_printf(s, "\tfield:char " #item ";\t"		\ -			       "offset:%u;\tsize:0;\n",			\ -			       (unsigned int)offsetof(typeof(field), item)); \ +#undef __array_desc +#define __array_desc(type, container, item, len)			\ +	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ +			       "offset:%zu;\tsize:%zu;\n",		\ +			       offsetof(typeof(field), container.item),	\ +			       sizeof(field.container.item));		\  	if (!ret)							\  		return 0; -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\ -	TRACE_FIELD(type, item, assign) +#undef __dynamic_array +#define __dynamic_array(type, item)					\ +	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\ +			       "offset:%zu;\tsize:0;\n",		\ +			       offsetof(typeof(field), item));		\ +	if (!ret)							\ +		return 0; -#undef TP_RAW_FMT -#define TP_RAW_FMT(args...) args +#undef F_printk +#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\ -static int								\ -ftrace_format_##call(struct ftrace_event_call *unused,			\ -		      struct trace_seq *s)				\ -{									\ -	struct args field;						\ -	int ret;							\ -									\ -	tstruct;							\ -									\ -	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\ -									\ -	return ret;							\ -} +#undef __entry +#define __entry REC -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\ -				    tpfmt)				\ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)		\  static int								\ -ftrace_format_##call(struct ftrace_event_call *unused,			\ -		      struct trace_seq *s)				\ +ftrace_format_##name(struct ftrace_event_call *unused,			\ +		     struct trace_seq *s)				\  {									\ -	struct args field;						\ -	int ret;							\ +	struct struct_name field __attribute__((unused));		\ +	int ret = 0;							\  									\  	tstruct;							\  									\ -	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\ +	trace_seq_printf(s, "\nprint fmt: " print);			\  									\  	return ret;							\  } -#include "trace_event_types.h" - -#undef TRACE_ZERO_CHAR -#define TRACE_ZERO_CHAR(arg) - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ -	entry->item = assign; - -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)\ -	entry->item = assign; - -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed)	\ -	TRACE_FIELD(type, item, assign) - -#undef TP_CMD -#define TP_CMD(cmd...)	cmd - -#undef TRACE_ENTRY -#define TRACE_ENTRY	entry - -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)	\ -	cmd; - -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\ -int ftrace_define_fields_##call(struct ftrace_event_call *event_call);	\ -static int ftrace_raw_init_event_##call(void);				\ -									\ -struct ftrace_event_call __used						\ -__attribute__((__aligned__(4)))						\ -__attribute__((section("_ftrace_events"))) event_##call = {		\ -	.name			= #call,				\ -	.id			= proto,				\ -	.system			= __stringify(TRACE_SYSTEM),		\ -	.raw_init		= ftrace_raw_init_event_##call,		\ -	.show_format		= ftrace_format_##call,			\ -	.define_fields		= ftrace_define_fields_##call,		\ -};									\ -static int ftrace_raw_init_event_##call(void)				\ -{									\ -	INIT_LIST_HEAD(&event_##call.fields);				\ -	return 0;							\ -}									\ - -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\ -				    tpfmt)				\ -									\ -struct ftrace_event_call __used						\ -__attribute__((__aligned__(4)))						\ -__attribute__((section("_ftrace_events"))) event_##call = {		\ -	.name			= #call,				\ -	.id			= proto,				\ -	.system			= __stringify(TRACE_SYSTEM),		\ -	.show_format		= ftrace_format_##call,			\ -}; +#include "trace_entries.h" -#include "trace_event_types.h" -#undef TRACE_FIELD -#define TRACE_FIELD(type, item, assign)					\ +#undef __field +#define __field(type, item)						\  	ret = trace_define_field(event_call, #type, #item,		\  				 offsetof(typeof(field), item),		\  				 sizeof(field.item),			\ @@ -162,32 +141,45 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\  	if (ret)							\  		return ret; -#undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type, item, len, cmd)			\ +#undef __field_desc +#define __field_desc(type, container, item)	\ +	ret = trace_define_field(event_call, #type, #item,		\ +				 offsetof(typeof(field),		\ +					  container.item),		\ +				 sizeof(field.container.item),		\ +				 is_signed_type(type), FILTER_OTHER);	\ +	if (ret)							\ +		return ret; + +#undef __array +#define __array(type, item, len)					\ +	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\  	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\  				 offsetof(typeof(field), item),		\  				 sizeof(field.item), 0, FILTER_OTHER);	\  	if (ret)							\  		return ret; -#undef TRACE_FIELD_SIGN -#define TRACE_FIELD_SIGN(type, item, assign, is_signed)			\ -	ret = trace_define_field(event_call, #type, #item,		\ -				 offsetof(typeof(field), item),		\ -				 sizeof(field.item), is_signed,		\ +#undef __array_desc +#define __array_desc(type, container, item, len)			\ +	BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);				\ +	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\ +				 offsetof(typeof(field),		\ +					  container.item),		\ +				 sizeof(field.container.item), 0,	\  				 FILTER_OTHER);				\  	if (ret)							\  		return ret; -#undef TRACE_FIELD_ZERO_CHAR -#define TRACE_FIELD_ZERO_CHAR(item) +#undef __dynamic_array +#define __dynamic_array(type, item) -#undef TRACE_EVENT_FORMAT -#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\ +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(name, struct_name, id, tstruct, print)		\  int									\ -ftrace_define_fields_##call(struct ftrace_event_call *event_call)	\ +ftrace_define_fields_##name(struct ftrace_event_call *event_call)	\  {									\ -	struct args field;						\ +	struct struct_name field;					\  	int ret;							\  									\  	ret = trace_define_common_fields(event_call);			\ @@ -199,8 +191,42 @@ ftrace_define_fields_##call(struct ftrace_event_call *event_call)	\  	return ret;							\  } -#undef TRACE_EVENT_FORMAT_NOFILTER -#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,	\ -				    tpfmt) +#include "trace_entries.h" + + +#undef __field +#define __field(type, item) + +#undef __field_desc +#define __field_desc(type, container, item) + +#undef __array +#define __array(type, item, len) + +#undef __array_desc +#define __array_desc(type, container, item, len) + +#undef __dynamic_array +#define __dynamic_array(type, item) + +#undef FTRACE_ENTRY +#define FTRACE_ENTRY(call, struct_name, type, tstruct, print)		\ +static int ftrace_raw_init_event_##call(void);				\ +									\ +struct ftrace_event_call __used						\ +__attribute__((__aligned__(4)))						\ +__attribute__((section("_ftrace_events"))) event_##call = {		\ +	.name			= #call,				\ +	.id			= type,					\ +	.system			= __stringify(TRACE_SYSTEM),		\ +	.raw_init		= ftrace_raw_init_event_##call,		\ +	.show_format		= ftrace_format_##call,			\ +	.define_fields		= ftrace_define_fields_##call,		\ +};									\ +static int ftrace_raw_init_event_##call(void)				\ +{									\ +	INIT_LIST_HEAD(&event_##call.fields);				\ +	return 0;							\ +}									\ -#include "trace_event_types.h" +#include "trace_entries.h" diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 5b01b94518f..b3f3776b0cd 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -290,7 +290,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,  {  	long count = (long)data; -	seq_printf(m, "%pf:", (void *)ip); +	seq_printf(m, "%ps:", (void *)ip);  	if (ops == &traceon_probe_ops)  		seq_printf(m, "traceon"); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b3749a2c313..45e6c01b2e4 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -124,7 +124,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret,  	if (unlikely(current->ret_stack[index].fp != frame_pointer)) {  		ftrace_graph_stop();  		WARN(1, "Bad frame pointer: expected %lx, received %lx\n" -		     "  from func %pF return to %lx\n", +		     "  from func %ps return to %lx\n",  		     current->ret_stack[index].fp,  		     frame_pointer,  		     (void *)current->ret_stack[index].func, @@ -364,6 +364,15 @@ print_graph_proc(struct trace_seq *s, pid_t pid)  } +static enum print_line_t +print_graph_lat_fmt(struct trace_seq *s, struct trace_entry *entry) +{ +	if (!trace_seq_putc(s, ' ')) +		return 0; + +	return trace_print_lat_fmt(s, entry); +} +  /* If the pid changed since the last trace, output this event */  static enum print_line_t  verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data) @@ -521,6 +530,7 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,  		if (ret == TRACE_TYPE_PARTIAL_LINE)  			return TRACE_TYPE_PARTIAL_LINE;  	} +  	/* Proc */  	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {  		ret = print_graph_proc(s, pid); @@ -659,7 +669,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,  			return TRACE_TYPE_PARTIAL_LINE;  	} -	ret = trace_seq_printf(s, "%pf();\n", (void *)call->func); +	ret = trace_seq_printf(s, "%ps();\n", (void *)call->func);  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -702,7 +712,7 @@ print_graph_entry_nested(struct trace_iterator *iter,  			return TRACE_TYPE_PARTIAL_LINE;  	} -	ret = trace_seq_printf(s, "%pf() {\n", (void *)call->func); +	ret = trace_seq_printf(s, "%ps() {\n", (void *)call->func);  	if (!ret)  		return TRACE_TYPE_PARTIAL_LINE; @@ -758,6 +768,13 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,  			return TRACE_TYPE_PARTIAL_LINE;  	} +	/* Latency format */ +	if (trace_flags & TRACE_ITER_LATENCY_FMT) { +		ret = print_graph_lat_fmt(s, ent); +		if (ret == TRACE_TYPE_PARTIAL_LINE) +			return TRACE_TYPE_PARTIAL_LINE; +	} +  	return 0;  } @@ -952,28 +969,59 @@ print_graph_function(struct trace_iterator *iter)  	return TRACE_TYPE_HANDLED;  } +static void print_lat_header(struct seq_file *s) +{ +	static const char spaces[] = "                "	/* 16 spaces */ +		"    "					/* 4 spaces */ +		"                 ";			/* 17 spaces */ +	int size = 0; + +	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) +		size += 16; +	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) +		size += 4; +	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) +		size += 17; + +	seq_printf(s, "#%.*s  _-----=> irqs-off        \n", size, spaces); +	seq_printf(s, "#%.*s / _----=> need-resched    \n", size, spaces); +	seq_printf(s, "#%.*s| / _---=> hardirq/softirq \n", size, spaces); +	seq_printf(s, "#%.*s|| / _--=> preempt-depth   \n", size, spaces); +	seq_printf(s, "#%.*s||| / _-=> lock-depth      \n", size, spaces); +	seq_printf(s, "#%.*s|||| /                     \n", size, spaces); +} +  static void print_graph_headers(struct seq_file *s)  { +	int lat = trace_flags & TRACE_ITER_LATENCY_FMT; + +	if (lat) +		print_lat_header(s); +  	/* 1st line */ -	seq_printf(s, "# "); +	seq_printf(s, "#");  	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)  		seq_printf(s, "     TIME       ");  	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) -		seq_printf(s, "CPU"); +		seq_printf(s, " CPU");  	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) -		seq_printf(s, "  TASK/PID      "); +		seq_printf(s, "  TASK/PID       "); +	if (lat) +		seq_printf(s, "|||||");  	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)  		seq_printf(s, "  DURATION   ");  	seq_printf(s, "               FUNCTION CALLS\n");  	/* 2nd line */ -	seq_printf(s, "# "); +	seq_printf(s, "#");  	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)  		seq_printf(s, "      |         ");  	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) -		seq_printf(s, "|  "); +		seq_printf(s, " |  ");  	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) -		seq_printf(s, "  |    |        "); +		seq_printf(s, "   |    |        "); +	if (lat) +		seq_printf(s, "|||||");  	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)  		seq_printf(s, "   |   |      ");  	seq_printf(s, "               |   |   |   |\n"); diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 5555b75a0d1..3aa7eaa2114 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -129,15 +129,10 @@ check_critical_timing(struct trace_array *tr,  		      unsigned long parent_ip,  		      int cpu)  { -	unsigned long latency, t0, t1;  	cycle_t T0, T1, delta;  	unsigned long flags;  	int pc; -	/* -	 * usecs conversion is slow so we try to delay the conversion -	 * as long as possible: -	 */  	T0 = data->preempt_timestamp;  	T1 = ftrace_now(cpu);  	delta = T1-T0; @@ -157,18 +152,15 @@ check_critical_timing(struct trace_array *tr,  	trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc); -	latency = nsecs_to_usecs(delta); -  	if (data->critical_sequence != max_sequence)  		goto out_unlock; -	tracing_max_latency = delta; -	t0 = nsecs_to_usecs(T0); -	t1 = nsecs_to_usecs(T1); -  	data->critical_end = parent_ip; -	update_max_tr_single(tr, current, cpu); +	if (likely(!is_tracing_stopped())) { +		tracing_max_latency = delta; +		update_max_tr_single(tr, current, cpu); +	}  	max_sequence++; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index c4c9bbda53d..0acd834659e 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -307,6 +307,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,  				struct trace_array_cpu *data,  				struct mmiotrace_rw *rw)  { +	struct ftrace_event_call *call = &event_mmiotrace_rw;  	struct ring_buffer *buffer = tr->buffer;  	struct ring_buffer_event *event;  	struct trace_mmiotrace_rw *entry; @@ -320,7 +321,9 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,  	}  	entry	= ring_buffer_event_data(event);  	entry->rw			= *rw; -	trace_buffer_unlock_commit(buffer, event, 0, pc); + +	if (!filter_check_discard(call, entry, buffer, event)) +		trace_buffer_unlock_commit(buffer, event, 0, pc);  }  void mmio_trace_rw(struct mmiotrace_rw *rw) @@ -334,6 +337,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,  				struct trace_array_cpu *data,  				struct mmiotrace_map *map)  { +	struct ftrace_event_call *call = &event_mmiotrace_map;  	struct ring_buffer *buffer = tr->buffer;  	struct ring_buffer_event *event;  	struct trace_mmiotrace_map *entry; @@ -347,7 +351,9 @@ static void __trace_mmiotrace_map(struct trace_array *tr,  	}  	entry	= ring_buffer_event_data(event);  	entry->map			= *map; -	trace_buffer_unlock_commit(buffer, event, 0, pc); + +	if (!filter_check_discard(call, entry, buffer, event)) +		trace_buffer_unlock_commit(buffer, event, 0, pc);  }  void mmio_trace_mapping(struct mmiotrace_map *map) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index e0c2545622e..f572f44c6e1 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -407,7 +407,7 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,  		 * since individual threads might have already quit!  		 */  		rcu_read_lock(); -		task = find_task_by_vpid(entry->ent.tgid); +		task = find_task_by_vpid(entry->tgid);  		if (task)  			mm = get_task_mm(task);  		rcu_read_unlock(); @@ -460,18 +460,23 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)  	return ret;  } -static int -lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +/** + * trace_print_lat_fmt - print the irq, preempt and lockdep fields + * @s: trace seq struct to write to + * @entry: The trace entry field from the ring buffer + * + * Prints the generic fields of irqs off, in hard or softirq, preempt + * count and lock depth. + */ +int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)  {  	int hardirq, softirq; -	char comm[TASK_COMM_LEN]; +	int ret; -	trace_find_cmdline(entry->pid, comm);  	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;  	softirq = entry->flags & TRACE_FLAG_SOFTIRQ; -	if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c", -			      comm, entry->pid, cpu, +	if (!trace_seq_printf(s, "%c%c%c",  			      (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :  				(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?  				  'X' : '.', @@ -481,9 +486,30 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)  				hardirq ? 'h' : softirq ? 's' : '.'))  		return 0; +	if (entry->lock_depth < 0) +		ret = trace_seq_putc(s, '.'); +	else +		ret = trace_seq_printf(s, "%d", entry->lock_depth); +	if (!ret) +		return 0; +  	if (entry->preempt_count)  		return trace_seq_printf(s, "%x", entry->preempt_count); -	return trace_seq_puts(s, "."); +	return trace_seq_putc(s, '.'); +} + +static int +lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu) +{ +	char comm[TASK_COMM_LEN]; + +	trace_find_cmdline(entry->pid, comm); + +	if (!trace_seq_printf(s, "%8.8s-%-5d %3d", +			      comm, entry->pid, cpu)) +		return 0; + +	return trace_print_lat_fmt(s, entry);  }  static unsigned long preempt_mark_thresh = 100; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index d38bec4a9c3..9d91c72ba38 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -26,6 +26,8 @@ extern struct trace_event *ftrace_find_event(int type);  extern enum print_line_t trace_nop_print(struct trace_iterator *iter,  					 int flags); +extern int +trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);  /* used by module unregistering */  extern int __unregister_ftrace_event(struct trace_event *event); diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index ad69f105a7c..26185d72767 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -24,6 +24,7 @@ static int __read_mostly	tracer_enabled;  static struct task_struct	*wakeup_task;  static int			wakeup_cpu; +static int			wakeup_current_cpu;  static unsigned			wakeup_prio = -1;  static int			wakeup_rt; @@ -56,33 +57,23 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)  	resched = ftrace_preempt_disable();  	cpu = raw_smp_processor_id(); +	if (cpu != wakeup_current_cpu) +		goto out_enable; +  	data = tr->data[cpu];  	disabled = atomic_inc_return(&data->disabled);  	if (unlikely(disabled != 1))  		goto out;  	local_irq_save(flags); -	__raw_spin_lock(&wakeup_lock); - -	if (unlikely(!wakeup_task)) -		goto unlock; - -	/* -	 * The task can't disappear because it needs to -	 * wake up first, and we have the wakeup_lock. -	 */ -	if (task_cpu(wakeup_task) != cpu) -		goto unlock;  	trace_function(tr, ip, parent_ip, flags, pc); - unlock: -	__raw_spin_unlock(&wakeup_lock);  	local_irq_restore(flags);   out:  	atomic_dec(&data->disabled); - + out_enable:  	ftrace_preempt_enable(resched);  } @@ -107,11 +98,18 @@ static int report_latency(cycle_t delta)  	return 1;  } +static void probe_wakeup_migrate_task(struct task_struct *task, int cpu) +{ +	if (task != wakeup_task) +		return; + +	wakeup_current_cpu = cpu; +} +  static void notrace  probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,  	struct task_struct *next)  { -	unsigned long latency = 0, t0 = 0, t1 = 0;  	struct trace_array_cpu *data;  	cycle_t T0, T1, delta;  	unsigned long flags; @@ -157,10 +155,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,  	trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);  	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); -	/* -	 * usecs conversion is slow so we try to delay the conversion -	 * as long as possible: -	 */  	T0 = data->preempt_timestamp;  	T1 = ftrace_now(cpu);  	delta = T1-T0; @@ -168,13 +162,10 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,  	if (!report_latency(delta))  		goto out_unlock; -	latency = nsecs_to_usecs(delta); - -	tracing_max_latency = delta; -	t0 = nsecs_to_usecs(T0); -	t1 = nsecs_to_usecs(T1); - -	update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); +	if (likely(!is_tracing_stopped())) { +		tracing_max_latency = delta; +		update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu); +	}  out_unlock:  	__wakeup_reset(wakeup_trace); @@ -244,6 +235,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)  	__wakeup_reset(wakeup_trace);  	wakeup_cpu = task_cpu(p); +	wakeup_current_cpu = wakeup_cpu;  	wakeup_prio = p->prio;  	wakeup_task = p; @@ -293,6 +285,13 @@ static void start_wakeup_tracer(struct trace_array *tr)  		goto fail_deprobe_wake_new;  	} +	ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task); +	if (ret) { +		pr_info("wakeup trace: Couldn't activate tracepoint" +			" probe to kernel_sched_migrate_task\n"); +		return; +	} +  	wakeup_reset(tr);  	/* @@ -325,6 +324,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)  	unregister_trace_sched_switch(probe_wakeup_sched_switch);  	unregister_trace_sched_wakeup_new(probe_wakeup);  	unregister_trace_sched_wakeup(probe_wakeup); +	unregister_trace_sched_migrate_task(probe_wakeup_migrate_task);  }  static int __wakeup_tracer_init(struct trace_array *tr)  | 
