diff options
Diffstat (limited to 'tools/perf/bench')
| -rw-r--r-- | tools/perf/bench/bench.h | 32 | ||||
| -rw-r--r-- | tools/perf/bench/futex-hash.c | 212 | ||||
| -rw-r--r-- | tools/perf/bench/futex-requeue.c | 211 | ||||
| -rw-r--r-- | tools/perf/bench/futex-wake.c | 201 | ||||
| -rw-r--r-- | tools/perf/bench/futex.h | 71 | ||||
| -rw-r--r-- | tools/perf/bench/mem-memcpy-arch.h | 12 | ||||
| -rw-r--r-- | tools/perf/bench/mem-memcpy-x86-64-asm-def.h | 12 | ||||
| -rw-r--r-- | tools/perf/bench/mem-memcpy-x86-64-asm.S | 12 | ||||
| -rw-r--r-- | tools/perf/bench/mem-memcpy.c | 253 | ||||
| -rw-r--r-- | tools/perf/bench/mem-memset-arch.h | 12 | ||||
| -rw-r--r-- | tools/perf/bench/mem-memset-x86-64-asm-def.h | 12 | ||||
| -rw-r--r-- | tools/perf/bench/mem-memset-x86-64-asm.S | 13 | ||||
| -rw-r--r-- | tools/perf/bench/mem-memset.c | 297 | ||||
| -rw-r--r-- | tools/perf/bench/numa.c | 1744 | ||||
| -rw-r--r-- | tools/perf/bench/sched-messaging.c | 2 | ||||
| -rw-r--r-- | tools/perf/bench/sched-pipe.c | 123 | 
16 files changed, 3114 insertions, 105 deletions
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index f7781c6267c..eba46709b27 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -1,9 +1,39 @@  #ifndef BENCH_H  #define BENCH_H +/* + * The madvise transparent hugepage constants were added in glibc + * 2.13. For compatibility with older versions of glibc, define these + * tokens if they are not already defined. + * + * PA-RISC uses different madvise values from other architectures and + * needs to be special-cased. + */ +#ifdef __hppa__ +# ifndef MADV_HUGEPAGE +#  define MADV_HUGEPAGE		67 +# endif +# ifndef MADV_NOHUGEPAGE +#  define MADV_NOHUGEPAGE	68 +# endif +#else +# ifndef MADV_HUGEPAGE +#  define MADV_HUGEPAGE		14 +# endif +# ifndef MADV_NOHUGEPAGE +#  define MADV_NOHUGEPAGE	15 +# endif +#endif + +extern int bench_numa(int argc, const char **argv, const char *prefix);  extern int bench_sched_messaging(int argc, const char **argv, const char *prefix);  extern int bench_sched_pipe(int argc, const char **argv, const char *prefix); -extern int bench_mem_memcpy(int argc, const char **argv, const char *prefix __used); +extern int bench_mem_memcpy(int argc, const char **argv, +			    const char *prefix __maybe_unused); +extern int bench_mem_memset(int argc, const char **argv, const char *prefix); +extern int bench_futex_hash(int argc, const char **argv, const char *prefix); +extern int bench_futex_wake(int argc, const char **argv, const char *prefix); +extern int bench_futex_requeue(int argc, const char **argv, const char *prefix);  #define BENCH_FORMAT_DEFAULT_STR	"default"  #define BENCH_FORMAT_DEFAULT		0 diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c new file mode 100644 index 00000000000..a84206e9c4a --- /dev/null +++ b/tools/perf/bench/futex-hash.c @@ -0,0 +1,212 @@ +/* + * Copyright (C) 2013  Davidlohr Bueso <davidlohr@hp.com> + * + * futex-hash: Stress the hell out of the Linux kernel futex uaddr hashing. + * + * This program is particularly useful for measuring the kernel's futex hash + * table/function implementation. In order for it to make sense, use with as + * many threads and futexes as possible. + */ + +#include "../perf.h" +#include "../util/util.h" +#include "../util/stat.h" +#include "../util/parse-options.h" +#include "../util/header.h" +#include "bench.h" +#include "futex.h" + +#include <err.h> +#include <stdlib.h> +#include <sys/time.h> +#include <pthread.h> + +static unsigned int nthreads = 0; +static unsigned int nsecs    = 10; +/* amount of futexes per thread */ +static unsigned int nfutexes = 1024; +static bool fshared = false, done = false, silent = false; + +struct timeval start, end, runtime; +static pthread_mutex_t thread_lock; +static unsigned int threads_starting; +static struct stats throughput_stats; +static pthread_cond_t thread_parent, thread_worker; + +struct worker { +	int tid; +	u_int32_t *futex; +	pthread_t thread; +	unsigned long ops; +}; + +static const struct option options[] = { +	OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"), +	OPT_UINTEGER('r', "runtime", &nsecs,    "Specify runtime (in seconds)"), +	OPT_UINTEGER('f', "futexes", &nfutexes, "Specify amount of futexes per threads"), +	OPT_BOOLEAN( 's', "silent",  &silent,   "Silent mode: do not display data/details"), +	OPT_BOOLEAN( 'S', "shared",  &fshared,  "Use shared futexes instead of private ones"), +	OPT_END() +}; + +static const char * const bench_futex_hash_usage[] = { +	"perf bench futex hash <options>", +	NULL +}; + +static void *workerfn(void *arg) +{ +	int ret; +	unsigned int i; +	struct worker *w = (struct worker *) arg; + +	pthread_mutex_lock(&thread_lock); +	threads_starting--; +	if (!threads_starting) +		pthread_cond_signal(&thread_parent); +	pthread_cond_wait(&thread_worker, &thread_lock); +	pthread_mutex_unlock(&thread_lock); + +	do { +		for (i = 0; i < nfutexes; i++, w->ops++) { +			/* +			 * We want the futex calls to fail in order to stress +			 * the hashing of uaddr and not measure other steps, +			 * such as internal waitqueue handling, thus enlarging +			 * the critical region protected by hb->lock. +			 */ +			ret = futex_wait(&w->futex[i], 1234, NULL, +					 fshared ? 0 : FUTEX_PRIVATE_FLAG); +			if (!silent && +			    (!ret || errno != EAGAIN || errno != EWOULDBLOCK)) +				warn("Non-expected futex return call"); +		} +	}  while (!done); + +	return NULL; +} + +static void toggle_done(int sig __maybe_unused, +			siginfo_t *info __maybe_unused, +			void *uc __maybe_unused) +{ +	/* inform all threads that we're done for the day */ +	done = true; +	gettimeofday(&end, NULL); +	timersub(&end, &start, &runtime); +} + +static void print_summary(void) +{ +	unsigned long avg = avg_stats(&throughput_stats); +	double stddev = stddev_stats(&throughput_stats); + +	printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n", +	       !silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg), +	       (int) runtime.tv_sec); +} + +int bench_futex_hash(int argc, const char **argv, +		     const char *prefix __maybe_unused) +{ +	int ret = 0; +	cpu_set_t cpu; +	struct sigaction act; +	unsigned int i, ncpus; +	pthread_attr_t thread_attr; +	struct worker *worker = NULL; + +	argc = parse_options(argc, argv, options, bench_futex_hash_usage, 0); +	if (argc) { +		usage_with_options(bench_futex_hash_usage, options); +		exit(EXIT_FAILURE); +	} + +	ncpus = sysconf(_SC_NPROCESSORS_ONLN); + +	sigfillset(&act.sa_mask); +	act.sa_sigaction = toggle_done; +	sigaction(SIGINT, &act, NULL); + +	if (!nthreads) /* default to the number of CPUs */ +		nthreads = ncpus; + +	worker = calloc(nthreads, sizeof(*worker)); +	if (!worker) +		goto errmem; + +	printf("Run summary [PID %d]: %d threads, each operating on %d [%s] futexes for %d secs.\n\n", +	       getpid(), nthreads, nfutexes, fshared ? "shared":"private", nsecs); + +	init_stats(&throughput_stats); +	pthread_mutex_init(&thread_lock, NULL); +	pthread_cond_init(&thread_parent, NULL); +	pthread_cond_init(&thread_worker, NULL); + +	threads_starting = nthreads; +	pthread_attr_init(&thread_attr); +	gettimeofday(&start, NULL); +	for (i = 0; i < nthreads; i++) { +		worker[i].tid = i; +		worker[i].futex = calloc(nfutexes, sizeof(*worker[i].futex)); +		if (!worker[i].futex) +			goto errmem; + +		CPU_ZERO(&cpu); +		CPU_SET(i % ncpus, &cpu); + +		ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu); +		if (ret) +			err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + +		ret = pthread_create(&worker[i].thread, &thread_attr, workerfn, +				     (void *)(struct worker *) &worker[i]); +		if (ret) +			err(EXIT_FAILURE, "pthread_create"); + +	} +	pthread_attr_destroy(&thread_attr); + +	pthread_mutex_lock(&thread_lock); +	while (threads_starting) +		pthread_cond_wait(&thread_parent, &thread_lock); +	pthread_cond_broadcast(&thread_worker); +	pthread_mutex_unlock(&thread_lock); + +	sleep(nsecs); +	toggle_done(0, NULL, NULL); + +	for (i = 0; i < nthreads; i++) { +		ret = pthread_join(worker[i].thread, NULL); +		if (ret) +			err(EXIT_FAILURE, "pthread_join"); +	} + +	/* cleanup & report results */ +	pthread_cond_destroy(&thread_parent); +	pthread_cond_destroy(&thread_worker); +	pthread_mutex_destroy(&thread_lock); + +	for (i = 0; i < nthreads; i++) { +		unsigned long t = worker[i].ops/runtime.tv_sec; +		update_stats(&throughput_stats, t); +		if (!silent) { +			if (nfutexes == 1) +				printf("[thread %2d] futex: %p [ %ld ops/sec ]\n", +				       worker[i].tid, &worker[i].futex[0], t); +			else +				printf("[thread %2d] futexes: %p ... %p [ %ld ops/sec ]\n", +				       worker[i].tid, &worker[i].futex[0], +				       &worker[i].futex[nfutexes-1], t); +		} + +		free(worker[i].futex); +	} + +	print_summary(); + +	free(worker); +	return ret; +errmem: +	err(EXIT_FAILURE, "calloc"); +} diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c new file mode 100644 index 00000000000..a16255876f1 --- /dev/null +++ b/tools/perf/bench/futex-requeue.c @@ -0,0 +1,211 @@ +/* + * Copyright (C) 2013  Davidlohr Bueso <davidlohr@hp.com> + * + * futex-requeue: Block a bunch of threads on futex1 and requeue them + *                on futex2, N at a time. + * + * This program is particularly useful to measure the latency of nthread + * requeues without waking up any tasks -- thus mimicking a regular futex_wait. + */ + +#include "../perf.h" +#include "../util/util.h" +#include "../util/stat.h" +#include "../util/parse-options.h" +#include "../util/header.h" +#include "bench.h" +#include "futex.h" + +#include <err.h> +#include <stdlib.h> +#include <sys/time.h> +#include <pthread.h> + +static u_int32_t futex1 = 0, futex2 = 0; + +/* + * How many tasks to requeue at a time. + * Default to 1 in order to make the kernel work more. + */ +static unsigned int nrequeue = 1; + +/* + * There can be significant variance from run to run, + * the more repeats, the more exact the overall avg and + * the better idea of the futex latency. + */ +static unsigned int repeat = 10; + +static pthread_t *worker; +static bool done = 0, silent = 0; +static pthread_mutex_t thread_lock; +static pthread_cond_t thread_parent, thread_worker; +static struct stats requeuetime_stats, requeued_stats; +static unsigned int ncpus, threads_starting, nthreads = 0; + +static const struct option options[] = { +	OPT_UINTEGER('t', "threads",  &nthreads, "Specify amount of threads"), +	OPT_UINTEGER('q', "nrequeue", &nrequeue, "Specify amount of threads to requeue at once"), +	OPT_UINTEGER('r', "repeat",   &repeat,   "Specify amount of times to repeat the run"), +	OPT_BOOLEAN( 's', "silent",   &silent,   "Silent mode: do not display data/details"), +	OPT_END() +}; + +static const char * const bench_futex_requeue_usage[] = { +	"perf bench futex requeue <options>", +	NULL +}; + +static void print_summary(void) +{ +	double requeuetime_avg = avg_stats(&requeuetime_stats); +	double requeuetime_stddev = stddev_stats(&requeuetime_stats); +	unsigned int requeued_avg = avg_stats(&requeued_stats); + +	printf("Requeued %d of %d threads in %.4f ms (+-%.2f%%)\n", +	       requeued_avg, +	       nthreads, +	       requeuetime_avg/1e3, +	       rel_stddev_stats(requeuetime_stddev, requeuetime_avg)); +} + +static void *workerfn(void *arg __maybe_unused) +{ +	pthread_mutex_lock(&thread_lock); +	threads_starting--; +	if (!threads_starting) +		pthread_cond_signal(&thread_parent); +	pthread_cond_wait(&thread_worker, &thread_lock); +	pthread_mutex_unlock(&thread_lock); + +	futex_wait(&futex1, 0, NULL, FUTEX_PRIVATE_FLAG); +	return NULL; +} + +static void block_threads(pthread_t *w, +			  pthread_attr_t thread_attr) +{ +	cpu_set_t cpu; +	unsigned int i; + +	threads_starting = nthreads; + +	/* create and block all threads */ +	for (i = 0; i < nthreads; i++) { +		CPU_ZERO(&cpu); +		CPU_SET(i % ncpus, &cpu); + +		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu)) +			err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + +		if (pthread_create(&w[i], &thread_attr, workerfn, NULL)) +			err(EXIT_FAILURE, "pthread_create"); +	} +} + +static void toggle_done(int sig __maybe_unused, +			siginfo_t *info __maybe_unused, +			void *uc __maybe_unused) +{ +	done = true; +} + +int bench_futex_requeue(int argc, const char **argv, +			const char *prefix __maybe_unused) +{ +	int ret = 0; +	unsigned int i, j; +	struct sigaction act; +	pthread_attr_t thread_attr; + +	argc = parse_options(argc, argv, options, bench_futex_requeue_usage, 0); +	if (argc) +		goto err; + +	ncpus = sysconf(_SC_NPROCESSORS_ONLN); + +	sigfillset(&act.sa_mask); +	act.sa_sigaction = toggle_done; +	sigaction(SIGINT, &act, NULL); + +	if (!nthreads) +		nthreads = ncpus; + +	worker = calloc(nthreads, sizeof(*worker)); +	if (!worker) +		err(EXIT_FAILURE, "calloc"); + +	printf("Run summary [PID %d]: Requeuing %d threads (from %p to %p), " +	       "%d at a time.\n\n", +	       getpid(), nthreads, &futex1, &futex2, nrequeue); + +	init_stats(&requeued_stats); +	init_stats(&requeuetime_stats); +	pthread_attr_init(&thread_attr); +	pthread_mutex_init(&thread_lock, NULL); +	pthread_cond_init(&thread_parent, NULL); +	pthread_cond_init(&thread_worker, NULL); + +	for (j = 0; j < repeat && !done; j++) { +		unsigned int nrequeued = 0; +		struct timeval start, end, runtime; + +		/* create, launch & block all threads */ +		block_threads(worker, thread_attr); + +		/* make sure all threads are already blocked */ +		pthread_mutex_lock(&thread_lock); +		while (threads_starting) +			pthread_cond_wait(&thread_parent, &thread_lock); +		pthread_cond_broadcast(&thread_worker); +		pthread_mutex_unlock(&thread_lock); + +		usleep(100000); + +		/* Ok, all threads are patiently blocked, start requeueing */ +		gettimeofday(&start, NULL); +		for (nrequeued = 0; nrequeued < nthreads; nrequeued += nrequeue) +			/* +			 * Do not wakeup any tasks blocked on futex1, allowing +			 * us to really measure futex_wait functionality. +			 */ +			futex_cmp_requeue(&futex1, 0, &futex2, 0, nrequeue, +					  FUTEX_PRIVATE_FLAG); +		gettimeofday(&end, NULL); +		timersub(&end, &start, &runtime); + +		update_stats(&requeued_stats, nrequeued); +		update_stats(&requeuetime_stats, runtime.tv_usec); + +		if (!silent) { +			printf("[Run %d]: Requeued %d of %d threads in %.4f ms\n", +			       j + 1, nrequeued, nthreads, runtime.tv_usec/1e3); +		} + +		/* everybody should be blocked on futex2, wake'em up */ +		nrequeued = futex_wake(&futex2, nthreads, FUTEX_PRIVATE_FLAG); +		if (nthreads != nrequeued) +			warnx("couldn't wakeup all tasks (%d/%d)", nrequeued, nthreads); + +		for (i = 0; i < nthreads; i++) { +			ret = pthread_join(worker[i], NULL); +			if (ret) +				err(EXIT_FAILURE, "pthread_join"); +		} + +	} + +	/* cleanup & report results */ +	pthread_cond_destroy(&thread_parent); +	pthread_cond_destroy(&thread_worker); +	pthread_mutex_destroy(&thread_lock); +	pthread_attr_destroy(&thread_attr); + +	print_summary(); + +	free(worker); +	return ret; +err: +	usage_with_options(bench_futex_requeue_usage, options); +	exit(EXIT_FAILURE); +} diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c new file mode 100644 index 00000000000..d096169b161 --- /dev/null +++ b/tools/perf/bench/futex-wake.c @@ -0,0 +1,201 @@ +/* + * Copyright (C) 2013  Davidlohr Bueso <davidlohr@hp.com> + * + * futex-wake: Block a bunch of threads on a futex and wake'em up, N at a time. + * + * This program is particularly useful to measure the latency of nthread wakeups + * in non-error situations:  all waiters are queued and all wake calls wakeup + * one or more tasks, and thus the waitqueue is never empty. + */ + +#include "../perf.h" +#include "../util/util.h" +#include "../util/stat.h" +#include "../util/parse-options.h" +#include "../util/header.h" +#include "bench.h" +#include "futex.h" + +#include <err.h> +#include <stdlib.h> +#include <sys/time.h> +#include <pthread.h> + +/* all threads will block on the same futex */ +static u_int32_t futex1 = 0; + +/* + * How many wakeups to do at a time. + * Default to 1 in order to make the kernel work more. + */ +static unsigned int nwakes = 1; + +/* + * There can be significant variance from run to run, + * the more repeats, the more exact the overall avg and + * the better idea of the futex latency. + */ +static unsigned int repeat = 10; + +pthread_t *worker; +static bool done = 0, silent = 0; +static pthread_mutex_t thread_lock; +static pthread_cond_t thread_parent, thread_worker; +static struct stats waketime_stats, wakeup_stats; +static unsigned int ncpus, threads_starting, nthreads = 0; + +static const struct option options[] = { +	OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"), +	OPT_UINTEGER('w', "nwakes",  &nwakes,   "Specify amount of threads to wake at once"), +	OPT_UINTEGER('r', "repeat",  &repeat,   "Specify amount of times to repeat the run"), +	OPT_BOOLEAN( 's', "silent",  &silent,   "Silent mode: do not display data/details"), +	OPT_END() +}; + +static const char * const bench_futex_wake_usage[] = { +	"perf bench futex wake <options>", +	NULL +}; + +static void *workerfn(void *arg __maybe_unused) +{ +	pthread_mutex_lock(&thread_lock); +	threads_starting--; +	if (!threads_starting) +		pthread_cond_signal(&thread_parent); +	pthread_cond_wait(&thread_worker, &thread_lock); +	pthread_mutex_unlock(&thread_lock); + +	futex_wait(&futex1, 0, NULL, FUTEX_PRIVATE_FLAG); +	return NULL; +} + +static void print_summary(void) +{ +	double waketime_avg = avg_stats(&waketime_stats); +	double waketime_stddev = stddev_stats(&waketime_stats); +	unsigned int wakeup_avg = avg_stats(&wakeup_stats); + +	printf("Wokeup %d of %d threads in %.4f ms (+-%.2f%%)\n", +	       wakeup_avg, +	       nthreads, +	       waketime_avg/1e3, +	       rel_stddev_stats(waketime_stddev, waketime_avg)); +} + +static void block_threads(pthread_t *w, +			  pthread_attr_t thread_attr) +{ +	cpu_set_t cpu; +	unsigned int i; + +	threads_starting = nthreads; + +	/* create and block all threads */ +	for (i = 0; i < nthreads; i++) { +		CPU_ZERO(&cpu); +		CPU_SET(i % ncpus, &cpu); + +		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu)) +			err(EXIT_FAILURE, "pthread_attr_setaffinity_np"); + +		if (pthread_create(&w[i], &thread_attr, workerfn, NULL)) +			err(EXIT_FAILURE, "pthread_create"); +	} +} + +static void toggle_done(int sig __maybe_unused, +			siginfo_t *info __maybe_unused, +			void *uc __maybe_unused) +{ +	done = true; +} + +int bench_futex_wake(int argc, const char **argv, +		     const char *prefix __maybe_unused) +{ +	int ret = 0; +	unsigned int i, j; +	struct sigaction act; +	pthread_attr_t thread_attr; + +	argc = parse_options(argc, argv, options, bench_futex_wake_usage, 0); +	if (argc) { +		usage_with_options(bench_futex_wake_usage, options); +		exit(EXIT_FAILURE); +	} + +	ncpus = sysconf(_SC_NPROCESSORS_ONLN); + +	sigfillset(&act.sa_mask); +	act.sa_sigaction = toggle_done; +	sigaction(SIGINT, &act, NULL); + +	if (!nthreads) +		nthreads = ncpus; + +	worker = calloc(nthreads, sizeof(*worker)); +	if (!worker) +		err(EXIT_FAILURE, "calloc"); + +	printf("Run summary [PID %d]: blocking on %d threads (at futex %p), " +	       "waking up %d at a time.\n\n", +	       getpid(), nthreads, &futex1, nwakes); + +	init_stats(&wakeup_stats); +	init_stats(&waketime_stats); +	pthread_attr_init(&thread_attr); +	pthread_mutex_init(&thread_lock, NULL); +	pthread_cond_init(&thread_parent, NULL); +	pthread_cond_init(&thread_worker, NULL); + +	for (j = 0; j < repeat && !done; j++) { +		unsigned int nwoken = 0; +		struct timeval start, end, runtime; + +		/* create, launch & block all threads */ +		block_threads(worker, thread_attr); + +		/* make sure all threads are already blocked */ +		pthread_mutex_lock(&thread_lock); +		while (threads_starting) +			pthread_cond_wait(&thread_parent, &thread_lock); +		pthread_cond_broadcast(&thread_worker); +		pthread_mutex_unlock(&thread_lock); + +		usleep(100000); + +		/* Ok, all threads are patiently blocked, start waking folks up */ +		gettimeofday(&start, NULL); +		while (nwoken != nthreads) +			nwoken += futex_wake(&futex1, nwakes, FUTEX_PRIVATE_FLAG); +		gettimeofday(&end, NULL); +		timersub(&end, &start, &runtime); + +		update_stats(&wakeup_stats, nwoken); +		update_stats(&waketime_stats, runtime.tv_usec); + +		if (!silent) { +			printf("[Run %d]: Wokeup %d of %d threads in %.4f ms\n", +			       j + 1, nwoken, nthreads, runtime.tv_usec/1e3); +		} + +		for (i = 0; i < nthreads; i++) { +			ret = pthread_join(worker[i], NULL); +			if (ret) +				err(EXIT_FAILURE, "pthread_join"); +		} + +	} + +	/* cleanup & report results */ +	pthread_cond_destroy(&thread_parent); +	pthread_cond_destroy(&thread_worker); +	pthread_mutex_destroy(&thread_lock); +	pthread_attr_destroy(&thread_attr); + +	print_summary(); + +	free(worker); +	return ret; +} diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h new file mode 100644 index 00000000000..71f2844cf97 --- /dev/null +++ b/tools/perf/bench/futex.h @@ -0,0 +1,71 @@ +/* + * Glibc independent futex library for testing kernel functionality. + * Shamelessly stolen from Darren Hart <dvhltc@us.ibm.com> + *    http://git.kernel.org/cgit/linux/kernel/git/dvhart/futextest.git/ + */ + +#ifndef _FUTEX_H +#define _FUTEX_H + +#include <unistd.h> +#include <sys/syscall.h> +#include <sys/types.h> +#include <linux/futex.h> + +/** + * futex() - SYS_futex syscall wrapper + * @uaddr:	address of first futex + * @op:		futex op code + * @val:	typically expected value of uaddr, but varies by op + * @timeout:	typically an absolute struct timespec (except where noted + *		otherwise). Overloaded by some ops + * @uaddr2:	address of second futex for some ops\ + * @val3:	varies by op + * @opflags:	flags to be bitwise OR'd with op, such as FUTEX_PRIVATE_FLAG + * + * futex() is used by all the following futex op wrappers. It can also be + * used for misuse and abuse testing. Generally, the specific op wrappers + * should be used instead. It is a macro instead of an static inline function as + * some of the types over overloaded (timeout is used for nr_requeue for + * example). + * + * These argument descriptions are the defaults for all + * like-named arguments in the following wrappers except where noted below. + */ +#define futex(uaddr, op, val, timeout, uaddr2, val3, opflags) \ +	syscall(SYS_futex, uaddr, op | opflags, val, timeout, uaddr2, val3) + +/** + * futex_wait() - block on uaddr with optional timeout + * @timeout:	relative timeout + */ +static inline int +futex_wait(u_int32_t *uaddr, u_int32_t val, struct timespec *timeout, int opflags) +{ +	return futex(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags); +} + +/** + * futex_wake() - wake one or more tasks blocked on uaddr + * @nr_wake:	wake up to this many tasks + */ +static inline int +futex_wake(u_int32_t *uaddr, int nr_wake, int opflags) +{ +	return futex(uaddr, FUTEX_WAKE, nr_wake, NULL, NULL, 0, opflags); +} + +/** +* futex_cmp_requeue() - requeue tasks from uaddr to uaddr2 +* @nr_wake:        wake up to this many tasks +* @nr_requeue:        requeue up to this many tasks +*/ +static inline int +futex_cmp_requeue(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2, int nr_wake, +		 int nr_requeue, int opflags) +{ +	return futex(uaddr, FUTEX_CMP_REQUEUE, nr_wake, nr_requeue, uaddr2, +		 val, opflags); +} + +#endif /* _FUTEX_H */ diff --git a/tools/perf/bench/mem-memcpy-arch.h b/tools/perf/bench/mem-memcpy-arch.h new file mode 100644 index 00000000000..57b4ed87145 --- /dev/null +++ b/tools/perf/bench/mem-memcpy-arch.h @@ -0,0 +1,12 @@ + +#ifdef HAVE_ARCH_X86_64_SUPPORT + +#define MEMCPY_FN(fn, name, desc)		\ +	extern void *fn(void *, const void *, size_t); + +#include "mem-memcpy-x86-64-asm-def.h" + +#undef MEMCPY_FN + +#endif + diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h new file mode 100644 index 00000000000..d66ab799b35 --- /dev/null +++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h @@ -0,0 +1,12 @@ + +MEMCPY_FN(__memcpy, +	"x86-64-unrolled", +	"unrolled memcpy() in arch/x86/lib/memcpy_64.S") + +MEMCPY_FN(memcpy_c, +	"x86-64-movsq", +	"movsq-based memcpy() in arch/x86/lib/memcpy_64.S") + +MEMCPY_FN(memcpy_c_e, +	"x86-64-movsb", +	"movsb-based memcpy() in arch/x86/lib/memcpy_64.S") diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S new file mode 100644 index 00000000000..fcd9cf00600 --- /dev/null +++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S @@ -0,0 +1,12 @@ +#define memcpy MEMCPY /* don't hide glibc's memcpy() */ +#define altinstr_replacement text +#define globl p2align 4; .globl +#define Lmemcpy_c globl memcpy_c; memcpy_c +#define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e +#include "../../../arch/x86/lib/memcpy_64.S" +/* + * We need to provide note.GNU-stack section, saying that we want + * NOT executable stack. Otherwise the final linking will assume that + * the ELF stack should not be restricted at all and set it RWX. + */ +.section .note.GNU-stack,"",@progbits diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c index 38dae746514..5ce71d3b72c 100644 --- a/tools/perf/bench/mem-memcpy.c +++ b/tools/perf/bench/mem-memcpy.c @@ -5,13 +5,13 @@   *   * Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>   */ -#include <ctype.h>  #include "../perf.h"  #include "../util/util.h"  #include "../util/parse-options.h"  #include "../util/header.h"  #include "bench.h" +#include "mem-memcpy-arch.h"  #include <stdio.h>  #include <stdlib.h> @@ -23,30 +23,49 @@  static const char	*length_str	= "1MB";  static const char	*routine	= "default"; -static bool		use_clock	= false; -static int		clock_fd; +static int		iterations	= 1; +static bool		use_cycle; +static int		cycle_fd; +static bool		only_prefault; +static bool		no_prefault;  static const struct option options[] = {  	OPT_STRING('l', "length", &length_str, "1MB",  		    "Specify length of memory to copy. " -		    "available unit: B, MB, GB (upper and lower)"), +		    "Available units: B, KB, MB, GB and TB (upper and lower)"),  	OPT_STRING('r', "routine", &routine, "default",  		    "Specify routine to copy"), -	OPT_BOOLEAN('c', "clock", &use_clock, -		    "Use CPU clock for measuring"), +	OPT_INTEGER('i', "iterations", &iterations, +		    "repeat memcpy() invocation this number of times"), +	OPT_BOOLEAN('c', "cycle", &use_cycle, +		    "Use cycles event instead of gettimeofday() for measuring"), +	OPT_BOOLEAN('o', "only-prefault", &only_prefault, +		    "Show only the result with page faults before memcpy()"), +	OPT_BOOLEAN('n', "no-prefault", &no_prefault, +		    "Show only the result without page faults before memcpy()"),  	OPT_END()  }; +typedef void *(*memcpy_t)(void *, const void *, size_t); +  struct routine {  	const char *name;  	const char *desc; -	void * (*fn)(void *dst, const void *src, size_t len); +	memcpy_t fn;  };  struct routine routines[] = {  	{ "default",  	  "Default memcpy() provided by glibc",  	  memcpy }, +#ifdef HAVE_ARCH_X86_64_SUPPORT + +#define MEMCPY_FN(fn, name, desc) { name, desc, fn }, +#include "mem-memcpy-x86-64-asm-def.h" +#undef MEMCPY_FN + +#endif +  	{ NULL,  	  NULL,  	  NULL   } @@ -57,27 +76,27 @@ static const char * const bench_mem_memcpy_usage[] = {  	NULL  }; -static struct perf_event_attr clock_attr = { +static struct perf_event_attr cycle_attr = {  	.type		= PERF_TYPE_HARDWARE,  	.config		= PERF_COUNT_HW_CPU_CYCLES  }; -static void init_clock(void) +static void init_cycle(void)  { -	clock_fd = sys_perf_event_open(&clock_attr, getpid(), -1, -1, 0); +	cycle_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, 0); -	if (clock_fd < 0 && errno == ENOSYS) +	if (cycle_fd < 0 && errno == ENOSYS)  		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");  	else -		BUG_ON(clock_fd < 0); +		BUG_ON(cycle_fd < 0);  } -static u64 get_clock(void) +static u64 get_cycle(void)  {  	int ret;  	u64 clk; -	ret = read(clock_fd, &clk, sizeof(u64)); +	ret = read(cycle_fd, &clk, sizeof(u64));  	BUG_ON(ret != sizeof(u64));  	return clk; @@ -89,29 +108,104 @@ static double timeval2double(struct timeval *ts)  		(double)ts->tv_usec / (double)1000000;  } -int bench_mem_memcpy(int argc, const char **argv, -		     const char *prefix __used) +static void alloc_mem(void **dst, void **src, size_t length) +{ +	*dst = zalloc(length); +	if (!*dst) +		die("memory allocation failed - maybe length is too large?\n"); + +	*src = zalloc(length); +	if (!*src) +		die("memory allocation failed - maybe length is too large?\n"); +	/* Make sure to always replace the zero pages even if MMAP_THRESH is crossed */ +	memset(*src, 0, length); +} + +static u64 do_memcpy_cycle(memcpy_t fn, size_t len, bool prefault)  { +	u64 cycle_start = 0ULL, cycle_end = 0ULL; +	void *src = NULL, *dst = NULL;  	int i; -	void *dst, *src; -	size_t length; -	double bps = 0.0; + +	alloc_mem(&src, &dst, len); + +	if (prefault) +		fn(dst, src, len); + +	cycle_start = get_cycle(); +	for (i = 0; i < iterations; ++i) +		fn(dst, src, len); +	cycle_end = get_cycle(); + +	free(src); +	free(dst); +	return cycle_end - cycle_start; +} + +static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault) +{  	struct timeval tv_start, tv_end, tv_diff; -	u64 clock_start, clock_end, clock_diff; +	void *src = NULL, *dst = NULL; +	int i; + +	alloc_mem(&src, &dst, len); + +	if (prefault) +		fn(dst, src, len); + +	BUG_ON(gettimeofday(&tv_start, NULL)); +	for (i = 0; i < iterations; ++i) +		fn(dst, src, len); +	BUG_ON(gettimeofday(&tv_end, NULL)); + +	timersub(&tv_end, &tv_start, &tv_diff); + +	free(src); +	free(dst); +	return (double)((double)len / timeval2double(&tv_diff)); +} + +#define pf (no_prefault ? 0 : 1) + +#define print_bps(x) do {					\ +		if (x < K)					\ +			printf(" %14lf B/Sec", x);		\ +		else if (x < K * K)				\ +			printf(" %14lfd KB/Sec", x / K);	\ +		else if (x < K * K * K)				\ +			printf(" %14lf MB/Sec", x / K / K);	\ +		else						\ +			printf(" %14lf GB/Sec", x / K / K / K); \ +	} while (0) + +int bench_mem_memcpy(int argc, const char **argv, +		     const char *prefix __maybe_unused) +{ +	int i; +	size_t len; +	double result_bps[2]; +	u64 result_cycle[2]; -	clock_start = clock_end = clock_diff = 0ULL;  	argc = parse_options(argc, argv, options,  			     bench_mem_memcpy_usage, 0); -	tv_diff.tv_sec = 0; -	tv_diff.tv_usec = 0; -	length = (size_t)perf_atoll((char *)length_str); +	if (use_cycle) +		init_cycle(); -	if ((s64)length <= 0) { +	len = (size_t)perf_atoll((char *)length_str); + +	result_cycle[0] = result_cycle[1] = 0ULL; +	result_bps[0] = result_bps[1] = 0.0; + +	if ((s64)len <= 0) {  		fprintf(stderr, "Invalid length:%s\n", length_str);  		return 1;  	} +	/* same to without specifying either of prefault and no-prefault */ +	if (only_prefault && no_prefault) +		only_prefault = no_prefault = false; +  	for (i = 0; routines[i].name; i++) {  		if (!strcmp(routines[i].name, routine))  			break; @@ -126,61 +220,80 @@ int bench_mem_memcpy(int argc, const char **argv,  		return 1;  	} -	dst = zalloc(length); -	if (!dst) -		die("memory allocation failed - maybe length is too large?\n"); - -	src = zalloc(length); -	if (!src) -		die("memory allocation failed - maybe length is too large?\n"); - -	if (bench_format == BENCH_FORMAT_DEFAULT) { -		printf("# Copying %s Bytes from %p to %p ...\n\n", -		       length_str, src, dst); -	} - -	if (use_clock) { -		init_clock(); -		clock_start = get_clock(); -	} else { -		BUG_ON(gettimeofday(&tv_start, NULL)); -	} +	if (bench_format == BENCH_FORMAT_DEFAULT) +		printf("# Copying %s Bytes ...\n\n", length_str); -	routines[i].fn(dst, src, length); - -	if (use_clock) { -		clock_end = get_clock(); -		clock_diff = clock_end - clock_start; +	if (!only_prefault && !no_prefault) { +		/* show both of results */ +		if (use_cycle) { +			result_cycle[0] = +				do_memcpy_cycle(routines[i].fn, len, false); +			result_cycle[1] = +				do_memcpy_cycle(routines[i].fn, len, true); +		} else { +			result_bps[0] = +				do_memcpy_gettimeofday(routines[i].fn, +						len, false); +			result_bps[1] = +				do_memcpy_gettimeofday(routines[i].fn, +						len, true); +		}  	} else { -		BUG_ON(gettimeofday(&tv_end, NULL)); -		timersub(&tv_end, &tv_start, &tv_diff); -		bps = (double)((double)length / timeval2double(&tv_diff)); +		if (use_cycle) { +			result_cycle[pf] = +				do_memcpy_cycle(routines[i].fn, +						len, only_prefault); +		} else { +			result_bps[pf] = +				do_memcpy_gettimeofday(routines[i].fn, +						len, only_prefault); +		}  	}  	switch (bench_format) {  	case BENCH_FORMAT_DEFAULT: -		if (use_clock) { -			printf(" %14lf Clock/Byte\n", -			       (double)clock_diff / (double)length); -		} else { -			if (bps < K) -				printf(" %14lf B/Sec\n", bps); -			else if (bps < K * K) -				printf(" %14lfd KB/Sec\n", bps / 1024); -			else if (bps < K * K * K) -				printf(" %14lf MB/Sec\n", bps / 1024 / 1024); -			else { -				printf(" %14lf GB/Sec\n", -				       bps / 1024 / 1024 / 1024); +		if (!only_prefault && !no_prefault) { +			if (use_cycle) { +				printf(" %14lf Cycle/Byte\n", +					(double)result_cycle[0] +					/ (double)len); +				printf(" %14lf Cycle/Byte (with prefault)\n", +					(double)result_cycle[1] +					/ (double)len); +			} else { +				print_bps(result_bps[0]); +				printf("\n"); +				print_bps(result_bps[1]); +				printf(" (with prefault)\n");  			} +		} else { +			if (use_cycle) { +				printf(" %14lf Cycle/Byte", +					(double)result_cycle[pf] +					/ (double)len); +			} else +				print_bps(result_bps[pf]); + +			printf("%s\n", only_prefault ? " (with prefault)" : "");  		}  		break;  	case BENCH_FORMAT_SIMPLE: -		if (use_clock) { -			printf("%14lf\n", -			       (double)clock_diff / (double)length); -		} else -			printf("%lf\n", bps); +		if (!only_prefault && !no_prefault) { +			if (use_cycle) { +				printf("%lf %lf\n", +					(double)result_cycle[0] / (double)len, +					(double)result_cycle[1] / (double)len); +			} else { +				printf("%lf %lf\n", +					result_bps[0], result_bps[1]); +			} +		} else { +			if (use_cycle) { +				printf("%lf\n", (double)result_cycle[pf] +					/ (double)len); +			} else +				printf("%lf\n", result_bps[pf]); +		}  		break;  	default:  		/* reaching this means there's some disaster: */ diff --git a/tools/perf/bench/mem-memset-arch.h b/tools/perf/bench/mem-memset-arch.h new file mode 100644 index 00000000000..633800cb0dc --- /dev/null +++ b/tools/perf/bench/mem-memset-arch.h @@ -0,0 +1,12 @@ + +#ifdef HAVE_ARCH_X86_64_SUPPORT + +#define MEMSET_FN(fn, name, desc)		\ +	extern void *fn(void *, int, size_t); + +#include "mem-memset-x86-64-asm-def.h" + +#undef MEMSET_FN + +#endif + diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h new file mode 100644 index 00000000000..a71dff97c1f --- /dev/null +++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h @@ -0,0 +1,12 @@ + +MEMSET_FN(__memset, +	"x86-64-unrolled", +	"unrolled memset() in arch/x86/lib/memset_64.S") + +MEMSET_FN(memset_c, +	"x86-64-stosq", +	"movsq-based memset() in arch/x86/lib/memset_64.S") + +MEMSET_FN(memset_c_e, +	"x86-64-stosb", +	"movsb-based memset() in arch/x86/lib/memset_64.S") diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S new file mode 100644 index 00000000000..9e5af89ed13 --- /dev/null +++ b/tools/perf/bench/mem-memset-x86-64-asm.S @@ -0,0 +1,13 @@ +#define memset MEMSET /* don't hide glibc's memset() */ +#define altinstr_replacement text +#define globl p2align 4; .globl +#define Lmemset_c globl memset_c; memset_c +#define Lmemset_c_e globl memset_c_e; memset_c_e +#include "../../../arch/x86/lib/memset_64.S" + +/* + * We need to provide note.GNU-stack section, saying that we want + * NOT executable stack. Otherwise the final linking will assume that + * the ELF stack should not be restricted at all and set it RWX. + */ +.section .note.GNU-stack,"",@progbits diff --git a/tools/perf/bench/mem-memset.c b/tools/perf/bench/mem-memset.c new file mode 100644 index 00000000000..9af79d2b18e --- /dev/null +++ b/tools/perf/bench/mem-memset.c @@ -0,0 +1,297 @@ +/* + * mem-memset.c + * + * memset: Simple memory set in various ways + * + * Trivial clone of mem-memcpy.c. + */ + +#include "../perf.h" +#include "../util/util.h" +#include "../util/parse-options.h" +#include "../util/header.h" +#include "bench.h" +#include "mem-memset-arch.h" + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/time.h> +#include <errno.h> + +#define K 1024 + +static const char	*length_str	= "1MB"; +static const char	*routine	= "default"; +static int		iterations	= 1; +static bool		use_cycle; +static int		cycle_fd; +static bool		only_prefault; +static bool		no_prefault; + +static const struct option options[] = { +	OPT_STRING('l', "length", &length_str, "1MB", +		    "Specify length of memory to set. " +		    "Available units: B, KB, MB, GB and TB (upper and lower)"), +	OPT_STRING('r', "routine", &routine, "default", +		    "Specify routine to set"), +	OPT_INTEGER('i', "iterations", &iterations, +		    "repeat memset() invocation this number of times"), +	OPT_BOOLEAN('c', "cycle", &use_cycle, +		    "Use cycles event instead of gettimeofday() for measuring"), +	OPT_BOOLEAN('o', "only-prefault", &only_prefault, +		    "Show only the result with page faults before memset()"), +	OPT_BOOLEAN('n', "no-prefault", &no_prefault, +		    "Show only the result without page faults before memset()"), +	OPT_END() +}; + +typedef void *(*memset_t)(void *, int, size_t); + +struct routine { +	const char *name; +	const char *desc; +	memset_t fn; +}; + +static const struct routine routines[] = { +	{ "default", +	  "Default memset() provided by glibc", +	  memset }, +#ifdef HAVE_ARCH_X86_64_SUPPORT + +#define MEMSET_FN(fn, name, desc) { name, desc, fn }, +#include "mem-memset-x86-64-asm-def.h" +#undef MEMSET_FN + +#endif + +	{ NULL, +	  NULL, +	  NULL   } +}; + +static const char * const bench_mem_memset_usage[] = { +	"perf bench mem memset <options>", +	NULL +}; + +static struct perf_event_attr cycle_attr = { +	.type		= PERF_TYPE_HARDWARE, +	.config		= PERF_COUNT_HW_CPU_CYCLES +}; + +static void init_cycle(void) +{ +	cycle_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, 0); + +	if (cycle_fd < 0 && errno == ENOSYS) +		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n"); +	else +		BUG_ON(cycle_fd < 0); +} + +static u64 get_cycle(void) +{ +	int ret; +	u64 clk; + +	ret = read(cycle_fd, &clk, sizeof(u64)); +	BUG_ON(ret != sizeof(u64)); + +	return clk; +} + +static double timeval2double(struct timeval *ts) +{ +	return (double)ts->tv_sec + +		(double)ts->tv_usec / (double)1000000; +} + +static void alloc_mem(void **dst, size_t length) +{ +	*dst = zalloc(length); +	if (!*dst) +		die("memory allocation failed - maybe length is too large?\n"); +} + +static u64 do_memset_cycle(memset_t fn, size_t len, bool prefault) +{ +	u64 cycle_start = 0ULL, cycle_end = 0ULL; +	void *dst = NULL; +	int i; + +	alloc_mem(&dst, len); + +	if (prefault) +		fn(dst, -1, len); + +	cycle_start = get_cycle(); +	for (i = 0; i < iterations; ++i) +		fn(dst, i, len); +	cycle_end = get_cycle(); + +	free(dst); +	return cycle_end - cycle_start; +} + +static double do_memset_gettimeofday(memset_t fn, size_t len, bool prefault) +{ +	struct timeval tv_start, tv_end, tv_diff; +	void *dst = NULL; +	int i; + +	alloc_mem(&dst, len); + +	if (prefault) +		fn(dst, -1, len); + +	BUG_ON(gettimeofday(&tv_start, NULL)); +	for (i = 0; i < iterations; ++i) +		fn(dst, i, len); +	BUG_ON(gettimeofday(&tv_end, NULL)); + +	timersub(&tv_end, &tv_start, &tv_diff); + +	free(dst); +	return (double)((double)len / timeval2double(&tv_diff)); +} + +#define pf (no_prefault ? 0 : 1) + +#define print_bps(x) do {					\ +		if (x < K)					\ +			printf(" %14lf B/Sec", x);		\ +		else if (x < K * K)				\ +			printf(" %14lfd KB/Sec", x / K);	\ +		else if (x < K * K * K)				\ +			printf(" %14lf MB/Sec", x / K / K);	\ +		else						\ +			printf(" %14lf GB/Sec", x / K / K / K); \ +	} while (0) + +int bench_mem_memset(int argc, const char **argv, +		     const char *prefix __maybe_unused) +{ +	int i; +	size_t len; +	double result_bps[2]; +	u64 result_cycle[2]; + +	argc = parse_options(argc, argv, options, +			     bench_mem_memset_usage, 0); + +	if (use_cycle) +		init_cycle(); + +	len = (size_t)perf_atoll((char *)length_str); + +	result_cycle[0] = result_cycle[1] = 0ULL; +	result_bps[0] = result_bps[1] = 0.0; + +	if ((s64)len <= 0) { +		fprintf(stderr, "Invalid length:%s\n", length_str); +		return 1; +	} + +	/* same to without specifying either of prefault and no-prefault */ +	if (only_prefault && no_prefault) +		only_prefault = no_prefault = false; + +	for (i = 0; routines[i].name; i++) { +		if (!strcmp(routines[i].name, routine)) +			break; +	} +	if (!routines[i].name) { +		printf("Unknown routine:%s\n", routine); +		printf("Available routines...\n"); +		for (i = 0; routines[i].name; i++) { +			printf("\t%s ... %s\n", +			       routines[i].name, routines[i].desc); +		} +		return 1; +	} + +	if (bench_format == BENCH_FORMAT_DEFAULT) +		printf("# Copying %s Bytes ...\n\n", length_str); + +	if (!only_prefault && !no_prefault) { +		/* show both of results */ +		if (use_cycle) { +			result_cycle[0] = +				do_memset_cycle(routines[i].fn, len, false); +			result_cycle[1] = +				do_memset_cycle(routines[i].fn, len, true); +		} else { +			result_bps[0] = +				do_memset_gettimeofday(routines[i].fn, +						len, false); +			result_bps[1] = +				do_memset_gettimeofday(routines[i].fn, +						len, true); +		} +	} else { +		if (use_cycle) { +			result_cycle[pf] = +				do_memset_cycle(routines[i].fn, +						len, only_prefault); +		} else { +			result_bps[pf] = +				do_memset_gettimeofday(routines[i].fn, +						len, only_prefault); +		} +	} + +	switch (bench_format) { +	case BENCH_FORMAT_DEFAULT: +		if (!only_prefault && !no_prefault) { +			if (use_cycle) { +				printf(" %14lf Cycle/Byte\n", +					(double)result_cycle[0] +					/ (double)len); +				printf(" %14lf Cycle/Byte (with prefault)\n ", +					(double)result_cycle[1] +					/ (double)len); +			} else { +				print_bps(result_bps[0]); +				printf("\n"); +				print_bps(result_bps[1]); +				printf(" (with prefault)\n"); +			} +		} else { +			if (use_cycle) { +				printf(" %14lf Cycle/Byte", +					(double)result_cycle[pf] +					/ (double)len); +			} else +				print_bps(result_bps[pf]); + +			printf("%s\n", only_prefault ? " (with prefault)" : ""); +		} +		break; +	case BENCH_FORMAT_SIMPLE: +		if (!only_prefault && !no_prefault) { +			if (use_cycle) { +				printf("%lf %lf\n", +					(double)result_cycle[0] / (double)len, +					(double)result_cycle[1] / (double)len); +			} else { +				printf("%lf %lf\n", +					result_bps[0], result_bps[1]); +			} +		} else { +			if (use_cycle) { +				printf("%lf\n", (double)result_cycle[pf] +					/ (double)len); +			} else +				printf("%lf\n", result_bps[pf]); +		} +		break; +	default: +		/* reaching this means there's some disaster: */ +		die("unknown format: %d\n", bench_format); +		break; +	} + +	return 0; +} diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c new file mode 100644 index 00000000000..ebfa163b80b --- /dev/null +++ b/tools/perf/bench/numa.c @@ -0,0 +1,1744 @@ +/* + * numa.c + * + * numa: Simulate NUMA-sensitive workload and measure their NUMA performance + */ + +#include "../perf.h" +#include "../builtin.h" +#include "../util/util.h" +#include "../util/parse-options.h" + +#include "bench.h" + +#include <errno.h> +#include <sched.h> +#include <stdio.h> +#include <assert.h> +#include <malloc.h> +#include <signal.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <pthread.h> +#include <sys/mman.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <sys/prctl.h> +#include <sys/types.h> + +#include <numa.h> +#include <numaif.h> + +/* + * Regular printout to the terminal, supressed if -q is specified: + */ +#define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0) + +/* + * Debug printf: + */ +#define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0) + +struct thread_data { +	int			curr_cpu; +	cpu_set_t		bind_cpumask; +	int			bind_node; +	u8			*process_data; +	int			process_nr; +	int			thread_nr; +	int			task_nr; +	unsigned int		loops_done; +	u64			val; +	u64			runtime_ns; +	pthread_mutex_t		*process_lock; +}; + +/* Parameters set by options: */ + +struct params { +	/* Startup synchronization: */ +	bool			serialize_startup; + +	/* Task hierarchy: */ +	int			nr_proc; +	int			nr_threads; + +	/* Working set sizes: */ +	const char		*mb_global_str; +	const char		*mb_proc_str; +	const char		*mb_proc_locked_str; +	const char		*mb_thread_str; + +	double			mb_global; +	double			mb_proc; +	double			mb_proc_locked; +	double			mb_thread; + +	/* Access patterns to the working set: */ +	bool			data_reads; +	bool			data_writes; +	bool			data_backwards; +	bool			data_zero_memset; +	bool			data_rand_walk; +	u32			nr_loops; +	u32			nr_secs; +	u32			sleep_usecs; + +	/* Working set initialization: */ +	bool			init_zero; +	bool			init_random; +	bool			init_cpu0; + +	/* Misc options: */ +	int			show_details; +	int			run_all; +	int			thp; + +	long			bytes_global; +	long			bytes_process; +	long			bytes_process_locked; +	long			bytes_thread; + +	int			nr_tasks; +	bool			show_quiet; + +	bool			show_convergence; +	bool			measure_convergence; + +	int			perturb_secs; +	int			nr_cpus; +	int			nr_nodes; + +	/* Affinity options -C and -N: */ +	char			*cpu_list_str; +	char			*node_list_str; +}; + + +/* Global, read-writable area, accessible to all processes and threads: */ + +struct global_info { +	u8			*data; + +	pthread_mutex_t		startup_mutex; +	int			nr_tasks_started; + +	pthread_mutex_t		startup_done_mutex; + +	pthread_mutex_t		start_work_mutex; +	int			nr_tasks_working; + +	pthread_mutex_t		stop_work_mutex; +	u64			bytes_done; + +	struct thread_data	*threads; + +	/* Convergence latency measurement: */ +	bool			all_converged; +	bool			stop_work; + +	int			print_once; + +	struct params		p; +}; + +static struct global_info	*g = NULL; + +static int parse_cpus_opt(const struct option *opt, const char *arg, int unset); +static int parse_nodes_opt(const struct option *opt, const char *arg, int unset); + +struct params p0; + +static const struct option options[] = { +	OPT_INTEGER('p', "nr_proc"	, &p0.nr_proc,		"number of processes"), +	OPT_INTEGER('t', "nr_threads"	, &p0.nr_threads,	"number of threads per process"), + +	OPT_STRING('G', "mb_global"	, &p0.mb_global_str,	"MB", "global  memory (MBs)"), +	OPT_STRING('P', "mb_proc"	, &p0.mb_proc_str,	"MB", "process memory (MBs)"), +	OPT_STRING('L', "mb_proc_locked", &p0.mb_proc_locked_str,"MB", "process serialized/locked memory access (MBs), <= process_memory"), +	OPT_STRING('T', "mb_thread"	, &p0.mb_thread_str,	"MB", "thread  memory (MBs)"), + +	OPT_UINTEGER('l', "nr_loops"	, &p0.nr_loops,		"max number of loops to run"), +	OPT_UINTEGER('s', "nr_secs"	, &p0.nr_secs,		"max number of seconds to run"), +	OPT_UINTEGER('u', "usleep"	, &p0.sleep_usecs,	"usecs to sleep per loop iteration"), + +	OPT_BOOLEAN('R', "data_reads"	, &p0.data_reads,	"access the data via writes (can be mixed with -W)"), +	OPT_BOOLEAN('W', "data_writes"	, &p0.data_writes,	"access the data via writes (can be mixed with -R)"), +	OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards,	"access the data backwards as well"), +	OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"), +	OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk,	"access the data with random (32bit LFSR) walk"), + + +	OPT_BOOLEAN('z', "init_zero"	, &p0.init_zero,	"bzero the initial allocations"), +	OPT_BOOLEAN('I', "init_random"	, &p0.init_random,	"randomize the contents of the initial allocations"), +	OPT_BOOLEAN('0', "init_cpu0"	, &p0.init_cpu0,	"do the initial allocations on CPU#0"), +	OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs,	"perturb thread 0/0 every X secs, to test convergence stability"), + +	OPT_INCR   ('d', "show_details"	, &p0.show_details,	"Show details"), +	OPT_INCR   ('a', "all"		, &p0.run_all,		"Run all tests in the suite"), +	OPT_INTEGER('H', "thp"		, &p0.thp,		"MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"), +	OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"), +	OPT_BOOLEAN('m', "measure_convergence",	&p0.measure_convergence, "measure convergence latency"), +	OPT_BOOLEAN('q', "quiet"	, &p0.show_quiet,	"bzero the initial allocations"), +	OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"), + +	/* Special option string parsing callbacks: */ +        OPT_CALLBACK('C', "cpus", NULL, "cpu[,cpu2,...cpuN]", +			"bind the first N tasks to these specific cpus (the rest is unbound)", +			parse_cpus_opt), +        OPT_CALLBACK('M', "memnodes", NULL, "node[,node2,...nodeN]", +			"bind the first N tasks to these specific memory nodes (the rest is unbound)", +			parse_nodes_opt), +	OPT_END() +}; + +static const char * const bench_numa_usage[] = { +	"perf bench numa <options>", +	NULL +}; + +static const char * const numa_usage[] = { +	"perf bench numa mem [<options>]", +	NULL +}; + +static cpu_set_t bind_to_cpu(int target_cpu) +{ +	cpu_set_t orig_mask, mask; +	int ret; + +	ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask); +	BUG_ON(ret); + +	CPU_ZERO(&mask); + +	if (target_cpu == -1) { +		int cpu; + +		for (cpu = 0; cpu < g->p.nr_cpus; cpu++) +			CPU_SET(cpu, &mask); +	} else { +		BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus); +		CPU_SET(target_cpu, &mask); +	} + +	ret = sched_setaffinity(0, sizeof(mask), &mask); +	BUG_ON(ret); + +	return orig_mask; +} + +static cpu_set_t bind_to_node(int target_node) +{ +	int cpus_per_node = g->p.nr_cpus/g->p.nr_nodes; +	cpu_set_t orig_mask, mask; +	int cpu; +	int ret; + +	BUG_ON(cpus_per_node*g->p.nr_nodes != g->p.nr_cpus); +	BUG_ON(!cpus_per_node); + +	ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask); +	BUG_ON(ret); + +	CPU_ZERO(&mask); + +	if (target_node == -1) { +		for (cpu = 0; cpu < g->p.nr_cpus; cpu++) +			CPU_SET(cpu, &mask); +	} else { +		int cpu_start = (target_node + 0) * cpus_per_node; +		int cpu_stop  = (target_node + 1) * cpus_per_node; + +		BUG_ON(cpu_stop > g->p.nr_cpus); + +		for (cpu = cpu_start; cpu < cpu_stop; cpu++) +			CPU_SET(cpu, &mask); +	} + +	ret = sched_setaffinity(0, sizeof(mask), &mask); +	BUG_ON(ret); + +	return orig_mask; +} + +static void bind_to_cpumask(cpu_set_t mask) +{ +	int ret; + +	ret = sched_setaffinity(0, sizeof(mask), &mask); +	BUG_ON(ret); +} + +static void mempol_restore(void) +{ +	int ret; + +	ret = set_mempolicy(MPOL_DEFAULT, NULL, g->p.nr_nodes-1); + +	BUG_ON(ret); +} + +static void bind_to_memnode(int node) +{ +	unsigned long nodemask; +	int ret; + +	if (node == -1) +		return; + +	BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)); +	nodemask = 1L << node; + +	ret = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8); +	dprintf("binding to node %d, mask: %016lx => %d\n", node, nodemask, ret); + +	BUG_ON(ret); +} + +#define HPSIZE (2*1024*1024) + +#define set_taskname(fmt...)				\ +do {							\ +	char name[20];					\ +							\ +	snprintf(name, 20, fmt);			\ +	prctl(PR_SET_NAME, name);			\ +} while (0) + +static u8 *alloc_data(ssize_t bytes0, int map_flags, +		      int init_zero, int init_cpu0, int thp, int init_random) +{ +	cpu_set_t orig_mask; +	ssize_t bytes; +	u8 *buf; +	int ret; + +	if (!bytes0) +		return NULL; + +	/* Allocate and initialize all memory on CPU#0: */ +	if (init_cpu0) { +		orig_mask = bind_to_node(0); +		bind_to_memnode(0); +	} + +	bytes = bytes0 + HPSIZE; + +	buf = (void *)mmap(0, bytes, PROT_READ|PROT_WRITE, MAP_ANON|map_flags, -1, 0); +	BUG_ON(buf == (void *)-1); + +	if (map_flags == MAP_PRIVATE) { +		if (thp > 0) { +			ret = madvise(buf, bytes, MADV_HUGEPAGE); +			if (ret && !g->print_once) { +				g->print_once = 1; +				printf("WARNING: Could not enable THP - do: 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled'\n"); +			} +		} +		if (thp < 0) { +			ret = madvise(buf, bytes, MADV_NOHUGEPAGE); +			if (ret && !g->print_once) { +				g->print_once = 1; +				printf("WARNING: Could not disable THP: run a CONFIG_TRANSPARENT_HUGEPAGE kernel?\n"); +			} +		} +	} + +	if (init_zero) { +		bzero(buf, bytes); +	} else { +		/* Initialize random contents, different in each word: */ +		if (init_random) { +			u64 *wbuf = (void *)buf; +			long off = rand(); +			long i; + +			for (i = 0; i < bytes/8; i++) +				wbuf[i] = i + off; +		} +	} + +	/* Align to 2MB boundary: */ +	buf = (void *)(((unsigned long)buf + HPSIZE-1) & ~(HPSIZE-1)); + +	/* Restore affinity: */ +	if (init_cpu0) { +		bind_to_cpumask(orig_mask); +		mempol_restore(); +	} + +	return buf; +} + +static void free_data(void *data, ssize_t bytes) +{ +	int ret; + +	if (!data) +		return; + +	ret = munmap(data, bytes); +	BUG_ON(ret); +} + +/* + * Create a shared memory buffer that can be shared between processes, zeroed: + */ +static void * zalloc_shared_data(ssize_t bytes) +{ +	return alloc_data(bytes, MAP_SHARED, 1, g->p.init_cpu0,  g->p.thp, g->p.init_random); +} + +/* + * Create a shared memory buffer that can be shared between processes: + */ +static void * setup_shared_data(ssize_t bytes) +{ +	return alloc_data(bytes, MAP_SHARED, 0, g->p.init_cpu0,  g->p.thp, g->p.init_random); +} + +/* + * Allocate process-local memory - this will either be shared between + * threads of this process, or only be accessed by this thread: + */ +static void * setup_private_data(ssize_t bytes) +{ +	return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0,  g->p.thp, g->p.init_random); +} + +/* + * Return a process-shared (global) mutex: + */ +static void init_global_mutex(pthread_mutex_t *mutex) +{ +	pthread_mutexattr_t attr; + +	pthread_mutexattr_init(&attr); +	pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); +	pthread_mutex_init(mutex, &attr); +} + +static int parse_cpu_list(const char *arg) +{ +	p0.cpu_list_str = strdup(arg); + +	dprintf("got CPU list: {%s}\n", p0.cpu_list_str); + +	return 0; +} + +static int parse_setup_cpu_list(void) +{ +	struct thread_data *td; +	char *str0, *str; +	int t; + +	if (!g->p.cpu_list_str) +		return 0; + +	dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks); + +	str0 = str = strdup(g->p.cpu_list_str); +	t = 0; + +	BUG_ON(!str); + +	tprintf("# binding tasks to CPUs:\n"); +	tprintf("#  "); + +	while (true) { +		int bind_cpu, bind_cpu_0, bind_cpu_1; +		char *tok, *tok_end, *tok_step, *tok_len, *tok_mul; +		int bind_len; +		int step; +		int mul; + +		tok = strsep(&str, ","); +		if (!tok) +			break; + +		tok_end = strstr(tok, "-"); + +		dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end); +		if (!tok_end) { +			/* Single CPU specified: */ +			bind_cpu_0 = bind_cpu_1 = atol(tok); +		} else { +			/* CPU range specified (for example: "5-11"): */ +			bind_cpu_0 = atol(tok); +			bind_cpu_1 = atol(tok_end + 1); +		} + +		step = 1; +		tok_step = strstr(tok, "#"); +		if (tok_step) { +			step = atol(tok_step + 1); +			BUG_ON(step <= 0 || step >= g->p.nr_cpus); +		} + +		/* +		 * Mask length. +		 * Eg: "--cpus 8_4-16#4" means: '--cpus 8_4,12_4,16_4', +		 * where the _4 means the next 4 CPUs are allowed. +		 */ +		bind_len = 1; +		tok_len = strstr(tok, "_"); +		if (tok_len) { +			bind_len = atol(tok_len + 1); +			BUG_ON(bind_len <= 0 || bind_len > g->p.nr_cpus); +		} + +		/* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */ +		mul = 1; +		tok_mul = strstr(tok, "x"); +		if (tok_mul) { +			mul = atol(tok_mul + 1); +			BUG_ON(mul <= 0); +		} + +		dprintf("CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul); + +		if (bind_cpu_0 >= g->p.nr_cpus || bind_cpu_1 >= g->p.nr_cpus) { +			printf("\nTest not applicable, system has only %d CPUs.\n", g->p.nr_cpus); +			return -1; +		} + +		BUG_ON(bind_cpu_0 < 0 || bind_cpu_1 < 0); +		BUG_ON(bind_cpu_0 > bind_cpu_1); + +		for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) { +			int i; + +			for (i = 0; i < mul; i++) { +				int cpu; + +				if (t >= g->p.nr_tasks) { +					printf("\n# NOTE: ignoring bind CPUs starting at CPU#%d\n #", bind_cpu); +					goto out; +				} +				td = g->threads + t; + +				if (t) +					tprintf(","); +				if (bind_len > 1) { +					tprintf("%2d/%d", bind_cpu, bind_len); +				} else { +					tprintf("%2d", bind_cpu); +				} + +				CPU_ZERO(&td->bind_cpumask); +				for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) { +					BUG_ON(cpu < 0 || cpu >= g->p.nr_cpus); +					CPU_SET(cpu, &td->bind_cpumask); +				} +				t++; +			} +		} +	} +out: + +	tprintf("\n"); + +	if (t < g->p.nr_tasks) +		printf("# NOTE: %d tasks bound, %d tasks unbound\n", t, g->p.nr_tasks - t); + +	free(str0); +	return 0; +} + +static int parse_cpus_opt(const struct option *opt __maybe_unused, +			  const char *arg, int unset __maybe_unused) +{ +	if (!arg) +		return -1; + +	return parse_cpu_list(arg); +} + +static int parse_node_list(const char *arg) +{ +	p0.node_list_str = strdup(arg); + +	dprintf("got NODE list: {%s}\n", p0.node_list_str); + +	return 0; +} + +static int parse_setup_node_list(void) +{ +	struct thread_data *td; +	char *str0, *str; +	int t; + +	if (!g->p.node_list_str) +		return 0; + +	dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks); + +	str0 = str = strdup(g->p.node_list_str); +	t = 0; + +	BUG_ON(!str); + +	tprintf("# binding tasks to NODEs:\n"); +	tprintf("# "); + +	while (true) { +		int bind_node, bind_node_0, bind_node_1; +		char *tok, *tok_end, *tok_step, *tok_mul; +		int step; +		int mul; + +		tok = strsep(&str, ","); +		if (!tok) +			break; + +		tok_end = strstr(tok, "-"); + +		dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end); +		if (!tok_end) { +			/* Single NODE specified: */ +			bind_node_0 = bind_node_1 = atol(tok); +		} else { +			/* NODE range specified (for example: "5-11"): */ +			bind_node_0 = atol(tok); +			bind_node_1 = atol(tok_end + 1); +		} + +		step = 1; +		tok_step = strstr(tok, "#"); +		if (tok_step) { +			step = atol(tok_step + 1); +			BUG_ON(step <= 0 || step >= g->p.nr_nodes); +		} + +		/* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */ +		mul = 1; +		tok_mul = strstr(tok, "x"); +		if (tok_mul) { +			mul = atol(tok_mul + 1); +			BUG_ON(mul <= 0); +		} + +		dprintf("NODEs: %d-%d #%d\n", bind_node_0, bind_node_1, step); + +		if (bind_node_0 >= g->p.nr_nodes || bind_node_1 >= g->p.nr_nodes) { +			printf("\nTest not applicable, system has only %d nodes.\n", g->p.nr_nodes); +			return -1; +		} + +		BUG_ON(bind_node_0 < 0 || bind_node_1 < 0); +		BUG_ON(bind_node_0 > bind_node_1); + +		for (bind_node = bind_node_0; bind_node <= bind_node_1; bind_node += step) { +			int i; + +			for (i = 0; i < mul; i++) { +				if (t >= g->p.nr_tasks) { +					printf("\n# NOTE: ignoring bind NODEs starting at NODE#%d\n", bind_node); +					goto out; +				} +				td = g->threads + t; + +				if (!t) +					tprintf(" %2d", bind_node); +				else +					tprintf(",%2d", bind_node); + +				td->bind_node = bind_node; +				t++; +			} +		} +	} +out: + +	tprintf("\n"); + +	if (t < g->p.nr_tasks) +		printf("# NOTE: %d tasks mem-bound, %d tasks unbound\n", t, g->p.nr_tasks - t); + +	free(str0); +	return 0; +} + +static int parse_nodes_opt(const struct option *opt __maybe_unused, +			  const char *arg, int unset __maybe_unused) +{ +	if (!arg) +		return -1; + +	return parse_node_list(arg); + +	return 0; +} + +#define BIT(x) (1ul << x) + +static inline uint32_t lfsr_32(uint32_t lfsr) +{ +	const uint32_t taps = BIT(1) | BIT(5) | BIT(6) | BIT(31); +	return (lfsr>>1) ^ ((0x0u - (lfsr & 0x1u)) & taps); +} + +/* + * Make sure there's real data dependency to RAM (when read + * accesses are enabled), so the compiler, the CPU and the + * kernel (KSM, zero page, etc.) cannot optimize away RAM + * accesses: + */ +static inline u64 access_data(u64 *data __attribute__((unused)), u64 val) +{ +	if (g->p.data_reads) +		val += *data; +	if (g->p.data_writes) +		*data = val + 1; +	return val; +} + +/* + * The worker process does two types of work, a forwards going + * loop and a backwards going loop. + * + * We do this so that on multiprocessor systems we do not create + * a 'train' of processing, with highly synchronized processes, + * skewing the whole benchmark. + */ +static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val) +{ +	long words = bytes/sizeof(u64); +	u64 *data = (void *)__data; +	long chunk_0, chunk_1; +	u64 *d0, *d, *d1; +	long off; +	long i; + +	BUG_ON(!data && words); +	BUG_ON(data && !words); + +	if (!data) +		return val; + +	/* Very simple memset() work variant: */ +	if (g->p.data_zero_memset && !g->p.data_rand_walk) { +		bzero(data, bytes); +		return val; +	} + +	/* Spread out by PID/TID nr and by loop nr: */ +	chunk_0 = words/nr_max; +	chunk_1 = words/g->p.nr_loops; +	off = nr*chunk_0 + loop*chunk_1; + +	while (off >= words) +		off -= words; + +	if (g->p.data_rand_walk) { +		u32 lfsr = nr + loop + val; +		int j; + +		for (i = 0; i < words/1024; i++) { +			long start, end; + +			lfsr = lfsr_32(lfsr); + +			start = lfsr % words; +			end = min(start + 1024, words-1); + +			if (g->p.data_zero_memset) { +				bzero(data + start, (end-start) * sizeof(u64)); +			} else { +				for (j = start; j < end; j++) +					val = access_data(data + j, val); +			} +		} +	} else if (!g->p.data_backwards || (nr + loop) & 1) { + +		d0 = data + off; +		d  = data + off + 1; +		d1 = data + words; + +		/* Process data forwards: */ +		for (;;) { +			if (unlikely(d >= d1)) +				d = data; +			if (unlikely(d == d0)) +				break; + +			val = access_data(d, val); + +			d++; +		} +	} else { +		/* Process data backwards: */ + +		d0 = data + off; +		d  = data + off - 1; +		d1 = data + words; + +		/* Process data forwards: */ +		for (;;) { +			if (unlikely(d < data)) +				d = data + words-1; +			if (unlikely(d == d0)) +				break; + +			val = access_data(d, val); + +			d--; +		} +	} + +	return val; +} + +static void update_curr_cpu(int task_nr, unsigned long bytes_worked) +{ +	unsigned int cpu; + +	cpu = sched_getcpu(); + +	g->threads[task_nr].curr_cpu = cpu; +	prctl(0, bytes_worked); +} + +#define MAX_NR_NODES	64 + +/* + * Count the number of nodes a process's threads + * are spread out on. + * + * A count of 1 means that the process is compressed + * to a single node. A count of g->p.nr_nodes means it's + * spread out on the whole system. + */ +static int count_process_nodes(int process_nr) +{ +	char node_present[MAX_NR_NODES] = { 0, }; +	int nodes; +	int n, t; + +	for (t = 0; t < g->p.nr_threads; t++) { +		struct thread_data *td; +		int task_nr; +		int node; + +		task_nr = process_nr*g->p.nr_threads + t; +		td = g->threads + task_nr; + +		node = numa_node_of_cpu(td->curr_cpu); +		node_present[node] = 1; +	} + +	nodes = 0; + +	for (n = 0; n < MAX_NR_NODES; n++) +		nodes += node_present[n]; + +	return nodes; +} + +/* + * Count the number of distinct process-threads a node contains. + * + * A count of 1 means that the node contains only a single + * process. If all nodes on the system contain at most one + * process then we are well-converged. + */ +static int count_node_processes(int node) +{ +	int processes = 0; +	int t, p; + +	for (p = 0; p < g->p.nr_proc; p++) { +		for (t = 0; t < g->p.nr_threads; t++) { +			struct thread_data *td; +			int task_nr; +			int n; + +			task_nr = p*g->p.nr_threads + t; +			td = g->threads + task_nr; + +			n = numa_node_of_cpu(td->curr_cpu); +			if (n == node) { +				processes++; +				break; +			} +		} +	} + +	return processes; +} + +static void calc_convergence_compression(int *strong) +{ +	unsigned int nodes_min, nodes_max; +	int p; + +	nodes_min = -1; +	nodes_max =  0; + +	for (p = 0; p < g->p.nr_proc; p++) { +		unsigned int nodes = count_process_nodes(p); + +		nodes_min = min(nodes, nodes_min); +		nodes_max = max(nodes, nodes_max); +	} + +	/* Strong convergence: all threads compress on a single node: */ +	if (nodes_min == 1 && nodes_max == 1) { +		*strong = 1; +	} else { +		*strong = 0; +		tprintf(" {%d-%d}", nodes_min, nodes_max); +	} +} + +static void calc_convergence(double runtime_ns_max, double *convergence) +{ +	unsigned int loops_done_min, loops_done_max; +	int process_groups; +	int nodes[MAX_NR_NODES]; +	int distance; +	int nr_min; +	int nr_max; +	int strong; +	int sum; +	int nr; +	int node; +	int cpu; +	int t; + +	if (!g->p.show_convergence && !g->p.measure_convergence) +		return; + +	for (node = 0; node < g->p.nr_nodes; node++) +		nodes[node] = 0; + +	loops_done_min = -1; +	loops_done_max = 0; + +	for (t = 0; t < g->p.nr_tasks; t++) { +		struct thread_data *td = g->threads + t; +		unsigned int loops_done; + +		cpu = td->curr_cpu; + +		/* Not all threads have written it yet: */ +		if (cpu < 0) +			continue; + +		node = numa_node_of_cpu(cpu); + +		nodes[node]++; + +		loops_done = td->loops_done; +		loops_done_min = min(loops_done, loops_done_min); +		loops_done_max = max(loops_done, loops_done_max); +	} + +	nr_max = 0; +	nr_min = g->p.nr_tasks; +	sum = 0; + +	for (node = 0; node < g->p.nr_nodes; node++) { +		nr = nodes[node]; +		nr_min = min(nr, nr_min); +		nr_max = max(nr, nr_max); +		sum += nr; +	} +	BUG_ON(nr_min > nr_max); + +	BUG_ON(sum > g->p.nr_tasks); + +	if (0 && (sum < g->p.nr_tasks)) +		return; + +	/* +	 * Count the number of distinct process groups present +	 * on nodes - when we are converged this will decrease +	 * to g->p.nr_proc: +	 */ +	process_groups = 0; + +	for (node = 0; node < g->p.nr_nodes; node++) { +		int processes = count_node_processes(node); + +		nr = nodes[node]; +		tprintf(" %2d/%-2d", nr, processes); + +		process_groups += processes; +	} + +	distance = nr_max - nr_min; + +	tprintf(" [%2d/%-2d]", distance, process_groups); + +	tprintf(" l:%3d-%-3d (%3d)", +		loops_done_min, loops_done_max, loops_done_max-loops_done_min); + +	if (loops_done_min && loops_done_max) { +		double skew = 1.0 - (double)loops_done_min/loops_done_max; + +		tprintf(" [%4.1f%%]", skew * 100.0); +	} + +	calc_convergence_compression(&strong); + +	if (strong && process_groups == g->p.nr_proc) { +		if (!*convergence) { +			*convergence = runtime_ns_max; +			tprintf(" (%6.1fs converged)\n", *convergence/1e9); +			if (g->p.measure_convergence) { +				g->all_converged = true; +				g->stop_work = true; +			} +		} +	} else { +		if (*convergence) { +			tprintf(" (%6.1fs de-converged)", runtime_ns_max/1e9); +			*convergence = 0; +		} +		tprintf("\n"); +	} +} + +static void show_summary(double runtime_ns_max, int l, double *convergence) +{ +	tprintf("\r #  %5.1f%%  [%.1f mins]", +		(double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max/1e9 / 60.0); + +	calc_convergence(runtime_ns_max, convergence); + +	if (g->p.show_details >= 0) +		fflush(stdout); +} + +static void *worker_thread(void *__tdata) +{ +	struct thread_data *td = __tdata; +	struct timeval start0, start, stop, diff; +	int process_nr = td->process_nr; +	int thread_nr = td->thread_nr; +	unsigned long last_perturbance; +	int task_nr = td->task_nr; +	int details = g->p.show_details; +	int first_task, last_task; +	double convergence = 0; +	u64 val = td->val; +	double runtime_ns_max; +	u8 *global_data; +	u8 *process_data; +	u8 *thread_data; +	u64 bytes_done; +	long work_done; +	u32 l; + +	bind_to_cpumask(td->bind_cpumask); +	bind_to_memnode(td->bind_node); + +	set_taskname("thread %d/%d", process_nr, thread_nr); + +	global_data = g->data; +	process_data = td->process_data; +	thread_data = setup_private_data(g->p.bytes_thread); + +	bytes_done = 0; + +	last_task = 0; +	if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1) +		last_task = 1; + +	first_task = 0; +	if (process_nr == 0 && thread_nr == 0) +		first_task = 1; + +	if (details >= 2) { +		printf("#  thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n", +			process_nr, thread_nr, global_data, process_data, thread_data); +	} + +	if (g->p.serialize_startup) { +		pthread_mutex_lock(&g->startup_mutex); +		g->nr_tasks_started++; +		pthread_mutex_unlock(&g->startup_mutex); + +		/* Here we will wait for the main process to start us all at once: */ +		pthread_mutex_lock(&g->start_work_mutex); +		g->nr_tasks_working++; + +		/* Last one wake the main process: */ +		if (g->nr_tasks_working == g->p.nr_tasks) +			pthread_mutex_unlock(&g->startup_done_mutex); + +		pthread_mutex_unlock(&g->start_work_mutex); +	} + +	gettimeofday(&start0, NULL); + +	start = stop = start0; +	last_perturbance = start.tv_sec; + +	for (l = 0; l < g->p.nr_loops; l++) { +		start = stop; + +		if (g->stop_work) +			break; + +		val += do_work(global_data,  g->p.bytes_global,  process_nr, g->p.nr_proc,	l, val); +		val += do_work(process_data, g->p.bytes_process, thread_nr,  g->p.nr_threads,	l, val); +		val += do_work(thread_data,  g->p.bytes_thread,  0,          1,		l, val); + +		if (g->p.sleep_usecs) { +			pthread_mutex_lock(td->process_lock); +			usleep(g->p.sleep_usecs); +			pthread_mutex_unlock(td->process_lock); +		} +		/* +		 * Amount of work to be done under a process-global lock: +		 */ +		if (g->p.bytes_process_locked) { +			pthread_mutex_lock(td->process_lock); +			val += do_work(process_data, g->p.bytes_process_locked, thread_nr,  g->p.nr_threads,	l, val); +			pthread_mutex_unlock(td->process_lock); +		} + +		work_done = g->p.bytes_global + g->p.bytes_process + +			    g->p.bytes_process_locked + g->p.bytes_thread; + +		update_curr_cpu(task_nr, work_done); +		bytes_done += work_done; + +		if (details < 0 && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs) +			continue; + +		td->loops_done = l; + +		gettimeofday(&stop, NULL); + +		/* Check whether our max runtime timed out: */ +		if (g->p.nr_secs) { +			timersub(&stop, &start0, &diff); +			if ((u32)diff.tv_sec >= g->p.nr_secs) { +				g->stop_work = true; +				break; +			} +		} + +		/* Update the summary at most once per second: */ +		if (start.tv_sec == stop.tv_sec) +			continue; + +		/* +		 * Perturb the first task's equilibrium every g->p.perturb_secs seconds, +		 * by migrating to CPU#0: +		 */ +		if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) { +			cpu_set_t orig_mask; +			int target_cpu; +			int this_cpu; + +			last_perturbance = stop.tv_sec; + +			/* +			 * Depending on where we are running, move into +			 * the other half of the system, to create some +			 * real disturbance: +			 */ +			this_cpu = g->threads[task_nr].curr_cpu; +			if (this_cpu < g->p.nr_cpus/2) +				target_cpu = g->p.nr_cpus-1; +			else +				target_cpu = 0; + +			orig_mask = bind_to_cpu(target_cpu); + +			/* Here we are running on the target CPU already */ +			if (details >= 1) +				printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu); + +			bind_to_cpumask(orig_mask); +		} + +		if (details >= 3) { +			timersub(&stop, &start, &diff); +			runtime_ns_max = diff.tv_sec * 1000000000; +			runtime_ns_max += diff.tv_usec * 1000; + +			if (details >= 0) { +				printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016"PRIx64"]\n", +					process_nr, thread_nr, runtime_ns_max / bytes_done, val); +			} +			fflush(stdout); +		} +		if (!last_task) +			continue; + +		timersub(&stop, &start0, &diff); +		runtime_ns_max = diff.tv_sec * 1000000000ULL; +		runtime_ns_max += diff.tv_usec * 1000ULL; + +		show_summary(runtime_ns_max, l, &convergence); +	} + +	gettimeofday(&stop, NULL); +	timersub(&stop, &start0, &diff); +	td->runtime_ns = diff.tv_sec * 1000000000ULL; +	td->runtime_ns += diff.tv_usec * 1000ULL; + +	free_data(thread_data, g->p.bytes_thread); + +	pthread_mutex_lock(&g->stop_work_mutex); +	g->bytes_done += bytes_done; +	pthread_mutex_unlock(&g->stop_work_mutex); + +	return NULL; +} + +/* + * A worker process starts a couple of threads: + */ +static void worker_process(int process_nr) +{ +	pthread_mutex_t process_lock; +	struct thread_data *td; +	pthread_t *pthreads; +	u8 *process_data; +	int task_nr; +	int ret; +	int t; + +	pthread_mutex_init(&process_lock, NULL); +	set_taskname("process %d", process_nr); + +	/* +	 * Pick up the memory policy and the CPU binding of our first thread, +	 * so that we initialize memory accordingly: +	 */ +	task_nr = process_nr*g->p.nr_threads; +	td = g->threads + task_nr; + +	bind_to_memnode(td->bind_node); +	bind_to_cpumask(td->bind_cpumask); + +	pthreads = zalloc(g->p.nr_threads * sizeof(pthread_t)); +	process_data = setup_private_data(g->p.bytes_process); + +	if (g->p.show_details >= 3) { +		printf(" # process %2d global mem: %p, process mem: %p\n", +			process_nr, g->data, process_data); +	} + +	for (t = 0; t < g->p.nr_threads; t++) { +		task_nr = process_nr*g->p.nr_threads + t; +		td = g->threads + task_nr; + +		td->process_data = process_data; +		td->process_nr   = process_nr; +		td->thread_nr    = t; +		td->task_nr	 = task_nr; +		td->val          = rand(); +		td->curr_cpu	 = -1; +		td->process_lock = &process_lock; + +		ret = pthread_create(pthreads + t, NULL, worker_thread, td); +		BUG_ON(ret); +	} + +	for (t = 0; t < g->p.nr_threads; t++) { +                ret = pthread_join(pthreads[t], NULL); +		BUG_ON(ret); +	} + +	free_data(process_data, g->p.bytes_process); +	free(pthreads); +} + +static void print_summary(void) +{ +	if (g->p.show_details < 0) +		return; + +	printf("\n ###\n"); +	printf(" # %d %s will execute (on %d nodes, %d CPUs):\n", +		g->p.nr_tasks, g->p.nr_tasks == 1 ? "task" : "tasks", g->p.nr_nodes, g->p.nr_cpus); +	printf(" #      %5dx %5ldMB global  shared mem operations\n", +			g->p.nr_loops, g->p.bytes_global/1024/1024); +	printf(" #      %5dx %5ldMB process shared mem operations\n", +			g->p.nr_loops, g->p.bytes_process/1024/1024); +	printf(" #      %5dx %5ldMB thread  local  mem operations\n", +			g->p.nr_loops, g->p.bytes_thread/1024/1024); + +	printf(" ###\n"); + +	printf("\n ###\n"); fflush(stdout); +} + +static void init_thread_data(void) +{ +	ssize_t size = sizeof(*g->threads)*g->p.nr_tasks; +	int t; + +	g->threads = zalloc_shared_data(size); + +	for (t = 0; t < g->p.nr_tasks; t++) { +		struct thread_data *td = g->threads + t; +		int cpu; + +		/* Allow all nodes by default: */ +		td->bind_node = -1; + +		/* Allow all CPUs by default: */ +		CPU_ZERO(&td->bind_cpumask); +		for (cpu = 0; cpu < g->p.nr_cpus; cpu++) +			CPU_SET(cpu, &td->bind_cpumask); +	} +} + +static void deinit_thread_data(void) +{ +	ssize_t size = sizeof(*g->threads)*g->p.nr_tasks; + +	free_data(g->threads, size); +} + +static int init(void) +{ +	g = (void *)alloc_data(sizeof(*g), MAP_SHARED, 1, 0, 0 /* THP */, 0); + +	/* Copy over options: */ +	g->p = p0; + +	g->p.nr_cpus = numa_num_configured_cpus(); + +	g->p.nr_nodes = numa_max_node() + 1; + +	/* char array in count_process_nodes(): */ +	BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0); + +	if (g->p.show_quiet && !g->p.show_details) +		g->p.show_details = -1; + +	/* Some memory should be specified: */ +	if (!g->p.mb_global_str && !g->p.mb_proc_str && !g->p.mb_thread_str) +		return -1; + +	if (g->p.mb_global_str) { +		g->p.mb_global = atof(g->p.mb_global_str); +		BUG_ON(g->p.mb_global < 0); +	} + +	if (g->p.mb_proc_str) { +		g->p.mb_proc = atof(g->p.mb_proc_str); +		BUG_ON(g->p.mb_proc < 0); +	} + +	if (g->p.mb_proc_locked_str) { +		g->p.mb_proc_locked = atof(g->p.mb_proc_locked_str); +		BUG_ON(g->p.mb_proc_locked < 0); +		BUG_ON(g->p.mb_proc_locked > g->p.mb_proc); +	} + +	if (g->p.mb_thread_str) { +		g->p.mb_thread = atof(g->p.mb_thread_str); +		BUG_ON(g->p.mb_thread < 0); +	} + +	BUG_ON(g->p.nr_threads <= 0); +	BUG_ON(g->p.nr_proc <= 0); + +	g->p.nr_tasks = g->p.nr_proc*g->p.nr_threads; + +	g->p.bytes_global		= g->p.mb_global	*1024L*1024L; +	g->p.bytes_process		= g->p.mb_proc		*1024L*1024L; +	g->p.bytes_process_locked	= g->p.mb_proc_locked	*1024L*1024L; +	g->p.bytes_thread		= g->p.mb_thread	*1024L*1024L; + +	g->data = setup_shared_data(g->p.bytes_global); + +	/* Startup serialization: */ +	init_global_mutex(&g->start_work_mutex); +	init_global_mutex(&g->startup_mutex); +	init_global_mutex(&g->startup_done_mutex); +	init_global_mutex(&g->stop_work_mutex); + +	init_thread_data(); + +	tprintf("#\n"); +	if (parse_setup_cpu_list() || parse_setup_node_list()) +		return -1; +	tprintf("#\n"); + +	print_summary(); + +	return 0; +} + +static void deinit(void) +{ +	free_data(g->data, g->p.bytes_global); +	g->data = NULL; + +	deinit_thread_data(); + +	free_data(g, sizeof(*g)); +	g = NULL; +} + +/* + * Print a short or long result, depending on the verbosity setting: + */ +static void print_res(const char *name, double val, +		      const char *txt_unit, const char *txt_short, const char *txt_long) +{ +	if (!name) +		name = "main,"; + +	if (g->p.show_quiet) +		printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short); +	else +		printf(" %14.3f %s\n", val, txt_long); +} + +static int __bench_numa(const char *name) +{ +	struct timeval start, stop, diff; +	u64 runtime_ns_min, runtime_ns_sum; +	pid_t *pids, pid, wpid; +	double delta_runtime; +	double runtime_avg; +	double runtime_sec_max; +	double runtime_sec_min; +	int wait_stat; +	double bytes; +	int i, t; + +	if (init()) +		return -1; + +	pids = zalloc(g->p.nr_proc * sizeof(*pids)); +	pid = -1; + +	/* All threads try to acquire it, this way we can wait for them to start up: */ +	pthread_mutex_lock(&g->start_work_mutex); + +	if (g->p.serialize_startup) { +		tprintf(" #\n"); +		tprintf(" # Startup synchronization: ..."); fflush(stdout); +	} + +	gettimeofday(&start, NULL); + +	for (i = 0; i < g->p.nr_proc; i++) { +		pid = fork(); +		dprintf(" # process %2d: PID %d\n", i, pid); + +		BUG_ON(pid < 0); +		if (!pid) { +			/* Child process: */ +			worker_process(i); + +			exit(0); +		} +		pids[i] = pid; + +	} +	/* Wait for all the threads to start up: */ +	while (g->nr_tasks_started != g->p.nr_tasks) +		usleep(1000); + +	BUG_ON(g->nr_tasks_started != g->p.nr_tasks); + +	if (g->p.serialize_startup) { +		double startup_sec; + +		pthread_mutex_lock(&g->startup_done_mutex); + +		/* This will start all threads: */ +		pthread_mutex_unlock(&g->start_work_mutex); + +		/* This mutex is locked - the last started thread will wake us: */ +		pthread_mutex_lock(&g->startup_done_mutex); + +		gettimeofday(&stop, NULL); + +		timersub(&stop, &start, &diff); + +		startup_sec = diff.tv_sec * 1000000000.0; +		startup_sec += diff.tv_usec * 1000.0; +		startup_sec /= 1e9; + +		tprintf(" threads initialized in %.6f seconds.\n", startup_sec); +		tprintf(" #\n"); + +		start = stop; +		pthread_mutex_unlock(&g->startup_done_mutex); +	} else { +		gettimeofday(&start, NULL); +	} + +	/* Parent process: */ + + +	for (i = 0; i < g->p.nr_proc; i++) { +		wpid = waitpid(pids[i], &wait_stat, 0); +		BUG_ON(wpid < 0); +		BUG_ON(!WIFEXITED(wait_stat)); + +	} + +	runtime_ns_sum = 0; +	runtime_ns_min = -1LL; + +	for (t = 0; t < g->p.nr_tasks; t++) { +		u64 thread_runtime_ns = g->threads[t].runtime_ns; + +		runtime_ns_sum += thread_runtime_ns; +		runtime_ns_min = min(thread_runtime_ns, runtime_ns_min); +	} + +	gettimeofday(&stop, NULL); +	timersub(&stop, &start, &diff); + +	BUG_ON(bench_format != BENCH_FORMAT_DEFAULT); + +	tprintf("\n ###\n"); +	tprintf("\n"); + +	runtime_sec_max = diff.tv_sec * 1000000000.0; +	runtime_sec_max += diff.tv_usec * 1000.0; +	runtime_sec_max /= 1e9; + +	runtime_sec_min = runtime_ns_min/1e9; + +	bytes = g->bytes_done; +	runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / 1e9; + +	if (g->p.measure_convergence) { +		print_res(name, runtime_sec_max, +			"secs,", "NUMA-convergence-latency", "secs latency to NUMA-converge"); +	} + +	print_res(name, runtime_sec_max, +		"secs,", "runtime-max/thread",	"secs slowest (max) thread-runtime"); + +	print_res(name, runtime_sec_min, +		"secs,", "runtime-min/thread",	"secs fastest (min) thread-runtime"); + +	print_res(name, runtime_avg, +		"secs,", "runtime-avg/thread",	"secs average thread-runtime"); + +	delta_runtime = (runtime_sec_max - runtime_sec_min)/2.0; +	print_res(name, delta_runtime / runtime_sec_max * 100.0, +		"%,", "spread-runtime/thread",	"% difference between max/avg runtime"); + +	print_res(name, bytes / g->p.nr_tasks / 1e9, +		"GB,", "data/thread",		"GB data processed, per thread"); + +	print_res(name, bytes / 1e9, +		"GB,", "data-total",		"GB data processed, total"); + +	print_res(name, runtime_sec_max * 1e9 / (bytes / g->p.nr_tasks), +		"nsecs,", "runtime/byte/thread","nsecs/byte/thread runtime"); + +	print_res(name, bytes / g->p.nr_tasks / 1e9 / runtime_sec_max, +		"GB/sec,", "thread-speed",	"GB/sec/thread speed"); + +	print_res(name, bytes / runtime_sec_max / 1e9, +		"GB/sec,", "total-speed",	"GB/sec total speed"); + +	free(pids); + +	deinit(); + +	return 0; +} + +#define MAX_ARGS 50 + +static int command_size(const char **argv) +{ +	int size = 0; + +	while (*argv) { +		size++; +		argv++; +	} + +	BUG_ON(size >= MAX_ARGS); + +	return size; +} + +static void init_params(struct params *p, const char *name, int argc, const char **argv) +{ +	int i; + +	printf("\n # Running %s \"perf bench numa", name); + +	for (i = 0; i < argc; i++) +		printf(" %s", argv[i]); + +	printf("\"\n"); + +	memset(p, 0, sizeof(*p)); + +	/* Initialize nonzero defaults: */ + +	p->serialize_startup		= 1; +	p->data_reads			= true; +	p->data_writes			= true; +	p->data_backwards		= true; +	p->data_rand_walk		= true; +	p->nr_loops			= -1; +	p->init_random			= true; +	p->mb_global_str		= "1"; +	p->nr_proc			= 1; +	p->nr_threads			= 1; +	p->nr_secs			= 5; +	p->run_all			= argc == 1; +} + +static int run_bench_numa(const char *name, const char **argv) +{ +	int argc = command_size(argv); + +	init_params(&p0, name, argc, argv); +	argc = parse_options(argc, argv, options, bench_numa_usage, 0); +	if (argc) +		goto err; + +	if (__bench_numa(name)) +		goto err; + +	return 0; + +err: +	return -1; +} + +#define OPT_BW_RAM		"-s",  "20", "-zZq",    "--thp", " 1", "--no-data_rand_walk" +#define OPT_BW_RAM_NOTHP	OPT_BW_RAM,		"--thp", "-1" + +#define OPT_CONV		"-s", "100", "-zZ0qcm", "--thp", " 1" +#define OPT_CONV_NOTHP		OPT_CONV,		"--thp", "-1" + +#define OPT_BW			"-s",  "20", "-zZ0q",   "--thp", " 1" +#define OPT_BW_NOTHP		OPT_BW,			"--thp", "-1" + +/* + * The built-in test-suite executed by "perf bench numa -a". + * + * (A minimum of 4 nodes and 16 GB of RAM is recommended.) + */ +static const char *tests[][MAX_ARGS] = { +   /* Basic single-stream NUMA bandwidth measurements: */ +   { "RAM-bw-local,",	  "mem",  "-p",  "1",  "-t",  "1", "-P", "1024", +			  "-C" ,   "0", "-M",   "0", OPT_BW_RAM }, +   { "RAM-bw-local-NOTHP,", +			  "mem",  "-p",  "1",  "-t",  "1", "-P", "1024", +			  "-C" ,   "0", "-M",   "0", OPT_BW_RAM_NOTHP }, +   { "RAM-bw-remote,",	  "mem",  "-p",  "1",  "-t",  "1", "-P", "1024", +			  "-C" ,   "0", "-M",   "1", OPT_BW_RAM }, + +   /* 2-stream NUMA bandwidth measurements: */ +   { "RAM-bw-local-2x,",  "mem",  "-p",  "2",  "-t",  "1", "-P", "1024", +			   "-C", "0,2", "-M", "0x2", OPT_BW_RAM }, +   { "RAM-bw-remote-2x,", "mem",  "-p",  "2",  "-t",  "1", "-P", "1024", +		 	   "-C", "0,2", "-M", "1x2", OPT_BW_RAM }, + +   /* Cross-stream NUMA bandwidth measurement: */ +   { "RAM-bw-cross,",     "mem",  "-p",  "2",  "-t",  "1", "-P", "1024", +		 	   "-C", "0,8", "-M", "1,0", OPT_BW_RAM }, + +   /* Convergence latency measurements: */ +   { " 1x3-convergence,", "mem",  "-p",  "1", "-t",  "3", "-P",  "512", OPT_CONV }, +   { " 1x4-convergence,", "mem",  "-p",  "1", "-t",  "4", "-P",  "512", OPT_CONV }, +   { " 1x6-convergence,", "mem",  "-p",  "1", "-t",  "6", "-P", "1020", OPT_CONV }, +   { " 2x3-convergence,", "mem",  "-p",  "3", "-t",  "3", "-P", "1020", OPT_CONV }, +   { " 3x3-convergence,", "mem",  "-p",  "3", "-t",  "3", "-P", "1020", OPT_CONV }, +   { " 4x4-convergence,", "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_CONV }, +   { " 4x4-convergence-NOTHP,", +			  "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_CONV_NOTHP }, +   { " 4x6-convergence,", "mem",  "-p",  "4", "-t",  "6", "-P", "1020", OPT_CONV }, +   { " 4x8-convergence,", "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_CONV }, +   { " 8x4-convergence,", "mem",  "-p",  "8", "-t",  "4", "-P",  "512", OPT_CONV }, +   { " 8x4-convergence-NOTHP,", +			  "mem",  "-p",  "8", "-t",  "4", "-P",  "512", OPT_CONV_NOTHP }, +   { " 3x1-convergence,", "mem",  "-p",  "3", "-t",  "1", "-P",  "512", OPT_CONV }, +   { " 4x1-convergence,", "mem",  "-p",  "4", "-t",  "1", "-P",  "512", OPT_CONV }, +   { " 8x1-convergence,", "mem",  "-p",  "8", "-t",  "1", "-P",  "512", OPT_CONV }, +   { "16x1-convergence,", "mem",  "-p", "16", "-t",  "1", "-P",  "256", OPT_CONV }, +   { "32x1-convergence,", "mem",  "-p", "32", "-t",  "1", "-P",  "128", OPT_CONV }, + +   /* Various NUMA process/thread layout bandwidth measurements: */ +   { " 2x1-bw-process,",  "mem",  "-p",  "2", "-t",  "1", "-P", "1024", OPT_BW }, +   { " 3x1-bw-process,",  "mem",  "-p",  "3", "-t",  "1", "-P", "1024", OPT_BW }, +   { " 4x1-bw-process,",  "mem",  "-p",  "4", "-t",  "1", "-P", "1024", OPT_BW }, +   { " 8x1-bw-process,",  "mem",  "-p",  "8", "-t",  "1", "-P", " 512", OPT_BW }, +   { " 8x1-bw-process-NOTHP,", +			  "mem",  "-p",  "8", "-t",  "1", "-P", " 512", OPT_BW_NOTHP }, +   { "16x1-bw-process,",  "mem",  "-p", "16", "-t",  "1", "-P",  "256", OPT_BW }, + +   { " 4x1-bw-thread,",	  "mem",  "-p",  "1", "-t",  "4", "-T",  "256", OPT_BW }, +   { " 8x1-bw-thread,",	  "mem",  "-p",  "1", "-t",  "8", "-T",  "256", OPT_BW }, +   { "16x1-bw-thread,",   "mem",  "-p",  "1", "-t", "16", "-T",  "128", OPT_BW }, +   { "32x1-bw-thread,",   "mem",  "-p",  "1", "-t", "32", "-T",   "64", OPT_BW }, + +   { " 2x3-bw-thread,",	  "mem",  "-p",  "2", "-t",  "3", "-P",  "512", OPT_BW }, +   { " 4x4-bw-thread,",	  "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_BW }, +   { " 4x6-bw-thread,",	  "mem",  "-p",  "4", "-t",  "6", "-P",  "512", OPT_BW }, +   { " 4x8-bw-thread,",	  "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_BW }, +   { " 4x8-bw-thread-NOTHP,", +			  "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_BW_NOTHP }, +   { " 3x3-bw-thread,",	  "mem",  "-p",  "3", "-t",  "3", "-P",  "512", OPT_BW }, +   { " 5x5-bw-thread,",	  "mem",  "-p",  "5", "-t",  "5", "-P",  "512", OPT_BW }, + +   { "2x16-bw-thread,",   "mem",  "-p",  "2", "-t", "16", "-P",  "512", OPT_BW }, +   { "1x32-bw-thread,",   "mem",  "-p",  "1", "-t", "32", "-P", "2048", OPT_BW }, + +   { "numa02-bw,",	  "mem",  "-p",  "1", "-t", "32", "-T",   "32", OPT_BW }, +   { "numa02-bw-NOTHP,",  "mem",  "-p",  "1", "-t", "32", "-T",   "32", OPT_BW_NOTHP }, +   { "numa01-bw-thread,", "mem",  "-p",  "2", "-t", "16", "-T",  "192", OPT_BW }, +   { "numa01-bw-thread-NOTHP,", +			  "mem",  "-p",  "2", "-t", "16", "-T",  "192", OPT_BW_NOTHP }, +}; + +static int bench_all(void) +{ +	int nr = ARRAY_SIZE(tests); +	int ret; +	int i; + +	ret = system("echo ' #'; echo ' # Running test on: '$(uname -a); echo ' #'"); +	BUG_ON(ret < 0); + +	for (i = 0; i < nr; i++) { +		run_bench_numa(tests[i][0], tests[i] + 1); +	} + +	printf("\n"); + +	return 0; +} + +int bench_numa(int argc, const char **argv, const char *prefix __maybe_unused) +{ +	init_params(&p0, "main,", argc, argv); +	argc = parse_options(argc, argv, options, bench_numa_usage, 0); +	if (argc) +		goto err; + +	if (p0.run_all) +		return bench_all(); + +	if (__bench_numa(NULL)) +		goto err; + +	return 0; + +err: +	usage_with_options(numa_usage, options); +	return -1; +} diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c index d1d1b30f99c..cc1190a0849 100644 --- a/tools/perf/bench/sched-messaging.c +++ b/tools/perf/bench/sched-messaging.c @@ -267,7 +267,7 @@ static const char * const bench_sched_message_usage[] = {  };  int bench_sched_messaging(int argc, const char **argv, -		    const char *prefix __used) +		    const char *prefix __maybe_unused)  {  	unsigned int i, total_children;  	struct timeval start, stop, diff; diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c index d9ab3ce446a..07a8d7646a1 100644 --- a/tools/perf/bench/sched-pipe.c +++ b/tools/perf/bench/sched-pipe.c @@ -7,9 +7,7 @@   * Based on pipe-test-1m.c by Ingo Molnar <mingo@redhat.com>   *  http://people.redhat.com/mingo/cfs-scheduler/tools/pipe-test-1m.c   * Ported to perf by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp> - *   */ -  #include "../perf.h"  #include "../util/util.h"  #include "../util/parse-options.h" @@ -28,12 +26,24 @@  #include <sys/time.h>  #include <sys/types.h> +#include <pthread.h> + +struct thread_data { +	int			nr; +	int			pipe_read; +	int			pipe_write; +	pthread_t		pthread; +}; +  #define LOOPS_DEFAULT 1000000 -static int loops = LOOPS_DEFAULT; +static	int			loops = LOOPS_DEFAULT; + +/* Use processes by default: */ +static bool			threaded;  static const struct option options[] = { -	OPT_INTEGER('l', "loop", &loops, -		    "Specify number of loops"), +	OPT_INTEGER('l', "loop",	&loops,		"Specify number of loops"), +	OPT_BOOLEAN('T', "threaded",	&threaded,	"Specify threads/process based task setup"),  	OPT_END()  }; @@ -42,59 +52,106 @@ static const char * const bench_sched_pipe_usage[] = {  	NULL  }; -int bench_sched_pipe(int argc, const char **argv, -		     const char *prefix __used) +static void *worker_thread(void *__tdata)  { -	int pipe_1[2], pipe_2[2]; +	struct thread_data *td = __tdata;  	int m = 0, i; +	int ret; + +	for (i = 0; i < loops; i++) { +		if (!td->nr) { +			ret = read(td->pipe_read, &m, sizeof(int)); +			BUG_ON(ret != sizeof(int)); +			ret = write(td->pipe_write, &m, sizeof(int)); +			BUG_ON(ret != sizeof(int)); +		} else { +			ret = write(td->pipe_write, &m, sizeof(int)); +			BUG_ON(ret != sizeof(int)); +			ret = read(td->pipe_read, &m, sizeof(int)); +			BUG_ON(ret != sizeof(int)); +		} +	} + +	return NULL; +} + +int bench_sched_pipe(int argc, const char **argv, const char *prefix __maybe_unused) +{ +	struct thread_data threads[2], *td; +	int pipe_1[2], pipe_2[2];  	struct timeval start, stop, diff;  	unsigned long long result_usec = 0; +	int nr_threads = 2; +	int t;  	/*  	 * why does "ret" exist?  	 * discarding returned value of read(), write()  	 * causes error in building environment for perf  	 */ -	int ret, wait_stat; -	pid_t pid, retpid; +	int __maybe_unused ret, wait_stat; +	pid_t pid, retpid __maybe_unused; -	argc = parse_options(argc, argv, options, -			     bench_sched_pipe_usage, 0); +	argc = parse_options(argc, argv, options, bench_sched_pipe_usage, 0); -	assert(!pipe(pipe_1)); -	assert(!pipe(pipe_2)); - -	pid = fork(); -	assert(pid >= 0); +	BUG_ON(pipe(pipe_1)); +	BUG_ON(pipe(pipe_2));  	gettimeofday(&start, NULL); -	if (!pid) { -		for (i = 0; i < loops; i++) { -			ret = read(pipe_1[0], &m, sizeof(int)); -			ret = write(pipe_2[1], &m, sizeof(int)); -		} -	} else { -		for (i = 0; i < loops; i++) { -			ret = write(pipe_1[1], &m, sizeof(int)); -			ret = read(pipe_2[0], &m, sizeof(int)); +	for (t = 0; t < nr_threads; t++) { +		td = threads + t; + +		td->nr = t; + +		if (t == 0) { +			td->pipe_read = pipe_1[0]; +			td->pipe_write = pipe_2[1]; +		} else { +			td->pipe_write = pipe_1[1]; +			td->pipe_read = pipe_2[0];  		}  	} -	gettimeofday(&stop, NULL); -	timersub(&stop, &start, &diff); -	if (pid) { +	if (threaded) { + +		for (t = 0; t < nr_threads; t++) { +			td = threads + t; + +			ret = pthread_create(&td->pthread, NULL, worker_thread, td); +			BUG_ON(ret); +		} + +		for (t = 0; t < nr_threads; t++) { +			td = threads + t; + +			ret = pthread_join(td->pthread, NULL); +			BUG_ON(ret); +		} + +	} else { +		pid = fork(); +		assert(pid >= 0); + +		if (!pid) { +			worker_thread(threads + 0); +			exit(0); +		} else { +			worker_thread(threads + 1); +		} +  		retpid = waitpid(pid, &wait_stat, 0);  		assert((retpid == pid) && WIFEXITED(wait_stat)); -	} else { -		exit(0);  	} +	gettimeofday(&stop, NULL); +	timersub(&stop, &start, &diff); +  	switch (bench_format) {  	case BENCH_FORMAT_DEFAULT: -		printf("# Executed %d pipe operations between two tasks\n\n", -			loops); +		printf("# Executed %d pipe operations between two %s\n\n", +			loops, threaded ? "threads" : "processes");  		result_usec = diff.tv_sec * 1000000;  		result_usec += diff.tv_usec;  | 
