1 files changed, 35 insertions, 0 deletions
diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h
new file mode 100644
index 00000000000..a26086b8a80
--- /dev/null
+++ b/arch/x86/um/shared/sysdep/faultinfo_32.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2004 Fujitsu Siemens Computers GmbH
+ * Author: Bodo Stroesser <bstroesser@fujitsu-siemens.com>
+ * Licensed under the GPL
+ */
+
+#ifndef __FAULTINFO_I386_H
+#define __FAULTINFO_I386_H
+
+/* this structure contains the full arch-specific faultinfo
+ * from the traps.
+ * On i386, ptrace_faultinfo unfortunately doesn't provide
+ * all the info, since trap_no is missing.
+ * All common elements are defined at the same position in
+ * both structures, thus making it easy to copy the
+ * contents without knowledge about the structure elements.
+ */
+struct faultinfo {
+        int error_code; /* in ptrace_faultinfo misleadingly called is_write */
+        unsigned long cr2; /* in ptrace_faultinfo called addr */
+        int trap_no; /* missing in ptrace_faultinfo */
+};
+
+#define FAULT_WRITE(fi) ((fi).error_code & 2)
+#define FAULT_ADDRESS(fi) ((fi).cr2)
+
+/* This is Page Fault */
+#define SEGV_IS_FIXABLE(fi)	((fi)->trap_no == 14)
+
+/* SKAS3 has no trap_no on i386, but get_skas_faultinfo() sets it to 0. */
+#define SEGV_MAYBE_FIXABLE(fi)	((fi)->trap_no == 0 && ptrace_faultinfo)
+
+#define PTRACE_FULL_FAULTINFO 0
+
+#endif
diff --git a/tools/Makefile b/tools/Makefile
new file mode 100644
index 00000000000..9a617adc667
--- /dev/null
+++ b/tools/Makefile
@@ -0,0 +1,119 @@
+include scripts/Makefile.include
+
+help:
+	@echo 'Possible targets:'
+	@echo ''
+	@echo '  acpi       - ACPI tools'
+	@echo '  cgroup     - cgroup tools'
+	@echo '  cpupower   - a tool for all things x86 CPU power'
+	@echo '  firewire   - the userspace part of nosy, an IEEE-1394 traffic sniffer'
+	@echo '  hv         - tools used when in Hyper-V clients'
+	@echo '  lguest     - a minimal 32-bit x86 hypervisor'
+	@echo '  perf       - Linux performance measurement and analysis tool'
+	@echo '  selftests  - various kernel selftests'
+	@echo '  turbostat  - Intel CPU idle stats and freq reporting tool'
+	@echo '  usb        - USB testing tools'
+	@echo '  virtio     - vhost test module'
+	@echo '  net        - misc networking tools'
+	@echo '  vm         - misc vm tools'
+	@echo '  x86_energy_perf_policy - Intel energy policy tool'
+	@echo '  tmon       - thermal monitoring and tuning tool'
+	@echo ''
+	@echo 'You can do:'
+	@echo ' $$ make -C tools/ <tool>_install'
+	@echo ''
+	@echo '  from the kernel command line to build and install one of'
+	@echo '  the tools above'
+	@echo ''
+	@echo '  $$ make tools/install'
+	@echo ''
+	@echo '  installs all tools.'
+	@echo ''
+	@echo 'Cleaning targets:'
+	@echo ''
+	@echo '  all of the above with the "_clean" string appended cleans'
+	@echo '    the respective build directory.'
+	@echo '  clean: a summary clean target to clean _all_ folders'
+
+acpi: FORCE
+	$(call descend,power/$@)
+
+cpupower: FORCE
+	$(call descend,power/$@)
+
+cgroup firewire hv guest usb virtio vm net: FORCE
+	$(call descend,$@)
+
+liblockdep: FORCE
+	$(call descend,lib/lockdep)
+
+libapikfs: FORCE
+	$(call descend,lib/api)
+
+perf: libapikfs FORCE
+	$(call descend,$@)
+
+selftests: FORCE
+	$(call descend,testing/$@)
+
+turbostat x86_energy_perf_policy: FORCE
+	$(call descend,power/x86/$@)
+
+tmon: FORCE
+	$(call descend,thermal/$@)
+
+acpi_install:
+	$(call descend,power/$(@:_install=),install)
+
+cpupower_install:
+	$(call descend,power/$(@:_install=),install)
+
+cgroup_install firewire_install hv_install lguest_install perf_install usb_install virtio_install vm_install net_install:
+	$(call descend,$(@:_install=),install)
+
+selftests_install:
+	$(call descend,testing/$(@:_clean=),install)
+
+turbostat_install x86_energy_perf_policy_install:
+	$(call descend,power/x86/$(@:_install=),install)
+
+tmon_install:
+	$(call descend,thermal/$(@:_install=),install)
+
+install: acpi_install cgroup_install cpupower_install hv_install firewire_install lguest_install \
+		perf_install selftests_install turbostat_install usb_install \
+		virtio_install vm_install net_install x86_energy_perf_policy_install \
+	tmon
+
+acpi_clean:
+	$(call descend,power/acpi,clean)
+
+cpupower_clean:
+	$(call descend,power/cpupower,clean)
+
+cgroup_clean hv_clean firewire_clean lguest_clean usb_clean virtio_clean vm_clean net_clean:
+	$(call descend,$(@:_clean=),clean)
+
+liblockdep_clean:
+	$(call descend,lib/lockdep,clean)
+
+libapikfs_clean:
+	$(call descend,lib/api,clean)
+
+perf_clean: libapikfs_clean
+	$(call descend,$(@:_clean=),clean)
+
+selftests_clean:
+	$(call descend,testing/$(@:_clean=),clean)
+
+turbostat_clean x86_energy_perf_policy_clean:
+	$(call descend,power/x86/$(@:_clean=),clean)
+
+tmon_clean:
+	$(call descend,thermal/tmon,clean)
+
+clean: acpi_clean cgroup_clean cpupower_clean hv_clean firewire_clean lguest_clean \
+		perf_clean selftests_clean turbostat_clean usb_clean virtio_clean \
+		vm_clean net_clean x86_energy_perf_policy_clean tmon_clean
+
+.PHONY: FORCE
diff --git a/tools/cgroup/.gitignore b/tools/cgroup/.gitignore
new file mode 100644
index 00000000000..633cd9b874f
--- /dev/null
+++ b/tools/cgroup/.gitignore
@@ -0,0 +1 @@
+cgroup_event_listener
diff --git a/tools/cgroup/Makefile b/tools/cgroup/Makefile
new file mode 100644
index 00000000000..b4286196b76
--- /dev/null
+++ b/tools/cgroup/Makefile
@@ -0,0 +1,11 @@
+# Makefile for cgroup tools
+
+CC = $(CROSS_COMPILE)gcc
+CFLAGS = -Wall -Wextra
+
+all: cgroup_event_listener
+%: %.c
+	$(CC) $(CFLAGS) -o $@ $^
+
+clean:
+	$(RM) cgroup_event_listener
diff --git a/tools/cgroup/cgroup_event_listener.c b/tools/cgroup/cgroup_event_listener.c
new file mode 100644
index 00000000000..4eb5507205c
--- /dev/null
+++ b/tools/cgroup/cgroup_event_listener.c
@@ -0,0 +1,82 @@
+/*
+ * cgroup_event_listener.c - Simple listener of cgroup events
+ *
+ * Copyright (C) Kirill A. Shutemov <kirill@shutemov.name>
+ */
+
+#include <assert.h>
+#include <err.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <sys/eventfd.h>
+
+#define USAGE_STR "Usage: cgroup_event_listener <path-to-control-file> <args>"
+
+int main(int argc, char **argv)
+{
+	int efd = -1;
+	int cfd = -1;
+	int event_control = -1;
+	char event_control_path[PATH_MAX];
+	char line[LINE_MAX];
+	int ret;
+
+	if (argc != 3)
+		errx(1, "%s", USAGE_STR);
+
+	cfd = open(argv[1], O_RDONLY);
+	if (cfd == -1)
+		err(1, "Cannot open %s", argv[1]);
+
+	ret = snprintf(event_control_path, PATH_MAX, "%s/cgroup.event_control",
+			dirname(argv[1]));
+	if (ret >= PATH_MAX)
+		errx(1, "Path to cgroup.event_control is too long");
+
+	event_control = open(event_control_path, O_WRONLY);
+	if (event_control == -1)
+		err(1, "Cannot open %s", event_control_path);
+
+	efd = eventfd(0, 0);
+	if (efd == -1)
+		err(1, "eventfd() failed");
+
+	ret = snprintf(line, LINE_MAX, "%d %d %s", efd, cfd, argv[2]);
+	if (ret >= LINE_MAX)
+		errx(1, "Arguments string is too long");
+
+	ret = write(event_control, line, strlen(line) + 1);
+	if (ret == -1)
+		err(1, "Cannot write to cgroup.event_control");
+
+	while (1) {
+		uint64_t result;
+
+		ret = read(efd, &result, sizeof(result));
+		if (ret == -1) {
+			if (errno == EINTR)
+				continue;
+			err(1, "Cannot read from eventfd");
+		}
+		assert(ret == sizeof(result));
+
+		ret = access(event_control_path, W_OK);
+		if ((ret == -1) && (errno == ENOENT)) {
+			puts("The cgroup seems to have removed.");
+			break;
+		}
+
+		if (ret == -1)
+			err(1, "cgroup.event_control is not accessible any more");
+
+		printf("%s %s: crossed\n", argv[1], argv[2]);
+	}
+
+	return 0;
+}
diff --git a/tools/firewire/Makefile b/tools/firewire/Makefile
new file mode 100644
index 00000000000..81767adaae7
--- /dev/null
+++ b/tools/firewire/Makefile
@@ -0,0 +1,19 @@
+prefix = /usr
+nosy-dump-version = 0.4
+
+CC = gcc
+
+all : nosy-dump
+
+nosy-dump : CFLAGS = -Wall -O2 -g
+nosy-dump : CPPFLAGS = -DVERSION=\"$(nosy-dump-version)\" -I../../drivers/firewire
+nosy-dump : LDFLAGS = -g
+nosy-dump : LDLIBS = -lpopt
+
+nosy-dump : nosy-dump.o decode-fcp.o
+
+clean :
+	rm -rf *.o nosy-dump
+
+install :
+	install nosy-dump $(prefix)/bin/nosy-dump
diff --git a/tools/firewire/decode-fcp.c b/tools/firewire/decode-fcp.c
new file mode 100644
index 00000000000..e41223b6a4c
--- /dev/null
+++ b/tools/firewire/decode-fcp.c
@@ -0,0 +1,213 @@
+#include <linux/firewire-constants.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "list.h"
+#include "nosy-dump.h"
+
+#define CSR_FCP_COMMAND			0xfffff0000b00ull
+#define CSR_FCP_RESPONSE		0xfffff0000d00ull
+
+static const char * const ctype_names[] = {
+	[0x0] = "control",		[0x8] = "not implemented",
+	[0x1] = "status",		[0x9] = "accepted",
+	[0x2] = "specific inquiry",	[0xa] = "rejected",
+	[0x3] = "notify",		[0xb] = "in transition",
+	[0x4] = "general inquiry",	[0xc] = "stable",
+	[0x5] = "(reserved 0x05)",	[0xd] = "changed",
+	[0x6] = "(reserved 0x06)",	[0xe] = "(reserved 0x0e)",
+	[0x7] = "(reserved 0x07)",	[0xf] = "interim",
+};
+
+static const char * const subunit_type_names[] = {
+	[0x00] = "monitor",		[0x10] = "(reserved 0x10)",
+	[0x01] = "audio",		[0x11] = "(reserved 0x11)",
+	[0x02] = "printer",		[0x12] = "(reserved 0x12)",
+	[0x03] = "disc",		[0x13] = "(reserved 0x13)",
+	[0x04] = "tape recorder/player",[0x14] = "(reserved 0x14)",
+	[0x05] = "tuner",		[0x15] = "(reserved 0x15)",
+	[0x06] = "ca",			[0x16] = "(reserved 0x16)",
+	[0x07] = "camera",		[0x17] = "(reserved 0x17)",
+	[0x08] = "(reserved 0x08)",	[0x18] = "(reserved 0x18)",
+	[0x09] = "panel",		[0x19] = "(reserved 0x19)",
+	[0x0a] = "bulletin board",	[0x1a] = "(reserved 0x1a)",
+	[0x0b] = "camera storage",	[0x1b] = "(reserved 0x1b)",
+	[0x0c] = "(reserved 0x0c)",	[0x1c] = "vendor unique",
+	[0x0d] = "(reserved 0x0d)",	[0x1d] = "all subunit types",
+	[0x0e] = "(reserved 0x0e)",	[0x1e] = "subunit_type extended to next byte",
+	[0x0f] = "(reserved 0x0f)",	[0x1f] = "unit",
+};
+
+struct avc_enum {
+	int value;
+	const char *name;
+};
+
+struct avc_field {
+	const char *name;	/* Short name for field. */
+	int offset;		/* Location of field, specified in bits; */
+				/* negative means from end of packet.    */
+	int width;		/* Width of field, 0 means use data_length. */
+	struct avc_enum *names;
+};
+
+struct avc_opcode_info {
+	const char *name;
+	struct avc_field fields[8];
+};
+
+struct avc_enum power_field_names[] = {
+	{ 0x70, "on" },
+	{ 0x60, "off" },
+	{ }
+};
+
+static const struct avc_opcode_info opcode_info[256] = {
+
+	/* TA Document 1999026 */
+	/* AV/C Digital Interface Command Set General Specification 4.0 */
+	[0xb2] = { "power", {
+			{ "state", 0, 8, power_field_names }
+		}
+	},
+	[0x30] = { "unit info", {
+			{ "foo", 0, 8 },
+			{ "unit_type", 8, 5 },
+			{ "unit", 13, 3 },
+			{ "company id", 16, 24 },
+		}
+	},
+	[0x31] = { "subunit info" },
+	[0x01] = { "reserve" },
+	[0xb0] = { "version" },
+	[0x00] = { "vendor dependent" },
+	[0x02] = { "plug info" },
+	[0x12] = { "channel usage" },
+	[0x24] = { "connect" },
+	[0x20] = { "connect av" },
+	[0x22] = { "connections" },
+	[0x11] = { "digital input" },
+	[0x10] = { "digital output" },
+	[0x25] = { "disconnect" },
+	[0x21] = { "disconnect av" },
+	[0x19] = { "input plug signal format" },
+	[0x18] = { "output plug signal format" },
+	[0x1f] = { "general bus setup" },
+
+	/* TA Document 1999025 */
+	/* AV/C Descriptor Mechanism Specification Version 1.0 */
+	[0x0c] = { "create descriptor" },
+	[0x08] = { "open descriptor" },
+	[0x09] = { "read descriptor" },
+	[0x0a] = { "write descriptor" },
+	[0x05] = { "open info block" },
+	[0x06] = { "read info block" },
+	[0x07] = { "write info block" },
+	[0x0b] = { "search descriptor" },
+	[0x0d] = { "object number select" },
+
+	/* TA Document 1999015 */
+	/* AV/C Command Set for Rate Control of Isochronous Data Flow 1.0 */
+	[0xb3] = { "rate", {
+			{ "subfunction", 0, 8 },
+			{ "result", 8, 8 },
+			{ "plug_type", 16, 8 },
+			{ "plug_id", 16, 8 },
+		}
+	},
+
+	/* TA Document 1999008 */
+	/* AV/C Audio Subunit Specification 1.0 */
+	[0xb8] = { "function block" },
+
+	/* TA Document 2001001 */
+	/* AV/C Panel Subunit Specification 1.1 */
+	[0x7d] = { "gui update" },
+	[0x7e] = { "push gui data" },
+	[0x7f] = { "user action" },
+	[0x7c] = { "pass through" },
+
+	/* */
+	[0x26] = { "asynchronous connection" },
+};
+
+struct avc_frame {
+	uint32_t operand0:8;
+	uint32_t opcode:8;
+	uint32_t subunit_id:3;
+	uint32_t subunit_type:5;
+	uint32_t ctype:4;
+	uint32_t cts:4;
+};
+
+static void
+decode_avc(struct link_transaction *t)
+{
+	struct avc_frame *frame =
+	    (struct avc_frame *) t->request->packet.write_block.data;
+	const struct avc_opcode_info *info;
+	const char *name;
+	char buffer[32];
+	int i;
+
+	info = &opcode_info[frame->opcode];
+	if (info->name == NULL) {
+		snprintf(buffer, sizeof(buffer),
+			 "(unknown opcode 0x%02x)", frame->opcode);
+		name = buffer;
+	} else {
+		name = info->name;
+	}
+
+	printf("av/c %s, subunit_type=%s, subunit_id=%d, opcode=%s",
+	    ctype_names[frame->ctype], subunit_type_names[frame->subunit_type],
+	    frame->subunit_id, name);
+
+	for (i = 0; info->fields[i].name != NULL; i++)
+		printf(", %s", info->fields[i].name);
+
+	printf("\n");
+}
+
+int
+decode_fcp(struct link_transaction *t)
+{
+	struct avc_frame *frame =
+	    (struct avc_frame *) t->request->packet.write_block.data;
+	unsigned long long offset =
+	    ((unsigned long long) t->request->packet.common.offset_high << 32) |
+	    t->request->packet.common.offset_low;
+
+	if (t->request->packet.common.tcode != TCODE_WRITE_BLOCK_REQUEST)
+		return 0;
+
+	if (offset == CSR_FCP_COMMAND || offset == CSR_FCP_RESPONSE) {
+		switch (frame->cts) {
+		case 0x00:
+			decode_avc(t);
+			break;
+		case 0x01:
+			printf("cal fcp frame (cts=0x01)\n");
+			break;
+		case 0x02:
+			printf("ehs fcp frame (cts=0x02)\n");
+			break;
+		case 0x03:
+			printf("havi fcp frame (cts=0x03)\n");
+			break;
+		case 0x0e:
+			printf("vendor specific fcp frame (cts=0x0e)\n");
+			break;
+		case 0x0f:
+			printf("extended cts\n");
+			break;
+		default:
+			printf("reserved fcp frame (ctx=0x%02x)\n", frame->cts);
+			break;
+		}
+		return 1;
+	}
+
+	return 0;
+}
+
diff --git a/tools/firewire/list.h b/tools/firewire/list.h
new file mode 100644
index 00000000000..41f4bdadf63
--- /dev/null
+++ b/tools/firewire/list.h
@@ -0,0 +1,62 @@
+struct list {
+	struct list *next, *prev;
+};
+
+static inline void
+list_init(struct list *list)
+{
+	list->next = list;
+	list->prev = list;
+}
+
+static inline int
+list_empty(struct list *list)
+{
+	return list->next == list;
+}
+
+static inline void
+list_insert(struct list *link, struct list *new_link)
+{
+	new_link->prev		= link->prev;
+	new_link->next		= link;
+	new_link->prev->next	= new_link;
+	new_link->next->prev	= new_link;
+}
+
+static inline void
+list_append(struct list *list, struct list *new_link)
+{
+	list_insert((struct list *)list, new_link);
+}
+
+static inline void
+list_prepend(struct list *list, struct list *new_link)
+{
+	list_insert(list->next, new_link);
+}
+
+static inline void
+list_remove(struct list *link)
+{
+	link->prev->next = link->next;
+	link->next->prev = link->prev;
+}
+
+#define list_entry(link, type, member) \
+	((type *)((char *)(link)-(unsigned long)(&((type *)0)->member)))
+
+#define list_head(list, type, member)		\
+	list_entry((list)->next, type, member)
+
+#define list_tail(list, type, member)		\
+	list_entry((list)->prev, type, member)
+
+#define list_next(elm, member)					\
+	list_entry((elm)->member.next, typeof(*elm), member)
+
+#define list_for_each_entry(pos, list, member)			\
+	for (pos = list_head(list, typeof(*pos), member);	\
+	     &pos->member != (list);				\
+	     pos = list_next(pos, member))
+
diff --git a/tools/firewire/nosy-dump.c b/tools/firewire/nosy-dump.c
new file mode 100644
index 00000000000..3179c711bd6
--- /dev/null
+++ b/tools/firewire/nosy-dump.c
@@ -0,0 +1,1035 @@
+/*
+ * nosy-dump - Interface to snoop mode driver for TI PCILynx 1394 controllers
+ * Copyright (C) 2002-2006 Kristian Høgsberg
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ */
+
+#include <byteswap.h>
+#include <endian.h>
+#include <fcntl.h>
+#include <linux/firewire-constants.h>
+#include <poll.h>
+#include <popt.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <termios.h>
+#include <unistd.h>
+
+#include "list.h"
+#include "nosy-dump.h"
+#include "nosy-user.h"
+
+enum {
+	PACKET_FIELD_DETAIL		= 0x01,
+	PACKET_FIELD_DATA_LENGTH	= 0x02,
+	/* Marks the fields we print in transaction view. */
+	PACKET_FIELD_TRANSACTION	= 0x04,
+};
+
+static void print_packet(uint32_t *data, size_t length);
+static void decode_link_packet(struct link_packet *packet, size_t length,
+			       int include_flags, int exclude_flags);
+static int run = 1;
+sig_t sys_sigint_handler;
+
+static char *option_nosy_device = "/dev/nosy";
+static char *option_view = "packet";
+static char *option_output;
+static char *option_input;
+static int option_hex;
+static int option_iso;
+static int option_cycle_start;
+static int option_version;
+static int option_verbose;
+
+enum {
+	VIEW_TRANSACTION,
+	VIEW_PACKET,
+	VIEW_STATS,
+};
+
+static const struct poptOption options[] = {
+	{
+		.longName	= "device",
+		.shortName	= 'd',
+		.argInfo	= POPT_ARG_STRING,
+		.arg		= &option_nosy_device,
+		.descrip	= "Path to nosy device.",
+		.argDescrip	= "DEVICE"
+	},
+	{
+		.longName	= "view",
+		.argInfo	= POPT_ARG_STRING,
+		.arg		= &option_view,
+		.descrip	= "Specify view of bus traffic: packet, transaction or stats.",
+		.argDescrip	= "VIEW"
+	},
+	{
+		.longName	= "hex",
+		.shortName	= 'x',
+		.argInfo	= POPT_ARG_NONE,
+		.arg		= &option_hex,
+		.descrip	= "Print each packet in hex.",
+	},
+	{
+		.longName	= "iso",
+		.argInfo	= POPT_ARG_NONE,
+		.arg		= &option_iso,
+		.descrip	= "Print iso packets.",
+	},
+	{
+		.longName	= "cycle-start",
+		.argInfo	= POPT_ARG_NONE,
+		.arg		= &option_cycle_start,
+		.descrip	= "Print cycle start packets.",
+	},
+	{
+		.longName	= "verbose",
+		.shortName	= 'v',
+		.argInfo	= POPT_ARG_NONE,
+		.arg		= &option_verbose,
+		.descrip	= "Verbose packet view.",
+	},
+	{
+		.longName	= "output",
+		.shortName	= 'o',
+		.argInfo	= POPT_ARG_STRING,
+		.arg		= &option_output,
+		.descrip	= "Log to output file.",
+		.argDescrip	= "FILENAME"
+	},
+	{
+		.longName	= "input",
+		.shortName	= 'i',
+		.argInfo	= POPT_ARG_STRING,
+		.arg		= &option_input,
+		.descrip	= "Decode log from file.",
+		.argDescrip	= "FILENAME"
+	},
+	{
+		.longName	= "version",
+		.argInfo	= POPT_ARG_NONE,
+		.arg		= &option_version,
+		.descrip	= "Specify print version info.",
+	},
+	POPT_AUTOHELP
+	POPT_TABLEEND
+};
+
+/* Allow all ^C except the first to interrupt the program in the usual way. */
+static void
+sigint_handler(int signal_num)
+{
+	if (run == 1) {
+		run = 0;
+		signal(SIGINT, SIG_DFL);
+	}
+}
+
+static struct subaction *
+subaction_create(uint32_t *data, size_t length)
+{
+	struct subaction *sa;
+
+	/* we put the ack in the subaction struct for easy access. */
+	sa = malloc(sizeof *sa - sizeof sa->packet + length);
+	if (!sa)
+		exit(EXIT_FAILURE);
+	sa->ack = data[length / 4 - 1];
+	sa->length = length;
+	memcpy(&sa->packet, data, length);
+
+	return sa;
+}
+
+static void
+subaction_destroy(struct subaction *sa)
+{
+	free(sa);
+}
+
+static struct list pending_transaction_list = {
+	&pending_transaction_list, &pending_transaction_list
+};
+
+static struct link_transaction *
+link_transaction_lookup(int request_node, int response_node, int tlabel)
+{
+	struct link_transaction *t;
+
+	list_for_each_entry(t, &pending_transaction_list, link) {
+		if (t->request_node == request_node &&
+		    t->response_node == response_node &&
+		    t->tlabel == tlabel)
+			return t;
+	}
+
+	t = malloc(sizeof *t);
+	if (!t)
+		exit(EXIT_FAILURE);
+	t->request_node = request_node;
+	t->response_node = response_node;
+	t->tlabel = tlabel;
+	list_init(&t->request_list);
+	list_init(&t->response_list);
+
+	list_append(&pending_transaction_list, &t->link);
+
+	return t;
+}
+
+static void
+link_transaction_destroy(struct link_transaction *t)
+{
+	struct subaction *sa;
+
+	while (!list_empty(&t->request_list)) {
+		sa = list_head(&t->request_list, struct subaction, link);
+		list_remove(&sa->link);
+		subaction_destroy(sa);
+	}
+	while (!list_empty(&t->response_list)) {
+		sa = list_head(&t->response_list, struct subaction, link);
+		list_remove(&sa->link);
+		subaction_destroy(sa);
+	}
+	free(t);
+}
+
+struct protocol_decoder {
+	const char *name;
+	int (*decode)(struct link_transaction *t);
+};
+
+static const struct protocol_decoder protocol_decoders[] = {
+	{ "FCP", decode_fcp }
+};
+
+static void
+handle_transaction(struct link_transaction *t)
+{
+	struct subaction *sa;
+	int i;
+
+	if (!t->request) {
+		printf("BUG in handle_transaction\n");
+		return;
+	}
+
+	for (i = 0; i < array_length(protocol_decoders); i++)
+		if (protocol_decoders[i].decode(t))
+			break;
+
+	/* HACK: decode only fcp right now. */
+	return;
+
+	decode_link_packet(&t->request->packet, t->request->length,
+			   PACKET_FIELD_TRANSACTION, 0);
+	if (t->response)
+		decode_link_packet(&t->response->packet, t->request->length,
+				   PACKET_FIELD_TRANSACTION, 0);
+	else
+		printf("[no response]");
+
+	if (option_verbose) {
+		list_for_each_entry(sa, &t->request_list, link)
+			print_packet((uint32_t *) &sa->packet, sa->length);
+		list_for_each_entry(sa, &t->response_list, link)
+			print_packet((uint32_t *) &sa->packet, sa->length);
+	}
+	printf("\r\n");
+
+	link_transaction_destroy(t);
+}
+
+static void
+clear_pending_transaction_list(void)
+{
+	struct link_transaction *t;
+
+	while (!list_empty(&pending_transaction_list)) {
+		t = list_head(&pending_transaction_list,
+			      struct link_transaction, link);
+		list_remove(&t->link);
+		link_transaction_destroy(t);
+		/* print unfinished transactions */
+	}
+}
+
+static const char * const tcode_names[] = {
+	[0x0] = "write_quadlet_request",	[0x6] = "read_quadlet_response",
+	[0x1] = "write_block_request",		[0x7] = "read_block_response",
+	[0x2] = "write_response",		[0x8] = "cycle_start",
+	[0x3] = "reserved",			[0x9] = "lock_request",
+	[0x4] = "read_quadlet_request",		[0xa] = "iso_data",
+	[0x5] = "read_block_request",		[0xb] = "lock_response",
+};
+
+static const char * const ack_names[] = {
+	[0x0] = "no ack",			[0x8] = "reserved (0x08)",
+	[0x1] = "ack_complete",			[0x9] = "reserved (0x09)",
+	[0x2] = "ack_pending",			[0xa] = "reserved (0x0a)",
+	[0x3] = "reserved (0x03)",		[0xb] = "reserved (0x0b)",
+	[0x4] = "ack_busy_x",			[0xc] = "reserved (0x0c)",
+	[0x5] = "ack_busy_a",			[0xd] = "ack_data_error",
+	[0x6] = "ack_busy_b",			[0xe] = "ack_type_error",
+	[0x7] = "reserved (0x07)",		[0xf] = "reserved (0x0f)",
+};
+
+static const char * const rcode_names[] = {
+	[0x0] = "complete",			[0x4] = "conflict_error",
+	[0x1] = "reserved (0x01)",		[0x5] = "data_error",
+	[0x2] = "reserved (0x02)",		[0x6] = "type_error",
+	[0x3] = "reserved (0x03)",		[0x7] = "address_error",
+};
+
+static const char * const retry_names[] = {
+	[0x0] = "retry_1",
+	[0x1] = "retry_x",
+	[0x2] = "retry_a",
+	[0x3] = "retry_b",
+};
+
+enum {
+	PACKET_RESERVED,
+	PACKET_REQUEST,
+	PACKET_RESPONSE,
+	PACKET_OTHER,
+};
+
+struct packet_info {
+	const char *name;
+	int type;
+	int response_tcode;
+	const struct packet_field *fields;
+	int field_count;
+};
+
+struct packet_field {
+	const char *name; /* Short name for field. */
+	int offset;	/* Location of field, specified in bits; */
+			/* negative means from end of packet.    */
+	int width;	/* Width of field, 0 means use data_length. */
+	int flags;	/* Show options. */
+	const char * const *value_names;
+};
+
+#define COMMON_REQUEST_FIELDS						\
+	{ "dest", 0, 16, PACKET_FIELD_TRANSACTION },			\
+	{ "tl", 16, 6 },						\
+	{ "rt", 22, 2, PACKET_FIELD_DETAIL, retry_names },		\
+	{ "tcode", 24, 4, PACKET_FIELD_TRANSACTION, tcode_names },	\
+	{ "pri", 28, 4, PACKET_FIELD_DETAIL },				\
+	{ "src", 32, 16, PACKET_FIELD_TRANSACTION },			\
+	{ "offs", 48, 48, PACKET_FIELD_TRANSACTION }
+
+#define COMMON_RESPONSE_FIELDS						\
+	{ "dest", 0, 16 },						\
+	{ "tl", 16, 6 },						\
+	{ "rt", 22, 2, PACKET_FIELD_DETAIL, retry_names },		\
+	{ "tcode", 24, 4, 0, tcode_names },				\
+	{ "pri", 28, 4, PACKET_FIELD_DETAIL },				\
+	{ "src", 32, 16 },						\
+	{ "rcode", 48, 4, PACKET_FIELD_TRANSACTION, rcode_names }
+
+static const struct packet_field read_quadlet_request_fields[] = {
+	COMMON_REQUEST_FIELDS,
+	{ "crc", 96, 32, PACKET_FIELD_DETAIL },
+	{ "ack", 156, 4, 0, ack_names },
+};
+
+static const struct packet_field read_quadlet_response_fields[] = {
+	COMMON_RESPONSE_FIELDS,
+	{ "data", 96, 32, PACKET_FIELD_TRANSACTION },
+	{ "crc", 128, 32, PACKET_FIELD_DETAIL },
+	{ "ack", 188, 4, 0, ack_names },
+};
+
+static const struct packet_field read_block_request_fields[] = {
+	COMMON_REQUEST_FIELDS,
+	{ "data_length", 96, 16, PACKET_FIELD_TRANSACTION },
+	{ "extended_tcode", 112, 16 },
+	{ "crc", 128, 32, PACKET_FIELD_DETAIL },
+	{ "ack", 188, 4, 0, ack_names },
+};
+
+static const struct packet_field block_response_fields[] = {
+	COMMON_RESPONSE_FIELDS,
+	{ "data_length", 96, 16, PACKET_FIELD_DATA_LENGTH },
+	{ "extended_tcode", 112, 16 },
+	{ "crc", 128, 32, PACKET_FIELD_DETAIL },
+	{ "data", 160, 0, PACKET_FIELD_TRANSACTION },
+	{ "crc", -64, 32, PACKET_FIELD_DETAIL },
+	{ "ack", -4, 4, 0, ack_names },
+};
+
+static const struct packet_field write_quadlet_request_fields[] = {
+	COMMON_REQUEST_FIELDS,
+	{ "data", 96, 32, PACKET_FIELD_TRANSACTION },
+	{ "ack", -4, 4, 0, ack_names },
+};
+
+static const struct packet_field block_request_fields[] = {
+	COMMON_REQUEST_FIELDS,
+	{ "data_length", 96, 16, PACKET_FIELD_DATA_LENGTH | PACKET_FIELD_TRANSACTION },
+	{ "extended_tcode", 112, 16, PACKET_FIELD_TRANSACTION },
+	{ "crc", 128, 32, PACKET_FIELD_DETAIL },
+	{ "data", 160, 0, PACKET_FIELD_TRANSACTION },
+	{ "crc", -64, 32, PACKET_FIELD_DETAIL },
+	{ "ack", -4, 4, 0, ack_names },
+};
+
+static const struct packet_field write_response_fields[] = {
+	COMMON_RESPONSE_FIELDS,
+	{ "reserved", 64, 32, PACKET_FIELD_DETAIL },
+	{ "ack", -4, 4, 0, ack_names },
+};
+
+static const struct packet_field iso_data_fields[] = {
+	{ "data_length", 0, 16, PACKET_FIELD_DATA_LENGTH },
+	{ "tag", 16, 2 },
+	{ "channel", 18, 6 },
+	{ "tcode", 24, 4, 0, tcode_names },
+	{ "sy", 28, 4 },
+	{ "crc", 32, 32, PACKET_FIELD_DETAIL },
+	{ "data", 64, 0 },
+	{ "crc", -64, 32, PACKET_FIELD_DETAIL },
+	{ "ack", -4, 4, 0, ack_names },
+};
+
+static const struct packet_info packet_info[] = {
+	{
+		.name		= "write_quadlet_request",
+		.type		= PACKET_REQUEST,
+		.response_tcode	= TCODE_WRITE_RESPONSE,
+		.fields		= write_quadlet_request_fields,
+		.field_count	= array_length(write_quadlet_request_fields)
+	},
+	{
+		.name		= "write_block_request",
+		.type		= PACKET_REQUEST,
+		.response_tcode	= TCODE_WRITE_RESPONSE,
+		.fields		= block_request_fields,
+		.field_count	= array_length(block_request_fields)
+	},
+	{
+		.name		= "write_response",
+		.type		= PACKET_RESPONSE,
+		.fields		= write_response_fields,
+		.field_count	= array_length(write_response_fields)
+	},
+	{
+		.name		= "reserved",
+		.type		= PACKET_RESERVED,
+	},
+	{
+		.name		= "read_quadlet_request",
+		.type		= PACKET_REQUEST,
+		.response_tcode	= TCODE_READ_QUADLET_RESPONSE,
+		.fields		= read_quadlet_request_fields,
+		.field_count	= array_length(read_quadlet_request_fields)
+	},
+	{
+		.name		= "read_block_request",
+		.type		= PACKET_REQUEST,
+		.response_tcode	= TCODE_READ_BLOCK_RESPONSE,
+		.fields		= read_block_request_fields,
+		.field_count	= array_length(read_block_request_fields)
+	},
+	{
+		.name		= "read_quadlet_response",
+		.type		= PACKET_RESPONSE,
+		.fields		= read_quadlet_response_fields,
+		.field_count	= array_length(read_quadlet_response_fields)
+	},
+	{
+		.name		= "read_block_response",
+		.type		= PACKET_RESPONSE,
+		.fields		= block_response_fields,
+		.field_count	= array_length(block_response_fields)
+	},
+	{
+		.name		= "cycle_start",
+		.type		= PACKET_OTHER,
+		.fields		= write_quadlet_request_fields,
+		.field_count	= array_length(write_quadlet_request_fields)
+	},
+	{
+		.name		= "lock_request",
+		.type		= PACKET_REQUEST,
+		.fields		= block_request_fields,
+		.field_count	= array_length(block_request_fields)
+	},
+	{
+		.name		= "iso_data",
+		.type		= PACKET_OTHER,
+		.fields		= iso_data_fields,
+		.field_count	= array_length(iso_data_fields)
+	},
+	{
+		.name		= "lock_response",
+		.type		= PACKET_RESPONSE,
+		.fields		= block_response_fields,
+		.field_count	= array_length(block_response_fields)
+	},
+};
+
+static int
+handle_request_packet(uint32_t *data, size_t length)
+{
+	struct link_packet *p = (struct link_packet *) data;
+	struct subaction *sa, *prev;
+	struct link_transaction *t;
+
+	t = link_transaction_lookup(p->common.source, p->common.destination,
+			p->common.tlabel);
+	sa = subaction_create(data, length);
+	t->request = sa;
+
+	if (!list_empty(&t->request_list)) {
+		prev = list_tail(&t->request_list,
+				 struct subaction, link);
+
+		if (!ACK_BUSY(prev->ack)) {
+			/*
+			 * error, we should only see ack_busy_* before the
+			 * ack_pending/ack_complete -- this is an ack_pending
+			 * instead (ack_complete would have finished the
+			 * transaction).
+			 */
+		}
+
+		if (prev->packet.common.tcode != sa->packet.common.tcode ||
+		    prev->packet.common.tlabel != sa->packet.common.tlabel) {
+			/* memcmp() ? */
+			/* error, these should match for retries. */
+		}
+	}
+
+	list_append(&t->request_list, &sa->link);
+
+	switch (sa->ack) {
+	case ACK_COMPLETE:
+		if (p->common.tcode != TCODE_WRITE_QUADLET_REQUEST &&
+		    p->common.tcode != TCODE_WRITE_BLOCK_REQUEST)
+			/* error, unified transactions only allowed for write */;
+		list_remove(&t->link);
+		handle_transaction(t);
+		break;
+
+	case ACK_NO_ACK:
+	case ACK_DATA_ERROR:
+	case ACK_TYPE_ERROR:
+		list_remove(&t->link);
+		handle_transaction(t);
+		break;
+
+	case ACK_PENDING:
+		/* request subaction phase over, wait for response. */
+		break;
+
+	case ACK_BUSY_X:
+	case ACK_BUSY_A:
+	case ACK_BUSY_B:
+		/* ok, wait for retry. */
+		/* check that retry protocol is respected. */
+		break;
+	}
+
+	return 1;
+}
+
+static int
+handle_response_packet(uint32_t *data, size_t length)
+{
+	struct link_packet *p = (struct link_packet *) data;
+	struct subaction *sa, *prev;
+	struct link_transaction *t;
+
+	t = link_transaction_lookup(p->common.destination, p->common.source,
+			p->common.tlabel);
+	if (list_empty(&t->request_list)) {
+		/* unsolicited response */
+	}
+
+	sa = subaction_create(data, length);
+	t->response = sa;
+
+	if (!list_empty(&t->response_list)) {
+		prev = list_tail(&t->response_list, struct subaction, link);
+
+		if (!ACK_BUSY(prev->ack)) {
+			/*
+			 * error, we should only see ack_busy_* before the
+			 * ack_pending/ack_complete
+			 */
+		}
+
+		if (prev->packet.common.tcode != sa->packet.common.tcode ||
+		    prev->packet.common.tlabel != sa->packet.common.tlabel) {
+			/* use memcmp() instead? */
+			/* error, these should match for retries. */
+		}
+	} else {
+		prev = list_tail(&t->request_list, struct subaction, link);
+		if (prev->ack != ACK_PENDING) {
+			/*
+			 * error, should not get response unless last request got
+			 * ack_pending.
+			 */
+		}
+
+		if (packet_info[prev->packet.common.tcode].response_tcode !=
+		    sa->packet.common.tcode) {
+			/* error, tcode mismatch */
+		}
+	}
+
+	list_append(&t->response_list, &sa->link);
+
+	switch (sa->ack) {
+	case ACK_COMPLETE:
+	case ACK_NO_ACK:
+	case ACK_DATA_ERROR:
+	case ACK_TYPE_ERROR:
+		list_remove(&t->link);
+		handle_transaction(t);
+		/* transaction complete, remove t from pending list. */
+		break;
+
+	case ACK_PENDING:
+		/* error for responses. */
+		break;
+
+	case ACK_BUSY_X:
+	case ACK_BUSY_A:
+	case ACK_BUSY_B:
+		/* no problem, wait for next retry */
+		break;
+	}
+
+	return 1;
+}
+
+static int
+handle_packet(uint32_t *data, size_t length)
+{
+	if (length == 0) {
+		printf("bus reset\r\n");
+		clear_pending_transaction_list();
+	} else if (length > sizeof(struct phy_packet)) {
+		struct link_packet *p = (struct link_packet *) data;
+
+		switch (packet_info[p->common.tcode].type) {
+		case PACKET_REQUEST:
+			return handle_request_packet(data, length);
+
+		case PACKET_RESPONSE:
+			return handle_response_packet(data, length);
+
+		case PACKET_OTHER:
+		case PACKET_RESERVED:
+			return 0;
+		}
+	}
+
+	return 1;
+}
+
+static unsigned int
+get_bits(struct link_packet *packet, int offset, int width)
+{
+	uint32_t *data = (uint32_t *) packet;
+	uint32_t index, shift, mask;
+
+	index = offset / 32 + 1;
+	shift = 32 - (offset & 31) - width;
+	mask = width == 32 ? ~0 : (1 << width) - 1;
+
+	return (data[index] >> shift) & mask;
+}
+
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define byte_index(i) ((i) ^ 3)
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#define byte_index(i) (i)
+#else
+#error unsupported byte order.
+#endif
+
+static void
+dump_data(unsigned char *data, int length)
+{
+	int i, print_length;
+
+	if (length > 128)
+		print_length = 128;
+	else
+		print_length = length;
+
+	for (i = 0; i < print_length; i++)
+		printf("%s%02hhx",
+		       (i % 4 == 0 && i != 0) ? " " : "",
+		       data[byte_index(i)]);
+
+	if (print_length < length)
+		printf(" (%d more bytes)", length - print_length);
+}
+
+static void
+decode_link_packet(struct link_packet *packet, size_t length,
+		   int include_flags, int exclude_flags)
+{
+	const struct packet_info *pi;
+	int data_length = 0;
+	int i;
+
+	pi = &packet_info[packet->common.tcode];
+
+	for (i = 0; i < pi->field_count; i++) {
+		const struct packet_field *f = &pi->fields[i];
+		int offset;
+
+		if (f->flags & exclude_flags)
+			continue;
+		if (include_flags && !(f->flags & include_flags))
+			continue;
+
+		if (f->offset < 0)
+			offset = length * 8 + f->offset - 32;
+		else
+			offset = f->offset;
+
+		if (f->value_names != NULL) {
+			uint32_t bits;
+
+			bits = get_bits(packet, offset, f->width);
+			printf("%s", f->value_names[bits]);
+		} else if (f->width == 0) {
+			printf("%s=[", f->name);
+			dump_data((unsigned char *) packet + (offset / 8 + 4), data_length);
+			printf("]");
+		} else {
+			unsigned long long bits;
+			int high_width, low_width;
+
+			if ((offset & ~31) != ((offset + f->width - 1) & ~31)) {
+				/* Bit field spans quadlet boundary. */
+				high_width = ((offset + 31) & ~31) - offset;
+				low_width = f->width - high_width;
+
+				bits = get_bits(packet, offset, high_width);
+				bits = (bits << low_width) |
+					get_bits(packet, offset + high_width, low_width);
+			} else {
+				bits = get_bits(packet, offset, f->width);
+			}
+
+			printf("%s=0x%0*llx", f->name, (f->width + 3) / 4, bits);
+
+			if (f->flags & PACKET_FIELD_DATA_LENGTH)
+				data_length = bits;
+		}
+
+		if (i < pi->field_count - 1)
+			printf(", ");
+	}
+}
+
+static void
+print_packet(uint32_t *data, size_t length)
+{
+	int i;
+
+	printf("%6u  ", data[0]);
+
+	if (length == 4) {
+		printf("bus reset");
+	} else if (length < sizeof(struct phy_packet)) {
+		printf("short packet: ");
+		for (i = 1; i < length / 4; i++)
+			printf("%s%08x", i == 0 ? "[" : " ", data[i]);
+		printf("]");
+
+	} else if (length == sizeof(struct phy_packet) && data[1] == ~data[2]) {
+		struct phy_packet *pp = (struct phy_packet *) data;
+
+		/* phy packet are 3 quadlets: the 1 quadlet payload,
+		 * the bitwise inverse of the payload and the snoop
+		 * mode ack */
+
+		switch (pp->common.identifier) {
+		case PHY_PACKET_CONFIGURATION:
+			if (!pp->phy_config.set_root && !pp->phy_config.set_gap_count) {
+				printf("ext phy config: phy_id=%02x", pp->phy_config.root_id);
+			} else {
+				printf("phy config:");
+				if (pp->phy_config.set_root)
+					printf(" set_root_id=%02x", pp->phy_config.root_id);
+				if (pp->phy_config.set_gap_count)
+					printf(" set_gap_count=%d", pp->phy_config.gap_count);
+			}
+			break;
+
+		case PHY_PACKET_LINK_ON:
+			printf("link-on packet, phy_id=%02x", pp->link_on.phy_id);
+			break;
+
+		case PHY_PACKET_SELF_ID:
+			if (pp->self_id.extended) {
+				printf("extended self id: phy_id=%02x, seq=%d",
+				       pp->ext_self_id.phy_id, pp->ext_self_id.sequence);
+			} else {
+				static const char * const speed_names[] = {
+					"S100", "S200", "S400", "BETA"
+				};
+				printf("self id: phy_id=%02x, link %s, gap_count=%d, speed=%s%s%s",
+				       pp->self_id.phy_id,
+				       (pp->self_id.link_active ? "active" : "not active"),
+				       pp->self_id.gap_count,
+				       speed_names[pp->self_id.phy_speed],
+				       (pp->self_id.contender ? ", irm contender" : ""),
+				       (pp->self_id.initiated_reset ? ", initiator" : ""));
+			}
+			break;
+		default:
+			printf("unknown phy packet: ");
+			for (i = 1; i < length / 4; i++)
+				printf("%s%08x", i == 0 ? "[" : " ", data[i]);
+			printf("]");
+			break;
+		}
+	} else {
+		struct link_packet *packet = (struct link_packet *) data;
+
+		decode_link_packet(packet, length, 0,
+				   option_verbose ? 0 : PACKET_FIELD_DETAIL);
+	}
+
+	if (option_hex) {
+		printf("  [");
+		dump_data((unsigned char *) data + 4, length - 4);
+		printf("]");
+	}
+
+	printf("\r\n");
+}
+
+#define HIDE_CURSOR	"\033[?25l"
+#define SHOW_CURSOR	"\033[?25h"
+#define CLEAR		"\033[H\033[2J"
+
+static void
+print_stats(uint32_t *data, size_t length)
+{
+	static int bus_reset_count, short_packet_count, phy_packet_count;
+	static int tcode_count[16];
+	static struct timeval last_update;
+	struct timeval now;
+	int i;
+
+	if (length == 0)
+		bus_reset_count++;
+	else if (length < sizeof(struct phy_packet))
+		short_packet_count++;
+	else if (length == sizeof(struct phy_packet) && data[1] == ~data[2])
+		phy_packet_count++;
+	else {
+		struct link_packet *packet = (struct link_packet *) data;
+		tcode_count[packet->common.tcode]++;
+	}
+
+	gettimeofday(&now, NULL);
+	if (now.tv_sec <= last_update.tv_sec &&
+	    now.tv_usec < last_update.tv_usec + 500000)
+		return;
+
+	last_update = now;
+	printf(CLEAR HIDE_CURSOR
+	       "  bus resets              : %8d\n"
+	       "  short packets           : %8d\n"
+	       "  phy packets             : %8d\n",
+	       bus_reset_count, short_packet_count, phy_packet_count);
+
+	for (i = 0; i < array_length(packet_info); i++)
+		if (packet_info[i].type != PACKET_RESERVED)
+			printf("  %-24s: %8d\n", packet_info[i].name, tcode_count[i]);
+	printf(SHOW_CURSOR "\n");
+}
+
+static struct termios saved_attributes;
+
+static void
+reset_input_mode(void)
+{
+	tcsetattr(STDIN_FILENO, TCSANOW, &saved_attributes);
+}
+
+static void
+set_input_mode(void)
+{
+	struct termios tattr;
+
+	/* Make sure stdin is a terminal. */
+	if (!isatty(STDIN_FILENO)) {
+		fprintf(stderr, "Not a terminal.\n");
+		exit(EXIT_FAILURE);
+	}
+
+	/* Save the terminal attributes so we can restore them later. */
+	tcgetattr(STDIN_FILENO, &saved_attributes);
+	atexit(reset_input_mode);
+
+	/* Set the funny terminal modes. */
+	tcgetattr(STDIN_FILENO, &tattr);
+	tattr.c_lflag &= ~(ICANON|ECHO); /* Clear ICANON and ECHO. */
+	tattr.c_cc[VMIN] = 1;
+	tattr.c_cc[VTIME] = 0;
+	tcsetattr(STDIN_FILENO, TCSAFLUSH, &tattr);
+}
+
+int main(int argc, const char *argv[])
+{
+	uint32_t buf[128 * 1024];
+	uint32_t filter;
+	int length, retval, view;
+	int fd = -1;
+	FILE *output = NULL, *input = NULL;
+	poptContext con;
+	char c;
+	struct pollfd pollfds[2];
+
+	sys_sigint_handler = signal(SIGINT, sigint_handler);
+
+	con = poptGetContext(NULL, argc, argv, options, 0);
+	retval = poptGetNextOpt(con);
+	if (retval < -1) {
+		poptPrintUsage(con, stdout, 0);
+		return -1;
+	}
+
+	if (option_version) {
+		printf("dump tool for nosy sniffer, version %s\n", VERSION);
+		return 0;
+	}
+
+	if (__BYTE_ORDER != __LITTLE_ENDIAN)
+		fprintf(stderr, "warning: nosy has only been tested on little "
+			"endian machines\n");
+
+	if (option_input != NULL) {
+		input = fopen(option_input, "r");
+		if (input == NULL) {
+			fprintf(stderr, "Could not open %s, %m\n", option_input);
+			return -1;
+		}
+	} else {
+		fd = open(option_nosy_device, O_RDWR);
+		if (fd < 0) {
+			fprintf(stderr, "Could not open %s, %m\n", option_nosy_device);
+			return -1;
+		}
+		set_input_mode();
+	}
+
+	if (strcmp(option_view, "transaction") == 0)
+		view = VIEW_TRANSACTION;
+	else if (strcmp(option_view, "stats") == 0)
+		view = VIEW_STATS;
+	else
+		view = VIEW_PACKET;
+
+	if (option_output) {
+		output = fopen(option_output, "w");
+		if (output == NULL) {
+			fprintf(stderr, "Could not open %s, %m\n", option_output);
+			return -1;
+		}
+	}
+
+	setvbuf(stdout, NULL, _IOLBF, BUFSIZ);
+
+	filter = ~0;
+	if (!option_iso)
+		filter &= ~(1 << TCODE_STREAM_DATA);
+	if (!option_cycle_start)
+		filter &= ~(1 << TCODE_CYCLE_START);
+	if (view == VIEW_STATS)
+		filter = ~(1 << TCODE_CYCLE_START);
+
+	ioctl(fd, NOSY_IOC_FILTER, filter);
+
+	ioctl(fd, NOSY_IOC_START);
+
+	pollfds[0].fd = fd;
+	pollfds[0].events = POLLIN;
+	pollfds[1].fd = STDIN_FILENO;
+	pollfds[1].events = POLLIN;
+
+	while (run) {
+		if (input != NULL) {
+			if (fread(&length, sizeof length, 1, input) != 1)
+				return 0;
+			fread(buf, 1, length, input);
+		} else {
+			poll(pollfds, 2, -1);
+			if (pollfds[1].revents) {
+				read(STDIN_FILENO, &c, sizeof c);
+				switch (c) {
+				case 'q':
+					if (output != NULL)
+						fclose(output);
+					return 0;
+				}
+			}
+
+			if (pollfds[0].revents)
+				length = read(fd, buf, sizeof buf);
+			else
+				continue;
+		}
+
+		if (output != NULL) {
+			fwrite(&length, sizeof length, 1, output);
+			fwrite(buf, 1, length, output);
+		}
+
+		switch (view) {
+		case VIEW_TRANSACTION:
+			handle_packet(buf, length);
+			break;
+		case VIEW_PACKET:
+			print_packet(buf, length);
+			break;
+		case VIEW_STATS:
+			print_stats(buf, length);
+			break;
+		}
+	}
+
+	if (output != NULL)
+		fclose(output);
+
+	close(fd);
+
+	poptFreeContext(con);
+
+	return 0;
+}
diff --git a/tools/firewire/nosy-dump.h b/tools/firewire/nosy-dump.h
new file mode 100644
index 00000000000..3a4b5b33ba5
--- /dev/null
+++ b/tools/firewire/nosy-dump.h
@@ -0,0 +1,173 @@
+#ifndef __nosy_dump_h__
+#define __nosy_dump_h__
+
+#define array_length(array) (sizeof(array) / sizeof(array[0]))
+
+#define ACK_NO_ACK   0x0
+#define ACK_DONE(a)  ((a >> 2) == 0)
+#define ACK_BUSY(a)  ((a >> 2) == 1)
+#define ACK_ERROR(a) ((a >> 2) == 3)
+
+#include <stdint.h>
+
+struct phy_packet {
+	uint32_t timestamp;
+	union {
+		struct {
+			uint32_t zero:24;
+			uint32_t phy_id:6;
+			uint32_t identifier:2;
+		} common, link_on;
+
+		struct {
+			uint32_t zero:16;
+			uint32_t gap_count:6;
+			uint32_t set_gap_count:1;
+			uint32_t set_root:1;
+			uint32_t root_id:6;
+			uint32_t identifier:2;
+		} phy_config;
+
+		struct {
+			uint32_t more_packets:1;
+			uint32_t initiated_reset:1;
+			uint32_t port2:2;
+			uint32_t port1:2;
+			uint32_t port0:2;
+			uint32_t power_class:3;
+			uint32_t contender:1;
+			uint32_t phy_delay:2;
+			uint32_t phy_speed:2;
+			uint32_t gap_count:6;
+			uint32_t link_active:1;
+			uint32_t extended:1;
+			uint32_t phy_id:6;
+			uint32_t identifier:2;
+		} self_id;
+
+		struct {
+			uint32_t more_packets:1;
+			uint32_t reserved1:1;
+			uint32_t porth:2;
+			uint32_t portg:2;
+			uint32_t portf:2;
+			uint32_t porte:2;
+			uint32_t portd:2;
+			uint32_t portc:2;
+			uint32_t portb:2;
+			uint32_t porta:2;
+			uint32_t reserved0:2;
+			uint32_t sequence:3;
+			uint32_t extended:1;
+			uint32_t phy_id:6;
+			uint32_t identifier:2;
+		} ext_self_id;
+	};
+	uint32_t inverted;
+	uint32_t ack;
+};
+
+#define TCODE_PHY_PACKET 0x10
+
+#define PHY_PACKET_CONFIGURATION 0x00
+#define PHY_PACKET_LINK_ON 0x01
+#define PHY_PACKET_SELF_ID 0x02
+
+struct link_packet {
+	uint32_t timestamp;
+	union {
+		struct {
+			uint32_t priority:4;
+			uint32_t tcode:4;
+			uint32_t rt:2;
+			uint32_t tlabel:6;
+			uint32_t destination:16;
+
+			uint32_t offset_high:16;
+			uint32_t source:16;
+
+			uint32_t offset_low;
+		} common;
+
+		struct {
+			uint32_t common[3];
+			uint32_t crc;
+		} read_quadlet;
+
+		struct {
+			uint32_t common[3];
+			uint32_t data;
+			uint32_t crc;
+		} read_quadlet_response;
+
+		struct {
+			uint32_t common[3];
+			uint32_t extended_tcode:16;
+			uint32_t data_length:16;
+			uint32_t crc;
+		} read_block;
+
+		struct {
+			uint32_t common[3];
+			uint32_t extended_tcode:16;
+			uint32_t data_length:16;
+			uint32_t crc;
+			uint32_t data[0];
+			/* crc and ack follows. */
+		} read_block_response;
+
+		struct {
+			uint32_t common[3];
+			uint32_t data;
+			uint32_t crc;
+		} write_quadlet;
+
+		struct {
+			uint32_t common[3];
+			uint32_t extended_tcode:16;
+			uint32_t data_length:16;
+			uint32_t crc;
+			uint32_t data[0];
+			/* crc and ack follows. */
+		} write_block;
+
+		struct {
+			uint32_t common[3];
+			uint32_t crc;
+		} write_response;
+
+		struct {
+			uint32_t common[3];
+			uint32_t data;
+			uint32_t crc;
+		} cycle_start;
+
+		struct {
+			uint32_t sy:4;
+			uint32_t tcode:4;
+			uint32_t channel:6;
+			uint32_t tag:2;
+			uint32_t data_length:16;
+
+			uint32_t crc;
+		} iso_data;
+	};
+};
+
+struct subaction {
+	uint32_t ack;
+	size_t length;
+	struct list link;
+	struct link_packet packet;
+};
+
+struct link_transaction {
+	int request_node, response_node, tlabel;
+	struct subaction *request, *response;
+	struct list request_list, response_list;
+	struct list link;
+};
+
+int decode_fcp(struct link_transaction *t);
+
+#endif /* __nosy_dump_h__ */
diff --git a/tools/hv/Makefile b/tools/hv/Makefile
new file mode 100644
index 00000000000..bd22f786a60
--- /dev/null
+++ b/tools/hv/Makefile
@@ -0,0 +1,13 @@
+# Makefile for Hyper-V tools
+
+CC = $(CROSS_COMPILE)gcc
+PTHREAD_LIBS = -lpthread
+WARNINGS = -Wall -Wextra
+CFLAGS = $(WARNINGS) -g $(PTHREAD_LIBS)
+
+all: hv_kvp_daemon hv_vss_daemon
+%: %.c
+	$(CC) $(CFLAGS) -o $@ $^
+
+clean:
+	$(RM) hv_kvp_daemon hv_vss_daemon
diff --git a/tools/hv/hv_fcopy_daemon.c b/tools/hv/hv_fcopy_daemon.c
new file mode 100644
index 00000000000..fba1c75aa48
--- /dev/null
+++ b/tools/hv/hv_fcopy_daemon.c
@@ -0,0 +1,197 @@
+/*
+ * An implementation of host to guest copy functionality for Linux.
+ *
+ * Copyright (C) 2014, Microsoft, Inc.
+ *
+ * Author : K. Y. Srinivasan <kys@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ */
+
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/poll.h>
+#include <linux/types.h>
+#include <linux/kdev_t.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+#include <linux/hyperv.h>
+#include <syslog.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+
+static int target_fd;
+static char target_fname[W_MAX_PATH];
+
+static int hv_start_fcopy(struct hv_start_fcopy *smsg)
+{
+	int error = HV_E_FAIL;
+	char *q, *p;
+
+	/*
+	 * If possile append a path seperator to the path.
+	 */
+	if (strlen((char *)smsg->path_name) < (W_MAX_PATH - 2))
+		strcat((char *)smsg->path_name, "/");
+
+	p = (char *)smsg->path_name;
+	snprintf(target_fname, sizeof(target_fname), "%s/%s",
+		(char *)smsg->path_name, smsg->file_name);
+
+	syslog(LOG_INFO, "Target file name: %s", target_fname);
+	/*
+	 * Check to see if the path is already in place; if not,
+	 * create if required.
+	 */
+	while ((q = strchr(p, '/')) != NULL) {
+		if (q == p) {
+			p++;
+			continue;
+		}
+		*q = '\0';
+		if (access((char *)smsg->path_name, F_OK)) {
+			if (smsg->copy_flags & CREATE_PATH) {
+				if (mkdir((char *)smsg->path_name, 0755)) {
+					syslog(LOG_ERR, "Failed to create %s",
+						(char *)smsg->path_name);
+					goto done;
+				}
+			} else {
+				syslog(LOG_ERR, "Invalid path: %s",
+					(char *)smsg->path_name);
+				goto done;
+			}
+		}
+		p = q + 1;
+		*q = '/';
+	}
+
+	if (!access(target_fname, F_OK)) {
+		syslog(LOG_INFO, "File: %s exists", target_fname);
+		if (!(smsg->copy_flags & OVER_WRITE)) {
+			error = HV_ERROR_ALREADY_EXISTS;
+			goto done;
+		}
+	}
+
+	target_fd = open(target_fname, O_RDWR | O_CREAT | O_CLOEXEC, 0744);
+	if (target_fd == -1) {
+		syslog(LOG_INFO, "Open Failed: %s", strerror(errno));
+		goto done;
+	}
+
+	error = 0;
+done:
+	return error;
+}
+
+static int hv_copy_data(struct hv_do_fcopy *cpmsg)
+{
+	ssize_t bytes_written;
+
+	bytes_written = pwrite(target_fd, cpmsg->data, cpmsg->size,
+				cpmsg->offset);
+
+	if (bytes_written != cpmsg->size)
+		return HV_E_FAIL;
+
+	return 0;
+}
+
+static int hv_copy_finished(void)
+{
+	close(target_fd);
+	return 0;
+}
+static int hv_copy_cancel(void)
+{
+	close(target_fd);
+	unlink(target_fname);
+	return 0;
+
+}
+
+int main(void)
+{
+	int fd, fcopy_fd, len;
+	int error;
+	int version = FCOPY_CURRENT_VERSION;
+	char *buffer[4096 * 2];
+	struct hv_fcopy_hdr *in_msg;
+
+	if (daemon(1, 0)) {
+		syslog(LOG_ERR, "daemon() failed; error: %s", strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
+	openlog("HV_FCOPY", 0, LOG_USER);
+	syslog(LOG_INFO, "HV_FCOPY starting; pid is:%d", getpid());
+
+	fcopy_fd = open("/dev/vmbus/hv_fcopy", O_RDWR);
+
+	if (fcopy_fd < 0) {
+		syslog(LOG_ERR, "open /dev/vmbus/hv_fcopy failed; error: %d %s",
+			errno, strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
+	/*
+	 * Register with the kernel.
+	 */
+	if ((write(fcopy_fd, &version, sizeof(int))) != sizeof(int)) {
+		syslog(LOG_ERR, "Registration failed: %s", strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+
+	while (1) {
+		/*
+		 * In this loop we process fcopy messages after the
+		 * handshake is complete.
+		 */
+		len = pread(fcopy_fd, buffer, (4096 * 2), 0);
+		if (len < 0) {
+			syslog(LOG_ERR, "pread failed: %s", strerror(errno));
+			exit(EXIT_FAILURE);
+		}
+		in_msg = (struct hv_fcopy_hdr *)buffer;
+
+		switch (in_msg->operation) {
+		case START_FILE_COPY:
+			error = hv_start_fcopy((struct hv_start_fcopy *)in_msg);
+			break;
+		case WRITE_TO_FILE:
+			error = hv_copy_data((struct hv_do_fcopy *)in_msg);
+			break;
+		case COMPLETE_FCOPY:
+			error = hv_copy_finished();
+			break;
+		case CANCEL_FCOPY:
+			error = hv_copy_cancel();
+			break;
+
+		default:
+			syslog(LOG_ERR, "Unknown operation: %d",
+				in_msg->operation);
+
+		}
+
+		if (pwrite(fcopy_fd, &error, sizeof(int), 0) != sizeof(int)) {
+			syslog(LOG_ERR, "pwrite failed: %s", strerror(errno));
+			exit(EXIT_FAILURE);
+		}
+	}
+}
diff --git a/tools/hv/hv_get_dhcp_info.sh b/tools/hv/hv_get_dhcp_info.sh
new file mode 100755
index 00000000000..ccd3e953276
--- /dev/null
+++ b/tools/hv/hv_get_dhcp_info.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# This example script retrieves the DHCP state of a given interface.
+# In the interest of keeping the KVP daemon code free of distro specific
+# information; the kvp daemon code invokes this external script to gather
+# DHCP setting for the specific interface.
+#
+# Input: Name of the interface
+#
+# Output: The script prints the string "Enabled" to stdout to indicate
+#	that DHCP is enabled on the interface. If DHCP is not enabled,
+#	the script prints the string "Disabled" to stdout.
+#
+# Each Distro is expected to implement this script in a distro specific
+# fashion. For instance on Distros that ship with Network Manager enabled,
+# this script can be based on the Network Manager APIs for retrieving DHCP
+# information.
+
+if_file="/etc/sysconfig/network-scripts/ifcfg-"$1
+
+dhcp=$(grep "dhcp" $if_file 2>/dev/null)
+
+if [ "$dhcp" != "" ];
+then
+echo "Enabled"
+else
+echo "Disabled"
+fi
diff --git a/tools/hv/hv_get_dns_info.sh b/tools/hv/hv_get_dns_info.sh
new file mode 100755
index 00000000000..058c17b46ff
--- /dev/null
+++ b/tools/hv/hv_get_dns_info.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+# This example script parses /etc/resolv.conf to retrive DNS information.
+# In the interest of keeping the KVP daemon code free of distro specific
+# information; the kvp daemon code invokes this external script to gather
+# DNS information.
+# This script is expected to print the nameserver values to stdout.
+# Each Distro is expected to implement this script in a distro specific
+# fashion. For instance on Distros that ship with Network Manager enabled,
+# this script can be based on the Network Manager APIs for retrieving DNS
+# entries.
+
+cat /etc/resolv.conf 2>/dev/null | awk '/^nameserver/ { print $2 }'
diff --git a/tools/hv/hv_kvp_daemon.c b/tools/hv/hv_kvp_daemon.c
new file mode 100644
index 00000000000..4088b816a3e
--- /dev/null
+++ b/tools/hv/hv_kvp_daemon.c
@@ -0,0 +1,1742 @@
+/*
+ * An implementation of key value pair (KVP) functionality for Linux.
+ *
+ *
+ * Copyright (C) 2010, Novell, Inc.
+ * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/poll.h>
+#include <sys/utsname.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+#include <arpa/inet.h>
+#include <linux/connector.h>
+#include <linux/hyperv.h>
+#include <linux/netlink.h>
+#include <ifaddrs.h>
+#include <netdb.h>
+#include <syslog.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <net/if.h>
+
+/*
+ * KVP protocol: The user mode component first registers with the
+ * the kernel component. Subsequently, the kernel component requests, data
+ * for the specified keys. In response to this message the user mode component
+ * fills in the value corresponding to the specified key. We overload the
+ * sequence field in the cn_msg header to define our KVP message types.
+ *
+ * We use this infrastructure for also supporting queries from user mode
+ * application for state that may be maintained in the KVP kernel component.
+ *
+ */
+
+
+enum key_index {
+	FullyQualifiedDomainName = 0,
+	IntegrationServicesVersion, /*This key is serviced in the kernel*/
+	NetworkAddressIPv4,
+	NetworkAddressIPv6,
+	OSBuildNumber,
+	OSName,
+	OSMajorVersion,
+	OSMinorVersion,
+	OSVersion,
+	ProcessorArchitecture
+};
+
+
+enum {
+	IPADDR = 0,
+	NETMASK,
+	GATEWAY,
+	DNS
+};
+
+static struct sockaddr_nl addr;
+static int in_hand_shake = 1;
+
+static char *os_name = "";
+static char *os_major = "";
+static char *os_minor = "";
+static char *processor_arch;
+static char *os_build;
+static char *os_version;
+static char *lic_version = "Unknown version";
+static char full_domain_name[HV_KVP_EXCHANGE_MAX_VALUE_SIZE];
+static struct utsname uts_buf;
+
+/*
+ * The location of the interface configuration file.
+ */
+
+#define KVP_CONFIG_LOC	"/var/lib/hyperv"
+
+#define MAX_FILE_NAME 100
+#define ENTRIES_PER_BLOCK 50
+
+#ifndef SOL_NETLINK
+#define SOL_NETLINK 270
+#endif
+
+struct kvp_record {
+	char key[HV_KVP_EXCHANGE_MAX_KEY_SIZE];
+	char value[HV_KVP_EXCHANGE_MAX_VALUE_SIZE];
+};
+
+struct kvp_file_state {
+	int fd;
+	int num_blocks;
+	struct kvp_record *records;
+	int num_records;
+	char fname[MAX_FILE_NAME];
+};
+
+static struct kvp_file_state kvp_file_info[KVP_POOL_COUNT];
+
+static void kvp_acquire_lock(int pool)
+{
+	struct flock fl = {F_WRLCK, SEEK_SET, 0, 0, 0};
+	fl.l_pid = getpid();
+
+	if (fcntl(kvp_file_info[pool].fd, F_SETLKW, &fl) == -1) {
+		syslog(LOG_ERR, "Failed to acquire the lock pool: %d; error: %d %s", pool,
+				errno, strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+}
+
+static void kvp_release_lock(int pool)
+{
+	struct flock fl = {F_UNLCK, SEEK_SET, 0, 0, 0};
+	fl.l_pid = getpid();
+
+	if (fcntl(kvp_file_info[pool].fd, F_SETLK, &fl) == -1) {
+		syslog(LOG_ERR, "Failed to release the lock pool: %d; error: %d %s", pool,
+				errno, strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+}
+
+static void kvp_update_file(int pool)
+{
+	FILE *filep;
+	size_t bytes_written;
+
+	/*
+	 * We are going to write our in-memory registry out to
+	 * disk; acquire the lock first.
+	 */
+	kvp_acquire_lock(pool);
+
+	filep = fopen(kvp_file_info[pool].fname, "we");
+	if (!filep) {
+		syslog(LOG_ERR, "Failed to open file, pool: %d; error: %d %s", pool,
+				errno, strerror(errno));
+		kvp_release_lock(pool);
+		exit(EXIT_FAILURE);
+	}
+
+	bytes_written = fwrite(kvp_file_info[pool].records,
+				sizeof(struct kvp_record),
+				kvp_file_info[pool].num_records, filep);
+
+	if (ferror(filep) || fclose(filep)) {
+		kvp_release_lock(pool);
+		syslog(LOG_ERR, "Failed to write file, pool: %d", pool);
+		exit(EXIT_FAILURE);
+	}
+
+	kvp_release_lock(pool);
+}
+
+static void kvp_update_mem_state(int pool)
+{
+	FILE *filep;
+	size_t records_read = 0;
+	struct kvp_record *record = kvp_file_info[pool].records;
+	struct kvp_record *readp;
+	int num_blocks = kvp_file_info[pool].num_blocks;
+	int alloc_unit = sizeof(struct kvp_record) * ENTRIES_PER_BLOCK;
+
+	kvp_acquire_lock(pool);
+
+	filep = fopen(kvp_file_info[pool].fname, "re");
+	if (!filep) {
+		syslog(LOG_ERR, "Failed to open file, pool: %d; error: %d %s", pool,
+				errno, strerror(errno));
+		kvp_release_lock(pool);
+		exit(EXIT_FAILURE);
+	}
+	for (;;) {
+		readp = &record[records_read];
+		records_read += fread(readp, sizeof(struct kvp_record),
+					ENTRIES_PER_BLOCK * num_blocks,
+					filep);
+
+		if (ferror(filep)) {
+			syslog(LOG_ERR, "Failed to read file, pool: %d", pool);
+			exit(EXIT_FAILURE);
+		}
+
+		if (!feof(filep)) {
+			/*
+			 * We have more data to read.
+			 */
+			num_blocks++;
+			record = realloc(record, alloc_unit * num_blocks);
+
+			if (record == NULL) {
+				syslog(LOG_ERR, "malloc failed");
+				exit(EXIT_FAILURE);
+			}
+			continue;
+		}
+		break;
+	}
+
+	kvp_file_info[pool].num_blocks = num_blocks;
+	kvp_file_info[pool].records = record;
+	kvp_file_info[pool].num_records = records_read;
+
+	fclose(filep);
+	kvp_release_lock(pool);
+}
+static int kvp_file_init(void)
+{
+	int  fd;
+	FILE *filep;
+	size_t records_read;
+	char *fname;
+	struct kvp_record *record;
+	struct kvp_record *readp;
+	int num_blocks;
+	int i;
+	int alloc_unit = sizeof(struct kvp_record) * ENTRIES_PER_BLOCK;
+
+	if (access(KVP_CONFIG_LOC, F_OK)) {
+		if (mkdir(KVP_CONFIG_LOC, 0755 /* rwxr-xr-x */)) {
+			syslog(LOG_ERR, "Failed to create '%s'; error: %d %s", KVP_CONFIG_LOC,
+					errno, strerror(errno));
+			exit(EXIT_FAILURE);
+		}
+	}
+
+	for (i = 0; i < KVP_POOL_COUNT; i++) {
+		fname = kvp_file_info[i].fname;
+		records_read = 0;
+		num_blocks = 1;
+		sprintf(fname, "%s/.kvp_pool_%d", KVP_CONFIG_LOC, i);
+		fd = open(fname, O_RDWR | O_CREAT | O_CLOEXEC, 0644 /* rw-r--r-- */);
+
+		if (fd == -1)
+			return 1;
+
+
+		filep = fopen(fname, "re");
+		if (!filep) {
+			close(fd);
+			return 1;
+		}
+
+		record = malloc(alloc_unit * num_blocks);
+		if (record == NULL) {
+			fclose(filep);
+			close(fd);
+			return 1;
+		}
+		for (;;) {
+			readp = &record[records_read];
+			records_read += fread(readp, sizeof(struct kvp_record),
+					ENTRIES_PER_BLOCK,
+					filep);
+
+			if (ferror(filep)) {
+				syslog(LOG_ERR, "Failed to read file, pool: %d",
+				       i);
+				exit(EXIT_FAILURE);
+			}
+
+			if (!feof(filep)) {
+				/*
+				 * We have more data to read.
+				 */
+				num_blocks++;
+				record = realloc(record, alloc_unit *
+						num_blocks);
+				if (record == NULL) {
+					fclose(filep);
+					close(fd);
+					return 1;
+				}
+				continue;
+			}
+			break;
+		}
+		kvp_file_info[i].fd = fd;
+		kvp_file_info[i].num_blocks = num_blocks;
+		kvp_file_info[i].records = record;
+		kvp_file_info[i].num_records = records_read;
+		fclose(filep);
+
+	}
+
+	return 0;
+}
+
+static int kvp_key_delete(int pool, const char *key, int key_size)
+{
+	int i;
+	int j, k;
+	int num_records;
+	struct kvp_record *record;
+
+	/*
+	 * First update the in-memory state.
+	 */
+	kvp_update_mem_state(pool);
+
+	num_records = kvp_file_info[pool].num_records;
+	record = kvp_file_info[pool].records;
+
+	for (i = 0; i < num_records; i++) {
+		if (memcmp(key, record[i].key, key_size))
+			continue;
+		/*
+		 * Found a match; just move the remaining
+		 * entries up.
+		 */
+		if (i == num_records) {
+			kvp_file_info[pool].num_records--;
+			kvp_update_file(pool);
+			return 0;
+		}
+
+		j = i;
+		k = j + 1;
+		for (; k < num_records; k++) {
+			strcpy(record[j].key, record[k].key);
+			strcpy(record[j].value, record[k].value);
+			j++;
+		}
+
+		kvp_file_info[pool].num_records--;
+		kvp_update_file(pool);
+		return 0;
+	}
+	return 1;
+}
+
+static int kvp_key_add_or_modify(int pool, const char *key, int key_size, const char *value,
+			int value_size)
+{
+	int i;
+	int num_records;
+	struct kvp_record *record;
+	int num_blocks;
+
+	if ((key_size > HV_KVP_EXCHANGE_MAX_KEY_SIZE) ||
+		(value_size > HV_KVP_EXCHANGE_MAX_VALUE_SIZE))
+		return 1;
+
+	/*
+	 * First update the in-memory state.
+	 */
+	kvp_update_mem_state(pool);
+
+	num_records = kvp_file_info[pool].num_records;
+	record = kvp_file_info[pool].records;
+	num_blocks = kvp_file_info[pool].num_blocks;
+
+	for (i = 0; i < num_records; i++) {
+		if (memcmp(key, record[i].key, key_size))
+			continue;
+		/*
+		 * Found a match; just update the value -
+		 * this is the modify case.
+		 */
+		memcpy(record[i].value, value, value_size);
+		kvp_update_file(pool);
+		return 0;
+	}
+
+	/*
+	 * Need to add a new entry;
+	 */
+	if (num_records == (ENTRIES_PER_BLOCK * num_blocks)) {
+		/* Need to allocate a larger array for reg entries. */
+		record = realloc(record, sizeof(struct kvp_record) *
+			 ENTRIES_PER_BLOCK * (num_blocks + 1));
+
+		if (record == NULL)
+			return 1;
+		kvp_file_info[pool].num_blocks++;
+
+	}
+	memcpy(record[i].value, value, value_size);
+	memcpy(record[i].key, key, key_size);
+	kvp_file_info[pool].records = record;
+	kvp_file_info[pool].num_records++;
+	kvp_update_file(pool);
+	return 0;
+}
+
+static int kvp_get_value(int pool, const char *key, int key_size, char *value,
+			int value_size)
+{
+	int i;
+	int num_records;
+	struct kvp_record *record;
+
+	if ((key_size > HV_KVP_EXCHANGE_MAX_KEY_SIZE) ||
+		(value_size > HV_KVP_EXCHANGE_MAX_VALUE_SIZE))
+		return 1;
+
+	/*
+	 * First update the in-memory state.
+	 */
+	kvp_update_mem_state(pool);
+
+	num_records = kvp_file_info[pool].num_records;
+	record = kvp_file_info[pool].records;
+
+	for (i = 0; i < num_records; i++) {
+		if (memcmp(key, record[i].key, key_size))
+			continue;
+		/*
+		 * Found a match; just copy the value out.
+		 */
+		memcpy(value, record[i].value, value_size);
+		return 0;
+	}
+
+	return 1;
+}
+
+static int kvp_pool_enumerate(int pool, int index, char *key, int key_size,
+				char *value, int value_size)
+{
+	struct kvp_record *record;
+
+	/*
+	 * First update our in-memory database.
+	 */
+	kvp_update_mem_state(pool);
+	record = kvp_file_info[pool].records;
+
+	if (index >= kvp_file_info[pool].num_records) {
+		return 1;
+	}
+
+	memcpy(key, record[index].key, key_size);
+	memcpy(value, record[index].value, value_size);
+	return 0;
+}
+
+
+void kvp_get_os_info(void)
+{
+	FILE	*file;
+	char	*p, buf[512];
+
+	uname(&uts_buf);
+	os_version = uts_buf.release;
+	os_build = strdup(uts_buf.release);
+
+	os_name = uts_buf.sysname;
+	processor_arch = uts_buf.machine;
+
+	/*
+	 * The current windows host (win7) expects the build
+	 * string to be of the form: x.y.z
+	 * Strip additional information we may have.
+	 */
+	p = strchr(os_version, '-');
+	if (p)
+		*p = '\0';
+
+	/*
+	 * Parse the /etc/os-release file if present:
+	 * http://www.freedesktop.org/software/systemd/man/os-release.html
+	 */
+	file = fopen("/etc/os-release", "r");
+	if (file != NULL) {
+		while (fgets(buf, sizeof(buf), file)) {
+			char *value, *q;
+
+			/* Ignore comments */
+			if (buf[0] == '#')
+				continue;
+
+			/* Split into name=value */
+			p = strchr(buf, '=');
+			if (!p)
+				continue;
+			*p++ = 0;
+
+			/* Remove quotes and newline; un-escape */
+			value = p;
+			q = p;
+			while (*p) {
+				if (*p == '\\') {
+					++p;
+					if (!*p)
+						break;
+					*q++ = *p++;
+				} else if (*p == '\'' || *p == '"' ||
+					   *p == '\n') {
+					++p;
+				} else {
+					*q++ = *p++;
+				}
+			}
+			*q = 0;
+
+			if (!strcmp(buf, "NAME")) {
+				p = strdup(value);
+				if (!p)
+					break;
+				os_name = p;
+			} else if (!strcmp(buf, "VERSION_ID")) {
+				p = strdup(value);
+				if (!p)
+					break;
+				os_major = p;
+			}
+		}
+		fclose(file);
+		return;
+	}
+
+	/* Fallback for older RH/SUSE releases */
+	file = fopen("/etc/SuSE-release", "r");
+	if (file != NULL)
+		goto kvp_osinfo_found;
+	file  = fopen("/etc/redhat-release", "r");
+	if (file != NULL)
+		goto kvp_osinfo_found;
+
+	/*
+	 * We don't have information about the os.
+	 */
+	return;
+
+kvp_osinfo_found:
+	/* up to three lines */
+	p = fgets(buf, sizeof(buf), file);
+	if (p) {
+		p = strchr(buf, '\n');
+		if (p)
+			*p = '\0';
+		p = strdup(buf);
+		if (!p)
+			goto done;
+		os_name = p;
+
+		/* second line */
+		p = fgets(buf, sizeof(buf), file);
+		if (p) {
+			p = strchr(buf, '\n');
+			if (p)
+				*p = '\0';
+			p = strdup(buf);
+			if (!p)
+				goto done;
+			os_major = p;
+
+			/* third line */
+			p = fgets(buf, sizeof(buf), file);
+			if (p)  {
+				p = strchr(buf, '\n');
+				if (p)
+					*p = '\0';
+				p = strdup(buf);
+				if (p)
+					os_minor = p;
+			}
+		}
+	}
+
+done:
+	fclose(file);
+	return;
+}
+
+
+
+/*
+ * Retrieve an interface name corresponding to the specified guid.
+ * If there is a match, the function returns a pointer
+ * to the interface name and if not, a NULL is returned.
+ * If a match is found, the caller is responsible for
+ * freeing the memory.
+ */
+
+static char *kvp_get_if_name(char *guid)
+{
+	DIR *dir;
+	struct dirent *entry;
+	FILE    *file;
+	char    *p, *q, *x;
+	char    *if_name = NULL;
+	char    buf[256];
+	char *kvp_net_dir = "/sys/class/net/";
+	char dev_id[256];
+
+	dir = opendir(kvp_net_dir);
+	if (dir == NULL)
+		return NULL;
+
+	snprintf(dev_id, sizeof(dev_id), "%s", kvp_net_dir);
+	q = dev_id + strlen(kvp_net_dir);
+
+	while ((entry = readdir(dir)) != NULL) {
+		/*
+		 * Set the state for the next pass.
+		 */
+		*q = '\0';
+		strcat(dev_id, entry->d_name);
+		strcat(dev_id, "/device/device_id");
+
+		file = fopen(dev_id, "r");
+		if (file == NULL)
+			continue;
+
+		p = fgets(buf, sizeof(buf), file);
+		if (p) {
+			x = strchr(p, '\n');
+			if (x)
+				*x = '\0';
+
+			if (!strcmp(p, guid)) {
+				/*
+				 * Found the guid match; return the interface
+				 * name. The caller will free the memory.
+				 */
+				if_name = strdup(entry->d_name);
+				fclose(file);
+				break;
+			}
+		}
+		fclose(file);
+	}
+
+	closedir(dir);
+	return if_name;
+}
+
+/*
+ * Retrieve the MAC address given the interface name.
+ */
+
+static char *kvp_if_name_to_mac(char *if_name)
+{
+	FILE    *file;
+	char    *p, *x;
+	char    buf[256];
+	char addr_file[256];
+	int i;
+	char *mac_addr = NULL;
+
+	snprintf(addr_file, sizeof(addr_file), "%s%s%s", "/sys/class/net/",
+		if_name, "/address");
+
+	file = fopen(addr_file, "r");
+	if (file == NULL)
+		return NULL;
+
+	p = fgets(buf, sizeof(buf), file);
+	if (p) {
+		x = strchr(p, '\n');
+		if (x)
+			*x = '\0';
+		for (i = 0; i < strlen(p); i++)
+			p[i] = toupper(p[i]);
+		mac_addr = strdup(p);
+	}
+
+	fclose(file);
+	return mac_addr;
+}
+
+
+/*
+ * Retrieve the interface name given tha MAC address.
+ */
+
+static char *kvp_mac_to_if_name(char *mac)
+{
+	DIR *dir;
+	struct dirent *entry;
+	FILE    *file;
+	char    *p, *q, *x;
+	char    *if_name = NULL;
+	char    buf[256];
+	char *kvp_net_dir = "/sys/class/net/";
+	char dev_id[256];
+	int i;
+
+	dir = opendir(kvp_net_dir);
+	if (dir == NULL)
+		return NULL;
+
+	snprintf(dev_id, sizeof(dev_id), kvp_net_dir);
+	q = dev_id + strlen(kvp_net_dir);
+
+	while ((entry = readdir(dir)) != NULL) {
+		/*
+		 * Set the state for the next pass.
+		 */
+		*q = '\0';
+
+		strcat(dev_id, entry->d_name);
+		strcat(dev_id, "/address");
+
+		file = fopen(dev_id, "r");
+		if (file == NULL)
+			continue;
+
+		p = fgets(buf, sizeof(buf), file);
+		if (p) {
+			x = strchr(p, '\n');
+			if (x)
+				*x = '\0';
+
+			for (i = 0; i < strlen(p); i++)
+				p[i] = toupper(p[i]);
+
+			if (!strcmp(p, mac)) {
+				/*
+				 * Found the MAC match; return the interface
+				 * name. The caller will free the memory.
+				 */
+				if_name = strdup(entry->d_name);
+				fclose(file);
+				break;
+			}
+		}
+		fclose(file);
+	}
+
+	closedir(dir);
+	return if_name;
+}
+
+
+static void kvp_process_ipconfig_file(char *cmd,
+					char *config_buf, int len,
+					int element_size, int offset)
+{
+	char buf[256];
+	char *p;
+	char *x;
+	FILE *file;
+
+	/*
+	 * First execute the command.
+	 */
+	file = popen(cmd, "r");
+	if (file == NULL)
+		return;
+
+	if (offset == 0)
+		memset(config_buf, 0, len);
+	while ((p = fgets(buf, sizeof(buf), file)) != NULL) {
+		if ((len - strlen(config_buf)) < (element_size + 1))
+			break;
+
+		x = strchr(p, '\n');
+		if (x)
+			*x = '\0';
+
+		strcat(config_buf, p);
+		strcat(config_buf, ";");
+	}
+	pclose(file);
+}
+
+static void kvp_get_ipconfig_info(char *if_name,
+				 struct hv_kvp_ipaddr_value *buffer)
+{
+	char cmd[512];
+	char dhcp_info[128];
+	char *p;
+	FILE *file;
+
+	/*
+	 * Get the address of default gateway (ipv4).
+	 */
+	sprintf(cmd, "%s %s", "ip route show dev", if_name);
+	strcat(cmd, " | awk '/default/ {print $3 }'");
+
+	/*
+	 * Execute the command to gather gateway info.
+	 */
+	kvp_process_ipconfig_file(cmd, (char *)buffer->gate_way,
+				(MAX_GATEWAY_SIZE * 2), INET_ADDRSTRLEN, 0);
+
+	/*
+	 * Get the address of default gateway (ipv6).
+	 */
+	sprintf(cmd, "%s %s", "ip -f inet6  route show dev", if_name);
+	strcat(cmd, " | awk '/default/ {print $3 }'");
+
+	/*
+	 * Execute the command to gather gateway info (ipv6).
+	 */
+	kvp_process_ipconfig_file(cmd, (char *)buffer->gate_way,
+				(MAX_GATEWAY_SIZE * 2), INET6_ADDRSTRLEN, 1);
+
+
+	/*
+	 * Gather the DNS  state.
+	 * Since there is no standard way to get this information
+	 * across various distributions of interest; we just invoke
+	 * an external script that needs to be ported across distros
+	 * of interest.
+	 *
+	 * Following is the expected format of the information from the script:
+	 *
+	 * ipaddr1 (nameserver1)
+	 * ipaddr2 (nameserver2)
+	 * .
+	 * .
+	 */
+
+	sprintf(cmd, "%s",  "hv_get_dns_info");
+
+	/*
+	 * Execute the command to gather DNS info.
+	 */
+	kvp_process_ipconfig_file(cmd, (char *)buffer->dns_addr,
+				(MAX_IP_ADDR_SIZE * 2), INET_ADDRSTRLEN, 0);
+
+	/*
+	 * Gather the DHCP state.
+	 * We will gather this state by invoking an external script.
+	 * The parameter to the script is the interface name.
+	 * Here is the expected output:
+	 *
+	 * Enabled: DHCP enabled.
+	 */
+
+	sprintf(cmd, "%s %s", "hv_get_dhcp_info", if_name);
+
+	file = popen(cmd, "r");
+	if (file == NULL)
+		return;
+
+	p = fgets(dhcp_info, sizeof(dhcp_info), file);
+	if (p == NULL) {
+		pclose(file);
+		return;
+	}
+
+	if (!strncmp(p, "Enabled", 7))
+		buffer->dhcp_enabled = 1;
+	else
+		buffer->dhcp_enabled = 0;
+
+	pclose(file);
+}
+
+
+static unsigned int hweight32(unsigned int *w)
+{
+	unsigned int res = *w - ((*w >> 1) & 0x55555555);
+	res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
+	res = (res + (res >> 4)) & 0x0F0F0F0F;
+	res = res + (res >> 8);
+	return (res + (res >> 16)) & 0x000000FF;
+}
+
+static int kvp_process_ip_address(void *addrp,
+				int family, char *buffer,
+				int length,  int *offset)
+{
+	struct sockaddr_in *addr;
+	struct sockaddr_in6 *addr6;
+	int addr_length;
+	char tmp[50];
+	const char *str;
+
+	if (family == AF_INET) {
+		addr = (struct sockaddr_in *)addrp;
+		str = inet_ntop(family, &addr->sin_addr, tmp, 50);
+		addr_length = INET_ADDRSTRLEN;
+	} else {
+		addr6 = (struct sockaddr_in6 *)addrp;
+		str = inet_ntop(family, &addr6->sin6_addr.s6_addr, tmp, 50);
+		addr_length = INET6_ADDRSTRLEN;
+	}
+
+	if ((length - *offset) < addr_length + 2)
+		return HV_E_FAIL;
+	if (str == NULL) {
+		strcpy(buffer, "inet_ntop failed\n");
+		return HV_E_FAIL;
+	}
+	if (*offset == 0)
+		strcpy(buffer, tmp);
+	else {
+		strcat(buffer, ";");
+		strcat(buffer, tmp);
+	}
+
+	*offset += strlen(str) + 1;
+
+	return 0;
+}
+
+static int
+kvp_get_ip_info(int family, char *if_name, int op,
+		 void  *out_buffer, int length)
+{
+	struct ifaddrs *ifap;
+	struct ifaddrs *curp;
+	int offset = 0;
+	int sn_offset = 0;
+	int error = 0;
+	char *buffer;
+	struct hv_kvp_ipaddr_value *ip_buffer;
+	char cidr_mask[5]; /* /xyz */
+	int weight;
+	int i;
+	unsigned int *w;
+	char *sn_str;
+	struct sockaddr_in6 *addr6;
+
+	if (op == KVP_OP_ENUMERATE) {
+		buffer = out_buffer;
+	} else {
+		ip_buffer = out_buffer;
+		buffer = (char *)ip_buffer->ip_addr;
+		ip_buffer->addr_family = 0;
+	}
+	/*
+	 * On entry into this function, the buffer is capable of holding the
+	 * maximum key value.
+	 */
+
+	if (getifaddrs(&ifap)) {
+		strcpy(buffer, "getifaddrs failed\n");
+		return HV_E_FAIL;
+	}
+
+	curp = ifap;
+	while (curp != NULL) {
+		if (curp->ifa_addr == NULL) {
+			curp = curp->ifa_next;
+			continue;
+		}
+
+		if ((if_name != NULL) &&
+			(strncmp(curp->ifa_name, if_name, strlen(if_name)))) {
+			/*
+			 * We want info about a specific interface;
+			 * just continue.
+			 */
+			curp = curp->ifa_next;
+			continue;
+		}
+
+		/*
+		 * We only support two address families: AF_INET and AF_INET6.
+		 * If a family value of 0 is specified, we collect both
+		 * supported address families; if not we gather info on
+		 * the specified address family.
+		 */
+		if ((((family != 0) &&
+			 (curp->ifa_addr->sa_family != family))) ||
+			 (curp->ifa_flags & IFF_LOOPBACK)) {
+			curp = curp->ifa_next;
+			continue;
+		}
+		if ((curp->ifa_addr->sa_family != AF_INET) &&
+			(curp->ifa_addr->sa_family != AF_INET6)) {
+			curp = curp->ifa_next;
+			continue;
+		}
+
+		if (op == KVP_OP_GET_IP_INFO) {
+			/*
+			 * Gather info other than the IP address.
+			 * IP address info will be gathered later.
+			 */
+			if (curp->ifa_addr->sa_family == AF_INET) {
+				ip_buffer->addr_family |= ADDR_FAMILY_IPV4;
+				/*
+				 * Get subnet info.
+				 */
+				error = kvp_process_ip_address(
+							     curp->ifa_netmask,
+							     AF_INET,
+							     (char *)
+							     ip_buffer->sub_net,
+							     length,
+							     &sn_offset);
+				if (error)
+					goto gather_ipaddr;
+			} else {
+				ip_buffer->addr_family |= ADDR_FAMILY_IPV6;
+
+				/*
+				 * Get subnet info in CIDR format.
+				 */
+				weight = 0;
+				sn_str = (char *)ip_buffer->sub_net;
+				addr6 = (struct sockaddr_in6 *)
+					curp->ifa_netmask;
+				w = addr6->sin6_addr.s6_addr32;
+
+				for (i = 0; i < 4; i++)
+					weight += hweight32(&w[i]);
+
+				sprintf(cidr_mask, "/%d", weight);
+				if ((length - sn_offset) <
+					(strlen(cidr_mask) + 1))
+					goto gather_ipaddr;
+
+				if (sn_offset == 0)
+					strcpy(sn_str, cidr_mask);
+				else {
+					strcat((char *)ip_buffer->sub_net, ";");
+					strcat(sn_str, cidr_mask);
+				}
+				sn_offset += strlen(sn_str) + 1;
+			}
+
+			/*
+			 * Collect other ip related configuration info.
+			 */
+
+			kvp_get_ipconfig_info(if_name, ip_buffer);
+		}
+
+gather_ipaddr:
+		error = kvp_process_ip_address(curp->ifa_addr,
+						curp->ifa_addr->sa_family,
+						buffer,
+						length, &offset);
+		if (error)
+			goto getaddr_done;
+
+		curp = curp->ifa_next;
+	}
+
+getaddr_done:
+	freeifaddrs(ifap);
+	return error;
+}
+
+
+static int expand_ipv6(char *addr, int type)
+{
+	int ret;
+	struct in6_addr v6_addr;
+
+	ret = inet_pton(AF_INET6, addr, &v6_addr);
+
+	if (ret != 1) {
+		if (type == NETMASK)
+			return 1;
+		return 0;
+	}
+
+	sprintf(addr, "%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:"
+		"%02x%02x:%02x%02x:%02x%02x",
+		(int)v6_addr.s6_addr[0], (int)v6_addr.s6_addr[1],
+		(int)v6_addr.s6_addr[2], (int)v6_addr.s6_addr[3],
+		(int)v6_addr.s6_addr[4], (int)v6_addr.s6_addr[5],
+		(int)v6_addr.s6_addr[6], (int)v6_addr.s6_addr[7],
+		(int)v6_addr.s6_addr[8], (int)v6_addr.s6_addr[9],
+		(int)v6_addr.s6_addr[10], (int)v6_addr.s6_addr[11],
+		(int)v6_addr.s6_addr[12], (int)v6_addr.s6_addr[13],
+		(int)v6_addr.s6_addr[14], (int)v6_addr.s6_addr[15]);
+
+	return 1;
+
+}
+
+static int is_ipv4(char *addr)
+{
+	int ret;
+	struct in_addr ipv4_addr;
+
+	ret = inet_pton(AF_INET, addr, &ipv4_addr);
+
+	if (ret == 1)
+		return 1;
+	return 0;
+}
+
+static int parse_ip_val_buffer(char *in_buf, int *offset,
+				char *out_buf, int out_len)
+{
+	char *x;
+	char *start;
+
+	/*
+	 * in_buf has sequence of characters that are seperated by
+	 * the character ';'. The last sequence does not have the
+	 * terminating ";" character.
+	 */
+	start = in_buf + *offset;
+
+	x = strchr(start, ';');
+	if (x)
+		*x = 0;
+	else
+		x = start + strlen(start);
+
+	if (strlen(start) != 0) {
+		int i = 0;
+		/*
+		 * Get rid of leading spaces.
+		 */
+		while (start[i] == ' ')
+			i++;
+
+		if ((x - start) <= out_len) {
+			strcpy(out_buf, (start + i));
+			*offset += (x - start) + 1;
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static int kvp_write_file(FILE *f, char *s1, char *s2, char *s3)
+{
+	int ret;
+
+	ret = fprintf(f, "%s%s%s%s\n", s1, s2, "=", s3);
+
+	if (ret < 0)
+		return HV_E_FAIL;
+
+	return 0;
+}
+
+
+static int process_ip_string(FILE *f, char *ip_string, int type)
+{
+	int error = 0;
+	char addr[INET6_ADDRSTRLEN];
+	int i = 0;
+	int j = 0;
+	char str[256];
+	char sub_str[10];
+	int offset = 0;
+
+	memset(addr, 0, sizeof(addr));
+
+	while (parse_ip_val_buffer(ip_string, &offset, addr,
+					(MAX_IP_ADDR_SIZE * 2))) {
+
+		sub_str[0] = 0;
+		if (is_ipv4(addr)) {
+			switch (type) {
+			case IPADDR:
+				snprintf(str, sizeof(str), "%s", "IPADDR");
+				break;
+			case NETMASK:
+				snprintf(str, sizeof(str), "%s", "NETMASK");
+				break;
+			case GATEWAY:
+				snprintf(str, sizeof(str), "%s", "GATEWAY");
+				break;
+			case DNS:
+				snprintf(str, sizeof(str), "%s", "DNS");
+				break;
+			}
+
+			if (type == DNS) {
+				snprintf(sub_str, sizeof(sub_str), "%d", ++i);
+			} else if (type == GATEWAY && i == 0) {
+				++i;
+			} else {
+				snprintf(sub_str, sizeof(sub_str), "%d", i++);
+			}
+
+
+		} else if (expand_ipv6(addr, type)) {
+			switch (type) {
+			case IPADDR:
+				snprintf(str, sizeof(str), "%s", "IPV6ADDR");
+				break;
+			case NETMASK:
+				snprintf(str, sizeof(str), "%s", "IPV6NETMASK");
+				break;
+			case GATEWAY:
+				snprintf(str, sizeof(str), "%s",
+					"IPV6_DEFAULTGW");
+				break;
+			case DNS:
+				snprintf(str, sizeof(str), "%s",  "DNS");
+				break;
+			}
+
+			if (type == DNS) {
+				snprintf(sub_str, sizeof(sub_str), "%d", ++i);
+			} else if (j == 0) {
+				++j;
+			} else {
+				snprintf(sub_str, sizeof(sub_str), "_%d", j++);
+			}
+		} else {
+			return  HV_INVALIDARG;
+		}
+
+		error = kvp_write_file(f, str, sub_str, addr);
+		if (error)
+			return error;
+		memset(addr, 0, sizeof(addr));
+	}
+
+	return 0;
+}
+
+static int kvp_set_ip_info(char *if_name, struct hv_kvp_ipaddr_value *new_val)
+{
+	int error = 0;
+	char if_file[128];
+	FILE *file;
+	char cmd[512];
+	char *mac_addr;
+
+	/*
+	 * Set the configuration for the specified interface with
+	 * the information provided. Since there is no standard
+	 * way to configure an interface, we will have an external
+	 * script that does the job of configuring the interface and
+	 * flushing the configuration.
+	 *
+	 * The parameters passed to this external script are:
+	 * 1. A configuration file that has the specified configuration.
+	 *
+	 * We will embed the name of the interface in the configuration
+	 * file: ifcfg-ethx (where ethx is the interface name).
+	 *
+	 * The information provided here may be more than what is needed
+	 * in a given distro to configure the interface and so are free
+	 * ignore information that may not be relevant.
+	 *
+	 * Here is the format of the ip configuration file:
+	 *
+	 * HWADDR=macaddr
+	 * DEVICE=interface name
+	 * BOOTPROTO=<protocol> (where <protocol> is "dhcp" if DHCP is configured
+	 *                       or "none" if no boot-time protocol should be used)
+	 *
+	 * IPADDR0=ipaddr1
+	 * IPADDR1=ipaddr2
+	 * IPADDRx=ipaddry (where y = x + 1)
+	 *
+	 * NETMASK0=netmask1
+	 * NETMASKx=netmasky (where y = x + 1)
+	 *
+	 * GATEWAY=ipaddr1
+	 * GATEWAYx=ipaddry (where y = x + 1)
+	 *
+	 * DNSx=ipaddrx (where first DNS address is tagged as DNS1 etc)
+	 *
+	 * IPV6 addresses will be tagged as IPV6ADDR, IPV6 gateway will be
+	 * tagged as IPV6_DEFAULTGW and IPV6 NETMASK will be tagged as
+	 * IPV6NETMASK.
+	 *
+	 * The host can specify multiple ipv4 and ipv6 addresses to be
+	 * configured for the interface. Furthermore, the configuration
+	 * needs to be persistent. A subsequent GET call on the interface
+	 * is expected to return the configuration that is set via the SET
+	 * call.
+	 */
+
+	snprintf(if_file, sizeof(if_file), "%s%s%s", KVP_CONFIG_LOC,
+		"/ifcfg-", if_name);
+
+	file = fopen(if_file, "w");
+
+	if (file == NULL) {
+		syslog(LOG_ERR, "Failed to open config file; error: %d %s",
+				errno, strerror(errno));
+		return HV_E_FAIL;
+	}
+
+	/*
+	 * First write out the MAC address.
+	 */
+
+	mac_addr = kvp_if_name_to_mac(if_name);
+	if (mac_addr == NULL) {
+		error = HV_E_FAIL;
+		goto setval_error;
+	}
+
+	error = kvp_write_file(file, "HWADDR", "", mac_addr);
+	free(mac_addr);
+	if (error)
+		goto setval_error;
+
+	error = kvp_write_file(file, "DEVICE", "", if_name);
+	if (error)
+		goto setval_error;
+
+	if (new_val->dhcp_enabled) {
+		error = kvp_write_file(file, "BOOTPROTO", "", "dhcp");
+		if (error)
+			goto setval_error;
+
+		/*
+		 * We are done!.
+		 */
+		goto setval_done;
+
+	} else {
+		error = kvp_write_file(file, "BOOTPROTO", "", "none");
+		if (error)
+			goto setval_error;
+	}
+
+	/*
+	 * Write the configuration for ipaddress, netmask, gateway and
+	 * name servers.
+	 */
+
+	error = process_ip_string(file, (char *)new_val->ip_addr, IPADDR);
+	if (error)
+		goto setval_error;
+
+	error = process_ip_string(file, (char *)new_val->sub_net, NETMASK);
+	if (error)
+		goto setval_error;
+
+	error = process_ip_string(file, (char *)new_val->gate_way, GATEWAY);
+	if (error)
+		goto setval_error;
+
+	error = process_ip_string(file, (char *)new_val->dns_addr, DNS);
+	if (error)
+		goto setval_error;
+
+setval_done:
+	fclose(file);
+
+	/*
+	 * Now that we have populated the configuration file,
+	 * invoke the external script to do its magic.
+	 */
+
+	snprintf(cmd, sizeof(cmd), "%s %s", "hv_set_ifconfig", if_file);
+	if (system(cmd)) {
+		syslog(LOG_ERR, "Failed to execute cmd '%s'; error: %d %s",
+				cmd, errno, strerror(errno));
+		return HV_E_FAIL;
+	}
+	return 0;
+
+setval_error:
+	syslog(LOG_ERR, "Failed to write config file");
+	fclose(file);
+	return error;
+}
+
+
+static void
+kvp_get_domain_name(char *buffer, int length)
+{
+	struct addrinfo	hints, *info ;
+	int error = 0;
+
+	gethostname(buffer, length);
+	memset(&hints, 0, sizeof(hints));
+	hints.ai_family = AF_INET; /*Get only ipv4 addrinfo. */
+	hints.ai_socktype = SOCK_STREAM;
+	hints.ai_flags = AI_CANONNAME;
+
+	error = getaddrinfo(buffer, NULL, &hints, &info);
+	if (error != 0) {
+		snprintf(buffer, length, "getaddrinfo failed: 0x%x %s",
+			error, gai_strerror(error));
+		return;
+	}
+	snprintf(buffer, length, "%s", info->ai_canonname);
+	freeaddrinfo(info);
+}
+
+static int
+netlink_send(int fd, struct cn_msg *msg)
+{
+	struct nlmsghdr nlh = { .nlmsg_type = NLMSG_DONE };
+	unsigned int size;
+	struct msghdr message;
+	struct iovec iov[2];
+
+	size = sizeof(struct cn_msg) + msg->len;
+
+	nlh.nlmsg_pid = getpid();
+	nlh.nlmsg_len = NLMSG_LENGTH(size);
+
+	iov[0].iov_base = &nlh;
+	iov[0].iov_len = sizeof(nlh);
+
+	iov[1].iov_base = msg;
+	iov[1].iov_len = size;
+
+	memset(&message, 0, sizeof(message));
+	message.msg_name = &addr;
+	message.msg_namelen = sizeof(addr);
+	message.msg_iov = iov;
+	message.msg_iovlen = 2;
+
+	return sendmsg(fd, &message, 0);
+}
+
+int main(void)
+{
+	int fd, len, nl_group;
+	int error;
+	struct cn_msg *message;
+	struct pollfd pfd;
+	struct nlmsghdr *incoming_msg;
+	struct cn_msg	*incoming_cn_msg;
+	struct hv_kvp_msg *hv_msg;
+	char	*p;
+	char	*key_value;
+	char	*key_name;
+	int	op;
+	int	pool;
+	char	*if_name;
+	struct hv_kvp_ipaddr_value *kvp_ip_val;
+	char *kvp_recv_buffer;
+	size_t kvp_recv_buffer_len;
+
+	if (daemon(1, 0))
+		return 1;
+	openlog("KVP", 0, LOG_USER);
+	syslog(LOG_INFO, "KVP starting; pid is:%d", getpid());
+
+	kvp_recv_buffer_len = NLMSG_LENGTH(0) + sizeof(struct cn_msg) + sizeof(struct hv_kvp_msg);
+	kvp_recv_buffer = calloc(1, kvp_recv_buffer_len);
+	if (!kvp_recv_buffer) {
+		syslog(LOG_ERR, "Failed to allocate netlink buffer");
+		exit(EXIT_FAILURE);
+	}
+	/*
+	 * Retrieve OS release information.
+	 */
+	kvp_get_os_info();
+	/*
+	 * Cache Fully Qualified Domain Name because getaddrinfo takes an
+	 * unpredictable amount of time to finish.
+	 */
+	kvp_get_domain_name(full_domain_name, sizeof(full_domain_name));
+
+	if (kvp_file_init()) {
+		syslog(LOG_ERR, "Failed to initialize the pools");
+		exit(EXIT_FAILURE);
+	}
+
+	fd = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
+	if (fd < 0) {
+		syslog(LOG_ERR, "netlink socket creation failed; error: %d %s", errno,
+				strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+	addr.nl_family = AF_NETLINK;
+	addr.nl_pad = 0;
+	addr.nl_pid = 0;
+	addr.nl_groups = 0;
+
+
+	error = bind(fd, (struct sockaddr *)&addr, sizeof(addr));
+	if (error < 0) {
+		syslog(LOG_ERR, "bind failed; error: %d %s", errno, strerror(errno));
+		close(fd);
+		exit(EXIT_FAILURE);
+	}
+	nl_group = CN_KVP_IDX;
+
+	if (setsockopt(fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &nl_group, sizeof(nl_group)) < 0) {
+		syslog(LOG_ERR, "setsockopt failed; error: %d %s", errno, strerror(errno));
+		close(fd);
+		exit(EXIT_FAILURE);
+	}
+
+	/*
+	 * Register ourselves with the kernel.
+	 */
+	message = (struct cn_msg *)kvp_recv_buffer;
+	message->id.idx = CN_KVP_IDX;
+	message->id.val = CN_KVP_VAL;
+
+	hv_msg = (struct hv_kvp_msg *)message->data;
+	hv_msg->kvp_hdr.operation = KVP_OP_REGISTER1;
+	message->ack = 0;
+	message->len = sizeof(struct hv_kvp_msg);
+
+	len = netlink_send(fd, message);
+	if (len < 0) {
+		syslog(LOG_ERR, "netlink_send failed; error: %d %s", errno, strerror(errno));
+		close(fd);
+		exit(EXIT_FAILURE);
+	}
+
+	pfd.fd = fd;
+
+	while (1) {
+		struct sockaddr *addr_p = (struct sockaddr *) &addr;
+		socklen_t addr_l = sizeof(addr);
+		pfd.events = POLLIN;
+		pfd.revents = 0;
+
+		if (poll(&pfd, 1, -1) < 0) {
+			syslog(LOG_ERR, "poll failed; error: %d %s", errno, strerror(errno));
+			if (errno == EINVAL) {
+				close(fd);
+				exit(EXIT_FAILURE);
+			}
+			else
+				continue;
+		}
+
+		len = recvfrom(fd, kvp_recv_buffer, kvp_recv_buffer_len, 0,
+				addr_p, &addr_l);
+
+		if (len < 0) {
+			syslog(LOG_ERR, "recvfrom failed; pid:%u error:%d %s",
+					addr.nl_pid, errno, strerror(errno));
+			close(fd);
+			return -1;
+		}
+
+		if (addr.nl_pid) {
+			syslog(LOG_WARNING, "Received packet from untrusted pid:%u",
+					addr.nl_pid);
+			continue;
+		}
+
+		incoming_msg = (struct nlmsghdr *)kvp_recv_buffer;
+
+		if (incoming_msg->nlmsg_type != NLMSG_DONE)
+			continue;
+
+		incoming_cn_msg = (struct cn_msg *)NLMSG_DATA(incoming_msg);
+		hv_msg = (struct hv_kvp_msg *)incoming_cn_msg->data;
+
+		/*
+		 * We will use the KVP header information to pass back
+		 * the error from this daemon. So, first copy the state
+		 * and set the error code to success.
+		 */
+		op = hv_msg->kvp_hdr.operation;
+		pool = hv_msg->kvp_hdr.pool;
+		hv_msg->error = HV_S_OK;
+
+		if ((in_hand_shake) && (op == KVP_OP_REGISTER1)) {
+			/*
+			 * Driver is registering with us; stash away the version
+			 * information.
+			 */
+			in_hand_shake = 0;
+			p = (char *)hv_msg->body.kvp_register.version;
+			lic_version = malloc(strlen(p) + 1);
+			if (lic_version) {
+				strcpy(lic_version, p);
+				syslog(LOG_INFO, "KVP LIC Version: %s",
+					lic_version);
+			} else {
+				syslog(LOG_ERR, "malloc failed");
+			}
+			continue;
+		}
+
+		switch (op) {
+		case KVP_OP_GET_IP_INFO:
+			kvp_ip_val = &hv_msg->body.kvp_ip_val;
+			if_name =
+			kvp_mac_to_if_name((char *)kvp_ip_val->adapter_id);
+
+			if (if_name == NULL) {
+				/*
+				 * We could not map the mac address to an
+				 * interface name; return error.
+				 */
+				hv_msg->error = HV_E_FAIL;
+				break;
+			}
+			error = kvp_get_ip_info(
+						0, if_name, KVP_OP_GET_IP_INFO,
+						kvp_ip_val,
+						(MAX_IP_ADDR_SIZE * 2));
+
+			if (error)
+				hv_msg->error = error;
+
+			free(if_name);
+			break;
+
+		case KVP_OP_SET_IP_INFO:
+			kvp_ip_val = &hv_msg->body.kvp_ip_val;
+			if_name = kvp_get_if_name(
+					(char *)kvp_ip_val->adapter_id);
+			if (if_name == NULL) {
+				/*
+				 * We could not map the guid to an
+				 * interface name; return error.
+				 */
+				hv_msg->error = HV_GUID_NOTFOUND;
+				break;
+			}
+			error = kvp_set_ip_info(if_name, kvp_ip_val);
+			if (error)
+				hv_msg->error = error;
+
+			free(if_name);
+			break;
+
+		case KVP_OP_SET:
+			if (kvp_key_add_or_modify(pool,
+					hv_msg->body.kvp_set.data.key,
+					hv_msg->body.kvp_set.data.key_size,
+					hv_msg->body.kvp_set.data.value,
+					hv_msg->body.kvp_set.data.value_size))
+					hv_msg->error = HV_S_CONT;
+			break;
+
+		case KVP_OP_GET:
+			if (kvp_get_value(pool,
+					hv_msg->body.kvp_set.data.key,
+					hv_msg->body.kvp_set.data.key_size,
+					hv_msg->body.kvp_set.data.value,
+					hv_msg->body.kvp_set.data.value_size))
+					hv_msg->error = HV_S_CONT;
+			break;
+
+		case KVP_OP_DELETE:
+			if (kvp_key_delete(pool,
+					hv_msg->body.kvp_delete.key,
+					hv_msg->body.kvp_delete.key_size))
+					hv_msg->error = HV_S_CONT;
+			break;
+
+		default:
+			break;
+		}
+
+		if (op != KVP_OP_ENUMERATE)
+			goto kvp_done;
+
+		/*
+		 * If the pool is KVP_POOL_AUTO, dynamically generate
+		 * both the key and the value; if not read from the
+		 * appropriate pool.
+		 */
+		if (pool != KVP_POOL_AUTO) {
+			if (kvp_pool_enumerate(pool,
+					hv_msg->body.kvp_enum_data.index,
+					hv_msg->body.kvp_enum_data.data.key,
+					HV_KVP_EXCHANGE_MAX_KEY_SIZE,
+					hv_msg->body.kvp_enum_data.data.value,
+					HV_KVP_EXCHANGE_MAX_VALUE_SIZE))
+					hv_msg->error = HV_S_CONT;
+			goto kvp_done;
+		}
+
+		hv_msg = (struct hv_kvp_msg *)incoming_cn_msg->data;
+		key_name = (char *)hv_msg->body.kvp_enum_data.data.key;
+		key_value = (char *)hv_msg->body.kvp_enum_data.data.value;
+
+		switch (hv_msg->body.kvp_enum_data.index) {
+		case FullyQualifiedDomainName:
+			strcpy(key_value, full_domain_name);
+			strcpy(key_name, "FullyQualifiedDomainName");
+			break;
+		case IntegrationServicesVersion:
+			strcpy(key_name, "IntegrationServicesVersion");
+			strcpy(key_value, lic_version);
+			break;
+		case NetworkAddressIPv4:
+			kvp_get_ip_info(AF_INET, NULL, KVP_OP_ENUMERATE,
+				key_value, HV_KVP_EXCHANGE_MAX_VALUE_SIZE);
+			strcpy(key_name, "NetworkAddressIPv4");
+			break;
+		case NetworkAddressIPv6:
+			kvp_get_ip_info(AF_INET6, NULL, KVP_OP_ENUMERATE,
+				key_value, HV_KVP_EXCHANGE_MAX_VALUE_SIZE);
+			strcpy(key_name, "NetworkAddressIPv6");
+			break;
+		case OSBuildNumber:
+			strcpy(key_value, os_build);
+			strcpy(key_name, "OSBuildNumber");
+			break;
+		case OSName:
+			strcpy(key_value, os_name);
+			strcpy(key_name, "OSName");
+			break;
+		case OSMajorVersion:
+			strcpy(key_value, os_major);
+			strcpy(key_name, "OSMajorVersion");
+			break;
+		case OSMinorVersion:
+			strcpy(key_value, os_minor);
+			strcpy(key_name, "OSMinorVersion");
+			break;
+		case OSVersion:
+			strcpy(key_value, os_version);
+			strcpy(key_name, "OSVersion");
+			break;
+		case ProcessorArchitecture:
+			strcpy(key_value, processor_arch);
+			strcpy(key_name, "ProcessorArchitecture");
+			break;
+		default:
+			hv_msg->error = HV_S_CONT;
+			break;
+		}
+		/*
+		 * Send the value back to the kernel. The response is
+		 * already in the receive buffer. Update the cn_msg header to
+		 * reflect the key value that has been added to the message
+		 */
+kvp_done:
+
+		incoming_cn_msg->id.idx = CN_KVP_IDX;
+		incoming_cn_msg->id.val = CN_KVP_VAL;
+		incoming_cn_msg->ack = 0;
+		incoming_cn_msg->len = sizeof(struct hv_kvp_msg);
+
+		len = netlink_send(fd, incoming_cn_msg);
+		if (len < 0) {
+			syslog(LOG_ERR, "net_link send failed; error: %d %s", errno,
+					strerror(errno));
+			exit(EXIT_FAILURE);
+		}
+	}
+
+}
diff --git a/tools/hv/hv_set_ifconfig.sh b/tools/hv/hv_set_ifconfig.sh
new file mode 100755
index 00000000000..735aafd64a3
--- /dev/null
+++ b/tools/hv/hv_set_ifconfig.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# This example script activates an interface based on the specified
+# configuration.
+#
+# In the interest of keeping the KVP daemon code free of distro specific
+# information; the kvp daemon code invokes this external script to configure
+# the interface.
+#
+# The only argument to this script is the configuration file that is to
+# be used to configure the interface.
+#
+# Each Distro is expected to implement this script in a distro specific
+# fashion. For instance on Distros that ship with Network Manager enabled,
+# this script can be based on the Network Manager APIs for configuring the
+# interface.
+#
+# This example script is based on a RHEL environment.
+#
+# Here is the format of the ip configuration file:
+#
+# HWADDR=macaddr
+# DEVICE=interface name
+# BOOTPROTO=<protocol> (where <protocol> is "dhcp" if DHCP is configured
+#                       or "none" if no boot-time protocol should be used)
+#
+# IPADDR0=ipaddr1
+# IPADDR1=ipaddr2
+# IPADDRx=ipaddry (where y = x + 1)
+#
+# NETMASK0=netmask1
+# NETMASKx=netmasky (where y = x + 1)
+#
+# GATEWAY=ipaddr1
+# GATEWAYx=ipaddry (where y = x + 1)
+#
+# DNSx=ipaddrx (where first DNS address is tagged as DNS1 etc)
+#
+# IPV6 addresses will be tagged as IPV6ADDR, IPV6 gateway will be
+# tagged as IPV6_DEFAULTGW and IPV6 NETMASK will be tagged as
+# IPV6NETMASK.
+#
+# The host can specify multiple ipv4 and ipv6 addresses to be
+# configured for the interface. Furthermore, the configuration
+# needs to be persistent. A subsequent GET call on the interface
+# is expected to return the configuration that is set via the SET
+# call.
+#
+
+
+
+echo "IPV6INIT=yes" >> $1
+echo "NM_CONTROLLED=no" >> $1
+echo "PEERDNS=yes" >> $1
+echo "ONBOOT=yes" >> $1
+
+
+cp $1 /etc/sysconfig/network-scripts/
+
+
+interface=$(echo $1 | awk -F - '{ print $2 }')
+
+/sbin/ifdown $interface 2>/dev/null
+/sbin/ifup $interface 2>/dev/null
diff --git a/tools/hv/hv_vss_daemon.c b/tools/hv/hv_vss_daemon.c
new file mode 100644
index 00000000000..6a213b8cd7b
--- /dev/null
+++ b/tools/hv/hv_vss_daemon.c
@@ -0,0 +1,267 @@
+/*
+ * An implementation of the host initiated guest snapshot for Hyper-V.
+ *
+ *
+ * Copyright (C) 2013, Microsoft, Inc.
+ * Author : K. Y. Srinivasan <kys@microsoft.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
+ * NON INFRINGEMENT.  See the GNU General Public License for more
+ * details.
+ *
+ */
+
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/poll.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <mntent.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <ctype.h>
+#include <errno.h>
+#include <arpa/inet.h>
+#include <linux/fs.h>
+#include <linux/connector.h>
+#include <linux/hyperv.h>
+#include <linux/netlink.h>
+#include <syslog.h>
+
+static struct sockaddr_nl addr;
+
+#ifndef SOL_NETLINK
+#define SOL_NETLINK 270
+#endif
+
+
+static int vss_do_freeze(char *dir, unsigned int cmd, char *fs_op)
+{
+	int ret, fd = open(dir, O_RDONLY);
+
+	if (fd < 0)
+		return 1;
+	ret = ioctl(fd, cmd, 0);
+	syslog(LOG_INFO, "VSS: %s of %s: %s\n", fs_op, dir, strerror(errno));
+	close(fd);
+	return !!ret;
+}
+
+static int vss_operate(int operation)
+{
+	char *fs_op;
+	char match[] = "/dev/";
+	FILE *mounts;
+	struct mntent *ent;
+	unsigned int cmd;
+	int error = 0, root_seen = 0;
+
+	switch (operation) {
+	case VSS_OP_FREEZE:
+		cmd = FIFREEZE;
+		fs_op = "freeze";
+		break;
+	case VSS_OP_THAW:
+		cmd = FITHAW;
+		fs_op = "thaw";
+		break;
+	default:
+		return -1;
+	}
+
+	mounts = setmntent("/proc/mounts", "r");
+	if (mounts == NULL)
+		return -1;
+
+	while ((ent = getmntent(mounts))) {
+		if (strncmp(ent->mnt_fsname, match, strlen(match)))
+			continue;
+		if (strcmp(ent->mnt_type, "iso9660") == 0)
+			continue;
+		if (strcmp(ent->mnt_type, "vfat") == 0)
+			continue;
+		if (strcmp(ent->mnt_dir, "/") == 0) {
+			root_seen = 1;
+			continue;
+		}
+		error |= vss_do_freeze(ent->mnt_dir, cmd, fs_op);
+	}
+	endmntent(mounts);
+
+	if (root_seen) {
+		error |= vss_do_freeze("/", cmd, fs_op);
+	}
+
+	return error;
+}
+
+static int netlink_send(int fd, struct cn_msg *msg)
+{
+	struct nlmsghdr nlh = { .nlmsg_type = NLMSG_DONE };
+	unsigned int size;
+	struct msghdr message;
+	struct iovec iov[2];
+
+	size = sizeof(struct cn_msg) + msg->len;
+
+	nlh.nlmsg_pid = getpid();
+	nlh.nlmsg_len = NLMSG_LENGTH(size);
+
+	iov[0].iov_base = &nlh;
+	iov[0].iov_len = sizeof(nlh);
+
+	iov[1].iov_base = msg;
+	iov[1].iov_len = size;
+
+	memset(&message, 0, sizeof(message));
+	message.msg_name = &addr;
+	message.msg_namelen = sizeof(addr);
+	message.msg_iov = iov;
+	message.msg_iovlen = 2;
+
+	return sendmsg(fd, &message, 0);
+}
+
+int main(void)
+{
+	int fd, len, nl_group;
+	int error;
+	struct cn_msg *message;
+	struct pollfd pfd;
+	struct nlmsghdr *incoming_msg;
+	struct cn_msg	*incoming_cn_msg;
+	int	op;
+	struct hv_vss_msg *vss_msg;
+	char *vss_recv_buffer;
+	size_t vss_recv_buffer_len;
+
+	if (daemon(1, 0))
+		return 1;
+
+	openlog("Hyper-V VSS", 0, LOG_USER);
+	syslog(LOG_INFO, "VSS starting; pid is:%d", getpid());
+
+	vss_recv_buffer_len = NLMSG_LENGTH(0) + sizeof(struct cn_msg) + sizeof(struct hv_vss_msg);
+	vss_recv_buffer = calloc(1, vss_recv_buffer_len);
+	if (!vss_recv_buffer) {
+		syslog(LOG_ERR, "Failed to allocate netlink buffers");
+		exit(EXIT_FAILURE);
+	}
+
+	fd = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
+	if (fd < 0) {
+		syslog(LOG_ERR, "netlink socket creation failed; error:%d %s",
+				errno, strerror(errno));
+		exit(EXIT_FAILURE);
+	}
+	addr.nl_family = AF_NETLINK;
+	addr.nl_pad = 0;
+	addr.nl_pid = 0;
+	addr.nl_groups = 0;
+
+
+	error = bind(fd, (struct sockaddr *)&addr, sizeof(addr));
+	if (error < 0) {
+		syslog(LOG_ERR, "bind failed; error:%d %s", errno, strerror(errno));
+		close(fd);
+		exit(EXIT_FAILURE);
+	}
+	nl_group = CN_VSS_IDX;
+	if (setsockopt(fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &nl_group, sizeof(nl_group)) < 0) {
+		syslog(LOG_ERR, "setsockopt failed; error:%d %s", errno, strerror(errno));
+		close(fd);
+		exit(EXIT_FAILURE);
+	}
+	/*
+	 * Register ourselves with the kernel.
+	 */
+	message = (struct cn_msg *)vss_recv_buffer;
+	message->id.idx = CN_VSS_IDX;
+	message->id.val = CN_VSS_VAL;
+	message->ack = 0;
+	vss_msg = (struct hv_vss_msg *)message->data;
+	vss_msg->vss_hdr.operation = VSS_OP_REGISTER;
+
+	message->len = sizeof(struct hv_vss_msg);
+
+	len = netlink_send(fd, message);
+	if (len < 0) {
+		syslog(LOG_ERR, "netlink_send failed; error:%d %s", errno, strerror(errno));
+		close(fd);
+		exit(EXIT_FAILURE);
+	}
+
+	pfd.fd = fd;
+
+	while (1) {
+		struct sockaddr *addr_p = (struct sockaddr *) &addr;
+		socklen_t addr_l = sizeof(addr);
+		pfd.events = POLLIN;
+		pfd.revents = 0;
+
+		if (poll(&pfd, 1, -1) < 0) {
+			syslog(LOG_ERR, "poll failed; error:%d %s", errno, strerror(errno));
+			if (errno == EINVAL) {
+				close(fd);
+				exit(EXIT_FAILURE);
+			}
+			else
+				continue;
+		}
+
+		len = recvfrom(fd, vss_recv_buffer, vss_recv_buffer_len, 0,
+				addr_p, &addr_l);
+
+		if (len < 0) {
+			syslog(LOG_ERR, "recvfrom failed; pid:%u error:%d %s",
+					addr.nl_pid, errno, strerror(errno));
+			close(fd);
+			return -1;
+		}
+
+		if (addr.nl_pid) {
+			syslog(LOG_WARNING,
+				"Received packet from untrusted pid:%u",
+				addr.nl_pid);
+			continue;
+		}
+
+		incoming_msg = (struct nlmsghdr *)vss_recv_buffer;
+
+		if (incoming_msg->nlmsg_type != NLMSG_DONE)
+			continue;
+
+		incoming_cn_msg = (struct cn_msg *)NLMSG_DATA(incoming_msg);
+		vss_msg = (struct hv_vss_msg *)incoming_cn_msg->data;
+		op = vss_msg->vss_hdr.operation;
+		error =  HV_S_OK;
+
+		switch (op) {
+		case VSS_OP_FREEZE:
+		case VSS_OP_THAW:
+			error = vss_operate(op);
+			if (error)
+				error = HV_E_FAIL;
+			break;
+		default:
+			syslog(LOG_ERR, "Illegal op:%d\n", op);
+		}
+		vss_msg->error = error;
+		len = netlink_send(fd, incoming_cn_msg);
+		if (len < 0) {
+			syslog(LOG_ERR, "net_link send failed; error:%d %s",
+					errno, strerror(errno));
+			exit(EXIT_FAILURE);
+		}
+	}
+
+}
diff --git a/tools/perf/util/include/asm/bug.h b/tools/include/asm/bug.h
index 7fcc6810adc..9e5f4846967 100644
--- a/tools/perf/util/include/asm/bug.h
+++ b/tools/include/asm/bug.h
@@ -1,5 +1,7 @@
-#ifndef _PERF_ASM_GENERIC_BUG_H
-#define _PERF_ASM_GENERIC_BUG_H
+#ifndef _TOOLS_ASM_BUG_H
+#define _TOOLS_ASM_BUG_H
+
+#include <linux/compiler.h>
 
 #define __WARN_printf(arg...)	do { fprintf(stderr, arg); } while (0)
 
@@ -19,4 +21,5 @@
 			__warned = 1;		\
 	unlikely(__ret_warn_once);		\
 })
-#endif
+
+#endif /* _TOOLS_ASM_BUG_H */
diff --git a/tools/include/linux/compiler.h b/tools/include/linux/compiler.h
new file mode 100644
index 00000000000..88461f09cc8
--- /dev/null
+++ b/tools/include/linux/compiler.h
@@ -0,0 +1,40 @@
+#ifndef _TOOLS_LINUX_COMPILER_H_
+#define _TOOLS_LINUX_COMPILER_H_
+
+#ifndef __always_inline
+# define __always_inline	inline __attribute__((always_inline))
+#endif
+
+#define __user
+
+#ifndef __attribute_const__
+# define __attribute_const__
+#endif
+
+#ifndef __maybe_unused
+# define __maybe_unused		__attribute__((unused))
+#endif
+
+#ifndef __packed
+# define __packed		__attribute__((__packed__))
+#endif
+
+#ifndef __force
+# define __force
+#endif
+
+#ifndef __weak
+# define __weak			__attribute__((weak))
+#endif
+
+#ifndef likely
+# define likely(x)		__builtin_expect(!!(x), 1)
+#endif
+
+#ifndef unlikely
+# define unlikely(x)		__builtin_expect(!!(x), 0)
+#endif
+
+#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
+
+#endif /* _TOOLS_LINUX_COMPILER_H */
diff --git a/tools/include/linux/export.h b/tools/include/linux/export.h
new file mode 100644
index 00000000000..d07e586b9ba
--- /dev/null
+++ b/tools/include/linux/export.h
@@ -0,0 +1,10 @@
+#ifndef _TOOLS_LINUX_EXPORT_H_
+#define _TOOLS_LINUX_EXPORT_H_
+
+#define EXPORT_SYMBOL(sym)
+#define EXPORT_SYMBOL_GPL(sym)
+#define EXPORT_SYMBOL_GPL_FUTURE(sym)
+#define EXPORT_UNUSED_SYMBOL(sym)
+#define EXPORT_UNUSED_SYMBOL_GPL(sym)
+
+#endif
diff --git a/tools/include/linux/hash.h b/tools/include/linux/hash.h
new file mode 100644
index 00000000000..d026c657301
--- /dev/null
+++ b/tools/include/linux/hash.h
@@ -0,0 +1,5 @@
+#include "../../../include/linux/hash.h"
+
+#ifndef _TOOLS_LINUX_HASH_H
+#define _TOOLS_LINUX_HASH_H
+#endif
diff --git a/tools/include/linux/types.h b/tools/include/linux/types.h
new file mode 100644
index 00000000000..b5cf25e05df
--- /dev/null
+++ b/tools/include/linux/types.h
@@ -0,0 +1,75 @@
+#ifndef _TOOLS_LINUX_TYPES_H_
+#define _TOOLS_LINUX_TYPES_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#define __SANE_USERSPACE_TYPES__	/* For PPC64, to get LL64 types */
+#include <asm/types.h>
+
+struct page;
+struct kmem_cache;
+
+typedef enum {
+	GFP_KERNEL,
+	GFP_ATOMIC,
+	__GFP_HIGHMEM,
+	__GFP_HIGH
+} gfp_t;
+
+/*
+ * We define u64 as uint64_t for every architecture
+ * so that we can print it with "%"PRIx64 without getting warnings.
+ *
+ * typedef __u64 u64;
+ * typedef __s64 s64;
+ */
+typedef uint64_t u64;
+typedef int64_t s64;
+
+typedef __u32 u32;
+typedef __s32 s32;
+
+typedef __u16 u16;
+typedef __s16 s16;
+
+typedef __u8  u8;
+typedef __s8  s8;
+
+#ifdef __CHECKER__
+#define __bitwise__ __attribute__((bitwise))
+#else
+#define __bitwise__
+#endif
+#ifdef __CHECK_ENDIAN__
+#define __bitwise __bitwise__
+#else
+#define __bitwise
+#endif
+
+#define __force
+#define __user
+#define __must_check
+#define __cold
+
+typedef __u16 __bitwise __le16;
+typedef __u16 __bitwise __be16;
+typedef __u32 __bitwise __le32;
+typedef __u32 __bitwise __be32;
+typedef __u64 __bitwise __le64;
+typedef __u64 __bitwise __be64;
+
+struct list_head {
+	struct list_head *next, *prev;
+};
+
+struct hlist_head {
+	struct hlist_node *first;
+};
+
+struct hlist_node {
+	struct hlist_node *next, **pprev;
+};
+
+#endif /* _TOOLS_LINUX_TYPES_H_ */
diff --git a/tools/include/tools/be_byteshift.h b/tools/include/tools/be_byteshift.h
new file mode 100644
index 00000000000..84c17d83657
--- /dev/null
+++ b/tools/include/tools/be_byteshift.h
@@ -0,0 +1,70 @@
+#ifndef _TOOLS_BE_BYTESHIFT_H
+#define _TOOLS_BE_BYTESHIFT_H
+
+#include <stdint.h>
+
+static inline uint16_t __get_unaligned_be16(const uint8_t *p)
+{
+	return p[0] << 8 | p[1];
+}
+
+static inline uint32_t __get_unaligned_be32(const uint8_t *p)
+{
+	return p[0] << 24 | p[1] << 16 | p[2] << 8 | p[3];
+}
+
+static inline uint64_t __get_unaligned_be64(const uint8_t *p)
+{
+	return (uint64_t)__get_unaligned_be32(p) << 32 |
+	       __get_unaligned_be32(p + 4);
+}
+
+static inline void __put_unaligned_be16(uint16_t val, uint8_t *p)
+{
+	*p++ = val >> 8;
+	*p++ = val;
+}
+
+static inline void __put_unaligned_be32(uint32_t val, uint8_t *p)
+{
+	__put_unaligned_be16(val >> 16, p);
+	__put_unaligned_be16(val, p + 2);
+}
+
+static inline void __put_unaligned_be64(uint64_t val, uint8_t *p)
+{
+	__put_unaligned_be32(val >> 32, p);
+	__put_unaligned_be32(val, p + 4);
+}
+
+static inline uint16_t get_unaligned_be16(const void *p)
+{
+	return __get_unaligned_be16((const uint8_t *)p);
+}
+
+static inline uint32_t get_unaligned_be32(const void *p)
+{
+	return __get_unaligned_be32((const uint8_t *)p);
+}
+
+static inline uint64_t get_unaligned_be64(const void *p)
+{
+	return __get_unaligned_be64((const uint8_t *)p);
+}
+
+static inline void put_unaligned_be16(uint16_t val, void *p)
+{
+	__put_unaligned_be16(val, p);
+}
+
+static inline void put_unaligned_be32(uint32_t val, void *p)
+{
+	__put_unaligned_be32(val, p);
+}
+
+static inline void put_unaligned_be64(uint64_t val, void *p)
+{
+	__put_unaligned_be64(val, p);
+}
+
+#endif /* _TOOLS_BE_BYTESHIFT_H */
diff --git a/tools/include/tools/le_byteshift.h b/tools/include/tools/le_byteshift.h
new file mode 100644
index 00000000000..8fe9f2488ec
--- /dev/null
+++ b/tools/include/tools/le_byteshift.h
@@ -0,0 +1,70 @@
+#ifndef _TOOLS_LE_BYTESHIFT_H
+#define _TOOLS_LE_BYTESHIFT_H
+
+#include <stdint.h>
+
+static inline uint16_t __get_unaligned_le16(const uint8_t *p)
+{
+	return p[0] | p[1] << 8;
+}
+
+static inline uint32_t __get_unaligned_le32(const uint8_t *p)
+{
+	return p[0] | p[1] << 8 | p[2] << 16 | p[3] << 24;
+}
+
+static inline uint64_t __get_unaligned_le64(const uint8_t *p)
+{
+	return (uint64_t)__get_unaligned_le32(p + 4) << 32 |
+	       __get_unaligned_le32(p);
+}
+
+static inline void __put_unaligned_le16(uint16_t val, uint8_t *p)
+{
+	*p++ = val;
+	*p++ = val >> 8;
+}
+
+static inline void __put_unaligned_le32(uint32_t val, uint8_t *p)
+{
+	__put_unaligned_le16(val >> 16, p + 2);
+	__put_unaligned_le16(val, p);
+}
+
+static inline void __put_unaligned_le64(uint64_t val, uint8_t *p)
+{
+	__put_unaligned_le32(val >> 32, p + 4);
+	__put_unaligned_le32(val, p);
+}
+
+static inline uint16_t get_unaligned_le16(const void *p)
+{
+	return __get_unaligned_le16((const uint8_t *)p);
+}
+
+static inline uint32_t get_unaligned_le32(const void *p)
+{
+	return __get_unaligned_le32((const uint8_t *)p);
+}
+
+static inline uint64_t get_unaligned_le64(const void *p)
+{
+	return __get_unaligned_le64((const uint8_t *)p);
+}
+
+static inline void put_unaligned_le16(uint16_t val, void *p)
+{
+	__put_unaligned_le16(val, p);
+}
+
+static inline void put_unaligned_le32(uint32_t val, void *p)
+{
+	__put_unaligned_le32(val, p);
+}
+
+static inline void put_unaligned_le64(uint64_t val, void *p)
+{
+	__put_unaligned_le64(val, p);
+}
+
+#endif /* _TOOLS_LE_BYTESHIFT_H */
diff --git a/tools/lguest/.gitignore b/tools/lguest/.gitignore
new file mode 100644
index 00000000000..115587fd5f6
--- /dev/null
+++ b/tools/lguest/.gitignore
@@ -0,0 +1 @@
+lguest
diff --git a/tools/lguest/Makefile b/tools/lguest/Makefile
new file mode 100644
index 00000000000..97bca4871ea
--- /dev/null
+++ b/tools/lguest/Makefile
@@ -0,0 +1,7 @@
+# This creates the demonstration utility "lguest" which runs a Linux guest.
+CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE
+
+all: lguest
+
+clean:
+	rm -f lguest
diff --git a/tools/lguest/extract b/tools/lguest/extract
new file mode 100644
index 00000000000..7730bb6e4b9
--- /dev/null
+++ b/tools/lguest/extract
@@ -0,0 +1,58 @@
+#! /bin/sh
+
+set -e
+
+PREFIX=$1
+shift
+
+trap 'rm -r $TMPDIR' 0
+TMPDIR=`mktemp -d`
+
+exec 3>/dev/null
+for f; do
+    while IFS="
+" read -r LINE; do
+	case "$LINE" in
+	    *$PREFIX:[0-9]*:\**)
+		NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
+		if [ -f $TMPDIR/$NUM ]; then
+		    echo "$TMPDIR/$NUM already exits prior to $f"
+		    exit 1
+		fi
+		exec 3>>$TMPDIR/$NUM
+		echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
+		/bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3
+		;;
+	    *$PREFIX:[0-9]*)
+		NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
+		if [ -f $TMPDIR/$NUM ]; then
+		    echo "$TMPDIR/$NUM already exits prior to $f"
+		    exit 1
+		fi
+		exec 3>>$TMPDIR/$NUM
+		echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
+		/bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3
+		;;
+	    *:\**)
+		/bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3
+		echo >&3
+		exec 3>/dev/null
+		;;
+	    *)
+		/bin/echo "$LINE" >&3
+		;;
+	esac
+    done < $f
+    echo >&3
+    exec 3>/dev/null
+done
+
+LASTFILE=""
+for f in $TMPDIR/*; do
+    if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then
+	LASTFILE=$(cat $TMPDIR/.$(basename $f) )
+	echo "[ $LASTFILE ]"
+    fi
+    cat $f
+done
+
diff --git a/tools/lguest/lguest.c b/tools/lguest/lguest.c
new file mode 100644
index 00000000000..32cf2ce15d6
--- /dev/null
+++ b/tools/lguest/lguest.c
@@ -0,0 +1,2072 @@
+/*P:100
+ * This is the Launcher code, a simple program which lays out the "physical"
+ * memory for the new Guest by mapping the kernel image and the virtual
+ * devices, then opens /dev/lguest to tell the kernel about the Guest and
+ * control it.
+:*/
+#define _LARGEFILE64_SOURCE
+#define _GNU_SOURCE
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <err.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <elf.h>
+#include <sys/mman.h>
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <sys/eventfd.h>
+#include <fcntl.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <ctype.h>
+#include <sys/socket.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <time.h>
+#include <netinet/in.h>
+#include <net/if.h>
+#include <linux/sockios.h>
+#include <linux/if_tun.h>
+#include <sys/uio.h>
+#include <termios.h>
+#include <getopt.h>
+#include <assert.h>
+#include <sched.h>
+#include <limits.h>
+#include <stddef.h>
+#include <signal.h>
+#include <pwd.h>
+#include <grp.h>
+
+#ifndef VIRTIO_F_ANY_LAYOUT
+#define VIRTIO_F_ANY_LAYOUT		27
+#endif
+
+/*L:110
+ * We can ignore the 43 include files we need for this program, but I do want
+ * to draw attention to the use of kernel-style types.
+ *
+ * As Linus said, "C is a Spartan language, and so should your naming be."  I
+ * like these abbreviations, so we define them here.  Note that u64 is always
+ * unsigned long long, which works on all Linux systems: this means that we can
+ * use %llu in printf for any u64.
+ */
+typedef unsigned long long u64;
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+/*:*/
+
+#include <linux/virtio_config.h>
+#include <linux/virtio_net.h>
+#include <linux/virtio_blk.h>
+#include <linux/virtio_console.h>
+#include <linux/virtio_rng.h>
+#include <linux/virtio_ring.h>
+#include <asm/bootparam.h>
+#include "../../include/linux/lguest_launcher.h"
+
+#define BRIDGE_PFX "bridge:"
+#ifndef SIOCBRADDIF
+#define SIOCBRADDIF	0x89a2		/* add interface to bridge      */
+#endif
+/* We can have up to 256 pages for devices. */
+#define DEVICE_PAGES 256
+/* This will occupy 3 pages: it must be a power of 2. */
+#define VIRTQUEUE_NUM 256
+
+/*L:120
+ * verbose is both a global flag and a macro.  The C preprocessor allows
+ * this, and although I wouldn't recommend it, it works quite nicely here.
+ */
+static bool verbose;
+#define verbose(args...) \
+	do { if (verbose) printf(args); } while(0)
+/*:*/
+
+/* The pointer to the start of guest memory. */
+static void *guest_base;
+/* The maximum guest physical address allowed, and maximum possible. */
+static unsigned long guest_limit, guest_max;
+/* The /dev/lguest file descriptor. */
+static int lguest_fd;
+
+/* a per-cpu variable indicating whose vcpu is currently running */
+static unsigned int __thread cpu_id;
+
+/* This is our list of devices. */
+struct device_list {
+	/* Counter to assign interrupt numbers. */
+	unsigned int next_irq;
+
+	/* Counter to print out convenient device numbers. */
+	unsigned int device_num;
+
+	/* The descriptor page for the devices. */
+	u8 *descpage;
+
+	/* A single linked list of devices. */
+	struct device *dev;
+	/* And a pointer to the last device for easy append. */
+	struct device *lastdev;
+};
+
+/* The list of Guest devices, based on command line arguments. */
+static struct device_list devices;
+
+/* The device structure describes a single device. */
+struct device {
+	/* The linked-list pointer. */
+	struct device *next;
+
+	/* The device's descriptor, as mapped into the Guest. */
+	struct lguest_device_desc *desc;
+
+	/* We can't trust desc values once Guest has booted: we use these. */
+	unsigned int feature_len;
+	unsigned int num_vq;
+
+	/* The name of this device, for --verbose. */
+	const char *name;
+
+	/* Any queues attached to this device */
+	struct virtqueue *vq;
+
+	/* Is it operational */
+	bool running;
+
+	/* Device-specific data. */
+	void *priv;
+};
+
+/* The virtqueue structure describes a queue attached to a device. */
+struct virtqueue {
+	struct virtqueue *next;
+
+	/* Which device owns me. */
+	struct device *dev;
+
+	/* The configuration for this queue. */
+	struct lguest_vqconfig config;
+
+	/* The actual ring of buffers. */
+	struct vring vring;
+
+	/* Last available index we saw. */
+	u16 last_avail_idx;
+
+	/* How many are used since we sent last irq? */
+	unsigned int pending_used;
+
+	/* Eventfd where Guest notifications arrive. */
+	int eventfd;
+
+	/* Function for the thread which is servicing this virtqueue. */
+	void (*service)(struct virtqueue *vq);
+	pid_t thread;
+};
+
+/* Remember the arguments to the program so we can "reboot" */
+static char **main_args;
+
+/* The original tty settings to restore on exit. */
+static struct termios orig_term;
+
+/*
+ * We have to be careful with barriers: our devices are all run in separate
+ * threads and so we need to make sure that changes visible to the Guest happen
+ * in precise order.
+ */
+#define wmb() __asm__ __volatile__("" : : : "memory")
+#define rmb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
+#define mb() __asm__ __volatile__("lock; addl $0,0(%%esp)" : : : "memory")
+
+/* Wrapper for the last available index.  Makes it easier to change. */
+#define lg_last_avail(vq)	((vq)->last_avail_idx)
+
+/*
+ * The virtio configuration space is defined to be little-endian.  x86 is
+ * little-endian too, but it's nice to be explicit so we have these helpers.
+ */
+#define cpu_to_le16(v16) (v16)
+#define cpu_to_le32(v32) (v32)
+#define cpu_to_le64(v64) (v64)
+#define le16_to_cpu(v16) (v16)
+#define le32_to_cpu(v32) (v32)
+#define le64_to_cpu(v64) (v64)
+
+/* Is this iovec empty? */
+static bool iov_empty(const struct iovec iov[], unsigned int num_iov)
+{
+	unsigned int i;
+
+	for (i = 0; i < num_iov; i++)
+		if (iov[i].iov_len)
+			return false;
+	return true;
+}
+
+/* Take len bytes from the front of this iovec. */
+static void iov_consume(struct iovec iov[], unsigned num_iov,
+			void *dest, unsigned len)
+{
+	unsigned int i;
+
+	for (i = 0; i < num_iov; i++) {
+		unsigned int used;
+
+		used = iov[i].iov_len < len ? iov[i].iov_len : len;
+		if (dest) {
+			memcpy(dest, iov[i].iov_base, used);
+			dest += used;
+		}
+		iov[i].iov_base += used;
+		iov[i].iov_len -= used;
+		len -= used;
+	}
+	if (len != 0)
+		errx(1, "iovec too short!");
+}
+
+/* The device virtqueue descriptors are followed by feature bitmasks. */
+static u8 *get_feature_bits(struct device *dev)
+{
+	return (u8 *)(dev->desc + 1)
+		+ dev->num_vq * sizeof(struct lguest_vqconfig);
+}
+
+/*L:100
+ * The Launcher code itself takes us out into userspace, that scary place where
+ * pointers run wild and free!  Unfortunately, like most userspace programs,
+ * it's quite boring (which is why everyone likes to hack on the kernel!).
+ * Perhaps if you make up an Lguest Drinking Game at this point, it will get
+ * you through this section.  Or, maybe not.
+ *
+ * The Launcher sets up a big chunk of memory to be the Guest's "physical"
+ * memory and stores it in "guest_base".  In other words, Guest physical ==
+ * Launcher virtual with an offset.
+ *
+ * This can be tough to get your head around, but usually it just means that we
+ * use these trivial conversion functions when the Guest gives us its
+ * "physical" addresses:
+ */
+static void *from_guest_phys(unsigned long addr)
+{
+	return guest_base + addr;
+}
+
+static unsigned long to_guest_phys(const void *addr)
+{
+	return (addr - guest_base);
+}
+
+/*L:130
+ * Loading the Kernel.
+ *
+ * We start with couple of simple helper routines.  open_or_die() avoids
+ * error-checking code cluttering the callers:
+ */
+static int open_or_die(const char *name, int flags)
+{
+	int fd = open(name, flags);
+	if (fd < 0)
+		err(1, "Failed to open %s", name);
+	return fd;
+}
+
+/* map_zeroed_pages() takes a number of pages. */
+static void *map_zeroed_pages(unsigned int num)
+{
+	int fd = open_or_die("/dev/zero", O_RDONLY);
+	void *addr;
+
+	/*
+	 * We use a private mapping (ie. if we write to the page, it will be
+	 * copied). We allocate an extra two pages PROT_NONE to act as guard
+	 * pages against read/write attempts that exceed allocated space.
+	 */
+	addr = mmap(NULL, getpagesize() * (num+2),
+		    PROT_NONE, MAP_PRIVATE, fd, 0);
+
+	if (addr == MAP_FAILED)
+		err(1, "Mmapping %u pages of /dev/zero", num);
+
+	if (mprotect(addr + getpagesize(), getpagesize() * num,
+		     PROT_READ|PROT_WRITE) == -1)
+		err(1, "mprotect rw %u pages failed", num);
+
+	/*
+	 * One neat mmap feature is that you can close the fd, and it
+	 * stays mapped.
+	 */
+	close(fd);
+
+	/* Return address after PROT_NONE page */
+	return addr + getpagesize();
+}
+
+/* Get some more pages for a device. */
+static void *get_pages(unsigned int num)
+{
+	void *addr = from_guest_phys(guest_limit);
+
+	guest_limit += num * getpagesize();
+	if (guest_limit > guest_max)
+		errx(1, "Not enough memory for devices");
+	return addr;
+}
+
+/*
+ * This routine is used to load the kernel or initrd.  It tries mmap, but if
+ * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries),
+ * it falls back to reading the memory in.
+ */
+static void map_at(int fd, void *addr, unsigned long offset, unsigned long len)
+{
+	ssize_t r;
+
+	/*
+	 * We map writable even though for some segments are marked read-only.
+	 * The kernel really wants to be writable: it patches its own
+	 * instructions.
+	 *
+	 * MAP_PRIVATE means that the page won't be copied until a write is
+	 * done to it.  This allows us to share untouched memory between
+	 * Guests.
+	 */
+	if (mmap(addr, len, PROT_READ|PROT_WRITE,
+		 MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED)
+		return;
+
+	/* pread does a seek and a read in one shot: saves a few lines. */
+	r = pread(fd, addr, len, offset);
+	if (r != len)
+		err(1, "Reading offset %lu len %lu gave %zi", offset, len, r);
+}
+
+/*
+ * This routine takes an open vmlinux image, which is in ELF, and maps it into
+ * the Guest memory.  ELF = Embedded Linking Format, which is the format used
+ * by all modern binaries on Linux including the kernel.
+ *
+ * The ELF headers give *two* addresses: a physical address, and a virtual
+ * address.  We use the physical address; the Guest will map itself to the
+ * virtual address.
+ *
+ * We return the starting address.
+ */
+static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr)
+{
+	Elf32_Phdr phdr[ehdr->e_phnum];
+	unsigned int i;
+
+	/*
+	 * Sanity checks on the main ELF header: an x86 executable with a
+	 * reasonable number of correctly-sized program headers.
+	 */
+	if (ehdr->e_type != ET_EXEC
+	    || ehdr->e_machine != EM_386
+	    || ehdr->e_phentsize != sizeof(Elf32_Phdr)
+	    || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
+		errx(1, "Malformed elf header");
+
+	/*
+	 * An ELF executable contains an ELF header and a number of "program"
+	 * headers which indicate which parts ("segments") of the program to
+	 * load where.
+	 */
+
+	/* We read in all the program headers at once: */
+	if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
+		err(1, "Seeking to program headers");
+	if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
+		err(1, "Reading program headers");
+
+	/*
+	 * Try all the headers: there are usually only three.  A read-only one,
+	 * a read-write one, and a "note" section which we don't load.
+	 */
+	for (i = 0; i < ehdr->e_phnum; i++) {
+		/* If this isn't a loadable segment, we ignore it */
+		if (phdr[i].p_type != PT_LOAD)
+			continue;
+
+		verbose("Section %i: size %i addr %p\n",
+			i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
+
+		/* We map this section of the file at its physical address. */
+		map_at(elf_fd, from_guest_phys(phdr[i].p_paddr),
+		       phdr[i].p_offset, phdr[i].p_filesz);
+	}
+
+	/* The entry point is given in the ELF header. */
+	return ehdr->e_entry;
+}
+
+/*L:150
+ * A bzImage, unlike an ELF file, is not meant to be loaded.  You're supposed
+ * to jump into it and it will unpack itself.  We used to have to perform some
+ * hairy magic because the unpacking code scared me.
+ *
+ * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote
+ * a small patch to jump over the tricky bits in the Guest, so now we just read
+ * the funky header so we know where in the file to load, and away we go!
+ */
+static unsigned long load_bzimage(int fd)
+{
+	struct boot_params boot;
+	int r;
+	/* Modern bzImages get loaded at 1M. */
+	void *p = from_guest_phys(0x100000);
+
+	/*
+	 * Go back to the start of the file and read the header.  It should be
+	 * a Linux boot header (see Documentation/x86/boot.txt)
+	 */
+	lseek(fd, 0, SEEK_SET);
+	read(fd, &boot, sizeof(boot));
+
+	/* Inside the setup_hdr, we expect the magic "HdrS" */
+	if (memcmp(&boot.hdr.header, "HdrS", 4) != 0)
+		errx(1, "This doesn't look like a bzImage to me");
+
+	/* Skip over the extra sectors of the header. */
+	lseek(fd, (boot.hdr.setup_sects+1) * 512, SEEK_SET);
+
+	/* Now read everything into memory. in nice big chunks. */
+	while ((r = read(fd, p, 65536)) > 0)
+		p += r;
+
+	/* Finally, code32_start tells us where to enter the kernel. */
+	return boot.hdr.code32_start;
+}
+
+/*L:140
+ * Loading the kernel is easy when it's a "vmlinux", but most kernels
+ * come wrapped up in the self-decompressing "bzImage" format.  With a little
+ * work, we can load those, too.
+ */
+static unsigned long load_kernel(int fd)
+{
+	Elf32_Ehdr hdr;
+
+	/* Read in the first few bytes. */
+	if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
+		err(1, "Reading kernel");
+
+	/* If it's an ELF file, it starts with "\177ELF" */
+	if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
+		return map_elf(fd, &hdr);
+
+	/* Otherwise we assume it's a bzImage, and try to load it. */
+	return load_bzimage(fd);
+}
+
+/*
+ * This is a trivial little helper to align pages.  Andi Kleen hated it because
+ * it calls getpagesize() twice: "it's dumb code."
+ *
+ * Kernel guys get really het up about optimization, even when it's not
+ * necessary.  I leave this code as a reaction against that.
+ */
+static inline unsigned long page_align(unsigned long addr)
+{
+	/* Add upwards and truncate downwards. */
+	return ((addr + getpagesize()-1) & ~(getpagesize()-1));
+}
+
+/*L:180
+ * An "initial ram disk" is a disk image loaded into memory along with the
+ * kernel which the kernel can use to boot from without needing any drivers.
+ * Most distributions now use this as standard: the initrd contains the code to
+ * load the appropriate driver modules for the current machine.
+ *
+ * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
+ * kernels.  He sent me this (and tells me when I break it).
+ */
+static unsigned long load_initrd(const char *name, unsigned long mem)
+{
+	int ifd;
+	struct stat st;
+	unsigned long len;
+
+	ifd = open_or_die(name, O_RDONLY);
+	/* fstat() is needed to get the file size. */
+	if (fstat(ifd, &st) < 0)
+		err(1, "fstat() on initrd '%s'", name);
+
+	/*
+	 * We map the initrd at the top of memory, but mmap wants it to be
+	 * page-aligned, so we round the size up for that.
+	 */
+	len = page_align(st.st_size);
+	map_at(ifd, from_guest_phys(mem - len), 0, st.st_size);
+	/*
+	 * Once a file is mapped, you can close the file descriptor.  It's a
+	 * little odd, but quite useful.
+	 */
+	close(ifd);
+	verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len);
+
+	/* We return the initrd size. */
+	return len;
+}
+/*:*/
+
+/*
+ * Simple routine to roll all the commandline arguments together with spaces
+ * between them.
+ */
+static void concat(char *dst, char *args[])
+{
+	unsigned int i, len = 0;
+
+	for (i = 0; args[i]; i++) {
+		if (i) {
+			strcat(dst+len, " ");
+			len++;
+		}
+		strcpy(dst+len, args[i]);
+		len += strlen(args[i]);
+	}
+	/* In case it's empty. */
+	dst[len] = '\0';
+}
+
+/*L:185
+ * This is where we actually tell the kernel to initialize the Guest.  We
+ * saw the arguments it expects when we looked at initialize() in lguest_user.c:
+ * the base of Guest "physical" memory, the top physical page to allow and the
+ * entry point for the Guest.
+ */
+static void tell_kernel(unsigned long start)
+{
+	unsigned long args[] = { LHREQ_INITIALIZE,
+				 (unsigned long)guest_base,
+				 guest_limit / getpagesize(), start };
+	verbose("Guest: %p - %p (%#lx)\n",
+		guest_base, guest_base + guest_limit, guest_limit);
+	lguest_fd = open_or_die("/dev/lguest", O_RDWR);
+	if (write(lguest_fd, args, sizeof(args)) < 0)
+		err(1, "Writing to /dev/lguest");
+}
+/*:*/
+
+/*L:200
+ * Device Handling.
+ *
+ * When the Guest gives us a buffer, it sends an array of addresses and sizes.
+ * We need to make sure it's not trying to reach into the Launcher itself, so
+ * we have a convenient routine which checks it and exits with an error message
+ * if something funny is going on:
+ */
+static void *_check_pointer(unsigned long addr, unsigned int size,
+			    unsigned int line)
+{
+	/*
+	 * Check if the requested address and size exceeds the allocated memory,
+	 * or addr + size wraps around.
+	 */
+	if ((addr + size) > guest_limit || (addr + size) < addr)
+		errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr);
+	/*
+	 * We return a pointer for the caller's convenience, now we know it's
+	 * safe to use.
+	 */
+	return from_guest_phys(addr);
+}
+/* A macro which transparently hands the line number to the real function. */
+#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
+
+/*
+ * Each buffer in the virtqueues is actually a chain of descriptors.  This
+ * function returns the next descriptor in the chain, or vq->vring.num if we're
+ * at the end.
+ */
+static unsigned next_desc(struct vring_desc *desc,
+			  unsigned int i, unsigned int max)
+{
+	unsigned int next;
+
+	/* If this descriptor says it doesn't chain, we're done. */
+	if (!(desc[i].flags & VRING_DESC_F_NEXT))
+		return max;
+
+	/* Check they're not leading us off end of descriptors. */
+	next = desc[i].next;
+	/* Make sure compiler knows to grab that: we don't want it changing! */
+	wmb();
+
+	if (next >= max)
+		errx(1, "Desc next is %u", next);
+
+	return next;
+}
+
+/*
+ * This actually sends the interrupt for this virtqueue, if we've used a
+ * buffer.
+ */
+static void trigger_irq(struct virtqueue *vq)
+{
+	unsigned long buf[] = { LHREQ_IRQ, vq->config.irq };
+
+	/* Don't inform them if nothing used. */
+	if (!vq->pending_used)
+		return;
+	vq->pending_used = 0;
+
+	/* If they don't want an interrupt, don't send one... */
+	if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) {
+		return;
+	}
+
+	/* Send the Guest an interrupt tell them we used something up. */
+	if (write(lguest_fd, buf, sizeof(buf)) != 0)
+		err(1, "Triggering irq %i", vq->config.irq);
+}
+
+/*
+ * This looks in the virtqueue for the first available buffer, and converts
+ * it to an iovec for convenient access.  Since descriptors consist of some
+ * number of output then some number of input descriptors, it's actually two
+ * iovecs, but we pack them into one and note how many of each there were.
+ *
+ * This function waits if necessary, and returns the descriptor number found.
+ */
+static unsigned wait_for_vq_desc(struct virtqueue *vq,
+				 struct iovec iov[],
+				 unsigned int *out_num, unsigned int *in_num)
+{
+	unsigned int i, head, max;
+	struct vring_desc *desc;
+	u16 last_avail = lg_last_avail(vq);
+
+	/* There's nothing available? */
+	while (last_avail == vq->vring.avail->idx) {
+		u64 event;
+
+		/*
+		 * Since we're about to sleep, now is a good time to tell the
+		 * Guest about what we've used up to now.
+		 */
+		trigger_irq(vq);
+
+		/* OK, now we need to know about added descriptors. */
+		vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY;
+
+		/*
+		 * They could have slipped one in as we were doing that: make
+		 * sure it's written, then check again.
+		 */
+		mb();
+		if (last_avail != vq->vring.avail->idx) {
+			vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
+			break;
+		}
+
+		/* Nothing new?  Wait for eventfd to tell us they refilled. */
+		if (read(vq->eventfd, &event, sizeof(event)) != sizeof(event))
+			errx(1, "Event read failed?");
+
+		/* We don't need to be notified again. */
+		vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY;
+	}
+
+	/* Check it isn't doing very strange things with descriptor numbers. */
+	if ((u16)(vq->vring.avail->idx - last_avail) > vq->vring.num)
+		errx(1, "Guest moved used index from %u to %u",
+		     last_avail, vq->vring.avail->idx);
+
+	/* 
+	 * Make sure we read the descriptor number *after* we read the ring
+	 * update; don't let the cpu or compiler change the order.
+	 */
+	rmb();
+
+	/*
+	 * Grab the next descriptor number they're advertising, and increment
+	 * the index we've seen.
+	 */
+	head = vq->vring.avail->ring[last_avail % vq->vring.num];
+	lg_last_avail(vq)++;
+
+	/* If their number is silly, that's a fatal mistake. */
+	if (head >= vq->vring.num)
+		errx(1, "Guest says index %u is available", head);
+
+	/* When we start there are none of either input nor output. */
+	*out_num = *in_num = 0;
+
+	max = vq->vring.num;
+	desc = vq->vring.desc;
+	i = head;
+
+	/*
+	 * We have to read the descriptor after we read the descriptor number,
+	 * but there's a data dependency there so the CPU shouldn't reorder
+	 * that: no rmb() required.
+	 */
+
+	/*
+	 * If this is an indirect entry, then this buffer contains a descriptor
+	 * table which we handle as if it's any normal descriptor chain.
+	 */
+	if (desc[i].flags & VRING_DESC_F_INDIRECT) {
+		if (desc[i].len % sizeof(struct vring_desc))
+			errx(1, "Invalid size for indirect buffer table");
+
+		max = desc[i].len / sizeof(struct vring_desc);
+		desc = check_pointer(desc[i].addr, desc[i].len);
+		i = 0;
+	}
+
+	do {
+		/* Grab the first descriptor, and check it's OK. */
+		iov[*out_num + *in_num].iov_len = desc[i].len;
+		iov[*out_num + *in_num].iov_base
+			= check_pointer(desc[i].addr, desc[i].len);
+		/* If this is an input descriptor, increment that count. */
+		if (desc[i].flags & VRING_DESC_F_WRITE)
+			(*in_num)++;
+		else {
+			/*
+			 * If it's an output descriptor, they're all supposed
+			 * to come before any input descriptors.
+			 */
+			if (*in_num)
+				errx(1, "Descriptor has out after in");
+			(*out_num)++;
+		}
+
+		/* If we've got too many, that implies a descriptor loop. */
+		if (*out_num + *in_num > max)
+			errx(1, "Looped descriptor");
+	} while ((i = next_desc(desc, i, max)) != max);
+
+	return head;
+}
+
+/*
+ * After we've used one of their buffers, we tell the Guest about it.  Sometime
+ * later we'll want to send them an interrupt using trigger_irq(); note that
+ * wait_for_vq_desc() does that for us if it has to wait.
+ */
+static void add_used(struct virtqueue *vq, unsigned int head, int len)
+{
+	struct vring_used_elem *used;
+
+	/*
+	 * The virtqueue contains a ring of used buffers.  Get a pointer to the
+	 * next entry in that used ring.
+	 */
+	used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num];
+	used->id = head;
+	used->len = len;
+	/* Make sure buffer is written before we update index. */
+	wmb();
+	vq->vring.used->idx++;
+	vq->pending_used++;
+}
+
+/* And here's the combo meal deal.  Supersize me! */
+static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len)
+{
+	add_used(vq, head, len);
+	trigger_irq(vq);
+}
+
+/*
+ * The Console
+ *
+ * We associate some data with the console for our exit hack.
+ */
+struct console_abort {
+	/* How many times have they hit ^C? */
+	int count;
+	/* When did they start? */
+	struct timeval start;
+};
+
+/* This is the routine which handles console input (ie. stdin). */
+static void console_input(struct virtqueue *vq)
+{
+	int len;
+	unsigned int head, in_num, out_num;
+	struct console_abort *abort = vq->dev->priv;
+	struct iovec iov[vq->vring.num];
+
+	/* Make sure there's a descriptor available. */
+	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
+	if (out_num)
+		errx(1, "Output buffers in console in queue?");
+
+	/* Read into it.  This is where we usually wait. */
+	len = readv(STDIN_FILENO, iov, in_num);
+	if (len <= 0) {
+		/* Ran out of input? */
+		warnx("Failed to get console input, ignoring console.");
+		/*
+		 * For simplicity, dying threads kill the whole Launcher.  So
+		 * just nap here.
+		 */
+		for (;;)
+			pause();
+	}
+
+	/* Tell the Guest we used a buffer. */
+	add_used_and_trigger(vq, head, len);
+
+	/*
+	 * Three ^C within one second?  Exit.
+	 *
+	 * This is such a hack, but works surprisingly well.  Each ^C has to
+	 * be in a buffer by itself, so they can't be too fast.  But we check
+	 * that we get three within about a second, so they can't be too
+	 * slow.
+	 */
+	if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) {
+		abort->count = 0;
+		return;
+	}
+
+	abort->count++;
+	if (abort->count == 1)
+		gettimeofday(&abort->start, NULL);
+	else if (abort->count == 3) {
+		struct timeval now;
+		gettimeofday(&now, NULL);
+		/* Kill all Launcher processes with SIGINT, like normal ^C */
+		if (now.tv_sec <= abort->start.tv_sec+1)
+			kill(0, SIGINT);
+		abort->count = 0;
+	}
+}
+
+/* This is the routine which handles console output (ie. stdout). */
+static void console_output(struct virtqueue *vq)
+{
+	unsigned int head, out, in;
+	struct iovec iov[vq->vring.num];
+
+	/* We usually wait in here, for the Guest to give us something. */
+	head = wait_for_vq_desc(vq, iov, &out, &in);
+	if (in)
+		errx(1, "Input buffers in console output queue?");
+
+	/* writev can return a partial write, so we loop here. */
+	while (!iov_empty(iov, out)) {
+		int len = writev(STDOUT_FILENO, iov, out);
+		if (len <= 0) {
+			warn("Write to stdout gave %i (%d)", len, errno);
+			break;
+		}
+		iov_consume(iov, out, NULL, len);
+	}
+
+	/*
+	 * We're finished with that buffer: if we're going to sleep,
+	 * wait_for_vq_desc() will prod the Guest with an interrupt.
+	 */
+	add_used(vq, head, 0);
+}
+
+/*
+ * The Network
+ *
+ * Handling output for network is also simple: we get all the output buffers
+ * and write them to /dev/net/tun.
+ */
+struct net_info {
+	int tunfd;
+};
+
+static void net_output(struct virtqueue *vq)
+{
+	struct net_info *net_info = vq->dev->priv;
+	unsigned int head, out, in;
+	struct iovec iov[vq->vring.num];
+
+	/* We usually wait in here for the Guest to give us a packet. */
+	head = wait_for_vq_desc(vq, iov, &out, &in);
+	if (in)
+		errx(1, "Input buffers in net output queue?");
+	/*
+	 * Send the whole thing through to /dev/net/tun.  It expects the exact
+	 * same format: what a coincidence!
+	 */
+	if (writev(net_info->tunfd, iov, out) < 0)
+		warnx("Write to tun failed (%d)?", errno);
+
+	/*
+	 * Done with that one; wait_for_vq_desc() will send the interrupt if
+	 * all packets are processed.
+	 */
+	add_used(vq, head, 0);
+}
+
+/*
+ * Handling network input is a bit trickier, because I've tried to optimize it.
+ *
+ * First we have a helper routine which tells is if from this file descriptor
+ * (ie. the /dev/net/tun device) will block:
+ */
+static bool will_block(int fd)
+{
+	fd_set fdset;
+	struct timeval zero = { 0, 0 };
+	FD_ZERO(&fdset);
+	FD_SET(fd, &fdset);
+	return select(fd+1, &fdset, NULL, NULL, &zero) != 1;
+}
+
+/*
+ * This handles packets coming in from the tun device to our Guest.  Like all
+ * service routines, it gets called again as soon as it returns, so you don't
+ * see a while(1) loop here.
+ */
+static void net_input(struct virtqueue *vq)
+{
+	int len;
+	unsigned int head, out, in;
+	struct iovec iov[vq->vring.num];
+	struct net_info *net_info = vq->dev->priv;
+
+	/*
+	 * Get a descriptor to write an incoming packet into.  This will also
+	 * send an interrupt if they're out of descriptors.
+	 */
+	head = wait_for_vq_desc(vq, iov, &out, &in);
+	if (out)
+		errx(1, "Output buffers in net input queue?");
+
+	/*
+	 * If it looks like we'll block reading from the tun device, send them
+	 * an interrupt.
+	 */
+	if (vq->pending_used && will_block(net_info->tunfd))
+		trigger_irq(vq);
+
+	/*
+	 * Read in the packet.  This is where we normally wait (when there's no
+	 * incoming network traffic).
+	 */
+	len = readv(net_info->tunfd, iov, in);
+	if (len <= 0)
+		warn("Failed to read from tun (%d).", errno);
+
+	/*
+	 * Mark that packet buffer as used, but don't interrupt here.  We want
+	 * to wait until we've done as much work as we can.
+	 */
+	add_used(vq, head, len);
+}
+/*:*/
+
+/* This is the helper to create threads: run the service routine in a loop. */
+static int do_thread(void *_vq)
+{
+	struct virtqueue *vq = _vq;
+
+	for (;;)
+		vq->service(vq);
+	return 0;
+}
+
+/*
+ * When a child dies, we kill our entire process group with SIGTERM.  This
+ * also has the side effect that the shell restores the console for us!
+ */
+static void kill_launcher(int signal)
+{
+	kill(0, SIGTERM);
+}
+
+static void reset_device(struct device *dev)
+{
+	struct virtqueue *vq;
+
+	verbose("Resetting device %s\n", dev->name);
+
+	/* Clear any features they've acked. */
+	memset(get_feature_bits(dev) + dev->feature_len, 0, dev->feature_len);
+
+	/* We're going to be explicitly killing threads, so ignore them. */
+	signal(SIGCHLD, SIG_IGN);
+
+	/* Zero out the virtqueues, get rid of their threads */
+	for (vq = dev->vq; vq; vq = vq->next) {
+		if (vq->thread != (pid_t)-1) {
+			kill(vq->thread, SIGTERM);
+			waitpid(vq->thread, NULL, 0);
+			vq->thread = (pid_t)-1;
+		}
+		memset(vq->vring.desc, 0,
+		       vring_size(vq->config.num, LGUEST_VRING_ALIGN));
+		lg_last_avail(vq) = 0;
+	}
+	dev->running = false;
+
+	/* Now we care if threads die. */
+	signal(SIGCHLD, (void *)kill_launcher);
+}
+
+/*L:216
+ * This actually creates the thread which services the virtqueue for a device.
+ */
+static void create_thread(struct virtqueue *vq)
+{
+	/*
+	 * Create stack for thread.  Since the stack grows upwards, we point
+	 * the stack pointer to the end of this region.
+	 */
+	char *stack = malloc(32768);
+	unsigned long args[] = { LHREQ_EVENTFD,
+				 vq->config.pfn*getpagesize(), 0 };
+
+	/* Create a zero-initialized eventfd. */
+	vq->eventfd = eventfd(0, 0);
+	if (vq->eventfd < 0)
+		err(1, "Creating eventfd");
+	args[2] = vq->eventfd;
+
+	/*
+	 * Attach an eventfd to this virtqueue: it will go off when the Guest
+	 * does an LHCALL_NOTIFY for this vq.
+	 */
+	if (write(lguest_fd, &args, sizeof(args)) != 0)
+		err(1, "Attaching eventfd");
+
+	/*
+	 * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so
+	 * we get a signal if it dies.
+	 */
+	vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq);
+	if (vq->thread == (pid_t)-1)
+		err(1, "Creating clone");
+
+	/* We close our local copy now the child has it. */
+	close(vq->eventfd);
+}
+
+static void start_device(struct device *dev)
+{
+	unsigned int i;
+	struct virtqueue *vq;
+
+	verbose("Device %s OK: offered", dev->name);
+	for (i = 0; i < dev->feature_len; i++)
+		verbose(" %02x", get_feature_bits(dev)[i]);
+	verbose(", accepted");
+	for (i = 0; i < dev->feature_len; i++)
+		verbose(" %02x", get_feature_bits(dev)
+			[dev->feature_len+i]);
+
+	for (vq = dev->vq; vq; vq = vq->next) {
+		if (vq->service)
+			create_thread(vq);
+	}
+	dev->running = true;
+}
+
+static void cleanup_devices(void)
+{
+	struct device *dev;
+
+	for (dev = devices.dev; dev; dev = dev->next)
+		reset_device(dev);
+
+	/* If we saved off the original terminal settings, restore them now. */
+	if (orig_term.c_lflag & (ISIG|ICANON|ECHO))
+		tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
+}
+
+/* When the Guest tells us they updated the status field, we handle it. */
+static void update_device_status(struct device *dev)
+{
+	/* A zero status is a reset, otherwise it's a set of flags. */
+	if (dev->desc->status == 0)
+		reset_device(dev);
+	else if (dev->desc->status & VIRTIO_CONFIG_S_FAILED) {
+		warnx("Device %s configuration FAILED", dev->name);
+		if (dev->running)
+			reset_device(dev);
+	} else {
+		if (dev->running)
+			err(1, "Device %s features finalized twice", dev->name);
+		start_device(dev);
+	}
+}
+
+/*L:215
+ * This is the generic routine we call when the Guest uses LHCALL_NOTIFY.  In
+ * particular, it's used to notify us of device status changes during boot.
+ */
+static void handle_output(unsigned long addr)
+{
+	struct device *i;
+
+	/* Check each device. */
+	for (i = devices.dev; i; i = i->next) {
+		struct virtqueue *vq;
+
+		/*
+		 * Notifications to device descriptors mean they updated the
+		 * device status.
+		 */
+		if (from_guest_phys(addr) == i->desc) {
+			update_device_status(i);
+			return;
+		}
+
+		/* Devices should not be used before features are finalized. */
+		for (vq = i->vq; vq; vq = vq->next) {
+			if (addr != vq->config.pfn*getpagesize())
+				continue;
+			errx(1, "Notification on %s before setup!", i->name);
+		}
+	}
+
+	/*
+	 * Early console write is done using notify on a nul-terminated string
+	 * in Guest memory.  It's also great for hacking debugging messages
+	 * into a Guest.
+	 */
+	if (addr >= guest_limit)
+		errx(1, "Bad NOTIFY %#lx", addr);
+
+	write(STDOUT_FILENO, from_guest_phys(addr),
+	      strnlen(from_guest_phys(addr), guest_limit - addr));
+}
+
+/*L:190
+ * Device Setup
+ *
+ * All devices need a descriptor so the Guest knows it exists, and a "struct
+ * device" so the Launcher can keep track of it.  We have common helper
+ * routines to allocate and manage them.
+ */
+
+/*
+ * The layout of the device page is a "struct lguest_device_desc" followed by a
+ * number of virtqueue descriptors, then two sets of feature bits, then an
+ * array of configuration bytes.  This routine returns the configuration
+ * pointer.
+ */
+static u8 *device_config(const struct device *dev)
+{
+	return (void *)(dev->desc + 1)
+		+ dev->num_vq * sizeof(struct lguest_vqconfig)
+		+ dev->feature_len * 2;
+}
+
+/*
+ * This routine allocates a new "struct lguest_device_desc" from descriptor
+ * table page just above the Guest's normal memory.  It returns a pointer to
+ * that descriptor.
+ */
+static struct lguest_device_desc *new_dev_desc(u16 type)
+{
+	struct lguest_device_desc d = { .type = type };
+	void *p;
+
+	/* Figure out where the next device config is, based on the last one. */
+	if (devices.lastdev)
+		p = device_config(devices.lastdev)
+			+ devices.lastdev->desc->config_len;
+	else
+		p = devices.descpage;
+
+	/* We only have one page for all the descriptors. */
+	if (p + sizeof(d) > (void *)devices.descpage + getpagesize())
+		errx(1, "Too many devices");
+
+	/* p might not be aligned, so we memcpy in. */
+	return memcpy(p, &d, sizeof(d));
+}
+
+/*
+ * Each device descriptor is followed by the description of its virtqueues.  We
+ * specify how many descriptors the virtqueue is to have.
+ */
+static void add_virtqueue(struct device *dev, unsigned int num_descs,
+			  void (*service)(struct virtqueue *))
+{
+	unsigned int pages;
+	struct virtqueue **i, *vq = malloc(sizeof(*vq));
+	void *p;
+
+	/* First we need some memory for this virtqueue. */
+	pages = (vring_size(num_descs, LGUEST_VRING_ALIGN) + getpagesize() - 1)
+		/ getpagesize();
+	p = get_pages(pages);
+
+	/* Initialize the virtqueue */
+	vq->next = NULL;
+	vq->last_avail_idx = 0;
+	vq->dev = dev;
+
+	/*
+	 * This is the routine the service thread will run, and its Process ID
+	 * once it's running.
+	 */
+	vq->service = service;
+	vq->thread = (pid_t)-1;
+
+	/* Initialize the configuration. */
+	vq->config.num = num_descs;
+	vq->config.irq = devices.next_irq++;
+	vq->config.pfn = to_guest_phys(p) / getpagesize();
+
+	/* Initialize the vring. */
+	vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN);
+
+	/*
+	 * Append virtqueue to this device's descriptor.  We use
+	 * device_config() to get the end of the device's current virtqueues;
+	 * we check that we haven't added any config or feature information
+	 * yet, otherwise we'd be overwriting them.
+	 */
+	assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0);
+	memcpy(device_config(dev), &vq->config, sizeof(vq->config));
+	dev->num_vq++;
+	dev->desc->num_vq++;
+
+	verbose("Virtqueue page %#lx\n", to_guest_phys(p));
+
+	/*
+	 * Add to tail of list, so dev->vq is first vq, dev->vq->next is
+	 * second.
+	 */
+	for (i = &dev->vq; *i; i = &(*i)->next);
+	*i = vq;
+}
+
+/*
+ * The first half of the feature bitmask is for us to advertise features.  The
+ * second half is for the Guest to accept features.
+ */
+static void add_feature(struct device *dev, unsigned bit)
+{
+	u8 *features = get_feature_bits(dev);
+
+	/* We can't extend the feature bits once we've added config bytes */
+	if (dev->desc->feature_len <= bit / CHAR_BIT) {
+		assert(dev->desc->config_len == 0);
+		dev->feature_len = dev->desc->feature_len = (bit/CHAR_BIT) + 1;
+	}
+
+	features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT));
+}
+
+/*
+ * This routine sets the configuration fields for an existing device's
+ * descriptor.  It only works for the last device, but that's OK because that's
+ * how we use it.
+ */
+static void set_config(struct device *dev, unsigned len, const void *conf)
+{
+	/* Check we haven't overflowed our single page. */
+	if (device_config(dev) + len > devices.descpage + getpagesize())
+		errx(1, "Too many devices");
+
+	/* Copy in the config information, and store the length. */
+	memcpy(device_config(dev), conf, len);
+	dev->desc->config_len = len;
+
+	/* Size must fit in config_len field (8 bits)! */
+	assert(dev->desc->config_len == len);
+}
+
+/*
+ * This routine does all the creation and setup of a new device, including
+ * calling new_dev_desc() to allocate the descriptor and device memory.  We
+ * don't actually start the service threads until later.
+ *
+ * See what I mean about userspace being boring?
+ */
+static struct device *new_device(const char *name, u16 type)
+{
+	struct device *dev = malloc(sizeof(*dev));
+
+	/* Now we populate the fields one at a time. */
+	dev->desc = new_dev_desc(type);
+	dev->name = name;
+	dev->vq = NULL;
+	dev->feature_len = 0;
+	dev->num_vq = 0;
+	dev->running = false;
+	dev->next = NULL;
+
+	/*
+	 * Append to device list.  Prepending to a single-linked list is
+	 * easier, but the user expects the devices to be arranged on the bus
+	 * in command-line order.  The first network device on the command line
+	 * is eth0, the first block device /dev/vda, etc.
+	 */
+	if (devices.lastdev)
+		devices.lastdev->next = dev;
+	else
+		devices.dev = dev;
+	devices.lastdev = dev;
+
+	return dev;
+}
+
+/*
+ * Our first setup routine is the console.  It's a fairly simple device, but
+ * UNIX tty handling makes it uglier than it could be.
+ */
+static void setup_console(void)
+{
+	struct device *dev;
+
+	/* If we can save the initial standard input settings... */
+	if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
+		struct termios term = orig_term;
+		/*
+		 * Then we turn off echo, line buffering and ^C etc: We want a
+		 * raw input stream to the Guest.
+		 */
+		term.c_lflag &= ~(ISIG|ICANON|ECHO);
+		tcsetattr(STDIN_FILENO, TCSANOW, &term);
+	}
+
+	dev = new_device("console", VIRTIO_ID_CONSOLE);
+
+	/* We store the console state in dev->priv, and initialize it. */
+	dev->priv = malloc(sizeof(struct console_abort));
+	((struct console_abort *)dev->priv)->count = 0;
+
+	/*
+	 * The console needs two virtqueues: the input then the output.  When
+	 * they put something the input queue, we make sure we're listening to
+	 * stdin.  When they put something in the output queue, we write it to
+	 * stdout.
+	 */
+	add_virtqueue(dev, VIRTQUEUE_NUM, console_input);
+	add_virtqueue(dev, VIRTQUEUE_NUM, console_output);
+
+	verbose("device %u: console\n", ++devices.device_num);
+}
+/*:*/
+
+/*M:010
+ * Inter-guest networking is an interesting area.  Simplest is to have a
+ * --sharenet=<name> option which opens or creates a named pipe.  This can be
+ * used to send packets to another guest in a 1:1 manner.
+ *
+ * More sophisticated is to use one of the tools developed for project like UML
+ * to do networking.
+ *
+ * Faster is to do virtio bonding in kernel.  Doing this 1:1 would be
+ * completely generic ("here's my vring, attach to your vring") and would work
+ * for any traffic.  Of course, namespace and permissions issues need to be
+ * dealt with.  A more sophisticated "multi-channel" virtio_net.c could hide
+ * multiple inter-guest channels behind one interface, although it would
+ * require some manner of hotplugging new virtio channels.
+ *
+ * Finally, we could use a virtio network switch in the kernel, ie. vhost.
+:*/
+
+static u32 str2ip(const char *ipaddr)
+{
+	unsigned int b[4];
+
+	if (sscanf(ipaddr, "%u.%u.%u.%u", &b[0], &b[1], &b[2], &b[3]) != 4)
+		errx(1, "Failed to parse IP address '%s'", ipaddr);
+	return (b[0] << 24) | (b[1] << 16) | (b[2] << 8) | b[3];
+}
+
+static void str2mac(const char *macaddr, unsigned char mac[6])
+{
+	unsigned int m[6];
+	if (sscanf(macaddr, "%02x:%02x:%02x:%02x:%02x:%02x",
+		   &m[0], &m[1], &m[2], &m[3], &m[4], &m[5]) != 6)
+		errx(1, "Failed to parse mac address '%s'", macaddr);
+	mac[0] = m[0];
+	mac[1] = m[1];
+	mac[2] = m[2];
+	mac[3] = m[3];
+	mac[4] = m[4];
+	mac[5] = m[5];
+}
+
+/*
+ * This code is "adapted" from libbridge: it attaches the Host end of the
+ * network device to the bridge device specified by the command line.
+ *
+ * This is yet another James Morris contribution (I'm an IP-level guy, so I
+ * dislike bridging), and I just try not to break it.
+ */
+static void add_to_bridge(int fd, const char *if_name, const char *br_name)
+{
+	int ifidx;
+	struct ifreq ifr;
+
+	if (!*br_name)
+		errx(1, "must specify bridge name");
+
+	ifidx = if_nametoindex(if_name);
+	if (!ifidx)
+		errx(1, "interface %s does not exist!", if_name);
+
+	strncpy(ifr.ifr_name, br_name, IFNAMSIZ);
+	ifr.ifr_name[IFNAMSIZ-1] = '\0';
+	ifr.ifr_ifindex = ifidx;
+	if (ioctl(fd, SIOCBRADDIF, &ifr) < 0)
+		err(1, "can't add %s to bridge %s", if_name, br_name);
+}
+
+/*
+ * This sets up the Host end of the network device with an IP address, brings
+ * it up so packets will flow, the copies the MAC address into the hwaddr
+ * pointer.
+ */
+static void configure_device(int fd, const char *tapif, u32 ipaddr)
+{
+	struct ifreq ifr;
+	struct sockaddr_in sin;
+
+	memset(&ifr, 0, sizeof(ifr));
+	strcpy(ifr.ifr_name, tapif);
+
+	/* Don't read these incantations.  Just cut & paste them like I did! */
+	sin.sin_family = AF_INET;
+	sin.sin_addr.s_addr = htonl(ipaddr);
+	memcpy(&ifr.ifr_addr, &sin, sizeof(sin));
+	if (ioctl(fd, SIOCSIFADDR, &ifr) != 0)
+		err(1, "Setting %s interface address", tapif);
+	ifr.ifr_flags = IFF_UP;
+	if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
+		err(1, "Bringing interface %s up", tapif);
+}
+
+static int get_tun_device(char tapif[IFNAMSIZ])
+{
+	struct ifreq ifr;
+	int netfd;
+
+	/* Start with this zeroed.  Messy but sure. */
+	memset(&ifr, 0, sizeof(ifr));
+
+	/*
+	 * We open the /dev/net/tun device and tell it we want a tap device.  A
+	 * tap device is like a tun device, only somehow different.  To tell
+	 * the truth, I completely blundered my way through this code, but it
+	 * works now!
+	 */
+	netfd = open_or_die("/dev/net/tun", O_RDWR);
+	ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR;
+	strcpy(ifr.ifr_name, "tap%d");
+	if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
+		err(1, "configuring /dev/net/tun");
+
+	if (ioctl(netfd, TUNSETOFFLOAD,
+		  TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0)
+		err(1, "Could not set features for tun device");
+
+	/*
+	 * We don't need checksums calculated for packets coming in this
+	 * device: trust us!
+	 */
+	ioctl(netfd, TUNSETNOCSUM, 1);
+
+	memcpy(tapif, ifr.ifr_name, IFNAMSIZ);
+	return netfd;
+}
+
+/*L:195
+ * Our network is a Host<->Guest network.  This can either use bridging or
+ * routing, but the principle is the same: it uses the "tun" device to inject
+ * packets into the Host as if they came in from a normal network card.  We
+ * just shunt packets between the Guest and the tun device.
+ */
+static void setup_tun_net(char *arg)
+{
+	struct device *dev;
+	struct net_info *net_info = malloc(sizeof(*net_info));
+	int ipfd;
+	u32 ip = INADDR_ANY;
+	bool bridging = false;
+	char tapif[IFNAMSIZ], *p;
+	struct virtio_net_config conf;
+
+	net_info->tunfd = get_tun_device(tapif);
+
+	/* First we create a new network device. */
+	dev = new_device("net", VIRTIO_ID_NET);
+	dev->priv = net_info;
+
+	/* Network devices need a recv and a send queue, just like console. */
+	add_virtqueue(dev, VIRTQUEUE_NUM, net_input);
+	add_virtqueue(dev, VIRTQUEUE_NUM, net_output);
+
+	/*
+	 * We need a socket to perform the magic network ioctls to bring up the
+	 * tap interface, connect to the bridge etc.  Any socket will do!
+	 */
+	ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+	if (ipfd < 0)
+		err(1, "opening IP socket");
+
+	/* If the command line was --tunnet=bridge:<name> do bridging. */
+	if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
+		arg += strlen(BRIDGE_PFX);
+		bridging = true;
+	}
+
+	/* A mac address may follow the bridge name or IP address */
+	p = strchr(arg, ':');
+	if (p) {
+		str2mac(p+1, conf.mac);
+		add_feature(dev, VIRTIO_NET_F_MAC);
+		*p = '\0';
+	}
+
+	/* arg is now either an IP address or a bridge name */
+	if (bridging)
+		add_to_bridge(ipfd, tapif, arg);
+	else
+		ip = str2ip(arg);
+
+	/* Set up the tun device. */
+	configure_device(ipfd, tapif, ip);
+
+	/* Expect Guest to handle everything except UFO */
+	add_feature(dev, VIRTIO_NET_F_CSUM);
+	add_feature(dev, VIRTIO_NET_F_GUEST_CSUM);
+	add_feature(dev, VIRTIO_NET_F_GUEST_TSO4);
+	add_feature(dev, VIRTIO_NET_F_GUEST_TSO6);
+	add_feature(dev, VIRTIO_NET_F_GUEST_ECN);
+	add_feature(dev, VIRTIO_NET_F_HOST_TSO4);
+	add_feature(dev, VIRTIO_NET_F_HOST_TSO6);
+	add_feature(dev, VIRTIO_NET_F_HOST_ECN);
+	/* We handle indirect ring entries */
+	add_feature(dev, VIRTIO_RING_F_INDIRECT_DESC);
+	/* We're compliant with the damn spec. */
+	add_feature(dev, VIRTIO_F_ANY_LAYOUT);
+	set_config(dev, sizeof(conf), &conf);
+
+	/* We don't need the socket any more; setup is done. */
+	close(ipfd);
+
+	devices.device_num++;
+
+	if (bridging)
+		verbose("device %u: tun %s attached to bridge: %s\n",
+			devices.device_num, tapif, arg);
+	else
+		verbose("device %u: tun %s: %s\n",
+			devices.device_num, tapif, arg);
+}
+/*:*/
+
+/* This hangs off device->priv. */
+struct vblk_info {
+	/* The size of the file. */
+	off64_t len;
+
+	/* The file descriptor for the file. */
+	int fd;
+
+};
+
+/*L:210
+ * The Disk
+ *
+ * The disk only has one virtqueue, so it only has one thread.  It is really
+ * simple: the Guest asks for a block number and we read or write that position
+ * in the file.
+ *
+ * Before we serviced each virtqueue in a separate thread, that was unacceptably
+ * slow: the Guest waits until the read is finished before running anything
+ * else, even if it could have been doing useful work.
+ *
+ * We could have used async I/O, except it's reputed to suck so hard that
+ * characters actually go missing from your code when you try to use it.
+ */
+static void blk_request(struct virtqueue *vq)
+{
+	struct vblk_info *vblk = vq->dev->priv;
+	unsigned int head, out_num, in_num, wlen;
+	int ret, i;
+	u8 *in;
+	struct virtio_blk_outhdr out;
+	struct iovec iov[vq->vring.num];
+	off64_t off;
+
+	/*
+	 * Get the next request, where we normally wait.  It triggers the
+	 * interrupt to acknowledge previously serviced requests (if any).
+	 */
+	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
+
+	/* Copy the output header from the front of the iov (adjusts iov) */
+	iov_consume(iov, out_num, &out, sizeof(out));
+
+	/* Find and trim end of iov input array, for our status byte. */
+	in = NULL;
+	for (i = out_num + in_num - 1; i >= out_num; i--) {
+		if (iov[i].iov_len > 0) {
+			in = iov[i].iov_base + iov[i].iov_len - 1;
+			iov[i].iov_len--;
+			break;
+		}
+	}
+	if (!in)
+		errx(1, "Bad virtblk cmd with no room for status");
+
+	/*
+	 * For historical reasons, block operations are expressed in 512 byte
+	 * "sectors".
+	 */
+	off = out.sector * 512;
+
+	/*
+	 * In general the virtio block driver is allowed to try SCSI commands.
+	 * It'd be nice if we supported eject, for example, but we don't.
+	 */
+	if (out.type & VIRTIO_BLK_T_SCSI_CMD) {
+		fprintf(stderr, "Scsi commands unsupported\n");
+		*in = VIRTIO_BLK_S_UNSUPP;
+		wlen = sizeof(*in);
+	} else if (out.type & VIRTIO_BLK_T_OUT) {
+		/*
+		 * Write
+		 *
+		 * Move to the right location in the block file.  This can fail
+		 * if they try to write past end.
+		 */
+		if (lseek64(vblk->fd, off, SEEK_SET) != off)
+			err(1, "Bad seek to sector %llu", out.sector);
+
+		ret = writev(vblk->fd, iov, out_num);
+		verbose("WRITE to sector %llu: %i\n", out.sector, ret);
+
+		/*
+		 * Grr... Now we know how long the descriptor they sent was, we
+		 * make sure they didn't try to write over the end of the block
+		 * file (possibly extending it).
+		 */
+		if (ret > 0 && off + ret > vblk->len) {
+			/* Trim it back to the correct length */
+			ftruncate64(vblk->fd, vblk->len);
+			/* Die, bad Guest, die. */
+			errx(1, "Write past end %llu+%u", off, ret);
+		}
+
+		wlen = sizeof(*in);
+		*in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
+	} else if (out.type & VIRTIO_BLK_T_FLUSH) {
+		/* Flush */
+		ret = fdatasync(vblk->fd);
+		verbose("FLUSH fdatasync: %i\n", ret);
+		wlen = sizeof(*in);
+		*in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR);
+	} else {
+		/*
+		 * Read
+		 *
+		 * Move to the right location in the block file.  This can fail
+		 * if they try to read past end.
+		 */
+		if (lseek64(vblk->fd, off, SEEK_SET) != off)
+			err(1, "Bad seek to sector %llu", out.sector);
+
+		ret = readv(vblk->fd, iov + out_num, in_num);
+		if (ret >= 0) {
+			wlen = sizeof(*in) + ret;
+			*in = VIRTIO_BLK_S_OK;
+		} else {
+			wlen = sizeof(*in);
+			*in = VIRTIO_BLK_S_IOERR;
+		}
+	}
+
+	/* Finished that request. */
+	add_used(vq, head, wlen);
+}
+
+/*L:198 This actually sets up a virtual block device. */
+static void setup_block_file(const char *filename)
+{
+	struct device *dev;
+	struct vblk_info *vblk;
+	struct virtio_blk_config conf;
+
+	/* Creat the device. */
+	dev = new_device("block", VIRTIO_ID_BLOCK);
+
+	/* The device has one virtqueue, where the Guest places requests. */
+	add_virtqueue(dev, VIRTQUEUE_NUM, blk_request);
+
+	/* Allocate the room for our own bookkeeping */
+	vblk = dev->priv = malloc(sizeof(*vblk));
+
+	/* First we open the file and store the length. */
+	vblk->fd = open_or_die(filename, O_RDWR|O_LARGEFILE);
+	vblk->len = lseek64(vblk->fd, 0, SEEK_END);
+
+	/* We support FLUSH. */
+	add_feature(dev, VIRTIO_BLK_F_FLUSH);
+
+	/* Tell Guest how many sectors this device has. */
+	conf.capacity = cpu_to_le64(vblk->len / 512);
+
+	/*
+	 * Tell Guest not to put in too many descriptors at once: two are used
+	 * for the in and out elements.
+	 */
+	add_feature(dev, VIRTIO_BLK_F_SEG_MAX);
+	conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2);
+
+	/* Don't try to put whole struct: we have 8 bit limit. */
+	set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf);
+
+	verbose("device %u: virtblock %llu sectors\n",
+		++devices.device_num, le64_to_cpu(conf.capacity));
+}
+
+/*L:211
+ * Our random number generator device reads from /dev/random into the Guest's
+ * input buffers.  The usual case is that the Guest doesn't want random numbers
+ * and so has no buffers although /dev/random is still readable, whereas
+ * console is the reverse.
+ *
+ * The same logic applies, however.
+ */
+struct rng_info {
+	int rfd;
+};
+
+static void rng_input(struct virtqueue *vq)
+{
+	int len;
+	unsigned int head, in_num, out_num, totlen = 0;
+	struct rng_info *rng_info = vq->dev->priv;
+	struct iovec iov[vq->vring.num];
+
+	/* First we need a buffer from the Guests's virtqueue. */
+	head = wait_for_vq_desc(vq, iov, &out_num, &in_num);
+	if (out_num)
+		errx(1, "Output buffers in rng?");
+
+	/*
+	 * Just like the console write, we loop to cover the whole iovec.
+	 * In this case, short reads actually happen quite a bit.
+	 */
+	while (!iov_empty(iov, in_num)) {
+		len = readv(rng_info->rfd, iov, in_num);
+		if (len <= 0)
+			err(1, "Read from /dev/random gave %i", len);
+		iov_consume(iov, in_num, NULL, len);
+		totlen += len;
+	}
+
+	/* Tell the Guest about the new input. */
+	add_used(vq, head, totlen);
+}
+
+/*L:199
+ * This creates a "hardware" random number device for the Guest.
+ */
+static void setup_rng(void)
+{
+	struct device *dev;
+	struct rng_info *rng_info = malloc(sizeof(*rng_info));
+
+	/* Our device's privat info simply contains the /dev/random fd. */
+	rng_info->rfd = open_or_die("/dev/random", O_RDONLY);
+
+	/* Create the new device. */
+	dev = new_device("rng", VIRTIO_ID_RNG);
+	dev->priv = rng_info;
+
+	/* The device has one virtqueue, where the Guest places inbufs. */
+	add_virtqueue(dev, VIRTQUEUE_NUM, rng_input);
+
+	verbose("device %u: rng\n", devices.device_num++);
+}
+/* That's the end of device setup. */
+
+/*L:230 Reboot is pretty easy: clean up and exec() the Launcher afresh. */
+static void __attribute__((noreturn)) restart_guest(void)
+{
+	unsigned int i;
+
+	/*
+	 * Since we don't track all open fds, we simply close everything beyond
+	 * stderr.
+	 */
+	for (i = 3; i < FD_SETSIZE; i++)
+		close(i);
+
+	/* Reset all the devices (kills all threads). */
+	cleanup_devices();
+
+	execv(main_args[0], main_args);
+	err(1, "Could not exec %s", main_args[0]);
+}
+
+/*L:220
+ * Finally we reach the core of the Launcher which runs the Guest, serves
+ * its input and output, and finally, lays it to rest.
+ */
+static void __attribute__((noreturn)) run_guest(void)
+{
+	for (;;) {
+		unsigned long notify_addr;
+		int readval;
+
+		/* We read from the /dev/lguest device to run the Guest. */
+		readval = pread(lguest_fd, &notify_addr,
+				sizeof(notify_addr), cpu_id);
+
+		/* One unsigned long means the Guest did HCALL_NOTIFY */
+		if (readval == sizeof(notify_addr)) {
+			verbose("Notify on address %#lx\n", notify_addr);
+			handle_output(notify_addr);
+		/* ENOENT means the Guest died.  Reading tells us why. */
+		} else if (errno == ENOENT) {
+			char reason[1024] = { 0 };
+			pread(lguest_fd, reason, sizeof(reason)-1, cpu_id);
+			errx(1, "%s", reason);
+		/* ERESTART means that we need to reboot the guest */
+		} else if (errno == ERESTART) {
+			restart_guest();
+		/* Anything else means a bug or incompatible change. */
+		} else
+			err(1, "Running guest failed");
+	}
+}
+/*L:240
+ * This is the end of the Launcher.  The good news: we are over halfway
+ * through!  The bad news: the most fiendish part of the code still lies ahead
+ * of us.
+ *
+ * Are you ready?  Take a deep breath and join me in the core of the Host, in
+ * "make Host".
+:*/
+
+static struct option opts[] = {
+	{ "verbose", 0, NULL, 'v' },
+	{ "tunnet", 1, NULL, 't' },
+	{ "block", 1, NULL, 'b' },
+	{ "rng", 0, NULL, 'r' },
+	{ "initrd", 1, NULL, 'i' },
+	{ "username", 1, NULL, 'u' },
+	{ "chroot", 1, NULL, 'c' },
+	{ NULL },
+};
+static void usage(void)
+{
+	errx(1, "Usage: lguest [--verbose] "
+	     "[--tunnet=(<ipaddr>:<macaddr>|bridge:<bridgename>:<macaddr>)\n"
+	     "|--block=<filename>|--initrd=<filename>]...\n"
+	     "<mem-in-mb> vmlinux [args...]");
+}
+
+/*L:105 The main routine is where the real work begins: */
+int main(int argc, char *argv[])
+{
+	/* Memory, code startpoint and size of the (optional) initrd. */
+	unsigned long mem = 0, start, initrd_size = 0;
+	/* Two temporaries. */
+	int i, c;
+	/* The boot information for the Guest. */
+	struct boot_params *boot;
+	/* If they specify an initrd file to load. */
+	const char *initrd_name = NULL;
+
+	/* Password structure for initgroups/setres[gu]id */
+	struct passwd *user_details = NULL;
+
+	/* Directory to chroot to */
+	char *chroot_path = NULL;
+
+	/* Save the args: we "reboot" by execing ourselves again. */
+	main_args = argv;
+
+	/*
+	 * First we initialize the device list.  We keep a pointer to the last
+	 * device, and the next interrupt number to use for devices (1:
+	 * remember that 0 is used by the timer).
+	 */
+	devices.lastdev = NULL;
+	devices.next_irq = 1;
+
+	/* We're CPU 0.  In fact, that's the only CPU possible right now. */
+	cpu_id = 0;
+
+	/*
+	 * We need to know how much memory so we can set up the device
+	 * descriptor and memory pages for the devices as we parse the command
+	 * line.  So we quickly look through the arguments to find the amount
+	 * of memory now.
+	 */
+	for (i = 1; i < argc; i++) {
+		if (argv[i][0] != '-') {
+			mem = atoi(argv[i]) * 1024 * 1024;
+			/*
+			 * We start by mapping anonymous pages over all of
+			 * guest-physical memory range.  This fills it with 0,
+			 * and ensures that the Guest won't be killed when it
+			 * tries to access it.
+			 */
+			guest_base = map_zeroed_pages(mem / getpagesize()
+						      + DEVICE_PAGES);
+			guest_limit = mem;
+			guest_max = mem + DEVICE_PAGES*getpagesize();
+			devices.descpage = get_pages(1);
+			break;
+		}
+	}
+
+	/* The options are fairly straight-forward */
+	while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
+		switch (c) {
+		case 'v':
+			verbose = true;
+			break;
+		case 't':
+			setup_tun_net(optarg);
+			break;
+		case 'b':
+			setup_block_file(optarg);
+			break;
+		case 'r':
+			setup_rng();
+			break;
+		case 'i':
+			initrd_name = optarg;
+			break;
+		case 'u':
+			user_details = getpwnam(optarg);
+			if (!user_details)
+				err(1, "getpwnam failed, incorrect username?");
+			break;
+		case 'c':
+			chroot_path = optarg;
+			break;
+		default:
+			warnx("Unknown argument %s", argv[optind]);
+			usage();
+		}
+	}
+	/*
+	 * After the other arguments we expect memory and kernel image name,
+	 * followed by command line arguments for the kernel.
+	 */
+	if (optind + 2 > argc)
+		usage();
+
+	verbose("Guest base is at %p\n", guest_base);
+
+	/* We always have a console device */
+	setup_console();
+
+	/* Now we load the kernel */
+	start = load_kernel(open_or_die(argv[optind+1], O_RDONLY));
+
+	/* Boot information is stashed at physical address 0 */
+	boot = from_guest_phys(0);
+
+	/* Map the initrd image if requested (at top of physical memory) */
+	if (initrd_name) {
+		initrd_size = load_initrd(initrd_name, mem);
+		/*
+		 * These are the location in the Linux boot header where the
+		 * start and size of the initrd are expected to be found.
+		 */
+		boot->hdr.ramdisk_image = mem - initrd_size;
+		boot->hdr.ramdisk_size = initrd_size;
+		/* The bootloader type 0xFF means "unknown"; that's OK. */
+		boot->hdr.type_of_loader = 0xFF;
+	}
+
+	/*
+	 * The Linux boot header contains an "E820" memory map: ours is a
+	 * simple, single region.
+	 */
+	boot->e820_entries = 1;
+	boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM });
+	/*
+	 * The boot header contains a command line pointer: we put the command
+	 * line after the boot header.
+	 */
+	boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1);
+	/* We use a simple helper to copy the arguments separated by spaces. */
+	concat((char *)(boot + 1), argv+optind+2);
+
+	/* Set kernel alignment to 16M (CONFIG_PHYSICAL_ALIGN) */
+	boot->hdr.kernel_alignment = 0x1000000;
+
+	/* Boot protocol version: 2.07 supports the fields for lguest. */
+	boot->hdr.version = 0x207;
+
+	/* The hardware_subarch value of "1" tells the Guest it's an lguest. */
+	boot->hdr.hardware_subarch = 1;
+
+	/* Tell the entry path not to try to reload segment registers. */
+	boot->hdr.loadflags |= KEEP_SEGMENTS;
+
+	/* We tell the kernel to initialize the Guest. */
+	tell_kernel(start);
+
+	/* Ensure that we terminate if a device-servicing child dies. */
+	signal(SIGCHLD, kill_launcher);
+
+	/* If we exit via err(), this kills all the threads, restores tty. */
+	atexit(cleanup_devices);
+
+	/* If requested, chroot to a directory */
+	if (chroot_path) {
+		if (chroot(chroot_path) != 0)
+			err(1, "chroot(\"%s\") failed", chroot_path);
+
+		if (chdir("/") != 0)
+			err(1, "chdir(\"/\") failed");
+
+		verbose("chroot done\n");
+	}
+
+	/* If requested, drop privileges */
+	if (user_details) {
+		uid_t u;
+		gid_t g;
+
+		u = user_details->pw_uid;
+		g = user_details->pw_gid;
+
+		if (initgroups(user_details->pw_name, g) != 0)
+			err(1, "initgroups failed");
+
+		if (setresgid(g, g, g) != 0)
+			err(1, "setresgid failed");
+
+		if (setresuid(u, u, u) != 0)
+			err(1, "setresuid failed");
+
+		verbose("Dropping privileges completed\n");
+	}
+
+	/* Finally, run the Guest.  This doesn't return. */
+	run_guest();
+}
+/*:*/
+
+/*M:999
+ * Mastery is done: you now know everything I do.
+ *
+ * But surely you have seen code, features and bugs in your wanderings which
+ * you now yearn to attack?  That is the real game, and I look forward to you
+ * patching and forking lguest into the Your-Name-Here-visor.
+ *
+ * Farewell, and good coding!
+ * Rusty Russell.
+ */
diff --git a/tools/lguest/lguest.txt b/tools/lguest/lguest.txt
new file mode 100644
index 00000000000..06e1f464951
--- /dev/null
+++ b/tools/lguest/lguest.txt
@@ -0,0 +1,125 @@
+      __
+ (___()'`;  Rusty's Remarkably Unreliable Guide to Lguest
+ /,    /`      - or, A Young Coder's Illustrated Hypervisor
+ \\"--\\    http://lguest.ozlabs.org
+
+Lguest is designed to be a minimal 32-bit x86 hypervisor for the Linux kernel,
+for Linux developers and users to experiment with virtualization with the
+minimum of complexity.  Nonetheless, it should have sufficient features to
+make it useful for specific tasks, and, of course, you are encouraged to fork
+and enhance it (see drivers/lguest/README).
+
+Features:
+
+- Kernel module which runs in a normal kernel.
+- Simple I/O model for communication.
+- Simple program to create new guests.
+- Logo contains cute puppies: http://lguest.ozlabs.org
+
+Developer features:
+
+- Fun to hack on.
+- No ABI: being tied to a specific kernel anyway, you can change anything.
+- Many opportunities for improvement or feature implementation.
+
+Running Lguest:
+
+- The easiest way to run lguest is to use same kernel as guest and host.
+  You can configure them differently, but usually it's easiest not to.
+
+  You will need to configure your kernel with the following options:
+
+  "Processor type and features":
+     "Paravirtualized guest support" = Y
+        "Lguest guest support" = Y
+     "High Memory Support" = off/4GB
+     "Alignment value to which kernel should be aligned" = 0x100000
+        (CONFIG_PARAVIRT=y, CONFIG_LGUEST_GUEST=y, CONFIG_HIGHMEM64G=n and
+         CONFIG_PHYSICAL_ALIGN=0x100000)
+
+  "Device Drivers":
+     "Block devices"
+        "Virtio block driver" = M/Y
+     "Network device support"
+        "Universal TUN/TAP device driver support" = M/Y
+        "Virtio network driver" = M/Y
+           (CONFIG_VIRTIO_BLK=m, CONFIG_VIRTIO_NET=m and CONFIG_TUN=m)
+
+  "Virtualization"
+     "Linux hypervisor example code" = M/Y
+        (CONFIG_LGUEST=m)
+
+- A tool called "lguest" is available in this directory: type "make"
+  to build it.  If you didn't build your kernel in-tree, use "make
+  O=<builddir>".
+
+- Create or find a root disk image.  There are several useful ones
+  around, such as the xm-test tiny root image at
+	  http://xm-test.xensource.com/ramdisks/initrd-1.1-i386.img
+
+  For more serious work, I usually use a distribution ISO image and
+  install it under qemu, then make multiple copies:
+
+	  dd if=/dev/zero of=rootfile bs=1M count=2048
+	  qemu -cdrom image.iso -hda rootfile -net user -net nic -boot d
+
+  Make sure that you install a getty on /dev/hvc0 if you want to log in on the
+  console!
+
+- "modprobe lg" if you built it as a module.
+
+- Run an lguest as root:
+
+      tools/lguest/lguest 64 vmlinux --tunnet=192.168.19.1 \
+        --block=rootfile root=/dev/vda
+
+   Explanation:
+    64: the amount of memory to use, in MB.
+
+    vmlinux: the kernel image found in the top of your build directory.  You
+       can also use a standard bzImage.
+
+    --tunnet=192.168.19.1: configures a "tap" device for networking with this
+       IP address.
+
+    --block=rootfile: a file or block device which becomes /dev/vda
+       inside the guest.
+
+    root=/dev/vda: this (and anything else on the command line) are
+       kernel boot parameters.
+
+- Configuring networking.  I usually have the host masquerade, using
+  "iptables -t nat -A POSTROUTING -o eth0 -j MASQUERADE" and "echo 1 >
+  /proc/sys/net/ipv4/ip_forward".  In this example, I would configure
+  eth0 inside the guest at 192.168.19.2.
+
+  Another method is to bridge the tap device to an external interface
+  using --tunnet=bridge:<bridgename>, and perhaps run dhcp on the guest
+  to obtain an IP address.  The bridge needs to be configured first:
+  this option simply adds the tap interface to it.
+
+  A simple example on my system:
+
+    ifconfig eth0 0.0.0.0
+    brctl addbr lg0
+    ifconfig lg0 up
+    brctl addif lg0 eth0
+    dhclient lg0
+
+  Then use --tunnet=bridge:lg0 when launching the guest.
+
+  See:
+  
+    http://www.linuxfoundation.org/collaborate/workgroups/networking/bridge
+    
+  for general information on how to get bridging to work.
+
+- Random number generation. Using the --rng option will provide a
+  /dev/hwrng in the guest that will read from the host's /dev/random.
+  Use this option in conjunction with rng-tools (see ../hw_random.txt)
+  to provide entropy to the guest kernel's /dev/random.
+
+There is a helpful mailing list at http://ozlabs.org/mailman/listinfo/lguest
+
+Good luck!
+Rusty Russell rusty@rustcorp.com.au.
diff --git a/tools/lib/api/Makefile b/tools/lib/api/Makefile
new file mode 100644
index 00000000000..ce00f7ee645
--- /dev/null
+++ b/tools/lib/api/Makefile
@@ -0,0 +1,44 @@
+include ../../scripts/Makefile.include
+include ../../perf/config/utilities.mak		# QUIET_CLEAN
+
+CC = $(CROSS_COMPILE)gcc
+AR = $(CROSS_COMPILE)ar
+
+# guard against environment variables
+LIB_H=
+LIB_OBJS=
+
+LIB_H += fs/debugfs.h
+LIB_H += fs/fs.h
+
+LIB_OBJS += $(OUTPUT)fs/debugfs.o
+LIB_OBJS += $(OUTPUT)fs/fs.o
+
+LIBFILE = libapikfs.a
+
+CFLAGS = -ggdb3 -Wall -Wextra -std=gnu99 -Werror -O6 -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS) $(EXTRA_CFLAGS) -fPIC
+EXTLIBS = -lelf -lpthread -lrt -lm
+ALL_CFLAGS = $(CFLAGS) $(BASIC_CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
+ALL_LDFLAGS = $(LDFLAGS)
+
+RM = rm -f
+
+$(LIBFILE): $(LIB_OBJS)
+	$(QUIET_AR)$(RM) $@ && $(AR) rcs $(OUTPUT)$@ $(LIB_OBJS)
+
+$(LIB_OBJS): $(LIB_H)
+
+libapi_dirs:
+	$(QUIET_MKDIR)mkdir -p $(OUTPUT)fs/
+
+$(OUTPUT)%.o: %.c libapi_dirs
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $<
+$(OUTPUT)%.s: %.c libapi_dirs
+	$(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $<
+$(OUTPUT)%.o: %.S libapi_dirs
+	$(QUIET_CC)$(CC) -o $@ -c $(ALL_CFLAGS) $<
+
+clean:
+	$(call QUIET_CLEAN, libapi) $(RM) $(LIB_OBJS) $(LIBFILE)
+
+.PHONY: clean
diff --git a/tools/lib/api/fs/debugfs.c b/tools/lib/api/fs/debugfs.c
new file mode 100644
index 00000000000..a74fba6d774
--- /dev/null
+++ b/tools/lib/api/fs/debugfs.c
@@ -0,0 +1,100 @@
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <sys/vfs.h>
+#include <sys/mount.h>
+#include <linux/kernel.h>
+
+#include "debugfs.h"
+
+char debugfs_mountpoint[PATH_MAX + 1] = "/sys/kernel/debug";
+
+static const char * const debugfs_known_mountpoints[] = {
+	"/sys/kernel/debug",
+	"/debug",
+	0,
+};
+
+static bool debugfs_found;
+
+/* find the path to the mounted debugfs */
+const char *debugfs_find_mountpoint(void)
+{
+	const char * const *ptr;
+	char type[100];
+	FILE *fp;
+
+	if (debugfs_found)
+		return (const char *)debugfs_mountpoint;
+
+	ptr = debugfs_known_mountpoints;
+	while (*ptr) {
+		if (debugfs_valid_mountpoint(*ptr) == 0) {
+			debugfs_found = true;
+			strcpy(debugfs_mountpoint, *ptr);
+			return debugfs_mountpoint;
+		}
+		ptr++;
+	}
+
+	/* give up and parse /proc/mounts */
+	fp = fopen("/proc/mounts", "r");
+	if (fp == NULL)
+		return NULL;
+
+	while (fscanf(fp, "%*s %" STR(PATH_MAX) "s %99s %*s %*d %*d\n",
+		      debugfs_mountpoint, type) == 2) {
+		if (strcmp(type, "debugfs") == 0)
+			break;
+	}
+	fclose(fp);
+
+	if (strcmp(type, "debugfs") != 0)
+		return NULL;
+
+	debugfs_found = true;
+
+	return debugfs_mountpoint;
+}
+
+/* verify that a mountpoint is actually a debugfs instance */
+
+int debugfs_valid_mountpoint(const char *debugfs)
+{
+	struct statfs st_fs;
+
+	if (statfs(debugfs, &st_fs) < 0)
+		return -ENOENT;
+	else if (st_fs.f_type != (long) DEBUGFS_MAGIC)
+		return -ENOENT;
+
+	return 0;
+}
+
+/* mount the debugfs somewhere if it's not mounted */
+char *debugfs_mount(const char *mountpoint)
+{
+	/* see if it's already mounted */
+	if (debugfs_find_mountpoint())
+		goto out;
+
+	/* if not mounted and no argument */
+	if (mountpoint == NULL) {
+		/* see if environment variable set */
+		mountpoint = getenv(PERF_DEBUGFS_ENVIRONMENT);
+		/* if no environment variable, use default */
+		if (mountpoint == NULL)
+			mountpoint = "/sys/kernel/debug";
+	}
+
+	if (mount(NULL, mountpoint, "debugfs", 0, NULL) < 0)
+		return NULL;
+
+	/* save the mountpoint */
+	debugfs_found = true;
+	strncpy(debugfs_mountpoint, mountpoint, sizeof(debugfs_mountpoint));
+out:
+	return debugfs_mountpoint;
+}
diff --git a/tools/lib/api/fs/debugfs.h b/tools/lib/api/fs/debugfs.h
new file mode 100644
index 00000000000..f19d3df9609
--- /dev/null
+++ b/tools/lib/api/fs/debugfs.h
@@ -0,0 +1,29 @@
+#ifndef __API_DEBUGFS_H__
+#define __API_DEBUGFS_H__
+
+#define _STR(x) #x
+#define STR(x) _STR(x)
+
+/*
+ * On most systems <limits.h> would have given us this, but  not on some systems
+ * (e.g. GNU/Hurd).
+ */
+#ifndef PATH_MAX
+#define PATH_MAX 4096
+#endif
+
+#ifndef DEBUGFS_MAGIC
+#define DEBUGFS_MAGIC          0x64626720
+#endif
+
+#ifndef PERF_DEBUGFS_ENVIRONMENT
+#define PERF_DEBUGFS_ENVIRONMENT "PERF_DEBUGFS_DIR"
+#endif
+
+const char *debugfs_find_mountpoint(void);
+int debugfs_valid_mountpoint(const char *debugfs);
+char *debugfs_mount(const char *mountpoint);
+
+extern char debugfs_mountpoint[];
+
+#endif /* __API_DEBUGFS_H__ */
diff --git a/tools/lib/api/fs/fs.c b/tools/lib/api/fs/fs.c
new file mode 100644
index 00000000000..c1b49c36a95
--- /dev/null
+++ b/tools/lib/api/fs/fs.c
@@ -0,0 +1,165 @@
+/* TODO merge/factor in debugfs.c here */
+
+#include <ctype.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/vfs.h>
+
+#include "debugfs.h"
+#include "fs.h"
+
+static const char * const sysfs__fs_known_mountpoints[] = {
+	"/sys",
+	0,
+};
+
+static const char * const procfs__known_mountpoints[] = {
+	"/proc",
+	0,
+};
+
+struct fs {
+	const char		*name;
+	const char * const	*mounts;
+	char			 path[PATH_MAX + 1];
+	bool			 found;
+	long			 magic;
+};
+
+enum {
+	FS__SYSFS  = 0,
+	FS__PROCFS = 1,
+};
+
+static struct fs fs__entries[] = {
+	[FS__SYSFS] = {
+		.name	= "sysfs",
+		.mounts	= sysfs__fs_known_mountpoints,
+		.magic	= SYSFS_MAGIC,
+	},
+	[FS__PROCFS] = {
+		.name	= "proc",
+		.mounts	= procfs__known_mountpoints,
+		.magic	= PROC_SUPER_MAGIC,
+	},
+};
+
+static bool fs__read_mounts(struct fs *fs)
+{
+	bool found = false;
+	char type[100];
+	FILE *fp;
+
+	fp = fopen("/proc/mounts", "r");
+	if (fp == NULL)
+		return NULL;
+
+	while (!found &&
+	       fscanf(fp, "%*s %" STR(PATH_MAX) "s %99s %*s %*d %*d\n",
+		      fs->path, type) == 2) {
+
+		if (strcmp(type, fs->name) == 0)
+			found = true;
+	}
+
+	fclose(fp);
+	return fs->found = found;
+}
+
+static int fs__valid_mount(const char *fs, long magic)
+{
+	struct statfs st_fs;
+
+	if (statfs(fs, &st_fs) < 0)
+		return -ENOENT;
+	else if (st_fs.f_type != magic)
+		return -ENOENT;
+
+	return 0;
+}
+
+static bool fs__check_mounts(struct fs *fs)
+{
+	const char * const *ptr;
+
+	ptr = fs->mounts;
+	while (*ptr) {
+		if (fs__valid_mount(*ptr, fs->magic) == 0) {
+			fs->found = true;
+			strcpy(fs->path, *ptr);
+			return true;
+		}
+		ptr++;
+	}
+
+	return false;
+}
+
+static void mem_toupper(char *f, size_t len)
+{
+	while (len) {
+		*f = toupper(*f);
+		f++;
+		len--;
+	}
+}
+
+/*
+ * Check for "NAME_PATH" environment variable to override fs location (for
+ * testing). This matches the recommendation in Documentation/sysfs-rules.txt
+ * for SYSFS_PATH.
+ */
+static bool fs__env_override(struct fs *fs)
+{
+	char *override_path;
+	size_t name_len = strlen(fs->name);
+	/* name + "_PATH" + '\0' */
+	char upper_name[name_len + 5 + 1];
+	memcpy(upper_name, fs->name, name_len);
+	mem_toupper(upper_name, name_len);
+	strcpy(&upper_name[name_len], "_PATH");
+
+	override_path = getenv(upper_name);
+	if (!override_path)
+		return false;
+
+	fs->found = true;
+	strncpy(fs->path, override_path, sizeof(fs->path));
+	return true;
+}
+
+static const char *fs__get_mountpoint(struct fs *fs)
+{
+	if (fs__env_override(fs))
+		return fs->path;
+
+	if (fs__check_mounts(fs))
+		return fs->path;
+
+	if (fs__read_mounts(fs))
+		return fs->path;
+
+	return NULL;
+}
+
+static const char *fs__mountpoint(int idx)
+{
+	struct fs *fs = &fs__entries[idx];
+
+	if (fs->found)
+		return (const char *)fs->path;
+
+	return fs__get_mountpoint(fs);
+}
+
+#define FS__MOUNTPOINT(name, idx)	\
+const char *name##__mountpoint(void)	\
+{					\
+	return fs__mountpoint(idx);	\
+}
+
+FS__MOUNTPOINT(sysfs,  FS__SYSFS);
+FS__MOUNTPOINT(procfs, FS__PROCFS);
diff --git a/tools/lib/api/fs/fs.h b/tools/lib/api/fs/fs.h
new file mode 100644
index 00000000000..cb7049551f3
--- /dev/null
+++ b/tools/lib/api/fs/fs.h
@@ -0,0 +1,14 @@
+#ifndef __API_FS__
+#define __API_FS__
+
+#ifndef SYSFS_MAGIC
+#define SYSFS_MAGIC            0x62656572
+#endif
+
+#ifndef PROC_SUPER_MAGIC
+#define PROC_SUPER_MAGIC       0x9fa0
+#endif
+
+const char *sysfs__mountpoint(void);
+const char *procfs__mountpoint(void);
+#endif /* __API_FS__ */
diff --git a/tools/lib/lockdep/Makefile b/tools/lib/lockdep/Makefile
new file mode 100644
index 00000000000..52f9279c6c1
--- /dev/null
+++ b/tools/lib/lockdep/Makefile
@@ -0,0 +1,243 @@
+# file format version
+FILE_VERSION = 1
+
+LIBLOCKDEP_VERSION=$(shell make --no-print-directory -sC ../../.. kernelversion)
+
+# Makefiles suck: This macro sets a default value of $(2) for the
+# variable named by $(1), unless the variable has been set by
+# environment or command line. This is necessary for CC and AR
+# because make sets default values, so the simpler ?= approach
+# won't work as expected.
+define allow-override
+  $(if $(or $(findstring environment,$(origin $(1))),\
+            $(findstring command line,$(origin $(1)))),,\
+    $(eval $(1) = $(2)))
+endef
+
+# Allow setting CC and AR, or setting CROSS_COMPILE as a prefix.
+$(call allow-override,CC,$(CROSS_COMPILE)gcc)
+$(call allow-override,AR,$(CROSS_COMPILE)ar)
+
+INSTALL = install
+
+# Use DESTDIR for installing into a different root directory.
+# This is useful for building a package. The program will be
+# installed in this directory as if it was the root directory.
+# Then the build tool can move it later.
+DESTDIR ?=
+DESTDIR_SQ = '$(subst ','\'',$(DESTDIR))'
+
+prefix ?= /usr/local
+libdir_relative = lib
+libdir = $(prefix)/$(libdir_relative)
+bindir_relative = bin
+bindir = $(prefix)/$(bindir_relative)
+
+export DESTDIR DESTDIR_SQ INSTALL
+
+# copy a bit from Linux kbuild
+
+ifeq ("$(origin V)", "command line")
+  VERBOSE = $(V)
+endif
+ifndef VERBOSE
+  VERBOSE = 0
+endif
+
+ifeq ("$(origin O)", "command line")
+  BUILD_OUTPUT := $(O)
+endif
+
+ifeq ($(BUILD_SRC),)
+ifneq ($(BUILD_OUTPUT),)
+
+define build_output
+	$(if $(VERBOSE:1=),@)$(MAKE) -C $(BUILD_OUTPUT)	\
+	BUILD_SRC=$(CURDIR) -f $(CURDIR)/Makefile $1
+endef
+
+saved-output := $(BUILD_OUTPUT)
+BUILD_OUTPUT := $(shell cd $(BUILD_OUTPUT) && /bin/pwd)
+$(if $(BUILD_OUTPUT),, \
+     $(error output directory "$(saved-output)" does not exist))
+
+all: sub-make
+
+gui: force
+	$(call build_output, all_cmd)
+
+$(filter-out gui,$(MAKECMDGOALS)): sub-make
+
+sub-make: force
+	$(call build_output, $(MAKECMDGOALS))
+
+
+# Leave processing to above invocation of make
+skip-makefile := 1
+
+endif # BUILD_OUTPUT
+endif # BUILD_SRC
+
+# We process the rest of the Makefile if this is the final invocation of make
+ifeq ($(skip-makefile),)
+
+srctree		:= $(realpath $(if $(BUILD_SRC),$(BUILD_SRC),$(CURDIR)))
+objtree		:= $(realpath $(CURDIR))
+src		:= $(srctree)
+obj		:= $(objtree)
+
+export prefix libdir bindir src obj
+
+# Shell quotes
+libdir_SQ = $(subst ','\'',$(libdir))
+bindir_SQ = $(subst ','\'',$(bindir))
+
+LIB_FILE = liblockdep.a liblockdep.so.$(LIBLOCKDEP_VERSION)
+BIN_FILE = lockdep
+
+CONFIG_INCLUDES =
+CONFIG_LIBS	=
+CONFIG_FLAGS	=
+
+OBJ		= $@
+N		=
+
+export Q VERBOSE
+
+INCLUDES = -I. -I/usr/local/include -I./uinclude -I./include -I../../include $(CONFIG_INCLUDES)
+
+# Set compile option CFLAGS if not set elsewhere
+CFLAGS ?= -g -DCONFIG_LOCKDEP -DCONFIG_STACKTRACE -DCONFIG_PROVE_LOCKING -DBITS_PER_LONG=__WORDSIZE -DLIBLOCKDEP_VERSION='"$(LIBLOCKDEP_VERSION)"' -rdynamic -O0 -g
+
+override CFLAGS += $(CONFIG_FLAGS) $(INCLUDES) $(PLUGIN_DIR_SQ)
+
+ifeq ($(VERBOSE),1)
+  Q =
+  print_compile =
+  print_app_build =
+  print_fpic_compile =
+  print_shared_lib_compile =
+  print_install =
+else
+  Q = @
+  print_compile =		echo '  CC                 '$(OBJ);
+  print_app_build =		echo '  BUILD              '$(OBJ);
+  print_fpic_compile =		echo '  CC FPIC            '$(OBJ);
+  print_shared_lib_compile =	echo '  BUILD SHARED LIB   '$(OBJ);
+  print_static_lib_build =	echo '  BUILD STATIC LIB   '$(OBJ);
+  print_install =		echo '  INSTALL     '$1'	to	$(DESTDIR_SQ)$2';
+endif
+
+do_fpic_compile =					\
+	($(print_fpic_compile)				\
+	$(CC) -c $(CFLAGS) $(EXT) -fPIC $< -o $@)
+
+do_app_build =						\
+	($(print_app_build)				\
+	$(CC) $^ -rdynamic -o $@ $(CONFIG_LIBS) $(LIBS))
+
+do_compile_shared_library =			\
+	($(print_shared_lib_compile)		\
+	$(CC) --shared $^ -o $@ -lpthread -ldl -Wl,-soname='"$@"';$(shell ln -s $@ liblockdep.so))
+
+do_build_static_lib =				\
+	($(print_static_lib_build)		\
+	$(RM) $@;  $(AR) rcs $@ $^)
+
+
+define do_compile
+	$(print_compile)						\
+	$(CC) -c $(CFLAGS) $(EXT) $< -o $(obj)/$@;
+endef
+
+$(obj)/%.o: $(src)/%.c
+	$(Q)$(call do_compile)
+
+%.o: $(src)/%.c
+	$(Q)$(call do_compile)
+
+PEVENT_LIB_OBJS = common.o lockdep.o preload.o rbtree.o
+
+ALL_OBJS = $(PEVENT_LIB_OBJS)
+
+CMD_TARGETS = $(LIB_FILE)
+
+TARGETS = $(CMD_TARGETS)
+
+
+all: all_cmd
+
+all_cmd: $(CMD_TARGETS)
+
+liblockdep.so.$(LIBLOCKDEP_VERSION): $(PEVENT_LIB_OBJS)
+	$(Q)$(do_compile_shared_library)
+
+liblockdep.a: $(PEVENT_LIB_OBJS)
+	$(Q)$(do_build_static_lib)
+
+$(PEVENT_LIB_OBJS): %.o: $(src)/%.c
+	$(Q)$(do_fpic_compile)
+
+## make deps
+
+all_objs := $(sort $(ALL_OBJS))
+all_deps := $(all_objs:%.o=.%.d)
+
+# let .d file also depends on the source and header files
+define check_deps
+		@set -e; $(RM) $@; \
+		$(CC) -MM $(CFLAGS) $< > $@.$$$$; \
+		sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
+		$(RM) $@.$$$$
+endef
+
+$(all_deps): .%.d: $(src)/%.c
+	$(Q)$(call check_deps)
+
+$(all_objs) : %.o : .%.d
+
+dep_includes := $(wildcard $(all_deps))
+
+ifneq ($(dep_includes),)
+ include $(dep_includes)
+endif
+
+### Detect environment changes
+TRACK_CFLAGS = $(subst ','\'',$(CFLAGS)):$(ARCH):$(CROSS_COMPILE)
+
+tags:	force
+	$(RM) tags
+	find . -name '*.[ch]' | xargs ctags --extra=+f --c-kinds=+px \
+	--regex-c++='/_PE\(([^,)]*).*/PEVENT_ERRNO__\1/'
+
+TAGS:	force
+	$(RM) TAGS
+	find . -name '*.[ch]' | xargs etags \
+	--regex='/_PE(\([^,)]*\).*/PEVENT_ERRNO__\1/'
+
+define do_install
+	$(print_install)				\
+	if [ ! -d '$(DESTDIR_SQ)$2' ]; then		\
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2';	\
+	fi;						\
+	$(INSTALL) $1 '$(DESTDIR_SQ)$2'
+endef
+
+install_lib: all_cmd
+	$(Q)$(call do_install,$(LIB_FILE),$(libdir_SQ))
+	$(Q)$(call do_install,$(BIN_FILE),$(bindir_SQ))
+
+install: install_lib
+
+clean:
+	$(RM) *.o *~ $(TARGETS) *.a *liblockdep*.so* $(VERSION_FILES) .*.d
+	$(RM) tags TAGS
+
+endif # skip-makefile
+
+PHONY += force
+force:
+
+# Declare the contents of the .PHONY variable as phony.  We keep that
+# information in a variable so we can use it in if_changed and friends.
+.PHONY: $(PHONY)
diff --git a/tools/lib/lockdep/common.c b/tools/lib/lockdep/common.c
new file mode 100644
index 00000000000..8ef602f18a3
--- /dev/null
+++ b/tools/lib/lockdep/common.c
@@ -0,0 +1,33 @@
+#include <stddef.h>
+#include <stdbool.h>
+#include <linux/compiler.h>
+#include <linux/lockdep.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+
+static __thread struct task_struct current_obj;
+
+/* lockdep wants these */
+bool debug_locks = true;
+bool debug_locks_silent;
+
+__attribute__((constructor)) static void liblockdep_init(void)
+{
+	lockdep_init();
+}
+
+__attribute__((destructor)) static void liblockdep_exit(void)
+{
+	debug_check_no_locks_held(&current_obj);
+}
+
+struct task_struct *__curr(void)
+{
+	if (current_obj.pid == 0) {
+		/* Makes lockdep output pretty */
+		prctl(PR_GET_NAME, current_obj.comm);
+		current_obj.pid = syscall(__NR_gettid);
+	}
+
+	return &current_obj;
+}
diff --git a/tools/lib/lockdep/include/liblockdep/common.h b/tools/lib/lockdep/include/liblockdep/common.h
new file mode 100644
index 00000000000..0bda630027c
--- /dev/null
+++ b/tools/lib/lockdep/include/liblockdep/common.h
@@ -0,0 +1,50 @@
+#ifndef _LIBLOCKDEP_COMMON_H
+#define _LIBLOCKDEP_COMMON_H
+
+#include <pthread.h>
+
+#define NR_LOCKDEP_CACHING_CLASSES 2
+#define MAX_LOCKDEP_SUBCLASSES 8UL
+
+#ifndef CALLER_ADDR0
+#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#endif
+
+#ifndef _RET_IP_
+#define _RET_IP_ CALLER_ADDR0
+#endif
+
+#ifndef _THIS_IP_
+#define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; })
+#endif
+
+struct lockdep_subclass_key {
+	char __one_byte;
+};
+
+struct lock_class_key {
+	struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES];
+};
+
+struct lockdep_map {
+	struct lock_class_key	*key;
+	struct lock_class	*class_cache[NR_LOCKDEP_CACHING_CLASSES];
+	const char		*name;
+#ifdef CONFIG_LOCK_STAT
+	int			cpu;
+	unsigned long		ip;
+#endif
+};
+
+void lockdep_init_map(struct lockdep_map *lock, const char *name,
+			struct lock_class_key *key, int subclass);
+void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
+			int trylock, int read, int check,
+			struct lockdep_map *nest_lock, unsigned long ip);
+void lock_release(struct lockdep_map *lock, int nested,
+			unsigned long ip);
+
+#define STATIC_LOCKDEP_MAP_INIT(_name, _key) \
+	{ .name = (_name), .key = (void *)(_key), }
+
+#endif
diff --git a/tools/lib/lockdep/include/liblockdep/mutex.h b/tools/lib/lockdep/include/liblockdep/mutex.h
new file mode 100644
index 00000000000..ee53a42818c
--- /dev/null
+++ b/tools/lib/lockdep/include/liblockdep/mutex.h
@@ -0,0 +1,70 @@
+#ifndef _LIBLOCKDEP_MUTEX_H
+#define _LIBLOCKDEP_MUTEX_H
+
+#include <pthread.h>
+#include "common.h"
+
+struct liblockdep_pthread_mutex {
+	pthread_mutex_t mutex;
+	struct lockdep_map dep_map;
+};
+
+typedef struct liblockdep_pthread_mutex liblockdep_pthread_mutex_t;
+
+#define LIBLOCKDEP_PTHREAD_MUTEX_INITIALIZER(mtx)			\
+		(const struct liblockdep_pthread_mutex) {		\
+	.mutex = PTHREAD_MUTEX_INITIALIZER,				\
+	.dep_map = STATIC_LOCKDEP_MAP_INIT(#mtx, &((&(mtx))->dep_map)),	\
+}
+
+static inline int __mutex_init(liblockdep_pthread_mutex_t *lock,
+				const char *name,
+				struct lock_class_key *key,
+				const pthread_mutexattr_t *__mutexattr)
+{
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+	return pthread_mutex_init(&lock->mutex, __mutexattr);
+}
+
+#define liblockdep_pthread_mutex_init(mutex, mutexattr)		\
+({								\
+	static struct lock_class_key __key;			\
+								\
+	__mutex_init((mutex), #mutex, &__key, (mutexattr));	\
+})
+
+static inline int liblockdep_pthread_mutex_lock(liblockdep_pthread_mutex_t *lock)
+{
+	lock_acquire(&lock->dep_map, 0, 0, 0, 1, NULL, (unsigned long)_RET_IP_);
+	return pthread_mutex_lock(&lock->mutex);
+}
+
+static inline int liblockdep_pthread_mutex_unlock(liblockdep_pthread_mutex_t *lock)
+{
+	lock_release(&lock->dep_map, 0, (unsigned long)_RET_IP_);
+	return pthread_mutex_unlock(&lock->mutex);
+}
+
+static inline int liblockdep_pthread_mutex_trylock(liblockdep_pthread_mutex_t *lock)
+{
+	lock_acquire(&lock->dep_map, 0, 1, 0, 1, NULL, (unsigned long)_RET_IP_);
+	return pthread_mutex_trylock(&lock->mutex) == 0 ? 1 : 0;
+}
+
+static inline int liblockdep_pthread_mutex_destroy(liblockdep_pthread_mutex_t *lock)
+{
+	return pthread_mutex_destroy(&lock->mutex);
+}
+
+#ifdef __USE_LIBLOCKDEP
+
+#define pthread_mutex_t         liblockdep_pthread_mutex_t
+#define pthread_mutex_init      liblockdep_pthread_mutex_init
+#define pthread_mutex_lock      liblockdep_pthread_mutex_lock
+#define pthread_mutex_unlock    liblockdep_pthread_mutex_unlock
+#define pthread_mutex_trylock   liblockdep_pthread_mutex_trylock
+#define pthread_mutex_destroy   liblockdep_pthread_mutex_destroy
+
+#endif
+
+#endif
diff --git a/tools/lib/lockdep/include/liblockdep/rwlock.h b/tools/lib/lockdep/include/liblockdep/rwlock.h
new file mode 100644
index 00000000000..4ec03f86155
--- /dev/null
+++ b/tools/lib/lockdep/include/liblockdep/rwlock.h
@@ -0,0 +1,86 @@
+#ifndef _LIBLOCKDEP_RWLOCK_H
+#define _LIBLOCKDEP_RWLOCK_H
+
+#include <pthread.h>
+#include "common.h"
+
+struct liblockdep_pthread_rwlock {
+	pthread_rwlock_t rwlock;
+	struct lockdep_map dep_map;
+};
+
+typedef struct liblockdep_pthread_rwlock liblockdep_pthread_rwlock_t;
+
+#define LIBLOCKDEP_PTHREAD_RWLOCK_INITIALIZER(rwl)			\
+		(struct liblockdep_pthread_rwlock) {			\
+	.rwlock = PTHREAD_RWLOCK_INITIALIZER,				\
+	.dep_map = STATIC_LOCKDEP_MAP_INIT(#rwl, &((&(rwl))->dep_map)),	\
+}
+
+static inline int __rwlock_init(liblockdep_pthread_rwlock_t *lock,
+				const char *name,
+				struct lock_class_key *key,
+				const pthread_rwlockattr_t *attr)
+{
+	lockdep_init_map(&lock->dep_map, name, key, 0);
+
+	return pthread_rwlock_init(&lock->rwlock, attr);
+}
+
+#define liblockdep_pthread_rwlock_init(lock, attr)		\
+({							\
+	static struct lock_class_key __key;		\
+							\
+	__rwlock_init((lock), #lock, &__key, (attr));	\
+})
+
+static inline int liblockdep_pthread_rwlock_rdlock(liblockdep_pthread_rwlock_t *lock)
+{
+	lock_acquire(&lock->dep_map, 0, 0, 2, 1, NULL, (unsigned long)_RET_IP_);
+	return pthread_rwlock_rdlock(&lock->rwlock);
+
+}
+
+static inline int liblockdep_pthread_rwlock_unlock(liblockdep_pthread_rwlock_t *lock)
+{
+	lock_release(&lock->dep_map, 0, (unsigned long)_RET_IP_);
+	return pthread_rwlock_unlock(&lock->rwlock);
+}
+
+static inline int liblockdep_pthread_rwlock_wrlock(liblockdep_pthread_rwlock_t *lock)
+{
+	lock_acquire(&lock->dep_map, 0, 0, 0, 1, NULL, (unsigned long)_RET_IP_);
+	return pthread_rwlock_wrlock(&lock->rwlock);
+}
+
+static inline int liblockdep_pthread_rwlock_tryrdlock(liblockdep_pthread_rwlock_t *lock)
+{
+	lock_acquire(&lock->dep_map, 0, 1, 2, 1, NULL, (unsigned long)_RET_IP_);
+	return pthread_rwlock_tryrdlock(&lock->rwlock) == 0 ? 1 : 0;
+}
+
+static inline int liblockdep_pthread_rwlock_trywlock(liblockdep_pthread_rwlock_t *lock)
+{
+	lock_acquire(&lock->dep_map, 0, 1, 0, 1, NULL, (unsigned long)_RET_IP_);
+	return pthread_rwlock_trywlock(&lock->rwlock) == 0 ? 1 : 0;
+}
+
+static inline int liblockdep_rwlock_destroy(liblockdep_pthread_rwlock_t *lock)
+{
+	return pthread_rwlock_destroy(&lock->rwlock);
+}
+
+#ifdef __USE_LIBLOCKDEP
+
+#define pthread_rwlock_t		liblockdep_pthread_rwlock_t
+#define pthread_rwlock_init		liblockdep_pthread_rwlock_init
+#define pthread_rwlock_rdlock		liblockdep_pthread_rwlock_rdlock
+#define pthread_rwlock_unlock		liblockdep_pthread_rwlock_unlock
+#define pthread_rwlock_wrlock		liblockdep_pthread_rwlock_wrlock
+#define pthread_rwlock_tryrdlock	liblockdep_pthread_rwlock_tryrdlock
+#define pthread_rwlock_trywlock		liblockdep_pthread_rwlock_trywlock
+#define pthread_rwlock_destroy		liblockdep_rwlock_destroy
+
+#endif
+
+#endif
diff --git a/tools/lib/lockdep/lockdep b/tools/lib/lockdep/lockdep
new file mode 100755
index 00000000000..49af9fe19f5
--- /dev/null
+++ b/tools/lib/lockdep/lockdep
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+LD_PRELOAD="./liblockdep.so $LD_PRELOAD" "$@"
diff --git a/tools/lib/lockdep/lockdep.c b/tools/lib/lockdep/lockdep.c
new file mode 100644
index 00000000000..f42b7e9aa48
--- /dev/null
+++ b/tools/lib/lockdep/lockdep.c
@@ -0,0 +1,2 @@
+#include <linux/lockdep.h>
+#include "../../../kernel/locking/lockdep.c"
diff --git a/tools/lib/lockdep/lockdep_internals.h b/tools/lib/lockdep/lockdep_internals.h
new file mode 100644
index 00000000000..29d0c954cc2
--- /dev/null
+++ b/tools/lib/lockdep/lockdep_internals.h
@@ -0,0 +1 @@
+#include "../../../kernel/locking/lockdep_internals.h"
diff --git a/tools/lib/lockdep/lockdep_states.h b/tools/lib/lockdep/lockdep_states.h
new file mode 100644
index 00000000000..248d235efda
--- /dev/null
+++ b/tools/lib/lockdep/lockdep_states.h
@@ -0,0 +1 @@
+#include "../../../kernel/locking/lockdep_states.h"
diff --git a/tools/lib/lockdep/preload.c b/tools/lib/lockdep/preload.c
new file mode 100644
index 00000000000..6f803609e49
--- /dev/null
+++ b/tools/lib/lockdep/preload.c
@@ -0,0 +1,445 @@
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <stdio.h>
+#include <dlfcn.h>
+#include <stdlib.h>
+#include <sysexits.h>
+#include "include/liblockdep/mutex.h"
+#include "../../../include/linux/rbtree.h"
+
+/**
+ * struct lock_lookup - liblockdep's view of a single unique lock
+ * @orig: pointer to the original pthread lock, used for lookups
+ * @dep_map: lockdep's dep_map structure
+ * @key: lockdep's key structure
+ * @node: rb-tree node used to store the lock in a global tree
+ * @name: a unique name for the lock
+ */
+struct lock_lookup {
+	void *orig; /* Original pthread lock, used for lookups */
+	struct lockdep_map dep_map; /* Since all locks are dynamic, we need
+				     * a dep_map and a key for each lock */
+	/*
+	 * Wait, there's no support for key classes? Yup :(
+	 * Most big projects wrap the pthread api with their own calls to
+	 * be compatible with different locking methods. This means that
+	 * "classes" will be brokes since the function that creates all
+	 * locks will point to a generic locking function instead of the
+	 * actual code that wants to do the locking.
+	 */
+	struct lock_class_key key;
+	struct rb_node node;
+#define LIBLOCKDEP_MAX_LOCK_NAME 22
+	char name[LIBLOCKDEP_MAX_LOCK_NAME];
+};
+
+/* This is where we store our locks */
+static struct rb_root locks = RB_ROOT;
+static pthread_rwlock_t locks_rwlock = PTHREAD_RWLOCK_INITIALIZER;
+
+/* pthread mutex API */
+
+#ifdef __GLIBC__
+extern int __pthread_mutex_init(pthread_mutex_t *mutex, const pthread_mutexattr_t *attr);
+extern int __pthread_mutex_lock(pthread_mutex_t *mutex);
+extern int __pthread_mutex_trylock(pthread_mutex_t *mutex);
+extern int __pthread_mutex_unlock(pthread_mutex_t *mutex);
+extern int __pthread_mutex_destroy(pthread_mutex_t *mutex);
+#else
+#define __pthread_mutex_init	NULL
+#define __pthread_mutex_lock	NULL
+#define __pthread_mutex_trylock	NULL
+#define __pthread_mutex_unlock	NULL
+#define __pthread_mutex_destroy	NULL
+#endif
+static int (*ll_pthread_mutex_init)(pthread_mutex_t *mutex,
+			const pthread_mutexattr_t *attr)	= __pthread_mutex_init;
+static int (*ll_pthread_mutex_lock)(pthread_mutex_t *mutex)	= __pthread_mutex_lock;
+static int (*ll_pthread_mutex_trylock)(pthread_mutex_t *mutex)	= __pthread_mutex_trylock;
+static int (*ll_pthread_mutex_unlock)(pthread_mutex_t *mutex)	= __pthread_mutex_unlock;
+static int (*ll_pthread_mutex_destroy)(pthread_mutex_t *mutex)	= __pthread_mutex_destroy;
+
+/* pthread rwlock API */
+
+#ifdef __GLIBC__
+extern int __pthread_rwlock_init(pthread_rwlock_t *rwlock, const pthread_rwlockattr_t *attr);
+extern int __pthread_rwlock_destroy(pthread_rwlock_t *rwlock);
+extern int __pthread_rwlock_wrlock(pthread_rwlock_t *rwlock);
+extern int __pthread_rwlock_trywrlock(pthread_rwlock_t *rwlock);
+extern int __pthread_rwlock_rdlock(pthread_rwlock_t *rwlock);
+extern int __pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock);
+extern int __pthread_rwlock_unlock(pthread_rwlock_t *rwlock);
+#else
+#define __pthread_rwlock_init		NULL
+#define __pthread_rwlock_destroy	NULL
+#define __pthread_rwlock_wrlock		NULL
+#define __pthread_rwlock_trywrlock	NULL
+#define __pthread_rwlock_rdlock		NULL
+#define __pthread_rwlock_tryrdlock	NULL
+#define __pthread_rwlock_unlock		NULL
+#endif
+
+static int (*ll_pthread_rwlock_init)(pthread_rwlock_t *rwlock,
+			const pthread_rwlockattr_t *attr)		= __pthread_rwlock_init;
+static int (*ll_pthread_rwlock_destroy)(pthread_rwlock_t *rwlock)	= __pthread_rwlock_destroy;
+static int (*ll_pthread_rwlock_rdlock)(pthread_rwlock_t *rwlock)	= __pthread_rwlock_rdlock;
+static int (*ll_pthread_rwlock_tryrdlock)(pthread_rwlock_t *rwlock)	= __pthread_rwlock_tryrdlock;
+static int (*ll_pthread_rwlock_trywrlock)(pthread_rwlock_t *rwlock)	= __pthread_rwlock_trywrlock;
+static int (*ll_pthread_rwlock_wrlock)(pthread_rwlock_t *rwlock)	= __pthread_rwlock_wrlock;
+static int (*ll_pthread_rwlock_unlock)(pthread_rwlock_t *rwlock)	= __pthread_rwlock_unlock;
+
+enum { none, prepare, done, } __init_state;
+static void init_preload(void);
+static void try_init_preload(void)
+{
+	if (__init_state != done)
+		init_preload();
+}
+
+static struct rb_node **__get_lock_node(void *lock, struct rb_node **parent)
+{
+	struct rb_node **node = &locks.rb_node;
+	struct lock_lookup *l;
+
+	*parent = NULL;
+
+	while (*node) {
+		l = rb_entry(*node, struct lock_lookup, node);
+
+		*parent = *node;
+		if (lock < l->orig)
+			node = &l->node.rb_left;
+		else if (lock > l->orig)
+			node = &l->node.rb_right;
+		else
+			return node;
+	}
+
+	return node;
+}
+
+#ifndef LIBLOCKDEP_STATIC_ENTRIES
+#define LIBLOCKDEP_STATIC_ENTRIES	1024
+#endif
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+static struct lock_lookup __locks[LIBLOCKDEP_STATIC_ENTRIES];
+static int __locks_nr;
+
+static inline bool is_static_lock(struct lock_lookup *lock)
+{
+	return lock >= __locks && lock < __locks + ARRAY_SIZE(__locks);
+}
+
+static struct lock_lookup *alloc_lock(void)
+{
+	if (__init_state != done) {
+		/*
+		 * Some programs attempt to initialize and use locks in their
+		 * allocation path. This means that a call to malloc() would
+		 * result in locks being initialized and locked.
+		 *
+		 * Why is it an issue for us? dlsym() below will try allocating
+		 * to give us the original function. Since this allocation will
+		 * result in a locking operations, we have to let pthread deal
+		 * with it, but we can't! we don't have the pointer to the
+		 * original API since we're inside dlsym() trying to get it
+		 */
+
+		int idx = __locks_nr++;
+		if (idx >= ARRAY_SIZE(__locks)) {
+			fprintf(stderr,
+		"LOCKDEP error: insufficient LIBLOCKDEP_STATIC_ENTRIES\n");
+			exit(EX_UNAVAILABLE);
+		}
+		return __locks + idx;
+	}
+
+	return malloc(sizeof(struct lock_lookup));
+}
+
+static inline void free_lock(struct lock_lookup *lock)
+{
+	if (likely(!is_static_lock(lock)))
+		free(lock);
+}
+
+/**
+ * __get_lock - find or create a lock instance
+ * @lock: pointer to a pthread lock function
+ *
+ * Try to find an existing lock in the rbtree using the provided pointer. If
+ * one wasn't found - create it.
+ */
+static struct lock_lookup *__get_lock(void *lock)
+{
+	struct rb_node **node, *parent;
+	struct lock_lookup *l;
+
+	ll_pthread_rwlock_rdlock(&locks_rwlock);
+	node = __get_lock_node(lock, &parent);
+	ll_pthread_rwlock_unlock(&locks_rwlock);
+	if (*node) {
+		return rb_entry(*node, struct lock_lookup, node);
+	}
+
+	/* We didn't find the lock, let's create it */
+	l = alloc_lock();
+	if (l == NULL)
+		return NULL;
+
+	l->orig = lock;
+	/*
+	 * Currently the name of the lock is the ptr value of the pthread lock,
+	 * while not optimal, it makes debugging a bit easier.
+	 *
+	 * TODO: Get the real name of the lock using libdwarf
+	 */
+	sprintf(l->name, "%p", lock);
+	lockdep_init_map(&l->dep_map, l->name, &l->key, 0);
+
+	ll_pthread_rwlock_wrlock(&locks_rwlock);
+	/* This might have changed since the last time we fetched it */
+	node = __get_lock_node(lock, &parent);
+	rb_link_node(&l->node, parent, node);
+	rb_insert_color(&l->node, &locks);
+	ll_pthread_rwlock_unlock(&locks_rwlock);
+
+	return l;
+}
+
+static void __del_lock(struct lock_lookup *lock)
+{
+	ll_pthread_rwlock_wrlock(&locks_rwlock);
+	rb_erase(&lock->node, &locks);
+	ll_pthread_rwlock_unlock(&locks_rwlock);
+	free_lock(lock);
+}
+
+int pthread_mutex_init(pthread_mutex_t *mutex,
+			const pthread_mutexattr_t *attr)
+{
+	int r;
+
+	/*
+	 * We keep trying to init our preload module because there might be
+	 * code in init sections that tries to touch locks before we are
+	 * initialized, in that case we'll need to manually call preload
+	 * to get us going.
+	 *
+	 * Funny enough, kernel's lockdep had the same issue, and used
+	 * (almost) the same solution. See look_up_lock_class() in
+	 * kernel/locking/lockdep.c for details.
+	 */
+	try_init_preload();
+
+	r = ll_pthread_mutex_init(mutex, attr);
+	if (r == 0)
+		/*
+		 * We do a dummy initialization here so that lockdep could
+		 * warn us if something fishy is going on - such as
+		 * initializing a held lock.
+		 */
+		__get_lock(mutex);
+
+	return r;
+}
+
+int pthread_mutex_lock(pthread_mutex_t *mutex)
+{
+	int r;
+
+	try_init_preload();
+
+	lock_acquire(&__get_lock(mutex)->dep_map, 0, 0, 0, 1, NULL,
+			(unsigned long)_RET_IP_);
+	/*
+	 * Here's the thing with pthread mutexes: unlike the kernel variant,
+	 * they can fail.
+	 *
+	 * This means that the behaviour here is a bit different from what's
+	 * going on in the kernel: there we just tell lockdep that we took the
+	 * lock before actually taking it, but here we must deal with the case
+	 * that locking failed.
+	 *
+	 * To do that we'll "release" the lock if locking failed - this way
+	 * we'll get lockdep doing the correct checks when we try to take
+	 * the lock, and if that fails - we'll be back to the correct
+	 * state by releasing it.
+	 */
+	r = ll_pthread_mutex_lock(mutex);
+	if (r)
+		lock_release(&__get_lock(mutex)->dep_map, 0, (unsigned long)_RET_IP_);
+
+	return r;
+}
+
+int pthread_mutex_trylock(pthread_mutex_t *mutex)
+{
+	int r;
+
+	try_init_preload();
+
+	lock_acquire(&__get_lock(mutex)->dep_map, 0, 1, 0, 1, NULL, (unsigned long)_RET_IP_);
+	r = ll_pthread_mutex_trylock(mutex);
+	if (r)
+		lock_release(&__get_lock(mutex)->dep_map, 0, (unsigned long)_RET_IP_);
+
+	return r;
+}
+
+int pthread_mutex_unlock(pthread_mutex_t *mutex)
+{
+	int r;
+
+	try_init_preload();
+
+	lock_release(&__get_lock(mutex)->dep_map, 0, (unsigned long)_RET_IP_);
+	/*
+	 * Just like taking a lock, only in reverse!
+	 *
+	 * If we fail releasing the lock, tell lockdep we're holding it again.
+	 */
+	r = ll_pthread_mutex_unlock(mutex);
+	if (r)
+		lock_acquire(&__get_lock(mutex)->dep_map, 0, 0, 0, 1, NULL, (unsigned long)_RET_IP_);
+
+	return r;
+}
+
+int pthread_mutex_destroy(pthread_mutex_t *mutex)
+{
+	try_init_preload();
+
+	/*
+	 * Let's see if we're releasing a lock that's held.
+	 *
+	 * TODO: Hook into free() and add that check there as well.
+	 */
+	debug_check_no_locks_freed(mutex, mutex + sizeof(*mutex));
+	__del_lock(__get_lock(mutex));
+	return ll_pthread_mutex_destroy(mutex);
+}
+
+/* This is the rwlock part, very similar to what happened with mutex above */
+int pthread_rwlock_init(pthread_rwlock_t *rwlock,
+			const pthread_rwlockattr_t *attr)
+{
+	int r;
+
+	try_init_preload();
+
+	r = ll_pthread_rwlock_init(rwlock, attr);
+	if (r == 0)
+		__get_lock(rwlock);
+
+	return r;
+}
+
+int pthread_rwlock_destroy(pthread_rwlock_t *rwlock)
+{
+	try_init_preload();
+
+	debug_check_no_locks_freed(rwlock, rwlock + sizeof(*rwlock));
+	__del_lock(__get_lock(rwlock));
+	return ll_pthread_rwlock_destroy(rwlock);
+}
+
+int pthread_rwlock_rdlock(pthread_rwlock_t *rwlock)
+{
+	int r;
+
+        init_preload();
+
+	lock_acquire(&__get_lock(rwlock)->dep_map, 0, 0, 2, 1, NULL, (unsigned long)_RET_IP_);
+	r = ll_pthread_rwlock_rdlock(rwlock);
+	if (r)
+		lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_);
+
+	return r;
+}
+
+int pthread_rwlock_tryrdlock(pthread_rwlock_t *rwlock)
+{
+	int r;
+
+        init_preload();
+
+	lock_acquire(&__get_lock(rwlock)->dep_map, 0, 1, 2, 1, NULL, (unsigned long)_RET_IP_);
+	r = ll_pthread_rwlock_tryrdlock(rwlock);
+	if (r)
+		lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_);
+
+	return r;
+}
+
+int pthread_rwlock_trywrlock(pthread_rwlock_t *rwlock)
+{
+	int r;
+
+        init_preload();
+
+	lock_acquire(&__get_lock(rwlock)->dep_map, 0, 1, 0, 1, NULL, (unsigned long)_RET_IP_);
+	r = ll_pthread_rwlock_trywrlock(rwlock);
+	if (r)
+                lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_);
+
+	return r;
+}
+
+int pthread_rwlock_wrlock(pthread_rwlock_t *rwlock)
+{
+	int r;
+
+        init_preload();
+
+	lock_acquire(&__get_lock(rwlock)->dep_map, 0, 0, 0, 1, NULL, (unsigned long)_RET_IP_);
+	r = ll_pthread_rwlock_wrlock(rwlock);
+	if (r)
+		lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_);
+
+	return r;
+}
+
+int pthread_rwlock_unlock(pthread_rwlock_t *rwlock)
+{
+	int r;
+
+        init_preload();
+
+	lock_release(&__get_lock(rwlock)->dep_map, 0, (unsigned long)_RET_IP_);
+	r = ll_pthread_rwlock_unlock(rwlock);
+	if (r)
+		lock_acquire(&__get_lock(rwlock)->dep_map, 0, 0, 0, 1, NULL, (unsigned long)_RET_IP_);
+
+	return r;
+}
+
+__attribute__((constructor)) static void init_preload(void)
+{
+	if (__init_state == done)
+		return;
+
+#ifndef __GLIBC__
+	__init_state = prepare;
+
+	ll_pthread_mutex_init = dlsym(RTLD_NEXT, "pthread_mutex_init");
+	ll_pthread_mutex_lock = dlsym(RTLD_NEXT, "pthread_mutex_lock");
+	ll_pthread_mutex_trylock = dlsym(RTLD_NEXT, "pthread_mutex_trylock");
+	ll_pthread_mutex_unlock = dlsym(RTLD_NEXT, "pthread_mutex_unlock");
+	ll_pthread_mutex_destroy = dlsym(RTLD_NEXT, "pthread_mutex_destroy");
+
+	ll_pthread_rwlock_init = dlsym(RTLD_NEXT, "pthread_rwlock_init");
+	ll_pthread_rwlock_destroy = dlsym(RTLD_NEXT, "pthread_rwlock_destroy");
+	ll_pthread_rwlock_rdlock = dlsym(RTLD_NEXT, "pthread_rwlock_rdlock");
+	ll_pthread_rwlock_tryrdlock = dlsym(RTLD_NEXT, "pthread_rwlock_tryrdlock");
+	ll_pthread_rwlock_wrlock = dlsym(RTLD_NEXT, "pthread_rwlock_wrlock");
+	ll_pthread_rwlock_trywrlock = dlsym(RTLD_NEXT, "pthread_rwlock_trywrlock");
+	ll_pthread_rwlock_unlock = dlsym(RTLD_NEXT, "pthread_rwlock_unlock");
+#endif
+
+	lockdep_init();
+
+	__init_state = done;
+}
diff --git a/tools/lib/lockdep/rbtree.c b/tools/lib/lockdep/rbtree.c
new file mode 100644
index 00000000000..f7f43033c8b
--- /dev/null
+++ b/tools/lib/lockdep/rbtree.c
@@ -0,0 +1 @@
+#include "../../../lib/rbtree.c"
diff --git a/tools/lib/lockdep/run_tests.sh b/tools/lib/lockdep/run_tests.sh
new file mode 100755
index 00000000000..5334ad9d39b
--- /dev/null
+++ b/tools/lib/lockdep/run_tests.sh
@@ -0,0 +1,27 @@
+#! /bin/bash
+
+make &> /dev/null
+
+for i in `ls tests/*.c`; do
+	testname=$(basename -s .c "$i")
+	gcc -o tests/$testname -pthread -lpthread $i liblockdep.a -Iinclude -D__USE_LIBLOCKDEP &> /dev/null
+	echo -ne "$testname... "
+	if [ $(timeout 1 ./tests/$testname | wc -l) -gt 0 ]; then
+		echo "PASSED!"
+	else
+		echo "FAILED!"
+	fi
+	rm tests/$testname
+done
+
+for i in `ls tests/*.c`; do
+	testname=$(basename -s .c "$i")
+	gcc -o tests/$testname -pthread -lpthread -Iinclude $i &> /dev/null
+	echo -ne "(PRELOAD) $testname... "
+	if [ $(timeout 1 ./lockdep ./tests/$testname | wc -l) -gt 0 ]; then
+		echo "PASSED!"
+	else
+		echo "FAILED!"
+	fi
+	rm tests/$testname
+done
diff --git a/tools/lib/lockdep/tests/AA.c b/tools/lib/lockdep/tests/AA.c
new file mode 100644
index 00000000000..0f782ff404a
--- /dev/null
+++ b/tools/lib/lockdep/tests/AA.c
@@ -0,0 +1,13 @@
+#include <liblockdep/mutex.h>
+
+void main(void)
+{
+	pthread_mutex_t a, b;
+
+	pthread_mutex_init(&a, NULL);
+	pthread_mutex_init(&b, NULL);
+
+	pthread_mutex_lock(&a);
+	pthread_mutex_lock(&b);
+	pthread_mutex_lock(&a);
+}
diff --git a/tools/lib/lockdep/tests/ABBA.c b/tools/lib/lockdep/tests/ABBA.c
new file mode 100644
index 00000000000..07f0e29d548
--- /dev/null
+++ b/tools/lib/lockdep/tests/ABBA.c
@@ -0,0 +1,13 @@
+#include <liblockdep/mutex.h>
+#include "common.h"
+
+void main(void)
+{
+	pthread_mutex_t a, b;
+
+	pthread_mutex_init(&a, NULL);
+	pthread_mutex_init(&b, NULL);
+
+	LOCK_UNLOCK_2(a, b);
+	LOCK_UNLOCK_2(b, a);
+}
diff --git a/tools/lib/lockdep/tests/ABBCCA.c b/tools/lib/lockdep/tests/ABBCCA.c
new file mode 100644
index 00000000000..843db09ac66
--- /dev/null
+++ b/tools/lib/lockdep/tests/ABBCCA.c
@@ -0,0 +1,15 @@
+#include <liblockdep/mutex.h>
+#include "common.h"
+
+void main(void)
+{
+	pthread_mutex_t a, b, c;
+
+	pthread_mutex_init(&a, NULL);
+	pthread_mutex_init(&b, NULL);
+	pthread_mutex_init(&c, NULL);
+
+	LOCK_UNLOCK_2(a, b);
+	LOCK_UNLOCK_2(b, c);
+	LOCK_UNLOCK_2(c, a);
+}
diff --git a/tools/lib/lockdep/tests/ABBCCDDA.c b/tools/lib/lockdep/tests/ABBCCDDA.c
new file mode 100644
index 00000000000..33620e268f8
--- /dev/null
+++ b/tools/lib/lockdep/tests/ABBCCDDA.c
@@ -0,0 +1,17 @@
+#include <liblockdep/mutex.h>
+#include "common.h"
+
+void main(void)
+{
+	pthread_mutex_t a, b, c, d;
+
+	pthread_mutex_init(&a, NULL);
+	pthread_mutex_init(&b, NULL);
+	pthread_mutex_init(&c, NULL);
+	pthread_mutex_init(&d, NULL);
+
+	LOCK_UNLOCK_2(a, b);
+	LOCK_UNLOCK_2(b, c);
+	LOCK_UNLOCK_2(c, d);
+	LOCK_UNLOCK_2(d, a);
+}
diff --git a/tools/lib/lockdep/tests/ABCABC.c b/tools/lib/lockdep/tests/ABCABC.c
new file mode 100644
index 00000000000..3fee51e3a68
--- /dev/null
+++ b/tools/lib/lockdep/tests/ABCABC.c
@@ -0,0 +1,15 @@
+#include <liblockdep/mutex.h>
+#include "common.h"
+
+void main(void)
+{
+	pthread_mutex_t a, b, c;
+
+	pthread_mutex_init(&a, NULL);
+	pthread_mutex_init(&b, NULL);
+	pthread_mutex_init(&c, NULL);
+
+	LOCK_UNLOCK_2(a, b);
+	LOCK_UNLOCK_2(c, a);
+	LOCK_UNLOCK_2(b, c);
+}
diff --git a/tools/lib/lockdep/tests/ABCDBCDA.c b/tools/lib/lockdep/tests/ABCDBCDA.c
new file mode 100644
index 00000000000..427ba562c75
--- /dev/null
+++ b/tools/lib/lockdep/tests/ABCDBCDA.c
@@ -0,0 +1,17 @@
+#include <liblockdep/mutex.h>
+#include "common.h"
+
+void main(void)
+{
+	pthread_mutex_t a, b, c, d;
+
+	pthread_mutex_init(&a, NULL);
+	pthread_mutex_init(&b, NULL);
+	pthread_mutex_init(&c, NULL);
+	pthread_mutex_init(&d, NULL);
+
+	LOCK_UNLOCK_2(a, b);
+	LOCK_UNLOCK_2(c, d);
+	LOCK_UNLOCK_2(b, c);
+	LOCK_UNLOCK_2(d, a);
+}
diff --git a/tools/lib/lockdep/tests/ABCDBDDA.c b/tools/lib/lockdep/tests/ABCDBDDA.c
new file mode 100644
index 00000000000..680c6cf3e91
--- /dev/null
+++ b/tools/lib/lockdep/tests/ABCDBDDA.c
@@ -0,0 +1,17 @@
+#include <liblockdep/mutex.h>
+#include "common.h"
+
+void main(void)
+{
+	pthread_mutex_t a, b, c, d;
+
+	pthread_mutex_init(&a, NULL);
+	pthread_mutex_init(&b, NULL);
+	pthread_mutex_init(&c, NULL);
+	pthread_mutex_init(&d, NULL);
+
+	LOCK_UNLOCK_2(a, b);
+	LOCK_UNLOCK_2(c, d);
+	LOCK_UNLOCK_2(b, d);
+	LOCK_UNLOCK_2(d, a);
+}
diff --git a/tools/lib/lockdep/tests/WW.c b/tools/lib/lockdep/tests/WW.c
new file mode 100644
index 00000000000..d44f77d7102
--- /dev/null
+++ b/tools/lib/lockdep/tests/WW.c
@@ -0,0 +1,13 @@
+#include <liblockdep/rwlock.h>
+
+void main(void)
+{
+	pthread_rwlock_t a, b;
+
+	pthread_rwlock_init(&a, NULL);
+	pthread_rwlock_init(&b, NULL);
+
+	pthread_rwlock_wrlock(&a);
+	pthread_rwlock_rdlock(&b);
+	pthread_rwlock_wrlock(&a);
+}
diff --git a/tools/lib/lockdep/tests/common.h b/tools/lib/lockdep/tests/common.h
new file mode 100644
index 00000000000..d89e94d47d8
--- /dev/null
+++ b/tools/lib/lockdep/tests/common.h
@@ -0,0 +1,12 @@
+#ifndef _LIBLOCKDEP_TEST_COMMON_H
+#define _LIBLOCKDEP_TEST_COMMON_H
+
+#define LOCK_UNLOCK_2(a, b)			\
+	do {					\
+		pthread_mutex_lock(&(a));	\
+		pthread_mutex_lock(&(b));	\
+		pthread_mutex_unlock(&(b));	\
+		pthread_mutex_unlock(&(a));	\
+	} while(0)
+
+#endif
diff --git a/tools/lib/lockdep/tests/unlock_balance.c b/tools/lib/lockdep/tests/unlock_balance.c
new file mode 100644
index 00000000000..0bc62de686f
--- /dev/null
+++ b/tools/lib/lockdep/tests/unlock_balance.c
@@ -0,0 +1,12 @@
+#include <liblockdep/mutex.h>
+
+void main(void)
+{
+	pthread_mutex_t a;
+
+	pthread_mutex_init(&a, NULL);
+
+	pthread_mutex_lock(&a);
+	pthread_mutex_unlock(&a);
+	pthread_mutex_unlock(&a);
+}
diff --git a/tools/lib/lockdep/uinclude/asm/hash.h b/tools/lib/lockdep/uinclude/asm/hash.h
new file mode 100644
index 00000000000..d82b170bb21
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/asm/hash.h
@@ -0,0 +1,6 @@
+#ifndef __ASM_GENERIC_HASH_H
+#define __ASM_GENERIC_HASH_H
+
+/* Stub */
+
+#endif /* __ASM_GENERIC_HASH_H */
diff --git a/tools/lib/lockdep/uinclude/asm/hweight.h b/tools/lib/lockdep/uinclude/asm/hweight.h
new file mode 100644
index 00000000000..fab00ff936d
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/asm/hweight.h
@@ -0,0 +1,3 @@
+
+/* empty file */
+
diff --git a/tools/lib/lockdep/uinclude/asm/sections.h b/tools/lib/lockdep/uinclude/asm/sections.h
new file mode 100644
index 00000000000..fab00ff936d
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/asm/sections.h
@@ -0,0 +1,3 @@
+
+/* empty file */
+
diff --git a/tools/lib/lockdep/uinclude/linux/bitops.h b/tools/lib/lockdep/uinclude/linux/bitops.h
new file mode 100644
index 00000000000..fab00ff936d
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/bitops.h
@@ -0,0 +1,3 @@
+
+/* empty file */
+
diff --git a/tools/lib/lockdep/uinclude/linux/compiler.h b/tools/lib/lockdep/uinclude/linux/compiler.h
new file mode 100644
index 00000000000..7ac838a1f19
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/compiler.h
@@ -0,0 +1,7 @@
+#ifndef _LIBLOCKDEP_LINUX_COMPILER_H_
+#define _LIBLOCKDEP_LINUX_COMPILER_H_
+
+#define __used		__attribute__((__unused__))
+#define unlikely
+
+#endif
diff --git a/tools/lib/lockdep/uinclude/linux/debug_locks.h b/tools/lib/lockdep/uinclude/linux/debug_locks.h
new file mode 100644
index 00000000000..f38eb64df79
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/debug_locks.h
@@ -0,0 +1,12 @@
+#ifndef _LIBLOCKDEP_DEBUG_LOCKS_H_
+#define _LIBLOCKDEP_DEBUG_LOCKS_H_
+
+#include <stddef.h>
+#include <linux/compiler.h>
+
+#define DEBUG_LOCKS_WARN_ON(x) (x)
+
+extern bool debug_locks;
+extern bool debug_locks_silent;
+
+#endif
diff --git a/tools/lib/lockdep/uinclude/linux/delay.h b/tools/lib/lockdep/uinclude/linux/delay.h
new file mode 100644
index 00000000000..fab00ff936d
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/delay.h
@@ -0,0 +1,3 @@
+
+/* empty file */
+
diff --git a/tools/lib/lockdep/uinclude/linux/ftrace.h b/tools/lib/lockdep/uinclude/linux/ftrace.h
new file mode 100644
index 00000000000..fab00ff936d
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/ftrace.h
@@ -0,0 +1,3 @@
+
+/* empty file */
+
diff --git a/tools/lib/lockdep/uinclude/linux/gfp.h b/tools/lib/lockdep/uinclude/linux/gfp.h
new file mode 100644
index 00000000000..fab00ff936d
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/gfp.h
@@ -0,0 +1,3 @@
+
+/* empty file */
+
diff --git a/tools/lib/lockdep/uinclude/linux/hardirq.h b/tools/lib/lockdep/uinclude/linux/hardirq.h
new file mode 100644
index 00000000000..c8f3f8f5872
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/hardirq.h
@@ -0,0 +1,11 @@
+#ifndef _LIBLOCKDEP_LINUX_HARDIRQ_H_
+#define _LIBLOCKDEP_LINUX_HARDIRQ_H_
+
+#define SOFTIRQ_BITS	0UL
+#define HARDIRQ_BITS	0UL
+#define SOFTIRQ_SHIFT	0UL
+#define HARDIRQ_SHIFT	0UL
+#define hardirq_count()	0UL
+#define softirq_count()	0UL
+
+#endif
diff --git a/tools/lib/lockdep/uinclude/linux/hash.h b/tools/lib/lockdep/uinclude/linux/hash.h
new file mode 100644
index 00000000000..0f8479858dc
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/hash.h
@@ -0,0 +1 @@
+#include "../../../include/linux/hash.h"
diff --git a/tools/lib/lockdep/uinclude/linux/interrupt.h b/tools/lib/lockdep/uinclude/linux/interrupt.h
new file mode 100644
index 00000000000..fab00ff936d
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/interrupt.h
@@ -0,0 +1,3 @@
+
+/* empty file */
+
diff --git a/tools/lib/lockdep/uinclude/linux/irqflags.h b/tools/lib/lockdep/uinclude/linux/irqflags.h
new file mode 100644
index 00000000000..6cc296f0fad
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/irqflags.h
@@ -0,0 +1,38 @@
+#ifndef _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_
+#define _LIBLOCKDEP_LINUX_TRACE_IRQFLAGS_H_
+
+# define trace_hardirq_context(p)	0
+# define trace_softirq_context(p)	0
+# define trace_hardirqs_enabled(p)	0
+# define trace_softirqs_enabled(p)	0
+# define trace_hardirq_enter()		do { } while (0)
+# define trace_hardirq_exit()		do { } while (0)
+# define lockdep_softirq_enter()	do { } while (0)
+# define lockdep_softirq_exit()		do { } while (0)
+# define INIT_TRACE_IRQFLAGS
+
+# define stop_critical_timings() do { } while (0)
+# define start_critical_timings() do { } while (0)
+
+#define raw_local_irq_disable() do { } while (0)
+#define raw_local_irq_enable() do { } while (0)
+#define raw_local_irq_save(flags) ((flags) = 0)
+#define raw_local_irq_restore(flags) do { } while (0)
+#define raw_local_save_flags(flags) ((flags) = 0)
+#define raw_irqs_disabled_flags(flags) do { } while (0)
+#define raw_irqs_disabled() 0
+#define raw_safe_halt()
+
+#define local_irq_enable() do { } while (0)
+#define local_irq_disable() do { } while (0)
+#define local_irq_save(flags) ((flags) = 0)
+#define local_irq_restore(flags) do { } while (0)
+#define local_save_flags(flags)	((flags) = 0)
+#define irqs_disabled() (1)
+#define irqs_disabled_flags(flags) (0)
+#define safe_halt() do { } while (0)
+
+#define trace_lock_release(x, y)
+#define trace_lock_acquire(a, b, c, d, e, f, g)
+
+#endif
diff --git a/tools/lib/lockdep/uinclude/linux/kallsyms.h b/tools/lib/lockdep/uinclude/linux/kallsyms.h
new file mode 100644
index 00000000000..b0f2dbdf1a1
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/kallsyms.h
@@ -0,0 +1,32 @@
+#ifndef _LIBLOCKDEP_LINUX_KALLSYMS_H_
+#define _LIBLOCKDEP_LINUX_KALLSYMS_H_
+
+#include <linux/kernel.h>
+#include <stdio.h>
+
+#define KSYM_NAME_LEN 128
+
+struct module;
+
+static inline const char *kallsyms_lookup(unsigned long addr,
+					  unsigned long *symbolsize,
+					  unsigned long *offset,
+					  char **modname, char *namebuf)
+{
+	return NULL;
+}
+
+#include <execinfo.h>
+#include <stdlib.h>
+static inline void print_ip_sym(unsigned long ip)
+{
+	char **name;
+
+	name = backtrace_symbols((void **)&ip, 1);
+
+	printf("%s\n", *name);
+
+	free(name);
+}
+
+#endif
diff --git a/tools/lib/lockdep/uinclude/linux/kern_levels.h b/tools/lib/lockdep/uinclude/linux/kern_levels.h
new file mode 100644
index 00000000000..3b9bade2869
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/kern_levels.h
@@ -0,0 +1,25 @@
+#ifndef __KERN_LEVELS_H__
+#define __KERN_LEVELS_H__
+
+#define KERN_SOH	""		/* ASCII Start Of Header */
+#define KERN_SOH_ASCII	''
+
+#define KERN_EMERG	KERN_SOH ""	/* system is unusable */
+#define KERN_ALERT	KERN_SOH ""	/* action must be taken immediately */
+#define KERN_CRIT	KERN_SOH ""	/* critical conditions */
+#define KERN_ERR	KERN_SOH ""	/* error conditions */
+#define KERN_WARNING	KERN_SOH ""	/* warning conditions */
+#define KERN_NOTICE	KERN_SOH ""	/* normal but significant condition */
+#define KERN_INFO	KERN_SOH ""	/* informational */
+#define KERN_DEBUG	KERN_SOH ""	/* debug-level messages */
+
+#define KERN_DEFAULT	KERN_SOH ""	/* the default kernel loglevel */
+
+/*
+ * Annotation for a "continued" line of log printout (only done after a
+ * line that had no enclosing \n). Only to be used by core/arch code
+ * during early bootup (a continued line is not SMP-safe otherwise).
+ */
+#define KERN_CONT	""
+
+#endif
diff --git a/tools/lib/lockdep/uinclude/linux/kernel.h b/tools/lib/lockdep/uinclude/linux/kernel.h
new file mode 100644
index 00000000000..a11e3c357be
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/kernel.h
@@ -0,0 +1,44 @@
+#ifndef _LIBLOCKDEP_LINUX_KERNEL_H_
+#define _LIBLOCKDEP_LINUX_KERNEL_H_
+
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/rcu.h>
+#include <linux/hardirq.h>
+#include <linux/kern_levels.h>
+
+#ifndef container_of
+#define container_of(ptr, type, member) ({			\
+	const typeof(((type *)0)->member) * __mptr = (ptr);	\
+	(type *)((char *)__mptr - offsetof(type, member)); })
+#endif
+
+#define max(x, y) ({				\
+	typeof(x) _max1 = (x);			\
+	typeof(y) _max2 = (y);			\
+	(void) (&_max1 == &_max2);		\
+	_max1 > _max2 ? _max1 : _max2; })
+
+#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
+#define WARN_ON(x) (x)
+#define WARN_ON_ONCE(x) (x)
+#define likely(x) (x)
+#define WARN(x, y, z) (x)
+#define uninitialized_var(x) x
+#define __init
+#define noinline
+#define list_add_tail_rcu list_add_tail
+
+#ifndef CALLER_ADDR0
+#define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#endif
+
+#ifndef _RET_IP_
+#define _RET_IP_ CALLER_ADDR0
+#endif
+
+#ifndef _THIS_IP_
+#define _THIS_IP_ ({ __label__ __here; __here: (unsigned long)&&__here; })
+#endif
+
+#endif
diff --git a/tools/lib/lockdep/uinclude/linux/kmemcheck.h b/tools/lib/lockdep/uinclude/linux/kmemcheck.h
new file mode 100644
index 00000000000..94d598bc6ab
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/kmemcheck.h
@@ -0,0 +1,8 @@
+#ifndef _LIBLOCKDEP_LINUX_KMEMCHECK_H_
+#define _LIBLOCKDEP_LINUX_KMEMCHECK_H_
+
+static inline void kmemcheck_mark_initialized(void *address, unsigned int n)
+{
+}
+
+#endif
diff --git a/tools/lib/lockdep/uinclude/linux/linkage.h b/tools/lib/lockdep/uinclude/linux/linkage.h
new file mode 100644
index 00000000000..fab00ff936d
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/linkage.h
@@ -0,0 +1,3 @@
+
+/* empty file */
+
diff --git a/tools/lib/lockdep/uinclude/linux/list.h b/tools/lib/lockdep/uinclude/linux/list.h
new file mode 100644
index 00000000000..6e9ef31ed82
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/list.h
@@ -0,0 +1 @@
+#include "../../../include/linux/list.h"
diff --git a/tools/lib/lockdep/uinclude/linux/lockdep.h b/tools/lib/lockdep/uinclude/linux/lockdep.h
new file mode 100644
index 00000000000..c1552c28507
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/lockdep.h
@@ -0,0 +1,58 @@
+#ifndef _LIBLOCKDEP_LOCKDEP_H_
+#define _LIBLOCKDEP_LOCKDEP_H_
+
+#include <sys/prctl.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include <limits.h>
+#include <linux/utsname.h>
+
+
+#define MAX_LOCK_DEPTH 2000UL
+
+#define asmlinkage
+#define __visible
+
+#include "../../../include/linux/lockdep.h"
+
+struct task_struct {
+	u64 curr_chain_key;
+	int lockdep_depth;
+	unsigned int lockdep_recursion;
+	struct held_lock held_locks[MAX_LOCK_DEPTH];
+	gfp_t lockdep_reclaim_gfp;
+	int pid;
+	char comm[17];
+};
+
+extern struct task_struct *__curr(void);
+
+#define current (__curr())
+
+#define debug_locks_off() 1
+#define task_pid_nr(tsk) ((tsk)->pid)
+
+#define KSYM_NAME_LEN 128
+#define printk printf
+
+#define list_del_rcu list_del
+
+#define atomic_t unsigned long
+#define atomic_inc(x) ((*(x))++)
+
+static struct new_utsname *init_utsname(void)
+{
+	static struct new_utsname n = (struct new_utsname) {
+		.release = "liblockdep",
+		.version = LIBLOCKDEP_VERSION,
+	};
+
+	return &n;
+}
+
+#define print_tainted() ""
+#define static_obj(x) 1
+
+#define debug_show_all_locks()
+
+#endif
diff --git a/tools/lib/lockdep/uinclude/linux/module.h b/tools/lib/lockdep/uinclude/linux/module.h
new file mode 100644
index 00000000000..09c7a7be8cc
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/module.h
@@ -0,0 +1,6 @@
+#ifndef _LIBLOCKDEP_LINUX_MODULE_H_
+#define _LIBLOCKDEP_LINUX_MODULE_H_
+
+#define module_param(name, type, perm)
+
+#endif
diff --git a/tools/lib/lockdep/uinclude/linux/mutex.h b/tools/lib/lockdep/uinclude/linux/mutex.h
new file mode 100644
index 00000000000..fab00ff936d
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/mutex.h
@@ -0,0 +1,3 @@
+
+/* empty file */
+
diff --git a/tools/lib/lockdep/uinclude/linux/poison.h b/tools/lib/lockdep/uinclude/linux/poison.h
new file mode 100644
index 00000000000..0c27bdf1423
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/poison.h
@@ -0,0 +1 @@
+#include "../../../include/linux/poison.h"
diff --git a/tools/lib/lockdep/uinclude/linux/prefetch.h b/tools/lib/lockdep/uinclude/linux/prefetch.h
new file mode 100644
index 00000000000..d73fe6f850a
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/prefetch.h
@@ -0,0 +1,6 @@
+#ifndef _LIBLOCKDEP_LINUX_PREFETCH_H_
+#define _LIBLOCKDEP_LINUX_PREFETCH_H
+
+static inline void prefetch(void *a __attribute__((unused))) { }
+
+#endif
diff --git a/tools/lib/lockdep/uinclude/linux/proc_fs.h b/tools/lib/lockdep/uinclude/linux/proc_fs.h
new file mode 100644
index 00000000000..fab00ff936d
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/proc_fs.h
@@ -0,0 +1,3 @@
+
+/* empty file */
+
diff --git a/tools/lib/lockdep/uinclude/linux/rbtree.h b/tools/lib/lockdep/uinclude/linux/rbtree.h
new file mode 100644
index 00000000000..965901db486
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/rbtree.h
@@ -0,0 +1 @@
+#include "../../../include/linux/rbtree.h"
diff --git a/tools/lib/lockdep/uinclude/linux/rbtree_augmented.h b/tools/lib/lockdep/uinclude/linux/rbtree_augmented.h
new file mode 100644
index 00000000000..c3759477379
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/rbtree_augmented.h
@@ -0,0 +1,2 @@
+#define __always_inline
+#include "../../../include/linux/rbtree_augmented.h"
diff --git a/tools/lib/lockdep/uinclude/linux/rcu.h b/tools/lib/lockdep/uinclude/linux/rcu.h
new file mode 100644
index 00000000000..042ee8e463c
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/rcu.h
@@ -0,0 +1,21 @@
+#ifndef _LIBLOCKDEP_RCU_H_
+#define _LIBLOCKDEP_RCU_H_
+
+int rcu_scheduler_active;
+
+static inline int rcu_lockdep_current_cpu_online(void)
+{
+	return 1;
+}
+
+static inline int rcu_is_cpu_idle(void)
+{
+	return 1;
+}
+
+static inline bool rcu_is_watching(void)
+{
+	return false;
+}
+
+#endif
diff --git a/tools/lib/lockdep/uinclude/linux/seq_file.h b/tools/lib/lockdep/uinclude/linux/seq_file.h
new file mode 100644
index 00000000000..fab00ff936d
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/seq_file.h
@@ -0,0 +1,3 @@
+
+/* empty file */
+
diff --git a/tools/lib/lockdep/uinclude/linux/spinlock.h b/tools/lib/lockdep/uinclude/linux/spinlock.h
new file mode 100644
index 00000000000..68c1aa2bcba
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/spinlock.h
@@ -0,0 +1,25 @@
+#ifndef _LIBLOCKDEP_SPINLOCK_H_
+#define _LIBLOCKDEP_SPINLOCK_H_
+
+#include <pthread.h>
+#include <stdbool.h>
+
+#define arch_spinlock_t pthread_mutex_t
+#define __ARCH_SPIN_LOCK_UNLOCKED PTHREAD_MUTEX_INITIALIZER
+
+static inline void arch_spin_lock(arch_spinlock_t *mutex)
+{
+	pthread_mutex_lock(mutex);
+}
+
+static inline void arch_spin_unlock(arch_spinlock_t *mutex)
+{
+	pthread_mutex_unlock(mutex);
+}
+
+static inline bool arch_spin_is_locked(arch_spinlock_t *mutex)
+{
+	return true;
+}
+
+#endif
diff --git a/tools/lib/lockdep/uinclude/linux/stacktrace.h b/tools/lib/lockdep/uinclude/linux/stacktrace.h
new file mode 100644
index 00000000000..39aecc6b19d
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/stacktrace.h
@@ -0,0 +1,32 @@
+#ifndef _LIBLOCKDEP_LINUX_STACKTRACE_H_
+#define _LIBLOCKDEP_LINUX_STACKTRACE_H_
+
+#include <execinfo.h>
+
+struct stack_trace {
+	unsigned int nr_entries, max_entries;
+	unsigned long *entries;
+	int skip;
+};
+
+static inline void print_stack_trace(struct stack_trace *trace, int spaces)
+{
+	backtrace_symbols_fd((void **)trace->entries, trace->nr_entries, 1);
+}
+
+#define save_stack_trace(trace)	\
+	((trace)->nr_entries =	\
+		backtrace((void **)(trace)->entries, (trace)->max_entries))
+
+static inline int dump_stack(void)
+{
+	void *array[64];
+	size_t size;
+
+	size = backtrace(array, 64);
+	backtrace_symbols_fd(array, size, 1);
+
+	return 0;
+}
+
+#endif
diff --git a/tools/lib/lockdep/uinclude/linux/stringify.h b/tools/lib/lockdep/uinclude/linux/stringify.h
new file mode 100644
index 00000000000..05dfcd1ac11
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/linux/stringify.h
@@ -0,0 +1,7 @@
+#ifndef _LIBLOCKDEP_LINUX_STRINGIFY_H_
+#define _LIBLOCKDEP_LINUX_STRINGIFY_H_
+
+#define __stringify_1(x...)	#x
+#define __stringify(x...)	__stringify_1(x)
+
+#endif
diff --git a/tools/lib/lockdep/uinclude/trace/events/lock.h b/tools/lib/lockdep/uinclude/trace/events/lock.h
new file mode 100644
index 00000000000..fab00ff936d
--- /dev/null
+++ b/tools/lib/lockdep/uinclude/trace/events/lock.h
@@ -0,0 +1,3 @@
+
+/* empty file */
+
diff --git a/tools/lib/symbol/kallsyms.c b/tools/lib/symbol/kallsyms.c
new file mode 100644
index 00000000000..18bc271a4bb
--- /dev/null
+++ b/tools/lib/symbol/kallsyms.c
@@ -0,0 +1,58 @@
+#include "symbol/kallsyms.h"
+#include <stdio.h>
+#include <stdlib.h>
+
+int kallsyms__parse(const char *filename, void *arg,
+		    int (*process_symbol)(void *arg, const char *name,
+					  char type, u64 start))
+{
+	char *line = NULL;
+	size_t n;
+	int err = -1;
+	FILE *file = fopen(filename, "r");
+
+	if (file == NULL)
+		goto out_failure;
+
+	err = 0;
+
+	while (!feof(file)) {
+		u64 start;
+		int line_len, len;
+		char symbol_type;
+		char *symbol_name;
+
+		line_len = getline(&line, &n, file);
+		if (line_len < 0 || !line)
+			break;
+
+		line[--line_len] = '\0'; /* \n */
+
+		len = hex2u64(line, &start);
+
+		len++;
+		if (len + 2 >= line_len)
+			continue;
+
+		symbol_type = line[len];
+		len += 2;
+		symbol_name = line + len;
+		len = line_len - len;
+
+		if (len >= KSYM_NAME_LEN) {
+			err = -1;
+			break;
+		}
+
+		err = process_symbol(arg, symbol_name, symbol_type, start);
+		if (err)
+			break;
+	}
+
+	free(line);
+	fclose(file);
+	return err;
+
+out_failure:
+	return -1;
+}
diff --git a/tools/lib/symbol/kallsyms.h b/tools/lib/symbol/kallsyms.h
new file mode 100644
index 00000000000..6084f5e18b3
--- /dev/null
+++ b/tools/lib/symbol/kallsyms.h
@@ -0,0 +1,24 @@
+#ifndef __TOOLS_KALLSYMS_H_
+#define __TOOLS_KALLSYMS_H_ 1
+
+#include <elf.h>
+#include <linux/ctype.h>
+#include <linux/types.h>
+
+#ifndef KSYM_NAME_LEN
+#define KSYM_NAME_LEN 256
+#endif
+
+static inline u8 kallsyms2elf_type(char type)
+{
+	if (type == 'W')
+		return STB_WEAK;
+
+	return isupper(type) ? STB_GLOBAL : STB_LOCAL;
+}
+
+int kallsyms__parse(const char *filename, void *arg,
+		    int (*process_symbol)(void *arg, const char *name,
+					  char type, u64 start));
+
+#endif /* __TOOLS_KALLSYMS_H_ */
diff --git a/tools/lib/traceevent/.gitignore b/tools/lib/traceevent/.gitignore
new file mode 100644
index 00000000000..35f56be5a4c
--- /dev/null
+++ b/tools/lib/traceevent/.gitignore
@@ -0,0 +1 @@
+TRACEEVENT-CFLAGS
diff --git a/tools/lib/traceevent/Makefile b/tools/lib/traceevent/Makefile
new file mode 100644
index 00000000000..005c9cc0693
--- /dev/null
+++ b/tools/lib/traceevent/Makefile
@@ -0,0 +1,340 @@
+# trace-cmd version
+EP_VERSION = 1
+EP_PATCHLEVEL = 1
+EP_EXTRAVERSION = 0
+
+# file format version
+FILE_VERSION = 6
+
+MAKEFLAGS += --no-print-directory
+
+
+# Makefiles suck: This macro sets a default value of $(2) for the
+# variable named by $(1), unless the variable has been set by
+# environment or command line. This is necessary for CC and AR
+# because make sets default values, so the simpler ?= approach
+# won't work as expected.
+define allow-override
+  $(if $(or $(findstring environment,$(origin $(1))),\
+            $(findstring command line,$(origin $(1)))),,\
+    $(eval $(1) = $(2)))
+endef
+
+# Allow setting CC and AR, or setting CROSS_COMPILE as a prefix.
+$(call allow-override,CC,$(CROSS_COMPILE)gcc)
+$(call allow-override,AR,$(CROSS_COMPILE)ar)
+
+EXT = -std=gnu99
+INSTALL = install
+
+# Use DESTDIR for installing into a different root directory.
+# This is useful for building a package. The program will be
+# installed in this directory as if it was the root directory.
+# Then the build tool can move it later.
+DESTDIR ?=
+DESTDIR_SQ = '$(subst ','\'',$(DESTDIR))'
+
+prefix ?= /usr/local
+bindir_relative = bin
+bindir = $(prefix)/$(bindir_relative)
+man_dir = $(prefix)/share/man
+man_dir_SQ = '$(subst ','\'',$(man_dir))'
+
+export man_dir man_dir_SQ INSTALL
+export DESTDIR DESTDIR_SQ
+
+set_plugin_dir := 1
+
+# Set plugin_dir to preffered global plugin location
+# If we install under $HOME directory we go under
+# $(HOME)/.traceevent/plugins
+#
+# We dont set PLUGIN_DIR in case we install under $HOME
+# directory, because by default the code looks under:
+# $(HOME)/.traceevent/plugins by default.
+#
+ifeq ($(plugin_dir),)
+ifeq ($(prefix),$(HOME))
+override plugin_dir = $(HOME)/.traceevent/plugins
+set_plugin_dir := 0
+else
+override plugin_dir = $(prefix)/lib/traceevent/plugins
+endif
+endif
+
+ifeq ($(set_plugin_dir),1)
+PLUGIN_DIR = -DPLUGIN_DIR="$(plugin_dir)"
+PLUGIN_DIR_SQ = '$(subst ','\'',$(PLUGIN_DIR))'
+endif
+
+include $(if $(BUILD_SRC),$(BUILD_SRC)/)../../scripts/Makefile.include
+
+# copy a bit from Linux kbuild
+
+ifeq ("$(origin V)", "command line")
+  VERBOSE = $(V)
+endif
+ifndef VERBOSE
+  VERBOSE = 0
+endif
+
+ifeq ("$(origin O)", "command line")
+  BUILD_OUTPUT := $(O)
+endif
+
+ifeq ($(BUILD_SRC),)
+ifneq ($(OUTPUT),)
+
+define build_output
+  $(if $(VERBOSE:1=),@)+$(MAKE) -C $(OUTPUT) \
+  BUILD_SRC=$(CURDIR)/ -f $(CURDIR)/Makefile $1
+endef
+
+all: sub-make
+
+$(MAKECMDGOALS): sub-make
+
+sub-make: force
+	$(call build_output, $(MAKECMDGOALS))
+
+
+# Leave processing to above invocation of make
+skip-makefile := 1
+
+endif # OUTPUT
+endif # BUILD_SRC
+
+# We process the rest of the Makefile if this is the final invocation of make
+ifeq ($(skip-makefile),)
+
+srctree		:= $(if $(BUILD_SRC),$(BUILD_SRC),$(CURDIR))
+objtree		:= $(CURDIR)
+src		:= $(srctree)
+obj		:= $(objtree)
+
+export prefix bindir src obj
+
+# Shell quotes
+bindir_SQ = $(subst ','\'',$(bindir))
+bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
+plugin_dir_SQ = $(subst ','\'',$(plugin_dir))
+
+LIB_FILE = libtraceevent.a libtraceevent.so
+
+CONFIG_INCLUDES = 
+CONFIG_LIBS	=
+CONFIG_FLAGS	=
+
+VERSION		= $(EP_VERSION)
+PATCHLEVEL	= $(EP_PATCHLEVEL)
+EXTRAVERSION	= $(EP_EXTRAVERSION)
+
+OBJ		= $@
+N		=
+
+export Q VERBOSE
+
+EVENT_PARSE_VERSION = $(EP_VERSION).$(EP_PATCHLEVEL).$(EP_EXTRAVERSION)
+
+INCLUDES = -I. -I $(srctree)/../../include $(CONFIG_INCLUDES)
+
+# Set compile option CFLAGS if not set elsewhere
+CFLAGS ?= -g -Wall
+
+# Append required CFLAGS
+override CFLAGS += $(CONFIG_FLAGS) $(INCLUDES) $(PLUGIN_DIR_SQ)
+override CFLAGS += $(udis86-flags) -D_GNU_SOURCE
+
+ifeq ($(VERBOSE),1)
+  Q =
+else
+  Q = @
+endif
+
+do_compile_shared_library =			\
+	($(print_shared_lib_compile)		\
+	$(CC) --shared $^ -o $@)
+
+do_plugin_build =				\
+	($(print_plugin_build)			\
+	$(CC) $(CFLAGS) -shared -nostartfiles -o $@ $<)
+
+do_build_static_lib =				\
+	($(print_static_lib_build)		\
+	$(RM) $@;  $(AR) rcs $@ $^)
+
+
+do_compile = $(QUIET_CC)$(CC) -c $(CFLAGS) $(EXT) $< -o $(obj)/$@;
+
+$(obj)/%.o: $(src)/%.c
+	$(call do_compile)
+
+%.o: $(src)/%.c
+	$(call do_compile)
+
+PEVENT_LIB_OBJS  = event-parse.o
+PEVENT_LIB_OBJS += event-plugin.o
+PEVENT_LIB_OBJS += trace-seq.o
+PEVENT_LIB_OBJS += parse-filter.o
+PEVENT_LIB_OBJS += parse-utils.o
+PEVENT_LIB_OBJS += kbuffer-parse.o
+
+PLUGIN_OBJS  = plugin_jbd2.o
+PLUGIN_OBJS += plugin_hrtimer.o
+PLUGIN_OBJS += plugin_kmem.o
+PLUGIN_OBJS += plugin_kvm.o
+PLUGIN_OBJS += plugin_mac80211.o
+PLUGIN_OBJS += plugin_sched_switch.o
+PLUGIN_OBJS += plugin_function.o
+PLUGIN_OBJS += plugin_xen.o
+PLUGIN_OBJS += plugin_scsi.o
+PLUGIN_OBJS += plugin_cfg80211.o
+
+PLUGINS := $(PLUGIN_OBJS:.o=.so)
+
+ALL_OBJS = $(PEVENT_LIB_OBJS) $(PLUGIN_OBJS)
+
+CMD_TARGETS = $(LIB_FILE) $(PLUGINS)
+
+TARGETS = $(CMD_TARGETS)
+
+
+all: all_cmd
+
+all_cmd: $(CMD_TARGETS)
+
+libtraceevent.so: $(PEVENT_LIB_OBJS)
+	$(QUIET_LINK)$(CC) --shared $^ -o $@
+
+libtraceevent.a: $(PEVENT_LIB_OBJS)
+	$(QUIET_LINK)$(RM) $@; $(AR) rcs $@ $^
+
+plugins: $(PLUGINS)
+
+$(PEVENT_LIB_OBJS): %.o: $(src)/%.c TRACEEVENT-CFLAGS
+	$(QUIET_CC_FPIC)$(CC) -c $(CFLAGS) $(EXT) -fPIC $< -o $@
+
+$(PLUGIN_OBJS): %.o : $(src)/%.c
+	$(QUIET_CC_FPIC)$(CC) -c $(CFLAGS) -fPIC -o $@ $<
+
+$(PLUGINS): %.so: %.o
+	$(QUIET_LINK)$(CC) $(CFLAGS) -shared -nostartfiles -o $@ $<
+
+define make_version.h
+  (echo '/* This file is automatically generated. Do not modify. */';		\
+   echo \#define VERSION_CODE $(shell						\
+   expr $(VERSION) \* 256 + $(PATCHLEVEL));					\
+   echo '#define EXTRAVERSION ' $(EXTRAVERSION);				\
+   echo '#define VERSION_STRING "'$(VERSION).$(PATCHLEVEL).$(EXTRAVERSION)'"';	\
+   echo '#define FILE_VERSION '$(FILE_VERSION);					\
+  ) > $1
+endef
+
+define update_version.h
+  ($(call make_version.h, $@.tmp);		\
+    if [ -r $@ ] && cmp -s $@ $@.tmp; then	\
+      rm -f $@.tmp;				\
+    else					\
+      echo '  UPDATE                 $@';	\
+      mv -f $@.tmp $@;				\
+    fi);
+endef
+
+ep_version.h: force
+	$(Q)$(N)$(call update_version.h)
+
+VERSION_FILES = ep_version.h
+
+define update_dir
+  (echo $1 > $@.tmp;				\
+   if [ -r $@ ] && cmp -s $@ $@.tmp; then	\
+     rm -f $@.tmp;				\
+   else						\
+     echo '  UPDATE                 $@';	\
+     mv -f $@.tmp $@;				\
+   fi);
+endef
+
+## make deps
+
+all_objs := $(sort $(ALL_OBJS))
+all_deps := $(all_objs:%.o=.%.d)
+
+# let .d file also depends on the source and header files
+define check_deps
+  @set -e; $(RM) $@; \
+  $(CC) -MM $(CFLAGS) $< > $@.$$$$; \
+  sed 's,\($*\)\.o[ :]*,\1.o $@ : ,g' < $@.$$$$ > $@; \
+  $(RM) $@.$$$$
+endef
+
+$(all_deps): .%.d: $(src)/%.c
+	$(Q)$(call check_deps)
+
+$(all_objs) : %.o : .%.d
+
+dep_includes := $(wildcard $(all_deps))
+
+ifneq ($(dep_includes),)
+ include $(dep_includes)
+endif
+
+### Detect environment changes
+TRACK_CFLAGS = $(subst ','\'',$(CFLAGS)):$(ARCH):$(CROSS_COMPILE)
+
+TRACEEVENT-CFLAGS: force
+	@FLAGS='$(TRACK_CFLAGS)'; \
+	    if test x"$$FLAGS" != x"`cat TRACEEVENT-CFLAGS 2>/dev/null`" ; then \
+		echo 1>&2 "  FLAGS:   * new build flags or cross compiler"; \
+		echo "$$FLAGS" >TRACEEVENT-CFLAGS; \
+            fi
+
+tags:	force
+	$(RM) tags
+	find . -name '*.[ch]' | xargs ctags --extra=+f --c-kinds=+px \
+	--regex-c++='/_PE\(([^,)]*).*/PEVENT_ERRNO__\1/'
+
+TAGS:	force
+	$(RM) TAGS
+	find . -name '*.[ch]' | xargs etags \
+	--regex='/_PE(\([^,)]*\).*/PEVENT_ERRNO__\1/'
+
+define do_install
+	if [ ! -d '$(DESTDIR_SQ)$2' ]; then		\
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$2';	\
+	fi;						\
+	$(INSTALL) $1 '$(DESTDIR_SQ)$2'
+endef
+
+define do_install_plugins
+	for plugin in $1; do				\
+	  $(call do_install,$$plugin,$(plugin_dir_SQ));	\
+	done
+endef
+
+install_lib: all_cmd install_plugins
+	$(call QUIET_INSTALL, $(LIB_FILE)) \
+		$(call do_install,$(LIB_FILE),$(bindir_SQ))
+
+install_plugins: $(PLUGINS)
+	$(call QUIET_INSTALL, trace_plugins) \
+		$(call do_install_plugins, $(PLUGINS))
+
+install: install_lib
+
+clean:
+	$(call QUIET_CLEAN, libtraceevent) \
+		$(RM) *.o *~ $(TARGETS) *.a *.so $(VERSION_FILES) .*.d \
+		$(RM) TRACEEVENT-CFLAGS tags TAGS
+
+endif # skip-makefile
+
+PHONY += force plugins
+force:
+
+plugins:
+	@echo > /dev/null
+
+# Declare the contents of the .PHONY variable as phony.  We keep that
+# information in a variable so we can use it in if_changed and friends.
+.PHONY: $(PHONY)
diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c
new file mode 100644
index 00000000000..93825a17dcc
--- /dev/null
+++ b/tools/lib/traceevent/event-parse.c
@@ -0,0 +1,6025 @@
+/*
+ * Copyright (C) 2009, 2010 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ *
+ *  The parts for function graph printing was taken and modified from the
+ *  Linux Kernel that were written by
+ *    - Copyright (C) 2009  Frederic Weisbecker,
+ *  Frederic Weisbecker gave his permission to relicense the code to
+ *  the Lesser General Public License.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <ctype.h>
+#include <errno.h>
+#include <stdint.h>
+#include <limits.h>
+
+#include "event-parse.h"
+#include "event-utils.h"
+
+static const char *input_buf;
+static unsigned long long input_buf_ptr;
+static unsigned long long input_buf_siz;
+
+static int is_flag_field;
+static int is_symbolic_field;
+
+static int show_warning = 1;
+
+#define do_warning(fmt, ...)				\
+	do {						\
+		if (show_warning)			\
+			warning(fmt, ##__VA_ARGS__);	\
+	} while (0)
+
+#define do_warning_event(event, fmt, ...)			\
+	do {							\
+		if (!show_warning)				\
+			continue;				\
+								\
+		if (event)					\
+			warning("[%s:%s] " fmt, event->system,	\
+				event->name, ##__VA_ARGS__);	\
+		else						\
+			warning(fmt, ##__VA_ARGS__);		\
+	} while (0)
+
+static void init_input_buf(const char *buf, unsigned long long size)
+{
+	input_buf = buf;
+	input_buf_siz = size;
+	input_buf_ptr = 0;
+}
+
+const char *pevent_get_input_buf(void)
+{
+	return input_buf;
+}
+
+unsigned long long pevent_get_input_buf_ptr(void)
+{
+	return input_buf_ptr;
+}
+
+struct event_handler {
+	struct event_handler		*next;
+	int				id;
+	const char			*sys_name;
+	const char			*event_name;
+	pevent_event_handler_func	func;
+	void				*context;
+};
+
+struct pevent_func_params {
+	struct pevent_func_params	*next;
+	enum pevent_func_arg_type	type;
+};
+
+struct pevent_function_handler {
+	struct pevent_function_handler	*next;
+	enum pevent_func_arg_type	ret_type;
+	char				*name;
+	pevent_func_handler		func;
+	struct pevent_func_params	*params;
+	int				nr_args;
+};
+
+static unsigned long long
+process_defined_func(struct trace_seq *s, void *data, int size,
+		     struct event_format *event, struct print_arg *arg);
+
+static void free_func_handle(struct pevent_function_handler *func);
+
+/**
+ * pevent_buffer_init - init buffer for parsing
+ * @buf: buffer to parse
+ * @size: the size of the buffer
+ *
+ * For use with pevent_read_token(), this initializes the internal
+ * buffer that pevent_read_token() will parse.
+ */
+void pevent_buffer_init(const char *buf, unsigned long long size)
+{
+	init_input_buf(buf, size);
+}
+
+void breakpoint(void)
+{
+	static int x;
+	x++;
+}
+
+struct print_arg *alloc_arg(void)
+{
+	return calloc(1, sizeof(struct print_arg));
+}
+
+struct cmdline {
+	char *comm;
+	int pid;
+};
+
+static int cmdline_cmp(const void *a, const void *b)
+{
+	const struct cmdline *ca = a;
+	const struct cmdline *cb = b;
+
+	if (ca->pid < cb->pid)
+		return -1;
+	if (ca->pid > cb->pid)
+		return 1;
+
+	return 0;
+}
+
+struct cmdline_list {
+	struct cmdline_list	*next;
+	char			*comm;
+	int			pid;
+};
+
+static int cmdline_init(struct pevent *pevent)
+{
+	struct cmdline_list *cmdlist = pevent->cmdlist;
+	struct cmdline_list *item;
+	struct cmdline *cmdlines;
+	int i;
+
+	cmdlines = malloc(sizeof(*cmdlines) * pevent->cmdline_count);
+	if (!cmdlines)
+		return -1;
+
+	i = 0;
+	while (cmdlist) {
+		cmdlines[i].pid = cmdlist->pid;
+		cmdlines[i].comm = cmdlist->comm;
+		i++;
+		item = cmdlist;
+		cmdlist = cmdlist->next;
+		free(item);
+	}
+
+	qsort(cmdlines, pevent->cmdline_count, sizeof(*cmdlines), cmdline_cmp);
+
+	pevent->cmdlines = cmdlines;
+	pevent->cmdlist = NULL;
+
+	return 0;
+}
+
+static const char *find_cmdline(struct pevent *pevent, int pid)
+{
+	const struct cmdline *comm;
+	struct cmdline key;
+
+	if (!pid)
+		return "<idle>";
+
+	if (!pevent->cmdlines && cmdline_init(pevent))
+		return "<not enough memory for cmdlines!>";
+
+	key.pid = pid;
+
+	comm = bsearch(&key, pevent->cmdlines, pevent->cmdline_count,
+		       sizeof(*pevent->cmdlines), cmdline_cmp);
+
+	if (comm)
+		return comm->comm;
+	return "<...>";
+}
+
+/**
+ * pevent_pid_is_registered - return if a pid has a cmdline registered
+ * @pevent: handle for the pevent
+ * @pid: The pid to check if it has a cmdline registered with.
+ *
+ * Returns 1 if the pid has a cmdline mapped to it
+ * 0 otherwise.
+ */
+int pevent_pid_is_registered(struct pevent *pevent, int pid)
+{
+	const struct cmdline *comm;
+	struct cmdline key;
+
+	if (!pid)
+		return 1;
+
+	if (!pevent->cmdlines && cmdline_init(pevent))
+		return 0;
+
+	key.pid = pid;
+
+	comm = bsearch(&key, pevent->cmdlines, pevent->cmdline_count,
+		       sizeof(*pevent->cmdlines), cmdline_cmp);
+
+	if (comm)
+		return 1;
+	return 0;
+}
+
+/*
+ * If the command lines have been converted to an array, then
+ * we must add this pid. This is much slower than when cmdlines
+ * are added before the array is initialized.
+ */
+static int add_new_comm(struct pevent *pevent, const char *comm, int pid)
+{
+	struct cmdline *cmdlines = pevent->cmdlines;
+	const struct cmdline *cmdline;
+	struct cmdline key;
+
+	if (!pid)
+		return 0;
+
+	/* avoid duplicates */
+	key.pid = pid;
+
+	cmdline = bsearch(&key, pevent->cmdlines, pevent->cmdline_count,
+		       sizeof(*pevent->cmdlines), cmdline_cmp);
+	if (cmdline) {
+		errno = EEXIST;
+		return -1;
+	}
+
+	cmdlines = realloc(cmdlines, sizeof(*cmdlines) * (pevent->cmdline_count + 1));
+	if (!cmdlines) {
+		errno = ENOMEM;
+		return -1;
+	}
+
+	cmdlines[pevent->cmdline_count].comm = strdup(comm);
+	if (!cmdlines[pevent->cmdline_count].comm) {
+		free(cmdlines);
+		errno = ENOMEM;
+		return -1;
+	}
+
+	cmdlines[pevent->cmdline_count].pid = pid;
+		
+	if (cmdlines[pevent->cmdline_count].comm)
+		pevent->cmdline_count++;
+
+	qsort(cmdlines, pevent->cmdline_count, sizeof(*cmdlines), cmdline_cmp);
+	pevent->cmdlines = cmdlines;
+
+	return 0;
+}
+
+/**
+ * pevent_register_comm - register a pid / comm mapping
+ * @pevent: handle for the pevent
+ * @comm: the command line to register
+ * @pid: the pid to map the command line to
+ *
+ * This adds a mapping to search for command line names with
+ * a given pid. The comm is duplicated.
+ */
+int pevent_register_comm(struct pevent *pevent, const char *comm, int pid)
+{
+	struct cmdline_list *item;
+
+	if (pevent->cmdlines)
+		return add_new_comm(pevent, comm, pid);
+
+	item = malloc(sizeof(*item));
+	if (!item)
+		return -1;
+
+	item->comm = strdup(comm);
+	if (!item->comm) {
+		free(item);
+		return -1;
+	}
+	item->pid = pid;
+	item->next = pevent->cmdlist;
+
+	pevent->cmdlist = item;
+	pevent->cmdline_count++;
+
+	return 0;
+}
+
+void pevent_register_trace_clock(struct pevent *pevent, char *trace_clock)
+{
+	pevent->trace_clock = trace_clock;
+}
+
+struct func_map {
+	unsigned long long		addr;
+	char				*func;
+	char				*mod;
+};
+
+struct func_list {
+	struct func_list	*next;
+	unsigned long long	addr;
+	char			*func;
+	char			*mod;
+};
+
+static int func_cmp(const void *a, const void *b)
+{
+	const struct func_map *fa = a;
+	const struct func_map *fb = b;
+
+	if (fa->addr < fb->addr)
+		return -1;
+	if (fa->addr > fb->addr)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * We are searching for a record in between, not an exact
+ * match.
+ */
+static int func_bcmp(const void *a, const void *b)
+{
+	const struct func_map *fa = a;
+	const struct func_map *fb = b;
+
+	if ((fa->addr == fb->addr) ||
+
+	    (fa->addr > fb->addr &&
+	     fa->addr < (fb+1)->addr))
+		return 0;
+
+	if (fa->addr < fb->addr)
+		return -1;
+
+	return 1;
+}
+
+static int func_map_init(struct pevent *pevent)
+{
+	struct func_list *funclist;
+	struct func_list *item;
+	struct func_map *func_map;
+	int i;
+
+	func_map = malloc(sizeof(*func_map) * (pevent->func_count + 1));
+	if (!func_map)
+		return -1;
+
+	funclist = pevent->funclist;
+
+	i = 0;
+	while (funclist) {
+		func_map[i].func = funclist->func;
+		func_map[i].addr = funclist->addr;
+		func_map[i].mod = funclist->mod;
+		i++;
+		item = funclist;
+		funclist = funclist->next;
+		free(item);
+	}
+
+	qsort(func_map, pevent->func_count, sizeof(*func_map), func_cmp);
+
+	/*
+	 * Add a special record at the end.
+	 */
+	func_map[pevent->func_count].func = NULL;
+	func_map[pevent->func_count].addr = 0;
+	func_map[pevent->func_count].mod = NULL;
+
+	pevent->func_map = func_map;
+	pevent->funclist = NULL;
+
+	return 0;
+}
+
+static struct func_map *
+find_func(struct pevent *pevent, unsigned long long addr)
+{
+	struct func_map *func;
+	struct func_map key;
+
+	if (!pevent->func_map)
+		func_map_init(pevent);
+
+	key.addr = addr;
+
+	func = bsearch(&key, pevent->func_map, pevent->func_count,
+		       sizeof(*pevent->func_map), func_bcmp);
+
+	return func;
+}
+
+/**
+ * pevent_find_function - find a function by a given address
+ * @pevent: handle for the pevent
+ * @addr: the address to find the function with
+ *
+ * Returns a pointer to the function stored that has the given
+ * address. Note, the address does not have to be exact, it
+ * will select the function that would contain the address.
+ */
+const char *pevent_find_function(struct pevent *pevent, unsigned long long addr)
+{
+	struct func_map *map;
+
+	map = find_func(pevent, addr);
+	if (!map)
+		return NULL;
+
+	return map->func;
+}
+
+/**
+ * pevent_find_function_address - find a function address by a given address
+ * @pevent: handle for the pevent
+ * @addr: the address to find the function with
+ *
+ * Returns the address the function starts at. This can be used in
+ * conjunction with pevent_find_function to print both the function
+ * name and the function offset.
+ */
+unsigned long long
+pevent_find_function_address(struct pevent *pevent, unsigned long long addr)
+{
+	struct func_map *map;
+
+	map = find_func(pevent, addr);
+	if (!map)
+		return 0;
+
+	return map->addr;
+}
+
+/**
+ * pevent_register_function - register a function with a given address
+ * @pevent: handle for the pevent
+ * @function: the function name to register
+ * @addr: the address the function starts at
+ * @mod: the kernel module the function may be in (NULL for none)
+ *
+ * This registers a function name with an address and module.
+ * The @func passed in is duplicated.
+ */
+int pevent_register_function(struct pevent *pevent, char *func,
+			     unsigned long long addr, char *mod)
+{
+	struct func_list *item = malloc(sizeof(*item));
+
+	if (!item)
+		return -1;
+
+	item->next = pevent->funclist;
+	item->func = strdup(func);
+	if (!item->func)
+		goto out_free;
+
+	if (mod) {
+		item->mod = strdup(mod);
+		if (!item->mod)
+			goto out_free_func;
+	} else
+		item->mod = NULL;
+	item->addr = addr;
+
+	pevent->funclist = item;
+	pevent->func_count++;
+
+	return 0;
+
+out_free_func:
+	free(item->func);
+	item->func = NULL;
+out_free:
+	free(item);
+	errno = ENOMEM;
+	return -1;
+}
+
+/**
+ * pevent_print_funcs - print out the stored functions
+ * @pevent: handle for the pevent
+ *
+ * This prints out the stored functions.
+ */
+void pevent_print_funcs(struct pevent *pevent)
+{
+	int i;
+
+	if (!pevent->func_map)
+		func_map_init(pevent);
+
+	for (i = 0; i < (int)pevent->func_count; i++) {
+		printf("%016llx %s",
+		       pevent->func_map[i].addr,
+		       pevent->func_map[i].func);
+		if (pevent->func_map[i].mod)
+			printf(" [%s]\n", pevent->func_map[i].mod);
+		else
+			printf("\n");
+	}
+}
+
+struct printk_map {
+	unsigned long long		addr;
+	char				*printk;
+};
+
+struct printk_list {
+	struct printk_list	*next;
+	unsigned long long	addr;
+	char			*printk;
+};
+
+static int printk_cmp(const void *a, const void *b)
+{
+	const struct printk_map *pa = a;
+	const struct printk_map *pb = b;
+
+	if (pa->addr < pb->addr)
+		return -1;
+	if (pa->addr > pb->addr)
+		return 1;
+
+	return 0;
+}
+
+static int printk_map_init(struct pevent *pevent)
+{
+	struct printk_list *printklist;
+	struct printk_list *item;
+	struct printk_map *printk_map;
+	int i;
+
+	printk_map = malloc(sizeof(*printk_map) * (pevent->printk_count + 1));
+	if (!printk_map)
+		return -1;
+
+	printklist = pevent->printklist;
+
+	i = 0;
+	while (printklist) {
+		printk_map[i].printk = printklist->printk;
+		printk_map[i].addr = printklist->addr;
+		i++;
+		item = printklist;
+		printklist = printklist->next;
+		free(item);
+	}
+
+	qsort(printk_map, pevent->printk_count, sizeof(*printk_map), printk_cmp);
+
+	pevent->printk_map = printk_map;
+	pevent->printklist = NULL;
+
+	return 0;
+}
+
+static struct printk_map *
+find_printk(struct pevent *pevent, unsigned long long addr)
+{
+	struct printk_map *printk;
+	struct printk_map key;
+
+	if (!pevent->printk_map && printk_map_init(pevent))
+		return NULL;
+
+	key.addr = addr;
+
+	printk = bsearch(&key, pevent->printk_map, pevent->printk_count,
+			 sizeof(*pevent->printk_map), printk_cmp);
+
+	return printk;
+}
+
+/**
+ * pevent_register_print_string - register a string by its address
+ * @pevent: handle for the pevent
+ * @fmt: the string format to register
+ * @addr: the address the string was located at
+ *
+ * This registers a string by the address it was stored in the kernel.
+ * The @fmt passed in is duplicated.
+ */
+int pevent_register_print_string(struct pevent *pevent, const char *fmt,
+				 unsigned long long addr)
+{
+	struct printk_list *item = malloc(sizeof(*item));
+	char *p;
+
+	if (!item)
+		return -1;
+
+	item->next = pevent->printklist;
+	item->addr = addr;
+
+	/* Strip off quotes and '\n' from the end */
+	if (fmt[0] == '"')
+		fmt++;
+	item->printk = strdup(fmt);
+	if (!item->printk)
+		goto out_free;
+
+	p = item->printk + strlen(item->printk) - 1;
+	if (*p == '"')
+		*p = 0;
+
+	p -= 2;
+	if (strcmp(p, "\\n") == 0)
+		*p = 0;
+
+	pevent->printklist = item;
+	pevent->printk_count++;
+
+	return 0;
+
+out_free:
+	free(item);
+	errno = ENOMEM;
+	return -1;
+}
+
+/**
+ * pevent_print_printk - print out the stored strings
+ * @pevent: handle for the pevent
+ *
+ * This prints the string formats that were stored.
+ */
+void pevent_print_printk(struct pevent *pevent)
+{
+	int i;
+
+	if (!pevent->printk_map)
+		printk_map_init(pevent);
+
+	for (i = 0; i < (int)pevent->printk_count; i++) {
+		printf("%016llx %s\n",
+		       pevent->printk_map[i].addr,
+		       pevent->printk_map[i].printk);
+	}
+}
+
+static struct event_format *alloc_event(void)
+{
+	return calloc(1, sizeof(struct event_format));
+}
+
+static int add_event(struct pevent *pevent, struct event_format *event)
+{
+	int i;
+	struct event_format **events = realloc(pevent->events, sizeof(event) *
+					       (pevent->nr_events + 1));
+	if (!events)
+		return -1;
+
+	pevent->events = events;
+
+	for (i = 0; i < pevent->nr_events; i++) {
+		if (pevent->events[i]->id > event->id)
+			break;
+	}
+	if (i < pevent->nr_events)
+		memmove(&pevent->events[i + 1],
+			&pevent->events[i],
+			sizeof(event) * (pevent->nr_events - i));
+
+	pevent->events[i] = event;
+	pevent->nr_events++;
+
+	event->pevent = pevent;
+
+	return 0;
+}
+
+static int event_item_type(enum event_type type)
+{
+	switch (type) {
+	case EVENT_ITEM ... EVENT_SQUOTE:
+		return 1;
+	case EVENT_ERROR ... EVENT_DELIM:
+	default:
+		return 0;
+	}
+}
+
+static void free_flag_sym(struct print_flag_sym *fsym)
+{
+	struct print_flag_sym *next;
+
+	while (fsym) {
+		next = fsym->next;
+		free(fsym->value);
+		free(fsym->str);
+		free(fsym);
+		fsym = next;
+	}
+}
+
+static void free_arg(struct print_arg *arg)
+{
+	struct print_arg *farg;
+
+	if (!arg)
+		return;
+
+	switch (arg->type) {
+	case PRINT_ATOM:
+		free(arg->atom.atom);
+		break;
+	case PRINT_FIELD:
+		free(arg->field.name);
+		break;
+	case PRINT_FLAGS:
+		free_arg(arg->flags.field);
+		free(arg->flags.delim);
+		free_flag_sym(arg->flags.flags);
+		break;
+	case PRINT_SYMBOL:
+		free_arg(arg->symbol.field);
+		free_flag_sym(arg->symbol.symbols);
+		break;
+	case PRINT_HEX:
+		free_arg(arg->hex.field);
+		free_arg(arg->hex.size);
+		break;
+	case PRINT_TYPE:
+		free(arg->typecast.type);
+		free_arg(arg->typecast.item);
+		break;
+	case PRINT_STRING:
+	case PRINT_BSTRING:
+		free(arg->string.string);
+		break;
+	case PRINT_BITMASK:
+		free(arg->bitmask.bitmask);
+		break;
+	case PRINT_DYNAMIC_ARRAY:
+		free(arg->dynarray.index);
+		break;
+	case PRINT_OP:
+		free(arg->op.op);
+		free_arg(arg->op.left);
+		free_arg(arg->op.right);
+		break;
+	case PRINT_FUNC:
+		while (arg->func.args) {
+			farg = arg->func.args;
+			arg->func.args = farg->next;
+			free_arg(farg);
+		}
+		break;
+
+	case PRINT_NULL:
+	default:
+		break;
+	}
+
+	free(arg);
+}
+
+static enum event_type get_type(int ch)
+{
+	if (ch == '\n')
+		return EVENT_NEWLINE;
+	if (isspace(ch))
+		return EVENT_SPACE;
+	if (isalnum(ch) || ch == '_')
+		return EVENT_ITEM;
+	if (ch == '\'')
+		return EVENT_SQUOTE;
+	if (ch == '"')
+		return EVENT_DQUOTE;
+	if (!isprint(ch))
+		return EVENT_NONE;
+	if (ch == '(' || ch == ')' || ch == ',')
+		return EVENT_DELIM;
+
+	return EVENT_OP;
+}
+
+static int __read_char(void)
+{
+	if (input_buf_ptr >= input_buf_siz)
+		return -1;
+
+	return input_buf[input_buf_ptr++];
+}
+
+static int __peek_char(void)
+{
+	if (input_buf_ptr >= input_buf_siz)
+		return -1;
+
+	return input_buf[input_buf_ptr];
+}
+
+/**
+ * pevent_peek_char - peek at the next character that will be read
+ *
+ * Returns the next character read, or -1 if end of buffer.
+ */
+int pevent_peek_char(void)
+{
+	return __peek_char();
+}
+
+static int extend_token(char **tok, char *buf, int size)
+{
+	char *newtok = realloc(*tok, size);
+
+	if (!newtok) {
+		free(*tok);
+		*tok = NULL;
+		return -1;
+	}
+
+	if (!*tok)
+		strcpy(newtok, buf);
+	else
+		strcat(newtok, buf);
+	*tok = newtok;
+
+	return 0;
+}
+
+static enum event_type force_token(const char *str, char **tok);
+
+static enum event_type __read_token(char **tok)
+{
+	char buf[BUFSIZ];
+	int ch, last_ch, quote_ch, next_ch;
+	int i = 0;
+	int tok_size = 0;
+	enum event_type type;
+
+	*tok = NULL;
+
+
+	ch = __read_char();
+	if (ch < 0)
+		return EVENT_NONE;
+
+	type = get_type(ch);
+	if (type == EVENT_NONE)
+		return type;
+
+	buf[i++] = ch;
+
+	switch (type) {
+	case EVENT_NEWLINE:
+	case EVENT_DELIM:
+		if (asprintf(tok, "%c", ch) < 0)
+			return EVENT_ERROR;
+
+		return type;
+
+	case EVENT_OP:
+		switch (ch) {
+		case '-':
+			next_ch = __peek_char();
+			if (next_ch == '>') {
+				buf[i++] = __read_char();
+				break;
+			}
+			/* fall through */
+		case '+':
+		case '|':
+		case '&':
+		case '>':
+		case '<':
+			last_ch = ch;
+			ch = __peek_char();
+			if (ch != last_ch)
+				goto test_equal;
+			buf[i++] = __read_char();
+			switch (last_ch) {
+			case '>':
+			case '<':
+				goto test_equal;
+			default:
+				break;
+			}
+			break;
+		case '!':
+		case '=':
+			goto test_equal;
+		default: /* what should we do instead? */
+			break;
+		}
+		buf[i] = 0;
+		*tok = strdup(buf);
+		return type;
+
+ test_equal:
+		ch = __peek_char();
+		if (ch == '=')
+			buf[i++] = __read_char();
+		goto out;
+
+	case EVENT_DQUOTE:
+	case EVENT_SQUOTE:
+		/* don't keep quotes */
+		i--;
+		quote_ch = ch;
+		last_ch = 0;
+ concat:
+		do {
+			if (i == (BUFSIZ - 1)) {
+				buf[i] = 0;
+				tok_size += BUFSIZ;
+
+				if (extend_token(tok, buf, tok_size) < 0)
+					return EVENT_NONE;
+				i = 0;
+			}
+			last_ch = ch;
+			ch = __read_char();
+			buf[i++] = ch;
+			/* the '\' '\' will cancel itself */
+			if (ch == '\\' && last_ch == '\\')
+				last_ch = 0;
+		} while (ch != quote_ch || last_ch == '\\');
+		/* remove the last quote */
+		i--;
+
+		/*
+		 * For strings (double quotes) check the next token.
+		 * If it is another string, concatinate the two.
+		 */
+		if (type == EVENT_DQUOTE) {
+			unsigned long long save_input_buf_ptr = input_buf_ptr;
+
+			do {
+				ch = __read_char();
+			} while (isspace(ch));
+			if (ch == '"')
+				goto concat;
+			input_buf_ptr = save_input_buf_ptr;
+		}
+
+		goto out;
+
+	case EVENT_ERROR ... EVENT_SPACE:
+	case EVENT_ITEM:
+	default:
+		break;
+	}
+
+	while (get_type(__peek_char()) == type) {
+		if (i == (BUFSIZ - 1)) {
+			buf[i] = 0;
+			tok_size += BUFSIZ;
+
+			if (extend_token(tok, buf, tok_size) < 0)
+				return EVENT_NONE;
+			i = 0;
+		}
+		ch = __read_char();
+		buf[i++] = ch;
+	}
+
+ out:
+	buf[i] = 0;
+	if (extend_token(tok, buf, tok_size + i + 1) < 0)
+		return EVENT_NONE;
+
+	if (type == EVENT_ITEM) {
+		/*
+		 * Older versions of the kernel has a bug that
+		 * creates invalid symbols and will break the mac80211
+		 * parsing. This is a work around to that bug.
+		 *
+		 * See Linux kernel commit:
+		 *  811cb50baf63461ce0bdb234927046131fc7fa8b
+		 */
+		if (strcmp(*tok, "LOCAL_PR_FMT") == 0) {
+			free(*tok);
+			*tok = NULL;
+			return force_token("\"\%s\" ", tok);
+		} else if (strcmp(*tok, "STA_PR_FMT") == 0) {
+			free(*tok);
+			*tok = NULL;
+			return force_token("\" sta:%pM\" ", tok);
+		} else if (strcmp(*tok, "VIF_PR_FMT") == 0) {
+			free(*tok);
+			*tok = NULL;
+			return force_token("\" vif:%p(%d)\" ", tok);
+		}
+	}
+
+	return type;
+}
+
+static enum event_type force_token(const char *str, char **tok)
+{
+	const char *save_input_buf;
+	unsigned long long save_input_buf_ptr;
+	unsigned long long save_input_buf_siz;
+	enum event_type type;
+	
+	/* save off the current input pointers */
+	save_input_buf = input_buf;
+	save_input_buf_ptr = input_buf_ptr;
+	save_input_buf_siz = input_buf_siz;
+
+	init_input_buf(str, strlen(str));
+
+	type = __read_token(tok);
+
+	/* reset back to original token */
+	input_buf = save_input_buf;
+	input_buf_ptr = save_input_buf_ptr;
+	input_buf_siz = save_input_buf_siz;
+
+	return type;
+}
+
+static void free_token(char *tok)
+{
+	if (tok)
+		free(tok);
+}
+
+static enum event_type read_token(char **tok)
+{
+	enum event_type type;
+
+	for (;;) {
+		type = __read_token(tok);
+		if (type != EVENT_SPACE)
+			return type;
+
+		free_token(*tok);
+	}
+
+	/* not reached */
+	*tok = NULL;
+	return EVENT_NONE;
+}
+
+/**
+ * pevent_read_token - access to utilites to use the pevent parser
+ * @tok: The token to return
+ *
+ * This will parse tokens from the string given by
+ * pevent_init_data().
+ *
+ * Returns the token type.
+ */
+enum event_type pevent_read_token(char **tok)
+{
+	return read_token(tok);
+}
+
+/**
+ * pevent_free_token - free a token returned by pevent_read_token
+ * @token: the token to free
+ */
+void pevent_free_token(char *token)
+{
+	free_token(token);
+}
+
+/* no newline */
+static enum event_type read_token_item(char **tok)
+{
+	enum event_type type;
+
+	for (;;) {
+		type = __read_token(tok);
+		if (type != EVENT_SPACE && type != EVENT_NEWLINE)
+			return type;
+		free_token(*tok);
+		*tok = NULL;
+	}
+
+	/* not reached */
+	*tok = NULL;
+	return EVENT_NONE;
+}
+
+static int test_type(enum event_type type, enum event_type expect)
+{
+	if (type != expect) {
+		do_warning("Error: expected type %d but read %d",
+		    expect, type);
+		return -1;
+	}
+	return 0;
+}
+
+static int test_type_token(enum event_type type, const char *token,
+		    enum event_type expect, const char *expect_tok)
+{
+	if (type != expect) {
+		do_warning("Error: expected type %d but read %d",
+		    expect, type);
+		return -1;
+	}
+
+	if (strcmp(token, expect_tok) != 0) {
+		do_warning("Error: expected '%s' but read '%s'",
+		    expect_tok, token);
+		return -1;
+	}
+	return 0;
+}
+
+static int __read_expect_type(enum event_type expect, char **tok, int newline_ok)
+{
+	enum event_type type;
+
+	if (newline_ok)
+		type = read_token(tok);
+	else
+		type = read_token_item(tok);
+	return test_type(type, expect);
+}
+
+static int read_expect_type(enum event_type expect, char **tok)
+{
+	return __read_expect_type(expect, tok, 1);
+}
+
+static int __read_expected(enum event_type expect, const char *str,
+			   int newline_ok)
+{
+	enum event_type type;
+	char *token;
+	int ret;
+
+	if (newline_ok)
+		type = read_token(&token);
+	else
+		type = read_token_item(&token);
+
+	ret = test_type_token(type, token, expect, str);
+
+	free_token(token);
+
+	return ret;
+}
+
+static int read_expected(enum event_type expect, const char *str)
+{
+	return __read_expected(expect, str, 1);
+}
+
+static int read_expected_item(enum event_type expect, const char *str)
+{
+	return __read_expected(expect, str, 0);
+}
+
+static char *event_read_name(void)
+{
+	char *token;
+
+	if (read_expected(EVENT_ITEM, "name") < 0)
+		return NULL;
+
+	if (read_expected(EVENT_OP, ":") < 0)
+		return NULL;
+
+	if (read_expect_type(EVENT_ITEM, &token) < 0)
+		goto fail;
+
+	return token;
+
+ fail:
+	free_token(token);
+	return NULL;
+}
+
+static int event_read_id(void)
+{
+	char *token;
+	int id;
+
+	if (read_expected_item(EVENT_ITEM, "ID") < 0)
+		return -1;
+
+	if (read_expected(EVENT_OP, ":") < 0)
+		return -1;
+
+	if (read_expect_type(EVENT_ITEM, &token) < 0)
+		goto fail;
+
+	id = strtoul(token, NULL, 0);
+	free_token(token);
+	return id;
+
+ fail:
+	free_token(token);
+	return -1;
+}
+
+static int field_is_string(struct format_field *field)
+{
+	if ((field->flags & FIELD_IS_ARRAY) &&
+	    (strstr(field->type, "char") || strstr(field->type, "u8") ||
+	     strstr(field->type, "s8")))
+		return 1;
+
+	return 0;
+}
+
+static int field_is_dynamic(struct format_field *field)
+{
+	if (strncmp(field->type, "__data_loc", 10) == 0)
+		return 1;
+
+	return 0;
+}
+
+static int field_is_long(struct format_field *field)
+{
+	/* includes long long */
+	if (strstr(field->type, "long"))
+		return 1;
+
+	return 0;
+}
+
+static unsigned int type_size(const char *name)
+{
+	/* This covers all FIELD_IS_STRING types. */
+	static struct {
+		const char *type;
+		unsigned int size;
+	} table[] = {
+		{ "u8",   1 },
+		{ "u16",  2 },
+		{ "u32",  4 },
+		{ "u64",  8 },
+		{ "s8",   1 },
+		{ "s16",  2 },
+		{ "s32",  4 },
+		{ "s64",  8 },
+		{ "char", 1 },
+		{ },
+	};
+	int i;
+
+	for (i = 0; table[i].type; i++) {
+		if (!strcmp(table[i].type, name))
+			return table[i].size;
+	}
+
+	return 0;
+}
+
+static int event_read_fields(struct event_format *event, struct format_field **fields)
+{
+	struct format_field *field = NULL;
+	enum event_type type;
+	char *token;
+	char *last_token;
+	int count = 0;
+
+	do {
+		unsigned int size_dynamic = 0;
+
+		type = read_token(&token);
+		if (type == EVENT_NEWLINE) {
+			free_token(token);
+			return count;
+		}
+
+		count++;
+
+		if (test_type_token(type, token, EVENT_ITEM, "field"))
+			goto fail;
+		free_token(token);
+
+		type = read_token(&token);
+		/*
+		 * The ftrace fields may still use the "special" name.
+		 * Just ignore it.
+		 */
+		if (event->flags & EVENT_FL_ISFTRACE &&
+		    type == EVENT_ITEM && strcmp(token, "special") == 0) {
+			free_token(token);
+			type = read_token(&token);
+		}
+
+		if (test_type_token(type, token, EVENT_OP, ":") < 0)
+			goto fail;
+
+		free_token(token);
+		if (read_expect_type(EVENT_ITEM, &token) < 0)
+			goto fail;
+
+		last_token = token;
+
+		field = calloc(1, sizeof(*field));
+		if (!field)
+			goto fail;
+
+		field->event = event;
+
+		/* read the rest of the type */
+		for (;;) {
+			type = read_token(&token);
+			if (type == EVENT_ITEM ||
+			    (type == EVENT_OP && strcmp(token, "*") == 0) ||
+			    /*
+			     * Some of the ftrace fields are broken and have
+			     * an illegal "." in them.
+			     */
+			    (event->flags & EVENT_FL_ISFTRACE &&
+			     type == EVENT_OP && strcmp(token, ".") == 0)) {
+
+				if (strcmp(token, "*") == 0)
+					field->flags |= FIELD_IS_POINTER;
+
+				if (field->type) {
+					char *new_type;
+					new_type = realloc(field->type,
+							   strlen(field->type) +
+							   strlen(last_token) + 2);
+					if (!new_type) {
+						free(last_token);
+						goto fail;
+					}
+					field->type = new_type;
+					strcat(field->type, " ");
+					strcat(field->type, last_token);
+					free(last_token);
+				} else
+					field->type = last_token;
+				last_token = token;
+				continue;
+			}
+
+			break;
+		}
+
+		if (!field->type) {
+			do_warning_event(event, "%s: no type found", __func__);
+			goto fail;
+		}
+		field->name = last_token;
+
+		if (test_type(type, EVENT_OP))
+			goto fail;
+
+		if (strcmp(token, "[") == 0) {
+			enum event_type last_type = type;
+			char *brackets = token;
+			char *new_brackets;
+			int len;
+
+			field->flags |= FIELD_IS_ARRAY;
+
+			type = read_token(&token);
+
+			if (type == EVENT_ITEM)
+				field->arraylen = strtoul(token, NULL, 0);
+			else
+				field->arraylen = 0;
+
+		        while (strcmp(token, "]") != 0) {
+				if (last_type == EVENT_ITEM &&
+				    type == EVENT_ITEM)
+					len = 2;
+				else
+					len = 1;
+				last_type = type;
+
+				new_brackets = realloc(brackets,
+						       strlen(brackets) +
+						       strlen(token) + len);
+				if (!new_brackets) {
+					free(brackets);
+					goto fail;
+				}
+				brackets = new_brackets;
+				if (len == 2)
+					strcat(brackets, " ");
+				strcat(brackets, token);
+				/* We only care about the last token */
+				field->arraylen = strtoul(token, NULL, 0);
+				free_token(token);
+				type = read_token(&token);
+				if (type == EVENT_NONE) {
+					do_warning_event(event, "failed to find token");
+					goto fail;
+				}
+			}
+
+			free_token(token);
+
+			new_brackets = realloc(brackets, strlen(brackets) + 2);
+			if (!new_brackets) {
+				free(brackets);
+				goto fail;
+			}
+			brackets = new_brackets;
+			strcat(brackets, "]");
+
+			/* add brackets to type */
+
+			type = read_token(&token);
+			/*
+			 * If the next token is not an OP, then it is of
+			 * the format: type [] item;
+			 */
+			if (type == EVENT_ITEM) {
+				char *new_type;
+				new_type = realloc(field->type,
+						   strlen(field->type) +
+						   strlen(field->name) +
+						   strlen(brackets) + 2);
+				if (!new_type) {
+					free(brackets);
+					goto fail;
+				}
+				field->type = new_type;
+				strcat(field->type, " ");
+				strcat(field->type, field->name);
+				size_dynamic = type_size(field->name);
+				free_token(field->name);
+				strcat(field->type, brackets);
+				field->name = token;
+				type = read_token(&token);
+			} else {
+				char *new_type;
+				new_type = realloc(field->type,
+						   strlen(field->type) +
+						   strlen(brackets) + 1);
+				if (!new_type) {
+					free(brackets);
+					goto fail;
+				}
+				field->type = new_type;
+				strcat(field->type, brackets);
+			}
+			free(brackets);
+		}
+
+		if (field_is_string(field))
+			field->flags |= FIELD_IS_STRING;
+		if (field_is_dynamic(field))
+			field->flags |= FIELD_IS_DYNAMIC;
+		if (field_is_long(field))
+			field->flags |= FIELD_IS_LONG;
+
+		if (test_type_token(type, token,  EVENT_OP, ";"))
+			goto fail;
+		free_token(token);
+
+		if (read_expected(EVENT_ITEM, "offset") < 0)
+			goto fail_expect;
+
+		if (read_expected(EVENT_OP, ":") < 0)
+			goto fail_expect;
+
+		if (read_expect_type(EVENT_ITEM, &token))
+			goto fail;
+		field->offset = strtoul(token, NULL, 0);
+		free_token(token);
+
+		if (read_expected(EVENT_OP, ";") < 0)
+			goto fail_expect;
+
+		if (read_expected(EVENT_ITEM, "size") < 0)
+			goto fail_expect;
+
+		if (read_expected(EVENT_OP, ":") < 0)
+			goto fail_expect;
+
+		if (read_expect_type(EVENT_ITEM, &token))
+			goto fail;
+		field->size = strtoul(token, NULL, 0);
+		free_token(token);
+
+		if (read_expected(EVENT_OP, ";") < 0)
+			goto fail_expect;
+
+		type = read_token(&token);
+		if (type != EVENT_NEWLINE) {
+			/* newer versions of the kernel have a "signed" type */
+			if (test_type_token(type, token, EVENT_ITEM, "signed"))
+				goto fail;
+
+			free_token(token);
+
+			if (read_expected(EVENT_OP, ":") < 0)
+				goto fail_expect;
+
+			if (read_expect_type(EVENT_ITEM, &token))
+				goto fail;
+
+			if (strtoul(token, NULL, 0))
+				field->flags |= FIELD_IS_SIGNED;
+
+			free_token(token);
+			if (read_expected(EVENT_OP, ";") < 0)
+				goto fail_expect;
+
+			if (read_expect_type(EVENT_NEWLINE, &token))
+				goto fail;
+		}
+
+		free_token(token);
+
+		if (field->flags & FIELD_IS_ARRAY) {
+			if (field->arraylen)
+				field->elementsize = field->size / field->arraylen;
+			else if (field->flags & FIELD_IS_DYNAMIC)
+				field->elementsize = size_dynamic;
+			else if (field->flags & FIELD_IS_STRING)
+				field->elementsize = 1;
+			else if (field->flags & FIELD_IS_LONG)
+				field->elementsize = event->pevent ?
+						     event->pevent->long_size :
+						     sizeof(long);
+		} else
+			field->elementsize = field->size;
+
+		*fields = field;
+		fields = &field->next;
+
+	} while (1);
+
+	return 0;
+
+fail:
+	free_token(token);
+fail_expect:
+	if (field) {
+		free(field->type);
+		free(field->name);
+		free(field);
+	}
+	return -1;
+}
+
+static int event_read_format(struct event_format *event)
+{
+	char *token;
+	int ret;
+
+	if (read_expected_item(EVENT_ITEM, "format") < 0)
+		return -1;
+
+	if (read_expected(EVENT_OP, ":") < 0)
+		return -1;
+
+	if (read_expect_type(EVENT_NEWLINE, &token))
+		goto fail;
+	free_token(token);
+
+	ret = event_read_fields(event, &event->format.common_fields);
+	if (ret < 0)
+		return ret;
+	event->format.nr_common = ret;
+
+	ret = event_read_fields(event, &event->format.fields);
+	if (ret < 0)
+		return ret;
+	event->format.nr_fields = ret;
+
+	return 0;
+
+ fail:
+	free_token(token);
+	return -1;
+}
+
+static enum event_type
+process_arg_token(struct event_format *event, struct print_arg *arg,
+		  char **tok, enum event_type type);
+
+static enum event_type
+process_arg(struct event_format *event, struct print_arg *arg, char **tok)
+{
+	enum event_type type;
+	char *token;
+
+	type = read_token(&token);
+	*tok = token;
+
+	return process_arg_token(event, arg, tok, type);
+}
+
+static enum event_type
+process_op(struct event_format *event, struct print_arg *arg, char **tok);
+
+/*
+ * For __print_symbolic() and __print_flags, we need to completely
+ * evaluate the first argument, which defines what to print next.
+ */
+static enum event_type
+process_field_arg(struct event_format *event, struct print_arg *arg, char **tok)
+{
+	enum event_type type;
+
+	type = process_arg(event, arg, tok);
+
+	while (type == EVENT_OP) {
+		type = process_op(event, arg, tok);
+	}
+
+	return type;
+}
+
+static enum event_type
+process_cond(struct event_format *event, struct print_arg *top, char **tok)
+{
+	struct print_arg *arg, *left, *right;
+	enum event_type type;
+	char *token = NULL;
+
+	arg = alloc_arg();
+	left = alloc_arg();
+	right = alloc_arg();
+
+	if (!arg || !left || !right) {
+		do_warning_event(event, "%s: not enough memory!", __func__);
+		/* arg will be freed at out_free */
+		free_arg(left);
+		free_arg(right);
+		goto out_free;
+	}
+
+	arg->type = PRINT_OP;
+	arg->op.left = left;
+	arg->op.right = right;
+
+	*tok = NULL;
+	type = process_arg(event, left, &token);
+
+ again:
+	/* Handle other operations in the arguments */
+	if (type == EVENT_OP && strcmp(token, ":") != 0) {
+		type = process_op(event, left, &token);
+		goto again;
+	}
+
+	if (test_type_token(type, token, EVENT_OP, ":"))
+		goto out_free;
+
+	arg->op.op = token;
+
+	type = process_arg(event, right, &token);
+
+	top->op.right = arg;
+
+	*tok = token;
+	return type;
+
+out_free:
+	/* Top may point to itself */
+	top->op.right = NULL;
+	free_token(token);
+	free_arg(arg);
+	return EVENT_ERROR;
+}
+
+static enum event_type
+process_array(struct event_format *event, struct print_arg *top, char **tok)
+{
+	struct print_arg *arg;
+	enum event_type type;
+	char *token = NULL;
+
+	arg = alloc_arg();
+	if (!arg) {
+		do_warning_event(event, "%s: not enough memory!", __func__);
+		/* '*tok' is set to top->op.op.  No need to free. */
+		*tok = NULL;
+		return EVENT_ERROR;
+	}
+
+	*tok = NULL;
+	type = process_arg(event, arg, &token);
+	if (test_type_token(type, token, EVENT_OP, "]"))
+		goto out_free;
+
+	top->op.right = arg;
+
+	free_token(token);
+	type = read_token_item(&token);
+	*tok = token;
+
+	return type;
+
+out_free:
+	free_token(token);
+	free_arg(arg);
+	return EVENT_ERROR;
+}
+
+static int get_op_prio(char *op)
+{
+	if (!op[1]) {
+		switch (op[0]) {
+		case '~':
+		case '!':
+			return 4;
+		case '*':
+		case '/':
+		case '%':
+			return 6;
+		case '+':
+		case '-':
+			return 7;
+			/* '>>' and '<<' are 8 */
+		case '<':
+		case '>':
+			return 9;
+			/* '==' and '!=' are 10 */
+		case '&':
+			return 11;
+		case '^':
+			return 12;
+		case '|':
+			return 13;
+		case '?':
+			return 16;
+		default:
+			do_warning("unknown op '%c'", op[0]);
+			return -1;
+		}
+	} else {
+		if (strcmp(op, "++") == 0 ||
+		    strcmp(op, "--") == 0) {
+			return 3;
+		} else if (strcmp(op, ">>") == 0 ||
+			   strcmp(op, "<<") == 0) {
+			return 8;
+		} else if (strcmp(op, ">=") == 0 ||
+			   strcmp(op, "<=") == 0) {
+			return 9;
+		} else if (strcmp(op, "==") == 0 ||
+			   strcmp(op, "!=") == 0) {
+			return 10;
+		} else if (strcmp(op, "&&") == 0) {
+			return 14;
+		} else if (strcmp(op, "||") == 0) {
+			return 15;
+		} else {
+			do_warning("unknown op '%s'", op);
+			return -1;
+		}
+	}
+}
+
+static int set_op_prio(struct print_arg *arg)
+{
+
+	/* single ops are the greatest */
+	if (!arg->op.left || arg->op.left->type == PRINT_NULL)
+		arg->op.prio = 0;
+	else
+		arg->op.prio = get_op_prio(arg->op.op);
+
+	return arg->op.prio;
+}
+
+/* Note, *tok does not get freed, but will most likely be saved */
+static enum event_type
+process_op(struct event_format *event, struct print_arg *arg, char **tok)
+{
+	struct print_arg *left, *right = NULL;
+	enum event_type type;
+	char *token;
+
+	/* the op is passed in via tok */
+	token = *tok;
+
+	if (arg->type == PRINT_OP && !arg->op.left) {
+		/* handle single op */
+		if (token[1]) {
+			do_warning_event(event, "bad op token %s", token);
+			goto out_free;
+		}
+		switch (token[0]) {
+		case '~':
+		case '!':
+		case '+':
+		case '-':
+			break;
+		default:
+			do_warning_event(event, "bad op token %s", token);
+			goto out_free;
+
+		}
+
+		/* make an empty left */
+		left = alloc_arg();
+		if (!left)
+			goto out_warn_free;
+
+		left->type = PRINT_NULL;
+		arg->op.left = left;
+
+		right = alloc_arg();
+		if (!right)
+			goto out_warn_free;
+
+		arg->op.right = right;
+
+		/* do not free the token, it belongs to an op */
+		*tok = NULL;
+		type = process_arg(event, right, tok);
+
+	} else if (strcmp(token, "?") == 0) {
+
+		left = alloc_arg();
+		if (!left)
+			goto out_warn_free;
+
+		/* copy the top arg to the left */
+		*left = *arg;
+
+		arg->type = PRINT_OP;
+		arg->op.op = token;
+		arg->op.left = left;
+		arg->op.prio = 0;
+
+		/* it will set arg->op.right */
+		type = process_cond(event, arg, tok);
+
+	} else if (strcmp(token, ">>") == 0 ||
+		   strcmp(token, "<<") == 0 ||
+		   strcmp(token, "&") == 0 ||
+		   strcmp(token, "|") == 0 ||
+		   strcmp(token, "&&") == 0 ||
+		   strcmp(token, "||") == 0 ||
+		   strcmp(token, "-") == 0 ||
+		   strcmp(token, "+") == 0 ||
+		   strcmp(token, "*") == 0 ||
+		   strcmp(token, "^") == 0 ||
+		   strcmp(token, "/") == 0 ||
+		   strcmp(token, "<") == 0 ||
+		   strcmp(token, ">") == 0 ||
+		   strcmp(token, "<=") == 0 ||
+		   strcmp(token, ">=") == 0 ||
+		   strcmp(token, "==") == 0 ||
+		   strcmp(token, "!=") == 0) {
+
+		left = alloc_arg();
+		if (!left)
+			goto out_warn_free;
+
+		/* copy the top arg to the left */
+		*left = *arg;
+
+		arg->type = PRINT_OP;
+		arg->op.op = token;
+		arg->op.left = left;
+		arg->op.right = NULL;
+
+		if (set_op_prio(arg) == -1) {
+			event->flags |= EVENT_FL_FAILED;
+			/* arg->op.op (= token) will be freed at out_free */
+			arg->op.op = NULL;
+			goto out_free;
+		}
+
+		type = read_token_item(&token);
+		*tok = token;
+
+		/* could just be a type pointer */
+		if ((strcmp(arg->op.op, "*") == 0) &&
+		    type == EVENT_DELIM && (strcmp(token, ")") == 0)) {
+			char *new_atom;
+
+			if (left->type != PRINT_ATOM) {
+				do_warning_event(event, "bad pointer type");
+				goto out_free;
+			}
+			new_atom = realloc(left->atom.atom,
+					    strlen(left->atom.atom) + 3);
+			if (!new_atom)
+				goto out_warn_free;
+
+			left->atom.atom = new_atom;
+			strcat(left->atom.atom, " *");
+			free(arg->op.op);
+			*arg = *left;
+			free(left);
+
+			return type;
+		}
+
+		right = alloc_arg();
+		if (!right)
+			goto out_warn_free;
+
+		type = process_arg_token(event, right, tok, type);
+		arg->op.right = right;
+
+	} else if (strcmp(token, "[") == 0) {
+
+		left = alloc_arg();
+		if (!left)
+			goto out_warn_free;
+
+		*left = *arg;
+
+		arg->type = PRINT_OP;
+		arg->op.op = token;
+		arg->op.left = left;
+
+		arg->op.prio = 0;
+
+		/* it will set arg->op.right */
+		type = process_array(event, arg, tok);
+
+	} else {
+		do_warning_event(event, "unknown op '%s'", token);
+		event->flags |= EVENT_FL_FAILED;
+		/* the arg is now the left side */
+		goto out_free;
+	}
+
+	if (type == EVENT_OP && strcmp(*tok, ":") != 0) {
+		int prio;
+
+		/* higher prios need to be closer to the root */
+		prio = get_op_prio(*tok);
+
+		if (prio > arg->op.prio)
+			return process_op(event, arg, tok);
+
+		return process_op(event, right, tok);
+	}
+
+	return type;
+
+out_warn_free:
+	do_warning_event(event, "%s: not enough memory!", __func__);
+out_free:
+	free_token(token);
+	*tok = NULL;
+	return EVENT_ERROR;
+}
+
+static enum event_type
+process_entry(struct event_format *event __maybe_unused, struct print_arg *arg,
+	      char **tok)
+{
+	enum event_type type;
+	char *field;
+	char *token;
+
+	if (read_expected(EVENT_OP, "->") < 0)
+		goto out_err;
+
+	if (read_expect_type(EVENT_ITEM, &token) < 0)
+		goto out_free;
+	field = token;
+
+	arg->type = PRINT_FIELD;
+	arg->field.name = field;
+
+	if (is_flag_field) {
+		arg->field.field = pevent_find_any_field(event, arg->field.name);
+		arg->field.field->flags |= FIELD_IS_FLAG;
+		is_flag_field = 0;
+	} else if (is_symbolic_field) {
+		arg->field.field = pevent_find_any_field(event, arg->field.name);
+		arg->field.field->flags |= FIELD_IS_SYMBOLIC;
+		is_symbolic_field = 0;
+	}
+
+	type = read_token(&token);
+	*tok = token;
+
+	return type;
+
+ out_free:
+	free_token(token);
+ out_err:
+	*tok = NULL;
+	return EVENT_ERROR;
+}
+
+static char *arg_eval (struct print_arg *arg);
+
+static unsigned long long
+eval_type_str(unsigned long long val, const char *type, int pointer)
+{
+	int sign = 0;
+	char *ref;
+	int len;
+
+	len = strlen(type);
+
+	if (pointer) {
+
+		if (type[len-1] != '*') {
+			do_warning("pointer expected with non pointer type");
+			return val;
+		}
+
+		ref = malloc(len);
+		if (!ref) {
+			do_warning("%s: not enough memory!", __func__);
+			return val;
+		}
+		memcpy(ref, type, len);
+
+		/* chop off the " *" */
+		ref[len - 2] = 0;
+
+		val = eval_type_str(val, ref, 0);
+		free(ref);
+		return val;
+	}
+
+	/* check if this is a pointer */
+	if (type[len - 1] == '*')
+		return val;
+
+	/* Try to figure out the arg size*/
+	if (strncmp(type, "struct", 6) == 0)
+		/* all bets off */
+		return val;
+
+	if (strcmp(type, "u8") == 0)
+		return val & 0xff;
+
+	if (strcmp(type, "u16") == 0)
+		return val & 0xffff;
+
+	if (strcmp(type, "u32") == 0)
+		return val & 0xffffffff;
+
+	if (strcmp(type, "u64") == 0 ||
+	    strcmp(type, "s64"))
+		return val;
+
+	if (strcmp(type, "s8") == 0)
+		return (unsigned long long)(char)val & 0xff;
+
+	if (strcmp(type, "s16") == 0)
+		return (unsigned long long)(short)val & 0xffff;
+
+	if (strcmp(type, "s32") == 0)
+		return (unsigned long long)(int)val & 0xffffffff;
+
+	if (strncmp(type, "unsigned ", 9) == 0) {
+		sign = 0;
+		type += 9;
+	}
+
+	if (strcmp(type, "char") == 0) {
+		if (sign)
+			return (unsigned long long)(char)val & 0xff;
+		else
+			return val & 0xff;
+	}
+
+	if (strcmp(type, "short") == 0) {
+		if (sign)
+			return (unsigned long long)(short)val & 0xffff;
+		else
+			return val & 0xffff;
+	}
+
+	if (strcmp(type, "int") == 0) {
+		if (sign)
+			return (unsigned long long)(int)val & 0xffffffff;
+		else
+			return val & 0xffffffff;
+	}
+
+	return val;
+}
+
+/*
+ * Try to figure out the type.
+ */
+static unsigned long long
+eval_type(unsigned long long val, struct print_arg *arg, int pointer)
+{
+	if (arg->type != PRINT_TYPE) {
+		do_warning("expected type argument");
+		return 0;
+	}
+
+	return eval_type_str(val, arg->typecast.type, pointer);
+}
+
+static int arg_num_eval(struct print_arg *arg, long long *val)
+{
+	long long left, right;
+	int ret = 1;
+
+	switch (arg->type) {
+	case PRINT_ATOM:
+		*val = strtoll(arg->atom.atom, NULL, 0);
+		break;
+	case PRINT_TYPE:
+		ret = arg_num_eval(arg->typecast.item, val);
+		if (!ret)
+			break;
+		*val = eval_type(*val, arg, 0);
+		break;
+	case PRINT_OP:
+		switch (arg->op.op[0]) {
+		case '|':
+			ret = arg_num_eval(arg->op.left, &left);
+			if (!ret)
+				break;
+			ret = arg_num_eval(arg->op.right, &right);
+			if (!ret)
+				break;
+			if (arg->op.op[1])
+				*val = left || right;
+			else
+				*val = left | right;
+			break;
+		case '&':
+			ret = arg_num_eval(arg->op.left, &left);
+			if (!ret)
+				break;
+			ret = arg_num_eval(arg->op.right, &right);
+			if (!ret)
+				break;
+			if (arg->op.op[1])
+				*val = left && right;
+			else
+				*val = left & right;
+			break;
+		case '<':
+			ret = arg_num_eval(arg->op.left, &left);
+			if (!ret)
+				break;
+			ret = arg_num_eval(arg->op.right, &right);
+			if (!ret)
+				break;
+			switch (arg->op.op[1]) {
+			case 0:
+				*val = left < right;
+				break;
+			case '<':
+				*val = left << right;
+				break;
+			case '=':
+				*val = left <= right;
+				break;
+			default:
+				do_warning("unknown op '%s'", arg->op.op);
+				ret = 0;
+			}
+			break;
+		case '>':
+			ret = arg_num_eval(arg->op.left, &left);
+			if (!ret)
+				break;
+			ret = arg_num_eval(arg->op.right, &right);
+			if (!ret)
+				break;
+			switch (arg->op.op[1]) {
+			case 0:
+				*val = left > right;
+				break;
+			case '>':
+				*val = left >> right;
+				break;
+			case '=':
+				*val = left >= right;
+				break;
+			default:
+				do_warning("unknown op '%s'", arg->op.op);
+				ret = 0;
+			}
+			break;
+		case '=':
+			ret = arg_num_eval(arg->op.left, &left);
+			if (!ret)
+				break;
+			ret = arg_num_eval(arg->op.right, &right);
+			if (!ret)
+				break;
+
+			if (arg->op.op[1] != '=') {
+				do_warning("unknown op '%s'", arg->op.op);
+				ret = 0;
+			} else
+				*val = left == right;
+			break;
+		case '!':
+			ret = arg_num_eval(arg->op.left, &left);
+			if (!ret)
+				break;
+			ret = arg_num_eval(arg->op.right, &right);
+			if (!ret)
+				break;
+
+			switch (arg->op.op[1]) {
+			case '=':
+				*val = left != right;
+				break;
+			default:
+				do_warning("unknown op '%s'", arg->op.op);
+				ret = 0;
+			}
+			break;
+		case '-':
+			/* check for negative */
+			if (arg->op.left->type == PRINT_NULL)
+				left = 0;
+			else
+				ret = arg_num_eval(arg->op.left, &left);
+			if (!ret)
+				break;
+			ret = arg_num_eval(arg->op.right, &right);
+			if (!ret)
+				break;
+			*val = left - right;
+			break;
+		case '+':
+			if (arg->op.left->type == PRINT_NULL)
+				left = 0;
+			else
+				ret = arg_num_eval(arg->op.left, &left);
+			if (!ret)
+				break;
+			ret = arg_num_eval(arg->op.right, &right);
+			if (!ret)
+				break;
+			*val = left + right;
+			break;
+		default:
+			do_warning("unknown op '%s'", arg->op.op);
+			ret = 0;
+		}
+		break;
+
+	case PRINT_NULL:
+	case PRINT_FIELD ... PRINT_SYMBOL:
+	case PRINT_STRING:
+	case PRINT_BSTRING:
+	case PRINT_BITMASK:
+	default:
+		do_warning("invalid eval type %d", arg->type);
+		ret = 0;
+
+	}
+	return ret;
+}
+
+static char *arg_eval (struct print_arg *arg)
+{
+	long long val;
+	static char buf[20];
+
+	switch (arg->type) {
+	case PRINT_ATOM:
+		return arg->atom.atom;
+	case PRINT_TYPE:
+		return arg_eval(arg->typecast.item);
+	case PRINT_OP:
+		if (!arg_num_eval(arg, &val))
+			break;
+		sprintf(buf, "%lld", val);
+		return buf;
+
+	case PRINT_NULL:
+	case PRINT_FIELD ... PRINT_SYMBOL:
+	case PRINT_STRING:
+	case PRINT_BSTRING:
+	case PRINT_BITMASK:
+	default:
+		do_warning("invalid eval type %d", arg->type);
+		break;
+	}
+
+	return NULL;
+}
+
+static enum event_type
+process_fields(struct event_format *event, struct print_flag_sym **list, char **tok)
+{
+	enum event_type type;
+	struct print_arg *arg = NULL;
+	struct print_flag_sym *field;
+	char *token = *tok;
+	char *value;
+
+	do {
+		free_token(token);
+		type = read_token_item(&token);
+		if (test_type_token(type, token, EVENT_OP, "{"))
+			break;
+
+		arg = alloc_arg();
+		if (!arg)
+			goto out_free;
+
+		free_token(token);
+		type = process_arg(event, arg, &token);
+
+		if (type == EVENT_OP)
+			type = process_op(event, arg, &token);
+
+		if (type == EVENT_ERROR)
+			goto out_free;
+
+		if (test_type_token(type, token, EVENT_DELIM, ","))
+			goto out_free;
+
+		field = calloc(1, sizeof(*field));
+		if (!field)
+			goto out_free;
+
+		value = arg_eval(arg);
+		if (value == NULL)
+			goto out_free_field;
+		field->value = strdup(value);
+		if (field->value == NULL)
+			goto out_free_field;
+
+		free_arg(arg);
+		arg = alloc_arg();
+		if (!arg)
+			goto out_free;
+
+		free_token(token);
+		type = process_arg(event, arg, &token);
+		if (test_type_token(type, token, EVENT_OP, "}"))
+			goto out_free_field;
+
+		value = arg_eval(arg);
+		if (value == NULL)
+			goto out_free_field;
+		field->str = strdup(value);
+		if (field->str == NULL)
+			goto out_free_field;
+		free_arg(arg);
+		arg = NULL;
+
+		*list = field;
+		list = &field->next;
+
+		free_token(token);
+		type = read_token_item(&token);
+	} while (type == EVENT_DELIM && strcmp(token, ",") == 0);
+
+	*tok = token;
+	return type;
+
+out_free_field:
+	free_flag_sym(field);
+out_free:
+	free_arg(arg);
+	free_token(token);
+	*tok = NULL;
+
+	return EVENT_ERROR;
+}
+
+static enum event_type
+process_flags(struct event_format *event, struct print_arg *arg, char **tok)
+{
+	struct print_arg *field;
+	enum event_type type;
+	char *token;
+
+	memset(arg, 0, sizeof(*arg));
+	arg->type = PRINT_FLAGS;
+
+	field = alloc_arg();
+	if (!field) {
+		do_warning_event(event, "%s: not enough memory!", __func__);
+		goto out_free;
+	}
+
+	type = process_field_arg(event, field, &token);
+
+	/* Handle operations in the first argument */
+	while (type == EVENT_OP)
+		type = process_op(event, field, &token);
+
+	if (test_type_token(type, token, EVENT_DELIM, ","))
+		goto out_free_field;
+	free_token(token);
+
+	arg->flags.field = field;
+
+	type = read_token_item(&token);
+	if (event_item_type(type)) {
+		arg->flags.delim = token;
+		type = read_token_item(&token);
+	}
+
+	if (test_type_token(type, token, EVENT_DELIM, ","))
+		goto out_free;
+
+	type = process_fields(event, &arg->flags.flags, &token);
+	if (test_type_token(type, token, EVENT_DELIM, ")"))
+		goto out_free;
+
+	free_token(token);
+	type = read_token_item(tok);
+	return type;
+
+out_free_field:
+	free_arg(field);
+out_free:
+	free_token(token);
+	*tok = NULL;
+	return EVENT_ERROR;
+}
+
+static enum event_type
+process_symbols(struct event_format *event, struct print_arg *arg, char **tok)
+{
+	struct print_arg *field;
+	enum event_type type;
+	char *token;
+
+	memset(arg, 0, sizeof(*arg));
+	arg->type = PRINT_SYMBOL;
+
+	field = alloc_arg();
+	if (!field) {
+		do_warning_event(event, "%s: not enough memory!", __func__);
+		goto out_free;
+	}
+
+	type = process_field_arg(event, field, &token);
+
+	if (test_type_token(type, token, EVENT_DELIM, ","))
+		goto out_free_field;
+
+	arg->symbol.field = field;
+
+	type = process_fields(event, &arg->symbol.symbols, &token);
+	if (test_type_token(type, token, EVENT_DELIM, ")"))
+		goto out_free;
+
+	free_token(token);
+	type = read_token_item(tok);
+	return type;
+
+out_free_field:
+	free_arg(field);
+out_free:
+	free_token(token);
+	*tok = NULL;
+	return EVENT_ERROR;
+}
+
+static enum event_type
+process_hex(struct event_format *event, struct print_arg *arg, char **tok)
+{
+	struct print_arg *field;
+	enum event_type type;
+	char *token;
+
+	memset(arg, 0, sizeof(*arg));
+	arg->type = PRINT_HEX;
+
+	field = alloc_arg();
+	if (!field) {
+		do_warning_event(event, "%s: not enough memory!", __func__);
+		goto out_free;
+	}
+
+	type = process_arg(event, field, &token);
+
+	if (test_type_token(type, token, EVENT_DELIM, ","))
+		goto out_free;
+
+	arg->hex.field = field;
+
+	free_token(token);
+
+	field = alloc_arg();
+	if (!field) {
+		do_warning_event(event, "%s: not enough memory!", __func__);
+		*tok = NULL;
+		return EVENT_ERROR;
+	}
+
+	type = process_arg(event, field, &token);
+
+	if (test_type_token(type, token, EVENT_DELIM, ")"))
+		goto out_free;
+
+	arg->hex.size = field;
+
+	free_token(token);
+	type = read_token_item(tok);
+	return type;
+
+ out_free:
+	free_arg(field);
+	free_token(token);
+	*tok = NULL;
+	return EVENT_ERROR;
+}
+
+static enum event_type
+process_dynamic_array(struct event_format *event, struct print_arg *arg, char **tok)
+{
+	struct format_field *field;
+	enum event_type type;
+	char *token;
+
+	memset(arg, 0, sizeof(*arg));
+	arg->type = PRINT_DYNAMIC_ARRAY;
+
+	/*
+	 * The item within the parenthesis is another field that holds
+	 * the index into where the array starts.
+	 */
+	type = read_token(&token);
+	*tok = token;
+	if (type != EVENT_ITEM)
+		goto out_free;
+
+	/* Find the field */
+
+	field = pevent_find_field(event, token);
+	if (!field)
+		goto out_free;
+
+	arg->dynarray.field = field;
+	arg->dynarray.index = 0;
+
+	if (read_expected(EVENT_DELIM, ")") < 0)
+		goto out_free;
+
+	free_token(token);
+	type = read_token_item(&token);
+	*tok = token;
+	if (type != EVENT_OP || strcmp(token, "[") != 0)
+		return type;
+
+	free_token(token);
+	arg = alloc_arg();
+	if (!arg) {
+		do_warning_event(event, "%s: not enough memory!", __func__);
+		*tok = NULL;
+		return EVENT_ERROR;
+	}
+
+	type = process_arg(event, arg, &token);
+	if (type == EVENT_ERROR)
+		goto out_free_arg;
+
+	if (!test_type_token(type, token, EVENT_OP, "]"))
+		goto out_free_arg;
+
+	free_token(token);
+	type = read_token_item(tok);
+	return type;
+
+ out_free_arg:
+	free_arg(arg);
+ out_free:
+	free_token(token);
+	*tok = NULL;
+	return EVENT_ERROR;
+}
+
+static enum event_type
+process_paren(struct event_format *event, struct print_arg *arg, char **tok)
+{
+	struct print_arg *item_arg;
+	enum event_type type;
+	char *token;
+
+	type = process_arg(event, arg, &token);
+
+	if (type == EVENT_ERROR)
+		goto out_free;
+
+	if (type == EVENT_OP)
+		type = process_op(event, arg, &token);
+
+	if (type == EVENT_ERROR)
+		goto out_free;
+
+	if (test_type_token(type, token, EVENT_DELIM, ")"))
+		goto out_free;
+
+	free_token(token);
+	type = read_token_item(&token);
+
+	/*
+	 * If the next token is an item or another open paren, then
+	 * this was a typecast.
+	 */
+	if (event_item_type(type) ||
+	    (type == EVENT_DELIM && strcmp(token, "(") == 0)) {
+
+		/* make this a typecast and contine */
+
+		/* prevous must be an atom */
+		if (arg->type != PRINT_ATOM) {
+			do_warning_event(event, "previous needed to be PRINT_ATOM");
+			goto out_free;
+		}
+
+		item_arg = alloc_arg();
+		if (!item_arg) {
+			do_warning_event(event, "%s: not enough memory!",
+					 __func__);
+			goto out_free;
+		}
+
+		arg->type = PRINT_TYPE;
+		arg->typecast.type = arg->atom.atom;
+		arg->typecast.item = item_arg;
+		type = process_arg_token(event, item_arg, &token, type);
+
+	}
+
+	*tok = token;
+	return type;
+
+ out_free:
+	free_token(token);
+	*tok = NULL;
+	return EVENT_ERROR;
+}
+
+
+static enum event_type
+process_str(struct event_format *event __maybe_unused, struct print_arg *arg,
+	    char **tok)
+{
+	enum event_type type;
+	char *token;
+
+	if (read_expect_type(EVENT_ITEM, &token) < 0)
+		goto out_free;
+
+	arg->type = PRINT_STRING;
+	arg->string.string = token;
+	arg->string.offset = -1;
+
+	if (read_expected(EVENT_DELIM, ")") < 0)
+		goto out_err;
+
+	type = read_token(&token);
+	*tok = token;
+
+	return type;
+
+ out_free:
+	free_token(token);
+ out_err:
+	*tok = NULL;
+	return EVENT_ERROR;
+}
+
+static enum event_type
+process_bitmask(struct event_format *event __maybe_unused, struct print_arg *arg,
+	    char **tok)
+{
+	enum event_type type;
+	char *token;
+
+	if (read_expect_type(EVENT_ITEM, &token) < 0)
+		goto out_free;
+
+	arg->type = PRINT_BITMASK;
+	arg->bitmask.bitmask = token;
+	arg->bitmask.offset = -1;
+
+	if (read_expected(EVENT_DELIM, ")") < 0)
+		goto out_err;
+
+	type = read_token(&token);
+	*tok = token;
+
+	return type;
+
+ out_free:
+	free_token(token);
+ out_err:
+	*tok = NULL;
+	return EVENT_ERROR;
+}
+
+static struct pevent_function_handler *
+find_func_handler(struct pevent *pevent, char *func_name)
+{
+	struct pevent_function_handler *func;
+
+	if (!pevent)
+		return NULL;
+
+	for (func = pevent->func_handlers; func; func = func->next) {
+		if (strcmp(func->name, func_name) == 0)
+			break;
+	}
+
+	return func;
+}
+
+static void remove_func_handler(struct pevent *pevent, char *func_name)
+{
+	struct pevent_function_handler *func;
+	struct pevent_function_handler **next;
+
+	next = &pevent->func_handlers;
+	while ((func = *next)) {
+		if (strcmp(func->name, func_name) == 0) {
+			*next = func->next;
+			free_func_handle(func);
+			break;
+		}
+		next = &func->next;
+	}
+}
+
+static enum event_type
+process_func_handler(struct event_format *event, struct pevent_function_handler *func,
+		     struct print_arg *arg, char **tok)
+{
+	struct print_arg **next_arg;
+	struct print_arg *farg;
+	enum event_type type;
+	char *token;
+	int i;
+
+	arg->type = PRINT_FUNC;
+	arg->func.func = func;
+
+	*tok = NULL;
+
+	next_arg = &(arg->func.args);
+	for (i = 0; i < func->nr_args; i++) {
+		farg = alloc_arg();
+		if (!farg) {
+			do_warning_event(event, "%s: not enough memory!",
+					 __func__);
+			return EVENT_ERROR;
+		}
+
+		type = process_arg(event, farg, &token);
+		if (i < (func->nr_args - 1)) {
+			if (type != EVENT_DELIM || strcmp(token, ",") != 0) {
+				do_warning_event(event,
+					"Error: function '%s()' expects %d arguments but event %s only uses %d",
+					func->name, func->nr_args,
+					event->name, i + 1);
+				goto err;
+			}
+		} else {
+			if (type != EVENT_DELIM || strcmp(token, ")") != 0) {
+				do_warning_event(event,
+					"Error: function '%s()' only expects %d arguments but event %s has more",
+					func->name, func->nr_args, event->name);
+				goto err;
+			}
+		}
+
+		*next_arg = farg;
+		next_arg = &(farg->next);
+		free_token(token);
+	}
+
+	type = read_token(&token);
+	*tok = token;
+
+	return type;
+
+err:
+	free_arg(farg);
+	free_token(token);
+	return EVENT_ERROR;
+}
+
+static enum event_type
+process_function(struct event_format *event, struct print_arg *arg,
+		 char *token, char **tok)
+{
+	struct pevent_function_handler *func;
+
+	if (strcmp(token, "__print_flags") == 0) {
+		free_token(token);
+		is_flag_field = 1;
+		return process_flags(event, arg, tok);
+	}
+	if (strcmp(token, "__print_symbolic") == 0) {
+		free_token(token);
+		is_symbolic_field = 1;
+		return process_symbols(event, arg, tok);
+	}
+	if (strcmp(token, "__print_hex") == 0) {
+		free_token(token);
+		return process_hex(event, arg, tok);
+	}
+	if (strcmp(token, "__get_str") == 0) {
+		free_token(token);
+		return process_str(event, arg, tok);
+	}
+	if (strcmp(token, "__get_bitmask") == 0) {
+		free_token(token);
+		return process_bitmask(event, arg, tok);
+	}
+	if (strcmp(token, "__get_dynamic_array") == 0) {
+		free_token(token);
+		return process_dynamic_array(event, arg, tok);
+	}
+
+	func = find_func_handler(event->pevent, token);
+	if (func) {
+		free_token(token);
+		return process_func_handler(event, func, arg, tok);
+	}
+
+	do_warning_event(event, "function %s not defined", token);
+	free_token(token);
+	return EVENT_ERROR;
+}
+
+static enum event_type
+process_arg_token(struct event_format *event, struct print_arg *arg,
+		  char **tok, enum event_type type)
+{
+	char *token;
+	char *atom;
+
+	token = *tok;
+
+	switch (type) {
+	case EVENT_ITEM:
+		if (strcmp(token, "REC") == 0) {
+			free_token(token);
+			type = process_entry(event, arg, &token);
+			break;
+		}
+		atom = token;
+		/* test the next token */
+		type = read_token_item(&token);
+
+		/*
+		 * If the next token is a parenthesis, then this
+		 * is a function.
+		 */
+		if (type == EVENT_DELIM && strcmp(token, "(") == 0) {
+			free_token(token);
+			token = NULL;
+			/* this will free atom. */
+			type = process_function(event, arg, atom, &token);
+			break;
+		}
+		/* atoms can be more than one token long */
+		while (type == EVENT_ITEM) {
+			char *new_atom;
+			new_atom = realloc(atom,
+					   strlen(atom) + strlen(token) + 2);
+			if (!new_atom) {
+				free(atom);
+				*tok = NULL;
+				free_token(token);
+				return EVENT_ERROR;
+			}
+			atom = new_atom;
+			strcat(atom, " ");
+			strcat(atom, token);
+			free_token(token);
+			type = read_token_item(&token);
+		}
+
+		arg->type = PRINT_ATOM;
+		arg->atom.atom = atom;
+		break;
+
+	case EVENT_DQUOTE:
+	case EVENT_SQUOTE:
+		arg->type = PRINT_ATOM;
+		arg->atom.atom = token;
+		type = read_token_item(&token);
+		break;
+	case EVENT_DELIM:
+		if (strcmp(token, "(") == 0) {
+			free_token(token);
+			type = process_paren(event, arg, &token);
+			break;
+		}
+	case EVENT_OP:
+		/* handle single ops */
+		arg->type = PRINT_OP;
+		arg->op.op = token;
+		arg->op.left = NULL;
+		type = process_op(event, arg, &token);
+
+		/* On error, the op is freed */
+		if (type == EVENT_ERROR)
+			arg->op.op = NULL;
+
+		/* return error type if errored */
+		break;
+
+	case EVENT_ERROR ... EVENT_NEWLINE:
+	default:
+		do_warning_event(event, "unexpected type %d", type);
+		return EVENT_ERROR;
+	}
+	*tok = token;
+
+	return type;
+}
+
+static int event_read_print_args(struct event_format *event, struct print_arg **list)
+{
+	enum event_type type = EVENT_ERROR;
+	struct print_arg *arg;
+	char *token;
+	int args = 0;
+
+	do {
+		if (type == EVENT_NEWLINE) {
+			type = read_token_item(&token);
+			continue;
+		}
+
+		arg = alloc_arg();
+		if (!arg) {
+			do_warning_event(event, "%s: not enough memory!",
+					 __func__);
+			return -1;
+		}
+
+		type = process_arg(event, arg, &token);
+
+		if (type == EVENT_ERROR) {
+			free_token(token);
+			free_arg(arg);
+			return -1;
+		}
+
+		*list = arg;
+		args++;
+
+		if (type == EVENT_OP) {
+			type = process_op(event, arg, &token);
+			free_token(token);
+			if (type == EVENT_ERROR) {
+				*list = NULL;
+				free_arg(arg);
+				return -1;
+			}
+			list = &arg->next;
+			continue;
+		}
+
+		if (type == EVENT_DELIM && strcmp(token, ",") == 0) {
+			free_token(token);
+			*list = arg;
+			list = &arg->next;
+			continue;
+		}
+		break;
+	} while (type != EVENT_NONE);
+
+	if (type != EVENT_NONE && type != EVENT_ERROR)
+		free_token(token);
+
+	return args;
+}
+
+static int event_read_print(struct event_format *event)
+{
+	enum event_type type;
+	char *token;
+	int ret;
+
+	if (read_expected_item(EVENT_ITEM, "print") < 0)
+		return -1;
+
+	if (read_expected(EVENT_ITEM, "fmt") < 0)
+		return -1;
+
+	if (read_expected(EVENT_OP, ":") < 0)
+		return -1;
+
+	if (read_expect_type(EVENT_DQUOTE, &token) < 0)
+		goto fail;
+
+ concat:
+	event->print_fmt.format = token;
+	event->print_fmt.args = NULL;
+
+	/* ok to have no arg */
+	type = read_token_item(&token);
+
+	if (type == EVENT_NONE)
+		return 0;
+
+	/* Handle concatenation of print lines */
+	if (type == EVENT_DQUOTE) {
+		char *cat;
+
+		if (asprintf(&cat, "%s%s", event->print_fmt.format, token) < 0)
+			goto fail;
+		free_token(token);
+		free_token(event->print_fmt.format);
+		event->print_fmt.format = NULL;
+		token = cat;
+		goto concat;
+	}
+			     
+	if (test_type_token(type, token, EVENT_DELIM, ","))
+		goto fail;
+
+	free_token(token);
+
+	ret = event_read_print_args(event, &event->print_fmt.args);
+	if (ret < 0)
+		return -1;
+
+	return ret;
+
+ fail:
+	free_token(token);
+	return -1;
+}
+
+/**
+ * pevent_find_common_field - return a common field by event
+ * @event: handle for the event
+ * @name: the name of the common field to return
+ *
+ * Returns a common field from the event by the given @name.
+ * This only searchs the common fields and not all field.
+ */
+struct format_field *
+pevent_find_common_field(struct event_format *event, const char *name)
+{
+	struct format_field *format;
+
+	for (format = event->format.common_fields;
+	     format; format = format->next) {
+		if (strcmp(format->name, name) == 0)
+			break;
+	}
+
+	return format;
+}
+
+/**
+ * pevent_find_field - find a non-common field
+ * @event: handle for the event
+ * @name: the name of the non-common field
+ *
+ * Returns a non-common field by the given @name.
+ * This does not search common fields.
+ */
+struct format_field *
+pevent_find_field(struct event_format *event, const char *name)
+{
+	struct format_field *format;
+
+	for (format = event->format.fields;
+	     format; format = format->next) {
+		if (strcmp(format->name, name) == 0)
+			break;
+	}
+
+	return format;
+}
+
+/**
+ * pevent_find_any_field - find any field by name
+ * @event: handle for the event
+ * @name: the name of the field
+ *
+ * Returns a field by the given @name.
+ * This searchs the common field names first, then
+ * the non-common ones if a common one was not found.
+ */
+struct format_field *
+pevent_find_any_field(struct event_format *event, const char *name)
+{
+	struct format_field *format;
+
+	format = pevent_find_common_field(event, name);
+	if (format)
+		return format;
+	return pevent_find_field(event, name);
+}
+
+/**
+ * pevent_read_number - read a number from data
+ * @pevent: handle for the pevent
+ * @ptr: the raw data
+ * @size: the size of the data that holds the number
+ *
+ * Returns the number (converted to host) from the
+ * raw data.
+ */
+unsigned long long pevent_read_number(struct pevent *pevent,
+				      const void *ptr, int size)
+{
+	switch (size) {
+	case 1:
+		return *(unsigned char *)ptr;
+	case 2:
+		return data2host2(pevent, ptr);
+	case 4:
+		return data2host4(pevent, ptr);
+	case 8:
+		return data2host8(pevent, ptr);
+	default:
+		/* BUG! */
+		return 0;
+	}
+}
+
+/**
+ * pevent_read_number_field - read a number from data
+ * @field: a handle to the field
+ * @data: the raw data to read
+ * @value: the value to place the number in
+ *
+ * Reads raw data according to a field offset and size,
+ * and translates it into @value.
+ *
+ * Returns 0 on success, -1 otherwise.
+ */
+int pevent_read_number_field(struct format_field *field, const void *data,
+			     unsigned long long *value)
+{
+	if (!field)
+		return -1;
+	switch (field->size) {
+	case 1:
+	case 2:
+	case 4:
+	case 8:
+		*value = pevent_read_number(field->event->pevent,
+					    data + field->offset, field->size);
+		return 0;
+	default:
+		return -1;
+	}
+}
+
+static int get_common_info(struct pevent *pevent,
+			   const char *type, int *offset, int *size)
+{
+	struct event_format *event;
+	struct format_field *field;
+
+	/*
+	 * All events should have the same common elements.
+	 * Pick any event to find where the type is;
+	 */
+	if (!pevent->events) {
+		do_warning("no event_list!");
+		return -1;
+	}
+
+	event = pevent->events[0];
+	field = pevent_find_common_field(event, type);
+	if (!field)
+		return -1;
+
+	*offset = field->offset;
+	*size = field->size;
+
+	return 0;
+}
+
+static int __parse_common(struct pevent *pevent, void *data,
+			  int *size, int *offset, const char *name)
+{
+	int ret;
+
+	if (!*size) {
+		ret = get_common_info(pevent, name, offset, size);
+		if (ret < 0)
+			return ret;
+	}
+	return pevent_read_number(pevent, data + *offset, *size);
+}
+
+static int trace_parse_common_type(struct pevent *pevent, void *data)
+{
+	return __parse_common(pevent, data,
+			      &pevent->type_size, &pevent->type_offset,
+			      "common_type");
+}
+
+static int parse_common_pid(struct pevent *pevent, void *data)
+{
+	return __parse_common(pevent, data,
+			      &pevent->pid_size, &pevent->pid_offset,
+			      "common_pid");
+}
+
+static int parse_common_pc(struct pevent *pevent, void *data)
+{
+	return __parse_common(pevent, data,
+			      &pevent->pc_size, &pevent->pc_offset,
+			      "common_preempt_count");
+}
+
+static int parse_common_flags(struct pevent *pevent, void *data)
+{
+	return __parse_common(pevent, data,
+			      &pevent->flags_size, &pevent->flags_offset,
+			      "common_flags");
+}
+
+static int parse_common_lock_depth(struct pevent *pevent, void *data)
+{
+	return __parse_common(pevent, data,
+			      &pevent->ld_size, &pevent->ld_offset,
+			      "common_lock_depth");
+}
+
+static int parse_common_migrate_disable(struct pevent *pevent, void *data)
+{
+	return __parse_common(pevent, data,
+			      &pevent->ld_size, &pevent->ld_offset,
+			      "common_migrate_disable");
+}
+
+static int events_id_cmp(const void *a, const void *b);
+
+/**
+ * pevent_find_event - find an event by given id
+ * @pevent: a handle to the pevent
+ * @id: the id of the event
+ *
+ * Returns an event that has a given @id.
+ */
+struct event_format *pevent_find_event(struct pevent *pevent, int id)
+{
+	struct event_format **eventptr;
+	struct event_format key;
+	struct event_format *pkey = &key;
+
+	/* Check cache first */
+	if (pevent->last_event && pevent->last_event->id == id)
+		return pevent->last_event;
+
+	key.id = id;
+
+	eventptr = bsearch(&pkey, pevent->events, pevent->nr_events,
+			   sizeof(*pevent->events), events_id_cmp);
+
+	if (eventptr) {
+		pevent->last_event = *eventptr;
+		return *eventptr;
+	}
+
+	return NULL;
+}
+
+/**
+ * pevent_find_event_by_name - find an event by given name
+ * @pevent: a handle to the pevent
+ * @sys: the system name to search for
+ * @name: the name of the event to search for
+ *
+ * This returns an event with a given @name and under the system
+ * @sys. If @sys is NULL the first event with @name is returned.
+ */
+struct event_format *
+pevent_find_event_by_name(struct pevent *pevent,
+			  const char *sys, const char *name)
+{
+	struct event_format *event;
+	int i;
+
+	if (pevent->last_event &&
+	    strcmp(pevent->last_event->name, name) == 0 &&
+	    (!sys || strcmp(pevent->last_event->system, sys) == 0))
+		return pevent->last_event;
+
+	for (i = 0; i < pevent->nr_events; i++) {
+		event = pevent->events[i];
+		if (strcmp(event->name, name) == 0) {
+			if (!sys)
+				break;
+			if (strcmp(event->system, sys) == 0)
+				break;
+		}
+	}
+	if (i == pevent->nr_events)
+		event = NULL;
+
+	pevent->last_event = event;
+	return event;
+}
+
+static unsigned long long
+eval_num_arg(void *data, int size, struct event_format *event, struct print_arg *arg)
+{
+	struct pevent *pevent = event->pevent;
+	unsigned long long val = 0;
+	unsigned long long left, right;
+	struct print_arg *typearg = NULL;
+	struct print_arg *larg;
+	unsigned long offset;
+	unsigned int field_size;
+
+	switch (arg->type) {
+	case PRINT_NULL:
+		/* ?? */
+		return 0;
+	case PRINT_ATOM:
+		return strtoull(arg->atom.atom, NULL, 0);
+	case PRINT_FIELD:
+		if (!arg->field.field) {
+			arg->field.field = pevent_find_any_field(event, arg->field.name);
+			if (!arg->field.field)
+				goto out_warning_field;
+			
+		}
+		/* must be a number */
+		val = pevent_read_number(pevent, data + arg->field.field->offset,
+				arg->field.field->size);
+		break;
+	case PRINT_FLAGS:
+	case PRINT_SYMBOL:
+	case PRINT_HEX:
+		break;
+	case PRINT_TYPE:
+		val = eval_num_arg(data, size, event, arg->typecast.item);
+		return eval_type(val, arg, 0);
+	case PRINT_STRING:
+	case PRINT_BSTRING:
+	case PRINT_BITMASK:
+		return 0;
+	case PRINT_FUNC: {
+		struct trace_seq s;
+		trace_seq_init(&s);
+		val = process_defined_func(&s, data, size, event, arg);
+		trace_seq_destroy(&s);
+		return val;
+	}
+	case PRINT_OP:
+		if (strcmp(arg->op.op, "[") == 0) {
+			/*
+			 * Arrays are special, since we don't want
+			 * to read the arg as is.
+			 */
+			right = eval_num_arg(data, size, event, arg->op.right);
+
+			/* handle typecasts */
+			larg = arg->op.left;
+			while (larg->type == PRINT_TYPE) {
+				if (!typearg)
+					typearg = larg;
+				larg = larg->typecast.item;
+			}
+
+			/* Default to long size */
+			field_size = pevent->long_size;
+
+			switch (larg->type) {
+			case PRINT_DYNAMIC_ARRAY:
+				offset = pevent_read_number(pevent,
+						   data + larg->dynarray.field->offset,
+						   larg->dynarray.field->size);
+				if (larg->dynarray.field->elementsize)
+					field_size = larg->dynarray.field->elementsize;
+				/*
+				 * The actual length of the dynamic array is stored
+				 * in the top half of the field, and the offset
+				 * is in the bottom half of the 32 bit field.
+				 */
+				offset &= 0xffff;
+				offset += right;
+				break;
+			case PRINT_FIELD:
+				if (!larg->field.field) {
+					larg->field.field =
+						pevent_find_any_field(event, larg->field.name);
+					if (!larg->field.field) {
+						arg = larg;
+						goto out_warning_field;
+					}
+				}
+				field_size = larg->field.field->elementsize;
+				offset = larg->field.field->offset +
+					right * larg->field.field->elementsize;
+				break;
+			default:
+				goto default_op; /* oops, all bets off */
+			}
+			val = pevent_read_number(pevent,
+						 data + offset, field_size);
+			if (typearg)
+				val = eval_type(val, typearg, 1);
+			break;
+		} else if (strcmp(arg->op.op, "?") == 0) {
+			left = eval_num_arg(data, size, event, arg->op.left);
+			arg = arg->op.right;
+			if (left)
+				val = eval_num_arg(data, size, event, arg->op.left);
+			else
+				val = eval_num_arg(data, size, event, arg->op.right);
+			break;
+		}
+ default_op:
+		left = eval_num_arg(data, size, event, arg->op.left);
+		right = eval_num_arg(data, size, event, arg->op.right);
+		switch (arg->op.op[0]) {
+		case '!':
+			switch (arg->op.op[1]) {
+			case 0:
+				val = !right;
+				break;
+			case '=':
+				val = left != right;
+				break;
+			default:
+				goto out_warning_op;
+			}
+			break;
+		case '~':
+			val = ~right;
+			break;
+		case '|':
+			if (arg->op.op[1])
+				val = left || right;
+			else
+				val = left | right;
+			break;
+		case '&':
+			if (arg->op.op[1])
+				val = left && right;
+			else
+				val = left & right;
+			break;
+		case '<':
+			switch (arg->op.op[1]) {
+			case 0:
+				val = left < right;
+				break;
+			case '<':
+				val = left << right;
+				break;
+			case '=':
+				val = left <= right;
+				break;
+			default:
+				goto out_warning_op;
+			}
+			break;
+		case '>':
+			switch (arg->op.op[1]) {
+			case 0:
+				val = left > right;
+				break;
+			case '>':
+				val = left >> right;
+				break;
+			case '=':
+				val = left >= right;
+				break;
+			default:
+				goto out_warning_op;
+			}
+			break;
+		case '=':
+			if (arg->op.op[1] != '=')
+				goto out_warning_op;
+
+			val = left == right;
+			break;
+		case '-':
+			val = left - right;
+			break;
+		case '+':
+			val = left + right;
+			break;
+		case '/':
+			val = left / right;
+			break;
+		case '*':
+			val = left * right;
+			break;
+		default:
+			goto out_warning_op;
+		}
+		break;
+	case PRINT_DYNAMIC_ARRAY:
+		/* Without [], we pass the address to the dynamic data */
+		offset = pevent_read_number(pevent,
+					    data + arg->dynarray.field->offset,
+					    arg->dynarray.field->size);
+		/*
+		 * The actual length of the dynamic array is stored
+		 * in the top half of the field, and the offset
+		 * is in the bottom half of the 32 bit field.
+		 */
+		offset &= 0xffff;
+		val = (unsigned long long)((unsigned long)data + offset);
+		break;
+	default: /* not sure what to do there */
+		return 0;
+	}
+	return val;
+
+out_warning_op:
+	do_warning_event(event, "%s: unknown op '%s'", __func__, arg->op.op);
+	return 0;
+
+out_warning_field:
+	do_warning_event(event, "%s: field %s not found",
+			 __func__, arg->field.name);
+	return 0;
+}
+
+struct flag {
+	const char *name;
+	unsigned long long value;
+};
+
+static const struct flag flags[] = {
+	{ "HI_SOFTIRQ", 0 },
+	{ "TIMER_SOFTIRQ", 1 },
+	{ "NET_TX_SOFTIRQ", 2 },
+	{ "NET_RX_SOFTIRQ", 3 },
+	{ "BLOCK_SOFTIRQ", 4 },
+	{ "BLOCK_IOPOLL_SOFTIRQ", 5 },
+	{ "TASKLET_SOFTIRQ", 6 },
+	{ "SCHED_SOFTIRQ", 7 },
+	{ "HRTIMER_SOFTIRQ", 8 },
+	{ "RCU_SOFTIRQ", 9 },
+
+	{ "HRTIMER_NORESTART", 0 },
+	{ "HRTIMER_RESTART", 1 },
+};
+
+static unsigned long long eval_flag(const char *flag)
+{
+	int i;
+
+	/*
+	 * Some flags in the format files do not get converted.
+	 * If the flag is not numeric, see if it is something that
+	 * we already know about.
+	 */
+	if (isdigit(flag[0]))
+		return strtoull(flag, NULL, 0);
+
+	for (i = 0; i < (int)(sizeof(flags)/sizeof(flags[0])); i++)
+		if (strcmp(flags[i].name, flag) == 0)
+			return flags[i].value;
+
+	return 0;
+}
+
+static void print_str_to_seq(struct trace_seq *s, const char *format,
+			     int len_arg, const char *str)
+{
+	if (len_arg >= 0)
+		trace_seq_printf(s, format, len_arg, str);
+	else
+		trace_seq_printf(s, format, str);
+}
+
+static void print_bitmask_to_seq(struct pevent *pevent,
+				 struct trace_seq *s, const char *format,
+				 int len_arg, const void *data, int size)
+{
+	int nr_bits = size * 8;
+	int str_size = (nr_bits + 3) / 4;
+	int len = 0;
+	char buf[3];
+	char *str;
+	int index;
+	int i;
+
+	/*
+	 * The kernel likes to put in commas every 32 bits, we
+	 * can do the same.
+	 */
+	str_size += (nr_bits - 1) / 32;
+
+	str = malloc(str_size + 1);
+	if (!str) {
+		do_warning("%s: not enough memory!", __func__);
+		return;
+	}
+	str[str_size] = 0;
+
+	/* Start out with -2 for the two chars per byte */
+	for (i = str_size - 2; i >= 0; i -= 2) {
+		/*
+		 * data points to a bit mask of size bytes.
+		 * In the kernel, this is an array of long words, thus
+		 * endianess is very important.
+		 */
+		if (pevent->file_bigendian)
+			index = size - (len + 1);
+		else
+			index = len;
+
+		snprintf(buf, 3, "%02x", *((unsigned char *)data + index));
+		memcpy(str + i, buf, 2);
+		len++;
+		if (!(len & 3) && i > 0) {
+			i--;
+			str[i] = ',';
+		}
+	}
+
+	if (len_arg >= 0)
+		trace_seq_printf(s, format, len_arg, str);
+	else
+		trace_seq_printf(s, format, str);
+
+	free(str);
+}
+
+static void print_str_arg(struct trace_seq *s, void *data, int size,
+			  struct event_format *event, const char *format,
+			  int len_arg, struct print_arg *arg)
+{
+	struct pevent *pevent = event->pevent;
+	struct print_flag_sym *flag;
+	struct format_field *field;
+	struct printk_map *printk;
+	unsigned long long val, fval;
+	unsigned long addr;
+	char *str;
+	unsigned char *hex;
+	int print;
+	int i, len;
+
+	switch (arg->type) {
+	case PRINT_NULL:
+		/* ?? */
+		return;
+	case PRINT_ATOM:
+		print_str_to_seq(s, format, len_arg, arg->atom.atom);
+		return;
+	case PRINT_FIELD:
+		field = arg->field.field;
+		if (!field) {
+			field = pevent_find_any_field(event, arg->field.name);
+			if (!field) {
+				str = arg->field.name;
+				goto out_warning_field;
+			}
+			arg->field.field = field;
+		}
+		/* Zero sized fields, mean the rest of the data */
+		len = field->size ? : size - field->offset;
+
+		/*
+		 * Some events pass in pointers. If this is not an array
+		 * and the size is the same as long_size, assume that it
+		 * is a pointer.
+		 */
+		if (!(field->flags & FIELD_IS_ARRAY) &&
+		    field->size == pevent->long_size) {
+			addr = *(unsigned long *)(data + field->offset);
+			/* Check if it matches a print format */
+			printk = find_printk(pevent, addr);
+			if (printk)
+				trace_seq_puts(s, printk->printk);
+			else
+				trace_seq_printf(s, "%lx", addr);
+			break;
+		}
+		str = malloc(len + 1);
+		if (!str) {
+			do_warning_event(event, "%s: not enough memory!",
+					 __func__);
+			return;
+		}
+		memcpy(str, data + field->offset, len);
+		str[len] = 0;
+		print_str_to_seq(s, format, len_arg, str);
+		free(str);
+		break;
+	case PRINT_FLAGS:
+		val = eval_num_arg(data, size, event, arg->flags.field);
+		print = 0;
+		for (flag = arg->flags.flags; flag; flag = flag->next) {
+			fval = eval_flag(flag->value);
+			if (!val && !fval) {
+				print_str_to_seq(s, format, len_arg, flag->str);
+				break;
+			}
+			if (fval && (val & fval) == fval) {
+				if (print && arg->flags.delim)
+					trace_seq_puts(s, arg->flags.delim);
+				print_str_to_seq(s, format, len_arg, flag->str);
+				print = 1;
+				val &= ~fval;
+			}
+		}
+		break;
+	case PRINT_SYMBOL:
+		val = eval_num_arg(data, size, event, arg->symbol.field);
+		for (flag = arg->symbol.symbols; flag; flag = flag->next) {
+			fval = eval_flag(flag->value);
+			if (val == fval) {
+				print_str_to_seq(s, format, len_arg, flag->str);
+				break;
+			}
+		}
+		break;
+	case PRINT_HEX:
+		if (arg->hex.field->type == PRINT_DYNAMIC_ARRAY) {
+			unsigned long offset;
+			offset = pevent_read_number(pevent,
+				data + arg->hex.field->dynarray.field->offset,
+				arg->hex.field->dynarray.field->size);
+			hex = data + (offset & 0xffff);
+		} else {
+			field = arg->hex.field->field.field;
+			if (!field) {
+				str = arg->hex.field->field.name;
+				field = pevent_find_any_field(event, str);
+				if (!field)
+					goto out_warning_field;
+				arg->hex.field->field.field = field;
+			}
+			hex = data + field->offset;
+		}
+		len = eval_num_arg(data, size, event, arg->hex.size);
+		for (i = 0; i < len; i++) {
+			if (i)
+				trace_seq_putc(s, ' ');
+			trace_seq_printf(s, "%02x", hex[i]);
+		}
+		break;
+
+	case PRINT_TYPE:
+		break;
+	case PRINT_STRING: {
+		int str_offset;
+
+		if (arg->string.offset == -1) {
+			struct format_field *f;
+
+			f = pevent_find_any_field(event, arg->string.string);
+			arg->string.offset = f->offset;
+		}
+		str_offset = data2host4(pevent, data + arg->string.offset);
+		str_offset &= 0xffff;
+		print_str_to_seq(s, format, len_arg, ((char *)data) + str_offset);
+		break;
+	}
+	case PRINT_BSTRING:
+		print_str_to_seq(s, format, len_arg, arg->string.string);
+		break;
+	case PRINT_BITMASK: {
+		int bitmask_offset;
+		int bitmask_size;
+
+		if (arg->bitmask.offset == -1) {
+			struct format_field *f;
+
+			f = pevent_find_any_field(event, arg->bitmask.bitmask);
+			arg->bitmask.offset = f->offset;
+		}
+		bitmask_offset = data2host4(pevent, data + arg->bitmask.offset);
+		bitmask_size = bitmask_offset >> 16;
+		bitmask_offset &= 0xffff;
+		print_bitmask_to_seq(pevent, s, format, len_arg,
+				     data + bitmask_offset, bitmask_size);
+		break;
+	}
+	case PRINT_OP:
+		/*
+		 * The only op for string should be ? :
+		 */
+		if (arg->op.op[0] != '?')
+			return;
+		val = eval_num_arg(data, size, event, arg->op.left);
+		if (val)
+			print_str_arg(s, data, size, event,
+				      format, len_arg, arg->op.right->op.left);
+		else
+			print_str_arg(s, data, size, event,
+				      format, len_arg, arg->op.right->op.right);
+		break;
+	case PRINT_FUNC:
+		process_defined_func(s, data, size, event, arg);
+		break;
+	default:
+		/* well... */
+		break;
+	}
+
+	return;
+
+out_warning_field:
+	do_warning_event(event, "%s: field %s not found",
+			 __func__, arg->field.name);
+}
+
+static unsigned long long
+process_defined_func(struct trace_seq *s, void *data, int size,
+		     struct event_format *event, struct print_arg *arg)
+{
+	struct pevent_function_handler *func_handle = arg->func.func;
+	struct pevent_func_params *param;
+	unsigned long long *args;
+	unsigned long long ret;
+	struct print_arg *farg;
+	struct trace_seq str;
+	struct save_str {
+		struct save_str *next;
+		char *str;
+	} *strings = NULL, *string;
+	int i;
+
+	if (!func_handle->nr_args) {
+		ret = (*func_handle->func)(s, NULL);
+		goto out;
+	}
+
+	farg = arg->func.args;
+	param = func_handle->params;
+
+	ret = ULLONG_MAX;
+	args = malloc(sizeof(*args) * func_handle->nr_args);
+	if (!args)
+		goto out;
+
+	for (i = 0; i < func_handle->nr_args; i++) {
+		switch (param->type) {
+		case PEVENT_FUNC_ARG_INT:
+		case PEVENT_FUNC_ARG_LONG:
+		case PEVENT_FUNC_ARG_PTR:
+			args[i] = eval_num_arg(data, size, event, farg);
+			break;
+		case PEVENT_FUNC_ARG_STRING:
+			trace_seq_init(&str);
+			print_str_arg(&str, data, size, event, "%s", -1, farg);
+			trace_seq_terminate(&str);
+			string = malloc(sizeof(*string));
+			if (!string) {
+				do_warning_event(event, "%s(%d): malloc str",
+						 __func__, __LINE__);
+				goto out_free;
+			}
+			string->next = strings;
+			string->str = strdup(str.buffer);
+			if (!string->str) {
+				free(string);
+				do_warning_event(event, "%s(%d): malloc str",
+						 __func__, __LINE__);
+				goto out_free;
+			}
+			args[i] = (uintptr_t)string->str;
+			strings = string;
+			trace_seq_destroy(&str);
+			break;
+		default:
+			/*
+			 * Something went totally wrong, this is not
+			 * an input error, something in this code broke.
+			 */
+			do_warning_event(event, "Unexpected end of arguments\n");
+			goto out_free;
+		}
+		farg = farg->next;
+		param = param->next;
+	}
+
+	ret = (*func_handle->func)(s, args);
+out_free:
+	free(args);
+	while (strings) {
+		string = strings;
+		strings = string->next;
+		free(string->str);
+		free(string);
+	}
+
+ out:
+	/* TBD : handle return type here */
+	return ret;
+}
+
+static void free_args(struct print_arg *args)
+{
+	struct print_arg *next;
+
+	while (args) {
+		next = args->next;
+
+		free_arg(args);
+		args = next;
+	}
+}
+
+static struct print_arg *make_bprint_args(char *fmt, void *data, int size, struct event_format *event)
+{
+	struct pevent *pevent = event->pevent;
+	struct format_field *field, *ip_field;
+	struct print_arg *args, *arg, **next;
+	unsigned long long ip, val;
+	char *ptr;
+	void *bptr;
+	int vsize;
+
+	field = pevent->bprint_buf_field;
+	ip_field = pevent->bprint_ip_field;
+
+	if (!field) {
+		field = pevent_find_field(event, "buf");
+		if (!field) {
+			do_warning_event(event, "can't find buffer field for binary printk");
+			return NULL;
+		}
+		ip_field = pevent_find_field(event, "ip");
+		if (!ip_field) {
+			do_warning_event(event, "can't find ip field for binary printk");
+			return NULL;
+		}
+		pevent->bprint_buf_field = field;
+		pevent->bprint_ip_field = ip_field;
+	}
+
+	ip = pevent_read_number(pevent, data + ip_field->offset, ip_field->size);
+
+	/*
+	 * The first arg is the IP pointer.
+	 */
+	args = alloc_arg();
+	if (!args) {
+		do_warning_event(event, "%s(%d): not enough memory!",
+				 __func__, __LINE__);
+		return NULL;
+	}
+	arg = args;
+	arg->next = NULL;
+	next = &arg->next;
+
+	arg->type = PRINT_ATOM;
+		
+	if (asprintf(&arg->atom.atom, "%lld", ip) < 0)
+		goto out_free;
+
+	/* skip the first "%pf: " */
+	for (ptr = fmt + 5, bptr = data + field->offset;
+	     bptr < data + size && *ptr; ptr++) {
+		int ls = 0;
+
+		if (*ptr == '%') {
+ process_again:
+			ptr++;
+			switch (*ptr) {
+			case '%':
+				break;
+			case 'l':
+				ls++;
+				goto process_again;
+			case 'L':
+				ls = 2;
+				goto process_again;
+			case '0' ... '9':
+				goto process_again;
+			case '.':
+				goto process_again;
+			case 'p':
+				ls = 1;
+				/* fall through */
+			case 'd':
+			case 'u':
+			case 'x':
+			case 'i':
+				switch (ls) {
+				case 0:
+					vsize = 4;
+					break;
+				case 1:
+					vsize = pevent->long_size;
+					break;
+				case 2:
+					vsize = 8;
+					break;
+				default:
+					vsize = ls; /* ? */
+					break;
+				}
+			/* fall through */
+			case '*':
+				if (*ptr == '*')
+					vsize = 4;
+
+				/* the pointers are always 4 bytes aligned */
+				bptr = (void *)(((unsigned long)bptr + 3) &
+						~3);
+				val = pevent_read_number(pevent, bptr, vsize);
+				bptr += vsize;
+				arg = alloc_arg();
+				if (!arg) {
+					do_warning_event(event, "%s(%d): not enough memory!",
+						   __func__, __LINE__);
+					goto out_free;
+				}
+				arg->next = NULL;
+				arg->type = PRINT_ATOM;
+				if (asprintf(&arg->atom.atom, "%lld", val) < 0) {
+					free(arg);
+					goto out_free;
+				}
+				*next = arg;
+				next = &arg->next;
+				/*
+				 * The '*' case means that an arg is used as the length.
+				 * We need to continue to figure out for what.
+				 */
+				if (*ptr == '*')
+					goto process_again;
+
+				break;
+			case 's':
+				arg = alloc_arg();
+				if (!arg) {
+					do_warning_event(event, "%s(%d): not enough memory!",
+						   __func__, __LINE__);
+					goto out_free;
+				}
+				arg->next = NULL;
+				arg->type = PRINT_BSTRING;
+				arg->string.string = strdup(bptr);
+				if (!arg->string.string)
+					goto out_free;
+				bptr += strlen(bptr) + 1;
+				*next = arg;
+				next = &arg->next;
+			default:
+				break;
+			}
+		}
+	}
+
+	return args;
+
+out_free:
+	free_args(args);
+	return NULL;
+}
+
+static char *
+get_bprint_format(void *data, int size __maybe_unused,
+		  struct event_format *event)
+{
+	struct pevent *pevent = event->pevent;
+	unsigned long long addr;
+	struct format_field *field;
+	struct printk_map *printk;
+	char *format;
+
+	field = pevent->bprint_fmt_field;
+
+	if (!field) {
+		field = pevent_find_field(event, "fmt");
+		if (!field) {
+			do_warning_event(event, "can't find format field for binary printk");
+			return NULL;
+		}
+		pevent->bprint_fmt_field = field;
+	}
+
+	addr = pevent_read_number(pevent, data + field->offset, field->size);
+
+	printk = find_printk(pevent, addr);
+	if (!printk) {
+		if (asprintf(&format, "%%pf: (NO FORMAT FOUND at %llx)\n", addr) < 0)
+			return NULL;
+		return format;
+	}
+
+	if (asprintf(&format, "%s: %s", "%pf", printk->printk) < 0)
+		return NULL;
+
+	return format;
+}
+
+static void print_mac_arg(struct trace_seq *s, int mac, void *data, int size,
+			  struct event_format *event, struct print_arg *arg)
+{
+	unsigned char *buf;
+	const char *fmt = "%.2x:%.2x:%.2x:%.2x:%.2x:%.2x";
+
+	if (arg->type == PRINT_FUNC) {
+		process_defined_func(s, data, size, event, arg);
+		return;
+	}
+
+	if (arg->type != PRINT_FIELD) {
+		trace_seq_printf(s, "ARG TYPE NOT FIELD BUT %d",
+				 arg->type);
+		return;
+	}
+
+	if (mac == 'm')
+		fmt = "%.2x%.2x%.2x%.2x%.2x%.2x";
+	if (!arg->field.field) {
+		arg->field.field =
+			pevent_find_any_field(event, arg->field.name);
+		if (!arg->field.field) {
+			do_warning_event(event, "%s: field %s not found",
+					 __func__, arg->field.name);
+			return;
+		}
+	}
+	if (arg->field.field->size != 6) {
+		trace_seq_printf(s, "INVALIDMAC");
+		return;
+	}
+	buf = data + arg->field.field->offset;
+	trace_seq_printf(s, fmt, buf[0], buf[1], buf[2], buf[3], buf[4], buf[5]);
+}
+
+static int is_printable_array(char *p, unsigned int len)
+{
+	unsigned int i;
+
+	for (i = 0; i < len && p[i]; i++)
+		if (!isprint(p[i]) && !isspace(p[i]))
+		    return 0;
+	return 1;
+}
+
+static void print_event_fields(struct trace_seq *s, void *data,
+			       int size __maybe_unused,
+			       struct event_format *event)
+{
+	struct format_field *field;
+	unsigned long long val;
+	unsigned int offset, len, i;
+
+	field = event->format.fields;
+	while (field) {
+		trace_seq_printf(s, " %s=", field->name);
+		if (field->flags & FIELD_IS_ARRAY) {
+			offset = field->offset;
+			len = field->size;
+			if (field->flags & FIELD_IS_DYNAMIC) {
+				val = pevent_read_number(event->pevent, data + offset, len);
+				offset = val;
+				len = offset >> 16;
+				offset &= 0xffff;
+			}
+			if (field->flags & FIELD_IS_STRING &&
+			    is_printable_array(data + offset, len)) {
+				trace_seq_printf(s, "%s", (char *)data + offset);
+			} else {
+				trace_seq_puts(s, "ARRAY[");
+				for (i = 0; i < len; i++) {
+					if (i)
+						trace_seq_puts(s, ", ");
+					trace_seq_printf(s, "%02x",
+							 *((unsigned char *)data + offset + i));
+				}
+				trace_seq_putc(s, ']');
+				field->flags &= ~FIELD_IS_STRING;
+			}
+		} else {
+			val = pevent_read_number(event->pevent, data + field->offset,
+						 field->size);
+			if (field->flags & FIELD_IS_POINTER) {
+				trace_seq_printf(s, "0x%llx", val);
+			} else if (field->flags & FIELD_IS_SIGNED) {
+				switch (field->size) {
+				case 4:
+					/*
+					 * If field is long then print it in hex.
+					 * A long usually stores pointers.
+					 */
+					if (field->flags & FIELD_IS_LONG)
+						trace_seq_printf(s, "0x%x", (int)val);
+					else
+						trace_seq_printf(s, "%d", (int)val);
+					break;
+				case 2:
+					trace_seq_printf(s, "%2d", (short)val);
+					break;
+				case 1:
+					trace_seq_printf(s, "%1d", (char)val);
+					break;
+				default:
+					trace_seq_printf(s, "%lld", val);
+				}
+			} else {
+				if (field->flags & FIELD_IS_LONG)
+					trace_seq_printf(s, "0x%llx", val);
+				else
+					trace_seq_printf(s, "%llu", val);
+			}
+		}
+		field = field->next;
+	}
+}
+
+static void pretty_print(struct trace_seq *s, void *data, int size, struct event_format *event)
+{
+	struct pevent *pevent = event->pevent;
+	struct print_fmt *print_fmt = &event->print_fmt;
+	struct print_arg *arg = print_fmt->args;
+	struct print_arg *args = NULL;
+	const char *ptr = print_fmt->format;
+	unsigned long long val;
+	struct func_map *func;
+	const char *saveptr;
+	struct trace_seq p;
+	char *bprint_fmt = NULL;
+	char format[32];
+	int show_func;
+	int len_as_arg;
+	int len_arg;
+	int len;
+	int ls;
+
+	if (event->flags & EVENT_FL_FAILED) {
+		trace_seq_printf(s, "[FAILED TO PARSE]");
+		print_event_fields(s, data, size, event);
+		return;
+	}
+
+	if (event->flags & EVENT_FL_ISBPRINT) {
+		bprint_fmt = get_bprint_format(data, size, event);
+		args = make_bprint_args(bprint_fmt, data, size, event);
+		arg = args;
+		ptr = bprint_fmt;
+	}
+
+	for (; *ptr; ptr++) {
+		ls = 0;
+		if (*ptr == '\\') {
+			ptr++;
+			switch (*ptr) {
+			case 'n':
+				trace_seq_putc(s, '\n');
+				break;
+			case 't':
+				trace_seq_putc(s, '\t');
+				break;
+			case 'r':
+				trace_seq_putc(s, '\r');
+				break;
+			case '\\':
+				trace_seq_putc(s, '\\');
+				break;
+			default:
+				trace_seq_putc(s, *ptr);
+				break;
+			}
+
+		} else if (*ptr == '%') {
+			saveptr = ptr;
+			show_func = 0;
+			len_as_arg = 0;
+ cont_process:
+			ptr++;
+			switch (*ptr) {
+			case '%':
+				trace_seq_putc(s, '%');
+				break;
+			case '#':
+				/* FIXME: need to handle properly */
+				goto cont_process;
+			case 'h':
+				ls--;
+				goto cont_process;
+			case 'l':
+				ls++;
+				goto cont_process;
+			case 'L':
+				ls = 2;
+				goto cont_process;
+			case '*':
+				/* The argument is the length. */
+				if (!arg) {
+					do_warning_event(event, "no argument match");
+					event->flags |= EVENT_FL_FAILED;
+					goto out_failed;
+				}
+				len_arg = eval_num_arg(data, size, event, arg);
+				len_as_arg = 1;
+				arg = arg->next;
+				goto cont_process;
+			case '.':
+			case 'z':
+			case 'Z':
+			case '0' ... '9':
+				goto cont_process;
+			case 'p':
+				if (pevent->long_size == 4)
+					ls = 1;
+				else
+					ls = 2;
+
+				if (*(ptr+1) == 'F' ||
+				    *(ptr+1) == 'f') {
+					ptr++;
+					show_func = *ptr;
+				} else if (*(ptr+1) == 'M' || *(ptr+1) == 'm') {
+					print_mac_arg(s, *(ptr+1), data, size, event, arg);
+					ptr++;
+					arg = arg->next;
+					break;
+				}
+
+				/* fall through */
+			case 'd':
+			case 'i':
+			case 'x':
+			case 'X':
+			case 'u':
+				if (!arg) {
+					do_warning_event(event, "no argument match");
+					event->flags |= EVENT_FL_FAILED;
+					goto out_failed;
+				}
+
+				len = ((unsigned long)ptr + 1) -
+					(unsigned long)saveptr;
+
+				/* should never happen */
+				if (len > 31) {
+					do_warning_event(event, "bad format!");
+					event->flags |= EVENT_FL_FAILED;
+					len = 31;
+				}
+
+				memcpy(format, saveptr, len);
+				format[len] = 0;
+
+				val = eval_num_arg(data, size, event, arg);
+				arg = arg->next;
+
+				if (show_func) {
+					func = find_func(pevent, val);
+					if (func) {
+						trace_seq_puts(s, func->func);
+						if (show_func == 'F')
+							trace_seq_printf(s,
+							       "+0x%llx",
+							       val - func->addr);
+						break;
+					}
+				}
+				if (pevent->long_size == 8 && ls &&
+				    sizeof(long) != 8) {
+					char *p;
+
+					ls = 2;
+					/* make %l into %ll */
+					p = strchr(format, 'l');
+					if (p)
+						memmove(p+1, p, strlen(p)+1);
+					else if (strcmp(format, "%p") == 0)
+						strcpy(format, "0x%llx");
+				}
+				switch (ls) {
+				case -2:
+					if (len_as_arg)
+						trace_seq_printf(s, format, len_arg, (char)val);
+					else
+						trace_seq_printf(s, format, (char)val);
+					break;
+				case -1:
+					if (len_as_arg)
+						trace_seq_printf(s, format, len_arg, (short)val);
+					else
+						trace_seq_printf(s, format, (short)val);
+					break;
+				case 0:
+					if (len_as_arg)
+						trace_seq_printf(s, format, len_arg, (int)val);
+					else
+						trace_seq_printf(s, format, (int)val);
+					break;
+				case 1:
+					if (len_as_arg)
+						trace_seq_printf(s, format, len_arg, (long)val);
+					else
+						trace_seq_printf(s, format, (long)val);
+					break;
+				case 2:
+					if (len_as_arg)
+						trace_seq_printf(s, format, len_arg,
+								 (long long)val);
+					else
+						trace_seq_printf(s, format, (long long)val);
+					break;
+				default:
+					do_warning_event(event, "bad count (%d)", ls);
+					event->flags |= EVENT_FL_FAILED;
+				}
+				break;
+			case 's':
+				if (!arg) {
+					do_warning_event(event, "no matching argument");
+					event->flags |= EVENT_FL_FAILED;
+					goto out_failed;
+				}
+
+				len = ((unsigned long)ptr + 1) -
+					(unsigned long)saveptr;
+
+				/* should never happen */
+				if (len > 31) {
+					do_warning_event(event, "bad format!");
+					event->flags |= EVENT_FL_FAILED;
+					len = 31;
+				}
+
+				memcpy(format, saveptr, len);
+				format[len] = 0;
+				if (!len_as_arg)
+					len_arg = -1;
+				/* Use helper trace_seq */
+				trace_seq_init(&p);
+				print_str_arg(&p, data, size, event,
+					      format, len_arg, arg);
+				trace_seq_terminate(&p);
+				trace_seq_puts(s, p.buffer);
+				trace_seq_destroy(&p);
+				arg = arg->next;
+				break;
+			default:
+				trace_seq_printf(s, ">%c<", *ptr);
+
+			}
+		} else
+			trace_seq_putc(s, *ptr);
+	}
+
+	if (event->flags & EVENT_FL_FAILED) {
+out_failed:
+		trace_seq_printf(s, "[FAILED TO PARSE]");
+	}
+
+	if (args) {
+		free_args(args);
+		free(bprint_fmt);
+	}
+}
+
+/**
+ * pevent_data_lat_fmt - parse the data for the latency format
+ * @pevent: a handle to the pevent
+ * @s: the trace_seq to write to
+ * @record: the record to read from
+ *
+ * This parses out the Latency format (interrupts disabled,
+ * need rescheduling, in hard/soft interrupt, preempt count
+ * and lock depth) and places it into the trace_seq.
+ */
+void pevent_data_lat_fmt(struct pevent *pevent,
+			 struct trace_seq *s, struct pevent_record *record)
+{
+	static int check_lock_depth = 1;
+	static int check_migrate_disable = 1;
+	static int lock_depth_exists;
+	static int migrate_disable_exists;
+	unsigned int lat_flags;
+	unsigned int pc;
+	int lock_depth;
+	int migrate_disable;
+	int hardirq;
+	int softirq;
+	void *data = record->data;
+
+	lat_flags = parse_common_flags(pevent, data);
+	pc = parse_common_pc(pevent, data);
+	/* lock_depth may not always exist */
+	if (lock_depth_exists)
+		lock_depth = parse_common_lock_depth(pevent, data);
+	else if (check_lock_depth) {
+		lock_depth = parse_common_lock_depth(pevent, data);
+		if (lock_depth < 0)
+			check_lock_depth = 0;
+		else
+			lock_depth_exists = 1;
+	}
+
+	/* migrate_disable may not always exist */
+	if (migrate_disable_exists)
+		migrate_disable = parse_common_migrate_disable(pevent, data);
+	else if (check_migrate_disable) {
+		migrate_disable = parse_common_migrate_disable(pevent, data);
+		if (migrate_disable < 0)
+			check_migrate_disable = 0;
+		else
+			migrate_disable_exists = 1;
+	}
+
+	hardirq = lat_flags & TRACE_FLAG_HARDIRQ;
+	softirq = lat_flags & TRACE_FLAG_SOFTIRQ;
+
+	trace_seq_printf(s, "%c%c%c",
+	       (lat_flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+	       (lat_flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
+	       'X' : '.',
+	       (lat_flags & TRACE_FLAG_NEED_RESCHED) ?
+	       'N' : '.',
+	       (hardirq && softirq) ? 'H' :
+	       hardirq ? 'h' : softirq ? 's' : '.');
+
+	if (pc)
+		trace_seq_printf(s, "%x", pc);
+	else
+		trace_seq_putc(s, '.');
+
+	if (migrate_disable_exists) {
+		if (migrate_disable < 0)
+			trace_seq_putc(s, '.');
+		else
+			trace_seq_printf(s, "%d", migrate_disable);
+	}
+
+	if (lock_depth_exists) {
+		if (lock_depth < 0)
+			trace_seq_putc(s, '.');
+		else
+			trace_seq_printf(s, "%d", lock_depth);
+	}
+
+	trace_seq_terminate(s);
+}
+
+/**
+ * pevent_data_type - parse out the given event type
+ * @pevent: a handle to the pevent
+ * @rec: the record to read from
+ *
+ * This returns the event id from the @rec.
+ */
+int pevent_data_type(struct pevent *pevent, struct pevent_record *rec)
+{
+	return trace_parse_common_type(pevent, rec->data);
+}
+
+/**
+ * pevent_data_event_from_type - find the event by a given type
+ * @pevent: a handle to the pevent
+ * @type: the type of the event.
+ *
+ * This returns the event form a given @type;
+ */
+struct event_format *pevent_data_event_from_type(struct pevent *pevent, int type)
+{
+	return pevent_find_event(pevent, type);
+}
+
+/**
+ * pevent_data_pid - parse the PID from raw data
+ * @pevent: a handle to the pevent
+ * @rec: the record to parse
+ *
+ * This returns the PID from a raw data.
+ */
+int pevent_data_pid(struct pevent *pevent, struct pevent_record *rec)
+{
+	return parse_common_pid(pevent, rec->data);
+}
+
+/**
+ * pevent_data_comm_from_pid - return the command line from PID
+ * @pevent: a handle to the pevent
+ * @pid: the PID of the task to search for
+ *
+ * This returns a pointer to the command line that has the given
+ * @pid.
+ */
+const char *pevent_data_comm_from_pid(struct pevent *pevent, int pid)
+{
+	const char *comm;
+
+	comm = find_cmdline(pevent, pid);
+	return comm;
+}
+
+/**
+ * pevent_data_comm_from_pid - parse the data into the print format
+ * @s: the trace_seq to write to
+ * @event: the handle to the event
+ * @record: the record to read from
+ *
+ * This parses the raw @data using the given @event information and
+ * writes the print format into the trace_seq.
+ */
+void pevent_event_info(struct trace_seq *s, struct event_format *event,
+		       struct pevent_record *record)
+{
+	int print_pretty = 1;
+
+	if (event->pevent->print_raw || (event->flags & EVENT_FL_PRINTRAW))
+		print_event_fields(s, record->data, record->size, event);
+	else {
+
+		if (event->handler && !(event->flags & EVENT_FL_NOHANDLE))
+			print_pretty = event->handler(s, record, event,
+						      event->context);
+
+		if (print_pretty)
+			pretty_print(s, record->data, record->size, event);
+	}
+
+	trace_seq_terminate(s);
+}
+
+static bool is_timestamp_in_us(char *trace_clock, bool use_trace_clock)
+{
+	if (!use_trace_clock)
+		return true;
+
+	if (!strcmp(trace_clock, "local") || !strcmp(trace_clock, "global")
+	    || !strcmp(trace_clock, "uptime") || !strcmp(trace_clock, "perf"))
+		return true;
+
+	/* trace_clock is setting in tsc or counter mode */
+	return false;
+}
+
+void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
+			struct pevent_record *record, bool use_trace_clock)
+{
+	static const char *spaces = "                    "; /* 20 spaces */
+	struct event_format *event;
+	unsigned long secs;
+	unsigned long usecs;
+	unsigned long nsecs;
+	const char *comm;
+	void *data = record->data;
+	int type;
+	int pid;
+	int len;
+	int p;
+	bool use_usec_format;
+
+	use_usec_format = is_timestamp_in_us(pevent->trace_clock,
+							use_trace_clock);
+	if (use_usec_format) {
+		secs = record->ts / NSECS_PER_SEC;
+		nsecs = record->ts - secs * NSECS_PER_SEC;
+	}
+
+	if (record->size < 0) {
+		do_warning("ug! negative record size %d", record->size);
+		return;
+	}
+
+	type = trace_parse_common_type(pevent, data);
+
+	event = pevent_find_event(pevent, type);
+	if (!event) {
+		do_warning("ug! no event found for type %d", type);
+		return;
+	}
+
+	pid = parse_common_pid(pevent, data);
+	comm = find_cmdline(pevent, pid);
+
+	if (pevent->latency_format) {
+		trace_seq_printf(s, "%8.8s-%-5d %3d",
+		       comm, pid, record->cpu);
+		pevent_data_lat_fmt(pevent, s, record);
+	} else
+		trace_seq_printf(s, "%16s-%-5d [%03d]", comm, pid, record->cpu);
+
+	if (use_usec_format) {
+		if (pevent->flags & PEVENT_NSEC_OUTPUT) {
+			usecs = nsecs;
+			p = 9;
+		} else {
+			usecs = (nsecs + 500) / NSECS_PER_USEC;
+			p = 6;
+		}
+
+		trace_seq_printf(s, " %5lu.%0*lu: %s: ",
+					secs, p, usecs, event->name);
+	} else
+		trace_seq_printf(s, " %12llu: %s: ",
+					record->ts, event->name);
+
+	/* Space out the event names evenly. */
+	len = strlen(event->name);
+	if (len < 20)
+		trace_seq_printf(s, "%.*s", 20 - len, spaces);
+
+	pevent_event_info(s, event, record);
+}
+
+static int events_id_cmp(const void *a, const void *b)
+{
+	struct event_format * const * ea = a;
+	struct event_format * const * eb = b;
+
+	if ((*ea)->id < (*eb)->id)
+		return -1;
+
+	if ((*ea)->id > (*eb)->id)
+		return 1;
+
+	return 0;
+}
+
+static int events_name_cmp(const void *a, const void *b)
+{
+	struct event_format * const * ea = a;
+	struct event_format * const * eb = b;
+	int res;
+
+	res = strcmp((*ea)->name, (*eb)->name);
+	if (res)
+		return res;
+
+	res = strcmp((*ea)->system, (*eb)->system);
+	if (res)
+		return res;
+
+	return events_id_cmp(a, b);
+}
+
+static int events_system_cmp(const void *a, const void *b)
+{
+	struct event_format * const * ea = a;
+	struct event_format * const * eb = b;
+	int res;
+
+	res = strcmp((*ea)->system, (*eb)->system);
+	if (res)
+		return res;
+
+	res = strcmp((*ea)->name, (*eb)->name);
+	if (res)
+		return res;
+
+	return events_id_cmp(a, b);
+}
+
+struct event_format **pevent_list_events(struct pevent *pevent, enum event_sort_type sort_type)
+{
+	struct event_format **events;
+	int (*sort)(const void *a, const void *b);
+
+	events = pevent->sort_events;
+
+	if (events && pevent->last_type == sort_type)
+		return events;
+
+	if (!events) {
+		events = malloc(sizeof(*events) * (pevent->nr_events + 1));
+		if (!events)
+			return NULL;
+
+		memcpy(events, pevent->events, sizeof(*events) * pevent->nr_events);
+		events[pevent->nr_events] = NULL;
+
+		pevent->sort_events = events;
+
+		/* the internal events are sorted by id */
+		if (sort_type == EVENT_SORT_ID) {
+			pevent->last_type = sort_type;
+			return events;
+		}
+	}
+
+	switch (sort_type) {
+	case EVENT_SORT_ID:
+		sort = events_id_cmp;
+		break;
+	case EVENT_SORT_NAME:
+		sort = events_name_cmp;
+		break;
+	case EVENT_SORT_SYSTEM:
+		sort = events_system_cmp;
+		break;
+	default:
+		return events;
+	}
+
+	qsort(events, pevent->nr_events, sizeof(*events), sort);
+	pevent->last_type = sort_type;
+
+	return events;
+}
+
+static struct format_field **
+get_event_fields(const char *type, const char *name,
+		 int count, struct format_field *list)
+{
+	struct format_field **fields;
+	struct format_field *field;
+	int i = 0;
+
+	fields = malloc(sizeof(*fields) * (count + 1));
+	if (!fields)
+		return NULL;
+
+	for (field = list; field; field = field->next) {
+		fields[i++] = field;
+		if (i == count + 1) {
+			do_warning("event %s has more %s fields than specified",
+				name, type);
+			i--;
+			break;
+		}
+	}
+
+	if (i != count)
+		do_warning("event %s has less %s fields than specified",
+			name, type);
+
+	fields[i] = NULL;
+
+	return fields;
+}
+
+/**
+ * pevent_event_common_fields - return a list of common fields for an event
+ * @event: the event to return the common fields of.
+ *
+ * Returns an allocated array of fields. The last item in the array is NULL.
+ * The array must be freed with free().
+ */
+struct format_field **pevent_event_common_fields(struct event_format *event)
+{
+	return get_event_fields("common", event->name,
+				event->format.nr_common,
+				event->format.common_fields);
+}
+
+/**
+ * pevent_event_fields - return a list of event specific fields for an event
+ * @event: the event to return the fields of.
+ *
+ * Returns an allocated array of fields. The last item in the array is NULL.
+ * The array must be freed with free().
+ */
+struct format_field **pevent_event_fields(struct event_format *event)
+{
+	return get_event_fields("event", event->name,
+				event->format.nr_fields,
+				event->format.fields);
+}
+
+static void print_fields(struct trace_seq *s, struct print_flag_sym *field)
+{
+	trace_seq_printf(s, "{ %s, %s }", field->value, field->str);
+	if (field->next) {
+		trace_seq_puts(s, ", ");
+		print_fields(s, field->next);
+	}
+}
+
+/* for debugging */
+static void print_args(struct print_arg *args)
+{
+	int print_paren = 1;
+	struct trace_seq s;
+
+	switch (args->type) {
+	case PRINT_NULL:
+		printf("null");
+		break;
+	case PRINT_ATOM:
+		printf("%s", args->atom.atom);
+		break;
+	case PRINT_FIELD:
+		printf("REC->%s", args->field.name);
+		break;
+	case PRINT_FLAGS:
+		printf("__print_flags(");
+		print_args(args->flags.field);
+		printf(", %s, ", args->flags.delim);
+		trace_seq_init(&s);
+		print_fields(&s, args->flags.flags);
+		trace_seq_do_printf(&s);
+		trace_seq_destroy(&s);
+		printf(")");
+		break;
+	case PRINT_SYMBOL:
+		printf("__print_symbolic(");
+		print_args(args->symbol.field);
+		printf(", ");
+		trace_seq_init(&s);
+		print_fields(&s, args->symbol.symbols);
+		trace_seq_do_printf(&s);
+		trace_seq_destroy(&s);
+		printf(")");
+		break;
+	case PRINT_HEX:
+		printf("__print_hex(");
+		print_args(args->hex.field);
+		printf(", ");
+		print_args(args->hex.size);
+		printf(")");
+		break;
+	case PRINT_STRING:
+	case PRINT_BSTRING:
+		printf("__get_str(%s)", args->string.string);
+		break;
+	case PRINT_BITMASK:
+		printf("__get_bitmask(%s)", args->bitmask.bitmask);
+		break;
+	case PRINT_TYPE:
+		printf("(%s)", args->typecast.type);
+		print_args(args->typecast.item);
+		break;
+	case PRINT_OP:
+		if (strcmp(args->op.op, ":") == 0)
+			print_paren = 0;
+		if (print_paren)
+			printf("(");
+		print_args(args->op.left);
+		printf(" %s ", args->op.op);
+		print_args(args->op.right);
+		if (print_paren)
+			printf(")");
+		break;
+	default:
+		/* we should warn... */
+		return;
+	}
+	if (args->next) {
+		printf("\n");
+		print_args(args->next);
+	}
+}
+
+static void parse_header_field(const char *field,
+			       int *offset, int *size, int mandatory)
+{
+	unsigned long long save_input_buf_ptr;
+	unsigned long long save_input_buf_siz;
+	char *token;
+	int type;
+
+	save_input_buf_ptr = input_buf_ptr;
+	save_input_buf_siz = input_buf_siz;
+
+	if (read_expected(EVENT_ITEM, "field") < 0)
+		return;
+	if (read_expected(EVENT_OP, ":") < 0)
+		return;
+
+	/* type */
+	if (read_expect_type(EVENT_ITEM, &token) < 0)
+		goto fail;
+	free_token(token);
+
+	/*
+	 * If this is not a mandatory field, then test it first.
+	 */
+	if (mandatory) {
+		if (read_expected(EVENT_ITEM, field) < 0)
+			return;
+	} else {
+		if (read_expect_type(EVENT_ITEM, &token) < 0)
+			goto fail;
+		if (strcmp(token, field) != 0)
+			goto discard;
+		free_token(token);
+	}
+
+	if (read_expected(EVENT_OP, ";") < 0)
+		return;
+	if (read_expected(EVENT_ITEM, "offset") < 0)
+		return;
+	if (read_expected(EVENT_OP, ":") < 0)
+		return;
+	if (read_expect_type(EVENT_ITEM, &token) < 0)
+		goto fail;
+	*offset = atoi(token);
+	free_token(token);
+	if (read_expected(EVENT_OP, ";") < 0)
+		return;
+	if (read_expected(EVENT_ITEM, "size") < 0)
+		return;
+	if (read_expected(EVENT_OP, ":") < 0)
+		return;
+	if (read_expect_type(EVENT_ITEM, &token) < 0)
+		goto fail;
+	*size = atoi(token);
+	free_token(token);
+	if (read_expected(EVENT_OP, ";") < 0)
+		return;
+	type = read_token(&token);
+	if (type != EVENT_NEWLINE) {
+		/* newer versions of the kernel have a "signed" type */
+		if (type != EVENT_ITEM)
+			goto fail;
+
+		if (strcmp(token, "signed") != 0)
+			goto fail;
+
+		free_token(token);
+
+		if (read_expected(EVENT_OP, ":") < 0)
+			return;
+
+		if (read_expect_type(EVENT_ITEM, &token))
+			goto fail;
+
+		free_token(token);
+		if (read_expected(EVENT_OP, ";") < 0)
+			return;
+
+		if (read_expect_type(EVENT_NEWLINE, &token))
+			goto fail;
+	}
+ fail:
+	free_token(token);
+	return;
+
+ discard:
+	input_buf_ptr = save_input_buf_ptr;
+	input_buf_siz = save_input_buf_siz;
+	*offset = 0;
+	*size = 0;
+	free_token(token);
+}
+
+/**
+ * pevent_parse_header_page - parse the data stored in the header page
+ * @pevent: the handle to the pevent
+ * @buf: the buffer storing the header page format string
+ * @size: the size of @buf
+ * @long_size: the long size to use if there is no header
+ *
+ * This parses the header page format for information on the
+ * ring buffer used. The @buf should be copied from
+ *
+ * /sys/kernel/debug/tracing/events/header_page
+ */
+int pevent_parse_header_page(struct pevent *pevent, char *buf, unsigned long size,
+			     int long_size)
+{
+	int ignore;
+
+	if (!size) {
+		/*
+		 * Old kernels did not have header page info.
+		 * Sorry but we just use what we find here in user space.
+		 */
+		pevent->header_page_ts_size = sizeof(long long);
+		pevent->header_page_size_size = long_size;
+		pevent->header_page_data_offset = sizeof(long long) + long_size;
+		pevent->old_format = 1;
+		return -1;
+	}
+	init_input_buf(buf, size);
+
+	parse_header_field("timestamp", &pevent->header_page_ts_offset,
+			   &pevent->header_page_ts_size, 1);
+	parse_header_field("commit", &pevent->header_page_size_offset,
+			   &pevent->header_page_size_size, 1);
+	parse_header_field("overwrite", &pevent->header_page_overwrite,
+			   &ignore, 0);
+	parse_header_field("data", &pevent->header_page_data_offset,
+			   &pevent->header_page_data_size, 1);
+
+	return 0;
+}
+
+static int event_matches(struct event_format *event,
+			 int id, const char *sys_name,
+			 const char *event_name)
+{
+	if (id >= 0 && id != event->id)
+		return 0;
+
+	if (event_name && (strcmp(event_name, event->name) != 0))
+		return 0;
+
+	if (sys_name && (strcmp(sys_name, event->system) != 0))
+		return 0;
+
+	return 1;
+}
+
+static void free_handler(struct event_handler *handle)
+{
+	free((void *)handle->sys_name);
+	free((void *)handle->event_name);
+	free(handle);
+}
+
+static int find_event_handle(struct pevent *pevent, struct event_format *event)
+{
+	struct event_handler *handle, **next;
+
+	for (next = &pevent->handlers; *next;
+	     next = &(*next)->next) {
+		handle = *next;
+		if (event_matches(event, handle->id,
+				  handle->sys_name,
+				  handle->event_name))
+			break;
+	}
+
+	if (!(*next))
+		return 0;
+
+	pr_stat("overriding event (%d) %s:%s with new print handler",
+		event->id, event->system, event->name);
+
+	event->handler = handle->func;
+	event->context = handle->context;
+
+	*next = handle->next;
+	free_handler(handle);
+
+	return 1;
+}
+
+/**
+ * __pevent_parse_format - parse the event format
+ * @buf: the buffer storing the event format string
+ * @size: the size of @buf
+ * @sys: the system the event belongs to
+ *
+ * This parses the event format and creates an event structure
+ * to quickly parse raw data for a given event.
+ *
+ * These files currently come from:
+ *
+ * /sys/kernel/debug/tracing/events/.../.../format
+ */
+enum pevent_errno __pevent_parse_format(struct event_format **eventp,
+					struct pevent *pevent, const char *buf,
+					unsigned long size, const char *sys)
+{
+	struct event_format *event;
+	int ret;
+
+	init_input_buf(buf, size);
+
+	*eventp = event = alloc_event();
+	if (!event)
+		return PEVENT_ERRNO__MEM_ALLOC_FAILED;
+
+	event->name = event_read_name();
+	if (!event->name) {
+		/* Bad event? */
+		ret = PEVENT_ERRNO__MEM_ALLOC_FAILED;
+		goto event_alloc_failed;
+	}
+
+	if (strcmp(sys, "ftrace") == 0) {
+		event->flags |= EVENT_FL_ISFTRACE;
+
+		if (strcmp(event->name, "bprint") == 0)
+			event->flags |= EVENT_FL_ISBPRINT;
+	}
+		
+	event->id = event_read_id();
+	if (event->id < 0) {
+		ret = PEVENT_ERRNO__READ_ID_FAILED;
+		/*
+		 * This isn't an allocation error actually.
+		 * But as the ID is critical, just bail out.
+		 */
+		goto event_alloc_failed;
+	}
+
+	event->system = strdup(sys);
+	if (!event->system) {
+		ret = PEVENT_ERRNO__MEM_ALLOC_FAILED;
+		goto event_alloc_failed;
+	}
+
+	/* Add pevent to event so that it can be referenced */
+	event->pevent = pevent;
+
+	ret = event_read_format(event);
+	if (ret < 0) {
+		ret = PEVENT_ERRNO__READ_FORMAT_FAILED;
+		goto event_parse_failed;
+	}
+
+	/*
+	 * If the event has an override, don't print warnings if the event
+	 * print format fails to parse.
+	 */
+	if (pevent && find_event_handle(pevent, event))
+		show_warning = 0;
+
+	ret = event_read_print(event);
+	show_warning = 1;
+
+	if (ret < 0) {
+		ret = PEVENT_ERRNO__READ_PRINT_FAILED;
+		goto event_parse_failed;
+	}
+
+	if (!ret && (event->flags & EVENT_FL_ISFTRACE)) {
+		struct format_field *field;
+		struct print_arg *arg, **list;
+
+		/* old ftrace had no args */
+		list = &event->print_fmt.args;
+		for (field = event->format.fields; field; field = field->next) {
+			arg = alloc_arg();
+			if (!arg) {
+				event->flags |= EVENT_FL_FAILED;
+				return PEVENT_ERRNO__OLD_FTRACE_ARG_FAILED;
+			}
+			arg->type = PRINT_FIELD;
+			arg->field.name = strdup(field->name);
+			if (!arg->field.name) {
+				event->flags |= EVENT_FL_FAILED;
+				free_arg(arg);
+				return PEVENT_ERRNO__OLD_FTRACE_ARG_FAILED;
+			}
+			arg->field.field = field;
+			*list = arg;
+			list = &arg->next;
+		}
+		return 0;
+	}
+
+	return 0;
+
+ event_parse_failed:
+	event->flags |= EVENT_FL_FAILED;
+	return ret;
+
+ event_alloc_failed:
+	free(event->system);
+	free(event->name);
+	free(event);
+	*eventp = NULL;
+	return ret;
+}
+
+static enum pevent_errno
+__pevent_parse_event(struct pevent *pevent,
+		     struct event_format **eventp,
+		     const char *buf, unsigned long size,
+		     const char *sys)
+{
+	int ret = __pevent_parse_format(eventp, pevent, buf, size, sys);
+	struct event_format *event = *eventp;
+
+	if (event == NULL)
+		return ret;
+
+	if (pevent && add_event(pevent, event)) {
+		ret = PEVENT_ERRNO__MEM_ALLOC_FAILED;
+		goto event_add_failed;
+	}
+
+#define PRINT_ARGS 0
+	if (PRINT_ARGS && event->print_fmt.args)
+		print_args(event->print_fmt.args);
+
+	return 0;
+
+event_add_failed:
+	pevent_free_format(event);
+	return ret;
+}
+
+/**
+ * pevent_parse_format - parse the event format
+ * @pevent: the handle to the pevent
+ * @eventp: returned format
+ * @buf: the buffer storing the event format string
+ * @size: the size of @buf
+ * @sys: the system the event belongs to
+ *
+ * This parses the event format and creates an event structure
+ * to quickly parse raw data for a given event.
+ *
+ * These files currently come from:
+ *
+ * /sys/kernel/debug/tracing/events/.../.../format
+ */
+enum pevent_errno pevent_parse_format(struct pevent *pevent,
+				      struct event_format **eventp,
+				      const char *buf,
+				      unsigned long size, const char *sys)
+{
+	return __pevent_parse_event(pevent, eventp, buf, size, sys);
+}
+
+/**
+ * pevent_parse_event - parse the event format
+ * @pevent: the handle to the pevent
+ * @buf: the buffer storing the event format string
+ * @size: the size of @buf
+ * @sys: the system the event belongs to
+ *
+ * This parses the event format and creates an event structure
+ * to quickly parse raw data for a given event.
+ *
+ * These files currently come from:
+ *
+ * /sys/kernel/debug/tracing/events/.../.../format
+ */
+enum pevent_errno pevent_parse_event(struct pevent *pevent, const char *buf,
+				     unsigned long size, const char *sys)
+{
+	struct event_format *event = NULL;
+	return __pevent_parse_event(pevent, &event, buf, size, sys);
+}
+
+#undef _PE
+#define _PE(code, str) str
+static const char * const pevent_error_str[] = {
+	PEVENT_ERRORS
+};
+#undef _PE
+
+int pevent_strerror(struct pevent *pevent __maybe_unused,
+		    enum pevent_errno errnum, char *buf, size_t buflen)
+{
+	int idx;
+	const char *msg;
+
+	if (errnum >= 0) {
+		msg = strerror_r(errnum, buf, buflen);
+		if (msg != buf) {
+			size_t len = strlen(msg);
+			memcpy(buf, msg, min(buflen - 1, len));
+			*(buf + min(buflen - 1, len)) = '\0';
+		}
+		return 0;
+	}
+
+	if (errnum <= __PEVENT_ERRNO__START ||
+	    errnum >= __PEVENT_ERRNO__END)
+		return -1;
+
+	idx = errnum - __PEVENT_ERRNO__START - 1;
+	msg = pevent_error_str[idx];
+	snprintf(buf, buflen, "%s", msg);
+
+	return 0;
+}
+
+int get_field_val(struct trace_seq *s, struct format_field *field,
+		  const char *name, struct pevent_record *record,
+		  unsigned long long *val, int err)
+{
+	if (!field) {
+		if (err)
+			trace_seq_printf(s, "<CANT FIND FIELD %s>", name);
+		return -1;
+	}
+
+	if (pevent_read_number_field(field, record->data, val)) {
+		if (err)
+			trace_seq_printf(s, " %s=INVALID", name);
+		return -1;
+	}
+
+	return 0;
+}
+
+/**
+ * pevent_get_field_raw - return the raw pointer into the data field
+ * @s: The seq to print to on error
+ * @event: the event that the field is for
+ * @name: The name of the field
+ * @record: The record with the field name.
+ * @len: place to store the field length.
+ * @err: print default error if failed.
+ *
+ * Returns a pointer into record->data of the field and places
+ * the length of the field in @len.
+ *
+ * On failure, it returns NULL.
+ */
+void *pevent_get_field_raw(struct trace_seq *s, struct event_format *event,
+			   const char *name, struct pevent_record *record,
+			   int *len, int err)
+{
+	struct format_field *field;
+	void *data = record->data;
+	unsigned offset;
+	int dummy;
+
+	if (!event)
+		return NULL;
+
+	field = pevent_find_field(event, name);
+
+	if (!field) {
+		if (err)
+			trace_seq_printf(s, "<CANT FIND FIELD %s>", name);
+		return NULL;
+	}
+
+	/* Allow @len to be NULL */
+	if (!len)
+		len = &dummy;
+
+	offset = field->offset;
+	if (field->flags & FIELD_IS_DYNAMIC) {
+		offset = pevent_read_number(event->pevent,
+					    data + offset, field->size);
+		*len = offset >> 16;
+		offset &= 0xffff;
+	} else
+		*len = field->size;
+
+	return data + offset;
+}
+
+/**
+ * pevent_get_field_val - find a field and return its value
+ * @s: The seq to print to on error
+ * @event: the event that the field is for
+ * @name: The name of the field
+ * @record: The record with the field name.
+ * @val: place to store the value of the field.
+ * @err: print default error if failed.
+ *
+ * Returns 0 on success -1 on field not found.
+ */
+int pevent_get_field_val(struct trace_seq *s, struct event_format *event,
+			 const char *name, struct pevent_record *record,
+			 unsigned long long *val, int err)
+{
+	struct format_field *field;
+
+	if (!event)
+		return -1;
+
+	field = pevent_find_field(event, name);
+
+	return get_field_val(s, field, name, record, val, err);
+}
+
+/**
+ * pevent_get_common_field_val - find a common field and return its value
+ * @s: The seq to print to on error
+ * @event: the event that the field is for
+ * @name: The name of the field
+ * @record: The record with the field name.
+ * @val: place to store the value of the field.
+ * @err: print default error if failed.
+ *
+ * Returns 0 on success -1 on field not found.
+ */
+int pevent_get_common_field_val(struct trace_seq *s, struct event_format *event,
+				const char *name, struct pevent_record *record,
+				unsigned long long *val, int err)
+{
+	struct format_field *field;
+
+	if (!event)
+		return -1;
+
+	field = pevent_find_common_field(event, name);
+
+	return get_field_val(s, field, name, record, val, err);
+}
+
+/**
+ * pevent_get_any_field_val - find a any field and return its value
+ * @s: The seq to print to on error
+ * @event: the event that the field is for
+ * @name: The name of the field
+ * @record: The record with the field name.
+ * @val: place to store the value of the field.
+ * @err: print default error if failed.
+ *
+ * Returns 0 on success -1 on field not found.
+ */
+int pevent_get_any_field_val(struct trace_seq *s, struct event_format *event,
+			     const char *name, struct pevent_record *record,
+			     unsigned long long *val, int err)
+{
+	struct format_field *field;
+
+	if (!event)
+		return -1;
+
+	field = pevent_find_any_field(event, name);
+
+	return get_field_val(s, field, name, record, val, err);
+}
+
+/**
+ * pevent_print_num_field - print a field and a format
+ * @s: The seq to print to
+ * @fmt: The printf format to print the field with.
+ * @event: the event that the field is for
+ * @name: The name of the field
+ * @record: The record with the field name.
+ * @err: print default error if failed.
+ *
+ * Returns: 0 on success, -1 field not found, or 1 if buffer is full.
+ */
+int pevent_print_num_field(struct trace_seq *s, const char *fmt,
+			   struct event_format *event, const char *name,
+			   struct pevent_record *record, int err)
+{
+	struct format_field *field = pevent_find_field(event, name);
+	unsigned long long val;
+
+	if (!field)
+		goto failed;
+
+	if (pevent_read_number_field(field, record->data, &val))
+		goto failed;
+
+	return trace_seq_printf(s, fmt, val);
+
+ failed:
+	if (err)
+		trace_seq_printf(s, "CAN'T FIND FIELD \"%s\"", name);
+	return -1;
+}
+
+/**
+ * pevent_print_func_field - print a field and a format for function pointers
+ * @s: The seq to print to
+ * @fmt: The printf format to print the field with.
+ * @event: the event that the field is for
+ * @name: The name of the field
+ * @record: The record with the field name.
+ * @err: print default error if failed.
+ *
+ * Returns: 0 on success, -1 field not found, or 1 if buffer is full.
+ */
+int pevent_print_func_field(struct trace_seq *s, const char *fmt,
+			    struct event_format *event, const char *name,
+			    struct pevent_record *record, int err)
+{
+	struct format_field *field = pevent_find_field(event, name);
+	struct pevent *pevent = event->pevent;
+	unsigned long long val;
+	struct func_map *func;
+	char tmp[128];
+
+	if (!field)
+		goto failed;
+
+	if (pevent_read_number_field(field, record->data, &val))
+		goto failed;
+
+	func = find_func(pevent, val);
+
+	if (func)
+		snprintf(tmp, 128, "%s/0x%llx", func->func, func->addr - val);
+	else
+		sprintf(tmp, "0x%08llx", val);
+
+	return trace_seq_printf(s, fmt, tmp);
+
+ failed:
+	if (err)
+		trace_seq_printf(s, "CAN'T FIND FIELD \"%s\"", name);
+	return -1;
+}
+
+static void free_func_handle(struct pevent_function_handler *func)
+{
+	struct pevent_func_params *params;
+
+	free(func->name);
+
+	while (func->params) {
+		params = func->params;
+		func->params = params->next;
+		free(params);
+	}
+
+	free(func);
+}
+
+/**
+ * pevent_register_print_function - register a helper function
+ * @pevent: the handle to the pevent
+ * @func: the function to process the helper function
+ * @ret_type: the return type of the helper function
+ * @name: the name of the helper function
+ * @parameters: A list of enum pevent_func_arg_type
+ *
+ * Some events may have helper functions in the print format arguments.
+ * This allows a plugin to dynamically create a way to process one
+ * of these functions.
+ *
+ * The @parameters is a variable list of pevent_func_arg_type enums that
+ * must end with PEVENT_FUNC_ARG_VOID.
+ */
+int pevent_register_print_function(struct pevent *pevent,
+				   pevent_func_handler func,
+				   enum pevent_func_arg_type ret_type,
+				   char *name, ...)
+{
+	struct pevent_function_handler *func_handle;
+	struct pevent_func_params **next_param;
+	struct pevent_func_params *param;
+	enum pevent_func_arg_type type;
+	va_list ap;
+	int ret;
+
+	func_handle = find_func_handler(pevent, name);
+	if (func_handle) {
+		/*
+		 * This is most like caused by the users own
+		 * plugins updating the function. This overrides the
+		 * system defaults.
+		 */
+		pr_stat("override of function helper '%s'", name);
+		remove_func_handler(pevent, name);
+	}
+
+	func_handle = calloc(1, sizeof(*func_handle));
+	if (!func_handle) {
+		do_warning("Failed to allocate function handler");
+		return PEVENT_ERRNO__MEM_ALLOC_FAILED;
+	}
+
+	func_handle->ret_type = ret_type;
+	func_handle->name = strdup(name);
+	func_handle->func = func;
+	if (!func_handle->name) {
+		do_warning("Failed to allocate function name");
+		free(func_handle);
+		return PEVENT_ERRNO__MEM_ALLOC_FAILED;
+	}
+
+	next_param = &(func_handle->params);
+	va_start(ap, name);
+	for (;;) {
+		type = va_arg(ap, enum pevent_func_arg_type);
+		if (type == PEVENT_FUNC_ARG_VOID)
+			break;
+
+		if (type >= PEVENT_FUNC_ARG_MAX_TYPES) {
+			do_warning("Invalid argument type %d", type);
+			ret = PEVENT_ERRNO__INVALID_ARG_TYPE;
+			goto out_free;
+		}
+
+		param = malloc(sizeof(*param));
+		if (!param) {
+			do_warning("Failed to allocate function param");
+			ret = PEVENT_ERRNO__MEM_ALLOC_FAILED;
+			goto out_free;
+		}
+		param->type = type;
+		param->next = NULL;
+
+		*next_param = param;
+		next_param = &(param->next);
+
+		func_handle->nr_args++;
+	}
+	va_end(ap);
+
+	func_handle->next = pevent->func_handlers;
+	pevent->func_handlers = func_handle;
+
+	return 0;
+ out_free:
+	va_end(ap);
+	free_func_handle(func_handle);
+	return ret;
+}
+
+/**
+ * pevent_unregister_print_function - unregister a helper function
+ * @pevent: the handle to the pevent
+ * @func: the function to process the helper function
+ * @name: the name of the helper function
+ *
+ * This function removes existing print handler for function @name.
+ *
+ * Returns 0 if the handler was removed successully, -1 otherwise.
+ */
+int pevent_unregister_print_function(struct pevent *pevent,
+				     pevent_func_handler func, char *name)
+{
+	struct pevent_function_handler *func_handle;
+
+	func_handle = find_func_handler(pevent, name);
+	if (func_handle && func_handle->func == func) {
+		remove_func_handler(pevent, name);
+		return 0;
+	}
+	return -1;
+}
+
+static struct event_format *pevent_search_event(struct pevent *pevent, int id,
+						const char *sys_name,
+						const char *event_name)
+{
+	struct event_format *event;
+
+	if (id >= 0) {
+		/* search by id */
+		event = pevent_find_event(pevent, id);
+		if (!event)
+			return NULL;
+		if (event_name && (strcmp(event_name, event->name) != 0))
+			return NULL;
+		if (sys_name && (strcmp(sys_name, event->system) != 0))
+			return NULL;
+	} else {
+		event = pevent_find_event_by_name(pevent, sys_name, event_name);
+		if (!event)
+			return NULL;
+	}
+	return event;
+}
+
+/**
+ * pevent_register_event_handler - register a way to parse an event
+ * @pevent: the handle to the pevent
+ * @id: the id of the event to register
+ * @sys_name: the system name the event belongs to
+ * @event_name: the name of the event
+ * @func: the function to call to parse the event information
+ * @context: the data to be passed to @func
+ *
+ * This function allows a developer to override the parsing of
+ * a given event. If for some reason the default print format
+ * is not sufficient, this function will register a function
+ * for an event to be used to parse the data instead.
+ *
+ * If @id is >= 0, then it is used to find the event.
+ * else @sys_name and @event_name are used.
+ */
+int pevent_register_event_handler(struct pevent *pevent, int id,
+				  const char *sys_name, const char *event_name,
+				  pevent_event_handler_func func, void *context)
+{
+	struct event_format *event;
+	struct event_handler *handle;
+
+	event = pevent_search_event(pevent, id, sys_name, event_name);
+	if (event == NULL)
+		goto not_found;
+
+	pr_stat("overriding event (%d) %s:%s with new print handler",
+		event->id, event->system, event->name);
+
+	event->handler = func;
+	event->context = context;
+	return 0;
+
+ not_found:
+	/* Save for later use. */
+	handle = calloc(1, sizeof(*handle));
+	if (!handle) {
+		do_warning("Failed to allocate event handler");
+		return PEVENT_ERRNO__MEM_ALLOC_FAILED;
+	}
+
+	handle->id = id;
+	if (event_name)
+		handle->event_name = strdup(event_name);
+	if (sys_name)
+		handle->sys_name = strdup(sys_name);
+
+	if ((event_name && !handle->event_name) ||
+	    (sys_name && !handle->sys_name)) {
+		do_warning("Failed to allocate event/sys name");
+		free((void *)handle->event_name);
+		free((void *)handle->sys_name);
+		free(handle);
+		return PEVENT_ERRNO__MEM_ALLOC_FAILED;
+	}
+
+	handle->func = func;
+	handle->next = pevent->handlers;
+	pevent->handlers = handle;
+	handle->context = context;
+
+	return -1;
+}
+
+static int handle_matches(struct event_handler *handler, int id,
+			  const char *sys_name, const char *event_name,
+			  pevent_event_handler_func func, void *context)
+{
+	if (id >= 0 && id != handler->id)
+		return 0;
+
+	if (event_name && (strcmp(event_name, handler->event_name) != 0))
+		return 0;
+
+	if (sys_name && (strcmp(sys_name, handler->sys_name) != 0))
+		return 0;
+
+	if (func != handler->func || context != handler->context)
+		return 0;
+
+	return 1;
+}
+
+/**
+ * pevent_unregister_event_handler - unregister an existing event handler
+ * @pevent: the handle to the pevent
+ * @id: the id of the event to unregister
+ * @sys_name: the system name the handler belongs to
+ * @event_name: the name of the event handler
+ * @func: the function to call to parse the event information
+ * @context: the data to be passed to @func
+ *
+ * This function removes existing event handler (parser).
+ *
+ * If @id is >= 0, then it is used to find the event.
+ * else @sys_name and @event_name are used.
+ *
+ * Returns 0 if handler was removed successfully, -1 if event was not found.
+ */
+int pevent_unregister_event_handler(struct pevent *pevent, int id,
+				    const char *sys_name, const char *event_name,
+				    pevent_event_handler_func func, void *context)
+{
+	struct event_format *event;
+	struct event_handler *handle;
+	struct event_handler **next;
+
+	event = pevent_search_event(pevent, id, sys_name, event_name);
+	if (event == NULL)
+		goto not_found;
+
+	if (event->handler == func && event->context == context) {
+		pr_stat("removing override handler for event (%d) %s:%s. Going back to default handler.",
+			event->id, event->system, event->name);
+
+		event->handler = NULL;
+		event->context = NULL;
+		return 0;
+	}
+
+not_found:
+	for (next = &pevent->handlers; *next; next = &(*next)->next) {
+		handle = *next;
+		if (handle_matches(handle, id, sys_name, event_name,
+				   func, context))
+			break;
+	}
+
+	if (!(*next))
+		return -1;
+
+	*next = handle->next;
+	free_handler(handle);
+
+	return 0;
+}
+
+/**
+ * pevent_alloc - create a pevent handle
+ */
+struct pevent *pevent_alloc(void)
+{
+	struct pevent *pevent = calloc(1, sizeof(*pevent));
+
+	if (pevent)
+		pevent->ref_count = 1;
+
+	return pevent;
+}
+
+void pevent_ref(struct pevent *pevent)
+{
+	pevent->ref_count++;
+}
+
+static void free_format_fields(struct format_field *field)
+{
+	struct format_field *next;
+
+	while (field) {
+		next = field->next;
+		free(field->type);
+		free(field->name);
+		free(field);
+		field = next;
+	}
+}
+
+static void free_formats(struct format *format)
+{
+	free_format_fields(format->common_fields);
+	free_format_fields(format->fields);
+}
+
+void pevent_free_format(struct event_format *event)
+{
+	free(event->name);
+	free(event->system);
+
+	free_formats(&event->format);
+
+	free(event->print_fmt.format);
+	free_args(event->print_fmt.args);
+
+	free(event);
+}
+
+/**
+ * pevent_free - free a pevent handle
+ * @pevent: the pevent handle to free
+ */
+void pevent_free(struct pevent *pevent)
+{
+	struct cmdline_list *cmdlist, *cmdnext;
+	struct func_list *funclist, *funcnext;
+	struct printk_list *printklist, *printknext;
+	struct pevent_function_handler *func_handler;
+	struct event_handler *handle;
+	int i;
+
+	if (!pevent)
+		return;
+
+	cmdlist = pevent->cmdlist;
+	funclist = pevent->funclist;
+	printklist = pevent->printklist;
+
+	pevent->ref_count--;
+	if (pevent->ref_count)
+		return;
+
+	if (pevent->cmdlines) {
+		for (i = 0; i < pevent->cmdline_count; i++)
+			free(pevent->cmdlines[i].comm);
+		free(pevent->cmdlines);
+	}
+
+	while (cmdlist) {
+		cmdnext = cmdlist->next;
+		free(cmdlist->comm);
+		free(cmdlist);
+		cmdlist = cmdnext;
+	}
+
+	if (pevent->func_map) {
+		for (i = 0; i < (int)pevent->func_count; i++) {
+			free(pevent->func_map[i].func);
+			free(pevent->func_map[i].mod);
+		}
+		free(pevent->func_map);
+	}
+
+	while (funclist) {
+		funcnext = funclist->next;
+		free(funclist->func);
+		free(funclist->mod);
+		free(funclist);
+		funclist = funcnext;
+	}
+
+	while (pevent->func_handlers) {
+		func_handler = pevent->func_handlers;
+		pevent->func_handlers = func_handler->next;
+		free_func_handle(func_handler);
+	}
+
+	if (pevent->printk_map) {
+		for (i = 0; i < (int)pevent->printk_count; i++)
+			free(pevent->printk_map[i].printk);
+		free(pevent->printk_map);
+	}
+
+	while (printklist) {
+		printknext = printklist->next;
+		free(printklist->printk);
+		free(printklist);
+		printklist = printknext;
+	}
+
+	for (i = 0; i < pevent->nr_events; i++)
+		pevent_free_format(pevent->events[i]);
+
+	while (pevent->handlers) {
+		handle = pevent->handlers;
+		pevent->handlers = handle->next;
+		free_handler(handle);
+	}
+
+	free(pevent->events);
+	free(pevent->sort_events);
+
+	free(pevent);
+}
+
+void pevent_unref(struct pevent *pevent)
+{
+	pevent_free(pevent);
+}
diff --git a/tools/lib/traceevent/event-parse.h b/tools/lib/traceevent/event-parse.h
new file mode 100644
index 00000000000..7a3873ff9a4
--- /dev/null
+++ b/tools/lib/traceevent/event-parse.h
@@ -0,0 +1,944 @@
+/*
+ * Copyright (C) 2009, 2010 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#ifndef _PARSE_EVENTS_H
+#define _PARSE_EVENTS_H
+
+#include <stdbool.h>
+#include <stdarg.h>
+#include <regex.h>
+#include <string.h>
+
+#ifndef __maybe_unused
+#define __maybe_unused __attribute__((unused))
+#endif
+
+/* ----------------------- trace_seq ----------------------- */
+
+
+#ifndef TRACE_SEQ_BUF_SIZE
+#define TRACE_SEQ_BUF_SIZE 4096
+#endif
+
+#ifndef DEBUG_RECORD
+#define DEBUG_RECORD 0
+#endif
+
+struct pevent_record {
+	unsigned long long	ts;
+	unsigned long long	offset;
+	long long		missed_events;	/* buffer dropped events before */
+	int			record_size;	/* size of binary record */
+	int			size;		/* size of data */
+	void			*data;
+	int			cpu;
+	int			ref_count;
+	int			locked;		/* Do not free, even if ref_count is zero */
+	void			*priv;
+#if DEBUG_RECORD
+	struct pevent_record	*prev;
+	struct pevent_record	*next;
+	long			alloc_addr;
+#endif
+};
+
+enum trace_seq_fail {
+	TRACE_SEQ__GOOD,
+	TRACE_SEQ__BUFFER_POISONED,
+	TRACE_SEQ__MEM_ALLOC_FAILED,
+};
+
+/*
+ * Trace sequences are used to allow a function to call several other functions
+ * to create a string of data to use (up to a max of PAGE_SIZE).
+ */
+
+struct trace_seq {
+	char			*buffer;
+	unsigned int		buffer_size;
+	unsigned int		len;
+	unsigned int		readpos;
+	enum trace_seq_fail	state;
+};
+
+void trace_seq_init(struct trace_seq *s);
+void trace_seq_reset(struct trace_seq *s);
+void trace_seq_destroy(struct trace_seq *s);
+
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+extern int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+	__attribute__ ((format (printf, 2, 0)));
+
+extern int trace_seq_puts(struct trace_seq *s, const char *str);
+extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
+
+extern void trace_seq_terminate(struct trace_seq *s);
+
+extern int trace_seq_do_printf(struct trace_seq *s);
+
+
+/* ----------------------- pevent ----------------------- */
+
+struct pevent;
+struct event_format;
+
+typedef int (*pevent_event_handler_func)(struct trace_seq *s,
+					 struct pevent_record *record,
+					 struct event_format *event,
+					 void *context);
+
+typedef int (*pevent_plugin_load_func)(struct pevent *pevent);
+typedef int (*pevent_plugin_unload_func)(struct pevent *pevent);
+
+struct pevent_plugin_option {
+	struct pevent_plugin_option	*next;
+	void				*handle;
+	char				*file;
+	char				*name;
+	char				*plugin_alias;
+	char				*description;
+	char				*value;
+	void				*priv;
+	int				set;
+};
+
+/*
+ * Plugin hooks that can be called:
+ *
+ * PEVENT_PLUGIN_LOADER:  (required)
+ *   The function name to initialized the plugin.
+ *
+ *   int PEVENT_PLUGIN_LOADER(struct pevent *pevent)
+ *
+ * PEVENT_PLUGIN_UNLOADER:  (optional)
+ *   The function called just before unloading
+ *
+ *   int PEVENT_PLUGIN_UNLOADER(struct pevent *pevent)
+ *
+ * PEVENT_PLUGIN_OPTIONS:  (optional)
+ *   Plugin options that can be set before loading
+ *
+ *   struct pevent_plugin_option PEVENT_PLUGIN_OPTIONS[] = {
+ *	{
+ *		.name = "option-name",
+ *		.plugin_alias = "overide-file-name", (optional)
+ *		.description = "description of option to show users",
+ *	},
+ *	{
+ *		.name = NULL,
+ *	},
+ *   };
+ *
+ *   Array must end with .name = NULL;
+ *
+ *
+ *   .plugin_alias is used to give a shorter name to access
+ *   the vairable. Useful if a plugin handles more than one event.
+ *
+ * PEVENT_PLUGIN_ALIAS: (optional)
+ *   The name to use for finding options (uses filename if not defined)
+ */
+#define PEVENT_PLUGIN_LOADER pevent_plugin_loader
+#define PEVENT_PLUGIN_UNLOADER pevent_plugin_unloader
+#define PEVENT_PLUGIN_OPTIONS pevent_plugin_options
+#define PEVENT_PLUGIN_ALIAS pevent_plugin_alias
+#define _MAKE_STR(x)	#x
+#define MAKE_STR(x)	_MAKE_STR(x)
+#define PEVENT_PLUGIN_LOADER_NAME MAKE_STR(PEVENT_PLUGIN_LOADER)
+#define PEVENT_PLUGIN_UNLOADER_NAME MAKE_STR(PEVENT_PLUGIN_UNLOADER)
+#define PEVENT_PLUGIN_OPTIONS_NAME MAKE_STR(PEVENT_PLUGIN_OPTIONS)
+#define PEVENT_PLUGIN_ALIAS_NAME MAKE_STR(PEVENT_PLUGIN_ALIAS)
+
+#define NSECS_PER_SEC		1000000000ULL
+#define NSECS_PER_USEC		1000ULL
+
+enum format_flags {
+	FIELD_IS_ARRAY		= 1,
+	FIELD_IS_POINTER	= 2,
+	FIELD_IS_SIGNED		= 4,
+	FIELD_IS_STRING		= 8,
+	FIELD_IS_DYNAMIC	= 16,
+	FIELD_IS_LONG		= 32,
+	FIELD_IS_FLAG		= 64,
+	FIELD_IS_SYMBOLIC	= 128,
+};
+
+struct format_field {
+	struct format_field	*next;
+	struct event_format	*event;
+	char			*type;
+	char			*name;
+	int			offset;
+	int			size;
+	unsigned int		arraylen;
+	unsigned int		elementsize;
+	unsigned long		flags;
+};
+
+struct format {
+	int			nr_common;
+	int			nr_fields;
+	struct format_field	*common_fields;
+	struct format_field	*fields;
+};
+
+struct print_arg_atom {
+	char			*atom;
+};
+
+struct print_arg_string {
+	char			*string;
+	int			offset;
+};
+
+struct print_arg_bitmask {
+	char			*bitmask;
+	int			offset;
+};
+
+struct print_arg_field {
+	char			*name;
+	struct format_field	*field;
+};
+
+struct print_flag_sym {
+	struct print_flag_sym	*next;
+	char			*value;
+	char			*str;
+};
+
+struct print_arg_typecast {
+	char 			*type;
+	struct print_arg	*item;
+};
+
+struct print_arg_flags {
+	struct print_arg	*field;
+	char			*delim;
+	struct print_flag_sym	*flags;
+};
+
+struct print_arg_symbol {
+	struct print_arg	*field;
+	struct print_flag_sym	*symbols;
+};
+
+struct print_arg_hex {
+	struct print_arg	*field;
+	struct print_arg	*size;
+};
+
+struct print_arg_dynarray {
+	struct format_field	*field;
+	struct print_arg	*index;
+};
+
+struct print_arg;
+
+struct print_arg_op {
+	char			*op;
+	int			prio;
+	struct print_arg	*left;
+	struct print_arg	*right;
+};
+
+struct pevent_function_handler;
+
+struct print_arg_func {
+	struct pevent_function_handler	*func;
+	struct print_arg		*args;
+};
+
+enum print_arg_type {
+	PRINT_NULL,
+	PRINT_ATOM,
+	PRINT_FIELD,
+	PRINT_FLAGS,
+	PRINT_SYMBOL,
+	PRINT_HEX,
+	PRINT_TYPE,
+	PRINT_STRING,
+	PRINT_BSTRING,
+	PRINT_DYNAMIC_ARRAY,
+	PRINT_OP,
+	PRINT_FUNC,
+	PRINT_BITMASK,
+};
+
+struct print_arg {
+	struct print_arg		*next;
+	enum print_arg_type		type;
+	union {
+		struct print_arg_atom		atom;
+		struct print_arg_field		field;
+		struct print_arg_typecast	typecast;
+		struct print_arg_flags		flags;
+		struct print_arg_symbol		symbol;
+		struct print_arg_hex		hex;
+		struct print_arg_func		func;
+		struct print_arg_string		string;
+		struct print_arg_bitmask	bitmask;
+		struct print_arg_op		op;
+		struct print_arg_dynarray	dynarray;
+	};
+};
+
+struct print_fmt {
+	char			*format;
+	struct print_arg	*args;
+};
+
+struct event_format {
+	struct pevent		*pevent;
+	char			*name;
+	int			id;
+	int			flags;
+	struct format		format;
+	struct print_fmt	print_fmt;
+	char			*system;
+	pevent_event_handler_func handler;
+	void			*context;
+};
+
+enum {
+	EVENT_FL_ISFTRACE	= 0x01,
+	EVENT_FL_ISPRINT	= 0x02,
+	EVENT_FL_ISBPRINT	= 0x04,
+	EVENT_FL_ISFUNCENT	= 0x10,
+	EVENT_FL_ISFUNCRET	= 0x20,
+	EVENT_FL_NOHANDLE	= 0x40,
+	EVENT_FL_PRINTRAW	= 0x80,
+
+	EVENT_FL_FAILED		= 0x80000000
+};
+
+enum event_sort_type {
+	EVENT_SORT_ID,
+	EVENT_SORT_NAME,
+	EVENT_SORT_SYSTEM,
+};
+
+enum event_type {
+	EVENT_ERROR,
+	EVENT_NONE,
+	EVENT_SPACE,
+	EVENT_NEWLINE,
+	EVENT_OP,
+	EVENT_DELIM,
+	EVENT_ITEM,
+	EVENT_DQUOTE,
+	EVENT_SQUOTE,
+};
+
+typedef unsigned long long (*pevent_func_handler)(struct trace_seq *s,
+					     unsigned long long *args);
+
+enum pevent_func_arg_type {
+	PEVENT_FUNC_ARG_VOID,
+	PEVENT_FUNC_ARG_INT,
+	PEVENT_FUNC_ARG_LONG,
+	PEVENT_FUNC_ARG_STRING,
+	PEVENT_FUNC_ARG_PTR,
+	PEVENT_FUNC_ARG_MAX_TYPES
+};
+
+enum pevent_flag {
+	PEVENT_NSEC_OUTPUT		= 1,	/* output in NSECS */
+	PEVENT_DISABLE_SYS_PLUGINS	= 1 << 1,
+	PEVENT_DISABLE_PLUGINS		= 1 << 2,
+};
+
+#define PEVENT_ERRORS 							      \
+	_PE(MEM_ALLOC_FAILED,	"failed to allocate memory"),		      \
+	_PE(PARSE_EVENT_FAILED,	"failed to parse event"),		      \
+	_PE(READ_ID_FAILED,	"failed to read event id"),		      \
+	_PE(READ_FORMAT_FAILED,	"failed to read event format"),		      \
+	_PE(READ_PRINT_FAILED,	"failed to read event print fmt"), 	      \
+	_PE(OLD_FTRACE_ARG_FAILED,"failed to allocate field name for ftrace"),\
+	_PE(INVALID_ARG_TYPE,	"invalid argument type"),		      \
+	_PE(INVALID_EXP_TYPE,	"invalid expression type"),		      \
+	_PE(INVALID_OP_TYPE,	"invalid operator type"),		      \
+	_PE(INVALID_EVENT_NAME,	"invalid event name"),			      \
+	_PE(EVENT_NOT_FOUND,	"no event found"),			      \
+	_PE(SYNTAX_ERROR,	"syntax error"),			      \
+	_PE(ILLEGAL_RVALUE,	"illegal rvalue"),			      \
+	_PE(ILLEGAL_LVALUE,	"illegal lvalue for string comparison"),      \
+	_PE(INVALID_REGEX,	"regex did not compute"),		      \
+	_PE(ILLEGAL_STRING_CMP,	"illegal comparison for string"), 	      \
+	_PE(ILLEGAL_INTEGER_CMP,"illegal comparison for integer"), 	      \
+	_PE(REPARENT_NOT_OP,	"cannot reparent other than OP"),	      \
+	_PE(REPARENT_FAILED,	"failed to reparent filter OP"),	      \
+	_PE(BAD_FILTER_ARG,	"bad arg in filter tree"),		      \
+	_PE(UNEXPECTED_TYPE,	"unexpected type (not a value)"),	      \
+	_PE(ILLEGAL_TOKEN,	"illegal token"),			      \
+	_PE(INVALID_PAREN,	"open parenthesis cannot come here"), 	      \
+	_PE(UNBALANCED_PAREN,	"unbalanced number of parenthesis"),	      \
+	_PE(UNKNOWN_TOKEN,	"unknown token"),			      \
+	_PE(FILTER_NOT_FOUND,	"no filter found"),			      \
+	_PE(NOT_A_NUMBER,	"must have number field"),		      \
+	_PE(NO_FILTER,		"no filters exists"),			      \
+	_PE(FILTER_MISS,	"record does not match to filter")
+
+#undef _PE
+#define _PE(__code, __str) PEVENT_ERRNO__ ## __code
+enum pevent_errno {
+	PEVENT_ERRNO__SUCCESS			= 0,
+	PEVENT_ERRNO__FILTER_MATCH		= PEVENT_ERRNO__SUCCESS,
+
+	/*
+	 * Choose an arbitrary negative big number not to clash with standard
+	 * errno since SUS requires the errno has distinct positive values.
+	 * See 'Issue 6' in the link below.
+	 *
+	 * http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/errno.h.html
+	 */
+	__PEVENT_ERRNO__START			= -100000,
+
+	PEVENT_ERRORS,
+
+	__PEVENT_ERRNO__END,
+};
+#undef _PE
+
+struct plugin_list;
+
+#define INVALID_PLUGIN_LIST_OPTION	((char **)((unsigned long)-1))
+
+struct plugin_list *traceevent_load_plugins(struct pevent *pevent);
+void traceevent_unload_plugins(struct plugin_list *plugin_list,
+			       struct pevent *pevent);
+char **traceevent_plugin_list_options(void);
+void traceevent_plugin_free_options_list(char **list);
+int traceevent_plugin_add_options(const char *name,
+				  struct pevent_plugin_option *options);
+void traceevent_plugin_remove_options(struct pevent_plugin_option *options);
+void traceevent_print_plugins(struct trace_seq *s,
+			      const char *prefix, const char *suffix,
+			      const struct plugin_list *list);
+
+struct cmdline;
+struct cmdline_list;
+struct func_map;
+struct func_list;
+struct event_handler;
+
+struct pevent {
+	int ref_count;
+
+	int header_page_ts_offset;
+	int header_page_ts_size;
+	int header_page_size_offset;
+	int header_page_size_size;
+	int header_page_data_offset;
+	int header_page_data_size;
+	int header_page_overwrite;
+
+	int file_bigendian;
+	int host_bigendian;
+
+	int latency_format;
+
+	int old_format;
+
+	int cpus;
+	int long_size;
+	int page_size;
+
+	struct cmdline *cmdlines;
+	struct cmdline_list *cmdlist;
+	int cmdline_count;
+
+	struct func_map *func_map;
+	struct func_list *funclist;
+	unsigned int func_count;
+
+	struct printk_map *printk_map;
+	struct printk_list *printklist;
+	unsigned int printk_count;
+
+
+	struct event_format **events;
+	int nr_events;
+	struct event_format **sort_events;
+	enum event_sort_type last_type;
+
+	int type_offset;
+	int type_size;
+
+	int pid_offset;
+	int pid_size;
+
+ 	int pc_offset;
+	int pc_size;
+
+	int flags_offset;
+	int flags_size;
+
+	int ld_offset;
+	int ld_size;
+
+	int print_raw;
+
+	int test_filters;
+
+	int flags;
+
+	struct format_field *bprint_ip_field;
+	struct format_field *bprint_fmt_field;
+	struct format_field *bprint_buf_field;
+
+	struct event_handler *handlers;
+	struct pevent_function_handler *func_handlers;
+
+	/* cache */
+	struct event_format *last_event;
+
+	char *trace_clock;
+};
+
+static inline void pevent_set_flag(struct pevent *pevent, int flag)
+{
+	pevent->flags |= flag;
+}
+
+static inline unsigned short
+__data2host2(struct pevent *pevent, unsigned short data)
+{
+	unsigned short swap;
+
+	if (pevent->host_bigendian == pevent->file_bigendian)
+		return data;
+
+	swap = ((data & 0xffULL) << 8) |
+		((data & (0xffULL << 8)) >> 8);
+
+	return swap;
+}
+
+static inline unsigned int
+__data2host4(struct pevent *pevent, unsigned int data)
+{
+	unsigned int swap;
+
+	if (pevent->host_bigendian == pevent->file_bigendian)
+		return data;
+
+	swap = ((data & 0xffULL) << 24) |
+		((data & (0xffULL << 8)) << 8) |
+		((data & (0xffULL << 16)) >> 8) |
+		((data & (0xffULL << 24)) >> 24);
+
+	return swap;
+}
+
+static inline unsigned long long
+__data2host8(struct pevent *pevent, unsigned long long data)
+{
+	unsigned long long swap;
+
+	if (pevent->host_bigendian == pevent->file_bigendian)
+		return data;
+
+	swap = ((data & 0xffULL) << 56) |
+		((data & (0xffULL << 8)) << 40) |
+		((data & (0xffULL << 16)) << 24) |
+		((data & (0xffULL << 24)) << 8) |
+		((data & (0xffULL << 32)) >> 8) |
+		((data & (0xffULL << 40)) >> 24) |
+		((data & (0xffULL << 48)) >> 40) |
+		((data & (0xffULL << 56)) >> 56);
+
+	return swap;
+}
+
+#define data2host2(pevent, ptr)		__data2host2(pevent, *(unsigned short *)(ptr))
+#define data2host4(pevent, ptr)		__data2host4(pevent, *(unsigned int *)(ptr))
+#define data2host8(pevent, ptr)					\
+({								\
+	unsigned long long __val;				\
+								\
+	memcpy(&__val, (ptr), sizeof(unsigned long long));	\
+	__data2host8(pevent, __val);				\
+})
+
+static inline int traceevent_host_bigendian(void)
+{
+	unsigned char str[] = { 0x1, 0x2, 0x3, 0x4 };
+	unsigned int val;
+
+	memcpy(&val, str, 4);
+	return val == 0x01020304;
+}
+
+/* taken from kernel/trace/trace.h */
+enum trace_flag_type {
+	TRACE_FLAG_IRQS_OFF		= 0x01,
+	TRACE_FLAG_IRQS_NOSUPPORT	= 0x02,
+	TRACE_FLAG_NEED_RESCHED		= 0x04,
+	TRACE_FLAG_HARDIRQ		= 0x08,
+	TRACE_FLAG_SOFTIRQ		= 0x10,
+};
+
+int pevent_register_comm(struct pevent *pevent, const char *comm, int pid);
+void pevent_register_trace_clock(struct pevent *pevent, char *trace_clock);
+int pevent_register_function(struct pevent *pevent, char *name,
+			     unsigned long long addr, char *mod);
+int pevent_register_print_string(struct pevent *pevent, const char *fmt,
+				 unsigned long long addr);
+int pevent_pid_is_registered(struct pevent *pevent, int pid);
+
+void pevent_print_event(struct pevent *pevent, struct trace_seq *s,
+			struct pevent_record *record, bool use_trace_clock);
+
+int pevent_parse_header_page(struct pevent *pevent, char *buf, unsigned long size,
+			     int long_size);
+
+enum pevent_errno pevent_parse_event(struct pevent *pevent, const char *buf,
+				     unsigned long size, const char *sys);
+enum pevent_errno pevent_parse_format(struct pevent *pevent,
+				      struct event_format **eventp,
+				      const char *buf,
+				      unsigned long size, const char *sys);
+void pevent_free_format(struct event_format *event);
+
+void *pevent_get_field_raw(struct trace_seq *s, struct event_format *event,
+			   const char *name, struct pevent_record *record,
+			   int *len, int err);
+
+int pevent_get_field_val(struct trace_seq *s, struct event_format *event,
+			 const char *name, struct pevent_record *record,
+			 unsigned long long *val, int err);
+int pevent_get_common_field_val(struct trace_seq *s, struct event_format *event,
+				const char *name, struct pevent_record *record,
+				unsigned long long *val, int err);
+int pevent_get_any_field_val(struct trace_seq *s, struct event_format *event,
+			     const char *name, struct pevent_record *record,
+			     unsigned long long *val, int err);
+
+int pevent_print_num_field(struct trace_seq *s, const char *fmt,
+			   struct event_format *event, const char *name,
+			   struct pevent_record *record, int err);
+
+int pevent_print_func_field(struct trace_seq *s, const char *fmt,
+			   struct event_format *event, const char *name,
+			   struct pevent_record *record, int err);
+
+int pevent_register_event_handler(struct pevent *pevent, int id,
+				  const char *sys_name, const char *event_name,
+				  pevent_event_handler_func func, void *context);
+int pevent_unregister_event_handler(struct pevent *pevent, int id,
+				    const char *sys_name, const char *event_name,
+				    pevent_event_handler_func func, void *context);
+int pevent_register_print_function(struct pevent *pevent,
+				   pevent_func_handler func,
+				   enum pevent_func_arg_type ret_type,
+				   char *name, ...);
+int pevent_unregister_print_function(struct pevent *pevent,
+				     pevent_func_handler func, char *name);
+
+struct format_field *pevent_find_common_field(struct event_format *event, const char *name);
+struct format_field *pevent_find_field(struct event_format *event, const char *name);
+struct format_field *pevent_find_any_field(struct event_format *event, const char *name);
+
+const char *pevent_find_function(struct pevent *pevent, unsigned long long addr);
+unsigned long long
+pevent_find_function_address(struct pevent *pevent, unsigned long long addr);
+unsigned long long pevent_read_number(struct pevent *pevent, const void *ptr, int size);
+int pevent_read_number_field(struct format_field *field, const void *data,
+			     unsigned long long *value);
+
+struct event_format *pevent_find_event(struct pevent *pevent, int id);
+
+struct event_format *
+pevent_find_event_by_name(struct pevent *pevent, const char *sys, const char *name);
+
+void pevent_data_lat_fmt(struct pevent *pevent,
+			 struct trace_seq *s, struct pevent_record *record);
+int pevent_data_type(struct pevent *pevent, struct pevent_record *rec);
+struct event_format *pevent_data_event_from_type(struct pevent *pevent, int type);
+int pevent_data_pid(struct pevent *pevent, struct pevent_record *rec);
+const char *pevent_data_comm_from_pid(struct pevent *pevent, int pid);
+void pevent_event_info(struct trace_seq *s, struct event_format *event,
+		       struct pevent_record *record);
+int pevent_strerror(struct pevent *pevent, enum pevent_errno errnum,
+		    char *buf, size_t buflen);
+
+struct event_format **pevent_list_events(struct pevent *pevent, enum event_sort_type);
+struct format_field **pevent_event_common_fields(struct event_format *event);
+struct format_field **pevent_event_fields(struct event_format *event);
+
+static inline int pevent_get_cpus(struct pevent *pevent)
+{
+	return pevent->cpus;
+}
+
+static inline void pevent_set_cpus(struct pevent *pevent, int cpus)
+{
+	pevent->cpus = cpus;
+}
+
+static inline int pevent_get_long_size(struct pevent *pevent)
+{
+	return pevent->long_size;
+}
+
+static inline void pevent_set_long_size(struct pevent *pevent, int long_size)
+{
+	pevent->long_size = long_size;
+}
+
+static inline int pevent_get_page_size(struct pevent *pevent)
+{
+	return pevent->page_size;
+}
+
+static inline void pevent_set_page_size(struct pevent *pevent, int _page_size)
+{
+	pevent->page_size = _page_size;
+}
+
+static inline int pevent_is_file_bigendian(struct pevent *pevent)
+{
+	return pevent->file_bigendian;
+}
+
+static inline void pevent_set_file_bigendian(struct pevent *pevent, int endian)
+{
+	pevent->file_bigendian = endian;
+}
+
+static inline int pevent_is_host_bigendian(struct pevent *pevent)
+{
+	return pevent->host_bigendian;
+}
+
+static inline void pevent_set_host_bigendian(struct pevent *pevent, int endian)
+{
+	pevent->host_bigendian = endian;
+}
+
+static inline int pevent_is_latency_format(struct pevent *pevent)
+{
+	return pevent->latency_format;
+}
+
+static inline void pevent_set_latency_format(struct pevent *pevent, int lat)
+{
+	pevent->latency_format = lat;
+}
+
+struct pevent *pevent_alloc(void);
+void pevent_free(struct pevent *pevent);
+void pevent_ref(struct pevent *pevent);
+void pevent_unref(struct pevent *pevent);
+
+/* access to the internal parser */
+void pevent_buffer_init(const char *buf, unsigned long long size);
+enum event_type pevent_read_token(char **tok);
+void pevent_free_token(char *token);
+int pevent_peek_char(void);
+const char *pevent_get_input_buf(void);
+unsigned long long pevent_get_input_buf_ptr(void);
+
+/* for debugging */
+void pevent_print_funcs(struct pevent *pevent);
+void pevent_print_printk(struct pevent *pevent);
+
+/* ----------------------- filtering ----------------------- */
+
+enum filter_boolean_type {
+	FILTER_FALSE,
+	FILTER_TRUE,
+};
+
+enum filter_op_type {
+	FILTER_OP_AND = 1,
+	FILTER_OP_OR,
+	FILTER_OP_NOT,
+};
+
+enum filter_cmp_type {
+	FILTER_CMP_NONE,
+	FILTER_CMP_EQ,
+	FILTER_CMP_NE,
+	FILTER_CMP_GT,
+	FILTER_CMP_LT,
+	FILTER_CMP_GE,
+	FILTER_CMP_LE,
+	FILTER_CMP_MATCH,
+	FILTER_CMP_NOT_MATCH,
+	FILTER_CMP_REGEX,
+	FILTER_CMP_NOT_REGEX,
+};
+
+enum filter_exp_type {
+	FILTER_EXP_NONE,
+	FILTER_EXP_ADD,
+	FILTER_EXP_SUB,
+	FILTER_EXP_MUL,
+	FILTER_EXP_DIV,
+	FILTER_EXP_MOD,
+	FILTER_EXP_RSHIFT,
+	FILTER_EXP_LSHIFT,
+	FILTER_EXP_AND,
+	FILTER_EXP_OR,
+	FILTER_EXP_XOR,
+	FILTER_EXP_NOT,
+};
+
+enum filter_arg_type {
+	FILTER_ARG_NONE,
+	FILTER_ARG_BOOLEAN,
+	FILTER_ARG_VALUE,
+	FILTER_ARG_FIELD,
+	FILTER_ARG_EXP,
+	FILTER_ARG_OP,
+	FILTER_ARG_NUM,
+	FILTER_ARG_STR,
+};
+
+enum filter_value_type {
+	FILTER_NUMBER,
+	FILTER_STRING,
+	FILTER_CHAR
+};
+
+struct fliter_arg;
+
+struct filter_arg_boolean {
+	enum filter_boolean_type	value;
+};
+
+struct filter_arg_field {
+	struct format_field	*field;
+};
+
+struct filter_arg_value {
+	enum filter_value_type	type;
+	union {
+		char			*str;
+		unsigned long long	val;
+	};
+};
+
+struct filter_arg_op {
+	enum filter_op_type	type;
+	struct filter_arg	*left;
+	struct filter_arg	*right;
+};
+
+struct filter_arg_exp {
+	enum filter_exp_type	type;
+	struct filter_arg	*left;
+	struct filter_arg	*right;
+};
+
+struct filter_arg_num {
+	enum filter_cmp_type	type;
+	struct filter_arg	*left;
+	struct filter_arg	*right;
+};
+
+struct filter_arg_str {
+	enum filter_cmp_type	type;
+	struct format_field	*field;
+	char			*val;
+	char			*buffer;
+	regex_t			reg;
+};
+
+struct filter_arg {
+	enum filter_arg_type	type;
+	union {
+		struct filter_arg_boolean	boolean;
+		struct filter_arg_field		field;
+		struct filter_arg_value		value;
+		struct filter_arg_op		op;
+		struct filter_arg_exp		exp;
+		struct filter_arg_num		num;
+		struct filter_arg_str		str;
+	};
+};
+
+struct filter_type {
+	int			event_id;
+	struct event_format	*event;
+	struct filter_arg	*filter;
+};
+
+#define PEVENT_FILTER_ERROR_BUFSZ  1024
+
+struct event_filter {
+	struct pevent		*pevent;
+	int			filters;
+	struct filter_type	*event_filters;
+	char			error_buffer[PEVENT_FILTER_ERROR_BUFSZ];
+};
+
+struct event_filter *pevent_filter_alloc(struct pevent *pevent);
+
+/* for backward compatibility */
+#define FILTER_NONE		PEVENT_ERRNO__NO_FILTER
+#define FILTER_NOEXIST		PEVENT_ERRNO__FILTER_NOT_FOUND
+#define FILTER_MISS		PEVENT_ERRNO__FILTER_MISS
+#define FILTER_MATCH		PEVENT_ERRNO__FILTER_MATCH
+
+enum filter_trivial_type {
+	FILTER_TRIVIAL_FALSE,
+	FILTER_TRIVIAL_TRUE,
+	FILTER_TRIVIAL_BOTH,
+};
+
+enum pevent_errno pevent_filter_add_filter_str(struct event_filter *filter,
+					       const char *filter_str);
+
+enum pevent_errno pevent_filter_match(struct event_filter *filter,
+				      struct pevent_record *record);
+
+int pevent_filter_strerror(struct event_filter *filter, enum pevent_errno err,
+			   char *buf, size_t buflen);
+
+int pevent_event_filtered(struct event_filter *filter,
+			  int event_id);
+
+void pevent_filter_reset(struct event_filter *filter);
+
+int pevent_filter_clear_trivial(struct event_filter *filter,
+				 enum filter_trivial_type type);
+
+void pevent_filter_free(struct event_filter *filter);
+
+char *pevent_filter_make_string(struct event_filter *filter, int event_id);
+
+int pevent_filter_remove_event(struct event_filter *filter,
+			       int event_id);
+
+int pevent_filter_event_has_trivial(struct event_filter *filter,
+				    int event_id,
+				    enum filter_trivial_type type);
+
+int pevent_filter_copy(struct event_filter *dest, struct event_filter *source);
+
+int pevent_update_trivial(struct event_filter *dest, struct event_filter *source,
+			  enum filter_trivial_type type);
+
+int pevent_filter_compare(struct event_filter *filter1, struct event_filter *filter2);
+
+#endif /* _PARSE_EVENTS_H */
diff --git a/tools/lib/traceevent/event-plugin.c b/tools/lib/traceevent/event-plugin.c
new file mode 100644
index 00000000000..136162c03af
--- /dev/null
+++ b/tools/lib/traceevent/event-plugin.c
@@ -0,0 +1,416 @@
+/*
+ * Copyright (C) 2009, 2010 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <dlfcn.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <dirent.h>
+#include "event-parse.h"
+#include "event-utils.h"
+
+#define LOCAL_PLUGIN_DIR ".traceevent/plugins"
+
+static struct registered_plugin_options {
+	struct registered_plugin_options	*next;
+	struct pevent_plugin_option		*options;
+} *registered_options;
+
+static struct trace_plugin_options {
+	struct trace_plugin_options	*next;
+	char				*plugin;
+	char				*option;
+	char				*value;
+} *trace_plugin_options;
+
+struct plugin_list {
+	struct plugin_list	*next;
+	char			*name;
+	void			*handle;
+};
+
+/**
+ * traceevent_plugin_list_options - get list of plugin options
+ *
+ * Returns an array of char strings that list the currently registered
+ * plugin options in the format of <plugin>:<option>. This list can be
+ * used by toggling the option.
+ *
+ * Returns NULL if there's no options registered. On error it returns
+ * INVALID_PLUGIN_LIST_OPTION
+ *
+ * Must be freed with traceevent_plugin_free_options_list().
+ */
+char **traceevent_plugin_list_options(void)
+{
+	struct registered_plugin_options *reg;
+	struct pevent_plugin_option *op;
+	char **list = NULL;
+	char *name;
+	int count = 0;
+
+	for (reg = registered_options; reg; reg = reg->next) {
+		for (op = reg->options; op->name; op++) {
+			char *alias = op->plugin_alias ? op->plugin_alias : op->file;
+			char **temp = list;
+
+			name = malloc(strlen(op->name) + strlen(alias) + 2);
+			if (!name)
+				goto err;
+
+			sprintf(name, "%s:%s", alias, op->name);
+			list = realloc(list, count + 2);
+			if (!list) {
+				list = temp;
+				free(name);
+				goto err;
+			}
+			list[count++] = name;
+			list[count] = NULL;
+		}
+	}
+	return list;
+
+ err:
+	while (--count >= 0)
+		free(list[count]);
+	free(list);
+
+	return INVALID_PLUGIN_LIST_OPTION;
+}
+
+void traceevent_plugin_free_options_list(char **list)
+{
+	int i;
+
+	if (!list)
+		return;
+
+	if (list == INVALID_PLUGIN_LIST_OPTION)
+		return;
+
+	for (i = 0; list[i]; i++)
+		free(list[i]);
+
+	free(list);
+}
+
+static int
+update_option(const char *file, struct pevent_plugin_option *option)
+{
+	struct trace_plugin_options *op;
+	char *plugin;
+
+	if (option->plugin_alias) {
+		plugin = strdup(option->plugin_alias);
+		if (!plugin)
+			return -1;
+	} else {
+		char *p;
+		plugin = strdup(file);
+		if (!plugin)
+			return -1;
+		p = strstr(plugin, ".");
+		if (p)
+			*p = '\0';
+	}
+
+	/* first look for named options */
+	for (op = trace_plugin_options; op; op = op->next) {
+		if (!op->plugin)
+			continue;
+		if (strcmp(op->plugin, plugin) != 0)
+			continue;
+		if (strcmp(op->option, option->name) != 0)
+			continue;
+
+		option->value = op->value;
+		option->set ^= 1;
+		goto out;
+	}
+
+	/* first look for unnamed options */
+	for (op = trace_plugin_options; op; op = op->next) {
+		if (op->plugin)
+			continue;
+		if (strcmp(op->option, option->name) != 0)
+			continue;
+
+		option->value = op->value;
+		option->set ^= 1;
+		break;
+	}
+
+ out:
+	free(plugin);
+	return 0;
+}
+
+/**
+ * traceevent_plugin_add_options - Add a set of options by a plugin
+ * @name: The name of the plugin adding the options
+ * @options: The set of options being loaded
+ *
+ * Sets the options with the values that have been added by user.
+ */
+int traceevent_plugin_add_options(const char *name,
+				  struct pevent_plugin_option *options)
+{
+	struct registered_plugin_options *reg;
+
+	reg = malloc(sizeof(*reg));
+	if (!reg)
+		return -1;
+	reg->next = registered_options;
+	reg->options = options;
+	registered_options = reg;
+
+	while (options->name) {
+		update_option(name, options);
+		options++;
+	}
+	return 0;
+}
+
+/**
+ * traceevent_plugin_remove_options - remove plugin options that were registered
+ * @options: Options to removed that were registered with traceevent_plugin_add_options
+ */
+void traceevent_plugin_remove_options(struct pevent_plugin_option *options)
+{
+	struct registered_plugin_options **last;
+	struct registered_plugin_options *reg;
+
+	for (last = &registered_options; *last; last = &(*last)->next) {
+		if ((*last)->options == options) {
+			reg = *last;
+			*last = reg->next;
+			free(reg);
+			return;
+		}
+	}
+}
+
+/**
+ * traceevent_print_plugins - print out the list of plugins loaded
+ * @s: the trace_seq descripter to write to
+ * @prefix: The prefix string to add before listing the option name
+ * @suffix: The suffix string ot append after the option name
+ * @list: The list of plugins (usually returned by traceevent_load_plugins()
+ *
+ * Writes to the trace_seq @s the list of plugins (files) that is
+ * returned by traceevent_load_plugins(). Use @prefix and @suffix for formating:
+ * @prefix = "  ", @suffix = "\n".
+ */
+void traceevent_print_plugins(struct trace_seq *s,
+			      const char *prefix, const char *suffix,
+			      const struct plugin_list *list)
+{
+	while (list) {
+		trace_seq_printf(s, "%s%s%s", prefix, list->name, suffix);
+		list = list->next;
+	}
+}
+
+static void
+load_plugin(struct pevent *pevent, const char *path,
+	    const char *file, void *data)
+{
+	struct plugin_list **plugin_list = data;
+	pevent_plugin_load_func func;
+	struct plugin_list *list;
+	const char *alias;
+	char *plugin;
+	void *handle;
+
+	plugin = malloc(strlen(path) + strlen(file) + 2);
+	if (!plugin) {
+		warning("could not allocate plugin memory\n");
+		return;
+	}
+
+	strcpy(plugin, path);
+	strcat(plugin, "/");
+	strcat(plugin, file);
+
+	handle = dlopen(plugin, RTLD_NOW | RTLD_GLOBAL);
+	if (!handle) {
+		warning("could not load plugin '%s'\n%s\n",
+			plugin, dlerror());
+		goto out_free;
+	}
+
+	alias = dlsym(handle, PEVENT_PLUGIN_ALIAS_NAME);
+	if (!alias)
+		alias = file;
+
+	func = dlsym(handle, PEVENT_PLUGIN_LOADER_NAME);
+	if (!func) {
+		warning("could not find func '%s' in plugin '%s'\n%s\n",
+			PEVENT_PLUGIN_LOADER_NAME, plugin, dlerror());
+		goto out_free;
+	}
+
+	list = malloc(sizeof(*list));
+	if (!list) {
+		warning("could not allocate plugin memory\n");
+		goto out_free;
+	}
+
+	list->next = *plugin_list;
+	list->handle = handle;
+	list->name = plugin;
+	*plugin_list = list;
+
+	pr_stat("registering plugin: %s", plugin);
+	func(pevent);
+	return;
+
+ out_free:
+	free(plugin);
+}
+
+static void
+load_plugins_dir(struct pevent *pevent, const char *suffix,
+		 const char *path,
+		 void (*load_plugin)(struct pevent *pevent,
+				     const char *path,
+				     const char *name,
+				     void *data),
+		 void *data)
+{
+	struct dirent *dent;
+	struct stat st;
+	DIR *dir;
+	int ret;
+
+	ret = stat(path, &st);
+	if (ret < 0)
+		return;
+
+	if (!S_ISDIR(st.st_mode))
+		return;
+
+	dir = opendir(path);
+	if (!dir)
+		return;
+
+	while ((dent = readdir(dir))) {
+		const char *name = dent->d_name;
+
+		if (strcmp(name, ".") == 0 ||
+		    strcmp(name, "..") == 0)
+			continue;
+
+		/* Only load plugins that end in suffix */
+		if (strcmp(name + (strlen(name) - strlen(suffix)), suffix) != 0)
+			continue;
+
+		load_plugin(pevent, path, name, data);
+	}
+
+	closedir(dir);
+}
+
+static void
+load_plugins(struct pevent *pevent, const char *suffix,
+	     void (*load_plugin)(struct pevent *pevent,
+				 const char *path,
+				 const char *name,
+				 void *data),
+	     void *data)
+{
+	char *home;
+	char *path;
+	char *envdir;
+
+	if (pevent->flags & PEVENT_DISABLE_PLUGINS)
+		return;
+
+	/*
+	 * If a system plugin directory was defined,
+	 * check that first.
+	 */
+#ifdef PLUGIN_DIR
+	if (!(pevent->flags & PEVENT_DISABLE_SYS_PLUGINS))
+		load_plugins_dir(pevent, suffix, PLUGIN_DIR,
+				 load_plugin, data);
+#endif
+
+	/*
+	 * Next let the environment-set plugin directory
+	 * override the system defaults.
+	 */
+	envdir = getenv("TRACEEVENT_PLUGIN_DIR");
+	if (envdir)
+		load_plugins_dir(pevent, suffix, envdir, load_plugin, data);
+
+	/*
+	 * Now let the home directory override the environment
+	 * or system defaults.
+	 */
+	home = getenv("HOME");
+	if (!home)
+		return;
+
+	path = malloc(strlen(home) + strlen(LOCAL_PLUGIN_DIR) + 2);
+	if (!path) {
+		warning("could not allocate plugin memory\n");
+		return;
+	}
+
+	strcpy(path, home);
+	strcat(path, "/");
+	strcat(path, LOCAL_PLUGIN_DIR);
+
+	load_plugins_dir(pevent, suffix, path, load_plugin, data);
+
+	free(path);
+}
+
+struct plugin_list*
+traceevent_load_plugins(struct pevent *pevent)
+{
+	struct plugin_list *list = NULL;
+
+	load_plugins(pevent, ".so", load_plugin, &list);
+	return list;
+}
+
+void
+traceevent_unload_plugins(struct plugin_list *plugin_list, struct pevent *pevent)
+{
+	pevent_plugin_unload_func func;
+	struct plugin_list *list;
+
+	while (plugin_list) {
+		list = plugin_list;
+		plugin_list = list->next;
+		func = dlsym(list->handle, PEVENT_PLUGIN_UNLOADER_NAME);
+		if (func)
+			func(pevent);
+		dlclose(list->handle);
+		free(list->name);
+		free(list);
+	}
+}
diff --git a/tools/lib/traceevent/event-utils.h b/tools/lib/traceevent/event-utils.h
new file mode 100644
index 00000000000..d1dc2170e40
--- /dev/null
+++ b/tools/lib/traceevent/event-utils.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (C) 2010 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#ifndef __UTIL_H
+#define __UTIL_H
+
+#include <ctype.h>
+
+/* Can be overridden */
+void warning(const char *fmt, ...);
+void pr_stat(const char *fmt, ...);
+void vpr_stat(const char *fmt, va_list ap);
+
+/* Always available */
+void __warning(const char *fmt, ...);
+void __pr_stat(const char *fmt, ...);
+
+void __vwarning(const char *fmt, ...);
+void __vpr_stat(const char *fmt, ...);
+
+#define min(x, y) ({				\
+	typeof(x) _min1 = (x);			\
+	typeof(y) _min2 = (y);			\
+	(void) (&_min1 == &_min2);		\
+	_min1 < _min2 ? _min1 : _min2; })
+
+static inline char *strim(char *string)
+{
+	char *ret;
+
+	if (!string)
+		return NULL;
+	while (*string) {
+		if (!isspace(*string))
+			break;
+		string++;
+	}
+	ret = string;
+
+	string = ret + strlen(ret) - 1;
+	while (string > ret) {
+		if (!isspace(*string))
+			break;
+		string--;
+	}
+	string[1] = 0;
+
+	return ret;
+}
+
+static inline int has_text(const char *text)
+{
+	if (!text)
+		return 0;
+
+	while (*text) {
+		if (!isspace(*text))
+			return 1;
+		text++;
+	}
+
+	return 0;
+}
+
+#endif
diff --git a/tools/lib/traceevent/kbuffer-parse.c b/tools/lib/traceevent/kbuffer-parse.c
new file mode 100644
index 00000000000..dcc665228c7
--- /dev/null
+++ b/tools/lib/traceevent/kbuffer-parse.c
@@ -0,0 +1,732 @@
+/*
+ * Copyright (C) 2009, 2010 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "kbuffer.h"
+
+#define MISSING_EVENTS (1 << 31)
+#define MISSING_STORED (1 << 30)
+
+#define COMMIT_MASK ((1 << 27) - 1)
+
+enum {
+	KBUFFER_FL_HOST_BIG_ENDIAN	= (1<<0),
+	KBUFFER_FL_BIG_ENDIAN		= (1<<1),
+	KBUFFER_FL_LONG_8		= (1<<2),
+	KBUFFER_FL_OLD_FORMAT		= (1<<3),
+};
+
+#define ENDIAN_MASK (KBUFFER_FL_HOST_BIG_ENDIAN | KBUFFER_FL_BIG_ENDIAN)
+
+/** kbuffer
+ * @timestamp		- timestamp of current event
+ * @lost_events		- # of lost events between this subbuffer and previous
+ * @flags		- special flags of the kbuffer
+ * @subbuffer		- pointer to the sub-buffer page
+ * @data		- pointer to the start of data on the sub-buffer page
+ * @index		- index from @data to the @curr event data
+ * @curr		- offset from @data to the start of current event
+ *			   (includes metadata)
+ * @next		- offset from @data to the start of next event
+ * @size		- The size of data on @data
+ * @start		- The offset from @subbuffer where @data lives
+ *
+ * @read_4		- Function to read 4 raw bytes (may swap)
+ * @read_8		- Function to read 8 raw bytes (may swap)
+ * @read_long		- Function to read a long word (4 or 8 bytes with needed swap)
+ */
+struct kbuffer {
+	unsigned long long 	timestamp;
+	long long		lost_events;
+	unsigned long		flags;
+	void			*subbuffer;
+	void			*data;
+	unsigned int		index;
+	unsigned int		curr;
+	unsigned int		next;
+	unsigned int		size;
+	unsigned int		start;
+
+	unsigned int (*read_4)(void *ptr);
+	unsigned long long (*read_8)(void *ptr);
+	unsigned long long (*read_long)(struct kbuffer *kbuf, void *ptr);
+	int (*next_event)(struct kbuffer *kbuf);
+};
+
+static void *zmalloc(size_t size)
+{
+	return calloc(1, size);
+}
+
+static int host_is_bigendian(void)
+{
+	unsigned char str[] = { 0x1, 0x2, 0x3, 0x4 };
+	unsigned int *ptr;
+
+	ptr = (unsigned int *)str;
+	return *ptr == 0x01020304;
+}
+
+static int do_swap(struct kbuffer *kbuf)
+{
+	return ((kbuf->flags & KBUFFER_FL_HOST_BIG_ENDIAN) + kbuf->flags) &
+		ENDIAN_MASK;
+}
+
+static unsigned long long __read_8(void *ptr)
+{
+	unsigned long long data = *(unsigned long long *)ptr;
+
+	return data;
+}
+
+static unsigned long long __read_8_sw(void *ptr)
+{
+	unsigned long long data = *(unsigned long long *)ptr;
+	unsigned long long swap;
+
+	swap = ((data & 0xffULL) << 56) |
+		((data & (0xffULL << 8)) << 40) |
+		((data & (0xffULL << 16)) << 24) |
+		((data & (0xffULL << 24)) << 8) |
+		((data & (0xffULL << 32)) >> 8) |
+		((data & (0xffULL << 40)) >> 24) |
+		((data & (0xffULL << 48)) >> 40) |
+		((data & (0xffULL << 56)) >> 56);
+
+	return swap;
+}
+
+static unsigned int __read_4(void *ptr)
+{
+	unsigned int data = *(unsigned int *)ptr;
+
+	return data;
+}
+
+static unsigned int __read_4_sw(void *ptr)
+{
+	unsigned int data = *(unsigned int *)ptr;
+	unsigned int swap;
+
+	swap = ((data & 0xffULL) << 24) |
+		((data & (0xffULL << 8)) << 8) |
+		((data & (0xffULL << 16)) >> 8) |
+		((data & (0xffULL << 24)) >> 24);
+
+	return swap;
+}
+
+static unsigned long long read_8(struct kbuffer *kbuf, void *ptr)
+{
+	return kbuf->read_8(ptr);
+}
+
+static unsigned int read_4(struct kbuffer *kbuf, void *ptr)
+{
+	return kbuf->read_4(ptr);
+}
+
+static unsigned long long __read_long_8(struct kbuffer *kbuf, void *ptr)
+{
+	return kbuf->read_8(ptr);
+}
+
+static unsigned long long __read_long_4(struct kbuffer *kbuf, void *ptr)
+{
+	return kbuf->read_4(ptr);
+}
+
+static unsigned long long read_long(struct kbuffer *kbuf, void *ptr)
+{
+	return kbuf->read_long(kbuf, ptr);
+}
+
+static int calc_index(struct kbuffer *kbuf, void *ptr)
+{
+	return (unsigned long)ptr - (unsigned long)kbuf->data;
+}
+
+static int __next_event(struct kbuffer *kbuf);
+
+/**
+ * kbuffer_alloc - allocat a new kbuffer
+ * @size;	enum to denote size of word
+ * @endian:	enum to denote endianness
+ *
+ * Allocates and returns a new kbuffer.
+ */
+struct kbuffer *
+kbuffer_alloc(enum kbuffer_long_size size, enum kbuffer_endian endian)
+{
+	struct kbuffer *kbuf;
+	int flags = 0;
+
+	switch (size) {
+	case KBUFFER_LSIZE_4:
+		break;
+	case KBUFFER_LSIZE_8:
+		flags |= KBUFFER_FL_LONG_8;
+		break;
+	default:
+		return NULL;
+	}
+
+	switch (endian) {
+	case KBUFFER_ENDIAN_LITTLE:
+		break;
+	case KBUFFER_ENDIAN_BIG:
+		flags |= KBUFFER_FL_BIG_ENDIAN;
+		break;
+	default:
+		return NULL;
+	}
+
+	kbuf = zmalloc(sizeof(*kbuf));
+	if (!kbuf)
+		return NULL;
+
+	kbuf->flags = flags;
+
+	if (host_is_bigendian())
+		kbuf->flags |= KBUFFER_FL_HOST_BIG_ENDIAN;
+
+	if (do_swap(kbuf)) {
+		kbuf->read_8 = __read_8_sw;
+		kbuf->read_4 = __read_4_sw;
+	} else {
+		kbuf->read_8 = __read_8;
+		kbuf->read_4 = __read_4;
+	}
+
+	if (kbuf->flags & KBUFFER_FL_LONG_8)
+		kbuf->read_long = __read_long_8;
+	else
+		kbuf->read_long = __read_long_4;
+
+	/* May be changed by kbuffer_set_old_format() */
+	kbuf->next_event = __next_event;
+
+	return kbuf;
+}
+
+/** kbuffer_free - free an allocated kbuffer
+ * @kbuf:	The kbuffer to free
+ *
+ * Can take NULL as a parameter.
+ */
+void kbuffer_free(struct kbuffer *kbuf)
+{
+	free(kbuf);
+}
+
+static unsigned int type4host(struct kbuffer *kbuf,
+			      unsigned int type_len_ts)
+{
+	if (kbuf->flags & KBUFFER_FL_BIG_ENDIAN)
+		return (type_len_ts >> 29) & 3;
+	else
+		return type_len_ts & 3;
+}
+
+static unsigned int len4host(struct kbuffer *kbuf,
+			     unsigned int type_len_ts)
+{
+	if (kbuf->flags & KBUFFER_FL_BIG_ENDIAN)
+		return (type_len_ts >> 27) & 7;
+	else
+		return (type_len_ts >> 2) & 7;
+}
+
+static unsigned int type_len4host(struct kbuffer *kbuf,
+				  unsigned int type_len_ts)
+{
+	if (kbuf->flags & KBUFFER_FL_BIG_ENDIAN)
+		return (type_len_ts >> 27) & ((1 << 5) - 1);
+	else
+		return type_len_ts & ((1 << 5) - 1);
+}
+
+static unsigned int ts4host(struct kbuffer *kbuf,
+			    unsigned int type_len_ts)
+{
+	if (kbuf->flags & KBUFFER_FL_BIG_ENDIAN)
+		return type_len_ts & ((1 << 27) - 1);
+	else
+		return type_len_ts >> 5;
+}
+
+/*
+ * Linux 2.6.30 and earlier (not much ealier) had a different
+ * ring buffer format. It should be obsolete, but we handle it anyway.
+ */
+enum old_ring_buffer_type {
+	OLD_RINGBUF_TYPE_PADDING,
+	OLD_RINGBUF_TYPE_TIME_EXTEND,
+	OLD_RINGBUF_TYPE_TIME_STAMP,
+	OLD_RINGBUF_TYPE_DATA,
+};
+
+static unsigned int old_update_pointers(struct kbuffer *kbuf)
+{
+	unsigned long long extend;
+	unsigned int type_len_ts;
+	unsigned int type;
+	unsigned int len;
+	unsigned int delta;
+	unsigned int length;
+	void *ptr = kbuf->data + kbuf->curr;
+
+	type_len_ts = read_4(kbuf, ptr);
+	ptr += 4;
+
+	type = type4host(kbuf, type_len_ts);
+	len = len4host(kbuf, type_len_ts);
+	delta = ts4host(kbuf, type_len_ts);
+
+	switch (type) {
+	case OLD_RINGBUF_TYPE_PADDING:
+		kbuf->next = kbuf->size;
+		return 0;
+
+	case OLD_RINGBUF_TYPE_TIME_EXTEND:
+		extend = read_4(kbuf, ptr);
+		extend <<= TS_SHIFT;
+		extend += delta;
+		delta = extend;
+		ptr += 4;
+		break;
+
+	case OLD_RINGBUF_TYPE_TIME_STAMP:
+		/* should never happen! */
+		kbuf->curr = kbuf->size;
+		kbuf->next = kbuf->size;
+		kbuf->index = kbuf->size;
+		return -1;
+	default:
+		if (len)
+			length = len * 4;
+		else {
+			length = read_4(kbuf, ptr);
+			length -= 4;
+			ptr += 4;
+		}
+		break;
+	}
+
+	kbuf->timestamp += delta;
+	kbuf->index = calc_index(kbuf, ptr);
+	kbuf->next = kbuf->index + length;
+
+	return type;
+}
+
+static int __old_next_event(struct kbuffer *kbuf)
+{
+	int type;
+
+	do {
+		kbuf->curr = kbuf->next;
+		if (kbuf->next >= kbuf->size)
+			return -1;
+		type = old_update_pointers(kbuf);
+	} while (type == OLD_RINGBUF_TYPE_TIME_EXTEND || type == OLD_RINGBUF_TYPE_PADDING);
+
+	return 0;
+}
+
+static unsigned int
+translate_data(struct kbuffer *kbuf, void *data, void **rptr,
+	       unsigned long long *delta, int *length)
+{
+	unsigned long long extend;
+	unsigned int type_len_ts;
+	unsigned int type_len;
+
+	type_len_ts = read_4(kbuf, data);
+	data += 4;
+
+	type_len = type_len4host(kbuf, type_len_ts);
+	*delta = ts4host(kbuf, type_len_ts);
+
+	switch (type_len) {
+	case KBUFFER_TYPE_PADDING:
+		*length = read_4(kbuf, data);
+		data += *length;
+		break;
+
+	case KBUFFER_TYPE_TIME_EXTEND:
+		extend = read_4(kbuf, data);
+		data += 4;
+		extend <<= TS_SHIFT;
+		extend += *delta;
+		*delta = extend;
+		*length = 0;
+		break;
+
+	case KBUFFER_TYPE_TIME_STAMP:
+		data += 12;
+		*length = 0;
+		break;
+	case 0:
+		*length = read_4(kbuf, data) - 4;
+		*length = (*length + 3) & ~3;
+		data += 4;
+		break;
+	default:
+		*length = type_len * 4;
+		break;
+	}
+
+	*rptr = data;
+
+	return type_len;
+}
+
+static unsigned int update_pointers(struct kbuffer *kbuf)
+{
+	unsigned long long delta;
+	unsigned int type_len;
+	int length;
+	void *ptr = kbuf->data + kbuf->curr;
+
+	type_len = translate_data(kbuf, ptr, &ptr, &delta, &length);
+
+	kbuf->timestamp += delta;
+	kbuf->index = calc_index(kbuf, ptr);
+	kbuf->next = kbuf->index + length;
+
+	return type_len;
+}
+
+/**
+ * kbuffer_translate_data - read raw data to get a record
+ * @swap:	Set to 1 if bytes in words need to be swapped when read
+ * @data:	The raw data to read
+ * @size:	Address to store the size of the event data.
+ *
+ * Returns a pointer to the event data. To determine the entire
+ * record size (record metadata + data) just add the difference between
+ * @data and the returned value to @size.
+ */
+void *kbuffer_translate_data(int swap, void *data, unsigned int *size)
+{
+	unsigned long long delta;
+	struct kbuffer kbuf;
+	int type_len;
+	int length;
+	void *ptr;
+
+	if (swap) {
+		kbuf.read_8 = __read_8_sw;
+		kbuf.read_4 = __read_4_sw;
+		kbuf.flags = host_is_bigendian() ? 0 : KBUFFER_FL_BIG_ENDIAN;
+	} else {
+		kbuf.read_8 = __read_8;
+		kbuf.read_4 = __read_4;
+		kbuf.flags = host_is_bigendian() ? KBUFFER_FL_BIG_ENDIAN: 0;
+	}
+
+	type_len = translate_data(&kbuf, data, &ptr, &delta, &length);
+	switch (type_len) {
+	case KBUFFER_TYPE_PADDING:
+	case KBUFFER_TYPE_TIME_EXTEND:
+	case KBUFFER_TYPE_TIME_STAMP:
+		return NULL;
+	};
+
+	*size = length;
+
+	return ptr;
+}
+
+static int __next_event(struct kbuffer *kbuf)
+{
+	int type;
+
+	do {
+		kbuf->curr = kbuf->next;
+		if (kbuf->next >= kbuf->size)
+			return -1;
+		type = update_pointers(kbuf);
+	} while (type == KBUFFER_TYPE_TIME_EXTEND || type == KBUFFER_TYPE_PADDING);
+
+	return 0;
+}
+
+static int next_event(struct kbuffer *kbuf)
+{
+	return kbuf->next_event(kbuf);
+}
+
+/**
+ * kbuffer_next_event - increment the current pointer
+ * @kbuf:	The kbuffer to read
+ * @ts:		Address to store the next record's timestamp (may be NULL to ignore)
+ *
+ * Increments the pointers into the subbuffer of the kbuffer to point to the
+ * next event so that the next kbuffer_read_event() will return a
+ * new event.
+ *
+ * Returns the data of the next event if a new event exists on the subbuffer,
+ * NULL otherwise.
+ */
+void *kbuffer_next_event(struct kbuffer *kbuf, unsigned long long *ts)
+{
+	int ret;
+
+	if (!kbuf || !kbuf->subbuffer)
+		return NULL;
+
+	ret = next_event(kbuf);
+	if (ret < 0)
+		return NULL;
+
+	if (ts)
+		*ts = kbuf->timestamp;
+
+	return kbuf->data + kbuf->index;
+}
+
+/**
+ * kbuffer_load_subbuffer - load a new subbuffer into the kbuffer
+ * @kbuf:	The kbuffer to load
+ * @subbuffer:	The subbuffer to load into @kbuf.
+ *
+ * Load a new subbuffer (page) into @kbuf. This will reset all
+ * the pointers and update the @kbuf timestamp. The next read will
+ * return the first event on @subbuffer.
+ *
+ * Returns 0 on succes, -1 otherwise.
+ */
+int kbuffer_load_subbuffer(struct kbuffer *kbuf, void *subbuffer)
+{
+	unsigned long long flags;
+	void *ptr = subbuffer;
+
+	if (!kbuf || !subbuffer)
+		return -1;
+
+	kbuf->subbuffer = subbuffer;
+
+	kbuf->timestamp = read_8(kbuf, ptr);
+	ptr += 8;
+
+	kbuf->curr = 0;
+
+	if (kbuf->flags & KBUFFER_FL_LONG_8)
+		kbuf->start = 16;
+	else
+		kbuf->start = 12;
+
+	kbuf->data = subbuffer + kbuf->start;
+
+	flags = read_long(kbuf, ptr);
+	kbuf->size = (unsigned int)flags & COMMIT_MASK;
+
+	if (flags & MISSING_EVENTS) {
+		if (flags & MISSING_STORED) {
+			ptr = kbuf->data + kbuf->size;
+			kbuf->lost_events = read_long(kbuf, ptr);
+		} else
+			kbuf->lost_events = -1;
+	} else
+		kbuf->lost_events = 0;
+
+	kbuf->index = 0;
+	kbuf->next = 0;
+
+	next_event(kbuf);
+
+	return 0;
+}
+
+/**
+ * kbuffer_read_event - read the next event in the kbuffer subbuffer
+ * @kbuf:	The kbuffer to read from
+ * @ts:		The address to store the timestamp of the event (may be NULL to ignore)
+ *
+ * Returns a pointer to the data part of the current event.
+ * NULL if no event is left on the subbuffer.
+ */
+void *kbuffer_read_event(struct kbuffer *kbuf, unsigned long long *ts)
+{
+	if (!kbuf || !kbuf->subbuffer)
+		return NULL;
+
+	if (kbuf->curr >= kbuf->size)
+		return NULL;
+
+	if (ts)
+		*ts = kbuf->timestamp;
+	return kbuf->data + kbuf->index;
+}
+
+/**
+ * kbuffer_timestamp - Return the timestamp of the current event
+ * @kbuf:	The kbuffer to read from
+ *
+ * Returns the timestamp of the current (next) event.
+ */
+unsigned long long kbuffer_timestamp(struct kbuffer *kbuf)
+{
+	return kbuf->timestamp;
+}
+
+/**
+ * kbuffer_read_at_offset - read the event that is at offset
+ * @kbuf:	The kbuffer to read from
+ * @offset:	The offset into the subbuffer
+ * @ts:		The address to store the timestamp of the event (may be NULL to ignore)
+ *
+ * The @offset must be an index from the @kbuf subbuffer beginning.
+ * If @offset is bigger than the stored subbuffer, NULL will be returned.
+ *
+ * Returns the data of the record that is at @offset. Note, @offset does
+ * not need to be the start of the record, the offset just needs to be
+ * in the record (or beginning of it).
+ *
+ * Note, the kbuf timestamp and pointers are updated to the
+ * returned record. That is, kbuffer_read_event() will return the same
+ * data and timestamp, and kbuffer_next_event() will increment from
+ * this record.
+ */
+void *kbuffer_read_at_offset(struct kbuffer *kbuf, int offset,
+			     unsigned long long *ts)
+{
+	void *data;
+
+	if (offset < kbuf->start)
+		offset = 0;
+	else
+		offset -= kbuf->start;
+
+	/* Reset the buffer */
+	kbuffer_load_subbuffer(kbuf, kbuf->subbuffer);
+
+	while (kbuf->curr < offset) {
+		data = kbuffer_next_event(kbuf, ts);
+		if (!data)
+			break;
+	}
+
+	return data;
+}
+
+/**
+ * kbuffer_subbuffer_size - the size of the loaded subbuffer
+ * @kbuf:	The kbuffer to read from
+ *
+ * Returns the size of the subbuffer. Note, this size is
+ * where the last event resides. The stored subbuffer may actually be
+ * bigger due to padding and such.
+ */
+int kbuffer_subbuffer_size(struct kbuffer *kbuf)
+{
+	return kbuf->size;
+}
+
+/**
+ * kbuffer_curr_index - Return the index of the record
+ * @kbuf:	The kbuffer to read from
+ *
+ * Returns the index from the start of the data part of
+ * the subbuffer to the current location. Note this is not
+ * from the start of the subbuffer. An index of zero will
+ * point to the first record. Use kbuffer_curr_offset() for
+ * the actually offset (that can be used by kbuffer_read_at_offset())
+ */
+int kbuffer_curr_index(struct kbuffer *kbuf)
+{
+	return kbuf->curr;
+}
+
+/**
+ * kbuffer_curr_offset - Return the offset of the record
+ * @kbuf:	The kbuffer to read from
+ *
+ * Returns the offset from the start of the subbuffer to the
+ * current location.
+ */
+int kbuffer_curr_offset(struct kbuffer *kbuf)
+{
+	return kbuf->curr + kbuf->start;
+}
+
+/**
+ * kbuffer_event_size - return the size of the event data
+ * @kbuf:	The kbuffer to read
+ *
+ * Returns the size of the event data (the payload not counting
+ * the meta data of the record) of the current event.
+ */
+int kbuffer_event_size(struct kbuffer *kbuf)
+{
+	return kbuf->next - kbuf->index;
+}
+
+/**
+ * kbuffer_curr_size - return the size of the entire record
+ * @kbuf:	The kbuffer to read
+ *
+ * Returns the size of the entire record (meta data and payload)
+ * of the current event.
+ */
+int kbuffer_curr_size(struct kbuffer *kbuf)
+{
+	return kbuf->next - kbuf->curr;
+}
+
+/**
+ * kbuffer_missed_events - return the # of missed events from last event.
+ * @kbuf: 	The kbuffer to read from
+ *
+ * Returns the # of missed events (if recorded) before the current
+ * event. Note, only events on the beginning of a subbuffer can
+ * have missed events, all other events within the buffer will be
+ * zero.
+ */
+int kbuffer_missed_events(struct kbuffer *kbuf)
+{
+	/* Only the first event can have missed events */
+	if (kbuf->curr)
+		return 0;
+
+	return kbuf->lost_events;
+}
+
+/**
+ * kbuffer_set_old_forma - set the kbuffer to use the old format parsing
+ * @kbuf:	The kbuffer to set
+ *
+ * This is obsolete (or should be). The first kernels to use the
+ * new ring buffer had a slightly different ring buffer format
+ * (2.6.30 and earlier). It is still somewhat supported by kbuffer,
+ * but should not be counted on in the future.
+ */
+void kbuffer_set_old_format(struct kbuffer *kbuf)
+{
+	kbuf->flags |= KBUFFER_FL_OLD_FORMAT;
+
+	kbuf->next_event = __old_next_event;
+}
diff --git a/tools/lib/traceevent/kbuffer.h b/tools/lib/traceevent/kbuffer.h
new file mode 100644
index 00000000000..c831f64b17a
--- /dev/null
+++ b/tools/lib/traceevent/kbuffer.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2012 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#ifndef _KBUFFER_H
+#define _KBUFFER_H
+
+#ifndef TS_SHIFT
+#define TS_SHIFT		27
+#endif
+
+enum kbuffer_endian {
+	KBUFFER_ENDIAN_BIG,
+	KBUFFER_ENDIAN_LITTLE,
+};
+
+enum kbuffer_long_size {
+	KBUFFER_LSIZE_4,
+	KBUFFER_LSIZE_8,
+};
+
+enum {
+	KBUFFER_TYPE_PADDING		= 29,
+	KBUFFER_TYPE_TIME_EXTEND	= 30,
+	KBUFFER_TYPE_TIME_STAMP		= 31,
+};
+
+struct kbuffer;
+
+struct kbuffer *kbuffer_alloc(enum kbuffer_long_size size, enum kbuffer_endian endian);
+void kbuffer_free(struct kbuffer *kbuf);
+int kbuffer_load_subbuffer(struct kbuffer *kbuf, void *subbuffer);
+void *kbuffer_read_event(struct kbuffer *kbuf, unsigned long long *ts);
+void *kbuffer_next_event(struct kbuffer *kbuf, unsigned long long *ts);
+unsigned long long kbuffer_timestamp(struct kbuffer *kbuf);
+
+void *kbuffer_translate_data(int swap, void *data, unsigned int *size);
+
+void *kbuffer_read_at_offset(struct kbuffer *kbuf, int offset, unsigned long long *ts);
+
+int kbuffer_curr_index(struct kbuffer *kbuf);
+
+int kbuffer_curr_offset(struct kbuffer *kbuf);
+int kbuffer_curr_size(struct kbuffer *kbuf);
+int kbuffer_event_size(struct kbuffer *kbuf);
+int kbuffer_missed_events(struct kbuffer *kbuf);
+int kbuffer_subbuffer_size(struct kbuffer *kbuf);
+
+void kbuffer_set_old_format(struct kbuffer *kbuf);
+
+#endif /* _K_BUFFER_H */
diff --git a/tools/lib/traceevent/parse-filter.c b/tools/lib/traceevent/parse-filter.c
new file mode 100644
index 00000000000..b50234402fc
--- /dev/null
+++ b/tools/lib/traceevent/parse-filter.c
@@ -0,0 +1,2432 @@
+/*
+ * Copyright (C) 2010 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <sys/types.h>
+
+#include "event-parse.h"
+#include "event-utils.h"
+
+#define COMM "COMM"
+
+static struct format_field comm = {
+	.name = "COMM",
+};
+
+struct event_list {
+	struct event_list	*next;
+	struct event_format	*event;
+};
+
+static void show_error(char *error_buf, const char *fmt, ...)
+{
+	unsigned long long index;
+	const char *input;
+	va_list ap;
+	int len;
+	int i;
+
+	input = pevent_get_input_buf();
+	index = pevent_get_input_buf_ptr();
+	len = input ? strlen(input) : 0;
+
+	if (len) {
+		strcpy(error_buf, input);
+		error_buf[len] = '\n';
+		for (i = 1; i < len && i < index; i++)
+			error_buf[len+i] = ' ';
+		error_buf[len + i] = '^';
+		error_buf[len + i + 1] = '\n';
+		len += i+2;
+	}
+
+	va_start(ap, fmt);
+	vsnprintf(error_buf + len, PEVENT_FILTER_ERROR_BUFSZ - len, fmt, ap);
+	va_end(ap);
+}
+
+static void free_token(char *token)
+{
+	pevent_free_token(token);
+}
+
+static enum event_type read_token(char **tok)
+{
+	enum event_type type;
+	char *token = NULL;
+
+	do {
+		free_token(token);
+		type = pevent_read_token(&token);
+	} while (type == EVENT_NEWLINE || type == EVENT_SPACE);
+
+	/* If token is = or ! check to see if the next char is ~ */
+	if (token &&
+	    (strcmp(token, "=") == 0 || strcmp(token, "!") == 0) &&
+	    pevent_peek_char() == '~') {
+		/* append it */
+		*tok = malloc(3);
+		if (*tok == NULL) {
+			free_token(token);
+			return EVENT_ERROR;
+		}
+		sprintf(*tok, "%c%c", *token, '~');
+		free_token(token);
+		/* Now remove the '~' from the buffer */
+		pevent_read_token(&token);
+		free_token(token);
+	} else
+		*tok = token;
+
+	return type;
+}
+
+static int filter_cmp(const void *a, const void *b)
+{
+	const struct filter_type *ea = a;
+	const struct filter_type *eb = b;
+
+	if (ea->event_id < eb->event_id)
+		return -1;
+
+	if (ea->event_id > eb->event_id)
+		return 1;
+
+	return 0;
+}
+
+static struct filter_type *
+find_filter_type(struct event_filter *filter, int id)
+{
+	struct filter_type *filter_type;
+	struct filter_type key;
+
+	key.event_id = id;
+
+	filter_type = bsearch(&key, filter->event_filters,
+			      filter->filters,
+			      sizeof(*filter->event_filters),
+			      filter_cmp);
+
+	return filter_type;
+}
+
+static struct filter_type *
+add_filter_type(struct event_filter *filter, int id)
+{
+	struct filter_type *filter_type;
+	int i;
+
+	filter_type = find_filter_type(filter, id);
+	if (filter_type)
+		return filter_type;
+
+	filter_type = realloc(filter->event_filters,
+			      sizeof(*filter->event_filters) *
+			      (filter->filters + 1));
+	if (!filter_type)
+		return NULL;
+
+	filter->event_filters = filter_type;
+
+	for (i = 0; i < filter->filters; i++) {
+		if (filter->event_filters[i].event_id > id)
+			break;
+	}
+
+	if (i < filter->filters)
+		memmove(&filter->event_filters[i+1],
+			&filter->event_filters[i],
+			sizeof(*filter->event_filters) *
+			(filter->filters - i));
+
+	filter_type = &filter->event_filters[i];
+	filter_type->event_id = id;
+	filter_type->event = pevent_find_event(filter->pevent, id);
+	filter_type->filter = NULL;
+
+	filter->filters++;
+
+	return filter_type;
+}
+
+/**
+ * pevent_filter_alloc - create a new event filter
+ * @pevent: The pevent that this filter is associated with
+ */
+struct event_filter *pevent_filter_alloc(struct pevent *pevent)
+{
+	struct event_filter *filter;
+
+	filter = malloc(sizeof(*filter));
+	if (filter == NULL)
+		return NULL;
+
+	memset(filter, 0, sizeof(*filter));
+	filter->pevent = pevent;
+	pevent_ref(pevent);
+
+	return filter;
+}
+
+static struct filter_arg *allocate_arg(void)
+{
+	return calloc(1, sizeof(struct filter_arg));
+}
+
+static void free_arg(struct filter_arg *arg)
+{
+	if (!arg)
+		return;
+
+	switch (arg->type) {
+	case FILTER_ARG_NONE:
+	case FILTER_ARG_BOOLEAN:
+		break;
+
+	case FILTER_ARG_NUM:
+		free_arg(arg->num.left);
+		free_arg(arg->num.right);
+		break;
+
+	case FILTER_ARG_EXP:
+		free_arg(arg->exp.left);
+		free_arg(arg->exp.right);
+		break;
+
+	case FILTER_ARG_STR:
+		free(arg->str.val);
+		regfree(&arg->str.reg);
+		free(arg->str.buffer);
+		break;
+
+	case FILTER_ARG_VALUE:
+		if (arg->value.type == FILTER_STRING ||
+		    arg->value.type == FILTER_CHAR)
+			free(arg->value.str);
+		break;
+
+	case FILTER_ARG_OP:
+		free_arg(arg->op.left);
+		free_arg(arg->op.right);
+	default:
+		break;
+	}
+
+	free(arg);
+}
+
+static int add_event(struct event_list **events,
+		      struct event_format *event)
+{
+	struct event_list *list;
+
+	list = malloc(sizeof(*list));
+	if (list == NULL)
+		return -1;
+
+	list->next = *events;
+	*events = list;
+	list->event = event;
+	return 0;
+}
+
+static int event_match(struct event_format *event,
+		       regex_t *sreg, regex_t *ereg)
+{
+	if (sreg) {
+		return !regexec(sreg, event->system, 0, NULL, 0) &&
+			!regexec(ereg, event->name, 0, NULL, 0);
+	}
+
+	return !regexec(ereg, event->system, 0, NULL, 0) ||
+		!regexec(ereg, event->name, 0, NULL, 0);
+}
+
+static enum pevent_errno
+find_event(struct pevent *pevent, struct event_list **events,
+	   char *sys_name, char *event_name)
+{
+	struct event_format *event;
+	regex_t ereg;
+	regex_t sreg;
+	int match = 0;
+	int fail = 0;
+	char *reg;
+	int ret;
+	int i;
+
+	if (!event_name) {
+		/* if no name is given, then swap sys and name */
+		event_name = sys_name;
+		sys_name = NULL;
+	}
+
+	reg = malloc(strlen(event_name) + 3);
+	if (reg == NULL)
+		return PEVENT_ERRNO__MEM_ALLOC_FAILED;
+
+	sprintf(reg, "^%s$", event_name);
+
+	ret = regcomp(&ereg, reg, REG_ICASE|REG_NOSUB);
+	free(reg);
+
+	if (ret)
+		return PEVENT_ERRNO__INVALID_EVENT_NAME;
+
+	if (sys_name) {
+		reg = malloc(strlen(sys_name) + 3);
+		if (reg == NULL) {
+			regfree(&ereg);
+			return PEVENT_ERRNO__MEM_ALLOC_FAILED;
+		}
+
+		sprintf(reg, "^%s$", sys_name);
+		ret = regcomp(&sreg, reg, REG_ICASE|REG_NOSUB);
+		free(reg);
+		if (ret) {
+			regfree(&ereg);
+			return PEVENT_ERRNO__INVALID_EVENT_NAME;
+		}
+	}
+
+	for (i = 0; i < pevent->nr_events; i++) {
+		event = pevent->events[i];
+		if (event_match(event, sys_name ? &sreg : NULL, &ereg)) {
+			match = 1;
+			if (add_event(events, event) < 0) {
+				fail = 1;
+				break;
+			}
+		}
+	}
+
+	regfree(&ereg);
+	if (sys_name)
+		regfree(&sreg);
+
+	if (!match)
+		return PEVENT_ERRNO__EVENT_NOT_FOUND;
+	if (fail)
+		return PEVENT_ERRNO__MEM_ALLOC_FAILED;
+
+	return 0;
+}
+
+static void free_events(struct event_list *events)
+{
+	struct event_list *event;
+
+	while (events) {
+		event = events;
+		events = events->next;
+		free(event);
+	}
+}
+
+static enum pevent_errno
+create_arg_item(struct event_format *event, const char *token,
+		enum event_type type, struct filter_arg **parg, char *error_str)
+{
+	struct format_field *field;
+	struct filter_arg *arg;
+
+	arg = allocate_arg();
+	if (arg == NULL) {
+		show_error(error_str, "failed to allocate filter arg");
+		return PEVENT_ERRNO__MEM_ALLOC_FAILED;
+	}
+
+	switch (type) {
+
+	case EVENT_SQUOTE:
+	case EVENT_DQUOTE:
+		arg->type = FILTER_ARG_VALUE;
+		arg->value.type =
+			type == EVENT_DQUOTE ? FILTER_STRING : FILTER_CHAR;
+		arg->value.str = strdup(token);
+		if (!arg->value.str) {
+			free_arg(arg);
+			show_error(error_str, "failed to allocate string filter arg");
+			return PEVENT_ERRNO__MEM_ALLOC_FAILED;
+		}
+		break;
+	case EVENT_ITEM:
+		/* if it is a number, then convert it */
+		if (isdigit(token[0])) {
+			arg->type = FILTER_ARG_VALUE;
+			arg->value.type = FILTER_NUMBER;
+			arg->value.val = strtoull(token, NULL, 0);
+			break;
+		}
+		/* Consider this a field */
+		field = pevent_find_any_field(event, token);
+		if (!field) {
+			if (strcmp(token, COMM) != 0) {
+				/* not a field, Make it false */
+				arg->type = FILTER_ARG_BOOLEAN;
+				arg->boolean.value = FILTER_FALSE;
+				break;
+			}
+			/* If token is 'COMM' then it is special */
+			field = &comm;
+		}
+		arg->type = FILTER_ARG_FIELD;
+		arg->field.field = field;
+		break;
+	default:
+		free_arg(arg);
+		show_error(error_str, "expected a value but found %s", token);
+		return PEVENT_ERRNO__UNEXPECTED_TYPE;
+	}
+	*parg = arg;
+	return 0;
+}
+
+static struct filter_arg *
+create_arg_op(enum filter_op_type btype)
+{
+	struct filter_arg *arg;
+
+	arg = allocate_arg();
+	if (!arg)
+		return NULL;
+
+	arg->type = FILTER_ARG_OP;
+	arg->op.type = btype;
+
+	return arg;
+}
+
+static struct filter_arg *
+create_arg_exp(enum filter_exp_type etype)
+{
+	struct filter_arg *arg;
+
+	arg = allocate_arg();
+	if (!arg)
+		return NULL;
+
+	arg->type = FILTER_ARG_EXP;
+	arg->op.type = etype;
+
+	return arg;
+}
+
+static struct filter_arg *
+create_arg_cmp(enum filter_exp_type etype)
+{
+	struct filter_arg *arg;
+
+	arg = allocate_arg();
+	if (!arg)
+		return NULL;
+
+	/* Use NUM and change if necessary */
+	arg->type = FILTER_ARG_NUM;
+	arg->op.type = etype;
+
+	return arg;
+}
+
+static enum pevent_errno
+add_right(struct filter_arg *op, struct filter_arg *arg, char *error_str)
+{
+	struct filter_arg *left;
+	char *str;
+	int op_type;
+	int ret;
+
+	switch (op->type) {
+	case FILTER_ARG_EXP:
+		if (op->exp.right)
+			goto out_fail;
+		op->exp.right = arg;
+		break;
+
+	case FILTER_ARG_OP:
+		if (op->op.right)
+			goto out_fail;
+		op->op.right = arg;
+		break;
+
+	case FILTER_ARG_NUM:
+		if (op->op.right)
+			goto out_fail;
+		/*
+		 * The arg must be num, str, or field
+		 */
+		switch (arg->type) {
+		case FILTER_ARG_VALUE:
+		case FILTER_ARG_FIELD:
+			break;
+		default:
+			show_error(error_str, "Illegal rvalue");
+			return PEVENT_ERRNO__ILLEGAL_RVALUE;
+		}
+
+		/*
+		 * Depending on the type, we may need to
+		 * convert this to a string or regex.
+		 */
+		switch (arg->value.type) {
+		case FILTER_CHAR:
+			/*
+			 * A char should be converted to number if
+			 * the string is 1 byte, and the compare
+			 * is not a REGEX.
+			 */
+			if (strlen(arg->value.str) == 1 &&
+			    op->num.type != FILTER_CMP_REGEX &&
+			    op->num.type != FILTER_CMP_NOT_REGEX) {
+				arg->value.type = FILTER_NUMBER;
+				goto do_int;
+			}
+			/* fall through */
+		case FILTER_STRING:
+
+			/* convert op to a string arg */
+			op_type = op->num.type;
+			left = op->num.left;
+			str = arg->value.str;
+
+			/* reset the op for the new field */
+			memset(op, 0, sizeof(*op));
+
+			/*
+			 * If left arg was a field not found then
+			 * NULL the entire op.
+			 */
+			if (left->type == FILTER_ARG_BOOLEAN) {
+				free_arg(left);
+				free_arg(arg);
+				op->type = FILTER_ARG_BOOLEAN;
+				op->boolean.value = FILTER_FALSE;
+				break;
+			}
+
+			/* Left arg must be a field */
+			if (left->type != FILTER_ARG_FIELD) {
+				show_error(error_str,
+					   "Illegal lvalue for string comparison");
+				return PEVENT_ERRNO__ILLEGAL_LVALUE;
+			}
+
+			/* Make sure this is a valid string compare */
+			switch (op_type) {
+			case FILTER_CMP_EQ:
+				op_type = FILTER_CMP_MATCH;
+				break;
+			case FILTER_CMP_NE:
+				op_type = FILTER_CMP_NOT_MATCH;
+				break;
+
+			case FILTER_CMP_REGEX:
+			case FILTER_CMP_NOT_REGEX:
+				ret = regcomp(&op->str.reg, str, REG_ICASE|REG_NOSUB);
+				if (ret) {
+					show_error(error_str,
+						   "RegEx '%s' did not compute",
+						   str);
+					return PEVENT_ERRNO__INVALID_REGEX;
+				}
+				break;
+			default:
+				show_error(error_str,
+					   "Illegal comparison for string");
+				return PEVENT_ERRNO__ILLEGAL_STRING_CMP;
+			}
+
+			op->type = FILTER_ARG_STR;
+			op->str.type = op_type;
+			op->str.field = left->field.field;
+			op->str.val = strdup(str);
+			if (!op->str.val) {
+				show_error(error_str, "Failed to allocate string filter");
+				return PEVENT_ERRNO__MEM_ALLOC_FAILED;
+			}
+			/*
+			 * Need a buffer to copy data for tests
+			 */
+			op->str.buffer = malloc(op->str.field->size + 1);
+			if (!op->str.buffer) {
+				show_error(error_str, "Failed to allocate string filter");
+				return PEVENT_ERRNO__MEM_ALLOC_FAILED;
+			}
+			/* Null terminate this buffer */
+			op->str.buffer[op->str.field->size] = 0;
+
+			/* We no longer have left or right args */
+			free_arg(arg);
+			free_arg(left);
+
+			break;
+
+		case FILTER_NUMBER:
+
+ do_int:
+			switch (op->num.type) {
+			case FILTER_CMP_REGEX:
+			case FILTER_CMP_NOT_REGEX:
+				show_error(error_str,
+					   "Op not allowed with integers");
+				return PEVENT_ERRNO__ILLEGAL_INTEGER_CMP;
+
+			default:
+				break;
+			}
+
+			/* numeric compare */
+			op->num.right = arg;
+			break;
+		default:
+			goto out_fail;
+		}
+		break;
+	default:
+		goto out_fail;
+	}
+
+	return 0;
+
+ out_fail:
+	show_error(error_str, "Syntax error");
+	return PEVENT_ERRNO__SYNTAX_ERROR;
+}
+
+static struct filter_arg *
+rotate_op_right(struct filter_arg *a, struct filter_arg *b)
+{
+	struct filter_arg *arg;
+
+	arg = a->op.right;
+	a->op.right = b;
+	return arg;
+}
+
+static enum pevent_errno add_left(struct filter_arg *op, struct filter_arg *arg)
+{
+	switch (op->type) {
+	case FILTER_ARG_EXP:
+		if (arg->type == FILTER_ARG_OP)
+			arg = rotate_op_right(arg, op);
+		op->exp.left = arg;
+		break;
+
+	case FILTER_ARG_OP:
+		op->op.left = arg;
+		break;
+	case FILTER_ARG_NUM:
+		if (arg->type == FILTER_ARG_OP)
+			arg = rotate_op_right(arg, op);
+
+		/* left arg of compares must be a field */
+		if (arg->type != FILTER_ARG_FIELD &&
+		    arg->type != FILTER_ARG_BOOLEAN)
+			return PEVENT_ERRNO__INVALID_ARG_TYPE;
+		op->num.left = arg;
+		break;
+	default:
+		return PEVENT_ERRNO__INVALID_ARG_TYPE;
+	}
+	return 0;
+}
+
+enum op_type {
+	OP_NONE,
+	OP_BOOL,
+	OP_NOT,
+	OP_EXP,
+	OP_CMP,
+};
+
+static enum op_type process_op(const char *token,
+			       enum filter_op_type *btype,
+			       enum filter_cmp_type *ctype,
+			       enum filter_exp_type *etype)
+{
+	*btype = FILTER_OP_NOT;
+	*etype = FILTER_EXP_NONE;
+	*ctype = FILTER_CMP_NONE;
+
+	if (strcmp(token, "&&") == 0)
+		*btype = FILTER_OP_AND;
+	else if (strcmp(token, "||") == 0)
+		*btype = FILTER_OP_OR;
+	else if (strcmp(token, "!") == 0)
+		return OP_NOT;
+
+	if (*btype != FILTER_OP_NOT)
+		return OP_BOOL;
+
+	/* Check for value expressions */
+	if (strcmp(token, "+") == 0) {
+		*etype = FILTER_EXP_ADD;
+	} else if (strcmp(token, "-") == 0) {
+		*etype = FILTER_EXP_SUB;
+	} else if (strcmp(token, "*") == 0) {
+		*etype = FILTER_EXP_MUL;
+	} else if (strcmp(token, "/") == 0) {
+		*etype = FILTER_EXP_DIV;
+	} else if (strcmp(token, "%") == 0) {
+		*etype = FILTER_EXP_MOD;
+	} else if (strcmp(token, ">>") == 0) {
+		*etype = FILTER_EXP_RSHIFT;
+	} else if (strcmp(token, "<<") == 0) {
+		*etype = FILTER_EXP_LSHIFT;
+	} else if (strcmp(token, "&") == 0) {
+		*etype = FILTER_EXP_AND;
+	} else if (strcmp(token, "|") == 0) {
+		*etype = FILTER_EXP_OR;
+	} else if (strcmp(token, "^") == 0) {
+		*etype = FILTER_EXP_XOR;
+	} else if (strcmp(token, "~") == 0)
+		*etype = FILTER_EXP_NOT;
+
+	if (*etype != FILTER_EXP_NONE)
+		return OP_EXP;
+
+	/* Check for compares */
+	if (strcmp(token, "==") == 0)
+		*ctype = FILTER_CMP_EQ;
+	else if (strcmp(token, "!=") == 0)
+		*ctype = FILTER_CMP_NE;
+	else if (strcmp(token, "<") == 0)
+		*ctype = FILTER_CMP_LT;
+	else if (strcmp(token, ">") == 0)
+		*ctype = FILTER_CMP_GT;
+	else if (strcmp(token, "<=") == 0)
+		*ctype = FILTER_CMP_LE;
+	else if (strcmp(token, ">=") == 0)
+		*ctype = FILTER_CMP_GE;
+	else if (strcmp(token, "=~") == 0)
+		*ctype = FILTER_CMP_REGEX;
+	else if (strcmp(token, "!~") == 0)
+		*ctype = FILTER_CMP_NOT_REGEX;
+	else
+		return OP_NONE;
+
+	return OP_CMP;
+}
+
+static int check_op_done(struct filter_arg *arg)
+{
+	switch (arg->type) {
+	case FILTER_ARG_EXP:
+		return arg->exp.right != NULL;
+
+	case FILTER_ARG_OP:
+		return arg->op.right != NULL;
+
+	case FILTER_ARG_NUM:
+		return arg->num.right != NULL;
+
+	case FILTER_ARG_STR:
+		/* A string conversion is always done */
+		return 1;
+
+	case FILTER_ARG_BOOLEAN:
+		/* field not found, is ok */
+		return 1;
+
+	default:
+		return 0;
+	}
+}
+
+enum filter_vals {
+	FILTER_VAL_NORM,
+	FILTER_VAL_FALSE,
+	FILTER_VAL_TRUE,
+};
+
+static enum pevent_errno
+reparent_op_arg(struct filter_arg *parent, struct filter_arg *old_child,
+		struct filter_arg *arg, char *error_str)
+{
+	struct filter_arg *other_child;
+	struct filter_arg **ptr;
+
+	if (parent->type != FILTER_ARG_OP &&
+	    arg->type != FILTER_ARG_OP) {
+		show_error(error_str, "can not reparent other than OP");
+		return PEVENT_ERRNO__REPARENT_NOT_OP;
+	}
+
+	/* Get the sibling */
+	if (old_child->op.right == arg) {
+		ptr = &old_child->op.right;
+		other_child = old_child->op.left;
+	} else if (old_child->op.left == arg) {
+		ptr = &old_child->op.left;
+		other_child = old_child->op.right;
+	} else {
+		show_error(error_str, "Error in reparent op, find other child");
+		return PEVENT_ERRNO__REPARENT_FAILED;
+	}
+
+	/* Detach arg from old_child */
+	*ptr = NULL;
+
+	/* Check for root */
+	if (parent == old_child) {
+		free_arg(other_child);
+		*parent = *arg;
+		/* Free arg without recussion */
+		free(arg);
+		return 0;
+	}
+
+	if (parent->op.right == old_child)
+		ptr = &parent->op.right;
+	else if (parent->op.left == old_child)
+		ptr = &parent->op.left;
+	else {
+		show_error(error_str, "Error in reparent op");
+		return PEVENT_ERRNO__REPARENT_FAILED;
+	}
+
+	*ptr = arg;
+
+	free_arg(old_child);
+	return 0;
+}
+
+/* Returns either filter_vals (success) or pevent_errno (failfure) */
+static int test_arg(struct filter_arg *parent, struct filter_arg *arg,
+		    char *error_str)
+{
+	int lval, rval;
+
+	switch (arg->type) {
+
+		/* bad case */
+	case FILTER_ARG_BOOLEAN:
+		return FILTER_VAL_FALSE + arg->boolean.value;
+
+		/* good cases: */
+	case FILTER_ARG_STR:
+	case FILTER_ARG_VALUE:
+	case FILTER_ARG_FIELD:
+		return FILTER_VAL_NORM;
+
+	case FILTER_ARG_EXP:
+		lval = test_arg(arg, arg->exp.left, error_str);
+		if (lval != FILTER_VAL_NORM)
+			return lval;
+		rval = test_arg(arg, arg->exp.right, error_str);
+		if (rval != FILTER_VAL_NORM)
+			return rval;
+		return FILTER_VAL_NORM;
+
+	case FILTER_ARG_NUM:
+		lval = test_arg(arg, arg->num.left, error_str);
+		if (lval != FILTER_VAL_NORM)
+			return lval;
+		rval = test_arg(arg, arg->num.right, error_str);
+		if (rval != FILTER_VAL_NORM)
+			return rval;
+		return FILTER_VAL_NORM;
+
+	case FILTER_ARG_OP:
+		if (arg->op.type != FILTER_OP_NOT) {
+			lval = test_arg(arg, arg->op.left, error_str);
+			switch (lval) {
+			case FILTER_VAL_NORM:
+				break;
+			case FILTER_VAL_TRUE:
+				if (arg->op.type == FILTER_OP_OR)
+					return FILTER_VAL_TRUE;
+				rval = test_arg(arg, arg->op.right, error_str);
+				if (rval != FILTER_VAL_NORM)
+					return rval;
+
+				return reparent_op_arg(parent, arg, arg->op.right,
+						       error_str);
+
+			case FILTER_VAL_FALSE:
+				if (arg->op.type == FILTER_OP_AND)
+					return FILTER_VAL_FALSE;
+				rval = test_arg(arg, arg->op.right, error_str);
+				if (rval != FILTER_VAL_NORM)
+					return rval;
+
+				return reparent_op_arg(parent, arg, arg->op.right,
+						       error_str);
+
+			default:
+				return lval;
+			}
+		}
+
+		rval = test_arg(arg, arg->op.right, error_str);
+		switch (rval) {
+		case FILTER_VAL_NORM:
+		default:
+			break;
+
+		case FILTER_VAL_TRUE:
+			if (arg->op.type == FILTER_OP_OR)
+				return FILTER_VAL_TRUE;
+			if (arg->op.type == FILTER_OP_NOT)
+				return FILTER_VAL_FALSE;
+
+			return reparent_op_arg(parent, arg, arg->op.left,
+					       error_str);
+
+		case FILTER_VAL_FALSE:
+			if (arg->op.type == FILTER_OP_AND)
+				return FILTER_VAL_FALSE;
+			if (arg->op.type == FILTER_OP_NOT)
+				return FILTER_VAL_TRUE;
+
+			return reparent_op_arg(parent, arg, arg->op.left,
+					       error_str);
+		}
+
+		return rval;
+	default:
+		show_error(error_str, "bad arg in filter tree");
+		return PEVENT_ERRNO__BAD_FILTER_ARG;
+	}
+	return FILTER_VAL_NORM;
+}
+
+/* Remove any unknown event fields */
+static int collapse_tree(struct filter_arg *arg,
+			 struct filter_arg **arg_collapsed, char *error_str)
+{
+	int ret;
+
+	ret = test_arg(arg, arg, error_str);
+	switch (ret) {
+	case FILTER_VAL_NORM:
+		break;
+
+	case FILTER_VAL_TRUE:
+	case FILTER_VAL_FALSE:
+		free_arg(arg);
+		arg = allocate_arg();
+		if (arg) {
+			arg->type = FILTER_ARG_BOOLEAN;
+			arg->boolean.value = ret == FILTER_VAL_TRUE;
+		} else {
+			show_error(error_str, "Failed to allocate filter arg");
+			ret = PEVENT_ERRNO__MEM_ALLOC_FAILED;
+		}
+		break;
+
+	default:
+		/* test_arg() already set the error_str */
+		free_arg(arg);
+		arg = NULL;
+		break;
+	}
+
+	*arg_collapsed = arg;
+	return ret;
+}
+
+static enum pevent_errno
+process_filter(struct event_format *event, struct filter_arg **parg,
+	       char *error_str, int not)
+{
+	enum event_type type;
+	char *token = NULL;
+	struct filter_arg *current_op = NULL;
+	struct filter_arg *current_exp = NULL;
+	struct filter_arg *left_item = NULL;
+	struct filter_arg *arg = NULL;
+	enum op_type op_type;
+	enum filter_op_type btype;
+	enum filter_exp_type etype;
+	enum filter_cmp_type ctype;
+	enum pevent_errno ret;
+
+	*parg = NULL;
+
+	do {
+		free(token);
+		type = read_token(&token);
+		switch (type) {
+		case EVENT_SQUOTE:
+		case EVENT_DQUOTE:
+		case EVENT_ITEM:
+			ret = create_arg_item(event, token, type, &arg, error_str);
+			if (ret < 0)
+				goto fail;
+			if (!left_item)
+				left_item = arg;
+			else if (current_exp) {
+				ret = add_right(current_exp, arg, error_str);
+				if (ret < 0)
+					goto fail;
+				left_item = NULL;
+				/* Not's only one one expression */
+				if (not) {
+					arg = NULL;
+					if (current_op)
+						goto fail_syntax;
+					free(token);
+					*parg = current_exp;
+					return 0;
+				}
+			} else
+				goto fail_syntax;
+			arg = NULL;
+			break;
+
+		case EVENT_DELIM:
+			if (*token == ',') {
+				show_error(error_str, "Illegal token ','");
+				ret = PEVENT_ERRNO__ILLEGAL_TOKEN;
+				goto fail;
+			}
+
+			if (*token == '(') {
+				if (left_item) {
+					show_error(error_str,
+						   "Open paren can not come after item");
+					ret = PEVENT_ERRNO__INVALID_PAREN;
+					goto fail;
+				}
+				if (current_exp) {
+					show_error(error_str,
+						   "Open paren can not come after expression");
+					ret = PEVENT_ERRNO__INVALID_PAREN;
+					goto fail;
+				}
+
+				ret = process_filter(event, &arg, error_str, 0);
+				if (ret != PEVENT_ERRNO__UNBALANCED_PAREN) {
+					if (ret == 0) {
+						show_error(error_str,
+							   "Unbalanced number of '('");
+						ret = PEVENT_ERRNO__UNBALANCED_PAREN;
+					}
+					goto fail;
+				}
+				ret = 0;
+
+				/* A not wants just one expression */
+				if (not) {
+					if (current_op)
+						goto fail_syntax;
+					*parg = arg;
+					return 0;
+				}
+
+				if (current_op)
+					ret = add_right(current_op, arg, error_str);
+				else
+					current_exp = arg;
+
+				if (ret < 0)
+					goto fail;
+
+			} else { /* ')' */
+				if (!current_op && !current_exp)
+					goto fail_syntax;
+
+				/* Make sure everything is finished at this level */
+				if (current_exp && !check_op_done(current_exp))
+					goto fail_syntax;
+				if (current_op && !check_op_done(current_op))
+					goto fail_syntax;
+
+				if (current_op)
+					*parg = current_op;
+				else
+					*parg = current_exp;
+				return PEVENT_ERRNO__UNBALANCED_PAREN;
+			}
+			break;
+
+		case EVENT_OP:
+			op_type = process_op(token, &btype, &ctype, &etype);
+
+			/* All expect a left arg except for NOT */
+			switch (op_type) {
+			case OP_BOOL:
+				/* Logic ops need a left expression */
+				if (!current_exp && !current_op)
+					goto fail_syntax;
+				/* fall through */
+			case OP_NOT:
+				/* logic only processes ops and exp */
+				if (left_item)
+					goto fail_syntax;
+				break;
+			case OP_EXP:
+			case OP_CMP:
+				if (!left_item)
+					goto fail_syntax;
+				break;
+			case OP_NONE:
+				show_error(error_str,
+					   "Unknown op token %s", token);
+				ret = PEVENT_ERRNO__UNKNOWN_TOKEN;
+				goto fail;
+			}
+
+			ret = 0;
+			switch (op_type) {
+			case OP_BOOL:
+				arg = create_arg_op(btype);
+				if (arg == NULL)
+					goto fail_alloc;
+				if (current_op)
+					ret = add_left(arg, current_op);
+				else
+					ret = add_left(arg, current_exp);
+				current_op = arg;
+				current_exp = NULL;
+				break;
+
+			case OP_NOT:
+				arg = create_arg_op(btype);
+				if (arg == NULL)
+					goto fail_alloc;
+				if (current_op)
+					ret = add_right(current_op, arg, error_str);
+				if (ret < 0)
+					goto fail;
+				current_exp = arg;
+				ret = process_filter(event, &arg, error_str, 1);
+				if (ret < 0)
+					goto fail;
+				ret = add_right(current_exp, arg, error_str);
+				if (ret < 0)
+					goto fail;
+				break;
+
+			case OP_EXP:
+			case OP_CMP:
+				if (op_type == OP_EXP)
+					arg = create_arg_exp(etype);
+				else
+					arg = create_arg_cmp(ctype);
+				if (arg == NULL)
+					goto fail_alloc;
+
+				if (current_op)
+					ret = add_right(current_op, arg, error_str);
+				if (ret < 0)
+					goto fail;
+				ret = add_left(arg, left_item);
+				if (ret < 0) {
+					arg = NULL;
+					goto fail_syntax;
+				}
+				current_exp = arg;
+				break;
+			default:
+				break;
+			}
+			arg = NULL;
+			if (ret < 0)
+				goto fail_syntax;
+			break;
+		case EVENT_NONE:
+			break;
+		case EVENT_ERROR:
+			goto fail_alloc;
+		default:
+			goto fail_syntax;
+		}
+	} while (type != EVENT_NONE);
+
+	if (!current_op && !current_exp)
+		goto fail_syntax;
+
+	if (!current_op)
+		current_op = current_exp;
+
+	ret = collapse_tree(current_op, parg, error_str);
+	if (ret < 0)
+		goto fail;
+
+	*parg = current_op;
+
+	return 0;
+
+ fail_alloc:
+	show_error(error_str, "failed to allocate filter arg");
+	ret = PEVENT_ERRNO__MEM_ALLOC_FAILED;
+	goto fail;
+ fail_syntax:
+	show_error(error_str, "Syntax error");
+	ret = PEVENT_ERRNO__SYNTAX_ERROR;
+ fail:
+	free_arg(current_op);
+	free_arg(current_exp);
+	free_arg(arg);
+	free(token);
+	return ret;
+}
+
+static enum pevent_errno
+process_event(struct event_format *event, const char *filter_str,
+	      struct filter_arg **parg, char *error_str)
+{
+	int ret;
+
+	pevent_buffer_init(filter_str, strlen(filter_str));
+
+	ret = process_filter(event, parg, error_str, 0);
+	if (ret < 0)
+		return ret;
+
+	/* If parg is NULL, then make it into FALSE */
+	if (!*parg) {
+		*parg = allocate_arg();
+		if (*parg == NULL)
+			return PEVENT_ERRNO__MEM_ALLOC_FAILED;
+
+		(*parg)->type = FILTER_ARG_BOOLEAN;
+		(*parg)->boolean.value = FILTER_FALSE;
+	}
+
+	return 0;
+}
+
+static enum pevent_errno
+filter_event(struct event_filter *filter, struct event_format *event,
+	     const char *filter_str, char *error_str)
+{
+	struct filter_type *filter_type;
+	struct filter_arg *arg;
+	enum pevent_errno ret;
+
+	if (filter_str) {
+		ret = process_event(event, filter_str, &arg, error_str);
+		if (ret < 0)
+			return ret;
+
+	} else {
+		/* just add a TRUE arg */
+		arg = allocate_arg();
+		if (arg == NULL)
+			return PEVENT_ERRNO__MEM_ALLOC_FAILED;
+
+		arg->type = FILTER_ARG_BOOLEAN;
+		arg->boolean.value = FILTER_TRUE;
+	}
+
+	filter_type = add_filter_type(filter, event->id);
+	if (filter_type == NULL)
+		return PEVENT_ERRNO__MEM_ALLOC_FAILED;
+
+	if (filter_type->filter)
+		free_arg(filter_type->filter);
+	filter_type->filter = arg;
+
+	return 0;
+}
+
+static void filter_init_error_buf(struct event_filter *filter)
+{
+	/* clear buffer to reset show error */
+	pevent_buffer_init("", 0);
+	filter->error_buffer[0] = '\0';
+}
+
+/**
+ * pevent_filter_add_filter_str - add a new filter
+ * @filter: the event filter to add to
+ * @filter_str: the filter string that contains the filter
+ *
+ * Returns 0 if the filter was successfully added or a
+ * negative error code.  Use pevent_filter_strerror() to see
+ * actual error message in case of error.
+ */
+enum pevent_errno pevent_filter_add_filter_str(struct event_filter *filter,
+					       const char *filter_str)
+{
+	struct pevent *pevent = filter->pevent;
+	struct event_list *event;
+	struct event_list *events = NULL;
+	const char *filter_start;
+	const char *next_event;
+	char *this_event;
+	char *event_name = NULL;
+	char *sys_name = NULL;
+	char *sp;
+	enum pevent_errno rtn = 0; /* PEVENT_ERRNO__SUCCESS */
+	int len;
+	int ret;
+
+	filter_init_error_buf(filter);
+
+	filter_start = strchr(filter_str, ':');
+	if (filter_start)
+		len = filter_start - filter_str;
+	else
+		len = strlen(filter_str);
+
+	do {
+		next_event = strchr(filter_str, ',');
+		if (next_event &&
+		    (!filter_start || next_event < filter_start))
+			len = next_event - filter_str;
+		else if (filter_start)
+			len = filter_start - filter_str;
+		else
+			len = strlen(filter_str);
+
+		this_event = malloc(len + 1);
+		if (this_event == NULL) {
+			/* This can only happen when events is NULL, but still */
+			free_events(events);
+			return PEVENT_ERRNO__MEM_ALLOC_FAILED;
+		}
+		memcpy(this_event, filter_str, len);
+		this_event[len] = 0;
+
+		if (next_event)
+			next_event++;
+
+		filter_str = next_event;
+
+		sys_name = strtok_r(this_event, "/", &sp);
+		event_name = strtok_r(NULL, "/", &sp);
+
+		if (!sys_name) {
+			/* This can only happen when events is NULL, but still */
+			free_events(events);
+			free(this_event);
+			return PEVENT_ERRNO__FILTER_NOT_FOUND;
+		}
+
+		/* Find this event */
+		ret = find_event(pevent, &events, strim(sys_name), strim(event_name));
+		if (ret < 0) {
+			free_events(events);
+			free(this_event);
+			return ret;
+		}
+		free(this_event);
+	} while (filter_str);
+
+	/* Skip the ':' */
+	if (filter_start)
+		filter_start++;
+
+	/* filter starts here */
+	for (event = events; event; event = event->next) {
+		ret = filter_event(filter, event->event, filter_start,
+				   filter->error_buffer);
+		/* Failures are returned if a parse error happened */
+		if (ret < 0)
+			rtn = ret;
+
+		if (ret >= 0 && pevent->test_filters) {
+			char *test;
+			test = pevent_filter_make_string(filter, event->event->id);
+			if (test) {
+				printf(" '%s: %s'\n", event->event->name, test);
+				free(test);
+			}
+		}
+	}
+
+	free_events(events);
+
+	if (rtn >= 0 && pevent->test_filters)
+		exit(0);
+
+	return rtn;
+}
+
+static void free_filter_type(struct filter_type *filter_type)
+{
+	free_arg(filter_type->filter);
+}
+
+/**
+ * pevent_filter_strerror - fill error message in a buffer
+ * @filter: the event filter contains error
+ * @err: the error code
+ * @buf: the buffer to be filled in
+ * @buflen: the size of the buffer
+ *
+ * Returns 0 if message was filled successfully, -1 if error
+ */
+int pevent_filter_strerror(struct event_filter *filter, enum pevent_errno err,
+			   char *buf, size_t buflen)
+{
+	if (err <= __PEVENT_ERRNO__START || err >= __PEVENT_ERRNO__END)
+		return -1;
+
+	if (strlen(filter->error_buffer) > 0) {
+		size_t len = snprintf(buf, buflen, "%s", filter->error_buffer);
+
+		if (len > buflen)
+			return -1;
+		return 0;
+	}
+
+	return pevent_strerror(filter->pevent, err, buf, buflen);
+}
+
+/**
+ * pevent_filter_remove_event - remove a filter for an event
+ * @filter: the event filter to remove from
+ * @event_id: the event to remove a filter for
+ *
+ * Removes the filter saved for an event defined by @event_id
+ * from the @filter.
+ *
+ * Returns 1: if an event was removed
+ *   0: if the event was not found
+ */
+int pevent_filter_remove_event(struct event_filter *filter,
+			       int event_id)
+{
+	struct filter_type *filter_type;
+	unsigned long len;
+
+	if (!filter->filters)
+		return 0;
+
+	filter_type = find_filter_type(filter, event_id);
+
+	if (!filter_type)
+		return 0;
+
+	free_filter_type(filter_type);
+
+	/* The filter_type points into the event_filters array */
+	len = (unsigned long)(filter->event_filters + filter->filters) -
+		(unsigned long)(filter_type + 1);
+
+	memmove(filter_type, filter_type + 1, len);
+	filter->filters--;
+
+	memset(&filter->event_filters[filter->filters], 0,
+	       sizeof(*filter_type));
+
+	return 1;
+}
+
+/**
+ * pevent_filter_reset - clear all filters in a filter
+ * @filter: the event filter to reset
+ *
+ * Removes all filters from a filter and resets it.
+ */
+void pevent_filter_reset(struct event_filter *filter)
+{
+	int i;
+
+	for (i = 0; i < filter->filters; i++)
+		free_filter_type(&filter->event_filters[i]);
+
+	free(filter->event_filters);
+	filter->filters = 0;
+	filter->event_filters = NULL;
+}
+
+void pevent_filter_free(struct event_filter *filter)
+{
+	pevent_unref(filter->pevent);
+
+	pevent_filter_reset(filter);
+
+	free(filter);
+}
+
+static char *arg_to_str(struct event_filter *filter, struct filter_arg *arg);
+
+static int copy_filter_type(struct event_filter *filter,
+			     struct event_filter *source,
+			     struct filter_type *filter_type)
+{
+	struct filter_arg *arg;
+	struct event_format *event;
+	const char *sys;
+	const char *name;
+	char *str;
+
+	/* Can't assume that the pevent's are the same */
+	sys = filter_type->event->system;
+	name = filter_type->event->name;
+	event = pevent_find_event_by_name(filter->pevent, sys, name);
+	if (!event)
+		return -1;
+
+	str = arg_to_str(source, filter_type->filter);
+	if (!str)
+		return -1;
+
+	if (strcmp(str, "TRUE") == 0 || strcmp(str, "FALSE") == 0) {
+		/* Add trivial event */
+		arg = allocate_arg();
+		if (arg == NULL)
+			return -1;
+
+		arg->type = FILTER_ARG_BOOLEAN;
+		if (strcmp(str, "TRUE") == 0)
+			arg->boolean.value = 1;
+		else
+			arg->boolean.value = 0;
+
+		filter_type = add_filter_type(filter, event->id);
+		if (filter_type == NULL)
+			return -1;
+
+		filter_type->filter = arg;
+
+		free(str);
+		return 0;
+	}
+
+	filter_event(filter, event, str, NULL);
+	free(str);
+
+	return 0;
+}
+
+/**
+ * pevent_filter_copy - copy a filter using another filter
+ * @dest - the filter to copy to
+ * @source - the filter to copy from
+ *
+ * Returns 0 on success and -1 if not all filters were copied
+ */
+int pevent_filter_copy(struct event_filter *dest, struct event_filter *source)
+{
+	int ret = 0;
+	int i;
+
+	pevent_filter_reset(dest);
+
+	for (i = 0; i < source->filters; i++) {
+		if (copy_filter_type(dest, source, &source->event_filters[i]))
+			ret = -1;
+	}
+	return ret;
+}
+
+
+/**
+ * pevent_update_trivial - update the trivial filters with the given filter
+ * @dest - the filter to update
+ * @source - the filter as the source of the update
+ * @type - the type of trivial filter to update.
+ *
+ * Scan dest for trivial events matching @type to replace with the source.
+ *
+ * Returns 0 on success and -1 if there was a problem updating, but
+ *   events may have still been updated on error.
+ */
+int pevent_update_trivial(struct event_filter *dest, struct event_filter *source,
+			  enum filter_trivial_type type)
+{
+	struct pevent *src_pevent;
+	struct pevent *dest_pevent;
+	struct event_format *event;
+	struct filter_type *filter_type;
+	struct filter_arg *arg;
+	char *str;
+	int i;
+
+	src_pevent = source->pevent;
+	dest_pevent = dest->pevent;
+
+	/* Do nothing if either of the filters has nothing to filter */
+	if (!dest->filters || !source->filters)
+		return 0;
+
+	for (i = 0; i < dest->filters; i++) {
+		filter_type = &dest->event_filters[i];
+		arg = filter_type->filter;
+		if (arg->type != FILTER_ARG_BOOLEAN)
+			continue;
+		if ((arg->boolean.value && type == FILTER_TRIVIAL_FALSE) ||
+		    (!arg->boolean.value && type == FILTER_TRIVIAL_TRUE))
+			continue;
+
+		event = filter_type->event;
+
+		if (src_pevent != dest_pevent) {
+			/* do a look up */
+			event = pevent_find_event_by_name(src_pevent,
+							  event->system,
+							  event->name);
+			if (!event)
+				return -1;
+		}
+
+		str = pevent_filter_make_string(source, event->id);
+		if (!str)
+			continue;
+
+		/* Don't bother if the filter is trivial too */
+		if (strcmp(str, "TRUE") != 0 && strcmp(str, "FALSE") != 0)
+			filter_event(dest, event, str, NULL);
+		free(str);
+	}
+	return 0;
+}
+
+/**
+ * pevent_filter_clear_trivial - clear TRUE and FALSE filters
+ * @filter: the filter to remove trivial filters from
+ * @type: remove only true, false, or both
+ *
+ * Removes filters that only contain a TRUE or FALES boolean arg.
+ *
+ * Returns 0 on success and -1 if there was a problem.
+ */
+int pevent_filter_clear_trivial(struct event_filter *filter,
+				 enum filter_trivial_type type)
+{
+	struct filter_type *filter_type;
+	int count = 0;
+	int *ids = NULL;
+	int i;
+
+	if (!filter->filters)
+		return 0;
+
+	/*
+	 * Two steps, first get all ids with trivial filters.
+	 *  then remove those ids.
+	 */
+	for (i = 0; i < filter->filters; i++) {
+		int *new_ids;
+
+		filter_type = &filter->event_filters[i];
+		if (filter_type->filter->type != FILTER_ARG_BOOLEAN)
+			continue;
+		switch (type) {
+		case FILTER_TRIVIAL_FALSE:
+			if (filter_type->filter->boolean.value)
+				continue;
+		case FILTER_TRIVIAL_TRUE:
+			if (!filter_type->filter->boolean.value)
+				continue;
+		default:
+			break;
+		}
+
+		new_ids = realloc(ids, sizeof(*ids) * (count + 1));
+		if (!new_ids) {
+			free(ids);
+			return -1;
+		}
+
+		ids = new_ids;
+		ids[count++] = filter_type->event_id;
+	}
+
+	if (!count)
+		return 0;
+
+	for (i = 0; i < count; i++)
+		pevent_filter_remove_event(filter, ids[i]);
+
+	free(ids);
+	return 0;
+}
+
+/**
+ * pevent_filter_event_has_trivial - return true event contains trivial filter
+ * @filter: the filter with the information
+ * @event_id: the id of the event to test
+ * @type: trivial type to test for (TRUE, FALSE, EITHER)
+ *
+ * Returns 1 if the event contains a matching trivial type
+ *  otherwise 0.
+ */
+int pevent_filter_event_has_trivial(struct event_filter *filter,
+				    int event_id,
+				    enum filter_trivial_type type)
+{
+	struct filter_type *filter_type;
+
+	if (!filter->filters)
+		return 0;
+
+	filter_type = find_filter_type(filter, event_id);
+
+	if (!filter_type)
+		return 0;
+
+	if (filter_type->filter->type != FILTER_ARG_BOOLEAN)
+		return 0;
+
+	switch (type) {
+	case FILTER_TRIVIAL_FALSE:
+		return !filter_type->filter->boolean.value;
+
+	case FILTER_TRIVIAL_TRUE:
+		return filter_type->filter->boolean.value;
+	default:
+		return 1;
+	}
+}
+
+static int test_filter(struct event_format *event, struct filter_arg *arg,
+		       struct pevent_record *record, enum pevent_errno *err);
+
+static const char *
+get_comm(struct event_format *event, struct pevent_record *record)
+{
+	const char *comm;
+	int pid;
+
+	pid = pevent_data_pid(event->pevent, record);
+	comm = pevent_data_comm_from_pid(event->pevent, pid);
+	return comm;
+}
+
+static unsigned long long
+get_value(struct event_format *event,
+	  struct format_field *field, struct pevent_record *record)
+{
+	unsigned long long val;
+
+	/* Handle our dummy "comm" field */
+	if (field == &comm) {
+		const char *name;
+
+		name = get_comm(event, record);
+		return (unsigned long)name;
+	}
+
+	pevent_read_number_field(field, record->data, &val);
+
+	if (!(field->flags & FIELD_IS_SIGNED))
+		return val;
+
+	switch (field->size) {
+	case 1:
+		return (char)val;
+	case 2:
+		return (short)val;
+	case 4:
+		return (int)val;
+	case 8:
+		return (long long)val;
+	}
+	return val;
+}
+
+static unsigned long long
+get_arg_value(struct event_format *event, struct filter_arg *arg,
+	      struct pevent_record *record, enum pevent_errno *err);
+
+static unsigned long long
+get_exp_value(struct event_format *event, struct filter_arg *arg,
+	      struct pevent_record *record, enum pevent_errno *err)
+{
+	unsigned long long lval, rval;
+
+	lval = get_arg_value(event, arg->exp.left, record, err);
+	rval = get_arg_value(event, arg->exp.right, record, err);
+
+	if (*err) {
+		/*
+		 * There was an error, no need to process anymore.
+		 */
+		return 0;
+	}
+
+	switch (arg->exp.type) {
+	case FILTER_EXP_ADD:
+		return lval + rval;
+
+	case FILTER_EXP_SUB:
+		return lval - rval;
+
+	case FILTER_EXP_MUL:
+		return lval * rval;
+
+	case FILTER_EXP_DIV:
+		return lval / rval;
+
+	case FILTER_EXP_MOD:
+		return lval % rval;
+
+	case FILTER_EXP_RSHIFT:
+		return lval >> rval;
+
+	case FILTER_EXP_LSHIFT:
+		return lval << rval;
+
+	case FILTER_EXP_AND:
+		return lval & rval;
+
+	case FILTER_EXP_OR:
+		return lval | rval;
+
+	case FILTER_EXP_XOR:
+		return lval ^ rval;
+
+	case FILTER_EXP_NOT:
+	default:
+		if (!*err)
+			*err = PEVENT_ERRNO__INVALID_EXP_TYPE;
+	}
+	return 0;
+}
+
+static unsigned long long
+get_arg_value(struct event_format *event, struct filter_arg *arg,
+	      struct pevent_record *record, enum pevent_errno *err)
+{
+	switch (arg->type) {
+	case FILTER_ARG_FIELD:
+		return get_value(event, arg->field.field, record);
+
+	case FILTER_ARG_VALUE:
+		if (arg->value.type != FILTER_NUMBER) {
+			if (!*err)
+				*err = PEVENT_ERRNO__NOT_A_NUMBER;
+		}
+		return arg->value.val;
+
+	case FILTER_ARG_EXP:
+		return get_exp_value(event, arg, record, err);
+
+	default:
+		if (!*err)
+			*err = PEVENT_ERRNO__INVALID_ARG_TYPE;
+	}
+	return 0;
+}
+
+static int test_num(struct event_format *event, struct filter_arg *arg,
+		    struct pevent_record *record, enum pevent_errno *err)
+{
+	unsigned long long lval, rval;
+
+	lval = get_arg_value(event, arg->num.left, record, err);
+	rval = get_arg_value(event, arg->num.right, record, err);
+
+	if (*err) {
+		/*
+		 * There was an error, no need to process anymore.
+		 */
+		return 0;
+	}
+
+	switch (arg->num.type) {
+	case FILTER_CMP_EQ:
+		return lval == rval;
+
+	case FILTER_CMP_NE:
+		return lval != rval;
+
+	case FILTER_CMP_GT:
+		return lval > rval;
+
+	case FILTER_CMP_LT:
+		return lval < rval;
+
+	case FILTER_CMP_GE:
+		return lval >= rval;
+
+	case FILTER_CMP_LE:
+		return lval <= rval;
+
+	default:
+		if (!*err)
+			*err = PEVENT_ERRNO__ILLEGAL_INTEGER_CMP;
+		return 0;
+	}
+}
+
+static const char *get_field_str(struct filter_arg *arg, struct pevent_record *record)
+{
+	struct event_format *event;
+	struct pevent *pevent;
+	unsigned long long addr;
+	const char *val = NULL;
+	char hex[64];
+
+	/* If the field is not a string convert it */
+	if (arg->str.field->flags & FIELD_IS_STRING) {
+		val = record->data + arg->str.field->offset;
+
+		/*
+		 * We need to copy the data since we can't be sure the field
+		 * is null terminated.
+		 */
+		if (*(val + arg->str.field->size - 1)) {
+			/* copy it */
+			memcpy(arg->str.buffer, val, arg->str.field->size);
+			/* the buffer is already NULL terminated */
+			val = arg->str.buffer;
+		}
+
+	} else {
+		event = arg->str.field->event;
+		pevent = event->pevent;
+		addr = get_value(event, arg->str.field, record);
+
+		if (arg->str.field->flags & (FIELD_IS_POINTER | FIELD_IS_LONG))
+			/* convert to a kernel symbol */
+			val = pevent_find_function(pevent, addr);
+
+		if (val == NULL) {
+			/* just use the hex of the string name */
+			snprintf(hex, 64, "0x%llx", addr);
+			val = hex;
+		}
+	}
+
+	return val;
+}
+
+static int test_str(struct event_format *event, struct filter_arg *arg,
+		    struct pevent_record *record, enum pevent_errno *err)
+{
+	const char *val;
+
+	if (arg->str.field == &comm)
+		val = get_comm(event, record);
+	else
+		val = get_field_str(arg, record);
+
+	switch (arg->str.type) {
+	case FILTER_CMP_MATCH:
+		return strcmp(val, arg->str.val) == 0;
+
+	case FILTER_CMP_NOT_MATCH:
+		return strcmp(val, arg->str.val) != 0;
+
+	case FILTER_CMP_REGEX:
+		/* Returns zero on match */
+		return !regexec(&arg->str.reg, val, 0, NULL, 0);
+
+	case FILTER_CMP_NOT_REGEX:
+		return regexec(&arg->str.reg, val, 0, NULL, 0);
+
+	default:
+		if (!*err)
+			*err = PEVENT_ERRNO__ILLEGAL_STRING_CMP;
+		return 0;
+	}
+}
+
+static int test_op(struct event_format *event, struct filter_arg *arg,
+		   struct pevent_record *record, enum pevent_errno *err)
+{
+	switch (arg->op.type) {
+	case FILTER_OP_AND:
+		return test_filter(event, arg->op.left, record, err) &&
+			test_filter(event, arg->op.right, record, err);
+
+	case FILTER_OP_OR:
+		return test_filter(event, arg->op.left, record, err) ||
+			test_filter(event, arg->op.right, record, err);
+
+	case FILTER_OP_NOT:
+		return !test_filter(event, arg->op.right, record, err);
+
+	default:
+		if (!*err)
+			*err = PEVENT_ERRNO__INVALID_OP_TYPE;
+		return 0;
+	}
+}
+
+static int test_filter(struct event_format *event, struct filter_arg *arg,
+		       struct pevent_record *record, enum pevent_errno *err)
+{
+	if (*err) {
+		/*
+		 * There was an error, no need to process anymore.
+		 */
+		return 0;
+	}
+
+	switch (arg->type) {
+	case FILTER_ARG_BOOLEAN:
+		/* easy case */
+		return arg->boolean.value;
+
+	case FILTER_ARG_OP:
+		return test_op(event, arg, record, err);
+
+	case FILTER_ARG_NUM:
+		return test_num(event, arg, record, err);
+
+	case FILTER_ARG_STR:
+		return test_str(event, arg, record, err);
+
+	case FILTER_ARG_EXP:
+	case FILTER_ARG_VALUE:
+	case FILTER_ARG_FIELD:
+		/*
+		 * Expressions, fields and values evaluate
+		 * to true if they return non zero
+		 */
+		return !!get_arg_value(event, arg, record, err);
+
+	default:
+		if (!*err)
+			*err = PEVENT_ERRNO__INVALID_ARG_TYPE;
+		return 0;
+	}
+}
+
+/**
+ * pevent_event_filtered - return true if event has filter
+ * @filter: filter struct with filter information
+ * @event_id: event id to test if filter exists
+ *
+ * Returns 1 if filter found for @event_id
+ *   otherwise 0;
+ */
+int pevent_event_filtered(struct event_filter *filter, int event_id)
+{
+	struct filter_type *filter_type;
+
+	if (!filter->filters)
+		return 0;
+
+	filter_type = find_filter_type(filter, event_id);
+
+	return filter_type ? 1 : 0;
+}
+
+/**
+ * pevent_filter_match - test if a record matches a filter
+ * @filter: filter struct with filter information
+ * @record: the record to test against the filter
+ *
+ * Returns: match result or error code (prefixed with PEVENT_ERRNO__)
+ * FILTER_MATCH - filter found for event and @record matches
+ * FILTER_MISS  - filter found for event and @record does not match
+ * FILTER_NOT_FOUND - no filter found for @record's event
+ * NO_FILTER - if no filters exist
+ * otherwise - error occurred during test
+ */
+enum pevent_errno pevent_filter_match(struct event_filter *filter,
+				      struct pevent_record *record)
+{
+	struct pevent *pevent = filter->pevent;
+	struct filter_type *filter_type;
+	int event_id;
+	int ret;
+	enum pevent_errno err = 0;
+
+	filter_init_error_buf(filter);
+
+	if (!filter->filters)
+		return PEVENT_ERRNO__NO_FILTER;
+
+	event_id = pevent_data_type(pevent, record);
+
+	filter_type = find_filter_type(filter, event_id);
+	if (!filter_type)
+		return PEVENT_ERRNO__FILTER_NOT_FOUND;
+
+	ret = test_filter(filter_type->event, filter_type->filter, record, &err);
+	if (err)
+		return err;
+
+	return ret ? PEVENT_ERRNO__FILTER_MATCH : PEVENT_ERRNO__FILTER_MISS;
+}
+
+static char *op_to_str(struct event_filter *filter, struct filter_arg *arg)
+{
+	char *str = NULL;
+	char *left = NULL;
+	char *right = NULL;
+	char *op = NULL;
+	int left_val = -1;
+	int right_val = -1;
+	int val;
+
+	switch (arg->op.type) {
+	case FILTER_OP_AND:
+		op = "&&";
+		/* fall through */
+	case FILTER_OP_OR:
+		if (!op)
+			op = "||";
+
+		left = arg_to_str(filter, arg->op.left);
+		right = arg_to_str(filter, arg->op.right);
+		if (!left || !right)
+			break;
+
+		/* Try to consolidate boolean values */
+		if (strcmp(left, "TRUE") == 0)
+			left_val = 1;
+		else if (strcmp(left, "FALSE") == 0)
+			left_val = 0;
+
+		if (strcmp(right, "TRUE") == 0)
+			right_val = 1;
+		else if (strcmp(right, "FALSE") == 0)
+			right_val = 0;
+
+		if (left_val >= 0) {
+			if ((arg->op.type == FILTER_OP_AND && !left_val) ||
+			    (arg->op.type == FILTER_OP_OR && left_val)) {
+				/* Just return left value */
+				str = left;
+				left = NULL;
+				break;
+			}
+			if (right_val >= 0) {
+				/* just evaluate this. */
+				val = 0;
+				switch (arg->op.type) {
+				case FILTER_OP_AND:
+					val = left_val && right_val;
+					break;
+				case FILTER_OP_OR:
+					val = left_val || right_val;
+					break;
+				default:
+					break;
+				}
+				asprintf(&str, val ? "TRUE" : "FALSE");
+				break;
+			}
+		}
+		if (right_val >= 0) {
+			if ((arg->op.type == FILTER_OP_AND && !right_val) ||
+			    (arg->op.type == FILTER_OP_OR && right_val)) {
+				/* Just return right value */
+				str = right;
+				right = NULL;
+				break;
+			}
+			/* The right value is meaningless */
+			str = left;
+			left = NULL;
+			break;
+		}
+
+		asprintf(&str, "(%s) %s (%s)", left, op, right);
+		break;
+
+	case FILTER_OP_NOT:
+		op = "!";
+		right = arg_to_str(filter, arg->op.right);
+		if (!right)
+			break;
+
+		/* See if we can consolidate */
+		if (strcmp(right, "TRUE") == 0)
+			right_val = 1;
+		else if (strcmp(right, "FALSE") == 0)
+			right_val = 0;
+		if (right_val >= 0) {
+			/* just return the opposite */
+			asprintf(&str, right_val ? "FALSE" : "TRUE");
+			break;
+		}
+		asprintf(&str, "%s(%s)", op, right);
+		break;
+
+	default:
+		/* ?? */
+		break;
+	}
+	free(left);
+	free(right);
+	return str;
+}
+
+static char *val_to_str(struct event_filter *filter, struct filter_arg *arg)
+{
+	char *str = NULL;
+
+	asprintf(&str, "%lld", arg->value.val);
+
+	return str;
+}
+
+static char *field_to_str(struct event_filter *filter, struct filter_arg *arg)
+{
+	return strdup(arg->field.field->name);
+}
+
+static char *exp_to_str(struct event_filter *filter, struct filter_arg *arg)
+{
+	char *lstr;
+	char *rstr;
+	char *op;
+	char *str = NULL;
+
+	lstr = arg_to_str(filter, arg->exp.left);
+	rstr = arg_to_str(filter, arg->exp.right);
+	if (!lstr || !rstr)
+		goto out;
+
+	switch (arg->exp.type) {
+	case FILTER_EXP_ADD:
+		op = "+";
+		break;
+	case FILTER_EXP_SUB:
+		op = "-";
+		break;
+	case FILTER_EXP_MUL:
+		op = "*";
+		break;
+	case FILTER_EXP_DIV:
+		op = "/";
+		break;
+	case FILTER_EXP_MOD:
+		op = "%";
+		break;
+	case FILTER_EXP_RSHIFT:
+		op = ">>";
+		break;
+	case FILTER_EXP_LSHIFT:
+		op = "<<";
+		break;
+	case FILTER_EXP_AND:
+		op = "&";
+		break;
+	case FILTER_EXP_OR:
+		op = "|";
+		break;
+	case FILTER_EXP_XOR:
+		op = "^";
+		break;
+	default:
+		op = "[ERROR IN EXPRESSION TYPE]";
+		break;
+	}
+
+	asprintf(&str, "%s %s %s", lstr, op, rstr);
+out:
+	free(lstr);
+	free(rstr);
+
+	return str;
+}
+
+static char *num_to_str(struct event_filter *filter, struct filter_arg *arg)
+{
+	char *lstr;
+	char *rstr;
+	char *str = NULL;
+	char *op = NULL;
+
+	lstr = arg_to_str(filter, arg->num.left);
+	rstr = arg_to_str(filter, arg->num.right);
+	if (!lstr || !rstr)
+		goto out;
+
+	switch (arg->num.type) {
+	case FILTER_CMP_EQ:
+		op = "==";
+		/* fall through */
+	case FILTER_CMP_NE:
+		if (!op)
+			op = "!=";
+		/* fall through */
+	case FILTER_CMP_GT:
+		if (!op)
+			op = ">";
+		/* fall through */
+	case FILTER_CMP_LT:
+		if (!op)
+			op = "<";
+		/* fall through */
+	case FILTER_CMP_GE:
+		if (!op)
+			op = ">=";
+		/* fall through */
+	case FILTER_CMP_LE:
+		if (!op)
+			op = "<=";
+
+		asprintf(&str, "%s %s %s", lstr, op, rstr);
+		break;
+
+	default:
+		/* ?? */
+		break;
+	}
+
+out:
+	free(lstr);
+	free(rstr);
+	return str;
+}
+
+static char *str_to_str(struct event_filter *filter, struct filter_arg *arg)
+{
+	char *str = NULL;
+	char *op = NULL;
+
+	switch (arg->str.type) {
+	case FILTER_CMP_MATCH:
+		op = "==";
+		/* fall through */
+	case FILTER_CMP_NOT_MATCH:
+		if (!op)
+			op = "!=";
+		/* fall through */
+	case FILTER_CMP_REGEX:
+		if (!op)
+			op = "=~";
+		/* fall through */
+	case FILTER_CMP_NOT_REGEX:
+		if (!op)
+			op = "!~";
+
+		asprintf(&str, "%s %s \"%s\"",
+			 arg->str.field->name, op, arg->str.val);
+		break;
+
+	default:
+		/* ?? */
+		break;
+	}
+	return str;
+}
+
+static char *arg_to_str(struct event_filter *filter, struct filter_arg *arg)
+{
+	char *str = NULL;
+
+	switch (arg->type) {
+	case FILTER_ARG_BOOLEAN:
+		asprintf(&str, arg->boolean.value ? "TRUE" : "FALSE");
+		return str;
+
+	case FILTER_ARG_OP:
+		return op_to_str(filter, arg);
+
+	case FILTER_ARG_NUM:
+		return num_to_str(filter, arg);
+
+	case FILTER_ARG_STR:
+		return str_to_str(filter, arg);
+
+	case FILTER_ARG_VALUE:
+		return val_to_str(filter, arg);
+
+	case FILTER_ARG_FIELD:
+		return field_to_str(filter, arg);
+
+	case FILTER_ARG_EXP:
+		return exp_to_str(filter, arg);
+
+	default:
+		/* ?? */
+		return NULL;
+	}
+
+}
+
+/**
+ * pevent_filter_make_string - return a string showing the filter
+ * @filter: filter struct with filter information
+ * @event_id: the event id to return the filter string with
+ *
+ * Returns a string that displays the filter contents.
+ *  This string must be freed with free(str).
+ *  NULL is returned if no filter is found or allocation failed.
+ */
+char *
+pevent_filter_make_string(struct event_filter *filter, int event_id)
+{
+	struct filter_type *filter_type;
+
+	if (!filter->filters)
+		return NULL;
+
+	filter_type = find_filter_type(filter, event_id);
+
+	if (!filter_type)
+		return NULL;
+
+	return arg_to_str(filter, filter_type->filter);
+}
+
+/**
+ * pevent_filter_compare - compare two filters and return if they are the same
+ * @filter1: Filter to compare with @filter2
+ * @filter2: Filter to compare with @filter1
+ *
+ * Returns:
+ *  1 if the two filters hold the same content.
+ *  0 if they do not.
+ */
+int pevent_filter_compare(struct event_filter *filter1, struct event_filter *filter2)
+{
+	struct filter_type *filter_type1;
+	struct filter_type *filter_type2;
+	char *str1, *str2;
+	int result;
+	int i;
+
+	/* Do the easy checks first */
+	if (filter1->filters != filter2->filters)
+		return 0;
+	if (!filter1->filters && !filter2->filters)
+		return 1;
+
+	/*
+	 * Now take a look at each of the events to see if they have the same
+	 * filters to them.
+	 */
+	for (i = 0; i < filter1->filters; i++) {
+		filter_type1 = &filter1->event_filters[i];
+		filter_type2 = find_filter_type(filter2, filter_type1->event_id);
+		if (!filter_type2)
+			break;
+		if (filter_type1->filter->type != filter_type2->filter->type)
+			break;
+		switch (filter_type1->filter->type) {
+		case FILTER_TRIVIAL_FALSE:
+		case FILTER_TRIVIAL_TRUE:
+			/* trivial types just need the type compared */
+			continue;
+		default:
+			break;
+		}
+		/* The best way to compare complex filters is with strings */
+		str1 = arg_to_str(filter1, filter_type1->filter);
+		str2 = arg_to_str(filter2, filter_type2->filter);
+		if (str1 && str2)
+			result = strcmp(str1, str2) != 0;
+		else
+			/* bail out if allocation fails */
+			result = 1;
+
+		free(str1);
+		free(str2);
+		if (result)
+			break;
+	}
+
+	if (i < filter1->filters)
+		return 0;
+	return 1;
+}
+
diff --git a/tools/lib/traceevent/parse-utils.c b/tools/lib/traceevent/parse-utils.c
new file mode 100644
index 00000000000..eda07fa31dc
--- /dev/null
+++ b/tools/lib/traceevent/parse-utils.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (C) 2010 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <errno.h>
+
+#define __weak __attribute__((weak))
+
+void __vwarning(const char *fmt, va_list ap)
+{
+	if (errno)
+		perror("trace-cmd");
+	errno = 0;
+
+	fprintf(stderr, "  ");
+	vfprintf(stderr, fmt, ap);
+
+	fprintf(stderr, "\n");
+}
+
+void __warning(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	__vwarning(fmt, ap);
+	va_end(ap);
+}
+
+void __weak warning(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	__vwarning(fmt, ap);
+	va_end(ap);
+}
+
+void __vpr_stat(const char *fmt, va_list ap)
+{
+	vprintf(fmt, ap);
+	printf("\n");
+}
+
+void __pr_stat(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	__vpr_stat(fmt, ap);
+	va_end(ap);
+}
+
+void __weak vpr_stat(const char *fmt, va_list ap)
+{
+	__vpr_stat(fmt, ap);
+}
+
+void __weak pr_stat(const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	__vpr_stat(fmt, ap);
+	va_end(ap);
+}
diff --git a/tools/lib/traceevent/plugin_cfg80211.c b/tools/lib/traceevent/plugin_cfg80211.c
new file mode 100644
index 00000000000..c066b25905f
--- /dev/null
+++ b/tools/lib/traceevent/plugin_cfg80211.c
@@ -0,0 +1,30 @@
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include <endian.h>
+#include "event-parse.h"
+
+static unsigned long long
+process___le16_to_cpup(struct trace_seq *s,
+		       unsigned long long *args)
+{
+	uint16_t *val = (uint16_t *) (unsigned long) args[0];
+	return val ? (long long) le16toh(*val) : 0;
+}
+
+int PEVENT_PLUGIN_LOADER(struct pevent *pevent)
+{
+	pevent_register_print_function(pevent,
+				       process___le16_to_cpup,
+				       PEVENT_FUNC_ARG_INT,
+				       "__le16_to_cpup",
+				       PEVENT_FUNC_ARG_PTR,
+				       PEVENT_FUNC_ARG_VOID);
+	return 0;
+}
+
+void PEVENT_PLUGIN_UNLOADER(struct pevent *pevent)
+{
+	pevent_unregister_print_function(pevent, process___le16_to_cpup,
+					 "__le16_to_cpup");
+}
diff --git a/tools/lib/traceevent/plugin_function.c b/tools/lib/traceevent/plugin_function.c
new file mode 100644
index 00000000000..a00ec190821
--- /dev/null
+++ b/tools/lib/traceevent/plugin_function.c
@@ -0,0 +1,194 @@
+/*
+ * Copyright (C) 2009, 2010 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "event-parse.h"
+#include "event-utils.h"
+
+static struct func_stack {
+	int size;
+	char **stack;
+} *fstack;
+
+static int cpus = -1;
+
+#define STK_BLK 10
+
+struct pevent_plugin_option plugin_options[] =
+{
+	{
+		.name = "parent",
+		.plugin_alias = "ftrace",
+		.description =
+		"Print parent of functions for function events",
+	},
+	{
+		.name = "indent",
+		.plugin_alias = "ftrace",
+		.description =
+		"Try to show function call indents, based on parents",
+		.set = 1,
+	},
+	{
+		.name = NULL,
+	}
+};
+
+static struct pevent_plugin_option *ftrace_parent = &plugin_options[0];
+static struct pevent_plugin_option *ftrace_indent = &plugin_options[1];
+
+static void add_child(struct func_stack *stack, const char *child, int pos)
+{
+	int i;
+
+	if (!child)
+		return;
+
+	if (pos < stack->size)
+		free(stack->stack[pos]);
+	else {
+		char **ptr;
+
+		ptr = realloc(stack->stack, sizeof(char *) *
+			      (stack->size + STK_BLK));
+		if (!ptr) {
+			warning("could not allocate plugin memory\n");
+			return;
+		}
+
+		stack->stack = ptr;
+
+		for (i = stack->size; i < stack->size + STK_BLK; i++)
+			stack->stack[i] = NULL;
+		stack->size += STK_BLK;
+	}
+
+	stack->stack[pos] = strdup(child);
+}
+
+static int add_and_get_index(const char *parent, const char *child, int cpu)
+{
+	int i;
+
+	if (cpu < 0)
+		return 0;
+
+	if (cpu > cpus) {
+		struct func_stack *ptr;
+
+		ptr = realloc(fstack, sizeof(*fstack) * (cpu + 1));
+		if (!ptr) {
+			warning("could not allocate plugin memory\n");
+			return 0;
+		}
+
+		fstack = ptr;
+
+		/* Account for holes in the cpu count */
+		for (i = cpus + 1; i <= cpu; i++)
+			memset(&fstack[i], 0, sizeof(fstack[i]));
+		cpus = cpu;
+	}
+
+	for (i = 0; i < fstack[cpu].size && fstack[cpu].stack[i]; i++) {
+		if (strcmp(parent, fstack[cpu].stack[i]) == 0) {
+			add_child(&fstack[cpu], child, i+1);
+			return i;
+		}
+	}
+
+	/* Not found */
+	add_child(&fstack[cpu], parent, 0);
+	add_child(&fstack[cpu], child, 1);
+	return 0;
+}
+
+static int function_handler(struct trace_seq *s, struct pevent_record *record,
+			    struct event_format *event, void *context)
+{
+	struct pevent *pevent = event->pevent;
+	unsigned long long function;
+	unsigned long long pfunction;
+	const char *func;
+	const char *parent;
+	int index;
+
+	if (pevent_get_field_val(s, event, "ip", record, &function, 1))
+		return trace_seq_putc(s, '!');
+
+	func = pevent_find_function(pevent, function);
+
+	if (pevent_get_field_val(s, event, "parent_ip", record, &pfunction, 1))
+		return trace_seq_putc(s, '!');
+
+	parent = pevent_find_function(pevent, pfunction);
+
+	if (parent && ftrace_indent->set)
+		index = add_and_get_index(parent, func, record->cpu);
+
+	trace_seq_printf(s, "%*s", index*3, "");
+
+	if (func)
+		trace_seq_printf(s, "%s", func);
+	else
+		trace_seq_printf(s, "0x%llx", function);
+
+	if (ftrace_parent->set) {
+		trace_seq_printf(s, " <-- ");
+		if (parent)
+			trace_seq_printf(s, "%s", parent);
+		else
+			trace_seq_printf(s, "0x%llx", pfunction);
+	}
+
+	return 0;
+}
+
+int PEVENT_PLUGIN_LOADER(struct pevent *pevent)
+{
+	pevent_register_event_handler(pevent, -1, "ftrace", "function",
+				      function_handler, NULL);
+
+	traceevent_plugin_add_options("ftrace", plugin_options);
+
+	return 0;
+}
+
+void PEVENT_PLUGIN_UNLOADER(struct pevent *pevent)
+{
+	int i, x;
+
+	pevent_unregister_event_handler(pevent, -1, "ftrace", "function",
+					function_handler, NULL);
+
+	for (i = 0; i <= cpus; i++) {
+		for (x = 0; x < fstack[i].size && fstack[i].stack[x]; x++)
+			free(fstack[i].stack[x]);
+		free(fstack[i].stack);
+	}
+
+	traceevent_plugin_remove_options(plugin_options);
+
+	free(fstack);
+	fstack = NULL;
+	cpus = -1;
+}
diff --git a/tools/lib/traceevent/plugin_hrtimer.c b/tools/lib/traceevent/plugin_hrtimer.c
new file mode 100644
index 00000000000..12bf14cc115
--- /dev/null
+++ b/tools/lib/traceevent/plugin_hrtimer.c
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2009 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2009 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "event-parse.h"
+
+static int timer_expire_handler(struct trace_seq *s,
+				struct pevent_record *record,
+				struct event_format *event, void *context)
+{
+	trace_seq_printf(s, "hrtimer=");
+
+	if (pevent_print_num_field(s, "0x%llx", event, "timer",
+				   record, 0) == -1)
+		pevent_print_num_field(s, "0x%llx", event, "hrtimer",
+				       record, 1);
+
+	trace_seq_printf(s, " now=");
+
+	pevent_print_num_field(s, "%llu", event, "now", record, 1);
+
+	pevent_print_func_field(s, " function=%s", event, "function",
+				record, 0);
+	return 0;
+}
+
+static int timer_start_handler(struct trace_seq *s,
+			       struct pevent_record *record,
+			       struct event_format *event, void *context)
+{
+	trace_seq_printf(s, "hrtimer=");
+
+	if (pevent_print_num_field(s, "0x%llx", event, "timer",
+				   record, 0) == -1)
+		pevent_print_num_field(s, "0x%llx", event, "hrtimer",
+				       record, 1);
+
+	pevent_print_func_field(s, " function=%s", event, "function",
+				record, 0);
+
+	trace_seq_printf(s, " expires=");
+	pevent_print_num_field(s, "%llu", event, "expires", record, 1);
+
+	trace_seq_printf(s, " softexpires=");
+	pevent_print_num_field(s, "%llu", event, "softexpires", record, 1);
+	return 0;
+}
+
+int PEVENT_PLUGIN_LOADER(struct pevent *pevent)
+{
+	pevent_register_event_handler(pevent, -1,
+				      "timer", "hrtimer_expire_entry",
+				      timer_expire_handler, NULL);
+
+	pevent_register_event_handler(pevent, -1, "timer", "hrtimer_start",
+				      timer_start_handler, NULL);
+	return 0;
+}
+
+void PEVENT_PLUGIN_UNLOADER(struct pevent *pevent)
+{
+	pevent_unregister_event_handler(pevent, -1,
+					"timer", "hrtimer_expire_entry",
+					timer_expire_handler, NULL);
+
+	pevent_unregister_event_handler(pevent, -1, "timer", "hrtimer_start",
+					timer_start_handler, NULL);
+}
diff --git a/tools/lib/traceevent/plugin_jbd2.c b/tools/lib/traceevent/plugin_jbd2.c
new file mode 100644
index 00000000000..0db714c721b
--- /dev/null
+++ b/tools/lib/traceevent/plugin_jbd2.c
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2010 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "event-parse.h"
+
+#define MINORBITS	20
+#define MINORMASK	((1U << MINORBITS) - 1)
+
+#define MAJOR(dev)	((unsigned int) ((dev) >> MINORBITS))
+#define MINOR(dev)	((unsigned int) ((dev) & MINORMASK))
+
+static unsigned long long
+process_jbd2_dev_to_name(struct trace_seq *s,
+			 unsigned long long *args)
+{
+	unsigned int dev = args[0];
+
+	trace_seq_printf(s, "%d:%d", MAJOR(dev), MINOR(dev));
+	return 0;
+}
+
+static unsigned long long
+process_jiffies_to_msecs(struct trace_seq *s,
+			 unsigned long long *args)
+{
+	unsigned long long jiffies = args[0];
+
+	trace_seq_printf(s, "%lld", jiffies);
+	return jiffies;
+}
+
+int PEVENT_PLUGIN_LOADER(struct pevent *pevent)
+{
+	pevent_register_print_function(pevent,
+				       process_jbd2_dev_to_name,
+				       PEVENT_FUNC_ARG_STRING,
+				       "jbd2_dev_to_name",
+				       PEVENT_FUNC_ARG_INT,
+				       PEVENT_FUNC_ARG_VOID);
+
+	pevent_register_print_function(pevent,
+				       process_jiffies_to_msecs,
+				       PEVENT_FUNC_ARG_LONG,
+				       "jiffies_to_msecs",
+				       PEVENT_FUNC_ARG_LONG,
+				       PEVENT_FUNC_ARG_VOID);
+	return 0;
+}
+
+void PEVENT_PLUGIN_UNLOADER(struct pevent *pevent)
+{
+	pevent_unregister_print_function(pevent, process_jbd2_dev_to_name,
+					 "jbd2_dev_to_name");
+
+	pevent_unregister_print_function(pevent, process_jiffies_to_msecs,
+					 "jiffies_to_msecs");
+}
diff --git a/tools/lib/traceevent/plugin_kmem.c b/tools/lib/traceevent/plugin_kmem.c
new file mode 100644
index 00000000000..70650ff48d7
--- /dev/null
+++ b/tools/lib/traceevent/plugin_kmem.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (C) 2009 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "event-parse.h"
+
+static int call_site_handler(struct trace_seq *s, struct pevent_record *record,
+			     struct event_format *event, void *context)
+{
+	struct format_field *field;
+	unsigned long long val, addr;
+	void *data = record->data;
+	const char *func;
+
+	field = pevent_find_field(event, "call_site");
+	if (!field)
+		return 1;
+
+	if (pevent_read_number_field(field, data, &val))
+		return 1;
+
+	func = pevent_find_function(event->pevent, val);
+	if (!func)
+		return 1;
+
+	addr = pevent_find_function_address(event->pevent, val);
+
+	trace_seq_printf(s, "(%s+0x%x) ", func, (int)(val - addr));
+	return 1;
+}
+
+int PEVENT_PLUGIN_LOADER(struct pevent *pevent)
+{
+	pevent_register_event_handler(pevent, -1, "kmem", "kfree",
+				      call_site_handler, NULL);
+
+	pevent_register_event_handler(pevent, -1, "kmem", "kmalloc",
+				      call_site_handler, NULL);
+
+	pevent_register_event_handler(pevent, -1, "kmem", "kmalloc_node",
+				      call_site_handler, NULL);
+
+	pevent_register_event_handler(pevent, -1, "kmem", "kmem_cache_alloc",
+				      call_site_handler, NULL);
+
+	pevent_register_event_handler(pevent, -1, "kmem",
+				      "kmem_cache_alloc_node",
+				      call_site_handler, NULL);
+
+	pevent_register_event_handler(pevent, -1, "kmem", "kmem_cache_free",
+				      call_site_handler, NULL);
+	return 0;
+}
+
+void PEVENT_PLUGIN_UNLOADER(struct pevent *pevent)
+{
+	pevent_unregister_event_handler(pevent, -1, "kmem", "kfree",
+					call_site_handler, NULL);
+
+	pevent_unregister_event_handler(pevent, -1, "kmem", "kmalloc",
+					call_site_handler, NULL);
+
+	pevent_unregister_event_handler(pevent, -1, "kmem", "kmalloc_node",
+					call_site_handler, NULL);
+
+	pevent_unregister_event_handler(pevent, -1, "kmem", "kmem_cache_alloc",
+					call_site_handler, NULL);
+
+	pevent_unregister_event_handler(pevent, -1, "kmem",
+					"kmem_cache_alloc_node",
+					call_site_handler, NULL);
+
+	pevent_unregister_event_handler(pevent, -1, "kmem", "kmem_cache_free",
+					call_site_handler, NULL);
+}
diff --git a/tools/lib/traceevent/plugin_kvm.c b/tools/lib/traceevent/plugin_kvm.c
new file mode 100644
index 00000000000..9e0e8c61b43
--- /dev/null
+++ b/tools/lib/traceevent/plugin_kvm.c
@@ -0,0 +1,465 @@
+/*
+ * Copyright (C) 2009 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdint.h>
+
+#include "event-parse.h"
+
+#ifdef HAVE_UDIS86
+
+#include <udis86.h>
+
+static ud_t ud;
+
+static void init_disassembler(void)
+{
+	ud_init(&ud);
+	ud_set_syntax(&ud, UD_SYN_ATT);
+}
+
+static const char *disassemble(unsigned char *insn, int len, uint64_t rip,
+			       int cr0_pe, int eflags_vm,
+			       int cs_d, int cs_l)
+{
+	int mode;
+
+	if (!cr0_pe)
+		mode = 16;
+	else if (eflags_vm)
+		mode = 16;
+	else if (cs_l)
+		mode = 64;
+	else if (cs_d)
+		mode = 32;
+	else
+		mode = 16;
+
+	ud_set_pc(&ud, rip);
+	ud_set_mode(&ud, mode);
+	ud_set_input_buffer(&ud, insn, len);
+	ud_disassemble(&ud);
+	return ud_insn_asm(&ud);
+}
+
+#else
+
+static void init_disassembler(void)
+{
+}
+
+static const char *disassemble(unsigned char *insn, int len, uint64_t rip,
+			       int cr0_pe, int eflags_vm,
+			       int cs_d, int cs_l)
+{
+	static char out[15*3+1];
+	int i;
+
+	for (i = 0; i < len; ++i)
+		sprintf(out + i * 3, "%02x ", insn[i]);
+	out[len*3-1] = '\0';
+	return out;
+}
+
+#endif
+
+
+#define VMX_EXIT_REASONS			\
+	_ER(EXCEPTION_NMI,	 0)		\
+	_ER(EXTERNAL_INTERRUPT,	 1)		\
+	_ER(TRIPLE_FAULT,	 2)		\
+	_ER(PENDING_INTERRUPT,	 7)		\
+	_ER(NMI_WINDOW,		 8)		\
+	_ER(TASK_SWITCH,	 9)		\
+	_ER(CPUID,		 10)		\
+	_ER(HLT,		 12)		\
+	_ER(INVD,		 13)		\
+	_ER(INVLPG,		 14)		\
+	_ER(RDPMC,		 15)		\
+	_ER(RDTSC,		 16)		\
+	_ER(VMCALL,		 18)		\
+	_ER(VMCLEAR,		 19)		\
+	_ER(VMLAUNCH,		 20)		\
+	_ER(VMPTRLD,		 21)		\
+	_ER(VMPTRST,		 22)		\
+	_ER(VMREAD,		 23)		\
+	_ER(VMRESUME,		 24)		\
+	_ER(VMWRITE,		 25)		\
+	_ER(VMOFF,		 26)		\
+	_ER(VMON,		 27)		\
+	_ER(CR_ACCESS,		 28)		\
+	_ER(DR_ACCESS,		 29)		\
+	_ER(IO_INSTRUCTION,	 30)		\
+	_ER(MSR_READ,		 31)		\
+	_ER(MSR_WRITE,		 32)		\
+	_ER(MWAIT_INSTRUCTION,	 36)		\
+	_ER(MONITOR_INSTRUCTION, 39)		\
+	_ER(PAUSE_INSTRUCTION,	 40)		\
+	_ER(MCE_DURING_VMENTRY,	 41)		\
+	_ER(TPR_BELOW_THRESHOLD, 43)		\
+	_ER(APIC_ACCESS,	 44)		\
+	_ER(EOI_INDUCED,	 45)		\
+	_ER(EPT_VIOLATION,	 48)		\
+	_ER(EPT_MISCONFIG,	 49)		\
+	_ER(INVEPT,		 50)		\
+	_ER(PREEMPTION_TIMER,	 52)		\
+	_ER(WBINVD,		 54)		\
+	_ER(XSETBV,		 55)		\
+	_ER(APIC_WRITE,		 56)		\
+	_ER(INVPCID,		 58)
+
+#define SVM_EXIT_REASONS \
+	_ER(EXIT_READ_CR0,	0x000)		\
+	_ER(EXIT_READ_CR3,	0x003)		\
+	_ER(EXIT_READ_CR4,	0x004)		\
+	_ER(EXIT_READ_CR8,	0x008)		\
+	_ER(EXIT_WRITE_CR0,	0x010)		\
+	_ER(EXIT_WRITE_CR3,	0x013)		\
+	_ER(EXIT_WRITE_CR4,	0x014)		\
+	_ER(EXIT_WRITE_CR8,	0x018)		\
+	_ER(EXIT_READ_DR0,	0x020)		\
+	_ER(EXIT_READ_DR1,	0x021)		\
+	_ER(EXIT_READ_DR2,	0x022)		\
+	_ER(EXIT_READ_DR3,	0x023)		\
+	_ER(EXIT_READ_DR4,	0x024)		\
+	_ER(EXIT_READ_DR5,	0x025)		\
+	_ER(EXIT_READ_DR6,	0x026)		\
+	_ER(EXIT_READ_DR7,	0x027)		\
+	_ER(EXIT_WRITE_DR0,	0x030)		\
+	_ER(EXIT_WRITE_DR1,	0x031)		\
+	_ER(EXIT_WRITE_DR2,	0x032)		\
+	_ER(EXIT_WRITE_DR3,	0x033)		\
+	_ER(EXIT_WRITE_DR4,	0x034)		\
+	_ER(EXIT_WRITE_DR5,	0x035)		\
+	_ER(EXIT_WRITE_DR6,	0x036)		\
+	_ER(EXIT_WRITE_DR7,	0x037)		\
+	_ER(EXIT_EXCP_BASE,     0x040)		\
+	_ER(EXIT_INTR,		0x060)		\
+	_ER(EXIT_NMI,		0x061)		\
+	_ER(EXIT_SMI,		0x062)		\
+	_ER(EXIT_INIT,		0x063)		\
+	_ER(EXIT_VINTR,		0x064)		\
+	_ER(EXIT_CR0_SEL_WRITE,	0x065)		\
+	_ER(EXIT_IDTR_READ,	0x066)		\
+	_ER(EXIT_GDTR_READ,	0x067)		\
+	_ER(EXIT_LDTR_READ,	0x068)		\
+	_ER(EXIT_TR_READ,	0x069)		\
+	_ER(EXIT_IDTR_WRITE,	0x06a)		\
+	_ER(EXIT_GDTR_WRITE,	0x06b)		\
+	_ER(EXIT_LDTR_WRITE,	0x06c)		\
+	_ER(EXIT_TR_WRITE,	0x06d)		\
+	_ER(EXIT_RDTSC,		0x06e)		\
+	_ER(EXIT_RDPMC,		0x06f)		\
+	_ER(EXIT_PUSHF,		0x070)		\
+	_ER(EXIT_POPF,		0x071)		\
+	_ER(EXIT_CPUID,		0x072)		\
+	_ER(EXIT_RSM,		0x073)		\
+	_ER(EXIT_IRET,		0x074)		\
+	_ER(EXIT_SWINT,		0x075)		\
+	_ER(EXIT_INVD,		0x076)		\
+	_ER(EXIT_PAUSE,		0x077)		\
+	_ER(EXIT_HLT,		0x078)		\
+	_ER(EXIT_INVLPG,	0x079)		\
+	_ER(EXIT_INVLPGA,	0x07a)		\
+	_ER(EXIT_IOIO,		0x07b)		\
+	_ER(EXIT_MSR,		0x07c)		\
+	_ER(EXIT_TASK_SWITCH,	0x07d)		\
+	_ER(EXIT_FERR_FREEZE,	0x07e)		\
+	_ER(EXIT_SHUTDOWN,	0x07f)		\
+	_ER(EXIT_VMRUN,		0x080)		\
+	_ER(EXIT_VMMCALL,	0x081)		\
+	_ER(EXIT_VMLOAD,	0x082)		\
+	_ER(EXIT_VMSAVE,	0x083)		\
+	_ER(EXIT_STGI,		0x084)		\
+	_ER(EXIT_CLGI,		0x085)		\
+	_ER(EXIT_SKINIT,	0x086)		\
+	_ER(EXIT_RDTSCP,	0x087)		\
+	_ER(EXIT_ICEBP,		0x088)		\
+	_ER(EXIT_WBINVD,	0x089)		\
+	_ER(EXIT_MONITOR,	0x08a)		\
+	_ER(EXIT_MWAIT,		0x08b)		\
+	_ER(EXIT_MWAIT_COND,	0x08c)		\
+	_ER(EXIT_NPF,		0x400)		\
+	_ER(EXIT_ERR,		-1)
+
+#define _ER(reason, val)	{ #reason, val },
+struct str_values {
+	const char	*str;
+	int		val;
+};
+
+static struct str_values vmx_exit_reasons[] = {
+	VMX_EXIT_REASONS
+	{ NULL, -1}
+};
+
+static struct str_values svm_exit_reasons[] = {
+	SVM_EXIT_REASONS
+	{ NULL, -1}
+};
+
+static struct isa_exit_reasons {
+	unsigned isa;
+	struct str_values *strings;
+} isa_exit_reasons[] = {
+	{ .isa = 1, .strings = vmx_exit_reasons },
+	{ .isa = 2, .strings = svm_exit_reasons },
+	{ }
+};
+
+static const char *find_exit_reason(unsigned isa, int val)
+{
+	struct str_values *strings = NULL;
+	int i;
+
+	for (i = 0; isa_exit_reasons[i].strings; ++i)
+		if (isa_exit_reasons[i].isa == isa) {
+			strings = isa_exit_reasons[i].strings;
+			break;
+		}
+	if (!strings)
+		return "UNKNOWN-ISA";
+	for (i = 0; strings[i].val >= 0; i++)
+		if (strings[i].val == val)
+			break;
+	if (strings[i].str)
+		return strings[i].str;
+	return "UNKNOWN";
+}
+
+static int kvm_exit_handler(struct trace_seq *s, struct pevent_record *record,
+			    struct event_format *event, void *context)
+{
+	unsigned long long isa;
+	unsigned long long val;
+	unsigned long long info1 = 0, info2 = 0;
+
+	if (pevent_get_field_val(s, event, "exit_reason", record, &val, 1) < 0)
+		return -1;
+
+	if (pevent_get_field_val(s, event, "isa", record, &isa, 0) < 0)
+		isa = 1;
+
+	trace_seq_printf(s, "reason %s", find_exit_reason(isa, val));
+
+	pevent_print_num_field(s, " rip 0x%lx", event, "guest_rip", record, 1);
+
+	if (pevent_get_field_val(s, event, "info1", record, &info1, 0) >= 0
+	    && pevent_get_field_val(s, event, "info2", record, &info2, 0) >= 0)
+		trace_seq_printf(s, " info %llx %llx", info1, info2);
+
+	return 0;
+}
+
+#define KVM_EMUL_INSN_F_CR0_PE (1 << 0)
+#define KVM_EMUL_INSN_F_EFL_VM (1 << 1)
+#define KVM_EMUL_INSN_F_CS_D   (1 << 2)
+#define KVM_EMUL_INSN_F_CS_L   (1 << 3)
+
+static int kvm_emulate_insn_handler(struct trace_seq *s,
+				    struct pevent_record *record,
+				    struct event_format *event, void *context)
+{
+	unsigned long long rip, csbase, len, flags, failed;
+	int llen;
+	uint8_t *insn;
+	const char *disasm;
+
+	if (pevent_get_field_val(s, event, "rip", record, &rip, 1) < 0)
+		return -1;
+
+	if (pevent_get_field_val(s, event, "csbase", record, &csbase, 1) < 0)
+		return -1;
+
+	if (pevent_get_field_val(s, event, "len", record, &len, 1) < 0)
+		return -1;
+
+	if (pevent_get_field_val(s, event, "flags", record, &flags, 1) < 0)
+		return -1;
+
+	if (pevent_get_field_val(s, event, "failed", record, &failed, 1) < 0)
+		return -1;
+
+	insn = pevent_get_field_raw(s, event, "insn", record, &llen, 1);
+	if (!insn)
+		return -1;
+
+	disasm = disassemble(insn, len, rip,
+			     flags & KVM_EMUL_INSN_F_CR0_PE,
+			     flags & KVM_EMUL_INSN_F_EFL_VM,
+			     flags & KVM_EMUL_INSN_F_CS_D,
+			     flags & KVM_EMUL_INSN_F_CS_L);
+
+	trace_seq_printf(s, "%llx:%llx: %s%s", csbase, rip, disasm,
+			 failed ? " FAIL" : "");
+	return 0;
+}
+
+union kvm_mmu_page_role {
+	unsigned word;
+	struct {
+		unsigned glevels:4;
+		unsigned level:4;
+		unsigned quadrant:2;
+		unsigned pad_for_nice_hex_output:6;
+		unsigned direct:1;
+		unsigned access:3;
+		unsigned invalid:1;
+		unsigned cr4_pge:1;
+		unsigned nxe:1;
+	};
+};
+
+static int kvm_mmu_print_role(struct trace_seq *s, struct pevent_record *record,
+			      struct event_format *event, void *context)
+{
+	unsigned long long val;
+	static const char *access_str[] = {
+		"---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"
+	};
+	union kvm_mmu_page_role role;
+
+	if (pevent_get_field_val(s, event, "role", record, &val, 1) < 0)
+		return -1;
+
+	role.word = (int)val;
+
+	/*
+	 * We can only use the structure if file is of the same
+	 * endianess.
+	 */
+	if (pevent_is_file_bigendian(event->pevent) ==
+	    pevent_is_host_bigendian(event->pevent)) {
+
+		trace_seq_printf(s, "%u/%u q%u%s %s%s %spge %snxe",
+				 role.level,
+				 role.glevels,
+				 role.quadrant,
+				 role.direct ? " direct" : "",
+				 access_str[role.access],
+				 role.invalid ? " invalid" : "",
+				 role.cr4_pge ? "" : "!",
+				 role.nxe ? "" : "!");
+	} else
+		trace_seq_printf(s, "WORD: %08x", role.word);
+
+	pevent_print_num_field(s, " root %u ",  event,
+			       "root_count", record, 1);
+
+	if (pevent_get_field_val(s, event, "unsync", record, &val, 1) < 0)
+		return -1;
+
+	trace_seq_printf(s, "%s%c",  val ? "unsync" : "sync", 0);
+	return 0;
+}
+
+static int kvm_mmu_get_page_handler(struct trace_seq *s,
+				    struct pevent_record *record,
+				    struct event_format *event, void *context)
+{
+	unsigned long long val;
+
+	if (pevent_get_field_val(s, event, "created", record, &val, 1) < 0)
+		return -1;
+
+	trace_seq_printf(s, "%s ", val ? "new" : "existing");
+
+	if (pevent_get_field_val(s, event, "gfn", record, &val, 1) < 0)
+		return -1;
+
+	trace_seq_printf(s, "sp gfn %llx ", val);
+	return kvm_mmu_print_role(s, record, event, context);
+}
+
+#define PT_WRITABLE_SHIFT 1
+#define PT_WRITABLE_MASK (1ULL << PT_WRITABLE_SHIFT)
+
+static unsigned long long
+process_is_writable_pte(struct trace_seq *s, unsigned long long *args)
+{
+	unsigned long pte = args[0];
+	return pte & PT_WRITABLE_MASK;
+}
+
+int PEVENT_PLUGIN_LOADER(struct pevent *pevent)
+{
+	init_disassembler();
+
+	pevent_register_event_handler(pevent, -1, "kvm", "kvm_exit",
+				      kvm_exit_handler, NULL);
+
+	pevent_register_event_handler(pevent, -1, "kvm", "kvm_emulate_insn",
+				      kvm_emulate_insn_handler, NULL);
+
+	pevent_register_event_handler(pevent, -1, "kvmmmu", "kvm_mmu_get_page",
+				      kvm_mmu_get_page_handler, NULL);
+
+	pevent_register_event_handler(pevent, -1, "kvmmmu", "kvm_mmu_sync_page",
+				      kvm_mmu_print_role, NULL);
+
+	pevent_register_event_handler(pevent, -1,
+				      "kvmmmu", "kvm_mmu_unsync_page",
+				      kvm_mmu_print_role, NULL);
+
+	pevent_register_event_handler(pevent, -1, "kvmmmu", "kvm_mmu_zap_page",
+				      kvm_mmu_print_role, NULL);
+
+	pevent_register_event_handler(pevent, -1, "kvmmmu",
+			"kvm_mmu_prepare_zap_page", kvm_mmu_print_role,
+			NULL);
+
+	pevent_register_print_function(pevent,
+				       process_is_writable_pte,
+				       PEVENT_FUNC_ARG_INT,
+				       "is_writable_pte",
+				       PEVENT_FUNC_ARG_LONG,
+				       PEVENT_FUNC_ARG_VOID);
+	return 0;
+}
+
+void PEVENT_PLUGIN_UNLOADER(struct pevent *pevent)
+{
+	pevent_unregister_event_handler(pevent, -1, "kvm", "kvm_exit",
+					kvm_exit_handler, NULL);
+
+	pevent_unregister_event_handler(pevent, -1, "kvm", "kvm_emulate_insn",
+					kvm_emulate_insn_handler, NULL);
+
+	pevent_unregister_event_handler(pevent, -1, "kvmmmu", "kvm_mmu_get_page",
+					kvm_mmu_get_page_handler, NULL);
+
+	pevent_unregister_event_handler(pevent, -1, "kvmmmu", "kvm_mmu_sync_page",
+					kvm_mmu_print_role, NULL);
+
+	pevent_unregister_event_handler(pevent, -1,
+					"kvmmmu", "kvm_mmu_unsync_page",
+					kvm_mmu_print_role, NULL);
+
+	pevent_unregister_event_handler(pevent, -1, "kvmmmu", "kvm_mmu_zap_page",
+					kvm_mmu_print_role, NULL);
+
+	pevent_unregister_event_handler(pevent, -1, "kvmmmu",
+			"kvm_mmu_prepare_zap_page", kvm_mmu_print_role,
+			NULL);
+
+	pevent_unregister_print_function(pevent, process_is_writable_pte,
+					 "is_writable_pte");
+}
diff --git a/tools/lib/traceevent/plugin_mac80211.c b/tools/lib/traceevent/plugin_mac80211.c
new file mode 100644
index 00000000000..7e15a0f1c2f
--- /dev/null
+++ b/tools/lib/traceevent/plugin_mac80211.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2009 Johannes Berg <johannes@sipsolutions.net>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "event-parse.h"
+
+#define INDENT 65
+
+static void print_string(struct trace_seq *s, struct event_format *event,
+			 const char *name, const void *data)
+{
+	struct format_field *f = pevent_find_field(event, name);
+	int offset;
+	int length;
+
+	if (!f) {
+		trace_seq_printf(s, "NOTFOUND:%s", name);
+		return;
+	}
+
+	offset = f->offset;
+	length = f->size;
+
+	if (!strncmp(f->type, "__data_loc", 10)) {
+		unsigned long long v;
+		if (pevent_read_number_field(f, data, &v)) {
+			trace_seq_printf(s, "invalid_data_loc");
+			return;
+		}
+		offset = v & 0xffff;
+		length = v >> 16;
+	}
+
+	trace_seq_printf(s, "%.*s", length, (char *)data + offset);
+}
+
+#define SF(fn)	pevent_print_num_field(s, fn ":%d", event, fn, record, 0)
+#define SFX(fn)	pevent_print_num_field(s, fn ":%#x", event, fn, record, 0)
+#define SP()	trace_seq_putc(s, ' ')
+
+static int drv_bss_info_changed(struct trace_seq *s,
+				struct pevent_record *record,
+				struct event_format *event, void *context)
+{
+	void *data = record->data;
+
+	print_string(s, event, "wiphy_name", data);
+	trace_seq_printf(s, " vif:");
+	print_string(s, event, "vif_name", data);
+	pevent_print_num_field(s, "(%d)", event, "vif_type", record, 1);
+
+	trace_seq_printf(s, "\n%*s", INDENT, "");
+	SF("assoc"); SP();
+	SF("aid"); SP();
+	SF("cts"); SP();
+	SF("shortpre"); SP();
+	SF("shortslot"); SP();
+	SF("dtimper"); SP();
+	trace_seq_printf(s, "\n%*s", INDENT, "");
+	SF("bcnint"); SP();
+	SFX("assoc_cap"); SP();
+	SFX("basic_rates"); SP();
+	SF("enable_beacon");
+	trace_seq_printf(s, "\n%*s", INDENT, "");
+	SF("ht_operation_mode");
+
+	return 0;
+}
+
+int PEVENT_PLUGIN_LOADER(struct pevent *pevent)
+{
+	pevent_register_event_handler(pevent, -1, "mac80211",
+				      "drv_bss_info_changed",
+				      drv_bss_info_changed, NULL);
+	return 0;
+}
+
+void PEVENT_PLUGIN_UNLOADER(struct pevent *pevent)
+{
+	pevent_unregister_event_handler(pevent, -1, "mac80211",
+					"drv_bss_info_changed",
+					drv_bss_info_changed, NULL);
+}
diff --git a/tools/lib/traceevent/plugin_sched_switch.c b/tools/lib/traceevent/plugin_sched_switch.c
new file mode 100644
index 00000000000..f1ce6006525
--- /dev/null
+++ b/tools/lib/traceevent/plugin_sched_switch.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (C) 2009, 2010 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "event-parse.h"
+
+static void write_state(struct trace_seq *s, int val)
+{
+	const char states[] = "SDTtZXxW";
+	int found = 0;
+	int i;
+
+	for (i = 0; i < (sizeof(states) - 1); i++) {
+		if (!(val & (1 << i)))
+			continue;
+
+		if (found)
+			trace_seq_putc(s, '|');
+
+		found = 1;
+		trace_seq_putc(s, states[i]);
+	}
+
+	if (!found)
+		trace_seq_putc(s, 'R');
+}
+
+static void write_and_save_comm(struct format_field *field,
+				struct pevent_record *record,
+				struct trace_seq *s, int pid)
+{
+	const char *comm;
+	int len;
+
+	comm = (char *)(record->data + field->offset);
+	len = s->len;
+	trace_seq_printf(s, "%.*s",
+			 field->size, comm);
+
+	/* make sure the comm has a \0 at the end. */
+	trace_seq_terminate(s);
+	comm = &s->buffer[len];
+
+	/* Help out the comm to ids. This will handle dups */
+	pevent_register_comm(field->event->pevent, comm, pid);
+}
+
+static int sched_wakeup_handler(struct trace_seq *s,
+				struct pevent_record *record,
+				struct event_format *event, void *context)
+{
+	struct format_field *field;
+	unsigned long long val;
+
+	if (pevent_get_field_val(s, event, "pid", record, &val, 1))
+		return trace_seq_putc(s, '!');
+
+	field = pevent_find_any_field(event, "comm");
+	if (field) {
+		write_and_save_comm(field, record, s, val);
+		trace_seq_putc(s, ':');
+	}
+	trace_seq_printf(s, "%lld", val);
+
+	if (pevent_get_field_val(s, event, "prio", record, &val, 0) == 0)
+		trace_seq_printf(s, " [%lld]", val);
+
+	if (pevent_get_field_val(s, event, "success", record, &val, 1) == 0)
+		trace_seq_printf(s, " success=%lld", val);
+
+	if (pevent_get_field_val(s, event, "target_cpu", record, &val, 0) == 0)
+		trace_seq_printf(s, " CPU:%03llu", val);
+
+	return 0;
+}
+
+static int sched_switch_handler(struct trace_seq *s,
+				struct pevent_record *record,
+				struct event_format *event, void *context)
+{
+	struct format_field *field;
+	unsigned long long val;
+
+	if (pevent_get_field_val(s, event, "prev_pid", record, &val, 1))
+		return trace_seq_putc(s, '!');
+
+	field = pevent_find_any_field(event, "prev_comm");
+	if (field) {
+		write_and_save_comm(field, record, s, val);
+		trace_seq_putc(s, ':');
+	}
+	trace_seq_printf(s, "%lld ", val);
+
+	if (pevent_get_field_val(s, event, "prev_prio", record, &val, 0) == 0)
+		trace_seq_printf(s, "[%lld] ", val);
+
+	if (pevent_get_field_val(s,  event, "prev_state", record, &val, 0) == 0)
+		write_state(s, val);
+
+	trace_seq_puts(s, " ==> ");
+
+	if (pevent_get_field_val(s, event, "next_pid", record, &val, 1))
+		return trace_seq_putc(s, '!');
+
+	field = pevent_find_any_field(event, "next_comm");
+	if (field) {
+		write_and_save_comm(field, record, s, val);
+		trace_seq_putc(s, ':');
+	}
+	trace_seq_printf(s, "%lld", val);
+
+	if (pevent_get_field_val(s, event, "next_prio", record, &val, 0) == 0)
+		trace_seq_printf(s, " [%lld]", val);
+
+	return 0;
+}
+
+int PEVENT_PLUGIN_LOADER(struct pevent *pevent)
+{
+	pevent_register_event_handler(pevent, -1, "sched", "sched_switch",
+				      sched_switch_handler, NULL);
+
+	pevent_register_event_handler(pevent, -1, "sched", "sched_wakeup",
+				      sched_wakeup_handler, NULL);
+
+	pevent_register_event_handler(pevent, -1, "sched", "sched_wakeup_new",
+				      sched_wakeup_handler, NULL);
+	return 0;
+}
+
+void PEVENT_PLUGIN_UNLOADER(struct pevent *pevent)
+{
+	pevent_unregister_event_handler(pevent, -1, "sched", "sched_switch",
+					sched_switch_handler, NULL);
+
+	pevent_unregister_event_handler(pevent, -1, "sched", "sched_wakeup",
+					sched_wakeup_handler, NULL);
+
+	pevent_unregister_event_handler(pevent, -1, "sched", "sched_wakeup_new",
+					sched_wakeup_handler, NULL);
+}
diff --git a/tools/lib/traceevent/plugin_scsi.c b/tools/lib/traceevent/plugin_scsi.c
new file mode 100644
index 00000000000..eda326fc862
--- /dev/null
+++ b/tools/lib/traceevent/plugin_scsi.c
@@ -0,0 +1,429 @@
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+#include "event-parse.h"
+
+typedef unsigned long sector_t;
+typedef uint64_t u64;
+typedef unsigned int u32;
+
+/*
+ *      SCSI opcodes
+ */
+#define TEST_UNIT_READY			0x00
+#define REZERO_UNIT			0x01
+#define REQUEST_SENSE			0x03
+#define FORMAT_UNIT			0x04
+#define READ_BLOCK_LIMITS		0x05
+#define REASSIGN_BLOCKS			0x07
+#define INITIALIZE_ELEMENT_STATUS	0x07
+#define READ_6				0x08
+#define WRITE_6				0x0a
+#define SEEK_6				0x0b
+#define READ_REVERSE			0x0f
+#define WRITE_FILEMARKS			0x10
+#define SPACE				0x11
+#define INQUIRY				0x12
+#define RECOVER_BUFFERED_DATA		0x14
+#define MODE_SELECT			0x15
+#define RESERVE				0x16
+#define RELEASE				0x17
+#define COPY				0x18
+#define ERASE				0x19
+#define MODE_SENSE			0x1a
+#define START_STOP			0x1b
+#define RECEIVE_DIAGNOSTIC		0x1c
+#define SEND_DIAGNOSTIC			0x1d
+#define ALLOW_MEDIUM_REMOVAL		0x1e
+
+#define READ_FORMAT_CAPACITIES		0x23
+#define SET_WINDOW			0x24
+#define READ_CAPACITY			0x25
+#define READ_10				0x28
+#define WRITE_10			0x2a
+#define SEEK_10				0x2b
+#define POSITION_TO_ELEMENT		0x2b
+#define WRITE_VERIFY			0x2e
+#define VERIFY				0x2f
+#define SEARCH_HIGH			0x30
+#define SEARCH_EQUAL			0x31
+#define SEARCH_LOW			0x32
+#define SET_LIMITS			0x33
+#define PRE_FETCH			0x34
+#define READ_POSITION			0x34
+#define SYNCHRONIZE_CACHE		0x35
+#define LOCK_UNLOCK_CACHE		0x36
+#define READ_DEFECT_DATA		0x37
+#define MEDIUM_SCAN			0x38
+#define COMPARE				0x39
+#define COPY_VERIFY			0x3a
+#define WRITE_BUFFER			0x3b
+#define READ_BUFFER			0x3c
+#define UPDATE_BLOCK			0x3d
+#define READ_LONG			0x3e
+#define WRITE_LONG			0x3f
+#define CHANGE_DEFINITION		0x40
+#define WRITE_SAME			0x41
+#define UNMAP				0x42
+#define READ_TOC			0x43
+#define READ_HEADER			0x44
+#define GET_EVENT_STATUS_NOTIFICATION	0x4a
+#define LOG_SELECT			0x4c
+#define LOG_SENSE			0x4d
+#define XDWRITEREAD_10			0x53
+#define MODE_SELECT_10			0x55
+#define RESERVE_10			0x56
+#define RELEASE_10			0x57
+#define MODE_SENSE_10			0x5a
+#define PERSISTENT_RESERVE_IN		0x5e
+#define PERSISTENT_RESERVE_OUT		0x5f
+#define VARIABLE_LENGTH_CMD		0x7f
+#define REPORT_LUNS			0xa0
+#define SECURITY_PROTOCOL_IN		0xa2
+#define MAINTENANCE_IN			0xa3
+#define MAINTENANCE_OUT			0xa4
+#define MOVE_MEDIUM			0xa5
+#define EXCHANGE_MEDIUM			0xa6
+#define READ_12				0xa8
+#define WRITE_12			0xaa
+#define READ_MEDIA_SERIAL_NUMBER	0xab
+#define WRITE_VERIFY_12			0xae
+#define VERIFY_12			0xaf
+#define SEARCH_HIGH_12			0xb0
+#define SEARCH_EQUAL_12			0xb1
+#define SEARCH_LOW_12			0xb2
+#define SECURITY_PROTOCOL_OUT		0xb5
+#define READ_ELEMENT_STATUS		0xb8
+#define SEND_VOLUME_TAG			0xb6
+#define WRITE_LONG_2			0xea
+#define EXTENDED_COPY			0x83
+#define RECEIVE_COPY_RESULTS		0x84
+#define ACCESS_CONTROL_IN		0x86
+#define ACCESS_CONTROL_OUT		0x87
+#define READ_16				0x88
+#define WRITE_16			0x8a
+#define READ_ATTRIBUTE			0x8c
+#define WRITE_ATTRIBUTE			0x8d
+#define VERIFY_16			0x8f
+#define SYNCHRONIZE_CACHE_16		0x91
+#define WRITE_SAME_16			0x93
+#define SERVICE_ACTION_IN		0x9e
+/* values for service action in */
+#define	SAI_READ_CAPACITY_16		0x10
+#define SAI_GET_LBA_STATUS		0x12
+/* values for VARIABLE_LENGTH_CMD service action codes
+ * see spc4r17 Section D.3.5, table D.7 and D.8 */
+#define VLC_SA_RECEIVE_CREDENTIAL	0x1800
+/* values for maintenance in */
+#define MI_REPORT_IDENTIFYING_INFORMATION		0x05
+#define MI_REPORT_TARGET_PGS				0x0a
+#define MI_REPORT_ALIASES				0x0b
+#define MI_REPORT_SUPPORTED_OPERATION_CODES		0x0c
+#define MI_REPORT_SUPPORTED_TASK_MANAGEMENT_FUNCTIONS	0x0d
+#define MI_REPORT_PRIORITY				0x0e
+#define MI_REPORT_TIMESTAMP				0x0f
+#define MI_MANAGEMENT_PROTOCOL_IN			0x10
+/* value for MI_REPORT_TARGET_PGS ext header */
+#define MI_EXT_HDR_PARAM_FMT		0x20
+/* values for maintenance out */
+#define MO_SET_IDENTIFYING_INFORMATION	0x06
+#define MO_SET_TARGET_PGS		0x0a
+#define MO_CHANGE_ALIASES		0x0b
+#define MO_SET_PRIORITY			0x0e
+#define MO_SET_TIMESTAMP		0x0f
+#define MO_MANAGEMENT_PROTOCOL_OUT	0x10
+/* values for variable length command */
+#define XDREAD_32			0x03
+#define XDWRITE_32			0x04
+#define XPWRITE_32			0x06
+#define XDWRITEREAD_32			0x07
+#define READ_32				0x09
+#define VERIFY_32			0x0a
+#define WRITE_32			0x0b
+#define WRITE_SAME_32			0x0d
+
+#define SERVICE_ACTION16(cdb) (cdb[1] & 0x1f)
+#define SERVICE_ACTION32(cdb) ((cdb[8] << 8) | cdb[9])
+
+static const char *
+scsi_trace_misc(struct trace_seq *, unsigned char *, int);
+
+static const char *
+scsi_trace_rw6(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	const char *ret = p->buffer + p->len;
+	sector_t lba = 0, txlen = 0;
+
+	lba |= ((cdb[1] & 0x1F) << 16);
+	lba |=  (cdb[2] << 8);
+	lba |=   cdb[3];
+	txlen = cdb[4];
+
+	trace_seq_printf(p, "lba=%llu txlen=%llu",
+			 (unsigned long long)lba, (unsigned long long)txlen);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+static const char *
+scsi_trace_rw10(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	const char *ret = p->buffer + p->len;
+	sector_t lba = 0, txlen = 0;
+
+	lba |= (cdb[2] << 24);
+	lba |= (cdb[3] << 16);
+	lba |= (cdb[4] << 8);
+	lba |=  cdb[5];
+	txlen |= (cdb[7] << 8);
+	txlen |=  cdb[8];
+
+	trace_seq_printf(p, "lba=%llu txlen=%llu protect=%u",
+			 (unsigned long long)lba, (unsigned long long)txlen,
+			 cdb[1] >> 5);
+
+	if (cdb[0] == WRITE_SAME)
+		trace_seq_printf(p, " unmap=%u", cdb[1] >> 3 & 1);
+
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+static const char *
+scsi_trace_rw12(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	const char *ret = p->buffer + p->len;
+	sector_t lba = 0, txlen = 0;
+
+	lba |= (cdb[2] << 24);
+	lba |= (cdb[3] << 16);
+	lba |= (cdb[4] << 8);
+	lba |=  cdb[5];
+	txlen |= (cdb[6] << 24);
+	txlen |= (cdb[7] << 16);
+	txlen |= (cdb[8] << 8);
+	txlen |=  cdb[9];
+
+	trace_seq_printf(p, "lba=%llu txlen=%llu protect=%u",
+			 (unsigned long long)lba, (unsigned long long)txlen,
+			 cdb[1] >> 5);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+static const char *
+scsi_trace_rw16(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	const char *ret = p->buffer + p->len;
+	sector_t lba = 0, txlen = 0;
+
+	lba |= ((u64)cdb[2] << 56);
+	lba |= ((u64)cdb[3] << 48);
+	lba |= ((u64)cdb[4] << 40);
+	lba |= ((u64)cdb[5] << 32);
+	lba |= (cdb[6] << 24);
+	lba |= (cdb[7] << 16);
+	lba |= (cdb[8] << 8);
+	lba |=  cdb[9];
+	txlen |= (cdb[10] << 24);
+	txlen |= (cdb[11] << 16);
+	txlen |= (cdb[12] << 8);
+	txlen |=  cdb[13];
+
+	trace_seq_printf(p, "lba=%llu txlen=%llu protect=%u",
+			 (unsigned long long)lba, (unsigned long long)txlen,
+			 cdb[1] >> 5);
+
+	if (cdb[0] == WRITE_SAME_16)
+		trace_seq_printf(p, " unmap=%u", cdb[1] >> 3 & 1);
+
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+static const char *
+scsi_trace_rw32(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	const char *ret = p->buffer + p->len, *cmd;
+	sector_t lba = 0, txlen = 0;
+	u32 ei_lbrt = 0;
+
+	switch (SERVICE_ACTION32(cdb)) {
+	case READ_32:
+		cmd = "READ";
+		break;
+	case VERIFY_32:
+		cmd = "VERIFY";
+		break;
+	case WRITE_32:
+		cmd = "WRITE";
+		break;
+	case WRITE_SAME_32:
+		cmd = "WRITE_SAME";
+		break;
+	default:
+		trace_seq_printf(p, "UNKNOWN");
+		goto out;
+	}
+
+	lba |= ((u64)cdb[12] << 56);
+	lba |= ((u64)cdb[13] << 48);
+	lba |= ((u64)cdb[14] << 40);
+	lba |= ((u64)cdb[15] << 32);
+	lba |= (cdb[16] << 24);
+	lba |= (cdb[17] << 16);
+	lba |= (cdb[18] << 8);
+	lba |=  cdb[19];
+	ei_lbrt |= (cdb[20] << 24);
+	ei_lbrt |= (cdb[21] << 16);
+	ei_lbrt |= (cdb[22] << 8);
+	ei_lbrt |=  cdb[23];
+	txlen |= (cdb[28] << 24);
+	txlen |= (cdb[29] << 16);
+	txlen |= (cdb[30] << 8);
+	txlen |=  cdb[31];
+
+	trace_seq_printf(p, "%s_32 lba=%llu txlen=%llu protect=%u ei_lbrt=%u",
+			 cmd, (unsigned long long)lba,
+			 (unsigned long long)txlen, cdb[10] >> 5, ei_lbrt);
+
+	if (SERVICE_ACTION32(cdb) == WRITE_SAME_32)
+		trace_seq_printf(p, " unmap=%u", cdb[10] >> 3 & 1);
+
+out:
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+static const char *
+scsi_trace_unmap(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	const char *ret = p->buffer + p->len;
+	unsigned int regions = cdb[7] << 8 | cdb[8];
+
+	trace_seq_printf(p, "regions=%u", (regions - 8) / 16);
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+static const char *
+scsi_trace_service_action_in(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	const char *ret = p->buffer + p->len, *cmd;
+	sector_t lba = 0;
+	u32 alloc_len = 0;
+
+	switch (SERVICE_ACTION16(cdb)) {
+	case SAI_READ_CAPACITY_16:
+		cmd = "READ_CAPACITY_16";
+		break;
+	case SAI_GET_LBA_STATUS:
+		cmd = "GET_LBA_STATUS";
+		break;
+	default:
+		trace_seq_printf(p, "UNKNOWN");
+		goto out;
+	}
+
+	lba |= ((u64)cdb[2] << 56);
+	lba |= ((u64)cdb[3] << 48);
+	lba |= ((u64)cdb[4] << 40);
+	lba |= ((u64)cdb[5] << 32);
+	lba |= (cdb[6] << 24);
+	lba |= (cdb[7] << 16);
+	lba |= (cdb[8] << 8);
+	lba |=  cdb[9];
+	alloc_len |= (cdb[10] << 24);
+	alloc_len |= (cdb[11] << 16);
+	alloc_len |= (cdb[12] << 8);
+	alloc_len |=  cdb[13];
+
+	trace_seq_printf(p, "%s lba=%llu alloc_len=%u", cmd,
+			 (unsigned long long)lba, alloc_len);
+
+out:
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+static const char *
+scsi_trace_varlen(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	switch (SERVICE_ACTION32(cdb)) {
+	case READ_32:
+	case VERIFY_32:
+	case WRITE_32:
+	case WRITE_SAME_32:
+		return scsi_trace_rw32(p, cdb, len);
+	default:
+		return scsi_trace_misc(p, cdb, len);
+	}
+}
+
+static const char *
+scsi_trace_misc(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	const char *ret = p->buffer + p->len;
+
+	trace_seq_printf(p, "-");
+	trace_seq_putc(p, 0);
+	return ret;
+}
+
+const char *
+scsi_trace_parse_cdb(struct trace_seq *p, unsigned char *cdb, int len)
+{
+	switch (cdb[0]) {
+	case READ_6:
+	case WRITE_6:
+		return scsi_trace_rw6(p, cdb, len);
+	case READ_10:
+	case VERIFY:
+	case WRITE_10:
+	case WRITE_SAME:
+		return scsi_trace_rw10(p, cdb, len);
+	case READ_12:
+	case VERIFY_12:
+	case WRITE_12:
+		return scsi_trace_rw12(p, cdb, len);
+	case READ_16:
+	case VERIFY_16:
+	case WRITE_16:
+	case WRITE_SAME_16:
+		return scsi_trace_rw16(p, cdb, len);
+	case UNMAP:
+		return scsi_trace_unmap(p, cdb, len);
+	case SERVICE_ACTION_IN:
+		return scsi_trace_service_action_in(p, cdb, len);
+	case VARIABLE_LENGTH_CMD:
+		return scsi_trace_varlen(p, cdb, len);
+	default:
+		return scsi_trace_misc(p, cdb, len);
+	}
+}
+
+unsigned long long process_scsi_trace_parse_cdb(struct trace_seq *s,
+						unsigned long long *args)
+{
+	scsi_trace_parse_cdb(s, (unsigned char *) (unsigned long) args[1], args[2]);
+	return 0;
+}
+
+int PEVENT_PLUGIN_LOADER(struct pevent *pevent)
+{
+	pevent_register_print_function(pevent,
+				       process_scsi_trace_parse_cdb,
+				       PEVENT_FUNC_ARG_STRING,
+				       "scsi_trace_parse_cdb",
+				       PEVENT_FUNC_ARG_PTR,
+				       PEVENT_FUNC_ARG_PTR,
+				       PEVENT_FUNC_ARG_INT,
+				       PEVENT_FUNC_ARG_VOID);
+	return 0;
+}
+
+void PEVENT_PLUGIN_UNLOADER(struct pevent *pevent)
+{
+	pevent_unregister_print_function(pevent, process_scsi_trace_parse_cdb,
+					 "scsi_trace_parse_cdb");
+}
diff --git a/tools/lib/traceevent/plugin_xen.c b/tools/lib/traceevent/plugin_xen.c
new file mode 100644
index 00000000000..3a413eaada6
--- /dev/null
+++ b/tools/lib/traceevent/plugin_xen.c
@@ -0,0 +1,136 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "event-parse.h"
+
+#define __HYPERVISOR_set_trap_table			0
+#define __HYPERVISOR_mmu_update				1
+#define __HYPERVISOR_set_gdt				2
+#define __HYPERVISOR_stack_switch			3
+#define __HYPERVISOR_set_callbacks			4
+#define __HYPERVISOR_fpu_taskswitch			5
+#define __HYPERVISOR_sched_op_compat			6
+#define __HYPERVISOR_dom0_op				7
+#define __HYPERVISOR_set_debugreg			8
+#define __HYPERVISOR_get_debugreg			9
+#define __HYPERVISOR_update_descriptor			10
+#define __HYPERVISOR_memory_op				12
+#define __HYPERVISOR_multicall				13
+#define __HYPERVISOR_update_va_mapping			14
+#define __HYPERVISOR_set_timer_op			15
+#define __HYPERVISOR_event_channel_op_compat		16
+#define __HYPERVISOR_xen_version			17
+#define __HYPERVISOR_console_io				18
+#define __HYPERVISOR_physdev_op_compat			19
+#define __HYPERVISOR_grant_table_op			20
+#define __HYPERVISOR_vm_assist				21
+#define __HYPERVISOR_update_va_mapping_otherdomain	22
+#define __HYPERVISOR_iret				23 /* x86 only */
+#define __HYPERVISOR_vcpu_op				24
+#define __HYPERVISOR_set_segment_base			25 /* x86/64 only */
+#define __HYPERVISOR_mmuext_op				26
+#define __HYPERVISOR_acm_op				27
+#define __HYPERVISOR_nmi_op				28
+#define __HYPERVISOR_sched_op				29
+#define __HYPERVISOR_callback_op			30
+#define __HYPERVISOR_xenoprof_op			31
+#define __HYPERVISOR_event_channel_op			32
+#define __HYPERVISOR_physdev_op				33
+#define __HYPERVISOR_hvm_op				34
+#define __HYPERVISOR_tmem_op				38
+
+/* Architecture-specific hypercall definitions. */
+#define __HYPERVISOR_arch_0				48
+#define __HYPERVISOR_arch_1				49
+#define __HYPERVISOR_arch_2				50
+#define __HYPERVISOR_arch_3				51
+#define __HYPERVISOR_arch_4				52
+#define __HYPERVISOR_arch_5				53
+#define __HYPERVISOR_arch_6				54
+#define __HYPERVISOR_arch_7				55
+
+#define N(x)	[__HYPERVISOR_##x] = "("#x")"
+static const char *xen_hypercall_names[] = {
+	N(set_trap_table),
+	N(mmu_update),
+	N(set_gdt),
+	N(stack_switch),
+	N(set_callbacks),
+	N(fpu_taskswitch),
+	N(sched_op_compat),
+	N(dom0_op),
+	N(set_debugreg),
+	N(get_debugreg),
+	N(update_descriptor),
+	N(memory_op),
+	N(multicall),
+	N(update_va_mapping),
+	N(set_timer_op),
+	N(event_channel_op_compat),
+	N(xen_version),
+	N(console_io),
+	N(physdev_op_compat),
+	N(grant_table_op),
+	N(vm_assist),
+	N(update_va_mapping_otherdomain),
+	N(iret),
+	N(vcpu_op),
+	N(set_segment_base),
+	N(mmuext_op),
+	N(acm_op),
+	N(nmi_op),
+	N(sched_op),
+	N(callback_op),
+	N(xenoprof_op),
+	N(event_channel_op),
+	N(physdev_op),
+	N(hvm_op),
+
+/* Architecture-specific hypercall definitions. */
+	N(arch_0),
+	N(arch_1),
+	N(arch_2),
+	N(arch_3),
+	N(arch_4),
+	N(arch_5),
+	N(arch_6),
+	N(arch_7),
+};
+#undef N
+
+#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]))
+
+static const char *xen_hypercall_name(unsigned op)
+{
+	if (op < ARRAY_SIZE(xen_hypercall_names) &&
+	    xen_hypercall_names[op] != NULL)
+		return xen_hypercall_names[op];
+
+	return "";
+}
+
+unsigned long long process_xen_hypercall_name(struct trace_seq *s,
+					      unsigned long long *args)
+{
+	unsigned int op = args[0];
+
+	trace_seq_printf(s, "%s", xen_hypercall_name(op));
+	return 0;
+}
+
+int PEVENT_PLUGIN_LOADER(struct pevent *pevent)
+{
+	pevent_register_print_function(pevent,
+				       process_xen_hypercall_name,
+				       PEVENT_FUNC_ARG_STRING,
+				       "xen_hypercall_name",
+				       PEVENT_FUNC_ARG_INT,
+				       PEVENT_FUNC_ARG_VOID);
+	return 0;
+}
+
+void PEVENT_PLUGIN_UNLOADER(struct pevent *pevent)
+{
+	pevent_unregister_print_function(pevent, process_xen_hypercall_name,
+					 "xen_hypercall_name");
+}
diff --git a/tools/lib/traceevent/trace-seq.c b/tools/lib/traceevent/trace-seq.c
new file mode 100644
index 00000000000..ec3bd16a548
--- /dev/null
+++ b/tools/lib/traceevent/trace-seq.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (C) 2009 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation;
+ * version 2.1 of the License (not later!)
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this program; if not,  see <http://www.gnu.org/licenses>
+ *
+ * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+
+#include <asm/bug.h>
+#include "event-parse.h"
+#include "event-utils.h"
+
+/*
+ * The TRACE_SEQ_POISON is to catch the use of using
+ * a trace_seq structure after it was destroyed.
+ */
+#define TRACE_SEQ_POISON	((void *)0xdeadbeef)
+#define TRACE_SEQ_CHECK(s)						\
+do {									\
+	if (WARN_ONCE((s)->buffer == TRACE_SEQ_POISON,			\
+		      "Usage of trace_seq after it was destroyed"))	\
+		(s)->state = TRACE_SEQ__BUFFER_POISONED;		\
+} while (0)
+
+#define TRACE_SEQ_CHECK_RET_N(s, n)		\
+do {						\
+	TRACE_SEQ_CHECK(s);			\
+	if ((s)->state != TRACE_SEQ__GOOD)	\
+		return n; 			\
+} while (0)
+
+#define TRACE_SEQ_CHECK_RET(s)   TRACE_SEQ_CHECK_RET_N(s, )
+#define TRACE_SEQ_CHECK_RET0(s)  TRACE_SEQ_CHECK_RET_N(s, 0)
+
+/**
+ * trace_seq_init - initialize the trace_seq structure
+ * @s: a pointer to the trace_seq structure to initialize
+ */
+void trace_seq_init(struct trace_seq *s)
+{
+	s->len = 0;
+	s->readpos = 0;
+	s->buffer_size = TRACE_SEQ_BUF_SIZE;
+	s->buffer = malloc(s->buffer_size);
+	if (s->buffer != NULL)
+		s->state = TRACE_SEQ__GOOD;
+	else
+		s->state = TRACE_SEQ__MEM_ALLOC_FAILED;
+}
+
+/**
+ * trace_seq_reset - re-initialize the trace_seq structure
+ * @s: a pointer to the trace_seq structure to reset
+ */
+void trace_seq_reset(struct trace_seq *s)
+{
+	if (!s)
+		return;
+	TRACE_SEQ_CHECK(s);
+	s->len = 0;
+	s->readpos = 0;
+}
+
+/**
+ * trace_seq_destroy - free up memory of a trace_seq
+ * @s: a pointer to the trace_seq to free the buffer
+ *
+ * Only frees the buffer, not the trace_seq struct itself.
+ */
+void trace_seq_destroy(struct trace_seq *s)
+{
+	if (!s)
+		return;
+	TRACE_SEQ_CHECK_RET(s);
+	free(s->buffer);
+	s->buffer = TRACE_SEQ_POISON;
+}
+
+static void expand_buffer(struct trace_seq *s)
+{
+	char *buf;
+
+	buf = realloc(s->buffer, s->buffer_size + TRACE_SEQ_BUF_SIZE);
+	if (WARN_ONCE(!buf, "Can't allocate trace_seq buffer memory")) {
+		s->state = TRACE_SEQ__MEM_ALLOC_FAILED;
+		return;
+	}
+
+	s->buffer = buf;
+	s->buffer_size += TRACE_SEQ_BUF_SIZE;
+}
+
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * It returns 0 if the trace oversizes the buffer's free
+ * space, 1 otherwise.
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+	va_list ap;
+	int len;
+	int ret;
+
+ try_again:
+	TRACE_SEQ_CHECK_RET0(s);
+
+	len = (s->buffer_size - 1) - s->len;
+
+	va_start(ap, fmt);
+	ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+	va_end(ap);
+
+	if (ret >= len) {
+		expand_buffer(s);
+		goto try_again;
+	}
+
+	s->len += ret;
+
+	return 1;
+}
+
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+	int len;
+	int ret;
+
+ try_again:
+	TRACE_SEQ_CHECK_RET0(s);
+
+	len = (s->buffer_size - 1) - s->len;
+
+	ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+
+	if (ret >= len) {
+		expand_buffer(s);
+		goto try_again;
+	}
+
+	s->len += ret;
+
+	return len;
+}
+
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+int trace_seq_puts(struct trace_seq *s, const char *str)
+{
+	int len;
+
+	TRACE_SEQ_CHECK_RET0(s);
+
+	len = strlen(str);
+
+	while (len > ((s->buffer_size - 1) - s->len))
+		expand_buffer(s);
+
+	TRACE_SEQ_CHECK_RET0(s);
+
+	memcpy(s->buffer + s->len, str, len);
+	s->len += len;
+
+	return len;
+}
+
+int trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+	TRACE_SEQ_CHECK_RET0(s);
+
+	while (s->len >= (s->buffer_size - 1))
+		expand_buffer(s);
+
+	TRACE_SEQ_CHECK_RET0(s);
+
+	s->buffer[s->len++] = c;
+
+	return 1;
+}
+
+void trace_seq_terminate(struct trace_seq *s)
+{
+	TRACE_SEQ_CHECK_RET(s);
+
+	/* There's always one character left on the buffer */
+	s->buffer[s->len] = 0;
+}
+
+int trace_seq_do_printf(struct trace_seq *s)
+{
+	TRACE_SEQ_CHECK(s);
+
+	switch (s->state) {
+	case TRACE_SEQ__GOOD:
+		return printf("%.*s", s->len, s->buffer);
+	case TRACE_SEQ__BUFFER_POISONED:
+		puts("Usage of trace_seq after it was destroyed");
+		break;
+	case TRACE_SEQ__MEM_ALLOC_FAILED:
+		puts("Can't allocate trace_seq buffer memory");
+		break;
+	}
+	return -1;
+}
diff --git a/tools/net/Makefile b/tools/net/Makefile
new file mode 100644
index 00000000000..ee577ea03ba
--- /dev/null
+++ b/tools/net/Makefile
@@ -0,0 +1,34 @@
+prefix = /usr
+
+CC = gcc
+LEX = flex
+YACC = bison
+
+%.yacc.c: %.y
+	$(YACC) -o $@ -d $<
+
+%.lex.c: %.l
+	$(LEX) -o $@ $<
+
+all : bpf_jit_disasm bpf_dbg bpf_asm
+
+bpf_jit_disasm : CFLAGS = -Wall -O2 -DPACKAGE='bpf_jit_disasm'
+bpf_jit_disasm : LDLIBS = -lopcodes -lbfd -ldl
+bpf_jit_disasm : bpf_jit_disasm.o
+
+bpf_dbg : CFLAGS = -Wall -O2
+bpf_dbg : LDLIBS = -lreadline
+bpf_dbg : bpf_dbg.o
+
+bpf_asm : CFLAGS = -Wall -O2 -I.
+bpf_asm : LDLIBS =
+bpf_asm : bpf_asm.o bpf_exp.yacc.o bpf_exp.lex.o
+bpf_exp.lex.o : bpf_exp.yacc.c
+
+clean :
+	rm -rf *.o bpf_jit_disasm bpf_dbg bpf_asm bpf_exp.yacc.* bpf_exp.lex.*
+
+install :
+	install bpf_jit_disasm $(prefix)/bin/bpf_jit_disasm
+	install bpf_dbg $(prefix)/bin/bpf_dbg
+	install bpf_asm $(prefix)/bin/bpf_asm
diff --git a/tools/net/bpf_asm.c b/tools/net/bpf_asm.c
new file mode 100644
index 00000000000..c15aef097b0
--- /dev/null
+++ b/tools/net/bpf_asm.c
@@ -0,0 +1,52 @@
+/*
+ * Minimal BPF assembler
+ *
+ * Instead of libpcap high-level filter expressions, it can be quite
+ * useful to define filters in low-level BPF assembler (that is kept
+ * close to Steven McCanne and Van Jacobson's original BPF paper).
+ * In particular for BPF JIT implementors, JIT security auditors, or
+ * just for defining BPF expressions that contain extensions which are
+ * not supported by compilers.
+ *
+ * How to get into it:
+ *
+ * 1) read Documentation/networking/filter.txt
+ * 2) Run `bpf_asm [-c] <filter-prog file>` to translate into binary
+ *    blob that is loadable with xt_bpf, cls_bpf et al. Note: -c will
+ *    pretty print a C-like construct.
+ *
+ * Copyright 2013 Daniel Borkmann <borkmann@redhat.com>
+ * Licensed under the GNU General Public License, version 2.0 (GPLv2)
+ */
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <string.h>
+
+extern void bpf_asm_compile(FILE *fp, bool cstyle);
+
+int main(int argc, char **argv)
+{
+	FILE *fp = stdin;
+	bool cstyle = false;
+	int i;
+
+	for (i = 1; i < argc; i++) {
+		if (!strncmp("-c", argv[i], 2)) {
+			cstyle = true;
+			continue;
+		}
+
+		fp = fopen(argv[i], "r");
+		if (!fp) {
+			fp = stdin;
+			continue;
+		}
+
+		break;
+	}
+
+	bpf_asm_compile(fp, cstyle);
+
+	return 0;
+}
diff --git a/tools/net/bpf_dbg.c b/tools/net/bpf_dbg.c
new file mode 100644
index 00000000000..9a287bec695
--- /dev/null
+++ b/tools/net/bpf_dbg.c
@@ -0,0 +1,1395 @@
+/*
+ * Minimal BPF debugger
+ *
+ * Minimal BPF debugger that mimics the kernel's engine (w/o extensions)
+ * and allows for single stepping through selected packets from a pcap
+ * with a provided user filter in order to facilitate verification of a
+ * BPF program. Besides others, this is useful to verify BPF programs
+ * before attaching to a live system, and can be used in socket filters,
+ * cls_bpf, xt_bpf, team driver and e.g. PTP code; in particular when a
+ * single more complex BPF program is being used. Reasons for a more
+ * complex BPF program are likely primarily to optimize execution time
+ * for making a verdict when multiple simple BPF programs are combined
+ * into one in order to prevent parsing same headers multiple times.
+ *
+ * More on how to debug BPF opcodes see Documentation/networking/filter.txt
+ * which is the main document on BPF. Mini howto for getting started:
+ *
+ *  1) `./bpf_dbg` to enter the shell (shell cmds denoted with '>'):
+ *  2) > load bpf 6,40 0 0 12,21 0 3 20... (output from `bpf_asm` or
+ *     `tcpdump -iem1 -ddd port 22 | tr '\n' ','` to load as filter)
+ *  3) > load pcap foo.pcap
+ *  4) > run <n>/disassemble/dump/quit (self-explanatory)
+ *  5) > breakpoint 2 (sets bp at loaded BPF insns 2, do `run` then;
+ *       multiple bps can be set, of course, a call to `breakpoint`
+ *       w/o args shows currently loaded bps, `breakpoint reset` for
+ *       resetting all breakpoints)
+ *  6) > select 3 (`run` etc will start from the 3rd packet in the pcap)
+ *  7) > step [-<n>, +<n>] (performs single stepping through the BPF)
+ *
+ * Copyright 2013 Daniel Borkmann <borkmann@redhat.com>
+ * Licensed under the GNU General Public License, version 2.0 (GPLv2)
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <ctype.h>
+#include <stdbool.h>
+#include <stdarg.h>
+#include <setjmp.h>
+#include <linux/filter.h>
+#include <linux/if_packet.h>
+#include <readline/readline.h>
+#include <readline/history.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/stat.h>
+#include <sys/mman.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <signal.h>
+#include <arpa/inet.h>
+#include <net/ethernet.h>
+
+#define TCPDUMP_MAGIC	0xa1b2c3d4
+
+#define BPF_LDX_B	(BPF_LDX | BPF_B)
+#define BPF_LDX_W	(BPF_LDX | BPF_W)
+#define BPF_JMP_JA	(BPF_JMP | BPF_JA)
+#define BPF_JMP_JEQ	(BPF_JMP | BPF_JEQ)
+#define BPF_JMP_JGT	(BPF_JMP | BPF_JGT)
+#define BPF_JMP_JGE	(BPF_JMP | BPF_JGE)
+#define BPF_JMP_JSET	(BPF_JMP | BPF_JSET)
+#define BPF_ALU_ADD	(BPF_ALU | BPF_ADD)
+#define BPF_ALU_SUB	(BPF_ALU | BPF_SUB)
+#define BPF_ALU_MUL	(BPF_ALU | BPF_MUL)
+#define BPF_ALU_DIV	(BPF_ALU | BPF_DIV)
+#define BPF_ALU_MOD	(BPF_ALU | BPF_MOD)
+#define BPF_ALU_NEG	(BPF_ALU | BPF_NEG)
+#define BPF_ALU_AND	(BPF_ALU | BPF_AND)
+#define BPF_ALU_OR	(BPF_ALU | BPF_OR)
+#define BPF_ALU_XOR	(BPF_ALU | BPF_XOR)
+#define BPF_ALU_LSH	(BPF_ALU | BPF_LSH)
+#define BPF_ALU_RSH	(BPF_ALU | BPF_RSH)
+#define BPF_MISC_TAX	(BPF_MISC | BPF_TAX)
+#define BPF_MISC_TXA	(BPF_MISC | BPF_TXA)
+#define BPF_LD_B	(BPF_LD | BPF_B)
+#define BPF_LD_H	(BPF_LD | BPF_H)
+#define BPF_LD_W	(BPF_LD | BPF_W)
+
+#ifndef array_size
+# define array_size(x)	(sizeof(x) / sizeof((x)[0]))
+#endif
+
+#ifndef __check_format_printf
+# define __check_format_printf(pos_fmtstr, pos_fmtargs) \
+	__attribute__ ((format (printf, (pos_fmtstr), (pos_fmtargs))))
+#endif
+
+enum {
+	CMD_OK,
+	CMD_ERR,
+	CMD_EX,
+};
+
+struct shell_cmd {
+	const char *name;
+	int (*func)(char *args);
+};
+
+struct pcap_filehdr {
+	uint32_t magic;
+	uint16_t version_major;
+	uint16_t version_minor;
+	int32_t  thiszone;
+	uint32_t sigfigs;
+	uint32_t snaplen;
+	uint32_t linktype;
+};
+
+struct pcap_timeval {
+	int32_t tv_sec;
+	int32_t tv_usec;
+};
+
+struct pcap_pkthdr {
+	struct pcap_timeval ts;
+	uint32_t caplen;
+	uint32_t len;
+};
+
+struct bpf_regs {
+	uint32_t A;
+	uint32_t X;
+	uint32_t M[BPF_MEMWORDS];
+	uint32_t R;
+	bool     Rs;
+	uint16_t Pc;
+};
+
+static struct sock_filter bpf_image[BPF_MAXINSNS + 1];
+static unsigned int bpf_prog_len = 0;
+
+static int bpf_breakpoints[64];
+static struct bpf_regs bpf_regs[BPF_MAXINSNS + 1];
+static struct bpf_regs bpf_curr;
+static unsigned int bpf_regs_len = 0;
+
+static int pcap_fd = -1;
+static unsigned int pcap_packet = 0;
+static size_t pcap_map_size = 0;
+static char *pcap_ptr_va_start, *pcap_ptr_va_curr;
+
+static const char * const op_table[] = {
+	[BPF_ST]	= "st",
+	[BPF_STX]	= "stx",
+	[BPF_LD_B]	= "ldb",
+	[BPF_LD_H]	= "ldh",
+	[BPF_LD_W]	= "ld",
+	[BPF_LDX]	= "ldx",
+	[BPF_LDX_B]	= "ldxb",
+	[BPF_JMP_JA]	= "ja",
+	[BPF_JMP_JEQ]	= "jeq",
+	[BPF_JMP_JGT]	= "jgt",
+	[BPF_JMP_JGE]	= "jge",
+	[BPF_JMP_JSET]	= "jset",
+	[BPF_ALU_ADD]	= "add",
+	[BPF_ALU_SUB]	= "sub",
+	[BPF_ALU_MUL]	= "mul",
+	[BPF_ALU_DIV]	= "div",
+	[BPF_ALU_MOD]	= "mod",
+	[BPF_ALU_NEG]	= "neg",
+	[BPF_ALU_AND]	= "and",
+	[BPF_ALU_OR]	= "or",
+	[BPF_ALU_XOR]	= "xor",
+	[BPF_ALU_LSH]	= "lsh",
+	[BPF_ALU_RSH]	= "rsh",
+	[BPF_MISC_TAX]	= "tax",
+	[BPF_MISC_TXA]	= "txa",
+	[BPF_RET]	= "ret",
+};
+
+static __check_format_printf(1, 2) int rl_printf(const char *fmt, ...)
+{
+	int ret;
+	va_list vl;
+
+	va_start(vl, fmt);
+	ret = vfprintf(rl_outstream, fmt, vl);
+	va_end(vl);
+
+	return ret;
+}
+
+static int matches(const char *cmd, const char *pattern)
+{
+	int len = strlen(cmd);
+
+	if (len > strlen(pattern))
+		return -1;
+
+	return memcmp(pattern, cmd, len);
+}
+
+static void hex_dump(const uint8_t *buf, size_t len)
+{
+	int i;
+
+	rl_printf("%3u: ", 0);
+	for (i = 0; i < len; i++) {
+		if (i && !(i % 16))
+			rl_printf("\n%3u: ", i);
+		rl_printf("%02x ", buf[i]);
+	}
+	rl_printf("\n");
+}
+
+static bool bpf_prog_loaded(void)
+{
+	if (bpf_prog_len == 0)
+		rl_printf("no bpf program loaded!\n");
+
+	return bpf_prog_len > 0;
+}
+
+static void bpf_disasm(const struct sock_filter f, unsigned int i)
+{
+	const char *op, *fmt;
+	int val = f.k;
+	char buf[256];
+
+	switch (f.code) {
+	case BPF_RET | BPF_K:
+		op = op_table[BPF_RET];
+		fmt = "#%#x";
+		break;
+	case BPF_RET | BPF_A:
+		op = op_table[BPF_RET];
+		fmt = "a";
+		break;
+	case BPF_RET | BPF_X:
+		op = op_table[BPF_RET];
+		fmt = "x";
+		break;
+	case BPF_MISC_TAX:
+		op = op_table[BPF_MISC_TAX];
+		fmt = "";
+		break;
+	case BPF_MISC_TXA:
+		op = op_table[BPF_MISC_TXA];
+		fmt = "";
+		break;
+	case BPF_ST:
+		op = op_table[BPF_ST];
+		fmt = "M[%d]";
+		break;
+	case BPF_STX:
+		op = op_table[BPF_STX];
+		fmt = "M[%d]";
+		break;
+	case BPF_LD_W | BPF_ABS:
+		op = op_table[BPF_LD_W];
+		fmt = "[%d]";
+		break;
+	case BPF_LD_H | BPF_ABS:
+		op = op_table[BPF_LD_H];
+		fmt = "[%d]";
+		break;
+	case BPF_LD_B | BPF_ABS:
+		op = op_table[BPF_LD_B];
+		fmt = "[%d]";
+		break;
+	case BPF_LD_W | BPF_LEN:
+		op = op_table[BPF_LD_W];
+		fmt = "#len";
+		break;
+	case BPF_LD_W | BPF_IND:
+		op = op_table[BPF_LD_W];
+		fmt = "[x+%d]";
+		break;
+	case BPF_LD_H | BPF_IND:
+		op = op_table[BPF_LD_H];
+		fmt = "[x+%d]";
+		break;
+	case BPF_LD_B | BPF_IND:
+		op = op_table[BPF_LD_B];
+		fmt = "[x+%d]";
+		break;
+	case BPF_LD | BPF_IMM:
+		op = op_table[BPF_LD_W];
+		fmt = "#%#x";
+		break;
+	case BPF_LDX | BPF_IMM:
+		op = op_table[BPF_LDX];
+		fmt = "#%#x";
+		break;
+	case BPF_LDX_B | BPF_MSH:
+		op = op_table[BPF_LDX_B];
+		fmt = "4*([%d]&0xf)";
+		break;
+	case BPF_LD | BPF_MEM:
+		op = op_table[BPF_LD_W];
+		fmt = "M[%d]";
+		break;
+	case BPF_LDX | BPF_MEM:
+		op = op_table[BPF_LDX];
+		fmt = "M[%d]";
+		break;
+	case BPF_JMP_JA:
+		op = op_table[BPF_JMP_JA];
+		fmt = "%d";
+		val = i + 1 + f.k;
+		break;
+	case BPF_JMP_JGT | BPF_X:
+		op = op_table[BPF_JMP_JGT];
+		fmt = "x";
+		break;
+	case BPF_JMP_JGT | BPF_K:
+		op = op_table[BPF_JMP_JGT];
+		fmt = "#%#x";
+		break;
+	case BPF_JMP_JGE | BPF_X:
+		op = op_table[BPF_JMP_JGE];
+		fmt = "x";
+		break;
+	case BPF_JMP_JGE | BPF_K:
+		op = op_table[BPF_JMP_JGE];
+		fmt = "#%#x";
+		break;
+	case BPF_JMP_JEQ | BPF_X:
+		op = op_table[BPF_JMP_JEQ];
+		fmt = "x";
+		break;
+	case BPF_JMP_JEQ | BPF_K:
+		op = op_table[BPF_JMP_JEQ];
+		fmt = "#%#x";
+		break;
+	case BPF_JMP_JSET | BPF_X:
+		op = op_table[BPF_JMP_JSET];
+		fmt = "x";
+		break;
+	case BPF_JMP_JSET | BPF_K:
+		op = op_table[BPF_JMP_JSET];
+		fmt = "#%#x";
+		break;
+	case BPF_ALU_NEG:
+		op = op_table[BPF_ALU_NEG];
+		fmt = "";
+		break;
+	case BPF_ALU_LSH | BPF_X:
+		op = op_table[BPF_ALU_LSH];
+		fmt = "x";
+		break;
+	case BPF_ALU_LSH | BPF_K:
+		op = op_table[BPF_ALU_LSH];
+		fmt = "#%d";
+		break;
+	case BPF_ALU_RSH | BPF_X:
+		op = op_table[BPF_ALU_RSH];
+		fmt = "x";
+		break;
+	case BPF_ALU_RSH | BPF_K:
+		op = op_table[BPF_ALU_RSH];
+		fmt = "#%d";
+		break;
+	case BPF_ALU_ADD | BPF_X:
+		op = op_table[BPF_ALU_ADD];
+		fmt = "x";
+		break;
+	case BPF_ALU_ADD | BPF_K:
+		op = op_table[BPF_ALU_ADD];
+		fmt = "#%d";
+		break;
+	case BPF_ALU_SUB | BPF_X:
+		op = op_table[BPF_ALU_SUB];
+		fmt = "x";
+		break;
+	case BPF_ALU_SUB | BPF_K:
+		op = op_table[BPF_ALU_SUB];
+		fmt = "#%d";
+		break;
+	case BPF_ALU_MUL | BPF_X:
+		op = op_table[BPF_ALU_MUL];
+		fmt = "x";
+		break;
+	case BPF_ALU_MUL | BPF_K:
+		op = op_table[BPF_ALU_MUL];
+		fmt = "#%d";
+		break;
+	case BPF_ALU_DIV | BPF_X:
+		op = op_table[BPF_ALU_DIV];
+		fmt = "x";
+		break;
+	case BPF_ALU_DIV | BPF_K:
+		op = op_table[BPF_ALU_DIV];
+		fmt = "#%d";
+		break;
+	case BPF_ALU_MOD | BPF_X:
+		op = op_table[BPF_ALU_MOD];
+		fmt = "x";
+		break;
+	case BPF_ALU_MOD | BPF_K:
+		op = op_table[BPF_ALU_MOD];
+		fmt = "#%d";
+		break;
+	case BPF_ALU_AND | BPF_X:
+		op = op_table[BPF_ALU_AND];
+		fmt = "x";
+		break;
+	case BPF_ALU_AND | BPF_K:
+		op = op_table[BPF_ALU_AND];
+		fmt = "#%#x";
+		break;
+	case BPF_ALU_OR | BPF_X:
+		op = op_table[BPF_ALU_OR];
+		fmt = "x";
+		break;
+	case BPF_ALU_OR | BPF_K:
+		op = op_table[BPF_ALU_OR];
+		fmt = "#%#x";
+		break;
+	case BPF_ALU_XOR | BPF_X:
+		op = op_table[BPF_ALU_XOR];
+		fmt = "x";
+		break;
+	case BPF_ALU_XOR | BPF_K:
+		op = op_table[BPF_ALU_XOR];
+		fmt = "#%#x";
+		break;
+	default:
+		op = "nosup";
+		fmt = "%#x";
+		val = f.code;
+		break;
+	}
+
+	memset(buf, 0, sizeof(buf));
+	snprintf(buf, sizeof(buf), fmt, val);
+	buf[sizeof(buf) - 1] = 0;
+
+	if ((BPF_CLASS(f.code) == BPF_JMP && BPF_OP(f.code) != BPF_JA))
+		rl_printf("l%d:\t%s %s, l%d, l%d\n", i, op, buf,
+			  i + 1 + f.jt, i + 1 + f.jf);
+	else
+		rl_printf("l%d:\t%s %s\n", i, op, buf);
+}
+
+static void bpf_dump_curr(struct bpf_regs *r, struct sock_filter *f)
+{
+	int i, m = 0;
+
+	rl_printf("pc:       [%u]\n", r->Pc);
+	rl_printf("code:     [%u] jt[%u] jf[%u] k[%u]\n",
+		  f->code, f->jt, f->jf, f->k);
+	rl_printf("curr:     ");
+	bpf_disasm(*f, r->Pc);
+
+	if (f->jt || f->jf) {
+		rl_printf("jt:       ");
+		bpf_disasm(*(f + f->jt + 1), r->Pc + f->jt + 1);
+		rl_printf("jf:       ");
+		bpf_disasm(*(f + f->jf + 1), r->Pc + f->jf + 1);
+	}
+
+	rl_printf("A:        [%#08x][%u]\n", r->A, r->A);
+	rl_printf("X:        [%#08x][%u]\n", r->X, r->X);
+	if (r->Rs)
+		rl_printf("ret:      [%#08x][%u]!\n", r->R, r->R);
+
+	for (i = 0; i < BPF_MEMWORDS; i++) {
+		if (r->M[i]) {
+			m++;
+			rl_printf("M[%d]: [%#08x][%u]\n", i, r->M[i], r->M[i]);
+		}
+	}
+	if (m == 0)
+		rl_printf("M[0,%d]:  [%#08x][%u]\n", BPF_MEMWORDS - 1, 0, 0);
+}
+
+static void bpf_dump_pkt(uint8_t *pkt, uint32_t pkt_caplen, uint32_t pkt_len)
+{
+	if (pkt_caplen != pkt_len)
+		rl_printf("cap: %u, len: %u\n", pkt_caplen, pkt_len);
+	else
+		rl_printf("len: %u\n", pkt_len);
+
+	hex_dump(pkt, pkt_caplen);
+}
+
+static void bpf_disasm_all(const struct sock_filter *f, unsigned int len)
+{
+	unsigned int i;
+
+	for (i = 0; i < len; i++)
+		bpf_disasm(f[i], i);
+}
+
+static void bpf_dump_all(const struct sock_filter *f, unsigned int len)
+{
+	unsigned int i;
+
+	rl_printf("/* { op, jt, jf, k }, */\n");
+	for (i = 0; i < len; i++)
+		rl_printf("{ %#04x, %2u, %2u, %#010x },\n",
+			  f[i].code, f[i].jt, f[i].jf, f[i].k);
+}
+
+static bool bpf_runnable(struct sock_filter *f, unsigned int len)
+{
+	int sock, ret, i;
+	struct sock_fprog bpf = {
+		.filter = f,
+		.len = len,
+	};
+
+	sock = socket(AF_INET, SOCK_DGRAM, 0);
+	if (sock < 0) {
+		rl_printf("cannot open socket!\n");
+		return false;
+	}
+	ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf));
+	close(sock);
+	if (ret < 0) {
+		rl_printf("program not allowed to run by kernel!\n");
+		return false;
+	}
+	for (i = 0; i < len; i++) {
+		if (BPF_CLASS(f[i].code) == BPF_LD &&
+		    f[i].k > SKF_AD_OFF) {
+			rl_printf("extensions currently not supported!\n");
+			return false;
+		}
+	}
+
+	return true;
+}
+
+static void bpf_reset_breakpoints(void)
+{
+	int i;
+
+	for (i = 0; i < array_size(bpf_breakpoints); i++)
+		bpf_breakpoints[i] = -1;
+}
+
+static void bpf_set_breakpoints(unsigned int where)
+{
+	int i;
+	bool set = false;
+
+	for (i = 0; i < array_size(bpf_breakpoints); i++) {
+		if (bpf_breakpoints[i] == (int) where) {
+			rl_printf("breakpoint already set!\n");
+			set = true;
+			break;
+		}
+
+		if (bpf_breakpoints[i] == -1 && set == false) {
+			bpf_breakpoints[i] = where;
+			set = true;
+		}
+	}
+
+	if (!set)
+		rl_printf("too many breakpoints set, reset first!\n");
+}
+
+static void bpf_dump_breakpoints(void)
+{
+	int i;
+
+	rl_printf("breakpoints: ");
+
+	for (i = 0; i < array_size(bpf_breakpoints); i++) {
+		if (bpf_breakpoints[i] < 0)
+			continue;
+		rl_printf("%d ", bpf_breakpoints[i]);
+	}
+
+	rl_printf("\n");
+}
+
+static void bpf_reset(void)
+{
+	bpf_regs_len = 0;
+
+	memset(bpf_regs, 0, sizeof(bpf_regs));
+	memset(&bpf_curr, 0, sizeof(bpf_curr));
+}
+
+static void bpf_safe_regs(void)
+{
+	memcpy(&bpf_regs[bpf_regs_len++], &bpf_curr, sizeof(bpf_curr));
+}
+
+static bool bpf_restore_regs(int off)
+{
+	unsigned int index = bpf_regs_len - 1 + off;
+
+	if (index == 0) {
+		bpf_reset();
+		return true;
+	} else if (index < bpf_regs_len) {
+		memcpy(&bpf_curr, &bpf_regs[index], sizeof(bpf_curr));
+		bpf_regs_len = index;
+		return true;
+	} else {
+		rl_printf("reached bottom of register history stack!\n");
+		return false;
+	}
+}
+
+static uint32_t extract_u32(uint8_t *pkt, uint32_t off)
+{
+	uint32_t r;
+
+	memcpy(&r, &pkt[off], sizeof(r));
+
+	return ntohl(r);
+}
+
+static uint16_t extract_u16(uint8_t *pkt, uint32_t off)
+{
+	uint16_t r;
+
+	memcpy(&r, &pkt[off], sizeof(r));
+
+	return ntohs(r);
+}
+
+static uint8_t extract_u8(uint8_t *pkt, uint32_t off)
+{
+	return pkt[off];
+}
+
+static void set_return(struct bpf_regs *r)
+{
+	r->R = 0;
+	r->Rs = true;
+}
+
+static void bpf_single_step(struct bpf_regs *r, struct sock_filter *f,
+			    uint8_t *pkt, uint32_t pkt_caplen,
+			    uint32_t pkt_len)
+{
+	uint32_t K = f->k;
+	int d;
+
+	switch (f->code) {
+	case BPF_RET | BPF_K:
+		r->R = K;
+		r->Rs = true;
+		break;
+	case BPF_RET | BPF_A:
+		r->R = r->A;
+		r->Rs = true;
+		break;
+	case BPF_RET | BPF_X:
+		r->R = r->X;
+		r->Rs = true;
+		break;
+	case BPF_MISC_TAX:
+		r->X = r->A;
+		break;
+	case BPF_MISC_TXA:
+		r->A = r->X;
+		break;
+	case BPF_ST:
+		r->M[K] = r->A;
+		break;
+	case BPF_STX:
+		r->M[K] = r->X;
+		break;
+	case BPF_LD_W | BPF_ABS:
+		d = pkt_caplen - K;
+		if (d >= sizeof(uint32_t))
+			r->A = extract_u32(pkt, K);
+		else
+			set_return(r);
+		break;
+	case BPF_LD_H | BPF_ABS:
+		d = pkt_caplen - K;
+		if (d >= sizeof(uint16_t))
+			r->A = extract_u16(pkt, K);
+		else
+			set_return(r);
+		break;
+	case BPF_LD_B | BPF_ABS:
+		d = pkt_caplen - K;
+		if (d >= sizeof(uint8_t))
+			r->A = extract_u8(pkt, K);
+		else
+			set_return(r);
+		break;
+	case BPF_LD_W | BPF_IND:
+		d = pkt_caplen - (r->X + K);
+		if (d >= sizeof(uint32_t))
+			r->A = extract_u32(pkt, r->X + K);
+		break;
+	case BPF_LD_H | BPF_IND:
+		d = pkt_caplen - (r->X + K);
+		if (d >= sizeof(uint16_t))
+			r->A = extract_u16(pkt, r->X + K);
+		else
+			set_return(r);
+		break;
+	case BPF_LD_B | BPF_IND:
+		d = pkt_caplen - (r->X + K);
+		if (d >= sizeof(uint8_t))
+			r->A = extract_u8(pkt, r->X + K);
+		else
+			set_return(r);
+		break;
+	case BPF_LDX_B | BPF_MSH:
+		d = pkt_caplen - K;
+		if (d >= sizeof(uint8_t)) {
+			r->X = extract_u8(pkt, K);
+			r->X = (r->X & 0xf) << 2;
+		} else
+			set_return(r);
+		break;
+	case BPF_LD_W | BPF_LEN:
+		r->A = pkt_len;
+		break;
+	case BPF_LDX_W | BPF_LEN:
+		r->A = pkt_len;
+		break;
+	case BPF_LD | BPF_IMM:
+		r->A = K;
+		break;
+	case BPF_LDX | BPF_IMM:
+		r->X = K;
+		break;
+	case BPF_LD | BPF_MEM:
+		r->A = r->M[K];
+		break;
+	case BPF_LDX | BPF_MEM:
+		r->X = r->M[K];
+		break;
+	case BPF_JMP_JA:
+		r->Pc += K;
+		break;
+	case BPF_JMP_JGT | BPF_X:
+		r->Pc += r->A > r->X ? f->jt : f->jf;
+		break;
+	case BPF_JMP_JGT | BPF_K:
+		r->Pc += r->A > K ? f->jt : f->jf;
+		break;
+	case BPF_JMP_JGE | BPF_X:
+		r->Pc += r->A >= r->X ? f->jt : f->jf;
+		break;
+	case BPF_JMP_JGE | BPF_K:
+		r->Pc += r->A >= K ? f->jt : f->jf;
+		break;
+	case BPF_JMP_JEQ | BPF_X:
+		r->Pc += r->A == r->X ? f->jt : f->jf;
+		break;
+	case BPF_JMP_JEQ | BPF_K:
+		r->Pc += r->A == K ? f->jt : f->jf;
+		break;
+	case BPF_JMP_JSET | BPF_X:
+		r->Pc += r->A & r->X ? f->jt : f->jf;
+		break;
+	case BPF_JMP_JSET | BPF_K:
+		r->Pc += r->A & K ? f->jt : f->jf;
+		break;
+	case BPF_ALU_NEG:
+		r->A = -r->A;
+		break;
+	case BPF_ALU_LSH | BPF_X:
+		r->A <<= r->X;
+		break;
+	case BPF_ALU_LSH | BPF_K:
+		r->A <<= K;
+		break;
+	case BPF_ALU_RSH | BPF_X:
+		r->A >>= r->X;
+		break;
+	case BPF_ALU_RSH | BPF_K:
+		r->A >>= K;
+		break;
+	case BPF_ALU_ADD | BPF_X:
+		r->A += r->X;
+		break;
+	case BPF_ALU_ADD | BPF_K:
+		r->A += K;
+		break;
+	case BPF_ALU_SUB | BPF_X:
+		r->A -= r->X;
+		break;
+	case BPF_ALU_SUB | BPF_K:
+		r->A -= K;
+		break;
+	case BPF_ALU_MUL | BPF_X:
+		r->A *= r->X;
+		break;
+	case BPF_ALU_MUL | BPF_K:
+		r->A *= K;
+		break;
+	case BPF_ALU_DIV | BPF_X:
+	case BPF_ALU_MOD | BPF_X:
+		if (r->X == 0) {
+			set_return(r);
+			break;
+		}
+		goto do_div;
+	case BPF_ALU_DIV | BPF_K:
+	case BPF_ALU_MOD | BPF_K:
+		if (K == 0) {
+			set_return(r);
+			break;
+		}
+do_div:
+		switch (f->code) {
+		case BPF_ALU_DIV | BPF_X:
+			r->A /= r->X;
+			break;
+		case BPF_ALU_DIV | BPF_K:
+			r->A /= K;
+			break;
+		case BPF_ALU_MOD | BPF_X:
+			r->A %= r->X;
+			break;
+		case BPF_ALU_MOD | BPF_K:
+			r->A %= K;
+			break;
+		}
+		break;
+	case BPF_ALU_AND | BPF_X:
+		r->A &= r->X;
+		break;
+	case BPF_ALU_AND | BPF_K:
+		r->A &= K;
+		break;
+	case BPF_ALU_OR | BPF_X:
+		r->A |= r->X;
+		break;
+	case BPF_ALU_OR | BPF_K:
+		r->A |= K;
+		break;
+	case BPF_ALU_XOR | BPF_X:
+		r->A ^= r->X;
+		break;
+	case BPF_ALU_XOR | BPF_K:
+		r->A ^= K;
+		break;
+	}
+}
+
+static bool bpf_pc_has_breakpoint(uint16_t pc)
+{
+	int i;
+
+	for (i = 0; i < array_size(bpf_breakpoints); i++) {
+		if (bpf_breakpoints[i] < 0)
+			continue;
+		if (bpf_breakpoints[i] == pc)
+			return true;
+	}
+
+	return false;
+}
+
+static bool bpf_handle_breakpoint(struct bpf_regs *r, struct sock_filter *f,
+				  uint8_t *pkt, uint32_t pkt_caplen,
+				  uint32_t pkt_len)
+{
+	rl_printf("-- register dump --\n");
+	bpf_dump_curr(r, &f[r->Pc]);
+	rl_printf("-- packet dump --\n");
+	bpf_dump_pkt(pkt, pkt_caplen, pkt_len);
+	rl_printf("(breakpoint)\n");
+	return true;
+}
+
+static int bpf_run_all(struct sock_filter *f, uint16_t bpf_len, uint8_t *pkt,
+		       uint32_t pkt_caplen, uint32_t pkt_len)
+{
+	bool stop = false;
+
+	while (bpf_curr.Rs == false && stop == false) {
+		bpf_safe_regs();
+
+		if (bpf_pc_has_breakpoint(bpf_curr.Pc))
+			stop = bpf_handle_breakpoint(&bpf_curr, f, pkt,
+						     pkt_caplen, pkt_len);
+
+		bpf_single_step(&bpf_curr, &f[bpf_curr.Pc], pkt, pkt_caplen,
+				pkt_len);
+		bpf_curr.Pc++;
+	}
+
+	return stop ? -1 : bpf_curr.R;
+}
+
+static int bpf_run_stepping(struct sock_filter *f, uint16_t bpf_len,
+			    uint8_t *pkt, uint32_t pkt_caplen,
+			    uint32_t pkt_len, int next)
+{
+	bool stop = false;
+	int i = 1;
+
+	while (bpf_curr.Rs == false && stop == false) {
+		bpf_safe_regs();
+
+		if (i++ == next)
+			stop = bpf_handle_breakpoint(&bpf_curr, f, pkt,
+						     pkt_caplen, pkt_len);
+
+		bpf_single_step(&bpf_curr, &f[bpf_curr.Pc], pkt, pkt_caplen,
+				pkt_len);
+		bpf_curr.Pc++;
+	}
+
+	return stop ? -1 : bpf_curr.R;
+}
+
+static bool pcap_loaded(void)
+{
+	if (pcap_fd < 0)
+		rl_printf("no pcap file loaded!\n");
+
+	return pcap_fd >= 0;
+}
+
+static struct pcap_pkthdr *pcap_curr_pkt(void)
+{
+	return (void *) pcap_ptr_va_curr;
+}
+
+static bool pcap_next_pkt(void)
+{
+	struct pcap_pkthdr *hdr = pcap_curr_pkt();
+
+	if (pcap_ptr_va_curr + sizeof(*hdr) -
+	    pcap_ptr_va_start >= pcap_map_size)
+		return false;
+	if (hdr->caplen == 0 || hdr->len == 0 || hdr->caplen > hdr->len)
+		return false;
+	if (pcap_ptr_va_curr + sizeof(*hdr) + hdr->caplen -
+	    pcap_ptr_va_start >= pcap_map_size)
+		return false;
+
+	pcap_ptr_va_curr += (sizeof(*hdr) + hdr->caplen);
+	return true;
+}
+
+static void pcap_reset_pkt(void)
+{
+	pcap_ptr_va_curr = pcap_ptr_va_start + sizeof(struct pcap_filehdr);
+}
+
+static int try_load_pcap(const char *file)
+{
+	struct pcap_filehdr *hdr;
+	struct stat sb;
+	int ret;
+
+	pcap_fd = open(file, O_RDONLY);
+	if (pcap_fd < 0) {
+		rl_printf("cannot open pcap [%s]!\n", strerror(errno));
+		return CMD_ERR;
+	}
+
+	ret = fstat(pcap_fd, &sb);
+	if (ret < 0) {
+		rl_printf("cannot fstat pcap file!\n");
+		return CMD_ERR;
+	}
+
+	if (!S_ISREG(sb.st_mode)) {
+		rl_printf("not a regular pcap file, duh!\n");
+		return CMD_ERR;
+	}
+
+	pcap_map_size = sb.st_size;
+	if (pcap_map_size <= sizeof(struct pcap_filehdr)) {
+		rl_printf("pcap file too small!\n");
+		return CMD_ERR;
+	}
+
+	pcap_ptr_va_start = mmap(NULL, pcap_map_size, PROT_READ,
+				 MAP_SHARED | MAP_LOCKED, pcap_fd, 0);
+	if (pcap_ptr_va_start == MAP_FAILED) {
+		rl_printf("mmap of file failed!");
+		return CMD_ERR;
+	}
+
+	hdr = (void *) pcap_ptr_va_start;
+	if (hdr->magic != TCPDUMP_MAGIC) {
+		rl_printf("wrong pcap magic!\n");
+		return CMD_ERR;
+	}
+
+	pcap_reset_pkt();
+
+	return CMD_OK;
+
+}
+
+static void try_close_pcap(void)
+{
+	if (pcap_fd >= 0) {
+		munmap(pcap_ptr_va_start, pcap_map_size);
+		close(pcap_fd);
+
+		pcap_ptr_va_start = pcap_ptr_va_curr = NULL;
+		pcap_map_size = 0;
+		pcap_packet = 0;
+		pcap_fd = -1;
+	}
+}
+
+static int cmd_load_bpf(char *bpf_string)
+{
+	char sp, *token, separator = ',';
+	unsigned short bpf_len, i = 0;
+	struct sock_filter tmp;
+
+	bpf_prog_len = 0;
+	memset(bpf_image, 0, sizeof(bpf_image));
+
+	if (sscanf(bpf_string, "%hu%c", &bpf_len, &sp) != 2 ||
+	    sp != separator || bpf_len > BPF_MAXINSNS || bpf_len == 0) {
+		rl_printf("syntax error in head length encoding!\n");
+		return CMD_ERR;
+	}
+
+	token = bpf_string;
+	while ((token = strchr(token, separator)) && (++token)[0]) {
+		if (i >= bpf_len) {
+			rl_printf("program exceeds encoded length!\n");
+			return CMD_ERR;
+		}
+
+		if (sscanf(token, "%hu %hhu %hhu %u,",
+			   &tmp.code, &tmp.jt, &tmp.jf, &tmp.k) != 4) {
+			rl_printf("syntax error at instruction %d!\n", i);
+			return CMD_ERR;
+		}
+
+		bpf_image[i].code = tmp.code;
+		bpf_image[i].jt = tmp.jt;
+		bpf_image[i].jf = tmp.jf;
+		bpf_image[i].k = tmp.k;
+
+		i++;
+	}
+
+	if (i != bpf_len) {
+		rl_printf("syntax error exceeding encoded length!\n");
+		return CMD_ERR;
+	} else
+		bpf_prog_len = bpf_len;
+	if (!bpf_runnable(bpf_image, bpf_prog_len))
+		bpf_prog_len = 0;
+
+	return CMD_OK;
+}
+
+static int cmd_load_pcap(char *file)
+{
+	char *file_trim, *tmp;
+
+	file_trim = strtok_r(file, " ", &tmp);
+	if (file_trim == NULL)
+		return CMD_ERR;
+
+	try_close_pcap();
+
+	return try_load_pcap(file_trim);
+}
+
+static int cmd_load(char *arg)
+{
+	char *subcmd, *cont, *tmp = strdup(arg);
+	int ret = CMD_OK;
+
+	subcmd = strtok_r(tmp, " ", &cont);
+	if (subcmd == NULL)
+		goto out;
+	if (matches(subcmd, "bpf") == 0) {
+		bpf_reset();
+		bpf_reset_breakpoints();
+
+		ret = cmd_load_bpf(cont);
+	} else if (matches(subcmd, "pcap") == 0) {
+		ret = cmd_load_pcap(cont);
+	} else {
+out:
+		rl_printf("bpf <code>:  load bpf code\n");
+		rl_printf("pcap <file>: load pcap file\n");
+		ret = CMD_ERR;
+	}
+
+	free(tmp);
+	return ret;
+}
+
+static int cmd_step(char *num)
+{
+	struct pcap_pkthdr *hdr;
+	int steps, ret;
+
+	if (!bpf_prog_loaded() || !pcap_loaded())
+		return CMD_ERR;
+
+	steps = strtol(num, NULL, 10);
+	if (steps == 0 || strlen(num) == 0)
+		steps = 1;
+	if (steps < 0) {
+		if (!bpf_restore_regs(steps))
+			return CMD_ERR;
+		steps = 1;
+	}
+
+	hdr = pcap_curr_pkt();
+	ret = bpf_run_stepping(bpf_image, bpf_prog_len,
+			       (uint8_t *) hdr + sizeof(*hdr),
+			       hdr->caplen, hdr->len, steps);
+	if (ret >= 0 || bpf_curr.Rs) {
+		bpf_reset();
+		if (!pcap_next_pkt()) {
+			rl_printf("(going back to first packet)\n");
+			pcap_reset_pkt();
+		} else {
+			rl_printf("(next packet)\n");
+		}
+	}
+
+	return CMD_OK;
+}
+
+static int cmd_select(char *num)
+{
+	unsigned int which, i;
+	bool have_next = true;
+
+	if (!pcap_loaded() || strlen(num) == 0)
+		return CMD_ERR;
+
+	which = strtoul(num, NULL, 10);
+	if (which == 0) {
+		rl_printf("packet count starts with 1, clamping!\n");
+		which = 1;
+	}
+
+	pcap_reset_pkt();
+	bpf_reset();
+
+	for (i = 0; i < which && (have_next = pcap_next_pkt()); i++)
+		/* noop */;
+	if (!have_next || pcap_curr_pkt() == NULL) {
+		rl_printf("no packet #%u available!\n", which);
+		pcap_reset_pkt();
+		return CMD_ERR;
+	}
+
+	return CMD_OK;
+}
+
+static int cmd_breakpoint(char *subcmd)
+{
+	if (!bpf_prog_loaded())
+		return CMD_ERR;
+	if (strlen(subcmd) == 0)
+		bpf_dump_breakpoints();
+	else if (matches(subcmd, "reset") == 0)
+		bpf_reset_breakpoints();
+	else {
+		unsigned int where = strtoul(subcmd, NULL, 10);
+
+		if (where < bpf_prog_len) {
+			bpf_set_breakpoints(where);
+			rl_printf("breakpoint at: ");
+			bpf_disasm(bpf_image[where], where);
+		}
+	}
+
+	return CMD_OK;
+}
+
+static int cmd_run(char *num)
+{
+	static uint32_t pass = 0, fail = 0;
+	bool has_limit = true;
+	int pkts = 0, i = 0;
+
+	if (!bpf_prog_loaded() || !pcap_loaded())
+		return CMD_ERR;
+
+	pkts = strtol(num, NULL, 10);
+	if (pkts == 0 || strlen(num) == 0)
+		has_limit = false;
+
+	do {
+		struct pcap_pkthdr *hdr = pcap_curr_pkt();
+		int ret = bpf_run_all(bpf_image, bpf_prog_len,
+				      (uint8_t *) hdr + sizeof(*hdr),
+				      hdr->caplen, hdr->len);
+		if (ret > 0)
+			pass++;
+		else if (ret == 0)
+			fail++;
+		else
+			return CMD_OK;
+		bpf_reset();
+	} while (pcap_next_pkt() && (!has_limit || (has_limit && ++i < pkts)));
+
+	rl_printf("bpf passes:%u fails:%u\n", pass, fail);
+
+	pcap_reset_pkt();
+	bpf_reset();
+
+	pass = fail = 0;
+	return CMD_OK;
+}
+
+static int cmd_disassemble(char *line_string)
+{
+	bool single_line = false;
+	unsigned long line;
+
+	if (!bpf_prog_loaded())
+		return CMD_ERR;
+	if (strlen(line_string) > 0 &&
+	    (line = strtoul(line_string, NULL, 10)) < bpf_prog_len)
+		single_line = true;
+	if (single_line)
+		bpf_disasm(bpf_image[line], line);
+	else
+		bpf_disasm_all(bpf_image, bpf_prog_len);
+
+	return CMD_OK;
+}
+
+static int cmd_dump(char *dontcare)
+{
+	if (!bpf_prog_loaded())
+		return CMD_ERR;
+
+	bpf_dump_all(bpf_image, bpf_prog_len);
+
+	return CMD_OK;
+}
+
+static int cmd_quit(char *dontcare)
+{
+	return CMD_EX;
+}
+
+static const struct shell_cmd cmds[] = {
+	{ .name = "load", .func = cmd_load },
+	{ .name = "select", .func = cmd_select },
+	{ .name = "step", .func = cmd_step },
+	{ .name = "run", .func = cmd_run },
+	{ .name = "breakpoint", .func = cmd_breakpoint },
+	{ .name = "disassemble", .func = cmd_disassemble },
+	{ .name = "dump", .func = cmd_dump },
+	{ .name = "quit", .func = cmd_quit },
+};
+
+static int execf(char *arg)
+{
+	char *cmd, *cont, *tmp = strdup(arg);
+	int i, ret = 0, len;
+
+	cmd = strtok_r(tmp, " ", &cont);
+	if (cmd == NULL)
+		goto out;
+	len = strlen(cmd);
+	for (i = 0; i < array_size(cmds); i++) {
+		if (len != strlen(cmds[i].name))
+			continue;
+		if (strncmp(cmds[i].name, cmd, len) == 0) {
+			ret = cmds[i].func(cont);
+			break;
+		}
+	}
+out:
+	free(tmp);
+	return ret;
+}
+
+static char *shell_comp_gen(const char *buf, int state)
+{
+	static int list_index, len;
+
+	if (!state) {
+		list_index = 0;
+		len = strlen(buf);
+	}
+
+	for (; list_index < array_size(cmds); ) {
+		const char *name = cmds[list_index].name;
+
+		list_index++;
+		if (strncmp(name, buf, len) == 0)
+			return strdup(name);
+	}
+
+	return NULL;
+}
+
+static char **shell_completion(const char *buf, int start, int end)
+{
+	char **matches = NULL;
+
+	if (start == 0)
+		matches = rl_completion_matches(buf, shell_comp_gen);
+
+	return matches;
+}
+
+static void intr_shell(int sig)
+{
+	if (rl_end)
+		rl_kill_line(-1, 0);
+
+	rl_crlf();
+	rl_refresh_line(0, 0);
+	rl_free_line_state();
+}
+
+static void init_shell(FILE *fin, FILE *fout)
+{
+	char file[128];
+
+	snprintf(file, sizeof(file), "%s/.bpf_dbg_history", getenv("HOME"));
+	read_history(file);
+
+	rl_instream = fin;
+	rl_outstream = fout;
+
+	rl_readline_name = "bpf_dbg";
+	rl_terminal_name = getenv("TERM");
+
+	rl_catch_signals = 0;
+	rl_catch_sigwinch = 1;
+
+	rl_attempted_completion_function = shell_completion;
+
+	rl_bind_key('\t', rl_complete);
+
+	rl_bind_key_in_map('\t', rl_complete, emacs_meta_keymap);
+	rl_bind_key_in_map('\033', rl_complete, emacs_meta_keymap);
+
+	snprintf(file, sizeof(file), "%s/.bpf_dbg_init", getenv("HOME"));
+	rl_read_init_file(file);
+
+	rl_prep_terminal(0);
+	rl_set_signals();
+
+	signal(SIGINT, intr_shell);
+}
+
+static void exit_shell(FILE *fin, FILE *fout)
+{
+	char file[128];
+
+	snprintf(file, sizeof(file), "%s/.bpf_dbg_history", getenv("HOME"));
+	write_history(file);
+
+	clear_history();
+	rl_deprep_terminal();
+
+	try_close_pcap();
+
+	if (fin != stdin)
+		fclose(fin);
+	if (fout != stdout)
+		fclose(fout);
+}
+
+static int run_shell_loop(FILE *fin, FILE *fout)
+{
+	char *buf;
+
+	init_shell(fin, fout);
+
+	while ((buf = readline("> ")) != NULL) {
+		int ret = execf(buf);
+		if (ret == CMD_EX)
+			break;
+		if (ret == CMD_OK && strlen(buf) > 0)
+			add_history(buf);
+
+		free(buf);
+	}
+
+	exit_shell(fin, fout);
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	FILE *fin = NULL, *fout = NULL;
+
+	if (argc >= 2)
+		fin = fopen(argv[1], "r");
+	if (argc >= 3)
+		fout = fopen(argv[2], "w");
+
+	return run_shell_loop(fin ? : stdin, fout ? : stdout);
+}
diff --git a/tools/net/bpf_exp.l b/tools/net/bpf_exp.l
new file mode 100644
index 00000000000..833a96611da
--- /dev/null
+++ b/tools/net/bpf_exp.l
@@ -0,0 +1,144 @@
+/*
+ * BPF asm code lexer
+ *
+ * This program is free software; you can distribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * Syntax kept close to:
+ *
+ * Steven McCanne and Van Jacobson. 1993. The BSD packet filter: a new
+ * architecture for user-level packet capture. In Proceedings of the
+ * USENIX Winter 1993 Conference Proceedings on USENIX Winter 1993
+ * Conference Proceedings (USENIX'93). USENIX Association, Berkeley,
+ * CA, USA, 2-2.
+ *
+ * Copyright 2013 Daniel Borkmann <borkmann@redhat.com>
+ * Licensed under the GNU General Public License, version 2.0 (GPLv2)
+ */
+
+%{
+
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include "bpf_exp.yacc.h"
+
+extern void yyerror(const char *str);
+
+%}
+
+%option align
+%option ecs
+
+%option nounput
+%option noreject
+%option noinput
+%option noyywrap
+
+%option 8bit
+%option caseless
+%option yylineno
+
+%%
+
+"ldb"		{ return OP_LDB; }
+"ldh"		{ return OP_LDH; }
+"ld"		{ return OP_LD; }
+"ldi"		{ return OP_LDI; }
+"ldx"		{ return OP_LDX; }
+"ldxi"		{ return OP_LDXI; }
+"ldxb"		{ return OP_LDXB; }
+"st"		{ return OP_ST; }
+"stx"		{ return OP_STX; }
+"jmp"		{ return OP_JMP; }
+"ja"		{ return OP_JMP; }
+"jeq"		{ return OP_JEQ; }
+"jneq"		{ return OP_JNEQ; }
+"jne"		{ return OP_JNEQ; }
+"jlt"		{ return OP_JLT; }
+"jle"		{ return OP_JLE; }
+"jgt"		{ return OP_JGT; }
+"jge"		{ return OP_JGE; }
+"jset"		{ return OP_JSET; }
+"add"		{ return OP_ADD; }
+"sub"		{ return OP_SUB; }
+"mul"		{ return OP_MUL; }
+"div"		{ return OP_DIV; }
+"mod"		{ return OP_MOD; }
+"neg"		{ return OP_NEG; }
+"and"		{ return OP_AND; }
+"xor"		{ return OP_XOR; }
+"or"		{ return OP_OR; }
+"lsh"		{ return OP_LSH; }
+"rsh"		{ return OP_RSH; }
+"ret"		{ return OP_RET; }
+"tax"		{ return OP_TAX; }
+"txa"		{ return OP_TXA; }
+
+"#"?("len")	{ return K_PKT_LEN; }
+"#"?("proto")	{ return K_PROTO; }
+"#"?("type")	{ return K_TYPE; }
+"#"?("poff")	{ return K_POFF; }
+"#"?("ifidx")	{ return K_IFIDX; }
+"#"?("nla")	{ return K_NLATTR; }
+"#"?("nlan")	{ return K_NLATTR_NEST; }
+"#"?("mark")	{ return K_MARK; }
+"#"?("queue")	{ return K_QUEUE; }
+"#"?("hatype")	{ return K_HATYPE; }
+"#"?("rxhash")	{ return K_RXHASH; }
+"#"?("cpu")	{ return K_CPU; }
+"#"?("vlan_tci") { return K_VLANT; }
+"#"?("vlan_pr")	{ return K_VLANP; }
+"#"?("rand")	{ return K_RAND; }
+
+":"		{ return ':'; }
+","		{ return ','; }
+"#"		{ return '#'; }
+"%"		{ return '%'; }
+"["		{ return '['; }
+"]"		{ return ']'; }
+"("		{ return '('; }
+")"		{ return ')'; }
+"x"		{ return 'x'; }
+"a"		{ return 'a'; }
+"+"		{ return '+'; }
+"M"		{ return 'M'; }
+"*"		{ return '*'; }
+"&"		{ return '&'; }
+
+([0][x][a-fA-F0-9]+) {
+			yylval.number = strtoul(yytext, NULL, 16);
+			return number;
+		}
+([0][b][0-1]+)	{
+			yylval.number = strtol(yytext + 2, NULL, 2);
+			return number;
+		}
+(([0])|([-+]?[1-9][0-9]*)) {
+			yylval.number = strtol(yytext, NULL, 10);
+			return number;
+		}
+([0][0-9]+)	{
+			yylval.number = strtol(yytext + 1, NULL, 8);
+			return number;
+		}
+[a-zA-Z_][a-zA-Z0-9_]+ {
+			yylval.label = strdup(yytext);
+			return label;
+		}
+
+"/*"([^\*]|\*[^/])*"*/"		{ /* NOP */ }
+";"[^\n]*			{ /* NOP */ }
+^#.*				{ /* NOP */ }
+[ \t]+				{ /* NOP */ }
+[ \n]+				{ /* NOP */ }
+
+.		{
+			printf("unknown character \'%s\'", yytext);
+			yyerror("lex unknown character");
+		}
+
+%%
diff --git a/tools/net/bpf_exp.y b/tools/net/bpf_exp.y
new file mode 100644
index 00000000000..e6306c51c26
--- /dev/null
+++ b/tools/net/bpf_exp.y
@@ -0,0 +1,771 @@
+/*
+ * BPF asm code parser
+ *
+ * This program is free software; you can distribute it and/or modify
+ * it under the terms of the GNU General Public License as published
+ * by the Free Software Foundation; either version 2 of the License,
+ * or (at your option) any later version.
+ *
+ * Syntax kept close to:
+ *
+ * Steven McCanne and Van Jacobson. 1993. The BSD packet filter: a new
+ * architecture for user-level packet capture. In Proceedings of the
+ * USENIX Winter 1993 Conference Proceedings on USENIX Winter 1993
+ * Conference Proceedings (USENIX'93). USENIX Association, Berkeley,
+ * CA, USA, 2-2.
+ *
+ * Copyright 2013 Daniel Borkmann <borkmann@redhat.com>
+ * Licensed under the GNU General Public License, version 2.0 (GPLv2)
+ */
+
+%{
+
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <unistd.h>
+#include <errno.h>
+#include <assert.h>
+#include <linux/filter.h>
+
+#include "bpf_exp.yacc.h"
+
+enum jmp_type { JTL, JFL, JKL };
+
+extern FILE *yyin;
+extern int yylex(void);
+extern void yyerror(const char *str);
+
+extern void bpf_asm_compile(FILE *fp, bool cstyle);
+static void bpf_set_curr_instr(uint16_t op, uint8_t jt, uint8_t jf, uint32_t k);
+static void bpf_set_curr_label(char *label);
+static void bpf_set_jmp_label(char *label, enum jmp_type type);
+
+%}
+
+%union {
+	char *label;
+	uint32_t number;
+}
+
+%token OP_LDB OP_LDH OP_LD OP_LDX OP_ST OP_STX OP_JMP OP_JEQ OP_JGT OP_JGE
+%token OP_JSET OP_ADD OP_SUB OP_MUL OP_DIV OP_AND OP_OR OP_XOR OP_LSH OP_RSH
+%token OP_RET OP_TAX OP_TXA OP_LDXB OP_MOD OP_NEG OP_JNEQ OP_JLT OP_JLE OP_LDI
+%token OP_LDXI
+
+%token K_PKT_LEN K_PROTO K_TYPE K_NLATTR K_NLATTR_NEST K_MARK K_QUEUE K_HATYPE
+%token K_RXHASH K_CPU K_IFIDX K_VLANT K_VLANP K_POFF K_RAND
+
+%token ':' ',' '[' ']' '(' ')' 'x' 'a' '+' 'M' '*' '&' '#' '%'
+
+%token number label
+
+%type <label> label
+%type <number> number
+
+%%
+
+prog
+	: line
+	| prog line
+	;
+
+line
+	: instr
+	| labelled_instr
+	;
+
+labelled_instr
+	: labelled instr
+	;
+
+instr
+	: ldb
+	| ldh
+	| ld
+	| ldi
+	| ldx
+	| ldxi
+	| st
+	| stx
+	| jmp
+	| jeq
+	| jneq
+	| jlt
+	| jle
+	| jgt
+	| jge
+	| jset
+	| add
+	| sub
+	| mul
+	| div
+	| mod
+	| neg
+	| and
+	| or
+	| xor
+	| lsh
+	| rsh
+	| ret
+	| tax
+	| txa
+	;
+
+labelled
+	: label ':' { bpf_set_curr_label($1); }
+	;
+
+ldb
+	: OP_LDB '[' 'x' '+' number ']' {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_IND, 0, 0, $5); }
+	| OP_LDB '[' '%' 'x' '+' number ']' {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_IND, 0, 0, $6); }
+	| OP_LDB '[' number ']' {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0, $3); }
+	| OP_LDB K_PROTO {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_PROTOCOL); }
+	| OP_LDB K_TYPE {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_PKTTYPE); }
+	| OP_LDB K_IFIDX {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_IFINDEX); }
+	| OP_LDB K_NLATTR {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_NLATTR); }
+	| OP_LDB K_NLATTR_NEST {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_NLATTR_NEST); }
+	| OP_LDB K_MARK {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_MARK); }
+	| OP_LDB K_QUEUE {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_QUEUE); }
+	| OP_LDB K_HATYPE {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_HATYPE); }
+	| OP_LDB K_RXHASH {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_RXHASH); }
+	| OP_LDB K_CPU {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_CPU); }
+	| OP_LDB K_VLANT {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_VLAN_TAG); }
+	| OP_LDB K_VLANP {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT); }
+	| OP_LDB K_POFF {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_PAY_OFFSET); }
+	| OP_LDB K_RAND {
+		bpf_set_curr_instr(BPF_LD | BPF_B | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	;
+
+ldh
+	: OP_LDH '[' 'x' '+' number ']' {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_IND, 0, 0, $5); }
+	| OP_LDH '[' '%' 'x' '+' number ']' {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_IND, 0, 0, $6); }
+	| OP_LDH '[' number ']' {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0, $3); }
+	| OP_LDH K_PROTO {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_PROTOCOL); }
+	| OP_LDH K_TYPE {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_PKTTYPE); }
+	| OP_LDH K_IFIDX {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_IFINDEX); }
+	| OP_LDH K_NLATTR {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_NLATTR); }
+	| OP_LDH K_NLATTR_NEST {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_NLATTR_NEST); }
+	| OP_LDH K_MARK {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_MARK); }
+	| OP_LDH K_QUEUE {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_QUEUE); }
+	| OP_LDH K_HATYPE {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_HATYPE); }
+	| OP_LDH K_RXHASH {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_RXHASH); }
+	| OP_LDH K_CPU {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_CPU); }
+	| OP_LDH K_VLANT {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_VLAN_TAG); }
+	| OP_LDH K_VLANP {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT); }
+	| OP_LDH K_POFF {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_PAY_OFFSET); }
+	| OP_LDH K_RAND {
+		bpf_set_curr_instr(BPF_LD | BPF_H | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	;
+
+ldi
+	: OP_LDI '#' number {
+		bpf_set_curr_instr(BPF_LD | BPF_IMM, 0, 0, $3); }
+	| OP_LDI number {
+		bpf_set_curr_instr(BPF_LD | BPF_IMM, 0, 0, $2); }
+	;
+
+ld
+	: OP_LD '#' number {
+		bpf_set_curr_instr(BPF_LD | BPF_IMM, 0, 0, $3); }
+	| OP_LD K_PKT_LEN {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_LEN, 0, 0, 0); }
+	| OP_LD K_PROTO {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_PROTOCOL); }
+	| OP_LD K_TYPE {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_PKTTYPE); }
+	| OP_LD K_IFIDX {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_IFINDEX); }
+	| OP_LD K_NLATTR {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_NLATTR); }
+	| OP_LD K_NLATTR_NEST {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_NLATTR_NEST); }
+	| OP_LD K_MARK {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_MARK); }
+	| OP_LD K_QUEUE {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_QUEUE); }
+	| OP_LD K_HATYPE {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_HATYPE); }
+	| OP_LD K_RXHASH {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_RXHASH); }
+	| OP_LD K_CPU {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_CPU); }
+	| OP_LD K_VLANT {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_VLAN_TAG); }
+	| OP_LD K_VLANP {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_VLAN_TAG_PRESENT); }
+	| OP_LD K_POFF {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_PAY_OFFSET); }
+	| OP_LD K_RAND {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0,
+				   SKF_AD_OFF + SKF_AD_RANDOM); }
+	| OP_LD 'M' '[' number ']' {
+		bpf_set_curr_instr(BPF_LD | BPF_MEM, 0, 0, $4); }
+	| OP_LD '[' 'x' '+' number ']' {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_IND, 0, 0, $5); }
+	| OP_LD '[' '%' 'x' '+' number ']' {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_IND, 0, 0, $6); }
+	| OP_LD '[' number ']' {
+		bpf_set_curr_instr(BPF_LD | BPF_W | BPF_ABS, 0, 0, $3); }
+	;
+
+ldxi
+	: OP_LDXI '#' number {
+		bpf_set_curr_instr(BPF_LDX | BPF_IMM, 0, 0, $3); }
+	| OP_LDXI number {
+		bpf_set_curr_instr(BPF_LDX | BPF_IMM, 0, 0, $2); }
+	;
+
+ldx
+	: OP_LDX '#' number {
+		bpf_set_curr_instr(BPF_LDX | BPF_IMM, 0, 0, $3); }
+	| OP_LDX K_PKT_LEN {
+		bpf_set_curr_instr(BPF_LDX | BPF_W | BPF_LEN, 0, 0, 0); }
+	| OP_LDX 'M' '[' number ']' {
+		bpf_set_curr_instr(BPF_LDX | BPF_MEM, 0, 0, $4); }
+	| OP_LDXB number '*' '(' '[' number ']' '&' number ')' {
+		if ($2 != 4 || $9 != 0xf) {
+			fprintf(stderr, "ldxb offset not supported!\n");
+			exit(0);
+		} else {
+			bpf_set_curr_instr(BPF_LDX | BPF_MSH | BPF_B, 0, 0, $6); } }
+	| OP_LDX number '*' '(' '[' number ']' '&' number ')' {
+		if ($2 != 4 || $9 != 0xf) {
+			fprintf(stderr, "ldxb offset not supported!\n");
+			exit(0);
+		} else {
+			bpf_set_curr_instr(BPF_LDX | BPF_MSH | BPF_B, 0, 0, $6); } }
+	;
+
+st
+	: OP_ST 'M' '[' number ']' {
+		bpf_set_curr_instr(BPF_ST, 0, 0, $4); }
+	;
+
+stx
+	: OP_STX 'M' '[' number ']' {
+		bpf_set_curr_instr(BPF_STX, 0, 0, $4); }
+	;
+
+jmp
+	: OP_JMP label {
+		bpf_set_jmp_label($2, JKL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JA, 0, 0, 0); }
+	;
+
+jeq
+	: OP_JEQ '#' number ',' label ',' label {
+		bpf_set_jmp_label($5, JTL);
+		bpf_set_jmp_label($7, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, $3); }
+	| OP_JEQ 'x' ',' label ',' label {
+		bpf_set_jmp_label($4, JTL);
+		bpf_set_jmp_label($6, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JEQ | BPF_X, 0, 0, 0); }
+	| OP_JEQ '%' 'x' ',' label ',' label {
+		bpf_set_jmp_label($5, JTL);
+		bpf_set_jmp_label($7, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JEQ | BPF_X, 0, 0, 0); }
+	| OP_JEQ '#' number ',' label {
+		bpf_set_jmp_label($5, JTL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, $3); }
+	| OP_JEQ 'x' ',' label {
+		bpf_set_jmp_label($4, JTL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JEQ | BPF_X, 0, 0, 0); }
+	| OP_JEQ '%' 'x' ',' label {
+		bpf_set_jmp_label($5, JTL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JEQ | BPF_X, 0, 0, 0); }
+	;
+
+jneq
+	: OP_JNEQ '#' number ',' label {
+		bpf_set_jmp_label($5, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JEQ | BPF_K, 0, 0, $3); }
+	| OP_JNEQ 'x' ',' label {
+		bpf_set_jmp_label($4, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JEQ | BPF_X, 0, 0, 0); }
+	| OP_JNEQ '%' 'x' ',' label {
+		bpf_set_jmp_label($5, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JEQ | BPF_X, 0, 0, 0); }
+	;
+
+jlt
+	: OP_JLT '#' number ',' label {
+		bpf_set_jmp_label($5, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGE | BPF_K, 0, 0, $3); }
+	| OP_JLT 'x' ',' label {
+		bpf_set_jmp_label($4, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 0); }
+	| OP_JLT '%' 'x' ',' label {
+		bpf_set_jmp_label($5, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 0); }
+	;
+
+jle
+	: OP_JLE '#' number ',' label {
+		bpf_set_jmp_label($5, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGT | BPF_K, 0, 0, $3); }
+	| OP_JLE 'x' ',' label {
+		bpf_set_jmp_label($4, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGT | BPF_X, 0, 0, 0); }
+	| OP_JLE '%' 'x' ',' label {
+		bpf_set_jmp_label($5, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGT | BPF_X, 0, 0, 0); }
+	;
+
+jgt
+	: OP_JGT '#' number ',' label ',' label {
+		bpf_set_jmp_label($5, JTL);
+		bpf_set_jmp_label($7, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGT | BPF_K, 0, 0, $3); }
+	| OP_JGT 'x' ',' label ',' label {
+		bpf_set_jmp_label($4, JTL);
+		bpf_set_jmp_label($6, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGT | BPF_X, 0, 0, 0); }
+	| OP_JGT '%' 'x' ',' label ',' label {
+		bpf_set_jmp_label($5, JTL);
+		bpf_set_jmp_label($7, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGT | BPF_X, 0, 0, 0); }
+	| OP_JGT '#' number ',' label {
+		bpf_set_jmp_label($5, JTL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGT | BPF_K, 0, 0, $3); }
+	| OP_JGT 'x' ',' label {
+		bpf_set_jmp_label($4, JTL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGT | BPF_X, 0, 0, 0); }
+	| OP_JGT '%' 'x' ',' label {
+		bpf_set_jmp_label($5, JTL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGT | BPF_X, 0, 0, 0); }
+	;
+
+jge
+	: OP_JGE '#' number ',' label ',' label {
+		bpf_set_jmp_label($5, JTL);
+		bpf_set_jmp_label($7, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGE | BPF_K, 0, 0, $3); }
+	| OP_JGE 'x' ',' label ',' label {
+		bpf_set_jmp_label($4, JTL);
+		bpf_set_jmp_label($6, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 0); }
+	| OP_JGE '%' 'x' ',' label ',' label {
+		bpf_set_jmp_label($5, JTL);
+		bpf_set_jmp_label($7, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 0); }
+	| OP_JGE '#' number ',' label {
+		bpf_set_jmp_label($5, JTL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGE | BPF_K, 0, 0, $3); }
+	| OP_JGE 'x' ',' label {
+		bpf_set_jmp_label($4, JTL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 0); }
+	| OP_JGE '%' 'x' ',' label {
+		bpf_set_jmp_label($5, JTL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JGE | BPF_X, 0, 0, 0); }
+	;
+
+jset
+	: OP_JSET '#' number ',' label ',' label {
+		bpf_set_jmp_label($5, JTL);
+		bpf_set_jmp_label($7, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JSET | BPF_K, 0, 0, $3); }
+	| OP_JSET 'x' ',' label ',' label {
+		bpf_set_jmp_label($4, JTL);
+		bpf_set_jmp_label($6, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JSET | BPF_X, 0, 0, 0); }
+	| OP_JSET '%' 'x' ',' label ',' label {
+		bpf_set_jmp_label($5, JTL);
+		bpf_set_jmp_label($7, JFL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JSET | BPF_X, 0, 0, 0); }
+	| OP_JSET '#' number ',' label {
+		bpf_set_jmp_label($5, JTL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JSET | BPF_K, 0, 0, $3); }
+	| OP_JSET 'x' ',' label {
+		bpf_set_jmp_label($4, JTL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JSET | BPF_X, 0, 0, 0); }
+	| OP_JSET '%' 'x' ',' label {
+		bpf_set_jmp_label($5, JTL);
+		bpf_set_curr_instr(BPF_JMP | BPF_JSET | BPF_X, 0, 0, 0); }
+	;
+
+add
+	: OP_ADD '#' number {
+		bpf_set_curr_instr(BPF_ALU | BPF_ADD | BPF_K, 0, 0, $3); }
+	| OP_ADD 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_ADD | BPF_X, 0, 0, 0); }
+	| OP_ADD '%' 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_ADD | BPF_X, 0, 0, 0); }
+	;
+
+sub
+	: OP_SUB '#' number {
+		bpf_set_curr_instr(BPF_ALU | BPF_SUB | BPF_K, 0, 0, $3); }
+	| OP_SUB 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_SUB | BPF_X, 0, 0, 0); }
+	| OP_SUB '%' 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_SUB | BPF_X, 0, 0, 0); }
+	;
+
+mul
+	: OP_MUL '#' number {
+		bpf_set_curr_instr(BPF_ALU | BPF_MUL | BPF_K, 0, 0, $3); }
+	| OP_MUL 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_MUL | BPF_X, 0, 0, 0); }
+	| OP_MUL '%' 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_MUL | BPF_X, 0, 0, 0); }
+	;
+
+div
+	: OP_DIV '#' number {
+		bpf_set_curr_instr(BPF_ALU | BPF_DIV | BPF_K, 0, 0, $3); }
+	| OP_DIV 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_DIV | BPF_X, 0, 0, 0); }
+	| OP_DIV '%' 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_DIV | BPF_X, 0, 0, 0); }
+	;
+
+mod
+	: OP_MOD '#' number {
+		bpf_set_curr_instr(BPF_ALU | BPF_MOD | BPF_K, 0, 0, $3); }
+	| OP_MOD 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_MOD | BPF_X, 0, 0, 0); }
+	| OP_MOD '%' 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_MOD | BPF_X, 0, 0, 0); }
+	;
+
+neg
+	: OP_NEG {
+		bpf_set_curr_instr(BPF_ALU | BPF_NEG, 0, 0, 0); }
+	;
+
+and
+	: OP_AND '#' number {
+		bpf_set_curr_instr(BPF_ALU | BPF_AND | BPF_K, 0, 0, $3); }
+	| OP_AND 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_AND | BPF_X, 0, 0, 0); }
+	| OP_AND '%' 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_AND | BPF_X, 0, 0, 0); }
+	;
+
+or
+	: OP_OR '#' number {
+		bpf_set_curr_instr(BPF_ALU | BPF_OR | BPF_K, 0, 0, $3); }
+	| OP_OR 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_OR | BPF_X, 0, 0, 0); }
+	| OP_OR '%' 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_OR | BPF_X, 0, 0, 0); }
+	;
+
+xor
+	: OP_XOR '#' number {
+		bpf_set_curr_instr(BPF_ALU | BPF_XOR | BPF_K, 0, 0, $3); }
+	| OP_XOR 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_XOR | BPF_X, 0, 0, 0); }
+	| OP_XOR '%' 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_XOR | BPF_X, 0, 0, 0); }
+	;
+
+lsh
+	: OP_LSH '#' number {
+		bpf_set_curr_instr(BPF_ALU | BPF_LSH | BPF_K, 0, 0, $3); }
+	| OP_LSH 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_LSH | BPF_X, 0, 0, 0); }
+	| OP_LSH '%' 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_LSH | BPF_X, 0, 0, 0); }
+	;
+
+rsh
+	: OP_RSH '#' number {
+		bpf_set_curr_instr(BPF_ALU | BPF_RSH | BPF_K, 0, 0, $3); }
+	| OP_RSH 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_RSH | BPF_X, 0, 0, 0); }
+	| OP_RSH '%' 'x' {
+		bpf_set_curr_instr(BPF_ALU | BPF_RSH | BPF_X, 0, 0, 0); }
+	;
+
+ret
+	: OP_RET 'a' {
+		bpf_set_curr_instr(BPF_RET | BPF_A, 0, 0, 0); }
+	| OP_RET '%' 'a' {
+		bpf_set_curr_instr(BPF_RET | BPF_A, 0, 0, 0); }
+	| OP_RET 'x' {
+		bpf_set_curr_instr(BPF_RET | BPF_X, 0, 0, 0); }
+	| OP_RET '%' 'x' {
+		bpf_set_curr_instr(BPF_RET | BPF_X, 0, 0, 0); }
+	| OP_RET '#' number {
+		bpf_set_curr_instr(BPF_RET | BPF_K, 0, 0, $3); }
+	;
+
+tax
+	: OP_TAX {
+		bpf_set_curr_instr(BPF_MISC | BPF_TAX, 0, 0, 0); }
+	;
+
+txa
+	: OP_TXA {
+		bpf_set_curr_instr(BPF_MISC | BPF_TXA, 0, 0, 0); }
+	;
+
+%%
+
+static int curr_instr = 0;
+static struct sock_filter out[BPF_MAXINSNS];
+static char **labels, **labels_jt, **labels_jf, **labels_k;
+
+static void bpf_assert_max(void)
+{
+	if (curr_instr >= BPF_MAXINSNS) {
+		fprintf(stderr, "only max %u insns allowed!\n", BPF_MAXINSNS);
+		exit(0);
+	}
+}
+
+static void bpf_set_curr_instr(uint16_t code, uint8_t jt, uint8_t jf,
+			       uint32_t k)
+{
+	bpf_assert_max();
+	out[curr_instr].code = code;
+	out[curr_instr].jt = jt;
+	out[curr_instr].jf = jf;
+	out[curr_instr].k = k;
+	curr_instr++;
+}
+
+static void bpf_set_curr_label(char *label)
+{
+	bpf_assert_max();
+	labels[curr_instr] = label;
+}
+
+static void bpf_set_jmp_label(char *label, enum jmp_type type)
+{
+	bpf_assert_max();
+	switch (type) {
+	case JTL:
+		labels_jt[curr_instr] = label;
+		break;
+	case JFL:
+		labels_jf[curr_instr] = label;
+		break;
+	case JKL:
+		labels_k[curr_instr] = label;
+		break;
+	}
+}
+
+static int bpf_find_insns_offset(const char *label)
+{
+	int i, max = curr_instr, ret = -ENOENT;
+
+	for (i = 0; i < max; i++) {
+		if (labels[i] && !strcmp(label, labels[i])) {
+			ret = i;
+			break;
+		}
+	}
+
+	if (ret == -ENOENT) {
+		fprintf(stderr, "no such label \'%s\'!\n", label);
+		exit(0);
+	}
+
+	return ret;
+}
+
+static void bpf_stage_1_insert_insns(void)
+{
+	yyparse();
+}
+
+static void bpf_reduce_k_jumps(void)
+{
+	int i;
+
+	for (i = 0; i < curr_instr; i++) {
+		if (labels_k[i]) {
+			int off = bpf_find_insns_offset(labels_k[i]);
+			out[i].k = (uint32_t) (off - i - 1);
+		}
+	}
+}
+
+static void bpf_reduce_jt_jumps(void)
+{
+	int i;
+
+	for (i = 0; i < curr_instr; i++) {
+		if (labels_jt[i]) {
+			int off = bpf_find_insns_offset(labels_jt[i]);
+			out[i].jt = (uint8_t) (off - i -1);
+		}
+	}
+}
+
+static void bpf_reduce_jf_jumps(void)
+{
+	int i;
+
+	for (i = 0; i < curr_instr; i++) {
+		if (labels_jf[i]) {
+			int off = bpf_find_insns_offset(labels_jf[i]);
+			out[i].jf = (uint8_t) (off - i - 1);
+		}
+	}
+}
+
+static void bpf_stage_2_reduce_labels(void)
+{
+	bpf_reduce_k_jumps();
+	bpf_reduce_jt_jumps();
+	bpf_reduce_jf_jumps();
+}
+
+static void bpf_pretty_print_c(void)
+{
+	int i;
+
+	for (i = 0; i < curr_instr; i++)
+		printf("{ %#04x, %2u, %2u, %#010x },\n", out[i].code,
+		       out[i].jt, out[i].jf, out[i].k);
+}
+
+static void bpf_pretty_print(void)
+{
+	int i;
+
+	printf("%u,", curr_instr);
+	for (i = 0; i < curr_instr; i++)
+		printf("%u %u %u %u,", out[i].code,
+		       out[i].jt, out[i].jf, out[i].k);
+	printf("\n");
+}
+
+static void bpf_init(void)
+{
+	memset(out, 0, sizeof(out));
+
+	labels = calloc(BPF_MAXINSNS, sizeof(*labels));
+	assert(labels);
+	labels_jt = calloc(BPF_MAXINSNS, sizeof(*labels_jt));
+	assert(labels_jt);
+	labels_jf = calloc(BPF_MAXINSNS, sizeof(*labels_jf));
+	assert(labels_jf);
+	labels_k = calloc(BPF_MAXINSNS, sizeof(*labels_k));
+	assert(labels_k);
+}
+
+static void bpf_destroy_labels(void)
+{
+	int i;
+
+	for (i = 0; i < curr_instr; i++) {
+		free(labels_jf[i]);
+		free(labels_jt[i]);
+		free(labels_k[i]);
+		free(labels[i]);
+	}
+}
+
+static void bpf_destroy(void)
+{
+	bpf_destroy_labels();
+	free(labels_jt);
+	free(labels_jf);
+	free(labels_k);
+	free(labels);
+}
+
+void bpf_asm_compile(FILE *fp, bool cstyle)
+{
+	yyin = fp;
+
+	bpf_init();
+	bpf_stage_1_insert_insns();
+	bpf_stage_2_reduce_labels();
+	bpf_destroy();
+
+	if (cstyle)
+		bpf_pretty_print_c();
+	else
+		bpf_pretty_print();
+
+	if (fp != stdin)
+		fclose(yyin);
+}
+
+void yyerror(const char *str)
+{
+	exit(1);
+}
diff --git a/tools/net/bpf_jit_disasm.c b/tools/net/bpf_jit_disasm.c
new file mode 100644
index 00000000000..c5baf9c591b
--- /dev/null
+++ b/tools/net/bpf_jit_disasm.c
@@ -0,0 +1,197 @@
+/*
+ * Minimal BPF JIT image disassembler
+ *
+ * Disassembles BPF JIT compiler emitted opcodes back to asm insn's for
+ * debugging or verification purposes.
+ *
+ * To get the disassembly of the JIT code, do the following:
+ *
+ *  1) `echo 2 > /proc/sys/net/core/bpf_jit_enable`
+ *  2) Load a BPF filter (e.g. `tcpdump -p -n -s 0 -i eth1 host 192.168.20.0/24`)
+ *  3) Run e.g. `bpf_jit_disasm -o` to read out the last JIT code
+ *
+ * Copyright 2013 Daniel Borkmann <borkmann@redhat.com>
+ * Licensed under the GNU General Public License, version 2.0 (GPLv2)
+ */
+
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <unistd.h>
+#include <string.h>
+#include <bfd.h>
+#include <dis-asm.h>
+#include <sys/klog.h>
+#include <sys/types.h>
+#include <regex.h>
+
+static void get_exec_path(char *tpath, size_t size)
+{
+	char *path;
+	ssize_t len;
+
+	snprintf(tpath, size, "/proc/%d/exe", (int) getpid());
+	tpath[size - 1] = 0;
+
+	path = strdup(tpath);
+	assert(path);
+
+	len = readlink(path, tpath, size);
+	tpath[len] = 0;
+
+	free(path);
+}
+
+static void get_asm_insns(uint8_t *image, size_t len, int opcodes)
+{
+	int count, i, pc = 0;
+	char tpath[256];
+	struct disassemble_info info;
+	disassembler_ftype disassemble;
+	bfd *bfdf;
+
+	memset(tpath, 0, sizeof(tpath));
+	get_exec_path(tpath, sizeof(tpath));
+
+	bfdf = bfd_openr(tpath, NULL);
+	assert(bfdf);
+	assert(bfd_check_format(bfdf, bfd_object));
+
+	init_disassemble_info(&info, stdout, (fprintf_ftype) fprintf);
+	info.arch = bfd_get_arch(bfdf);
+	info.mach = bfd_get_mach(bfdf);
+	info.buffer = image;
+	info.buffer_length = len;
+
+	disassemble_init_for_target(&info);
+
+	disassemble = disassembler(bfdf);
+	assert(disassemble);
+
+	do {
+		printf("%4x:\t", pc);
+
+		count = disassemble(pc, &info);
+
+		if (opcodes) {
+			printf("\n\t");
+			for (i = 0; i < count; ++i)
+				printf("%02x ", (uint8_t) image[pc + i]);
+		}
+		printf("\n");
+
+		pc += count;
+	} while(count > 0 && pc < len);
+
+	bfd_close(bfdf);
+}
+
+static char *get_klog_buff(int *klen)
+{
+	int ret, len = klogctl(10, NULL, 0);
+	char *buff = malloc(len);
+
+	assert(buff && klen);
+	ret = klogctl(3, buff, len);
+	assert(ret >= 0);
+	*klen = ret;
+
+	return buff;
+}
+
+static void put_klog_buff(char *buff)
+{
+	free(buff);
+}
+
+static int get_last_jit_image(char *haystack, size_t hlen,
+			      uint8_t *image, size_t ilen)
+{
+	char *ptr, *pptr, *tmp;
+	off_t off = 0;
+	int ret, flen, proglen, pass, ulen = 0;
+	regmatch_t pmatch[1];
+	unsigned long base;
+	regex_t regex;
+
+	if (hlen == 0)
+		return 0;
+
+	ret = regcomp(&regex, "flen=[[:alnum:]]+ proglen=[[:digit:]]+ "
+		      "pass=[[:digit:]]+ image=[[:xdigit:]]+", REG_EXTENDED);
+	assert(ret == 0);
+
+	ptr = haystack;
+	while (1) {
+		ret = regexec(&regex, ptr, 1, pmatch, 0);
+		if (ret == 0) {
+			ptr += pmatch[0].rm_eo;
+			off += pmatch[0].rm_eo;
+			assert(off < hlen);
+		} else
+			break;
+	}
+
+	ptr = haystack + off - (pmatch[0].rm_eo - pmatch[0].rm_so);
+	ret = sscanf(ptr, "flen=%d proglen=%d pass=%d image=%lx",
+		     &flen, &proglen, &pass, &base);
+	if (ret != 4)
+		return 0;
+
+	tmp = ptr = haystack + off;
+	while ((ptr = strtok(tmp, "\n")) != NULL && ulen < ilen) {
+		tmp = NULL;
+		if (!strstr(ptr, "JIT code"))
+			continue;
+		pptr = ptr;
+		while ((ptr = strstr(pptr, ":")))
+			pptr = ptr + 1;
+		ptr = pptr;
+		do {
+			image[ulen++] = (uint8_t) strtoul(pptr, &pptr, 16);
+			if (ptr == pptr || ulen >= ilen) {
+				ulen--;
+				break;
+			}
+			ptr = pptr;
+		} while (1);
+	}
+
+	assert(ulen == proglen);
+	printf("%d bytes emitted from JIT compiler (pass:%d, flen:%d)\n",
+	       proglen, pass, flen);
+	printf("%lx + <x>:\n", base);
+
+	regfree(&regex);
+	return ulen;
+}
+
+int main(int argc, char **argv)
+{
+	int len, klen, opcodes = 0;
+	char *kbuff;
+	static uint8_t image[32768];
+
+	if (argc > 1) {
+		if (!strncmp("-o", argv[argc - 1], 2)) {
+			opcodes = 1;
+		} else {
+			printf("usage: bpf_jit_disasm [-o: show opcodes]\n");
+			exit(0);
+		}
+	}
+
+	bfd_init();
+	memset(image, 0, sizeof(image));
+
+	kbuff = get_klog_buff(&klen);
+
+	len = get_last_jit_image(kbuff, klen, image, sizeof(image));
+	if (len > 0)
+		get_asm_insns(image, len, opcodes);
+
+	put_klog_buff(kbuff);
+
+	return 0;
+}
diff --git a/tools/nfsd/inject_fault.sh b/tools/nfsd/inject_fault.sh
new file mode 100755
index 00000000000..06a399ac8b2
--- /dev/null
+++ b/tools/nfsd/inject_fault.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+#
+# Copyright (c) 2011 Bryan Schumaker <bjschuma@netapp.com>
+#
+# Script for easier NFSD fault injection
+
+# Check that debugfs has been mounted
+DEBUGFS=`cat /proc/mounts | grep debugfs`
+if [ "$DEBUGFS" == "" ]; then
+	echo "debugfs does not appear to be mounted!"
+	echo "Please mount debugfs and try again"
+	exit 1
+fi
+
+# Check that the fault injection directory exists
+DEBUGDIR=`echo $DEBUGFS | awk '{print $2}'`/nfsd
+if [ ! -d "$DEBUGDIR" ]; then
+	echo "$DEBUGDIR does not exist"
+	echo "Check that your .config selects CONFIG_NFSD_FAULT_INJECTION"
+	exit 1
+fi
+
+function help()
+{
+	echo "Usage $0 injection_type [count]"
+	echo ""
+	echo "Injection types are:"
+	ls $DEBUGDIR
+	exit 1
+}
+
+if [ $# == 0 ]; then
+	help
+elif [ ! -f $DEBUGDIR/$1 ]; then
+	help
+elif [ $# != 2 ]; then
+	COUNT=0
+else
+	COUNT=$2
+fi
+
+BEFORE=`mktemp`
+AFTER=`mktemp`
+dmesg > $BEFORE
+echo $COUNT > $DEBUGDIR/$1
+dmesg > $AFTER
+# Capture lines that only exist in the $AFTER file
+diff $BEFORE $AFTER | grep ">"
+rm -f $BEFORE $AFTER
diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore
index e1d60d78078..782d86e961b 100644
--- a/tools/perf/.gitignore
+++ b/tools/perf/.gitignore
@@ -1,4 +1,3 @@
-PERF-BUILD-OPTIONS
 PERF-CFLAGS
 PERF-GUI-VARS
 PERF-VERSION-FILE
@@ -14,7 +13,14 @@ perf*.html
 common-cmds.h
 perf.data
 perf.data.old
+output.svg
 perf-archive
 tags
 TAGS
 cscope*
+config.mak
+config.mak.autogen
+*-bison.*
+*-flex.*
+*.pyc
+*.pyo
diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile
index bd498d49695..3ba1c0b0990 100644
--- a/tools/perf/Documentation/Makefile
+++ b/tools/perf/Documentation/Makefile
@@ -1,3 +1,6 @@
+include ../../scripts/Makefile.include
+include ../config/utilities.mak
+
 MAN1_TXT= \
 	$(filter-out $(addsuffix .txt, $(ARTICLES) $(SP_ARTICLES)), \
 		$(wildcard perf-*.txt)) \
@@ -6,10 +9,11 @@ MAN5_TXT=
 MAN7_TXT=
 
 MAN_TXT = $(MAN1_TXT) $(MAN5_TXT) $(MAN7_TXT)
-MAN_XML=$(patsubst %.txt,%.xml,$(MAN_TXT))
-MAN_HTML=$(patsubst %.txt,%.html,$(MAN_TXT))
+_MAN_XML=$(patsubst %.txt,%.xml,$(MAN_TXT))
+_MAN_HTML=$(patsubst %.txt,%.html,$(MAN_TXT))
 
-DOC_HTML=$(MAN_HTML)
+MAN_XML=$(addprefix $(OUTPUT),$(_MAN_XML))
+MAN_HTML=$(addprefix $(OUTPUT),$(_MAN_HTML))
 
 ARTICLES =
 # with their own formatting rules.
@@ -18,11 +22,17 @@ API_DOCS = $(patsubst %.txt,%,$(filter-out technical/api-index-skel.txt technica
 SP_ARTICLES += $(API_DOCS)
 SP_ARTICLES += technical/api-index
 
-DOC_HTML += $(patsubst %,%.html,$(ARTICLES) $(SP_ARTICLES))
+_DOC_HTML = $(_MAN_HTML)
+_DOC_HTML+=$(patsubst %,%.html,$(ARTICLES) $(SP_ARTICLES))
+DOC_HTML=$(addprefix $(OUTPUT),$(_DOC_HTML))
+
+_DOC_MAN1=$(patsubst %.txt,%.1,$(MAN1_TXT))
+_DOC_MAN5=$(patsubst %.txt,%.5,$(MAN5_TXT))
+_DOC_MAN7=$(patsubst %.txt,%.7,$(MAN7_TXT))
 
-DOC_MAN1=$(patsubst %.txt,%.1,$(MAN1_TXT))
-DOC_MAN5=$(patsubst %.txt,%.5,$(MAN5_TXT))
-DOC_MAN7=$(patsubst %.txt,%.7,$(MAN7_TXT))
+DOC_MAN1=$(addprefix $(OUTPUT),$(_DOC_MAN1))
+DOC_MAN5=$(addprefix $(OUTPUT),$(_DOC_MAN5))
+DOC_MAN7=$(addprefix $(OUTPUT),$(_DOC_MAN7))
 
 # Make the path relative to DESTDIR, not prefix
 ifndef DESTDIR
@@ -50,6 +60,7 @@ MAKEINFO=makeinfo
 INSTALL_INFO=install-info
 DOCBOOK2X_TEXI=docbook2x-texi
 DBLATEX=dblatex
+XMLTO=xmlto
 ifndef PERL_PATH
 	PERL_PATH = /usr/bin/perl
 endif
@@ -57,6 +68,16 @@ endif
 -include ../config.mak.autogen
 -include ../config.mak
 
+_tmp_tool_path := $(call get-executable,$(ASCIIDOC))
+ifeq ($(_tmp_tool_path),)
+	missing_tools = $(ASCIIDOC)
+endif
+
+_tmp_tool_path := $(call get-executable,$(XMLTO))
+ifeq ($(_tmp_tool_path),)
+	missing_tools += $(XMLTO)
+endif
+
 #
 # For asciidoc ...
 #	-7.1.2,	no extra settings are needed.
@@ -123,17 +144,18 @@ NO_SUBDIR = :
 endif
 
 ifneq ($(findstring $(MAKEFLAGS),s),s)
-ifndef V
-	QUIET_ASCIIDOC	= @echo '   ' ASCIIDOC $@;
-	QUIET_XMLTO	= @echo '   ' XMLTO $@;
-	QUIET_DB2TEXI	= @echo '   ' DB2TEXI $@;
-	QUIET_MAKEINFO	= @echo '   ' MAKEINFO $@;
-	QUIET_DBLATEX	= @echo '   ' DBLATEX $@;
-	QUIET_XSLTPROC	= @echo '   ' XSLTPROC $@;
-	QUIET_GEN	= @echo '   ' GEN $@;
+ifneq ($(V),1)
+	QUIET_ASCIIDOC	= @echo '  ASCIIDOC '$@;
+	QUIET_XMLTO	= @echo '  XMLTO    '$@;
+	QUIET_DB2TEXI	= @echo '  DB2TEXI  '$@;
+	QUIET_MAKEINFO	= @echo '  MAKEINFO '$@;
+	QUIET_DBLATEX	= @echo '  DBLATEX  '$@;
+	QUIET_XSLTPROC	= @echo '  XSLTPROC '$@;
+	QUIET_GEN	= @echo '  GEN      '$@;
 	QUIET_STDERR	= 2> /dev/null
 	QUIET_SUBDIR0	= +@subdir=
-	QUIET_SUBDIR1	= ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
+	QUIET_SUBDIR1	= ;$(NO_SUBDIR) \
+			   echo '  SUBDIR   ' $$subdir; \
 			  $(MAKE) $(PRINT_DIR) -C $$subdir
 	export V
 endif
@@ -150,53 +172,67 @@ man1: $(DOC_MAN1)
 man5: $(DOC_MAN5)
 man7: $(DOC_MAN7)
 
-info: perf.info perfman.info
+info: $(OUTPUT)perf.info $(OUTPUT)perfman.info
 
-pdf: user-manual.pdf
+pdf: $(OUTPUT)user-manual.pdf
 
 install: install-man
 
-install-man: man
-	$(INSTALL) -d -m 755 $(DESTDIR)$(man1dir)
-#	$(INSTALL) -d -m 755 $(DESTDIR)$(man5dir)
-#	$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir)
-	$(INSTALL) -m 644 $(DOC_MAN1) $(DESTDIR)$(man1dir)
-#	$(INSTALL) -m 644 $(DOC_MAN5) $(DESTDIR)$(man5dir)
-#	$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir)
+check-man-tools:
+ifdef missing_tools
+	$(error "You need to install $(missing_tools) for man pages")
+endif
+
+do-install-man: man
+	$(call QUIET_INSTALL, Documentation-man) \
+		$(INSTALL) -d -m 755 $(DESTDIR)$(man1dir); \
+#		$(INSTALL) -d -m 755 $(DESTDIR)$(man5dir); \
+#		$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir); \
+		$(INSTALL) -m 644 $(DOC_MAN1) $(DESTDIR)$(man1dir); \
+#		$(INSTALL) -m 644 $(DOC_MAN5) $(DESTDIR)$(man5dir); \
+#		$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir)
+
+install-man: check-man-tools man
+
+ifdef missing_tools
+  DO_INSTALL_MAN = $(warning Please install $(missing_tools) to have the man pages installed)
+else
+  DO_INSTALL_MAN = do-install-man
+endif
+
+try-install-man: $(DO_INSTALL_MAN)
 
 install-info: info
-	$(INSTALL) -d -m 755 $(DESTDIR)$(infodir)
-	$(INSTALL) -m 644 perf.info perfman.info $(DESTDIR)$(infodir)
+	$(call QUIET_INSTALL, Documentation-info) \
+		$(INSTALL) -d -m 755 $(DESTDIR)$(infodir); \
+		$(INSTALL) -m 644 $(OUTPUT)perf.info $(OUTPUT)perfman.info $(DESTDIR)$(infodir); \
 	if test -r $(DESTDIR)$(infodir)/dir; then \
-	  $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perf.info ;\
-	  $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perfman.info ;\
+		$(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perf.info ;\
+		$(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perfman.info ;\
 	else \
 	  echo "No directory found in $(DESTDIR)$(infodir)" >&2 ; \
 	fi
 
 install-pdf: pdf
-	$(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir)
-	$(INSTALL) -m 644 user-manual.pdf $(DESTDIR)$(pdfdir)
-
-install-html: html
-	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
+	$(call QUIET_INSTALL, Documentation-pdf) \
+		$(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir); \
+		$(INSTALL) -m 644 $(OUTPUT)user-manual.pdf $(DESTDIR)$(pdfdir)
 
-../PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
-	$(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) PERF-VERSION-FILE
+#install-html: html
+#	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir)
 
--include ../PERF-VERSION-FILE
 
 #
 # Determine "include::" file references in asciidoc files.
 #
-doc.dep : $(wildcard *.txt) build-docdep.perl
+$(OUTPUT)doc.dep : $(wildcard *.txt) build-docdep.perl
 	$(QUIET_GEN)$(RM) $@+ $@ && \
 	$(PERL_PATH) ./build-docdep.perl >$@+ $(QUIET_STDERR) && \
 	mv $@+ $@
 
--include doc.dep
+-include $(OUPTUT)doc.dep
 
-cmds_txt = cmds-ancillaryinterrogators.txt \
+_cmds_txt = cmds-ancillaryinterrogators.txt \
 	cmds-ancillarymanipulators.txt \
 	cmds-mainporcelain.txt \
 	cmds-plumbinginterrogators.txt \
@@ -205,32 +241,38 @@ cmds_txt = cmds-ancillaryinterrogators.txt \
 	cmds-synchelpers.txt \
 	cmds-purehelpers.txt \
 	cmds-foreignscminterface.txt
+cmds_txt=$(addprefix $(OUTPUT),$(_cmds_txt))
 
-$(cmds_txt): cmd-list.made
+$(cmds_txt): $(OUTPUT)cmd-list.made
 
-cmd-list.made: cmd-list.perl ../command-list.txt $(MAN1_TXT)
+$(OUTPUT)cmd-list.made: cmd-list.perl ../command-list.txt $(MAN1_TXT)
 	$(QUIET_GEN)$(RM) $@ && \
 	$(PERL_PATH) ./cmd-list.perl ../command-list.txt $(QUIET_STDERR) && \
 	date >$@
 
+CLEAN_FILES =									\
+	$(MAN_XML) $(addsuffix +,$(MAN_XML))					\
+	$(MAN_HTML) $(addsuffix +,$(MAN_HTML))					\
+	$(DOC_HTML) $(DOC_MAN1) $(DOC_MAN5) $(DOC_MAN7)				\
+	$(OUTPUT)*.texi $(OUTPUT)*.texi+ $(OUTPUT)*.texi++			\
+	$(OUTPUT)perf.info $(OUTPUT)perfman.info				\
+	$(OUTPUT)howto-index.txt $(OUTPUT)howto/*.html $(OUTPUT)doc.dep		\
+	$(OUTPUT)technical/api-*.html $(OUTPUT)technical/api-index.txt		\
+	$(cmds_txt) $(OUTPUT)*.made
 clean:
-	$(RM) *.xml *.xml+ *.html *.html+ *.1 *.5 *.7
-	$(RM) *.texi *.texi+ *.texi++ perf.info perfman.info
-	$(RM) howto-index.txt howto/*.html doc.dep
-	$(RM) technical/api-*.html technical/api-index.txt
-	$(RM) $(cmds_txt) *.made
+	$(call QUIET_CLEAN, Documentation) $(RM) $(CLEAN_FILES)
 
-$(MAN_HTML): %.html : %.txt
+$(MAN_HTML): $(OUTPUT)%.html : %.txt
 	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
 	$(ASCIIDOC) -b xhtml11 -d manpage -f asciidoc.conf \
 		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
 	mv $@+ $@
 
-%.1 %.5 %.7 : %.xml
+$(OUTPUT)%.1 $(OUTPUT)%.5 $(OUTPUT)%.7 : $(OUTPUT)%.xml
 	$(QUIET_XMLTO)$(RM) $@ && \
-	xmlto -m $(MANPAGE_XSL) $(XMLTO_EXTRA) man $<
+	$(XMLTO) -o $(OUTPUT). -m $(MANPAGE_XSL) $(XMLTO_EXTRA) man $<
 
-%.xml : %.txt
+$(OUTPUT)%.xml : %.txt
 	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \
 	$(ASCIIDOC) -b docbook -d manpage -f asciidoc.conf \
 		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \
@@ -239,25 +281,25 @@ $(MAN_HTML): %.html : %.txt
 XSLT = docbook.xsl
 XSLTOPTS = --xinclude --stringparam html.stylesheet docbook-xsl.css
 
-user-manual.html: user-manual.xml
+$(OUTPUT)user-manual.html: $(OUTPUT)user-manual.xml
 	$(QUIET_XSLTPROC)xsltproc $(XSLTOPTS) -o $@ $(XSLT) $<
 
-perf.info: user-manual.texi
-	$(QUIET_MAKEINFO)$(MAKEINFO) --no-split -o $@ user-manual.texi
+$(OUTPUT)perf.info: $(OUTPUT)user-manual.texi
+	$(QUIET_MAKEINFO)$(MAKEINFO) --no-split -o $@ $(OUTPUT)user-manual.texi
 
-user-manual.texi: user-manual.xml
+$(OUTPUT)user-manual.texi: $(OUTPUT)user-manual.xml
 	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
-	$(DOCBOOK2X_TEXI) user-manual.xml --encoding=UTF-8 --to-stdout >$@++ && \
+	$(DOCBOOK2X_TEXI) $(OUTPUT)user-manual.xml --encoding=UTF-8 --to-stdout >$@++ && \
 	$(PERL_PATH) fix-texi.perl <$@++ >$@+ && \
 	rm $@++ && \
 	mv $@+ $@
 
-user-manual.pdf: user-manual.xml
+$(OUTPUT)user-manual.pdf: $(OUTPUT)user-manual.xml
 	$(QUIET_DBLATEX)$(RM) $@+ $@ && \
 	$(DBLATEX) -o $@+ -p /etc/asciidoc/dblatex/asciidoc-dblatex.xsl -s /etc/asciidoc/dblatex/asciidoc-dblatex.sty $< && \
 	mv $@+ $@
 
-perfman.texi: $(MAN_XML) cat-texi.perl
+$(OUTPUT)perfman.texi: $(MAN_XML) cat-texi.perl
 	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \
 	($(foreach xml,$(MAN_XML),$(DOCBOOK2X_TEXI) --encoding=UTF-8 \
 		--to-stdout $(xml) &&) true) > $@++ && \
@@ -265,7 +307,7 @@ perfman.texi: $(MAN_XML) cat-texi.perl
 	rm $@++ && \
 	mv $@+ $@
 
-perfman.info: perfman.texi
+$(OUTPUT)perfman.info: $(OUTPUT)perfman.texi
 	$(QUIET_MAKEINFO)$(MAKEINFO) --no-split --no-validate $*.texi
 
 $(patsubst %.txt,%.texi,$(MAN_TXT)): %.texi : %.xml
@@ -288,15 +330,14 @@ $(patsubst %.txt,%.html,$(wildcard howto/*.txt)): %.html : %.txt
 	sed -e '1,/^$$/d' $< | $(ASCIIDOC) -b xhtml11 - >$@+ && \
 	mv $@+ $@
 
-install-webdoc : html
-	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST)
-
-quick-install: quick-install-man
+# UNIMPLEMENTED
+#install-webdoc : html
+#	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST)
 
-quick-install-man:
-	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir)
+# quick-install: quick-install-man
 
-quick-install-html:
-	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir)
+# quick-install-man:
+#	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir)
 
-.PHONY: .FORCE-PERF-VERSION-FILE
+#quick-install-html:
+#	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir)
diff --git a/tools/perf/Documentation/android.txt b/tools/perf/Documentation/android.txt
new file mode 100644
index 00000000000..8484c3a04a6
--- /dev/null
+++ b/tools/perf/Documentation/android.txt
@@ -0,0 +1,78 @@
+How to compile perf for Android
+=========================================
+
+I. Set the Android NDK environment
+------------------------------------------------
+
+(a). Use the Android NDK
+------------------------------------------------
+1. You need to download and install the Android Native Development Kit (NDK).
+Set the NDK variable to point to the path where you installed the NDK:
+  export NDK=/path/to/android-ndk
+
+2. Set cross-compiling environment variables for NDK toolchain and sysroot.
+For arm:
+  export NDK_TOOLCHAIN=${NDK}/toolchains/arm-linux-androideabi-4.6/prebuilt/linux-x86/bin/arm-linux-androideabi-
+  export NDK_SYSROOT=${NDK}/platforms/android-9/arch-arm
+For x86:
+  export NDK_TOOLCHAIN=${NDK}/toolchains/x86-4.6/prebuilt/linux-x86/bin/i686-linux-android-
+  export NDK_SYSROOT=${NDK}/platforms/android-9/arch-x86
+
+This method is not working for Android NDK versions up to Revision 8b.
+perf uses some bionic enhancements that are not included in these NDK versions.
+You can use method (b) described below instead.
+
+(b). Use the Android source tree
+-----------------------------------------------
+1. Download the master branch of the Android source tree.
+Set the environment for the target you want using:
+  source build/envsetup.sh
+  lunch
+
+2. Build your own NDK sysroot to contain latest bionic changes and set the
+NDK sysroot environment variable.
+  cd ${ANDROID_BUILD_TOP}/ndk
+For arm:
+  ./build/tools/build-ndk-sysroot.sh --abi=arm
+  export NDK_SYSROOT=${ANDROID_BUILD_TOP}/ndk/build/platforms/android-3/arch-arm
+For x86:
+  ./build/tools/build-ndk-sysroot.sh --abi=x86
+  export NDK_SYSROOT=${ANDROID_BUILD_TOP}/ndk/build/platforms/android-3/arch-x86
+
+3. Set the NDK toolchain environment variable.
+For arm:
+  export NDK_TOOLCHAIN=${ANDROID_TOOLCHAIN}/arm-linux-androideabi-
+For x86:
+  export NDK_TOOLCHAIN=${ANDROID_TOOLCHAIN}/i686-linux-android-
+
+II. Compile perf for Android
+------------------------------------------------
+You need to run make with the NDK toolchain and sysroot defined above:
+For arm:
+  make ARCH=arm CROSS_COMPILE=${NDK_TOOLCHAIN} CFLAGS="--sysroot=${NDK_SYSROOT}"
+For x86:
+  make ARCH=x86 CROSS_COMPILE=${NDK_TOOLCHAIN} CFLAGS="--sysroot=${NDK_SYSROOT}"
+
+III. Install perf
+-----------------------------------------------
+You need to connect to your Android device/emulator using adb.
+Install perf using:
+  adb push perf /data/perf
+
+If you also want to use perf-archive you need busybox tools for Android.
+For installing perf-archive, you first need to replace #!/bin/bash with #!/system/bin/sh:
+  sed 's/#!\/bin\/bash/#!\/system\/bin\/sh/g' perf-archive >> /tmp/perf-archive
+  chmod +x /tmp/perf-archive
+  adb push /tmp/perf-archive /data/perf-archive
+
+IV. Environment settings for running perf
+------------------------------------------------
+Some perf features need environment variables to run properly.
+You need to set these before running perf on the target:
+  adb shell
+  # PERF_PAGER=cat
+
+IV. Run perf
+------------------------------------------------
+Run perf on your device/emulator to which you previously connected using adb:
+  # ./data/perf
diff --git a/tools/perf/Documentation/examples.txt b/tools/perf/Documentation/examples.txt
index 8eb6c489fb1..a4e39215648 100644
--- a/tools/perf/Documentation/examples.txt
+++ b/tools/perf/Documentation/examples.txt
@@ -17,8 +17,8 @@ titan:~> perf list
   kmem:kmem_cache_alloc_node               [Tracepoint event]
   kmem:kfree                               [Tracepoint event]
   kmem:kmem_cache_free                     [Tracepoint event]
-  kmem:mm_page_free_direct                 [Tracepoint event]
-  kmem:mm_pagevec_free                     [Tracepoint event]
+  kmem:mm_page_free                        [Tracepoint event]
+  kmem:mm_page_free_batched                [Tracepoint event]
   kmem:mm_page_alloc                       [Tracepoint event]
   kmem:mm_page_alloc_zone_locked           [Tracepoint event]
   kmem:mm_page_pcpu_drain                  [Tracepoint event]
@@ -29,15 +29,15 @@ measured. For example the page alloc/free properties of a 'hackbench
 run' are:
 
  titan:~> perf stat -e kmem:mm_page_pcpu_drain -e kmem:mm_page_alloc
- -e kmem:mm_pagevec_free -e kmem:mm_page_free_direct ./hackbench 10
+ -e kmem:mm_page_free_batched -e kmem:mm_page_free ./hackbench 10
  Time: 0.575
 
  Performance counter stats for './hackbench 10':
 
           13857  kmem:mm_page_pcpu_drain
           27576  kmem:mm_page_alloc
-           6025  kmem:mm_pagevec_free
-          20934  kmem:mm_page_free_direct
+           6025  kmem:mm_page_free_batched
+          20934  kmem:mm_page_free
 
     0.613972165  seconds time elapsed
 
@@ -45,8 +45,8 @@ You can observe the statistical properties as well, by using the
 'repeat the workload N times' feature of perf stat:
 
  titan:~> perf stat --repeat 5 -e kmem:mm_page_pcpu_drain -e
-   kmem:mm_page_alloc -e kmem:mm_pagevec_free -e
-   kmem:mm_page_free_direct ./hackbench 10
+   kmem:mm_page_alloc -e kmem:mm_page_free_batched -e
+   kmem:mm_page_free ./hackbench 10
  Time: 0.627
  Time: 0.644
  Time: 0.564
@@ -57,8 +57,8 @@ You can observe the statistical properties as well, by using the
 
           12920  kmem:mm_page_pcpu_drain    ( +-   3.359% )
           25035  kmem:mm_page_alloc         ( +-   3.783% )
-           6104  kmem:mm_pagevec_free       ( +-   0.934% )
-          18376  kmem:mm_page_free_direct   ( +-   4.941% )
+           6104  kmem:mm_page_free_batched  ( +-   0.934% )
+          18376  kmem:mm_page_free	    ( +-   4.941% )
 
     0.643954516  seconds time elapsed   ( +-   2.363% )
 
@@ -66,7 +66,7 @@ Furthermore, these tracepoints can be used to sample the workload as
 well. For example the page allocations done by a 'git gc' can be
 captured the following way:
 
- titan:~/git> perf record -f -e kmem:mm_page_alloc -c 1 ./git gc
+ titan:~/git> perf record -e kmem:mm_page_alloc -c 1 ./git gc
  Counting objects: 1148, done.
  Delta compression using up to 2 threads.
  Compressing objects: 100% (450/450), done.
@@ -120,7 +120,7 @@ Furthermore, call-graph sampling can be done too, of page
 allocations - to see precisely what kind of page allocations there
 are:
 
- titan:~/git> perf record -f -g -e kmem:mm_page_alloc -c 1 ./git gc
+ titan:~/git> perf record -g -e kmem:mm_page_alloc -c 1 ./git gc
  Counting objects: 1148, done.
  Delta compression using up to 2 threads.
  Compressing objects: 100% (450/450), done.
@@ -158,15 +158,15 @@ Or you can observe the whole system's page allocations for 10
 seconds:
 
 titan:~/git> perf stat -a -e kmem:mm_page_pcpu_drain -e
-kmem:mm_page_alloc -e kmem:mm_pagevec_free -e
-kmem:mm_page_free_direct sleep 10
+kmem:mm_page_alloc -e kmem:mm_page_free_batched -e
+kmem:mm_page_free sleep 10
 
  Performance counter stats for 'sleep 10':
 
          171585  kmem:mm_page_pcpu_drain
          322114  kmem:mm_page_alloc
-          73623  kmem:mm_pagevec_free
-         254115  kmem:mm_page_free_direct
+          73623  kmem:mm_page_free_batched
+         254115  kmem:mm_page_free
 
    10.000591410  seconds time elapsed
 
@@ -174,15 +174,15 @@ Or observe how fluctuating the page allocations are, via statistical
 analysis done over ten 1-second intervals:
 
  titan:~/git> perf stat --repeat 10 -a -e kmem:mm_page_pcpu_drain -e
-   kmem:mm_page_alloc -e kmem:mm_pagevec_free -e
-   kmem:mm_page_free_direct sleep 1
+   kmem:mm_page_alloc -e kmem:mm_page_free_batched -e
+   kmem:mm_page_free sleep 1
 
  Performance counter stats for 'sleep 1' (10 runs):
 
           17254  kmem:mm_page_pcpu_drain    ( +-   3.709% )
           34394  kmem:mm_page_alloc         ( +-   4.617% )
-           7509  kmem:mm_pagevec_free       ( +-   4.820% )
-          25653  kmem:mm_page_free_direct   ( +-   3.672% )
+           7509  kmem:mm_page_free_batched  ( +-   4.820% )
+          25653  kmem:mm_page_free	    ( +-   3.672% )
 
     1.058135029  seconds time elapsed   ( +-   3.089% )
 
diff --git a/tools/perf/Documentation/jit-interface.txt b/tools/perf/Documentation/jit-interface.txt
new file mode 100644
index 00000000000..a8656f56491
--- /dev/null
+++ b/tools/perf/Documentation/jit-interface.txt
@@ -0,0 +1,15 @@
+perf supports a simple JIT interface to resolve symbols for dynamic code generated
+by a JIT.
+
+The JIT has to write a /tmp/perf-%d.map  (%d = pid of process) file
+
+This is a text file.
+
+Each line has the following format, fields separated with spaces:
+
+START SIZE symbolname
+
+START and SIZE are hex numbers without 0x.
+symbolname is the rest of the line, so it could contain special characters.
+
+The ownership of the file has to match the process.
diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt
index c9dcade0683..e9cd39a92dc 100644
--- a/tools/perf/Documentation/perf-annotate.txt
+++ b/tools/perf/Documentation/perf-annotate.txt
@@ -1,5 +1,5 @@
 perf-annotate(1)
-==============
+================
 
 NAME
 ----
@@ -8,7 +8,7 @@ perf-annotate - Read perf.data (created by perf record) and display annotated co
 SYNOPSIS
 --------
 [verse]
-'perf annotate' [-i <file> | --input=file] symbol_name
+'perf annotate' [-i <file> | --input=file] [symbol_name]
 
 DESCRIPTION
 -----------
@@ -22,8 +22,80 @@ OPTIONS
 -------
 -i::
 --input=::
-        Input file name. (default: perf.data)
+        Input file name. (default: perf.data unless stdin is a fifo)
+
+-d::
+--dsos=<dso[,dso...]>::
+        Only consider symbols in these dsos.
+-s::
+--symbol=<symbol>::
+        Symbol to annotate.
+
+-f::
+--force::
+        Don't complain, do it.
+
+-v::
+--verbose::
+        Be more verbose. (Show symbol address, etc)
+
+-D::
+--dump-raw-trace::
+        Dump raw trace in ASCII.
+
+-k::
+--vmlinux=<file>::
+        vmlinux pathname.
+
+-m::
+--modules::
+        Load module symbols. WARNING: use only with -k and LIVE kernel.
+
+-l::
+--print-line::
+        Print matching source lines (may be slow).
+
+-P::
+--full-paths::
+        Don't shorten the displayed pathnames.
+
+--stdio:: Use the stdio interface.
+
+--tui:: Use the TUI interface. Use of --tui requires a tty, if one is not
+	present, as when piping to other commands, the stdio interface is
+	used. This interfaces starts by centering on the line with more
+	samples, TAB/UNTAB cycles through the lines with more samples.
+
+--gtk:: Use the GTK interface.
+
+-C::
+--cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can
+	be provided as a comma-separated list with no space: 0,1. Ranges of
+	CPUs are specified with -: 0-2. Default is to report samples on all
+	CPUs.
+
+--asm-raw::
+	Show raw instruction encoding of assembly instructions.
+
+--source::
+	Interleave source code with assembly code. Enabled by default,
+	disable with --no-source.
+
+--symfs=<directory>::
+        Look for files with symbols relative to this directory.
+
+-M::
+--disassembler-style=:: Set disassembler style for objdump.
+
+--objdump=<path>::
+        Path to objdump binary.
+
+--skip-missing::
+	Skip symbols that cannot be annotated.
+
+--group::
+	Show event group information together
 
 SEE ALSO
 --------
-linkperf:perf-record[1]
+linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-archive.txt b/tools/perf/Documentation/perf-archive.txt
index fae174dc7d0..ac6ecbb3e66 100644
--- a/tools/perf/Documentation/perf-archive.txt
+++ b/tools/perf/Documentation/perf-archive.txt
@@ -12,9 +12,9 @@ SYNOPSIS
 
 DESCRIPTION
 -----------
-This command runs runs perf-buildid-list --with-hits, and collects the files
-with the buildids found so that analisys of perf.data contents can be possible
-on another machine.
+This command runs perf-buildid-list --with-hits, and collects the files with the
+buildids found so that analysis of perf.data contents can be possible on another
+machine.
 
 
 SEE ALSO
diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt
index ae525ac5a2c..4464ad770d5 100644
--- a/tools/perf/Documentation/perf-bench.txt
+++ b/tools/perf/Documentation/perf-bench.txt
@@ -1,5 +1,5 @@
 perf-bench(1)
-============
+=============
 
 NAME
 ----
@@ -12,19 +12,19 @@ SYNOPSIS
 
 DESCRIPTION
 -----------
-This 'perf bench' command is general framework for benchmark suites.
+This 'perf bench' command is a general framework for benchmark suites.
 
 COMMON OPTIONS
 --------------
 -f::
 --format=::
 Specify format style.
-Current available format styles are,
+Current available format styles are:
 
 'default'::
 Default style. This is mainly for human reading.
 ---------------------
-% perf bench sched pipe                      # with no style specify
+% perf bench sched pipe                      # with no style specified
 (executing 1000000 pipe operations between two tasks)
         Total time:5.855 sec
                 5.855061 usecs/op
@@ -45,14 +45,26 @@ SUBSYSTEM
 'sched'::
 	Scheduler and IPC mechanisms.
 
+'mem'::
+	Memory access performance.
+
+'numa'::
+	NUMA scheduling and MM benchmarks.
+
+'futex'::
+	Futex stressing benchmarks.
+
+'all'::
+	All benchmark subsystems.
+
 SUITES FOR 'sched'
 ~~~~~~~~~~~~~~~~~~
 *messaging*::
 Suite for evaluating performance of scheduler and IPC mechanisms.
 Based on hackbench by Rusty Russell.
 
-Options of *pipe*
-^^^^^^^^^^^^^^^^^
+Options of *messaging*
+^^^^^^^^^^^^^^^^^^^^^^
 -p::
 --pipe::
 Use pipe() instead of socketpair()
@@ -79,7 +91,7 @@ options (20 sender and receiver processes per group)
 
       Total time:0.308 sec
 
-% perf bench sched messaging -t -g 20        # be multi-thread,with 20 groups
+% perf bench sched messaging -t -g 20        # be multi-thread, with 20 groups
 (20 sender and receiver threads per group)
 (20 groups == 800 threads run)
 
@@ -115,6 +127,88 @@ Example of *pipe*
                 59004 ops/sec
 ---------------------
 
+SUITES FOR 'mem'
+~~~~~~~~~~~~~~~~
+*memcpy*::
+Suite for evaluating performance of simple memory copy in various ways.
+
+Options of *memcpy*
+^^^^^^^^^^^^^^^^^^^
+-l::
+--length::
+Specify length of memory to copy (default: 1MB).
+Available units are B, KB, MB, GB and TB (case insensitive).
+
+-r::
+--routine::
+Specify routine to copy (default: default).
+Available routines are depend on the architecture.
+On x86-64, x86-64-unrolled, x86-64-movsq and x86-64-movsb are supported.
+
+-i::
+--iterations::
+Repeat memcpy invocation this number of times.
+
+-c::
+--cycle::
+Use perf's cpu-cycles event instead of gettimeofday syscall.
+
+-o::
+--only-prefault::
+Show only the result with page faults before memcpy.
+
+-n::
+--no-prefault::
+Show only the result without page faults before memcpy.
+
+*memset*::
+Suite for evaluating performance of simple memory set in various ways.
+
+Options of *memset*
+^^^^^^^^^^^^^^^^^^^
+-l::
+--length::
+Specify length of memory to set (default: 1MB).
+Available units are B, KB, MB, GB and TB (case insensitive).
+
+-r::
+--routine::
+Specify routine to set (default: default).
+Available routines are depend on the architecture.
+On x86-64, x86-64-unrolled, x86-64-stosq and x86-64-stosb are supported.
+
+-i::
+--iterations::
+Repeat memset invocation this number of times.
+
+-c::
+--cycle::
+Use perf's cpu-cycles event instead of gettimeofday syscall.
+
+-o::
+--only-prefault::
+Show only the result with page faults before memset.
+
+-n::
+--no-prefault::
+Show only the result without page faults before memset.
+
+SUITES FOR 'numa'
+~~~~~~~~~~~~~~~~~
+*mem*::
+Suite for evaluating NUMA workloads.
+
+SUITES FOR 'futex'
+~~~~~~~~~~~~~~~~~~
+*hash*::
+Suite for evaluating hash tables.
+
+*wake*::
+Suite for evaluating wake calls.
+
+*requeue*::
+Suite for evaluating requeue calls.
+
 SEE ALSO
 --------
 linkperf:perf[1]
diff --git a/tools/perf/Documentation/perf-buildid-cache.txt b/tools/perf/Documentation/perf-buildid-cache.txt
index 88bc3b51974..fd77d81ea74 100644
--- a/tools/perf/Documentation/perf-buildid-cache.txt
+++ b/tools/perf/Documentation/perf-buildid-cache.txt
@@ -8,26 +8,46 @@ perf-buildid-cache - Manage build-id cache.
 SYNOPSIS
 --------
 [verse]
-'perf buildid-list <options>'
+'perf buildid-cache <options>'
 
 DESCRIPTION
 -----------
-This command manages the build-id cache. It can add and remove files to the
-cache. In the future it should as well purge older entries, set upper limits
-for the space used by the cache, etc.
+This command manages the build-id cache. It can add and remove files to/from
+the cache. In the future it should as well purge older entries, set upper
+limits for the space used by the cache, etc.
 
 OPTIONS
 -------
 -a::
 --add=::
         Add specified file to the cache.
+-k::
+--kcore::
+        Add specified kcore file to the cache. For the current host that is
+        /proc/kcore which requires root permissions to read. Be aware that
+        running 'perf buildid-cache' as root may update root's build-id cache
+        not the user's. Use the -v option to see where the file is created.
+        Note that the copied file contains only code sections not the whole core
+        image. Note also that files "kallsyms" and "modules" must also be in the
+        same directory and are also copied.  All 3 files are created with read
+        permissions for root only. kcore will not be added if there is already a
+        kcore in the cache (with the same build-id) that has the same modules at
+        the same addresses. Use the -v option to see if a copy of kcore is
+        actually made.
 -r::
 --remove=::
-        Remove specified file to the cache.
+        Remove specified file from the cache.
+-M::
+--missing=:: 
+	List missing build ids in the cache for the specified file.
+-u::
+--update::
+	Update specified file of the cache. It can be used to update kallsyms
+	kernel dso to vmlinux in order to support annotation.
 -v::
 --verbose::
 	Be more verbose.
 
 SEE ALSO
 --------
-linkperf:perf-record[1], linkperf:perf-report[1]
+linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-buildid-list[1]
diff --git a/tools/perf/Documentation/perf-buildid-list.txt b/tools/perf/Documentation/perf-buildid-list.txt
index 01b642c0bf8..25c52efcc7f 100644
--- a/tools/perf/Documentation/perf-buildid-list.txt
+++ b/tools/perf/Documentation/perf-buildid-list.txt
@@ -16,14 +16,23 @@ This command displays the buildids found in a perf.data file, so that other
 tools can be used to fetch packages with matching symbol tables for use by
 perf report.
 
+It can also be used to show the build id of the running kernel or in an ELF
+file using -i/--input.
+
 OPTIONS
 -------
+-H::
+--with-hits::
+        Show only DSOs with hits.
 -i::
 --input=::
-        Input file name. (default: perf.data)
+        Input file name. (default: perf.data unless stdin is a fifo)
 -f::
 --force::
 	Don't do ownership validation.
+-k::
+--kernel::
+	Show running kernel build id.
 -v::
 --verbose::
 	Be more verbose.
diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt
index 8974e208cba..b3b8abae62b 100644
--- a/tools/perf/Documentation/perf-diff.txt
+++ b/tools/perf/Documentation/perf-diff.txt
@@ -1,48 +1,63 @@
 perf-diff(1)
-==============
+============
 
 NAME
 ----
-perf-diff - Read two perf.data files and display the differential profile
+perf-diff - Read perf.data files and display the differential profile
 
 SYNOPSIS
 --------
 [verse]
-'perf diff' [oldfile] [newfile]
+'perf diff' [baseline file] [data file1] [[data file2] ... ]
 
 DESCRIPTION
 -----------
-This command displays the performance difference amongst two perf.data files
-captured via perf record.
+This command displays the performance difference amongst two or more perf.data
+files captured via perf record.
 
 If no parameters are passed it will assume perf.data.old and perf.data.
 
+The differential profile is displayed only for events matching both
+specified perf.data files.
+
 OPTIONS
 -------
+-D::
+--dump-raw-trace::
+        Dump raw trace in ASCII.
+
+-m::
+--modules::
+        Load module symbols. WARNING: use only with -k and LIVE kernel
+
 -d::
 --dsos=::
 	Only consider symbols in these dsos. CSV that understands
-	file://filename entries.
+	file://filename entries.  This option will affect the percentage
+	of the Baseline/Delta column.  See --percentage for more info.
 
 -C::
 --comms=::
 	Only consider symbols in these comms. CSV that understands
-	file://filename entries.
+	file://filename entries.  This option will affect the percentage
+	of the Baseline/Delta column.  See --percentage for more info.
 
 -S::
 --symbols=::
 	Only consider these symbols. CSV that understands
-	file://filename entries.
+	file://filename entries.  This option will affect the percentage
+	of the Baseline/Delta column.  See --percentage for more info.
 
 -s::
 --sort=::
-	Sort by key(s): pid, comm, dso, symbol.
+	Sort by key(s): pid, comm, dso, symbol, cpu, parent, srcline.
+	Please see description of --sort in the perf-report man page.
 
 -t::
 --field-separator=::
 
 	Use a special separator character and don't pad with spaces, replacing
-	all occurances of this separator in symbol names (and other output)
+	all occurrences of this separator in symbol names (and other output)
 	with a '.' character, that thus it's the only non valid separator.
 
 -v::
@@ -50,6 +65,142 @@ OPTIONS
 	Be verbose, for instance, show the raw counts in addition to the
 	diff.
 
+-f::
+--force::
+       Don't complain, do it.
+
+--symfs=<directory>::
+        Look for files with symbols relative to this directory.
+
+-b::
+--baseline-only::
+        Show only items with match in baseline.
+
+-c::
+--compute::
+        Differential computation selection - delta,ratio,wdiff (default is delta).
+        See COMPARISON METHODS section for more info.
+
+-p::
+--period::
+        Show period values for both compared hist entries.
+
+-F::
+--formula::
+        Show formula for given computation.
+
+-o::
+--order::
+       Specify compute sorting column number.
+
+--percentage::
+	Determine how to display the overhead percentage of filtered entries.
+	Filters can be applied by --comms, --dsos and/or --symbols options.
+
+	"relative" means it's relative to filtered entries only so that the
+	sum of shown entries will be always 100%.  "absolute" means it retains
+	the original value before and after the filter is applied.
+
+COMPARISON
+----------
+The comparison is governed by the baseline file. The baseline perf.data
+file is iterated for samples. All other perf.data files specified on
+the command line are searched for the baseline sample pair. If the pair
+is found, specified computation is made and result is displayed.
+
+All samples from non-baseline perf.data files, that do not match any
+baseline entry, are displayed with empty space within baseline column
+and possible computation results (delta) in their related column.
+
+Example files samples:
+- file A with samples f1, f2, f3, f4,    f6
+- file B with samples     f2,     f4, f5
+- file C with samples f1, f2,         f5
+
+Example output:
+  x - computation takes place for pair
+  b - baseline sample percentage
+
+- perf diff A B C
+
+  baseline/A compute/B compute/C  samples
+  ---------------------------------------
+  b                    x          f1
+  b          x         x          f2
+  b                               f3
+  b          x                    f4
+  b                               f6
+             x         x          f5
+
+- perf diff B A C
+
+  baseline/B compute/A compute/C  samples
+  ---------------------------------------
+  b          x         x          f2
+  b          x                    f4
+  b                    x          f5
+             x         x          f1
+             x                    f3
+             x                    f6
+
+- perf diff C B A
+
+  baseline/C compute/B compute/A  samples
+  ---------------------------------------
+  b                    x          f1
+  b          x         x          f2
+  b          x                    f5
+                       x          f3
+             x         x          f4
+                       x          f6
+
+COMPARISON METHODS
+------------------
+delta
+~~~~~
+If specified the 'Delta' column is displayed with value 'd' computed as:
+
+  d = A->period_percent - B->period_percent
+
+with:
+  - A/B being matching hist entry from data/baseline file specified
+    (or perf.data/perf.data.old) respectively.
+
+  - period_percent being the % of the hist entry period value within
+    single data file
+
+  - with filtering by -C, -d and/or -S, period_percent might be changed
+    relative to how entries are filtered.  Use --percentage=absolute to
+    prevent such fluctuation.
+
+ratio
+~~~~~
+If specified the 'Ratio' column is displayed with value 'r' computed as:
+
+  r = A->period / B->period
+
+with:
+  - A/B being matching hist entry from data/baseline file specified
+    (or perf.data/perf.data.old) respectively.
+
+  - period being the hist entry period value
+
+wdiff:WEIGHT-B,WEIGHT-A
+~~~~~~~~~~~~~~~~~~~~~~~
+If specified the 'Weighted diff' column is displayed with value 'd' computed as:
+
+   d = B->period * WEIGHT-A - A->period * WEIGHT-B
+
+  - A/B being matching hist entry from data/baseline file specified
+    (or perf.data/perf.data.old) respectively.
+
+  - period being the hist entry period value
+
+  - WEIGHT-A/WEIGHT-B being user suplied weights in the the '-c' option
+    behind ':' separator like '-c wdiff:1,2'.
+    - WIEGHT-A being the weight of the data file
+    - WIEGHT-B being the weight of the baseline data file
+
 SEE ALSO
 --------
-linkperf:perf-record[1]
+linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-evlist.txt b/tools/perf/Documentation/perf-evlist.txt
new file mode 100644
index 00000000000..1ceb3700ffb
--- /dev/null
+++ b/tools/perf/Documentation/perf-evlist.txt
@@ -0,0 +1,38 @@
+perf-evlist(1)
+==============
+
+NAME
+----
+perf-evlist - List the event names in a perf.data file
+
+SYNOPSIS
+--------
+[verse]
+'perf evlist <options>'
+
+DESCRIPTION
+-----------
+This command displays the names of events sampled in a perf.data file.
+
+OPTIONS
+-------
+-i::
+--input=::
+        Input file name. (default: perf.data unless stdin is a fifo)
+
+-F::
+--freq=::
+	Show just the sample frequency used for each event.
+
+-v::
+--verbose=::
+	Show all fields.
+
+-g::
+--group::
+	Show event group information.
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-list[1],
+linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-inject.txt b/tools/perf/Documentation/perf-inject.txt
new file mode 100644
index 00000000000..a00a34276c5
--- /dev/null
+++ b/tools/perf/Documentation/perf-inject.txt
@@ -0,0 +1,46 @@
+perf-inject(1)
+==============
+
+NAME
+----
+perf-inject - Filter to augment the events stream with additional information
+
+SYNOPSIS
+--------
+[verse]
+'perf inject <options>'
+
+DESCRIPTION
+-----------
+perf-inject reads a perf-record event stream and repipes it to stdout.  At any
+point the processing code can inject other events into the event stream - in
+this case build-ids (-b option) are read and injected as needed into the event
+stream.
+
+Build-ids are just the first user of perf-inject - potentially anything that
+needs userspace processing to augment the events stream with additional
+information could make use of this facility.
+
+OPTIONS
+-------
+-b::
+--build-ids=::
+        Inject build-ids into the output stream
+-v::
+--verbose::
+	Be more verbose.
+-i::
+--input=::
+	Input file name. (default: stdin)
+-o::
+--output=::
+	Output file name. (default: stdout)
+-s::
+--sched-stat::
+	Merge sched_stat and sched_switch for getting events where and how long
+	tasks slept. sched_switch contains a callchain where a task slept and
+	sched_stat contains a timeslice how long a task slept.
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-archive[1]
diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt
index eac4d852e7c..7c8fbbf3f61 100644
--- a/tools/perf/Documentation/perf-kmem.txt
+++ b/tools/perf/Documentation/perf-kmem.txt
@@ -1,5 +1,5 @@
 perf-kmem(1)
-==============
+============
 
 NAME
 ----
@@ -23,7 +23,7 @@ OPTIONS
 -------
 -i <file>::
 --input=<file>::
-	Select the input file (default: perf.data)
+	Select the input file (default: perf.data unless stdin is a fifo)
 
 --caller::
 	Show per-callsite statistics
diff --git a/tools/perf/Documentation/perf-kvm.txt b/tools/perf/Documentation/perf-kvm.txt
new file mode 100644
index 00000000000..52276a6d2b7
--- /dev/null
+++ b/tools/perf/Documentation/perf-kvm.txt
@@ -0,0 +1,156 @@
+perf-kvm(1)
+===========
+
+NAME
+----
+perf-kvm - Tool to trace/measure kvm guest os
+
+SYNOPSIS
+--------
+[verse]
+'perf kvm' [--host] [--guest] [--guestmount=<path>
+	[--guestkallsyms=<path> --guestmodules=<path> | --guestvmlinux=<path>]]
+	{top|record|report|diff|buildid-list} [<options>]
+'perf kvm' [--host] [--guest] [--guestkallsyms=<path> --guestmodules=<path>
+	| --guestvmlinux=<path>] {top|record|report|diff|buildid-list|stat} [<options>]
+'perf kvm stat [record|report|live] [<options>]
+
+DESCRIPTION
+-----------
+There are a couple of variants of perf kvm:
+
+  'perf kvm [options] top <command>' to generates and displays
+  a performance counter profile of guest os in realtime
+  of an arbitrary workload.
+
+  'perf kvm record <command>' to record the performance counter profile
+  of an arbitrary workload and save it into a perf data file. We set the
+  default behavior of perf kvm as --guest, so if neither --host nor --guest
+  is input, the perf data file name is perf.data.guest. If --host is input,
+  the perf data file name is perf.data.kvm. If you want to record data into
+  perf.data.host, please input --host --no-guest. The behaviors are shown as
+  following:
+    Default('')         ->  perf.data.guest
+    --host              ->  perf.data.kvm
+    --guest             ->  perf.data.guest
+    --host --guest      ->  perf.data.kvm
+    --host --no-guest   ->  perf.data.host
+
+  'perf kvm report' to display the performance counter profile information
+  recorded via perf kvm record.
+
+  'perf kvm diff' to displays the performance difference amongst two perf.data
+  files captured via perf record.
+
+  'perf kvm buildid-list' to  display the buildids found in a perf data file,
+  so that other tools can be used to fetch packages with matching symbol tables
+  for use by perf report. As buildid is read from /sys/kernel/notes in os, then
+  if you want to list the buildid for guest, please make sure your perf data file
+  was captured with --guestmount in perf kvm record.
+
+  'perf kvm stat <command>' to run a command and gather performance counter
+  statistics.
+  Especially, perf 'kvm stat record/report' generates a statistical analysis
+  of KVM events. Currently, vmexit, mmio and ioport events are supported.
+  'perf kvm stat record <command>' records kvm events and the events between
+  start and end <command>.
+  And this command produces a file which contains tracing results of kvm
+  events.
+
+  'perf kvm stat report' reports statistical data which includes events
+  handled time, samples, and so on.
+
+  'perf kvm stat live' reports statistical data in a live mode (similar to
+  record + report but with statistical data updated live at a given display
+  rate).
+
+OPTIONS
+-------
+-i::
+--input=<path>::
+        Input file name.
+-o::
+--output=<path>::
+        Output file name.
+--host::
+        Collect host side performance profile.
+--guest::
+        Collect guest side performance profile.
+--guestmount=<path>::
+	Guest os root file system mount directory. Users mounts guest os
+        root directories under <path> by a specific filesystem access method,
+	typically, sshfs. For example, start 2 guest os. The one's pid is 8888
+	and the other's is 9999.
+        #mkdir ~/guestmount; cd ~/guestmount
+        #sshfs -o allow_other,direct_io -p 5551 localhost:/ 8888/
+        #sshfs -o allow_other,direct_io -p 5552 localhost:/ 9999/
+        #perf kvm --host --guest --guestmount=~/guestmount top
+--guestkallsyms=<path>::
+        Guest os /proc/kallsyms file copy. 'perf' kvm' reads it to get guest
+	kernel symbols. Users copy it out from guest os.
+--guestmodules=<path>::
+	Guest os /proc/modules file copy. 'perf' kvm' reads it to get guest
+	kernel module information. Users copy it out from guest os.
+--guestvmlinux=<path>::
+	Guest os kernel vmlinux.
+-v::
+--verbose::
+	Be more verbose (show counter open errors, etc).
+
+STAT REPORT OPTIONS
+-------------------
+--vcpu=<value>::
+       analyze events which occures on this vcpu. (default: all vcpus)
+
+--event=<value>::
+       event to be analyzed. Possible values: vmexit, mmio, ioport.
+       (default: vmexit)
+-k::
+--key=<value>::
+       Sorting key. Possible values: sample (default, sort by samples
+       number), time (sort by average time).
+-p::
+--pid=::
+    Analyze events only for given process ID(s) (comma separated list).
+
+STAT LIVE OPTIONS
+-----------------
+-d::
+--display::
+        Time in seconds between display updates
+
+-m::
+--mmap-pages=::
+    Number of mmap data pages (must be a power of two) or size
+    specification with appended unit character - B/K/M/G. The
+    size is rounded up to have nearest pages power of two value.
+
+-a::
+--all-cpus::
+        System-wide collection from all CPUs.
+
+-p::
+--pid=::
+    Analyze events only for given process ID(s) (comma separated list).
+
+--vcpu=<value>::
+       analyze events which occures on this vcpu. (default: all vcpus)
+
+
+--event=<value>::
+       event to be analyzed. Possible values: vmexit, mmio, ioport.
+       (default: vmexit)
+
+-k::
+--key=<value>::
+       Sorting key. Possible values: sample (default, sort by samples
+       number), time (sort by average time).
+
+--duration=<value>::
+       Show events other than HLT that take longer than duration usecs.
+
+SEE ALSO
+--------
+linkperf:perf-top[1], linkperf:perf-record[1], linkperf:perf-report[1],
+linkperf:perf-diff[1], linkperf:perf-buildid-list[1],
+linkperf:perf-stat[1]
diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index 8290b942266..6fce6a62220 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -8,18 +8,115 @@ perf-list - List all symbolic event types
 SYNOPSIS
 --------
 [verse]
-'perf list'
+'perf list' [hw|sw|cache|tracepoint|pmu|event_glob]
 
 DESCRIPTION
 -----------
 This command displays the symbolic event types which can be selected in the
 various perf commands with the -e option.
 
+[[EVENT_MODIFIERS]]
+EVENT MODIFIERS
+---------------
+
+Events can optionally have a modifer by appending a colon and one or
+more modifiers. Modifiers allow the user to restrict the events to be
+counted. The following modifiers exist:
+
+ u - user-space counting
+ k - kernel counting
+ h - hypervisor counting
+ G - guest counting (in KVM guests)
+ H - host counting (not in KVM guests)
+ p - precise level
+ S - read sample value (PERF_SAMPLE_READ)
+ D - pin the event to the PMU
+
+The 'p' modifier can be used for specifying how precise the instruction
+address should be. The 'p' modifier can be specified multiple times:
+
+ 0 - SAMPLE_IP can have arbitrary skid
+ 1 - SAMPLE_IP must have constant skid
+ 2 - SAMPLE_IP requested to have 0 skid
+ 3 - SAMPLE_IP must have 0 skid
+
+For Intel systems precise event sampling is implemented with PEBS
+which supports up to precise-level 2.
+
+On AMD systems it is implemented using IBS (up to precise-level 2).
+The precise modifier works with event types 0x76 (cpu-cycles, CPU
+clocks not halted) and 0xC1 (micro-ops retired). Both events map to
+IBS execution sampling (IBS op) with the IBS Op Counter Control bit
+(IbsOpCntCtl) set respectively (see AMD64 Architecture Programmer’s
+Manual Volume 2: System Programming, 13.3 Instruction-Based
+Sampling). Examples to use IBS:
+
+ perf record -a -e cpu-cycles:p ...    # use ibs op counting cycles
+ perf record -a -e r076:p ...          # same as -e cpu-cycles:p
+ perf record -a -e r0C1:p ...          # use ibs op counting micro-ops
+
+RAW HARDWARE EVENT DESCRIPTOR
+-----------------------------
+Even when an event is not available in a symbolic form within perf right now,
+it can be encoded in a per processor specific way.
+
+For instance For x86 CPUs NNN represents the raw register encoding with the
+layout of IA32_PERFEVTSELx MSRs (see [Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide] Figure 30-1 Layout
+of IA32_PERFEVTSELx MSRs) or AMD's PerfEvtSeln (see [AMD64 Architecture Programmer’s Manual Volume 2: System Programming], Page 344,
+Figure 13-7 Performance Event-Select Register (PerfEvtSeln)).
+
+Note: Only the following bit fields can be set in x86 counter
+registers: event, umask, edge, inv, cmask. Esp. guest/host only and
+OS/user mode flags must be setup using <<EVENT_MODIFIERS, EVENT
+MODIFIERS>>.
+
+Example:
+
+If the Intel docs for a QM720 Core i7 describe an event as:
+
+  Event  Umask  Event Mask
+  Num.   Value  Mnemonic    Description                        Comment
+
+  A8H      01H  LSD.UOPS    Counts the number of micro-ops     Use cmask=1 and
+                            delivered by loop stream detector  invert to count
+                                                               cycles
+
+raw encoding of 0x1A8 can be used:
+
+ perf stat -e r1a8 -a sleep 1
+ perf record -e r1a8 ...
+
+You should refer to the processor specific documentation for getting these
+details. Some of them are referenced in the SEE ALSO section below.
+
 OPTIONS
 -------
-None
+
+Without options all known events will be listed.
+
+To limit the list use:
+
+. 'hw' or 'hardware' to list hardware events such as cache-misses, etc.
+
+. 'sw' or 'software' to list software events such as context switches, etc.
+
+. 'cache' or 'hwcache' to list hardware cache events such as L1-dcache-loads, etc.
+
+. 'tracepoint' to list all tracepoint events, alternatively use
+  'subsys_glob:event_glob' to filter by tracepoint subsystems such as sched,
+  block, etc.
+
+. 'pmu' to print the kernel supplied PMU events.
+
+. If none of the above is matched, it will apply the supplied glob to all
+  events, printing the ones that match.
+
+One or more types can be used at the same time, listing the events for the
+types specified.
 
 SEE ALSO
 --------
 linkperf:perf-stat[1], linkperf:perf-top[1],
-linkperf:perf-record[1]
+linkperf:perf-record[1],
+http://www.intel.com/Assets/PDF/manual/253669.pdf[Intel® 64 and IA-32 Architectures Software Developer's Manual Volume 3B: System Programming Guide],
+http://support.amd.com/us/Processor_TechDocs/24593_APM_v2.pdf[AMD64 Architecture Programmer’s Manual Volume 2: System Programming]
diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt
index b317102138c..ab25be28c9d 100644
--- a/tools/perf/Documentation/perf-lock.txt
+++ b/tools/perf/Documentation/perf-lock.txt
@@ -8,7 +8,7 @@ perf-lock - Analyze lock events
 SYNOPSIS
 --------
 [verse]
-'perf lock' {record|report|trace}
+'perf lock' {record|report|script|info}
 
 DESCRIPTION
 -----------
@@ -20,10 +20,47 @@ and statistics with this 'perf lock' command.
   produces the file "perf.data" which contains tracing
   results of lock events.
 
-  'perf lock trace' shows raw lock events.
-
   'perf lock report' reports statistical data.
 
+  'perf lock script' shows raw lock events.
+
+  'perf lock info' shows metadata like threads or addresses
+  of lock instances.
+
+COMMON OPTIONS
+--------------
+
+-i::
+--input=<file>::
+        Input file name. (default: perf.data unless stdin is a fifo)
+
+-v::
+--verbose::
+        Be more verbose (show symbol address, etc).
+
+-D::
+--dump-raw-trace::
+        Dump raw trace in ASCII.
+
+REPORT OPTIONS
+--------------
+
+-k::
+--key=<value>::
+        Sorting key. Possible values: acquired (default), contended,
+	avg_wait, wait_total, wait_max, wait_min.
+
+INFO OPTIONS
+------------
+
+-t::
+--threads::
+	dump thread list in perf.data
+
+-m::
+--map::
+	dump map of lock instances (address:name table)
+
 SEE ALSO
 --------
 linkperf:perf[1]
diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt
new file mode 100644
index 00000000000..1d78a4064da
--- /dev/null
+++ b/tools/perf/Documentation/perf-mem.txt
@@ -0,0 +1,52 @@
+perf-mem(1)
+===========
+
+NAME
+----
+perf-mem - Profile memory accesses
+
+SYNOPSIS
+--------
+[verse]
+'perf mem' [<options>] (record [<command>] | report)
+
+DESCRIPTION
+-----------
+"perf mem -t <TYPE> record" runs a command and gathers memory operation data
+from it, into perf.data. Perf record options are accepted and are passed through.
+
+"perf mem -t <TYPE> report" displays the result. It invokes perf report with the
+right set of options to display a memory access profile.
+
+Note that on Intel systems the memory latency reported is the use-latency,
+not the pure load (or store latency). Use latency includes any pipeline
+queueing delays in addition to the memory subsystem latency.
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+-t::
+--type=::
+	Select the memory operation type: load or store (default: load)
+
+-D::
+--dump-raw-samples=::
+	Dump the raw decoded samples on the screen in a format that is easy to parse with
+	one sample per line.
+
+-x::
+--field-separator::
+	Specify the field separator used when dump raw samples (-D option). By default,
+	The separator is the space character.
+
+-C::
+--cpu-list::
+	Restrict dump of raw samples to those provided via this option. Note that the same
+	option can be passed in record mode. It will be interpreted the same way as perf
+	record.
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt
index 34202b1be0b..1513935c399 100644
--- a/tools/perf/Documentation/perf-probe.txt
+++ b/tools/perf/Documentation/perf-probe.txt
@@ -16,7 +16,9 @@ or
 or
 'perf probe' --list
 or
-'perf probe' --line='FUNC[:RLN[+NUM|:RLN2]]|SRC:ALN[+NUM|:ALN2]'
+'perf probe' [options] --line='LINE'
+or
+'perf probe' [options] --vars='PROBEPOINT'
 
 DESCRIPTION
 -----------
@@ -31,6 +33,17 @@ OPTIONS
 --vmlinux=PATH::
 	Specify vmlinux path which has debuginfo (Dwarf binary).
 
+-m::
+--module=MODNAME|PATH::
+	Specify module name in which perf-probe searches probe points
+	or lines. If a path of module file is passed, perf-probe
+	treat it as an offline module (this means you can add a probe on
+        a module which has not been loaded yet).
+
+-s::
+--source=PATH::
+	Specify path to kernel source.
+
 -v::
 --verbose::
         Be more verbose (show parsed arguments, etc).
@@ -53,10 +66,48 @@ OPTIONS
 	Show source code lines which can be probed. This needs an argument
 	which specifies a range of the source code. (see LINE SYNTAX for detail)
 
+-V::
+--vars=::
+	Show available local variables at given probe point. The argument
+	syntax is same as PROBE SYNTAX, but NO ARGs.
+
+--externs::
+	(Only for --vars) Show external defined variables in addition to local
+	variables.
+
+-F::
+--funcs::
+	Show available functions in given module or kernel. With -x/--exec,
+	can also list functions in a user space executable / shared library.
+
+--filter=FILTER::
+	(Only for --vars and --funcs) Set filter. FILTER is a combination of glob
+	pattern, see FILTER PATTERN for detail.
+	Default FILTER is "!__k???tab_* & !__crc_*" for --vars, and "!_*"
+	for --funcs.
+	If several filters are specified, only the last filter is used.
+
 -f::
 --force::
 	Forcibly add events with existing name.
 
+-n::
+--dry-run::
+	Dry run. With this option, --add and --del doesn't execute actual
+	adding and removal operations.
+
+--max-probes::
+	Set the maximum number of probe points for an event. Default is 128.
+
+-x::
+--exec=PATH::
+	Specify path to the executable or shared library file for user
+	space tracing. Can also be used with --funcs option.
+
+In absence of -m/-x options, perf probe checks if the first argument after
+the options is an absolute path name. If its an absolute path, perf probe
+uses it as a target module/target user space binary to probe.
+
 PROBE SYNTAX
 ------------
 Probe points are defined by following syntax.
@@ -74,19 +125,31 @@ Probe points are defined by following syntax.
 'EVENT' specifies the name of new event, if omitted, it will be set the name of the probed function. Currently, event group name is set as 'probe'.
 'FUNC' specifies a probed function name, and it may have one of the following options; '+OFFS' is the offset from function entry address in bytes, ':RLN' is the relative-line number from function entry line, and '%return' means that it probes function return. And ';PTN' means lazy matching pattern (see LAZY MATCHING). Note that ';PTN' must be the end of the probe point definition.  In addition, '@SRC' specifies a source file which has that function.
 It is also possible to specify a probe point by the source line number or lazy matching by using 'SRC:ALN' or 'SRC;PTN' syntax, where 'SRC' is the source file path, ':ALN' is the line number and ';PTN' is the lazy matching pattern.
-'ARG' specifies the arguments of this probe point. You can use the name of local variable, or kprobe-tracer argument format (e.g. $retval, %ax, etc).
+'ARG' specifies the arguments of this probe point, (see PROBE ARGUMENT).
+
+PROBE ARGUMENT
+--------------
+Each probe argument follows below syntax.
+
+ [NAME=]LOCALVAR|$retval|%REG|@SYMBOL[:TYPE]
+
+'NAME' specifies the name of this argument (optional). You can use the name of local variable, local data structure member (e.g. var->field, var.field2), local array with fixed index (e.g. array[1], var->array[0], var->pointer[2]), or kprobe-tracer argument format (e.g. $retval, %ax, etc). Note that the name of this argument will be set as the last member name if you specify a local data structure member (e.g. field2 for 'var->field1.field2'.)
+'TYPE' casts the type of this argument (optional). If omitted, perf probe automatically set the type based on debuginfo. You can specify 'string' type only for the local variable or structure member which is an array of or a pointer to 'char' or 'unsigned char' type.
+
+On x86 systems %REG is always the short form of the register: for example %AX. %RAX or %EAX is not valid.
 
 LINE SYNTAX
 -----------
-Line range is descripted by following syntax.
+Line range is described by following syntax.
 
- "FUNC[:RLN[+NUM|:RLN2]]|SRC:ALN[+NUM|:ALN2]"
+ "FUNC[@SRC][:RLN[+NUM|-RLN2]]|SRC[:ALN[+NUM|-ALN2]]"
 
 FUNC specifies the function name of showing lines. 'RLN' is the start line
 number from function entry line, and 'RLN2' is the end line number. As same as
 probe syntax, 'SRC' means the source file path, 'ALN' is start line number,
 and 'ALN2' is end line number in the file. It is also possible to specify how
-many lines to show by using 'NUM'.
+many lines to show by using 'NUM'. Moreover, 'FUNC@SRC' combination is good
+for searching a specific function when several functions share same name.
 So, "source.c:100-120" shows lines between 100th to l20th in source.c file. And "func:10+20" shows 20 lines from 10th line of func function.
 
 LAZY MATCHING
@@ -98,6 +161,14 @@ e.g.
 
 This provides some sort of flexibility and robustness to probe point definitions against minor code changes. For example, actual 10th line of schedule() can be moved easily by modifying schedule(), but the same line matching 'rq=cpu_rq*' may still exist in the function.)
 
+FILTER PATTERN
+--------------
+ The filter pattern is a glob matching pattern(s) to filter variables.
+ In addition, you can use "!" for specifying filter-out rule. You also can give several rules combined with "&" or "|", and fold those rules as one rule by using "(" ")".
+
+e.g.
+ With --filter "foo* | bar*", perf probe -V shows variables which start with "foo" or "bar".
+ With --filter "!foo* & *bar", perf probe -V shows variables which don't start with "foo" and end with "bar", like "fizzbar". But "foobar" is filtered out.
 
 EXAMPLES
 --------
@@ -123,6 +194,13 @@ Delete all probes on schedule().
 
  ./perf probe --del='schedule*'
 
+Add probes at zfree() function on /bin/zsh
+
+ ./perf probe -x /bin/zsh zfree or ./perf probe /bin/zsh zfree
+
+Add probes at malloc() function on libc
+
+ ./perf probe -x /lib/libc.so.6 malloc or ./perf probe /lib/libc.so.6 malloc
 
 SEE ALSO
 --------
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index fc46c0b40f6..d460049cae8 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -39,26 +39,37 @@ OPTIONS
           be passed as follows: '\mem:addr[:[r][w][x]]'.
           If you want to profile read-write accesses in 0x1000, just set
           'mem:0x1000:rw'.
+
+--filter=<filter>::
+        Event filter.
+
 -a::
-        System-wide collection.
+--all-cpus::
+        System-wide collection from all CPUs.
 
 -l::
         Scale counter values.
 
 -p::
 --pid=::
-	Record events on existing pid.
+	Record events on existing process ID (comma separated list).
+
+-t::
+--tid=::
+        Record events on existing thread ID (comma separated list).
+        This option also disables inheritance by default.  Enable it by adding
+        --inherit.
+
+-u::
+--uid=::
+        Record events in threads owned by uid. Name or number.
 
 -r::
 --realtime=::
 	Collect data with this RT SCHED_FIFO priority.
--A::
---append::
-	Append to the output file to do incremental profiling.
 
--f::
---force::
-	Overwrite existing data file.
+--no-buffering::
+	Collect data without buffering.
 
 -c::
 --count=::
@@ -69,19 +80,37 @@ OPTIONS
 	Output file name.
 
 -i::
---inherit::
-	Child tasks inherit counters.
+--no-inherit::
+	Child tasks do not inherit counters.
 -F::
 --freq=::
 	Profile at this frequency.
 
 -m::
 --mmap-pages=::
-	Number of mmap data pages.
+	Number of mmap data pages (must be a power of two) or size
+	specification with appended unit character - B/K/M/G. The
+	size is rounded up to have nearest pages power of two value.
 
 -g::
+	Enables call-graph (stack chain/backtrace) recording.
+
 --call-graph::
-	Do call-graph (stack chain/backtrace) recording.
+	Setup and enable call-graph (stack chain/backtrace) recording,
+	implies -g.
+
+	Allows specifying "fp" (frame pointer) or "dwarf"
+	(DWARF's CFI - Call Frame Information) as the method to collect
+	the information used to show the call graphs.
+
+	In some systems, where binaries are build with gcc
+	--fomit-frame-pointer, using the "fp" method will produce bogus
+	call graphs, using "dwarf", if available (perf tools linked to
+	the libunwind library) should be used instead.
+
+-q::
+--quiet::
+	Don't print any message, useful for scripting.
 
 -v::
 --verbose::
@@ -95,13 +124,95 @@ OPTIONS
 --data::
 	Sample addresses.
 
+-T::
+--timestamp::
+	Sample timestamps. Use it with 'perf report -D' to see the timestamps,
+	for instance.
+
 -n::
 --no-samples::
 	Don't sample.
 
 -R::
 --raw-samples::
-Collect raw sample records from all opened counters (typically for tracepoint counters).
+Collect raw sample records from all opened counters (default for tracepoint counters).
+
+-C::
+--cpu::
+Collect samples only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+In per-thread mode with inheritance mode on (default), samples are captured only when
+the thread executes on the designated CPUs. Default is to monitor all CPUs.
+
+-N::
+--no-buildid-cache::
+Do not update the builid cache. This saves some overhead in situations
+where the information in the perf.data file (which includes buildids)
+is sufficient.
+
+-G name,...::
+--cgroup name,...::
+monitor only in the container (cgroup) called "name". This option is available only
+in per-cpu mode. The cgroup filesystem must be mounted. All threads belonging to
+container "name" are monitored when they run on the monitored CPUs. Multiple cgroups
+can be provided. Each cgroup is applied to the corresponding event, i.e., first cgroup
+to first event, second cgroup to second event and so on. It is possible to provide
+an empty cgroup (monitor all the time) using, e.g., -G foo,,bar. Cgroups must have
+corresponding events, i.e., they always refer to events defined earlier on the command
+line.
+
+-b::
+--branch-any::
+Enable taken branch stack sampling. Any type of taken branch may be sampled.
+This is a shortcut for --branch-filter any. See --branch-filter for more infos.
+
+-j::
+--branch-filter::
+Enable taken branch stack sampling. Each sample captures a series of consecutive
+taken branches. The number of branches captured with each sample depends on the
+underlying hardware, the type of branches of interest, and the executed code.
+It is possible to select the types of branches captured by enabling filters. The
+following filters are defined:
+
+        - any:  any type of branches
+        - any_call: any function call or system call
+        - any_ret: any function return or system call return
+        - ind_call: any indirect branch
+        - u:  only when the branch target is at the user level
+        - k: only when the branch target is in the kernel
+        - hv: only when the target is at the hypervisor level
+	- in_tx: only when the target is in a hardware transaction
+	- no_tx: only when the target is not in a hardware transaction
+	- abort_tx: only when the target is a hardware transaction abort
+	- cond: conditional branches
+
++
+The option requires at least one branch type among any, any_call, any_ret, ind_call, cond.
+The privilege levels may be omitted, in which case, the privilege levels of the associated
+event are applied to the branch filter. Both kernel (k) and hypervisor (hv) privilege
+levels are subject to permissions.  When sampling on multiple events, branch stack sampling
+is enabled for all the sampling events. The sampled branch type is the same for all events.
+The various filters must be specified as a comma separated list: --branch-filter any_ret,u,k
+Note that this feature may not be available on all processors.
+
+--weight::
+Enable weightened sampling. An additional weight is recorded per sample and can be
+displayed with the weight and local_weight sort keys.  This currently works for TSX
+abort events and some memory events in precise mode on modern Intel CPUs.
+
+--transaction::
+Record transaction flags for transaction related events.
+
+--per-thread::
+Use per-thread mmaps.  By default per-cpu mmaps are created.  This option
+overrides that and uses per-thread mmaps.  A side-effect of that is that
+inheritance is automatically disabled.  --per-thread is ignored with a warning
+if combined with -a or -C options.
+
+-D::
+--delay=::
+After starting the program, wait msecs before measuring. This is useful to
+filter out the startup phase of the program, which is often very different.
 
 SEE ALSO
 --------
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index abfabe9147a..d2b59af62bc 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -19,52 +19,292 @@ OPTIONS
 -------
 -i::
 --input=::
-        Input file name. (default: perf.data)
--d::
---dsos=::
-	Only consider symbols in these dsos. CSV that understands
-	file://filename entries.
+        Input file name. (default: perf.data unless stdin is a fifo)
+
+-v::
+--verbose::
+        Be more verbose. (show symbol address, etc)
+
 -n::
 --show-nr-samples::
 	Show the number of samples for each symbol
+
+--showcpuutilization::
+        Show sample percentage for different cpu modes.
+
 -T::
 --threads::
 	Show per-thread event counters
--C::
+-c::
 --comms=::
 	Only consider symbols in these comms. CSV that understands
-	file://filename entries.
+	file://filename entries.  This option will affect the percentage of
+	the overhead column.  See --percentage for more info.
+-d::
+--dsos=::
+	Only consider symbols in these dsos. CSV that understands
+	file://filename entries.  This option will affect the percentage of
+	the overhead column.  See --percentage for more info.
 -S::
 --symbols=::
 	Only consider these symbols. CSV that understands
-	file://filename entries.
+	file://filename entries.  This option will affect the percentage of
+	the overhead column.  See --percentage for more info.
+
+--symbol-filter=::
+	Only show symbols that match (partially) with this filter.
+
+-U::
+--hide-unresolved::
+        Only display entries resolved to a symbol.
 
 -s::
 --sort=::
-	Sort by key(s): pid, comm, dso, symbol, parent.
+	Sort histogram entries by given key(s) - multiple keys can be specified
+	in CSV format.  Following sort keys are available:
+	pid, comm, dso, symbol, parent, cpu, srcline, weight, local_weight.
+
+	Each key has following meaning:
+
+	- comm: command (name) of the task which can be read via /proc/<pid>/comm
+	- pid: command and tid of the task
+	- dso: name of library or module executed at the time of sample
+	- symbol: name of function executed at the time of sample
+	- parent: name of function matched to the parent regex filter. Unmatched
+	entries are displayed as "[other]".
+	- cpu: cpu number the task ran at the time of sample
+	- srcline: filename and line number executed at the time of sample.  The
+	DWARF debugging info must be provided.
+	- weight: Event specific weight, e.g. memory latency or transaction
+	abort cost. This is the global weight.
+	- local_weight: Local weight version of the weight above.
+	- transaction: Transaction abort flags.
+	- overhead: Overhead percentage of sample
+	- overhead_sys: Overhead percentage of sample running in system mode
+	- overhead_us: Overhead percentage of sample running in user mode
+	- overhead_guest_sys: Overhead percentage of sample running in system mode
+	on guest machine
+	- overhead_guest_us: Overhead percentage of sample running in user mode on
+	guest machine
+	- sample: Number of sample
+	- period: Raw number of event count of sample
+
+	By default, comm, dso and symbol keys are used.
+	(i.e. --sort comm,dso,symbol)
+
+	If --branch-stack option is used, following sort keys are also
+	available:
+	dso_from, dso_to, symbol_from, symbol_to, mispredict.
+
+	- dso_from: name of library or module branched from
+	- dso_to: name of library or module branched to
+	- symbol_from: name of function branched from
+	- symbol_to: name of function branched to
+	- mispredict: "N" for predicted branch, "Y" for mispredicted branch
+	- in_tx: branch in TSX transaction
+	- abort: TSX transaction abort.
+
+	And default sort keys are changed to comm, dso_from, symbol_from, dso_to
+	and symbol_to, see '--branch-stack'.
+
+-F::
+--fields=::
+	Specify output field - multiple keys can be specified in CSV format.
+	Following fields are available:
+	overhead, overhead_sys, overhead_us, overhead_children, sample and period.
+	Also it can contain any sort key(s).
+
+	By default, every sort keys not specified in -F will be appended
+	automatically.
+
+	If --mem-mode option is used, following sort keys are also available
+	(incompatible with --branch-stack):
+	symbol_daddr, dso_daddr, locked, tlb, mem, snoop, dcacheline.
+
+	- symbol_daddr: name of data symbol being executed on at the time of sample
+	- dso_daddr: name of library or module containing the data being executed
+	on at the time of sample
+	- locked: whether the bus was locked at the time of sample
+	- tlb: type of tlb access for the data at the time of sample
+	- mem: type of memory access for the data at the time of sample
+	- snoop: type of snoop (if any) for the data at the time of sample
+	- dcacheline: the cacheline the data address is on at the time of sample
+
+	And default sort keys are changed to local_weight, mem, sym, dso,
+	symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'.
+
+-p::
+--parent=<regex>::
+        A regex filter to identify parent. The parent is a caller of this
+	function and searched through the callchain, thus it requires callchain
+	information recorded. The pattern is in the exteneded regex format and
+	defaults to "\^sys_|^do_page_fault", see '--sort parent'.
+
+-x::
+--exclude-other::
+        Only display entries with parent-match.
 
 -w::
---field-width=::
+--column-widths=<width[,width...]>::
 	Force each column width to the provided list, for large terminal
 	readability.
 
 -t::
 --field-separator=::
-
 	Use a special separator character and don't pad with spaces, replacing
-	all occurances of this separator in symbol names (and other output)
+	all occurrences of this separator in symbol names (and other output)
 	with a '.' character, that thus it's the only non valid separator.
 
--g [type,min]::
+-D::
+--dump-raw-trace::
+        Dump raw trace in ASCII.
+
+-g [type,min[,limit],order[,key]]::
 --call-graph::
-        Display callchains using type and min percent threshold.
+        Display call chains using type, min percent threshold, optional print
+	limit and order.
 	type can be either:
-	- flat: single column, linear exposure of callchains.
+	- flat: single column, linear exposure of call chains.
 	- graph: use a graph tree, displaying absolute overhead rates.
 	- fractal: like graph, but displays relative rates. Each branch of
 		 the tree is considered as a new profiled object. +
-	Default: fractal,0.5.
+
+	order can be either:
+	- callee: callee based call graph.
+	- caller: inverted caller based call graph.
+
+	key can be:
+	- function: compare on functions
+	- address: compare on individual code addresses
+
+	Default: fractal,0.5,callee,function.
+
+--children::
+	Accumulate callchain of children to parent entry so that then can
+	show up in the output.  The output will have a new "Children" column
+	and will be sorted on the data.  It requires callchains are recorded.
+
+--max-stack::
+	Set the stack depth limit when parsing the callchain, anything
+	beyond the specified depth will be ignored. This is a trade-off
+	between information loss and faster processing especially for
+	workloads that can have a very long callchain stack.
+
+	Default: 127
+
+-G::
+--inverted::
+        alias for inverted caller based call graph.
+
+--ignore-callees=<regex>::
+        Ignore callees of the function(s) matching the given regex.
+        This has the effect of collecting the callers of each such
+        function into one place in the call-graph tree.
+
+--pretty=<key>::
+        Pretty printing style.  key: normal, raw
+
+--stdio:: Use the stdio interface.
+
+--tui:: Use the TUI interface, that is integrated with annotate and allows
+        zooming into DSOs or threads, among other features. Use of --tui
+	requires a tty, if one is not present, as when piping to other
+	commands, the stdio interface is used.
+
+--gtk:: Use the GTK2 interface.
+
+-k::
+--vmlinux=<file>::
+        vmlinux pathname
+
+--kallsyms=<file>::
+        kallsyms pathname
+
+-m::
+--modules::
+        Load module symbols. WARNING: This should only be used with -k and
+        a LIVE kernel.
+
+-f::
+--force::
+        Don't complain, do it.
+
+--symfs=<directory>::
+        Look for files with symbols relative to this directory.
+
+-C::
+--cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can
+	be provided as a comma-separated list with no space: 0,1. Ranges of
+	CPUs are specified with -: 0-2. Default is to report samples on all
+	CPUs.
+
+-M::
+--disassembler-style=:: Set disassembler style for objdump.
+
+--source::
+	Interleave source code with assembly code. Enabled by default,
+	disable with --no-source.
+
+--asm-raw::
+	Show raw instruction encoding of assembly instructions.
+
+--show-total-period:: Show a column with the sum of periods.
+
+-I::
+--show-info::
+	Display extended information about the perf.data file. This adds
+	information which may be very large and thus may clutter the display.
+	It currently includes: cpu and numa topology of the host system.
+
+-b::
+--branch-stack::
+	Use the addresses of sampled taken branches instead of the instruction
+	address to build the histograms. To generate meaningful output, the
+	perf.data file must have been obtained using perf record -b or
+	perf record --branch-filter xxx where xxx is a branch filter option.
+	perf report is able to auto-detect whether a perf.data file contains
+	branch stacks and it will automatically switch to the branch view mode,
+	unless --no-branch-stack is used.
+
+--objdump=<path>::
+        Path to objdump binary.
+
+--group::
+	Show event group information together.
+
+--demangle::
+	Demangle symbol names to human readable form. It's enabled by default,
+	disable with --no-demangle.
+
+--mem-mode::
+	Use the data addresses of samples in addition to instruction addresses
+	to build the histograms.  To generate meaningful output, the perf.data
+	file must have been obtained using perf record -d -W and using a
+	special event -e cpu/mem-loads/ or -e cpu/mem-stores/. See
+	'perf mem' for simpler access.
+
+--percent-limit::
+	Do not show entries which have an overhead under that percent.
+	(Default: 0).
+
+--percentage::
+	Determine how to display the overhead percentage of filtered entries.
+	Filters can be applied by --comms, --dsos and/or --symbols options and
+	Zoom operations on the TUI (thread, dso, etc).
+
+	"relative" means it's relative to filtered entries only so that the
+	sum of shown entries will be always 100%.  "absolute" means it retains
+	the original value before and after the filter is applied.
+
+--header::
+	Show header information in the perf.data file.  This includes
+	various information like hostname, OS and perf version, cpu/mem
+	info, perf command line, event list and so on.  Currently only
+	--stdio output supports this feature.
+
+--header-only::
+	Show only perf.data header (forces --stdio).
 
 SEE ALSO
 --------
-linkperf:perf-stat[1]
+linkperf:perf-stat[1], linkperf:perf-annotate[1]
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 1ce79198997..8ff4df95695 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -8,11 +8,11 @@ perf-sched - Tool to trace/measure scheduler properties (latencies)
 SYNOPSIS
 --------
 [verse]
-'perf sched' {record|latency|replay|trace}
+'perf sched' {record|latency|map|replay|script}
 
 DESCRIPTION
 -----------
-There's four variants of perf sched:
+There are five variants of perf sched:
 
   'perf sched record <command>' to record the scheduling events
   of an arbitrary workload.
@@ -20,18 +20,32 @@ There's four variants of perf sched:
   'perf sched latency' to report the per task scheduling latencies
   and other scheduling properties of the workload.
 
-  'perf sched trace' to see a detailed trace of the workload that
-  was recorded.
+  'perf sched script' to see a detailed trace of the workload that
+   was recorded (aliased to 'perf script' for now).
 
   'perf sched replay' to simulate the workload that was recorded
   via perf sched record. (this is done by starting up mockup threads
   that mimic the workload based on the events in the trace. These
   threads can then replay the timings (CPU runtime and sleep patterns)
-  of the workload as it occured when it was recorded - and can repeat
+  of the workload as it occurred when it was recorded - and can repeat
   it a number of times, measuring its performance.)
 
+  'perf sched map' to print a textual context-switching outline of
+  workload captured via perf sched record.  Columns stand for
+  individual CPUs, and the two-letter shortcuts stand for tasks that
+  are running on a CPU. A '*' denotes the CPU that had the event, and
+  a dot signals an idle CPU.
+
 OPTIONS
 -------
+-i::
+--input=<file>::
+        Input file name. (default: perf.data unless stdin is a fifo)
+
+-v::
+--verbose::
+        Be more verbose. (show symbol address, etc)
+
 -D::
 --dump-raw-trace=::
         Display verbose dump of the sched data.
diff --git a/tools/perf/Documentation/perf-trace-perl.txt b/tools/perf/Documentation/perf-script-perl.txt
index d729cee8d98..d00bef23134 100644
--- a/tools/perf/Documentation/perf-trace-perl.txt
+++ b/tools/perf/Documentation/perf-script-perl.txt
@@ -1,19 +1,19 @@
-perf-trace-perl(1)
+perf-script-perl(1)
 ==================
 
 NAME
 ----
-perf-trace-perl - Process trace data with a Perl script
+perf-script-perl - Process trace data with a Perl script
 
 SYNOPSIS
 --------
 [verse]
-'perf trace' [-s [Perl]:script[.pl] ]
+'perf script' [-s [Perl]:script[.pl] ]
 
 DESCRIPTION
 -----------
 
-This perf trace option is used to process perf trace data using perf's
+This perf script option is used to process perf script data using perf's
 built-in Perl interpreter.  It reads and processes the input file and
 displays the results of the trace analysis implemented in the given
 Perl script, if any.
@@ -21,7 +21,7 @@ Perl script, if any.
 STARTER SCRIPTS
 ---------------
 
-You can avoid reading the rest of this document by running 'perf trace
+You can avoid reading the rest of this document by running 'perf script
 -g perl' in the same directory as an existing perf.data trace file.
 That will generate a starter script containing a handler for each of
 the event types in the trace file; it simply prints every available
@@ -30,13 +30,13 @@ field for each event in the trace file.
 You can also look at the existing scripts in
 ~/libexec/perf-core/scripts/perl for typical examples showing how to
 do basic things like aggregate event data, print results, etc.  Also,
-the check-perf-trace.pl script, while not interesting for its results,
+the check-perf-script.pl script, while not interesting for its results,
 attempts to exercise all of the main scripting features.
 
 EVENT HANDLERS
 --------------
 
-When perf trace is invoked using a trace script, a user-defined
+When perf script is invoked using a trace script, a user-defined
 'handler function' is called for each event in the trace.  If there's
 no handler function defined for a given event type, the event is
 ignored (or passed to a 'trace_handled' function, see below) and the
@@ -49,12 +49,10 @@ available as calls back into the perf executable (see below).
 As an example, the following perf record command can be used to record
 all sched_wakeup events in the system:
 
- # perf record -c 1 -f -a -M -R -e sched:sched_wakeup
+ # perf record -a -e sched:sched_wakeup
 
 Traces meant to be processed using a script should be recorded with
-the above options: -c 1 says to sample every event, -a to enable
-system-wide collection, -M to multiplex the output, and -R to collect
-raw samples.
+the above option: -a to enable system-wide collection.
 
 The format file for the sched_wakep event defines the following fields
 (see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format):
@@ -65,7 +63,6 @@ The format file for the sched_wakep event defines the following fields
         field:unsigned char common_flags;
         field:unsigned char common_preempt_count;
         field:int common_pid;
-        field:int common_lock_depth;
 
         field:char comm[TASK_COMM_LEN];
         field:pid_t pid;
@@ -114,7 +111,7 @@ write a useful trace script.  The sections below cover the rest.
 SCRIPT LAYOUT
 -------------
 
-Every perf trace Perl script should start by setting up a Perl module
+Every perf script Perl script should start by setting up a Perl module
 search path and 'use'ing a few support modules (see module
 descriptions below):
 
@@ -164,7 +161,7 @@ sub trace_unhandled
 ----
 
 The remaining sections provide descriptions of each of the available
-built-in perf trace Perl modules and their associated functions.
+built-in perf script Perl modules and their associated functions.
 
 AVAILABLE MODULES AND FUNCTIONS
 -------------------------------
@@ -172,7 +169,7 @@ AVAILABLE MODULES AND FUNCTIONS
 The following sections describe the functions and variables available
 via the various Perf::Trace::* Perl modules.  To use the functions and
 variables from the given module, add the corresponding 'use
-Perf::Trace::XXX' line to your perf trace script.
+Perf::Trace::XXX' line to your perf script script.
 
 Perf::Trace::Core Module
 ~~~~~~~~~~~~~~~~~~~~~~~~
@@ -206,7 +203,7 @@ argument.
 Perf::Trace::Util Module
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
-Various utility functions for use with perf trace:
+Various utility functions for use with perf script:
 
   nsecs($secs, $nsecs) - returns total nsecs given secs/nsecs pair
   nsecs_secs($nsecs) - returns whole secs portion given nsecs
@@ -216,4 +213,4 @@ Various utility functions for use with perf trace:
 
 SEE ALSO
 --------
-linkperf:perf-trace[1]
+linkperf:perf-script[1]
diff --git a/tools/perf/Documentation/perf-trace-python.txt b/tools/perf/Documentation/perf-script-python.txt
index a241aca7718..9f1f054b843 100644
--- a/tools/perf/Documentation/perf-trace-python.txt
+++ b/tools/perf/Documentation/perf-script-python.txt
@@ -1,19 +1,19 @@
-perf-trace-python(1)
-==================
+perf-script-python(1)
+====================
 
 NAME
 ----
-perf-trace-python - Process trace data with a Python script
+perf-script-python - Process trace data with a Python script
 
 SYNOPSIS
 --------
 [verse]
-'perf trace' [-s [Python]:script[.py] ]
+'perf script' [-s [Python]:script[.py] ]
 
 DESCRIPTION
 -----------
 
-This perf trace option is used to process perf trace data using perf's
+This perf script option is used to process perf script data using perf's
 built-in Python interpreter.  It reads and processes the input file and
 displays the results of the trace analysis implemented in the given
 Python script, if any.
@@ -23,15 +23,15 @@ A QUICK EXAMPLE
 
 This section shows the process, start to finish, of creating a working
 Python script that aggregates and extracts useful information from a
-raw perf trace stream.  You can avoid reading the rest of this
+raw perf script stream.  You can avoid reading the rest of this
 document if an example is enough for you; the rest of the document
 provides more details on each step and lists the library functions
 available to script writers.
 
 This example actually details the steps that were used to create the
-'syscall-counts' script you see when you list the available perf trace
-scripts via 'perf trace -l'.  As such, this script also shows how to
-integrate your script into the list of general-purpose 'perf trace'
+'syscall-counts' script you see when you list the available perf script
+scripts via 'perf script -l'.  As such, this script also shows how to
+integrate your script into the list of general-purpose 'perf script'
 scripts listed by that command.
 
 The syscall-counts script is a simple script, but demonstrates all the
@@ -93,7 +93,7 @@ don't care how it exited, so we'll use 'perf record' to record only
 the sys_enter events:
 
 ----
-# perf record -c 1 -f -a -M -R -e raw_syscalls:sys_enter
+# perf record -a -e raw_syscalls:sys_enter
 
 ^C[ perf record: Woken up 1 times to write data ]
 [ perf record: Captured and wrote 56.545 MB perf.data (~2470503 samples) ]
@@ -105,25 +105,25 @@ That single stream will be recorded in a file in the current directory
 called perf.data.
 
 Once we have a perf.data file containing our data, we can use the -g
-'perf trace' option to generate a Python script that will contain a
+'perf script' option to generate a Python script that will contain a
 callback handler for each event type found in the perf.data trace
 stream (for more details, see the STARTER SCRIPTS section).
 
 ----
-# perf trace -g python
-generated Python script: perf-trace.py
+# perf script -g python
+generated Python script: perf-script.py
 
 The output file created also in the current directory is named
-perf-trace.py.  Here's the file in its entirety:
+perf-script.py.  Here's the file in its entirety:
 
-# perf trace event handlers, generated by perf trace -g python
+# perf script event handlers, generated by perf script -g python
 # Licensed under the terms of the GNU GPL License version 2
 
 # The common_* event handler fields are the most useful fields common to
 # all events.  They don't necessarily correspond to the 'common_*' fields
 # in the format files.  Those fields not available as handler params can
 # be retrieved using Python functions of the form common_*(context).
-# See the perf-trace-python Documentation for the list of available functions.
+# See the perf-script-python Documentation for the list of available functions.
 
 import os
 import sys
@@ -160,7 +160,7 @@ def print_header(event_name, cpu, secs, nsecs, pid, comm):
 ----
 
 At the top is a comment block followed by some import statements and a
-path append which every perf trace script should include.
+path append which every perf script script should include.
 
 Following that are a couple generated functions, trace_begin() and
 trace_end(), which are called at the beginning and the end of the
@@ -182,15 +182,15 @@ mean either that the record step recorded event types that it wasn't
 really interested in, or the script was run against a trace file that
 doesn't correspond to the script.
 
-The script generated by -g option option simply prints a line for each
+The script generated by -g option simply prints a line for each
 event found in the trace stream i.e. it basically just dumps the event
 and its parameter values to stdout.  The print_header() function is
 simply a utility function used for that purpose.  Let's rename the
 script and run it to see the default output:
 
 ----
-# mv perf-trace.py syscall-counts.py
-# perf trace -s syscall-counts.py
+# mv perf-script.py syscall-counts.py
+# perf script -s syscall-counts.py
 
 raw_syscalls__sys_enter     1 00840.847582083     7506 perf                  id=1, args=
 raw_syscalls__sys_enter     1 00840.847595764     7506 perf                  id=1, args=
@@ -315,7 +315,7 @@ def print_syscall_totals():
 
 The script can be run just as before:
 
-  # perf trace -s syscall-counts.py
+  # perf script -s syscall-counts.py
 
 So those are the essential steps in writing and running a script.  The
 process can be generalized to any tracepoint or set of tracepoints
@@ -324,19 +324,18 @@ interested in by looking at the list of available events shown by
 'perf list' and/or look in /sys/kernel/debug/tracing events for
 detailed event and field info, record the corresponding trace data
 using 'perf record', passing it the list of interesting events,
-generate a skeleton script using 'perf trace -g python' and modify the
+generate a skeleton script using 'perf script -g python' and modify the
 code to aggregate and display it for your particular needs.
 
 After you've done that you may end up with a general-purpose script
 that you want to keep around and have available for future use.  By
 writing a couple of very simple shell scripts and putting them in the
 right place, you can have your script listed alongside the other
-scripts listed by the 'perf trace -l' command e.g.:
+scripts listed by the 'perf script -l' command e.g.:
 
 ----
-root@tropicana:~# perf trace -l
+root@tropicana:~# perf script -l
 List of available trace scripts:
-  workqueue-stats                      workqueue stats (ins/exe/create/destroy)
   wakeup-latency                       system-wide min/max/avg wakeup latency
   rw-by-file <comm>                    r/w activity for a program, by file
   rw-by-pid                            system-wide r/w activity
@@ -359,20 +358,20 @@ your script:
 # cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-record
 
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e raw_syscalls:sys_enter
+perf record -a -e raw_syscalls:sys_enter
 ----
 
 The 'report' script is also a shell script with the same base name as
 your script, but with -report appended.  It should also be located in
 the perf/scripts/python/bin directory.  In that script, you write the
-'perf trace -s' command-line needed for running your script:
+'perf script -s' command-line needed for running your script:
 
 ----
 # cat kernel-source/tools/perf/scripts/python/bin/syscall-counts-report
 
 #!/bin/bash
 # description: system-wide syscall counts
-perf trace -s ~/libexec/perf-core/scripts/python/syscall-counts.py
+perf script -s ~/libexec/perf-core/scripts/python/syscall-counts.py
 ----
 
 Note that the location of the Python script given in the shell script
@@ -390,38 +389,37 @@ total 32
 drwxr-xr-x 4 trz trz 4096 2010-01-26 22:30 .
 drwxr-xr-x 4 trz trz 4096 2010-01-26 22:29 ..
 drwxr-xr-x 2 trz trz 4096 2010-01-26 22:29 bin
--rw-r--r-- 1 trz trz 2548 2010-01-26 22:29 check-perf-trace.py
+-rw-r--r-- 1 trz trz 2548 2010-01-26 22:29 check-perf-script.py
 drwxr-xr-x 3 trz trz 4096 2010-01-26 22:49 Perf-Trace-Util
 -rw-r--r-- 1 trz trz 1462 2010-01-26 22:30 syscall-counts.py
 ----
 
 Once you've done that (don't forget to do a new 'make install',
-otherwise your script won't show up at run-time), 'perf trace -l'
+otherwise your script won't show up at run-time), 'perf script -l'
 should show a new entry for your script:
 
 ----
-root@tropicana:~# perf trace -l
+root@tropicana:~# perf script -l
 List of available trace scripts:
-  workqueue-stats                      workqueue stats (ins/exe/create/destroy)
   wakeup-latency                       system-wide min/max/avg wakeup latency
   rw-by-file <comm>                    r/w activity for a program, by file
   rw-by-pid                            system-wide r/w activity
   syscall-counts                       system-wide syscall counts
 ----
 
-You can now perform the record step via 'perf trace record':
+You can now perform the record step via 'perf script record':
 
-  # perf trace record syscall-counts
+  # perf script record syscall-counts
 
-and display the output using 'perf trace report':
+and display the output using 'perf script report':
 
-  # perf trace report syscall-counts
+  # perf script report syscall-counts
 
 STARTER SCRIPTS
 ---------------
 
 You can quickly get started writing a script for a particular set of
-trace data by generating a skeleton script using 'perf trace -g
+trace data by generating a skeleton script using 'perf script -g
 python' in the same directory as an existing perf.data trace file.
 That will generate a starter script containing a handler for each of
 the event types in the trace file; it simply prints every available
@@ -430,13 +428,13 @@ field for each event in the trace file.
 You can also look at the existing scripts in
 ~/libexec/perf-core/scripts/python for typical examples showing how to
 do basic things like aggregate event data, print results, etc.  Also,
-the check-perf-trace.py script, while not interesting for its results,
+the check-perf-script.py script, while not interesting for its results,
 attempts to exercise all of the main scripting features.
 
 EVENT HANDLERS
 --------------
 
-When perf trace is invoked using a trace script, a user-defined
+When perf script is invoked using a trace script, a user-defined
 'handler function' is called for each event in the trace.  If there's
 no handler function defined for a given event type, the event is
 ignored (or passed to a 'trace_handled' function, see below) and the
@@ -449,12 +447,10 @@ available as calls back into the perf executable (see below).
 As an example, the following perf record command can be used to record
 all sched_wakeup events in the system:
 
- # perf record -c 1 -f -a -M -R -e sched:sched_wakeup
+ # perf record -a -e sched:sched_wakeup
 
 Traces meant to be processed using a script should be recorded with
-the above options: -c 1 says to sample every event, -a to enable
-system-wide collection, -M to multiplex the output, and -R to collect
-raw samples.
+the above option: -a to enable system-wide collection.
 
 The format file for the sched_wakep event defines the following fields
 (see /sys/kernel/debug/tracing/events/sched/sched_wakeup/format):
@@ -465,7 +461,6 @@ The format file for the sched_wakep event defines the following fields
         field:unsigned char common_flags;
         field:unsigned char common_preempt_count;
         field:int common_pid;
-        field:int common_lock_depth;
 
         field:char comm[TASK_COMM_LEN];
         field:pid_t pid;
@@ -512,7 +507,7 @@ write a useful trace script.  The sections below cover the rest.
 SCRIPT LAYOUT
 -------------
 
-Every perf trace Python script should start by setting up a Python
+Every perf script Python script should start by setting up a Python
 module search path and 'import'ing a few support modules (see module
 descriptions below):
 
@@ -561,15 +556,15 @@ def trace_unhandled(event_name, context, common_cpu, common_secs,
 ----
 
 The remaining sections provide descriptions of each of the available
-built-in perf trace Python modules and their associated functions.
+built-in perf script Python modules and their associated functions.
 
 AVAILABLE MODULES AND FUNCTIONS
 -------------------------------
 
 The following sections describe the functions and variables available
-via the various perf trace Python modules.  To use the functions and
+via the various perf script Python modules.  To use the functions and
 variables from the given module, add the corresponding 'from XXXX
-import' line to your perf trace script.
+import' line to your perf script script.
 
 Core.py Module
 ~~~~~~~~~~~~~~
@@ -584,7 +579,7 @@ files:
   flag_str(event_name, field_name, field_value) - returns the string represention corresponding to field_value for the flag field field_name of event event_name
   symbol_str(event_name, field_name, field_value) - returns the string represention corresponding to field_value for the symbolic field field_name of event event_name
 
-The *autodict* function returns a special special kind of Python
+The *autodict* function returns a special kind of Python
 dictionary that implements Perl's 'autovivifying' hashes in Python
 i.e. with autovivifying hashes, you can assign nested hash values
 without having to go to the trouble of creating intermediate levels if
@@ -612,7 +607,7 @@ argument.
 Util.py Module
 ~~~~~~~~~~~~~~
 
-Various utility functions for use with perf trace:
+Various utility functions for use with perf script:
 
   nsecs(secs, nsecs) - returns total nsecs given secs/nsecs pair
   nsecs_secs(nsecs) - returns whole secs portion given nsecs
@@ -622,4 +617,4 @@ Various utility functions for use with perf trace:
 
 SEE ALSO
 --------
-linkperf:perf-trace[1]
+linkperf:perf-script[1]
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
new file mode 100644
index 00000000000..05f9a0a6784
--- /dev/null
+++ b/tools/perf/Documentation/perf-script.txt
@@ -0,0 +1,221 @@
+perf-script(1)
+=============
+
+NAME
+----
+perf-script - Read perf.data (created by perf record) and display trace output
+
+SYNOPSIS
+--------
+[verse]
+'perf script' [<options>]
+'perf script' [<options>] record <script> [<record-options>] <command>
+'perf script' [<options>] report <script> [script-args]
+'perf script' [<options>] <script> <required-script-args> [<record-options>] <command>
+'perf script' [<options>] <top-script> [script-args]
+
+DESCRIPTION
+-----------
+This command reads the input file and displays the trace recorded.
+
+There are several variants of perf script:
+
+  'perf script' to see a detailed trace of the workload that was
+  recorded.
+
+  You can also run a set of pre-canned scripts that aggregate and
+  summarize the raw trace data in various ways (the list of scripts is
+  available via 'perf script -l').  The following variants allow you to
+  record and run those scripts:
+
+  'perf script record <script> <command>' to record the events required
+  for 'perf script report'.  <script> is the name displayed in the
+  output of 'perf script --list' i.e. the actual script name minus any
+  language extension.  If <command> is not specified, the events are
+  recorded using the -a (system-wide) 'perf record' option.
+
+  'perf script report <script> [args]' to run and display the results
+  of <script>.  <script> is the name displayed in the output of 'perf
+  trace --list' i.e. the actual script name minus any language
+  extension.  The perf.data output from a previous run of 'perf script
+  record <script>' is used and should be present for this command to
+  succeed.  [args] refers to the (mainly optional) args expected by
+  the script.
+
+  'perf script <script> <required-script-args> <command>' to both
+  record the events required for <script> and to run the <script>
+  using 'live-mode' i.e. without writing anything to disk.  <script>
+  is the name displayed in the output of 'perf script --list' i.e. the
+  actual script name minus any language extension.  If <command> is
+  not specified, the events are recorded using the -a (system-wide)
+  'perf record' option.  If <script> has any required args, they
+  should be specified before <command>.  This mode doesn't allow for
+  optional script args to be specified; if optional script args are
+  desired, they can be specified using separate 'perf script record'
+  and 'perf script report' commands, with the stdout of the record step
+  piped to the stdin of the report script, using the '-o -' and '-i -'
+  options of the corresponding commands.
+
+  'perf script <top-script>' to both record the events required for
+  <top-script> and to run the <top-script> using 'live-mode'
+  i.e. without writing anything to disk.  <top-script> is the name
+  displayed in the output of 'perf script --list' i.e. the actual
+  script name minus any language extension; a <top-script> is defined
+  as any script name ending with the string 'top'.
+
+  [<record-options>] can be passed to the record steps of 'perf script
+  record' and 'live-mode' variants; this isn't possible however for
+  <top-script> 'live-mode' or 'perf script report' variants.
+
+  See the 'SEE ALSO' section for links to language-specific
+  information on how to write and run your own trace scripts.
+
+OPTIONS
+-------
+<command>...::
+	Any command you can specify in a shell.
+
+-D::
+--dump-raw-script=::
+        Display verbose dump of the trace data.
+
+-L::
+--Latency=::
+        Show latency attributes (irqs/preemption disabled, etc).
+
+-l::
+--list=::
+        Display a list of available trace scripts.
+
+-s ['lang']::
+--script=::
+        Process trace data with the given script ([lang]:script[.ext]).
+	If the string 'lang' is specified in place of a script name, a
+        list of supported languages will be displayed instead.
+
+-g::
+--gen-script=::
+        Generate perf-script.[ext] starter script for given language,
+        using current perf.data.
+
+-a::
+        Force system-wide collection.  Scripts run without a <command>
+        normally use -a by default, while scripts run with a <command>
+        normally don't - this option allows the latter to be run in
+        system-wide mode.
+
+-i::
+--input=::
+        Input file name. (default: perf.data unless stdin is a fifo)
+
+-d::
+--debug-mode::
+        Do various checks like samples ordering and lost events.
+
+-f::
+--fields::
+        Comma separated list of fields to print. Options are:
+        comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff, srcline.
+        Field list can be prepended with the type, trace, sw or hw,
+        to indicate to which event type the field list applies.
+        e.g., -f sw:comm,tid,time,ip,sym  and -f trace:time,cpu,trace
+
+		perf script -f <fields>
+
+	is equivalent to:
+
+		perf script -f trace:<fields> -f sw:<fields> -f hw:<fields>
+    
+	i.e., the specified fields apply to all event types if the type string
+	is not given.
+    
+	The arguments are processed in the order received. A later usage can
+	reset a prior request. e.g.:
+    
+		-f trace: -f comm,tid,time,ip,sym
+    
+	The first -f suppresses trace events (field list is ""), but then the
+	second invocation sets the fields to comm,tid,time,ip,sym. In this case a
+	warning is given to the user:
+    
+		"Overriding previous field request for all events."
+    
+	Alternativey, consider the order:
+    
+		-f comm,tid,time,ip,sym -f trace:
+    
+	The first -f sets the fields for all events and the second -f
+	suppresses trace events. The user is given a warning message about
+	the override, and the result of the above is that only S/W and H/W
+	events are displayed with the given fields.
+    
+	For the 'wildcard' option if a user selected field is invalid for an
+	event type, a message is displayed to the user that the option is
+	ignored for that type. For example:
+    
+		$ perf script -f comm,tid,trace
+		'trace' not valid for hardware events. Ignoring.
+		'trace' not valid for software events. Ignoring.
+    
+	Alternatively, if the type is given an invalid field is specified it
+	is an error. For example:
+    
+        perf script -v -f sw:comm,tid,trace
+        'trace' not valid for software events.
+    
+	At this point usage is displayed, and perf-script exits.
+    
+	Finally, a user may not set fields to none for all event types.
+	i.e., -f "" is not allowed.
+
+-k::
+--vmlinux=<file>::
+        vmlinux pathname
+
+--kallsyms=<file>::
+        kallsyms pathname
+
+--symfs=<directory>::
+        Look for files with symbols relative to this directory.
+
+-G::
+--hide-call-graph::
+        When printing symbols do not display call chain.
+
+-C::
+--cpu:: Only report samples for the list of CPUs provided. Multiple CPUs can
+	be provided as a comma-separated list with no space: 0,1. Ranges of
+	CPUs are specified with -: 0-2. Default is to report samples on all
+	CPUs.
+
+-c::
+--comms=::
+	Only display events for these comms. CSV that understands
+	file://filename entries.
+
+-I::
+--show-info::
+	Display extended information about the perf.data file. This adds
+	information which may be very large and thus may clutter the display.
+	It currently includes: cpu and numa topology of the host system.
+	It can only be used with the perf script report mode.
+
+--show-kernel-path::
+	Try to resolve the path of [kernel.kallsyms]
+
+--show-task-events
+	Display task related events (e.g. FORK, COMM, EXIT).
+
+--show-mmap-events
+	Display mmap related events (e.g. MMAP, MMAP2).
+
+--header
+	Show perf.data header.
+
+--header-only
+	Show only perf.data header.
+
+SEE ALSO
+--------
+linkperf:perf-record[1], linkperf:perf-script-perl[1],
+linkperf:perf-script-python[1]
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 484080dd5b6..29ee857c09c 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -8,8 +8,8 @@ perf-stat - Run a command and gather performance counter statistics
 SYNOPSIS
 --------
 [verse]
-'perf stat' [-e <EVENT> | --event=EVENT] [-S] [-a] <command>
-'perf stat' [-e <EVENT> | --event=EVENT] [-S] [-a] -- <command> [<options>]
+'perf stat' [-e <EVENT> | --event=EVENT] [-a] <command>
+'perf stat' [-e <EVENT> | --event=EVENT] [-a] -- <command> [<options>]
 
 DESCRIPTION
 -----------
@@ -31,17 +31,116 @@ OPTIONS
 	 hexadecimal event descriptor.
 
 -i::
---inherit::
-        child tasks inherit counters
+--no-inherit::
+        child tasks do not inherit counters
 -p::
 --pid=<pid>::
-        stat events on existing pid
+        stat events on existing process id (comma separated list)
+
+-t::
+--tid=<tid>::
+        stat events on existing thread id (comma separated list)
+
 
 -a::
-        system-wide collection
+--all-cpus::
+        system-wide collection from all CPUs
 
 -c::
-        scale counter values
+--scale::
+	scale/normalize counter values
+
+-r::
+--repeat=<n>::
+	repeat command and print average + stddev (max: 100). 0 means forever.
+
+-B::
+--big-num::
+        print large numbers with thousands' separators according to locale
+
+-C::
+--cpu=::
+Count only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+In per-thread mode, this option is ignored. The -a option is still necessary
+to activate system-wide monitoring. Default is to count on all CPUs.
+
+-A::
+--no-aggr::
+Do not aggregate counts across all monitored CPUs in system-wide mode (-a).
+This option is only valid in system-wide mode.
+
+-n::
+--null::
+        null run - don't start any counters
+
+-v::
+--verbose::
+        be more verbose (show counter open errors, etc)
+
+-x SEP::
+--field-separator SEP::
+print counts using a CSV-style output to make it easy to import directly into
+spreadsheets. Columns are separated by the string specified in SEP.
+
+-G name::
+--cgroup name::
+monitor only in the container (cgroup) called "name". This option is available only
+in per-cpu mode. The cgroup filesystem must be mounted. All threads belonging to
+container "name" are monitored when they run on the monitored CPUs. Multiple cgroups
+can be provided. Each cgroup is applied to the corresponding event, i.e., first cgroup
+to first event, second cgroup to second event and so on. It is possible to provide
+an empty cgroup (monitor all the time) using, e.g., -G foo,,bar. Cgroups must have
+corresponding events, i.e., they always refer to events defined earlier on the command
+line.
+
+-o file::
+--output file::
+Print the output into the designated file.
+
+--append::
+Append to the output file designated with the -o option. Ignored if -o is not specified.
+
+--log-fd::
+
+Log output to fd, instead of stderr.  Complementary to --output, and mutually exclusive
+with it.  --append may be used here.  Examples:
+     3>results  perf stat --log-fd 3          -- $cmd
+     3>>results perf stat --log-fd 3 --append -- $cmd
+
+--pre::
+--post::
+	Pre and post measurement hooks, e.g.:
+
+perf stat --repeat 10 --null --sync --pre 'make -s O=defconfig-build/clean' -- make -s -j64 O=defconfig-build/ bzImage
+
+-I msecs::
+--interval-print msecs::
+	Print count deltas every N milliseconds (minimum: 100ms)
+	example: perf stat -I 1000 -e cycles -a sleep 5
+
+--per-socket::
+Aggregate counts per processor socket for system-wide mode measurements.  This
+is a useful mode to detect imbalance between sockets.  To enable this mode,
+use --per-socket in addition to -a. (system-wide).  The output includes the
+socket number and the number of online processors on that socket. This is
+useful to gauge the amount of aggregation.
+
+--per-core::
+Aggregate counts per physical processor for system-wide mode measurements.  This
+is a useful mode to detect imbalance between physical cores.  To enable this mode,
+use --per-core in addition to -a. (system-wide).  The output includes the
+core number and the number of online logical processors on that physical processor.
+
+-D msecs::
+--delay msecs::
+After starting the program, wait msecs before measuring. This is useful to
+filter out the startup phase of the program, which is often very different.
+
+-T::
+--transaction::
+
+Print statistics of transactional execution if supported.
 
 EXAMPLES
 --------
diff --git a/tools/perf/Documentation/perf-test.txt b/tools/perf/Documentation/perf-test.txt
new file mode 100644
index 00000000000..d1d3e5121f8
--- /dev/null
+++ b/tools/perf/Documentation/perf-test.txt
@@ -0,0 +1,32 @@
+perf-test(1)
+============
+
+NAME
+----
+perf-test - Runs sanity tests.
+
+SYNOPSIS
+--------
+[verse]
+'perf test [<options>] [{list <test-name-fragment>|[<test-name-fragments>|<test-numbers>]}]'
+
+DESCRIPTION
+-----------
+This command does assorted sanity tests, initially through linked routines but
+also will look for a directory with more tests in the form of scripts.
+
+To get a list of available tests use 'perf test list', specifying a test name
+fragment will show all tests that have it.
+
+To run just specific tests, inform test name fragments or the numbers obtained
+from 'perf test list'.
+
+OPTIONS
+-------
+-s::
+--skip::
+	Tests to skip (comma separater numeric list).
+
+-v::
+--verbose::
+	Be more verbose.
diff --git a/tools/perf/Documentation/perf-timechart.txt b/tools/perf/Documentation/perf-timechart.txt
index 4b1788355ec..5e0f986dff3 100644
--- a/tools/perf/Documentation/perf-timechart.txt
+++ b/tools/perf/Documentation/perf-timechart.txt
@@ -8,7 +8,7 @@ perf-timechart - Tool to visualize total system behavior during a workload
 SYNOPSIS
 --------
 [verse]
-'perf timechart' {record}
+'perf timechart' [<timechart options>] {record} [<record options>]
 
 DESCRIPTION
 -----------
@@ -20,24 +20,72 @@ There are two variants of perf timechart:
   'perf timechart' to turn a trace into a Scalable Vector Graphics file,
   that can be viewed with popular SVG viewers such as 'Inkscape'.
 
-OPTIONS
--------
+TIMECHART OPTIONS
+-----------------
 -o::
 --output=::
         Select the output file (default: output.svg)
 -i::
 --input=::
-        Select the input file (default: perf.data)
+        Select the input file (default: perf.data unless stdin is a fifo)
 -w::
 --width=::
         Select the width of the SVG file (default: 1000)
 -P::
 --power-only::
         Only output the CPU power section of the diagram
+-T::
+--tasks-only::
+        Don't output processor state transitions
 -p::
 --process::
         Select the processes to display, by name or PID
 
+--symfs=<directory>::
+        Look for files with symbols relative to this directory.
+-n::
+--proc-num::
+        Print task info for at least given number of tasks.
+-t::
+--topology::
+        Sort CPUs according to topology.
+--highlight=<duration_nsecs|task_name>::
+	Highlight tasks (using different color) that run more than given
+	duration or tasks with given name. If number is given it's interpreted
+	as number of nanoseconds. If non-numeric string is given it's
+	interpreted as task name.
+
+RECORD OPTIONS
+--------------
+-P::
+--power-only::
+        Record only power-related events
+-T::
+--tasks-only::
+        Record only tasks-related events
+-g::
+--callchain::
+        Do call-graph (stack chain/backtrace) recording
+
+EXAMPLES
+--------
+
+$ perf timechart record git pull
+
+  [ perf record: Woken up 13 times to write data ]
+  [ perf record: Captured and wrote 4.253 MB perf.data (~185801 samples) ]
+
+$ perf timechart
+
+  Written 10.2 seconds of trace to output.svg.
+
+Record system-wide timechart:
+
+  $ perf timechart record
+
+  then generate timechart and highlight 'gcc' tasks:
+
+  $ perf timechart --highlight gcc
 
 SEE ALSO
 --------
diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt
index 785b9fc32a4..180ae02137a 100644
--- a/tools/perf/Documentation/perf-top.txt
+++ b/tools/perf/Documentation/perf-top.txt
@@ -12,7 +12,7 @@ SYNOPSIS
 
 DESCRIPTION
 -----------
-This command generates and displays a performance counter profile in realtime.
+This command generates and displays a performance counter profile in real time.
 
 
 OPTIONS
@@ -25,9 +25,11 @@ OPTIONS
 --count=<count>::
 	Event period to sample.
 
--C <cpu>::
---CPU=<cpu>::
-	CPU to profile.
+-C <cpu-list>::
+--cpu=<cpu>::
+Monitor only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+Default is to monitor all CPUS.
 
 -d <seconds>::
 --delay=<seconds>::
@@ -48,13 +50,16 @@ OPTIONS
 --count-filter=<count>::
 	Only display functions with more events than this.
 
+--group::
+        Put the counters into a counter group.
+
 -F <freq>::
 --freq=<freq>::
 	Profile at this frequency.
 
 -i::
 --inherit::
-	Child tasks inherit counters, only makes sens with -p option.
+	Child tasks do not inherit counters.
 
 -k <path>::
 --vmlinux=<path>::
@@ -62,20 +67,41 @@ OPTIONS
 
 -m <pages>::
 --mmap-pages=<pages>::
-	Number of mmapped data pages.
+	Number of mmap data pages (must be a power of two) or size
+	specification with appended unit character - B/K/M/G. The
+	size is rounded up to have nearest pages power of two value.
 
 -p <pid>::
 --pid=<pid>::
-	Profile events on existing pid.
+	Profile events on existing Process ID (comma separated list).
+
+-t <tid>::
+--tid=<tid>::
+        Profile events on existing thread ID (comma separated list).
+
+-u::
+--uid=::
+        Record events in threads owned by uid. Name or number.
 
 -r <priority>::
 --realtime=<priority>::
 	Collect data with this RT SCHED_FIFO priority.
 
--s <symbol>::
 --sym-annotate=<symbol>::
         Annotate this symbol.
 
+-K::
+--hide_kernel_symbols::
+        Hide kernel symbols.
+
+-U::
+--hide_user_symbols::
+        Hide user symbols.
+
+-D::
+--dump-symtab::
+        Dump the symbol table used for profiling.
+
 -v::
 --verbose::
 	Be more verbose (show counter open errors, etc).
@@ -84,6 +110,89 @@ OPTIONS
 --zero::
 	Zero history across display updates.
 
+-s::
+--sort::
+	Sort by key(s): pid, comm, dso, symbol, parent, srcline, weight,
+	local_weight, abort, in_tx, transaction, overhead, sample, period.
+	Please see description of --sort in the perf-report man page.
+
+--fields=::
+	Specify output field - multiple keys can be specified in CSV format.
+	Following fields are available:
+	overhead, overhead_sys, overhead_us, overhead_children, sample and period.
+	Also it can contain any sort key(s).
+
+	By default, every sort keys not specified in --field will be appended
+	automatically.
+
+-n::
+--show-nr-samples::
+	Show a column with the number of samples.
+
+--show-total-period::
+	Show a column with the sum of periods.
+
+--dsos::
+	Only consider symbols in these dsos.  This option will affect the
+	percentage of the overhead column.  See --percentage for more info.
+
+--comms::
+	Only consider symbols in these comms.  This option will affect the
+	percentage of the overhead column.  See --percentage for more info.
+
+--symbols::
+	Only consider these symbols.  This option will affect the
+	percentage of the overhead column.  See --percentage for more info.
+
+-M::
+--disassembler-style=:: Set disassembler style for objdump.
+
+--source::
+	Interleave source code with assembly code. Enabled by default,
+	disable with --no-source.
+
+--asm-raw::
+	Show raw instruction encoding of assembly instructions.
+
+-g::
+	Enables call-graph (stack chain/backtrace) recording.
+
+--call-graph::
+	Setup and enable call-graph (stack chain/backtrace) recording,
+	implies -g.
+
+--children::
+	Accumulate callchain of children to parent entry so that then can
+	show up in the output.  The output will have a new "Children" column
+	and will be sorted on the data.  It requires -g/--call-graph option
+	enabled.
+
+--max-stack::
+	Set the stack depth limit when parsing the callchain, anything
+	beyond the specified depth will be ignored. This is a trade-off
+	between information loss and faster processing especially for
+	workloads that can have a very long callchain stack.
+
+	Default: 127
+
+--ignore-callees=<regex>::
+        Ignore callees of the function(s) matching the given regex.
+        This has the effect of collecting the callers of each such
+        function into one place in the call-graph tree.
+
+--percent-limit::
+	Do not show entries which have an overhead under that percent.
+	(Default: 0).
+
+--percentage::
+	Determine how to display the overhead percentage of filtered entries.
+	Filters can be applied by --comms, --dsos and/or --symbols options and
+	Zoom operations on the TUI (thread, dso, etc).
+
+	"relative" means it's relative to filtered entries only so that the
+	sum of shown entries will be always 100%. "absolute" means it retains
+	the original value before and after the filter is applied.
+
 INTERACTIVE PROMPTING KEYS
 --------------------------
 
@@ -108,9 +217,6 @@ INTERACTIVE PROMPTING KEYS
 [S]::
 	Stop annotation, return to full profile display.
 
-[w]::
-	Toggle between weighted sum and individual count[E]r profile.
-
 [z]::
 	Toggle event count zeroing across display updates.
 
@@ -122,4 +228,4 @@ Pressing any unmapped key displays a menu, and prompts for input.
 
 SEE ALSO
 --------
-linkperf:perf-stat[1], linkperf:perf-list[1]
+linkperf:perf-stat[1], linkperf:perf-list[1], linkperf:perf-report[1]
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index 8879299cd9d..fae38d9a44a 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -1,70 +1,112 @@
 perf-trace(1)
-==============
+=============
 
 NAME
 ----
-perf-trace - Read perf.data (created by perf record) and display trace output
+perf-trace - strace inspired tool
 
 SYNOPSIS
 --------
 [verse]
-'perf trace' {record <script> | report <script> [args] }
+'perf trace'
+'perf trace record'
 
 DESCRIPTION
 -----------
-This command reads the input file and displays the trace recorded.
+This command will show the events associated with the target, initially
+syscalls, but other system events like pagefaults, task lifetime events,
+scheduling events, etc.
 
-There are several variants of perf trace:
+This is a live mode tool in addition to working with perf.data files like
+the other perf tools. Files can be generated using the 'perf record' command
+but the session needs to include the raw_syscalls events (-e 'raw_syscalls:*').
+Alernatively, the 'perf trace record' can be used as a shortcut to
+automatically include the raw_syscalls events when writing events to a file.
 
-  'perf trace' to see a detailed trace of the workload that was
-  recorded.
+The following options apply to perf trace; options to perf trace record are
+found in the perf record man page.
 
-  You can also run a set of pre-canned scripts that aggregate and
-  summarize the raw trace data in various ways (the list of scripts is
-  available via 'perf trace -l').  The following variants allow you to
-  record and run those scripts:
+OPTIONS
+-------
 
-  'perf trace record <script>' to record the events required for 'perf
-  trace report'.  <script> is the name displayed in the output of
-  'perf trace --list' i.e. the actual script name minus any language
-  extension.
+-a::
+--all-cpus::
+        System-wide collection from all CPUs.
 
-  'perf trace report <script>' to run and display the results of
-  <script>.  <script> is the name displayed in the output of 'perf
-  trace --list' i.e. the actual script name minus any language
-  extension.  The perf.data output from a previous run of 'perf trace
-  record <script>' is used and should be present for this command to
-  succeed.
+-e::
+--expr::
+	List of events to show, currently only syscall names.
+	Prefixing with ! shows all syscalls but the ones specified.  You may
+	need to escape it.
 
-  See the 'SEE ALSO' section for links to language-specific
-  information on how to write and run your own trace scripts.
+-o::
+--output=::
+	Output file name.
 
-OPTIONS
--------
--D::
---dump-raw-trace=::
-        Display verbose dump of the trace data.
+-p::
+--pid=::
+	Record events on existing process ID (comma separated list).
+
+-t::
+--tid=::
+        Record events on existing thread ID (comma separated list).
+
+-u::
+--uid=::
+        Record events in threads owned by uid. Name or number.
+
+-v::
+--verbose=::
+        Verbosity level.
+
+-i::
+--no-inherit::
+	Child tasks do not inherit counters.
+
+-m::
+--mmap-pages=::
+	Number of mmap data pages (must be a power of two) or size
+	specification with appended unit character - B/K/M/G. The
+	size is rounded up to have nearest pages power of two value.
+
+-C::
+--cpu::
+Collect samples only on the list of CPUs provided. Multiple CPUs can be provided as a
+comma-separated list with no space: 0,1. Ranges of CPUs are specified with -: 0-2.
+In per-thread mode with inheritance mode on (default), Events are captured only when
+the thread executes on the designated CPUs. Default is to monitor all CPUs.
+
+--duration:
+	Show only events that had a duration greater than N.M ms.
+
+--sched:
+	Accrue thread runtime and provide a summary at the end of the session.
+
+-i
+--input
+	Process events from a given perf data file.
+
+-T
+--time
+	Print full timestamp rather time relative to first sample.
 
--L::
---Latency=::
-        Show latency attributes (irqs/preemption disabled, etc).
+--comm::
+        Show process COMM right beside its ID, on by default, disable with --no-comm.
 
--l::
---list=::
-        Display a list of available trace scripts.
+-s::
+--summary::
+	Show only a summary of syscalls by thread with min, max, and average times
+    (in msec) and relative stddev.
 
--s ['lang']::
---script=::
-        Process trace data with the given script ([lang]:script[.ext]).
-	If the string 'lang' is specified in place of a script name, a
-        list of supported languages will be displayed instead.
+-S::
+--with-summary::
+	Show all syscalls followed by a summary by thread with min, max, and
+    average times (in msec) and relative stddev.
 
--g::
---gen-script=::
-        Generate perf-trace.[ext] starter script for given language,
-        using current perf.data.
+--tool_stats::
+	Show tool stats such as number of times fd->pathname was discovered thru
+	hooking the open syscall return + vfs_getname or via reading /proc/pid/fd, etc.
 
 SEE ALSO
 --------
-linkperf:perf-record[1], linkperf:perf-trace-perl[1],
-linkperf:perf-trace-python[1]
+linkperf:perf-record[1], linkperf:perf-script[1]
diff --git a/tools/perf/Documentation/perfconfig.example b/tools/perf/Documentation/perfconfig.example
new file mode 100644
index 00000000000..767ea2436e1
--- /dev/null
+++ b/tools/perf/Documentation/perfconfig.example
@@ -0,0 +1,29 @@
+[colors]
+
+	# These were the old defaults
+	top = red, lightgray
+	medium = green, lightgray
+	normal = black, lightgray
+	selected = lightgray, magenta
+	code = blue, lightgray
+	addr = magenta, lightgray
+
+[tui]
+
+	# Defaults if linked with libslang
+	report = on
+	annotate = on
+	top = on
+
+[buildid]
+
+	# Default, disable using /dev/null
+	dir = /root/.debug
+
+[annotate]
+
+	# Defaults
+	hide_src_code = false
+	use_offset = true
+	jump_arrows = true
+	show_nr_jumps = false
diff --git a/tools/perf/MANIFEST b/tools/perf/MANIFEST
new file mode 100644
index 00000000000..45da209b6ed
--- /dev/null
+++ b/tools/perf/MANIFEST
@@ -0,0 +1,39 @@
+tools/perf
+tools/scripts
+tools/lib/traceevent
+tools/lib/api
+tools/lib/symbol/kallsyms.c
+tools/lib/symbol/kallsyms.h
+tools/include/asm/bug.h
+tools/include/linux/compiler.h
+tools/include/linux/hash.h
+tools/include/linux/export.h
+tools/include/linux/types.h
+include/linux/const.h
+include/linux/perf_event.h
+include/linux/rbtree.h
+include/linux/list.h
+include/linux/hash.h
+include/linux/stringify.h
+lib/rbtree.c
+include/linux/swab.h
+arch/*/include/asm/unistd*.h
+arch/*/include/asm/perf_regs.h
+arch/*/include/uapi/asm/unistd*.h
+arch/*/include/uapi/asm/perf_regs.h
+arch/*/lib/memcpy*.S
+arch/*/lib/memset*.S
+include/linux/poison.h
+include/linux/magic.h
+include/linux/hw_breakpoint.h
+include/linux/rbtree_augmented.h
+include/uapi/linux/perf_event.h
+include/uapi/linux/const.h
+include/uapi/linux/swab.h
+include/uapi/linux/hw_breakpoint.h
+arch/x86/include/asm/svm.h
+arch/x86/include/asm/vmx.h
+arch/x86/include/asm/kvm_host.h
+arch/x86/include/uapi/asm/svm.h
+arch/x86/include/uapi/asm/vmx.h
+arch/x86/include/uapi/asm/kvm.h
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index bc0f670a833..cb2e5868c8e 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -1,1181 +1,90 @@
-# The default target of this Makefile is...
-all::
-
-# Define V=1 to have a more verbose compile.
-# Define V=2 to have an even more verbose compile.
-#
-# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf()
-# or vsnprintf() return -1 instead of number of characters which would
-# have been written to the final string if enough space had been available.
-#
-# Define FREAD_READS_DIRECTORIES if your are on a system which succeeds
-# when attempting to read from an fopen'ed directory.
-#
-# Define NO_OPENSSL environment variable if you do not have OpenSSL.
-# This also implies MOZILLA_SHA1.
-#
-# Define CURLDIR=/foo/bar if your curl header and library files are in
-# /foo/bar/include and /foo/bar/lib directories.
-#
-# Define EXPATDIR=/foo/bar if your expat header and library files are in
-# /foo/bar/include and /foo/bar/lib directories.
-#
-# Define NO_D_INO_IN_DIRENT if you don't have d_ino in your struct dirent.
-#
-# Define NO_D_TYPE_IN_DIRENT if your platform defines DT_UNKNOWN but lacks
-# d_type in struct dirent (latest Cygwin -- will be fixed soonish).
-#
-# Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.)
-# do not support the 'size specifiers' introduced by C99, namely ll, hh,
-# j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t).
-# some C compilers supported these specifiers prior to C99 as an extension.
-#
-# Define NO_STRCASESTR if you don't have strcasestr.
-#
-# Define NO_MEMMEM if you don't have memmem.
-#
-# Define NO_STRTOUMAX if you don't have strtoumax in the C library.
-# If your compiler also does not support long long or does not have
-# strtoull, define NO_STRTOULL.
-#
-# Define NO_SETENV if you don't have setenv in the C library.
-#
-# Define NO_UNSETENV if you don't have unsetenv in the C library.
-#
-# Define NO_MKDTEMP if you don't have mkdtemp in the C library.
-#
-# Define NO_SYS_SELECT_H if you don't have sys/select.h.
-#
-# Define NO_SYMLINK_HEAD if you never want .perf/HEAD to be a symbolic link.
-# Enable it on Windows.  By default, symrefs are still used.
-#
-# Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability
-# tests.  These tests take up a significant amount of the total test time
-# but are not needed unless you plan to talk to SVN repos.
-#
-# Define NO_FINK if you are building on Darwin/Mac OS X, have Fink
-# installed in /sw, but don't want PERF to link against any libraries
-# installed there.  If defined you may specify your own (or Fink's)
-# include directories and library directories by defining CFLAGS
-# and LDFLAGS appropriately.
-#
-# Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X,
-# have DarwinPorts installed in /opt/local, but don't want PERF to
-# link against any libraries installed there.  If defined you may
-# specify your own (or DarwinPort's) include directories and
-# library directories by defining CFLAGS and LDFLAGS appropriately.
-#
-# Define PPC_SHA1 environment variable when running make to make use of
-# a bundled SHA1 routine optimized for PowerPC.
-#
-# Define ARM_SHA1 environment variable when running make to make use of
-# a bundled SHA1 routine optimized for ARM.
-#
-# Define MOZILLA_SHA1 environment variable when running make to make use of
-# a bundled SHA1 routine coming from Mozilla. It is GPL'd and should be fast
-# on non-x86 architectures (e.g. PowerPC), while the OpenSSL version (default
-# choice) has very fast version optimized for i586.
-#
-# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin).
-#
-# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin).
-#
-# Define NEEDS_SOCKET if linking with libc is not enough (SunOS,
-# Patrick Mauritz).
-#
-# Define NO_MMAP if you want to avoid mmap.
-#
-# Define NO_PTHREADS if you do not have or do not want to use Pthreads.
-#
-# Define NO_PREAD if you have a problem with pread() system call (e.g.
-# cygwin.dll before v1.5.22).
-#
-# Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is
-# generally faster on your platform than accessing the working directory.
-#
-# Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support
-# the executable mode bit, but doesn't really do so.
-#
-# Define NO_IPV6 if you lack IPv6 support and getaddrinfo().
-#
-# Define NO_SOCKADDR_STORAGE if your platform does not have struct
-# sockaddr_storage.
-#
-# Define NO_ICONV if your libc does not properly support iconv.
-#
-# Define OLD_ICONV if your library has an old iconv(), where the second
-# (input buffer pointer) parameter is declared with type (const char **).
 #
-# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound.
+# This is a simple wrapper Makefile that calls the main Makefile.perf
+# with a -j option to do parallel builds
 #
-# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib"
-# that tells runtime paths to dynamic libraries;
-# "-Wl,-rpath=/path/lib" is used instead.
+# If you want to invoke the perf build in some non-standard way then
+# you can use the 'make -f Makefile.perf' method to invoke it.
 #
-# Define USE_NSEC below if you want perf to care about sub-second file mtimes
-# and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and
-# it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely
-# randomly break unless your underlying filesystem supports those sub-second
-# times (my ext3 doesn't).
-#
-# Define USE_ST_TIMESPEC if your "struct stat" uses "st_ctimespec" instead of
-# "st_ctim"
-#
-# Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec"
-# available.  This automatically turns USE_NSEC off.
-#
-# Define USE_STDEV below if you want perf to care about the underlying device
-# change being considered an inode change from the update-index perspective.
-#
-# Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks
-# field that counts the on-disk footprint in 512-byte blocks.
-#
-# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
-#
-# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
-#
-# Define NO_PERL_MAKEMAKER if you cannot use Makefiles generated by perl's
-# MakeMaker (e.g. using ActiveState under Cygwin).
-#
-# Define NO_PERL if you do not want Perl scripts or libraries at all.
-#
-# Define INTERNAL_QSORT to use Git's implementation of qsort(), which
-# is a simplified version of the merge sort used in glibc. This is
-# recommended if Git triggers O(n^2) behavior in your platform's qsort().
-#
-# Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call
-# your external grep (e.g., if your system lacks grep, if its grep is
-# broken, or spawning external process is slower than built-in grep perf has).
-#
-# Define LDFLAGS=-static to build a static binary.
-#
-# Define EXTRA_CFLAGS=-m64 or EXTRA_CFLAGS=-m32 as appropriate for cross-builds.
-
-PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE
-	@$(SHELL_PATH) util/PERF-VERSION-GEN
--include PERF-VERSION-FILE
-
-uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not')
-uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not')
-uname_O := $(shell sh -c 'uname -o 2>/dev/null || echo not')
-uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not')
-uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not')
-uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not')
-
-# CFLAGS and LDFLAGS are for the users to override from the command line.
 
 #
-# Include saner warnings here, which can catch bugs:
+# Clear out the built-in rules GNU make defines by default (such as .o targets),
+# so that we pass through all targets to Makefile.perf:
 #
-
-EXTRA_WARNINGS := -Wformat
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wformat-security
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wformat-y2k
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wshadow
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Winit-self
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wpacked
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wredundant-decls
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wstack-protector
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wstrict-aliasing=3
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wswitch-default
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wswitch-enum
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wno-system-headers
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wundef
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wvolatile-register-var
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wwrite-strings
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wbad-function-cast
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wmissing-declarations
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wmissing-prototypes
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wnested-externs
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wold-style-definition
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wstrict-prototypes
-EXTRA_WARNINGS := $(EXTRA_WARNINGS) -Wdeclaration-after-statement
-
-ifeq ("$(origin DEBUG)", "command line")
-  PERF_DEBUG = $(DEBUG)
-endif
-ifndef PERF_DEBUG
-  CFLAGS_OPTIMIZE = -O6
-endif
-
-CFLAGS = -ggdb3 -Wall -Wextra -std=gnu99 -Werror $(CFLAGS_OPTIMIZE) -D_FORTIFY_SOURCE=2 $(EXTRA_WARNINGS) $(EXTRA_CFLAGS)
-EXTLIBS = -lpthread -lrt -lelf -lm
-ALL_CFLAGS = $(CFLAGS) -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64
-ALL_LDFLAGS = $(LDFLAGS)
-STRIP ?= strip
-
-# Among the variables below, these:
-#   perfexecdir
-#   template_dir
-#   mandir
-#   infodir
-#   htmldir
-#   ETC_PERFCONFIG (but not sysconfdir)
-# can be specified as a relative path some/where/else;
-# this is interpreted as relative to $(prefix) and "perf" at
-# runtime figures out where they are based on the path to the executable.
-# This can help installing the suite in a relocatable way.
-
-# Make the path relative to DESTDIR, not to prefix
-ifndef DESTDIR
-prefix = $(HOME)
-endif
-bindir_relative = bin
-bindir = $(prefix)/$(bindir_relative)
-mandir = share/man
-infodir = share/info
-perfexecdir = libexec/perf-core
-sharedir = $(prefix)/share
-template_dir = share/perf-core/templates
-htmldir = share/doc/perf-doc
-ifeq ($(prefix),/usr)
-sysconfdir = /etc
-ETC_PERFCONFIG = $(sysconfdir)/perfconfig
-else
-sysconfdir = $(prefix)/etc
-ETC_PERFCONFIG = etc/perfconfig
-endif
-lib = lib
-
-export prefix bindir sharedir sysconfdir
-
-CC = $(CROSS_COMPILE)gcc
-AR = $(CROSS_COMPILE)ar
-RM = rm -f
-TAR = tar
-FIND = find
-INSTALL = install
-RPMBUILD = rpmbuild
-PTHREAD_LIBS = -lpthread
-
-# sparse is architecture-neutral, which means that we need to tell it
-# explicitly what architecture to check for. Fix this up for yours..
-SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
-
-ifeq ($(V), 2)
-	QUIET_STDERR = ">/dev/null"
-else
-	QUIET_STDERR = ">/dev/null 2>&1"
-endif
-
-BITBUCKET = "/dev/null"
-
-ifneq ($(shell sh -c "(echo '\#include <stdio.h>'; echo 'int main(void) { return puts(\"hi\"); }') | $(CC) -x c - $(ALL_CFLAGS) -o $(BITBUCKET) "$(QUIET_STDERR)" && echo y"), y)
-	BITBUCKET = .perf.dev.null
-endif
-
-ifeq ($(shell sh -c "echo 'int foo(void) {char X[2]; return 3;}' | $(CC) -x c -c -Werror -fstack-protector-all - -o $(BITBUCKET) "$(QUIET_STDERR)" && echo y"), y)
-  CFLAGS := $(CFLAGS) -fstack-protector-all
-endif
-
-
-### --- END CONFIGURATION SECTION ---
-
-# Those must not be GNU-specific; they are shared with perl/ which may
-# be built by a different compiler. (Note that this is an artifact now
-# but it still might be nice to keep that distinction.)
-BASIC_CFLAGS = -Iutil/include
-BASIC_LDFLAGS =
-
-# Guard against environment variables
-BUILTIN_OBJS =
-BUILT_INS =
-COMPAT_CFLAGS =
-COMPAT_OBJS =
-LIB_H =
-LIB_OBJS =
-SCRIPT_PERL =
-SCRIPT_SH =
-TEST_PROGRAMS =
-
-SCRIPT_SH += perf-archive.sh
+.SUFFIXES:
 
 #
-# No Perl scripts right now:
+# We don't want to pass along options like -j:
 #
-
-# SCRIPT_PERL += perf-add--interactive.perl
-
-SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \
-	  $(patsubst %.perl,%,$(SCRIPT_PERL))
-
-# Empty...
-EXTRA_PROGRAMS =
-
-# ... and all the rest that could be moved out of bindir to perfexecdir
-PROGRAMS += $(EXTRA_PROGRAMS)
+unexport MAKEFLAGS
 
 #
-# Single 'perf' binary right now:
+# Do a parallel build with multiple jobs, based on the number of CPUs online
+# in this system: 'make -j8' on a 8-CPU system, etc.
 #
-PROGRAMS += perf
-
-# List built-in command $C whose implementation cmd_$C() is not in
-# builtin-$C.o but is linked in as part of some other command.
+# (To override it, run 'make JOBS=1' and similar.)
 #
-
-# what 'all' will build and 'install' will install, in perfexecdir
-ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
-
-# what 'all' will build but not install in perfexecdir
-OTHER_PROGRAMS = perf$X
-
-# Set paths to tools early so that they can be used for version tests.
-ifndef SHELL_PATH
-	SHELL_PATH = /bin/sh
-endif
-ifndef PERL_PATH
-	PERL_PATH = /usr/bin/perl
+ifeq ($(JOBS),)
+  JOBS := $(shell grep -c ^processor /proc/cpuinfo 2>/dev/null)
+  ifeq ($(JOBS),)
+    JOBS := 1
+  endif
 endif
 
-export PERL_PATH
-
-LIB_FILE=libperf.a
-
-LIB_H += ../../include/linux/perf_event.h
-LIB_H += ../../include/linux/rbtree.h
-LIB_H += ../../include/linux/list.h
-LIB_H += ../../include/linux/hash.h
-LIB_H += ../../include/linux/stringify.h
-LIB_H += util/include/linux/bitmap.h
-LIB_H += util/include/linux/bitops.h
-LIB_H += util/include/linux/compiler.h
-LIB_H += util/include/linux/ctype.h
-LIB_H += util/include/linux/kernel.h
-LIB_H += util/include/linux/list.h
-LIB_H += util/include/linux/module.h
-LIB_H += util/include/linux/poison.h
-LIB_H += util/include/linux/prefetch.h
-LIB_H += util/include/linux/rbtree.h
-LIB_H += util/include/linux/string.h
-LIB_H += util/include/linux/types.h
-LIB_H += util/include/asm/asm-offsets.h
-LIB_H += util/include/asm/bitops.h
-LIB_H += util/include/asm/bug.h
-LIB_H += util/include/asm/byteorder.h
-LIB_H += util/include/asm/swab.h
-LIB_H += util/include/asm/system.h
-LIB_H += util/include/asm/uaccess.h
-LIB_H += perf.h
-LIB_H += util/cache.h
-LIB_H += util/callchain.h
-LIB_H += util/build-id.h
-LIB_H += util/debug.h
-LIB_H += util/debugfs.h
-LIB_H += util/event.h
-LIB_H += util/exec_cmd.h
-LIB_H += util/types.h
-LIB_H += util/levenshtein.h
-LIB_H += util/map.h
-LIB_H += util/parse-options.h
-LIB_H += util/parse-events.h
-LIB_H += util/quote.h
-LIB_H += util/util.h
-LIB_H += util/header.h
-LIB_H += util/help.h
-LIB_H += util/session.h
-LIB_H += util/strbuf.h
-LIB_H += util/string.h
-LIB_H += util/strlist.h
-LIB_H += util/svghelper.h
-LIB_H += util/run-command.h
-LIB_H += util/sigchain.h
-LIB_H += util/symbol.h
-LIB_H += util/color.h
-LIB_H += util/values.h
-LIB_H += util/sort.h
-LIB_H += util/hist.h
-LIB_H += util/thread.h
-LIB_H += util/trace-event.h
-LIB_H += util/probe-finder.h
-LIB_H += util/probe-event.h
-LIB_H += util/cpumap.h
-
-LIB_OBJS += util/abspath.o
-LIB_OBJS += util/alias.o
-LIB_OBJS += util/build-id.o
-LIB_OBJS += util/config.o
-LIB_OBJS += util/ctype.o
-LIB_OBJS += util/debugfs.o
-LIB_OBJS += util/environment.o
-LIB_OBJS += util/event.o
-LIB_OBJS += util/exec_cmd.o
-LIB_OBJS += util/help.o
-LIB_OBJS += util/levenshtein.o
-LIB_OBJS += util/parse-options.o
-LIB_OBJS += util/parse-events.o
-LIB_OBJS += util/path.o
-LIB_OBJS += util/rbtree.o
-LIB_OBJS += util/bitmap.o
-LIB_OBJS += util/hweight.o
-LIB_OBJS += util/find_next_bit.o
-LIB_OBJS += util/run-command.o
-LIB_OBJS += util/quote.o
-LIB_OBJS += util/strbuf.o
-LIB_OBJS += util/string.o
-LIB_OBJS += util/strlist.o
-LIB_OBJS += util/usage.o
-LIB_OBJS += util/wrapper.o
-LIB_OBJS += util/sigchain.o
-LIB_OBJS += util/symbol.o
-LIB_OBJS += util/color.o
-LIB_OBJS += util/pager.o
-LIB_OBJS += util/header.o
-LIB_OBJS += util/callchain.o
-LIB_OBJS += util/values.o
-LIB_OBJS += util/debug.o
-LIB_OBJS += util/map.o
-LIB_OBJS += util/session.o
-LIB_OBJS += util/thread.o
-LIB_OBJS += util/trace-event-parse.o
-LIB_OBJS += util/trace-event-read.o
-LIB_OBJS += util/trace-event-info.o
-LIB_OBJS += util/trace-event-scripting.o
-LIB_OBJS += util/svghelper.o
-LIB_OBJS += util/sort.o
-LIB_OBJS += util/hist.o
-LIB_OBJS += util/probe-event.o
-LIB_OBJS += util/util.o
-LIB_OBJS += util/cpumap.o
-
-BUILTIN_OBJS += builtin-annotate.o
-
-BUILTIN_OBJS += builtin-bench.o
-
-# Benchmark modules
-BUILTIN_OBJS += bench/sched-messaging.o
-BUILTIN_OBJS += bench/sched-pipe.o
-BUILTIN_OBJS += bench/mem-memcpy.o
-
-BUILTIN_OBJS += builtin-diff.o
-BUILTIN_OBJS += builtin-help.o
-BUILTIN_OBJS += builtin-sched.o
-BUILTIN_OBJS += builtin-buildid-list.o
-BUILTIN_OBJS += builtin-buildid-cache.o
-BUILTIN_OBJS += builtin-list.o
-BUILTIN_OBJS += builtin-record.o
-BUILTIN_OBJS += builtin-report.o
-BUILTIN_OBJS += builtin-stat.o
-BUILTIN_OBJS += builtin-timechart.o
-BUILTIN_OBJS += builtin-top.o
-BUILTIN_OBJS += builtin-trace.o
-BUILTIN_OBJS += builtin-probe.o
-BUILTIN_OBJS += builtin-kmem.o
-BUILTIN_OBJS += builtin-lock.o
-
-PERFLIBS = $(LIB_FILE)
-
 #
-# Platform specific tweaks
+# Only pass canonical directory names as the output directory:
 #
-
-# We choose to avoid "if .. else if .. else .. endif endif"
-# because maintaining the nesting to match is a pain.  If
-# we had "elif" things would have been much nicer...
-
--include config.mak.autogen
--include config.mak
-
-ifeq ($(uname_S),Darwin)
-	ifndef NO_FINK
-		ifeq ($(shell test -d /sw/lib && echo y),y)
-			BASIC_CFLAGS += -I/sw/include
-			BASIC_LDFLAGS += -L/sw/lib
-		endif
-	endif
-	ifndef NO_DARWIN_PORTS
-		ifeq ($(shell test -d /opt/local/lib && echo y),y)
-			BASIC_CFLAGS += -I/opt/local/include
-			BASIC_LDFLAGS += -L/opt/local/lib
-		endif
-	endif
-	PTHREAD_LIBS =
-endif
-
-ifeq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
-ifneq ($(shell sh -c "(echo '\#include <gnu/libc-version.h>'; echo 'int main(void) { const char * version = gnu_get_libc_version(); return (long)version; }') | $(CC) -x c - $(ALL_CFLAGS) -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
-	msg := $(error No gnu/libc-version.h found, please install glibc-dev[el]/glibc-static);
-endif
-
-	ifneq ($(shell sh -c "(echo '\#include <libelf.h>'; echo 'int main(void) { Elf * elf = elf_begin(0, ELF_C_READ_MMAP, 0); return (long)elf; }') | $(CC) -x c - $(ALL_CFLAGS) -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
-		BASIC_CFLAGS += -DLIBELF_NO_MMAP
-	endif
-else
-	msg := $(error No libelf.h/libelf found, please install libelf-dev/elfutils-libelf-devel and glibc-dev[el]);
-endif
-
-ifneq ($(shell sh -c "(echo '\#include <dwarf.h>'; echo '\#include <libdw.h>'; echo 'int main(void) { Dwarf *dbg; dbg = dwarf_begin(0, DWARF_C_READ); return (long)dbg; }') | $(CC) -x c - $(ALL_CFLAGS) -I/usr/include/elfutils -ldw -lelf -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) "$(QUIET_STDERR)" && echo y"), y)
-	msg := $(warning No libdw.h found or old libdw.h found, disables dwarf support. Please install elfutils-devel/elfutils-dev);
-	BASIC_CFLAGS += -DNO_DWARF_SUPPORT
-else
-	BASIC_CFLAGS += -I/usr/include/elfutils
-	EXTLIBS += -lelf -ldw
-	LIB_OBJS += util/probe-finder.o
-endif
-
-ifndef NO_LIBPERL
-PERL_EMBED_LDOPTS = `perl -MExtUtils::Embed -e ldopts 2>/dev/null`
-PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null`
-endif
-
-ifneq ($(shell sh -c "(echo '\#include <EXTERN.h>'; echo '\#include <perl.h>'; echo 'int main(void) { perl_alloc(); return 0; }') | $(CC) -x c - $(PERL_EMBED_CCOPTS) -o $(BITBUCKET) $(PERL_EMBED_LDOPTS) > /dev/null 2>&1 && echo y"), y)
-	BASIC_CFLAGS += -DNO_LIBPERL
-else
-	ALL_LDFLAGS += $(PERL_EMBED_LDOPTS)
-	LIB_OBJS += util/scripting-engines/trace-event-perl.o
-	LIB_OBJS += scripts/perl/Perf-Trace-Util/Context.o
-endif
-
-ifndef NO_LIBPYTHON
-PYTHON_EMBED_LDOPTS = `python-config --ldflags 2>/dev/null`
-PYTHON_EMBED_CCOPTS = `python-config --cflags 2>/dev/null`
-endif
-
-ifneq ($(shell sh -c "(echo '\#include <Python.h>'; echo 'int main(void) { Py_Initialize(); return 0; }') | $(CC) -x c - $(PYTHON_EMBED_CCOPTS) -o /dev/null $(PYTHON_EMBED_LDOPTS) > /dev/null 2>&1 && echo y"), y)
-	BASIC_CFLAGS += -DNO_LIBPYTHON
-else
-	ALL_LDFLAGS += $(PYTHON_EMBED_LDOPTS)
-	LIB_OBJS += util/scripting-engines/trace-event-python.o
-	LIB_OBJS += scripts/python/Perf-Trace-Util/Context.o
-endif
-
-ifdef NO_DEMANGLE
-	BASIC_CFLAGS += -DNO_DEMANGLE
-else
-	has_bfd := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) -lbfd "$(QUIET_STDERR)" && echo y")
-
-	ifeq ($(has_bfd),y)
-		EXTLIBS += -lbfd
-	else
-		has_bfd_iberty := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) -lbfd -liberty "$(QUIET_STDERR)" && echo y")
-		ifeq ($(has_bfd_iberty),y)
-			EXTLIBS += -lbfd -liberty
-		else
-			has_bfd_iberty_z := $(shell sh -c "(echo '\#include <bfd.h>'; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) -lbfd -liberty -lz "$(QUIET_STDERR)" && echo y")
-			ifeq ($(has_bfd_iberty_z),y)
-				EXTLIBS += -lbfd -liberty -lz
-			else
-				has_cplus_demangle := $(shell sh -c "(echo 'extern char *cplus_demangle(const char *, int);'; echo 'int main(void) { cplus_demangle(0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o $(BITBUCKET) $(ALL_LDFLAGS) $(EXTLIBS) -liberty "$(QUIET_STDERR)" && echo y")
-				ifeq ($(has_cplus_demangle),y)
-					EXTLIBS += -liberty
-					BASIC_CFLAGS += -DHAVE_CPLUS_DEMANGLE
-				else
-					msg := $(warning No bfd.h/libbfd found, install binutils-dev[el]/zlib-static to gain symbol demangling)
-					BASIC_CFLAGS += -DNO_DEMANGLE
-				endif
-			endif
-		endif
-	endif
-endif
-
-ifndef CC_LD_DYNPATH
-	ifdef NO_R_TO_GCC_LINKER
-		# Some gcc does not accept and pass -R to the linker to specify
-		# the runtime dynamic library path.
-		CC_LD_DYNPATH = -Wl,-rpath,
-	else
-		CC_LD_DYNPATH = -R
-	endif
-endif
-
-ifdef NEEDS_SOCKET
-	EXTLIBS += -lsocket
-endif
-ifdef NEEDS_NSL
-	EXTLIBS += -lnsl
-endif
-ifdef NO_D_TYPE_IN_DIRENT
-	BASIC_CFLAGS += -DNO_D_TYPE_IN_DIRENT
-endif
-ifdef NO_D_INO_IN_DIRENT
-	BASIC_CFLAGS += -DNO_D_INO_IN_DIRENT
-endif
-ifdef NO_ST_BLOCKS_IN_STRUCT_STAT
-	BASIC_CFLAGS += -DNO_ST_BLOCKS_IN_STRUCT_STAT
-endif
-ifdef USE_NSEC
-	BASIC_CFLAGS += -DUSE_NSEC
-endif
-ifdef USE_ST_TIMESPEC
-	BASIC_CFLAGS += -DUSE_ST_TIMESPEC
-endif
-ifdef NO_NSEC
-	BASIC_CFLAGS += -DNO_NSEC
-endif
-ifdef NO_C99_FORMAT
-	BASIC_CFLAGS += -DNO_C99_FORMAT
-endif
-ifdef SNPRINTF_RETURNS_BOGUS
-	COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS
-	COMPAT_OBJS += compat/snprintf.o
-endif
-ifdef FREAD_READS_DIRECTORIES
-	COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES
-	COMPAT_OBJS += compat/fopen.o
-endif
-ifdef NO_SYMLINK_HEAD
-	BASIC_CFLAGS += -DNO_SYMLINK_HEAD
-endif
-ifdef NO_STRCASESTR
-	COMPAT_CFLAGS += -DNO_STRCASESTR
-	COMPAT_OBJS += compat/strcasestr.o
-endif
-ifdef NO_STRTOUMAX
-	COMPAT_CFLAGS += -DNO_STRTOUMAX
-	COMPAT_OBJS += compat/strtoumax.o
-endif
-ifdef NO_STRTOULL
-	COMPAT_CFLAGS += -DNO_STRTOULL
-endif
-ifdef NO_SETENV
-	COMPAT_CFLAGS += -DNO_SETENV
-	COMPAT_OBJS += compat/setenv.o
+ifneq ($(O),)
+  FULL_O := $(shell readlink -f $(O) || echo $(O))
 endif
-ifdef NO_MKDTEMP
-	COMPAT_CFLAGS += -DNO_MKDTEMP
-	COMPAT_OBJS += compat/mkdtemp.o
-endif
-ifdef NO_UNSETENV
-	COMPAT_CFLAGS += -DNO_UNSETENV
-	COMPAT_OBJS += compat/unsetenv.o
-endif
-ifdef NO_SYS_SELECT_H
-	BASIC_CFLAGS += -DNO_SYS_SELECT_H
-endif
-ifdef NO_MMAP
-	COMPAT_CFLAGS += -DNO_MMAP
-	COMPAT_OBJS += compat/mmap.o
-else
-	ifdef USE_WIN32_MMAP
-		COMPAT_CFLAGS += -DUSE_WIN32_MMAP
-		COMPAT_OBJS += compat/win32mmap.o
-	endif
-endif
-ifdef NO_PREAD
-	COMPAT_CFLAGS += -DNO_PREAD
-	COMPAT_OBJS += compat/pread.o
-endif
-ifdef NO_FAST_WORKING_DIRECTORY
-	BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY
-endif
-ifdef NO_TRUSTABLE_FILEMODE
-	BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE
-endif
-ifdef NO_IPV6
-	BASIC_CFLAGS += -DNO_IPV6
-endif
-ifdef NO_UINTMAX_T
-	BASIC_CFLAGS += -Duintmax_t=uint32_t
-endif
-ifdef NO_SOCKADDR_STORAGE
-ifdef NO_IPV6
-	BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in
-else
-	BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in6
-endif
-endif
-ifdef NO_INET_NTOP
-	LIB_OBJS += compat/inet_ntop.o
-endif
-ifdef NO_INET_PTON
-	LIB_OBJS += compat/inet_pton.o
-endif
-
-ifdef NO_ICONV
-	BASIC_CFLAGS += -DNO_ICONV
-endif
-
-ifdef OLD_ICONV
-	BASIC_CFLAGS += -DOLD_ICONV
-endif
-
-ifdef NO_DEFLATE_BOUND
-	BASIC_CFLAGS += -DNO_DEFLATE_BOUND
-endif
-
-ifdef PPC_SHA1
-	SHA1_HEADER = "ppc/sha1.h"
-	LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o
-else
-ifdef ARM_SHA1
-	SHA1_HEADER = "arm/sha1.h"
-	LIB_OBJS += arm/sha1.o arm/sha1_arm.o
-else
-ifdef MOZILLA_SHA1
-	SHA1_HEADER = "mozilla-sha1/sha1.h"
-	LIB_OBJS += mozilla-sha1/sha1.o
-else
-	SHA1_HEADER = <openssl/sha.h>
-	EXTLIBS += $(LIB_4_CRYPTO)
-endif
-endif
-endif
-ifdef NO_PERL_MAKEMAKER
-	export NO_PERL_MAKEMAKER
-endif
-ifdef NO_HSTRERROR
-	COMPAT_CFLAGS += -DNO_HSTRERROR
-	COMPAT_OBJS += compat/hstrerror.o
-endif
-ifdef NO_MEMMEM
-	COMPAT_CFLAGS += -DNO_MEMMEM
-	COMPAT_OBJS += compat/memmem.o
-endif
-ifdef INTERNAL_QSORT
-	COMPAT_CFLAGS += -DINTERNAL_QSORT
-	COMPAT_OBJS += compat/qsort.o
-endif
-ifdef RUNTIME_PREFIX
-	COMPAT_CFLAGS += -DRUNTIME_PREFIX
-endif
-
-ifdef DIR_HAS_BSD_GROUP_SEMANTICS
-	COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS
-endif
-ifdef NO_EXTERNAL_GREP
-	BASIC_CFLAGS += -DNO_EXTERNAL_GREP
-endif
-
-ifeq ($(PERL_PATH),)
-NO_PERL=NoThanks
-endif
-
-QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir
-QUIET_SUBDIR1  =
-
-ifneq ($(findstring $(MAKEFLAGS),w),w)
-PRINT_DIR = --no-print-directory
-else # "make -w"
-NO_SUBDIR = :
-endif
-
-ifneq ($(findstring $(MAKEFLAGS),s),s)
-ifndef V
-	QUIET_CC       = @echo '   ' CC $@;
-	QUIET_AR       = @echo '   ' AR $@;
-	QUIET_LINK     = @echo '   ' LINK $@;
-	QUIET_BUILT_IN = @echo '   ' BUILTIN $@;
-	QUIET_GEN      = @echo '   ' GEN $@;
-	QUIET_SUBDIR0  = +@subdir=
-	QUIET_SUBDIR1  = ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \
-			 $(MAKE) $(PRINT_DIR) -C $$subdir
-	export V
-	export QUIET_GEN
-	export QUIET_BUILT_IN
-endif
-endif
-
-ifdef ASCIIDOC8
-	export ASCIIDOC8
-endif
-
-# Shell quote (do not use $(call) to accommodate ancient setups);
-
-SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER))
-ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
-
-DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
-bindir_SQ = $(subst ','\'',$(bindir))
-bindir_relative_SQ = $(subst ','\'',$(bindir_relative))
-mandir_SQ = $(subst ','\'',$(mandir))
-infodir_SQ = $(subst ','\'',$(infodir))
-perfexecdir_SQ = $(subst ','\'',$(perfexecdir))
-template_dir_SQ = $(subst ','\'',$(template_dir))
-htmldir_SQ = $(subst ','\'',$(htmldir))
-prefix_SQ = $(subst ','\'',$(prefix))
-
-SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH))
-PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH))
-
-LIBS = $(PERFLIBS) $(EXTLIBS)
-
-BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \
-	$(COMPAT_CFLAGS)
-LIB_OBJS += $(COMPAT_OBJS)
-
-ALL_CFLAGS += $(BASIC_CFLAGS)
-ALL_LDFLAGS += $(BASIC_LDFLAGS)
-
-export TAR INSTALL DESTDIR SHELL_PATH
-
-
-### Build rules
-
-SHELL = $(SHELL_PATH)
-
-all:: .perf.dev.null shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) PERF-BUILD-OPTIONS
-ifneq (,$X)
-	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';)
-endif
-
-all::
-
-please_set_SHELL_PATH_to_a_more_modern_shell:
-	@$$(:)
-
-shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
-
-strip: $(PROGRAMS) perf$X
-	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) perf$X
-
-perf.o: perf.c common-cmds.h PERF-CFLAGS
-	$(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \
-		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
-		$(ALL_CFLAGS) -c $(filter %.c,$^)
-
-perf$X: perf.o $(BUILTIN_OBJS) $(PERFLIBS)
-	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ perf.o \
-		$(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS)
-
-builtin-help.o: builtin-help.c common-cmds.h PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
-		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
-		'-DPERF_MAN_PATH="$(mandir_SQ)"' \
-		'-DPERF_INFO_PATH="$(infodir_SQ)"' $<
-
-builtin-timechart.o: builtin-timechart.c common-cmds.h PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
-		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
-		'-DPERF_MAN_PATH="$(mandir_SQ)"' \
-		'-DPERF_INFO_PATH="$(infodir_SQ)"' $<
-
-$(BUILT_INS): perf$X
-	$(QUIET_BUILT_IN)$(RM) $@ && \
-	ln perf$X $@ 2>/dev/null || \
-	ln -s perf$X $@ 2>/dev/null || \
-	cp perf$X $@
-
-common-cmds.h: util/generate-cmdlist.sh command-list.txt
-
-common-cmds.h: $(wildcard Documentation/perf-*.txt)
-	$(QUIET_GEN). util/generate-cmdlist.sh > $@+ && mv $@+ $@
-
-$(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh
-	$(QUIET_GEN)$(RM) $@ $@+ && \
-	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \
-	    -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \
-	    -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \
-	    -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
-	    -e 's/@@NO_CURL@@/$(NO_CURL)/g' \
-	    $@.sh >$@+ && \
-	chmod +x $@+ && \
-	mv $@+ $@
-
-configure: configure.ac
-	$(QUIET_GEN)$(RM) $@ $<+ && \
-	sed -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \
-	    $< > $<+ && \
-	autoconf -o $@ $<+ && \
-	$(RM) $<+
-
-# These can record PERF_VERSION
-perf.o perf.spec \
-	$(patsubst %.sh,%,$(SCRIPT_SH)) \
-	$(patsubst %.perl,%,$(SCRIPT_PERL)) \
-	: PERF-VERSION-FILE
-
-%.o: %.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
-%.s: %.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $<
-%.o: %.S
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $<
-
-util/exec_cmd.o: util/exec_cmd.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
-		'-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
-		'-DBINDIR="$(bindir_relative_SQ)"' \
-		'-DPREFIX="$(prefix_SQ)"' \
-		$<
-
-builtin-init-db.o: builtin-init-db.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $<
-
-util/config.o: util/config.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
-
-util/rbtree.o: ../../lib/rbtree.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o util/rbtree.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
-
-# some perf warning policies can't fit to lib/bitmap.c, eg: it warns about variable shadowing
-# from <string.h> that comes from kernel headers wrapping.
-KBITMAP_FLAGS=`echo $(ALL_CFLAGS) | sed s/-Wshadow// | sed s/-Wswitch-default// | sed s/-Wextra//`
-
-util/bitmap.o: ../../lib/bitmap.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o util/bitmap.o -c $(KBITMAP_FLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
-
-util/hweight.o: ../../lib/hweight.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o util/hweight.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
-
-util/find_next_bit.o: ../../lib/find_next_bit.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o util/find_next_bit.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
-
-util/scripting-engines/trace-event-perl.o: util/scripting-engines/trace-event-perl.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o util/scripting-engines/trace-event-perl.o -c $(ALL_CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow $<
-
-scripts/perl/Perf-Trace-Util/Context.o: scripts/perl/Perf-Trace-Util/Context.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o scripts/perl/Perf-Trace-Util/Context.o -c $(ALL_CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs $<
-
-util/scripting-engines/trace-event-python.o: util/scripting-engines/trace-event-python.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o util/scripting-engines/trace-event-python.o -c $(ALL_CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow $<
-
-scripts/python/Perf-Trace-Util/Context.o: scripts/python/Perf-Trace-Util/Context.c PERF-CFLAGS
-	$(QUIET_CC)$(CC) -o scripts/python/Perf-Trace-Util/Context.o -c $(ALL_CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs $<
-
-perf-%$X: %.o $(PERFLIBS)
-	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS)
-
-$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
-$(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
-builtin-revert.o wt-status.o: wt-status.h
-
-$(LIB_FILE): $(LIB_OBJS)
-	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
-
-doc:
-	$(MAKE) -C Documentation all
-
-man:
-	$(MAKE) -C Documentation man
-
-html:
-	$(MAKE) -C Documentation html
-
-info:
-	$(MAKE) -C Documentation info
-
-pdf:
-	$(MAKE) -C Documentation pdf
-
-TAGS:
-	$(RM) TAGS
-	$(FIND) . -name '*.[hcS]' -print | xargs etags -a
-
-tags:
-	$(RM) tags
-	$(FIND) . -name '*.[hcS]' -print | xargs ctags -a
-
-cscope:
-	$(RM) cscope*
-	$(FIND) . -name '*.[hcS]' -print | xargs cscope -b
-
-### Detect prefix changes
-TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\
-             $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ)
-
-PERF-CFLAGS: .FORCE-PERF-CFLAGS
-	@FLAGS='$(TRACK_CFLAGS)'; \
-	    if test x"$$FLAGS" != x"`cat PERF-CFLAGS 2>/dev/null`" ; then \
-		echo 1>&2 "    * new build flags or prefix"; \
-		echo "$$FLAGS" >PERF-CFLAGS; \
-            fi
-
-# We need to apply sq twice, once to protect from the shell
-# that runs PERF-BUILD-OPTIONS, and then again to protect it
-# and the first level quoting from the shell that runs "echo".
-PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS
-	@echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@
-	@echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@
-	@echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@
-	@echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@
-
-### Testing rules
 
 #
-# None right now:
+# Only accept the 'DEBUG' variable from the command line:
 #
-# TEST_PROGRAMS += test-something$X
-
-all:: $(TEST_PROGRAMS)
-
-# GNU make supports exporting all variables by "export" without parameters.
-# However, the environment gets quite big, and some programs have problems
-# with that.
-
-export NO_SVN_TESTS
-
-check: common-cmds.h
-	if sparse; \
-	then \
-		for i in *.c */*.c; \
-		do \
-			sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
-		done; \
-	else \
-		echo 2>&1 "Did you mean 'make test'?"; \
-		exit 1; \
-	fi
-
-remove-dashes:
-	./fixup-builtins $(BUILT_INS) $(PROGRAMS) $(SCRIPTS)
-
-### Installation rules
-
-ifneq ($(filter /%,$(firstword $(template_dir))),)
-template_instdir = $(template_dir)
-else
-template_instdir = $(prefix)/$(template_dir)
-endif
-export template_instdir
-
-ifneq ($(filter /%,$(firstword $(perfexecdir))),)
-perfexec_instdir = $(perfexecdir)
+ifeq ("$(origin DEBUG)", "command line")
+  ifeq ($(DEBUG),)
+    override DEBUG = 0
+  else
+    SET_DEBUG = "DEBUG=$(DEBUG)"
+  endif
 else
-perfexec_instdir = $(prefix)/$(perfexecdir)
+  override DEBUG = 0
 endif
-perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
-export perfexec_instdir
-
-install: all
-	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'
-	$(INSTALL) perf$X '$(DESTDIR_SQ)$(bindir_SQ)'
-	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'
-	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/bin'
-	$(INSTALL) perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
-	$(INSTALL) scripts/perl/Perf-Trace-Util/lib/Perf/Trace/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'
-	$(INSTALL) scripts/perl/*.pl -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl'
-	$(INSTALL) scripts/perl/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/bin'
-	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/Perf-Trace-Util/lib/Perf/Trace'
-	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/bin'
-	$(INSTALL) scripts/python/Perf-Trace-Util/lib/Perf/Trace/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/Perf-Trace-Util/lib/Perf/Trace'
-	$(INSTALL) scripts/python/*.py -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python'
-	$(INSTALL) scripts/python/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/bin'
-
-ifdef BUILT_INS
-	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
-	$(INSTALL) $(BUILT_INS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
-ifneq (,$X)
-	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';)
-endif
-endif
-
-install-doc:
-	$(MAKE) -C Documentation install
-
-install-man:
-	$(MAKE) -C Documentation install-man
-
-install-html:
-	$(MAKE) -C Documentation install-html
-
-install-info:
-	$(MAKE) -C Documentation install-info
-
-install-pdf:
-	$(MAKE) -C Documentation install-pdf
-
-quick-install-doc:
-	$(MAKE) -C Documentation quick-install
 
-quick-install-man:
-	$(MAKE) -C Documentation quick-install-man
+define print_msg
+  @printf '  BUILD:   Doing '\''make \033[33m-j'$(JOBS)'\033[m'\'' parallel build\n'
+endef
 
-quick-install-html:
-	$(MAKE) -C Documentation quick-install-html
+define make
+  @$(MAKE) -f Makefile.perf --no-print-directory -j$(JOBS) O=$(FULL_O) $(SET_DEBUG) $@
+endef
 
-
-### Maintainer's dist rules
-#
-# None right now
-#
 #
-# perf.spec: perf.spec.in
-#	sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+
-#	mv $@+ $@
+# Needed if no target specified:
+# (Except for tags and TAGS targets. The reason is that the
+# Makefile does not treat tags/TAGS as targets but as files
+# and thus won't rebuilt them once they are in place.)
 #
-# PERF_TARNAME=perf-$(PERF_VERSION)
-# dist: perf.spec perf-archive$(X) configure
-#	./perf-archive --format=tar \
-#		--prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar
-#	@mkdir -p $(PERF_TARNAME)
-#	@cp perf.spec configure $(PERF_TARNAME)
-#	@echo $(PERF_VERSION) > $(PERF_TARNAME)/version
-#	$(TAR) rf $(PERF_TARNAME).tar \
-#		$(PERF_TARNAME)/perf.spec \
-#		$(PERF_TARNAME)/configure \
-#		$(PERF_TARNAME)/version
-#	@$(RM) -r $(PERF_TARNAME)
-#	gzip -f -9 $(PERF_TARNAME).tar
+all tags TAGS:
+	$(print_msg)
+	$(make)
+
 #
-# htmldocs = perf-htmldocs-$(PERF_VERSION)
-# manpages = perf-manpages-$(PERF_VERSION)
-# dist-doc:
-#	$(RM) -r .doc-tmp-dir
-#	mkdir .doc-tmp-dir
-#	$(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc
-#	cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar .
-#	gzip -n -9 -f $(htmldocs).tar
-#	:
-#	$(RM) -r .doc-tmp-dir
-#	mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7
-#	$(MAKE) -C Documentation DESTDIR=./ \
-#		man1dir=../.doc-tmp-dir/man1 \
-#		man5dir=../.doc-tmp-dir/man5 \
-#		man7dir=../.doc-tmp-dir/man7 \
-#		install
-#	cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar .
-#	gzip -n -9 -f $(manpages).tar
-#	$(RM) -r .doc-tmp-dir
+# The clean target is not really parallel, don't print the jobs info:
 #
-# rpm: dist
-#	$(RPMBUILD) -ta $(PERF_TARNAME).tar.gz
-
-### Cleaning rules
-
-distclean: clean
-#	$(RM) configure
-
 clean:
-	$(RM) *.o */*.o */*/*.o */*/*/*.o $(LIB_FILE)
-	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X
-	$(RM) $(TEST_PROGRAMS)
-	$(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope*
-	$(RM) -r autom4te.cache
-	$(RM) config.log config.mak.autogen config.mak.append config.status config.cache
-	$(RM) -r $(PERF_TARNAME) .doc-tmp-dir
-	$(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz
-	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz
-	$(MAKE) -C Documentation/ clean
-	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS
-
-.PHONY: all install clean strip
-.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
-.PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS
-.PHONY: .FORCE-PERF-BUILD-OPTIONS
-
-.perf.dev.null:
-		touch .perf.dev.null
+	$(make)
 
-.INTERMEDIATE:	.perf.dev.null
-
-### Make sure built-ins do not have dups and listed in perf.c
-#
-check-builtins::
-	./check-builtins.sh
-
-### Test suite coverage testing
-#
-# None right now
 #
-# .PHONY: coverage coverage-clean coverage-build coverage-report
+# The build-test target is not really parallel, don't print the jobs info:
 #
-# coverage:
-#	$(MAKE) coverage-build
-#	$(MAKE) coverage-report
-#
-# coverage-clean:
-#	rm -f *.gcda *.gcno
-#
-# COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs
-# COVERAGE_LDFLAGS = $(CFLAGS)  -O0 -lgcov
+build-test:
+	@$(MAKE) -f tests/make --no-print-directory
+
 #
-# coverage-build: coverage-clean
-#	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all
-#	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \
-#		-j1 test
+# All other targets get passed through:
 #
-# coverage-report:
-#	gcov -b *.c */*.c
-#	grep '^function.*called 0 ' *.c.gcov */*.c.gcov \
-#		| sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \
-#		| tee coverage-untested-functions
+%:
+	$(print_msg)
+	$(make)
+
+.PHONY: tags TAGS
diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf
new file mode 100644
index 00000000000..9670a16fa57
--- /dev/null
+++ b/tools/perf/Makefile.perf
@@ -0,0 +1,938 @@
+include ../scripts/Makefile.include
+
+# The default target of this Makefile is...
+all:
+
+include config/utilities.mak
+
+# Define V to have a more verbose compile.
+#
+# Define VF to have a more verbose feature check output.
+#
+# Define O to save output files in a separate directory.
+#
+# Define ARCH as name of target architecture if you want cross-builds.
+#
+# Define CROSS_COMPILE as prefix name of compiler if you want cross-builds.
+#
+# Define NO_LIBPERL to disable perl script extension.
+#
+# Define NO_LIBPYTHON to disable python script extension.
+#
+# Define PYTHON to point to the python binary if the default
+# `python' is not correct; for example: PYTHON=python2
+#
+# Define PYTHON_CONFIG to point to the python-config binary if
+# the default `$(PYTHON)-config' is not correct.
+#
+# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8
+#
+# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72.
+#
+# Define LDFLAGS=-static to build a static binary.
+#
+# Define EXTRA_CFLAGS=-m64 or EXTRA_CFLAGS=-m32 as appropriate for cross-builds.
+#
+# Define NO_DWARF if you do not want debug-info analysis feature at all.
+#
+# Define WERROR=0 to disable treating any warnings as errors.
+#
+# Define NO_NEWT if you do not want TUI support. (deprecated)
+#
+# Define NO_SLANG if you do not want TUI support.
+#
+# Define NO_GTK2 if you do not want GTK+ GUI support.
+#
+# Define NO_DEMANGLE if you do not want C++ symbol demangling.
+#
+# Define NO_LIBELF if you do not want libelf dependency (e.g. cross-builds)
+#
+# Define NO_LIBUNWIND if you do not want libunwind dependency for dwarf
+# backtrace post unwind.
+#
+# Define NO_BACKTRACE if you do not want stack backtrace debug feature
+#
+# Define NO_LIBNUMA if you do not want numa perf benchmark
+#
+# Define NO_LIBAUDIT if you do not want libaudit support
+#
+# Define NO_LIBBIONIC if you do not want bionic support
+#
+# Define NO_LIBDW_DWARF_UNWIND if you do not want libdw support
+# for dwarf backtrace post unwind.
+
+ifeq ($(srctree),)
+srctree := $(patsubst %/,%,$(dir $(shell pwd)))
+srctree := $(patsubst %/,%,$(dir $(srctree)))
+#$(info Determined 'srctree' to be $(srctree))
+endif
+
+ifneq ($(objtree),)
+#$(info Determined 'objtree' to be $(objtree))
+endif
+
+ifneq ($(OUTPUT),)
+#$(info Determined 'OUTPUT' to be $(OUTPUT))
+endif
+
+$(OUTPUT)PERF-VERSION-FILE: ../../.git/HEAD
+	@$(SHELL_PATH) util/PERF-VERSION-GEN $(OUTPUT)
+	@touch $(OUTPUT)PERF-VERSION-FILE
+
+CC = $(CROSS_COMPILE)gcc
+AR = $(CROSS_COMPILE)ar
+PKG_CONFIG = $(CROSS_COMPILE)pkg-config
+
+RM      = rm -f
+LN      = ln -f
+MKDIR   = mkdir
+FIND    = find
+INSTALL = install
+FLEX    = flex
+BISON   = bison
+STRIP   = strip
+
+LIB_DIR          = $(srctree)/tools/lib/api/
+TRACE_EVENT_DIR = $(srctree)/tools/lib/traceevent/
+
+# include config/Makefile by default and rule out
+# non-config cases
+config := 1
+
+NON_CONFIG_TARGETS := clean TAGS tags cscope help
+
+ifdef MAKECMDGOALS
+ifeq ($(filter-out $(NON_CONFIG_TARGETS),$(MAKECMDGOALS)),)
+  config := 0
+endif
+endif
+
+ifeq ($(config),1)
+include config/Makefile
+endif
+
+export prefix bindir sharedir sysconfdir DESTDIR
+
+# sparse is architecture-neutral, which means that we need to tell it
+# explicitly what architecture to check for. Fix this up for yours..
+SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__
+
+# Guard against environment variables
+BUILTIN_OBJS =
+LIB_H =
+LIB_OBJS =
+GTK_OBJS =
+PYRF_OBJS =
+SCRIPT_SH =
+
+SCRIPT_SH += perf-archive.sh
+
+grep-libs = $(filter -l%,$(1))
+strip-libs = $(filter-out -l%,$(1))
+
+ifneq ($(OUTPUT),)
+  TE_PATH=$(OUTPUT)
+ifneq ($(subdir),)
+  LIB_PATH=$(OUTPUT)/../lib/api/
+else
+  LIB_PATH=$(OUTPUT)
+endif
+else
+  TE_PATH=$(TRACE_EVENT_DIR)
+  LIB_PATH=$(LIB_DIR)
+endif
+
+LIBTRACEEVENT = $(TE_PATH)libtraceevent.a
+export LIBTRACEEVENT
+
+LIBAPIKFS = $(LIB_PATH)libapikfs.a
+export LIBAPIKFS
+
+# python extension build directories
+PYTHON_EXTBUILD     := $(OUTPUT)python_ext_build/
+PYTHON_EXTBUILD_LIB := $(PYTHON_EXTBUILD)lib/
+PYTHON_EXTBUILD_TMP := $(PYTHON_EXTBUILD)tmp/
+export PYTHON_EXTBUILD_LIB PYTHON_EXTBUILD_TMP
+
+python-clean := $(call QUIET_CLEAN, python) $(RM) -r $(PYTHON_EXTBUILD) $(OUTPUT)python/perf.so
+
+PYTHON_EXT_SRCS := $(shell grep -v ^\# util/python-ext-sources)
+PYTHON_EXT_DEPS := util/python-ext-sources util/setup.py $(LIBTRACEEVENT) $(LIBAPIKFS)
+
+$(OUTPUT)python/perf.so: $(PYTHON_EXT_SRCS) $(PYTHON_EXT_DEPS)
+	$(QUIET_GEN)CFLAGS='$(CFLAGS)' $(PYTHON_WORD) util/setup.py \
+	  --quiet build_ext; \
+	mkdir -p $(OUTPUT)python && \
+	cp $(PYTHON_EXTBUILD_LIB)perf.so $(OUTPUT)python/
+#
+# No Perl scripts right now:
+#
+
+SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH))
+
+#
+# Single 'perf' binary right now:
+#
+PROGRAMS += $(OUTPUT)perf
+
+# what 'all' will build and 'install' will install, in perfexecdir
+ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS)
+
+# what 'all' will build but not install in perfexecdir
+OTHER_PROGRAMS = $(OUTPUT)perf
+
+# Set paths to tools early so that they can be used for version tests.
+ifndef SHELL_PATH
+  SHELL_PATH = /bin/sh
+endif
+ifndef PERL_PATH
+  PERL_PATH = /usr/bin/perl
+endif
+
+export PERL_PATH
+
+$(OUTPUT)util/parse-events-flex.c: util/parse-events.l $(OUTPUT)util/parse-events-bison.c
+	$(QUIET_FLEX)$(FLEX) -o $@ --header-file=$(OUTPUT)util/parse-events-flex.h $(PARSER_DEBUG_FLEX) util/parse-events.l
+
+$(OUTPUT)util/parse-events-bison.c: util/parse-events.y
+	$(QUIET_BISON)$(BISON) -v util/parse-events.y -d $(PARSER_DEBUG_BISON) -o $(OUTPUT)util/parse-events-bison.c -p parse_events_
+
+$(OUTPUT)util/pmu-flex.c: util/pmu.l $(OUTPUT)util/pmu-bison.c
+	$(QUIET_FLEX)$(FLEX) -o $@ --header-file=$(OUTPUT)util/pmu-flex.h util/pmu.l
+
+$(OUTPUT)util/pmu-bison.c: util/pmu.y
+	$(QUIET_BISON)$(BISON) -v util/pmu.y -d -o $(OUTPUT)util/pmu-bison.c -p perf_pmu_
+
+$(OUTPUT)util/parse-events.o: $(OUTPUT)util/parse-events-flex.c $(OUTPUT)util/parse-events-bison.c
+$(OUTPUT)util/pmu.o: $(OUTPUT)util/pmu-flex.c $(OUTPUT)util/pmu-bison.c
+
+LIB_FILE=$(OUTPUT)libperf.a
+
+LIB_H += ../lib/symbol/kallsyms.h
+LIB_H += ../../include/uapi/linux/perf_event.h
+LIB_H += ../../include/linux/rbtree.h
+LIB_H += ../../include/linux/list.h
+LIB_H += ../../include/uapi/linux/const.h
+LIB_H += ../include/linux/hash.h
+LIB_H += ../../include/linux/stringify.h
+LIB_H += util/include/linux/bitmap.h
+LIB_H += util/include/linux/bitops.h
+LIB_H += ../include/linux/compiler.h
+LIB_H += util/include/linux/const.h
+LIB_H += util/include/linux/ctype.h
+LIB_H += util/include/linux/kernel.h
+LIB_H += util/include/linux/list.h
+LIB_H += ../include/linux/export.h
+LIB_H += util/include/linux/poison.h
+LIB_H += util/include/linux/rbtree.h
+LIB_H += util/include/linux/rbtree_augmented.h
+LIB_H += util/include/linux/string.h
+LIB_H += ../include/linux/types.h
+LIB_H += util/include/linux/linkage.h
+LIB_H += util/include/asm/asm-offsets.h
+LIB_H += ../include/asm/bug.h
+LIB_H += util/include/asm/byteorder.h
+LIB_H += util/include/asm/hweight.h
+LIB_H += util/include/asm/swab.h
+LIB_H += util/include/asm/system.h
+LIB_H += util/include/asm/uaccess.h
+LIB_H += util/include/dwarf-regs.h
+LIB_H += util/include/asm/dwarf2.h
+LIB_H += util/include/asm/cpufeature.h
+LIB_H += util/include/asm/unistd_32.h
+LIB_H += util/include/asm/unistd_64.h
+LIB_H += perf.h
+LIB_H += util/annotate.h
+LIB_H += util/cache.h
+LIB_H += util/callchain.h
+LIB_H += util/build-id.h
+LIB_H += util/debug.h
+LIB_H += util/pmu.h
+LIB_H += util/event.h
+LIB_H += util/evsel.h
+LIB_H += util/evlist.h
+LIB_H += util/exec_cmd.h
+LIB_H += util/levenshtein.h
+LIB_H += util/machine.h
+LIB_H += util/map.h
+LIB_H += util/parse-options.h
+LIB_H += util/parse-events.h
+LIB_H += util/quote.h
+LIB_H += util/util.h
+LIB_H += util/xyarray.h
+LIB_H += util/header.h
+LIB_H += util/help.h
+LIB_H += util/session.h
+LIB_H += util/strbuf.h
+LIB_H += util/strlist.h
+LIB_H += util/strfilter.h
+LIB_H += util/svghelper.h
+LIB_H += util/tool.h
+LIB_H += util/run-command.h
+LIB_H += util/sigchain.h
+LIB_H += util/dso.h
+LIB_H += util/symbol.h
+LIB_H += util/color.h
+LIB_H += util/values.h
+LIB_H += util/sort.h
+LIB_H += util/hist.h
+LIB_H += util/comm.h
+LIB_H += util/thread.h
+LIB_H += util/thread_map.h
+LIB_H += util/trace-event.h
+LIB_H += util/probe-finder.h
+LIB_H += util/dwarf-aux.h
+LIB_H += util/probe-event.h
+LIB_H += util/pstack.h
+LIB_H += util/cpumap.h
+LIB_H += util/top.h
+LIB_H += $(ARCH_INCLUDE)
+LIB_H += util/cgroup.h
+LIB_H += $(LIB_INCLUDE)traceevent/event-parse.h
+LIB_H += util/target.h
+LIB_H += util/rblist.h
+LIB_H += util/intlist.h
+LIB_H += util/perf_regs.h
+LIB_H += util/unwind.h
+LIB_H += util/vdso.h
+LIB_H += ui/helpline.h
+LIB_H += ui/progress.h
+LIB_H += ui/util.h
+LIB_H += ui/ui.h
+LIB_H += util/data.h
+
+LIB_OBJS += $(OUTPUT)util/abspath.o
+LIB_OBJS += $(OUTPUT)util/alias.o
+LIB_OBJS += $(OUTPUT)util/annotate.o
+LIB_OBJS += $(OUTPUT)util/build-id.o
+LIB_OBJS += $(OUTPUT)util/config.o
+LIB_OBJS += $(OUTPUT)util/ctype.o
+LIB_OBJS += $(OUTPUT)util/pmu.o
+LIB_OBJS += $(OUTPUT)util/environment.o
+LIB_OBJS += $(OUTPUT)util/event.o
+LIB_OBJS += $(OUTPUT)util/evlist.o
+LIB_OBJS += $(OUTPUT)util/evsel.o
+LIB_OBJS += $(OUTPUT)util/exec_cmd.o
+LIB_OBJS += $(OUTPUT)util/help.o
+LIB_OBJS += $(OUTPUT)util/kallsyms.o
+LIB_OBJS += $(OUTPUT)util/levenshtein.o
+LIB_OBJS += $(OUTPUT)util/parse-options.o
+LIB_OBJS += $(OUTPUT)util/parse-events.o
+LIB_OBJS += $(OUTPUT)util/path.o
+LIB_OBJS += $(OUTPUT)util/rbtree.o
+LIB_OBJS += $(OUTPUT)util/bitmap.o
+LIB_OBJS += $(OUTPUT)util/hweight.o
+LIB_OBJS += $(OUTPUT)util/run-command.o
+LIB_OBJS += $(OUTPUT)util/quote.o
+LIB_OBJS += $(OUTPUT)util/strbuf.o
+LIB_OBJS += $(OUTPUT)util/string.o
+LIB_OBJS += $(OUTPUT)util/strlist.o
+LIB_OBJS += $(OUTPUT)util/strfilter.o
+LIB_OBJS += $(OUTPUT)util/top.o
+LIB_OBJS += $(OUTPUT)util/usage.o
+LIB_OBJS += $(OUTPUT)util/wrapper.o
+LIB_OBJS += $(OUTPUT)util/sigchain.o
+LIB_OBJS += $(OUTPUT)util/dso.o
+LIB_OBJS += $(OUTPUT)util/symbol.o
+LIB_OBJS += $(OUTPUT)util/symbol-elf.o
+LIB_OBJS += $(OUTPUT)util/color.o
+LIB_OBJS += $(OUTPUT)util/pager.o
+LIB_OBJS += $(OUTPUT)util/header.o
+LIB_OBJS += $(OUTPUT)util/callchain.o
+LIB_OBJS += $(OUTPUT)util/values.o
+LIB_OBJS += $(OUTPUT)util/debug.o
+LIB_OBJS += $(OUTPUT)util/machine.o
+LIB_OBJS += $(OUTPUT)util/map.o
+LIB_OBJS += $(OUTPUT)util/pstack.o
+LIB_OBJS += $(OUTPUT)util/session.o
+LIB_OBJS += $(OUTPUT)util/comm.o
+LIB_OBJS += $(OUTPUT)util/thread.o
+LIB_OBJS += $(OUTPUT)util/thread_map.o
+LIB_OBJS += $(OUTPUT)util/trace-event-parse.o
+LIB_OBJS += $(OUTPUT)util/parse-events-flex.o
+LIB_OBJS += $(OUTPUT)util/parse-events-bison.o
+LIB_OBJS += $(OUTPUT)util/pmu-flex.o
+LIB_OBJS += $(OUTPUT)util/pmu-bison.o
+LIB_OBJS += $(OUTPUT)util/trace-event-read.o
+LIB_OBJS += $(OUTPUT)util/trace-event-info.o
+LIB_OBJS += $(OUTPUT)util/trace-event-scripting.o
+LIB_OBJS += $(OUTPUT)util/trace-event.o
+LIB_OBJS += $(OUTPUT)util/svghelper.o
+LIB_OBJS += $(OUTPUT)util/sort.o
+LIB_OBJS += $(OUTPUT)util/hist.o
+LIB_OBJS += $(OUTPUT)util/probe-event.o
+LIB_OBJS += $(OUTPUT)util/util.o
+LIB_OBJS += $(OUTPUT)util/xyarray.o
+LIB_OBJS += $(OUTPUT)util/cpumap.o
+LIB_OBJS += $(OUTPUT)util/cgroup.o
+LIB_OBJS += $(OUTPUT)util/target.o
+LIB_OBJS += $(OUTPUT)util/rblist.o
+LIB_OBJS += $(OUTPUT)util/intlist.o
+LIB_OBJS += $(OUTPUT)util/vdso.o
+LIB_OBJS += $(OUTPUT)util/stat.o
+LIB_OBJS += $(OUTPUT)util/record.o
+LIB_OBJS += $(OUTPUT)util/srcline.o
+LIB_OBJS += $(OUTPUT)util/data.o
+
+LIB_OBJS += $(OUTPUT)ui/setup.o
+LIB_OBJS += $(OUTPUT)ui/helpline.o
+LIB_OBJS += $(OUTPUT)ui/progress.o
+LIB_OBJS += $(OUTPUT)ui/util.o
+LIB_OBJS += $(OUTPUT)ui/hist.o
+LIB_OBJS += $(OUTPUT)ui/stdio/hist.o
+
+LIB_OBJS += $(OUTPUT)arch/common.o
+
+LIB_OBJS += $(OUTPUT)tests/parse-events.o
+LIB_OBJS += $(OUTPUT)tests/dso-data.o
+LIB_OBJS += $(OUTPUT)tests/attr.o
+LIB_OBJS += $(OUTPUT)tests/vmlinux-kallsyms.o
+LIB_OBJS += $(OUTPUT)tests/open-syscall.o
+LIB_OBJS += $(OUTPUT)tests/open-syscall-all-cpus.o
+LIB_OBJS += $(OUTPUT)tests/open-syscall-tp-fields.o
+LIB_OBJS += $(OUTPUT)tests/mmap-basic.o
+LIB_OBJS += $(OUTPUT)tests/perf-record.o
+LIB_OBJS += $(OUTPUT)tests/rdpmc.o
+LIB_OBJS += $(OUTPUT)tests/evsel-roundtrip-name.o
+LIB_OBJS += $(OUTPUT)tests/evsel-tp-sched.o
+LIB_OBJS += $(OUTPUT)tests/pmu.o
+LIB_OBJS += $(OUTPUT)tests/hists_common.o
+LIB_OBJS += $(OUTPUT)tests/hists_link.o
+LIB_OBJS += $(OUTPUT)tests/hists_filter.o
+LIB_OBJS += $(OUTPUT)tests/hists_output.o
+LIB_OBJS += $(OUTPUT)tests/hists_cumulate.o
+LIB_OBJS += $(OUTPUT)tests/python-use.o
+LIB_OBJS += $(OUTPUT)tests/bp_signal.o
+LIB_OBJS += $(OUTPUT)tests/bp_signal_overflow.o
+LIB_OBJS += $(OUTPUT)tests/task-exit.o
+LIB_OBJS += $(OUTPUT)tests/sw-clock.o
+ifeq ($(ARCH),x86)
+LIB_OBJS += $(OUTPUT)tests/perf-time-to-tsc.o
+endif
+LIB_OBJS += $(OUTPUT)tests/code-reading.o
+LIB_OBJS += $(OUTPUT)tests/sample-parsing.o
+LIB_OBJS += $(OUTPUT)tests/parse-no-sample-id-all.o
+ifndef NO_DWARF_UNWIND
+ifeq ($(ARCH),$(filter $(ARCH),x86 arm))
+LIB_OBJS += $(OUTPUT)tests/dwarf-unwind.o
+endif
+endif
+LIB_OBJS += $(OUTPUT)tests/mmap-thread-lookup.o
+LIB_OBJS += $(OUTPUT)tests/thread-mg-share.o
+
+BUILTIN_OBJS += $(OUTPUT)builtin-annotate.o
+BUILTIN_OBJS += $(OUTPUT)builtin-bench.o
+# Benchmark modules
+BUILTIN_OBJS += $(OUTPUT)bench/sched-messaging.o
+BUILTIN_OBJS += $(OUTPUT)bench/sched-pipe.o
+ifeq ($(RAW_ARCH),x86_64)
+BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy-x86-64-asm.o
+BUILTIN_OBJS += $(OUTPUT)bench/mem-memset-x86-64-asm.o
+endif
+BUILTIN_OBJS += $(OUTPUT)bench/mem-memcpy.o
+BUILTIN_OBJS += $(OUTPUT)bench/mem-memset.o
+BUILTIN_OBJS += $(OUTPUT)bench/futex-hash.o
+BUILTIN_OBJS += $(OUTPUT)bench/futex-wake.o
+BUILTIN_OBJS += $(OUTPUT)bench/futex-requeue.o
+
+BUILTIN_OBJS += $(OUTPUT)builtin-diff.o
+BUILTIN_OBJS += $(OUTPUT)builtin-evlist.o
+BUILTIN_OBJS += $(OUTPUT)builtin-help.o
+BUILTIN_OBJS += $(OUTPUT)builtin-sched.o
+BUILTIN_OBJS += $(OUTPUT)builtin-buildid-list.o
+BUILTIN_OBJS += $(OUTPUT)builtin-buildid-cache.o
+BUILTIN_OBJS += $(OUTPUT)builtin-list.o
+BUILTIN_OBJS += $(OUTPUT)builtin-record.o
+BUILTIN_OBJS += $(OUTPUT)builtin-report.o
+BUILTIN_OBJS += $(OUTPUT)builtin-stat.o
+BUILTIN_OBJS += $(OUTPUT)builtin-timechart.o
+BUILTIN_OBJS += $(OUTPUT)builtin-top.o
+BUILTIN_OBJS += $(OUTPUT)builtin-script.o
+BUILTIN_OBJS += $(OUTPUT)builtin-probe.o
+BUILTIN_OBJS += $(OUTPUT)builtin-kmem.o
+BUILTIN_OBJS += $(OUTPUT)builtin-lock.o
+BUILTIN_OBJS += $(OUTPUT)builtin-kvm.o
+BUILTIN_OBJS += $(OUTPUT)builtin-inject.o
+BUILTIN_OBJS += $(OUTPUT)tests/builtin-test.o
+BUILTIN_OBJS += $(OUTPUT)builtin-mem.o
+
+PERFLIBS = $(LIB_FILE) $(LIBAPIKFS) $(LIBTRACEEVENT)
+
+# We choose to avoid "if .. else if .. else .. endif endif"
+# because maintaining the nesting to match is a pain.  If
+# we had "elif" things would have been much nicer...
+
+-include arch/$(ARCH)/Makefile
+
+ifneq ($(OUTPUT),)
+  CFLAGS += -I$(OUTPUT)
+endif
+
+ifdef NO_LIBELF
+EXTLIBS := $(filter-out -lelf,$(EXTLIBS))
+
+# Remove ELF/DWARF dependent codes
+LIB_OBJS := $(filter-out $(OUTPUT)util/symbol-elf.o,$(LIB_OBJS))
+LIB_OBJS := $(filter-out $(OUTPUT)util/dwarf-aux.o,$(LIB_OBJS))
+LIB_OBJS := $(filter-out $(OUTPUT)util/probe-event.o,$(LIB_OBJS))
+LIB_OBJS := $(filter-out $(OUTPUT)util/probe-finder.o,$(LIB_OBJS))
+
+BUILTIN_OBJS := $(filter-out $(OUTPUT)builtin-probe.o,$(BUILTIN_OBJS))
+
+# Use minimal symbol handling
+LIB_OBJS += $(OUTPUT)util/symbol-minimal.o
+
+else # NO_LIBELF
+ifndef NO_DWARF
+  LIB_OBJS += $(OUTPUT)util/probe-finder.o
+  LIB_OBJS += $(OUTPUT)util/dwarf-aux.o
+endif # NO_DWARF
+endif # NO_LIBELF
+
+ifndef NO_LIBDW_DWARF_UNWIND
+  LIB_OBJS += $(OUTPUT)util/unwind-libdw.o
+  LIB_H += util/unwind-libdw.h
+endif
+
+ifndef NO_LIBUNWIND
+  LIB_OBJS += $(OUTPUT)util/unwind-libunwind.o
+endif
+LIB_OBJS += $(OUTPUT)tests/keep-tracking.o
+
+ifndef NO_LIBAUDIT
+  BUILTIN_OBJS += $(OUTPUT)builtin-trace.o
+endif
+
+ifndef NO_SLANG
+  LIB_OBJS += $(OUTPUT)ui/browser.o
+  LIB_OBJS += $(OUTPUT)ui/browsers/annotate.o
+  LIB_OBJS += $(OUTPUT)ui/browsers/hists.o
+  LIB_OBJS += $(OUTPUT)ui/browsers/map.o
+  LIB_OBJS += $(OUTPUT)ui/browsers/scripts.o
+  LIB_OBJS += $(OUTPUT)ui/browsers/header.o
+  LIB_OBJS += $(OUTPUT)ui/tui/setup.o
+  LIB_OBJS += $(OUTPUT)ui/tui/util.o
+  LIB_OBJS += $(OUTPUT)ui/tui/helpline.o
+  LIB_OBJS += $(OUTPUT)ui/tui/progress.o
+  LIB_H += ui/tui/tui.h
+  LIB_H += ui/browser.h
+  LIB_H += ui/browsers/map.h
+  LIB_H += ui/keysyms.h
+  LIB_H += ui/libslang.h
+endif
+
+ifndef NO_GTK2
+  ALL_PROGRAMS += $(OUTPUT)libperf-gtk.so
+
+  GTK_OBJS += $(OUTPUT)ui/gtk/browser.o
+  GTK_OBJS += $(OUTPUT)ui/gtk/hists.o
+  GTK_OBJS += $(OUTPUT)ui/gtk/setup.o
+  GTK_OBJS += $(OUTPUT)ui/gtk/util.o
+  GTK_OBJS += $(OUTPUT)ui/gtk/helpline.o
+  GTK_OBJS += $(OUTPUT)ui/gtk/progress.o
+  GTK_OBJS += $(OUTPUT)ui/gtk/annotate.o
+
+install-gtk: $(OUTPUT)libperf-gtk.so
+	$(call QUIET_INSTALL, 'GTK UI') \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(libdir_SQ)'; \
+		$(INSTALL) $(OUTPUT)libperf-gtk.so '$(DESTDIR_SQ)$(libdir_SQ)'
+endif
+
+ifndef NO_LIBPERL
+  LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-perl.o
+  LIB_OBJS += $(OUTPUT)scripts/perl/Perf-Trace-Util/Context.o
+endif
+
+ifndef NO_LIBPYTHON
+  LIB_OBJS += $(OUTPUT)util/scripting-engines/trace-event-python.o
+  LIB_OBJS += $(OUTPUT)scripts/python/Perf-Trace-Util/Context.o
+endif
+
+ifeq ($(NO_PERF_REGS),0)
+  ifeq ($(ARCH),x86)
+    LIB_H += arch/x86/include/perf_regs.h
+  endif
+  LIB_OBJS += $(OUTPUT)util/perf_regs.o
+endif
+
+ifndef NO_LIBNUMA
+  BUILTIN_OBJS += $(OUTPUT)bench/numa.o
+endif
+
+ifdef ASCIIDOC8
+  export ASCIIDOC8
+endif
+
+LIBS = -Wl,--whole-archive $(PERFLIBS) -Wl,--no-whole-archive -Wl,--start-group $(EXTLIBS) -Wl,--end-group
+
+export INSTALL SHELL_PATH
+
+### Build rules
+
+SHELL = $(SHELL_PATH)
+
+all: shell_compatibility_test $(ALL_PROGRAMS) $(LANG_BINDINGS) $(OTHER_PROGRAMS)
+
+please_set_SHELL_PATH_to_a_more_modern_shell:
+	@$$(:)
+
+shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell
+
+strip: $(PROGRAMS) $(OUTPUT)perf
+	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) $(OUTPUT)perf
+
+$(OUTPUT)perf.o: perf.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -include $(OUTPUT)PERF-VERSION-FILE \
+		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
+		$(CFLAGS) -c $(filter %.c,$^) -o $@
+
+$(OUTPUT)perf: $(OUTPUT)perf.o $(BUILTIN_OBJS) $(PERFLIBS)
+	$(QUIET_LINK)$(CC) $(CFLAGS) $(LDFLAGS) $(OUTPUT)perf.o \
+               $(BUILTIN_OBJS) $(LIBS) -o $@
+
+$(GTK_OBJS): $(OUTPUT)%.o: %.c $(LIB_H)
+	$(QUIET_CC)$(CC) -o $@ -c -fPIC $(CFLAGS) $(GTK_CFLAGS) $<
+
+$(OUTPUT)libperf-gtk.so: $(GTK_OBJS) $(PERFLIBS)
+	$(QUIET_LINK)$(CC) -o $@ -shared $(LDFLAGS) $(filter %.o,$^) $(GTK_LIBS)
+
+$(OUTPUT)builtin-help.o: builtin-help.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
+		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
+		'-DPERF_MAN_PATH="$(mandir_SQ)"' \
+		'-DPERF_INFO_PATH="$(infodir_SQ)"' $<
+
+$(OUTPUT)builtin-timechart.o: builtin-timechart.c $(OUTPUT)common-cmds.h $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
+		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
+		'-DPERF_MAN_PATH="$(mandir_SQ)"' \
+		'-DPERF_INFO_PATH="$(infodir_SQ)"' $<
+
+$(OUTPUT)common-cmds.h: util/generate-cmdlist.sh command-list.txt
+
+$(OUTPUT)common-cmds.h: $(wildcard Documentation/perf-*.txt)
+	$(QUIET_GEN). util/generate-cmdlist.sh > $@+ && mv $@+ $@
+
+$(SCRIPTS) : % : %.sh
+	$(QUIET_GEN)$(INSTALL) '$@.sh' '$(OUTPUT)$@'
+
+# These can record PERF_VERSION
+$(OUTPUT)perf.o perf.spec \
+	$(SCRIPTS) \
+	: $(OUTPUT)PERF-VERSION-FILE
+
+.SUFFIXES:
+
+#
+# If a target does not match any of the later rules then prefix it by $(OUTPUT)
+# This makes targets like 'make O=/tmp/perf perf.o' work in a natural way.
+#
+ifneq ($(OUTPUT),)
+%.o: $(OUTPUT)%.o
+	@echo "    # Redirected target $@ => $(OUTPUT)$@"
+util/%.o: $(OUTPUT)util/%.o
+	@echo "    # Redirected target $@ => $(OUTPUT)$@"
+bench/%.o: $(OUTPUT)bench/%.o
+	@echo "    # Redirected target $@ => $(OUTPUT)$@"
+tests/%.o: $(OUTPUT)tests/%.o
+	@echo "    # Redirected target $@ => $(OUTPUT)$@"
+endif
+
+# These two need to be here so that when O= is not used they take precedence
+# over the general rule for .o
+
+$(OUTPUT)util/%-flex.o: $(OUTPUT)util/%-flex.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c -Iutil/ $(CFLAGS) -w $<
+
+$(OUTPUT)util/%-bison.o: $(OUTPUT)util/%-bison.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c -Iutil/ $(CFLAGS) -DYYENABLE_NLS=0 -DYYLTYPE_IS_TRIVIAL=0 -w $<
+
+$(OUTPUT)%.o: %.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $<
+$(OUTPUT)%.i: %.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -E $(CFLAGS) $<
+$(OUTPUT)%.s: %.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -S $(CFLAGS) $<
+$(OUTPUT)%.o: %.S
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $<
+$(OUTPUT)%.s: %.S
+	$(QUIET_CC)$(CC) -o $@ -E $(CFLAGS) $<
+
+$(OUTPUT)util/exec_cmd.o: util/exec_cmd.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
+		'-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \
+		'-DPREFIX="$(prefix_SQ)"' \
+		$<
+
+$(OUTPUT)tests/attr.o: tests/attr.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
+		'-DBINDIR="$(bindir_SQ)"' -DPYTHON='"$(PYTHON_WORD)"' \
+		$<
+
+$(OUTPUT)tests/python-use.o: tests/python-use.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) \
+		-DPYTHONPATH='"$(OUTPUT)python"' \
+		-DPYTHON='"$(PYTHON_WORD)"' \
+		$<
+
+$(OUTPUT)tests/dwarf-unwind.o: tests/dwarf-unwind.c
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -fno-optimize-sibling-calls $<
+
+$(OUTPUT)util/config.o: util/config.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
+
+$(OUTPUT)ui/setup.o: ui/setup.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DLIBDIR='"$(libdir_SQ)"' $<
+
+$(OUTPUT)ui/browser.o: ui/browser.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
+
+$(OUTPUT)ui/browsers/annotate.o: ui/browsers/annotate.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
+
+$(OUTPUT)ui/browsers/hists.o: ui/browsers/hists.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
+
+$(OUTPUT)ui/browsers/map.o: ui/browsers/map.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
+
+$(OUTPUT)ui/browsers/scripts.o: ui/browsers/scripts.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -DENABLE_SLFUTURE_CONST $<
+
+$(OUTPUT)util/kallsyms.o: ../lib/symbol/kallsyms.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $<
+
+$(OUTPUT)util/rbtree.o: ../../lib/rbtree.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -Wno-unused-parameter -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $<
+
+$(OUTPUT)util/parse-events.o: util/parse-events.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) -Wno-redundant-decls $<
+
+$(OUTPUT)util/scripting-engines/trace-event-perl.o: util/scripting-engines/trace-event-perl.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow -Wno-undef -Wno-switch-default $<
+
+$(OUTPUT)scripts/perl/Perf-Trace-Util/Context.o: scripts/perl/Perf-Trace-Util/Context.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs -Wno-undef -Wno-switch-default $<
+
+$(OUTPUT)util/scripting-engines/trace-event-python.o: util/scripting-engines/trace-event-python.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow $<
+
+$(OUTPUT)scripts/python/Perf-Trace-Util/Context.o: scripts/python/Perf-Trace-Util/Context.c $(OUTPUT)PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $@ -c $(CFLAGS) $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-nested-externs $<
+
+$(OUTPUT)perf-%: %.o $(PERFLIBS)
+	$(QUIET_LINK)$(CC) $(CFLAGS) -o $@ $(LDFLAGS) $(filter %.o,$^) $(LIBS)
+
+$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H)
+$(patsubst perf-%,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h)
+
+# we compile into subdirectories. if the target directory is not the source directory, they might not exists. So
+# we depend the various files onto their directories.
+DIRECTORY_DEPS = $(LIB_OBJS) $(BUILTIN_OBJS) $(GTK_OBJS)
+DIRECTORY_DEPS += $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h
+# no need to add flex objects, because they depend on bison ones
+DIRECTORY_DEPS += $(OUTPUT)util/parse-events-bison.c
+DIRECTORY_DEPS += $(OUTPUT)util/pmu-bison.c
+
+OUTPUT_DIRECTORIES := $(sort $(dir $(DIRECTORY_DEPS)))
+
+$(DIRECTORY_DEPS): | $(OUTPUT_DIRECTORIES)
+# In the second step, we make a rule to actually create these directories
+$(OUTPUT_DIRECTORIES):
+	$(QUIET_MKDIR)$(MKDIR) -p $@ 2>/dev/null
+
+$(LIB_FILE): $(LIB_OBJS)
+	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS)
+
+# libtraceevent.a
+TE_SOURCES = $(wildcard $(TRACE_EVENT_DIR)*.[ch])
+
+LIBTRACEEVENT_FLAGS  = $(QUIET_SUBDIR1) O=$(OUTPUT)
+LIBTRACEEVENT_FLAGS += CFLAGS="-g -Wall $(EXTRA_CFLAGS)"
+LIBTRACEEVENT_FLAGS += plugin_dir=$(plugindir_SQ)
+
+$(LIBTRACEEVENT): $(TE_SOURCES) $(OUTPUT)PERF-CFLAGS
+	$(QUIET_SUBDIR0)$(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) libtraceevent.a plugins
+
+$(LIBTRACEEVENT)-clean:
+	$(call QUIET_CLEAN, libtraceevent)
+	@$(MAKE) -C $(TRACE_EVENT_DIR) O=$(OUTPUT) clean >/dev/null
+
+install-traceevent-plugins: $(LIBTRACEEVENT)
+	$(QUIET_SUBDIR0)$(TRACE_EVENT_DIR) $(LIBTRACEEVENT_FLAGS) install_plugins
+
+LIBAPIKFS_SOURCES = $(wildcard $(LIB_PATH)fs/*.[ch])
+
+# if subdir is set, we've been called from above so target has been built
+# already
+$(LIBAPIKFS): $(LIBAPIKFS_SOURCES)
+ifeq ($(subdir),)
+	$(QUIET_SUBDIR0)$(LIB_DIR) $(QUIET_SUBDIR1) O=$(OUTPUT) libapikfs.a
+endif
+
+$(LIBAPIKFS)-clean:
+ifeq ($(subdir),)
+	$(call QUIET_CLEAN, libapikfs)
+	@$(MAKE) -C $(LIB_DIR) O=$(OUTPUT) clean >/dev/null
+endif
+
+help:
+	@echo 'Perf make targets:'
+	@echo '  doc		- make *all* documentation (see below)'
+	@echo '  man		- make manpage documentation (access with man <foo>)'
+	@echo '  html		- make html documentation'
+	@echo '  info		- make GNU info documentation (access with info <foo>)'
+	@echo '  pdf		- make pdf documentation'
+	@echo '  TAGS		- use etags to make tag information for source browsing'
+	@echo '  tags		- use ctags to make tag information for source browsing'
+	@echo '  cscope	- use cscope to make interactive browsing database'
+	@echo ''
+	@echo 'Perf install targets:'
+	@echo '  NOTE: documentation build requires asciidoc, xmlto packages to be installed'
+	@echo '  HINT: use "prefix" or "DESTDIR" to install to a particular'
+	@echo '        path like "make prefix=/usr/local install install-doc"'
+	@echo '  install	- install compiled binaries'
+	@echo '  install-doc	- install *all* documentation'
+	@echo '  install-man	- install manpage documentation'
+	@echo '  install-html	- install html documentation'
+	@echo '  install-info	- install GNU info documentation'
+	@echo '  install-pdf	- install pdf documentation'
+	@echo ''
+	@echo '  quick-install-doc	- alias for quick-install-man'
+	@echo '  quick-install-man	- install the documentation quickly'
+	@echo '  quick-install-html	- install the html documentation quickly'
+	@echo ''
+	@echo 'Perf maintainer targets:'
+	@echo '  clean			- clean all binary objects and build output'
+
+
+DOC_TARGETS := doc man html info pdf
+
+INSTALL_DOC_TARGETS := $(patsubst %,install-%,$(DOC_TARGETS)) try-install-man
+INSTALL_DOC_TARGETS += quick-install-doc quick-install-man quick-install-html
+
+# 'make doc' should call 'make -C Documentation all'
+$(DOC_TARGETS):
+	$(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) $(@:doc=all)
+
+TAG_FOLDERS= . ../lib/traceevent ../lib/api ../lib/symbol
+TAG_FILES= ../../include/uapi/linux/perf_event.h
+
+TAGS:
+	$(QUIET_GEN)$(RM) TAGS; \
+	$(FIND) $(TAG_FOLDERS) -name '*.[hcS]' -print | xargs etags -a $(TAG_FILES)
+
+tags:
+	$(QUIET_GEN)$(RM) tags; \
+	$(FIND) $(TAG_FOLDERS) -name '*.[hcS]' -print | xargs ctags -a $(TAG_FILES)
+
+cscope:
+	$(QUIET_GEN)$(RM) cscope*; \
+	$(FIND) $(TAG_FOLDERS) -name '*.[hcS]' -print | xargs cscope -b $(TAG_FILES)
+
+### Detect prefix changes
+TRACK_CFLAGS = $(subst ','\'',$(CFLAGS)):\
+             $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ):$(plugindir_SQ)
+
+$(OUTPUT)PERF-CFLAGS: .FORCE-PERF-CFLAGS
+	@FLAGS='$(TRACK_CFLAGS)'; \
+	    if test x"$$FLAGS" != x"`cat $(OUTPUT)PERF-CFLAGS 2>/dev/null`" ; then \
+		echo 1>&2 "  FLAGS:   * new build flags or prefix"; \
+		echo "$$FLAGS" >$(OUTPUT)PERF-CFLAGS; \
+            fi
+
+### Testing rules
+
+# GNU make supports exporting all variables by "export" without parameters.
+# However, the environment gets quite big, and some programs have problems
+# with that.
+
+check: $(OUTPUT)common-cmds.h
+	if sparse; \
+	then \
+		for i in *.c */*.c; \
+		do \
+			sparse $(CFLAGS) $(SPARSE_FLAGS) $$i || exit; \
+		done; \
+	else \
+		exit 1; \
+	fi
+
+### Installation rules
+
+install-gtk:
+
+install-bin: all install-gtk
+	$(call QUIET_INSTALL, binaries) \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)'; \
+		$(INSTALL) $(OUTPUT)perf '$(DESTDIR_SQ)$(bindir_SQ)'; \
+		$(LN) '$(DESTDIR_SQ)$(bindir_SQ)/perf' '$(DESTDIR_SQ)$(bindir_SQ)/trace'
+	$(call QUIET_INSTALL, libexec) \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+	$(call QUIET_INSTALL, perf-archive) \
+		$(INSTALL) $(OUTPUT)perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)'
+ifndef NO_LIBPERL
+	$(call QUIET_INSTALL, perl-scripts) \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'; \
+		$(INSTALL) scripts/perl/Perf-Trace-Util/lib/Perf/Trace/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'; \
+		$(INSTALL) scripts/perl/*.pl -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl'; \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/bin'; \
+		$(INSTALL) scripts/perl/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/bin'
+endif
+ifndef NO_LIBPYTHON
+	$(call QUIET_INSTALL, python-scripts) \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/Perf-Trace-Util/lib/Perf/Trace'; \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/bin'; \
+		$(INSTALL) scripts/python/Perf-Trace-Util/lib/Perf/Trace/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/Perf-Trace-Util/lib/Perf/Trace'; \
+		$(INSTALL) scripts/python/*.py -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python'; \
+		$(INSTALL) scripts/python/bin/* -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/python/bin'
+endif
+	$(call QUIET_INSTALL, perf_completion-script) \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d'; \
+		$(INSTALL) perf-completion.sh '$(DESTDIR_SQ)$(sysconfdir_SQ)/bash_completion.d/perf'
+	$(call QUIET_INSTALL, tests) \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests'; \
+		$(INSTALL) tests/attr.py '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests'; \
+		$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'; \
+		$(INSTALL) tests/attr/* '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/tests/attr'
+
+install: install-bin try-install-man install-traceevent-plugins
+
+install-python_ext:
+	$(PYTHON_WORD) util/setup.py --quiet install --root='/$(DESTDIR_SQ)'
+
+# 'make install-doc' should call 'make -C Documentation install'
+$(INSTALL_DOC_TARGETS):
+	$(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) $(@:-doc=)
+
+### Cleaning rules
+
+#
+# This is here, not in config/Makefile, because config/Makefile does
+# not get included for the clean target:
+#
+config-clean:
+	$(call QUIET_CLEAN, config)
+	@$(MAKE) -C config/feature-checks clean >/dev/null
+
+clean: $(LIBTRACEEVENT)-clean $(LIBAPIKFS)-clean config-clean
+	$(call QUIET_CLEAN, core-objs)  $(RM) $(LIB_OBJS) $(BUILTIN_OBJS) $(LIB_FILE) $(OUTPUT)perf-archive $(OUTPUT)perf.o $(LANG_BINDINGS) $(GTK_OBJS)
+	$(call QUIET_CLEAN, core-progs) $(RM) $(ALL_PROGRAMS) perf
+	$(call QUIET_CLEAN, core-gen)   $(RM)  *.spec *.pyc *.pyo */*.pyc */*.pyo $(OUTPUT)common-cmds.h TAGS tags cscope* $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)PERF-CFLAGS $(OUTPUT)PERF-FEATURES $(OUTPUT)util/*-bison* $(OUTPUT)util/*-flex*
+	$(QUIET_SUBDIR0)Documentation $(QUIET_SUBDIR1) clean
+	$(python-clean)
+
+#
+# Trick: if ../../.git does not exist - we are building out of tree for example,
+# then force version regeneration:
+#
+ifeq ($(wildcard ../../.git/HEAD),)
+    GIT-HEAD-PHONY = ../../.git/HEAD
+else
+    GIT-HEAD-PHONY =
+endif
+
+.PHONY: all install clean config-clean strip install-gtk
+.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell
+.PHONY: $(GIT-HEAD-PHONY) TAGS tags cscope .FORCE-PERF-CFLAGS
+
diff --git a/tools/perf/arch/arm/Makefile b/tools/perf/arch/arm/Makefile
new file mode 100644
index 00000000000..09d62153d38
--- /dev/null
+++ b/tools/perf/arch/arm/Makefile
@@ -0,0 +1,14 @@
+ifndef NO_DWARF
+PERF_HAVE_DWARF_REGS := 1
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
+endif
+ifndef NO_LIBUNWIND
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind-libunwind.o
+endif
+ifndef NO_LIBDW_DWARF_UNWIND
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind-libdw.o
+endif
+ifndef NO_DWARF_UNWIND
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/tests/regs_load.o
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/tests/dwarf-unwind.o
+endif
diff --git a/tools/perf/arch/arm/include/perf_regs.h b/tools/perf/arch/arm/include/perf_regs.h
new file mode 100644
index 00000000000..f619c9c5a4b
--- /dev/null
+++ b/tools/perf/arch/arm/include/perf_regs.h
@@ -0,0 +1,59 @@
+#ifndef ARCH_PERF_REGS_H
+#define ARCH_PERF_REGS_H
+
+#include <stdlib.h>
+#include <linux/types.h>
+#include <asm/perf_regs.h>
+
+void perf_regs_load(u64 *regs);
+
+#define PERF_REGS_MASK	((1ULL << PERF_REG_ARM_MAX) - 1)
+#define PERF_REGS_MAX	PERF_REG_ARM_MAX
+#define PERF_SAMPLE_REGS_ABI	PERF_SAMPLE_REGS_ABI_32
+
+#define PERF_REG_IP	PERF_REG_ARM_PC
+#define PERF_REG_SP	PERF_REG_ARM_SP
+
+static inline const char *perf_reg_name(int id)
+{
+	switch (id) {
+	case PERF_REG_ARM_R0:
+		return "r0";
+	case PERF_REG_ARM_R1:
+		return "r1";
+	case PERF_REG_ARM_R2:
+		return "r2";
+	case PERF_REG_ARM_R3:
+		return "r3";
+	case PERF_REG_ARM_R4:
+		return "r4";
+	case PERF_REG_ARM_R5:
+		return "r5";
+	case PERF_REG_ARM_R6:
+		return "r6";
+	case PERF_REG_ARM_R7:
+		return "r7";
+	case PERF_REG_ARM_R8:
+		return "r8";
+	case PERF_REG_ARM_R9:
+		return "r9";
+	case PERF_REG_ARM_R10:
+		return "r10";
+	case PERF_REG_ARM_FP:
+		return "fp";
+	case PERF_REG_ARM_IP:
+		return "ip";
+	case PERF_REG_ARM_SP:
+		return "sp";
+	case PERF_REG_ARM_LR:
+		return "lr";
+	case PERF_REG_ARM_PC:
+		return "pc";
+	default:
+		return NULL;
+	}
+
+	return NULL;
+}
+
+#endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/arm/tests/dwarf-unwind.c b/tools/perf/arch/arm/tests/dwarf-unwind.c
new file mode 100644
index 00000000000..9f870d27cb3
--- /dev/null
+++ b/tools/perf/arch/arm/tests/dwarf-unwind.c
@@ -0,0 +1,60 @@
+#include <string.h>
+#include "perf_regs.h"
+#include "thread.h"
+#include "map.h"
+#include "event.h"
+#include "tests/tests.h"
+
+#define STACK_SIZE 8192
+
+static int sample_ustack(struct perf_sample *sample,
+			 struct thread *thread, u64 *regs)
+{
+	struct stack_dump *stack = &sample->user_stack;
+	struct map *map;
+	unsigned long sp;
+	u64 stack_size, *buf;
+
+	buf = malloc(STACK_SIZE);
+	if (!buf) {
+		pr_debug("failed to allocate sample uregs data\n");
+		return -1;
+	}
+
+	sp = (unsigned long) regs[PERF_REG_ARM_SP];
+
+	map = map_groups__find(thread->mg, MAP__VARIABLE, (u64) sp);
+	if (!map) {
+		pr_debug("failed to get stack map\n");
+		free(buf);
+		return -1;
+	}
+
+	stack_size = map->end - sp;
+	stack_size = stack_size > STACK_SIZE ? STACK_SIZE : stack_size;
+
+	memcpy(buf, (void *) sp, stack_size);
+	stack->data = (char *) buf;
+	stack->size = stack_size;
+	return 0;
+}
+
+int test__arch_unwind_sample(struct perf_sample *sample,
+			     struct thread *thread)
+{
+	struct regs_dump *regs = &sample->user_regs;
+	u64 *buf;
+
+	buf = calloc(1, sizeof(u64) * PERF_REGS_MAX);
+	if (!buf) {
+		pr_debug("failed to allocate sample uregs data\n");
+		return -1;
+	}
+
+	perf_regs_load(buf);
+	regs->abi  = PERF_SAMPLE_REGS_ABI;
+	regs->regs = buf;
+	regs->mask = PERF_REGS_MASK;
+
+	return sample_ustack(sample, thread, buf);
+}
diff --git a/tools/perf/arch/arm/tests/regs_load.S b/tools/perf/arch/arm/tests/regs_load.S
new file mode 100644
index 00000000000..e09e983946f
--- /dev/null
+++ b/tools/perf/arch/arm/tests/regs_load.S
@@ -0,0 +1,58 @@
+#include <linux/linkage.h>
+
+#define R0 0x00
+#define R1 0x08
+#define R2 0x10
+#define R3 0x18
+#define R4 0x20
+#define R5 0x28
+#define R6 0x30
+#define R7 0x38
+#define R8 0x40
+#define R9 0x48
+#define SL 0x50
+#define FP 0x58
+#define IP 0x60
+#define SP 0x68
+#define LR 0x70
+#define PC 0x78
+
+/*
+ * Implementation of void perf_regs_load(u64 *regs);
+ *
+ * This functions fills in the 'regs' buffer from the actual registers values,
+ * in the way the perf built-in unwinding test expects them:
+ * - the PC at the time at the call to this function. Since this function
+ *   is called using a bl instruction, the PC value is taken from LR.
+ * The built-in unwinding test then unwinds the call stack from the dwarf
+ * information in unwind__get_entries.
+ *
+ * Notes:
+ * - the 8 bytes stride in the registers offsets comes from the fact
+ * that the registers are stored in an u64 array (u64 *regs),
+ * - the regs buffer needs to be zeroed before the call to this function,
+ * in this case using a calloc in dwarf-unwind.c.
+ */
+
+.text
+.type perf_regs_load,%function
+ENTRY(perf_regs_load)
+	str r0, [r0, #R0]
+	str r1, [r0, #R1]
+	str r2, [r0, #R2]
+	str r3, [r0, #R3]
+	str r4, [r0, #R4]
+	str r5, [r0, #R5]
+	str r6, [r0, #R6]
+	str r7, [r0, #R7]
+	str r8, [r0, #R8]
+	str r9, [r0, #R9]
+	str sl, [r0, #SL]
+	str fp, [r0, #FP]
+	str ip, [r0, #IP]
+	str sp, [r0, #SP]
+	str lr, [r0, #LR]
+	str lr, [r0, #PC]	// store pc as lr in order to skip the call
+	                        //  to this function
+	mov pc, lr
+ENDPROC(perf_regs_load)
diff --git a/tools/perf/arch/arm/util/dwarf-regs.c b/tools/perf/arch/arm/util/dwarf-regs.c
new file mode 100644
index 00000000000..33ec5b339da
--- /dev/null
+++ b/tools/perf/arch/arm/util/dwarf-regs.c
@@ -0,0 +1,64 @@
+/*
+ * Mapping of DWARF debug register numbers into register names.
+ *
+ * Copyright (C) 2010 Will Deacon, ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <stddef.h>
+#include <dwarf-regs.h>
+
+struct pt_regs_dwarfnum {
+	const char *name;
+	unsigned int dwarfnum;
+};
+
+#define STR(s) #s
+#define REG_DWARFNUM_NAME(r, num) {.name = r, .dwarfnum = num}
+#define GPR_DWARFNUM_NAME(num) \
+	{.name = STR(%r##num), .dwarfnum = num}
+#define REG_DWARFNUM_END {.name = NULL, .dwarfnum = 0}
+
+/*
+ * Reference:
+ * http://infocenter.arm.com/help/topic/com.arm.doc.ihi0040a/IHI0040A_aadwarf.pdf
+ */
+static const struct pt_regs_dwarfnum regdwarfnum_table[] = {
+	GPR_DWARFNUM_NAME(0),
+	GPR_DWARFNUM_NAME(1),
+	GPR_DWARFNUM_NAME(2),
+	GPR_DWARFNUM_NAME(3),
+	GPR_DWARFNUM_NAME(4),
+	GPR_DWARFNUM_NAME(5),
+	GPR_DWARFNUM_NAME(6),
+	GPR_DWARFNUM_NAME(7),
+	GPR_DWARFNUM_NAME(8),
+	GPR_DWARFNUM_NAME(9),
+	GPR_DWARFNUM_NAME(10),
+	REG_DWARFNUM_NAME("%fp", 11),
+	REG_DWARFNUM_NAME("%ip", 12),
+	REG_DWARFNUM_NAME("%sp", 13),
+	REG_DWARFNUM_NAME("%lr", 14),
+	REG_DWARFNUM_NAME("%pc", 15),
+	REG_DWARFNUM_END,
+};
+
+/**
+ * get_arch_regstr() - lookup register name from it's DWARF register number
+ * @n:	the DWARF register number
+ *
+ * get_arch_regstr() returns the name of the register in struct
+ * regdwarfnum_table from it's DWARF register number. If the register is not
+ * found in the table, this returns NULL;
+ */
+const char *get_arch_regstr(unsigned int n)
+{
+	const struct pt_regs_dwarfnum *roff;
+	for (roff = regdwarfnum_table; roff->name != NULL; roff++)
+		if (roff->dwarfnum == n)
+			return roff->name;
+	return NULL;
+}
diff --git a/tools/perf/arch/arm/util/unwind-libdw.c b/tools/perf/arch/arm/util/unwind-libdw.c
new file mode 100644
index 00000000000..b4176c60117
--- /dev/null
+++ b/tools/perf/arch/arm/util/unwind-libdw.c
@@ -0,0 +1,36 @@
+#include <elfutils/libdwfl.h>
+#include "../../util/unwind-libdw.h"
+#include "../../util/perf_regs.h"
+
+bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
+{
+	struct unwind_info *ui = arg;
+	struct regs_dump *user_regs = &ui->sample->user_regs;
+	Dwarf_Word dwarf_regs[PERF_REG_ARM_MAX];
+
+#define REG(r) ({						\
+	Dwarf_Word val = 0;					\
+	perf_reg_value(&val, user_regs, PERF_REG_ARM_##r);	\
+	val;							\
+})
+
+	dwarf_regs[0]  = REG(R0);
+	dwarf_regs[1]  = REG(R1);
+	dwarf_regs[2]  = REG(R2);
+	dwarf_regs[3]  = REG(R3);
+	dwarf_regs[4]  = REG(R4);
+	dwarf_regs[5]  = REG(R5);
+	dwarf_regs[6]  = REG(R6);
+	dwarf_regs[7]  = REG(R7);
+	dwarf_regs[8]  = REG(R8);
+	dwarf_regs[9]  = REG(R9);
+	dwarf_regs[10] = REG(R10);
+	dwarf_regs[11] = REG(FP);
+	dwarf_regs[12] = REG(IP);
+	dwarf_regs[13] = REG(SP);
+	dwarf_regs[14] = REG(LR);
+	dwarf_regs[15] = REG(PC);
+
+	return dwfl_thread_state_registers(thread, 0, PERF_REG_ARM_MAX,
+					   dwarf_regs);
+}
diff --git a/tools/perf/arch/arm/util/unwind-libunwind.c b/tools/perf/arch/arm/util/unwind-libunwind.c
new file mode 100644
index 00000000000..729ed69a666
--- /dev/null
+++ b/tools/perf/arch/arm/util/unwind-libunwind.c
@@ -0,0 +1,48 @@
+
+#include <errno.h>
+#include <libunwind.h>
+#include "perf_regs.h"
+#include "../../util/unwind.h"
+
+int libunwind__arch_reg_id(int regnum)
+{
+	switch (regnum) {
+	case UNW_ARM_R0:
+		return PERF_REG_ARM_R0;
+	case UNW_ARM_R1:
+		return PERF_REG_ARM_R1;
+	case UNW_ARM_R2:
+		return PERF_REG_ARM_R2;
+	case UNW_ARM_R3:
+		return PERF_REG_ARM_R3;
+	case UNW_ARM_R4:
+		return PERF_REG_ARM_R4;
+	case UNW_ARM_R5:
+		return PERF_REG_ARM_R5;
+	case UNW_ARM_R6:
+		return PERF_REG_ARM_R6;
+	case UNW_ARM_R7:
+		return PERF_REG_ARM_R7;
+	case UNW_ARM_R8:
+		return PERF_REG_ARM_R8;
+	case UNW_ARM_R9:
+		return PERF_REG_ARM_R9;
+	case UNW_ARM_R10:
+		return PERF_REG_ARM_R10;
+	case UNW_ARM_R11:
+		return PERF_REG_ARM_FP;
+	case UNW_ARM_R12:
+		return PERF_REG_ARM_IP;
+	case UNW_ARM_R13:
+		return PERF_REG_ARM_SP;
+	case UNW_ARM_R14:
+		return PERF_REG_ARM_LR;
+	case UNW_ARM_R15:
+		return PERF_REG_ARM_PC;
+	default:
+		pr_err("unwind: invalid reg id %d\n", regnum);
+		return -EINVAL;
+	}
+
+	return -EINVAL;
+}
diff --git a/tools/perf/arch/arm64/Makefile b/tools/perf/arch/arm64/Makefile
new file mode 100644
index 00000000000..67e9b3d38e8
--- /dev/null
+++ b/tools/perf/arch/arm64/Makefile
@@ -0,0 +1,7 @@
+ifndef NO_DWARF
+PERF_HAVE_DWARF_REGS := 1
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
+endif
+ifndef NO_LIBUNWIND
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind-libunwind.o
+endif
diff --git a/tools/perf/arch/arm64/include/perf_regs.h b/tools/perf/arch/arm64/include/perf_regs.h
new file mode 100644
index 00000000000..e9441b9e2a3
--- /dev/null
+++ b/tools/perf/arch/arm64/include/perf_regs.h
@@ -0,0 +1,88 @@
+#ifndef ARCH_PERF_REGS_H
+#define ARCH_PERF_REGS_H
+
+#include <stdlib.h>
+#include <linux/types.h>
+#include <asm/perf_regs.h>
+
+#define PERF_REGS_MASK	((1ULL << PERF_REG_ARM64_MAX) - 1)
+#define PERF_REG_IP	PERF_REG_ARM64_PC
+#define PERF_REG_SP	PERF_REG_ARM64_SP
+
+static inline const char *perf_reg_name(int id)
+{
+	switch (id) {
+	case PERF_REG_ARM64_X0:
+		return "x0";
+	case PERF_REG_ARM64_X1:
+		return "x1";
+	case PERF_REG_ARM64_X2:
+		return "x2";
+	case PERF_REG_ARM64_X3:
+		return "x3";
+	case PERF_REG_ARM64_X4:
+		return "x4";
+	case PERF_REG_ARM64_X5:
+		return "x5";
+	case PERF_REG_ARM64_X6:
+		return "x6";
+	case PERF_REG_ARM64_X7:
+		return "x7";
+	case PERF_REG_ARM64_X8:
+		return "x8";
+	case PERF_REG_ARM64_X9:
+		return "x9";
+	case PERF_REG_ARM64_X10:
+		return "x10";
+	case PERF_REG_ARM64_X11:
+		return "x11";
+	case PERF_REG_ARM64_X12:
+		return "x12";
+	case PERF_REG_ARM64_X13:
+		return "x13";
+	case PERF_REG_ARM64_X14:
+		return "x14";
+	case PERF_REG_ARM64_X15:
+		return "x15";
+	case PERF_REG_ARM64_X16:
+		return "x16";
+	case PERF_REG_ARM64_X17:
+		return "x17";
+	case PERF_REG_ARM64_X18:
+		return "x18";
+	case PERF_REG_ARM64_X19:
+		return "x19";
+	case PERF_REG_ARM64_X20:
+		return "x20";
+	case PERF_REG_ARM64_X21:
+		return "x21";
+	case PERF_REG_ARM64_X22:
+		return "x22";
+	case PERF_REG_ARM64_X23:
+		return "x23";
+	case PERF_REG_ARM64_X24:
+		return "x24";
+	case PERF_REG_ARM64_X25:
+		return "x25";
+	case PERF_REG_ARM64_X26:
+		return "x26";
+	case PERF_REG_ARM64_X27:
+		return "x27";
+	case PERF_REG_ARM64_X28:
+		return "x28";
+	case PERF_REG_ARM64_X29:
+		return "x29";
+	case PERF_REG_ARM64_SP:
+		return "sp";
+	case PERF_REG_ARM64_LR:
+		return "lr";
+	case PERF_REG_ARM64_PC:
+		return "pc";
+	default:
+		return NULL;
+	}
+
+	return NULL;
+}
+
+#endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/arm64/util/dwarf-regs.c b/tools/perf/arch/arm64/util/dwarf-regs.c
new file mode 100644
index 00000000000..d49efeb8172
--- /dev/null
+++ b/tools/perf/arch/arm64/util/dwarf-regs.c
@@ -0,0 +1,80 @@
+/*
+ * Mapping of DWARF debug register numbers into register names.
+ *
+ * Copyright (C) 2010 Will Deacon, ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <stddef.h>
+#include <dwarf-regs.h>
+
+struct pt_regs_dwarfnum {
+	const char *name;
+	unsigned int dwarfnum;
+};
+
+#define STR(s) #s
+#define REG_DWARFNUM_NAME(r, num) {.name = r, .dwarfnum = num}
+#define GPR_DWARFNUM_NAME(num) \
+	{.name = STR(%x##num), .dwarfnum = num}
+#define REG_DWARFNUM_END {.name = NULL, .dwarfnum = 0}
+
+/*
+ * Reference:
+ * http://infocenter.arm.com/help/topic/com.arm.doc.ihi0057b/IHI0057B_aadwarf64.pdf
+ */
+static const struct pt_regs_dwarfnum regdwarfnum_table[] = {
+	GPR_DWARFNUM_NAME(0),
+	GPR_DWARFNUM_NAME(1),
+	GPR_DWARFNUM_NAME(2),
+	GPR_DWARFNUM_NAME(3),
+	GPR_DWARFNUM_NAME(4),
+	GPR_DWARFNUM_NAME(5),
+	GPR_DWARFNUM_NAME(6),
+	GPR_DWARFNUM_NAME(7),
+	GPR_DWARFNUM_NAME(8),
+	GPR_DWARFNUM_NAME(9),
+	GPR_DWARFNUM_NAME(10),
+	GPR_DWARFNUM_NAME(11),
+	GPR_DWARFNUM_NAME(12),
+	GPR_DWARFNUM_NAME(13),
+	GPR_DWARFNUM_NAME(14),
+	GPR_DWARFNUM_NAME(15),
+	GPR_DWARFNUM_NAME(16),
+	GPR_DWARFNUM_NAME(17),
+	GPR_DWARFNUM_NAME(18),
+	GPR_DWARFNUM_NAME(19),
+	GPR_DWARFNUM_NAME(20),
+	GPR_DWARFNUM_NAME(21),
+	GPR_DWARFNUM_NAME(22),
+	GPR_DWARFNUM_NAME(23),
+	GPR_DWARFNUM_NAME(24),
+	GPR_DWARFNUM_NAME(25),
+	GPR_DWARFNUM_NAME(26),
+	GPR_DWARFNUM_NAME(27),
+	GPR_DWARFNUM_NAME(28),
+	GPR_DWARFNUM_NAME(29),
+	REG_DWARFNUM_NAME("%lr", 30),
+	REG_DWARFNUM_NAME("%sp", 31),
+	REG_DWARFNUM_END,
+};
+
+/**
+ * get_arch_regstr() - lookup register name from it's DWARF register number
+ * @n:	the DWARF register number
+ *
+ * get_arch_regstr() returns the name of the register in struct
+ * regdwarfnum_table from it's DWARF register number. If the register is not
+ * found in the table, this returns NULL;
+ */
+const char *get_arch_regstr(unsigned int n)
+{
+	const struct pt_regs_dwarfnum *roff;
+	for (roff = regdwarfnum_table; roff->name != NULL; roff++)
+		if (roff->dwarfnum == n)
+			return roff->name;
+	return NULL;
+}
diff --git a/tools/perf/arch/arm64/util/unwind-libunwind.c b/tools/perf/arch/arm64/util/unwind-libunwind.c
new file mode 100644
index 00000000000..436ee43859d
--- /dev/null
+++ b/tools/perf/arch/arm64/util/unwind-libunwind.c
@@ -0,0 +1,82 @@
+
+#include <errno.h>
+#include <libunwind.h>
+#include "perf_regs.h"
+#include "../../util/unwind.h"
+
+int libunwind__arch_reg_id(int regnum)
+{
+	switch (regnum) {
+	case UNW_AARCH64_X0:
+		return PERF_REG_ARM64_X0;
+	case UNW_AARCH64_X1:
+		return PERF_REG_ARM64_X1;
+	case UNW_AARCH64_X2:
+		return PERF_REG_ARM64_X2;
+	case UNW_AARCH64_X3:
+		return PERF_REG_ARM64_X3;
+	case UNW_AARCH64_X4:
+		return PERF_REG_ARM64_X4;
+	case UNW_AARCH64_X5:
+		return PERF_REG_ARM64_X5;
+	case UNW_AARCH64_X6:
+		return PERF_REG_ARM64_X6;
+	case UNW_AARCH64_X7:
+		return PERF_REG_ARM64_X7;
+	case UNW_AARCH64_X8:
+		return PERF_REG_ARM64_X8;
+	case UNW_AARCH64_X9:
+		return PERF_REG_ARM64_X9;
+	case UNW_AARCH64_X10:
+		return PERF_REG_ARM64_X10;
+	case UNW_AARCH64_X11:
+		return PERF_REG_ARM64_X11;
+	case UNW_AARCH64_X12:
+		return PERF_REG_ARM64_X12;
+	case UNW_AARCH64_X13:
+		return PERF_REG_ARM64_X13;
+	case UNW_AARCH64_X14:
+		return PERF_REG_ARM64_X14;
+	case UNW_AARCH64_X15:
+		return PERF_REG_ARM64_X15;
+	case UNW_AARCH64_X16:
+		return PERF_REG_ARM64_X16;
+	case UNW_AARCH64_X17:
+		return PERF_REG_ARM64_X17;
+	case UNW_AARCH64_X18:
+		return PERF_REG_ARM64_X18;
+	case UNW_AARCH64_X19:
+		return PERF_REG_ARM64_X19;
+	case UNW_AARCH64_X20:
+		return PERF_REG_ARM64_X20;
+	case UNW_AARCH64_X21:
+		return PERF_REG_ARM64_X21;
+	case UNW_AARCH64_X22:
+		return PERF_REG_ARM64_X22;
+	case UNW_AARCH64_X23:
+		return PERF_REG_ARM64_X23;
+	case UNW_AARCH64_X24:
+		return PERF_REG_ARM64_X24;
+	case UNW_AARCH64_X25:
+		return PERF_REG_ARM64_X25;
+	case UNW_AARCH64_X26:
+		return PERF_REG_ARM64_X26;
+	case UNW_AARCH64_X27:
+		return PERF_REG_ARM64_X27;
+	case UNW_AARCH64_X28:
+		return PERF_REG_ARM64_X28;
+	case UNW_AARCH64_X29:
+		return PERF_REG_ARM64_X29;
+	case UNW_AARCH64_X30:
+		return PERF_REG_ARM64_LR;
+	case UNW_AARCH64_SP:
+		return PERF_REG_ARM64_SP;
+	case UNW_AARCH64_PC:
+		return PERF_REG_ARM64_PC;
+	default:
+		pr_err("unwind: invalid reg id %d\n", regnum);
+		return -EINVAL;
+	}
+
+	return -EINVAL;
+}
diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c
new file mode 100644
index 00000000000..42faf369211
--- /dev/null
+++ b/tools/perf/arch/common.c
@@ -0,0 +1,211 @@
+#include <stdio.h>
+#include <sys/utsname.h>
+#include "common.h"
+#include "../util/debug.h"
+
+const char *const arm_triplets[] = {
+	"arm-eabi-",
+	"arm-linux-androideabi-",
+	"arm-unknown-linux-",
+	"arm-unknown-linux-gnu-",
+	"arm-unknown-linux-gnueabi-",
+	NULL
+};
+
+const char *const powerpc_triplets[] = {
+	"powerpc-unknown-linux-gnu-",
+	"powerpc64-unknown-linux-gnu-",
+	NULL
+};
+
+const char *const s390_triplets[] = {
+	"s390-ibm-linux-",
+	NULL
+};
+
+const char *const sh_triplets[] = {
+	"sh-unknown-linux-gnu-",
+	"sh64-unknown-linux-gnu-",
+	NULL
+};
+
+const char *const sparc_triplets[] = {
+	"sparc-unknown-linux-gnu-",
+	"sparc64-unknown-linux-gnu-",
+	NULL
+};
+
+const char *const x86_triplets[] = {
+	"x86_64-pc-linux-gnu-",
+	"x86_64-unknown-linux-gnu-",
+	"i686-pc-linux-gnu-",
+	"i586-pc-linux-gnu-",
+	"i486-pc-linux-gnu-",
+	"i386-pc-linux-gnu-",
+	"i686-linux-android-",
+	"i686-android-linux-",
+	NULL
+};
+
+const char *const mips_triplets[] = {
+	"mips-unknown-linux-gnu-",
+	"mipsel-linux-android-",
+	NULL
+};
+
+static bool lookup_path(char *name)
+{
+	bool found = false;
+	char *path, *tmp;
+	char buf[PATH_MAX];
+	char *env = getenv("PATH");
+
+	if (!env)
+		return false;
+
+	env = strdup(env);
+	if (!env)
+		return false;
+
+	path = strtok_r(env, ":", &tmp);
+	while (path) {
+		scnprintf(buf, sizeof(buf), "%s/%s", path, name);
+		if (access(buf, F_OK) == 0) {
+			found = true;
+			break;
+		}
+		path = strtok_r(NULL, ":", &tmp);
+	}
+	free(env);
+	return found;
+}
+
+static int lookup_triplets(const char *const *triplets, const char *name)
+{
+	int i;
+	char buf[PATH_MAX];
+
+	for (i = 0; triplets[i] != NULL; i++) {
+		scnprintf(buf, sizeof(buf), "%s%s", triplets[i], name);
+		if (lookup_path(buf))
+			return i;
+	}
+	return -1;
+}
+
+/*
+ * Return architecture name in a normalized form.
+ * The conversion logic comes from the Makefile.
+ */
+static const char *normalize_arch(char *arch)
+{
+	if (!strcmp(arch, "x86_64"))
+		return "x86";
+	if (arch[0] == 'i' && arch[2] == '8' && arch[3] == '6')
+		return "x86";
+	if (!strcmp(arch, "sun4u") || !strncmp(arch, "sparc", 5))
+		return "sparc";
+	if (!strncmp(arch, "arm", 3) || !strcmp(arch, "sa110"))
+		return "arm";
+	if (!strncmp(arch, "s390", 4))
+		return "s390";
+	if (!strncmp(arch, "parisc", 6))
+		return "parisc";
+	if (!strncmp(arch, "powerpc", 7) || !strncmp(arch, "ppc", 3))
+		return "powerpc";
+	if (!strncmp(arch, "mips", 4))
+		return "mips";
+	if (!strncmp(arch, "sh", 2) && isdigit(arch[2]))
+		return "sh";
+
+	return arch;
+}
+
+static int perf_session_env__lookup_binutils_path(struct perf_session_env *env,
+						  const char *name,
+						  const char **path)
+{
+	int idx;
+	const char *arch, *cross_env;
+	struct utsname uts;
+	const char *const *path_list;
+	char *buf = NULL;
+
+	arch = normalize_arch(env->arch);
+
+	if (uname(&uts) < 0)
+		goto out;
+
+	/*
+	 * We don't need to try to find objdump path for native system.
+	 * Just use default binutils path (e.g.: "objdump").
+	 */
+	if (!strcmp(normalize_arch(uts.machine), arch))
+		goto out;
+
+	cross_env = getenv("CROSS_COMPILE");
+	if (cross_env) {
+		if (asprintf(&buf, "%s%s", cross_env, name) < 0)
+			goto out_error;
+		if (buf[0] == '/') {
+			if (access(buf, F_OK) == 0)
+				goto out;
+			goto out_error;
+		}
+		if (lookup_path(buf))
+			goto out;
+		zfree(&buf);
+	}
+
+	if (!strcmp(arch, "arm"))
+		path_list = arm_triplets;
+	else if (!strcmp(arch, "powerpc"))
+		path_list = powerpc_triplets;
+	else if (!strcmp(arch, "sh"))
+		path_list = sh_triplets;
+	else if (!strcmp(arch, "s390"))
+		path_list = s390_triplets;
+	else if (!strcmp(arch, "sparc"))
+		path_list = sparc_triplets;
+	else if (!strcmp(arch, "x86"))
+		path_list = x86_triplets;
+	else if (!strcmp(arch, "mips"))
+		path_list = mips_triplets;
+	else {
+		ui__error("binutils for %s not supported.\n", arch);
+		goto out_error;
+	}
+
+	idx = lookup_triplets(path_list, name);
+	if (idx < 0) {
+		ui__error("Please install %s for %s.\n"
+			  "You can add it to PATH, set CROSS_COMPILE or "
+			  "override the default using --%s.\n",
+			  name, arch, name);
+		goto out_error;
+	}
+
+	if (asprintf(&buf, "%s%s", path_list[idx], name) < 0)
+		goto out_error;
+
+out:
+	*path = buf;
+	return 0;
+out_error:
+	free(buf);
+	*path = NULL;
+	return -1;
+}
+
+int perf_session_env__lookup_objdump(struct perf_session_env *env)
+{
+	/*
+	 * For live mode, env->arch will be NULL and we can use
+	 * the native objdump tool.
+	 */
+	if (env->arch == NULL)
+		return 0;
+
+	return perf_session_env__lookup_binutils_path(env, "objdump",
+						      &objdump_path);
+}
diff --git a/tools/perf/arch/common.h b/tools/perf/arch/common.h
new file mode 100644
index 00000000000..ede246eda9b
--- /dev/null
+++ b/tools/perf/arch/common.h
@@ -0,0 +1,10 @@
+#ifndef ARCH_PERF_COMMON_H
+#define ARCH_PERF_COMMON_H
+
+#include "../util/session.h"
+
+extern const char *objdump_path;
+
+int perf_session_env__lookup_objdump(struct perf_session_env *env);
+
+#endif /* ARCH_PERF_COMMON_H */
diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile
new file mode 100644
index 00000000000..744e629797b
--- /dev/null
+++ b/tools/perf/arch/powerpc/Makefile
@@ -0,0 +1,5 @@
+ifndef NO_DWARF
+PERF_HAVE_DWARF_REGS := 1
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
+endif
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/header.o
diff --git a/tools/perf/arch/powerpc/util/dwarf-regs.c b/tools/perf/arch/powerpc/util/dwarf-regs.c
new file mode 100644
index 00000000000..733151cdf46
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/dwarf-regs.c
@@ -0,0 +1,88 @@
+/*
+ * Mapping of DWARF debug register numbers into register names.
+ *
+ * Copyright (C) 2010 Ian Munsie, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <stddef.h>
+#include <dwarf-regs.h>
+
+
+struct pt_regs_dwarfnum {
+	const char *name;
+	unsigned int dwarfnum;
+};
+
+#define STR(s) #s
+#define REG_DWARFNUM_NAME(r, num) {.name = r, .dwarfnum = num}
+#define GPR_DWARFNUM_NAME(num)	\
+	{.name = STR(%gpr##num), .dwarfnum = num}
+#define REG_DWARFNUM_END {.name = NULL, .dwarfnum = 0}
+
+/*
+ * Reference:
+ * http://refspecs.linuxfoundation.org/ELF/ppc64/PPC-elf64abi-1.9.html
+ */
+static const struct pt_regs_dwarfnum regdwarfnum_table[] = {
+	GPR_DWARFNUM_NAME(0),
+	GPR_DWARFNUM_NAME(1),
+	GPR_DWARFNUM_NAME(2),
+	GPR_DWARFNUM_NAME(3),
+	GPR_DWARFNUM_NAME(4),
+	GPR_DWARFNUM_NAME(5),
+	GPR_DWARFNUM_NAME(6),
+	GPR_DWARFNUM_NAME(7),
+	GPR_DWARFNUM_NAME(8),
+	GPR_DWARFNUM_NAME(9),
+	GPR_DWARFNUM_NAME(10),
+	GPR_DWARFNUM_NAME(11),
+	GPR_DWARFNUM_NAME(12),
+	GPR_DWARFNUM_NAME(13),
+	GPR_DWARFNUM_NAME(14),
+	GPR_DWARFNUM_NAME(15),
+	GPR_DWARFNUM_NAME(16),
+	GPR_DWARFNUM_NAME(17),
+	GPR_DWARFNUM_NAME(18),
+	GPR_DWARFNUM_NAME(19),
+	GPR_DWARFNUM_NAME(20),
+	GPR_DWARFNUM_NAME(21),
+	GPR_DWARFNUM_NAME(22),
+	GPR_DWARFNUM_NAME(23),
+	GPR_DWARFNUM_NAME(24),
+	GPR_DWARFNUM_NAME(25),
+	GPR_DWARFNUM_NAME(26),
+	GPR_DWARFNUM_NAME(27),
+	GPR_DWARFNUM_NAME(28),
+	GPR_DWARFNUM_NAME(29),
+	GPR_DWARFNUM_NAME(30),
+	GPR_DWARFNUM_NAME(31),
+	REG_DWARFNUM_NAME("%msr",   66),
+	REG_DWARFNUM_NAME("%ctr",   109),
+	REG_DWARFNUM_NAME("%link",  108),
+	REG_DWARFNUM_NAME("%xer",   101),
+	REG_DWARFNUM_NAME("%dar",   119),
+	REG_DWARFNUM_NAME("%dsisr", 118),
+	REG_DWARFNUM_END,
+};
+
+/**
+ * get_arch_regstr() - lookup register name from it's DWARF register number
+ * @n:	the DWARF register number
+ *
+ * get_arch_regstr() returns the name of the register in struct
+ * regdwarfnum_table from it's DWARF register number. If the register is not
+ * found in the table, this returns NULL;
+ */
+const char *get_arch_regstr(unsigned int n)
+{
+	const struct pt_regs_dwarfnum *roff;
+	for (roff = regdwarfnum_table; roff->name != NULL; roff++)
+		if (roff->dwarfnum == n)
+			return roff->name;
+	return NULL;
+}
diff --git a/tools/perf/arch/powerpc/util/header.c b/tools/perf/arch/powerpc/util/header.c
new file mode 100644
index 00000000000..2f7073d107f
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/header.c
@@ -0,0 +1,36 @@
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../../util/header.h"
+
+#define __stringify_1(x)        #x
+#define __stringify(x)          __stringify_1(x)
+
+#define mfspr(rn)       ({unsigned long rval; \
+			 asm volatile("mfspr %0," __stringify(rn) \
+				      : "=r" (rval)); rval; })
+
+#define SPRN_PVR        0x11F	/* Processor Version Register */
+#define PVR_VER(pvr)    (((pvr) >>  16) & 0xFFFF) /* Version field */
+#define PVR_REV(pvr)    (((pvr) >>   0) & 0xFFFF) /* Revison field */
+
+int
+get_cpuid(char *buffer, size_t sz)
+{
+	unsigned long pvr;
+	int nb;
+
+	pvr = mfspr(SPRN_PVR);
+
+	nb = scnprintf(buffer, sz, "%lu,%lu$", PVR_VER(pvr), PVR_REV(pvr));
+
+	/* look for end marker to ensure the entire data fit */
+	if (strchr(buffer, '$')) {
+		buffer[nb-1] = '\0';
+		return 0;
+	}
+	return -1;
+}
diff --git a/tools/perf/arch/s390/Makefile b/tools/perf/arch/s390/Makefile
new file mode 100644
index 00000000000..15130b50dfe
--- /dev/null
+++ b/tools/perf/arch/s390/Makefile
@@ -0,0 +1,4 @@
+ifndef NO_DWARF
+PERF_HAVE_DWARF_REGS := 1
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
+endif
diff --git a/tools/perf/arch/s390/util/dwarf-regs.c b/tools/perf/arch/s390/util/dwarf-regs.c
new file mode 100644
index 00000000000..0469df02ee6
--- /dev/null
+++ b/tools/perf/arch/s390/util/dwarf-regs.c
@@ -0,0 +1,22 @@
+/*
+ * Mapping of DWARF debug register numbers into register names.
+ *
+ *    Copyright IBM Corp. 2010
+ *    Author(s): Heiko Carstens <heiko.carstens@de.ibm.com>,
+ *
+ */
+
+#include <stddef.h>
+#include <dwarf-regs.h>
+
+#define NUM_GPRS 16
+
+static const char *gpr_names[NUM_GPRS] = {
+	"%r0", "%r1",  "%r2",  "%r3",  "%r4",  "%r5",  "%r6",  "%r7",
+	"%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
+};
+
+const char *get_arch_regstr(unsigned int n)
+{
+	return (n >= NUM_GPRS) ? NULL : gpr_names[n];
+}
diff --git a/tools/perf/arch/sh/Makefile b/tools/perf/arch/sh/Makefile
new file mode 100644
index 00000000000..15130b50dfe
--- /dev/null
+++ b/tools/perf/arch/sh/Makefile
@@ -0,0 +1,4 @@
+ifndef NO_DWARF
+PERF_HAVE_DWARF_REGS := 1
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
+endif
diff --git a/tools/perf/arch/sh/util/dwarf-regs.c b/tools/perf/arch/sh/util/dwarf-regs.c
new file mode 100644
index 00000000000..0d0897f57a1
--- /dev/null
+++ b/tools/perf/arch/sh/util/dwarf-regs.c
@@ -0,0 +1,55 @@
+/*
+ * Mapping of DWARF debug register numbers into register names.
+ *
+ * Copyright (C) 2010 Matt Fleming <matt@console-pimps.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+#include <stddef.h>
+#include <dwarf-regs.h>
+
+/*
+ * Generic dwarf analysis helpers
+ */
+
+#define SH_MAX_REGS 18
+const char *sh_regs_table[SH_MAX_REGS] = {
+	"r0",
+	"r1",
+	"r2",
+	"r3",
+	"r4",
+	"r5",
+	"r6",
+	"r7",
+	"r8",
+	"r9",
+	"r10",
+	"r11",
+	"r12",
+	"r13",
+	"r14",
+	"r15",
+	"pc",
+	"pr",
+};
+
+/* Return architecture dependent register string (for kprobe-tracer) */
+const char *get_arch_regstr(unsigned int n)
+{
+	return (n <= SH_MAX_REGS) ? sh_regs_table[n] : NULL;
+}
diff --git a/tools/perf/arch/sparc/Makefile b/tools/perf/arch/sparc/Makefile
new file mode 100644
index 00000000000..15130b50dfe
--- /dev/null
+++ b/tools/perf/arch/sparc/Makefile
@@ -0,0 +1,4 @@
+ifndef NO_DWARF
+PERF_HAVE_DWARF_REGS := 1
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
+endif
diff --git a/tools/perf/arch/sparc/util/dwarf-regs.c b/tools/perf/arch/sparc/util/dwarf-regs.c
new file mode 100644
index 00000000000..92eda412fed
--- /dev/null
+++ b/tools/perf/arch/sparc/util/dwarf-regs.c
@@ -0,0 +1,43 @@
+/*
+ * Mapping of DWARF debug register numbers into register names.
+ *
+ * Copyright (C) 2010 David S. Miller <davem@davemloft.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <stddef.h>
+#include <dwarf-regs.h>
+
+#define SPARC_MAX_REGS	96
+
+const char *sparc_regs_table[SPARC_MAX_REGS] = {
+	"%g0", "%g1", "%g2", "%g3", "%g4", "%g5", "%g6", "%g7",
+	"%o0", "%o1", "%o2", "%o3", "%o4", "%o5", "%sp", "%o7",
+	"%l0", "%l1", "%l2", "%l3", "%l4", "%l5", "%l6", "%l7",
+	"%i0", "%i1", "%i2", "%i3", "%i4", "%i5", "%fp", "%i7",
+	"%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
+	"%f8", "%f9", "%f10", "%f11", "%f12", "%f13", "%f14", "%f15",
+	"%f16", "%f17", "%f18", "%f19", "%f20", "%f21", "%f22", "%f23",
+	"%f24", "%f25", "%f26", "%f27", "%f28", "%f29", "%f30", "%f31",
+	"%f32", "%f33", "%f34", "%f35", "%f36", "%f37", "%f38", "%f39",
+	"%f40", "%f41", "%f42", "%f43", "%f44", "%f45", "%f46", "%f47",
+	"%f48", "%f49", "%f50", "%f51", "%f52", "%f53", "%f54", "%f55",
+	"%f56", "%f57", "%f58", "%f59", "%f60", "%f61", "%f62", "%f63",
+};
+
+/**
+ * get_arch_regstr() - lookup register name from it's DWARF register number
+ * @n:	the DWARF register number
+ *
+ * get_arch_regstr() returns the name of the register in struct
+ * regdwarfnum_table from it's DWARF register number. If the register is not
+ * found in the table, this returns NULL;
+ */
+const char *get_arch_regstr(unsigned int n)
+{
+	return (n <= SPARC_MAX_REGS) ? sparc_regs_table[n] : NULL;
+}
diff --git a/tools/perf/arch/x86/Makefile b/tools/perf/arch/x86/Makefile
new file mode 100644
index 00000000000..1641542e363
--- /dev/null
+++ b/tools/perf/arch/x86/Makefile
@@ -0,0 +1,17 @@
+ifndef NO_DWARF
+PERF_HAVE_DWARF_REGS := 1
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
+endif
+ifndef NO_LIBUNWIND
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind-libunwind.o
+endif
+ifndef NO_LIBDW_DWARF_UNWIND
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/unwind-libdw.o
+endif
+ifndef NO_DWARF_UNWIND
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/tests/regs_load.o
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/tests/dwarf-unwind.o
+endif
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/header.o
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/tsc.o
+LIB_H += arch/$(ARCH)/util/tsc.h
diff --git a/tools/perf/arch/x86/include/perf_regs.h b/tools/perf/arch/x86/include/perf_regs.h
new file mode 100644
index 00000000000..7df517acfef
--- /dev/null
+++ b/tools/perf/arch/x86/include/perf_regs.h
@@ -0,0 +1,86 @@
+#ifndef ARCH_PERF_REGS_H
+#define ARCH_PERF_REGS_H
+
+#include <stdlib.h>
+#include <linux/types.h>
+#include <asm/perf_regs.h>
+
+void perf_regs_load(u64 *regs);
+
+#ifndef HAVE_ARCH_X86_64_SUPPORT
+#define PERF_REGS_MASK ((1ULL << PERF_REG_X86_32_MAX) - 1)
+#define PERF_REGS_MAX PERF_REG_X86_32_MAX
+#define PERF_SAMPLE_REGS_ABI PERF_SAMPLE_REGS_ABI_32
+#else
+#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \
+		       (1ULL << PERF_REG_X86_ES) | \
+		       (1ULL << PERF_REG_X86_FS) | \
+		       (1ULL << PERF_REG_X86_GS))
+#define PERF_REGS_MASK (((1ULL << PERF_REG_X86_64_MAX) - 1) & ~REG_NOSUPPORT)
+#define PERF_REGS_MAX PERF_REG_X86_64_MAX
+#define PERF_SAMPLE_REGS_ABI PERF_SAMPLE_REGS_ABI_64
+#endif
+#define PERF_REG_IP PERF_REG_X86_IP
+#define PERF_REG_SP PERF_REG_X86_SP
+
+static inline const char *perf_reg_name(int id)
+{
+	switch (id) {
+	case PERF_REG_X86_AX:
+		return "AX";
+	case PERF_REG_X86_BX:
+		return "BX";
+	case PERF_REG_X86_CX:
+		return "CX";
+	case PERF_REG_X86_DX:
+		return "DX";
+	case PERF_REG_X86_SI:
+		return "SI";
+	case PERF_REG_X86_DI:
+		return "DI";
+	case PERF_REG_X86_BP:
+		return "BP";
+	case PERF_REG_X86_SP:
+		return "SP";
+	case PERF_REG_X86_IP:
+		return "IP";
+	case PERF_REG_X86_FLAGS:
+		return "FLAGS";
+	case PERF_REG_X86_CS:
+		return "CS";
+	case PERF_REG_X86_SS:
+		return "SS";
+	case PERF_REG_X86_DS:
+		return "DS";
+	case PERF_REG_X86_ES:
+		return "ES";
+	case PERF_REG_X86_FS:
+		return "FS";
+	case PERF_REG_X86_GS:
+		return "GS";
+#ifdef HAVE_ARCH_X86_64_SUPPORT
+	case PERF_REG_X86_R8:
+		return "R8";
+	case PERF_REG_X86_R9:
+		return "R9";
+	case PERF_REG_X86_R10:
+		return "R10";
+	case PERF_REG_X86_R11:
+		return "R11";
+	case PERF_REG_X86_R12:
+		return "R12";
+	case PERF_REG_X86_R13:
+		return "R13";
+	case PERF_REG_X86_R14:
+		return "R14";
+	case PERF_REG_X86_R15:
+		return "R15";
+#endif /* HAVE_ARCH_X86_64_SUPPORT */
+	default:
+		return NULL;
+	}
+
+	return NULL;
+}
+
+#endif /* ARCH_PERF_REGS_H */
diff --git a/tools/perf/arch/x86/tests/dwarf-unwind.c b/tools/perf/arch/x86/tests/dwarf-unwind.c
new file mode 100644
index 00000000000..9f89f899ccc
--- /dev/null
+++ b/tools/perf/arch/x86/tests/dwarf-unwind.c
@@ -0,0 +1,60 @@
+#include <string.h>
+#include "perf_regs.h"
+#include "thread.h"
+#include "map.h"
+#include "event.h"
+#include "tests/tests.h"
+
+#define STACK_SIZE 8192
+
+static int sample_ustack(struct perf_sample *sample,
+			 struct thread *thread, u64 *regs)
+{
+	struct stack_dump *stack = &sample->user_stack;
+	struct map *map;
+	unsigned long sp;
+	u64 stack_size, *buf;
+
+	buf = malloc(STACK_SIZE);
+	if (!buf) {
+		pr_debug("failed to allocate sample uregs data\n");
+		return -1;
+	}
+
+	sp = (unsigned long) regs[PERF_REG_X86_SP];
+
+	map = map_groups__find(thread->mg, MAP__VARIABLE, (u64) sp);
+	if (!map) {
+		pr_debug("failed to get stack map\n");
+		free(buf);
+		return -1;
+	}
+
+	stack_size = map->end - sp;
+	stack_size = stack_size > STACK_SIZE ? STACK_SIZE : stack_size;
+
+	memcpy(buf, (void *) sp, stack_size);
+	stack->data = (char *) buf;
+	stack->size = stack_size;
+	return 0;
+}
+
+int test__arch_unwind_sample(struct perf_sample *sample,
+			     struct thread *thread)
+{
+	struct regs_dump *regs = &sample->user_regs;
+	u64 *buf;
+
+	buf = malloc(sizeof(u64) * PERF_REGS_MAX);
+	if (!buf) {
+		pr_debug("failed to allocate sample uregs data\n");
+		return -1;
+	}
+
+	perf_regs_load(buf);
+	regs->abi  = PERF_SAMPLE_REGS_ABI;
+	regs->regs = buf;
+	regs->mask = PERF_REGS_MASK;
+
+	return sample_ustack(sample, thread, buf);
+}
diff --git a/tools/perf/arch/x86/tests/regs_load.S b/tools/perf/arch/x86/tests/regs_load.S
new file mode 100644
index 00000000000..60875d5c556
--- /dev/null
+++ b/tools/perf/arch/x86/tests/regs_load.S
@@ -0,0 +1,98 @@
+#include <linux/linkage.h>
+
+#define AX	 0
+#define BX	 1 * 8
+#define CX	 2 * 8
+#define DX	 3 * 8
+#define SI	 4 * 8
+#define DI	 5 * 8
+#define BP	 6 * 8
+#define SP	 7 * 8
+#define IP	 8 * 8
+#define FLAGS	 9 * 8
+#define CS	10 * 8
+#define SS	11 * 8
+#define DS	12 * 8
+#define ES	13 * 8
+#define FS	14 * 8
+#define GS	15 * 8
+#define R8	16 * 8
+#define R9	17 * 8
+#define R10	18 * 8
+#define R11	19 * 8
+#define R12	20 * 8
+#define R13	21 * 8
+#define R14	22 * 8
+#define R15	23 * 8
+
+.text
+#ifdef HAVE_ARCH_X86_64_SUPPORT
+ENTRY(perf_regs_load)
+	movq %rax, AX(%rdi)
+	movq %rbx, BX(%rdi)
+	movq %rcx, CX(%rdi)
+	movq %rdx, DX(%rdi)
+	movq %rsi, SI(%rdi)
+	movq %rdi, DI(%rdi)
+	movq %rbp, BP(%rdi)
+
+	leaq 8(%rsp), %rax /* exclude this call.  */
+	movq %rax, SP(%rdi)
+
+	movq 0(%rsp), %rax
+	movq %rax, IP(%rdi)
+
+	movq $0, FLAGS(%rdi)
+	movq $0, CS(%rdi)
+	movq $0, SS(%rdi)
+	movq $0, DS(%rdi)
+	movq $0, ES(%rdi)
+	movq $0, FS(%rdi)
+	movq $0, GS(%rdi)
+
+	movq %r8,  R8(%rdi)
+	movq %r9,  R9(%rdi)
+	movq %r10, R10(%rdi)
+	movq %r11, R11(%rdi)
+	movq %r12, R12(%rdi)
+	movq %r13, R13(%rdi)
+	movq %r14, R14(%rdi)
+	movq %r15, R15(%rdi)
+	ret
+ENDPROC(perf_regs_load)
+#else
+ENTRY(perf_regs_load)
+	push %edi
+	movl 8(%esp), %edi
+	movl %eax, AX(%edi)
+	movl %ebx, BX(%edi)
+	movl %ecx, CX(%edi)
+	movl %edx, DX(%edi)
+	movl %esi, SI(%edi)
+	pop %eax
+	movl %eax, DI(%edi)
+	movl %ebp, BP(%edi)
+
+	leal 4(%esp), %eax /* exclude this call.  */
+	movl %eax, SP(%edi)
+
+	movl 0(%esp), %eax
+	movl %eax, IP(%edi)
+
+	movl $0, FLAGS(%edi)
+	movl $0, CS(%edi)
+	movl $0, SS(%edi)
+	movl $0, DS(%edi)
+	movl $0, ES(%edi)
+	movl $0, FS(%edi)
+	movl $0, GS(%edi)
+	ret
+ENDPROC(perf_regs_load)
+#endif
+
+/*
+ * We need to provide note.GNU-stack section, saying that we want
+ * NOT executable stack. Otherwise the final linking will assume that
+ * the ELF stack should not be restricted at all and set it RWX.
+ */
+.section .note.GNU-stack,"",@progbits
diff --git a/tools/perf/arch/x86/util/dwarf-regs.c b/tools/perf/arch/x86/util/dwarf-regs.c
new file mode 100644
index 00000000000..be22dd46323
--- /dev/null
+++ b/tools/perf/arch/x86/util/dwarf-regs.c
@@ -0,0 +1,75 @@
+/*
+ * dwarf-regs.c : Mapping of DWARF debug register numbers into register names.
+ * Extracted from probe-finder.c
+ *
+ * Written by Masami Hiramatsu <mhiramat@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+#include <stddef.h>
+#include <dwarf-regs.h>
+
+/*
+ * Generic dwarf analysis helpers
+ */
+
+#define X86_32_MAX_REGS 8
+const char *x86_32_regs_table[X86_32_MAX_REGS] = {
+	"%ax",
+	"%cx",
+	"%dx",
+	"%bx",
+	"$stack",	/* Stack address instead of %sp */
+	"%bp",
+	"%si",
+	"%di",
+};
+
+#define X86_64_MAX_REGS 16
+const char *x86_64_regs_table[X86_64_MAX_REGS] = {
+	"%ax",
+	"%dx",
+	"%cx",
+	"%bx",
+	"%si",
+	"%di",
+	"%bp",
+	"%sp",
+	"%r8",
+	"%r9",
+	"%r10",
+	"%r11",
+	"%r12",
+	"%r13",
+	"%r14",
+	"%r15",
+};
+
+/* TODO: switching by dwarf address size */
+#ifdef __x86_64__
+#define ARCH_MAX_REGS X86_64_MAX_REGS
+#define arch_regs_table x86_64_regs_table
+#else
+#define ARCH_MAX_REGS X86_32_MAX_REGS
+#define arch_regs_table x86_32_regs_table
+#endif
+
+/* Return architecture dependent register string (for kprobe-tracer) */
+const char *get_arch_regstr(unsigned int n)
+{
+	return (n <= ARCH_MAX_REGS) ? arch_regs_table[n] : NULL;
+}
diff --git a/tools/perf/arch/x86/util/header.c b/tools/perf/arch/x86/util/header.c
new file mode 100644
index 00000000000..146d12a1cec
--- /dev/null
+++ b/tools/perf/arch/x86/util/header.c
@@ -0,0 +1,59 @@
+#include <sys/types.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "../../util/header.h"
+
+static inline void
+cpuid(unsigned int op, unsigned int *a, unsigned int *b, unsigned int *c,
+      unsigned int *d)
+{
+	__asm__ __volatile__ (".byte 0x53\n\tcpuid\n\t"
+			      "movl %%ebx, %%esi\n\t.byte 0x5b"
+			: "=a" (*a),
+			"=S" (*b),
+			"=c" (*c),
+			"=d" (*d)
+			: "a" (op));
+}
+
+int
+get_cpuid(char *buffer, size_t sz)
+{
+	unsigned int a, b, c, d, lvl;
+	int family = -1, model = -1, step = -1;
+	int nb;
+	char vendor[16];
+
+	cpuid(0, &lvl, &b, &c, &d);
+	strncpy(&vendor[0], (char *)(&b), 4);
+	strncpy(&vendor[4], (char *)(&d), 4);
+	strncpy(&vendor[8], (char *)(&c), 4);
+	vendor[12] = '\0';
+
+	if (lvl >= 1) {
+		cpuid(1, &a, &b, &c, &d);
+
+		family = (a >> 8) & 0xf;  /* bits 11 - 8 */
+		model  = (a >> 4) & 0xf;  /* Bits  7 - 4 */
+		step   = a & 0xf;
+
+		/* extended family */
+		if (family == 0xf)
+			family += (a >> 20) & 0xff;
+
+		/* extended model */
+		if (family >= 0x6)
+			model += ((a >> 16) & 0xf) << 4;
+	}
+	nb = scnprintf(buffer, sz, "%s,%u,%u,%u$", vendor, family, model, step);
+
+	/* look for end marker to ensure the entire data fit */
+	if (strchr(buffer, '$')) {
+		buffer[nb-1] = '\0';
+		return 0;
+	}
+	return -1;
+}
diff --git a/tools/perf/arch/x86/util/tsc.c b/tools/perf/arch/x86/util/tsc.c
new file mode 100644
index 00000000000..40021fa3129
--- /dev/null
+++ b/tools/perf/arch/x86/util/tsc.c
@@ -0,0 +1,59 @@
+#include <stdbool.h>
+#include <errno.h>
+
+#include <linux/perf_event.h>
+
+#include "../../perf.h"
+#include <linux/types.h>
+#include "../../util/debug.h"
+#include "tsc.h"
+
+u64 perf_time_to_tsc(u64 ns, struct perf_tsc_conversion *tc)
+{
+	u64 t, quot, rem;
+
+	t = ns - tc->time_zero;
+	quot = t / tc->time_mult;
+	rem  = t % tc->time_mult;
+	return (quot << tc->time_shift) +
+	       (rem << tc->time_shift) / tc->time_mult;
+}
+
+u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc)
+{
+	u64 quot, rem;
+
+	quot = cyc >> tc->time_shift;
+	rem  = cyc & ((1 << tc->time_shift) - 1);
+	return tc->time_zero + quot * tc->time_mult +
+	       ((rem * tc->time_mult) >> tc->time_shift);
+}
+
+int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc,
+			     struct perf_tsc_conversion *tc)
+{
+	bool cap_user_time_zero;
+	u32 seq;
+	int i = 0;
+
+	while (1) {
+		seq = pc->lock;
+		rmb();
+		tc->time_mult = pc->time_mult;
+		tc->time_shift = pc->time_shift;
+		tc->time_zero = pc->time_zero;
+		cap_user_time_zero = pc->cap_user_time_zero;
+		rmb();
+		if (pc->lock == seq && !(seq & 1))
+			break;
+		if (++i > 10000) {
+			pr_debug("failed to get perf_event_mmap_page lock\n");
+			return -EINVAL;
+		}
+	}
+
+	if (!cap_user_time_zero)
+		return -EOPNOTSUPP;
+
+	return 0;
+}
diff --git a/tools/perf/arch/x86/util/tsc.h b/tools/perf/arch/x86/util/tsc.h
new file mode 100644
index 00000000000..2affe0366b5
--- /dev/null
+++ b/tools/perf/arch/x86/util/tsc.h
@@ -0,0 +1,20 @@
+#ifndef TOOLS_PERF_ARCH_X86_UTIL_TSC_H__
+#define TOOLS_PERF_ARCH_X86_UTIL_TSC_H__
+
+#include <linux/types.h>
+
+struct perf_tsc_conversion {
+	u16 time_shift;
+	u32 time_mult;
+	u64 time_zero;
+};
+
+struct perf_event_mmap_page;
+
+int perf_read_tsc_conversion(const struct perf_event_mmap_page *pc,
+			     struct perf_tsc_conversion *tc);
+
+u64 perf_time_to_tsc(u64 ns, struct perf_tsc_conversion *tc);
+u64 tsc_to_perf_time(u64 cyc, struct perf_tsc_conversion *tc);
+
+#endif /* TOOLS_PERF_ARCH_X86_UTIL_TSC_H__ */
diff --git a/tools/perf/arch/x86/util/unwind-libdw.c b/tools/perf/arch/x86/util/unwind-libdw.c
new file mode 100644
index 00000000000..c4b72176ca8
--- /dev/null
+++ b/tools/perf/arch/x86/util/unwind-libdw.c
@@ -0,0 +1,51 @@
+#include <elfutils/libdwfl.h>
+#include "../../util/unwind-libdw.h"
+#include "../../util/perf_regs.h"
+
+bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
+{
+	struct unwind_info *ui = arg;
+	struct regs_dump *user_regs = &ui->sample->user_regs;
+	Dwarf_Word dwarf_regs[17];
+	unsigned nregs;
+
+#define REG(r) ({						\
+	Dwarf_Word val = 0;					\
+	perf_reg_value(&val, user_regs, PERF_REG_X86_##r);	\
+	val;							\
+})
+
+	if (user_regs->abi == PERF_SAMPLE_REGS_ABI_32) {
+		dwarf_regs[0] = REG(AX);
+		dwarf_regs[1] = REG(CX);
+		dwarf_regs[2] = REG(DX);
+		dwarf_regs[3] = REG(BX);
+		dwarf_regs[4] = REG(SP);
+		dwarf_regs[5] = REG(BP);
+		dwarf_regs[6] = REG(SI);
+		dwarf_regs[7] = REG(DI);
+		dwarf_regs[8] = REG(IP);
+		nregs = 9;
+	} else {
+		dwarf_regs[0]  = REG(AX);
+		dwarf_regs[1]  = REG(DX);
+		dwarf_regs[2]  = REG(CX);
+		dwarf_regs[3]  = REG(BX);
+		dwarf_regs[4]  = REG(SI);
+		dwarf_regs[5]  = REG(DI);
+		dwarf_regs[6]  = REG(BP);
+		dwarf_regs[7]  = REG(SP);
+		dwarf_regs[8]  = REG(R8);
+		dwarf_regs[9]  = REG(R9);
+		dwarf_regs[10] = REG(R10);
+		dwarf_regs[11] = REG(R11);
+		dwarf_regs[12] = REG(R12);
+		dwarf_regs[13] = REG(R13);
+		dwarf_regs[14] = REG(R14);
+		dwarf_regs[15] = REG(R15);
+		dwarf_regs[16] = REG(IP);
+		nregs = 17;
+	}
+
+	return dwfl_thread_state_registers(thread, 0, nregs, dwarf_regs);
+}
diff --git a/tools/perf/arch/x86/util/unwind-libunwind.c b/tools/perf/arch/x86/util/unwind-libunwind.c
new file mode 100644
index 00000000000..3261f68c6a7
--- /dev/null
+++ b/tools/perf/arch/x86/util/unwind-libunwind.c
@@ -0,0 +1,111 @@
+
+#include <errno.h>
+#include <libunwind.h>
+#include "perf_regs.h"
+#include "../../util/unwind.h"
+
+#ifdef HAVE_ARCH_X86_64_SUPPORT
+int libunwind__arch_reg_id(int regnum)
+{
+	int id;
+
+	switch (regnum) {
+	case UNW_X86_64_RAX:
+		id = PERF_REG_X86_AX;
+		break;
+	case UNW_X86_64_RDX:
+		id = PERF_REG_X86_DX;
+		break;
+	case UNW_X86_64_RCX:
+		id = PERF_REG_X86_CX;
+		break;
+	case UNW_X86_64_RBX:
+		id = PERF_REG_X86_BX;
+		break;
+	case UNW_X86_64_RSI:
+		id = PERF_REG_X86_SI;
+		break;
+	case UNW_X86_64_RDI:
+		id = PERF_REG_X86_DI;
+		break;
+	case UNW_X86_64_RBP:
+		id = PERF_REG_X86_BP;
+		break;
+	case UNW_X86_64_RSP:
+		id = PERF_REG_X86_SP;
+		break;
+	case UNW_X86_64_R8:
+		id = PERF_REG_X86_R8;
+		break;
+	case UNW_X86_64_R9:
+		id = PERF_REG_X86_R9;
+		break;
+	case UNW_X86_64_R10:
+		id = PERF_REG_X86_R10;
+		break;
+	case UNW_X86_64_R11:
+		id = PERF_REG_X86_R11;
+		break;
+	case UNW_X86_64_R12:
+		id = PERF_REG_X86_R12;
+		break;
+	case UNW_X86_64_R13:
+		id = PERF_REG_X86_R13;
+		break;
+	case UNW_X86_64_R14:
+		id = PERF_REG_X86_R14;
+		break;
+	case UNW_X86_64_R15:
+		id = PERF_REG_X86_R15;
+		break;
+	case UNW_X86_64_RIP:
+		id = PERF_REG_X86_IP;
+		break;
+	default:
+		pr_err("unwind: invalid reg id %d\n", regnum);
+		return -EINVAL;
+	}
+
+	return id;
+}
+#else
+int libunwind__arch_reg_id(int regnum)
+{
+	int id;
+
+	switch (regnum) {
+	case UNW_X86_EAX:
+		id = PERF_REG_X86_AX;
+		break;
+	case UNW_X86_EDX:
+		id = PERF_REG_X86_DX;
+		break;
+	case UNW_X86_ECX:
+		id = PERF_REG_X86_CX;
+		break;
+	case UNW_X86_EBX:
+		id = PERF_REG_X86_BX;
+		break;
+	case UNW_X86_ESI:
+		id = PERF_REG_X86_SI;
+		break;
+	case UNW_X86_EDI:
+		id = PERF_REG_X86_DI;
+		break;
+	case UNW_X86_EBP:
+		id = PERF_REG_X86_BP;
+		break;
+	case UNW_X86_ESP:
+		id = PERF_REG_X86_SP;
+		break;
+	case UNW_X86_EIP:
+		id = PERF_REG_X86_IP;
+		break;
+	default:
+		pr_err("unwind: invalid reg id %d\n", regnum);
+		return -EINVAL;
+	}
+
+	return id;
+}
+#endif /* HAVE_ARCH_X86_64_SUPPORT */
diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h
index f7781c6267c..eba46709b27 100644
--- a/tools/perf/bench/bench.h
+++ b/tools/perf/bench/bench.h
@@ -1,9 +1,39 @@
 #ifndef BENCH_H
 #define BENCH_H
 
+/*
+ * The madvise transparent hugepage constants were added in glibc
+ * 2.13. For compatibility with older versions of glibc, define these
+ * tokens if they are not already defined.
+ *
+ * PA-RISC uses different madvise values from other architectures and
+ * needs to be special-cased.
+ */
+#ifdef __hppa__
+# ifndef MADV_HUGEPAGE
+#  define MADV_HUGEPAGE		67
+# endif
+# ifndef MADV_NOHUGEPAGE
+#  define MADV_NOHUGEPAGE	68
+# endif
+#else
+# ifndef MADV_HUGEPAGE
+#  define MADV_HUGEPAGE		14
+# endif
+# ifndef MADV_NOHUGEPAGE
+#  define MADV_NOHUGEPAGE	15
+# endif
+#endif
+
+extern int bench_numa(int argc, const char **argv, const char *prefix);
 extern int bench_sched_messaging(int argc, const char **argv, const char *prefix);
 extern int bench_sched_pipe(int argc, const char **argv, const char *prefix);
-extern int bench_mem_memcpy(int argc, const char **argv, const char *prefix __used);
+extern int bench_mem_memcpy(int argc, const char **argv,
+			    const char *prefix __maybe_unused);
+extern int bench_mem_memset(int argc, const char **argv, const char *prefix);
+extern int bench_futex_hash(int argc, const char **argv, const char *prefix);
+extern int bench_futex_wake(int argc, const char **argv, const char *prefix);
+extern int bench_futex_requeue(int argc, const char **argv, const char *prefix);
 
 #define BENCH_FORMAT_DEFAULT_STR	"default"
 #define BENCH_FORMAT_DEFAULT		0
diff --git a/tools/perf/bench/futex-hash.c b/tools/perf/bench/futex-hash.c
new file mode 100644
index 00000000000..a84206e9c4a
--- /dev/null
+++ b/tools/perf/bench/futex-hash.c
@@ -0,0 +1,212 @@
+/*
+ * Copyright (C) 2013  Davidlohr Bueso <davidlohr@hp.com>
+ *
+ * futex-hash: Stress the hell out of the Linux kernel futex uaddr hashing.
+ *
+ * This program is particularly useful for measuring the kernel's futex hash
+ * table/function implementation. In order for it to make sense, use with as
+ * many threads and futexes as possible.
+ */
+
+#include "../perf.h"
+#include "../util/util.h"
+#include "../util/stat.h"
+#include "../util/parse-options.h"
+#include "../util/header.h"
+#include "bench.h"
+#include "futex.h"
+
+#include <err.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+static unsigned int nthreads = 0;
+static unsigned int nsecs    = 10;
+/* amount of futexes per thread */
+static unsigned int nfutexes = 1024;
+static bool fshared = false, done = false, silent = false;
+
+struct timeval start, end, runtime;
+static pthread_mutex_t thread_lock;
+static unsigned int threads_starting;
+static struct stats throughput_stats;
+static pthread_cond_t thread_parent, thread_worker;
+
+struct worker {
+	int tid;
+	u_int32_t *futex;
+	pthread_t thread;
+	unsigned long ops;
+};
+
+static const struct option options[] = {
+	OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"),
+	OPT_UINTEGER('r', "runtime", &nsecs,    "Specify runtime (in seconds)"),
+	OPT_UINTEGER('f', "futexes", &nfutexes, "Specify amount of futexes per threads"),
+	OPT_BOOLEAN( 's', "silent",  &silent,   "Silent mode: do not display data/details"),
+	OPT_BOOLEAN( 'S', "shared",  &fshared,  "Use shared futexes instead of private ones"),
+	OPT_END()
+};
+
+static const char * const bench_futex_hash_usage[] = {
+	"perf bench futex hash <options>",
+	NULL
+};
+
+static void *workerfn(void *arg)
+{
+	int ret;
+	unsigned int i;
+	struct worker *w = (struct worker *) arg;
+
+	pthread_mutex_lock(&thread_lock);
+	threads_starting--;
+	if (!threads_starting)
+		pthread_cond_signal(&thread_parent);
+	pthread_cond_wait(&thread_worker, &thread_lock);
+	pthread_mutex_unlock(&thread_lock);
+
+	do {
+		for (i = 0; i < nfutexes; i++, w->ops++) {
+			/*
+			 * We want the futex calls to fail in order to stress
+			 * the hashing of uaddr and not measure other steps,
+			 * such as internal waitqueue handling, thus enlarging
+			 * the critical region protected by hb->lock.
+			 */
+			ret = futex_wait(&w->futex[i], 1234, NULL,
+					 fshared ? 0 : FUTEX_PRIVATE_FLAG);
+			if (!silent &&
+			    (!ret || errno != EAGAIN || errno != EWOULDBLOCK))
+				warn("Non-expected futex return call");
+		}
+	}  while (!done);
+
+	return NULL;
+}
+
+static void toggle_done(int sig __maybe_unused,
+			siginfo_t *info __maybe_unused,
+			void *uc __maybe_unused)
+{
+	/* inform all threads that we're done for the day */
+	done = true;
+	gettimeofday(&end, NULL);
+	timersub(&end, &start, &runtime);
+}
+
+static void print_summary(void)
+{
+	unsigned long avg = avg_stats(&throughput_stats);
+	double stddev = stddev_stats(&throughput_stats);
+
+	printf("%sAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n",
+	       !silent ? "\n" : "", avg, rel_stddev_stats(stddev, avg),
+	       (int) runtime.tv_sec);
+}
+
+int bench_futex_hash(int argc, const char **argv,
+		     const char *prefix __maybe_unused)
+{
+	int ret = 0;
+	cpu_set_t cpu;
+	struct sigaction act;
+	unsigned int i, ncpus;
+	pthread_attr_t thread_attr;
+	struct worker *worker = NULL;
+
+	argc = parse_options(argc, argv, options, bench_futex_hash_usage, 0);
+	if (argc) {
+		usage_with_options(bench_futex_hash_usage, options);
+		exit(EXIT_FAILURE);
+	}
+
+	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+
+	sigfillset(&act.sa_mask);
+	act.sa_sigaction = toggle_done;
+	sigaction(SIGINT, &act, NULL);
+
+	if (!nthreads) /* default to the number of CPUs */
+		nthreads = ncpus;
+
+	worker = calloc(nthreads, sizeof(*worker));
+	if (!worker)
+		goto errmem;
+
+	printf("Run summary [PID %d]: %d threads, each operating on %d [%s] futexes for %d secs.\n\n",
+	       getpid(), nthreads, nfutexes, fshared ? "shared":"private", nsecs);
+
+	init_stats(&throughput_stats);
+	pthread_mutex_init(&thread_lock, NULL);
+	pthread_cond_init(&thread_parent, NULL);
+	pthread_cond_init(&thread_worker, NULL);
+
+	threads_starting = nthreads;
+	pthread_attr_init(&thread_attr);
+	gettimeofday(&start, NULL);
+	for (i = 0; i < nthreads; i++) {
+		worker[i].tid = i;
+		worker[i].futex = calloc(nfutexes, sizeof(*worker[i].futex));
+		if (!worker[i].futex)
+			goto errmem;
+
+		CPU_ZERO(&cpu);
+		CPU_SET(i % ncpus, &cpu);
+
+		ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu);
+		if (ret)
+			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
+
+		ret = pthread_create(&worker[i].thread, &thread_attr, workerfn,
+				     (void *)(struct worker *) &worker[i]);
+		if (ret)
+			err(EXIT_FAILURE, "pthread_create");
+
+	}
+	pthread_attr_destroy(&thread_attr);
+
+	pthread_mutex_lock(&thread_lock);
+	while (threads_starting)
+		pthread_cond_wait(&thread_parent, &thread_lock);
+	pthread_cond_broadcast(&thread_worker);
+	pthread_mutex_unlock(&thread_lock);
+
+	sleep(nsecs);
+	toggle_done(0, NULL, NULL);
+
+	for (i = 0; i < nthreads; i++) {
+		ret = pthread_join(worker[i].thread, NULL);
+		if (ret)
+			err(EXIT_FAILURE, "pthread_join");
+	}
+
+	/* cleanup & report results */
+	pthread_cond_destroy(&thread_parent);
+	pthread_cond_destroy(&thread_worker);
+	pthread_mutex_destroy(&thread_lock);
+
+	for (i = 0; i < nthreads; i++) {
+		unsigned long t = worker[i].ops/runtime.tv_sec;
+		update_stats(&throughput_stats, t);
+		if (!silent) {
+			if (nfutexes == 1)
+				printf("[thread %2d] futex: %p [ %ld ops/sec ]\n",
+				       worker[i].tid, &worker[i].futex[0], t);
+			else
+				printf("[thread %2d] futexes: %p ... %p [ %ld ops/sec ]\n",
+				       worker[i].tid, &worker[i].futex[0],
+				       &worker[i].futex[nfutexes-1], t);
+		}
+
+		free(worker[i].futex);
+	}
+
+	print_summary();
+
+	free(worker);
+	return ret;
+errmem:
+	err(EXIT_FAILURE, "calloc");
+}
diff --git a/tools/perf/bench/futex-requeue.c b/tools/perf/bench/futex-requeue.c
new file mode 100644
index 00000000000..a16255876f1
--- /dev/null
+++ b/tools/perf/bench/futex-requeue.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (C) 2013  Davidlohr Bueso <davidlohr@hp.com>
+ *
+ * futex-requeue: Block a bunch of threads on futex1 and requeue them
+ *                on futex2, N at a time.
+ *
+ * This program is particularly useful to measure the latency of nthread
+ * requeues without waking up any tasks -- thus mimicking a regular futex_wait.
+ */
+
+#include "../perf.h"
+#include "../util/util.h"
+#include "../util/stat.h"
+#include "../util/parse-options.h"
+#include "../util/header.h"
+#include "bench.h"
+#include "futex.h"
+
+#include <err.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+static u_int32_t futex1 = 0, futex2 = 0;
+
+/*
+ * How many tasks to requeue at a time.
+ * Default to 1 in order to make the kernel work more.
+ */
+static unsigned int nrequeue = 1;
+
+/*
+ * There can be significant variance from run to run,
+ * the more repeats, the more exact the overall avg and
+ * the better idea of the futex latency.
+ */
+static unsigned int repeat = 10;
+
+static pthread_t *worker;
+static bool done = 0, silent = 0;
+static pthread_mutex_t thread_lock;
+static pthread_cond_t thread_parent, thread_worker;
+static struct stats requeuetime_stats, requeued_stats;
+static unsigned int ncpus, threads_starting, nthreads = 0;
+
+static const struct option options[] = {
+	OPT_UINTEGER('t', "threads",  &nthreads, "Specify amount of threads"),
+	OPT_UINTEGER('q', "nrequeue", &nrequeue, "Specify amount of threads to requeue at once"),
+	OPT_UINTEGER('r', "repeat",   &repeat,   "Specify amount of times to repeat the run"),
+	OPT_BOOLEAN( 's', "silent",   &silent,   "Silent mode: do not display data/details"),
+	OPT_END()
+};
+
+static const char * const bench_futex_requeue_usage[] = {
+	"perf bench futex requeue <options>",
+	NULL
+};
+
+static void print_summary(void)
+{
+	double requeuetime_avg = avg_stats(&requeuetime_stats);
+	double requeuetime_stddev = stddev_stats(&requeuetime_stats);
+	unsigned int requeued_avg = avg_stats(&requeued_stats);
+
+	printf("Requeued %d of %d threads in %.4f ms (+-%.2f%%)\n",
+	       requeued_avg,
+	       nthreads,
+	       requeuetime_avg/1e3,
+	       rel_stddev_stats(requeuetime_stddev, requeuetime_avg));
+}
+
+static void *workerfn(void *arg __maybe_unused)
+{
+	pthread_mutex_lock(&thread_lock);
+	threads_starting--;
+	if (!threads_starting)
+		pthread_cond_signal(&thread_parent);
+	pthread_cond_wait(&thread_worker, &thread_lock);
+	pthread_mutex_unlock(&thread_lock);
+
+	futex_wait(&futex1, 0, NULL, FUTEX_PRIVATE_FLAG);
+	return NULL;
+}
+
+static void block_threads(pthread_t *w,
+			  pthread_attr_t thread_attr)
+{
+	cpu_set_t cpu;
+	unsigned int i;
+
+	threads_starting = nthreads;
+
+	/* create and block all threads */
+	for (i = 0; i < nthreads; i++) {
+		CPU_ZERO(&cpu);
+		CPU_SET(i % ncpus, &cpu);
+
+		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu))
+			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
+
+		if (pthread_create(&w[i], &thread_attr, workerfn, NULL))
+			err(EXIT_FAILURE, "pthread_create");
+	}
+}
+
+static void toggle_done(int sig __maybe_unused,
+			siginfo_t *info __maybe_unused,
+			void *uc __maybe_unused)
+{
+	done = true;
+}
+
+int bench_futex_requeue(int argc, const char **argv,
+			const char *prefix __maybe_unused)
+{
+	int ret = 0;
+	unsigned int i, j;
+	struct sigaction act;
+	pthread_attr_t thread_attr;
+
+	argc = parse_options(argc, argv, options, bench_futex_requeue_usage, 0);
+	if (argc)
+		goto err;
+
+	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+
+	sigfillset(&act.sa_mask);
+	act.sa_sigaction = toggle_done;
+	sigaction(SIGINT, &act, NULL);
+
+	if (!nthreads)
+		nthreads = ncpus;
+
+	worker = calloc(nthreads, sizeof(*worker));
+	if (!worker)
+		err(EXIT_FAILURE, "calloc");
+
+	printf("Run summary [PID %d]: Requeuing %d threads (from %p to %p), "
+	       "%d at a time.\n\n",
+	       getpid(), nthreads, &futex1, &futex2, nrequeue);
+
+	init_stats(&requeued_stats);
+	init_stats(&requeuetime_stats);
+	pthread_attr_init(&thread_attr);
+	pthread_mutex_init(&thread_lock, NULL);
+	pthread_cond_init(&thread_parent, NULL);
+	pthread_cond_init(&thread_worker, NULL);
+
+	for (j = 0; j < repeat && !done; j++) {
+		unsigned int nrequeued = 0;
+		struct timeval start, end, runtime;
+
+		/* create, launch & block all threads */
+		block_threads(worker, thread_attr);
+
+		/* make sure all threads are already blocked */
+		pthread_mutex_lock(&thread_lock);
+		while (threads_starting)
+			pthread_cond_wait(&thread_parent, &thread_lock);
+		pthread_cond_broadcast(&thread_worker);
+		pthread_mutex_unlock(&thread_lock);
+
+		usleep(100000);
+
+		/* Ok, all threads are patiently blocked, start requeueing */
+		gettimeofday(&start, NULL);
+		for (nrequeued = 0; nrequeued < nthreads; nrequeued += nrequeue)
+			/*
+			 * Do not wakeup any tasks blocked on futex1, allowing
+			 * us to really measure futex_wait functionality.
+			 */
+			futex_cmp_requeue(&futex1, 0, &futex2, 0, nrequeue,
+					  FUTEX_PRIVATE_FLAG);
+		gettimeofday(&end, NULL);
+		timersub(&end, &start, &runtime);
+
+		update_stats(&requeued_stats, nrequeued);
+		update_stats(&requeuetime_stats, runtime.tv_usec);
+
+		if (!silent) {
+			printf("[Run %d]: Requeued %d of %d threads in %.4f ms\n",
+			       j + 1, nrequeued, nthreads, runtime.tv_usec/1e3);
+		}
+
+		/* everybody should be blocked on futex2, wake'em up */
+		nrequeued = futex_wake(&futex2, nthreads, FUTEX_PRIVATE_FLAG);
+		if (nthreads != nrequeued)
+			warnx("couldn't wakeup all tasks (%d/%d)", nrequeued, nthreads);
+
+		for (i = 0; i < nthreads; i++) {
+			ret = pthread_join(worker[i], NULL);
+			if (ret)
+				err(EXIT_FAILURE, "pthread_join");
+		}
+
+	}
+
+	/* cleanup & report results */
+	pthread_cond_destroy(&thread_parent);
+	pthread_cond_destroy(&thread_worker);
+	pthread_mutex_destroy(&thread_lock);
+	pthread_attr_destroy(&thread_attr);
+
+	print_summary();
+
+	free(worker);
+	return ret;
+err:
+	usage_with_options(bench_futex_requeue_usage, options);
+	exit(EXIT_FAILURE);
+}
diff --git a/tools/perf/bench/futex-wake.c b/tools/perf/bench/futex-wake.c
new file mode 100644
index 00000000000..d096169b161
--- /dev/null
+++ b/tools/perf/bench/futex-wake.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (C) 2013  Davidlohr Bueso <davidlohr@hp.com>
+ *
+ * futex-wake: Block a bunch of threads on a futex and wake'em up, N at a time.
+ *
+ * This program is particularly useful to measure the latency of nthread wakeups
+ * in non-error situations:  all waiters are queued and all wake calls wakeup
+ * one or more tasks, and thus the waitqueue is never empty.
+ */
+
+#include "../perf.h"
+#include "../util/util.h"
+#include "../util/stat.h"
+#include "../util/parse-options.h"
+#include "../util/header.h"
+#include "bench.h"
+#include "futex.h"
+
+#include <err.h>
+#include <stdlib.h>
+#include <sys/time.h>
+#include <pthread.h>
+
+/* all threads will block on the same futex */
+static u_int32_t futex1 = 0;
+
+/*
+ * How many wakeups to do at a time.
+ * Default to 1 in order to make the kernel work more.
+ */
+static unsigned int nwakes = 1;
+
+/*
+ * There can be significant variance from run to run,
+ * the more repeats, the more exact the overall avg and
+ * the better idea of the futex latency.
+ */
+static unsigned int repeat = 10;
+
+pthread_t *worker;
+static bool done = 0, silent = 0;
+static pthread_mutex_t thread_lock;
+static pthread_cond_t thread_parent, thread_worker;
+static struct stats waketime_stats, wakeup_stats;
+static unsigned int ncpus, threads_starting, nthreads = 0;
+
+static const struct option options[] = {
+	OPT_UINTEGER('t', "threads", &nthreads, "Specify amount of threads"),
+	OPT_UINTEGER('w', "nwakes",  &nwakes,   "Specify amount of threads to wake at once"),
+	OPT_UINTEGER('r', "repeat",  &repeat,   "Specify amount of times to repeat the run"),
+	OPT_BOOLEAN( 's', "silent",  &silent,   "Silent mode: do not display data/details"),
+	OPT_END()
+};
+
+static const char * const bench_futex_wake_usage[] = {
+	"perf bench futex wake <options>",
+	NULL
+};
+
+static void *workerfn(void *arg __maybe_unused)
+{
+	pthread_mutex_lock(&thread_lock);
+	threads_starting--;
+	if (!threads_starting)
+		pthread_cond_signal(&thread_parent);
+	pthread_cond_wait(&thread_worker, &thread_lock);
+	pthread_mutex_unlock(&thread_lock);
+
+	futex_wait(&futex1, 0, NULL, FUTEX_PRIVATE_FLAG);
+	return NULL;
+}
+
+static void print_summary(void)
+{
+	double waketime_avg = avg_stats(&waketime_stats);
+	double waketime_stddev = stddev_stats(&waketime_stats);
+	unsigned int wakeup_avg = avg_stats(&wakeup_stats);
+
+	printf("Wokeup %d of %d threads in %.4f ms (+-%.2f%%)\n",
+	       wakeup_avg,
+	       nthreads,
+	       waketime_avg/1e3,
+	       rel_stddev_stats(waketime_stddev, waketime_avg));
+}
+
+static void block_threads(pthread_t *w,
+			  pthread_attr_t thread_attr)
+{
+	cpu_set_t cpu;
+	unsigned int i;
+
+	threads_starting = nthreads;
+
+	/* create and block all threads */
+	for (i = 0; i < nthreads; i++) {
+		CPU_ZERO(&cpu);
+		CPU_SET(i % ncpus, &cpu);
+
+		if (pthread_attr_setaffinity_np(&thread_attr, sizeof(cpu_set_t), &cpu))
+			err(EXIT_FAILURE, "pthread_attr_setaffinity_np");
+
+		if (pthread_create(&w[i], &thread_attr, workerfn, NULL))
+			err(EXIT_FAILURE, "pthread_create");
+	}
+}
+
+static void toggle_done(int sig __maybe_unused,
+			siginfo_t *info __maybe_unused,
+			void *uc __maybe_unused)
+{
+	done = true;
+}
+
+int bench_futex_wake(int argc, const char **argv,
+		     const char *prefix __maybe_unused)
+{
+	int ret = 0;
+	unsigned int i, j;
+	struct sigaction act;
+	pthread_attr_t thread_attr;
+
+	argc = parse_options(argc, argv, options, bench_futex_wake_usage, 0);
+	if (argc) {
+		usage_with_options(bench_futex_wake_usage, options);
+		exit(EXIT_FAILURE);
+	}
+
+	ncpus = sysconf(_SC_NPROCESSORS_ONLN);
+
+	sigfillset(&act.sa_mask);
+	act.sa_sigaction = toggle_done;
+	sigaction(SIGINT, &act, NULL);
+
+	if (!nthreads)
+		nthreads = ncpus;
+
+	worker = calloc(nthreads, sizeof(*worker));
+	if (!worker)
+		err(EXIT_FAILURE, "calloc");
+
+	printf("Run summary [PID %d]: blocking on %d threads (at futex %p), "
+	       "waking up %d at a time.\n\n",
+	       getpid(), nthreads, &futex1, nwakes);
+
+	init_stats(&wakeup_stats);
+	init_stats(&waketime_stats);
+	pthread_attr_init(&thread_attr);
+	pthread_mutex_init(&thread_lock, NULL);
+	pthread_cond_init(&thread_parent, NULL);
+	pthread_cond_init(&thread_worker, NULL);
+
+	for (j = 0; j < repeat && !done; j++) {
+		unsigned int nwoken = 0;
+		struct timeval start, end, runtime;
+
+		/* create, launch & block all threads */
+		block_threads(worker, thread_attr);
+
+		/* make sure all threads are already blocked */
+		pthread_mutex_lock(&thread_lock);
+		while (threads_starting)
+			pthread_cond_wait(&thread_parent, &thread_lock);
+		pthread_cond_broadcast(&thread_worker);
+		pthread_mutex_unlock(&thread_lock);
+
+		usleep(100000);
+
+		/* Ok, all threads are patiently blocked, start waking folks up */
+		gettimeofday(&start, NULL);
+		while (nwoken != nthreads)
+			nwoken += futex_wake(&futex1, nwakes, FUTEX_PRIVATE_FLAG);
+		gettimeofday(&end, NULL);
+		timersub(&end, &start, &runtime);
+
+		update_stats(&wakeup_stats, nwoken);
+		update_stats(&waketime_stats, runtime.tv_usec);
+
+		if (!silent) {
+			printf("[Run %d]: Wokeup %d of %d threads in %.4f ms\n",
+			       j + 1, nwoken, nthreads, runtime.tv_usec/1e3);
+		}
+
+		for (i = 0; i < nthreads; i++) {
+			ret = pthread_join(worker[i], NULL);
+			if (ret)
+				err(EXIT_FAILURE, "pthread_join");
+		}
+
+	}
+
+	/* cleanup & report results */
+	pthread_cond_destroy(&thread_parent);
+	pthread_cond_destroy(&thread_worker);
+	pthread_mutex_destroy(&thread_lock);
+	pthread_attr_destroy(&thread_attr);
+
+	print_summary();
+
+	free(worker);
+	return ret;
+}
diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h
new file mode 100644
index 00000000000..71f2844cf97
--- /dev/null
+++ b/tools/perf/bench/futex.h
@@ -0,0 +1,71 @@
+/*
+ * Glibc independent futex library for testing kernel functionality.
+ * Shamelessly stolen from Darren Hart <dvhltc@us.ibm.com>
+ *    http://git.kernel.org/cgit/linux/kernel/git/dvhart/futextest.git/
+ */
+
+#ifndef _FUTEX_H
+#define _FUTEX_H
+
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <linux/futex.h>
+
+/**
+ * futex() - SYS_futex syscall wrapper
+ * @uaddr:	address of first futex
+ * @op:		futex op code
+ * @val:	typically expected value of uaddr, but varies by op
+ * @timeout:	typically an absolute struct timespec (except where noted
+ *		otherwise). Overloaded by some ops
+ * @uaddr2:	address of second futex for some ops\
+ * @val3:	varies by op
+ * @opflags:	flags to be bitwise OR'd with op, such as FUTEX_PRIVATE_FLAG
+ *
+ * futex() is used by all the following futex op wrappers. It can also be
+ * used for misuse and abuse testing. Generally, the specific op wrappers
+ * should be used instead. It is a macro instead of an static inline function as
+ * some of the types over overloaded (timeout is used for nr_requeue for
+ * example).
+ *
+ * These argument descriptions are the defaults for all
+ * like-named arguments in the following wrappers except where noted below.
+ */
+#define futex(uaddr, op, val, timeout, uaddr2, val3, opflags) \
+	syscall(SYS_futex, uaddr, op | opflags, val, timeout, uaddr2, val3)
+
+/**
+ * futex_wait() - block on uaddr with optional timeout
+ * @timeout:	relative timeout
+ */
+static inline int
+futex_wait(u_int32_t *uaddr, u_int32_t val, struct timespec *timeout, int opflags)
+{
+	return futex(uaddr, FUTEX_WAIT, val, timeout, NULL, 0, opflags);
+}
+
+/**
+ * futex_wake() - wake one or more tasks blocked on uaddr
+ * @nr_wake:	wake up to this many tasks
+ */
+static inline int
+futex_wake(u_int32_t *uaddr, int nr_wake, int opflags)
+{
+	return futex(uaddr, FUTEX_WAKE, nr_wake, NULL, NULL, 0, opflags);
+}
+
+/**
+* futex_cmp_requeue() - requeue tasks from uaddr to uaddr2
+* @nr_wake:        wake up to this many tasks
+* @nr_requeue:        requeue up to this many tasks
+*/
+static inline int
+futex_cmp_requeue(u_int32_t *uaddr, u_int32_t val, u_int32_t *uaddr2, int nr_wake,
+		 int nr_requeue, int opflags)
+{
+	return futex(uaddr, FUTEX_CMP_REQUEUE, nr_wake, nr_requeue, uaddr2,
+		 val, opflags);
+}
+
+#endif /* _FUTEX_H */
diff --git a/tools/perf/bench/mem-memcpy-arch.h b/tools/perf/bench/mem-memcpy-arch.h
new file mode 100644
index 00000000000..57b4ed87145
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-arch.h
@@ -0,0 +1,12 @@
+
+#ifdef HAVE_ARCH_X86_64_SUPPORT
+
+#define MEMCPY_FN(fn, name, desc)		\
+	extern void *fn(void *, const void *, size_t);
+
+#include "mem-memcpy-x86-64-asm-def.h"
+
+#undef MEMCPY_FN
+
+#endif
+
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
new file mode 100644
index 00000000000..d66ab799b35
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -0,0 +1,12 @@
+
+MEMCPY_FN(__memcpy,
+	"x86-64-unrolled",
+	"unrolled memcpy() in arch/x86/lib/memcpy_64.S")
+
+MEMCPY_FN(memcpy_c,
+	"x86-64-movsq",
+	"movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
+
+MEMCPY_FN(memcpy_c_e,
+	"x86-64-movsb",
+	"movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S
new file mode 100644
index 00000000000..fcd9cf00600
--- /dev/null
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -0,0 +1,12 @@
+#define memcpy MEMCPY /* don't hide glibc's memcpy() */
+#define altinstr_replacement text
+#define globl p2align 4; .globl
+#define Lmemcpy_c globl memcpy_c; memcpy_c
+#define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e
+#include "../../../arch/x86/lib/memcpy_64.S"
+/*
+ * We need to provide note.GNU-stack section, saying that we want
+ * NOT executable stack. Otherwise the final linking will assume that
+ * the ELF stack should not be restricted at all and set it RWX.
+ */
+.section .note.GNU-stack,"",@progbits
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index 89773178e89..5ce71d3b72c 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -5,14 +5,13 @@
  *
  * Written by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
  */
-#include <ctype.h>
 
 #include "../perf.h"
 #include "../util/util.h"
 #include "../util/parse-options.h"
-#include "../util/string.h"
 #include "../util/header.h"
 #include "bench.h"
+#include "mem-memcpy-arch.h"
 
 #include <stdio.h>
 #include <stdlib.h>
@@ -24,30 +23,49 @@
 
 static const char	*length_str	= "1MB";
 static const char	*routine	= "default";
-static int		use_clock	= 0;
-static int		clock_fd;
+static int		iterations	= 1;
+static bool		use_cycle;
+static int		cycle_fd;
+static bool		only_prefault;
+static bool		no_prefault;
 
 static const struct option options[] = {
 	OPT_STRING('l', "length", &length_str, "1MB",
 		    "Specify length of memory to copy. "
-		    "available unit: B, MB, GB (upper and lower)"),
+		    "Available units: B, KB, MB, GB and TB (upper and lower)"),
 	OPT_STRING('r', "routine", &routine, "default",
 		    "Specify routine to copy"),
-	OPT_BOOLEAN('c', "clock", &use_clock,
-		    "Use CPU clock for measuring"),
+	OPT_INTEGER('i', "iterations", &iterations,
+		    "repeat memcpy() invocation this number of times"),
+	OPT_BOOLEAN('c', "cycle", &use_cycle,
+		    "Use cycles event instead of gettimeofday() for measuring"),
+	OPT_BOOLEAN('o', "only-prefault", &only_prefault,
+		    "Show only the result with page faults before memcpy()"),
+	OPT_BOOLEAN('n', "no-prefault", &no_prefault,
+		    "Show only the result without page faults before memcpy()"),
 	OPT_END()
 };
 
+typedef void *(*memcpy_t)(void *, const void *, size_t);
+
 struct routine {
 	const char *name;
 	const char *desc;
-	void * (*fn)(void *dst, const void *src, size_t len);
+	memcpy_t fn;
 };
 
 struct routine routines[] = {
 	{ "default",
 	  "Default memcpy() provided by glibc",
 	  memcpy },
+#ifdef HAVE_ARCH_X86_64_SUPPORT
+
+#define MEMCPY_FN(fn, name, desc) { name, desc, fn },
+#include "mem-memcpy-x86-64-asm-def.h"
+#undef MEMCPY_FN
+
+#endif
+
 	{ NULL,
 	  NULL,
 	  NULL   }
@@ -58,27 +76,27 @@ static const char * const bench_mem_memcpy_usage[] = {
 	NULL
 };
 
-static struct perf_event_attr clock_attr = {
+static struct perf_event_attr cycle_attr = {
 	.type		= PERF_TYPE_HARDWARE,
 	.config		= PERF_COUNT_HW_CPU_CYCLES
 };
 
-static void init_clock(void)
+static void init_cycle(void)
 {
-	clock_fd = sys_perf_event_open(&clock_attr, getpid(), -1, -1, 0);
+	cycle_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, 0);
 
-	if (clock_fd < 0 && errno == ENOSYS)
+	if (cycle_fd < 0 && errno == ENOSYS)
 		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
 	else
-		BUG_ON(clock_fd < 0);
+		BUG_ON(cycle_fd < 0);
 }
 
-static u64 get_clock(void)
+static u64 get_cycle(void)
 {
 	int ret;
 	u64 clk;
 
-	ret = read(clock_fd, &clk, sizeof(u64));
+	ret = read(cycle_fd, &clk, sizeof(u64));
 	BUG_ON(ret != sizeof(u64));
 
 	return clk;
@@ -90,29 +108,104 @@ static double timeval2double(struct timeval *ts)
 		(double)ts->tv_usec / (double)1000000;
 }
 
-int bench_mem_memcpy(int argc, const char **argv,
-		     const char *prefix __used)
+static void alloc_mem(void **dst, void **src, size_t length)
+{
+	*dst = zalloc(length);
+	if (!*dst)
+		die("memory allocation failed - maybe length is too large?\n");
+
+	*src = zalloc(length);
+	if (!*src)
+		die("memory allocation failed - maybe length is too large?\n");
+	/* Make sure to always replace the zero pages even if MMAP_THRESH is crossed */
+	memset(*src, 0, length);
+}
+
+static u64 do_memcpy_cycle(memcpy_t fn, size_t len, bool prefault)
 {
+	u64 cycle_start = 0ULL, cycle_end = 0ULL;
+	void *src = NULL, *dst = NULL;
 	int i;
-	void *dst, *src;
-	size_t length;
-	double bps = 0.0;
+
+	alloc_mem(&src, &dst, len);
+
+	if (prefault)
+		fn(dst, src, len);
+
+	cycle_start = get_cycle();
+	for (i = 0; i < iterations; ++i)
+		fn(dst, src, len);
+	cycle_end = get_cycle();
+
+	free(src);
+	free(dst);
+	return cycle_end - cycle_start;
+}
+
+static double do_memcpy_gettimeofday(memcpy_t fn, size_t len, bool prefault)
+{
 	struct timeval tv_start, tv_end, tv_diff;
-	u64 clock_start, clock_end, clock_diff;
+	void *src = NULL, *dst = NULL;
+	int i;
+
+	alloc_mem(&src, &dst, len);
+
+	if (prefault)
+		fn(dst, src, len);
+
+	BUG_ON(gettimeofday(&tv_start, NULL));
+	for (i = 0; i < iterations; ++i)
+		fn(dst, src, len);
+	BUG_ON(gettimeofday(&tv_end, NULL));
+
+	timersub(&tv_end, &tv_start, &tv_diff);
+
+	free(src);
+	free(dst);
+	return (double)((double)len / timeval2double(&tv_diff));
+}
+
+#define pf (no_prefault ? 0 : 1)
+
+#define print_bps(x) do {					\
+		if (x < K)					\
+			printf(" %14lf B/Sec", x);		\
+		else if (x < K * K)				\
+			printf(" %14lfd KB/Sec", x / K);	\
+		else if (x < K * K * K)				\
+			printf(" %14lf MB/Sec", x / K / K);	\
+		else						\
+			printf(" %14lf GB/Sec", x / K / K / K); \
+	} while (0)
+
+int bench_mem_memcpy(int argc, const char **argv,
+		     const char *prefix __maybe_unused)
+{
+	int i;
+	size_t len;
+	double result_bps[2];
+	u64 result_cycle[2];
 
-	clock_start = clock_end = clock_diff = 0ULL;
 	argc = parse_options(argc, argv, options,
 			     bench_mem_memcpy_usage, 0);
 
-	tv_diff.tv_sec = 0;
-	tv_diff.tv_usec = 0;
-	length = (size_t)perf_atoll((char *)length_str);
+	if (use_cycle)
+		init_cycle();
 
-	if ((s64)length <= 0) {
+	len = (size_t)perf_atoll((char *)length_str);
+
+	result_cycle[0] = result_cycle[1] = 0ULL;
+	result_bps[0] = result_bps[1] = 0.0;
+
+	if ((s64)len <= 0) {
 		fprintf(stderr, "Invalid length:%s\n", length_str);
 		return 1;
 	}
 
+	/* same to without specifying either of prefault and no-prefault */
+	if (only_prefault && no_prefault)
+		only_prefault = no_prefault = false;
+
 	for (i = 0; routines[i].name; i++) {
 		if (!strcmp(routines[i].name, routine))
 			break;
@@ -127,61 +220,80 @@ int bench_mem_memcpy(int argc, const char **argv,
 		return 1;
 	}
 
-	dst = zalloc(length);
-	if (!dst)
-		die("memory allocation failed - maybe length is too large?\n");
-
-	src = zalloc(length);
-	if (!src)
-		die("memory allocation failed - maybe length is too large?\n");
-
-	if (bench_format == BENCH_FORMAT_DEFAULT) {
-		printf("# Copying %s Bytes from %p to %p ...\n\n",
-		       length_str, src, dst);
-	}
-
-	if (use_clock) {
-		init_clock();
-		clock_start = get_clock();
-	} else {
-		BUG_ON(gettimeofday(&tv_start, NULL));
-	}
+	if (bench_format == BENCH_FORMAT_DEFAULT)
+		printf("# Copying %s Bytes ...\n\n", length_str);
 
-	routines[i].fn(dst, src, length);
-
-	if (use_clock) {
-		clock_end = get_clock();
-		clock_diff = clock_end - clock_start;
+	if (!only_prefault && !no_prefault) {
+		/* show both of results */
+		if (use_cycle) {
+			result_cycle[0] =
+				do_memcpy_cycle(routines[i].fn, len, false);
+			result_cycle[1] =
+				do_memcpy_cycle(routines[i].fn, len, true);
+		} else {
+			result_bps[0] =
+				do_memcpy_gettimeofday(routines[i].fn,
+						len, false);
+			result_bps[1] =
+				do_memcpy_gettimeofday(routines[i].fn,
+						len, true);
+		}
 	} else {
-		BUG_ON(gettimeofday(&tv_end, NULL));
-		timersub(&tv_end, &tv_start, &tv_diff);
-		bps = (double)((double)length / timeval2double(&tv_diff));
+		if (use_cycle) {
+			result_cycle[pf] =
+				do_memcpy_cycle(routines[i].fn,
+						len, only_prefault);
+		} else {
+			result_bps[pf] =
+				do_memcpy_gettimeofday(routines[i].fn,
+						len, only_prefault);
+		}
 	}
 
 	switch (bench_format) {
 	case BENCH_FORMAT_DEFAULT:
-		if (use_clock) {
-			printf(" %14lf Clock/Byte\n",
-			       (double)clock_diff / (double)length);
-		} else {
-			if (bps < K)
-				printf(" %14lf B/Sec\n", bps);
-			else if (bps < K * K)
-				printf(" %14lfd KB/Sec\n", bps / 1024);
-			else if (bps < K * K * K)
-				printf(" %14lf MB/Sec\n", bps / 1024 / 1024);
-			else {
-				printf(" %14lf GB/Sec\n",
-				       bps / 1024 / 1024 / 1024);
+		if (!only_prefault && !no_prefault) {
+			if (use_cycle) {
+				printf(" %14lf Cycle/Byte\n",
+					(double)result_cycle[0]
+					/ (double)len);
+				printf(" %14lf Cycle/Byte (with prefault)\n",
+					(double)result_cycle[1]
+					/ (double)len);
+			} else {
+				print_bps(result_bps[0]);
+				printf("\n");
+				print_bps(result_bps[1]);
+				printf(" (with prefault)\n");
 			}
+		} else {
+			if (use_cycle) {
+				printf(" %14lf Cycle/Byte",
+					(double)result_cycle[pf]
+					/ (double)len);
+			} else
+				print_bps(result_bps[pf]);
+
+			printf("%s\n", only_prefault ? " (with prefault)" : "");
 		}
 		break;
 	case BENCH_FORMAT_SIMPLE:
-		if (use_clock) {
-			printf("%14lf\n",
-			       (double)clock_diff / (double)length);
-		} else
-			printf("%lf\n", bps);
+		if (!only_prefault && !no_prefault) {
+			if (use_cycle) {
+				printf("%lf %lf\n",
+					(double)result_cycle[0] / (double)len,
+					(double)result_cycle[1] / (double)len);
+			} else {
+				printf("%lf %lf\n",
+					result_bps[0], result_bps[1]);
+			}
+		} else {
+			if (use_cycle) {
+				printf("%lf\n", (double)result_cycle[pf]
+					/ (double)len);
+			} else
+				printf("%lf\n", result_bps[pf]);
+		}
 		break;
 	default:
 		/* reaching this means there's some disaster: */
diff --git a/tools/perf/bench/mem-memset-arch.h b/tools/perf/bench/mem-memset-arch.h
new file mode 100644
index 00000000000..633800cb0dc
--- /dev/null
+++ b/tools/perf/bench/mem-memset-arch.h
@@ -0,0 +1,12 @@
+
+#ifdef HAVE_ARCH_X86_64_SUPPORT
+
+#define MEMSET_FN(fn, name, desc)		\
+	extern void *fn(void *, int, size_t);
+
+#include "mem-memset-x86-64-asm-def.h"
+
+#undef MEMSET_FN
+
+#endif
+
diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h
new file mode 100644
index 00000000000..a71dff97c1f
--- /dev/null
+++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h
@@ -0,0 +1,12 @@
+
+MEMSET_FN(__memset,
+	"x86-64-unrolled",
+	"unrolled memset() in arch/x86/lib/memset_64.S")
+
+MEMSET_FN(memset_c,
+	"x86-64-stosq",
+	"movsq-based memset() in arch/x86/lib/memset_64.S")
+
+MEMSET_FN(memset_c_e,
+	"x86-64-stosb",
+	"movsb-based memset() in arch/x86/lib/memset_64.S")
diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S
new file mode 100644
index 00000000000..9e5af89ed13
--- /dev/null
+++ b/tools/perf/bench/mem-memset-x86-64-asm.S
@@ -0,0 +1,13 @@
+#define memset MEMSET /* don't hide glibc's memset() */
+#define altinstr_replacement text
+#define globl p2align 4; .globl
+#define Lmemset_c globl memset_c; memset_c
+#define Lmemset_c_e globl memset_c_e; memset_c_e
+#include "../../../arch/x86/lib/memset_64.S"
+
+/*
+ * We need to provide note.GNU-stack section, saying that we want
+ * NOT executable stack. Otherwise the final linking will assume that
+ * the ELF stack should not be restricted at all and set it RWX.
+ */
+.section .note.GNU-stack,"",@progbits
diff --git a/tools/perf/bench/mem-memset.c b/tools/perf/bench/mem-memset.c
new file mode 100644
index 00000000000..9af79d2b18e
--- /dev/null
+++ b/tools/perf/bench/mem-memset.c
@@ -0,0 +1,297 @@
+/*
+ * mem-memset.c
+ *
+ * memset: Simple memory set in various ways
+ *
+ * Trivial clone of mem-memcpy.c.
+ */
+
+#include "../perf.h"
+#include "../util/util.h"
+#include "../util/parse-options.h"
+#include "../util/header.h"
+#include "bench.h"
+#include "mem-memset-arch.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/time.h>
+#include <errno.h>
+
+#define K 1024
+
+static const char	*length_str	= "1MB";
+static const char	*routine	= "default";
+static int		iterations	= 1;
+static bool		use_cycle;
+static int		cycle_fd;
+static bool		only_prefault;
+static bool		no_prefault;
+
+static const struct option options[] = {
+	OPT_STRING('l', "length", &length_str, "1MB",
+		    "Specify length of memory to set. "
+		    "Available units: B, KB, MB, GB and TB (upper and lower)"),
+	OPT_STRING('r', "routine", &routine, "default",
+		    "Specify routine to set"),
+	OPT_INTEGER('i', "iterations", &iterations,
+		    "repeat memset() invocation this number of times"),
+	OPT_BOOLEAN('c', "cycle", &use_cycle,
+		    "Use cycles event instead of gettimeofday() for measuring"),
+	OPT_BOOLEAN('o', "only-prefault", &only_prefault,
+		    "Show only the result with page faults before memset()"),
+	OPT_BOOLEAN('n', "no-prefault", &no_prefault,
+		    "Show only the result without page faults before memset()"),
+	OPT_END()
+};
+
+typedef void *(*memset_t)(void *, int, size_t);
+
+struct routine {
+	const char *name;
+	const char *desc;
+	memset_t fn;
+};
+
+static const struct routine routines[] = {
+	{ "default",
+	  "Default memset() provided by glibc",
+	  memset },
+#ifdef HAVE_ARCH_X86_64_SUPPORT
+
+#define MEMSET_FN(fn, name, desc) { name, desc, fn },
+#include "mem-memset-x86-64-asm-def.h"
+#undef MEMSET_FN
+
+#endif
+
+	{ NULL,
+	  NULL,
+	  NULL   }
+};
+
+static const char * const bench_mem_memset_usage[] = {
+	"perf bench mem memset <options>",
+	NULL
+};
+
+static struct perf_event_attr cycle_attr = {
+	.type		= PERF_TYPE_HARDWARE,
+	.config		= PERF_COUNT_HW_CPU_CYCLES
+};
+
+static void init_cycle(void)
+{
+	cycle_fd = sys_perf_event_open(&cycle_attr, getpid(), -1, -1, 0);
+
+	if (cycle_fd < 0 && errno == ENOSYS)
+		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
+	else
+		BUG_ON(cycle_fd < 0);
+}
+
+static u64 get_cycle(void)
+{
+	int ret;
+	u64 clk;
+
+	ret = read(cycle_fd, &clk, sizeof(u64));
+	BUG_ON(ret != sizeof(u64));
+
+	return clk;
+}
+
+static double timeval2double(struct timeval *ts)
+{
+	return (double)ts->tv_sec +
+		(double)ts->tv_usec / (double)1000000;
+}
+
+static void alloc_mem(void **dst, size_t length)
+{
+	*dst = zalloc(length);
+	if (!*dst)
+		die("memory allocation failed - maybe length is too large?\n");
+}
+
+static u64 do_memset_cycle(memset_t fn, size_t len, bool prefault)
+{
+	u64 cycle_start = 0ULL, cycle_end = 0ULL;
+	void *dst = NULL;
+	int i;
+
+	alloc_mem(&dst, len);
+
+	if (prefault)
+		fn(dst, -1, len);
+
+	cycle_start = get_cycle();
+	for (i = 0; i < iterations; ++i)
+		fn(dst, i, len);
+	cycle_end = get_cycle();
+
+	free(dst);
+	return cycle_end - cycle_start;
+}
+
+static double do_memset_gettimeofday(memset_t fn, size_t len, bool prefault)
+{
+	struct timeval tv_start, tv_end, tv_diff;
+	void *dst = NULL;
+	int i;
+
+	alloc_mem(&dst, len);
+
+	if (prefault)
+		fn(dst, -1, len);
+
+	BUG_ON(gettimeofday(&tv_start, NULL));
+	for (i = 0; i < iterations; ++i)
+		fn(dst, i, len);
+	BUG_ON(gettimeofday(&tv_end, NULL));
+
+	timersub(&tv_end, &tv_start, &tv_diff);
+
+	free(dst);
+	return (double)((double)len / timeval2double(&tv_diff));
+}
+
+#define pf (no_prefault ? 0 : 1)
+
+#define print_bps(x) do {					\
+		if (x < K)					\
+			printf(" %14lf B/Sec", x);		\
+		else if (x < K * K)				\
+			printf(" %14lfd KB/Sec", x / K);	\
+		else if (x < K * K * K)				\
+			printf(" %14lf MB/Sec", x / K / K);	\
+		else						\
+			printf(" %14lf GB/Sec", x / K / K / K); \
+	} while (0)
+
+int bench_mem_memset(int argc, const char **argv,
+		     const char *prefix __maybe_unused)
+{
+	int i;
+	size_t len;
+	double result_bps[2];
+	u64 result_cycle[2];
+
+	argc = parse_options(argc, argv, options,
+			     bench_mem_memset_usage, 0);
+
+	if (use_cycle)
+		init_cycle();
+
+	len = (size_t)perf_atoll((char *)length_str);
+
+	result_cycle[0] = result_cycle[1] = 0ULL;
+	result_bps[0] = result_bps[1] = 0.0;
+
+	if ((s64)len <= 0) {
+		fprintf(stderr, "Invalid length:%s\n", length_str);
+		return 1;
+	}
+
+	/* same to without specifying either of prefault and no-prefault */
+	if (only_prefault && no_prefault)
+		only_prefault = no_prefault = false;
+
+	for (i = 0; routines[i].name; i++) {
+		if (!strcmp(routines[i].name, routine))
+			break;
+	}
+	if (!routines[i].name) {
+		printf("Unknown routine:%s\n", routine);
+		printf("Available routines...\n");
+		for (i = 0; routines[i].name; i++) {
+			printf("\t%s ... %s\n",
+			       routines[i].name, routines[i].desc);
+		}
+		return 1;
+	}
+
+	if (bench_format == BENCH_FORMAT_DEFAULT)
+		printf("# Copying %s Bytes ...\n\n", length_str);
+
+	if (!only_prefault && !no_prefault) {
+		/* show both of results */
+		if (use_cycle) {
+			result_cycle[0] =
+				do_memset_cycle(routines[i].fn, len, false);
+			result_cycle[1] =
+				do_memset_cycle(routines[i].fn, len, true);
+		} else {
+			result_bps[0] =
+				do_memset_gettimeofday(routines[i].fn,
+						len, false);
+			result_bps[1] =
+				do_memset_gettimeofday(routines[i].fn,
+						len, true);
+		}
+	} else {
+		if (use_cycle) {
+			result_cycle[pf] =
+				do_memset_cycle(routines[i].fn,
+						len, only_prefault);
+		} else {
+			result_bps[pf] =
+				do_memset_gettimeofday(routines[i].fn,
+						len, only_prefault);
+		}
+	}
+
+	switch (bench_format) {
+	case BENCH_FORMAT_DEFAULT:
+		if (!only_prefault && !no_prefault) {
+			if (use_cycle) {
+				printf(" %14lf Cycle/Byte\n",
+					(double)result_cycle[0]
+					/ (double)len);
+				printf(" %14lf Cycle/Byte (with prefault)\n ",
+					(double)result_cycle[1]
+					/ (double)len);
+			} else {
+				print_bps(result_bps[0]);
+				printf("\n");
+				print_bps(result_bps[1]);
+				printf(" (with prefault)\n");
+			}
+		} else {
+			if (use_cycle) {
+				printf(" %14lf Cycle/Byte",
+					(double)result_cycle[pf]
+					/ (double)len);
+			} else
+				print_bps(result_bps[pf]);
+
+			printf("%s\n", only_prefault ? " (with prefault)" : "");
+		}
+		break;
+	case BENCH_FORMAT_SIMPLE:
+		if (!only_prefault && !no_prefault) {
+			if (use_cycle) {
+				printf("%lf %lf\n",
+					(double)result_cycle[0] / (double)len,
+					(double)result_cycle[1] / (double)len);
+			} else {
+				printf("%lf %lf\n",
+					result_bps[0], result_bps[1]);
+			}
+		} else {
+			if (use_cycle) {
+				printf("%lf\n", (double)result_cycle[pf]
+					/ (double)len);
+			} else
+				printf("%lf\n", result_bps[pf]);
+		}
+		break;
+	default:
+		/* reaching this means there's some disaster: */
+		die("unknown format: %d\n", bench_format);
+		break;
+	}
+
+	return 0;
+}
diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c
new file mode 100644
index 00000000000..ebfa163b80b
--- /dev/null
+++ b/tools/perf/bench/numa.c
@@ -0,0 +1,1744 @@
+/*
+ * numa.c
+ *
+ * numa: Simulate NUMA-sensitive workload and measure their NUMA performance
+ */
+
+#include "../perf.h"
+#include "../builtin.h"
+#include "../util/util.h"
+#include "../util/parse-options.h"
+
+#include "bench.h"
+
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <assert.h>
+#include <malloc.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/prctl.h>
+#include <sys/types.h>
+
+#include <numa.h>
+#include <numaif.h>
+
+/*
+ * Regular printout to the terminal, supressed if -q is specified:
+ */
+#define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0)
+
+/*
+ * Debug printf:
+ */
+#define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0)
+
+struct thread_data {
+	int			curr_cpu;
+	cpu_set_t		bind_cpumask;
+	int			bind_node;
+	u8			*process_data;
+	int			process_nr;
+	int			thread_nr;
+	int			task_nr;
+	unsigned int		loops_done;
+	u64			val;
+	u64			runtime_ns;
+	pthread_mutex_t		*process_lock;
+};
+
+/* Parameters set by options: */
+
+struct params {
+	/* Startup synchronization: */
+	bool			serialize_startup;
+
+	/* Task hierarchy: */
+	int			nr_proc;
+	int			nr_threads;
+
+	/* Working set sizes: */
+	const char		*mb_global_str;
+	const char		*mb_proc_str;
+	const char		*mb_proc_locked_str;
+	const char		*mb_thread_str;
+
+	double			mb_global;
+	double			mb_proc;
+	double			mb_proc_locked;
+	double			mb_thread;
+
+	/* Access patterns to the working set: */
+	bool			data_reads;
+	bool			data_writes;
+	bool			data_backwards;
+	bool			data_zero_memset;
+	bool			data_rand_walk;
+	u32			nr_loops;
+	u32			nr_secs;
+	u32			sleep_usecs;
+
+	/* Working set initialization: */
+	bool			init_zero;
+	bool			init_random;
+	bool			init_cpu0;
+
+	/* Misc options: */
+	int			show_details;
+	int			run_all;
+	int			thp;
+
+	long			bytes_global;
+	long			bytes_process;
+	long			bytes_process_locked;
+	long			bytes_thread;
+
+	int			nr_tasks;
+	bool			show_quiet;
+
+	bool			show_convergence;
+	bool			measure_convergence;
+
+	int			perturb_secs;
+	int			nr_cpus;
+	int			nr_nodes;
+
+	/* Affinity options -C and -N: */
+	char			*cpu_list_str;
+	char			*node_list_str;
+};
+
+
+/* Global, read-writable area, accessible to all processes and threads: */
+
+struct global_info {
+	u8			*data;
+
+	pthread_mutex_t		startup_mutex;
+	int			nr_tasks_started;
+
+	pthread_mutex_t		startup_done_mutex;
+
+	pthread_mutex_t		start_work_mutex;
+	int			nr_tasks_working;
+
+	pthread_mutex_t		stop_work_mutex;
+	u64			bytes_done;
+
+	struct thread_data	*threads;
+
+	/* Convergence latency measurement: */
+	bool			all_converged;
+	bool			stop_work;
+
+	int			print_once;
+
+	struct params		p;
+};
+
+static struct global_info	*g = NULL;
+
+static int parse_cpus_opt(const struct option *opt, const char *arg, int unset);
+static int parse_nodes_opt(const struct option *opt, const char *arg, int unset);
+
+struct params p0;
+
+static const struct option options[] = {
+	OPT_INTEGER('p', "nr_proc"	, &p0.nr_proc,		"number of processes"),
+	OPT_INTEGER('t', "nr_threads"	, &p0.nr_threads,	"number of threads per process"),
+
+	OPT_STRING('G', "mb_global"	, &p0.mb_global_str,	"MB", "global  memory (MBs)"),
+	OPT_STRING('P', "mb_proc"	, &p0.mb_proc_str,	"MB", "process memory (MBs)"),
+	OPT_STRING('L', "mb_proc_locked", &p0.mb_proc_locked_str,"MB", "process serialized/locked memory access (MBs), <= process_memory"),
+	OPT_STRING('T', "mb_thread"	, &p0.mb_thread_str,	"MB", "thread  memory (MBs)"),
+
+	OPT_UINTEGER('l', "nr_loops"	, &p0.nr_loops,		"max number of loops to run"),
+	OPT_UINTEGER('s', "nr_secs"	, &p0.nr_secs,		"max number of seconds to run"),
+	OPT_UINTEGER('u', "usleep"	, &p0.sleep_usecs,	"usecs to sleep per loop iteration"),
+
+	OPT_BOOLEAN('R', "data_reads"	, &p0.data_reads,	"access the data via writes (can be mixed with -W)"),
+	OPT_BOOLEAN('W', "data_writes"	, &p0.data_writes,	"access the data via writes (can be mixed with -R)"),
+	OPT_BOOLEAN('B', "data_backwards", &p0.data_backwards,	"access the data backwards as well"),
+	OPT_BOOLEAN('Z', "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"),
+	OPT_BOOLEAN('r', "data_rand_walk", &p0.data_rand_walk,	"access the data with random (32bit LFSR) walk"),
+
+
+	OPT_BOOLEAN('z', "init_zero"	, &p0.init_zero,	"bzero the initial allocations"),
+	OPT_BOOLEAN('I', "init_random"	, &p0.init_random,	"randomize the contents of the initial allocations"),
+	OPT_BOOLEAN('0', "init_cpu0"	, &p0.init_cpu0,	"do the initial allocations on CPU#0"),
+	OPT_INTEGER('x', "perturb_secs", &p0.perturb_secs,	"perturb thread 0/0 every X secs, to test convergence stability"),
+
+	OPT_INCR   ('d', "show_details"	, &p0.show_details,	"Show details"),
+	OPT_INCR   ('a', "all"		, &p0.run_all,		"Run all tests in the suite"),
+	OPT_INTEGER('H', "thp"		, &p0.thp,		"MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
+	OPT_BOOLEAN('c', "show_convergence", &p0.show_convergence, "show convergence details"),
+	OPT_BOOLEAN('m', "measure_convergence",	&p0.measure_convergence, "measure convergence latency"),
+	OPT_BOOLEAN('q', "quiet"	, &p0.show_quiet,	"bzero the initial allocations"),
+	OPT_BOOLEAN('S', "serialize-startup", &p0.serialize_startup,"serialize thread startup"),
+
+	/* Special option string parsing callbacks: */
+        OPT_CALLBACK('C', "cpus", NULL, "cpu[,cpu2,...cpuN]",
+			"bind the first N tasks to these specific cpus (the rest is unbound)",
+			parse_cpus_opt),
+        OPT_CALLBACK('M', "memnodes", NULL, "node[,node2,...nodeN]",
+			"bind the first N tasks to these specific memory nodes (the rest is unbound)",
+			parse_nodes_opt),
+	OPT_END()
+};
+
+static const char * const bench_numa_usage[] = {
+	"perf bench numa <options>",
+	NULL
+};
+
+static const char * const numa_usage[] = {
+	"perf bench numa mem [<options>]",
+	NULL
+};
+
+static cpu_set_t bind_to_cpu(int target_cpu)
+{
+	cpu_set_t orig_mask, mask;
+	int ret;
+
+	ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
+	BUG_ON(ret);
+
+	CPU_ZERO(&mask);
+
+	if (target_cpu == -1) {
+		int cpu;
+
+		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
+			CPU_SET(cpu, &mask);
+	} else {
+		BUG_ON(target_cpu < 0 || target_cpu >= g->p.nr_cpus);
+		CPU_SET(target_cpu, &mask);
+	}
+
+	ret = sched_setaffinity(0, sizeof(mask), &mask);
+	BUG_ON(ret);
+
+	return orig_mask;
+}
+
+static cpu_set_t bind_to_node(int target_node)
+{
+	int cpus_per_node = g->p.nr_cpus/g->p.nr_nodes;
+	cpu_set_t orig_mask, mask;
+	int cpu;
+	int ret;
+
+	BUG_ON(cpus_per_node*g->p.nr_nodes != g->p.nr_cpus);
+	BUG_ON(!cpus_per_node);
+
+	ret = sched_getaffinity(0, sizeof(orig_mask), &orig_mask);
+	BUG_ON(ret);
+
+	CPU_ZERO(&mask);
+
+	if (target_node == -1) {
+		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
+			CPU_SET(cpu, &mask);
+	} else {
+		int cpu_start = (target_node + 0) * cpus_per_node;
+		int cpu_stop  = (target_node + 1) * cpus_per_node;
+
+		BUG_ON(cpu_stop > g->p.nr_cpus);
+
+		for (cpu = cpu_start; cpu < cpu_stop; cpu++)
+			CPU_SET(cpu, &mask);
+	}
+
+	ret = sched_setaffinity(0, sizeof(mask), &mask);
+	BUG_ON(ret);
+
+	return orig_mask;
+}
+
+static void bind_to_cpumask(cpu_set_t mask)
+{
+	int ret;
+
+	ret = sched_setaffinity(0, sizeof(mask), &mask);
+	BUG_ON(ret);
+}
+
+static void mempol_restore(void)
+{
+	int ret;
+
+	ret = set_mempolicy(MPOL_DEFAULT, NULL, g->p.nr_nodes-1);
+
+	BUG_ON(ret);
+}
+
+static void bind_to_memnode(int node)
+{
+	unsigned long nodemask;
+	int ret;
+
+	if (node == -1)
+		return;
+
+	BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask));
+	nodemask = 1L << node;
+
+	ret = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8);
+	dprintf("binding to node %d, mask: %016lx => %d\n", node, nodemask, ret);
+
+	BUG_ON(ret);
+}
+
+#define HPSIZE (2*1024*1024)
+
+#define set_taskname(fmt...)				\
+do {							\
+	char name[20];					\
+							\
+	snprintf(name, 20, fmt);			\
+	prctl(PR_SET_NAME, name);			\
+} while (0)
+
+static u8 *alloc_data(ssize_t bytes0, int map_flags,
+		      int init_zero, int init_cpu0, int thp, int init_random)
+{
+	cpu_set_t orig_mask;
+	ssize_t bytes;
+	u8 *buf;
+	int ret;
+
+	if (!bytes0)
+		return NULL;
+
+	/* Allocate and initialize all memory on CPU#0: */
+	if (init_cpu0) {
+		orig_mask = bind_to_node(0);
+		bind_to_memnode(0);
+	}
+
+	bytes = bytes0 + HPSIZE;
+
+	buf = (void *)mmap(0, bytes, PROT_READ|PROT_WRITE, MAP_ANON|map_flags, -1, 0);
+	BUG_ON(buf == (void *)-1);
+
+	if (map_flags == MAP_PRIVATE) {
+		if (thp > 0) {
+			ret = madvise(buf, bytes, MADV_HUGEPAGE);
+			if (ret && !g->print_once) {
+				g->print_once = 1;
+				printf("WARNING: Could not enable THP - do: 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled'\n");
+			}
+		}
+		if (thp < 0) {
+			ret = madvise(buf, bytes, MADV_NOHUGEPAGE);
+			if (ret && !g->print_once) {
+				g->print_once = 1;
+				printf("WARNING: Could not disable THP: run a CONFIG_TRANSPARENT_HUGEPAGE kernel?\n");
+			}
+		}
+	}
+
+	if (init_zero) {
+		bzero(buf, bytes);
+	} else {
+		/* Initialize random contents, different in each word: */
+		if (init_random) {
+			u64 *wbuf = (void *)buf;
+			long off = rand();
+			long i;
+
+			for (i = 0; i < bytes/8; i++)
+				wbuf[i] = i + off;
+		}
+	}
+
+	/* Align to 2MB boundary: */
+	buf = (void *)(((unsigned long)buf + HPSIZE-1) & ~(HPSIZE-1));
+
+	/* Restore affinity: */
+	if (init_cpu0) {
+		bind_to_cpumask(orig_mask);
+		mempol_restore();
+	}
+
+	return buf;
+}
+
+static void free_data(void *data, ssize_t bytes)
+{
+	int ret;
+
+	if (!data)
+		return;
+
+	ret = munmap(data, bytes);
+	BUG_ON(ret);
+}
+
+/*
+ * Create a shared memory buffer that can be shared between processes, zeroed:
+ */
+static void * zalloc_shared_data(ssize_t bytes)
+{
+	return alloc_data(bytes, MAP_SHARED, 1, g->p.init_cpu0,  g->p.thp, g->p.init_random);
+}
+
+/*
+ * Create a shared memory buffer that can be shared between processes:
+ */
+static void * setup_shared_data(ssize_t bytes)
+{
+	return alloc_data(bytes, MAP_SHARED, 0, g->p.init_cpu0,  g->p.thp, g->p.init_random);
+}
+
+/*
+ * Allocate process-local memory - this will either be shared between
+ * threads of this process, or only be accessed by this thread:
+ */
+static void * setup_private_data(ssize_t bytes)
+{
+	return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0,  g->p.thp, g->p.init_random);
+}
+
+/*
+ * Return a process-shared (global) mutex:
+ */
+static void init_global_mutex(pthread_mutex_t *mutex)
+{
+	pthread_mutexattr_t attr;
+
+	pthread_mutexattr_init(&attr);
+	pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
+	pthread_mutex_init(mutex, &attr);
+}
+
+static int parse_cpu_list(const char *arg)
+{
+	p0.cpu_list_str = strdup(arg);
+
+	dprintf("got CPU list: {%s}\n", p0.cpu_list_str);
+
+	return 0;
+}
+
+static int parse_setup_cpu_list(void)
+{
+	struct thread_data *td;
+	char *str0, *str;
+	int t;
+
+	if (!g->p.cpu_list_str)
+		return 0;
+
+	dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);
+
+	str0 = str = strdup(g->p.cpu_list_str);
+	t = 0;
+
+	BUG_ON(!str);
+
+	tprintf("# binding tasks to CPUs:\n");
+	tprintf("#  ");
+
+	while (true) {
+		int bind_cpu, bind_cpu_0, bind_cpu_1;
+		char *tok, *tok_end, *tok_step, *tok_len, *tok_mul;
+		int bind_len;
+		int step;
+		int mul;
+
+		tok = strsep(&str, ",");
+		if (!tok)
+			break;
+
+		tok_end = strstr(tok, "-");
+
+		dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end);
+		if (!tok_end) {
+			/* Single CPU specified: */
+			bind_cpu_0 = bind_cpu_1 = atol(tok);
+		} else {
+			/* CPU range specified (for example: "5-11"): */
+			bind_cpu_0 = atol(tok);
+			bind_cpu_1 = atol(tok_end + 1);
+		}
+
+		step = 1;
+		tok_step = strstr(tok, "#");
+		if (tok_step) {
+			step = atol(tok_step + 1);
+			BUG_ON(step <= 0 || step >= g->p.nr_cpus);
+		}
+
+		/*
+		 * Mask length.
+		 * Eg: "--cpus 8_4-16#4" means: '--cpus 8_4,12_4,16_4',
+		 * where the _4 means the next 4 CPUs are allowed.
+		 */
+		bind_len = 1;
+		tok_len = strstr(tok, "_");
+		if (tok_len) {
+			bind_len = atol(tok_len + 1);
+			BUG_ON(bind_len <= 0 || bind_len > g->p.nr_cpus);
+		}
+
+		/* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */
+		mul = 1;
+		tok_mul = strstr(tok, "x");
+		if (tok_mul) {
+			mul = atol(tok_mul + 1);
+			BUG_ON(mul <= 0);
+		}
+
+		dprintf("CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul);
+
+		if (bind_cpu_0 >= g->p.nr_cpus || bind_cpu_1 >= g->p.nr_cpus) {
+			printf("\nTest not applicable, system has only %d CPUs.\n", g->p.nr_cpus);
+			return -1;
+		}
+
+		BUG_ON(bind_cpu_0 < 0 || bind_cpu_1 < 0);
+		BUG_ON(bind_cpu_0 > bind_cpu_1);
+
+		for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) {
+			int i;
+
+			for (i = 0; i < mul; i++) {
+				int cpu;
+
+				if (t >= g->p.nr_tasks) {
+					printf("\n# NOTE: ignoring bind CPUs starting at CPU#%d\n #", bind_cpu);
+					goto out;
+				}
+				td = g->threads + t;
+
+				if (t)
+					tprintf(",");
+				if (bind_len > 1) {
+					tprintf("%2d/%d", bind_cpu, bind_len);
+				} else {
+					tprintf("%2d", bind_cpu);
+				}
+
+				CPU_ZERO(&td->bind_cpumask);
+				for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) {
+					BUG_ON(cpu < 0 || cpu >= g->p.nr_cpus);
+					CPU_SET(cpu, &td->bind_cpumask);
+				}
+				t++;
+			}
+		}
+	}
+out:
+
+	tprintf("\n");
+
+	if (t < g->p.nr_tasks)
+		printf("# NOTE: %d tasks bound, %d tasks unbound\n", t, g->p.nr_tasks - t);
+
+	free(str0);
+	return 0;
+}
+
+static int parse_cpus_opt(const struct option *opt __maybe_unused,
+			  const char *arg, int unset __maybe_unused)
+{
+	if (!arg)
+		return -1;
+
+	return parse_cpu_list(arg);
+}
+
+static int parse_node_list(const char *arg)
+{
+	p0.node_list_str = strdup(arg);
+
+	dprintf("got NODE list: {%s}\n", p0.node_list_str);
+
+	return 0;
+}
+
+static int parse_setup_node_list(void)
+{
+	struct thread_data *td;
+	char *str0, *str;
+	int t;
+
+	if (!g->p.node_list_str)
+		return 0;
+
+	dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);
+
+	str0 = str = strdup(g->p.node_list_str);
+	t = 0;
+
+	BUG_ON(!str);
+
+	tprintf("# binding tasks to NODEs:\n");
+	tprintf("# ");
+
+	while (true) {
+		int bind_node, bind_node_0, bind_node_1;
+		char *tok, *tok_end, *tok_step, *tok_mul;
+		int step;
+		int mul;
+
+		tok = strsep(&str, ",");
+		if (!tok)
+			break;
+
+		tok_end = strstr(tok, "-");
+
+		dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end);
+		if (!tok_end) {
+			/* Single NODE specified: */
+			bind_node_0 = bind_node_1 = atol(tok);
+		} else {
+			/* NODE range specified (for example: "5-11"): */
+			bind_node_0 = atol(tok);
+			bind_node_1 = atol(tok_end + 1);
+		}
+
+		step = 1;
+		tok_step = strstr(tok, "#");
+		if (tok_step) {
+			step = atol(tok_step + 1);
+			BUG_ON(step <= 0 || step >= g->p.nr_nodes);
+		}
+
+		/* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */
+		mul = 1;
+		tok_mul = strstr(tok, "x");
+		if (tok_mul) {
+			mul = atol(tok_mul + 1);
+			BUG_ON(mul <= 0);
+		}
+
+		dprintf("NODEs: %d-%d #%d\n", bind_node_0, bind_node_1, step);
+
+		if (bind_node_0 >= g->p.nr_nodes || bind_node_1 >= g->p.nr_nodes) {
+			printf("\nTest not applicable, system has only %d nodes.\n", g->p.nr_nodes);
+			return -1;
+		}
+
+		BUG_ON(bind_node_0 < 0 || bind_node_1 < 0);
+		BUG_ON(bind_node_0 > bind_node_1);
+
+		for (bind_node = bind_node_0; bind_node <= bind_node_1; bind_node += step) {
+			int i;
+
+			for (i = 0; i < mul; i++) {
+				if (t >= g->p.nr_tasks) {
+					printf("\n# NOTE: ignoring bind NODEs starting at NODE#%d\n", bind_node);
+					goto out;
+				}
+				td = g->threads + t;
+
+				if (!t)
+					tprintf(" %2d", bind_node);
+				else
+					tprintf(",%2d", bind_node);
+
+				td->bind_node = bind_node;
+				t++;
+			}
+		}
+	}
+out:
+
+	tprintf("\n");
+
+	if (t < g->p.nr_tasks)
+		printf("# NOTE: %d tasks mem-bound, %d tasks unbound\n", t, g->p.nr_tasks - t);
+
+	free(str0);
+	return 0;
+}
+
+static int parse_nodes_opt(const struct option *opt __maybe_unused,
+			  const char *arg, int unset __maybe_unused)
+{
+	if (!arg)
+		return -1;
+
+	return parse_node_list(arg);
+
+	return 0;
+}
+
+#define BIT(x) (1ul << x)
+
+static inline uint32_t lfsr_32(uint32_t lfsr)
+{
+	const uint32_t taps = BIT(1) | BIT(5) | BIT(6) | BIT(31);
+	return (lfsr>>1) ^ ((0x0u - (lfsr & 0x1u)) & taps);
+}
+
+/*
+ * Make sure there's real data dependency to RAM (when read
+ * accesses are enabled), so the compiler, the CPU and the
+ * kernel (KSM, zero page, etc.) cannot optimize away RAM
+ * accesses:
+ */
+static inline u64 access_data(u64 *data __attribute__((unused)), u64 val)
+{
+	if (g->p.data_reads)
+		val += *data;
+	if (g->p.data_writes)
+		*data = val + 1;
+	return val;
+}
+
+/*
+ * The worker process does two types of work, a forwards going
+ * loop and a backwards going loop.
+ *
+ * We do this so that on multiprocessor systems we do not create
+ * a 'train' of processing, with highly synchronized processes,
+ * skewing the whole benchmark.
+ */
+static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val)
+{
+	long words = bytes/sizeof(u64);
+	u64 *data = (void *)__data;
+	long chunk_0, chunk_1;
+	u64 *d0, *d, *d1;
+	long off;
+	long i;
+
+	BUG_ON(!data && words);
+	BUG_ON(data && !words);
+
+	if (!data)
+		return val;
+
+	/* Very simple memset() work variant: */
+	if (g->p.data_zero_memset && !g->p.data_rand_walk) {
+		bzero(data, bytes);
+		return val;
+	}
+
+	/* Spread out by PID/TID nr and by loop nr: */
+	chunk_0 = words/nr_max;
+	chunk_1 = words/g->p.nr_loops;
+	off = nr*chunk_0 + loop*chunk_1;
+
+	while (off >= words)
+		off -= words;
+
+	if (g->p.data_rand_walk) {
+		u32 lfsr = nr + loop + val;
+		int j;
+
+		for (i = 0; i < words/1024; i++) {
+			long start, end;
+
+			lfsr = lfsr_32(lfsr);
+
+			start = lfsr % words;
+			end = min(start + 1024, words-1);
+
+			if (g->p.data_zero_memset) {
+				bzero(data + start, (end-start) * sizeof(u64));
+			} else {
+				for (j = start; j < end; j++)
+					val = access_data(data + j, val);
+			}
+		}
+	} else if (!g->p.data_backwards || (nr + loop) & 1) {
+
+		d0 = data + off;
+		d  = data + off + 1;
+		d1 = data + words;
+
+		/* Process data forwards: */
+		for (;;) {
+			if (unlikely(d >= d1))
+				d = data;
+			if (unlikely(d == d0))
+				break;
+
+			val = access_data(d, val);
+
+			d++;
+		}
+	} else {
+		/* Process data backwards: */
+
+		d0 = data + off;
+		d  = data + off - 1;
+		d1 = data + words;
+
+		/* Process data forwards: */
+		for (;;) {
+			if (unlikely(d < data))
+				d = data + words-1;
+			if (unlikely(d == d0))
+				break;
+
+			val = access_data(d, val);
+
+			d--;
+		}
+	}
+
+	return val;
+}
+
+static void update_curr_cpu(int task_nr, unsigned long bytes_worked)
+{
+	unsigned int cpu;
+
+	cpu = sched_getcpu();
+
+	g->threads[task_nr].curr_cpu = cpu;
+	prctl(0, bytes_worked);
+}
+
+#define MAX_NR_NODES	64
+
+/*
+ * Count the number of nodes a process's threads
+ * are spread out on.
+ *
+ * A count of 1 means that the process is compressed
+ * to a single node. A count of g->p.nr_nodes means it's
+ * spread out on the whole system.
+ */
+static int count_process_nodes(int process_nr)
+{
+	char node_present[MAX_NR_NODES] = { 0, };
+	int nodes;
+	int n, t;
+
+	for (t = 0; t < g->p.nr_threads; t++) {
+		struct thread_data *td;
+		int task_nr;
+		int node;
+
+		task_nr = process_nr*g->p.nr_threads + t;
+		td = g->threads + task_nr;
+
+		node = numa_node_of_cpu(td->curr_cpu);
+		node_present[node] = 1;
+	}
+
+	nodes = 0;
+
+	for (n = 0; n < MAX_NR_NODES; n++)
+		nodes += node_present[n];
+
+	return nodes;
+}
+
+/*
+ * Count the number of distinct process-threads a node contains.
+ *
+ * A count of 1 means that the node contains only a single
+ * process. If all nodes on the system contain at most one
+ * process then we are well-converged.
+ */
+static int count_node_processes(int node)
+{
+	int processes = 0;
+	int t, p;
+
+	for (p = 0; p < g->p.nr_proc; p++) {
+		for (t = 0; t < g->p.nr_threads; t++) {
+			struct thread_data *td;
+			int task_nr;
+			int n;
+
+			task_nr = p*g->p.nr_threads + t;
+			td = g->threads + task_nr;
+
+			n = numa_node_of_cpu(td->curr_cpu);
+			if (n == node) {
+				processes++;
+				break;
+			}
+		}
+	}
+
+	return processes;
+}
+
+static void calc_convergence_compression(int *strong)
+{
+	unsigned int nodes_min, nodes_max;
+	int p;
+
+	nodes_min = -1;
+	nodes_max =  0;
+
+	for (p = 0; p < g->p.nr_proc; p++) {
+		unsigned int nodes = count_process_nodes(p);
+
+		nodes_min = min(nodes, nodes_min);
+		nodes_max = max(nodes, nodes_max);
+	}
+
+	/* Strong convergence: all threads compress on a single node: */
+	if (nodes_min == 1 && nodes_max == 1) {
+		*strong = 1;
+	} else {
+		*strong = 0;
+		tprintf(" {%d-%d}", nodes_min, nodes_max);
+	}
+}
+
+static void calc_convergence(double runtime_ns_max, double *convergence)
+{
+	unsigned int loops_done_min, loops_done_max;
+	int process_groups;
+	int nodes[MAX_NR_NODES];
+	int distance;
+	int nr_min;
+	int nr_max;
+	int strong;
+	int sum;
+	int nr;
+	int node;
+	int cpu;
+	int t;
+
+	if (!g->p.show_convergence && !g->p.measure_convergence)
+		return;
+
+	for (node = 0; node < g->p.nr_nodes; node++)
+		nodes[node] = 0;
+
+	loops_done_min = -1;
+	loops_done_max = 0;
+
+	for (t = 0; t < g->p.nr_tasks; t++) {
+		struct thread_data *td = g->threads + t;
+		unsigned int loops_done;
+
+		cpu = td->curr_cpu;
+
+		/* Not all threads have written it yet: */
+		if (cpu < 0)
+			continue;
+
+		node = numa_node_of_cpu(cpu);
+
+		nodes[node]++;
+
+		loops_done = td->loops_done;
+		loops_done_min = min(loops_done, loops_done_min);
+		loops_done_max = max(loops_done, loops_done_max);
+	}
+
+	nr_max = 0;
+	nr_min = g->p.nr_tasks;
+	sum = 0;
+
+	for (node = 0; node < g->p.nr_nodes; node++) {
+		nr = nodes[node];
+		nr_min = min(nr, nr_min);
+		nr_max = max(nr, nr_max);
+		sum += nr;
+	}
+	BUG_ON(nr_min > nr_max);
+
+	BUG_ON(sum > g->p.nr_tasks);
+
+	if (0 && (sum < g->p.nr_tasks))
+		return;
+
+	/*
+	 * Count the number of distinct process groups present
+	 * on nodes - when we are converged this will decrease
+	 * to g->p.nr_proc:
+	 */
+	process_groups = 0;
+
+	for (node = 0; node < g->p.nr_nodes; node++) {
+		int processes = count_node_processes(node);
+
+		nr = nodes[node];
+		tprintf(" %2d/%-2d", nr, processes);
+
+		process_groups += processes;
+	}
+
+	distance = nr_max - nr_min;
+
+	tprintf(" [%2d/%-2d]", distance, process_groups);
+
+	tprintf(" l:%3d-%-3d (%3d)",
+		loops_done_min, loops_done_max, loops_done_max-loops_done_min);
+
+	if (loops_done_min && loops_done_max) {
+		double skew = 1.0 - (double)loops_done_min/loops_done_max;
+
+		tprintf(" [%4.1f%%]", skew * 100.0);
+	}
+
+	calc_convergence_compression(&strong);
+
+	if (strong && process_groups == g->p.nr_proc) {
+		if (!*convergence) {
+			*convergence = runtime_ns_max;
+			tprintf(" (%6.1fs converged)\n", *convergence/1e9);
+			if (g->p.measure_convergence) {
+				g->all_converged = true;
+				g->stop_work = true;
+			}
+		}
+	} else {
+		if (*convergence) {
+			tprintf(" (%6.1fs de-converged)", runtime_ns_max/1e9);
+			*convergence = 0;
+		}
+		tprintf("\n");
+	}
+}
+
+static void show_summary(double runtime_ns_max, int l, double *convergence)
+{
+	tprintf("\r #  %5.1f%%  [%.1f mins]",
+		(double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max/1e9 / 60.0);
+
+	calc_convergence(runtime_ns_max, convergence);
+
+	if (g->p.show_details >= 0)
+		fflush(stdout);
+}
+
+static void *worker_thread(void *__tdata)
+{
+	struct thread_data *td = __tdata;
+	struct timeval start0, start, stop, diff;
+	int process_nr = td->process_nr;
+	int thread_nr = td->thread_nr;
+	unsigned long last_perturbance;
+	int task_nr = td->task_nr;
+	int details = g->p.show_details;
+	int first_task, last_task;
+	double convergence = 0;
+	u64 val = td->val;
+	double runtime_ns_max;
+	u8 *global_data;
+	u8 *process_data;
+	u8 *thread_data;
+	u64 bytes_done;
+	long work_done;
+	u32 l;
+
+	bind_to_cpumask(td->bind_cpumask);
+	bind_to_memnode(td->bind_node);
+
+	set_taskname("thread %d/%d", process_nr, thread_nr);
+
+	global_data = g->data;
+	process_data = td->process_data;
+	thread_data = setup_private_data(g->p.bytes_thread);
+
+	bytes_done = 0;
+
+	last_task = 0;
+	if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1)
+		last_task = 1;
+
+	first_task = 0;
+	if (process_nr == 0 && thread_nr == 0)
+		first_task = 1;
+
+	if (details >= 2) {
+		printf("#  thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n",
+			process_nr, thread_nr, global_data, process_data, thread_data);
+	}
+
+	if (g->p.serialize_startup) {
+		pthread_mutex_lock(&g->startup_mutex);
+		g->nr_tasks_started++;
+		pthread_mutex_unlock(&g->startup_mutex);
+
+		/* Here we will wait for the main process to start us all at once: */
+		pthread_mutex_lock(&g->start_work_mutex);
+		g->nr_tasks_working++;
+
+		/* Last one wake the main process: */
+		if (g->nr_tasks_working == g->p.nr_tasks)
+			pthread_mutex_unlock(&g->startup_done_mutex);
+
+		pthread_mutex_unlock(&g->start_work_mutex);
+	}
+
+	gettimeofday(&start0, NULL);
+
+	start = stop = start0;
+	last_perturbance = start.tv_sec;
+
+	for (l = 0; l < g->p.nr_loops; l++) {
+		start = stop;
+
+		if (g->stop_work)
+			break;
+
+		val += do_work(global_data,  g->p.bytes_global,  process_nr, g->p.nr_proc,	l, val);
+		val += do_work(process_data, g->p.bytes_process, thread_nr,  g->p.nr_threads,	l, val);
+		val += do_work(thread_data,  g->p.bytes_thread,  0,          1,		l, val);
+
+		if (g->p.sleep_usecs) {
+			pthread_mutex_lock(td->process_lock);
+			usleep(g->p.sleep_usecs);
+			pthread_mutex_unlock(td->process_lock);
+		}
+		/*
+		 * Amount of work to be done under a process-global lock:
+		 */
+		if (g->p.bytes_process_locked) {
+			pthread_mutex_lock(td->process_lock);
+			val += do_work(process_data, g->p.bytes_process_locked, thread_nr,  g->p.nr_threads,	l, val);
+			pthread_mutex_unlock(td->process_lock);
+		}
+
+		work_done = g->p.bytes_global + g->p.bytes_process +
+			    g->p.bytes_process_locked + g->p.bytes_thread;
+
+		update_curr_cpu(task_nr, work_done);
+		bytes_done += work_done;
+
+		if (details < 0 && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs)
+			continue;
+
+		td->loops_done = l;
+
+		gettimeofday(&stop, NULL);
+
+		/* Check whether our max runtime timed out: */
+		if (g->p.nr_secs) {
+			timersub(&stop, &start0, &diff);
+			if ((u32)diff.tv_sec >= g->p.nr_secs) {
+				g->stop_work = true;
+				break;
+			}
+		}
+
+		/* Update the summary at most once per second: */
+		if (start.tv_sec == stop.tv_sec)
+			continue;
+
+		/*
+		 * Perturb the first task's equilibrium every g->p.perturb_secs seconds,
+		 * by migrating to CPU#0:
+		 */
+		if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) {
+			cpu_set_t orig_mask;
+			int target_cpu;
+			int this_cpu;
+
+			last_perturbance = stop.tv_sec;
+
+			/*
+			 * Depending on where we are running, move into
+			 * the other half of the system, to create some
+			 * real disturbance:
+			 */
+			this_cpu = g->threads[task_nr].curr_cpu;
+			if (this_cpu < g->p.nr_cpus/2)
+				target_cpu = g->p.nr_cpus-1;
+			else
+				target_cpu = 0;
+
+			orig_mask = bind_to_cpu(target_cpu);
+
+			/* Here we are running on the target CPU already */
+			if (details >= 1)
+				printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu);
+
+			bind_to_cpumask(orig_mask);
+		}
+
+		if (details >= 3) {
+			timersub(&stop, &start, &diff);
+			runtime_ns_max = diff.tv_sec * 1000000000;
+			runtime_ns_max += diff.tv_usec * 1000;
+
+			if (details >= 0) {
+				printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016"PRIx64"]\n",
+					process_nr, thread_nr, runtime_ns_max / bytes_done, val);
+			}
+			fflush(stdout);
+		}
+		if (!last_task)
+			continue;
+
+		timersub(&stop, &start0, &diff);
+		runtime_ns_max = diff.tv_sec * 1000000000ULL;
+		runtime_ns_max += diff.tv_usec * 1000ULL;
+
+		show_summary(runtime_ns_max, l, &convergence);
+	}
+
+	gettimeofday(&stop, NULL);
+	timersub(&stop, &start0, &diff);
+	td->runtime_ns = diff.tv_sec * 1000000000ULL;
+	td->runtime_ns += diff.tv_usec * 1000ULL;
+
+	free_data(thread_data, g->p.bytes_thread);
+
+	pthread_mutex_lock(&g->stop_work_mutex);
+	g->bytes_done += bytes_done;
+	pthread_mutex_unlock(&g->stop_work_mutex);
+
+	return NULL;
+}
+
+/*
+ * A worker process starts a couple of threads:
+ */
+static void worker_process(int process_nr)
+{
+	pthread_mutex_t process_lock;
+	struct thread_data *td;
+	pthread_t *pthreads;
+	u8 *process_data;
+	int task_nr;
+	int ret;
+	int t;
+
+	pthread_mutex_init(&process_lock, NULL);
+	set_taskname("process %d", process_nr);
+
+	/*
+	 * Pick up the memory policy and the CPU binding of our first thread,
+	 * so that we initialize memory accordingly:
+	 */
+	task_nr = process_nr*g->p.nr_threads;
+	td = g->threads + task_nr;
+
+	bind_to_memnode(td->bind_node);
+	bind_to_cpumask(td->bind_cpumask);
+
+	pthreads = zalloc(g->p.nr_threads * sizeof(pthread_t));
+	process_data = setup_private_data(g->p.bytes_process);
+
+	if (g->p.show_details >= 3) {
+		printf(" # process %2d global mem: %p, process mem: %p\n",
+			process_nr, g->data, process_data);
+	}
+
+	for (t = 0; t < g->p.nr_threads; t++) {
+		task_nr = process_nr*g->p.nr_threads + t;
+		td = g->threads + task_nr;
+
+		td->process_data = process_data;
+		td->process_nr   = process_nr;
+		td->thread_nr    = t;
+		td->task_nr	 = task_nr;
+		td->val          = rand();
+		td->curr_cpu	 = -1;
+		td->process_lock = &process_lock;
+
+		ret = pthread_create(pthreads + t, NULL, worker_thread, td);
+		BUG_ON(ret);
+	}
+
+	for (t = 0; t < g->p.nr_threads; t++) {
+                ret = pthread_join(pthreads[t], NULL);
+		BUG_ON(ret);
+	}
+
+	free_data(process_data, g->p.bytes_process);
+	free(pthreads);
+}
+
+static void print_summary(void)
+{
+	if (g->p.show_details < 0)
+		return;
+
+	printf("\n ###\n");
+	printf(" # %d %s will execute (on %d nodes, %d CPUs):\n",
+		g->p.nr_tasks, g->p.nr_tasks == 1 ? "task" : "tasks", g->p.nr_nodes, g->p.nr_cpus);
+	printf(" #      %5dx %5ldMB global  shared mem operations\n",
+			g->p.nr_loops, g->p.bytes_global/1024/1024);
+	printf(" #      %5dx %5ldMB process shared mem operations\n",
+			g->p.nr_loops, g->p.bytes_process/1024/1024);
+	printf(" #      %5dx %5ldMB thread  local  mem operations\n",
+			g->p.nr_loops, g->p.bytes_thread/1024/1024);
+
+	printf(" ###\n");
+
+	printf("\n ###\n"); fflush(stdout);
+}
+
+static void init_thread_data(void)
+{
+	ssize_t size = sizeof(*g->threads)*g->p.nr_tasks;
+	int t;
+
+	g->threads = zalloc_shared_data(size);
+
+	for (t = 0; t < g->p.nr_tasks; t++) {
+		struct thread_data *td = g->threads + t;
+		int cpu;
+
+		/* Allow all nodes by default: */
+		td->bind_node = -1;
+
+		/* Allow all CPUs by default: */
+		CPU_ZERO(&td->bind_cpumask);
+		for (cpu = 0; cpu < g->p.nr_cpus; cpu++)
+			CPU_SET(cpu, &td->bind_cpumask);
+	}
+}
+
+static void deinit_thread_data(void)
+{
+	ssize_t size = sizeof(*g->threads)*g->p.nr_tasks;
+
+	free_data(g->threads, size);
+}
+
+static int init(void)
+{
+	g = (void *)alloc_data(sizeof(*g), MAP_SHARED, 1, 0, 0 /* THP */, 0);
+
+	/* Copy over options: */
+	g->p = p0;
+
+	g->p.nr_cpus = numa_num_configured_cpus();
+
+	g->p.nr_nodes = numa_max_node() + 1;
+
+	/* char array in count_process_nodes(): */
+	BUG_ON(g->p.nr_nodes > MAX_NR_NODES || g->p.nr_nodes < 0);
+
+	if (g->p.show_quiet && !g->p.show_details)
+		g->p.show_details = -1;
+
+	/* Some memory should be specified: */
+	if (!g->p.mb_global_str && !g->p.mb_proc_str && !g->p.mb_thread_str)
+		return -1;
+
+	if (g->p.mb_global_str) {
+		g->p.mb_global = atof(g->p.mb_global_str);
+		BUG_ON(g->p.mb_global < 0);
+	}
+
+	if (g->p.mb_proc_str) {
+		g->p.mb_proc = atof(g->p.mb_proc_str);
+		BUG_ON(g->p.mb_proc < 0);
+	}
+
+	if (g->p.mb_proc_locked_str) {
+		g->p.mb_proc_locked = atof(g->p.mb_proc_locked_str);
+		BUG_ON(g->p.mb_proc_locked < 0);
+		BUG_ON(g->p.mb_proc_locked > g->p.mb_proc);
+	}
+
+	if (g->p.mb_thread_str) {
+		g->p.mb_thread = atof(g->p.mb_thread_str);
+		BUG_ON(g->p.mb_thread < 0);
+	}
+
+	BUG_ON(g->p.nr_threads <= 0);
+	BUG_ON(g->p.nr_proc <= 0);
+
+	g->p.nr_tasks = g->p.nr_proc*g->p.nr_threads;
+
+	g->p.bytes_global		= g->p.mb_global	*1024L*1024L;
+	g->p.bytes_process		= g->p.mb_proc		*1024L*1024L;
+	g->p.bytes_process_locked	= g->p.mb_proc_locked	*1024L*1024L;
+	g->p.bytes_thread		= g->p.mb_thread	*1024L*1024L;
+
+	g->data = setup_shared_data(g->p.bytes_global);
+
+	/* Startup serialization: */
+	init_global_mutex(&g->start_work_mutex);
+	init_global_mutex(&g->startup_mutex);
+	init_global_mutex(&g->startup_done_mutex);
+	init_global_mutex(&g->stop_work_mutex);
+
+	init_thread_data();
+
+	tprintf("#\n");
+	if (parse_setup_cpu_list() || parse_setup_node_list())
+		return -1;
+	tprintf("#\n");
+
+	print_summary();
+
+	return 0;
+}
+
+static void deinit(void)
+{
+	free_data(g->data, g->p.bytes_global);
+	g->data = NULL;
+
+	deinit_thread_data();
+
+	free_data(g, sizeof(*g));
+	g = NULL;
+}
+
+/*
+ * Print a short or long result, depending on the verbosity setting:
+ */
+static void print_res(const char *name, double val,
+		      const char *txt_unit, const char *txt_short, const char *txt_long)
+{
+	if (!name)
+		name = "main,";
+
+	if (g->p.show_quiet)
+		printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short);
+	else
+		printf(" %14.3f %s\n", val, txt_long);
+}
+
+static int __bench_numa(const char *name)
+{
+	struct timeval start, stop, diff;
+	u64 runtime_ns_min, runtime_ns_sum;
+	pid_t *pids, pid, wpid;
+	double delta_runtime;
+	double runtime_avg;
+	double runtime_sec_max;
+	double runtime_sec_min;
+	int wait_stat;
+	double bytes;
+	int i, t;
+
+	if (init())
+		return -1;
+
+	pids = zalloc(g->p.nr_proc * sizeof(*pids));
+	pid = -1;
+
+	/* All threads try to acquire it, this way we can wait for them to start up: */
+	pthread_mutex_lock(&g->start_work_mutex);
+
+	if (g->p.serialize_startup) {
+		tprintf(" #\n");
+		tprintf(" # Startup synchronization: ..."); fflush(stdout);
+	}
+
+	gettimeofday(&start, NULL);
+
+	for (i = 0; i < g->p.nr_proc; i++) {
+		pid = fork();
+		dprintf(" # process %2d: PID %d\n", i, pid);
+
+		BUG_ON(pid < 0);
+		if (!pid) {
+			/* Child process: */
+			worker_process(i);
+
+			exit(0);
+		}
+		pids[i] = pid;
+
+	}
+	/* Wait for all the threads to start up: */
+	while (g->nr_tasks_started != g->p.nr_tasks)
+		usleep(1000);
+
+	BUG_ON(g->nr_tasks_started != g->p.nr_tasks);
+
+	if (g->p.serialize_startup) {
+		double startup_sec;
+
+		pthread_mutex_lock(&g->startup_done_mutex);
+
+		/* This will start all threads: */
+		pthread_mutex_unlock(&g->start_work_mutex);
+
+		/* This mutex is locked - the last started thread will wake us: */
+		pthread_mutex_lock(&g->startup_done_mutex);
+
+		gettimeofday(&stop, NULL);
+
+		timersub(&stop, &start, &diff);
+
+		startup_sec = diff.tv_sec * 1000000000.0;
+		startup_sec += diff.tv_usec * 1000.0;
+		startup_sec /= 1e9;
+
+		tprintf(" threads initialized in %.6f seconds.\n", startup_sec);
+		tprintf(" #\n");
+
+		start = stop;
+		pthread_mutex_unlock(&g->startup_done_mutex);
+	} else {
+		gettimeofday(&start, NULL);
+	}
+
+	/* Parent process: */
+
+
+	for (i = 0; i < g->p.nr_proc; i++) {
+		wpid = waitpid(pids[i], &wait_stat, 0);
+		BUG_ON(wpid < 0);
+		BUG_ON(!WIFEXITED(wait_stat));
+
+	}
+
+	runtime_ns_sum = 0;
+	runtime_ns_min = -1LL;
+
+	for (t = 0; t < g->p.nr_tasks; t++) {
+		u64 thread_runtime_ns = g->threads[t].runtime_ns;
+
+		runtime_ns_sum += thread_runtime_ns;
+		runtime_ns_min = min(thread_runtime_ns, runtime_ns_min);
+	}
+
+	gettimeofday(&stop, NULL);
+	timersub(&stop, &start, &diff);
+
+	BUG_ON(bench_format != BENCH_FORMAT_DEFAULT);
+
+	tprintf("\n ###\n");
+	tprintf("\n");
+
+	runtime_sec_max = diff.tv_sec * 1000000000.0;
+	runtime_sec_max += diff.tv_usec * 1000.0;
+	runtime_sec_max /= 1e9;
+
+	runtime_sec_min = runtime_ns_min/1e9;
+
+	bytes = g->bytes_done;
+	runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / 1e9;
+
+	if (g->p.measure_convergence) {
+		print_res(name, runtime_sec_max,
+			"secs,", "NUMA-convergence-latency", "secs latency to NUMA-converge");
+	}
+
+	print_res(name, runtime_sec_max,
+		"secs,", "runtime-max/thread",	"secs slowest (max) thread-runtime");
+
+	print_res(name, runtime_sec_min,
+		"secs,", "runtime-min/thread",	"secs fastest (min) thread-runtime");
+
+	print_res(name, runtime_avg,
+		"secs,", "runtime-avg/thread",	"secs average thread-runtime");
+
+	delta_runtime = (runtime_sec_max - runtime_sec_min)/2.0;
+	print_res(name, delta_runtime / runtime_sec_max * 100.0,
+		"%,", "spread-runtime/thread",	"% difference between max/avg runtime");
+
+	print_res(name, bytes / g->p.nr_tasks / 1e9,
+		"GB,", "data/thread",		"GB data processed, per thread");
+
+	print_res(name, bytes / 1e9,
+		"GB,", "data-total",		"GB data processed, total");
+
+	print_res(name, runtime_sec_max * 1e9 / (bytes / g->p.nr_tasks),
+		"nsecs,", "runtime/byte/thread","nsecs/byte/thread runtime");
+
+	print_res(name, bytes / g->p.nr_tasks / 1e9 / runtime_sec_max,
+		"GB/sec,", "thread-speed",	"GB/sec/thread speed");
+
+	print_res(name, bytes / runtime_sec_max / 1e9,
+		"GB/sec,", "total-speed",	"GB/sec total speed");
+
+	free(pids);
+
+	deinit();
+
+	return 0;
+}
+
+#define MAX_ARGS 50
+
+static int command_size(const char **argv)
+{
+	int size = 0;
+
+	while (*argv) {
+		size++;
+		argv++;
+	}
+
+	BUG_ON(size >= MAX_ARGS);
+
+	return size;
+}
+
+static void init_params(struct params *p, const char *name, int argc, const char **argv)
+{
+	int i;
+
+	printf("\n # Running %s \"perf bench numa", name);
+
+	for (i = 0; i < argc; i++)
+		printf(" %s", argv[i]);
+
+	printf("\"\n");
+
+	memset(p, 0, sizeof(*p));
+
+	/* Initialize nonzero defaults: */
+
+	p->serialize_startup		= 1;
+	p->data_reads			= true;
+	p->data_writes			= true;
+	p->data_backwards		= true;
+	p->data_rand_walk		= true;
+	p->nr_loops			= -1;
+	p->init_random			= true;
+	p->mb_global_str		= "1";
+	p->nr_proc			= 1;
+	p->nr_threads			= 1;
+	p->nr_secs			= 5;
+	p->run_all			= argc == 1;
+}
+
+static int run_bench_numa(const char *name, const char **argv)
+{
+	int argc = command_size(argv);
+
+	init_params(&p0, name, argc, argv);
+	argc = parse_options(argc, argv, options, bench_numa_usage, 0);
+	if (argc)
+		goto err;
+
+	if (__bench_numa(name))
+		goto err;
+
+	return 0;
+
+err:
+	return -1;
+}
+
+#define OPT_BW_RAM		"-s",  "20", "-zZq",    "--thp", " 1", "--no-data_rand_walk"
+#define OPT_BW_RAM_NOTHP	OPT_BW_RAM,		"--thp", "-1"
+
+#define OPT_CONV		"-s", "100", "-zZ0qcm", "--thp", " 1"
+#define OPT_CONV_NOTHP		OPT_CONV,		"--thp", "-1"
+
+#define OPT_BW			"-s",  "20", "-zZ0q",   "--thp", " 1"
+#define OPT_BW_NOTHP		OPT_BW,			"--thp", "-1"
+
+/*
+ * The built-in test-suite executed by "perf bench numa -a".
+ *
+ * (A minimum of 4 nodes and 16 GB of RAM is recommended.)
+ */
+static const char *tests[][MAX_ARGS] = {
+   /* Basic single-stream NUMA bandwidth measurements: */
+   { "RAM-bw-local,",	  "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
+			  "-C" ,   "0", "-M",   "0", OPT_BW_RAM },
+   { "RAM-bw-local-NOTHP,",
+			  "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
+			  "-C" ,   "0", "-M",   "0", OPT_BW_RAM_NOTHP },
+   { "RAM-bw-remote,",	  "mem",  "-p",  "1",  "-t",  "1", "-P", "1024",
+			  "-C" ,   "0", "-M",   "1", OPT_BW_RAM },
+
+   /* 2-stream NUMA bandwidth measurements: */
+   { "RAM-bw-local-2x,",  "mem",  "-p",  "2",  "-t",  "1", "-P", "1024",
+			   "-C", "0,2", "-M", "0x2", OPT_BW_RAM },
+   { "RAM-bw-remote-2x,", "mem",  "-p",  "2",  "-t",  "1", "-P", "1024",
+		 	   "-C", "0,2", "-M", "1x2", OPT_BW_RAM },
+
+   /* Cross-stream NUMA bandwidth measurement: */
+   { "RAM-bw-cross,",     "mem",  "-p",  "2",  "-t",  "1", "-P", "1024",
+		 	   "-C", "0,8", "-M", "1,0", OPT_BW_RAM },
+
+   /* Convergence latency measurements: */
+   { " 1x3-convergence,", "mem",  "-p",  "1", "-t",  "3", "-P",  "512", OPT_CONV },
+   { " 1x4-convergence,", "mem",  "-p",  "1", "-t",  "4", "-P",  "512", OPT_CONV },
+   { " 1x6-convergence,", "mem",  "-p",  "1", "-t",  "6", "-P", "1020", OPT_CONV },
+   { " 2x3-convergence,", "mem",  "-p",  "3", "-t",  "3", "-P", "1020", OPT_CONV },
+   { " 3x3-convergence,", "mem",  "-p",  "3", "-t",  "3", "-P", "1020", OPT_CONV },
+   { " 4x4-convergence,", "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_CONV },
+   { " 4x4-convergence-NOTHP,",
+			  "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_CONV_NOTHP },
+   { " 4x6-convergence,", "mem",  "-p",  "4", "-t",  "6", "-P", "1020", OPT_CONV },
+   { " 4x8-convergence,", "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_CONV },
+   { " 8x4-convergence,", "mem",  "-p",  "8", "-t",  "4", "-P",  "512", OPT_CONV },
+   { " 8x4-convergence-NOTHP,",
+			  "mem",  "-p",  "8", "-t",  "4", "-P",  "512", OPT_CONV_NOTHP },
+   { " 3x1-convergence,", "mem",  "-p",  "3", "-t",  "1", "-P",  "512", OPT_CONV },
+   { " 4x1-convergence,", "mem",  "-p",  "4", "-t",  "1", "-P",  "512", OPT_CONV },
+   { " 8x1-convergence,", "mem",  "-p",  "8", "-t",  "1", "-P",  "512", OPT_CONV },
+   { "16x1-convergence,", "mem",  "-p", "16", "-t",  "1", "-P",  "256", OPT_CONV },
+   { "32x1-convergence,", "mem",  "-p", "32", "-t",  "1", "-P",  "128", OPT_CONV },
+
+   /* Various NUMA process/thread layout bandwidth measurements: */
+   { " 2x1-bw-process,",  "mem",  "-p",  "2", "-t",  "1", "-P", "1024", OPT_BW },
+   { " 3x1-bw-process,",  "mem",  "-p",  "3", "-t",  "1", "-P", "1024", OPT_BW },
+   { " 4x1-bw-process,",  "mem",  "-p",  "4", "-t",  "1", "-P", "1024", OPT_BW },
+   { " 8x1-bw-process,",  "mem",  "-p",  "8", "-t",  "1", "-P", " 512", OPT_BW },
+   { " 8x1-bw-process-NOTHP,",
+			  "mem",  "-p",  "8", "-t",  "1", "-P", " 512", OPT_BW_NOTHP },
+   { "16x1-bw-process,",  "mem",  "-p", "16", "-t",  "1", "-P",  "256", OPT_BW },
+
+   { " 4x1-bw-thread,",	  "mem",  "-p",  "1", "-t",  "4", "-T",  "256", OPT_BW },
+   { " 8x1-bw-thread,",	  "mem",  "-p",  "1", "-t",  "8", "-T",  "256", OPT_BW },
+   { "16x1-bw-thread,",   "mem",  "-p",  "1", "-t", "16", "-T",  "128", OPT_BW },
+   { "32x1-bw-thread,",   "mem",  "-p",  "1", "-t", "32", "-T",   "64", OPT_BW },
+
+   { " 2x3-bw-thread,",	  "mem",  "-p",  "2", "-t",  "3", "-P",  "512", OPT_BW },
+   { " 4x4-bw-thread,",	  "mem",  "-p",  "4", "-t",  "4", "-P",  "512", OPT_BW },
+   { " 4x6-bw-thread,",	  "mem",  "-p",  "4", "-t",  "6", "-P",  "512", OPT_BW },
+   { " 4x8-bw-thread,",	  "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_BW },
+   { " 4x8-bw-thread-NOTHP,",
+			  "mem",  "-p",  "4", "-t",  "8", "-P",  "512", OPT_BW_NOTHP },
+   { " 3x3-bw-thread,",	  "mem",  "-p",  "3", "-t",  "3", "-P",  "512", OPT_BW },
+   { " 5x5-bw-thread,",	  "mem",  "-p",  "5", "-t",  "5", "-P",  "512", OPT_BW },
+
+   { "2x16-bw-thread,",   "mem",  "-p",  "2", "-t", "16", "-P",  "512", OPT_BW },
+   { "1x32-bw-thread,",   "mem",  "-p",  "1", "-t", "32", "-P", "2048", OPT_BW },
+
+   { "numa02-bw,",	  "mem",  "-p",  "1", "-t", "32", "-T",   "32", OPT_BW },
+   { "numa02-bw-NOTHP,",  "mem",  "-p",  "1", "-t", "32", "-T",   "32", OPT_BW_NOTHP },
+   { "numa01-bw-thread,", "mem",  "-p",  "2", "-t", "16", "-T",  "192", OPT_BW },
+   { "numa01-bw-thread-NOTHP,",
+			  "mem",  "-p",  "2", "-t", "16", "-T",  "192", OPT_BW_NOTHP },
+};
+
+static int bench_all(void)
+{
+	int nr = ARRAY_SIZE(tests);
+	int ret;
+	int i;
+
+	ret = system("echo ' #'; echo ' # Running test on: '$(uname -a); echo ' #'");
+	BUG_ON(ret < 0);
+
+	for (i = 0; i < nr; i++) {
+		run_bench_numa(tests[i][0], tests[i] + 1);
+	}
+
+	printf("\n");
+
+	return 0;
+}
+
+int bench_numa(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	init_params(&p0, "main,", argc, argv);
+	argc = parse_options(argc, argv, options, bench_numa_usage, 0);
+	if (argc)
+		goto err;
+
+	if (p0.run_all)
+		return bench_all();
+
+	if (__bench_numa(NULL))
+		goto err;
+
+	return 0;
+
+err:
+	usage_with_options(numa_usage, options);
+	return -1;
+}
diff --git a/tools/perf/bench/sched-messaging.c b/tools/perf/bench/sched-messaging.c
index 81cee78181f..cc1190a0849 100644
--- a/tools/perf/bench/sched-messaging.c
+++ b/tools/perf/bench/sched-messaging.c
@@ -31,9 +31,9 @@
 
 #define DATASIZE 100
 
-static int use_pipes = 0;
+static bool use_pipes = false;
 static unsigned int loops = 100;
-static unsigned int thread_mode = 0;
+static bool thread_mode = false;
 static unsigned int num_groups = 10;
 
 struct sender_context {
@@ -256,10 +256,8 @@ static const struct option options[] = {
 		    "Use pipe() instead of socketpair()"),
 	OPT_BOOLEAN('t', "thread", &thread_mode,
 		    "Be multi thread instead of multi process"),
-	OPT_INTEGER('g', "group", &num_groups,
-		    "Specify number of groups"),
-	OPT_INTEGER('l', "loop", &loops,
-		    "Specify number of loops"),
+	OPT_UINTEGER('g', "group", &num_groups, "Specify number of groups"),
+	OPT_UINTEGER('l', "loop", &loops, "Specify number of loops"),
 	OPT_END()
 };
 
@@ -269,7 +267,7 @@ static const char * const bench_sched_message_usage[] = {
 };
 
 int bench_sched_messaging(int argc, const char **argv,
-		    const char *prefix __used)
+		    const char *prefix __maybe_unused)
 {
 	unsigned int i, total_children;
 	struct timeval start, stop, diff;
diff --git a/tools/perf/bench/sched-pipe.c b/tools/perf/bench/sched-pipe.c
index 4f77c7c2764..07a8d7646a1 100644
--- a/tools/perf/bench/sched-pipe.c
+++ b/tools/perf/bench/sched-pipe.c
@@ -7,9 +7,7 @@
  * Based on pipe-test-1m.c by Ingo Molnar <mingo@redhat.com>
  *  http://people.redhat.com/mingo/cfs-scheduler/tools/pipe-test-1m.c
  * Ported to perf by Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
- *
  */
-
 #include "../perf.h"
 #include "../util/util.h"
 #include "../util/parse-options.h"
@@ -28,12 +26,24 @@
 #include <sys/time.h>
 #include <sys/types.h>
 
+#include <pthread.h>
+
+struct thread_data {
+	int			nr;
+	int			pipe_read;
+	int			pipe_write;
+	pthread_t		pthread;
+};
+
 #define LOOPS_DEFAULT 1000000
-static int loops = LOOPS_DEFAULT;
+static	int			loops = LOOPS_DEFAULT;
+
+/* Use processes by default: */
+static bool			threaded;
 
 static const struct option options[] = {
-	OPT_INTEGER('l', "loop", &loops,
-		    "Specify number of loops"),
+	OPT_INTEGER('l', "loop",	&loops,		"Specify number of loops"),
+	OPT_BOOLEAN('T', "threaded",	&threaded,	"Specify threads/process based task setup"),
 	OPT_END()
 };
 
@@ -42,59 +52,106 @@ static const char * const bench_sched_pipe_usage[] = {
 	NULL
 };
 
-int bench_sched_pipe(int argc, const char **argv,
-		     const char *prefix __used)
+static void *worker_thread(void *__tdata)
 {
-	int pipe_1[2], pipe_2[2];
+	struct thread_data *td = __tdata;
 	int m = 0, i;
+	int ret;
+
+	for (i = 0; i < loops; i++) {
+		if (!td->nr) {
+			ret = read(td->pipe_read, &m, sizeof(int));
+			BUG_ON(ret != sizeof(int));
+			ret = write(td->pipe_write, &m, sizeof(int));
+			BUG_ON(ret != sizeof(int));
+		} else {
+			ret = write(td->pipe_write, &m, sizeof(int));
+			BUG_ON(ret != sizeof(int));
+			ret = read(td->pipe_read, &m, sizeof(int));
+			BUG_ON(ret != sizeof(int));
+		}
+	}
+
+	return NULL;
+}
+
+int bench_sched_pipe(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	struct thread_data threads[2], *td;
+	int pipe_1[2], pipe_2[2];
 	struct timeval start, stop, diff;
 	unsigned long long result_usec = 0;
+	int nr_threads = 2;
+	int t;
 
 	/*
 	 * why does "ret" exist?
 	 * discarding returned value of read(), write()
 	 * causes error in building environment for perf
 	 */
-	int ret, wait_stat;
-	pid_t pid, retpid;
+	int __maybe_unused ret, wait_stat;
+	pid_t pid, retpid __maybe_unused;
 
-	argc = parse_options(argc, argv, options,
-			     bench_sched_pipe_usage, 0);
+	argc = parse_options(argc, argv, options, bench_sched_pipe_usage, 0);
 
-	assert(!pipe(pipe_1));
-	assert(!pipe(pipe_2));
-
-	pid = fork();
-	assert(pid >= 0);
+	BUG_ON(pipe(pipe_1));
+	BUG_ON(pipe(pipe_2));
 
 	gettimeofday(&start, NULL);
 
-	if (!pid) {
-		for (i = 0; i < loops; i++) {
-			ret = read(pipe_1[0], &m, sizeof(int));
-			ret = write(pipe_2[1], &m, sizeof(int));
-		}
-	} else {
-		for (i = 0; i < loops; i++) {
-			ret = write(pipe_1[1], &m, sizeof(int));
-			ret = read(pipe_2[0], &m, sizeof(int));
+	for (t = 0; t < nr_threads; t++) {
+		td = threads + t;
+
+		td->nr = t;
+
+		if (t == 0) {
+			td->pipe_read = pipe_1[0];
+			td->pipe_write = pipe_2[1];
+		} else {
+			td->pipe_write = pipe_1[1];
+			td->pipe_read = pipe_2[0];
 		}
 	}
 
-	gettimeofday(&stop, NULL);
-	timersub(&stop, &start, &diff);
 
-	if (pid) {
+	if (threaded) {
+
+		for (t = 0; t < nr_threads; t++) {
+			td = threads + t;
+
+			ret = pthread_create(&td->pthread, NULL, worker_thread, td);
+			BUG_ON(ret);
+		}
+
+		for (t = 0; t < nr_threads; t++) {
+			td = threads + t;
+
+			ret = pthread_join(td->pthread, NULL);
+			BUG_ON(ret);
+		}
+
+	} else {
+		pid = fork();
+		assert(pid >= 0);
+
+		if (!pid) {
+			worker_thread(threads + 0);
+			exit(0);
+		} else {
+			worker_thread(threads + 1);
+		}
+
 		retpid = waitpid(pid, &wait_stat, 0);
 		assert((retpid == pid) && WIFEXITED(wait_stat));
-	} else {
-		exit(0);
 	}
 
+	gettimeofday(&stop, NULL);
+	timersub(&stop, &start, &diff);
+
 	switch (bench_format) {
 	case BENCH_FORMAT_DEFAULT:
-		printf("# Extecuted %d pipe operations between two tasks\n\n",
-			loops);
+		printf("# Executed %d pipe operations between two %s\n\n",
+			loops, threaded ? "threads" : "processes");
 
 		result_usec = diff.tv_sec * 1000000;
 		result_usec += diff.tv_usec;
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 6ad7148451c..1ec429fef2b 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -8,17 +8,18 @@
 #include "builtin.h"
 
 #include "util/util.h"
-
 #include "util/color.h"
 #include <linux/list.h>
 #include "util/cache.h"
 #include <linux/rbtree.h>
 #include "util/symbol.h"
-#include "util/string.h"
 
 #include "perf.h"
 #include "util/debug.h"
 
+#include "util/evlist.h"
+#include "util/evsel.h"
+#include "util/annotate.h"
 #include "util/event.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
@@ -26,87 +27,35 @@
 #include "util/sort.h"
 #include "util/hist.h"
 #include "util/session.h"
-
-static char		const *input_name = "perf.data";
-
-static int		force;
-
-static int		full_paths;
-
-static int		print_line;
-
-struct sym_hist {
-	u64		sum;
-	u64		ip[0];
+#include "util/tool.h"
+#include "util/data.h"
+#include "arch/common.h"
+
+#include <dlfcn.h>
+#include <linux/bitmap.h>
+
+struct perf_annotate {
+	struct perf_tool tool;
+	bool	   force, use_tui, use_stdio, use_gtk;
+	bool	   full_paths;
+	bool	   print_line;
+	bool	   skip_missing;
+	const char *sym_hist_filter;
+	const char *cpu_list;
+	DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 };
 
-struct sym_ext {
-	struct rb_node	node;
-	double		percent;
-	char		*path;
-};
-
-struct sym_priv {
-	struct sym_hist	*hist;
-	struct sym_ext	*ext;
-};
-
-static const char *sym_hist_filter;
-
-static int sym__alloc_hist(struct symbol *self)
-{
-	struct sym_priv *priv = symbol__priv(self);
-	const int size = (sizeof(*priv->hist) +
-			  (self->end - self->start) * sizeof(u64));
-
-	priv->hist = zalloc(size);
-	return priv->hist == NULL ? -1 : 0;
-}
-
-/*
- * collect histogram counts
- */
-static int annotate__hist_hit(struct hist_entry *he, u64 ip)
-{
-	unsigned int sym_size, offset;
-	struct symbol *sym = he->sym;
-	struct sym_priv *priv;
-	struct sym_hist *h;
-
-	he->count++;
-
-	if (!sym || !he->map)
-		return 0;
-
-	priv = symbol__priv(sym);
-	if (priv->hist == NULL && sym__alloc_hist(sym) < 0)
-		return -ENOMEM;
-
-	sym_size = sym->end - sym->start;
-	offset = ip - sym->start;
-
-	pr_debug3("%s: ip=%#Lx\n", __func__, he->map->unmap_ip(he->map, ip));
-
-	if (offset >= sym_size)
-		return 0;
-
-	h = priv->hist;
-	h->sum++;
-	h->ip[offset]++;
-
-	pr_debug3("%#Lx %s: count++ [ip: %#Lx, %#Lx] => %Ld\n", he->sym->start,
-		  he->sym->name, ip, ip - he->sym->start, h->ip[offset]);
-	return 0;
-}
-
-static int perf_session__add_hist_entry(struct perf_session *self,
-					struct addr_location *al, u64 count)
+static int perf_evsel__add_sample(struct perf_evsel *evsel,
+				  struct perf_sample *sample __maybe_unused,
+				  struct addr_location *al,
+				  struct perf_annotate *ann)
 {
-	bool hit;
 	struct hist_entry *he;
+	int ret;
 
-	if (sym_hist_filter != NULL &&
-	    (al->sym == NULL || strcmp(sym_hist_filter, al->sym->name) != 0)) {
+	if (ann->sym_hist_filter != NULL &&
+	    (al->sym == NULL ||
+	     strcmp(ann->sym_hist_filter, al->sym->name) != 0)) {
 		/* We're only interested in a symbol named sym_hist_filter */
 		if (al->sym != NULL) {
 			rb_erase(&al->sym->rb_node,
@@ -116,27 +65,35 @@ static int perf_session__add_hist_entry(struct perf_session *self,
 		return 0;
 	}
 
-	he = __perf_session__add_hist_entry(&self->hists, al, NULL, count, &hit);
+	he = __hists__add_entry(&evsel->hists, al, NULL, NULL, NULL, 1, 1, 0,
+				true);
 	if (he == NULL)
 		return -ENOMEM;
 
-	return annotate__hist_hit(he, al->addr);
+	ret = hist_entry__inc_addr_samples(he, evsel->idx, al->addr);
+	hists__inc_nr_samples(&evsel->hists, true);
+	return ret;
 }
 
-static int process_sample_event(event_t *event, struct perf_session *session)
+static int process_sample_event(struct perf_tool *tool,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct perf_evsel *evsel,
+				struct machine *machine)
 {
+	struct perf_annotate *ann = container_of(tool, struct perf_annotate, tool);
 	struct addr_location al;
 
-	dump_printf("(IP, %d): %d: %#Lx\n", event->header.misc,
-		    event->ip.pid, event->ip.ip);
-
-	if (event__preprocess_sample(event, session, &al, NULL) < 0) {
+	if (perf_event__preprocess_sample(event, machine, &al, sample) < 0) {
 		pr_warning("problem processing %d event, skipping it.\n",
 			   event->header.type);
 		return -1;
 	}
 
-	if (!al.filtered && perf_session__add_hist_entry(session, &al, 1)) {
+	if (ann->cpu_list && !test_bit(sample->cpu, ann->cpu_bitmap))
+		return 0;
+
+	if (!al.filtered && perf_evsel__add_sample(evsel, sample, &al, ann)) {
 		pr_warning("problem incrementing symbol count, "
 			   "skipping event\n");
 		return -1;
@@ -145,489 +102,276 @@ static int process_sample_event(event_t *event, struct perf_session *session)
 	return 0;
 }
 
-struct objdump_line {
-	struct list_head node;
-	s64		 offset;
-	char		 *line;
-};
-
-static struct objdump_line *objdump_line__new(s64 offset, char *line)
+static int hist_entry__tty_annotate(struct hist_entry *he,
+				    struct perf_evsel *evsel,
+				    struct perf_annotate *ann)
 {
-	struct objdump_line *self = malloc(sizeof(*self));
-
-	if (self != NULL) {
-		self->offset = offset;
-		self->line = line;
-	}
-
-	return self;
+	return symbol__tty_annotate(he->ms.sym, he->ms.map, evsel,
+				    ann->print_line, ann->full_paths, 0, 0);
 }
 
-static void objdump_line__free(struct objdump_line *self)
+static void hists__find_annotations(struct hists *hists,
+				    struct perf_evsel *evsel,
+				    struct perf_annotate *ann)
 {
-	free(self->line);
-	free(self);
-}
+	struct rb_node *nd = rb_first(&hists->entries), *next;
+	int key = K_RIGHT;
 
-static void objdump__add_line(struct list_head *head, struct objdump_line *line)
-{
-	list_add_tail(&line->node, head);
-}
-
-static struct objdump_line *objdump__get_next_ip_line(struct list_head *head,
-						      struct objdump_line *pos)
-{
-	list_for_each_entry_continue(pos, head, node)
-		if (pos->offset >= 0)
-			return pos;
-
-	return NULL;
-}
-
-static int parse_line(FILE *file, struct hist_entry *he,
-		      struct list_head *head)
-{
-	struct symbol *sym = he->sym;
-	struct objdump_line *objdump_line;
-	char *line = NULL, *tmp, *tmp2;
-	size_t line_len;
-	s64 line_ip, offset = -1;
-	char *c;
-
-	if (getline(&line, &line_len, file) < 0)
-		return -1;
-
-	if (!line)
-		return -1;
-
-	c = strchr(line, '\n');
-	if (c)
-		*c = 0;
-
-	line_ip = -1;
-
-	/*
-	 * Strip leading spaces:
-	 */
-	tmp = line;
-	while (*tmp) {
-		if (*tmp != ' ')
-			break;
-		tmp++;
-	}
-
-	if (*tmp) {
-		/*
-		 * Parse hexa addresses followed by ':'
-		 */
-		line_ip = strtoull(tmp, &tmp2, 16);
-		if (*tmp2 != ':')
-			line_ip = -1;
-	}
-
-	if (line_ip != -1) {
-		u64 start = map__rip_2objdump(he->map, sym->start);
-		offset = line_ip - start;
-	}
-
-	objdump_line = objdump_line__new(offset, line);
-	if (objdump_line == NULL) {
-		free(line);
-		return -1;
-	}
-	objdump__add_line(head, objdump_line);
-
-	return 0;
-}
-
-static int objdump_line__print(struct objdump_line *self,
-			       struct list_head *head,
-			       struct hist_entry *he, u64 len)
-{
-	struct symbol *sym = he->sym;
-	static const char *prev_line;
-	static const char *prev_color;
-
-	if (self->offset != -1) {
-		const char *path = NULL;
-		unsigned int hits = 0;
-		double percent = 0.0;
-		const char *color;
-		struct sym_priv *priv = symbol__priv(sym);
-		struct sym_ext *sym_ext = priv->ext;
-		struct sym_hist *h = priv->hist;
-		s64 offset = self->offset;
-		struct objdump_line *next = objdump__get_next_ip_line(head, self);
-
-		while (offset < (s64)len &&
-		       (next == NULL || offset < next->offset)) {
-			if (sym_ext) {
-				if (path == NULL)
-					path = sym_ext[offset].path;
-				percent += sym_ext[offset].percent;
-			} else
-				hits += h->ip[offset];
-
-			++offset;
+	while (nd) {
+		struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node);
+		struct annotation *notes;
+
+		if (he->ms.sym == NULL || he->ms.map->dso->annotate_warned)
+			goto find_next;
+
+		notes = symbol__annotation(he->ms.sym);
+		if (notes->src == NULL) {
+find_next:
+			if (key == K_LEFT)
+				nd = rb_prev(nd);
+			else
+				nd = rb_next(nd);
+			continue;
 		}
 
-		if (sym_ext == NULL && h->sum)
-			percent = 100.0 * hits / h->sum;
-
-		color = get_percent_color(percent);
-
-		/*
-		 * Also color the filename and line if needed, with
-		 * the same color than the percentage. Don't print it
-		 * twice for close colored ip with the same filename:line
-		 */
-		if (path) {
-			if (!prev_line || strcmp(prev_line, path)
-				       || color != prev_color) {
-				color_fprintf(stdout, color, " %s", path);
-				prev_line = path;
-				prev_color = color;
+		if (use_browser == 2) {
+			int ret;
+			int (*annotate)(struct hist_entry *he,
+					struct perf_evsel *evsel,
+					struct hist_browser_timer *hbt);
+
+			annotate = dlsym(perf_gtk_handle,
+					 "hist_entry__gtk_annotate");
+			if (annotate == NULL) {
+				ui__error("GTK browser not found!\n");
+				return;
 			}
-		}
-
-		color_fprintf(stdout, color, " %7.2f", percent);
-		printf(" :	");
-		color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", self->line);
-	} else {
-		if (!*self->line)
-			printf("         :\n");
-		else
-			printf("         :	%s\n", self->line);
-	}
-
-	return 0;
-}
-
-static struct rb_root root_sym_ext;
 
-static void insert_source_line(struct sym_ext *sym_ext)
-{
-	struct sym_ext *iter;
-	struct rb_node **p = &root_sym_ext.rb_node;
-	struct rb_node *parent = NULL;
-
-	while (*p != NULL) {
-		parent = *p;
-		iter = rb_entry(parent, struct sym_ext, node);
-
-		if (sym_ext->percent > iter->percent)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
-
-	rb_link_node(&sym_ext->node, parent, p);
-	rb_insert_color(&sym_ext->node, &root_sym_ext);
-}
-
-static void free_source_line(struct hist_entry *he, int len)
-{
-	struct sym_priv *priv = symbol__priv(he->sym);
-	struct sym_ext *sym_ext = priv->ext;
-	int i;
-
-	if (!sym_ext)
-		return;
-
-	for (i = 0; i < len; i++)
-		free(sym_ext[i].path);
-	free(sym_ext);
-
-	priv->ext = NULL;
-	root_sym_ext = RB_ROOT;
-}
-
-/* Get the filename:line for the colored entries */
-static void
-get_source_line(struct hist_entry *he, int len, const char *filename)
-{
-	struct symbol *sym = he->sym;
-	u64 start;
-	int i;
-	char cmd[PATH_MAX * 2];
-	struct sym_ext *sym_ext;
-	struct sym_priv *priv = symbol__priv(sym);
-	struct sym_hist *h = priv->hist;
-
-	if (!h->sum)
-		return;
-
-	sym_ext = priv->ext = calloc(len, sizeof(struct sym_ext));
-	if (!priv->ext)
-		return;
-
-	start = he->map->unmap_ip(he->map, sym->start);
-
-	for (i = 0; i < len; i++) {
-		char *path = NULL;
-		size_t line_len;
-		u64 offset;
-		FILE *fp;
-
-		sym_ext[i].percent = 100.0 * h->ip[i] / h->sum;
-		if (sym_ext[i].percent <= 0.5)
-			continue;
-
-		offset = start + i;
-		sprintf(cmd, "addr2line -e %s %016llx", filename, offset);
-		fp = popen(cmd, "r");
-		if (!fp)
-			continue;
-
-		if (getline(&path, &line_len, fp) < 0 || !line_len)
-			goto next;
-
-		sym_ext[i].path = malloc(sizeof(char) * line_len + 1);
-		if (!sym_ext[i].path)
-			goto next;
-
-		strcpy(sym_ext[i].path, path);
-		insert_source_line(&sym_ext[i]);
+			ret = annotate(he, evsel, NULL);
+			if (!ret || !ann->skip_missing)
+				return;
+
+			/* skip missing symbols */
+			nd = rb_next(nd);
+		} else if (use_browser == 1) {
+			key = hist_entry__tui_annotate(he, evsel, NULL);
+			switch (key) {
+			case -1:
+				if (!ann->skip_missing)
+					return;
+				/* fall through */
+			case K_RIGHT:
+				next = rb_next(nd);
+				break;
+			case K_LEFT:
+				next = rb_prev(nd);
+				break;
+			default:
+				return;
+			}
 
-	next:
-		pclose(fp);
+			if (next != NULL)
+				nd = next;
+		} else {
+			hist_entry__tty_annotate(he, evsel, ann);
+			nd = rb_next(nd);
+			/*
+			 * Since we have a hist_entry per IP for the same
+			 * symbol, free he->ms.sym->src to signal we already
+			 * processed this symbol.
+			 */
+			zfree(&notes->src);
+		}
 	}
 }
 
-static void print_summary(const char *filename)
+static int __cmd_annotate(struct perf_annotate *ann)
 {
-	struct sym_ext *sym_ext;
-	struct rb_node *node;
-
-	printf("\nSorted summary for file %s\n", filename);
-	printf("----------------------------------------------\n\n");
-
-	if (RB_EMPTY_ROOT(&root_sym_ext)) {
-		printf(" Nothing higher than %1.1f%%\n", MIN_GREEN);
-		return;
-	}
-
-	node = rb_first(&root_sym_ext);
-	while (node) {
-		double percent;
-		const char *color;
-		char *path;
-
-		sym_ext = rb_entry(node, struct sym_ext, node);
-		percent = sym_ext->percent;
-		color = get_percent_color(percent);
-		path = sym_ext->path;
-
-		color_fprintf(stdout, color, " %7.2f %s", percent, path);
-		node = rb_next(node);
-	}
-}
+	int ret;
+	struct perf_session *session;
+	struct perf_evsel *pos;
+	u64 total_nr_samples;
+	struct perf_data_file file = {
+		.path  = input_name,
+		.mode  = PERF_DATA_MODE_READ,
+		.force = ann->force,
+	};
+
+	session = perf_session__new(&file, false, &ann->tool);
+	if (session == NULL)
+		return -ENOMEM;
 
-static void hist_entry__print_hits(struct hist_entry *self)
-{
-	struct symbol *sym = self->sym;
-	struct sym_priv *priv = symbol__priv(sym);
-	struct sym_hist *h = priv->hist;
-	u64 len = sym->end - sym->start, offset;
-
-	for (offset = 0; offset < len; ++offset)
-		if (h->ip[offset] != 0)
-			printf("%*Lx: %Lu\n", BITS_PER_LONG / 2,
-			       sym->start + offset, h->ip[offset]);
-	printf("%*s: %Lu\n", BITS_PER_LONG / 2, "h->sum", h->sum);
-}
+	machines__set_symbol_filter(&session->machines, symbol__annotate_init);
 
-static void annotate_sym(struct hist_entry *he)
-{
-	struct map *map = he->map;
-	struct dso *dso = map->dso;
-	struct symbol *sym = he->sym;
-	const char *filename = dso->long_name, *d_filename;
-	u64 len;
-	char command[PATH_MAX*2];
-	FILE *file;
-	LIST_HEAD(head);
-	struct objdump_line *pos, *n;
-
-	if (!filename)
-		return;
-
-	pr_debug("%s: filename=%s, sym=%s, start=%#Lx, end=%#Lx\n", __func__,
-		 filename, sym->name, map->unmap_ip(map, sym->start),
-		 map->unmap_ip(map, sym->end));
-
-	if (full_paths)
-		d_filename = filename;
-	else
-		d_filename = basename(filename);
-
-	len = sym->end - sym->start;
-
-	if (print_line) {
-		get_source_line(he, len, filename);
-		print_summary(filename);
+	if (ann->cpu_list) {
+		ret = perf_session__cpu_bitmap(session, ann->cpu_list,
+					       ann->cpu_bitmap);
+		if (ret)
+			goto out_delete;
 	}
 
-	printf("\n\n------------------------------------------------\n");
-	printf(" Percent |	Source code & Disassembly of %s\n", d_filename);
-	printf("------------------------------------------------\n");
-
-	if (verbose >= 2)
-		printf("annotating [%p] %30s : [%p] %30s\n",
-		       dso, dso->long_name, sym, sym->name);
-
-	sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s|grep -v %s",
-		map__rip_2objdump(map, sym->start),
-		map__rip_2objdump(map, sym->end),
-		filename, filename);
-
-	if (verbose >= 3)
-		printf("doing: %s\n", command);
-
-	file = popen(command, "r");
-	if (!file)
-		return;
-
-	while (!feof(file)) {
-		if (parse_line(file, he, &head) < 0)
-			break;
+	if (!objdump_path) {
+		ret = perf_session_env__lookup_objdump(&session->header.env);
+		if (ret)
+			goto out_delete;
 	}
 
-	pclose(file);
-
-	if (verbose)
-		hist_entry__print_hits(he);
+	ret = perf_session__process_events(session, &ann->tool);
+	if (ret)
+		goto out_delete;
 
-	list_for_each_entry_safe(pos, n, &head, node) {
-		objdump_line__print(pos, &head, he, len);
-		list_del(&pos->node);
-		objdump_line__free(pos);
+	if (dump_trace) {
+		perf_session__fprintf_nr_events(session, stdout);
+		goto out_delete;
 	}
 
-	if (print_line)
-		free_source_line(he, len);
-}
+	if (verbose > 3)
+		perf_session__fprintf(session, stdout);
 
-static void perf_session__find_annotations(struct perf_session *self)
-{
-	struct rb_node *nd;
+	if (verbose > 2)
+		perf_session__fprintf_dsos(session, stdout);
 
-	for (nd = rb_first(&self->hists); nd; nd = rb_next(nd)) {
-		struct hist_entry *he = rb_entry(nd, struct hist_entry, rb_node);
-		struct sym_priv *priv;
+	total_nr_samples = 0;
+	evlist__for_each(session->evlist, pos) {
+		struct hists *hists = &pos->hists;
+		u32 nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE];
 
-		if (he->sym == NULL)
-			continue;
+		if (nr_samples > 0) {
+			total_nr_samples += nr_samples;
+			hists__collapse_resort(hists, NULL);
+			hists__output_resort(hists);
 
-		priv = symbol__priv(he->sym);
-		if (priv->hist == NULL)
-			continue;
+			if (symbol_conf.event_group &&
+			    !perf_evsel__is_group_leader(pos))
+				continue;
 
-		annotate_sym(he);
-		/*
-		 * Since we have a hist_entry per IP for the same symbol, free
-		 * he->sym->hist to signal we already processed this symbol.
-		 */
-		free(priv->hist);
-		priv->hist = NULL;
+			hists__find_annotations(hists, pos, ann);
+		}
 	}
-}
-
-static struct perf_event_ops event_ops = {
-	.sample	= process_sample_event,
-	.mmap	= event__process_mmap,
-	.comm	= event__process_comm,
-	.fork	= event__process_task,
-};
-
-static int __cmd_annotate(void)
-{
-	int ret;
-	struct perf_session *session;
-
-	session = perf_session__new(input_name, O_RDONLY, force);
-	if (session == NULL)
-		return -ENOMEM;
 
-	ret = perf_session__process_events(session, &event_ops);
-	if (ret)
-		goto out_delete;
-
-	if (dump_trace) {
-		event__print_totals();
+	if (total_nr_samples == 0) {
+		ui__error("The %s file has no samples!\n", file.path);
 		goto out_delete;
 	}
 
-	if (verbose > 3)
-		perf_session__fprintf(session, stdout);
+	if (use_browser == 2) {
+		void (*show_annotations)(void);
 
-	if (verbose > 2)
-		dsos__fprintf(stdout);
+		show_annotations = dlsym(perf_gtk_handle,
+					 "perf_gtk__show_annotations");
+		if (show_annotations == NULL) {
+			ui__error("GTK browser not found!\n");
+			goto out_delete;
+		}
+		show_annotations();
+	}
 
-	perf_session__collapse_resort(&session->hists);
-	perf_session__output_resort(&session->hists, session->event_total[0]);
-	perf_session__find_annotations(session);
 out_delete:
-	perf_session__delete(session);
-
+	/*
+	 * Speed up the exit process, for large files this can
+	 * take quite a while.
+	 *
+	 * XXX Enable this when using valgrind or if we ever
+	 * librarize this command.
+	 *
+	 * Also experiment with obstacks to see how much speed
+	 * up we'll get here.
+	 *
+	 * perf_session__delete(session);
+	 */
 	return ret;
 }
 
 static const char * const annotate_usage[] = {
-	"perf annotate [<options>] <command>",
+	"perf annotate [<options>]",
 	NULL
 };
 
-static const struct option options[] = {
+int cmd_annotate(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	struct perf_annotate annotate = {
+		.tool = {
+			.sample	= process_sample_event,
+			.mmap	= perf_event__process_mmap,
+			.mmap2	= perf_event__process_mmap2,
+			.comm	= perf_event__process_comm,
+			.exit	= perf_event__process_exit,
+			.fork	= perf_event__process_fork,
+			.ordered_samples = true,
+			.ordering_requires_timestamps = true,
+		},
+	};
+	const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
-	OPT_STRING('s', "symbol", &sym_hist_filter, "symbol",
+	OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
+		   "only consider symbols in these dsos"),
+	OPT_STRING('s', "symbol", &annotate.sym_hist_filter, "symbol",
 		    "symbol to annotate"),
-	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_BOOLEAN('f', "force", &annotate.force, "don't complain, do it"),
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
+	OPT_BOOLEAN(0, "gtk", &annotate.use_gtk, "Use the GTK interface"),
+	OPT_BOOLEAN(0, "tui", &annotate.use_tui, "Use the TUI interface"),
+	OPT_BOOLEAN(0, "stdio", &annotate.use_stdio, "Use the stdio interface"),
 	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
 		   "file", "vmlinux pathname"),
 	OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules,
 		    "load module symbols - WARNING: use only with -k and LIVE kernel"),
-	OPT_BOOLEAN('l', "print-line", &print_line,
+	OPT_BOOLEAN('l', "print-line", &annotate.print_line,
 		    "print matching source lines (may be slow)"),
-	OPT_BOOLEAN('P', "full-paths", &full_paths,
+	OPT_BOOLEAN('P', "full-paths", &annotate.full_paths,
 		    "Don't shorten the displayed pathnames"),
+	OPT_BOOLEAN(0, "skip-missing", &annotate.skip_missing,
+		    "Skip symbols that cannot be annotated"),
+	OPT_STRING('C', "cpu", &annotate.cpu_list, "cpu", "list of cpus to profile"),
+	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
+		   "Look for files with symbols relative to this directory"),
+	OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src,
+		    "Interleave source code with assembly code (default)"),
+	OPT_BOOLEAN(0, "asm-raw", &symbol_conf.annotate_asm_raw,
+		    "Display raw encoding of assembly instructions (default)"),
+	OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style",
+		   "Specify disassembler style (e.g. -M intel for intel syntax)"),
+	OPT_STRING(0, "objdump", &objdump_path, "path",
+		   "objdump binary to use for disassembly and annotations"),
+	OPT_BOOLEAN(0, "group", &symbol_conf.event_group,
+		    "Show event group information together"),
 	OPT_END()
-};
+	};
 
-int cmd_annotate(int argc, const char **argv, const char *prefix __used)
-{
 	argc = parse_options(argc, argv, options, annotate_usage, 0);
 
-	symbol_conf.priv_size = sizeof(struct sym_priv);
+	if (annotate.use_stdio)
+		use_browser = 0;
+	else if (annotate.use_tui)
+		use_browser = 1;
+	else if (annotate.use_gtk)
+		use_browser = 2;
+
+	setup_browser(true);
+
+	symbol_conf.priv_size = sizeof(struct annotation);
 	symbol_conf.try_vmlinux_path = true;
 
 	if (symbol__init() < 0)
 		return -1;
 
-	setup_sorting(annotate_usage, options);
+	if (setup_sorting() < 0)
+		usage_with_options(annotate_usage, options);
 
 	if (argc) {
 		/*
-		 * Special case: if there's an argument left then assume tha
+		 * Special case: if there's an argument left then assume that
 		 * it's a symbol filter:
 		 */
 		if (argc > 1)
 			usage_with_options(annotate_usage, options);
 
-		sym_hist_filter = argv[0];
-	}
-
-	setup_pager();
-
-	if (field_sep && *field_sep == '.') {
-		pr_err("'.' is the only non valid --field-separator argument\n");
-		return -1;
+		annotate.sym_hist_filter = argv[0];
 	}
 
-	return __cmd_annotate();
+	return __cmd_annotate(&annotate);
 }
diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c
index 46996774e55..1e6e7771054 100644
--- a/tools/perf/builtin-bench.c
+++ b/tools/perf/builtin-bench.c
@@ -1,21 +1,19 @@
 /*
- *
  * builtin-bench.c
  *
- * General benchmarking subsystem provided by perf
+ * General benchmarking collections provided by perf
  *
  * Copyright (C) 2009, Hitoshi Mitake <mitake@dcl.info.waseda.ac.jp>
- *
  */
 
 /*
+ * Available benchmark collection list:
  *
- * Available subsystem list:
- *  sched ... scheduler and IPC mechanism
+ *  sched ... scheduler and IPC performance
  *  mem   ... memory access performance
- *
+ *  numa  ... NUMA scheduling and MM performance
+ *  futex ... Futex performance
  */
-
 #include "perf.h"
 #include "util/util.h"
 #include "util/parse-options.h"
@@ -25,92 +23,101 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <sys/prctl.h>
+
+typedef int (*bench_fn_t)(int argc, const char **argv, const char *prefix);
+
+struct bench {
+	const char	*name;
+	const char	*summary;
+	bench_fn_t	fn;
+};
+
+#ifdef HAVE_LIBNUMA_SUPPORT
+static struct bench numa_benchmarks[] = {
+	{ "mem",	"Benchmark for NUMA workloads",			bench_numa		},
+	{ "all",	"Test all NUMA benchmarks",			NULL			},
+	{ NULL,		NULL,						NULL			}
+};
+#endif
 
-struct bench_suite {
-	const char *name;
-	const char *summary;
-	int (*fn)(int, const char **, const char *);
+static struct bench sched_benchmarks[] = {
+	{ "messaging",	"Benchmark for scheduling and IPC",		bench_sched_messaging	},
+	{ "pipe",	"Benchmark for pipe() between two processes",	bench_sched_pipe	},
+	{ "all",	"Test all scheduler benchmarks",		NULL			},
+	{ NULL,		NULL,						NULL			}
 };
-						\
-/* sentinel: easy for help */
-#define suite_all { "all", "test all suite (pseudo suite)", NULL }
-
-static struct bench_suite sched_suites[] = {
-	{ "messaging",
-	  "Benchmark for scheduler and IPC mechanisms",
-	  bench_sched_messaging },
-	{ "pipe",
-	  "Flood of communication over pipe() between two processes",
-	  bench_sched_pipe      },
-	suite_all,
-	{ NULL,
-	  NULL,
-	  NULL                  }
+
+static struct bench mem_benchmarks[] = {
+	{ "memcpy",	"Benchmark for memcpy()",			bench_mem_memcpy	},
+	{ "memset",	"Benchmark for memset() tests",			bench_mem_memset	},
+	{ "all",	"Test all memory benchmarks",			NULL			},
+	{ NULL,		NULL,						NULL			}
 };
 
-static struct bench_suite mem_suites[] = {
-	{ "memcpy",
-	  "Simple memory copy in various ways",
-	  bench_mem_memcpy },
-	suite_all,
-	{ NULL,
-	  NULL,
-	  NULL             }
+static struct bench futex_benchmarks[] = {
+	{ "hash",	"Benchmark for futex hash table",               bench_futex_hash	},
+	{ "wake",	"Benchmark for futex wake calls",               bench_futex_wake	},
+	{ "requeue",	"Benchmark for futex requeue calls",            bench_futex_requeue	},
+	{ "all",	"Test all futex benchmarks",			NULL			},
+	{ NULL,		NULL,						NULL			}
 };
 
-struct bench_subsys {
-	const char *name;
-	const char *summary;
-	struct bench_suite *suites;
+struct collection {
+	const char	*name;
+	const char	*summary;
+	struct bench	*benchmarks;
 };
 
-static struct bench_subsys subsystems[] = {
-	{ "sched",
-	  "scheduler and IPC mechanism",
-	  sched_suites },
-	{ "mem",
-	  "memory access performance",
-	  mem_suites },
-	{ "all",		/* sentinel: easy for help */
-	  "test all subsystem (pseudo subsystem)",
-	  NULL },
-	{ NULL,
-	  NULL,
-	  NULL       }
+static struct collection collections[] = {
+	{ "sched",	"Scheduler and IPC benchmarks",			sched_benchmarks	},
+	{ "mem",	"Memory access benchmarks",			mem_benchmarks		},
+#ifdef HAVE_LIBNUMA_SUPPORT
+	{ "numa",	"NUMA scheduling and MM benchmarks",		numa_benchmarks		},
+#endif
+	{"futex",       "Futex stressing benchmarks",                   futex_benchmarks        },
+	{ "all",	"All benchmarks",				NULL			},
+	{ NULL,		NULL,						NULL			}
 };
 
-static void dump_suites(int subsys_index)
+/* Iterate over all benchmark collections: */
+#define for_each_collection(coll) \
+	for (coll = collections; coll->name; coll++)
+
+/* Iterate over all benchmarks within a collection: */
+#define for_each_bench(coll, bench) \
+	for (bench = coll->benchmarks; bench && bench->name; bench++)
+
+static void dump_benchmarks(struct collection *coll)
 {
-	int i;
+	struct bench *bench;
 
-	printf("# List of available suites for %s...\n\n",
-	       subsystems[subsys_index].name);
+	printf("\n        # List of available benchmarks for collection '%s':\n\n", coll->name);
 
-	for (i = 0; subsystems[subsys_index].suites[i].name; i++)
-		printf("%14s: %s\n",
-		       subsystems[subsys_index].suites[i].name,
-		       subsystems[subsys_index].suites[i].summary);
+	for_each_bench(coll, bench)
+		printf("%14s: %s\n", bench->name, bench->summary);
 
 	printf("\n");
-	return;
 }
 
-static char *bench_format_str;
+static const char *bench_format_str;
+
+/* Output/formatting style, exported to benchmark modules: */
 int bench_format = BENCH_FORMAT_DEFAULT;
 
 static const struct option bench_options[] = {
-	OPT_STRING('f', "format", &bench_format_str, "default",
-		    "Specify format style"),
+	OPT_STRING('f', "format", &bench_format_str, "default", "Specify format style"),
 	OPT_END()
 };
 
 static const char * const bench_usage[] = {
-	"perf bench [<common options>] <subsystem> <suite> [<options>]",
+	"perf bench [<common options>] <collection> <benchmark> [<options>]",
 	NULL
 };
 
 static void print_usage(void)
 {
+	struct collection *coll;
 	int i;
 
 	printf("Usage: \n");
@@ -118,15 +125,14 @@ static void print_usage(void)
 		printf("\t%s\n", bench_usage[i]);
 	printf("\n");
 
-	printf("# List of available subsystems...\n\n");
+	printf("        # List of all available benchmark collections:\n\n");
 
-	for (i = 0; subsystems[i].name; i++)
-		printf("%14s: %s\n",
-		       subsystems[i].name, subsystems[i].summary);
+	for_each_collection(coll)
+		printf("%14s: %s\n", coll->name, coll->summary);
 	printf("\n");
 }
 
-static int bench_str2int(char *str)
+static int bench_str2int(const char *str)
 {
 	if (!str)
 		return BENCH_FORMAT_DEFAULT;
@@ -139,43 +145,74 @@ static int bench_str2int(char *str)
 	return BENCH_FORMAT_UNKNOWN;
 }
 
-static void all_suite(struct bench_subsys *subsys)	  /* FROM HERE */
+/*
+ * Run a specific benchmark but first rename the running task's ->comm[]
+ * to something meaningful:
+ */
+static int run_bench(const char *coll_name, const char *bench_name, bench_fn_t fn,
+		     int argc, const char **argv, const char *prefix)
 {
-	int i;
+	int size;
+	char *name;
+	int ret;
+
+	size = strlen(coll_name) + 1 + strlen(bench_name) + 1;
+
+	name = zalloc(size);
+	BUG_ON(!name);
+
+	scnprintf(name, size, "%s-%s", coll_name, bench_name);
+
+	prctl(PR_SET_NAME, name);
+	argv[0] = name;
+
+	ret = fn(argc, argv, prefix);
+
+	free(name);
+
+	return ret;
+}
+
+static void run_collection(struct collection *coll)
+{
+	struct bench *bench;
 	const char *argv[2];
-	struct bench_suite *suites = subsys->suites;
 
 	argv[1] = NULL;
 	/*
 	 * TODO:
-	 * preparing preset parameters for
+	 *
+	 * Preparing preset parameters for
 	 * embedded, ordinary PC, HPC, etc...
-	 * will be helpful
+	 * would be helpful.
 	 */
-	for (i = 0; suites[i].fn; i++) {
-		printf("# Running %s/%s benchmark...\n",
-		       subsys->name,
-		       suites[i].name);
-
-		argv[1] = suites[i].name;
-		suites[i].fn(1, argv, NULL);
+	for_each_bench(coll, bench) {
+		if (!bench->fn)
+			break;
+		printf("# Running %s/%s benchmark...\n", coll->name, bench->name);
+		fflush(stdout);
+
+		argv[1] = bench->name;
+		run_bench(coll->name, bench->name, bench->fn, 1, argv, NULL);
 		printf("\n");
 	}
 }
 
-static void all_subsystem(void)
+static void run_all_collections(void)
 {
-	int i;
-	for (i = 0; subsystems[i].suites; i++)
-		all_suite(&subsystems[i]);
+	struct collection *coll;
+
+	for_each_collection(coll)
+		run_collection(coll);
 }
 
-int cmd_bench(int argc, const char **argv, const char *prefix __used)
+int cmd_bench(int argc, const char **argv, const char *prefix __maybe_unused)
 {
-	int i, j, status = 0;
+	struct collection *coll;
+	int ret = 0;
 
 	if (argc < 2) {
-		/* No subsystem specified. */
+		/* No collection specified. */
 		print_usage();
 		goto end;
 	}
@@ -185,7 +222,7 @@ int cmd_bench(int argc, const char **argv, const char *prefix __used)
 
 	bench_format = bench_str2int(bench_format_str);
 	if (bench_format == BENCH_FORMAT_UNKNOWN) {
-		printf("Unknown format descriptor:%s\n", bench_format_str);
+		printf("Unknown format descriptor: '%s'\n", bench_format_str);
 		goto end;
 	}
 
@@ -195,51 +232,51 @@ int cmd_bench(int argc, const char **argv, const char *prefix __used)
 	}
 
 	if (!strcmp(argv[0], "all")) {
-		all_subsystem();
+		run_all_collections();
 		goto end;
 	}
 
-	for (i = 0; subsystems[i].name; i++) {
-		if (strcmp(subsystems[i].name, argv[0]))
+	for_each_collection(coll) {
+		struct bench *bench;
+
+		if (strcmp(coll->name, argv[0]))
 			continue;
 
 		if (argc < 2) {
-			/* No suite specified. */
-			dump_suites(i);
+			/* No bench specified. */
+			dump_benchmarks(coll);
 			goto end;
 		}
 
 		if (!strcmp(argv[1], "all")) {
-			all_suite(&subsystems[i]);
+			run_collection(coll);
 			goto end;
 		}
 
-		for (j = 0; subsystems[i].suites[j].name; j++) {
-			if (strcmp(subsystems[i].suites[j].name, argv[1]))
+		for_each_bench(coll, bench) {
+			if (strcmp(bench->name, argv[1]))
 				continue;
 
 			if (bench_format == BENCH_FORMAT_DEFAULT)
-				printf("# Running %s/%s benchmark...\n",
-				       subsystems[i].name,
-				       subsystems[i].suites[j].name);
-			status = subsystems[i].suites[j].fn(argc - 1,
-							    argv + 1, prefix);
+				printf("# Running '%s/%s' benchmark:\n", coll->name, bench->name);
+			fflush(stdout);
+			ret = run_bench(coll->name, bench->name, bench->fn, argc-1, argv+1, prefix);
 			goto end;
 		}
 
 		if (!strcmp(argv[1], "-h") || !strcmp(argv[1], "--help")) {
-			dump_suites(i);
+			dump_benchmarks(coll);
 			goto end;
 		}
 
-		printf("Unknown suite:%s for %s\n", argv[1], argv[0]);
-		status = 1;
+		printf("Unknown benchmark: '%s' for collection '%s'\n", argv[1], argv[0]);
+		ret = 1;
 		goto end;
 	}
 
-	printf("Unknown subsystem:%s\n", argv[0]);
-	status = 1;
+	printf("Unknown collection: '%s'\n", argv[0]);
+	ret = 1;
 
 end:
-	return status;
+	return ret;
 }
diff --git a/tools/perf/builtin-buildid-cache.c b/tools/perf/builtin-buildid-cache.c
index 30a05f552c9..b22dbb16f87 100644
--- a/tools/perf/builtin-buildid-cache.c
+++ b/tools/perf/builtin-buildid-cache.c
@@ -6,6 +6,11 @@
  * Copyright (C) 2010, Red Hat Inc.
  * Copyright (C) 2010, Arnaldo Carvalho de Melo <acme@redhat.com>
  */
+#include <sys/types.h>
+#include <sys/time.h>
+#include <time.h>
+#include <dirent.h>
+#include <unistd.h>
 #include "builtin.h"
 #include "perf.h"
 #include "util/cache.h"
@@ -13,23 +18,168 @@
 #include "util/header.h"
 #include "util/parse-options.h"
 #include "util/strlist.h"
+#include "util/build-id.h"
+#include "util/session.h"
 #include "util/symbol.h"
 
-static char const *add_name_list_str, *remove_name_list_str;
+static int build_id_cache__kcore_buildid(const char *proc_dir, char *sbuildid)
+{
+	char root_dir[PATH_MAX];
+	char notes[PATH_MAX];
+	u8 build_id[BUILD_ID_SIZE];
+	char *p;
 
-static const char * const buildid_cache_usage[] = {
-	"perf buildid-cache [<options>]",
-	NULL
-};
+	strlcpy(root_dir, proc_dir, sizeof(root_dir));
 
-static const struct option buildid_cache_options[] = {
-	OPT_STRING('a', "add", &add_name_list_str,
-		   "file list", "file(s) to add"),
-	OPT_STRING('r', "remove", &remove_name_list_str, "file list",
-		    "file(s) to remove"),
-	OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose"),
-	OPT_END()
-};
+	p = strrchr(root_dir, '/');
+	if (!p)
+		return -1;
+	*p = '\0';
+
+	scnprintf(notes, sizeof(notes), "%s/sys/kernel/notes", root_dir);
+
+	if (sysfs__read_build_id(notes, build_id, sizeof(build_id)))
+		return -1;
+
+	build_id__sprintf(build_id, sizeof(build_id), sbuildid);
+
+	return 0;
+}
+
+static int build_id_cache__kcore_dir(char *dir, size_t sz)
+{
+	struct timeval tv;
+	struct tm tm;
+	char dt[32];
+
+	if (gettimeofday(&tv, NULL) || !localtime_r(&tv.tv_sec, &tm))
+		return -1;
+
+	if (!strftime(dt, sizeof(dt), "%Y%m%d%H%M%S", &tm))
+		return -1;
+
+	scnprintf(dir, sz, "%s%02u", dt, (unsigned)tv.tv_usec / 10000);
+
+	return 0;
+}
+
+static bool same_kallsyms_reloc(const char *from_dir, char *to_dir)
+{
+	char from[PATH_MAX];
+	char to[PATH_MAX];
+	const char *name;
+	u64 addr1 = 0, addr2 = 0;
+	int i;
+
+	scnprintf(from, sizeof(from), "%s/kallsyms", from_dir);
+	scnprintf(to, sizeof(to), "%s/kallsyms", to_dir);
+
+	for (i = 0; (name = ref_reloc_sym_names[i]) != NULL; i++) {
+		addr1 = kallsyms__get_function_start(from, name);
+		if (addr1)
+			break;
+	}
+
+	if (name)
+		addr2 = kallsyms__get_function_start(to, name);
+
+	return addr1 == addr2;
+}
+
+static int build_id_cache__kcore_existing(const char *from_dir, char *to_dir,
+					  size_t to_dir_sz)
+{
+	char from[PATH_MAX];
+	char to[PATH_MAX];
+	char to_subdir[PATH_MAX];
+	struct dirent *dent;
+	int ret = -1;
+	DIR *d;
+
+	d = opendir(to_dir);
+	if (!d)
+		return -1;
+
+	scnprintf(from, sizeof(from), "%s/modules", from_dir);
+
+	while (1) {
+		dent = readdir(d);
+		if (!dent)
+			break;
+		if (dent->d_type != DT_DIR)
+			continue;
+		scnprintf(to, sizeof(to), "%s/%s/modules", to_dir,
+			  dent->d_name);
+		scnprintf(to_subdir, sizeof(to_subdir), "%s/%s",
+			  to_dir, dent->d_name);
+		if (!compare_proc_modules(from, to) &&
+		    same_kallsyms_reloc(from_dir, to_subdir)) {
+			strlcpy(to_dir, to_subdir, to_dir_sz);
+			ret = 0;
+			break;
+		}
+	}
+
+	closedir(d);
+
+	return ret;
+}
+
+static int build_id_cache__add_kcore(const char *filename, const char *debugdir)
+{
+	char dir[32], sbuildid[BUILD_ID_SIZE * 2 + 1];
+	char from_dir[PATH_MAX], to_dir[PATH_MAX];
+	char *p;
+
+	strlcpy(from_dir, filename, sizeof(from_dir));
+
+	p = strrchr(from_dir, '/');
+	if (!p || strcmp(p + 1, "kcore"))
+		return -1;
+	*p = '\0';
+
+	if (build_id_cache__kcore_buildid(from_dir, sbuildid))
+		return -1;
+
+	scnprintf(to_dir, sizeof(to_dir), "%s/[kernel.kcore]/%s",
+		  debugdir, sbuildid);
+
+	if (!build_id_cache__kcore_existing(from_dir, to_dir, sizeof(to_dir))) {
+		pr_debug("same kcore found in %s\n", to_dir);
+		return 0;
+	}
+
+	if (build_id_cache__kcore_dir(dir, sizeof(dir)))
+		return -1;
+
+	scnprintf(to_dir, sizeof(to_dir), "%s/[kernel.kcore]/%s/%s",
+		  debugdir, sbuildid, dir);
+
+	if (mkdir_p(to_dir, 0755))
+		return -1;
+
+	if (kcore_copy(from_dir, to_dir)) {
+		/* Remove YYYYmmddHHMMSShh directory */
+		if (!rmdir(to_dir)) {
+			p = strrchr(to_dir, '/');
+			if (p)
+				*p = '\0';
+			/* Try to remove buildid directory */
+			if (!rmdir(to_dir)) {
+				p = strrchr(to_dir, '/');
+				if (p)
+					*p = '\0';
+				/* Try to remove [kernel.kcore] directory */
+				rmdir(to_dir);
+			}
+		}
+		return -1;
+	}
+
+	pr_debug("kcore added to build-id cache directory %s\n", to_dir);
+
+	return 0;
+}
 
 static int build_id_cache__add_file(const char *filename, const char *debugdir)
 {
@@ -43,15 +193,16 @@ static int build_id_cache__add_file(const char *filename, const char *debugdir)
 	}
 
 	build_id__sprintf(build_id, sizeof(build_id), sbuild_id);
-	err = build_id_cache__add_s(sbuild_id, debugdir, filename, false);
+	err = build_id_cache__add_s(sbuild_id, debugdir, filename,
+				    false, false);
 	if (verbose)
 		pr_info("Adding %s %s: %s\n", sbuild_id, filename,
 			err ? "FAIL" : "Ok");
 	return err;
 }
 
-static int build_id_cache__remove_file(const char *filename __used,
-				       const char *debugdir __used)
+static int build_id_cache__remove_file(const char *filename,
+				       const char *debugdir)
 {
 	u8 build_id[BUILD_ID_SIZE];
 	char sbuild_id[BUILD_ID_SIZE * 2 + 1];
@@ -72,14 +223,113 @@ static int build_id_cache__remove_file(const char *filename __used,
 	return err;
 }
 
-static int __cmd_buildid_cache(void)
+static bool dso__missing_buildid_cache(struct dso *dso, int parm __maybe_unused)
+{
+	char filename[PATH_MAX];
+	u8 build_id[BUILD_ID_SIZE];
+
+	if (dso__build_id_filename(dso, filename, sizeof(filename)) &&
+	    filename__read_build_id(filename, build_id,
+				    sizeof(build_id)) != sizeof(build_id)) {
+		if (errno == ENOENT)
+			return false;
+
+		pr_warning("Problems with %s file, consider removing it from the cache\n", 
+			   filename);
+	} else if (memcmp(dso->build_id, build_id, sizeof(dso->build_id))) {
+		pr_warning("Problems with %s file, consider removing it from the cache\n", 
+			   filename);
+	}
+
+	return true;
+}
+
+static int build_id_cache__fprintf_missing(const char *filename, bool force, FILE *fp)
+{
+	struct perf_data_file file = {
+		.path  = filename,
+		.mode  = PERF_DATA_MODE_READ,
+		.force = force,
+	};
+	struct perf_session *session = perf_session__new(&file, false, NULL);
+	if (session == NULL)
+		return -1;
+
+	perf_session__fprintf_dsos_buildid(session, fp, dso__missing_buildid_cache, 0);
+	perf_session__delete(session);
+
+	return 0;
+}
+
+static int build_id_cache__update_file(const char *filename,
+				       const char *debugdir)
+{
+	u8 build_id[BUILD_ID_SIZE];
+	char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+
+	int err;
+
+	if (filename__read_build_id(filename, &build_id, sizeof(build_id)) < 0) {
+		pr_debug("Couldn't read a build-id in %s\n", filename);
+		return -1;
+	}
+
+	build_id__sprintf(build_id, sizeof(build_id), sbuild_id);
+	err = build_id_cache__remove_s(sbuild_id, debugdir);
+	if (!err) {
+		err = build_id_cache__add_s(sbuild_id, debugdir, filename,
+					    false, false);
+	}
+	if (verbose)
+		pr_info("Updating %s %s: %s\n", sbuild_id, filename,
+			err ? "FAIL" : "Ok");
+
+	return err;
+}
+
+int cmd_buildid_cache(int argc, const char **argv,
+		      const char *prefix __maybe_unused)
 {
 	struct strlist *list;
 	struct str_node *pos;
+	int ret = 0;
+	bool force = false;
 	char debugdir[PATH_MAX];
+	char const *add_name_list_str = NULL,
+		   *remove_name_list_str = NULL,
+		   *missing_filename = NULL,
+		   *update_name_list_str = NULL,
+		   *kcore_filename;
+
+	const struct option buildid_cache_options[] = {
+	OPT_STRING('a', "add", &add_name_list_str,
+		   "file list", "file(s) to add"),
+	OPT_STRING('k', "kcore", &kcore_filename,
+		   "file", "kcore file to add"),
+	OPT_STRING('r', "remove", &remove_name_list_str, "file list",
+		    "file(s) to remove"),
+	OPT_STRING('M', "missing", &missing_filename, "file",
+		   "to find missing build ids in the cache"),
+	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
+	OPT_STRING('u', "update", &update_name_list_str, "file list",
+		    "file(s) to update"),
+	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
+	OPT_END()
+	};
+	const char * const buildid_cache_usage[] = {
+		"perf buildid-cache [<options>]",
+		NULL
+	};
 
-	snprintf(debugdir, sizeof(debugdir), "%s/%s", getenv("HOME"),
-		 DEBUG_CACHE_DIR);
+	argc = parse_options(argc, argv, buildid_cache_options,
+			     buildid_cache_usage, 0);
+
+	if (symbol__init() < 0)
+		return -1;
+
+	setup_pager();
+
+	snprintf(debugdir, sizeof(debugdir), "%s", buildid_dir);
 
 	if (add_name_list_str) {
 		list = strlist__new(true, add_name_list_str);
@@ -117,17 +367,30 @@ static int __cmd_buildid_cache(void)
 		}
 	}
 
-	return 0;
-}
+	if (missing_filename)
+		ret = build_id_cache__fprintf_missing(missing_filename, force, stdout);
 
-int cmd_buildid_cache(int argc, const char **argv, const char *prefix __used)
-{
-	argc = parse_options(argc, argv, buildid_cache_options,
-			     buildid_cache_usage, 0);
+	if (update_name_list_str) {
+		list = strlist__new(true, update_name_list_str);
+		if (list) {
+			strlist__for_each(pos, list)
+				if (build_id_cache__update_file(pos->s, debugdir)) {
+					if (errno == ENOENT) {
+						pr_debug("%s wasn't in the cache\n",
+							 pos->s);
+						continue;
+					}
+					pr_warning("Couldn't update %s: %s\n",
+						   pos->s, strerror(errno));
+				}
 
-	if (symbol__init() < 0)
-		return -1;
+			strlist__delete(list);
+		}
+	}
 
-	setup_pager();
-	return __cmd_buildid_cache();
+	if (kcore_filename &&
+	    build_id_cache__add_kcore(kcore_filename, debugdir))
+		pr_warning("Couldn't add %s\n", kcore_filename);
+
+	return ret;
 }
diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c
index d0675c02f81..ed3873b3e23 100644
--- a/tools/perf/builtin-buildid-list.c
+++ b/tools/perf/builtin-buildid-list.c
@@ -1,7 +1,8 @@
 /*
  * builtin-buildid-list.c
  *
- * Builtin buildid-list command: list buildids in perf.data
+ * Builtin buildid-list command: list buildids in perf.data, in the running
+ * kernel and in ELF files.
  *
  * Copyright (C) 2009, Red Hat Inc.
  * Copyright (C) 2009, Arnaldo Carvalho de Melo <acme@redhat.com>
@@ -14,47 +15,97 @@
 #include "util/parse-options.h"
 #include "util/session.h"
 #include "util/symbol.h"
+#include "util/data.h"
 
-static char const *input_name = "perf.data";
-static int force;
-static bool with_hits;
+static int sysfs__fprintf_build_id(FILE *fp)
+{
+	u8 kallsyms_build_id[BUILD_ID_SIZE];
+	char sbuild_id[BUILD_ID_SIZE * 2 + 1];
 
-static const char * const buildid_list_usage[] = {
-	"perf buildid-list [<options>]",
-	NULL
-};
+	if (sysfs__read_build_id("/sys/kernel/notes", kallsyms_build_id,
+				 sizeof(kallsyms_build_id)) != 0)
+		return -1;
 
-static const struct option options[] = {
-	OPT_BOOLEAN('H', "with-hits", &with_hits, "Show only DSOs with hits"),
-	OPT_STRING('i', "input", &input_name, "file",
-		    "input file name"),
-	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
-		    "be more verbose"),
-	OPT_END()
-};
+	build_id__sprintf(kallsyms_build_id, sizeof(kallsyms_build_id),
+			  sbuild_id);
+	fprintf(fp, "%s\n", sbuild_id);
+	return 0;
+}
 
-static int __cmd_buildid_list(void)
+static int filename__fprintf_build_id(const char *name, FILE *fp)
+{
+	u8 build_id[BUILD_ID_SIZE];
+	char sbuild_id[BUILD_ID_SIZE * 2 + 1];
+
+	if (filename__read_build_id(name, build_id,
+				    sizeof(build_id)) != sizeof(build_id))
+		return 0;
+
+	build_id__sprintf(build_id, sizeof(build_id), sbuild_id);
+	return fprintf(fp, "%s\n", sbuild_id);
+}
+
+static bool dso__skip_buildid(struct dso *dso, int with_hits)
+{
+	return with_hits && !dso->hit;
+}
+
+static int perf_session__list_build_ids(bool force, bool with_hits)
 {
-	int err = -1;
 	struct perf_session *session;
+	struct perf_data_file file = {
+		.path  = input_name,
+		.mode  = PERF_DATA_MODE_READ,
+		.force = force,
+	};
+
+	symbol__elf_init();
+	/*
+	 * See if this is an ELF file first:
+	 */
+	if (filename__fprintf_build_id(input_name, stdout))
+		goto out;
 
-	session = perf_session__new(input_name, O_RDONLY, force);
+	session = perf_session__new(&file, false, &build_id__mark_dso_hit_ops);
 	if (session == NULL)
 		return -1;
-
-	if (with_hits)
+	/*
+	 * in pipe-mode, the only way to get the buildids is to parse
+	 * the record stream. Buildids are stored as RECORD_HEADER_BUILD_ID
+	 */
+	if (with_hits || perf_data_file__is_pipe(&file))
 		perf_session__process_events(session, &build_id__mark_dso_hit_ops);
 
-	dsos__fprintf_buildid(stdout, with_hits);
-
+	perf_session__fprintf_dsos_buildid(session, stdout, dso__skip_buildid, with_hits);
 	perf_session__delete(session);
-	return err;
+out:
+	return 0;
 }
 
-int cmd_buildid_list(int argc, const char **argv, const char *prefix __used)
+int cmd_buildid_list(int argc, const char **argv,
+		     const char *prefix __maybe_unused)
 {
+	bool show_kernel = false;
+	bool with_hits = false;
+	bool force = false;
+	const struct option options[] = {
+	OPT_BOOLEAN('H', "with-hits", &with_hits, "Show only DSOs with hits"),
+	OPT_STRING('i', "input", &input_name, "file", "input file name"),
+	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
+	OPT_BOOLEAN('k', "kernel", &show_kernel, "Show current kernel build id"),
+	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
+	OPT_END()
+	};
+	const char * const buildid_list_usage[] = {
+		"perf buildid-list [<options>]",
+		NULL
+	};
+
 	argc = parse_options(argc, argv, options, buildid_list_usage, 0);
 	setup_pager();
-	return __cmd_buildid_list();
+
+	if (show_kernel)
+		return sysfs__fprintf_build_id(stdout);
+
+	return perf_session__list_build_ids(force, with_hits);
 }
diff --git a/tools/perf/builtin-diff.c b/tools/perf/builtin-diff.c
index 1ea15d8aeed..9a5a035cb42 100644
--- a/tools/perf/builtin-diff.c
+++ b/tools/perf/builtin-diff.c
@@ -9,74 +9,543 @@
 #include "util/debug.h"
 #include "util/event.h"
 #include "util/hist.h"
+#include "util/evsel.h"
+#include "util/evlist.h"
 #include "util/session.h"
+#include "util/tool.h"
 #include "util/sort.h"
 #include "util/symbol.h"
 #include "util/util.h"
+#include "util/data.h"
 
 #include <stdlib.h>
+#include <math.h>
 
-static char const *input_old = "perf.data.old",
-		  *input_new = "perf.data";
-static char	  diff__default_sort_order[] = "dso,symbol";
-static int  force;
-static bool show_displacement;
+/* Diff command specific HPP columns. */
+enum {
+	PERF_HPP_DIFF__BASELINE,
+	PERF_HPP_DIFF__PERIOD,
+	PERF_HPP_DIFF__PERIOD_BASELINE,
+	PERF_HPP_DIFF__DELTA,
+	PERF_HPP_DIFF__RATIO,
+	PERF_HPP_DIFF__WEIGHTED_DIFF,
+	PERF_HPP_DIFF__FORMULA,
 
-static int perf_session__add_hist_entry(struct perf_session *self,
-					struct addr_location *al, u64 count)
+	PERF_HPP_DIFF__MAX_INDEX
+};
+
+struct diff_hpp_fmt {
+	struct perf_hpp_fmt	 fmt;
+	int			 idx;
+	char			*header;
+	int			 header_width;
+};
+
+struct data__file {
+	struct perf_session	*session;
+	struct perf_data_file	file;
+	int			 idx;
+	struct hists		*hists;
+	struct diff_hpp_fmt	 fmt[PERF_HPP_DIFF__MAX_INDEX];
+};
+
+static struct data__file *data__files;
+static int data__files_cnt;
+
+#define data__for_each_file_start(i, d, s)	\
+	for (i = s, d = &data__files[s];	\
+	     i < data__files_cnt;		\
+	     i++, d = &data__files[i])
+
+#define data__for_each_file(i, d) data__for_each_file_start(i, d, 0)
+#define data__for_each_file_new(i, d) data__for_each_file_start(i, d, 1)
+
+static bool force;
+static bool show_period;
+static bool show_formula;
+static bool show_baseline_only;
+static unsigned int sort_compute;
+
+static s64 compute_wdiff_w1;
+static s64 compute_wdiff_w2;
+
+enum {
+	COMPUTE_DELTA,
+	COMPUTE_RATIO,
+	COMPUTE_WEIGHTED_DIFF,
+	COMPUTE_MAX,
+};
+
+const char *compute_names[COMPUTE_MAX] = {
+	[COMPUTE_DELTA] = "delta",
+	[COMPUTE_RATIO] = "ratio",
+	[COMPUTE_WEIGHTED_DIFF] = "wdiff",
+};
+
+static int compute;
+
+static int compute_2_hpp[COMPUTE_MAX] = {
+	[COMPUTE_DELTA]		= PERF_HPP_DIFF__DELTA,
+	[COMPUTE_RATIO]		= PERF_HPP_DIFF__RATIO,
+	[COMPUTE_WEIGHTED_DIFF]	= PERF_HPP_DIFF__WEIGHTED_DIFF,
+};
+
+#define MAX_COL_WIDTH 70
+
+static struct header_column {
+	const char *name;
+	int width;
+} columns[PERF_HPP_DIFF__MAX_INDEX] = {
+	[PERF_HPP_DIFF__BASELINE] = {
+		.name  = "Baseline",
+	},
+	[PERF_HPP_DIFF__PERIOD] = {
+		.name  = "Period",
+		.width = 14,
+	},
+	[PERF_HPP_DIFF__PERIOD_BASELINE] = {
+		.name  = "Base period",
+		.width = 14,
+	},
+	[PERF_HPP_DIFF__DELTA] = {
+		.name  = "Delta",
+		.width = 7,
+	},
+	[PERF_HPP_DIFF__RATIO] = {
+		.name  = "Ratio",
+		.width = 14,
+	},
+	[PERF_HPP_DIFF__WEIGHTED_DIFF] = {
+		.name  = "Weighted diff",
+		.width = 14,
+	},
+	[PERF_HPP_DIFF__FORMULA] = {
+		.name  = "Formula",
+		.width = MAX_COL_WIDTH,
+	}
+};
+
+static int setup_compute_opt_wdiff(char *opt)
 {
-	bool hit;
-	struct hist_entry *he = __perf_session__add_hist_entry(&self->hists,
-							       al, NULL,
-							       count, &hit);
-	if (he == NULL)
-		return -ENOMEM;
+	char *w1_str = opt;
+	char *w2_str;
+
+	int ret = -EINVAL;
+
+	if (!opt)
+		goto out;
+
+	w2_str = strchr(opt, ',');
+	if (!w2_str)
+		goto out;
+
+	*w2_str++ = 0x0;
+	if (!*w2_str)
+		goto out;
+
+	compute_wdiff_w1 = strtol(w1_str, NULL, 10);
+	compute_wdiff_w2 = strtol(w2_str, NULL, 10);
+
+	if (!compute_wdiff_w1 || !compute_wdiff_w2)
+		goto out;
+
+	pr_debug("compute wdiff w1(%" PRId64 ") w2(%" PRId64 ")\n",
+		  compute_wdiff_w1, compute_wdiff_w2);
 
-	if (hit)
-		he->count += count;
+	ret = 0;
+
+ out:
+	if (ret)
+		pr_err("Failed: wrong weight data, use 'wdiff:w1,w2'\n");
+
+	return ret;
+}
+
+static int setup_compute_opt(char *opt)
+{
+	if (compute == COMPUTE_WEIGHTED_DIFF)
+		return setup_compute_opt_wdiff(opt);
+
+	if (opt) {
+		pr_err("Failed: extra option specified '%s'", opt);
+		return -EINVAL;
+	}
 
 	return 0;
 }
 
-static int diff__process_sample_event(event_t *event, struct perf_session *session)
+static int setup_compute(const struct option *opt, const char *str,
+			 int unset __maybe_unused)
 {
-	struct addr_location al;
-	struct sample_data data = { .period = 1, };
+	int *cp = (int *) opt->value;
+	char *cstr = (char *) str;
+	char buf[50];
+	unsigned i;
+	char *option;
+
+	if (!str) {
+		*cp = COMPUTE_DELTA;
+		return 0;
+	}
 
-	dump_printf("(IP, %d): %d: %#Lx\n", event->header.misc,
-		    event->ip.pid, event->ip.ip);
+	option = strchr(str, ':');
+	if (option) {
+		unsigned len = option++ - str;
 
-	if (event__preprocess_sample(event, session, &al, NULL) < 0) {
-		pr_warning("problem processing %d event, skipping it.\n",
-			   event->header.type);
-		return -1;
+		/*
+		 * The str data are not writeable, so we need
+		 * to use another buffer.
+		 */
+
+		/* No option value is longer. */
+		if (len >= sizeof(buf))
+			return -EINVAL;
+
+		strncpy(buf, str, len);
+		buf[len] = 0x0;
+		cstr = buf;
+	}
+
+	for (i = 0; i < COMPUTE_MAX; i++)
+		if (!strcmp(cstr, compute_names[i])) {
+			*cp = i;
+			return setup_compute_opt(option);
+		}
+
+	pr_err("Failed: '%s' is not computation method "
+	       "(use 'delta','ratio' or 'wdiff')\n", str);
+	return -EINVAL;
+}
+
+static double period_percent(struct hist_entry *he, u64 period)
+{
+	u64 total = hists__total_period(he->hists);
+
+	return (period * 100.0) / total;
+}
+
+static double compute_delta(struct hist_entry *he, struct hist_entry *pair)
+{
+	double old_percent = period_percent(he, he->stat.period);
+	double new_percent = period_percent(pair, pair->stat.period);
+
+	pair->diff.period_ratio_delta = new_percent - old_percent;
+	pair->diff.computed = true;
+	return pair->diff.period_ratio_delta;
+}
+
+static double compute_ratio(struct hist_entry *he, struct hist_entry *pair)
+{
+	double old_period = he->stat.period ?: 1;
+	double new_period = pair->stat.period;
+
+	pair->diff.computed = true;
+	pair->diff.period_ratio = new_period / old_period;
+	return pair->diff.period_ratio;
+}
+
+static s64 compute_wdiff(struct hist_entry *he, struct hist_entry *pair)
+{
+	u64 old_period = he->stat.period;
+	u64 new_period = pair->stat.period;
+
+	pair->diff.computed = true;
+	pair->diff.wdiff = new_period * compute_wdiff_w2 -
+			   old_period * compute_wdiff_w1;
+
+	return pair->diff.wdiff;
+}
+
+static int formula_delta(struct hist_entry *he, struct hist_entry *pair,
+			 char *buf, size_t size)
+{
+	u64 he_total = he->hists->stats.total_period;
+	u64 pair_total = pair->hists->stats.total_period;
+
+	if (symbol_conf.filter_relative) {
+		he_total = he->hists->stats.total_non_filtered_period;
+		pair_total = pair->hists->stats.total_non_filtered_period;
+	}
+	return scnprintf(buf, size,
+			 "(%" PRIu64 " * 100 / %" PRIu64 ") - "
+			 "(%" PRIu64 " * 100 / %" PRIu64 ")",
+			 pair->stat.period, pair_total,
+			 he->stat.period, he_total);
+}
+
+static int formula_ratio(struct hist_entry *he, struct hist_entry *pair,
+			 char *buf, size_t size)
+{
+	double old_period = he->stat.period;
+	double new_period = pair->stat.period;
+
+	return scnprintf(buf, size, "%.0F / %.0F", new_period, old_period);
+}
+
+static int formula_wdiff(struct hist_entry *he, struct hist_entry *pair,
+			 char *buf, size_t size)
+{
+	u64 old_period = he->stat.period;
+	u64 new_period = pair->stat.period;
+
+	return scnprintf(buf, size,
+		  "(%" PRIu64 " * " "%" PRId64 ") - (%" PRIu64 " * " "%" PRId64 ")",
+		  new_period, compute_wdiff_w2, old_period, compute_wdiff_w1);
+}
+
+static int formula_fprintf(struct hist_entry *he, struct hist_entry *pair,
+			   char *buf, size_t size)
+{
+	switch (compute) {
+	case COMPUTE_DELTA:
+		return formula_delta(he, pair, buf, size);
+	case COMPUTE_RATIO:
+		return formula_ratio(he, pair, buf, size);
+	case COMPUTE_WEIGHTED_DIFF:
+		return formula_wdiff(he, pair, buf, size);
+	default:
+		BUG_ON(1);
 	}
 
-	if (al.filtered || al.sym == NULL)
+	return -1;
+}
+
+static int hists__add_entry(struct hists *hists,
+			    struct addr_location *al, u64 period,
+			    u64 weight, u64 transaction)
+{
+	if (__hists__add_entry(hists, al, NULL, NULL, NULL, period, weight,
+			       transaction, true) != NULL)
 		return 0;
+	return -ENOMEM;
+}
+
+static int diff__process_sample_event(struct perf_tool *tool __maybe_unused,
+				      union perf_event *event,
+				      struct perf_sample *sample,
+				      struct perf_evsel *evsel,
+				      struct machine *machine)
+{
+	struct addr_location al;
 
-	event__parse_sample(event, session->sample_type, &data);
+	if (perf_event__preprocess_sample(event, machine, &al, sample) < 0) {
+		pr_warning("problem processing %d event, skipping it.\n",
+			   event->header.type);
+		return -1;
+	}
 
-	if (perf_session__add_hist_entry(session, &al, data.period)) {
-		pr_warning("problem incrementing symbol count, skipping event\n");
+	if (hists__add_entry(&evsel->hists, &al, sample->period,
+			     sample->weight, sample->transaction)) {
+		pr_warning("problem incrementing symbol period, skipping event\n");
 		return -1;
 	}
 
-	session->events_stats.total += data.period;
+	/*
+	 * The total_period is updated here before going to the output
+	 * tree since normally only the baseline hists will call
+	 * hists__output_resort() and precompute needs the total
+	 * period in order to sort entries by percentage delta.
+	 */
+	evsel->hists.stats.total_period += sample->period;
+	if (!al.filtered)
+		evsel->hists.stats.total_non_filtered_period += sample->period;
+
 	return 0;
 }
 
-static struct perf_event_ops event_ops = {
+static struct perf_tool tool = {
 	.sample	= diff__process_sample_event,
-	.mmap	= event__process_mmap,
-	.comm	= event__process_comm,
-	.exit	= event__process_task,
-	.fork	= event__process_task,
-	.lost	= event__process_lost,
+	.mmap	= perf_event__process_mmap,
+	.comm	= perf_event__process_comm,
+	.exit	= perf_event__process_exit,
+	.fork	= perf_event__process_fork,
+	.lost	= perf_event__process_lost,
+	.ordered_samples = true,
+	.ordering_requires_timestamps = true,
 };
 
-static void perf_session__insert_hist_entry_by_name(struct rb_root *root,
-						    struct hist_entry *he)
+static struct perf_evsel *evsel_match(struct perf_evsel *evsel,
+				      struct perf_evlist *evlist)
+{
+	struct perf_evsel *e;
+
+	evlist__for_each(evlist, e) {
+		if (perf_evsel__match2(evsel, e))
+			return e;
+	}
+
+	return NULL;
+}
+
+static void perf_evlist__collapse_resort(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel;
+
+	evlist__for_each(evlist, evsel) {
+		struct hists *hists = &evsel->hists;
+
+		hists__collapse_resort(hists, NULL);
+	}
+}
+
+static struct hist_entry*
+get_pair_data(struct hist_entry *he, struct data__file *d)
+{
+	if (hist_entry__has_pairs(he)) {
+		struct hist_entry *pair;
+
+		list_for_each_entry(pair, &he->pairs.head, pairs.node)
+			if (pair->hists == d->hists)
+				return pair;
+	}
+
+	return NULL;
+}
+
+static struct hist_entry*
+get_pair_fmt(struct hist_entry *he, struct diff_hpp_fmt *dfmt)
+{
+	void *ptr = dfmt - dfmt->idx;
+	struct data__file *d = container_of(ptr, struct data__file, fmt);
+
+	return get_pair_data(he, d);
+}
+
+static void hists__baseline_only(struct hists *hists)
+{
+	struct rb_root *root;
+	struct rb_node *next;
+
+	if (sort__need_collapse)
+		root = &hists->entries_collapsed;
+	else
+		root = hists->entries_in;
+
+	next = rb_first(root);
+	while (next != NULL) {
+		struct hist_entry *he = rb_entry(next, struct hist_entry, rb_node_in);
+
+		next = rb_next(&he->rb_node_in);
+		if (!hist_entry__next_pair(he)) {
+			rb_erase(&he->rb_node_in, root);
+			hist_entry__free(he);
+		}
+	}
+}
+
+static void hists__precompute(struct hists *hists)
+{
+	struct rb_root *root;
+	struct rb_node *next;
+
+	if (sort__need_collapse)
+		root = &hists->entries_collapsed;
+	else
+		root = hists->entries_in;
+
+	next = rb_first(root);
+	while (next != NULL) {
+		struct hist_entry *he, *pair;
+
+		he   = rb_entry(next, struct hist_entry, rb_node_in);
+		next = rb_next(&he->rb_node_in);
+
+		pair = get_pair_data(he, &data__files[sort_compute]);
+		if (!pair)
+			continue;
+
+		switch (compute) {
+		case COMPUTE_DELTA:
+			compute_delta(he, pair);
+			break;
+		case COMPUTE_RATIO:
+			compute_ratio(he, pair);
+			break;
+		case COMPUTE_WEIGHTED_DIFF:
+			compute_wdiff(he, pair);
+			break;
+		default:
+			BUG_ON(1);
+		}
+	}
+}
+
+static int64_t cmp_doubles(double l, double r)
+{
+	if (l > r)
+		return -1;
+	else if (l < r)
+		return 1;
+	else
+		return 0;
+}
+
+static int64_t
+__hist_entry__cmp_compute(struct hist_entry *left, struct hist_entry *right,
+			int c)
+{
+	switch (c) {
+	case COMPUTE_DELTA:
+	{
+		double l = left->diff.period_ratio_delta;
+		double r = right->diff.period_ratio_delta;
+
+		return cmp_doubles(l, r);
+	}
+	case COMPUTE_RATIO:
+	{
+		double l = left->diff.period_ratio;
+		double r = right->diff.period_ratio;
+
+		return cmp_doubles(l, r);
+	}
+	case COMPUTE_WEIGHTED_DIFF:
+	{
+		s64 l = left->diff.wdiff;
+		s64 r = right->diff.wdiff;
+
+		return r - l;
+	}
+	default:
+		BUG_ON(1);
+	}
+
+	return 0;
+}
+
+static int64_t
+hist_entry__cmp_compute(struct hist_entry *left, struct hist_entry *right,
+			int c)
+{
+	bool pairs_left  = hist_entry__has_pairs(left);
+	bool pairs_right = hist_entry__has_pairs(right);
+	struct hist_entry *p_right, *p_left;
+
+	if (!pairs_left && !pairs_right)
+		return 0;
+
+	if (!pairs_left || !pairs_right)
+		return pairs_left ? -1 : 1;
+
+	p_left  = get_pair_data(left,  &data__files[sort_compute]);
+	p_right = get_pair_data(right, &data__files[sort_compute]);
+
+	if (!p_left && !p_right)
+		return 0;
+
+	if (!p_left || !p_right)
+		return p_left ? -1 : 1;
+
+	/*
+	 * We have 2 entries of same kind, let's
+	 * make the data comparison.
+	 */
+	return __hist_entry__cmp_compute(p_left, p_right, c);
+}
+
+static void insert_hist_entry_by_compute(struct rb_root *root,
+					 struct hist_entry *he,
+					 int c)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
@@ -85,7 +554,7 @@ static void perf_session__insert_hist_entry_by_name(struct rb_root *root,
 	while (*p != NULL) {
 		parent = *p;
 		iter = rb_entry(parent, struct hist_entry, rb_node);
-		if (hist_entry__cmp(he, iter) < 0)
+		if (hist_entry__cmp_compute(he, iter, c) < 0)
 			p = &(*p)->rb_left;
 		else
 			p = &(*p)->rb_right;
@@ -95,90 +564,149 @@ static void perf_session__insert_hist_entry_by_name(struct rb_root *root,
 	rb_insert_color(&he->rb_node, root);
 }
 
-static void perf_session__resort_hist_entries(struct perf_session *self)
+static void hists__compute_resort(struct hists *hists)
 {
-	unsigned long position = 1;
-	struct rb_root tmp = RB_ROOT;
-	struct rb_node *next = rb_first(&self->hists);
+	struct rb_root *root;
+	struct rb_node *next;
+
+	if (sort__need_collapse)
+		root = &hists->entries_collapsed;
+	else
+		root = hists->entries_in;
+
+	hists->entries = RB_ROOT;
+	next = rb_first(root);
+
+	hists__reset_stats(hists);
+	hists__reset_col_len(hists);
 
 	while (next != NULL) {
-		struct hist_entry *n = rb_entry(next, struct hist_entry, rb_node);
+		struct hist_entry *he;
+
+		he = rb_entry(next, struct hist_entry, rb_node_in);
+		next = rb_next(&he->rb_node_in);
+
+		insert_hist_entry_by_compute(&hists->entries, he, compute);
+		hists__inc_stats(hists, he);
 
-		next = rb_next(&n->rb_node);
-		rb_erase(&n->rb_node, &self->hists);
-		n->position = position++;
-		perf_session__insert_hist_entry_by_name(&tmp, n);
+		if (!he->filtered)
+			hists__calc_col_len(hists, he);
+	}
+}
+
+static void hists__process(struct hists *hists)
+{
+	if (show_baseline_only)
+		hists__baseline_only(hists);
+
+	if (sort_compute) {
+		hists__precompute(hists);
+		hists__compute_resort(hists);
+	} else {
+		hists__output_resort(hists);
 	}
 
-	self->hists = tmp;
+	hists__fprintf(hists, true, 0, 0, 0, stdout);
 }
 
-static void perf_session__set_hist_entries_positions(struct perf_session *self)
+static void data__fprintf(void)
 {
-	perf_session__output_resort(&self->hists, self->events_stats.total);
-	perf_session__resort_hist_entries(self);
+	struct data__file *d;
+	int i;
+
+	fprintf(stdout, "# Data files:\n");
+
+	data__for_each_file(i, d)
+		fprintf(stdout, "#  [%d] %s %s\n",
+			d->idx, d->file.path,
+			!d->idx ? "(Baseline)" : "");
+
+	fprintf(stdout, "#\n");
 }
 
-static struct hist_entry *
-perf_session__find_hist_entry(struct perf_session *self,
-			      struct hist_entry *he)
+static void data_process(void)
 {
-	struct rb_node *n = self->hists.rb_node;
+	struct perf_evlist *evlist_base = data__files[0].session->evlist;
+	struct perf_evsel *evsel_base;
+	bool first = true;
 
-	while (n) {
-		struct hist_entry *iter = rb_entry(n, struct hist_entry, rb_node);
-		int64_t cmp = hist_entry__cmp(he, iter);
+	evlist__for_each(evlist_base, evsel_base) {
+		struct data__file *d;
+		int i;
 
-		if (cmp < 0)
-			n = n->rb_left;
-		else if (cmp > 0)
-			n = n->rb_right;
-		else 
-			return iter;
-	}
+		data__for_each_file_new(i, d) {
+			struct perf_evlist *evlist = d->session->evlist;
+			struct perf_evsel *evsel;
 
-	return NULL;
+			evsel = evsel_match(evsel_base, evlist);
+			if (!evsel)
+				continue;
+
+			d->hists = &evsel->hists;
+
+			hists__match(&evsel_base->hists, &evsel->hists);
+
+			if (!show_baseline_only)
+				hists__link(&evsel_base->hists,
+					    &evsel->hists);
+		}
+
+		fprintf(stdout, "%s# Event '%s'\n#\n", first ? "" : "\n",
+			perf_evsel__name(evsel_base));
+
+		first = false;
+
+		if (verbose || data__files_cnt > 2)
+			data__fprintf();
+
+		hists__process(&evsel_base->hists);
+	}
 }
 
-static void perf_session__match_hists(struct perf_session *old_session,
-				      struct perf_session *new_session)
+static void data__free(struct data__file *d)
 {
-	struct rb_node *nd;
+	int col;
 
-	for (nd = rb_first(&new_session->hists); nd; nd = rb_next(nd)) {
-		struct hist_entry *pos = rb_entry(nd, struct hist_entry, rb_node);
-		pos->pair = perf_session__find_hist_entry(old_session, pos);
+	for (col = 0; col < PERF_HPP_DIFF__MAX_INDEX; col++) {
+		struct diff_hpp_fmt *fmt = &d->fmt[col];
+
+		zfree(&fmt->header);
 	}
 }
 
 static int __cmd_diff(void)
 {
-	int ret, i;
-	struct perf_session *session[2];
+	struct data__file *d;
+	int ret = -EINVAL, i;
 
-	session[0] = perf_session__new(input_old, O_RDONLY, force);
-	session[1] = perf_session__new(input_new, O_RDONLY, force);
-	if (session[0] == NULL || session[1] == NULL)
-		return -ENOMEM;
+	data__for_each_file(i, d) {
+		d->session = perf_session__new(&d->file, false, &tool);
+		if (!d->session) {
+			pr_err("Failed to open %s\n", d->file.path);
+			ret = -ENOMEM;
+			goto out_delete;
+		}
 
-	for (i = 0; i < 2; ++i) {
-		ret = perf_session__process_events(session[i], &event_ops);
-		if (ret)
+		ret = perf_session__process_events(d->session, &tool);
+		if (ret) {
+			pr_err("Failed to process %s\n", d->file.path);
 			goto out_delete;
+		}
+
+		perf_evlist__collapse_resort(d->session->evlist);
 	}
 
-	perf_session__output_resort(&session[1]->hists,
-				    session[1]->events_stats.total);
-	if (show_displacement)
-		perf_session__set_hist_entries_positions(session[0]);
+	data_process();
+
+ out_delete:
+	data__for_each_file(i, d) {
+		if (d->session)
+			perf_session__delete(d->session);
+
+		data__free(d);
+	}
 
-	perf_session__match_hists(session[0], session[1]);
-	perf_session__fprintf_hists(&session[1]->hists, session[0],
-				    show_displacement, stdout,
-				    session[1]->events_stats.total);
-out_delete:
-	for (i = 0; i < 2; ++i)
-		perf_session__delete(session[i]);
+	free(data__files);
 	return ret;
 }
 
@@ -188,17 +716,23 @@ static const char * const diff_usage[] = {
 };
 
 static const struct option options[] = {
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
-	OPT_BOOLEAN('m', "displacement", &show_displacement,
-		    "Show position displacement relative to baseline"),
+	OPT_BOOLEAN('b', "baseline-only", &show_baseline_only,
+		    "Show only items with match in baseline"),
+	OPT_CALLBACK('c', "compute", &compute,
+		     "delta,ratio,wdiff:w1,w2 (default delta)",
+		     "Entries differential computation selection",
+		     setup_compute),
+	OPT_BOOLEAN('p', "period", &show_period,
+		    "Show period values."),
+	OPT_BOOLEAN('F', "formula", &show_formula,
+		    "Show formula."),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
 	OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules,
 		    "load module symbols - WARNING: use only with -k and LIVE kernel"),
-	OPT_BOOLEAN('P', "full-paths", &symbol_conf.full_paths,
-		    "Don't shorten the pathnames taking into account the cwd"),
 	OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
 		   "only consider symbols in these dsos"),
 	OPT_STRING('C', "comms", &symbol_conf.comm_list_str, "comm[,comm...]",
@@ -206,37 +740,425 @@ static const struct option options[] = {
 	OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
 		   "only consider these symbols"),
 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
-		   "sort by key(s): pid, comm, dso, symbol, parent"),
+		   "sort by key(s): pid, comm, dso, symbol, parent, cpu, srcline, ..."
+		   " Please refer the man page for the complete list."),
 	OPT_STRING('t', "field-separator", &symbol_conf.field_sep, "separator",
 		   "separator for columns, no spaces will be added between "
 		   "columns '.' is reserved."),
+	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
+		    "Look for files with symbols relative to this directory"),
+	OPT_UINTEGER('o', "order", &sort_compute, "Specify compute sorting."),
+	OPT_CALLBACK(0, "percentage", NULL, "relative|absolute",
+		     "How to display percentage of filtered entries", parse_filter_percentage),
 	OPT_END()
 };
 
-int cmd_diff(int argc, const char **argv, const char *prefix __used)
+static double baseline_percent(struct hist_entry *he)
 {
-	sort_order = diff__default_sort_order;
-	argc = parse_options(argc, argv, options, diff_usage, 0);
+	u64 total = hists__total_period(he->hists);
+
+	return 100.0 * he->stat.period / total;
+}
+
+static int hpp__color_baseline(struct perf_hpp_fmt *fmt,
+			       struct perf_hpp *hpp, struct hist_entry *he)
+{
+	struct diff_hpp_fmt *dfmt =
+		container_of(fmt, struct diff_hpp_fmt, fmt);
+	double percent = baseline_percent(he);
+	char pfmt[20] = " ";
+
+	if (!he->dummy) {
+		scnprintf(pfmt, 20, "%%%d.2f%%%%", dfmt->header_width - 1);
+		return percent_color_snprintf(hpp->buf, hpp->size,
+					      pfmt, percent);
+	} else
+		return scnprintf(hpp->buf, hpp->size, "%*s",
+				 dfmt->header_width, pfmt);
+}
+
+static int hpp__entry_baseline(struct hist_entry *he, char *buf, size_t size)
+{
+	double percent = baseline_percent(he);
+	const char *fmt = symbol_conf.field_sep ? "%.2f" : "%6.2f%%";
+	int ret = 0;
+
+	if (!he->dummy)
+		ret = scnprintf(buf, size, fmt, percent);
+
+	return ret;
+}
+
+static int __hpp__color_compare(struct perf_hpp_fmt *fmt,
+				struct perf_hpp *hpp, struct hist_entry *he,
+				int comparison_method)
+{
+	struct diff_hpp_fmt *dfmt =
+		container_of(fmt, struct diff_hpp_fmt, fmt);
+	struct hist_entry *pair = get_pair_fmt(he, dfmt);
+	double diff;
+	s64 wdiff;
+	char pfmt[20] = " ";
+
+	if (!pair)
+		goto dummy_print;
+
+	switch (comparison_method) {
+	case COMPUTE_DELTA:
+		if (pair->diff.computed)
+			diff = pair->diff.period_ratio_delta;
+		else
+			diff = compute_delta(he, pair);
+
+		if (fabs(diff) < 0.01)
+			goto dummy_print;
+		scnprintf(pfmt, 20, "%%%+d.2f%%%%", dfmt->header_width - 1);
+		return percent_color_snprintf(hpp->buf, hpp->size,
+					pfmt, diff);
+	case COMPUTE_RATIO:
+		if (he->dummy)
+			goto dummy_print;
+		if (pair->diff.computed)
+			diff = pair->diff.period_ratio;
+		else
+			diff = compute_ratio(he, pair);
+
+		scnprintf(pfmt, 20, "%%%d.6f", dfmt->header_width);
+		return value_color_snprintf(hpp->buf, hpp->size,
+					pfmt, diff);
+	case COMPUTE_WEIGHTED_DIFF:
+		if (he->dummy)
+			goto dummy_print;
+		if (pair->diff.computed)
+			wdiff = pair->diff.wdiff;
+		else
+			wdiff = compute_wdiff(he, pair);
+
+		scnprintf(pfmt, 20, "%%14ld", dfmt->header_width);
+		return color_snprintf(hpp->buf, hpp->size,
+				get_percent_color(wdiff),
+				pfmt, wdiff);
+	default:
+		BUG_ON(1);
+	}
+dummy_print:
+	return scnprintf(hpp->buf, hpp->size, "%*s",
+			dfmt->header_width, pfmt);
+}
+
+static int hpp__color_delta(struct perf_hpp_fmt *fmt,
+			struct perf_hpp *hpp, struct hist_entry *he)
+{
+	return __hpp__color_compare(fmt, hpp, he, COMPUTE_DELTA);
+}
+
+static int hpp__color_ratio(struct perf_hpp_fmt *fmt,
+			struct perf_hpp *hpp, struct hist_entry *he)
+{
+	return __hpp__color_compare(fmt, hpp, he, COMPUTE_RATIO);
+}
+
+static int hpp__color_wdiff(struct perf_hpp_fmt *fmt,
+			struct perf_hpp *hpp, struct hist_entry *he)
+{
+	return __hpp__color_compare(fmt, hpp, he, COMPUTE_WEIGHTED_DIFF);
+}
+
+static void
+hpp__entry_unpair(struct hist_entry *he, int idx, char *buf, size_t size)
+{
+	switch (idx) {
+	case PERF_HPP_DIFF__PERIOD_BASELINE:
+		scnprintf(buf, size, "%" PRIu64, he->stat.period);
+		break;
+
+	default:
+		break;
+	}
+}
+
+static void
+hpp__entry_pair(struct hist_entry *he, struct hist_entry *pair,
+		int idx, char *buf, size_t size)
+{
+	double diff;
+	double ratio;
+	s64 wdiff;
+
+	switch (idx) {
+	case PERF_HPP_DIFF__DELTA:
+		if (pair->diff.computed)
+			diff = pair->diff.period_ratio_delta;
+		else
+			diff = compute_delta(he, pair);
+
+		if (fabs(diff) >= 0.01)
+			scnprintf(buf, size, "%+4.2F%%", diff);
+		break;
+
+	case PERF_HPP_DIFF__RATIO:
+		/* No point for ratio number if we are dummy.. */
+		if (he->dummy)
+			break;
+
+		if (pair->diff.computed)
+			ratio = pair->diff.period_ratio;
+		else
+			ratio = compute_ratio(he, pair);
+
+		if (ratio > 0.0)
+			scnprintf(buf, size, "%14.6F", ratio);
+		break;
+
+	case PERF_HPP_DIFF__WEIGHTED_DIFF:
+		/* No point for wdiff number if we are dummy.. */
+		if (he->dummy)
+			break;
+
+		if (pair->diff.computed)
+			wdiff = pair->diff.wdiff;
+		else
+			wdiff = compute_wdiff(he, pair);
+
+		if (wdiff != 0)
+			scnprintf(buf, size, "%14ld", wdiff);
+		break;
+
+	case PERF_HPP_DIFF__FORMULA:
+		formula_fprintf(he, pair, buf, size);
+		break;
+
+	case PERF_HPP_DIFF__PERIOD:
+		scnprintf(buf, size, "%" PRIu64, pair->stat.period);
+		break;
+
+	default:
+		BUG_ON(1);
+	};
+}
+
+static void
+__hpp__entry_global(struct hist_entry *he, struct diff_hpp_fmt *dfmt,
+		    char *buf, size_t size)
+{
+	struct hist_entry *pair = get_pair_fmt(he, dfmt);
+	int idx = dfmt->idx;
+
+	/* baseline is special */
+	if (idx == PERF_HPP_DIFF__BASELINE)
+		hpp__entry_baseline(he, buf, size);
+	else {
+		if (pair)
+			hpp__entry_pair(he, pair, idx, buf, size);
+		else
+			hpp__entry_unpair(he, idx, buf, size);
+	}
+}
+
+static int hpp__entry_global(struct perf_hpp_fmt *_fmt, struct perf_hpp *hpp,
+			     struct hist_entry *he)
+{
+	struct diff_hpp_fmt *dfmt =
+		container_of(_fmt, struct diff_hpp_fmt, fmt);
+	char buf[MAX_COL_WIDTH] = " ";
+
+	__hpp__entry_global(he, dfmt, buf, MAX_COL_WIDTH);
+
+	if (symbol_conf.field_sep)
+		return scnprintf(hpp->buf, hpp->size, "%s", buf);
+	else
+		return scnprintf(hpp->buf, hpp->size, "%*s",
+				 dfmt->header_width, buf);
+}
+
+static int hpp__header(struct perf_hpp_fmt *fmt, struct perf_hpp *hpp,
+		       struct perf_evsel *evsel __maybe_unused)
+{
+	struct diff_hpp_fmt *dfmt =
+		container_of(fmt, struct diff_hpp_fmt, fmt);
+
+	BUG_ON(!dfmt->header);
+	return scnprintf(hpp->buf, hpp->size, dfmt->header);
+}
+
+static int hpp__width(struct perf_hpp_fmt *fmt,
+		      struct perf_hpp *hpp __maybe_unused,
+		      struct perf_evsel *evsel __maybe_unused)
+{
+	struct diff_hpp_fmt *dfmt =
+		container_of(fmt, struct diff_hpp_fmt, fmt);
+
+	BUG_ON(dfmt->header_width <= 0);
+	return dfmt->header_width;
+}
+
+static void init_header(struct data__file *d, struct diff_hpp_fmt *dfmt)
+{
+#define MAX_HEADER_NAME 100
+	char buf_indent[MAX_HEADER_NAME];
+	char buf[MAX_HEADER_NAME];
+	const char *header = NULL;
+	int width = 0;
+
+	BUG_ON(dfmt->idx >= PERF_HPP_DIFF__MAX_INDEX);
+	header = columns[dfmt->idx].name;
+	width  = columns[dfmt->idx].width;
+
+	/* Only our defined HPP fmts should appear here. */
+	BUG_ON(!header);
+
+	if (data__files_cnt > 2)
+		scnprintf(buf, MAX_HEADER_NAME, "%s/%d", header, d->idx);
+
+#define NAME (data__files_cnt > 2 ? buf : header)
+	dfmt->header_width = width;
+	width = (int) strlen(NAME);
+	if (dfmt->header_width < width)
+		dfmt->header_width = width;
+
+	scnprintf(buf_indent, MAX_HEADER_NAME, "%*s",
+		  dfmt->header_width, NAME);
+
+	dfmt->header = strdup(buf_indent);
+#undef MAX_HEADER_NAME
+#undef NAME
+}
+
+static void data__hpp_register(struct data__file *d, int idx)
+{
+	struct diff_hpp_fmt *dfmt = &d->fmt[idx];
+	struct perf_hpp_fmt *fmt = &dfmt->fmt;
+
+	dfmt->idx = idx;
+
+	fmt->header = hpp__header;
+	fmt->width  = hpp__width;
+	fmt->entry  = hpp__entry_global;
+
+	/* TODO more colors */
+	switch (idx) {
+	case PERF_HPP_DIFF__BASELINE:
+		fmt->color = hpp__color_baseline;
+		break;
+	case PERF_HPP_DIFF__DELTA:
+		fmt->color = hpp__color_delta;
+		break;
+	case PERF_HPP_DIFF__RATIO:
+		fmt->color = hpp__color_ratio;
+		break;
+	case PERF_HPP_DIFF__WEIGHTED_DIFF:
+		fmt->color = hpp__color_wdiff;
+		break;
+	default:
+		break;
+	}
+
+	init_header(d, dfmt);
+	perf_hpp__column_register(fmt);
+}
+
+static void ui_init(void)
+{
+	struct data__file *d;
+	int i;
+
+	data__for_each_file(i, d) {
+
+		/*
+		 * Baseline or compute realted columns:
+		 *
+		 *   PERF_HPP_DIFF__BASELINE
+		 *   PERF_HPP_DIFF__DELTA
+		 *   PERF_HPP_DIFF__RATIO
+		 *   PERF_HPP_DIFF__WEIGHTED_DIFF
+		 */
+		data__hpp_register(d, i ? compute_2_hpp[compute] :
+					  PERF_HPP_DIFF__BASELINE);
+
+		/*
+		 * And the rest:
+		 *
+		 * PERF_HPP_DIFF__FORMULA
+		 * PERF_HPP_DIFF__PERIOD
+		 * PERF_HPP_DIFF__PERIOD_BASELINE
+		 */
+		if (show_formula && i)
+			data__hpp_register(d, PERF_HPP_DIFF__FORMULA);
+
+		if (show_period)
+			data__hpp_register(d, i ? PERF_HPP_DIFF__PERIOD :
+						  PERF_HPP_DIFF__PERIOD_BASELINE);
+	}
+}
+
+static int data_init(int argc, const char **argv)
+{
+	struct data__file *d;
+	static const char *defaults[] = {
+		"perf.data.old",
+		"perf.data",
+	};
+	bool use_default = true;
+	int i;
+
+	data__files_cnt = 2;
+
 	if (argc) {
-		if (argc > 2)
-			usage_with_options(diff_usage, options);
-		if (argc == 2) {
-			input_old = argv[0];
-			input_new = argv[1];
-		} else
-			input_new = argv[0];
+		if (argc == 1)
+			defaults[1] = argv[0];
+		else {
+			data__files_cnt = argc;
+			use_default = false;
+		}
+	} else if (perf_guest) {
+		defaults[0] = "perf.data.host";
+		defaults[1] = "perf.data.guest";
 	}
 
-	symbol_conf.exclude_other = false;
+	if (sort_compute >= (unsigned int) data__files_cnt) {
+		pr_err("Order option out of limit.\n");
+		return -EINVAL;
+	}
+
+	data__files = zalloc(sizeof(*data__files) * data__files_cnt);
+	if (!data__files)
+		return -ENOMEM;
+
+	data__for_each_file(i, d) {
+		struct perf_data_file *file = &d->file;
+
+		file->path  = use_default ? defaults[i] : argv[i];
+		file->mode  = PERF_DATA_MODE_READ,
+		file->force = force,
+
+		d->idx  = i;
+	}
+
+	return 0;
+}
+
+int cmd_diff(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	perf_config(perf_default_config, NULL);
+
+	argc = parse_options(argc, argv, options, diff_usage, 0);
+
 	if (symbol__init() < 0)
 		return -1;
 
-	setup_sorting(diff_usage, options);
+	if (data_init(argc, argv) < 0)
+		return -1;
+
+	ui_init();
+
+	sort__mode = SORT_MODE__DIFF;
+
+	if (setup_sorting() < 0)
+		usage_with_options(diff_usage, options);
+
 	setup_pager();
 
-	sort_entry__setup_elide(&sort_dso, symbol_conf.dso_list, "dso", NULL);
-	sort_entry__setup_elide(&sort_comm, symbol_conf.comm_list, "comm", NULL);
-	sort_entry__setup_elide(&sort_sym, symbol_conf.sym_list, "symbol", NULL);
+	sort__setup_elide(NULL);
 
 	return __cmd_diff();
 }
diff --git a/tools/perf/builtin-evlist.c b/tools/perf/builtin-evlist.c
new file mode 100644
index 00000000000..c99e0de7e54
--- /dev/null
+++ b/tools/perf/builtin-evlist.c
@@ -0,0 +1,66 @@
+/*
+ * Builtin evlist command: Show the list of event selectors present
+ * in a perf.data file.
+ */
+#include "builtin.h"
+
+#include "util/util.h"
+
+#include <linux/list.h>
+
+#include "perf.h"
+#include "util/evlist.h"
+#include "util/evsel.h"
+#include "util/parse-events.h"
+#include "util/parse-options.h"
+#include "util/session.h"
+#include "util/data.h"
+
+static int __cmd_evlist(const char *file_name, struct perf_attr_details *details)
+{
+	struct perf_session *session;
+	struct perf_evsel *pos;
+	struct perf_data_file file = {
+		.path = file_name,
+		.mode = PERF_DATA_MODE_READ,
+	};
+
+	session = perf_session__new(&file, 0, NULL);
+	if (session == NULL)
+		return -ENOMEM;
+
+	evlist__for_each(session->evlist, pos)
+		perf_evsel__fprintf(pos, details, stdout);
+
+	perf_session__delete(session);
+	return 0;
+}
+
+int cmd_evlist(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	struct perf_attr_details details = { .verbose = false, };
+	const struct option options[] = {
+	OPT_STRING('i', "input", &input_name, "file", "Input file name"),
+	OPT_BOOLEAN('F', "freq", &details.freq, "Show the sample frequency"),
+	OPT_BOOLEAN('v', "verbose", &details.verbose,
+		    "Show all event attr details"),
+	OPT_BOOLEAN('g', "group", &details.event_group,
+		    "Show event group information"),
+	OPT_END()
+	};
+	const char * const evlist_usage[] = {
+		"perf evlist [<options>]",
+		NULL
+	};
+
+	argc = parse_options(argc, argv, options, evlist_usage, 0);
+	if (argc)
+		usage_with_options(evlist_usage, options);
+
+	if (details.event_group && (details.verbose || details.freq)) {
+		pr_err("--group option is not compatible with other options\n");
+		usage_with_options(evlist_usage, options);
+	}
+
+	return __cmd_evlist(input_name, &details);
+}
diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c
index 215b584007b..178b88ae3d2 100644
--- a/tools/perf/builtin-help.c
+++ b/tools/perf/builtin-help.c
@@ -24,28 +24,12 @@ static struct man_viewer_info_list {
 } *man_viewer_info_list;
 
 enum help_format {
+	HELP_FORMAT_NONE,
 	HELP_FORMAT_MAN,
 	HELP_FORMAT_INFO,
 	HELP_FORMAT_WEB,
 };
 
-static int show_all = 0;
-static enum help_format help_format = HELP_FORMAT_MAN;
-static struct option builtin_help_options[] = {
-	OPT_BOOLEAN('a', "all", &show_all, "print all available commands"),
-	OPT_SET_INT('m', "man", &help_format, "show man page", HELP_FORMAT_MAN),
-	OPT_SET_INT('w', "web", &help_format, "show manual in web browser",
-			HELP_FORMAT_WEB),
-	OPT_SET_INT('i', "info", &help_format, "show info page",
-			HELP_FORMAT_INFO),
-	OPT_END(),
-};
-
-static const char * const builtin_help_usage[] = {
-	"perf help [--all] [--man|--web|--info] [command]",
-	NULL
-};
-
 static enum help_format parse_help_format(const char *format)
 {
 	if (!strcmp(format, "man"))
@@ -54,7 +38,9 @@ static enum help_format parse_help_format(const char *format)
 		return HELP_FORMAT_INFO;
 	if (!strcmp(format, "web") || !strcmp(format, "html"))
 		return HELP_FORMAT_WEB;
-	die("unrecognized help format '%s'", format);
+
+	pr_err("unrecognized help format '%s'", format);
+	return HELP_FORMAT_NONE;
 }
 
 static const char *get_man_viewer_info(const char *name)
@@ -255,10 +241,14 @@ static int add_man_viewer_info(const char *var, const char *value)
 
 static int perf_help_config(const char *var, const char *value, void *cb)
 {
+	enum help_format *help_formatp = cb;
+
 	if (!strcmp(var, "help.format")) {
 		if (!value)
 			return config_error_nonbool(var);
-		help_format = parse_help_format(value);
+		*help_formatp = parse_help_format(value);
+		if (*help_formatp == HELP_FORMAT_NONE)
+			return -1;
 		return 0;
 	}
 	if (!strcmp(var, "man.viewer")) {
@@ -352,7 +342,7 @@ static void exec_viewer(const char *name, const char *page)
 		warning("'%s': unknown man viewer.", name);
 }
 
-static void show_man_page(const char *perf_cmd)
+static int show_man_page(const char *perf_cmd)
 {
 	struct man_viewer_list *viewer;
 	const char *page = cmd_to_page(perf_cmd);
@@ -365,28 +355,35 @@ static void show_man_page(const char *perf_cmd)
 	if (fallback)
 		exec_viewer(fallback, page);
 	exec_viewer("man", page);
-	die("no man viewer handled the request");
+
+	pr_err("no man viewer handled the request");
+	return -1;
 }
 
-static void show_info_page(const char *perf_cmd)
+static int show_info_page(const char *perf_cmd)
 {
 	const char *page = cmd_to_page(perf_cmd);
 	setenv("INFOPATH", system_path(PERF_INFO_PATH), 1);
 	execlp("info", "info", "perfman", page, NULL);
+	return -1;
 }
 
-static void get_html_page_path(struct strbuf *page_path, const char *page)
+static int get_html_page_path(struct strbuf *page_path, const char *page)
 {
 	struct stat st;
 	const char *html_path = system_path(PERF_HTML_PATH);
 
 	/* Check that we have a perf documentation directory. */
 	if (stat(mkpath("%s/perf.html", html_path), &st)
-	    || !S_ISREG(st.st_mode))
-		die("'%s': not a documentation directory.", html_path);
+	    || !S_ISREG(st.st_mode)) {
+		pr_err("'%s': not a documentation directory.", html_path);
+		return -1;
+	}
 
 	strbuf_init(page_path, 0);
 	strbuf_addf(page_path, "%s/%s.html", html_path, page);
+
+	return 0;
 }
 
 /*
@@ -401,23 +398,42 @@ static void open_html(const char *path)
 }
 #endif
 
-static void show_html_page(const char *perf_cmd)
+static int show_html_page(const char *perf_cmd)
 {
 	const char *page = cmd_to_page(perf_cmd);
 	struct strbuf page_path; /* it leaks but we exec bellow */
 
-	get_html_page_path(&page_path, page);
+	if (get_html_page_path(&page_path, page) != 0)
+		return -1;
 
 	open_html(page_path.buf);
+
+	return 0;
 }
 
-int cmd_help(int argc, const char **argv, const char *prefix __used)
+int cmd_help(int argc, const char **argv, const char *prefix __maybe_unused)
 {
+	bool show_all = false;
+	enum help_format help_format = HELP_FORMAT_MAN;
+	struct option builtin_help_options[] = {
+	OPT_BOOLEAN('a', "all", &show_all, "print all available commands"),
+	OPT_SET_UINT('m', "man", &help_format, "show man page", HELP_FORMAT_MAN),
+	OPT_SET_UINT('w', "web", &help_format, "show manual in web browser",
+			HELP_FORMAT_WEB),
+	OPT_SET_UINT('i', "info", &help_format, "show info page",
+			HELP_FORMAT_INFO),
+	OPT_END(),
+	};
+	const char * const builtin_help_usage[] = {
+		"perf help [--all] [--man|--web|--info] [command]",
+		NULL
+	};
 	const char *alias;
+	int rc = 0;
 
 	load_command_list("perf-", &main_cmds, &other_cmds);
 
-	perf_config(perf_help_config, NULL);
+	perf_config(perf_help_config, &help_format);
 
 	argc = parse_options(argc, argv, builtin_help_options,
 			builtin_help_usage, 0);
@@ -444,16 +460,20 @@ int cmd_help(int argc, const char **argv, const char *prefix __used)
 
 	switch (help_format) {
 	case HELP_FORMAT_MAN:
-		show_man_page(argv[0]);
+		rc = show_man_page(argv[0]);
 		break;
 	case HELP_FORMAT_INFO:
-		show_info_page(argv[0]);
+		rc = show_info_page(argv[0]);
 		break;
 	case HELP_FORMAT_WEB:
-		show_html_page(argv[0]);
+		rc = show_html_page(argv[0]);
+		break;
+	case HELP_FORMAT_NONE:
+		/* fall-through */
 	default:
+		rc = -1;
 		break;
 	}
 
-	return 0;
+	return rc;
 }
diff --git a/tools/perf/builtin-inject.c b/tools/perf/builtin-inject.c
new file mode 100644
index 00000000000..16c7c11ad06
--- /dev/null
+++ b/tools/perf/builtin-inject.c
@@ -0,0 +1,463 @@
+/*
+ * builtin-inject.c
+ *
+ * Builtin inject command: Examine the live mode (stdin) event stream
+ * and repipe it to stdout while optionally injecting additional
+ * events into it.
+ */
+#include "builtin.h"
+
+#include "perf.h"
+#include "util/color.h"
+#include "util/evlist.h"
+#include "util/evsel.h"
+#include "util/session.h"
+#include "util/tool.h"
+#include "util/debug.h"
+#include "util/build-id.h"
+#include "util/data.h"
+
+#include "util/parse-options.h"
+
+#include <linux/list.h>
+
+struct perf_inject {
+	struct perf_tool	tool;
+	bool			build_ids;
+	bool			sched_stat;
+	const char		*input_name;
+	struct perf_data_file	output;
+	u64			bytes_written;
+	struct list_head	samples;
+};
+
+struct event_entry {
+	struct list_head node;
+	u32		 tid;
+	union perf_event event[0];
+};
+
+static int perf_event__repipe_synth(struct perf_tool *tool,
+				    union perf_event *event)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+	ssize_t size;
+
+	size = perf_data_file__write(&inject->output, event,
+				     event->header.size);
+	if (size < 0)
+		return -errno;
+
+	inject->bytes_written += size;
+	return 0;
+}
+
+static int perf_event__repipe_op2_synth(struct perf_tool *tool,
+					union perf_event *event,
+					struct perf_session *session
+					__maybe_unused)
+{
+	return perf_event__repipe_synth(tool, event);
+}
+
+static int perf_event__repipe_attr(struct perf_tool *tool,
+				   union perf_event *event,
+				   struct perf_evlist **pevlist)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject,
+						  tool);
+	int ret;
+
+	ret = perf_event__process_attr(tool, event, pevlist);
+	if (ret)
+		return ret;
+
+	if (!inject->output.is_pipe)
+		return 0;
+
+	return perf_event__repipe_synth(tool, event);
+}
+
+static int perf_event__repipe(struct perf_tool *tool,
+			      union perf_event *event,
+			      struct perf_sample *sample __maybe_unused,
+			      struct machine *machine __maybe_unused)
+{
+	return perf_event__repipe_synth(tool, event);
+}
+
+typedef int (*inject_handler)(struct perf_tool *tool,
+			      union perf_event *event,
+			      struct perf_sample *sample,
+			      struct perf_evsel *evsel,
+			      struct machine *machine);
+
+static int perf_event__repipe_sample(struct perf_tool *tool,
+				     union perf_event *event,
+				     struct perf_sample *sample,
+				     struct perf_evsel *evsel,
+				     struct machine *machine)
+{
+	if (evsel->handler) {
+		inject_handler f = evsel->handler;
+		return f(tool, event, sample, evsel, machine);
+	}
+
+	build_id__mark_dso_hit(tool, event, sample, evsel, machine);
+
+	return perf_event__repipe_synth(tool, event);
+}
+
+static int perf_event__repipe_mmap(struct perf_tool *tool,
+				   union perf_event *event,
+				   struct perf_sample *sample,
+				   struct machine *machine)
+{
+	int err;
+
+	err = perf_event__process_mmap(tool, event, sample, machine);
+	perf_event__repipe(tool, event, sample, machine);
+
+	return err;
+}
+
+static int perf_event__repipe_mmap2(struct perf_tool *tool,
+				   union perf_event *event,
+				   struct perf_sample *sample,
+				   struct machine *machine)
+{
+	int err;
+
+	err = perf_event__process_mmap2(tool, event, sample, machine);
+	perf_event__repipe(tool, event, sample, machine);
+
+	return err;
+}
+
+static int perf_event__repipe_fork(struct perf_tool *tool,
+				   union perf_event *event,
+				   struct perf_sample *sample,
+				   struct machine *machine)
+{
+	int err;
+
+	err = perf_event__process_fork(tool, event, sample, machine);
+	perf_event__repipe(tool, event, sample, machine);
+
+	return err;
+}
+
+static int perf_event__repipe_tracing_data(struct perf_tool *tool,
+					   union perf_event *event,
+					   struct perf_session *session)
+{
+	int err;
+
+	perf_event__repipe_synth(tool, event);
+	err = perf_event__process_tracing_data(tool, event, session);
+
+	return err;
+}
+
+static int dso__read_build_id(struct dso *dso)
+{
+	if (dso->has_build_id)
+		return 0;
+
+	if (filename__read_build_id(dso->long_name, dso->build_id,
+				    sizeof(dso->build_id)) > 0) {
+		dso->has_build_id = true;
+		return 0;
+	}
+
+	return -1;
+}
+
+static int dso__inject_build_id(struct dso *dso, struct perf_tool *tool,
+				struct machine *machine)
+{
+	u16 misc = PERF_RECORD_MISC_USER;
+	int err;
+
+	if (dso__read_build_id(dso) < 0) {
+		pr_debug("no build_id found for %s\n", dso->long_name);
+		return -1;
+	}
+
+	if (dso->kernel)
+		misc = PERF_RECORD_MISC_KERNEL;
+
+	err = perf_event__synthesize_build_id(tool, dso, misc, perf_event__repipe,
+					      machine);
+	if (err) {
+		pr_err("Can't synthesize build_id event for %s\n", dso->long_name);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int perf_event__inject_buildid(struct perf_tool *tool,
+				      union perf_event *event,
+				      struct perf_sample *sample,
+				      struct perf_evsel *evsel __maybe_unused,
+				      struct machine *machine)
+{
+	struct addr_location al;
+	struct thread *thread;
+	u8 cpumode;
+
+	cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+
+	thread = machine__findnew_thread(machine, sample->pid, sample->tid);
+	if (thread == NULL) {
+		pr_err("problem processing %d event, skipping it.\n",
+		       event->header.type);
+		goto repipe;
+	}
+
+	thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION,
+			      sample->ip, &al);
+
+	if (al.map != NULL) {
+		if (!al.map->dso->hit) {
+			al.map->dso->hit = 1;
+			if (map__load(al.map, NULL) >= 0) {
+				dso__inject_build_id(al.map->dso, tool, machine);
+				/*
+				 * If this fails, too bad, let the other side
+				 * account this as unresolved.
+				 */
+			} else {
+#ifdef HAVE_LIBELF_SUPPORT
+				pr_warning("no symbols found in %s, maybe "
+					   "install a debug package?\n",
+					   al.map->dso->long_name);
+#endif
+			}
+		}
+	}
+
+repipe:
+	perf_event__repipe(tool, event, sample, machine);
+	return 0;
+}
+
+static int perf_inject__sched_process_exit(struct perf_tool *tool,
+					   union perf_event *event __maybe_unused,
+					   struct perf_sample *sample,
+					   struct perf_evsel *evsel __maybe_unused,
+					   struct machine *machine __maybe_unused)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+	struct event_entry *ent;
+
+	list_for_each_entry(ent, &inject->samples, node) {
+		if (sample->tid == ent->tid) {
+			list_del_init(&ent->node);
+			free(ent);
+			break;
+		}
+	}
+
+	return 0;
+}
+
+static int perf_inject__sched_switch(struct perf_tool *tool,
+				     union perf_event *event,
+				     struct perf_sample *sample,
+				     struct perf_evsel *evsel,
+				     struct machine *machine)
+{
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+	struct event_entry *ent;
+
+	perf_inject__sched_process_exit(tool, event, sample, evsel, machine);
+
+	ent = malloc(event->header.size + sizeof(struct event_entry));
+	if (ent == NULL) {
+		color_fprintf(stderr, PERF_COLOR_RED,
+			     "Not enough memory to process sched switch event!");
+		return -1;
+	}
+
+	ent->tid = sample->tid;
+	memcpy(&ent->event, event, event->header.size);
+	list_add(&ent->node, &inject->samples);
+	return 0;
+}
+
+static int perf_inject__sched_stat(struct perf_tool *tool,
+				   union perf_event *event __maybe_unused,
+				   struct perf_sample *sample,
+				   struct perf_evsel *evsel,
+				   struct machine *machine)
+{
+	struct event_entry *ent;
+	union perf_event *event_sw;
+	struct perf_sample sample_sw;
+	struct perf_inject *inject = container_of(tool, struct perf_inject, tool);
+	u32 pid = perf_evsel__intval(evsel, sample, "pid");
+
+	list_for_each_entry(ent, &inject->samples, node) {
+		if (pid == ent->tid)
+			goto found;
+	}
+
+	return 0;
+found:
+	event_sw = &ent->event[0];
+	perf_evsel__parse_sample(evsel, event_sw, &sample_sw);
+
+	sample_sw.period = sample->period;
+	sample_sw.time	 = sample->time;
+	perf_event__synthesize_sample(event_sw, evsel->attr.sample_type,
+				      evsel->attr.read_format, &sample_sw,
+				      false);
+	build_id__mark_dso_hit(tool, event_sw, &sample_sw, evsel, machine);
+	return perf_event__repipe(tool, event_sw, &sample_sw, machine);
+}
+
+static void sig_handler(int sig __maybe_unused)
+{
+	session_done = 1;
+}
+
+static int perf_evsel__check_stype(struct perf_evsel *evsel,
+				   u64 sample_type, const char *sample_msg)
+{
+	struct perf_event_attr *attr = &evsel->attr;
+	const char *name = perf_evsel__name(evsel);
+
+	if (!(attr->sample_type & sample_type)) {
+		pr_err("Samples for %s event do not have %s attribute set.",
+			name, sample_msg);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int __cmd_inject(struct perf_inject *inject)
+{
+	struct perf_session *session;
+	int ret = -EINVAL;
+	struct perf_data_file file = {
+		.path = inject->input_name,
+		.mode = PERF_DATA_MODE_READ,
+	};
+	struct perf_data_file *file_out = &inject->output;
+
+	signal(SIGINT, sig_handler);
+
+	if (inject->build_ids || inject->sched_stat) {
+		inject->tool.mmap	  = perf_event__repipe_mmap;
+		inject->tool.mmap2	  = perf_event__repipe_mmap2;
+		inject->tool.fork	  = perf_event__repipe_fork;
+		inject->tool.tracing_data = perf_event__repipe_tracing_data;
+	}
+
+	session = perf_session__new(&file, true, &inject->tool);
+	if (session == NULL)
+		return -ENOMEM;
+
+	if (inject->build_ids) {
+		inject->tool.sample = perf_event__inject_buildid;
+	} else if (inject->sched_stat) {
+		struct perf_evsel *evsel;
+
+		inject->tool.ordered_samples = true;
+
+		evlist__for_each(session->evlist, evsel) {
+			const char *name = perf_evsel__name(evsel);
+
+			if (!strcmp(name, "sched:sched_switch")) {
+				if (perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID"))
+					return -EINVAL;
+
+				evsel->handler = perf_inject__sched_switch;
+			} else if (!strcmp(name, "sched:sched_process_exit"))
+				evsel->handler = perf_inject__sched_process_exit;
+			else if (!strncmp(name, "sched:sched_stat_", 17))
+				evsel->handler = perf_inject__sched_stat;
+		}
+	}
+
+	if (!file_out->is_pipe)
+		lseek(file_out->fd, session->header.data_offset, SEEK_SET);
+
+	ret = perf_session__process_events(session, &inject->tool);
+
+	if (!file_out->is_pipe) {
+		session->header.data_size = inject->bytes_written;
+		perf_session__write_header(session, session->evlist, file_out->fd, true);
+	}
+
+	perf_session__delete(session);
+
+	return ret;
+}
+
+int cmd_inject(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	struct perf_inject inject = {
+		.tool = {
+			.sample		= perf_event__repipe_sample,
+			.mmap		= perf_event__repipe,
+			.mmap2		= perf_event__repipe,
+			.comm		= perf_event__repipe,
+			.fork		= perf_event__repipe,
+			.exit		= perf_event__repipe,
+			.lost		= perf_event__repipe,
+			.read		= perf_event__repipe_sample,
+			.throttle	= perf_event__repipe,
+			.unthrottle	= perf_event__repipe,
+			.attr		= perf_event__repipe_attr,
+			.tracing_data	= perf_event__repipe_op2_synth,
+			.finished_round	= perf_event__repipe_op2_synth,
+			.build_id	= perf_event__repipe_op2_synth,
+		},
+		.input_name  = "-",
+		.samples = LIST_HEAD_INIT(inject.samples),
+		.output = {
+			.path = "-",
+			.mode = PERF_DATA_MODE_WRITE,
+		},
+	};
+	const struct option options[] = {
+		OPT_BOOLEAN('b', "build-ids", &inject.build_ids,
+			    "Inject build-ids into the output stream"),
+		OPT_STRING('i', "input", &inject.input_name, "file",
+			   "input file name"),
+		OPT_STRING('o', "output", &inject.output.path, "file",
+			   "output file name"),
+		OPT_BOOLEAN('s', "sched-stat", &inject.sched_stat,
+			    "Merge sched-stat and sched-switch for getting events "
+			    "where and how long tasks slept"),
+		OPT_INCR('v', "verbose", &verbose,
+			 "be more verbose (show build ids, etc)"),
+		OPT_END()
+	};
+	const char * const inject_usage[] = {
+		"perf inject [<options>]",
+		NULL
+	};
+
+	argc = parse_options(argc, argv, options, inject_usage, 0);
+
+	/*
+	 * Any (unrecognized) arguments left?
+	 */
+	if (argc)
+		usage_with_options(inject_usage, options);
+
+	if (perf_data_file__open(&inject.output)) {
+		perror("failed to create output file");
+		return -1;
+	}
+
+	if (symbol__init() < 0)
+		return -1;
+
+	return __cmd_inject(&inject);
+}
diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c
index 924a9518931..bef3376bfaf 100644
--- a/tools/perf/builtin-kmem.c
+++ b/tools/perf/builtin-kmem.c
@@ -1,25 +1,29 @@
 #include "builtin.h"
 #include "perf.h"
 
+#include "util/evlist.h"
+#include "util/evsel.h"
 #include "util/util.h"
 #include "util/cache.h"
 #include "util/symbol.h"
 #include "util/thread.h"
 #include "util/header.h"
 #include "util/session.h"
+#include "util/tool.h"
 
 #include "util/parse-options.h"
 #include "util/trace-event.h"
+#include "util/data.h"
+#include "util/cpumap.h"
 
 #include "util/debug.h"
 
 #include <linux/rbtree.h>
+#include <linux/string.h>
 
 struct alloc_stat;
 typedef int (*sort_fn_t)(struct alloc_stat *, struct alloc_stat *);
 
-static char const		*input_name = "perf.data";
-
 static int			alloc_flag;
 static int			caller_flag;
 
@@ -28,11 +32,6 @@ static int			caller_lines = -1;
 
 static bool			raw_ip;
 
-static char			default_sort_order[] = "frag,hit,bytes";
-
-static int			*cpunode_map;
-static int			max_cpu_num;
-
 struct alloc_stat {
 	u64	call_site;
 	u64	ptr;
@@ -54,64 +53,8 @@ static struct rb_root root_caller_sorted;
 static unsigned long total_requested, total_allocated;
 static unsigned long nr_allocs, nr_cross_allocs;
 
-#define PATH_SYS_NODE	"/sys/devices/system/node"
-
-static void init_cpunode_map(void)
-{
-	FILE *fp;
-	int i;
-
-	fp = fopen("/sys/devices/system/cpu/kernel_max", "r");
-	if (!fp) {
-		max_cpu_num = 4096;
-		return;
-	}
-
-	if (fscanf(fp, "%d", &max_cpu_num) < 1)
-		die("Failed to read 'kernel_max' from sysfs");
-	max_cpu_num++;
-
-	cpunode_map = calloc(max_cpu_num, sizeof(int));
-	if (!cpunode_map)
-		die("calloc");
-	for (i = 0; i < max_cpu_num; i++)
-		cpunode_map[i] = -1;
-	fclose(fp);
-}
-
-static void setup_cpunode_map(void)
-{
-	struct dirent *dent1, *dent2;
-	DIR *dir1, *dir2;
-	unsigned int cpu, mem;
-	char buf[PATH_MAX];
-
-	init_cpunode_map();
-
-	dir1 = opendir(PATH_SYS_NODE);
-	if (!dir1)
-		return;
-
-	while ((dent1 = readdir(dir1)) != NULL) {
-		if (dent1->d_type != DT_DIR ||
-		    sscanf(dent1->d_name, "node%u", &mem) < 1)
-			continue;
-
-		snprintf(buf, PATH_MAX, "%s/%s", PATH_SYS_NODE, dent1->d_name);
-		dir2 = opendir(buf);
-		if (!dir2)
-			continue;
-		while ((dent2 = readdir(dir2)) != NULL) {
-			if (dent2->d_type != DT_LNK ||
-			    sscanf(dent2->d_name, "cpu%u", &cpu) < 1)
-				continue;
-			cpunode_map[cpu] = mem;
-		}
-	}
-}
-
-static void insert_alloc_stat(unsigned long call_site, unsigned long ptr,
-			      int bytes_req, int bytes_alloc, int cpu)
+static int insert_alloc_stat(unsigned long call_site, unsigned long ptr,
+			     int bytes_req, int bytes_alloc, int cpu)
 {
 	struct rb_node **node = &root_alloc_stat.rb_node;
 	struct rb_node *parent = NULL;
@@ -135,8 +78,10 @@ static void insert_alloc_stat(unsigned long call_site, unsigned long ptr,
 		data->bytes_alloc += bytes_alloc;
 	} else {
 		data = malloc(sizeof(*data));
-		if (!data)
-			die("malloc");
+		if (!data) {
+			pr_err("%s: malloc failed\n", __func__);
+			return -1;
+		}
 		data->ptr = ptr;
 		data->pingpong = 0;
 		data->hit = 1;
@@ -148,9 +93,10 @@ static void insert_alloc_stat(unsigned long call_site, unsigned long ptr,
 	}
 	data->call_site = call_site;
 	data->alloc_cpu = cpu;
+	return 0;
 }
 
-static void insert_caller_stat(unsigned long call_site,
+static int insert_caller_stat(unsigned long call_site,
 			      int bytes_req, int bytes_alloc)
 {
 	struct rb_node **node = &root_caller_stat.rb_node;
@@ -175,8 +121,10 @@ static void insert_caller_stat(unsigned long call_site,
 		data->bytes_alloc += bytes_alloc;
 	} else {
 		data = malloc(sizeof(*data));
-		if (!data)
-			die("malloc");
+		if (!data) {
+			pr_err("%s: malloc failed\n", __func__);
+			return -1;
+		}
 		data->call_site = call_site;
 		data->pingpong = 0;
 		data->hit = 1;
@@ -186,39 +134,43 @@ static void insert_caller_stat(unsigned long call_site,
 		rb_link_node(&data->node, parent, node);
 		rb_insert_color(&data->node, &root_caller_stat);
 	}
+
+	return 0;
 }
 
-static void process_alloc_event(void *data,
-				struct event *event,
-				int cpu,
-				u64 timestamp __used,
-				struct thread *thread __used,
-				int node)
+static int perf_evsel__process_alloc_event(struct perf_evsel *evsel,
+					   struct perf_sample *sample)
 {
-	unsigned long call_site;
-	unsigned long ptr;
-	int bytes_req;
-	int bytes_alloc;
-	int node1, node2;
-
-	ptr = raw_field_value(event, "ptr", data);
-	call_site = raw_field_value(event, "call_site", data);
-	bytes_req = raw_field_value(event, "bytes_req", data);
-	bytes_alloc = raw_field_value(event, "bytes_alloc", data);
+	unsigned long ptr = perf_evsel__intval(evsel, sample, "ptr"),
+		      call_site = perf_evsel__intval(evsel, sample, "call_site");
+	int bytes_req = perf_evsel__intval(evsel, sample, "bytes_req"),
+	    bytes_alloc = perf_evsel__intval(evsel, sample, "bytes_alloc");
 
-	insert_alloc_stat(call_site, ptr, bytes_req, bytes_alloc, cpu);
-	insert_caller_stat(call_site, bytes_req, bytes_alloc);
+	if (insert_alloc_stat(call_site, ptr, bytes_req, bytes_alloc, sample->cpu) ||
+	    insert_caller_stat(call_site, bytes_req, bytes_alloc))
+		return -1;
 
 	total_requested += bytes_req;
 	total_allocated += bytes_alloc;
 
-	if (node) {
-		node1 = cpunode_map[cpu];
-		node2 = raw_field_value(event, "node", data);
+	nr_allocs++;
+	return 0;
+}
+
+static int perf_evsel__process_alloc_node_event(struct perf_evsel *evsel,
+						struct perf_sample *sample)
+{
+	int ret = perf_evsel__process_alloc_event(evsel, sample);
+
+	if (!ret) {
+		int node1 = cpu__get_node(sample->cpu),
+		    node2 = perf_evsel__intval(evsel, sample, "node");
+
 		if (node1 != node2)
 			nr_cross_allocs++;
 	}
-	nr_allocs++;
+
+	return ret;
 }
 
 static int ptr_cmp(struct alloc_stat *, struct alloc_stat *);
@@ -249,94 +201,62 @@ static struct alloc_stat *search_alloc_stat(unsigned long ptr,
 	return NULL;
 }
 
-static void process_free_event(void *data,
-			       struct event *event,
-			       int cpu,
-			       u64 timestamp __used,
-			       struct thread *thread __used)
+static int perf_evsel__process_free_event(struct perf_evsel *evsel,
+					  struct perf_sample *sample)
 {
-	unsigned long ptr;
+	unsigned long ptr = perf_evsel__intval(evsel, sample, "ptr");
 	struct alloc_stat *s_alloc, *s_caller;
 
-	ptr = raw_field_value(event, "ptr", data);
-
 	s_alloc = search_alloc_stat(ptr, 0, &root_alloc_stat, ptr_cmp);
 	if (!s_alloc)
-		return;
+		return 0;
 
-	if (cpu != s_alloc->alloc_cpu) {
+	if ((short)sample->cpu != s_alloc->alloc_cpu) {
 		s_alloc->pingpong++;
 
 		s_caller = search_alloc_stat(0, s_alloc->call_site,
 					     &root_caller_stat, callsite_cmp);
-		assert(s_caller);
+		if (!s_caller)
+			return -1;
 		s_caller->pingpong++;
 	}
 	s_alloc->alloc_cpu = -1;
-}
-
-static void
-process_raw_event(event_t *raw_event __used, void *data,
-		  int cpu, u64 timestamp, struct thread *thread)
-{
-	struct event *event;
-	int type;
-
-	type = trace_parse_common_type(data);
-	event = trace_find_event(type);
-
-	if (!strcmp(event->name, "kmalloc") ||
-	    !strcmp(event->name, "kmem_cache_alloc")) {
-		process_alloc_event(data, event, cpu, timestamp, thread, 0);
-		return;
-	}
 
-	if (!strcmp(event->name, "kmalloc_node") ||
-	    !strcmp(event->name, "kmem_cache_alloc_node")) {
-		process_alloc_event(data, event, cpu, timestamp, thread, 1);
-		return;
-	}
-
-	if (!strcmp(event->name, "kfree") ||
-	    !strcmp(event->name, "kmem_cache_free")) {
-		process_free_event(data, event, cpu, timestamp, thread);
-		return;
-	}
+	return 0;
 }
 
-static int process_sample_event(event_t *event, struct perf_session *session)
-{
-	struct sample_data data;
-	struct thread *thread;
-
-	memset(&data, 0, sizeof(data));
-	data.time = -1;
-	data.cpu = -1;
-	data.period = 1;
+typedef int (*tracepoint_handler)(struct perf_evsel *evsel,
+				  struct perf_sample *sample);
 
-	event__parse_sample(event, session->sample_type, &data);
-
-	dump_printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc,
-		    data.pid, data.tid, data.ip, data.period);
+static int process_sample_event(struct perf_tool *tool __maybe_unused,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct perf_evsel *evsel,
+				struct machine *machine)
+{
+	struct thread *thread = machine__findnew_thread(machine, sample->pid,
+							sample->tid);
 
-	thread = perf_session__findnew(session, event->ip.pid);
 	if (thread == NULL) {
 		pr_debug("problem processing %d event, skipping it.\n",
 			 event->header.type);
 		return -1;
 	}
 
-	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+	dump_printf(" ... thread: %s:%d\n", thread__comm_str(thread), thread->tid);
 
-	process_raw_event(event, data.raw_data, data.cpu,
-			  data.time, thread);
+	if (evsel->handler != NULL) {
+		tracepoint_handler f = evsel->handler;
+		return f(evsel, sample);
+	}
 
 	return 0;
 }
 
-static struct perf_event_ops event_ops = {
-	.sample	= process_sample_event,
-	.comm	= event__process_comm,
+static struct perf_tool perf_kmem = {
+	.sample		 = process_sample_event,
+	.comm		 = perf_event__process_comm,
+	.ordered_samples = true,
 };
 
 static double fragmentation(unsigned long n_req, unsigned long n_alloc)
@@ -351,6 +271,7 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 			   int n_lines, int is_caller)
 {
 	struct rb_node *next;
+	struct machine *machine = &session->machines.host;
 
 	printf("%.102s\n", graph_dotted_line);
 	printf(" %-34s |",  is_caller ? "Callsite": "Alloc Ptr");
@@ -363,21 +284,22 @@ static void __print_result(struct rb_root *root, struct perf_session *session,
 		struct alloc_stat *data = rb_entry(next, struct alloc_stat,
 						   node);
 		struct symbol *sym = NULL;
+		struct map *map;
 		char buf[BUFSIZ];
 		u64 addr;
 
 		if (is_caller) {
 			addr = data->call_site;
 			if (!raw_ip)
-				sym = map_groups__find_function(&session->kmaps, addr, NULL);
+				sym = machine__find_kernel_function(machine, addr, &map, NULL);
 		} else
 			addr = data->ptr;
 
 		if (sym != NULL)
-			snprintf(buf, sizeof(buf), "%s+%Lx", sym->name,
-				 addr - sym->start);
+			snprintf(buf, sizeof(buf), "%s+%" PRIx64 "", sym->name,
+				 addr - map->unmap_ip(map, sym->start));
 		else
-			snprintf(buf, sizeof(buf), "%#Lx", addr);
+			snprintf(buf, sizeof(buf), "%#" PRIx64 "", addr);
 		printf(" %-34s |", buf);
 
 		printf(" %9llu/%-5lu | %9llu/%-5lu | %8lu | %8lu | %6.3f%%\n",
@@ -484,15 +406,37 @@ static void sort_result(void)
 static int __cmd_kmem(void)
 {
 	int err = -EINVAL;
-	struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0);
+	struct perf_session *session;
+	const struct perf_evsel_str_handler kmem_tracepoints[] = {
+		{ "kmem:kmalloc",		perf_evsel__process_alloc_event, },
+    		{ "kmem:kmem_cache_alloc",	perf_evsel__process_alloc_event, },
+		{ "kmem:kmalloc_node",		perf_evsel__process_alloc_node_event, },
+    		{ "kmem:kmem_cache_alloc_node", perf_evsel__process_alloc_node_event, },
+		{ "kmem:kfree",			perf_evsel__process_free_event, },
+    		{ "kmem:kmem_cache_free",	perf_evsel__process_free_event, },
+	};
+	struct perf_data_file file = {
+		.path = input_name,
+		.mode = PERF_DATA_MODE_READ,
+	};
+
+	session = perf_session__new(&file, false, &perf_kmem);
 	if (session == NULL)
 		return -ENOMEM;
 
+	if (perf_session__create_kernel_maps(session) < 0)
+		goto out_delete;
+
 	if (!perf_session__has_traces(session, "kmem record"))
 		goto out_delete;
 
+	if (perf_session__set_tracepoints_handlers(session, kmem_tracepoints)) {
+		pr_err("Initializing perf session tracepoint handlers failed\n");
+		return -1;
+	}
+
 	setup_pager();
-	err = perf_session__process_events(session, &event_ops);
+	err = perf_session__process_events(session, &perf_kmem);
 	if (err != 0)
 		goto out_delete;
 	sort_result();
@@ -502,11 +446,6 @@ out_delete:
 	return err;
 }
 
-static const char * const kmem_usage[] = {
-	"perf kmem [<options>] {record|stat}",
-	NULL
-};
-
 static int ptr_cmp(struct alloc_stat *l, struct alloc_stat *r)
 {
 	if (l->ptr < r->ptr)
@@ -605,8 +544,7 @@ static struct sort_dimension *avail_sorts[] = {
 	&pingpong_sort_dimension,
 };
 
-#define NUM_AVAIL_SORTS	\
-	(int)(sizeof(avail_sorts) / sizeof(struct sort_dimension *))
+#define NUM_AVAIL_SORTS	((int)ARRAY_SIZE(avail_sorts))
 
 static int sort_dimension__add(const char *tok, struct list_head *list)
 {
@@ -615,10 +553,11 @@ static int sort_dimension__add(const char *tok, struct list_head *list)
 
 	for (i = 0; i < NUM_AVAIL_SORTS; i++) {
 		if (!strcmp(avail_sorts[i]->name, tok)) {
-			sort = malloc(sizeof(*sort));
-			if (!sort)
-				die("malloc");
-			memcpy(sort, avail_sorts[i], sizeof(*sort));
+			sort = memdup(avail_sorts[i], sizeof(*avail_sorts[i]));
+			if (!sort) {
+				pr_err("%s: memdup failed\n", __func__);
+				return -1;
+			}
 			list_add_tail(&sort->list, list);
 			return 0;
 		}
@@ -632,8 +571,10 @@ static int setup_sorting(struct list_head *sort_list, const char *arg)
 	char *tok;
 	char *str = strdup(arg);
 
-	if (!str)
-		die("strdup");
+	if (!str) {
+		pr_err("%s: strdup failed\n", __func__);
+		return -1;
+	}
 
 	while (true) {
 		tok = strsep(&str, ",");
@@ -641,6 +582,7 @@ static int setup_sorting(struct list_head *sort_list, const char *arg)
 			break;
 		if (sort_dimension__add(tok, sort_list) < 0) {
 			error("Unknown --sort key: '%s'", tok);
+			free(str);
 			return -1;
 		}
 	}
@@ -649,8 +591,8 @@ static int setup_sorting(struct list_head *sort_list, const char *arg)
 	return 0;
 }
 
-static int parse_sort_opt(const struct option *opt __used,
-			  const char *arg, int unset __used)
+static int parse_sort_opt(const struct option *opt __maybe_unused,
+			  const char *arg, int unset __maybe_unused)
 {
 	if (!arg)
 		return -1;
@@ -663,22 +605,24 @@ static int parse_sort_opt(const struct option *opt __used,
 	return 0;
 }
 
-static int parse_caller_opt(const struct option *opt __used,
-			  const char *arg __used, int unset __used)
+static int parse_caller_opt(const struct option *opt __maybe_unused,
+			    const char *arg __maybe_unused,
+			    int unset __maybe_unused)
 {
 	caller_flag = (alloc_flag + 1);
 	return 0;
 }
 
-static int parse_alloc_opt(const struct option *opt __used,
-			  const char *arg __used, int unset __used)
+static int parse_alloc_opt(const struct option *opt __maybe_unused,
+			   const char *arg __maybe_unused,
+			   int unset __maybe_unused)
 {
 	alloc_flag = (caller_flag + 1);
 	return 0;
 }
 
-static int parse_line_opt(const struct option *opt __used,
-			  const char *arg, int unset __used)
+static int parse_line_opt(const struct option *opt __maybe_unused,
+			  const char *arg, int unset __maybe_unused)
 {
 	int lines;
 
@@ -695,48 +639,26 @@ static int parse_line_opt(const struct option *opt __used,
 	return 0;
 }
 
-static const struct option kmem_options[] = {
-	OPT_STRING('i', "input", &input_name, "file",
-		   "input file name"),
-	OPT_CALLBACK_NOOPT(0, "caller", NULL, NULL,
-			   "show per-callsite statistics",
-			   parse_caller_opt),
-	OPT_CALLBACK_NOOPT(0, "alloc", NULL, NULL,
-			   "show per-allocation statistics",
-			   parse_alloc_opt),
-	OPT_CALLBACK('s', "sort", NULL, "key[,key2...]",
-		     "sort by keys: ptr, call_site, bytes, hit, pingpong, frag",
-		     parse_sort_opt),
-	OPT_CALLBACK('l', "line", NULL, "num",
-		     "show n lines",
-		     parse_line_opt),
-	OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"),
-	OPT_END()
-};
-
-static const char *record_args[] = {
-	"record",
-	"-a",
-	"-R",
-	"-M",
-	"-f",
-	"-c", "1",
+static int __cmd_record(int argc, const char **argv)
+{
+	const char * const record_args[] = {
+	"record", "-a", "-R", "-c", "1",
 	"-e", "kmem:kmalloc",
 	"-e", "kmem:kmalloc_node",
 	"-e", "kmem:kfree",
 	"-e", "kmem:kmem_cache_alloc",
 	"-e", "kmem:kmem_cache_alloc_node",
 	"-e", "kmem:kmem_cache_free",
-};
-
-static int __cmd_record(int argc, const char **argv)
-{
+	};
 	unsigned int rec_argc, i, j;
 	const char **rec_argv;
 
 	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
 
+	if (rec_argv == NULL)
+		return -ENOMEM;
+
 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
 		rec_argv[i] = strdup(record_args[i]);
 
@@ -746,9 +668,29 @@ static int __cmd_record(int argc, const char **argv)
 	return cmd_record(i, rec_argv, NULL);
 }
 
-int cmd_kmem(int argc, const char **argv, const char *prefix __used)
+int cmd_kmem(int argc, const char **argv, const char *prefix __maybe_unused)
 {
-	argc = parse_options(argc, argv, kmem_options, kmem_usage, 0);
+	const char * const default_sort_order = "frag,hit,bytes";
+	const struct option kmem_options[] = {
+	OPT_STRING('i', "input", &input_name, "file", "input file name"),
+	OPT_CALLBACK_NOOPT(0, "caller", NULL, NULL,
+			   "show per-callsite statistics", parse_caller_opt),
+	OPT_CALLBACK_NOOPT(0, "alloc", NULL, NULL,
+			   "show per-allocation statistics", parse_alloc_opt),
+	OPT_CALLBACK('s', "sort", NULL, "key[,key2...]",
+		     "sort by keys: ptr, call_site, bytes, hit, pingpong, frag",
+		     parse_sort_opt),
+	OPT_CALLBACK('l', "line", NULL, "num", "show n lines", parse_line_opt),
+	OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"),
+	OPT_END()
+	};
+	const char *const kmem_subcommands[] = { "record", "stat", NULL };
+	const char *kmem_usage[] = {
+		NULL,
+		NULL
+	};
+	argc = parse_options_subcommand(argc, argv, kmem_options,
+					kmem_subcommands, kmem_usage, 0);
 
 	if (!argc)
 		usage_with_options(kmem_usage, kmem_options);
@@ -758,7 +700,8 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __used)
 	if (!strncmp(argv[0], "rec", 3)) {
 		return __cmd_record(argc, argv);
 	} else if (!strcmp(argv[0], "stat")) {
-		setup_cpunode_map();
+		if (cpu__setup_cpunode_map())
+			return -1;
 
 		if (list_empty(&caller_sort))
 			setup_sorting(&caller_sort, default_sort_order);
diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c
new file mode 100644
index 00000000000..0f1e5a2f6ad
--- /dev/null
+++ b/tools/perf/builtin-kvm.c
@@ -0,0 +1,1737 @@
+#include "builtin.h"
+#include "perf.h"
+
+#include "util/evsel.h"
+#include "util/evlist.h"
+#include "util/util.h"
+#include "util/cache.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+#include "util/header.h"
+#include "util/session.h"
+#include "util/intlist.h"
+#include "util/parse-options.h"
+#include "util/trace-event.h"
+#include "util/debug.h"
+#include <api/fs/debugfs.h>
+#include "util/tool.h"
+#include "util/stat.h"
+#include "util/top.h"
+#include "util/data.h"
+
+#include <sys/prctl.h>
+#ifdef HAVE_TIMERFD_SUPPORT
+#include <sys/timerfd.h>
+#endif
+
+#include <termios.h>
+#include <semaphore.h>
+#include <pthread.h>
+#include <math.h>
+
+#if defined(__i386__) || defined(__x86_64__)
+#include <asm/svm.h>
+#include <asm/vmx.h>
+#include <asm/kvm.h>
+
+struct event_key {
+	#define INVALID_KEY     (~0ULL)
+	u64 key;
+	int info;
+};
+
+struct kvm_event_stats {
+	u64 time;
+	struct stats stats;
+};
+
+struct kvm_event {
+	struct list_head hash_entry;
+	struct rb_node rb;
+
+	struct event_key key;
+
+	struct kvm_event_stats total;
+
+	#define DEFAULT_VCPU_NUM 8
+	int max_vcpu;
+	struct kvm_event_stats *vcpu;
+};
+
+typedef int (*key_cmp_fun)(struct kvm_event*, struct kvm_event*, int);
+
+struct kvm_event_key {
+	const char *name;
+	key_cmp_fun key;
+};
+
+
+struct perf_kvm_stat;
+
+struct kvm_events_ops {
+	bool (*is_begin_event)(struct perf_evsel *evsel,
+			       struct perf_sample *sample,
+			       struct event_key *key);
+	bool (*is_end_event)(struct perf_evsel *evsel,
+			     struct perf_sample *sample, struct event_key *key);
+	void (*decode_key)(struct perf_kvm_stat *kvm, struct event_key *key,
+			   char decode[20]);
+	const char *name;
+};
+
+struct exit_reasons_table {
+	unsigned long exit_code;
+	const char *reason;
+};
+
+#define EVENTS_BITS		12
+#define EVENTS_CACHE_SIZE	(1UL << EVENTS_BITS)
+
+struct perf_kvm_stat {
+	struct perf_tool    tool;
+	struct record_opts  opts;
+	struct perf_evlist  *evlist;
+	struct perf_session *session;
+
+	const char *file_name;
+	const char *report_event;
+	const char *sort_key;
+	int trace_vcpu;
+
+	struct exit_reasons_table *exit_reasons;
+	int exit_reasons_size;
+	const char *exit_reasons_isa;
+
+	struct kvm_events_ops *events_ops;
+	key_cmp_fun compare;
+	struct list_head kvm_events_cache[EVENTS_CACHE_SIZE];
+
+	u64 total_time;
+	u64 total_count;
+	u64 lost_events;
+	u64 duration;
+
+	const char *pid_str;
+	struct intlist *pid_list;
+
+	struct rb_root result;
+
+	int timerfd;
+	unsigned int display_time;
+	bool live;
+};
+
+
+static void exit_event_get_key(struct perf_evsel *evsel,
+			       struct perf_sample *sample,
+			       struct event_key *key)
+{
+	key->info = 0;
+	key->key = perf_evsel__intval(evsel, sample, "exit_reason");
+}
+
+static bool kvm_exit_event(struct perf_evsel *evsel)
+{
+	return !strcmp(evsel->name, "kvm:kvm_exit");
+}
+
+static bool exit_event_begin(struct perf_evsel *evsel,
+			     struct perf_sample *sample, struct event_key *key)
+{
+	if (kvm_exit_event(evsel)) {
+		exit_event_get_key(evsel, sample, key);
+		return true;
+	}
+
+	return false;
+}
+
+static bool kvm_entry_event(struct perf_evsel *evsel)
+{
+	return !strcmp(evsel->name, "kvm:kvm_entry");
+}
+
+static bool exit_event_end(struct perf_evsel *evsel,
+			   struct perf_sample *sample __maybe_unused,
+			   struct event_key *key __maybe_unused)
+{
+	return kvm_entry_event(evsel);
+}
+
+static struct exit_reasons_table vmx_exit_reasons[] = {
+	VMX_EXIT_REASONS
+};
+
+static struct exit_reasons_table svm_exit_reasons[] = {
+	SVM_EXIT_REASONS
+};
+
+static const char *get_exit_reason(struct perf_kvm_stat *kvm, u64 exit_code)
+{
+	int i = kvm->exit_reasons_size;
+	struct exit_reasons_table *tbl = kvm->exit_reasons;
+
+	while (i--) {
+		if (tbl->exit_code == exit_code)
+			return tbl->reason;
+		tbl++;
+	}
+
+	pr_err("unknown kvm exit code:%lld on %s\n",
+		(unsigned long long)exit_code, kvm->exit_reasons_isa);
+	return "UNKNOWN";
+}
+
+static void exit_event_decode_key(struct perf_kvm_stat *kvm,
+				  struct event_key *key,
+				  char decode[20])
+{
+	const char *exit_reason = get_exit_reason(kvm, key->key);
+
+	scnprintf(decode, 20, "%s", exit_reason);
+}
+
+static struct kvm_events_ops exit_events = {
+	.is_begin_event = exit_event_begin,
+	.is_end_event = exit_event_end,
+	.decode_key = exit_event_decode_key,
+	.name = "VM-EXIT"
+};
+
+/*
+ * For the mmio events, we treat:
+ * the time of MMIO write: kvm_mmio(KVM_TRACE_MMIO_WRITE...) -> kvm_entry
+ * the time of MMIO read: kvm_exit -> kvm_mmio(KVM_TRACE_MMIO_READ...).
+ */
+static void mmio_event_get_key(struct perf_evsel *evsel, struct perf_sample *sample,
+			       struct event_key *key)
+{
+	key->key  = perf_evsel__intval(evsel, sample, "gpa");
+	key->info = perf_evsel__intval(evsel, sample, "type");
+}
+
+#define KVM_TRACE_MMIO_READ_UNSATISFIED 0
+#define KVM_TRACE_MMIO_READ 1
+#define KVM_TRACE_MMIO_WRITE 2
+
+static bool mmio_event_begin(struct perf_evsel *evsel,
+			     struct perf_sample *sample, struct event_key *key)
+{
+	/* MMIO read begin event in kernel. */
+	if (kvm_exit_event(evsel))
+		return true;
+
+	/* MMIO write begin event in kernel. */
+	if (!strcmp(evsel->name, "kvm:kvm_mmio") &&
+	    perf_evsel__intval(evsel, sample, "type") == KVM_TRACE_MMIO_WRITE) {
+		mmio_event_get_key(evsel, sample, key);
+		return true;
+	}
+
+	return false;
+}
+
+static bool mmio_event_end(struct perf_evsel *evsel, struct perf_sample *sample,
+			   struct event_key *key)
+{
+	/* MMIO write end event in kernel. */
+	if (kvm_entry_event(evsel))
+		return true;
+
+	/* MMIO read end event in kernel.*/
+	if (!strcmp(evsel->name, "kvm:kvm_mmio") &&
+	    perf_evsel__intval(evsel, sample, "type") == KVM_TRACE_MMIO_READ) {
+		mmio_event_get_key(evsel, sample, key);
+		return true;
+	}
+
+	return false;
+}
+
+static void mmio_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
+				  struct event_key *key,
+				  char decode[20])
+{
+	scnprintf(decode, 20, "%#lx:%s", (unsigned long)key->key,
+				key->info == KVM_TRACE_MMIO_WRITE ? "W" : "R");
+}
+
+static struct kvm_events_ops mmio_events = {
+	.is_begin_event = mmio_event_begin,
+	.is_end_event = mmio_event_end,
+	.decode_key = mmio_event_decode_key,
+	.name = "MMIO Access"
+};
+
+ /* The time of emulation pio access is from kvm_pio to kvm_entry. */
+static void ioport_event_get_key(struct perf_evsel *evsel,
+				 struct perf_sample *sample,
+				 struct event_key *key)
+{
+	key->key  = perf_evsel__intval(evsel, sample, "port");
+	key->info = perf_evsel__intval(evsel, sample, "rw");
+}
+
+static bool ioport_event_begin(struct perf_evsel *evsel,
+			       struct perf_sample *sample,
+			       struct event_key *key)
+{
+	if (!strcmp(evsel->name, "kvm:kvm_pio")) {
+		ioport_event_get_key(evsel, sample, key);
+		return true;
+	}
+
+	return false;
+}
+
+static bool ioport_event_end(struct perf_evsel *evsel,
+			     struct perf_sample *sample __maybe_unused,
+			     struct event_key *key __maybe_unused)
+{
+	return kvm_entry_event(evsel);
+}
+
+static void ioport_event_decode_key(struct perf_kvm_stat *kvm __maybe_unused,
+				    struct event_key *key,
+				    char decode[20])
+{
+	scnprintf(decode, 20, "%#llx:%s", (unsigned long long)key->key,
+				key->info ? "POUT" : "PIN");
+}
+
+static struct kvm_events_ops ioport_events = {
+	.is_begin_event = ioport_event_begin,
+	.is_end_event = ioport_event_end,
+	.decode_key = ioport_event_decode_key,
+	.name = "IO Port Access"
+};
+
+static bool register_kvm_events_ops(struct perf_kvm_stat *kvm)
+{
+	bool ret = true;
+
+	if (!strcmp(kvm->report_event, "vmexit"))
+		kvm->events_ops = &exit_events;
+	else if (!strcmp(kvm->report_event, "mmio"))
+		kvm->events_ops = &mmio_events;
+	else if (!strcmp(kvm->report_event, "ioport"))
+		kvm->events_ops = &ioport_events;
+	else {
+		pr_err("Unknown report event:%s\n", kvm->report_event);
+		ret = false;
+	}
+
+	return ret;
+}
+
+struct vcpu_event_record {
+	int vcpu_id;
+	u64 start_time;
+	struct kvm_event *last_event;
+};
+
+
+static void init_kvm_event_record(struct perf_kvm_stat *kvm)
+{
+	unsigned int i;
+
+	for (i = 0; i < EVENTS_CACHE_SIZE; i++)
+		INIT_LIST_HEAD(&kvm->kvm_events_cache[i]);
+}
+
+#ifdef HAVE_TIMERFD_SUPPORT
+static void clear_events_cache_stats(struct list_head *kvm_events_cache)
+{
+	struct list_head *head;
+	struct kvm_event *event;
+	unsigned int i;
+	int j;
+
+	for (i = 0; i < EVENTS_CACHE_SIZE; i++) {
+		head = &kvm_events_cache[i];
+		list_for_each_entry(event, head, hash_entry) {
+			/* reset stats for event */
+			event->total.time = 0;
+			init_stats(&event->total.stats);
+
+			for (j = 0; j < event->max_vcpu; ++j) {
+				event->vcpu[j].time = 0;
+				init_stats(&event->vcpu[j].stats);
+			}
+		}
+	}
+}
+#endif
+
+static int kvm_events_hash_fn(u64 key)
+{
+	return key & (EVENTS_CACHE_SIZE - 1);
+}
+
+static bool kvm_event_expand(struct kvm_event *event, int vcpu_id)
+{
+	int old_max_vcpu = event->max_vcpu;
+	void *prev;
+
+	if (vcpu_id < event->max_vcpu)
+		return true;
+
+	while (event->max_vcpu <= vcpu_id)
+		event->max_vcpu += DEFAULT_VCPU_NUM;
+
+	prev = event->vcpu;
+	event->vcpu = realloc(event->vcpu,
+			      event->max_vcpu * sizeof(*event->vcpu));
+	if (!event->vcpu) {
+		free(prev);
+		pr_err("Not enough memory\n");
+		return false;
+	}
+
+	memset(event->vcpu + old_max_vcpu, 0,
+	       (event->max_vcpu - old_max_vcpu) * sizeof(*event->vcpu));
+	return true;
+}
+
+static struct kvm_event *kvm_alloc_init_event(struct event_key *key)
+{
+	struct kvm_event *event;
+
+	event = zalloc(sizeof(*event));
+	if (!event) {
+		pr_err("Not enough memory\n");
+		return NULL;
+	}
+
+	event->key = *key;
+	init_stats(&event->total.stats);
+	return event;
+}
+
+static struct kvm_event *find_create_kvm_event(struct perf_kvm_stat *kvm,
+					       struct event_key *key)
+{
+	struct kvm_event *event;
+	struct list_head *head;
+
+	BUG_ON(key->key == INVALID_KEY);
+
+	head = &kvm->kvm_events_cache[kvm_events_hash_fn(key->key)];
+	list_for_each_entry(event, head, hash_entry) {
+		if (event->key.key == key->key && event->key.info == key->info)
+			return event;
+	}
+
+	event = kvm_alloc_init_event(key);
+	if (!event)
+		return NULL;
+
+	list_add(&event->hash_entry, head);
+	return event;
+}
+
+static bool handle_begin_event(struct perf_kvm_stat *kvm,
+			       struct vcpu_event_record *vcpu_record,
+			       struct event_key *key, u64 timestamp)
+{
+	struct kvm_event *event = NULL;
+
+	if (key->key != INVALID_KEY)
+		event = find_create_kvm_event(kvm, key);
+
+	vcpu_record->last_event = event;
+	vcpu_record->start_time = timestamp;
+	return true;
+}
+
+static void
+kvm_update_event_stats(struct kvm_event_stats *kvm_stats, u64 time_diff)
+{
+	kvm_stats->time += time_diff;
+	update_stats(&kvm_stats->stats, time_diff);
+}
+
+static double kvm_event_rel_stddev(int vcpu_id, struct kvm_event *event)
+{
+	struct kvm_event_stats *kvm_stats = &event->total;
+
+	if (vcpu_id != -1)
+		kvm_stats = &event->vcpu[vcpu_id];
+
+	return rel_stddev_stats(stddev_stats(&kvm_stats->stats),
+				avg_stats(&kvm_stats->stats));
+}
+
+static bool update_kvm_event(struct kvm_event *event, int vcpu_id,
+			     u64 time_diff)
+{
+	if (vcpu_id == -1) {
+		kvm_update_event_stats(&event->total, time_diff);
+		return true;
+	}
+
+	if (!kvm_event_expand(event, vcpu_id))
+		return false;
+
+	kvm_update_event_stats(&event->vcpu[vcpu_id], time_diff);
+	return true;
+}
+
+static bool handle_end_event(struct perf_kvm_stat *kvm,
+			     struct vcpu_event_record *vcpu_record,
+			     struct event_key *key,
+			     struct perf_sample *sample)
+{
+	struct kvm_event *event;
+	u64 time_begin, time_diff;
+	int vcpu;
+
+	if (kvm->trace_vcpu == -1)
+		vcpu = -1;
+	else
+		vcpu = vcpu_record->vcpu_id;
+
+	event = vcpu_record->last_event;
+	time_begin = vcpu_record->start_time;
+
+	/* The begin event is not caught. */
+	if (!time_begin)
+		return true;
+
+	/*
+	 * In some case, the 'begin event' only records the start timestamp,
+	 * the actual event is recognized in the 'end event' (e.g. mmio-event).
+	 */
+
+	/* Both begin and end events did not get the key. */
+	if (!event && key->key == INVALID_KEY)
+		return true;
+
+	if (!event)
+		event = find_create_kvm_event(kvm, key);
+
+	if (!event)
+		return false;
+
+	vcpu_record->last_event = NULL;
+	vcpu_record->start_time = 0;
+
+	/* seems to happen once in a while during live mode */
+	if (sample->time < time_begin) {
+		pr_debug("End time before begin time; skipping event.\n");
+		return true;
+	}
+
+	time_diff = sample->time - time_begin;
+
+	if (kvm->duration && time_diff > kvm->duration) {
+		char decode[32];
+
+		kvm->events_ops->decode_key(kvm, &event->key, decode);
+		if (strcmp(decode, "HLT")) {
+			pr_info("%" PRIu64 " VM %d, vcpu %d: %s event took %" PRIu64 "usec\n",
+				 sample->time, sample->pid, vcpu_record->vcpu_id,
+				 decode, time_diff/1000);
+		}
+	}
+
+	return update_kvm_event(event, vcpu, time_diff);
+}
+
+static
+struct vcpu_event_record *per_vcpu_record(struct thread *thread,
+					  struct perf_evsel *evsel,
+					  struct perf_sample *sample)
+{
+	/* Only kvm_entry records vcpu id. */
+	if (!thread->priv && kvm_entry_event(evsel)) {
+		struct vcpu_event_record *vcpu_record;
+
+		vcpu_record = zalloc(sizeof(*vcpu_record));
+		if (!vcpu_record) {
+			pr_err("%s: Not enough memory\n", __func__);
+			return NULL;
+		}
+
+		vcpu_record->vcpu_id = perf_evsel__intval(evsel, sample, "vcpu_id");
+		thread->priv = vcpu_record;
+	}
+
+	return thread->priv;
+}
+
+static bool handle_kvm_event(struct perf_kvm_stat *kvm,
+			     struct thread *thread,
+			     struct perf_evsel *evsel,
+			     struct perf_sample *sample)
+{
+	struct vcpu_event_record *vcpu_record;
+	struct event_key key = {.key = INVALID_KEY};
+
+	vcpu_record = per_vcpu_record(thread, evsel, sample);
+	if (!vcpu_record)
+		return true;
+
+	/* only process events for vcpus user cares about */
+	if ((kvm->trace_vcpu != -1) &&
+	    (kvm->trace_vcpu != vcpu_record->vcpu_id))
+		return true;
+
+	if (kvm->events_ops->is_begin_event(evsel, sample, &key))
+		return handle_begin_event(kvm, vcpu_record, &key, sample->time);
+
+	if (kvm->events_ops->is_end_event(evsel, sample, &key))
+		return handle_end_event(kvm, vcpu_record, &key, sample);
+
+	return true;
+}
+
+#define GET_EVENT_KEY(func, field)					\
+static u64 get_event_ ##func(struct kvm_event *event, int vcpu)		\
+{									\
+	if (vcpu == -1)							\
+		return event->total.field;				\
+									\
+	if (vcpu >= event->max_vcpu)					\
+		return 0;						\
+									\
+	return event->vcpu[vcpu].field;					\
+}
+
+#define COMPARE_EVENT_KEY(func, field)					\
+GET_EVENT_KEY(func, field)						\
+static int compare_kvm_event_ ## func(struct kvm_event *one,		\
+					struct kvm_event *two, int vcpu)\
+{									\
+	return get_event_ ##func(one, vcpu) >				\
+				get_event_ ##func(two, vcpu);		\
+}
+
+GET_EVENT_KEY(time, time);
+COMPARE_EVENT_KEY(count, stats.n);
+COMPARE_EVENT_KEY(mean, stats.mean);
+GET_EVENT_KEY(max, stats.max);
+GET_EVENT_KEY(min, stats.min);
+
+#define DEF_SORT_NAME_KEY(name, compare_key)				\
+	{ #name, compare_kvm_event_ ## compare_key }
+
+static struct kvm_event_key keys[] = {
+	DEF_SORT_NAME_KEY(sample, count),
+	DEF_SORT_NAME_KEY(time, mean),
+	{ NULL, NULL }
+};
+
+static bool select_key(struct perf_kvm_stat *kvm)
+{
+	int i;
+
+	for (i = 0; keys[i].name; i++) {
+		if (!strcmp(keys[i].name, kvm->sort_key)) {
+			kvm->compare = keys[i].key;
+			return true;
+		}
+	}
+
+	pr_err("Unknown compare key:%s\n", kvm->sort_key);
+	return false;
+}
+
+static void insert_to_result(struct rb_root *result, struct kvm_event *event,
+			     key_cmp_fun bigger, int vcpu)
+{
+	struct rb_node **rb = &result->rb_node;
+	struct rb_node *parent = NULL;
+	struct kvm_event *p;
+
+	while (*rb) {
+		p = container_of(*rb, struct kvm_event, rb);
+		parent = *rb;
+
+		if (bigger(event, p, vcpu))
+			rb = &(*rb)->rb_left;
+		else
+			rb = &(*rb)->rb_right;
+	}
+
+	rb_link_node(&event->rb, parent, rb);
+	rb_insert_color(&event->rb, result);
+}
+
+static void
+update_total_count(struct perf_kvm_stat *kvm, struct kvm_event *event)
+{
+	int vcpu = kvm->trace_vcpu;
+
+	kvm->total_count += get_event_count(event, vcpu);
+	kvm->total_time += get_event_time(event, vcpu);
+}
+
+static bool event_is_valid(struct kvm_event *event, int vcpu)
+{
+	return !!get_event_count(event, vcpu);
+}
+
+static void sort_result(struct perf_kvm_stat *kvm)
+{
+	unsigned int i;
+	int vcpu = kvm->trace_vcpu;
+	struct kvm_event *event;
+
+	for (i = 0; i < EVENTS_CACHE_SIZE; i++) {
+		list_for_each_entry(event, &kvm->kvm_events_cache[i], hash_entry) {
+			if (event_is_valid(event, vcpu)) {
+				update_total_count(kvm, event);
+				insert_to_result(&kvm->result, event,
+						 kvm->compare, vcpu);
+			}
+		}
+	}
+}
+
+/* returns left most element of result, and erase it */
+static struct kvm_event *pop_from_result(struct rb_root *result)
+{
+	struct rb_node *node = rb_first(result);
+
+	if (!node)
+		return NULL;
+
+	rb_erase(node, result);
+	return container_of(node, struct kvm_event, rb);
+}
+
+static void print_vcpu_info(struct perf_kvm_stat *kvm)
+{
+	int vcpu = kvm->trace_vcpu;
+
+	pr_info("Analyze events for ");
+
+	if (kvm->live) {
+		if (kvm->opts.target.system_wide)
+			pr_info("all VMs, ");
+		else if (kvm->opts.target.pid)
+			pr_info("pid(s) %s, ", kvm->opts.target.pid);
+		else
+			pr_info("dazed and confused on what is monitored, ");
+	}
+
+	if (vcpu == -1)
+		pr_info("all VCPUs:\n\n");
+	else
+		pr_info("VCPU %d:\n\n", vcpu);
+}
+
+static void show_timeofday(void)
+{
+	char date[64];
+	struct timeval tv;
+	struct tm ltime;
+
+	gettimeofday(&tv, NULL);
+	if (localtime_r(&tv.tv_sec, &ltime)) {
+		strftime(date, sizeof(date), "%H:%M:%S", &ltime);
+		pr_info("%s.%06ld", date, tv.tv_usec);
+	} else
+		pr_info("00:00:00.000000");
+
+	return;
+}
+
+static void print_result(struct perf_kvm_stat *kvm)
+{
+	char decode[20];
+	struct kvm_event *event;
+	int vcpu = kvm->trace_vcpu;
+
+	if (kvm->live) {
+		puts(CONSOLE_CLEAR);
+		show_timeofday();
+	}
+
+	pr_info("\n\n");
+	print_vcpu_info(kvm);
+	pr_info("%20s ", kvm->events_ops->name);
+	pr_info("%10s ", "Samples");
+	pr_info("%9s ", "Samples%");
+
+	pr_info("%9s ", "Time%");
+	pr_info("%10s ", "Min Time");
+	pr_info("%10s ", "Max Time");
+	pr_info("%16s ", "Avg time");
+	pr_info("\n\n");
+
+	while ((event = pop_from_result(&kvm->result))) {
+		u64 ecount, etime, max, min;
+
+		ecount = get_event_count(event, vcpu);
+		etime = get_event_time(event, vcpu);
+		max = get_event_max(event, vcpu);
+		min = get_event_min(event, vcpu);
+
+		kvm->events_ops->decode_key(kvm, &event->key, decode);
+		pr_info("%20s ", decode);
+		pr_info("%10llu ", (unsigned long long)ecount);
+		pr_info("%8.2f%% ", (double)ecount / kvm->total_count * 100);
+		pr_info("%8.2f%% ", (double)etime / kvm->total_time * 100);
+		pr_info("%8" PRIu64 "us ", min / 1000);
+		pr_info("%8" PRIu64 "us ", max / 1000);
+		pr_info("%9.2fus ( +-%7.2f%% )", (double)etime / ecount/1e3,
+			kvm_event_rel_stddev(vcpu, event));
+		pr_info("\n");
+	}
+
+	pr_info("\nTotal Samples:%" PRIu64 ", Total events handled time:%.2fus.\n\n",
+		kvm->total_count, kvm->total_time / 1e3);
+
+	if (kvm->lost_events)
+		pr_info("\nLost events: %" PRIu64 "\n\n", kvm->lost_events);
+}
+
+#ifdef HAVE_TIMERFD_SUPPORT
+static int process_lost_event(struct perf_tool *tool,
+			      union perf_event *event __maybe_unused,
+			      struct perf_sample *sample __maybe_unused,
+			      struct machine *machine __maybe_unused)
+{
+	struct perf_kvm_stat *kvm = container_of(tool, struct perf_kvm_stat, tool);
+
+	kvm->lost_events++;
+	return 0;
+}
+#endif
+
+static bool skip_sample(struct perf_kvm_stat *kvm,
+			struct perf_sample *sample)
+{
+	if (kvm->pid_list && intlist__find(kvm->pid_list, sample->pid) == NULL)
+		return true;
+
+	return false;
+}
+
+static int process_sample_event(struct perf_tool *tool,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct perf_evsel *evsel,
+				struct machine *machine)
+{
+	struct thread *thread;
+	struct perf_kvm_stat *kvm = container_of(tool, struct perf_kvm_stat,
+						 tool);
+
+	if (skip_sample(kvm, sample))
+		return 0;
+
+	thread = machine__findnew_thread(machine, sample->pid, sample->tid);
+	if (thread == NULL) {
+		pr_debug("problem processing %d event, skipping it.\n",
+			event->header.type);
+		return -1;
+	}
+
+	if (!handle_kvm_event(kvm, thread, evsel, sample))
+		return -1;
+
+	return 0;
+}
+
+static int cpu_isa_config(struct perf_kvm_stat *kvm)
+{
+	char buf[64], *cpuid;
+	int err, isa;
+
+	if (kvm->live) {
+		err = get_cpuid(buf, sizeof(buf));
+		if (err != 0) {
+			pr_err("Failed to look up CPU type (Intel or AMD)\n");
+			return err;
+		}
+		cpuid = buf;
+	} else
+		cpuid = kvm->session->header.env.cpuid;
+
+	if (strstr(cpuid, "Intel"))
+		isa = 1;
+	else if (strstr(cpuid, "AMD"))
+		isa = 0;
+	else {
+		pr_err("CPU %s is not supported.\n", cpuid);
+		return -ENOTSUP;
+	}
+
+	if (isa == 1) {
+		kvm->exit_reasons = vmx_exit_reasons;
+		kvm->exit_reasons_size = ARRAY_SIZE(vmx_exit_reasons);
+		kvm->exit_reasons_isa = "VMX";
+	}
+
+	return 0;
+}
+
+static bool verify_vcpu(int vcpu)
+{
+	if (vcpu != -1 && vcpu < 0) {
+		pr_err("Invalid vcpu:%d.\n", vcpu);
+		return false;
+	}
+
+	return true;
+}
+
+#ifdef HAVE_TIMERFD_SUPPORT
+/* keeping the max events to a modest level to keep
+ * the processing of samples per mmap smooth.
+ */
+#define PERF_KVM__MAX_EVENTS_PER_MMAP  25
+
+static s64 perf_kvm__mmap_read_idx(struct perf_kvm_stat *kvm, int idx,
+				   u64 *mmap_time)
+{
+	union perf_event *event;
+	struct perf_sample sample;
+	s64 n = 0;
+	int err;
+
+	*mmap_time = ULLONG_MAX;
+	while ((event = perf_evlist__mmap_read(kvm->evlist, idx)) != NULL) {
+		err = perf_evlist__parse_sample(kvm->evlist, event, &sample);
+		if (err) {
+			perf_evlist__mmap_consume(kvm->evlist, idx);
+			pr_err("Failed to parse sample\n");
+			return -1;
+		}
+
+		err = perf_session_queue_event(kvm->session, event, &sample, 0);
+		/*
+		 * FIXME: Here we can't consume the event, as perf_session_queue_event will
+		 *        point to it, and it'll get possibly overwritten by the kernel.
+		 */
+		perf_evlist__mmap_consume(kvm->evlist, idx);
+
+		if (err) {
+			pr_err("Failed to enqueue sample: %d\n", err);
+			return -1;
+		}
+
+		/* save time stamp of our first sample for this mmap */
+		if (n == 0)
+			*mmap_time = sample.time;
+
+		/* limit events per mmap handled all at once */
+		n++;
+		if (n == PERF_KVM__MAX_EVENTS_PER_MMAP)
+			break;
+	}
+
+	return n;
+}
+
+static int perf_kvm__mmap_read(struct perf_kvm_stat *kvm)
+{
+	int i, err, throttled = 0;
+	s64 n, ntotal = 0;
+	u64 flush_time = ULLONG_MAX, mmap_time;
+
+	for (i = 0; i < kvm->evlist->nr_mmaps; i++) {
+		n = perf_kvm__mmap_read_idx(kvm, i, &mmap_time);
+		if (n < 0)
+			return -1;
+
+		/* flush time is going to be the minimum of all the individual
+		 * mmap times. Essentially, we flush all the samples queued up
+		 * from the last pass under our minimal start time -- that leaves
+		 * a very small race for samples to come in with a lower timestamp.
+		 * The ioctl to return the perf_clock timestamp should close the
+		 * race entirely.
+		 */
+		if (mmap_time < flush_time)
+			flush_time = mmap_time;
+
+		ntotal += n;
+		if (n == PERF_KVM__MAX_EVENTS_PER_MMAP)
+			throttled = 1;
+	}
+
+	/* flush queue after each round in which we processed events */
+	if (ntotal) {
+		kvm->session->ordered_samples.next_flush = flush_time;
+		err = kvm->tool.finished_round(&kvm->tool, NULL, kvm->session);
+		if (err) {
+			if (kvm->lost_events)
+				pr_info("\nLost events: %" PRIu64 "\n\n",
+					kvm->lost_events);
+			return err;
+		}
+	}
+
+	return throttled;
+}
+
+static volatile int done;
+
+static void sig_handler(int sig __maybe_unused)
+{
+	done = 1;
+}
+
+static int perf_kvm__timerfd_create(struct perf_kvm_stat *kvm)
+{
+	struct itimerspec new_value;
+	int rc = -1;
+
+	kvm->timerfd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
+	if (kvm->timerfd < 0) {
+		pr_err("timerfd_create failed\n");
+		goto out;
+	}
+
+	new_value.it_value.tv_sec = kvm->display_time;
+	new_value.it_value.tv_nsec = 0;
+	new_value.it_interval.tv_sec = kvm->display_time;
+	new_value.it_interval.tv_nsec = 0;
+
+	if (timerfd_settime(kvm->timerfd, 0, &new_value, NULL) != 0) {
+		pr_err("timerfd_settime failed: %d\n", errno);
+		close(kvm->timerfd);
+		goto out;
+	}
+
+	rc = 0;
+out:
+	return rc;
+}
+
+static int perf_kvm__handle_timerfd(struct perf_kvm_stat *kvm)
+{
+	uint64_t c;
+	int rc;
+
+	rc = read(kvm->timerfd, &c, sizeof(uint64_t));
+	if (rc < 0) {
+		if (errno == EAGAIN)
+			return 0;
+
+		pr_err("Failed to read timer fd: %d\n", errno);
+		return -1;
+	}
+
+	if (rc != sizeof(uint64_t)) {
+		pr_err("Error reading timer fd - invalid size returned\n");
+		return -1;
+	}
+
+	if (c != 1)
+		pr_debug("Missed timer beats: %" PRIu64 "\n", c-1);
+
+	/* update display */
+	sort_result(kvm);
+	print_result(kvm);
+
+	/* reset counts */
+	clear_events_cache_stats(kvm->kvm_events_cache);
+	kvm->total_count = 0;
+	kvm->total_time = 0;
+	kvm->lost_events = 0;
+
+	return 0;
+}
+
+static int fd_set_nonblock(int fd)
+{
+	long arg = 0;
+
+	arg = fcntl(fd, F_GETFL);
+	if (arg < 0) {
+		pr_err("Failed to get current flags for fd %d\n", fd);
+		return -1;
+	}
+
+	if (fcntl(fd, F_SETFL, arg | O_NONBLOCK) < 0) {
+		pr_err("Failed to set non-block option on fd %d\n", fd);
+		return -1;
+	}
+
+	return 0;
+}
+
+static
+int perf_kvm__handle_stdin(struct termios *tc_now, struct termios *tc_save)
+{
+	int c;
+
+	tcsetattr(0, TCSANOW, tc_now);
+	c = getc(stdin);
+	tcsetattr(0, TCSAFLUSH, tc_save);
+
+	if (c == 'q')
+		return 1;
+
+	return 0;
+}
+
+static int kvm_events_live_report(struct perf_kvm_stat *kvm)
+{
+	struct pollfd *pollfds = NULL;
+	int nr_fds, nr_stdin, ret, err = -EINVAL;
+	struct termios tc, save;
+
+	/* live flag must be set first */
+	kvm->live = true;
+
+	ret = cpu_isa_config(kvm);
+	if (ret < 0)
+		return ret;
+
+	if (!verify_vcpu(kvm->trace_vcpu) ||
+	    !select_key(kvm) ||
+	    !register_kvm_events_ops(kvm)) {
+		goto out;
+	}
+
+	init_kvm_event_record(kvm);
+
+	tcgetattr(0, &save);
+	tc = save;
+	tc.c_lflag &= ~(ICANON | ECHO);
+	tc.c_cc[VMIN] = 0;
+	tc.c_cc[VTIME] = 0;
+
+	signal(SIGINT, sig_handler);
+	signal(SIGTERM, sig_handler);
+
+	/* copy pollfds -- need to add timerfd and stdin */
+	nr_fds = kvm->evlist->nr_fds;
+	pollfds = zalloc(sizeof(struct pollfd) * (nr_fds + 2));
+	if (!pollfds) {
+		err = -ENOMEM;
+		goto out;
+	}
+	memcpy(pollfds, kvm->evlist->pollfd,
+		sizeof(struct pollfd) * kvm->evlist->nr_fds);
+
+	/* add timer fd */
+	if (perf_kvm__timerfd_create(kvm) < 0) {
+		err = -1;
+		goto out;
+	}
+
+	pollfds[nr_fds].fd = kvm->timerfd;
+	pollfds[nr_fds].events = POLLIN;
+	nr_fds++;
+
+	pollfds[nr_fds].fd = fileno(stdin);
+	pollfds[nr_fds].events = POLLIN;
+	nr_stdin = nr_fds;
+	nr_fds++;
+	if (fd_set_nonblock(fileno(stdin)) != 0)
+		goto out;
+
+	/* everything is good - enable the events and process */
+	perf_evlist__enable(kvm->evlist);
+
+	while (!done) {
+		int rc;
+
+		rc = perf_kvm__mmap_read(kvm);
+		if (rc < 0)
+			break;
+
+		err = perf_kvm__handle_timerfd(kvm);
+		if (err)
+			goto out;
+
+		if (pollfds[nr_stdin].revents & POLLIN)
+			done = perf_kvm__handle_stdin(&tc, &save);
+
+		if (!rc && !done)
+			err = poll(pollfds, nr_fds, 100);
+	}
+
+	perf_evlist__disable(kvm->evlist);
+
+	if (err == 0) {
+		sort_result(kvm);
+		print_result(kvm);
+	}
+
+out:
+	if (kvm->timerfd >= 0)
+		close(kvm->timerfd);
+
+	free(pollfds);
+	return err;
+}
+
+static int kvm_live_open_events(struct perf_kvm_stat *kvm)
+{
+	int err, rc = -1;
+	struct perf_evsel *pos;
+	struct perf_evlist *evlist = kvm->evlist;
+
+	perf_evlist__config(evlist, &kvm->opts);
+
+	/*
+	 * Note: exclude_{guest,host} do not apply here.
+	 *       This command processes KVM tracepoints from host only
+	 */
+	evlist__for_each(evlist, pos) {
+		struct perf_event_attr *attr = &pos->attr;
+
+		/* make sure these *are* set */
+		perf_evsel__set_sample_bit(pos, TID);
+		perf_evsel__set_sample_bit(pos, TIME);
+		perf_evsel__set_sample_bit(pos, CPU);
+		perf_evsel__set_sample_bit(pos, RAW);
+		/* make sure these are *not*; want as small a sample as possible */
+		perf_evsel__reset_sample_bit(pos, PERIOD);
+		perf_evsel__reset_sample_bit(pos, IP);
+		perf_evsel__reset_sample_bit(pos, CALLCHAIN);
+		perf_evsel__reset_sample_bit(pos, ADDR);
+		perf_evsel__reset_sample_bit(pos, READ);
+		attr->mmap = 0;
+		attr->comm = 0;
+		attr->task = 0;
+
+		attr->sample_period = 1;
+
+		attr->watermark = 0;
+		attr->wakeup_events = 1000;
+
+		/* will enable all once we are ready */
+		attr->disabled = 1;
+	}
+
+	err = perf_evlist__open(evlist);
+	if (err < 0) {
+		printf("Couldn't create the events: %s\n", strerror(errno));
+		goto out;
+	}
+
+	if (perf_evlist__mmap(evlist, kvm->opts.mmap_pages, false) < 0) {
+		ui__error("Failed to mmap the events: %s\n", strerror(errno));
+		perf_evlist__close(evlist);
+		goto out;
+	}
+
+	rc = 0;
+
+out:
+	return rc;
+}
+#endif
+
+static int read_events(struct perf_kvm_stat *kvm)
+{
+	int ret;
+
+	struct perf_tool eops = {
+		.sample			= process_sample_event,
+		.comm			= perf_event__process_comm,
+		.ordered_samples	= true,
+	};
+	struct perf_data_file file = {
+		.path = kvm->file_name,
+		.mode = PERF_DATA_MODE_READ,
+	};
+
+	kvm->tool = eops;
+	kvm->session = perf_session__new(&file, false, &kvm->tool);
+	if (!kvm->session) {
+		pr_err("Initializing perf session failed\n");
+		return -EINVAL;
+	}
+
+	if (!perf_session__has_traces(kvm->session, "kvm record"))
+		return -EINVAL;
+
+	/*
+	 * Do not use 'isa' recorded in kvm_exit tracepoint since it is not
+	 * traced in the old kernel.
+	 */
+	ret = cpu_isa_config(kvm);
+	if (ret < 0)
+		return ret;
+
+	return perf_session__process_events(kvm->session, &kvm->tool);
+}
+
+static int parse_target_str(struct perf_kvm_stat *kvm)
+{
+	if (kvm->pid_str) {
+		kvm->pid_list = intlist__new(kvm->pid_str);
+		if (kvm->pid_list == NULL) {
+			pr_err("Error parsing process id string\n");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int kvm_events_report_vcpu(struct perf_kvm_stat *kvm)
+{
+	int ret = -EINVAL;
+	int vcpu = kvm->trace_vcpu;
+
+	if (parse_target_str(kvm) != 0)
+		goto exit;
+
+	if (!verify_vcpu(vcpu))
+		goto exit;
+
+	if (!select_key(kvm))
+		goto exit;
+
+	if (!register_kvm_events_ops(kvm))
+		goto exit;
+
+	init_kvm_event_record(kvm);
+	setup_pager();
+
+	ret = read_events(kvm);
+	if (ret)
+		goto exit;
+
+	sort_result(kvm);
+	print_result(kvm);
+
+exit:
+	return ret;
+}
+
+static const char * const kvm_events_tp[] = {
+	"kvm:kvm_entry",
+	"kvm:kvm_exit",
+	"kvm:kvm_mmio",
+	"kvm:kvm_pio",
+};
+
+#define STRDUP_FAIL_EXIT(s)		\
+	({	char *_p;		\
+	_p = strdup(s);		\
+		if (!_p)		\
+			return -ENOMEM;	\
+		_p;			\
+	})
+
+static int
+kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv)
+{
+	unsigned int rec_argc, i, j;
+	const char **rec_argv;
+	const char * const record_args[] = {
+		"record",
+		"-R",
+		"-m", "1024",
+		"-c", "1",
+	};
+
+	rec_argc = ARRAY_SIZE(record_args) + argc + 2 +
+		   2 * ARRAY_SIZE(kvm_events_tp);
+	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+
+	if (rec_argv == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(record_args); i++)
+		rec_argv[i] = STRDUP_FAIL_EXIT(record_args[i]);
+
+	for (j = 0; j < ARRAY_SIZE(kvm_events_tp); j++) {
+		rec_argv[i++] = "-e";
+		rec_argv[i++] = STRDUP_FAIL_EXIT(kvm_events_tp[j]);
+	}
+
+	rec_argv[i++] = STRDUP_FAIL_EXIT("-o");
+	rec_argv[i++] = STRDUP_FAIL_EXIT(kvm->file_name);
+
+	for (j = 1; j < (unsigned int)argc; j++, i++)
+		rec_argv[i] = argv[j];
+
+	return cmd_record(i, rec_argv, NULL);
+}
+
+static int
+kvm_events_report(struct perf_kvm_stat *kvm, int argc, const char **argv)
+{
+	const struct option kvm_events_report_options[] = {
+		OPT_STRING(0, "event", &kvm->report_event, "report event",
+			    "event for reporting: vmexit, mmio, ioport"),
+		OPT_INTEGER(0, "vcpu", &kvm->trace_vcpu,
+			    "vcpu id to report"),
+		OPT_STRING('k', "key", &kvm->sort_key, "sort-key",
+			    "key for sorting: sample(sort by samples number)"
+			    " time (sort by avg time)"),
+		OPT_STRING('p', "pid", &kvm->pid_str, "pid",
+			   "analyze events only for given process id(s)"),
+		OPT_END()
+	};
+
+	const char * const kvm_events_report_usage[] = {
+		"perf kvm stat report [<options>]",
+		NULL
+	};
+
+	symbol__init();
+
+	if (argc) {
+		argc = parse_options(argc, argv,
+				     kvm_events_report_options,
+				     kvm_events_report_usage, 0);
+		if (argc)
+			usage_with_options(kvm_events_report_usage,
+					   kvm_events_report_options);
+	}
+
+	return kvm_events_report_vcpu(kvm);
+}
+
+#ifdef HAVE_TIMERFD_SUPPORT
+static struct perf_evlist *kvm_live_event_list(void)
+{
+	struct perf_evlist *evlist;
+	char *tp, *name, *sys;
+	unsigned int j;
+	int err = -1;
+
+	evlist = perf_evlist__new();
+	if (evlist == NULL)
+		return NULL;
+
+	for (j = 0; j < ARRAY_SIZE(kvm_events_tp); j++) {
+
+		tp = strdup(kvm_events_tp[j]);
+		if (tp == NULL)
+			goto out;
+
+		/* split tracepoint into subsystem and name */
+		sys = tp;
+		name = strchr(tp, ':');
+		if (name == NULL) {
+			pr_err("Error parsing %s tracepoint: subsystem delimiter not found\n",
+				kvm_events_tp[j]);
+			free(tp);
+			goto out;
+		}
+		*name = '\0';
+		name++;
+
+		if (perf_evlist__add_newtp(evlist, sys, name, NULL)) {
+			pr_err("Failed to add %s tracepoint to the list\n", kvm_events_tp[j]);
+			free(tp);
+			goto out;
+		}
+
+		free(tp);
+	}
+
+	err = 0;
+
+out:
+	if (err) {
+		perf_evlist__delete(evlist);
+		evlist = NULL;
+	}
+
+	return evlist;
+}
+
+static int kvm_events_live(struct perf_kvm_stat *kvm,
+			   int argc, const char **argv)
+{
+	char errbuf[BUFSIZ];
+	int err;
+
+	const struct option live_options[] = {
+		OPT_STRING('p', "pid", &kvm->opts.target.pid, "pid",
+			"record events on existing process id"),
+		OPT_CALLBACK('m', "mmap-pages", &kvm->opts.mmap_pages, "pages",
+			"number of mmap data pages",
+			perf_evlist__parse_mmap_pages),
+		OPT_INCR('v', "verbose", &verbose,
+			"be more verbose (show counter open errors, etc)"),
+		OPT_BOOLEAN('a', "all-cpus", &kvm->opts.target.system_wide,
+			"system-wide collection from all CPUs"),
+		OPT_UINTEGER('d', "display", &kvm->display_time,
+			"time in seconds between display updates"),
+		OPT_STRING(0, "event", &kvm->report_event, "report event",
+			"event for reporting: vmexit, mmio, ioport"),
+		OPT_INTEGER(0, "vcpu", &kvm->trace_vcpu,
+			"vcpu id to report"),
+		OPT_STRING('k', "key", &kvm->sort_key, "sort-key",
+			"key for sorting: sample(sort by samples number)"
+			" time (sort by avg time)"),
+		OPT_U64(0, "duration", &kvm->duration,
+		    "show events other than HALT that take longer than duration usecs"),
+		OPT_END()
+	};
+	const char * const live_usage[] = {
+		"perf kvm stat live [<options>]",
+		NULL
+	};
+	struct perf_data_file file = {
+		.mode = PERF_DATA_MODE_WRITE,
+	};
+
+
+	/* event handling */
+	kvm->tool.sample = process_sample_event;
+	kvm->tool.comm   = perf_event__process_comm;
+	kvm->tool.exit   = perf_event__process_exit;
+	kvm->tool.fork   = perf_event__process_fork;
+	kvm->tool.lost   = process_lost_event;
+	kvm->tool.ordered_samples = true;
+	perf_tool__fill_defaults(&kvm->tool);
+
+	/* set defaults */
+	kvm->display_time = 1;
+	kvm->opts.user_interval = 1;
+	kvm->opts.mmap_pages = 512;
+	kvm->opts.target.uses_mmap = false;
+	kvm->opts.target.uid_str = NULL;
+	kvm->opts.target.uid = UINT_MAX;
+
+	symbol__init();
+	disable_buildid_cache();
+
+	use_browser = 0;
+	setup_browser(false);
+
+	if (argc) {
+		argc = parse_options(argc, argv, live_options,
+				     live_usage, 0);
+		if (argc)
+			usage_with_options(live_usage, live_options);
+	}
+
+	kvm->duration *= NSEC_PER_USEC;   /* convert usec to nsec */
+
+	/*
+	 * target related setups
+	 */
+	err = target__validate(&kvm->opts.target);
+	if (err) {
+		target__strerror(&kvm->opts.target, err, errbuf, BUFSIZ);
+		ui__warning("%s", errbuf);
+	}
+
+	if (target__none(&kvm->opts.target))
+		kvm->opts.target.system_wide = true;
+
+
+	/*
+	 * generate the event list
+	 */
+	kvm->evlist = kvm_live_event_list();
+	if (kvm->evlist == NULL) {
+		err = -1;
+		goto out;
+	}
+
+	symbol_conf.nr_events = kvm->evlist->nr_entries;
+
+	if (perf_evlist__create_maps(kvm->evlist, &kvm->opts.target) < 0)
+		usage_with_options(live_usage, live_options);
+
+	/*
+	 * perf session
+	 */
+	kvm->session = perf_session__new(&file, false, &kvm->tool);
+	if (kvm->session == NULL) {
+		err = -ENOMEM;
+		goto out;
+	}
+	kvm->session->evlist = kvm->evlist;
+	perf_session__set_id_hdr_size(kvm->session);
+	machine__synthesize_threads(&kvm->session->machines.host, &kvm->opts.target,
+				    kvm->evlist->threads, false);
+	err = kvm_live_open_events(kvm);
+	if (err)
+		goto out;
+
+	err = kvm_events_live_report(kvm);
+
+out:
+	exit_browser(0);
+
+	if (kvm->session)
+		perf_session__delete(kvm->session);
+	kvm->session = NULL;
+	if (kvm->evlist)
+		perf_evlist__delete(kvm->evlist);
+
+	return err;
+}
+#endif
+
+static void print_kvm_stat_usage(void)
+{
+	printf("Usage: perf kvm stat <command>\n\n");
+
+	printf("# Available commands:\n");
+	printf("\trecord: record kvm events\n");
+	printf("\treport: report statistical data of kvm events\n");
+	printf("\tlive:   live reporting of statistical data of kvm events\n");
+
+	printf("\nOtherwise, it is the alias of 'perf stat':\n");
+}
+
+static int kvm_cmd_stat(const char *file_name, int argc, const char **argv)
+{
+	struct perf_kvm_stat kvm = {
+		.file_name = file_name,
+
+		.trace_vcpu	= -1,
+		.report_event	= "vmexit",
+		.sort_key	= "sample",
+
+		.exit_reasons = svm_exit_reasons,
+		.exit_reasons_size = ARRAY_SIZE(svm_exit_reasons),
+		.exit_reasons_isa = "SVM",
+	};
+
+	if (argc == 1) {
+		print_kvm_stat_usage();
+		goto perf_stat;
+	}
+
+	if (!strncmp(argv[1], "rec", 3))
+		return kvm_events_record(&kvm, argc - 1, argv + 1);
+
+	if (!strncmp(argv[1], "rep", 3))
+		return kvm_events_report(&kvm, argc - 1 , argv + 1);
+
+#ifdef HAVE_TIMERFD_SUPPORT
+	if (!strncmp(argv[1], "live", 4))
+		return kvm_events_live(&kvm, argc - 1 , argv + 1);
+#endif
+
+perf_stat:
+	return cmd_stat(argc, argv, NULL);
+}
+#endif
+
+static int __cmd_record(const char *file_name, int argc, const char **argv)
+{
+	int rec_argc, i = 0, j;
+	const char **rec_argv;
+
+	rec_argc = argc + 2;
+	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+	rec_argv[i++] = strdup("record");
+	rec_argv[i++] = strdup("-o");
+	rec_argv[i++] = strdup(file_name);
+	for (j = 1; j < argc; j++, i++)
+		rec_argv[i] = argv[j];
+
+	BUG_ON(i != rec_argc);
+
+	return cmd_record(i, rec_argv, NULL);
+}
+
+static int __cmd_report(const char *file_name, int argc, const char **argv)
+{
+	int rec_argc, i = 0, j;
+	const char **rec_argv;
+
+	rec_argc = argc + 2;
+	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+	rec_argv[i++] = strdup("report");
+	rec_argv[i++] = strdup("-i");
+	rec_argv[i++] = strdup(file_name);
+	for (j = 1; j < argc; j++, i++)
+		rec_argv[i] = argv[j];
+
+	BUG_ON(i != rec_argc);
+
+	return cmd_report(i, rec_argv, NULL);
+}
+
+static int
+__cmd_buildid_list(const char *file_name, int argc, const char **argv)
+{
+	int rec_argc, i = 0, j;
+	const char **rec_argv;
+
+	rec_argc = argc + 2;
+	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+	rec_argv[i++] = strdup("buildid-list");
+	rec_argv[i++] = strdup("-i");
+	rec_argv[i++] = strdup(file_name);
+	for (j = 1; j < argc; j++, i++)
+		rec_argv[i] = argv[j];
+
+	BUG_ON(i != rec_argc);
+
+	return cmd_buildid_list(i, rec_argv, NULL);
+}
+
+int cmd_kvm(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	const char *file_name = NULL;
+	const struct option kvm_options[] = {
+		OPT_STRING('i', "input", &file_name, "file",
+			   "Input file name"),
+		OPT_STRING('o', "output", &file_name, "file",
+			   "Output file name"),
+		OPT_BOOLEAN(0, "guest", &perf_guest,
+			    "Collect guest os data"),
+		OPT_BOOLEAN(0, "host", &perf_host,
+			    "Collect host os data"),
+		OPT_STRING(0, "guestmount", &symbol_conf.guestmount, "directory",
+			   "guest mount directory under which every guest os"
+			   " instance has a subdir"),
+		OPT_STRING(0, "guestvmlinux", &symbol_conf.default_guest_vmlinux_name,
+			   "file", "file saving guest os vmlinux"),
+		OPT_STRING(0, "guestkallsyms", &symbol_conf.default_guest_kallsyms,
+			   "file", "file saving guest os /proc/kallsyms"),
+		OPT_STRING(0, "guestmodules", &symbol_conf.default_guest_modules,
+			   "file", "file saving guest os /proc/modules"),
+		OPT_INCR('v', "verbose", &verbose,
+			    "be more verbose (show counter open errors, etc)"),
+		OPT_END()
+	};
+
+	const char *const kvm_subcommands[] = { "top", "record", "report", "diff",
+						"buildid-list", "stat", NULL };
+	const char *kvm_usage[] = { NULL, NULL };
+
+	perf_host  = 0;
+	perf_guest = 1;
+
+	argc = parse_options_subcommand(argc, argv, kvm_options, kvm_subcommands, kvm_usage,
+					PARSE_OPT_STOP_AT_NON_OPTION);
+	if (!argc)
+		usage_with_options(kvm_usage, kvm_options);
+
+	if (!perf_host)
+		perf_guest = 1;
+
+	if (!file_name) {
+		file_name = get_filename_for_perf_kvm();
+
+		if (!file_name) {
+			pr_err("Failed to allocate memory for filename\n");
+			return -ENOMEM;
+		}
+	}
+
+	if (!strncmp(argv[0], "rec", 3))
+		return __cmd_record(file_name, argc, argv);
+	else if (!strncmp(argv[0], "rep", 3))
+		return __cmd_report(file_name, argc, argv);
+	else if (!strncmp(argv[0], "diff", 4))
+		return cmd_diff(argc, argv, NULL);
+	else if (!strncmp(argv[0], "top", 3))
+		return cmd_top(argc, argv, NULL);
+	else if (!strncmp(argv[0], "buildid-list", 12))
+		return __cmd_buildid_list(file_name, argc, argv);
+#if defined(__i386__) || defined(__x86_64__)
+	else if (!strncmp(argv[0], "stat", 4))
+		return kvm_cmd_stat(file_name, argc, argv);
+#endif
+	else
+		usage_with_options(kvm_usage, kvm_options);
+
+	return 0;
+}
diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c
index d88c6961274..011195e38f2 100644
--- a/tools/perf/builtin-list.c
+++ b/tools/perf/builtin-list.c
@@ -5,6 +5,7 @@
  *
  * Copyright (C) 2009, Thomas Gleixner <tglx@linutronix.de>
  * Copyright (C) 2008-2009, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
  */
 #include "builtin.h"
 
@@ -12,10 +13,65 @@
 
 #include "util/parse-events.h"
 #include "util/cache.h"
+#include "util/pmu.h"
+#include "util/parse-options.h"
 
-int cmd_list(int argc __used, const char **argv __used, const char *prefix __used)
+int cmd_list(int argc, const char **argv, const char *prefix __maybe_unused)
 {
+	int i;
+	const struct option list_options[] = {
+		OPT_END()
+	};
+	const char * const list_usage[] = {
+		"perf list [hw|sw|cache|tracepoint|pmu|event_glob]",
+		NULL
+	};
+
+	argc = parse_options(argc, argv, list_options, list_usage,
+			     PARSE_OPT_STOP_AT_NON_OPTION);
+
 	setup_pager();
-	print_events();
+
+	if (argc == 0) {
+		print_events(NULL, false);
+		return 0;
+	}
+
+	for (i = 0; i < argc; ++i) {
+		if (i)
+			putchar('\n');
+		if (strncmp(argv[i], "tracepoint", 10) == 0)
+			print_tracepoint_events(NULL, NULL, false);
+		else if (strcmp(argv[i], "hw") == 0 ||
+			 strcmp(argv[i], "hardware") == 0)
+			print_events_type(PERF_TYPE_HARDWARE);
+		else if (strcmp(argv[i], "sw") == 0 ||
+			 strcmp(argv[i], "software") == 0)
+			print_events_type(PERF_TYPE_SOFTWARE);
+		else if (strcmp(argv[i], "cache") == 0 ||
+			 strcmp(argv[i], "hwcache") == 0)
+			print_hwcache_events(NULL, false);
+		else if (strcmp(argv[i], "pmu") == 0)
+			print_pmu_events(NULL, false);
+		else if (strcmp(argv[i], "--raw-dump") == 0)
+			print_events(NULL, true);
+		else {
+			char *sep = strchr(argv[i], ':'), *s;
+			int sep_idx;
+
+			if (sep == NULL) {
+				print_events(argv[i], false);
+				continue;
+			}
+			sep_idx = sep - argv[i];
+			s = strdup(argv[i]);
+			if (s == NULL)
+				return -1;
+
+			s[sep_idx] = '\0';
+			print_tracepoint_events(s, s + sep_idx + 1, false);
+			free(s);
+		}
+	}
 	return 0;
 }
diff --git a/tools/perf/builtin-lock.c b/tools/perf/builtin-lock.c
index e12c844df1e..6148afc995c 100644
--- a/tools/perf/builtin-lock.c
+++ b/tools/perf/builtin-lock.c
@@ -1,6 +1,8 @@
 #include "builtin.h"
 #include "perf.h"
 
+#include "util/evlist.h"
+#include "util/evsel.h"
 #include "util/util.h"
 #include "util/cache.h"
 #include "util/symbol.h"
@@ -12,6 +14,8 @@
 
 #include "util/debug.h"
 #include "util/session.h"
+#include "util/tool.h"
+#include "util/data.h"
 
 #include <sys/types.h>
 #include <sys/prctl.h>
@@ -23,6 +27,8 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 
+static struct perf_session *session;
+
 /* based on kernel/lockdep.c */
 #define LOCKHASH_BITS		12
 #define LOCKHASH_SIZE		(1UL << LOCKHASH_BITS)
@@ -32,35 +38,169 @@ static struct list_head lockhash_table[LOCKHASH_SIZE];
 #define __lockhashfn(key)	hash_long((unsigned long)key, LOCKHASH_BITS)
 #define lockhashentry(key)	(lockhash_table + __lockhashfn((key)))
 
-#define LOCK_STATE_UNLOCKED	0	       /* initial state */
-#define LOCK_STATE_LOCKED	1
-
 struct lock_stat {
 	struct list_head	hash_entry;
 	struct rb_node		rb;		/* used for sorting */
 
 	/*
-	 * FIXME: raw_field_value() returns unsigned long long,
+	 * FIXME: perf_evsel__intval() returns u64,
 	 * so address of lockdep_map should be dealed as 64bit.
 	 * Is there more better solution?
 	 */
 	void			*addr;		/* address of lockdep_map, used as ID */
 	char			*name;		/* for strcpy(), we cannot use const */
 
-	int			state;
-	u64			prev_event_time; /* timestamp of previous event */
-
-	unsigned int		nr_acquired;
 	unsigned int		nr_acquire;
+	unsigned int		nr_acquired;
 	unsigned int		nr_contended;
 	unsigned int		nr_release;
 
+	unsigned int		nr_readlock;
+	unsigned int		nr_trylock;
+
 	/* these times are in nano sec. */
+	u64                     avg_wait_time;
 	u64			wait_time_total;
 	u64			wait_time_min;
 	u64			wait_time_max;
+
+	int			discard; /* flag of blacklist */
 };
 
+/*
+ * States of lock_seq_stat
+ *
+ * UNINITIALIZED is required for detecting first event of acquire.
+ * As the nature of lock events, there is no guarantee
+ * that the first event for the locks are acquire,
+ * it can be acquired, contended or release.
+ */
+#define SEQ_STATE_UNINITIALIZED      0	       /* initial state */
+#define SEQ_STATE_RELEASED	1
+#define SEQ_STATE_ACQUIRING	2
+#define SEQ_STATE_ACQUIRED	3
+#define SEQ_STATE_READ_ACQUIRED	4
+#define SEQ_STATE_CONTENDED	5
+
+/*
+ * MAX_LOCK_DEPTH
+ * Imported from include/linux/sched.h.
+ * Should this be synchronized?
+ */
+#define MAX_LOCK_DEPTH 48
+
+/*
+ * struct lock_seq_stat:
+ * Place to put on state of one lock sequence
+ * 1) acquire -> acquired -> release
+ * 2) acquire -> contended -> acquired -> release
+ * 3) acquire (with read or try) -> release
+ * 4) Are there other patterns?
+ */
+struct lock_seq_stat {
+	struct list_head        list;
+	int			state;
+	u64			prev_event_time;
+	void                    *addr;
+
+	int                     read_count;
+};
+
+struct thread_stat {
+	struct rb_node		rb;
+
+	u32                     tid;
+	struct list_head        seq_list;
+};
+
+static struct rb_root		thread_stats;
+
+static struct thread_stat *thread_stat_find(u32 tid)
+{
+	struct rb_node *node;
+	struct thread_stat *st;
+
+	node = thread_stats.rb_node;
+	while (node) {
+		st = container_of(node, struct thread_stat, rb);
+		if (st->tid == tid)
+			return st;
+		else if (tid < st->tid)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+
+	return NULL;
+}
+
+static void thread_stat_insert(struct thread_stat *new)
+{
+	struct rb_node **rb = &thread_stats.rb_node;
+	struct rb_node *parent = NULL;
+	struct thread_stat *p;
+
+	while (*rb) {
+		p = container_of(*rb, struct thread_stat, rb);
+		parent = *rb;
+
+		if (new->tid < p->tid)
+			rb = &(*rb)->rb_left;
+		else if (new->tid > p->tid)
+			rb = &(*rb)->rb_right;
+		else
+			BUG_ON("inserting invalid thread_stat\n");
+	}
+
+	rb_link_node(&new->rb, parent, rb);
+	rb_insert_color(&new->rb, &thread_stats);
+}
+
+static struct thread_stat *thread_stat_findnew_after_first(u32 tid)
+{
+	struct thread_stat *st;
+
+	st = thread_stat_find(tid);
+	if (st)
+		return st;
+
+	st = zalloc(sizeof(struct thread_stat));
+	if (!st) {
+		pr_err("memory allocation failed\n");
+		return NULL;
+	}
+
+	st->tid = tid;
+	INIT_LIST_HEAD(&st->seq_list);
+
+	thread_stat_insert(st);
+
+	return st;
+}
+
+static struct thread_stat *thread_stat_findnew_first(u32 tid);
+static struct thread_stat *(*thread_stat_findnew)(u32 tid) =
+	thread_stat_findnew_first;
+
+static struct thread_stat *thread_stat_findnew_first(u32 tid)
+{
+	struct thread_stat *st;
+
+	st = zalloc(sizeof(struct thread_stat));
+	if (!st) {
+		pr_err("memory allocation failed\n");
+		return NULL;
+	}
+	st->tid = tid;
+	INIT_LIST_HEAD(&st->seq_list);
+
+	rb_link_node(&st->rb, NULL, &thread_stats.rb_node);
+	rb_insert_color(&st->rb, &thread_stats);
+
+	thread_stat_findnew = thread_stat_findnew_after_first;
+	return st;
+}
+
 /* build simple key function one is bigger than two */
 #define SINGLE_KEY(member)						\
 	static int lock_stat_key_ ## member(struct lock_stat *one,	\
@@ -71,10 +211,22 @@ struct lock_stat {
 
 SINGLE_KEY(nr_acquired)
 SINGLE_KEY(nr_contended)
+SINGLE_KEY(avg_wait_time)
 SINGLE_KEY(wait_time_total)
-SINGLE_KEY(wait_time_min)
 SINGLE_KEY(wait_time_max)
 
+static int lock_stat_key_wait_time_min(struct lock_stat *one,
+					struct lock_stat *two)
+{
+	u64 s1 = one->wait_time_min;
+	u64 s2 = two->wait_time_min;
+	if (s1 == ULLONG_MAX)
+		s1 = 0;
+	if (s2 == ULLONG_MAX)
+		s2 = 0;
+	return s1 > s2;
+}
+
 struct lock_key {
 	/*
 	 * name: the value for specify by user
@@ -96,6 +248,7 @@ static struct rb_root		result;	/* place to store sorted data */
 struct lock_key keys[] = {
 	DEF_KEY_LOCK(acquired, nr_acquired),
 	DEF_KEY_LOCK(contended, nr_contended),
+	DEF_KEY_LOCK(avg_wait, avg_wait_time),
 	DEF_KEY_LOCK(wait_total, wait_time_total),
 	DEF_KEY_LOCK(wait_min, wait_time_min),
 	DEF_KEY_LOCK(wait_max, wait_time_max),
@@ -105,18 +258,20 @@ struct lock_key keys[] = {
 	{ NULL, NULL }
 };
 
-static void select_key(void)
+static int select_key(void)
 {
 	int i;
 
 	for (i = 0; keys[i].name; i++) {
 		if (!strcmp(keys[i].name, sort_key)) {
 			compare = keys[i].key;
-			return;
+			return 0;
 		}
 	}
 
-	die("Unknown compare key:%s\n", sort_key);
+	pr_err("Unknown compare key: %s\n", sort_key);
+
+	return -1;
 }
 
 static void insert_to_result(struct lock_stat *st,
@@ -171,190 +326,319 @@ static struct lock_stat *lock_stat_findnew(void *addr, const char *name)
 
 	new->addr = addr;
 	new->name = zalloc(sizeof(char) * strlen(name) + 1);
-	if (!new->name)
+	if (!new->name) {
+		free(new);
 		goto alloc_failed;
-	strcpy(new->name, name);
+	}
 
-	/* LOCK_STATE_UNLOCKED == 0 isn't guaranteed forever */
-	new->state = LOCK_STATE_UNLOCKED;
+	strcpy(new->name, name);
 	new->wait_time_min = ULLONG_MAX;
 
 	list_add(&new->hash_entry, entry);
 	return new;
 
 alloc_failed:
-	die("memory allocation failed\n");
+	pr_err("memory allocation failed\n");
+	return NULL;
 }
 
-static char			const *input_name = "perf.data";
+struct trace_lock_handler {
+	int (*acquire_event)(struct perf_evsel *evsel,
+			     struct perf_sample *sample);
 
-static int			profile_cpu = -1;
+	int (*acquired_event)(struct perf_evsel *evsel,
+			      struct perf_sample *sample);
 
-struct raw_event_sample {
-	u32			size;
-	char			data[0];
-};
+	int (*contended_event)(struct perf_evsel *evsel,
+			       struct perf_sample *sample);
 
-struct trace_acquire_event {
-	void			*addr;
-	const char		*name;
+	int (*release_event)(struct perf_evsel *evsel,
+			     struct perf_sample *sample);
 };
 
-struct trace_acquired_event {
-	void			*addr;
-	const char		*name;
-};
+static struct lock_seq_stat *get_seq(struct thread_stat *ts, void *addr)
+{
+	struct lock_seq_stat *seq;
 
-struct trace_contended_event {
-	void			*addr;
-	const char		*name;
-};
+	list_for_each_entry(seq, &ts->seq_list, list) {
+		if (seq->addr == addr)
+			return seq;
+	}
 
-struct trace_release_event {
-	void			*addr;
-	const char		*name;
+	seq = zalloc(sizeof(struct lock_seq_stat));
+	if (!seq) {
+		pr_err("memory allocation failed\n");
+		return NULL;
+	}
+	seq->state = SEQ_STATE_UNINITIALIZED;
+	seq->addr = addr;
+
+	list_add(&seq->list, &ts->seq_list);
+	return seq;
+}
+
+enum broken_state {
+	BROKEN_ACQUIRE,
+	BROKEN_ACQUIRED,
+	BROKEN_CONTENDED,
+	BROKEN_RELEASE,
+	BROKEN_MAX,
 };
 
-struct trace_lock_handler {
-	void (*acquire_event)(struct trace_acquire_event *,
-			      struct event *,
-			      int cpu,
-			      u64 timestamp,
-			      struct thread *thread);
-
-	void (*acquired_event)(struct trace_acquired_event *,
-			       struct event *,
-			       int cpu,
-			       u64 timestamp,
-			       struct thread *thread);
-
-	void (*contended_event)(struct trace_contended_event *,
-				struct event *,
-				int cpu,
-				u64 timestamp,
-				struct thread *thread);
-
-	void (*release_event)(struct trace_release_event *,
-			      struct event *,
-			      int cpu,
-			      u64 timestamp,
-			      struct thread *thread);
+static int bad_hist[BROKEN_MAX];
+
+enum acquire_flags {
+	TRY_LOCK = 1,
+	READ_LOCK = 2,
 };
 
-static void
-report_lock_acquire_event(struct trace_acquire_event *acquire_event,
-			struct event *__event __used,
-			int cpu __used,
-			u64 timestamp,
-			struct thread *thread __used)
+static int report_lock_acquire_event(struct perf_evsel *evsel,
+				     struct perf_sample *sample)
 {
-	struct lock_stat *st;
+	void *addr;
+	struct lock_stat *ls;
+	struct thread_stat *ts;
+	struct lock_seq_stat *seq;
+	const char *name = perf_evsel__strval(evsel, sample, "name");
+	u64 tmp = perf_evsel__intval(evsel, sample, "lockdep_addr");
+	int flag = perf_evsel__intval(evsel, sample, "flag");
+
+	memcpy(&addr, &tmp, sizeof(void *));
+
+	ls = lock_stat_findnew(addr, name);
+	if (!ls)
+		return -ENOMEM;
+	if (ls->discard)
+		return 0;
 
-	st = lock_stat_findnew(acquire_event->addr, acquire_event->name);
+	ts = thread_stat_findnew(sample->tid);
+	if (!ts)
+		return -ENOMEM;
 
-	switch (st->state) {
-	case LOCK_STATE_UNLOCKED:
+	seq = get_seq(ts, addr);
+	if (!seq)
+		return -ENOMEM;
+
+	switch (seq->state) {
+	case SEQ_STATE_UNINITIALIZED:
+	case SEQ_STATE_RELEASED:
+		if (!flag) {
+			seq->state = SEQ_STATE_ACQUIRING;
+		} else {
+			if (flag & TRY_LOCK)
+				ls->nr_trylock++;
+			if (flag & READ_LOCK)
+				ls->nr_readlock++;
+			seq->state = SEQ_STATE_READ_ACQUIRED;
+			seq->read_count = 1;
+			ls->nr_acquired++;
+		}
 		break;
-	case LOCK_STATE_LOCKED:
+	case SEQ_STATE_READ_ACQUIRED:
+		if (flag & READ_LOCK) {
+			seq->read_count++;
+			ls->nr_acquired++;
+			goto end;
+		} else {
+			goto broken;
+		}
 		break;
+	case SEQ_STATE_ACQUIRED:
+	case SEQ_STATE_ACQUIRING:
+	case SEQ_STATE_CONTENDED:
+broken:
+		/* broken lock sequence, discard it */
+		ls->discard = 1;
+		bad_hist[BROKEN_ACQUIRE]++;
+		list_del(&seq->list);
+		free(seq);
+		goto end;
 	default:
-		BUG_ON(1);
+		BUG_ON("Unknown state of lock sequence found!\n");
 		break;
 	}
 
-	st->prev_event_time = timestamp;
+	ls->nr_acquire++;
+	seq->prev_event_time = sample->time;
+end:
+	return 0;
 }
 
-static void
-report_lock_acquired_event(struct trace_acquired_event *acquired_event,
-			 struct event *__event __used,
-			 int cpu __used,
-			 u64 timestamp,
-			 struct thread *thread __used)
+static int report_lock_acquired_event(struct perf_evsel *evsel,
+				      struct perf_sample *sample)
 {
-	struct lock_stat *st;
+	void *addr;
+	struct lock_stat *ls;
+	struct thread_stat *ts;
+	struct lock_seq_stat *seq;
+	u64 contended_term;
+	const char *name = perf_evsel__strval(evsel, sample, "name");
+	u64 tmp = perf_evsel__intval(evsel, sample, "lockdep_addr");
+
+	memcpy(&addr, &tmp, sizeof(void *));
+
+	ls = lock_stat_findnew(addr, name);
+	if (!ls)
+		return -ENOMEM;
+	if (ls->discard)
+		return 0;
 
-	st = lock_stat_findnew(acquired_event->addr, acquired_event->name);
+	ts = thread_stat_findnew(sample->tid);
+	if (!ts)
+		return -ENOMEM;
 
-	switch (st->state) {
-	case LOCK_STATE_UNLOCKED:
-		st->state = LOCK_STATE_LOCKED;
-		st->nr_acquired++;
+	seq = get_seq(ts, addr);
+	if (!seq)
+		return -ENOMEM;
+
+	switch (seq->state) {
+	case SEQ_STATE_UNINITIALIZED:
+		/* orphan event, do nothing */
+		return 0;
+	case SEQ_STATE_ACQUIRING:
 		break;
-	case LOCK_STATE_LOCKED:
+	case SEQ_STATE_CONTENDED:
+		contended_term = sample->time - seq->prev_event_time;
+		ls->wait_time_total += contended_term;
+		if (contended_term < ls->wait_time_min)
+			ls->wait_time_min = contended_term;
+		if (ls->wait_time_max < contended_term)
+			ls->wait_time_max = contended_term;
 		break;
+	case SEQ_STATE_RELEASED:
+	case SEQ_STATE_ACQUIRED:
+	case SEQ_STATE_READ_ACQUIRED:
+		/* broken lock sequence, discard it */
+		ls->discard = 1;
+		bad_hist[BROKEN_ACQUIRED]++;
+		list_del(&seq->list);
+		free(seq);
+		goto end;
 	default:
-		BUG_ON(1);
+		BUG_ON("Unknown state of lock sequence found!\n");
 		break;
 	}
 
-	st->prev_event_time = timestamp;
+	seq->state = SEQ_STATE_ACQUIRED;
+	ls->nr_acquired++;
+	ls->avg_wait_time = ls->nr_contended ? ls->wait_time_total/ls->nr_contended : 0;
+	seq->prev_event_time = sample->time;
+end:
+	return 0;
 }
 
-static void
-report_lock_contended_event(struct trace_contended_event *contended_event,
-			  struct event *__event __used,
-			  int cpu __used,
-			  u64 timestamp,
-			  struct thread *thread __used)
+static int report_lock_contended_event(struct perf_evsel *evsel,
+				       struct perf_sample *sample)
 {
-	struct lock_stat *st;
+	void *addr;
+	struct lock_stat *ls;
+	struct thread_stat *ts;
+	struct lock_seq_stat *seq;
+	const char *name = perf_evsel__strval(evsel, sample, "name");
+	u64 tmp = perf_evsel__intval(evsel, sample, "lockdep_addr");
+
+	memcpy(&addr, &tmp, sizeof(void *));
+
+	ls = lock_stat_findnew(addr, name);
+	if (!ls)
+		return -ENOMEM;
+	if (ls->discard)
+		return 0;
 
-	st = lock_stat_findnew(contended_event->addr, contended_event->name);
+	ts = thread_stat_findnew(sample->tid);
+	if (!ts)
+		return -ENOMEM;
 
-	switch (st->state) {
-	case LOCK_STATE_UNLOCKED:
-		break;
-	case LOCK_STATE_LOCKED:
-		st->nr_contended++;
+	seq = get_seq(ts, addr);
+	if (!seq)
+		return -ENOMEM;
+
+	switch (seq->state) {
+	case SEQ_STATE_UNINITIALIZED:
+		/* orphan event, do nothing */
+		return 0;
+	case SEQ_STATE_ACQUIRING:
 		break;
+	case SEQ_STATE_RELEASED:
+	case SEQ_STATE_ACQUIRED:
+	case SEQ_STATE_READ_ACQUIRED:
+	case SEQ_STATE_CONTENDED:
+		/* broken lock sequence, discard it */
+		ls->discard = 1;
+		bad_hist[BROKEN_CONTENDED]++;
+		list_del(&seq->list);
+		free(seq);
+		goto end;
 	default:
-		BUG_ON(1);
+		BUG_ON("Unknown state of lock sequence found!\n");
 		break;
 	}
 
-	st->prev_event_time = timestamp;
+	seq->state = SEQ_STATE_CONTENDED;
+	ls->nr_contended++;
+	ls->avg_wait_time = ls->wait_time_total/ls->nr_contended;
+	seq->prev_event_time = sample->time;
+end:
+	return 0;
 }
 
-static void
-report_lock_release_event(struct trace_release_event *release_event,
-			struct event *__event __used,
-			int cpu __used,
-			u64 timestamp,
-			struct thread *thread __used)
+static int report_lock_release_event(struct perf_evsel *evsel,
+				     struct perf_sample *sample)
 {
-	struct lock_stat *st;
-	u64 hold_time;
+	void *addr;
+	struct lock_stat *ls;
+	struct thread_stat *ts;
+	struct lock_seq_stat *seq;
+	const char *name = perf_evsel__strval(evsel, sample, "name");
+	u64 tmp = perf_evsel__intval(evsel, sample, "lockdep_addr");
+
+	memcpy(&addr, &tmp, sizeof(void *));
+
+	ls = lock_stat_findnew(addr, name);
+	if (!ls)
+		return -ENOMEM;
+	if (ls->discard)
+		return 0;
 
-	st = lock_stat_findnew(release_event->addr, release_event->name);
+	ts = thread_stat_findnew(sample->tid);
+	if (!ts)
+		return -ENOMEM;
 
-	switch (st->state) {
-	case LOCK_STATE_UNLOCKED:
-		break;
-	case LOCK_STATE_LOCKED:
-		st->state = LOCK_STATE_UNLOCKED;
-		hold_time = timestamp - st->prev_event_time;
+	seq = get_seq(ts, addr);
+	if (!seq)
+		return -ENOMEM;
 
-		if (timestamp < st->prev_event_time) {
-			/* terribly, this can happen... */
+	switch (seq->state) {
+	case SEQ_STATE_UNINITIALIZED:
+		goto end;
+	case SEQ_STATE_ACQUIRED:
+		break;
+	case SEQ_STATE_READ_ACQUIRED:
+		seq->read_count--;
+		BUG_ON(seq->read_count < 0);
+		if (!seq->read_count) {
+			ls->nr_release++;
 			goto end;
 		}
-
-		if (st->wait_time_min > hold_time)
-			st->wait_time_min = hold_time;
-		if (st->wait_time_max < hold_time)
-			st->wait_time_max = hold_time;
-		st->wait_time_total += hold_time;
-
-		st->nr_release++;
 		break;
+	case SEQ_STATE_ACQUIRING:
+	case SEQ_STATE_CONTENDED:
+	case SEQ_STATE_RELEASED:
+		/* broken lock sequence, discard it */
+		ls->discard = 1;
+		bad_hist[BROKEN_RELEASE]++;
+		goto free_seq;
 	default:
-		BUG_ON(1);
+		BUG_ON("Unknown state of lock sequence found!\n");
 		break;
 	}
 
+	ls->nr_release++;
+free_seq:
+	list_del(&seq->list);
+	free(seq);
 end:
-	st->prev_event_time = timestamp;
+	return 0;
 }
 
 /* lock oriented handlers */
@@ -368,265 +652,51 @@ static struct trace_lock_handler report_lock_ops  = {
 
 static struct trace_lock_handler *trace_handler;
 
-static void
-process_lock_acquire_event(void *data,
-			   struct event *event __used,
-			   int cpu __used,
-			   u64 timestamp __used,
-			   struct thread *thread __used)
-{
-	struct trace_acquire_event acquire_event;
-	u64 tmp;		/* this is required for casting... */
-
-	tmp = raw_field_value(event, "lockdep_addr", data);
-	memcpy(&acquire_event.addr, &tmp, sizeof(void *));
-	acquire_event.name = (char *)raw_field_ptr(event, "name", data);
-
-	if (trace_handler->acquire_event)
-		trace_handler->acquire_event(&acquire_event, event, cpu, timestamp, thread);
-}
-
-static void
-process_lock_acquired_event(void *data,
-			    struct event *event __used,
-			    int cpu __used,
-			    u64 timestamp __used,
-			    struct thread *thread __used)
-{
-	struct trace_acquired_event acquired_event;
-	u64 tmp;		/* this is required for casting... */
-
-	tmp = raw_field_value(event, "lockdep_addr", data);
-	memcpy(&acquired_event.addr, &tmp, sizeof(void *));
-	acquired_event.name = (char *)raw_field_ptr(event, "name", data);
-
-	if (trace_handler->acquire_event)
-		trace_handler->acquired_event(&acquired_event, event, cpu, timestamp, thread);
-}
-
-static void
-process_lock_contended_event(void *data,
-			     struct event *event __used,
-			     int cpu __used,
-			     u64 timestamp __used,
-			     struct thread *thread __used)
+static int perf_evsel__process_lock_acquire(struct perf_evsel *evsel,
+					     struct perf_sample *sample)
 {
-	struct trace_contended_event contended_event;
-	u64 tmp;		/* this is required for casting... */
-
-	tmp = raw_field_value(event, "lockdep_addr", data);
-	memcpy(&contended_event.addr, &tmp, sizeof(void *));
-	contended_event.name = (char *)raw_field_ptr(event, "name", data);
-
-	if (trace_handler->acquire_event)
-		trace_handler->contended_event(&contended_event, event, cpu, timestamp, thread);
-}
-
-static void
-process_lock_release_event(void *data,
-			   struct event *event __used,
-			   int cpu __used,
-			   u64 timestamp __used,
-			   struct thread *thread __used)
-{
-	struct trace_release_event release_event;
-	u64 tmp;		/* this is required for casting... */
-
-	tmp = raw_field_value(event, "lockdep_addr", data);
-	memcpy(&release_event.addr, &tmp, sizeof(void *));
-	release_event.name = (char *)raw_field_ptr(event, "name", data);
-
 	if (trace_handler->acquire_event)
-		trace_handler->release_event(&release_event, event, cpu, timestamp, thread);
-}
-
-static void
-process_raw_event(void *data, int cpu,
-		  u64 timestamp, struct thread *thread)
-{
-	struct event *event;
-	int type;
-
-	type = trace_parse_common_type(data);
-	event = trace_find_event(type);
-
-	if (!strcmp(event->name, "lock_acquire"))
-		process_lock_acquire_event(data, event, cpu, timestamp, thread);
-	if (!strcmp(event->name, "lock_acquired"))
-		process_lock_acquired_event(data, event, cpu, timestamp, thread);
-	if (!strcmp(event->name, "lock_contended"))
-		process_lock_contended_event(data, event, cpu, timestamp, thread);
-	if (!strcmp(event->name, "lock_release"))
-		process_lock_release_event(data, event, cpu, timestamp, thread);
-}
-
-struct raw_event_queue {
-	u64			timestamp;
-	int			cpu;
-	void			*data;
-	struct thread		*thread;
-	struct list_head	list;
-};
-
-static LIST_HEAD(raw_event_head);
-
-#define FLUSH_PERIOD	(5 * NSEC_PER_SEC)
-
-static u64 flush_limit = ULLONG_MAX;
-static u64 last_flush = 0;
-struct raw_event_queue *last_inserted;
-
-static void flush_raw_event_queue(u64 limit)
-{
-	struct raw_event_queue *tmp, *iter;
-
-	list_for_each_entry_safe(iter, tmp, &raw_event_head, list) {
-		if (iter->timestamp > limit)
-			return;
-
-		if (iter == last_inserted)
-			last_inserted = NULL;
-
-		process_raw_event(iter->data, iter->cpu, iter->timestamp,
-				  iter->thread);
-
-		last_flush = iter->timestamp;
-		list_del(&iter->list);
-		free(iter->data);
-		free(iter);
-	}
-}
-
-static void __queue_raw_event_end(struct raw_event_queue *new)
-{
-	struct raw_event_queue *iter;
-
-	list_for_each_entry_reverse(iter, &raw_event_head, list) {
-		if (iter->timestamp < new->timestamp) {
-			list_add(&new->list, &iter->list);
-			return;
-		}
-	}
-
-	list_add(&new->list, &raw_event_head);
-}
-
-static void __queue_raw_event_before(struct raw_event_queue *new,
-				     struct raw_event_queue *iter)
-{
-	list_for_each_entry_continue_reverse(iter, &raw_event_head, list) {
-		if (iter->timestamp < new->timestamp) {
-			list_add(&new->list, &iter->list);
-			return;
-		}
-	}
-
-	list_add(&new->list, &raw_event_head);
+		return trace_handler->acquire_event(evsel, sample);
+	return 0;
 }
 
-static void __queue_raw_event_after(struct raw_event_queue *new,
-				     struct raw_event_queue *iter)
+static int perf_evsel__process_lock_acquired(struct perf_evsel *evsel,
+					      struct perf_sample *sample)
 {
-	list_for_each_entry_continue(iter, &raw_event_head, list) {
-		if (iter->timestamp > new->timestamp) {
-			list_add_tail(&new->list, &iter->list);
-			return;
-		}
-	}
-	list_add_tail(&new->list, &raw_event_head);
+	if (trace_handler->acquired_event)
+		return trace_handler->acquired_event(evsel, sample);
+	return 0;
 }
 
-/* The queue is ordered by time */
-static void __queue_raw_event(struct raw_event_queue *new)
+static int perf_evsel__process_lock_contended(struct perf_evsel *evsel,
+					      struct perf_sample *sample)
 {
-	if (!last_inserted) {
-		__queue_raw_event_end(new);
-		return;
-	}
-
-	/*
-	 * Most of the time the current event has a timestamp
-	 * very close to the last event inserted, unless we just switched
-	 * to another event buffer. Having a sorting based on a list and
-	 * on the last inserted event that is close to the current one is
-	 * probably more efficient than an rbtree based sorting.
-	 */
-	if (last_inserted->timestamp >= new->timestamp)
-		__queue_raw_event_before(new, last_inserted);
-	else
-		__queue_raw_event_after(new, last_inserted);
+	if (trace_handler->contended_event)
+		return trace_handler->contended_event(evsel, sample);
+	return 0;
 }
 
-static void queue_raw_event(void *data, int raw_size, int cpu,
-			    u64 timestamp, struct thread *thread)
+static int perf_evsel__process_lock_release(struct perf_evsel *evsel,
+					    struct perf_sample *sample)
 {
-	struct raw_event_queue *new;
-
-	if (flush_limit == ULLONG_MAX)
-		flush_limit = timestamp + FLUSH_PERIOD;
-
-	if (timestamp < last_flush) {
-		printf("Warning: Timestamp below last timeslice flush\n");
-		return;
-	}
-
-	new = malloc(sizeof(*new));
-	if (!new)
-		die("Not enough memory\n");
-
-	new->timestamp = timestamp;
-	new->cpu = cpu;
-	new->thread = thread;
-
-	new->data = malloc(raw_size);
-	if (!new->data)
-		die("Not enough memory\n");
-
-	memcpy(new->data, data, raw_size);
-
-	__queue_raw_event(new);
-	last_inserted = new;
-
-	/*
-	 * We want to have a slice of events covering 2 * FLUSH_PERIOD
-	 * If FLUSH_PERIOD is big enough, it ensures every events that occured
-	 * in the first half of the timeslice have all been buffered and there
-	 * are none remaining (we need that because of the weakly ordered
-	 * event recording we have). Then once we reach the 2 * FLUSH_PERIOD
-	 * timeslice, we flush the first half to be gentle with the memory
-	 * (the second half can still get new events in the middle, so wait
-	 * another period to flush it)
-	 */
-	if (new->timestamp > flush_limit &&
-		new->timestamp - flush_limit > FLUSH_PERIOD) {
-		flush_limit += FLUSH_PERIOD;
-		flush_raw_event_queue(flush_limit);
-	}
+	if (trace_handler->release_event)
+		return trace_handler->release_event(evsel, sample);
+	return 0;
 }
 
-static int process_sample_event(event_t *event, struct perf_session *session)
+static void print_bad_events(int bad, int total)
 {
-	struct thread *thread;
-	struct sample_data data;
-
-	bzero(&data, sizeof(struct sample_data));
-	event__parse_sample(event, session->sample_type, &data);
-	thread = perf_session__findnew(session, data.pid);
-
-	if (thread == NULL) {
-		pr_debug("problem processing %d event, skipping it.\n",
-			 event->header.type);
-		return -1;
-	}
-
-	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
-
-	if (profile_cpu != -1 && profile_cpu != (int) data.cpu)
-		return 0;
-
-	queue_raw_event(data.raw_data, data.raw_size, data.cpu, data.time, thread);
-
-	return 0;
+	/* Output for debug, this have to be removed */
+	int i;
+	const char *name[4] =
+		{ "acquire", "acquired", "contended", "release" };
+
+	pr_info("\n=== output for debug===\n\n");
+	pr_info("bad: %d, total: %d\n", bad, total);
+	pr_info("bad rate: %.2f %%\n", (double)bad / (double)total * 100);
+	pr_info("histogram of events caused bad sequence\n");
+	for (i = 0; i < BROKEN_MAX; i++)
+		pr_info(" %10s: %d\n", name[i], bad_hist[i]);
 }
 
 /* TODO: various way to print, coloring, nano or milli sec */
@@ -634,26 +704,31 @@ static void print_result(void)
 {
 	struct lock_stat *st;
 	char cut_name[20];
+	int bad, total;
 
-	printf("%18s ", "ID");
-	printf("%20s ", "Name");
-	printf("%10s ", "acquired");
-	printf("%10s ", "contended");
+	pr_info("%20s ", "Name");
+	pr_info("%10s ", "acquired");
+	pr_info("%10s ", "contended");
 
-	printf("%15s ", "total wait (ns)");
-	printf("%15s ", "max wait (ns)");
-	printf("%15s ", "min wait (ns)");
+	pr_info("%15s ", "avg wait (ns)");
+	pr_info("%15s ", "total wait (ns)");
+	pr_info("%15s ", "max wait (ns)");
+	pr_info("%15s ", "min wait (ns)");
 
-	printf("\n\n");
+	pr_info("\n\n");
 
+	bad = total = 0;
 	while ((st = pop_from_result())) {
+		total++;
+		if (st->discard) {
+			bad++;
+			continue;
+		}
 		bzero(cut_name, 20);
 
-		printf("%p ", st->addr);
-
 		if (strlen(st->name) < 16) {
 			/* output raw name */
-			printf("%20s ", st->name);
+			pr_info("%20s ", st->name);
 		} else {
 			strncpy(cut_name, st->name, 16);
 			cut_name[16] = '.';
@@ -661,18 +736,40 @@ static void print_result(void)
 			cut_name[18] = '.';
 			cut_name[19] = '\0';
 			/* cut off name for saving output style */
-			printf("%20s ", cut_name);
+			pr_info("%20s ", cut_name);
 		}
 
-		printf("%10u ", st->nr_acquired);
-		printf("%10u ", st->nr_contended);
+		pr_info("%10u ", st->nr_acquired);
+		pr_info("%10u ", st->nr_contended);
 
-		printf("%15llu ", st->wait_time_total);
-		printf("%15llu ", st->wait_time_max);
-		printf("%15llu ", st->wait_time_min == ULLONG_MAX ?
+		pr_info("%15" PRIu64 " ", st->avg_wait_time);
+		pr_info("%15" PRIu64 " ", st->wait_time_total);
+		pr_info("%15" PRIu64 " ", st->wait_time_max);
+		pr_info("%15" PRIu64 " ", st->wait_time_min == ULLONG_MAX ?
 		       0 : st->wait_time_min);
-		printf("\n");
+		pr_info("\n");
 	}
+
+	print_bad_events(bad, total);
+}
+
+static bool info_threads, info_map;
+
+static void dump_threads(void)
+{
+	struct thread_stat *st;
+	struct rb_node *node;
+	struct thread *t;
+
+	pr_info("%10s: comm\n", "Thread ID");
+
+	node = rb_first(&thread_stats);
+	while (node) {
+		st = container_of(node, struct thread_stat, rb);
+		t = perf_session__findnew(session, st->tid);
+		pr_info("%10d: %s\n", st->tid, thread__comm_str(t));
+		node = rb_next(node);
+	};
 }
 
 static void dump_map(void)
@@ -680,27 +777,54 @@ static void dump_map(void)
 	unsigned int i;
 	struct lock_stat *st;
 
+	pr_info("Address of instance: name of class\n");
 	for (i = 0; i < LOCKHASH_SIZE; i++) {
 		list_for_each_entry(st, &lockhash_table[i], hash_entry) {
-			printf("%p: %s\n", st->addr, st->name);
+			pr_info(" %p: %s\n", st->addr, st->name);
 		}
 	}
 }
 
-static struct perf_event_ops eops = {
-	.sample			= process_sample_event,
-	.comm			= event__process_comm,
-};
+static int dump_info(void)
+{
+	int rc = 0;
 
-static struct perf_session *session;
+	if (info_threads)
+		dump_threads();
+	else if (info_map)
+		dump_map();
+	else {
+		rc = -1;
+		pr_err("Unknown type of information\n");
+	}
+
+	return rc;
+}
+
+typedef int (*tracepoint_handler)(struct perf_evsel *evsel,
+				  struct perf_sample *sample);
 
-static int read_events(void)
+static int process_sample_event(struct perf_tool *tool __maybe_unused,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct perf_evsel *evsel,
+				struct machine *machine)
 {
-	session = perf_session__new(input_name, O_RDONLY, 0);
-	if (!session)
-		die("Initializing perf session failed\n");
+	struct thread *thread = machine__findnew_thread(machine, sample->pid,
+							sample->tid);
 
-	return perf_session__process_events(session, &eops);
+	if (thread == NULL) {
+		pr_debug("problem processing %d event, skipping it.\n",
+			event->header.type);
+		return -1;
+	}
+
+	if (evsel->handler != NULL) {
+		tracepoint_handler f = evsel->handler;
+		return f(evsel, sample);
+	}
+
+	return 0;
 }
 
 static void sort_result(void)
@@ -715,82 +839,147 @@ static void sort_result(void)
 	}
 }
 
-static void __cmd_report(void)
+static const struct perf_evsel_str_handler lock_tracepoints[] = {
+	{ "lock:lock_acquire",	 perf_evsel__process_lock_acquire,   }, /* CONFIG_LOCKDEP */
+	{ "lock:lock_acquired",	 perf_evsel__process_lock_acquired,  }, /* CONFIG_LOCKDEP, CONFIG_LOCK_STAT */
+	{ "lock:lock_contended", perf_evsel__process_lock_contended, }, /* CONFIG_LOCKDEP, CONFIG_LOCK_STAT */
+	{ "lock:lock_release",	 perf_evsel__process_lock_release,   }, /* CONFIG_LOCKDEP */
+};
+
+static int __cmd_report(bool display_info)
 {
-	setup_pager();
-	select_key();
-	read_events();
-	flush_raw_event_queue(ULLONG_MAX);
-	sort_result();
-	print_result();
-}
+	int err = -EINVAL;
+	struct perf_tool eops = {
+		.sample		 = process_sample_event,
+		.comm		 = perf_event__process_comm,
+		.ordered_samples = true,
+	};
+	struct perf_data_file file = {
+		.path = input_name,
+		.mode = PERF_DATA_MODE_READ,
+	};
+
+	session = perf_session__new(&file, false, &eops);
+	if (!session) {
+		pr_err("Initializing perf session failed\n");
+		return -ENOMEM;
+	}
 
-static const char * const report_usage[] = {
-	"perf lock report [<options>]",
-	NULL
-};
+	if (!perf_session__has_traces(session, "lock record"))
+		goto out_delete;
 
-static const struct option report_options[] = {
-	OPT_STRING('k', "key", &sort_key, "acquired",
-		    "key for sorting"),
-	/* TODO: type */
-	OPT_END()
-};
+	if (perf_session__set_tracepoints_handlers(session, lock_tracepoints)) {
+		pr_err("Initializing perf session tracepoint handlers failed\n");
+		goto out_delete;
+	}
 
-static const char * const lock_usage[] = {
-	"perf lock [<options>] {record|trace|report}",
-	NULL
-};
+	if (select_key())
+		goto out_delete;
 
-static const struct option lock_options[] = {
-	OPT_STRING('i', "input", &input_name, "file", "input file name"),
-	OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"),
-	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"),
-	OPT_END()
-};
+	err = perf_session__process_events(session, &eops);
+	if (err)
+		goto out_delete;
 
-static const char *record_args[] = {
-	"record",
-	"-a",
-	"-R",
-	"-f",
-	"-m", "1024",
-	"-c", "1",
-	"-e", "lock:lock_acquire:r",
-	"-e", "lock:lock_acquired:r",
-	"-e", "lock:lock_contended:r",
-	"-e", "lock:lock_release:r",
-};
+	setup_pager();
+	if (display_info) /* used for info subcommand */
+		err = dump_info();
+	else {
+		sort_result();
+		print_result();
+	}
+
+out_delete:
+	perf_session__delete(session);
+	return err;
+}
 
 static int __cmd_record(int argc, const char **argv)
 {
-	unsigned int rec_argc, i, j;
+	const char *record_args[] = {
+		"record", "-R", "-m", "1024", "-c", "1",
+	};
+	unsigned int rec_argc, i, j, ret;
 	const char **rec_argv;
 
+	for (i = 0; i < ARRAY_SIZE(lock_tracepoints); i++) {
+		if (!is_valid_tracepoint(lock_tracepoints[i].name)) {
+				pr_err("tracepoint %s is not enabled. "
+				       "Are CONFIG_LOCKDEP and CONFIG_LOCK_STAT enabled?\n",
+				       lock_tracepoints[i].name);
+				return 1;
+		}
+	}
+
 	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
+	/* factor of 2 is for -e in front of each tracepoint */
+	rec_argc += 2 * ARRAY_SIZE(lock_tracepoints);
+
 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+	if (!rec_argv)
+		return -ENOMEM;
 
 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
 		rec_argv[i] = strdup(record_args[i]);
 
+	for (j = 0; j < ARRAY_SIZE(lock_tracepoints); j++) {
+		rec_argv[i++] = "-e";
+		rec_argv[i++] = strdup(lock_tracepoints[j].name);
+	}
+
 	for (j = 1; j < (unsigned int)argc; j++, i++)
 		rec_argv[i] = argv[j];
 
 	BUG_ON(i != rec_argc);
 
-	return cmd_record(i, rec_argv, NULL);
+	ret = cmd_record(i, rec_argv, NULL);
+	free(rec_argv);
+	return ret;
 }
 
-int cmd_lock(int argc, const char **argv, const char *prefix __used)
+int cmd_lock(int argc, const char **argv, const char *prefix __maybe_unused)
 {
+	const struct option info_options[] = {
+	OPT_BOOLEAN('t', "threads", &info_threads,
+		    "dump thread list in perf.data"),
+	OPT_BOOLEAN('m', "map", &info_map,
+		    "map of lock instances (address:name table)"),
+	OPT_END()
+	};
+	const struct option lock_options[] = {
+	OPT_STRING('i', "input", &input_name, "file", "input file name"),
+	OPT_INCR('v', "verbose", &verbose, "be more verbose (show symbol address, etc)"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, "dump raw trace in ASCII"),
+	OPT_END()
+	};
+	const struct option report_options[] = {
+	OPT_STRING('k', "key", &sort_key, "acquired",
+		    "key for sorting (acquired / contended / avg_wait / wait_total / wait_max / wait_min)"),
+	/* TODO: type */
+	OPT_END()
+	};
+	const char * const info_usage[] = {
+		"perf lock info [<options>]",
+		NULL
+	};
+	const char *const lock_subcommands[] = { "record", "report", "script",
+						 "info", NULL };
+	const char *lock_usage[] = {
+		NULL,
+		NULL
+	};
+	const char * const report_usage[] = {
+		"perf lock report [<options>]",
+		NULL
+	};
 	unsigned int i;
+	int rc = 0;
 
 	symbol__init();
 	for (i = 0; i < LOCKHASH_SIZE; i++)
 		INIT_LIST_HEAD(lockhash_table + i);
 
-	argc = parse_options(argc, argv, lock_options, lock_usage,
-			     PARSE_OPT_STOP_AT_NON_OPTION);
+	argc = parse_options_subcommand(argc, argv, lock_options, lock_subcommands,
+					lock_usage, PARSE_OPT_STOP_AT_NON_OPTION);
 	if (!argc)
 		usage_with_options(lock_usage, lock_options);
 
@@ -804,19 +993,23 @@ int cmd_lock(int argc, const char **argv, const char *prefix __used)
 			if (argc)
 				usage_with_options(report_usage, report_options);
 		}
-		__cmd_report();
-	} else if (!strcmp(argv[0], "trace")) {
-		/* Aliased to 'perf trace' */
-		return cmd_trace(argc, argv, prefix);
-	} else if (!strcmp(argv[0], "map")) {
+		rc = __cmd_report(false);
+	} else if (!strcmp(argv[0], "script")) {
+		/* Aliased to 'perf script' */
+		return cmd_script(argc, argv, prefix);
+	} else if (!strcmp(argv[0], "info")) {
+		if (argc) {
+			argc = parse_options(argc, argv,
+					     info_options, info_usage, 0);
+			if (argc)
+				usage_with_options(info_usage, info_options);
+		}
 		/* recycling report_lock_ops */
 		trace_handler = &report_lock_ops;
-		setup_pager();
-		read_events();
-		dump_map();
+		rc = __cmd_report(true);
 	} else {
 		usage_with_options(lock_usage, lock_options);
 	}
 
-	return 0;
+	return rc;
 }
diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c
new file mode 100644
index 00000000000..4a1a6c94a5e
--- /dev/null
+++ b/tools/perf/builtin-mem.c
@@ -0,0 +1,246 @@
+#include "builtin.h"
+#include "perf.h"
+
+#include "util/parse-options.h"
+#include "util/trace-event.h"
+#include "util/tool.h"
+#include "util/session.h"
+#include "util/data.h"
+
+#define MEM_OPERATION_LOAD	"load"
+#define MEM_OPERATION_STORE	"store"
+
+static const char	*mem_operation		= MEM_OPERATION_LOAD;
+
+struct perf_mem {
+	struct perf_tool	tool;
+	char const		*input_name;
+	bool			hide_unresolved;
+	bool			dump_raw;
+	const char		*cpu_list;
+	DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
+};
+
+static int __cmd_record(int argc, const char **argv)
+{
+	int rec_argc, i = 0, j;
+	const char **rec_argv;
+	char event[64];
+	int ret;
+
+	rec_argc = argc + 4;
+	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+	if (!rec_argv)
+		return -1;
+
+	rec_argv[i++] = strdup("record");
+	if (!strcmp(mem_operation, MEM_OPERATION_LOAD))
+		rec_argv[i++] = strdup("-W");
+	rec_argv[i++] = strdup("-d");
+	rec_argv[i++] = strdup("-e");
+
+	if (strcmp(mem_operation, MEM_OPERATION_LOAD))
+		sprintf(event, "cpu/mem-stores/pp");
+	else
+		sprintf(event, "cpu/mem-loads/pp");
+
+	rec_argv[i++] = strdup(event);
+	for (j = 1; j < argc; j++, i++)
+		rec_argv[i] = argv[j];
+
+	ret = cmd_record(i, rec_argv, NULL);
+	free(rec_argv);
+	return ret;
+}
+
+static int
+dump_raw_samples(struct perf_tool *tool,
+		 union perf_event *event,
+		 struct perf_sample *sample,
+		 struct machine *machine)
+{
+	struct perf_mem *mem = container_of(tool, struct perf_mem, tool);
+	struct addr_location al;
+	const char *fmt;
+
+	if (perf_event__preprocess_sample(event, machine, &al, sample) < 0) {
+		fprintf(stderr, "problem processing %d event, skipping it.\n",
+				event->header.type);
+		return -1;
+	}
+
+	if (al.filtered || (mem->hide_unresolved && al.sym == NULL))
+		return 0;
+
+	if (al.map != NULL)
+		al.map->dso->hit = 1;
+
+	if (symbol_conf.field_sep) {
+		fmt = "%d%s%d%s0x%"PRIx64"%s0x%"PRIx64"%s%"PRIu64
+		      "%s0x%"PRIx64"%s%s:%s\n";
+	} else {
+		fmt = "%5d%s%5d%s0x%016"PRIx64"%s0x016%"PRIx64
+		      "%s%5"PRIu64"%s0x%06"PRIx64"%s%s:%s\n";
+		symbol_conf.field_sep = " ";
+	}
+
+	printf(fmt,
+		sample->pid,
+		symbol_conf.field_sep,
+		sample->tid,
+		symbol_conf.field_sep,
+		sample->ip,
+		symbol_conf.field_sep,
+		sample->addr,
+		symbol_conf.field_sep,
+		sample->weight,
+		symbol_conf.field_sep,
+		sample->data_src,
+		symbol_conf.field_sep,
+		al.map ? (al.map->dso ? al.map->dso->long_name : "???") : "???",
+		al.sym ? al.sym->name : "???");
+
+	return 0;
+}
+
+static int process_sample_event(struct perf_tool *tool,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct perf_evsel *evsel __maybe_unused,
+				struct machine *machine)
+{
+	return dump_raw_samples(tool, event, sample, machine);
+}
+
+static int report_raw_events(struct perf_mem *mem)
+{
+	struct perf_data_file file = {
+		.path = input_name,
+		.mode = PERF_DATA_MODE_READ,
+	};
+	int err = -EINVAL;
+	int ret;
+	struct perf_session *session = perf_session__new(&file, false,
+							 &mem->tool);
+
+	if (session == NULL)
+		return -ENOMEM;
+
+	if (mem->cpu_list) {
+		ret = perf_session__cpu_bitmap(session, mem->cpu_list,
+					       mem->cpu_bitmap);
+		if (ret)
+			goto out_delete;
+	}
+
+	if (symbol__init() < 0)
+		return -1;
+
+	printf("# PID, TID, IP, ADDR, LOCAL WEIGHT, DSRC, SYMBOL\n");
+
+	err = perf_session__process_events(session, &mem->tool);
+	if (err)
+		return err;
+
+	return 0;
+
+out_delete:
+	perf_session__delete(session);
+	return err;
+}
+
+static int report_events(int argc, const char **argv, struct perf_mem *mem)
+{
+	const char **rep_argv;
+	int ret, i = 0, j, rep_argc;
+
+	if (mem->dump_raw)
+		return report_raw_events(mem);
+
+	rep_argc = argc + 3;
+	rep_argv = calloc(rep_argc + 1, sizeof(char *));
+	if (!rep_argv)
+		return -1;
+
+	rep_argv[i++] = strdup("report");
+	rep_argv[i++] = strdup("--mem-mode");
+	rep_argv[i++] = strdup("-n"); /* display number of samples */
+
+	/*
+	 * there is no weight (cost) associated with stores, so don't print
+	 * the column
+	 */
+	if (strcmp(mem_operation, MEM_OPERATION_LOAD))
+		rep_argv[i++] = strdup("--sort=mem,sym,dso,symbol_daddr,"
+				       "dso_daddr,tlb,locked");
+
+	for (j = 1; j < argc; j++, i++)
+		rep_argv[i] = argv[j];
+
+	ret = cmd_report(i, rep_argv, NULL);
+	free(rep_argv);
+	return ret;
+}
+
+int cmd_mem(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	struct stat st;
+	struct perf_mem mem = {
+		.tool = {
+			.sample		= process_sample_event,
+			.mmap		= perf_event__process_mmap,
+			.mmap2		= perf_event__process_mmap2,
+			.comm		= perf_event__process_comm,
+			.lost		= perf_event__process_lost,
+			.fork		= perf_event__process_fork,
+			.build_id	= perf_event__process_build_id,
+			.ordered_samples = true,
+		},
+		.input_name		 = "perf.data",
+	};
+	const struct option mem_options[] = {
+	OPT_STRING('t', "type", &mem_operation,
+		   "type", "memory operations(load/store)"),
+	OPT_BOOLEAN('D', "dump-raw-samples", &mem.dump_raw,
+		    "dump raw samples in ASCII"),
+	OPT_BOOLEAN('U', "hide-unresolved", &mem.hide_unresolved,
+		    "Only display entries resolved to a symbol"),
+	OPT_STRING('i', "input", &input_name, "file",
+		   "input file name"),
+	OPT_STRING('C', "cpu", &mem.cpu_list, "cpu",
+		   "list of cpus to profile"),
+	OPT_STRING('x', "field-separator", &symbol_conf.field_sep,
+		   "separator",
+		   "separator for columns, no spaces will be added"
+		   " between columns '.' is reserved."),
+	OPT_END()
+	};
+	const char *const mem_subcommands[] = { "record", "report", NULL };
+	const char *mem_usage[] = {
+		NULL,
+		NULL
+	};
+
+
+	argc = parse_options_subcommand(argc, argv, mem_options, mem_subcommands,
+					mem_usage, PARSE_OPT_STOP_AT_NON_OPTION);
+
+	if (!argc || !(strncmp(argv[0], "rec", 3) || mem_operation))
+		usage_with_options(mem_usage, mem_options);
+
+	if (!mem.input_name || !strlen(mem.input_name)) {
+		if (!fstat(STDIN_FILENO, &st) && S_ISFIFO(st.st_mode))
+			mem.input_name = "-";
+		else
+			mem.input_name = "perf.data";
+	}
+
+	if (!strncmp(argv[0], "rec", 3))
+		return __cmd_record(argc, argv);
+	else if (!strncmp(argv[0], "rep", 3))
+		return report_events(argc, argv, &mem);
+	else
+		usage_with_options(mem_usage, mem_options);
+
+	return 0;
+}
diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c
index 152d6c9b1fa..c63fa292507 100644
--- a/tools/perf/builtin-probe.c
+++ b/tools/perf/builtin-probe.c
@@ -20,7 +20,6 @@
  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  *
  */
-#define _GNU_SOURCE
 #include <sys/utsname.h>
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -31,153 +30,298 @@
 #include <stdlib.h>
 #include <string.h>
 
-#undef _GNU_SOURCE
 #include "perf.h"
 #include "builtin.h"
 #include "util/util.h"
 #include "util/strlist.h"
-#include "util/event.h"
-#include "util/debug.h"
-#include "util/debugfs.h"
+#include "util/strfilter.h"
 #include "util/symbol.h"
-#include "util/thread.h"
+#include "util/debug.h"
+#include <api/fs/debugfs.h>
 #include "util/parse-options.h"
-#include "util/parse-events.h"	/* For debugfs_path */
 #include "util/probe-finder.h"
 #include "util/probe-event.h"
 
-#define MAX_PATH_LEN 256
+#define DEFAULT_VAR_FILTER "!__k???tab_* & !__crc_*"
+#define DEFAULT_FUNC_FILTER "!_*"
 
 /* Session management structure */
 static struct {
-	bool need_dwarf;
 	bool list_events;
 	bool force_add;
 	bool show_lines;
-	int nr_probe;
-	struct probe_point probes[MAX_PROBES];
+	bool show_vars;
+	bool show_ext_vars;
+	bool show_funcs;
+	bool mod_events;
+	bool uprobes;
+	int nevents;
+	struct perf_probe_event events[MAX_PROBES];
 	struct strlist *dellist;
-	struct map_groups kmap_groups;
-	struct map *kmaps[MAP__NR_TYPES];
 	struct line_range line_range;
-} session;
-
+	char *target;
+	int max_probe_points;
+	struct strfilter *filter;
+} params;
 
 /* Parse an event definition. Note that any error must die. */
-static void parse_probe_event(const char *str)
+static int parse_probe_event(const char *str)
 {
-	struct probe_point *pp = &session.probes[session.nr_probe];
+	struct perf_probe_event *pev = &params.events[params.nevents];
+	int ret;
+
+	pr_debug("probe-definition(%d): %s\n", params.nevents, str);
+	if (++params.nevents == MAX_PROBES) {
+		pr_err("Too many probes (> %d) were specified.", MAX_PROBES);
+		return -1;
+	}
 
-	pr_debug("probe-definition(%d): %s\n", session.nr_probe, str);
-	if (++session.nr_probe == MAX_PROBES)
-		die("Too many probes (> %d) are specified.", MAX_PROBES);
+	pev->uprobes = params.uprobes;
 
-	/* Parse perf-probe event into probe_point */
-	parse_perf_probe_event(str, pp, &session.need_dwarf);
+	/* Parse a perf-probe command into event */
+	ret = parse_perf_probe_command(str, pev);
+	pr_debug("%d arguments\n", pev->nargs);
 
-	pr_debug("%d arguments\n", pp->nr_args);
+	return ret;
 }
 
-static void parse_probe_event_argv(int argc, const char **argv)
+static int set_target(const char *ptr)
 {
-	int i, len;
+	int found = 0;
+	const char *buf;
+
+	/*
+	 * The first argument after options can be an absolute path
+	 * to an executable / library or kernel module.
+	 *
+	 * TODO: Support relative path, and $PATH, $LD_LIBRARY_PATH,
+	 * short module name.
+	 */
+	if (!params.target && ptr && *ptr == '/') {
+		params.target = strdup(ptr);
+		if (!params.target)
+			return -ENOMEM;
+
+		found = 1;
+		buf = ptr + (strlen(ptr) - 3);
+
+		if (strcmp(buf, ".ko"))
+			params.uprobes = true;
+
+	}
+
+	return found;
+}
+
+static int parse_probe_event_argv(int argc, const char **argv)
+{
+	int i, len, ret, found_target;
 	char *buf;
 
+	found_target = set_target(argv[0]);
+	if (found_target < 0)
+		return found_target;
+
+	if (found_target && argc == 1)
+		return 0;
+
 	/* Bind up rest arguments */
 	len = 0;
-	for (i = 0; i < argc; i++)
+	for (i = 0; i < argc; i++) {
+		if (i == 0 && found_target)
+			continue;
+
 		len += strlen(argv[i]) + 1;
+	}
 	buf = zalloc(len + 1);
-	if (!buf)
-		die("Failed to allocate memory for binding arguments.");
+	if (buf == NULL)
+		return -ENOMEM;
 	len = 0;
-	for (i = 0; i < argc; i++)
+	for (i = 0; i < argc; i++) {
+		if (i == 0 && found_target)
+			continue;
+
 		len += sprintf(&buf[len], "%s ", argv[i]);
-	parse_probe_event(buf);
+	}
+	params.mod_events = true;
+	ret = parse_probe_event(buf);
 	free(buf);
+	return ret;
 }
 
-static int opt_add_probe_event(const struct option *opt __used,
-			      const char *str, int unset __used)
+static int opt_add_probe_event(const struct option *opt __maybe_unused,
+			      const char *str, int unset __maybe_unused)
 {
-	if (str)
-		parse_probe_event(str);
-	return 0;
+	if (str) {
+		params.mod_events = true;
+		return parse_probe_event(str);
+	} else
+		return 0;
 }
 
-static int opt_del_probe_event(const struct option *opt __used,
-			       const char *str, int unset __used)
+static int opt_del_probe_event(const struct option *opt __maybe_unused,
+			       const char *str, int unset __maybe_unused)
 {
 	if (str) {
-		if (!session.dellist)
-			session.dellist = strlist__new(true, NULL);
-		strlist__add(session.dellist, str);
+		params.mod_events = true;
+		if (!params.dellist)
+			params.dellist = strlist__new(true, NULL);
+		strlist__add(params.dellist, str);
 	}
 	return 0;
 }
 
-/* Currently just checking function name from symbol map */
-static void evaluate_probe_point(struct probe_point *pp)
+static int opt_set_target(const struct option *opt, const char *str,
+			int unset __maybe_unused)
 {
-	struct symbol *sym;
-	sym = map__find_symbol_by_name(session.kmaps[MAP__FUNCTION],
-				       pp->function, NULL);
-	if (!sym)
-		die("Kernel symbol \'%s\' not found - probe not added.",
-		    pp->function);
+	int ret = -ENOENT;
+	char *tmp;
+
+	if  (str && !params.target) {
+		if (!strcmp(opt->long_name, "exec"))
+			params.uprobes = true;
+#ifdef HAVE_DWARF_SUPPORT
+		else if (!strcmp(opt->long_name, "module"))
+			params.uprobes = false;
+#endif
+		else
+			return ret;
+
+		/* Expand given path to absolute path, except for modulename */
+		if (params.uprobes || strchr(str, '/')) {
+			tmp = realpath(str, NULL);
+			if (!tmp) {
+				pr_warning("Failed to get the absolute path of %s: %m\n", str);
+				return ret;
+			}
+		} else {
+			tmp = strdup(str);
+			if (!tmp)
+				return -ENOMEM;
+		}
+		params.target = tmp;
+		ret = 0;
+	}
+
+	return ret;
 }
 
-#ifndef NO_DWARF_SUPPORT
-static int open_vmlinux(void)
+#ifdef HAVE_DWARF_SUPPORT
+static int opt_show_lines(const struct option *opt __maybe_unused,
+			  const char *str, int unset __maybe_unused)
 {
-	if (map__load(session.kmaps[MAP__FUNCTION], NULL) < 0) {
-		pr_debug("Failed to load kernel map.\n");
+	int ret = 0;
+
+	if (!str)
+		return 0;
+
+	if (params.show_lines) {
+		pr_warning("Warning: more than one --line options are"
+			   " detected. Only the first one is valid.\n");
+		return 0;
+	}
+
+	params.show_lines = true;
+	ret = parse_line_range_desc(str, &params.line_range);
+
+	return ret;
+}
+
+static int opt_show_vars(const struct option *opt __maybe_unused,
+			 const char *str, int unset __maybe_unused)
+{
+	struct perf_probe_event *pev = &params.events[params.nevents];
+	int ret;
+
+	if (!str)
+		return 0;
+
+	ret = parse_probe_event(str);
+	if (!ret && pev->nargs != 0) {
+		pr_err("  Error: '--vars' doesn't accept arguments.\n");
 		return -EINVAL;
 	}
-	pr_debug("Try to open %s\n",
-		 session.kmaps[MAP__FUNCTION]->dso->long_name);
-	return open(session.kmaps[MAP__FUNCTION]->dso->long_name, O_RDONLY);
+	params.show_vars = true;
+
+	return ret;
 }
+#endif
 
-static int opt_show_lines(const struct option *opt __used,
-			  const char *str, int unset __used)
+static int opt_set_filter(const struct option *opt __maybe_unused,
+			  const char *str, int unset __maybe_unused)
 {
-	if (str)
-		parse_line_range_desc(str, &session.line_range);
-	INIT_LIST_HEAD(&session.line_range.line_list);
-	session.show_lines = true;
+	const char *err;
+
+	if (str) {
+		pr_debug2("Set filter: %s\n", str);
+		if (params.filter)
+			strfilter__delete(params.filter);
+		params.filter = strfilter__new(str, &err);
+		if (!params.filter) {
+			pr_err("Filter parse error at %td.\n", err - str + 1);
+			pr_err("Source: \"%s\"\n", str);
+			pr_err("         %*c\n", (int)(err - str + 1), '^');
+			return -EINVAL;
+		}
+	}
+
 	return 0;
 }
-#endif
 
-static const char * const probe_usage[] = {
-	"perf probe [<options>] 'PROBEDEF' ['PROBEDEF' ...]",
-	"perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]",
-	"perf probe [<options>] --del '[GROUP:]EVENT' ...",
-	"perf probe --list",
-#ifndef NO_DWARF_SUPPORT
-	"perf probe --line 'LINEDESC'",
+static int init_params(void)
+{
+	return line_range__init(&params.line_range);
+}
+
+static void cleanup_params(void)
+{
+	int i;
+
+	for (i = 0; i < params.nevents; i++)
+		clear_perf_probe_event(params.events + i);
+	if (params.dellist)
+		strlist__delete(params.dellist);
+	line_range__clear(&params.line_range);
+	free(params.target);
+	if (params.filter)
+		strfilter__delete(params.filter);
+	memset(&params, 0, sizeof(params));
+}
+
+static void pr_err_with_code(const char *msg, int err)
+{
+	pr_err("%s", msg);
+	pr_debug(" Reason: %s (Code: %d)", strerror(-err), err);
+	pr_err("\n");
+}
+
+static int
+__cmd_probe(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	const char * const probe_usage[] = {
+		"perf probe [<options>] 'PROBEDEF' ['PROBEDEF' ...]",
+		"perf probe [<options>] --add 'PROBEDEF' [--add 'PROBEDEF' ...]",
+		"perf probe [<options>] --del '[GROUP:]EVENT' ...",
+		"perf probe --list",
+#ifdef HAVE_DWARF_SUPPORT
+		"perf probe [<options>] --line 'LINEDESC'",
+		"perf probe [<options>] --vars 'PROBEPOINT'",
 #endif
-	NULL
+		NULL
 };
-
-static const struct option options[] = {
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	const struct option options[] = {
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show parsed arguments, etc)"),
-#ifndef NO_DWARF_SUPPORT
-	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
-		   "file", "vmlinux pathname"),
-#endif
-	OPT_BOOLEAN('l', "list", &session.list_events,
+	OPT_BOOLEAN('l', "list", &params.list_events,
 		    "list up current probe events"),
 	OPT_CALLBACK('d', "del", NULL, "[GROUP:]EVENT", "delete a probe event.",
 		opt_del_probe_event),
 	OPT_CALLBACK('a', "add", NULL,
-#ifdef NO_DWARF_SUPPORT
-		"[EVENT=]FUNC[+OFF|%return] [ARG ...]",
-#else
+#ifdef HAVE_DWARF_SUPPORT
 		"[EVENT=]FUNC[@SRC][+OFF|%return|:RL|;PT]|SRC:AL|SRC;PT"
-		" [ARG ...]",
+		" [[NAME=]ARG ...]",
+#else
+		"[EVENT=]FUNC[+OFF|%return] [[NAME=]ARG ...]",
 #endif
 		"probe point definition, where\n"
 		"\t\tGROUP:\tGroup name (optional)\n"
@@ -185,51 +329,53 @@ static const struct option options[] = {
 		"\t\tFUNC:\tFunction name\n"
 		"\t\tOFF:\tOffset from function entry (in byte)\n"
 		"\t\t%return:\tPut the probe at function return\n"
-#ifdef NO_DWARF_SUPPORT
-		"\t\tARG:\tProbe argument (only \n"
-#else
+#ifdef HAVE_DWARF_SUPPORT
 		"\t\tSRC:\tSource code path\n"
 		"\t\tRL:\tRelative line number from function entry.\n"
 		"\t\tAL:\tAbsolute line number in file.\n"
 		"\t\tPT:\tLazy expression of line code.\n"
 		"\t\tARG:\tProbe argument (local variable name or\n"
-#endif
 		"\t\t\tkprobe-tracer argument format.)\n",
+#else
+		"\t\tARG:\tProbe argument (kprobe-tracer argument format.)\n",
+#endif
 		opt_add_probe_event),
-	OPT_BOOLEAN('f', "force", &session.force_add, "forcibly add events"
+	OPT_BOOLEAN('f', "force", &params.force_add, "forcibly add events"
 		    " with existing name"),
-#ifndef NO_DWARF_SUPPORT
+#ifdef HAVE_DWARF_SUPPORT
 	OPT_CALLBACK('L', "line", NULL,
-		     "FUNC[:RLN[+NUM|:RLN2]]|SRC:ALN[+NUM|:ALN2]",
+		     "FUNC[:RLN[+NUM|-RLN2]]|SRC:ALN[+NUM|-ALN2]",
 		     "Show source code lines.", opt_show_lines),
+	OPT_CALLBACK('V', "vars", NULL,
+		     "FUNC[@SRC][+OFF|%return|:RL|;PT]|SRC:AL|SRC;PT",
+		     "Show accessible variables on PROBEDEF", opt_show_vars),
+	OPT_BOOLEAN('\0', "externs", &params.show_ext_vars,
+		    "Show external variables too (with --vars only)"),
+	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
+		   "file", "vmlinux pathname"),
+	OPT_STRING('s', "source", &symbol_conf.source_prefix,
+		   "directory", "path to kernel source"),
+	OPT_CALLBACK('m', "module", NULL, "modname|path",
+		"target module name (for online) or path (for offline)",
+		opt_set_target),
 #endif
+	OPT__DRY_RUN(&probe_event_dry_run),
+	OPT_INTEGER('\0', "max-probes", &params.max_probe_points,
+		 "Set how many probe points can be found for a probe."),
+	OPT_BOOLEAN('F', "funcs", &params.show_funcs,
+		    "Show potential probe-able functions."),
+	OPT_CALLBACK('\0', "filter", NULL,
+		     "[!]FILTER", "Set a filter (with --vars/funcs only)\n"
+		     "\t\t\t(default: \"" DEFAULT_VAR_FILTER "\" for --vars,\n"
+		     "\t\t\t \"" DEFAULT_FUNC_FILTER "\" for --funcs)",
+		     opt_set_filter),
+	OPT_CALLBACK('x', "exec", NULL, "executable|path",
+			"target executable name or path", opt_set_target),
+	OPT_BOOLEAN(0, "demangle", &symbol_conf.demangle,
+		    "Disable symbol demangling"),
 	OPT_END()
-};
-
-/* Initialize symbol maps for vmlinux */
-static void init_vmlinux(void)
-{
-	symbol_conf.sort_by_name = true;
-	if (symbol_conf.vmlinux_name == NULL)
-		symbol_conf.try_vmlinux_path = true;
-	else
-		pr_debug("Use vmlinux: %s\n", symbol_conf.vmlinux_name);
-	if (symbol__init() < 0)
-		die("Failed to init symbol map.");
-
-	map_groups__init(&session.kmap_groups);
-	if (map_groups__create_kernel_maps(&session.kmap_groups,
-					   session.kmaps) < 0)
-		die("Failed to create kernel maps.");
-}
-
-int cmd_probe(int argc, const char **argv, const char *prefix __used)
-{
-	int i, ret;
-#ifndef NO_DWARF_SUPPORT
-	int fd;
-#endif
-	struct probe_point *pp;
+	};
+	int ret;
 
 	argc = parse_options(argc, argv, options, probe_usage,
 			     PARSE_OPT_STOP_AT_NON_OPTION);
@@ -238,123 +384,147 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used)
 			pr_warning("  Error: '-' is not supported.\n");
 			usage_with_options(probe_usage, options);
 		}
-		parse_probe_event_argv(argc, argv);
+		ret = parse_probe_event_argv(argc, argv);
+		if (ret < 0) {
+			pr_err_with_code("  Error: Command Parse Error.", ret);
+			return ret;
+		}
 	}
 
-	if ((!session.nr_probe && !session.dellist && !session.list_events &&
-	     !session.show_lines))
+	if (params.max_probe_points == 0)
+		params.max_probe_points = MAX_PROBES;
+
+	if ((!params.nevents && !params.dellist && !params.list_events &&
+	     !params.show_lines && !params.show_funcs))
 		usage_with_options(probe_usage, options);
 
-	if (debugfs_valid_mountpoint(debugfs_path) < 0)
-		die("Failed to find debugfs path.");
+	/*
+	 * Only consider the user's kernel image path if given.
+	 */
+	symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
 
-	if (session.list_events) {
-		if (session.nr_probe != 0 || session.dellist) {
-			pr_warning("  Error: Don't use --list with"
-				   " --add/--del.\n");
+	if (params.list_events) {
+		if (params.mod_events) {
+			pr_err("  Error: Don't use --list with --add/--del.\n");
 			usage_with_options(probe_usage, options);
 		}
-		if (session.show_lines) {
-			pr_warning("  Error: Don't use --list with --line.\n");
+		if (params.show_lines) {
+			pr_err("  Error: Don't use --list with --line.\n");
 			usage_with_options(probe_usage, options);
 		}
-		show_perf_probe_events();
-		return 0;
-	}
-
-#ifndef NO_DWARF_SUPPORT
-	if (session.show_lines) {
-		if (session.nr_probe != 0 || session.dellist) {
-			pr_warning("  Error: Don't use --line with"
-				   " --add/--del.\n");
+		if (params.show_vars) {
+			pr_err(" Error: Don't use --list with --vars.\n");
 			usage_with_options(probe_usage, options);
 		}
-		init_vmlinux();
-		fd = open_vmlinux();
-		if (fd < 0)
-			die("Could not open debuginfo file.");
-		ret = find_line_range(fd, &session.line_range);
-		if (ret <= 0)
-			die("Source line is not found.\n");
-		close(fd);
-		show_line_range(&session.line_range);
-		return 0;
+		if (params.show_funcs) {
+			pr_err("  Error: Don't use --list with --funcs.\n");
+			usage_with_options(probe_usage, options);
+		}
+		if (params.uprobes) {
+			pr_warning("  Error: Don't use --list with --exec.\n");
+			usage_with_options(probe_usage, options);
+		}
+		ret = show_perf_probe_events();
+		if (ret < 0)
+			pr_err_with_code("  Error: Failed to show event list.", ret);
+		return ret;
 	}
-#endif
-
-	if (session.dellist) {
-		del_trace_kprobe_events(session.dellist);
-		strlist__delete(session.dellist);
-		if (session.nr_probe == 0)
-			return 0;
+	if (params.show_funcs) {
+		if (params.nevents != 0 || params.dellist) {
+			pr_err("  Error: Don't use --funcs with"
+			       " --add/--del.\n");
+			usage_with_options(probe_usage, options);
+		}
+		if (params.show_lines) {
+			pr_err("  Error: Don't use --funcs with --line.\n");
+			usage_with_options(probe_usage, options);
+		}
+		if (params.show_vars) {
+			pr_err("  Error: Don't use --funcs with --vars.\n");
+			usage_with_options(probe_usage, options);
+		}
+		if (!params.filter)
+			params.filter = strfilter__new(DEFAULT_FUNC_FILTER,
+						       NULL);
+		ret = show_available_funcs(params.target, params.filter,
+					params.uprobes);
+		strfilter__delete(params.filter);
+		params.filter = NULL;
+		if (ret < 0)
+			pr_err_with_code("  Error: Failed to show functions.", ret);
+		return ret;
 	}
 
-	/* Add probes */
-	init_vmlinux();
-
-	if (session.need_dwarf)
-#ifdef NO_DWARF_SUPPORT
-		die("Debuginfo-analysis is not supported");
-#else	/* !NO_DWARF_SUPPORT */
-		pr_debug("Some probes require debuginfo.\n");
-
-	fd = open_vmlinux();
-	if (fd < 0) {
-		if (session.need_dwarf)
-			die("Could not open debuginfo file.");
+#ifdef HAVE_DWARF_SUPPORT
+	if (params.show_lines) {
+		if (params.mod_events) {
+			pr_err("  Error: Don't use --line with"
+			       " --add/--del.\n");
+			usage_with_options(probe_usage, options);
+		}
+		if (params.show_vars) {
+			pr_err(" Error: Don't use --line with --vars.\n");
+			usage_with_options(probe_usage, options);
+		}
 
-		pr_debug("Could not open vmlinux/module file."
-			 " Try to use symbols.\n");
-		goto end_dwarf;
+		ret = show_line_range(&params.line_range, params.target);
+		if (ret < 0)
+			pr_err_with_code("  Error: Failed to show lines.", ret);
+		return ret;
 	}
-
-	/* Searching probe points */
-	for (i = 0; i < session.nr_probe; i++) {
-		pp = &session.probes[i];
-		if (pp->found)
-			continue;
-
-		lseek(fd, SEEK_SET, 0);
-		ret = find_probe_point(fd, pp);
-		if (ret > 0)
-			continue;
-		if (ret == 0) {	/* No error but failed to find probe point. */
-			synthesize_perf_probe_point(pp);
-			die("Probe point '%s' not found. - probe not added.",
-			    pp->probes[0]);
+	if (params.show_vars) {
+		if (params.mod_events) {
+			pr_err("  Error: Don't use --vars with"
+			       " --add/--del.\n");
+			usage_with_options(probe_usage, options);
 		}
-		/* Error path */
-		if (session.need_dwarf) {
-			if (ret == -ENOENT)
-				pr_warning("No dwarf info found in the vmlinux - please rebuild with CONFIG_DEBUG_INFO=y.\n");
-			die("Could not analyze debuginfo.");
+		if (!params.filter)
+			params.filter = strfilter__new(DEFAULT_VAR_FILTER,
+						       NULL);
+
+		ret = show_available_vars(params.events, params.nevents,
+					  params.max_probe_points,
+					  params.target,
+					  params.filter,
+					  params.show_ext_vars);
+		strfilter__delete(params.filter);
+		params.filter = NULL;
+		if (ret < 0)
+			pr_err_with_code("  Error: Failed to show vars.", ret);
+		return ret;
+	}
+#endif
+
+	if (params.dellist) {
+		ret = del_perf_probe_events(params.dellist);
+		if (ret < 0) {
+			pr_err_with_code("  Error: Failed to delete events.", ret);
+			return ret;
 		}
-		pr_debug("An error occurred in debuginfo analysis."
-			 " Try to use symbols.\n");
-		break;
 	}
-	close(fd);
 
-end_dwarf:
-#endif /* !NO_DWARF_SUPPORT */
+	if (params.nevents) {
+		ret = add_perf_probe_events(params.events, params.nevents,
+					    params.max_probe_points,
+					    params.target,
+					    params.force_add);
+		if (ret < 0) {
+			pr_err_with_code("  Error: Failed to add events.", ret);
+			return ret;
+		}
+	}
+	return 0;
+}
 
-	/* Synthesize probes without dwarf */
-	for (i = 0; i < session.nr_probe; i++) {
-		pp = &session.probes[i];
-		if (pp->found)	/* This probe is already found. */
-			continue;
+int cmd_probe(int argc, const char **argv, const char *prefix)
+{
+	int ret;
 
-		evaluate_probe_point(pp);
-		ret = synthesize_trace_kprobe_event(pp);
-		if (ret == -E2BIG)
-			die("probe point definition becomes too long.");
-		else if (ret < 0)
-			die("Failed to synthesize a probe point.");
+	ret = init_params();
+	if (!ret) {
+		ret = __cmd_probe(argc, argv, prefix);
+		cleanup_params();
 	}
 
-	/* Settng up probe points */
-	add_trace_kprobe_events(session.probes, session.nr_probe,
-				session.force_add);
-	return 0;
+	return ret;
 }
-
diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 3b8b6387c47..378b85b731a 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -5,8 +5,6 @@
  * (or a CPU, or a PID) into the perf.data output file - for
  * later analysis via perf report.
  */
-#define _FILE_OFFSET_BITS 64
-
 #include "builtin.h"
 
 #include "perf.h"
@@ -15,154 +13,71 @@
 #include "util/util.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
-#include "util/string.h"
 
 #include "util/header.h"
 #include "util/event.h"
+#include "util/evlist.h"
+#include "util/evsel.h"
 #include "util/debug.h"
 #include "util/session.h"
+#include "util/tool.h"
 #include "util/symbol.h"
 #include "util/cpumap.h"
+#include "util/thread_map.h"
+#include "util/data.h"
 
 #include <unistd.h>
 #include <sched.h>
-
-static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
-
-static long			default_interval		=      0;
-
-static int			nr_cpus				=      0;
-static unsigned int		page_size;
-static unsigned int		mmap_pages			=    128;
-static int			freq				=   1000;
-static int			output;
-static const char		*output_name			= "perf.data";
-static int			group				=      0;
-static unsigned int		realtime_prio			=      0;
-static int			raw_samples			=      0;
-static int			system_wide			=      0;
-static int			profile_cpu			=     -1;
-static pid_t			target_pid			=     -1;
-static pid_t			child_pid			=     -1;
-static int			inherit				=      1;
-static int			force				=      0;
-static int			append_file			=      0;
-static int			call_graph			=      0;
-static int			inherit_stat			=      0;
-static int			no_samples			=      0;
-static int			sample_address			=      0;
-static int			multiplex			=      0;
-static int			multiplex_fd			=     -1;
-
-static long			samples				=      0;
-static struct timeval		last_read;
-static struct timeval		this_read;
-
-static u64			bytes_written			=      0;
-
-static struct pollfd		event_array[MAX_NR_CPUS * MAX_COUNTERS];
-
-static int			nr_poll				=      0;
-static int			nr_cpu				=      0;
-
-static int			file_new			=      1;
-static off_t			post_processing_offset;
-
-static struct perf_session	*session;
-
-struct mmap_data {
-	int			counter;
-	void			*base;
-	unsigned int		mask;
-	unsigned int		prev;
+#include <sys/mman.h>
+
+
+struct record {
+	struct perf_tool	tool;
+	struct record_opts	opts;
+	u64			bytes_written;
+	struct perf_data_file	file;
+	struct perf_evlist	*evlist;
+	struct perf_session	*session;
+	const char		*progname;
+	int			realtime_prio;
+	bool			no_buildid;
+	bool			no_buildid_cache;
+	long			samples;
 };
 
-static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
-
-static unsigned long mmap_read_head(struct mmap_data *md)
-{
-	struct perf_event_mmap_page *pc = md->base;
-	long head;
-
-	head = pc->data_head;
-	rmb();
-
-	return head;
-}
-
-static void mmap_write_tail(struct mmap_data *md, unsigned long tail)
+static int record__write(struct record *rec, void *bf, size_t size)
 {
-	struct perf_event_mmap_page *pc = md->base;
-
-	/*
-	 * ensure all reads are done before we write the tail out.
-	 */
-	/* mb(); */
-	pc->data_tail = tail;
-}
-
-static void write_output(void *buf, size_t size)
-{
-	while (size) {
-		int ret = write(output, buf, size);
-
-		if (ret < 0)
-			die("failed to write");
-
-		size -= ret;
-		buf += ret;
-
-		bytes_written += ret;
+	if (perf_data_file__write(rec->session->file, bf, size) < 0) {
+		pr_err("failed to write perf data, error: %m\n");
+		return -1;
 	}
+
+	rec->bytes_written += size;
+	return 0;
 }
 
-static int process_synthesized_event(event_t *event,
-				     struct perf_session *self __used)
+static int process_synthesized_event(struct perf_tool *tool,
+				     union perf_event *event,
+				     struct perf_sample *sample __maybe_unused,
+				     struct machine *machine __maybe_unused)
 {
-	write_output(event, event->header.size);
-	return 0;
+	struct record *rec = container_of(tool, struct record, tool);
+	return record__write(rec, event, event->header.size);
 }
 
-static void mmap_read(struct mmap_data *md)
+static int record__mmap_read(struct record *rec, struct perf_mmap *md)
 {
-	unsigned int head = mmap_read_head(md);
+	unsigned int head = perf_mmap__read_head(md);
 	unsigned int old = md->prev;
 	unsigned char *data = md->base + page_size;
 	unsigned long size;
 	void *buf;
-	int diff;
-
-	gettimeofday(&this_read, NULL);
-
-	/*
-	 * If we're further behind than half the buffer, there's a chance
-	 * the writer will bite our tail and mess up the samples under us.
-	 *
-	 * If we somehow ended up ahead of the head, we got messed up.
-	 *
-	 * In either case, truncate and restart at head.
-	 */
-	diff = head - old;
-	if (diff < 0) {
-		struct timeval iv;
-		unsigned long msecs;
-
-		timersub(&this_read, &last_read, &iv);
-		msecs = iv.tv_sec*1000 + iv.tv_usec/1000;
-
-		fprintf(stderr, "WARNING: failed to keep up with mmap data."
-				"  Last read %lu msecs ago.\n", msecs);
-
-		/*
-		 * head points to a known good entry, start there.
-		 */
-		old = head;
-	}
+	int rc = 0;
 
-	last_read = this_read;
+	if (old == head)
+		return 0;
 
-	if (old != head)
-		samples++;
+	rec->samples++;
 
 	size = head - old;
 
@@ -171,556 +86,875 @@ static void mmap_read(struct mmap_data *md)
 		size = md->mask + 1 - (old & md->mask);
 		old += size;
 
-		write_output(buf, size);
+		if (record__write(rec, buf, size) < 0) {
+			rc = -1;
+			goto out;
+		}
 	}
 
 	buf = &data[old & md->mask];
 	size = head - old;
 	old += size;
 
-	write_output(buf, size);
+	if (record__write(rec, buf, size) < 0) {
+		rc = -1;
+		goto out;
+	}
 
 	md->prev = old;
-	mmap_write_tail(md, old);
+	perf_mmap__write_tail(md, old);
+
+out:
+	return rc;
 }
 
 static volatile int done = 0;
 static volatile int signr = -1;
+static volatile int child_finished = 0;
 
 static void sig_handler(int sig)
 {
+	if (sig == SIGCHLD)
+		child_finished = 1;
+	else
+		signr = sig;
+
 	done = 1;
-	signr = sig;
 }
 
-static void sig_atexit(void)
+static void record__sig_exit(void)
 {
-	if (child_pid != -1)
-		kill(child_pid, SIGTERM);
-
 	if (signr == -1)
 		return;
 
 	signal(signr, SIG_DFL);
-	kill(getpid(), signr);
+	raise(signr);
 }
 
-static int group_fd;
-
-static struct perf_header_attr *get_header_attr(struct perf_event_attr *a, int nr)
+static int record__open(struct record *rec)
 {
-	struct perf_header_attr *h_attr;
+	char msg[512];
+	struct perf_evsel *pos;
+	struct perf_evlist *evlist = rec->evlist;
+	struct perf_session *session = rec->session;
+	struct record_opts *opts = &rec->opts;
+	int rc = 0;
 
-	if (nr < session->header.attrs) {
-		h_attr = session->header.attr[nr];
-	} else {
-		h_attr = perf_header_attr__new(a);
-		if (h_attr != NULL)
-			if (perf_header__add_attr(&session->header, h_attr) < 0) {
-				perf_header_attr__delete(h_attr);
-				h_attr = NULL;
-			}
-	}
+	perf_evlist__config(evlist, opts);
 
-	return h_attr;
-}
+	evlist__for_each(evlist, pos) {
+try_again:
+		if (perf_evsel__open(pos, evlist->cpus, evlist->threads) < 0) {
+			if (perf_evsel__fallback(pos, errno, msg, sizeof(msg))) {
+				if (verbose)
+					ui__warning("%s\n", msg);
+				goto try_again;
+			}
 
-static void create_counter(int counter, int cpu, pid_t pid)
-{
-	char *filter = filters[counter];
-	struct perf_event_attr *attr = attrs + counter;
-	struct perf_header_attr *h_attr;
-	int track = !counter; /* only the first counter needs these */
-	int ret;
-	struct {
-		u64 count;
-		u64 time_enabled;
-		u64 time_running;
-		u64 id;
-	} read_data;
-
-	attr->read_format	= PERF_FORMAT_TOTAL_TIME_ENABLED |
-				  PERF_FORMAT_TOTAL_TIME_RUNNING |
-				  PERF_FORMAT_ID;
-
-	attr->sample_type	|= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
-
-	if (nr_counters > 1)
-		attr->sample_type |= PERF_SAMPLE_ID;
-
-	if (freq) {
-		attr->sample_type	|= PERF_SAMPLE_PERIOD;
-		attr->freq		= 1;
-		attr->sample_freq	= freq;
+			rc = -errno;
+			perf_evsel__open_strerror(pos, &opts->target,
+						  errno, msg, sizeof(msg));
+			ui__error("%s\n", msg);
+			goto out;
+		}
 	}
 
-	if (no_samples)
-		attr->sample_freq = 0;
-
-	if (inherit_stat)
-		attr->inherit_stat = 1;
-
-	if (sample_address)
-		attr->sample_type	|= PERF_SAMPLE_ADDR;
-
-	if (call_graph)
-		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
-
-	if (raw_samples) {
-		attr->sample_type	|= PERF_SAMPLE_TIME;
-		attr->sample_type	|= PERF_SAMPLE_RAW;
-		attr->sample_type	|= PERF_SAMPLE_CPU;
+	if (perf_evlist__apply_filters(evlist)) {
+		error("failed to set filter with %d (%s)\n", errno,
+			strerror(errno));
+		rc = -1;
+		goto out;
 	}
 
-	attr->mmap		= track;
-	attr->comm		= track;
-	attr->inherit		= inherit;
-	attr->disabled		= 1;
-
-try_again:
-	fd[nr_cpu][counter] = sys_perf_event_open(attr, pid, cpu, group_fd, 0);
-
-	if (fd[nr_cpu][counter] < 0) {
-		int err = errno;
-
-		if (err == EPERM || err == EACCES)
-			die("Permission error - are you root?\n");
-		else if (err ==  ENODEV && profile_cpu != -1)
-			die("No such device - did you specify an out-of-range profile CPU?\n");
-
-		/*
-		 * If it's cycles then fall back to hrtimer
-		 * based cpu-clock-tick sw counter, which
-		 * is always available even if no PMU support:
-		 */
-		if (attr->type == PERF_TYPE_HARDWARE
-			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
-
-			if (verbose)
-				warning(" ... trying to fall back to cpu-clock-ticks\n");
-			attr->type = PERF_TYPE_SOFTWARE;
-			attr->config = PERF_COUNT_SW_CPU_CLOCK;
-			goto try_again;
+	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
+		if (errno == EPERM) {
+			pr_err("Permission error mapping pages.\n"
+			       "Consider increasing "
+			       "/proc/sys/kernel/perf_event_mlock_kb,\n"
+			       "or try again with a smaller value of -m/--mmap_pages.\n"
+			       "(current value: %u)\n", opts->mmap_pages);
+			rc = -errno;
+		} else {
+			pr_err("failed to mmap with %d (%s)\n", errno, strerror(errno));
+			rc = -errno;
 		}
-		printf("\n");
-		error("perfcounter syscall returned with %d (%s)\n",
-			fd[nr_cpu][counter], strerror(err));
-
-#if defined(__i386__) || defined(__x86_64__)
-		if (attr->type == PERF_TYPE_HARDWARE && err == EOPNOTSUPP)
-			die("No hardware sampling interrupt available. No APIC? If so then you can boot the kernel with the \"lapic\" boot parameter to force-enable it.\n");
-#endif
-
-		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
-		exit(-1);
+		goto out;
 	}
 
-	h_attr = get_header_attr(attr, counter);
-	if (h_attr == NULL)
-		die("nomem\n");
+	session->evlist = evlist;
+	perf_session__set_id_hdr_size(session);
+out:
+	return rc;
+}
 
-	if (!file_new) {
-		if (memcmp(&h_attr->attr, attr, sizeof(*attr))) {
-			fprintf(stderr, "incompatible append\n");
-			exit(-1);
-		}
-	}
+static int process_buildids(struct record *rec)
+{
+	struct perf_data_file *file  = &rec->file;
+	struct perf_session *session = rec->session;
+	u64 start = session->header.data_offset;
 
-	if (read(fd[nr_cpu][counter], &read_data, sizeof(read_data)) == -1) {
-		perror("Unable to read perf file descriptor\n");
-		exit(-1);
-	}
+	u64 size = lseek(file->fd, 0, SEEK_CUR);
+	if (size == 0)
+		return 0;
 
-	if (perf_header_attr__add_id(h_attr, read_data.id) < 0) {
-		pr_warning("Not enough memory to add id\n");
-		exit(-1);
-	}
+	return __perf_session__process_events(session, start,
+					      size - start,
+					      size, &build_id__mark_dso_hit_ops);
+}
 
-	assert(fd[nr_cpu][counter] >= 0);
-	fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK);
+static void perf_event__synthesize_guest_os(struct machine *machine, void *data)
+{
+	int err;
+	struct perf_tool *tool = data;
+	/*
+	 *As for guest kernel when processing subcommand record&report,
+	 *we arrange module mmap prior to guest kernel mmap and trigger
+	 *a preload dso because default guest module symbols are loaded
+	 *from guest kallsyms instead of /lib/modules/XXX/XXX. This
+	 *method is used to avoid symbol missing when the first addr is
+	 *in module instead of in guest kernel.
+	 */
+	err = perf_event__synthesize_modules(tool, process_synthesized_event,
+					     machine);
+	if (err < 0)
+		pr_err("Couldn't record guest kernel [%d]'s reference"
+		       " relocation symbol.\n", machine->pid);
 
 	/*
-	 * First counter acts as the group leader:
+	 * We use _stext for guest kernel because guest kernel's /proc/kallsyms
+	 * have no _text sometimes.
 	 */
-	if (group && group_fd == -1)
-		group_fd = fd[nr_cpu][counter];
-	if (multiplex && multiplex_fd == -1)
-		multiplex_fd = fd[nr_cpu][counter];
+	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
+						 machine);
+	if (err < 0)
+		pr_err("Couldn't record guest kernel [%d]'s reference"
+		       " relocation symbol.\n", machine->pid);
+}
 
-	if (multiplex && fd[nr_cpu][counter] != multiplex_fd) {
+static struct perf_event_header finished_round_event = {
+	.size = sizeof(struct perf_event_header),
+	.type = PERF_RECORD_FINISHED_ROUND,
+};
 
-		ret = ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_SET_OUTPUT, multiplex_fd);
-		assert(ret != -1);
-	} else {
-		event_array[nr_poll].fd = fd[nr_cpu][counter];
-		event_array[nr_poll].events = POLLIN;
-		nr_poll++;
-
-		mmap_array[nr_cpu][counter].counter = counter;
-		mmap_array[nr_cpu][counter].prev = 0;
-		mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
-		mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-				PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0);
-		if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
-			error("failed to mmap with %d (%s)\n", errno, strerror(errno));
-			exit(-1);
+static int record__mmap_read_all(struct record *rec)
+{
+	int i;
+	int rc = 0;
+
+	for (i = 0; i < rec->evlist->nr_mmaps; i++) {
+		if (rec->evlist->mmap[i].base) {
+			if (record__mmap_read(rec, &rec->evlist->mmap[i]) != 0) {
+				rc = -1;
+				goto out;
+			}
 		}
 	}
 
-	if (filter != NULL) {
-		ret = ioctl(fd[nr_cpu][counter],
-			    PERF_EVENT_IOC_SET_FILTER, filter);
-		if (ret) {
-			error("failed to set filter with %d (%s)\n", errno,
-			      strerror(errno));
-			exit(-1);
-		}
-	}
+	if (perf_header__has_feat(&rec->session->header, HEADER_TRACING_DATA))
+		rc = record__write(rec, &finished_round_event, sizeof(finished_round_event));
 
-	ioctl(fd[nr_cpu][counter], PERF_EVENT_IOC_ENABLE);
+out:
+	return rc;
 }
 
-static void open_counters(int cpu, pid_t pid)
+static void record__init_features(struct record *rec)
 {
-	int counter;
+	struct perf_session *session = rec->session;
+	int feat;
 
-	group_fd = -1;
-	for (counter = 0; counter < nr_counters; counter++)
-		create_counter(counter, cpu, pid);
+	for (feat = HEADER_FIRST_FEATURE; feat < HEADER_LAST_FEATURE; feat++)
+		perf_header__set_feat(&session->header, feat);
 
-	nr_cpu++;
-}
+	if (rec->no_buildid)
+		perf_header__clear_feat(&session->header, HEADER_BUILD_ID);
 
-static int process_buildids(void)
-{
-	u64 size = lseek(output, 0, SEEK_CUR);
+	if (!have_tracepoints(&rec->evlist->entries))
+		perf_header__clear_feat(&session->header, HEADER_TRACING_DATA);
 
-	if (size == 0)
-		return 0;
-
-	session->fd = output;
-	return __perf_session__process_events(session, post_processing_offset,
-					      size - post_processing_offset,
-					      size, &build_id__mark_dso_hit_ops);
+	if (!rec->opts.branch_stack)
+		perf_header__clear_feat(&session->header, HEADER_BRANCH_STACK);
 }
 
-static void atexit_header(void)
-{
-	session->header.data_size += bytes_written;
+static volatile int workload_exec_errno;
 
-	process_buildids();
-	perf_header__write(&session->header, output, true);
+/*
+ * perf_evlist__prepare_workload will send a SIGUSR1
+ * if the fork fails, since we asked by setting its
+ * want_signal to true.
+ */
+static void workload_exec_failed_signal(int signo __maybe_unused,
+					siginfo_t *info,
+					void *ucontext __maybe_unused)
+{
+	workload_exec_errno = info->si_value.sival_int;
+	done = 1;
+	child_finished = 1;
 }
 
-static int __cmd_record(int argc, const char **argv)
+static int __cmd_record(struct record *rec, int argc, const char **argv)
 {
-	int i, counter;
-	struct stat st;
-	pid_t pid = 0;
-	int flags;
 	int err;
+	int status = 0;
 	unsigned long waking = 0;
-	int child_ready_pipe[2], go_pipe[2];
-	const bool forks = target_pid == -1 && argc > 0;
-	char buf;
+	const bool forks = argc > 0;
+	struct machine *machine;
+	struct perf_tool *tool = &rec->tool;
+	struct record_opts *opts = &rec->opts;
+	struct perf_data_file *file = &rec->file;
+	struct perf_session *session;
+	bool disabled = false;
 
-	page_size = sysconf(_SC_PAGE_SIZE);
+	rec->progname = argv[0];
 
-	atexit(sig_atexit);
+	atexit(record__sig_exit);
 	signal(SIGCHLD, sig_handler);
 	signal(SIGINT, sig_handler);
+	signal(SIGTERM, sig_handler);
 
-	if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
-		perror("failed to create pipes");
-		exit(-1);
+	session = perf_session__new(file, false, NULL);
+	if (session == NULL) {
+		pr_err("Perf session creation failed.\n");
+		return -1;
 	}
 
-	if (!stat(output_name, &st) && st.st_size) {
-		if (!force) {
-			if (!append_file) {
-				pr_err("Error, output file %s exists, use -A "
-				       "to append or -f to overwrite.\n",
-				       output_name);
-				exit(-1);
-			}
-		} else {
-			char oldname[PATH_MAX];
-			snprintf(oldname, sizeof(oldname), "%s.old",
-				 output_name);
-			unlink(oldname);
-			rename(output_name, oldname);
-		}
-	} else {
-		append_file = 0;
-	}
+	rec->session = session;
 
-	flags = O_CREAT|O_RDWR;
-	if (append_file)
-		file_new = 0;
-	else
-		flags |= O_TRUNC;
+	record__init_features(rec);
 
-	output = open(output_name, flags, S_IRUSR|S_IWUSR);
-	if (output < 0) {
-		perror("failed to create output file");
-		exit(-1);
+	if (forks) {
+		err = perf_evlist__prepare_workload(rec->evlist, &opts->target,
+						    argv, file->is_pipe,
+						    workload_exec_failed_signal);
+		if (err < 0) {
+			pr_err("Couldn't run the workload!\n");
+			status = err;
+			goto out_delete_session;
+		}
 	}
 
-	session = perf_session__new(output_name, O_WRONLY, force);
-	if (session == NULL) {
-		pr_err("Not enough memory for reading perf file header\n");
-		return -1;
+	if (record__open(rec) != 0) {
+		err = -1;
+		goto out_child;
 	}
 
-	if (!file_new) {
-		err = perf_header__read(&session->header, output);
+	if (!rec->evlist->nr_groups)
+		perf_header__clear_feat(&session->header, HEADER_GROUP_DESC);
+
+	if (file->is_pipe) {
+		err = perf_header__write_pipe(file->fd);
 		if (err < 0)
-			return err;
+			goto out_child;
+	} else {
+		err = perf_session__write_header(session, rec->evlist,
+						 file->fd, false);
+		if (err < 0)
+			goto out_child;
 	}
 
-	if (raw_samples) {
-		perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
-	} else {
-		for (i = 0; i < nr_counters; i++) {
-			if (attrs[i].sample_type & PERF_SAMPLE_RAW) {
-				perf_header__set_feat(&session->header, HEADER_TRACE_INFO);
-				break;
-			}
-		}
+	if (!rec->no_buildid
+	    && !perf_header__has_feat(&session->header, HEADER_BUILD_ID)) {
+		pr_err("Couldn't generate buildids. "
+		       "Use --no-buildid to profile anyway.\n");
+		err = -1;
+		goto out_child;
 	}
 
-	atexit(atexit_header);
+	machine = &session->machines.host;
 
-	if (forks) {
-		pid = fork();
-		if (pid < 0) {
-			perror("failed to fork");
-			exit(-1);
+	if (file->is_pipe) {
+		err = perf_event__synthesize_attrs(tool, session,
+						   process_synthesized_event);
+		if (err < 0) {
+			pr_err("Couldn't synthesize attrs.\n");
+			goto out_child;
 		}
 
-		if (!pid) {
-			close(child_ready_pipe[0]);
-			close(go_pipe[1]);
-			fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
-
+		if (have_tracepoints(&rec->evlist->entries)) {
 			/*
-			 * Do a dummy execvp to get the PLT entry resolved,
-			 * so we avoid the resolver overhead on the real
-			 * execvp call.
+			 * FIXME err <= 0 here actually means that
+			 * there were no tracepoints so its not really
+			 * an error, just that we don't need to
+			 * synthesize anything.  We really have to
+			 * return this more properly and also
+			 * propagate errors that now are calling die()
 			 */
-			execvp("", (char **)argv);
+			err = perf_event__synthesize_tracing_data(tool, file->fd, rec->evlist,
+								  process_synthesized_event);
+			if (err <= 0) {
+				pr_err("Couldn't record tracing data.\n");
+				goto out_child;
+			}
+			rec->bytes_written += err;
+		}
+	}
 
-			/*
-			 * Tell the parent we're ready to go
-			 */
-			close(child_ready_pipe[1]);
+	err = perf_event__synthesize_kernel_mmap(tool, process_synthesized_event,
+						 machine);
+	if (err < 0)
+		pr_err("Couldn't record kernel reference relocation symbol\n"
+		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
+		       "Check /proc/kallsyms permission or run as root.\n");
+
+	err = perf_event__synthesize_modules(tool, process_synthesized_event,
+					     machine);
+	if (err < 0)
+		pr_err("Couldn't record kernel module information.\n"
+		       "Symbol resolution may be skewed if relocation was used (e.g. kexec).\n"
+		       "Check /proc/modules permission or run as root.\n");
+
+	if (perf_guest) {
+		machines__process_guests(&session->machines,
+					 perf_event__synthesize_guest_os, tool);
+	}
 
-			/*
-			 * Wait until the parent tells us to go.
-			 */
-			if (read(go_pipe[0], &buf, 1) == -1)
-				perror("unable to read pipe");
+	err = __machine__synthesize_threads(machine, tool, &opts->target, rec->evlist->threads,
+					    process_synthesized_event, opts->sample_address);
+	if (err != 0)
+		goto out_child;
 
-			execvp(argv[0], (char **)argv);
+	if (rec->realtime_prio) {
+		struct sched_param param;
 
-			perror(argv[0]);
-			exit(-1);
+		param.sched_priority = rec->realtime_prio;
+		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
+			pr_err("Could not set realtime priority.\n");
+			err = -1;
+			goto out_child;
 		}
+	}
+
+	/*
+	 * When perf is starting the traced process, all the events
+	 * (apart from group members) have enable_on_exec=1 set,
+	 * so don't spoil it by prematurely enabling them.
+	 */
+	if (!target__none(&opts->target) && !opts->initial_delay)
+		perf_evlist__enable(rec->evlist);
+
+	/*
+	 * Let the child rip
+	 */
+	if (forks)
+		perf_evlist__start_workload(rec->evlist);
 
-		child_pid = pid;
+	if (opts->initial_delay) {
+		usleep(opts->initial_delay * 1000);
+		perf_evlist__enable(rec->evlist);
+	}
+
+	for (;;) {
+		int hits = rec->samples;
 
-		if (!system_wide)
-			target_pid = pid;
+		if (record__mmap_read_all(rec) < 0) {
+			err = -1;
+			goto out_child;
+		}
+
+		if (hits == rec->samples) {
+			if (done)
+				break;
+			err = poll(rec->evlist->pollfd, rec->evlist->nr_fds, -1);
+			/*
+			 * Propagate error, only if there's any. Ignore positive
+			 * number of returned events and interrupt error.
+			 */
+			if (err > 0 || (err < 0 && errno == EINTR))
+				err = 0;
+			waking++;
+		}
 
-		close(child_ready_pipe[1]);
-		close(go_pipe[0]);
 		/*
-		 * wait for child to settle
+		 * When perf is starting the traced process, at the end events
+		 * die with the process and we wait for that. Thus no need to
+		 * disable events in this case.
 		 */
-		if (read(child_ready_pipe[0], &buf, 1) == -1) {
-			perror("unable to read pipe");
-			exit(-1);
+		if (done && !disabled && !target__none(&opts->target)) {
+			perf_evlist__disable(rec->evlist);
+			disabled = true;
 		}
-		close(child_ready_pipe[0]);
 	}
 
+	if (forks && workload_exec_errno) {
+		char msg[512];
+		const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg));
+		pr_err("Workload failed: %s\n", emsg);
+		err = -1;
+		goto out_child;
+	}
 
-	if ((!system_wide && !inherit) || profile_cpu != -1) {
-		open_counters(profile_cpu, target_pid);
-	} else {
-		nr_cpus = read_cpu_map();
-		for (i = 0; i < nr_cpus; i++)
-			open_counters(cpumap[i], target_pid);
+	if (!quiet) {
+		fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
+
+		/*
+		 * Approximate RIP event size: 24 bytes.
+		 */
+		fprintf(stderr,
+			"[ perf record: Captured and wrote %.3f MB %s (~%" PRIu64 " samples) ]\n",
+			(double)rec->bytes_written / 1024.0 / 1024.0,
+			file->path,
+			rec->bytes_written / 24);
 	}
 
-	if (file_new) {
-		err = perf_header__write(&session->header, output, false);
+out_child:
+	if (forks) {
+		int exit_status;
+
+		if (!child_finished)
+			kill(rec->evlist->workload.pid, SIGTERM);
+
+		wait(&exit_status);
+
 		if (err < 0)
-			return err;
+			status = err;
+		else if (WIFEXITED(exit_status))
+			status = WEXITSTATUS(exit_status);
+		else if (WIFSIGNALED(exit_status))
+			signr = WTERMSIG(exit_status);
+	} else
+		status = err;
+
+	if (!err && !file->is_pipe) {
+		rec->session->header.data_size += rec->bytes_written;
+
+		if (!rec->no_buildid)
+			process_buildids(rec);
+		perf_session__write_header(rec->session, rec->evlist,
+					   file->fd, true);
 	}
 
-	post_processing_offset = lseek(output, 0, SEEK_CUR);
+out_delete_session:
+	perf_session__delete(session);
+	return status;
+}
 
-	err = event__synthesize_kernel_mmap(process_synthesized_event,
-					    session, "_text");
-	if (err < 0) {
-		pr_err("Couldn't record kernel reference relocation symbol.\n");
-		return err;
-	}
+#define BRANCH_OPT(n, m) \
+	{ .name = n, .mode = (m) }
 
-	err = event__synthesize_modules(process_synthesized_event, session);
-	if (err < 0) {
-		pr_err("Couldn't record kernel reference relocation symbol.\n");
-		return err;
-	}
+#define BRANCH_END { .name = NULL }
 
-	if (!system_wide && profile_cpu == -1)
-		event__synthesize_thread(target_pid, process_synthesized_event,
-					 session);
-	else
-		event__synthesize_threads(process_synthesized_event, session);
+struct branch_mode {
+	const char *name;
+	int mode;
+};
 
-	if (realtime_prio) {
-		struct sched_param param;
+static const struct branch_mode branch_modes[] = {
+	BRANCH_OPT("u", PERF_SAMPLE_BRANCH_USER),
+	BRANCH_OPT("k", PERF_SAMPLE_BRANCH_KERNEL),
+	BRANCH_OPT("hv", PERF_SAMPLE_BRANCH_HV),
+	BRANCH_OPT("any", PERF_SAMPLE_BRANCH_ANY),
+	BRANCH_OPT("any_call", PERF_SAMPLE_BRANCH_ANY_CALL),
+	BRANCH_OPT("any_ret", PERF_SAMPLE_BRANCH_ANY_RETURN),
+	BRANCH_OPT("ind_call", PERF_SAMPLE_BRANCH_IND_CALL),
+	BRANCH_OPT("abort_tx", PERF_SAMPLE_BRANCH_ABORT_TX),
+	BRANCH_OPT("in_tx", PERF_SAMPLE_BRANCH_IN_TX),
+	BRANCH_OPT("no_tx", PERF_SAMPLE_BRANCH_NO_TX),
+	BRANCH_OPT("cond", PERF_SAMPLE_BRANCH_COND),
+	BRANCH_END
+};
 
-		param.sched_priority = realtime_prio;
-		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
-			pr_err("Could not set realtime priority.\n");
-			exit(-1);
-		}
-	}
+static int
+parse_branch_stack(const struct option *opt, const char *str, int unset)
+{
+#define ONLY_PLM \
+	(PERF_SAMPLE_BRANCH_USER	|\
+	 PERF_SAMPLE_BRANCH_KERNEL	|\
+	 PERF_SAMPLE_BRANCH_HV)
+
+	uint64_t *mode = (uint64_t *)opt->value;
+	const struct branch_mode *br;
+	char *s, *os = NULL, *p;
+	int ret = -1;
+
+	if (unset)
+		return 0;
 
 	/*
-	 * Let the child rip
+	 * cannot set it twice, -b + --branch-filter for instance
 	 */
-	if (forks)
-		close(go_pipe[1]);
-
-	for (;;) {
-		int hits = samples;
+	if (*mode)
+		return -1;
 
-		for (i = 0; i < nr_cpu; i++) {
-			for (counter = 0; counter < nr_counters; counter++) {
-				if (mmap_array[i][counter].base)
-					mmap_read(&mmap_array[i][counter]);
+	/* str may be NULL in case no arg is passed to -b */
+	if (str) {
+		/* because str is read-only */
+		s = os = strdup(str);
+		if (!s)
+			return -1;
+
+		for (;;) {
+			p = strchr(s, ',');
+			if (p)
+				*p = '\0';
+
+			for (br = branch_modes; br->name; br++) {
+				if (!strcasecmp(s, br->name))
+					break;
+			}
+			if (!br->name) {
+				ui__warning("unknown branch filter %s,"
+					    " check man page\n", s);
+				goto error;
 			}
-		}
 
-		if (hits == samples) {
-			if (done)
+			*mode |= br->mode;
+
+			if (!p)
 				break;
-			err = poll(event_array, nr_poll, -1);
-			waking++;
+
+			s = p + 1;
 		}
+	}
+	ret = 0;
+
+	/* default to any branch */
+	if ((*mode & ~ONLY_PLM) == 0) {
+		*mode = PERF_SAMPLE_BRANCH_ANY;
+	}
+error:
+	free(os);
+	return ret;
+}
 
-		if (done) {
-			for (i = 0; i < nr_cpu; i++) {
-				for (counter = 0; counter < nr_counters; counter++)
-					ioctl(fd[i][counter], PERF_EVENT_IOC_DISABLE);
+#ifdef HAVE_DWARF_UNWIND_SUPPORT
+static int get_stack_size(char *str, unsigned long *_size)
+{
+	char *endptr;
+	unsigned long size;
+	unsigned long max_size = round_down(USHRT_MAX, sizeof(u64));
+
+	size = strtoul(str, &endptr, 0);
+
+	do {
+		if (*endptr)
+			break;
+
+		size = round_up(size, sizeof(u64));
+		if (!size || size > max_size)
+			break;
+
+		*_size = size;
+		return 0;
+
+	} while (0);
+
+	pr_err("callchain: Incorrect stack dump size (max %ld): %s\n",
+	       max_size, str);
+	return -1;
+}
+#endif /* HAVE_DWARF_UNWIND_SUPPORT */
+
+int record_parse_callchain(const char *arg, struct record_opts *opts)
+{
+	char *tok, *name, *saveptr = NULL;
+	char *buf;
+	int ret = -1;
+
+	/* We need buffer that we know we can write to. */
+	buf = malloc(strlen(arg) + 1);
+	if (!buf)
+		return -ENOMEM;
+
+	strcpy(buf, arg);
+
+	tok = strtok_r((char *)buf, ",", &saveptr);
+	name = tok ? : (char *)buf;
+
+	do {
+		/* Framepointer style */
+		if (!strncmp(name, "fp", sizeof("fp"))) {
+			if (!strtok_r(NULL, ",", &saveptr)) {
+				opts->call_graph = CALLCHAIN_FP;
+				ret = 0;
+			} else
+				pr_err("callchain: No more arguments "
+				       "needed for -g fp\n");
+			break;
+
+#ifdef HAVE_DWARF_UNWIND_SUPPORT
+		/* Dwarf style */
+		} else if (!strncmp(name, "dwarf", sizeof("dwarf"))) {
+			const unsigned long default_stack_dump_size = 8192;
+
+			ret = 0;
+			opts->call_graph = CALLCHAIN_DWARF;
+			opts->stack_dump_size = default_stack_dump_size;
+
+			tok = strtok_r(NULL, ",", &saveptr);
+			if (tok) {
+				unsigned long size = 0;
+
+				ret = get_stack_size(tok, &size);
+				opts->stack_dump_size = size;
 			}
+#endif /* HAVE_DWARF_UNWIND_SUPPORT */
+		} else {
+			pr_err("callchain: Unknown --call-graph option "
+			       "value: %s\n", arg);
+			break;
 		}
+
+	} while (0);
+
+	free(buf);
+	return ret;
+}
+
+static void callchain_debug(struct record_opts *opts)
+{
+	static const char *str[CALLCHAIN_MAX] = { "NONE", "FP", "DWARF" };
+
+	pr_debug("callchain: type %s\n", str[opts->call_graph]);
+
+	if (opts->call_graph == CALLCHAIN_DWARF)
+		pr_debug("callchain: stack dump size %d\n",
+			 opts->stack_dump_size);
+}
+
+int record_parse_callchain_opt(const struct option *opt,
+			       const char *arg,
+			       int unset)
+{
+	struct record_opts *opts = opt->value;
+	int ret;
+
+	opts->call_graph_enabled = !unset;
+
+	/* --no-call-graph */
+	if (unset) {
+		opts->call_graph = CALLCHAIN_NONE;
+		pr_debug("callchain: disabled\n");
+		return 0;
 	}
 
-	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
+	ret = record_parse_callchain(arg, opts);
+	if (!ret)
+		callchain_debug(opts);
 
-	/*
-	 * Approximate RIP event size: 24 bytes.
-	 */
-	fprintf(stderr,
-		"[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n",
-		(double)bytes_written / 1024.0 / 1024.0,
-		output_name,
-		bytes_written / 24);
+	return ret;
+}
+
+int record_callchain_opt(const struct option *opt,
+			 const char *arg __maybe_unused,
+			 int unset __maybe_unused)
+{
+	struct record_opts *opts = opt->value;
+
+	opts->call_graph_enabled = !unset;
 
+	if (opts->call_graph == CALLCHAIN_NONE)
+		opts->call_graph = CALLCHAIN_FP;
+
+	callchain_debug(opts);
 	return 0;
 }
 
+static int perf_record_config(const char *var, const char *value, void *cb)
+{
+	struct record *rec = cb;
+
+	if (!strcmp(var, "record.call-graph"))
+		return record_parse_callchain(value, &rec->opts);
+
+	return perf_default_config(var, value, cb);
+}
+
 static const char * const record_usage[] = {
 	"perf record [<options>] [<command>]",
 	"perf record [<options>] -- <command> [<options>]",
 	NULL
 };
 
-static const struct option options[] = {
-	OPT_CALLBACK('e', "event", NULL, "event",
+/*
+ * XXX Ideally would be local to cmd_record() and passed to a record__new
+ * because we need to have access to it in record__exit, that is called
+ * after cmd_record() exits, but since record_options need to be accessible to
+ * builtin-script, leave it here.
+ *
+ * At least we don't ouch it in all the other functions here directly.
+ *
+ * Just say no to tons of global variables, sigh.
+ */
+static struct record record = {
+	.opts = {
+		.mmap_pages	     = UINT_MAX,
+		.user_freq	     = UINT_MAX,
+		.user_interval	     = ULLONG_MAX,
+		.freq		     = 4000,
+		.target		     = {
+			.uses_mmap   = true,
+			.default_per_cpu = true,
+		},
+	},
+};
+
+#define CALLCHAIN_HELP "setup and enables call-graph (stack chain/backtrace) recording: "
+
+#ifdef HAVE_DWARF_UNWIND_SUPPORT
+const char record_callchain_help[] = CALLCHAIN_HELP "fp dwarf";
+#else
+const char record_callchain_help[] = CALLCHAIN_HELP "fp";
+#endif
+
+/*
+ * XXX Will stay a global variable till we fix builtin-script.c to stop messing
+ * with it and switch to use the library functions in perf_evlist that came
+ * from builtin-record.c, i.e. use record_opts,
+ * perf_evlist__prepare_workload, etc instead of fork+exec'in 'perf record',
+ * using pipes, etc.
+ */
+const struct option record_options[] = {
+	OPT_CALLBACK('e', "event", &record.evlist, "event",
 		     "event selector. use 'perf list' to list available events",
-		     parse_events),
-	OPT_CALLBACK(0, "filter", NULL, "filter",
+		     parse_events_option),
+	OPT_CALLBACK(0, "filter", &record.evlist, "filter",
 		     "event filter", parse_filter),
-	OPT_INTEGER('p', "pid", &target_pid,
-		    "record events on existing pid"),
-	OPT_INTEGER('r', "realtime", &realtime_prio,
+	OPT_STRING('p', "pid", &record.opts.target.pid, "pid",
+		    "record events on existing process id"),
+	OPT_STRING('t', "tid", &record.opts.target.tid, "tid",
+		    "record events on existing thread id"),
+	OPT_INTEGER('r', "realtime", &record.realtime_prio,
 		    "collect data with this RT SCHED_FIFO priority"),
-	OPT_BOOLEAN('R', "raw-samples", &raw_samples,
+	OPT_BOOLEAN(0, "no-buffering", &record.opts.no_buffering,
+		    "collect data without buffering"),
+	OPT_BOOLEAN('R', "raw-samples", &record.opts.raw_samples,
 		    "collect raw sample records from all opened counters"),
-	OPT_BOOLEAN('a', "all-cpus", &system_wide,
+	OPT_BOOLEAN('a', "all-cpus", &record.opts.target.system_wide,
 			    "system-wide collection from all CPUs"),
-	OPT_BOOLEAN('A', "append", &append_file,
-			    "append to the output file to do incremental profiling"),
-	OPT_INTEGER('C', "profile_cpu", &profile_cpu,
-			    "CPU to profile on"),
-	OPT_BOOLEAN('f', "force", &force,
-			"overwrite existing data file"),
-	OPT_LONG('c', "count", &default_interval,
-		    "event period to sample"),
-	OPT_STRING('o', "output", &output_name, "file",
+	OPT_STRING('C', "cpu", &record.opts.target.cpu_list, "cpu",
+		    "list of cpus to monitor"),
+	OPT_U64('c', "count", &record.opts.user_interval, "event period to sample"),
+	OPT_STRING('o', "output", &record.file.path, "file",
 		    "output file name"),
-	OPT_BOOLEAN('i', "inherit", &inherit,
-		    "child tasks inherit counters"),
-	OPT_INTEGER('F', "freq", &freq,
-		    "profile at this frequency"),
-	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
-		    "number of mmap data pages"),
-	OPT_BOOLEAN('g', "call-graph", &call_graph,
-		    "do call-graph (stack chain/backtrace) recording"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_BOOLEAN_SET('i', "no-inherit", &record.opts.no_inherit,
+			&record.opts.no_inherit_set,
+			"child tasks do not inherit counters"),
+	OPT_UINTEGER('F', "freq", &record.opts.user_freq, "profile at this frequency"),
+	OPT_CALLBACK('m', "mmap-pages", &record.opts.mmap_pages, "pages",
+		     "number of mmap data pages",
+		     perf_evlist__parse_mmap_pages),
+	OPT_BOOLEAN(0, "group", &record.opts.group,
+		    "put the counters into a counter group"),
+	OPT_CALLBACK_NOOPT('g', NULL, &record.opts,
+			   NULL, "enables call-graph recording" ,
+			   &record_callchain_opt),
+	OPT_CALLBACK(0, "call-graph", &record.opts,
+		     "mode[,dump_size]", record_callchain_help,
+		     &record_parse_callchain_opt),
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
-	OPT_BOOLEAN('s', "stat", &inherit_stat,
+	OPT_BOOLEAN('q', "quiet", &quiet, "don't print any message"),
+	OPT_BOOLEAN('s', "stat", &record.opts.inherit_stat,
 		    "per thread counts"),
-	OPT_BOOLEAN('d', "data", &sample_address,
+	OPT_BOOLEAN('d', "data", &record.opts.sample_address,
 		    "Sample addresses"),
-	OPT_BOOLEAN('n', "no-samples", &no_samples,
+	OPT_BOOLEAN('T', "timestamp", &record.opts.sample_time, "Sample timestamps"),
+	OPT_BOOLEAN('P', "period", &record.opts.period, "Sample period"),
+	OPT_BOOLEAN('n', "no-samples", &record.opts.no_samples,
 		    "don't sample"),
-	OPT_BOOLEAN('M', "multiplex", &multiplex,
-		    "multiplex counter output in a single channel"),
+	OPT_BOOLEAN('N', "no-buildid-cache", &record.no_buildid_cache,
+		    "do not update the buildid cache"),
+	OPT_BOOLEAN('B', "no-buildid", &record.no_buildid,
+		    "do not collect buildids in perf.data"),
+	OPT_CALLBACK('G', "cgroup", &record.evlist, "name",
+		     "monitor event in cgroup name only",
+		     parse_cgroups),
+	OPT_UINTEGER('D', "delay", &record.opts.initial_delay,
+		  "ms to wait before starting measurement after program start"),
+	OPT_STRING('u', "uid", &record.opts.target.uid_str, "user",
+		   "user to profile"),
+
+	OPT_CALLBACK_NOOPT('b', "branch-any", &record.opts.branch_stack,
+		     "branch any", "sample any taken branches",
+		     parse_branch_stack),
+
+	OPT_CALLBACK('j', "branch-filter", &record.opts.branch_stack,
+		     "branch filter mask", "branch stack filter modes",
+		     parse_branch_stack),
+	OPT_BOOLEAN('W', "weight", &record.opts.sample_weight,
+		    "sample by weight (on special events only)"),
+	OPT_BOOLEAN(0, "transaction", &record.opts.sample_transaction,
+		    "sample transaction flags (special events only)"),
+	OPT_BOOLEAN(0, "per-thread", &record.opts.target.per_thread,
+		    "use per-thread mmaps"),
 	OPT_END()
 };
 
-int cmd_record(int argc, const char **argv, const char *prefix __used)
+int cmd_record(int argc, const char **argv, const char *prefix __maybe_unused)
 {
-	int counter;
+	int err = -ENOMEM;
+	struct record *rec = &record;
+	char errbuf[BUFSIZ];
+
+	rec->evlist = perf_evlist__new();
+	if (rec->evlist == NULL)
+		return -ENOMEM;
 
-	argc = parse_options(argc, argv, options, record_usage,
+	perf_config(perf_record_config, rec);
+
+	argc = parse_options(argc, argv, record_options, record_usage,
 			    PARSE_OPT_STOP_AT_NON_OPTION);
-	if (!argc && target_pid == -1 && !system_wide && profile_cpu == -1)
-		usage_with_options(record_usage, options);
+	if (!argc && target__none(&rec->opts.target))
+		usage_with_options(record_usage, record_options);
+
+	if (nr_cgroups && !rec->opts.target.system_wide) {
+		ui__error("cgroup monitoring only available in"
+			  " system-wide mode\n");
+		usage_with_options(record_usage, record_options);
+	}
 
 	symbol__init();
 
-	if (!nr_counters) {
-		nr_counters	= 1;
-		attrs[0].type	= PERF_TYPE_HARDWARE;
-		attrs[0].config = PERF_COUNT_HW_CPU_CYCLES;
+	if (symbol_conf.kptr_restrict)
+		pr_warning(
+"WARNING: Kernel address maps (/proc/{kallsyms,modules}) are restricted,\n"
+"check /proc/sys/kernel/kptr_restrict.\n\n"
+"Samples in kernel functions may not be resolved if a suitable vmlinux\n"
+"file is not found in the buildid cache or in the vmlinux path.\n\n"
+"Samples in kernel modules won't be resolved at all.\n\n"
+"If some relocation was applied (e.g. kexec) symbols may be misresolved\n"
+"even with a suitable vmlinux or kallsyms file.\n\n");
+
+	if (rec->no_buildid_cache || rec->no_buildid)
+		disable_buildid_cache();
+
+	if (rec->evlist->nr_entries == 0 &&
+	    perf_evlist__add_default(rec->evlist) < 0) {
+		pr_err("Not enough memory for event selector list\n");
+		goto out_symbol_exit;
 	}
 
-	/*
-	 * User specified count overrides default frequency.
-	 */
-	if (default_interval)
-		freq = 0;
-	else if (freq) {
-		default_interval = freq;
-	} else {
-		fprintf(stderr, "frequency and count are zero, aborting\n");
-		exit(EXIT_FAILURE);
+	if (rec->opts.target.tid && !rec->opts.no_inherit_set)
+		rec->opts.no_inherit = true;
+
+	err = target__validate(&rec->opts.target);
+	if (err) {
+		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
+		ui__warning("%s", errbuf);
+	}
+
+	err = target__parse_uid(&rec->opts.target);
+	if (err) {
+		int saved_errno = errno;
+
+		target__strerror(&rec->opts.target, err, errbuf, BUFSIZ);
+		ui__error("%s", errbuf);
+
+		err = -saved_errno;
+		goto out_symbol_exit;
 	}
 
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (attrs[counter].sample_period)
-			continue;
+	err = -ENOMEM;
+	if (perf_evlist__create_maps(rec->evlist, &rec->opts.target) < 0)
+		usage_with_options(record_usage, record_options);
 
-		attrs[counter].sample_period = default_interval;
+	if (record_opts__config(&rec->opts)) {
+		err = -EINVAL;
+		goto out_symbol_exit;
 	}
 
-	return __cmd_record(argc, argv);
+	err = __cmd_record(&record, argc, argv);
+out_symbol_exit:
+	perf_evlist__delete(rec->evlist);
+	symbol__exit();
+	return err;
 }
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f815de25d0f..21d830bafff 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -8,21 +8,24 @@
 #include "builtin.h"
 
 #include "util/util.h"
+#include "util/cache.h"
 
+#include "util/annotate.h"
 #include "util/color.h"
 #include <linux/list.h>
-#include "util/cache.h"
 #include <linux/rbtree.h>
 #include "util/symbol.h"
-#include "util/string.h"
 #include "util/callchain.h"
 #include "util/strlist.h"
 #include "util/values.h"
 
 #include "perf.h"
 #include "util/debug.h"
+#include "util/evlist.h"
+#include "util/evsel.h"
 #include "util/header.h"
 #include "util/session.h"
+#include "util/tool.h"
 
 #include "util/parse-options.h"
 #include "util/parse-events.h"
@@ -30,445 +33,784 @@
 #include "util/thread.h"
 #include "util/sort.h"
 #include "util/hist.h"
+#include "util/data.h"
+#include "arch/common.h"
+
+#include <dlfcn.h>
+#include <linux/bitmap.h>
+
+struct report {
+	struct perf_tool	tool;
+	struct perf_session	*session;
+	bool			force, use_tui, use_gtk, use_stdio;
+	bool			hide_unresolved;
+	bool			dont_use_callchains;
+	bool			show_full_info;
+	bool			show_threads;
+	bool			inverted_callchain;
+	bool			mem_mode;
+	bool			header;
+	bool			header_only;
+	int			max_stack;
+	struct perf_read_values	show_threads_values;
+	const char		*pretty_printing_style;
+	const char		*cpu_list;
+	const char		*symbol_filter_str;
+	float			min_percent;
+	u64			nr_entries;
+	DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
+};
 
-static char		const *input_name = "perf.data";
-
-static int		force;
-static bool		hide_unresolved;
-static bool		dont_use_callchains;
-
-static int		show_threads;
-static struct perf_read_values	show_threads_values;
-
-static char		default_pretty_printing_style[] = "normal";
-static char		*pretty_printing_style = default_pretty_printing_style;
-
-static char		callchain_default_opt[] = "fractal,0.5";
-
-static struct event_stat_id *get_stats(struct perf_session *self,
-				       u64 event_stream, u32 type, u64 config)
+static int report__config(const char *var, const char *value, void *cb)
 {
-	struct rb_node **p = &self->stats_by_id.rb_node;
-	struct rb_node *parent = NULL;
-	struct event_stat_id *iter, *new;
-
-	while (*p != NULL) {
-		parent = *p;
-		iter = rb_entry(parent, struct event_stat_id, rb_node);
-		if (iter->config == config)
-			return iter;
-
-
-		if (config > iter->config)
-			p = &(*p)->rb_right;
-		else
-			p = &(*p)->rb_left;
+	if (!strcmp(var, "report.group")) {
+		symbol_conf.event_group = perf_config_bool(var, value);
+		return 0;
+	}
+	if (!strcmp(var, "report.percent-limit")) {
+		struct report *rep = cb;
+		rep->min_percent = strtof(value, NULL);
+		return 0;
+	}
+	if (!strcmp(var, "report.children")) {
+		symbol_conf.cumulate_callchain = perf_config_bool(var, value);
+		return 0;
 	}
 
-	new = malloc(sizeof(struct event_stat_id));
-	if (new == NULL)
-		return NULL;
-	memset(new, 0, sizeof(struct event_stat_id));
-	new->event_stream = event_stream;
-	new->config = config;
-	new->type = type;
-	rb_link_node(&new->rb_node, parent, p);
-	rb_insert_color(&new->rb_node, &self->stats_by_id);
-	return new;
+	return perf_default_config(var, value, cb);
 }
 
-static int perf_session__add_hist_entry(struct perf_session *self,
-					struct addr_location *al,
-					struct sample_data *data)
+static void report__inc_stats(struct report *rep, struct hist_entry *he)
 {
-	struct symbol **syms = NULL, *parent = NULL;
-	bool hit;
-	struct hist_entry *he;
-	struct event_stat_id *stats;
-	struct perf_event_attr *attr;
-
-	if ((sort__has_parent || symbol_conf.use_callchain) && data->callchain)
-		syms = perf_session__resolve_callchain(self, al->thread,
-						       data->callchain, &parent);
-
-	attr = perf_header__find_attr(data->id, &self->header);
-	if (attr)
-		stats = get_stats(self, data->id, attr->type, attr->config);
-	else
-		stats = get_stats(self, data->id, 0, 0);
-	if (stats == NULL)
-		return -ENOMEM;
-	he = __perf_session__add_hist_entry(&stats->hists, al, parent,
-					    data->period, &hit);
-	if (he == NULL)
-		return -ENOMEM;
-
-	if (hit)
-		he->count += data->period;
-
-	if (symbol_conf.use_callchain) {
-		if (!hit)
-			callchain_init(&he->callchain);
-		append_chain(&he->callchain, data->callchain, syms);
-		free(syms);
-	}
-
-	return 0;
+	/*
+	 * The @he is either of a newly created one or an existing one
+	 * merging current sample.  We only want to count a new one so
+	 * checking ->nr_events being 1.
+	 */
+	if (he->stat.nr_events == 1)
+		rep->nr_entries++;
 }
 
-static int validate_chain(struct ip_callchain *chain, event_t *event)
+static int hist_iter__report_callback(struct hist_entry_iter *iter,
+				      struct addr_location *al, bool single,
+				      void *arg)
 {
-	unsigned int chain_size;
+	int err = 0;
+	struct report *rep = arg;
+	struct hist_entry *he = iter->he;
+	struct perf_evsel *evsel = iter->evsel;
+	struct mem_info *mi;
+	struct branch_info *bi;
 
-	chain_size = event->header.size;
-	chain_size -= (unsigned long)&event->ip.__more_data - (unsigned long)event;
+	report__inc_stats(rep, he);
 
-	if (chain->nr*sizeof(u64) > chain_size)
-		return -1;
+	if (!ui__has_annotation())
+		return 0;
 
-	return 0;
-}
+	if (sort__mode == SORT_MODE__BRANCH) {
+		bi = he->branch_info;
+		err = addr_map_symbol__inc_samples(&bi->from, evsel->idx);
+		if (err)
+			goto out;
 
-static int add_event_total(struct perf_session *session,
-			   struct sample_data *data,
-			   struct perf_event_attr *attr)
-{
-	struct event_stat_id *stats;
+		err = addr_map_symbol__inc_samples(&bi->to, evsel->idx);
 
-	if (attr)
-		stats = get_stats(session, data->id, attr->type, attr->config);
-	else
-		stats = get_stats(session, data->id, 0, 0);
+	} else if (rep->mem_mode) {
+		mi = he->mem_info;
+		err = addr_map_symbol__inc_samples(&mi->daddr, evsel->idx);
+		if (err)
+			goto out;
 
-	if (!stats)
-		return -ENOMEM;
+		err = hist_entry__inc_addr_samples(he, evsel->idx, al->addr);
 
-	stats->stats.total += data->period;
-	session->events_stats.total += data->period;
-	return 0;
+	} else if (symbol_conf.cumulate_callchain) {
+		if (single)
+			err = hist_entry__inc_addr_samples(he, evsel->idx,
+							   al->addr);
+	} else {
+		err = hist_entry__inc_addr_samples(he, evsel->idx, al->addr);
+	}
+
+out:
+	return err;
 }
 
-static int process_sample_event(event_t *event, struct perf_session *session)
+static int process_sample_event(struct perf_tool *tool,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct perf_evsel *evsel,
+				struct machine *machine)
 {
-	struct sample_data data = { .period = 1, };
+	struct report *rep = container_of(tool, struct report, tool);
 	struct addr_location al;
-	struct perf_event_attr *attr;
-
-	event__parse_sample(event, session->sample_type, &data);
-
-	dump_printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc,
-		    data.pid, data.tid, data.ip, data.period);
-
-	if (session->sample_type & PERF_SAMPLE_CALLCHAIN) {
-		unsigned int i;
-
-		dump_printf("... chain: nr:%Lu\n", data.callchain->nr);
-
-		if (validate_chain(data.callchain, event) < 0) {
-			pr_debug("call-chain problem with event, "
-				 "skipping it.\n");
-			return 0;
-		}
-
-		if (dump_trace) {
-			for (i = 0; i < data.callchain->nr; i++)
-				dump_printf("..... %2d: %016Lx\n",
-					    i, data.callchain->ips[i]);
-		}
-	}
-
-	if (event__preprocess_sample(event, session, &al, NULL) < 0) {
-		fprintf(stderr, "problem processing %d event, skipping it.\n",
-			event->header.type);
+	struct hist_entry_iter iter = {
+		.hide_unresolved = rep->hide_unresolved,
+		.add_entry_cb = hist_iter__report_callback,
+	};
+	int ret;
+
+	if (perf_event__preprocess_sample(event, machine, &al, sample) < 0) {
+		pr_debug("problem processing %d event, skipping it.\n",
+			 event->header.type);
 		return -1;
 	}
 
-	if (al.filtered || (hide_unresolved && al.sym == NULL))
+	if (rep->hide_unresolved && al.sym == NULL)
 		return 0;
 
-	if (perf_session__add_hist_entry(session, &al, &data)) {
-		pr_debug("problem incrementing symbol count, skipping event\n");
-		return -1;
-	}
+	if (rep->cpu_list && !test_bit(sample->cpu, rep->cpu_bitmap))
+		return 0;
 
-	attr = perf_header__find_attr(data.id, &session->header);
+	if (sort__mode == SORT_MODE__BRANCH)
+		iter.ops = &hist_iter_branch;
+	else if (rep->mem_mode)
+		iter.ops = &hist_iter_mem;
+	else if (symbol_conf.cumulate_callchain)
+		iter.ops = &hist_iter_cumulative;
+	else
+		iter.ops = &hist_iter_normal;
 
-	if (add_event_total(session, &data, attr)) {
-		pr_debug("problem adding event count\n");
-		return -1;
-	}
+	if (al.map != NULL)
+		al.map->dso->hit = 1;
 
-	return 0;
+	ret = hist_entry_iter__add(&iter, &al, evsel, sample, rep->max_stack,
+				   rep);
+	if (ret < 0)
+		pr_debug("problem adding hist entry, skipping event\n");
+
+	return ret;
 }
 
-static int process_read_event(event_t *event, struct perf_session *session __used)
+static int process_read_event(struct perf_tool *tool,
+			      union perf_event *event,
+			      struct perf_sample *sample __maybe_unused,
+			      struct perf_evsel *evsel,
+			      struct machine *machine __maybe_unused)
 {
-	struct perf_event_attr *attr;
-
-	attr = perf_header__find_attr(event->read.id, &session->header);
+	struct report *rep = container_of(tool, struct report, tool);
 
-	if (show_threads) {
-		const char *name = attr ? __event_name(attr->type, attr->config)
-				   : "unknown";
-		perf_read_values_add_value(&show_threads_values,
+	if (rep->show_threads) {
+		const char *name = evsel ? perf_evsel__name(evsel) : "unknown";
+		perf_read_values_add_value(&rep->show_threads_values,
 					   event->read.pid, event->read.tid,
 					   event->read.id,
 					   name,
 					   event->read.value);
 	}
 
-	dump_printf(": %d %d %s %Lu\n", event->read.pid, event->read.tid,
-		    attr ? __event_name(attr->type, attr->config) : "FAIL",
+	dump_printf(": %d %d %s %" PRIu64 "\n", event->read.pid, event->read.tid,
+		    evsel ? perf_evsel__name(evsel) : "FAIL",
 		    event->read.value);
 
 	return 0;
 }
 
-static int perf_session__setup_sample_type(struct perf_session *self)
+/* For pipe mode, sample_type is not currently set */
+static int report__setup_sample_type(struct report *rep)
 {
-	if (!(self->sample_type & PERF_SAMPLE_CALLCHAIN)) {
+	struct perf_session *session = rep->session;
+	u64 sample_type = perf_evlist__combined_sample_type(session->evlist);
+	bool is_pipe = perf_data_file__is_pipe(session->file);
+
+	if (!is_pipe && !(sample_type & PERF_SAMPLE_CALLCHAIN)) {
 		if (sort__has_parent) {
-			fprintf(stderr, "selected --sort parent, but no"
-					" callchain data. Did you call"
-					" perf record without -g?\n");
+			ui__error("Selected --sort parent, but no "
+				    "callchain data. Did you call "
+				    "'perf record' without -g?\n");
 			return -EINVAL;
 		}
 		if (symbol_conf.use_callchain) {
-			fprintf(stderr, "selected -g but no callchain data."
-					" Did you call perf record without"
-					" -g?\n");
+			ui__error("Selected -g but no callchain data. Did "
+				    "you call 'perf record' without -g?\n");
 			return -1;
 		}
-	} else if (!dont_use_callchains && callchain_param.mode != CHAIN_NONE &&
+	} else if (!rep->dont_use_callchains &&
+		   callchain_param.mode != CHAIN_NONE &&
 		   !symbol_conf.use_callchain) {
 			symbol_conf.use_callchain = true;
-			if (register_callchain_param(&callchain_param) < 0) {
-				fprintf(stderr, "Can't register callchain"
-						" params\n");
+			if (callchain_register_param(&callchain_param) < 0) {
+				ui__error("Can't register callchain params.\n");
 				return -EINVAL;
 			}
 	}
 
+	if (symbol_conf.cumulate_callchain) {
+		/* Silently ignore if callchain is missing */
+		if (!(sample_type & PERF_SAMPLE_CALLCHAIN)) {
+			symbol_conf.cumulate_callchain = false;
+			perf_hpp__cancel_cumulate();
+		}
+	}
+
+	if (sort__mode == SORT_MODE__BRANCH) {
+		if (!is_pipe &&
+		    !(sample_type & PERF_SAMPLE_BRANCH_STACK)) {
+			ui__error("Selected -b but no branch data. "
+				  "Did you call perf record without -b?\n");
+			return -1;
+		}
+	}
+
 	return 0;
 }
 
-static struct perf_event_ops event_ops = {
-	.sample	= process_sample_event,
-	.mmap	= event__process_mmap,
-	.comm	= event__process_comm,
-	.exit	= event__process_task,
-	.fork	= event__process_task,
-	.lost	= event__process_lost,
-	.read	= process_read_event,
-};
-
-static int __cmd_report(void)
+static void sig_handler(int sig __maybe_unused)
 {
-	int ret = -EINVAL;
-	struct perf_session *session;
-	struct rb_node *next;
-
-	session = perf_session__new(input_name, O_RDONLY, force);
-	if (session == NULL)
-		return -ENOMEM;
+	session_done = 1;
+}
 
-	if (show_threads)
-		perf_read_values_init(&show_threads_values);
+static size_t hists__fprintf_nr_sample_events(struct hists *hists, struct report *rep,
+					      const char *evname, FILE *fp)
+{
+	size_t ret;
+	char unit;
+	unsigned long nr_samples = hists->stats.nr_events[PERF_RECORD_SAMPLE];
+	u64 nr_events = hists->stats.total_period;
+	struct perf_evsel *evsel = hists_to_evsel(hists);
+	char buf[512];
+	size_t size = sizeof(buf);
+
+	if (symbol_conf.filter_relative) {
+		nr_samples = hists->stats.nr_non_filtered_samples;
+		nr_events = hists->stats.total_non_filtered_period;
+	}
 
-	ret = perf_session__setup_sample_type(session);
-	if (ret)
-		goto out_delete;
+	if (perf_evsel__is_group_event(evsel)) {
+		struct perf_evsel *pos;
 
-	ret = perf_session__process_events(session, &event_ops);
-	if (ret)
-		goto out_delete;
+		perf_evsel__group_desc(evsel, buf, size);
+		evname = buf;
 
-	if (dump_trace) {
-		event__print_totals();
-		goto out_delete;
+		for_each_group_member(pos, evsel) {
+			if (symbol_conf.filter_relative) {
+				nr_samples += pos->hists.stats.nr_non_filtered_samples;
+				nr_events += pos->hists.stats.total_non_filtered_period;
+			} else {
+				nr_samples += pos->hists.stats.nr_events[PERF_RECORD_SAMPLE];
+				nr_events += pos->hists.stats.total_period;
+			}
+		}
 	}
 
-	if (verbose > 3)
-		perf_session__fprintf(session, stdout);
+	nr_samples = convert_unit(nr_samples, &unit);
+	ret = fprintf(fp, "# Samples: %lu%c", nr_samples, unit);
+	if (evname != NULL)
+		ret += fprintf(fp, " of event '%s'", evname);
 
-	if (verbose > 2)
-		dsos__fprintf(stdout);
+	if (rep->mem_mode) {
+		ret += fprintf(fp, "\n# Total weight : %" PRIu64, nr_events);
+		ret += fprintf(fp, "\n# Sort order   : %s", sort_order);
+	} else
+		ret += fprintf(fp, "\n# Event count (approx.): %" PRIu64, nr_events);
+	return ret + fprintf(fp, "\n#\n");
+}
 
-	next = rb_first(&session->stats_by_id);
-	while (next) {
-		struct event_stat_id *stats;
+static int perf_evlist__tty_browse_hists(struct perf_evlist *evlist,
+					 struct report *rep,
+					 const char *help)
+{
+	struct perf_evsel *pos;
 
-		stats = rb_entry(next, struct event_stat_id, rb_node);
-		perf_session__collapse_resort(&stats->hists);
-		perf_session__output_resort(&stats->hists, stats->stats.total);
-		if (rb_first(&session->stats_by_id) ==
-		    rb_last(&session->stats_by_id))
-			fprintf(stdout, "# Samples: %Ld\n#\n",
-				stats->stats.total);
-		else
-			fprintf(stdout, "# Samples: %Ld %s\n#\n",
-				stats->stats.total,
-				__event_name(stats->type, stats->config));
+	evlist__for_each(evlist, pos) {
+		struct hists *hists = &pos->hists;
+		const char *evname = perf_evsel__name(pos);
+
+		if (symbol_conf.event_group &&
+		    !perf_evsel__is_group_leader(pos))
+			continue;
 
-		perf_session__fprintf_hists(&stats->hists, NULL, false, stdout,
-					    stats->stats.total);
+		hists__fprintf_nr_sample_events(hists, rep, evname, stdout);
+		hists__fprintf(hists, true, 0, 0, rep->min_percent, stdout);
 		fprintf(stdout, "\n\n");
-		next = rb_next(&stats->rb_node);
 	}
 
 	if (sort_order == default_sort_order &&
-	    parent_pattern == default_parent_pattern)
-		fprintf(stdout, "#\n# (For a higher level overview, try: perf report --sort comm,dso)\n#\n");
-
-	if (show_threads) {
-		bool raw_printing_style = !strcmp(pretty_printing_style, "raw");
-		perf_read_values_display(stdout, &show_threads_values,
-					 raw_printing_style);
-		perf_read_values_destroy(&show_threads_values);
+	    parent_pattern == default_parent_pattern) {
+		fprintf(stdout, "#\n# (%s)\n#\n", help);
+
+		if (rep->show_threads) {
+			bool style = !strcmp(rep->pretty_printing_style, "raw");
+			perf_read_values_display(stdout, &rep->show_threads_values,
+						 style);
+			perf_read_values_destroy(&rep->show_threads_values);
+		}
 	}
-out_delete:
-	perf_session__delete(session);
-	return ret;
+
+	return 0;
 }
 
-static int
-parse_callchain_opt(const struct option *opt __used, const char *arg,
-		    int unset)
+static void report__warn_kptr_restrict(const struct report *rep)
 {
-	char *tok;
-	char *endptr;
+	struct map *kernel_map = rep->session->machines.host.vmlinux_maps[MAP__FUNCTION];
+	struct kmap *kernel_kmap = map__kmap(kernel_map);
+
+	if (kernel_map == NULL ||
+	    (kernel_map->dso->hit &&
+	     (kernel_kmap->ref_reloc_sym == NULL ||
+	      kernel_kmap->ref_reloc_sym->addr == 0))) {
+		const char *desc =
+		    "As no suitable kallsyms nor vmlinux was found, kernel samples\n"
+		    "can't be resolved.";
+
+		if (kernel_map) {
+			const struct dso *kdso = kernel_map->dso;
+			if (!RB_EMPTY_ROOT(&kdso->symbols[MAP__FUNCTION])) {
+				desc = "If some relocation was applied (e.g. "
+				       "kexec) symbols may be misresolved.";
+			}
+		}
 
-	/*
-	 * --no-call-graph
-	 */
-	if (unset) {
-		dont_use_callchains = true;
-		return 0;
+		ui__warning(
+"Kernel address maps (/proc/{kallsyms,modules}) were restricted.\n\n"
+"Check /proc/sys/kernel/kptr_restrict before running 'perf record'.\n\n%s\n\n"
+"Samples in kernel modules can't be resolved as well.\n\n",
+		desc);
 	}
+}
 
-	symbol_conf.use_callchain = true;
+static int report__gtk_browse_hists(struct report *rep, const char *help)
+{
+	int (*hist_browser)(struct perf_evlist *evlist, const char *help,
+			    struct hist_browser_timer *timer, float min_pcnt);
 
-	if (!arg)
-		return 0;
+	hist_browser = dlsym(perf_gtk_handle, "perf_evlist__gtk_browse_hists");
 
-	tok = strtok((char *)arg, ",");
-	if (!tok)
+	if (hist_browser == NULL) {
+		ui__error("GTK browser not found!\n");
 		return -1;
+	}
+
+	return hist_browser(rep->session->evlist, help, NULL, rep->min_percent);
+}
 
-	/* get the output mode */
-	if (!strncmp(tok, "graph", strlen(arg)))
-		callchain_param.mode = CHAIN_GRAPH_ABS;
+static int report__browse_hists(struct report *rep)
+{
+	int ret;
+	struct perf_session *session = rep->session;
+	struct perf_evlist *evlist = session->evlist;
+	const char *help = "For a higher level overview, try: perf report --sort comm,dso";
+
+	switch (use_browser) {
+	case 1:
+		ret = perf_evlist__tui_browse_hists(evlist, help, NULL,
+						    rep->min_percent,
+						    &session->header.env);
+		/*
+		 * Usually "ret" is the last pressed key, and we only
+		 * care if the key notifies us to switch data file.
+		 */
+		if (ret != K_SWITCH_INPUT_DATA)
+			ret = 0;
+		break;
+	case 2:
+		ret = report__gtk_browse_hists(rep, help);
+		break;
+	default:
+		ret = perf_evlist__tty_browse_hists(evlist, rep, help);
+		break;
+	}
+
+	return ret;
+}
+
+static void report__collapse_hists(struct report *rep)
+{
+	struct ui_progress prog;
+	struct perf_evsel *pos;
+
+	ui_progress__init(&prog, rep->nr_entries, "Merging related events...");
+
+	evlist__for_each(rep->session->evlist, pos) {
+		struct hists *hists = &pos->hists;
+
+		if (pos->idx == 0)
+			hists->symbol_filter_str = rep->symbol_filter_str;
+
+		hists__collapse_resort(hists, &prog);
+
+		/* Non-group events are considered as leader */
+		if (symbol_conf.event_group &&
+		    !perf_evsel__is_group_leader(pos)) {
+			struct hists *leader_hists = &pos->leader->hists;
+
+			hists__match(leader_hists, hists);
+			hists__link(leader_hists, hists);
+		}
+	}
+
+	ui_progress__finish();
+}
+
+static int __cmd_report(struct report *rep)
+{
+	int ret;
+	struct perf_session *session = rep->session;
+	struct perf_evsel *pos;
+	struct perf_data_file *file = session->file;
+
+	signal(SIGINT, sig_handler);
+
+	if (rep->cpu_list) {
+		ret = perf_session__cpu_bitmap(session, rep->cpu_list,
+					       rep->cpu_bitmap);
+		if (ret)
+			return ret;
+	}
+
+	if (rep->show_threads)
+		perf_read_values_init(&rep->show_threads_values);
+
+	ret = report__setup_sample_type(rep);
+	if (ret)
+		return ret;
+
+	ret = perf_session__process_events(session, &rep->tool);
+	if (ret)
+		return ret;
+
+	report__warn_kptr_restrict(rep);
 
-	else if (!strncmp(tok, "flat", strlen(arg)))
-		callchain_param.mode = CHAIN_FLAT;
+	if (use_browser == 0) {
+		if (verbose > 3)
+			perf_session__fprintf(session, stdout);
 
-	else if (!strncmp(tok, "fractal", strlen(arg)))
-		callchain_param.mode = CHAIN_GRAPH_REL;
+		if (verbose > 2)
+			perf_session__fprintf_dsos(session, stdout);
 
-	else if (!strncmp(tok, "none", strlen(arg))) {
-		callchain_param.mode = CHAIN_NONE;
-		symbol_conf.use_callchain = false;
+		if (dump_trace) {
+			perf_session__fprintf_nr_events(session, stdout);
+			return 0;
+		}
+	}
+
+	report__collapse_hists(rep);
+
+	if (session_done())
+		return 0;
 
+	if (rep->nr_entries == 0) {
+		ui__error("The %s file has no samples!\n", file->path);
 		return 0;
 	}
 
-	else
-		return -1;
+	evlist__for_each(session->evlist, pos)
+		hists__output_resort(&pos->hists);
 
-	/* get the min percentage */
-	tok = strtok(NULL, ",");
-	if (!tok)
-		goto setup;
+	return report__browse_hists(rep);
+}
 
-	callchain_param.min_percent = strtod(tok, &endptr);
-	if (tok == endptr)
-		return -1;
+static int
+report_parse_callchain_opt(const struct option *opt, const char *arg, int unset)
+{
+	struct report *rep = (struct report *)opt->value;
 
-setup:
-	if (register_callchain_param(&callchain_param) < 0) {
-		fprintf(stderr, "Can't register callchain params\n");
-		return -1;
+	/*
+	 * --no-call-graph
+	 */
+	if (unset) {
+		rep->dont_use_callchains = true;
+		return 0;
+	}
+
+	return parse_callchain_report_opt(arg);
+}
+
+int
+report_parse_ignore_callees_opt(const struct option *opt __maybe_unused,
+				const char *arg, int unset __maybe_unused)
+{
+	if (arg) {
+		int err = regcomp(&ignore_callees_regex, arg, REG_EXTENDED);
+		if (err) {
+			char buf[BUFSIZ];
+			regerror(err, &ignore_callees_regex, buf, sizeof(buf));
+			pr_err("Invalid --ignore-callees regex: %s\n%s", arg, buf);
+			return -1;
+		}
+		have_ignore_callees = 1;
 	}
+
 	return 0;
 }
 
-static const char * const report_usage[] = {
-	"perf report [<options>] <command>",
-	NULL
-};
+static int
+parse_branch_mode(const struct option *opt __maybe_unused,
+		  const char *str __maybe_unused, int unset)
+{
+	int *branch_mode = opt->value;
+
+	*branch_mode = !unset;
+	return 0;
+}
+
+static int
+parse_percent_limit(const struct option *opt, const char *str,
+		    int unset __maybe_unused)
+{
+	struct report *rep = opt->value;
+
+	rep->min_percent = strtof(str, NULL);
+	return 0;
+}
 
-static const struct option options[] = {
+int cmd_report(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	struct perf_session *session;
+	struct stat st;
+	bool has_br_stack = false;
+	int branch_mode = -1;
+	int ret = -1;
+	char callchain_default_opt[] = "fractal,0.5,callee";
+	const char * const report_usage[] = {
+		"perf report [<options>]",
+		NULL
+	};
+	struct report report = {
+		.tool = {
+			.sample		 = process_sample_event,
+			.mmap		 = perf_event__process_mmap,
+			.mmap2		 = perf_event__process_mmap2,
+			.comm		 = perf_event__process_comm,
+			.exit		 = perf_event__process_exit,
+			.fork		 = perf_event__process_fork,
+			.lost		 = perf_event__process_lost,
+			.read		 = process_read_event,
+			.attr		 = perf_event__process_attr,
+			.tracing_data	 = perf_event__process_tracing_data,
+			.build_id	 = perf_event__process_build_id,
+			.ordered_samples = true,
+			.ordering_requires_timestamps = true,
+		},
+		.max_stack		 = PERF_MAX_STACK_DEPTH,
+		.pretty_printing_style	 = "normal",
+	};
+	const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
 	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
 		   "file", "vmlinux pathname"),
-	OPT_BOOLEAN('f', "force", &force, "don't complain, do it"),
+	OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
+		   "file", "kallsyms pathname"),
+	OPT_BOOLEAN('f', "force", &report.force, "don't complain, do it"),
 	OPT_BOOLEAN('m', "modules", &symbol_conf.use_modules,
 		    "load module symbols - WARNING: use only with -k and LIVE kernel"),
 	OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples,
 		    "Show a column with the number of samples"),
-	OPT_BOOLEAN('T', "threads", &show_threads,
+	OPT_BOOLEAN('T', "threads", &report.show_threads,
 		    "Show per-thread event counters"),
-	OPT_STRING(0, "pretty", &pretty_printing_style, "key",
+	OPT_STRING(0, "pretty", &report.pretty_printing_style, "key",
 		   "pretty printing style key: normal raw"),
+	OPT_BOOLEAN(0, "tui", &report.use_tui, "Use the TUI interface"),
+	OPT_BOOLEAN(0, "gtk", &report.use_gtk, "Use the GTK2 interface"),
+	OPT_BOOLEAN(0, "stdio", &report.use_stdio,
+		    "Use the stdio interface"),
+	OPT_BOOLEAN(0, "header", &report.header, "Show data header."),
+	OPT_BOOLEAN(0, "header-only", &report.header_only,
+		    "Show only data header."),
 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
-		   "sort by key(s): pid, comm, dso, symbol, parent"),
-	OPT_BOOLEAN('P', "full-paths", &symbol_conf.full_paths,
-		    "Don't shorten the pathnames taking into account the cwd"),
+		   "sort by key(s): pid, comm, dso, symbol, parent, cpu, srcline, ..."
+		   " Please refer the man page for the complete list."),
+	OPT_STRING('F', "fields", &field_order, "key[,keys...]",
+		   "output field(s): overhead, period, sample plus all of sort keys"),
+	OPT_BOOLEAN(0, "showcpuutilization", &symbol_conf.show_cpu_utilization,
+		    "Show sample percentage for different cpu modes"),
 	OPT_STRING('p', "parent", &parent_pattern, "regex",
 		   "regex filter to identify parent, see: '--sort parent'"),
 	OPT_BOOLEAN('x', "exclude-other", &symbol_conf.exclude_other,
 		    "Only display entries with parent-match"),
-	OPT_CALLBACK_DEFAULT('g', "call-graph", NULL, "output_type,min_percent",
-		     "Display callchains using output_type and min percent threshold. "
-		     "Default: fractal,0.5", &parse_callchain_opt, callchain_default_opt),
+	OPT_CALLBACK_DEFAULT('g', "call-graph", &report, "output_type,min_percent[,print_limit],call_order",
+		     "Display callchains using output_type (graph, flat, fractal, or none) , min percent threshold, optional print limit, callchain order, key (function or address). "
+		     "Default: fractal,0.5,callee,function", &report_parse_callchain_opt, callchain_default_opt),
+	OPT_BOOLEAN(0, "children", &symbol_conf.cumulate_callchain,
+		    "Accumulate callchains of children and show total overhead as well"),
+	OPT_INTEGER(0, "max-stack", &report.max_stack,
+		    "Set the maximum stack depth when parsing the callchain, "
+		    "anything beyond the specified depth will be ignored. "
+		    "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
+	OPT_BOOLEAN('G', "inverted", &report.inverted_callchain,
+		    "alias for inverted call graph"),
+	OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
+		   "ignore callees of these functions in call graphs",
+		   report_parse_ignore_callees_opt),
 	OPT_STRING('d', "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
 		   "only consider symbols in these dsos"),
-	OPT_STRING('C', "comms", &symbol_conf.comm_list_str, "comm[,comm...]",
+	OPT_STRING('c', "comms", &symbol_conf.comm_list_str, "comm[,comm...]",
 		   "only consider symbols in these comms"),
 	OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
 		   "only consider these symbols"),
+	OPT_STRING(0, "symbol-filter", &report.symbol_filter_str, "filter",
+		   "only show symbols that (partially) match with this filter"),
 	OPT_STRING('w', "column-widths", &symbol_conf.col_width_list_str,
 		   "width[,width...]",
 		   "don't try to adjust column width, use these fixed values"),
 	OPT_STRING('t', "field-separator", &symbol_conf.field_sep, "separator",
 		   "separator for columns, no spaces will be added between "
 		   "columns '.' is reserved."),
-	OPT_BOOLEAN('U', "hide-unresolved", &hide_unresolved,
+	OPT_BOOLEAN('U', "hide-unresolved", &report.hide_unresolved,
 		    "Only display entries resolved to a symbol"),
+	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
+		    "Look for files with symbols relative to this directory"),
+	OPT_STRING('C', "cpu", &report.cpu_list, "cpu",
+		   "list of cpus to profile"),
+	OPT_BOOLEAN('I', "show-info", &report.show_full_info,
+		    "Display extended information about perf.data file"),
+	OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src,
+		    "Interleave source code with assembly code (default)"),
+	OPT_BOOLEAN(0, "asm-raw", &symbol_conf.annotate_asm_raw,
+		    "Display raw encoding of assembly instructions (default)"),
+	OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style",
+		   "Specify disassembler style (e.g. -M intel for intel syntax)"),
+	OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
+		    "Show a column with the sum of periods"),
+	OPT_BOOLEAN(0, "group", &symbol_conf.event_group,
+		    "Show event group information together"),
+	OPT_CALLBACK_NOOPT('b', "branch-stack", &branch_mode, "",
+		    "use branch records for histogram filling", parse_branch_mode),
+	OPT_STRING(0, "objdump", &objdump_path, "path",
+		   "objdump binary to use for disassembly and annotations"),
+	OPT_BOOLEAN(0, "demangle", &symbol_conf.demangle,
+		    "Disable symbol demangling"),
+	OPT_BOOLEAN(0, "mem-mode", &report.mem_mode, "mem access profile"),
+	OPT_CALLBACK(0, "percent-limit", &report, "percent",
+		     "Don't show entries under that percent", parse_percent_limit),
+	OPT_CALLBACK(0, "percentage", NULL, "relative|absolute",
+		     "how to display percentage of filtered entries", parse_filter_percentage),
 	OPT_END()
-};
+	};
+	struct perf_data_file file = {
+		.mode  = PERF_DATA_MODE_READ,
+	};
+
+	perf_config(report__config, &report);
 
-int cmd_report(int argc, const char **argv, const char *prefix __used)
-{
 	argc = parse_options(argc, argv, options, report_usage, 0);
 
-	setup_pager();
+	if (report.use_stdio)
+		use_browser = 0;
+	else if (report.use_tui)
+		use_browser = 1;
+	else if (report.use_gtk)
+		use_browser = 2;
 
-	if (symbol__init() < 0)
-		return -1;
+	if (report.inverted_callchain)
+		callchain_param.order = ORDER_CALLER;
 
-	setup_sorting(report_usage, options);
+	if (!input_name || !strlen(input_name)) {
+		if (!fstat(STDIN_FILENO, &st) && S_ISFIFO(st.st_mode))
+			input_name = "-";
+		else
+			input_name = "perf.data";
+	}
 
-	if (parent_pattern != default_parent_pattern) {
-		sort_dimension__add("parent");
-		sort_parent.elide = 1;
-	} else
-		symbol_conf.exclude_other = false;
+	file.path  = input_name;
+	file.force = report.force;
+
+repeat:
+	session = perf_session__new(&file, false, &report.tool);
+	if (session == NULL)
+		return -ENOMEM;
+
+	report.session = session;
+
+	has_br_stack = perf_header__has_feat(&session->header,
+					     HEADER_BRANCH_STACK);
+
+	if (branch_mode == -1 && has_br_stack) {
+		sort__mode = SORT_MODE__BRANCH;
+		symbol_conf.cumulate_callchain = false;
+	}
+
+	if (report.mem_mode) {
+		if (sort__mode == SORT_MODE__BRANCH) {
+			pr_err("branch and mem mode incompatible\n");
+			goto error;
+		}
+		sort__mode = SORT_MODE__MEMORY;
+		symbol_conf.cumulate_callchain = false;
+	}
+
+	if (setup_sorting() < 0) {
+		if (sort_order)
+			parse_options_usage(report_usage, options, "s", 1);
+		if (field_order)
+			parse_options_usage(sort_order ? NULL : report_usage,
+					    options, "F", 1);
+		goto error;
+	}
+
+	/* Force tty output for header output. */
+	if (report.header || report.header_only)
+		use_browser = 0;
+
+	if (strcmp(input_name, "-") != 0)
+		setup_browser(true);
+	else
+		use_browser = 0;
+
+	if (report.header || report.header_only) {
+		perf_session__fprintf_info(session, stdout,
+					   report.show_full_info);
+		if (report.header_only)
+			return 0;
+	} else if (use_browser == 0) {
+		fputs("# To display the perf.data header info, please use --header/--header-only options.\n#\n",
+		      stdout);
+	}
 
 	/*
-	 * Any (unrecognized) arguments left?
+	 * Only in the TUI browser we are doing integrated annotation,
+	 * so don't allocate extra space that won't be used in the stdio
+	 * implementation.
 	 */
-	if (argc)
-		usage_with_options(report_usage, options);
+	if (ui__has_annotation()) {
+		symbol_conf.priv_size = sizeof(struct annotation);
+		machines__set_symbol_filter(&session->machines,
+					    symbol__annotate_init);
+		/*
+ 		 * For searching by name on the "Browse map details".
+ 		 * providing it only in verbose mode not to bloat too
+ 		 * much struct symbol.
+ 		 */
+		if (verbose) {
+			/*
+			 * XXX: Need to provide a less kludgy way to ask for
+			 * more space per symbol, the u32 is for the index on
+			 * the ui browser.
+			 * See symbol__browser_index.
+			 */
+			symbol_conf.priv_size += sizeof(u32);
+			symbol_conf.sort_by_name = true;
+		}
+	}
 
-	sort_entry__setup_elide(&sort_dso, symbol_conf.dso_list, "dso", stdout);
-	sort_entry__setup_elide(&sort_comm, symbol_conf.comm_list, "comm", stdout);
-	sort_entry__setup_elide(&sort_sym, symbol_conf.sym_list, "symbol", stdout);
+	if (symbol__init() < 0)
+		goto error;
+
+	if (argc) {
+		/*
+		 * Special case: if there's an argument left then assume that
+		 * it's a symbol filter:
+		 */
+		if (argc > 1)
+			usage_with_options(report_usage, options);
 
-	return __cmd_report();
+		report.symbol_filter_str = argv[0];
+	}
+
+	sort__setup_elide(stdout);
+
+	ret = __cmd_report(&report);
+	if (ret == K_SWITCH_INPUT_DATA) {
+		perf_session__delete(session);
+		goto repeat;
+	} else
+		ret = 0;
+
+error:
+	perf_session__delete(session);
+	return ret;
 }
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 4f5a03e4344..c38d06c0477 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -2,11 +2,14 @@
 #include "perf.h"
 
 #include "util/util.h"
+#include "util/evlist.h"
 #include "util/cache.h"
+#include "util/evsel.h"
 #include "util/symbol.h"
 #include "util/thread.h"
 #include "util/header.h"
 #include "util/session.h"
+#include "util/tool.h"
 
 #include "util/parse-options.h"
 #include "util/trace-event.h"
@@ -14,31 +17,18 @@
 #include "util/debug.h"
 
 #include <sys/prctl.h>
+#include <sys/resource.h>
 
 #include <semaphore.h>
 #include <pthread.h>
 #include <math.h>
 
-static char			const *input_name = "perf.data";
-
-static char			default_sort_order[] = "avg, max, switch, runtime";
-static char			*sort_order = default_sort_order;
-
-static int			profile_cpu = -1;
-
 #define PR_SET_NAME		15               /* Set process name */
 #define MAX_CPUS		4096
-
-static u64			run_measurement_overhead;
-static u64			sleep_measurement_overhead;
-
 #define COMM_LEN		20
 #define SYM_LEN			129
-
 #define MAX_PID			65536
 
-static unsigned long		nr_tasks;
-
 struct sched_atom;
 
 struct task_desc {
@@ -68,53 +58,15 @@ enum sched_event_type {
 
 struct sched_atom {
 	enum sched_event_type	type;
+	int			specific_wait;
 	u64			timestamp;
 	u64			duration;
 	unsigned long		nr;
-	int			specific_wait;
 	sem_t			*wait_sem;
 	struct task_desc	*wakee;
 };
 
-static struct task_desc		*pid_to_task[MAX_PID];
-
-static struct task_desc		**tasks;
-
-static pthread_mutex_t		start_work_mutex = PTHREAD_MUTEX_INITIALIZER;
-static u64			start_time;
-
-static pthread_mutex_t		work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-static unsigned long		nr_run_events;
-static unsigned long		nr_sleep_events;
-static unsigned long		nr_wakeup_events;
-
-static unsigned long		nr_sleep_corrections;
-static unsigned long		nr_run_events_optimized;
-
-static unsigned long		targetless_wakeups;
-static unsigned long		multitarget_wakeups;
-
-static u64			cpu_usage;
-static u64			runavg_cpu_usage;
-static u64			parent_cpu_usage;
-static u64			runavg_parent_cpu_usage;
-
-static unsigned long		nr_runs;
-static u64			sum_runtime;
-static u64			sum_fluct;
-static u64			run_avg;
-
-static unsigned long		replay_repeat = 10;
-static unsigned long		nr_timestamps;
-static unsigned long		nr_unordered_timestamps;
-static unsigned long		nr_state_machine_bugs;
-static unsigned long		nr_context_switch_bugs;
-static unsigned long		nr_events;
-static unsigned long		nr_lost_chunks;
-static unsigned long		nr_lost_events;
-
-#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
+#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
 
 enum thread_state {
 	THREAD_SLEEPING = 0,
@@ -145,11 +97,78 @@ struct work_atoms {
 
 typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
 
-static struct rb_root		atom_root, sorted_atom_root;
+struct perf_sched;
+
+struct trace_sched_handler {
+	int (*switch_event)(struct perf_sched *sched, struct perf_evsel *evsel,
+			    struct perf_sample *sample, struct machine *machine);
 
-static u64			all_runtime;
-static u64			all_count;
+	int (*runtime_event)(struct perf_sched *sched, struct perf_evsel *evsel,
+			     struct perf_sample *sample, struct machine *machine);
 
+	int (*wakeup_event)(struct perf_sched *sched, struct perf_evsel *evsel,
+			    struct perf_sample *sample, struct machine *machine);
+
+	/* PERF_RECORD_FORK event, not sched_process_fork tracepoint */
+	int (*fork_event)(struct perf_sched *sched, union perf_event *event,
+			  struct machine *machine);
+
+	int (*migrate_task_event)(struct perf_sched *sched,
+				  struct perf_evsel *evsel,
+				  struct perf_sample *sample,
+				  struct machine *machine);
+};
+
+struct perf_sched {
+	struct perf_tool tool;
+	const char	 *sort_order;
+	unsigned long	 nr_tasks;
+	struct task_desc *pid_to_task[MAX_PID];
+	struct task_desc **tasks;
+	const struct trace_sched_handler *tp_handler;
+	pthread_mutex_t	 start_work_mutex;
+	pthread_mutex_t	 work_done_wait_mutex;
+	int		 profile_cpu;
+/*
+ * Track the current task - that way we can know whether there's any
+ * weird events, such as a task being switched away that is not current.
+ */
+	int		 max_cpu;
+	u32		 curr_pid[MAX_CPUS];
+	struct thread	 *curr_thread[MAX_CPUS];
+	char		 next_shortname1;
+	char		 next_shortname2;
+	unsigned int	 replay_repeat;
+	unsigned long	 nr_run_events;
+	unsigned long	 nr_sleep_events;
+	unsigned long	 nr_wakeup_events;
+	unsigned long	 nr_sleep_corrections;
+	unsigned long	 nr_run_events_optimized;
+	unsigned long	 targetless_wakeups;
+	unsigned long	 multitarget_wakeups;
+	unsigned long	 nr_runs;
+	unsigned long	 nr_timestamps;
+	unsigned long	 nr_unordered_timestamps;
+	unsigned long	 nr_context_switch_bugs;
+	unsigned long	 nr_events;
+	unsigned long	 nr_lost_chunks;
+	unsigned long	 nr_lost_events;
+	u64		 run_measurement_overhead;
+	u64		 sleep_measurement_overhead;
+	u64		 start_time;
+	u64		 cpu_usage;
+	u64		 runavg_cpu_usage;
+	u64		 parent_cpu_usage;
+	u64		 runavg_parent_cpu_usage;
+	u64		 sum_runtime;
+	u64		 sum_fluct;
+	u64		 run_avg;
+	u64		 all_runtime;
+	u64		 all_count;
+	u64		 cpu_last_switched[MAX_CPUS];
+	struct rb_root	 atom_root, sorted_atom_root;
+	struct list_head sort_list, cmp_pid;
+};
 
 static u64 get_nsecs(void)
 {
@@ -160,13 +179,13 @@ static u64 get_nsecs(void)
 	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
 }
 
-static void burn_nsecs(u64 nsecs)
+static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
 {
 	u64 T0 = get_nsecs(), T1;
 
 	do {
 		T1 = get_nsecs();
-	} while (T1 + run_measurement_overhead < T0 + nsecs);
+	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
 }
 
 static void sleep_nsecs(u64 nsecs)
@@ -179,24 +198,24 @@ static void sleep_nsecs(u64 nsecs)
 	nanosleep(&ts, NULL);
 }
 
-static void calibrate_run_measurement_overhead(void)
+static void calibrate_run_measurement_overhead(struct perf_sched *sched)
 {
 	u64 T0, T1, delta, min_delta = 1000000000ULL;
 	int i;
 
 	for (i = 0; i < 10; i++) {
 		T0 = get_nsecs();
-		burn_nsecs(0);
+		burn_nsecs(sched, 0);
 		T1 = get_nsecs();
 		delta = T1-T0;
 		min_delta = min(min_delta, delta);
 	}
-	run_measurement_overhead = min_delta;
+	sched->run_measurement_overhead = min_delta;
 
-	printf("run measurement overhead: %Ld nsecs\n", min_delta);
+	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
 }
 
-static void calibrate_sleep_measurement_overhead(void)
+static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
 {
 	u64 T0, T1, delta, min_delta = 1000000000ULL;
 	int i;
@@ -209,9 +228,9 @@ static void calibrate_sleep_measurement_overhead(void)
 		min_delta = min(min_delta, delta);
 	}
 	min_delta -= 10000;
-	sleep_measurement_overhead = min_delta;
+	sched->sleep_measurement_overhead = min_delta;
 
-	printf("sleep measurement overhead: %Ld nsecs\n", min_delta);
+	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
 }
 
 static struct sched_atom *
@@ -242,8 +261,8 @@ static struct sched_atom *last_event(struct task_desc *task)
 	return task->atoms[task->nr_events - 1];
 }
 
-static void
-add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration)
+static void add_sched_event_run(struct perf_sched *sched, struct task_desc *task,
+				u64 timestamp, u64 duration)
 {
 	struct sched_atom *event, *curr_event = last_event(task);
 
@@ -252,7 +271,7 @@ add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration)
 	 * to it:
 	 */
 	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
-		nr_run_events_optimized++;
+		sched->nr_run_events_optimized++;
 		curr_event->duration += duration;
 		return;
 	}
@@ -262,12 +281,11 @@ add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration)
 	event->type = SCHED_EVENT_RUN;
 	event->duration = duration;
 
-	nr_run_events++;
+	sched->nr_run_events++;
 }
 
-static void
-add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
-		       struct task_desc *wakee)
+static void add_sched_event_wakeup(struct perf_sched *sched, struct task_desc *task,
+				   u64 timestamp, struct task_desc *wakee)
 {
 	struct sched_atom *event, *wakee_event;
 
@@ -277,11 +295,11 @@ add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
 
 	wakee_event = last_event(wakee);
 	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
-		targetless_wakeups++;
+		sched->targetless_wakeups++;
 		return;
 	}
 	if (wakee_event->wait_sem) {
-		multitarget_wakeups++;
+		sched->multitarget_wakeups++;
 		return;
 	}
 
@@ -290,94 +308,89 @@ add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
 	wakee_event->specific_wait = 1;
 	event->wait_sem = wakee_event->wait_sem;
 
-	nr_wakeup_events++;
+	sched->nr_wakeup_events++;
 }
 
-static void
-add_sched_event_sleep(struct task_desc *task, u64 timestamp,
-		      u64 task_state __used)
+static void add_sched_event_sleep(struct perf_sched *sched, struct task_desc *task,
+				  u64 timestamp, u64 task_state __maybe_unused)
 {
 	struct sched_atom *event = get_new_event(task, timestamp);
 
 	event->type = SCHED_EVENT_SLEEP;
 
-	nr_sleep_events++;
+	sched->nr_sleep_events++;
 }
 
-static struct task_desc *register_pid(unsigned long pid, const char *comm)
+static struct task_desc *register_pid(struct perf_sched *sched,
+				      unsigned long pid, const char *comm)
 {
 	struct task_desc *task;
 
 	BUG_ON(pid >= MAX_PID);
 
-	task = pid_to_task[pid];
+	task = sched->pid_to_task[pid];
 
 	if (task)
 		return task;
 
 	task = zalloc(sizeof(*task));
 	task->pid = pid;
-	task->nr = nr_tasks;
+	task->nr = sched->nr_tasks;
 	strcpy(task->comm, comm);
 	/*
 	 * every task starts in sleeping state - this gets ignored
 	 * if there's no wakeup pointing to this sleep state:
 	 */
-	add_sched_event_sleep(task, 0, 0);
+	add_sched_event_sleep(sched, task, 0, 0);
 
-	pid_to_task[pid] = task;
-	nr_tasks++;
-	tasks = realloc(tasks, nr_tasks*sizeof(struct task_task *));
-	BUG_ON(!tasks);
-	tasks[task->nr] = task;
+	sched->pid_to_task[pid] = task;
+	sched->nr_tasks++;
+	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_task *));
+	BUG_ON(!sched->tasks);
+	sched->tasks[task->nr] = task;
 
 	if (verbose)
-		printf("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
+		printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
 
 	return task;
 }
 
 
-static void print_task_traces(void)
+static void print_task_traces(struct perf_sched *sched)
 {
 	struct task_desc *task;
 	unsigned long i;
 
-	for (i = 0; i < nr_tasks; i++) {
-		task = tasks[i];
+	for (i = 0; i < sched->nr_tasks; i++) {
+		task = sched->tasks[i];
 		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
 			task->nr, task->comm, task->pid, task->nr_events);
 	}
 }
 
-static void add_cross_task_wakeups(void)
+static void add_cross_task_wakeups(struct perf_sched *sched)
 {
 	struct task_desc *task1, *task2;
 	unsigned long i, j;
 
-	for (i = 0; i < nr_tasks; i++) {
-		task1 = tasks[i];
+	for (i = 0; i < sched->nr_tasks; i++) {
+		task1 = sched->tasks[i];
 		j = i + 1;
-		if (j == nr_tasks)
+		if (j == sched->nr_tasks)
 			j = 0;
-		task2 = tasks[j];
-		add_sched_event_wakeup(task1, 0, task2);
+		task2 = sched->tasks[j];
+		add_sched_event_wakeup(sched, task1, 0, task2);
 	}
 }
 
-static void
-process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom)
+static void perf_sched__process_event(struct perf_sched *sched,
+				      struct sched_atom *atom)
 {
 	int ret = 0;
-	u64 now;
-	long long delta;
-
-	now = get_nsecs();
-	delta = start_time + atom->timestamp - now;
 
 	switch (atom->type) {
 		case SCHED_EVENT_RUN:
-			burn_nsecs(atom->duration);
+			burn_nsecs(sched, atom->duration);
 			break;
 		case SCHED_EVENT_SLEEP:
 			if (atom->wait_sem)
@@ -424,8 +437,8 @@ static int self_open_counters(void)
 	fd = sys_perf_event_open(&attr, 0, -1, -1, 0);
 
 	if (fd < 0)
-		die("Error: sys_perf_event_open() syscall returned"
-		    "with %d (%s)\n", fd, strerror(errno));
+		pr_err("Error: sys_perf_event_open() syscall returned "
+		       "with %d (%s)\n", fd, strerror(errno));
 	return fd;
 }
 
@@ -440,31 +453,41 @@ static u64 get_cpu_usage_nsec_self(int fd)
 	return runtime;
 }
 
+struct sched_thread_parms {
+	struct task_desc  *task;
+	struct perf_sched *sched;
+};
+
 static void *thread_func(void *ctx)
 {
-	struct task_desc *this_task = ctx;
+	struct sched_thread_parms *parms = ctx;
+	struct task_desc *this_task = parms->task;
+	struct perf_sched *sched = parms->sched;
 	u64 cpu_usage_0, cpu_usage_1;
 	unsigned long i, ret;
 	char comm2[22];
 	int fd;
 
+	zfree(&parms);
+
 	sprintf(comm2, ":%s", this_task->comm);
 	prctl(PR_SET_NAME, comm2);
 	fd = self_open_counters();
-
+	if (fd < 0)
+		return NULL;
 again:
 	ret = sem_post(&this_task->ready_for_work);
 	BUG_ON(ret);
-	ret = pthread_mutex_lock(&start_work_mutex);
+	ret = pthread_mutex_lock(&sched->start_work_mutex);
 	BUG_ON(ret);
-	ret = pthread_mutex_unlock(&start_work_mutex);
+	ret = pthread_mutex_unlock(&sched->start_work_mutex);
 	BUG_ON(ret);
 
 	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
 
 	for (i = 0; i < this_task->nr_events; i++) {
 		this_task->curr_event = i;
-		process_sched_event(this_task, this_task->atoms[i]);
+		perf_sched__process_event(sched, this_task->atoms[i]);
 	}
 
 	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
@@ -472,15 +495,15 @@ again:
 	ret = sem_post(&this_task->work_done_sem);
 	BUG_ON(ret);
 
-	ret = pthread_mutex_lock(&work_done_wait_mutex);
+	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
 	BUG_ON(ret);
-	ret = pthread_mutex_unlock(&work_done_wait_mutex);
+	ret = pthread_mutex_unlock(&sched->work_done_wait_mutex);
 	BUG_ON(ret);
 
 	goto again;
 }
 
-static void create_tasks(void)
+static void create_tasks(struct perf_sched *sched)
 {
 	struct task_desc *task;
 	pthread_attr_t attr;
@@ -489,372 +512,238 @@ static void create_tasks(void)
 
 	err = pthread_attr_init(&attr);
 	BUG_ON(err);
-	err = pthread_attr_setstacksize(&attr, (size_t)(16*1024));
+	err = pthread_attr_setstacksize(&attr,
+			(size_t) max(16 * 1024, PTHREAD_STACK_MIN));
 	BUG_ON(err);
-	err = pthread_mutex_lock(&start_work_mutex);
+	err = pthread_mutex_lock(&sched->start_work_mutex);
 	BUG_ON(err);
-	err = pthread_mutex_lock(&work_done_wait_mutex);
+	err = pthread_mutex_lock(&sched->work_done_wait_mutex);
 	BUG_ON(err);
-	for (i = 0; i < nr_tasks; i++) {
-		task = tasks[i];
+	for (i = 0; i < sched->nr_tasks; i++) {
+		struct sched_thread_parms *parms = malloc(sizeof(*parms));
+		BUG_ON(parms == NULL);
+		parms->task = task = sched->tasks[i];
+		parms->sched = sched;
 		sem_init(&task->sleep_sem, 0, 0);
 		sem_init(&task->ready_for_work, 0, 0);
 		sem_init(&task->work_done_sem, 0, 0);
 		task->curr_event = 0;
-		err = pthread_create(&task->thread, &attr, thread_func, task);
+		err = pthread_create(&task->thread, &attr, thread_func, parms);
 		BUG_ON(err);
 	}
 }
 
-static void wait_for_tasks(void)
+static void wait_for_tasks(struct perf_sched *sched)
 {
 	u64 cpu_usage_0, cpu_usage_1;
 	struct task_desc *task;
 	unsigned long i, ret;
 
-	start_time = get_nsecs();
-	cpu_usage = 0;
-	pthread_mutex_unlock(&work_done_wait_mutex);
+	sched->start_time = get_nsecs();
+	sched->cpu_usage = 0;
+	pthread_mutex_unlock(&sched->work_done_wait_mutex);
 
-	for (i = 0; i < nr_tasks; i++) {
-		task = tasks[i];
+	for (i = 0; i < sched->nr_tasks; i++) {
+		task = sched->tasks[i];
 		ret = sem_wait(&task->ready_for_work);
 		BUG_ON(ret);
 		sem_init(&task->ready_for_work, 0, 0);
 	}
-	ret = pthread_mutex_lock(&work_done_wait_mutex);
+	ret = pthread_mutex_lock(&sched->work_done_wait_mutex);
 	BUG_ON(ret);
 
 	cpu_usage_0 = get_cpu_usage_nsec_parent();
 
-	pthread_mutex_unlock(&start_work_mutex);
+	pthread_mutex_unlock(&sched->start_work_mutex);
 
-	for (i = 0; i < nr_tasks; i++) {
-		task = tasks[i];
+	for (i = 0; i < sched->nr_tasks; i++) {
+		task = sched->tasks[i];
 		ret = sem_wait(&task->work_done_sem);
 		BUG_ON(ret);
 		sem_init(&task->work_done_sem, 0, 0);
-		cpu_usage += task->cpu_usage;
+		sched->cpu_usage += task->cpu_usage;
 		task->cpu_usage = 0;
 	}
 
 	cpu_usage_1 = get_cpu_usage_nsec_parent();
-	if (!runavg_cpu_usage)
-		runavg_cpu_usage = cpu_usage;
-	runavg_cpu_usage = (runavg_cpu_usage*9 + cpu_usage)/10;
+	if (!sched->runavg_cpu_usage)
+		sched->runavg_cpu_usage = sched->cpu_usage;
+	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * 9 + sched->cpu_usage) / 10;
 
-	parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
-	if (!runavg_parent_cpu_usage)
-		runavg_parent_cpu_usage = parent_cpu_usage;
-	runavg_parent_cpu_usage = (runavg_parent_cpu_usage*9 +
-				   parent_cpu_usage)/10;
+	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
+	if (!sched->runavg_parent_cpu_usage)
+		sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
+	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * 9 +
+					 sched->parent_cpu_usage)/10;
 
-	ret = pthread_mutex_lock(&start_work_mutex);
+	ret = pthread_mutex_lock(&sched->start_work_mutex);
 	BUG_ON(ret);
 
-	for (i = 0; i < nr_tasks; i++) {
-		task = tasks[i];
+	for (i = 0; i < sched->nr_tasks; i++) {
+		task = sched->tasks[i];
 		sem_init(&task->sleep_sem, 0, 0);
 		task->curr_event = 0;
 	}
 }
 
-static void run_one_test(void)
+static void run_one_test(struct perf_sched *sched)
 {
-	u64 T0, T1, delta, avg_delta, fluct, std_dev;
+	u64 T0, T1, delta, avg_delta, fluct;
 
 	T0 = get_nsecs();
-	wait_for_tasks();
+	wait_for_tasks(sched);
 	T1 = get_nsecs();
 
 	delta = T1 - T0;
-	sum_runtime += delta;
-	nr_runs++;
+	sched->sum_runtime += delta;
+	sched->nr_runs++;
 
-	avg_delta = sum_runtime / nr_runs;
+	avg_delta = sched->sum_runtime / sched->nr_runs;
 	if (delta < avg_delta)
 		fluct = avg_delta - delta;
 	else
 		fluct = delta - avg_delta;
-	sum_fluct += fluct;
-	std_dev = sum_fluct / nr_runs / sqrt(nr_runs);
-	if (!run_avg)
-		run_avg = delta;
-	run_avg = (run_avg*9 + delta)/10;
+	sched->sum_fluct += fluct;
+	if (!sched->run_avg)
+		sched->run_avg = delta;
+	sched->run_avg = (sched->run_avg * 9 + delta) / 10;
 
-	printf("#%-3ld: %0.3f, ",
-		nr_runs, (double)delta/1000000.0);
+	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / 1000000.0);
 
-	printf("ravg: %0.2f, ",
-		(double)run_avg/1e6);
+	printf("ravg: %0.2f, ", (double)sched->run_avg / 1e6);
 
 	printf("cpu: %0.2f / %0.2f",
-		(double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6);
+		(double)sched->cpu_usage / 1e6, (double)sched->runavg_cpu_usage / 1e6);
 
 #if 0
 	/*
 	 * rusage statistics done by the parent, these are less
-	 * accurate than the sum_exec_runtime based statistics:
+	 * accurate than the sched->sum_exec_runtime based statistics:
 	 */
 	printf(" [%0.2f / %0.2f]",
-		(double)parent_cpu_usage/1e6,
-		(double)runavg_parent_cpu_usage/1e6);
+		(double)sched->parent_cpu_usage/1e6,
+		(double)sched->runavg_parent_cpu_usage/1e6);
 #endif
 
 	printf("\n");
 
-	if (nr_sleep_corrections)
-		printf(" (%ld sleep corrections)\n", nr_sleep_corrections);
-	nr_sleep_corrections = 0;
+	if (sched->nr_sleep_corrections)
+		printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
+	sched->nr_sleep_corrections = 0;
 }
 
-static void test_calibrations(void)
+static void test_calibrations(struct perf_sched *sched)
 {
 	u64 T0, T1;
 
 	T0 = get_nsecs();
-	burn_nsecs(1e6);
+	burn_nsecs(sched, 1e6);
 	T1 = get_nsecs();
 
-	printf("the run test took %Ld nsecs\n", T1-T0);
+	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
 
 	T0 = get_nsecs();
 	sleep_nsecs(1e6);
 	T1 = get_nsecs();
 
-	printf("the sleep test took %Ld nsecs\n", T1-T0);
+	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
 }
 
-#define FILL_FIELD(ptr, field, event, data)	\
-	ptr.field = (typeof(ptr.field)) raw_field_value(event, #field, data)
-
-#define FILL_ARRAY(ptr, array, event, data)			\
-do {								\
-	void *__array = raw_field_ptr(event, #array, data);	\
-	memcpy(ptr.array, __array, sizeof(ptr.array));	\
-} while(0)
-
-#define FILL_COMMON_FIELDS(ptr, event, data)			\
-do {								\
-	FILL_FIELD(ptr, common_type, event, data);		\
-	FILL_FIELD(ptr, common_flags, event, data);		\
-	FILL_FIELD(ptr, common_preempt_count, event, data);	\
-	FILL_FIELD(ptr, common_pid, event, data);		\
-	FILL_FIELD(ptr, common_tgid, event, data);		\
-} while (0)
-
-
-
-struct trace_switch_event {
-	u32 size;
-
-	u16 common_type;
-	u8 common_flags;
-	u8 common_preempt_count;
-	u32 common_pid;
-	u32 common_tgid;
-
-	char prev_comm[16];
-	u32 prev_pid;
-	u32 prev_prio;
-	u64 prev_state;
-	char next_comm[16];
-	u32 next_pid;
-	u32 next_prio;
-};
-
-struct trace_runtime_event {
-	u32 size;
-
-	u16 common_type;
-	u8 common_flags;
-	u8 common_preempt_count;
-	u32 common_pid;
-	u32 common_tgid;
-
-	char comm[16];
-	u32 pid;
-	u64 runtime;
-	u64 vruntime;
-};
-
-struct trace_wakeup_event {
-	u32 size;
-
-	u16 common_type;
-	u8 common_flags;
-	u8 common_preempt_count;
-	u32 common_pid;
-	u32 common_tgid;
-
-	char comm[16];
-	u32 pid;
-
-	u32 prio;
-	u32 success;
-	u32 cpu;
-};
-
-struct trace_fork_event {
-	u32 size;
-
-	u16 common_type;
-	u8 common_flags;
-	u8 common_preempt_count;
-	u32 common_pid;
-	u32 common_tgid;
-
-	char parent_comm[16];
-	u32 parent_pid;
-	char child_comm[16];
-	u32 child_pid;
-};
-
-struct trace_migrate_task_event {
-	u32 size;
-
-	u16 common_type;
-	u8 common_flags;
-	u8 common_preempt_count;
-	u32 common_pid;
-	u32 common_tgid;
-
-	char comm[16];
-	u32 pid;
-
-	u32 prio;
-	u32 cpu;
-};
-
-struct trace_sched_handler {
-	void (*switch_event)(struct trace_switch_event *,
-			     struct perf_session *,
-			     struct event *,
-			     int cpu,
-			     u64 timestamp,
-			     struct thread *thread);
-
-	void (*runtime_event)(struct trace_runtime_event *,
-			      struct perf_session *,
-			      struct event *,
-			      int cpu,
-			      u64 timestamp,
-			      struct thread *thread);
-
-	void (*wakeup_event)(struct trace_wakeup_event *,
-			     struct perf_session *,
-			     struct event *,
-			     int cpu,
-			     u64 timestamp,
-			     struct thread *thread);
-
-	void (*fork_event)(struct trace_fork_event *,
-			   struct event *,
-			   int cpu,
-			   u64 timestamp,
-			   struct thread *thread);
-
-	void (*migrate_task_event)(struct trace_migrate_task_event *,
-			   struct perf_session *session,
-			   struct event *,
-			   int cpu,
-			   u64 timestamp,
-			   struct thread *thread);
-};
-
-
-static void
-replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
-		    struct perf_session *session __used,
-		    struct event *event,
-		    int cpu __used,
-		    u64 timestamp __used,
-		    struct thread *thread __used)
+static int
+replay_wakeup_event(struct perf_sched *sched,
+		    struct perf_evsel *evsel, struct perf_sample *sample,
+		    struct machine *machine __maybe_unused)
 {
+	const char *comm = perf_evsel__strval(evsel, sample, "comm");
+	const u32 pid	 = perf_evsel__intval(evsel, sample, "pid");
 	struct task_desc *waker, *wakee;
 
 	if (verbose) {
-		printf("sched_wakeup event %p\n", event);
+		printf("sched_wakeup event %p\n", evsel);
 
-		printf(" ... pid %d woke up %s/%d\n",
-			wakeup_event->common_pid,
-			wakeup_event->comm,
-			wakeup_event->pid);
+		printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
 	}
 
-	waker = register_pid(wakeup_event->common_pid, "<unknown>");
-	wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
+	waker = register_pid(sched, sample->tid, "<unknown>");
+	wakee = register_pid(sched, pid, comm);
 
-	add_sched_event_wakeup(waker, timestamp, wakee);
+	add_sched_event_wakeup(sched, waker, sample->time, wakee);
+	return 0;
 }
 
-static u64 cpu_last_switched[MAX_CPUS];
-
-static void
-replay_switch_event(struct trace_switch_event *switch_event,
-		    struct perf_session *session __used,
-		    struct event *event,
-		    int cpu,
-		    u64 timestamp,
-		    struct thread *thread __used)
+static int replay_switch_event(struct perf_sched *sched,
+			       struct perf_evsel *evsel,
+			       struct perf_sample *sample,
+			       struct machine *machine __maybe_unused)
 {
-	struct task_desc *prev, *next;
-	u64 timestamp0;
+	const char *prev_comm  = perf_evsel__strval(evsel, sample, "prev_comm"),
+		   *next_comm  = perf_evsel__strval(evsel, sample, "next_comm");
+	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
+		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
+	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
+	struct task_desc *prev, __maybe_unused *next;
+	u64 timestamp0, timestamp = sample->time;
+	int cpu = sample->cpu;
 	s64 delta;
 
 	if (verbose)
-		printf("sched_switch event %p\n", event);
+		printf("sched_switch event %p\n", evsel);
 
 	if (cpu >= MAX_CPUS || cpu < 0)
-		return;
+		return 0;
 
-	timestamp0 = cpu_last_switched[cpu];
+	timestamp0 = sched->cpu_last_switched[cpu];
 	if (timestamp0)
 		delta = timestamp - timestamp0;
 	else
 		delta = 0;
 
-	if (delta < 0)
-		die("hm, delta: %Ld < 0 ?\n", delta);
-
-	if (verbose) {
-		printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n",
-			switch_event->prev_comm, switch_event->prev_pid,
-			switch_event->next_comm, switch_event->next_pid,
-			delta);
+	if (delta < 0) {
+		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
+		return -1;
 	}
 
-	prev = register_pid(switch_event->prev_pid, switch_event->prev_comm);
-	next = register_pid(switch_event->next_pid, switch_event->next_comm);
+	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
+		 prev_comm, prev_pid, next_comm, next_pid, delta);
 
-	cpu_last_switched[cpu] = timestamp;
+	prev = register_pid(sched, prev_pid, prev_comm);
+	next = register_pid(sched, next_pid, next_comm);
 
-	add_sched_event_run(prev, timestamp, delta);
-	add_sched_event_sleep(prev, timestamp, switch_event->prev_state);
-}
+	sched->cpu_last_switched[cpu] = timestamp;
 
+	add_sched_event_run(sched, prev, timestamp, delta);
+	add_sched_event_sleep(sched, prev, timestamp, prev_state);
 
-static void
-replay_fork_event(struct trace_fork_event *fork_event,
-		  struct event *event,
-		  int cpu __used,
-		  u64 timestamp __used,
-		  struct thread *thread __used)
+	return 0;
+}
+
+static int replay_fork_event(struct perf_sched *sched,
+			     union perf_event *event,
+			     struct machine *machine)
 {
+	struct thread *child, *parent;
+
+	child = machine__findnew_thread(machine, event->fork.pid,
+					event->fork.tid);
+	parent = machine__findnew_thread(machine, event->fork.ppid,
+					 event->fork.ptid);
+
+	if (child == NULL || parent == NULL) {
+		pr_debug("thread does not exist on fork event: child %p, parent %p\n",
+				 child, parent);
+		return 0;
+	}
+
 	if (verbose) {
-		printf("sched_fork event %p\n", event);
-		printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
-		printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
+		printf("fork event\n");
+		printf("... parent: %s/%d\n", thread__comm_str(parent), parent->tid);
+		printf("...  child: %s/%d\n", thread__comm_str(child), child->tid);
 	}
-	register_pid(fork_event->parent_pid, fork_event->parent_comm);
-	register_pid(fork_event->child_pid, fork_event->child_comm);
-}
 
-static struct trace_sched_handler replay_ops  = {
-	.wakeup_event		= replay_wakeup_event,
-	.switch_event		= replay_switch_event,
-	.fork_event		= replay_fork_event,
-};
+	register_pid(sched, parent->tid, thread__comm_str(parent));
+	register_pid(sched, child->tid, thread__comm_str(child));
+	return 0;
+}
 
 struct sort_dimension {
 	const char		*name;
@@ -862,8 +751,6 @@ struct sort_dimension {
 	struct list_head	list;
 };
 
-static LIST_HEAD(cmp_pid);
-
 static int
 thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
 {
@@ -932,43 +819,37 @@ __thread_latency_insert(struct rb_root *root, struct work_atoms *data,
 	rb_insert_color(&data->node, root);
 }
 
-static void thread_atoms_insert(struct thread *thread)
+static int thread_atoms_insert(struct perf_sched *sched, struct thread *thread)
 {
 	struct work_atoms *atoms = zalloc(sizeof(*atoms));
-	if (!atoms)
-		die("No memory");
+	if (!atoms) {
+		pr_err("No memory at %s\n", __func__);
+		return -1;
+	}
 
 	atoms->thread = thread;
 	INIT_LIST_HEAD(&atoms->work_list);
-	__thread_latency_insert(&atom_root, atoms, &cmp_pid);
-}
-
-static void
-latency_fork_event(struct trace_fork_event *fork_event __used,
-		   struct event *event __used,
-		   int cpu __used,
-		   u64 timestamp __used,
-		   struct thread *thread __used)
-{
-	/* should insert the newcomer */
+	__thread_latency_insert(&sched->atom_root, atoms, &sched->cmp_pid);
+	return 0;
 }
 
-__used
-static char sched_out_state(struct trace_switch_event *switch_event)
+static char sched_out_state(u64 prev_state)
 {
 	const char *str = TASK_STATE_TO_CHAR_STR;
 
-	return str[switch_event->prev_state];
+	return str[prev_state];
 }
 
-static void
+static int
 add_sched_out_event(struct work_atoms *atoms,
 		    char run_state,
 		    u64 timestamp)
 {
 	struct work_atom *atom = zalloc(sizeof(*atom));
-	if (!atom)
-		die("Non memory");
+	if (!atom) {
+		pr_err("Non memory at %s", __func__);
+		return -1;
+	}
 
 	atom->sched_out_time = timestamp;
 
@@ -978,10 +859,12 @@ add_sched_out_event(struct work_atoms *atoms,
 	}
 
 	list_add_tail(&atom->list, &atoms->work_list);
+	return 0;
 }
 
 static void
-add_runtime_event(struct work_atoms *atoms, u64 delta, u64 timestamp __used)
+add_runtime_event(struct work_atoms *atoms, u64 delta,
+		  u64 timestamp __maybe_unused)
 {
 	struct work_atom *atom;
 
@@ -1024,106 +907,123 @@ add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
 	atoms->nb_atoms++;
 }
 
-static void
-latency_switch_event(struct trace_switch_event *switch_event,
-		     struct perf_session *session,
-		     struct event *event __used,
-		     int cpu,
-		     u64 timestamp,
-		     struct thread *thread __used)
+static int latency_switch_event(struct perf_sched *sched,
+				struct perf_evsel *evsel,
+				struct perf_sample *sample,
+				struct machine *machine)
 {
+	const u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
+		  next_pid = perf_evsel__intval(evsel, sample, "next_pid");
+	const u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
 	struct work_atoms *out_events, *in_events;
 	struct thread *sched_out, *sched_in;
-	u64 timestamp0;
+	u64 timestamp0, timestamp = sample->time;
+	int cpu = sample->cpu;
 	s64 delta;
 
 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
 
-	timestamp0 = cpu_last_switched[cpu];
-	cpu_last_switched[cpu] = timestamp;
+	timestamp0 = sched->cpu_last_switched[cpu];
+	sched->cpu_last_switched[cpu] = timestamp;
 	if (timestamp0)
 		delta = timestamp - timestamp0;
 	else
 		delta = 0;
 
-	if (delta < 0)
-		die("hm, delta: %Ld < 0 ?\n", delta);
-
+	if (delta < 0) {
+		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
+		return -1;
+	}
 
-	sched_out = perf_session__findnew(session, switch_event->prev_pid);
-	sched_in = perf_session__findnew(session, switch_event->next_pid);
+	sched_out = machine__findnew_thread(machine, 0, prev_pid);
+	sched_in = machine__findnew_thread(machine, 0, next_pid);
 
-	out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
+	out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
 	if (!out_events) {
-		thread_atoms_insert(sched_out);
-		out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
-		if (!out_events)
-			die("out-event: Internal tree error");
+		if (thread_atoms_insert(sched, sched_out))
+			return -1;
+		out_events = thread_atoms_search(&sched->atom_root, sched_out, &sched->cmp_pid);
+		if (!out_events) {
+			pr_err("out-event: Internal tree error");
+			return -1;
+		}
 	}
-	add_sched_out_event(out_events, sched_out_state(switch_event), timestamp);
+	if (add_sched_out_event(out_events, sched_out_state(prev_state), timestamp))
+		return -1;
 
-	in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
+	in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
 	if (!in_events) {
-		thread_atoms_insert(sched_in);
-		in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
-		if (!in_events)
-			die("in-event: Internal tree error");
+		if (thread_atoms_insert(sched, sched_in))
+			return -1;
+		in_events = thread_atoms_search(&sched->atom_root, sched_in, &sched->cmp_pid);
+		if (!in_events) {
+			pr_err("in-event: Internal tree error");
+			return -1;
+		}
 		/*
 		 * Take came in we have not heard about yet,
 		 * add in an initial atom in runnable state:
 		 */
-		add_sched_out_event(in_events, 'R', timestamp);
+		if (add_sched_out_event(in_events, 'R', timestamp))
+			return -1;
 	}
 	add_sched_in_event(in_events, timestamp);
+
+	return 0;
 }
 
-static void
-latency_runtime_event(struct trace_runtime_event *runtime_event,
-		     struct perf_session *session,
-		     struct event *event __used,
-		     int cpu,
-		     u64 timestamp,
-		     struct thread *this_thread __used)
+static int latency_runtime_event(struct perf_sched *sched,
+				 struct perf_evsel *evsel,
+				 struct perf_sample *sample,
+				 struct machine *machine)
 {
-	struct thread *thread = perf_session__findnew(session, runtime_event->pid);
-	struct work_atoms *atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
+	const u32 pid	   = perf_evsel__intval(evsel, sample, "pid");
+	const u64 runtime  = perf_evsel__intval(evsel, sample, "runtime");
+	struct thread *thread = machine__findnew_thread(machine, 0, pid);
+	struct work_atoms *atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
+	u64 timestamp = sample->time;
+	int cpu = sample->cpu;
 
 	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
 	if (!atoms) {
-		thread_atoms_insert(thread);
-		atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
-		if (!atoms)
-			die("in-event: Internal tree error");
-		add_sched_out_event(atoms, 'R', timestamp);
+		if (thread_atoms_insert(sched, thread))
+			return -1;
+		atoms = thread_atoms_search(&sched->atom_root, thread, &sched->cmp_pid);
+		if (!atoms) {
+			pr_err("in-event: Internal tree error");
+			return -1;
+		}
+		if (add_sched_out_event(atoms, 'R', timestamp))
+			return -1;
 	}
 
-	add_runtime_event(atoms, runtime_event->runtime, timestamp);
+	add_runtime_event(atoms, runtime, timestamp);
+	return 0;
 }
 
-static void
-latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
-		     struct perf_session *session,
-		     struct event *__event __used,
-		     int cpu __used,
-		     u64 timestamp,
-		     struct thread *thread __used)
+static int latency_wakeup_event(struct perf_sched *sched,
+				struct perf_evsel *evsel,
+				struct perf_sample *sample,
+				struct machine *machine)
 {
+	const u32 pid	  = perf_evsel__intval(evsel, sample, "pid");
 	struct work_atoms *atoms;
 	struct work_atom *atom;
 	struct thread *wakee;
+	u64 timestamp = sample->time;
 
-	/* Note for later, it may be interesting to observe the failing cases */
-	if (!wakeup_event->success)
-		return;
-
-	wakee = perf_session__findnew(session, wakeup_event->pid);
-	atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
+	wakee = machine__findnew_thread(machine, 0, pid);
+	atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
 	if (!atoms) {
-		thread_atoms_insert(wakee);
-		atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
-		if (!atoms)
-			die("wakeup-event: Internal tree error");
-		add_sched_out_event(atoms, 'S', timestamp);
+		if (thread_atoms_insert(sched, wakee))
+			return -1;
+		atoms = thread_atoms_search(&sched->atom_root, wakee, &sched->cmp_pid);
+		if (!atoms) {
+			pr_err("wakeup-event: Internal tree error");
+			return -1;
+		}
+		if (add_sched_out_event(atoms, 'S', timestamp))
+			return -1;
 	}
 
 	BUG_ON(list_empty(&atoms->work_list));
@@ -1131,31 +1031,37 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
 
 	/*
+	 * As we do not guarantee the wakeup event happens when
+	 * task is out of run queue, also may happen when task is
+	 * on run queue and wakeup only change ->state to TASK_RUNNING,
+	 * then we should not set the ->wake_up_time when wake up a
+	 * task which is on run queue.
+	 *
 	 * You WILL be missing events if you've recorded only
 	 * one CPU, or are only looking at only one, so don't
-	 * make useless noise.
+	 * skip in this case.
 	 */
-	if (profile_cpu == -1 && atom->state != THREAD_SLEEPING)
-		nr_state_machine_bugs++;
+	if (sched->profile_cpu == -1 && atom->state != THREAD_SLEEPING)
+		return 0;
 
-	nr_timestamps++;
+	sched->nr_timestamps++;
 	if (atom->sched_out_time > timestamp) {
-		nr_unordered_timestamps++;
-		return;
+		sched->nr_unordered_timestamps++;
+		return 0;
 	}
 
 	atom->state = THREAD_WAIT_CPU;
 	atom->wake_up_time = timestamp;
+	return 0;
 }
 
-static void
-latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event,
-		     struct perf_session *session,
-		     struct event *__event __used,
-		     int cpu __used,
-		     u64 timestamp,
-		     struct thread *thread __used)
+static int latency_migrate_task_event(struct perf_sched *sched,
+				      struct perf_evsel *evsel,
+				      struct perf_sample *sample,
+				      struct machine *machine)
 {
+	const u32 pid = perf_evsel__intval(evsel, sample, "pid");
+	u64 timestamp = sample->time;
 	struct work_atoms *atoms;
 	struct work_atom *atom;
 	struct thread *migrant;
@@ -1163,18 +1069,22 @@ latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event,
 	/*
 	 * Only need to worry about migration when profiling one CPU.
 	 */
-	if (profile_cpu == -1)
-		return;
+	if (sched->profile_cpu == -1)
+		return 0;
 
-	migrant = perf_session__findnew(session, migrate_task_event->pid);
-	atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
+	migrant = machine__findnew_thread(machine, 0, pid);
+	atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
 	if (!atoms) {
-		thread_atoms_insert(migrant);
-		register_pid(migrant->pid, migrant->comm);
-		atoms = thread_atoms_search(&atom_root, migrant, &cmp_pid);
-		if (!atoms)
-			die("migration-event: Internal tree error");
-		add_sched_out_event(atoms, 'R', timestamp);
+		if (thread_atoms_insert(sched, migrant))
+			return -1;
+		register_pid(sched, migrant->tid, thread__comm_str(migrant));
+		atoms = thread_atoms_search(&sched->atom_root, migrant, &sched->cmp_pid);
+		if (!atoms) {
+			pr_err("migration-event: Internal tree error");
+			return -1;
+		}
+		if (add_sched_out_event(atoms, 'R', timestamp))
+			return -1;
 	}
 
 	BUG_ON(list_empty(&atoms->work_list));
@@ -1182,21 +1092,15 @@ latency_migrate_task_event(struct trace_migrate_task_event *migrate_task_event,
 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
 	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
 
-	nr_timestamps++;
+	sched->nr_timestamps++;
 
 	if (atom->sched_out_time > timestamp)
-		nr_unordered_timestamps++;
-}
+		sched->nr_unordered_timestamps++;
 
-static struct trace_sched_handler lat_ops  = {
-	.wakeup_event		= latency_wakeup_event,
-	.switch_event		= latency_switch_event,
-	.runtime_event		= latency_runtime_event,
-	.fork_event		= latency_fork_event,
-	.migrate_task_event	= latency_migrate_task_event,
-};
+	return 0;
+}
 
-static void output_lat_thread(struct work_atoms *work_list)
+static void output_lat_thread(struct perf_sched *sched, struct work_atoms *work_list)
 {
 	int i;
 	int ret;
@@ -1207,20 +1111,20 @@ static void output_lat_thread(struct work_atoms *work_list)
 	/*
 	 * Ignore idle threads:
 	 */
-	if (!strcmp(work_list->thread->comm, "swapper"))
+	if (!strcmp(thread__comm_str(work_list->thread), "swapper"))
 		return;
 
-	all_runtime += work_list->total_runtime;
-	all_count += work_list->nb_atoms;
+	sched->all_runtime += work_list->total_runtime;
+	sched->all_count   += work_list->nb_atoms;
 
-	ret = printf("  %s:%d ", work_list->thread->comm, work_list->thread->pid);
+	ret = printf("  %s:%d ", thread__comm_str(work_list->thread), work_list->thread->tid);
 
 	for (i = 0; i < 24 - ret; i++)
 		printf(" ");
 
 	avg = work_list->total_lat / work_list->nb_atoms;
 
-	printf("|%11.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n",
+	printf("|%11.3f ms |%9" PRIu64 " | avg:%9.3f ms | max:%9.3f ms | max at: %13.6f s\n",
 	      (double)work_list->total_runtime / 1e6,
 		 work_list->nb_atoms, (double)avg / 1e6,
 		 (double)work_list->max_lat / 1e6,
@@ -1229,19 +1133,14 @@ static void output_lat_thread(struct work_atoms *work_list)
 
 static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
 {
-	if (l->thread->pid < r->thread->pid)
+	if (l->thread->tid < r->thread->tid)
 		return -1;
-	if (l->thread->pid > r->thread->pid)
+	if (l->thread->tid > r->thread->tid)
 		return 1;
 
 	return 0;
 }
 
-static struct sort_dimension pid_sort_dimension = {
-	.name			= "pid",
-	.cmp			= pid_cmp,
-};
-
 static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
 {
 	u64 avgl, avgr;
@@ -1263,11 +1162,6 @@ static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
 	return 0;
 }
 
-static struct sort_dimension avg_sort_dimension = {
-	.name			= "avg",
-	.cmp			= avg_cmp,
-};
-
 static int max_cmp(struct work_atoms *l, struct work_atoms *r)
 {
 	if (l->max_lat < r->max_lat)
@@ -1278,11 +1172,6 @@ static int max_cmp(struct work_atoms *l, struct work_atoms *r)
 	return 0;
 }
 
-static struct sort_dimension max_sort_dimension = {
-	.name			= "max",
-	.cmp			= max_cmp,
-};
-
 static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
 {
 	if (l->nb_atoms < r->nb_atoms)
@@ -1293,11 +1182,6 @@ static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
 	return 0;
 }
 
-static struct sort_dimension switch_sort_dimension = {
-	.name			= "switch",
-	.cmp			= switch_cmp,
-};
-
 static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
 {
 	if (l->total_runtime < r->total_runtime)
@@ -1308,28 +1192,38 @@ static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
 	return 0;
 }
 
-static struct sort_dimension runtime_sort_dimension = {
-	.name			= "runtime",
-	.cmp			= runtime_cmp,
-};
-
-static struct sort_dimension *available_sorts[] = {
-	&pid_sort_dimension,
-	&avg_sort_dimension,
-	&max_sort_dimension,
-	&switch_sort_dimension,
-	&runtime_sort_dimension,
-};
-
-#define NB_AVAILABLE_SORTS	(int)(sizeof(available_sorts) / sizeof(struct sort_dimension *))
-
-static LIST_HEAD(sort_list);
-
 static int sort_dimension__add(const char *tok, struct list_head *list)
 {
-	int i;
-
-	for (i = 0; i < NB_AVAILABLE_SORTS; i++) {
+	size_t i;
+	static struct sort_dimension avg_sort_dimension = {
+		.name = "avg",
+		.cmp  = avg_cmp,
+	};
+	static struct sort_dimension max_sort_dimension = {
+		.name = "max",
+		.cmp  = max_cmp,
+	};
+	static struct sort_dimension pid_sort_dimension = {
+		.name = "pid",
+		.cmp  = pid_cmp,
+	};
+	static struct sort_dimension runtime_sort_dimension = {
+		.name = "runtime",
+		.cmp  = runtime_cmp,
+	};
+	static struct sort_dimension switch_sort_dimension = {
+		.name = "switch",
+		.cmp  = switch_cmp,
+	};
+	struct sort_dimension *available_sorts[] = {
+		&pid_sort_dimension,
+		&avg_sort_dimension,
+		&max_sort_dimension,
+		&switch_sort_dimension,
+		&runtime_sort_dimension,
+	};
+
+	for (i = 0; i < ARRAY_SIZE(available_sorts); i++) {
 		if (!strcmp(available_sorts[i]->name, tok)) {
 			list_add_tail(&available_sorts[i]->list, list);
 
@@ -1340,539 +1234,412 @@ static int sort_dimension__add(const char *tok, struct list_head *list)
 	return -1;
 }
 
-static void setup_sorting(void);
-
-static void sort_lat(void)
+static void perf_sched__sort_lat(struct perf_sched *sched)
 {
 	struct rb_node *node;
 
 	for (;;) {
 		struct work_atoms *data;
-		node = rb_first(&atom_root);
+		node = rb_first(&sched->atom_root);
 		if (!node)
 			break;
 
-		rb_erase(node, &atom_root);
+		rb_erase(node, &sched->atom_root);
 		data = rb_entry(node, struct work_atoms, node);
-		__thread_latency_insert(&sorted_atom_root, data, &sort_list);
+		__thread_latency_insert(&sched->sorted_atom_root, data, &sched->sort_list);
 	}
 }
 
-static struct trace_sched_handler *trace_handler;
-
-static void
-process_sched_wakeup_event(void *data, struct perf_session *session,
-			   struct event *event,
-			   int cpu __used,
-			   u64 timestamp __used,
-			   struct thread *thread __used)
+static int process_sched_wakeup_event(struct perf_tool *tool,
+				      struct perf_evsel *evsel,
+				      struct perf_sample *sample,
+				      struct machine *machine)
 {
-	struct trace_wakeup_event wakeup_event;
-
-	FILL_COMMON_FIELDS(wakeup_event, event, data);
+	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
 
-	FILL_ARRAY(wakeup_event, comm, event, data);
-	FILL_FIELD(wakeup_event, pid, event, data);
-	FILL_FIELD(wakeup_event, prio, event, data);
-	FILL_FIELD(wakeup_event, success, event, data);
-	FILL_FIELD(wakeup_event, cpu, event, data);
+	if (sched->tp_handler->wakeup_event)
+		return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
 
-	if (trace_handler->wakeup_event)
-		trace_handler->wakeup_event(&wakeup_event, session, event,
-					    cpu, timestamp, thread);
+	return 0;
 }
 
-/*
- * Track the current task - that way we can know whether there's any
- * weird events, such as a task being switched away that is not current.
- */
-static int max_cpu;
-
-static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 };
-
-static struct thread *curr_thread[MAX_CPUS];
-
-static char next_shortname1 = 'A';
-static char next_shortname2 = '0';
-
-static void
-map_switch_event(struct trace_switch_event *switch_event,
-		 struct perf_session *session,
-		 struct event *event __used,
-		 int this_cpu,
-		 u64 timestamp,
-		 struct thread *thread __used)
+static int map_switch_event(struct perf_sched *sched, struct perf_evsel *evsel,
+			    struct perf_sample *sample, struct machine *machine)
 {
-	struct thread *sched_out, *sched_in;
+	const u32 next_pid = perf_evsel__intval(evsel, sample, "next_pid");
+	struct thread *sched_in;
 	int new_shortname;
-	u64 timestamp0;
+	u64 timestamp0, timestamp = sample->time;
 	s64 delta;
-	int cpu;
+	int cpu, this_cpu = sample->cpu;
 
 	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
 
-	if (this_cpu > max_cpu)
-		max_cpu = this_cpu;
+	if (this_cpu > sched->max_cpu)
+		sched->max_cpu = this_cpu;
 
-	timestamp0 = cpu_last_switched[this_cpu];
-	cpu_last_switched[this_cpu] = timestamp;
+	timestamp0 = sched->cpu_last_switched[this_cpu];
+	sched->cpu_last_switched[this_cpu] = timestamp;
 	if (timestamp0)
 		delta = timestamp - timestamp0;
 	else
 		delta = 0;
 
-	if (delta < 0)
-		die("hm, delta: %Ld < 0 ?\n", delta);
-
+	if (delta < 0) {
+		pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
+		return -1;
+	}
 
-	sched_out = perf_session__findnew(session, switch_event->prev_pid);
-	sched_in = perf_session__findnew(session, switch_event->next_pid);
+	sched_in = machine__findnew_thread(machine, 0, next_pid);
 
-	curr_thread[this_cpu] = sched_in;
+	sched->curr_thread[this_cpu] = sched_in;
 
 	printf("  ");
 
 	new_shortname = 0;
 	if (!sched_in->shortname[0]) {
-		sched_in->shortname[0] = next_shortname1;
-		sched_in->shortname[1] = next_shortname2;
-
-		if (next_shortname1 < 'Z') {
-			next_shortname1++;
+		if (!strcmp(thread__comm_str(sched_in), "swapper")) {
+			/*
+			 * Don't allocate a letter-number for swapper:0
+			 * as a shortname. Instead, we use '.' for it.
+			 */
+			sched_in->shortname[0] = '.';
+			sched_in->shortname[1] = ' ';
 		} else {
-			next_shortname1='A';
-			if (next_shortname2 < '9') {
-				next_shortname2++;
+			sched_in->shortname[0] = sched->next_shortname1;
+			sched_in->shortname[1] = sched->next_shortname2;
+
+			if (sched->next_shortname1 < 'Z') {
+				sched->next_shortname1++;
 			} else {
-				next_shortname2='0';
+				sched->next_shortname1 = 'A';
+				if (sched->next_shortname2 < '9')
+					sched->next_shortname2++;
+				else
+					sched->next_shortname2 = '0';
 			}
 		}
 		new_shortname = 1;
 	}
 
-	for (cpu = 0; cpu <= max_cpu; cpu++) {
+	for (cpu = 0; cpu <= sched->max_cpu; cpu++) {
 		if (cpu != this_cpu)
 			printf(" ");
 		else
 			printf("*");
 
-		if (curr_thread[cpu]) {
-			if (curr_thread[cpu]->pid)
-				printf("%2s ", curr_thread[cpu]->shortname);
-			else
-				printf(".  ");
-		} else
+		if (sched->curr_thread[cpu])
+			printf("%2s ", sched->curr_thread[cpu]->shortname);
+		else
 			printf("   ");
 	}
 
 	printf("  %12.6f secs ", (double)timestamp/1e9);
 	if (new_shortname) {
 		printf("%s => %s:%d\n",
-			sched_in->shortname, sched_in->comm, sched_in->pid);
+		       sched_in->shortname, thread__comm_str(sched_in), sched_in->tid);
 	} else {
 		printf("\n");
 	}
-}
 
+	return 0;
+}
 
-static void
-process_sched_switch_event(void *data, struct perf_session *session,
-			   struct event *event,
-			   int this_cpu,
-			   u64 timestamp __used,
-			   struct thread *thread __used)
+static int process_sched_switch_event(struct perf_tool *tool,
+				      struct perf_evsel *evsel,
+				      struct perf_sample *sample,
+				      struct machine *machine)
 {
-	struct trace_switch_event switch_event;
-
-	FILL_COMMON_FIELDS(switch_event, event, data);
-
-	FILL_ARRAY(switch_event, prev_comm, event, data);
-	FILL_FIELD(switch_event, prev_pid, event, data);
-	FILL_FIELD(switch_event, prev_prio, event, data);
-	FILL_FIELD(switch_event, prev_state, event, data);
-	FILL_ARRAY(switch_event, next_comm, event, data);
-	FILL_FIELD(switch_event, next_pid, event, data);
-	FILL_FIELD(switch_event, next_prio, event, data);
+	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
+	int this_cpu = sample->cpu, err = 0;
+	u32 prev_pid = perf_evsel__intval(evsel, sample, "prev_pid"),
+	    next_pid = perf_evsel__intval(evsel, sample, "next_pid");
 
-	if (curr_pid[this_cpu] != (u32)-1) {
+	if (sched->curr_pid[this_cpu] != (u32)-1) {
 		/*
 		 * Are we trying to switch away a PID that is
 		 * not current?
 		 */
-		if (curr_pid[this_cpu] != switch_event.prev_pid)
-			nr_context_switch_bugs++;
+		if (sched->curr_pid[this_cpu] != prev_pid)
+			sched->nr_context_switch_bugs++;
 	}
-	if (trace_handler->switch_event)
-		trace_handler->switch_event(&switch_event, session, event,
-					    this_cpu, timestamp, thread);
 
-	curr_pid[this_cpu] = switch_event.next_pid;
+	if (sched->tp_handler->switch_event)
+		err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
+
+	sched->curr_pid[this_cpu] = next_pid;
+	return err;
 }
 
-static void
-process_sched_runtime_event(void *data, struct perf_session *session,
-			   struct event *event,
-			   int cpu __used,
-			   u64 timestamp __used,
-			   struct thread *thread __used)
+static int process_sched_runtime_event(struct perf_tool *tool,
+				       struct perf_evsel *evsel,
+				       struct perf_sample *sample,
+				       struct machine *machine)
 {
-	struct trace_runtime_event runtime_event;
+	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
 
-	FILL_ARRAY(runtime_event, comm, event, data);
-	FILL_FIELD(runtime_event, pid, event, data);
-	FILL_FIELD(runtime_event, runtime, event, data);
-	FILL_FIELD(runtime_event, vruntime, event, data);
+	if (sched->tp_handler->runtime_event)
+		return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
 
-	if (trace_handler->runtime_event)
-		trace_handler->runtime_event(&runtime_event, session, event, cpu, timestamp, thread);
+	return 0;
 }
 
-static void
-process_sched_fork_event(void *data,
-			 struct event *event,
-			 int cpu __used,
-			 u64 timestamp __used,
-			 struct thread *thread __used)
+static int perf_sched__process_fork_event(struct perf_tool *tool,
+					  union perf_event *event,
+					  struct perf_sample *sample,
+					  struct machine *machine)
 {
-	struct trace_fork_event fork_event;
+	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
 
-	FILL_COMMON_FIELDS(fork_event, event, data);
+	/* run the fork event through the perf machineruy */
+	perf_event__process_fork(tool, event, sample, machine);
 
-	FILL_ARRAY(fork_event, parent_comm, event, data);
-	FILL_FIELD(fork_event, parent_pid, event, data);
-	FILL_ARRAY(fork_event, child_comm, event, data);
-	FILL_FIELD(fork_event, child_pid, event, data);
+	/* and then run additional processing needed for this command */
+	if (sched->tp_handler->fork_event)
+		return sched->tp_handler->fork_event(sched, event, machine);
 
-	if (trace_handler->fork_event)
-		trace_handler->fork_event(&fork_event, event,
-					  cpu, timestamp, thread);
-}
-
-static void
-process_sched_exit_event(struct event *event,
-			 int cpu __used,
-			 u64 timestamp __used,
-			 struct thread *thread __used)
-{
-	if (verbose)
-		printf("sched_exit event %p\n", event);
+	return 0;
 }
 
-static void
-process_sched_migrate_task_event(void *data, struct perf_session *session,
-			   struct event *event,
-			   int cpu __used,
-			   u64 timestamp __used,
-			   struct thread *thread __used)
+static int process_sched_migrate_task_event(struct perf_tool *tool,
+					    struct perf_evsel *evsel,
+					    struct perf_sample *sample,
+					    struct machine *machine)
 {
-	struct trace_migrate_task_event migrate_task_event;
+	struct perf_sched *sched = container_of(tool, struct perf_sched, tool);
 
-	FILL_COMMON_FIELDS(migrate_task_event, event, data);
+	if (sched->tp_handler->migrate_task_event)
+		return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
 
-	FILL_ARRAY(migrate_task_event, comm, event, data);
-	FILL_FIELD(migrate_task_event, pid, event, data);
-	FILL_FIELD(migrate_task_event, prio, event, data);
-	FILL_FIELD(migrate_task_event, cpu, event, data);
-
-	if (trace_handler->migrate_task_event)
-		trace_handler->migrate_task_event(&migrate_task_event, session,
-						 event, cpu, timestamp, thread);
+	return 0;
 }
 
-static void
-process_raw_event(event_t *raw_event __used, struct perf_session *session,
-		  void *data, int cpu, u64 timestamp, struct thread *thread)
-{
-	struct event *event;
-	int type;
-
-
-	type = trace_parse_common_type(data);
-	event = trace_find_event(type);
-
-	if (!strcmp(event->name, "sched_switch"))
-		process_sched_switch_event(data, session, event, cpu, timestamp, thread);
-	if (!strcmp(event->name, "sched_stat_runtime"))
-		process_sched_runtime_event(data, session, event, cpu, timestamp, thread);
-	if (!strcmp(event->name, "sched_wakeup"))
-		process_sched_wakeup_event(data, session, event, cpu, timestamp, thread);
-	if (!strcmp(event->name, "sched_wakeup_new"))
-		process_sched_wakeup_event(data, session, event, cpu, timestamp, thread);
-	if (!strcmp(event->name, "sched_process_fork"))
-		process_sched_fork_event(data, event, cpu, timestamp, thread);
-	if (!strcmp(event->name, "sched_process_exit"))
-		process_sched_exit_event(event, cpu, timestamp, thread);
-	if (!strcmp(event->name, "sched_migrate_task"))
-		process_sched_migrate_task_event(data, session, event, cpu, timestamp, thread);
-}
+typedef int (*tracepoint_handler)(struct perf_tool *tool,
+				  struct perf_evsel *evsel,
+				  struct perf_sample *sample,
+				  struct machine *machine);
 
-static int process_sample_event(event_t *event, struct perf_session *session)
+static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
+						 union perf_event *event __maybe_unused,
+						 struct perf_sample *sample,
+						 struct perf_evsel *evsel,
+						 struct machine *machine)
 {
-	struct sample_data data;
-	struct thread *thread;
+	int err = 0;
 
-	if (!(session->sample_type & PERF_SAMPLE_RAW))
-		return 0;
+	evsel->hists.stats.total_period += sample->period;
+	hists__inc_nr_samples(&evsel->hists, true);
 
-	memset(&data, 0, sizeof(data));
-	data.time = -1;
-	data.cpu = -1;
-	data.period = -1;
-
-	event__parse_sample(event, session->sample_type, &data);
+	if (evsel->handler != NULL) {
+		tracepoint_handler f = evsel->handler;
+		err = f(tool, evsel, sample, machine);
+	}
 
-	dump_printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc,
-		    data.pid, data.tid, data.ip, data.period);
+	return err;
+}
 
-	thread = perf_session__findnew(session, data.pid);
-	if (thread == NULL) {
-		pr_debug("problem processing %d event, skipping it.\n",
-			 event->header.type);
+static int perf_sched__read_events(struct perf_sched *sched,
+				   struct perf_session **psession)
+{
+	const struct perf_evsel_str_handler handlers[] = {
+		{ "sched:sched_switch",	      process_sched_switch_event, },
+		{ "sched:sched_stat_runtime", process_sched_runtime_event, },
+		{ "sched:sched_wakeup",	      process_sched_wakeup_event, },
+		{ "sched:sched_wakeup_new",   process_sched_wakeup_event, },
+		{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
+	};
+	struct perf_session *session;
+	struct perf_data_file file = {
+		.path = input_name,
+		.mode = PERF_DATA_MODE_READ,
+	};
+
+	session = perf_session__new(&file, false, &sched->tool);
+	if (session == NULL) {
+		pr_debug("No Memory for session\n");
 		return -1;
 	}
 
-	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+	if (perf_session__set_tracepoints_handlers(session, handlers))
+		goto out_delete;
 
-	if (profile_cpu != -1 && profile_cpu != (int)data.cpu)
-		return 0;
-
-	process_raw_event(event, session, data.raw_data, data.cpu, data.time, thread);
+	if (perf_session__has_traces(session, "record -R")) {
+		int err = perf_session__process_events(session, &sched->tool);
+		if (err) {
+			pr_err("Failed to process events, error %d", err);
+			goto out_delete;
+		}
 
-	return 0;
-}
+		sched->nr_events      = session->stats.nr_events[0];
+		sched->nr_lost_events = session->stats.total_lost;
+		sched->nr_lost_chunks = session->stats.nr_events[PERF_RECORD_LOST];
+	}
 
-static int process_lost_event(event_t *event __used,
-			      struct perf_session *session __used)
-{
-	nr_lost_chunks++;
-	nr_lost_events += event->lost.lost;
+	if (psession)
+		*psession = session;
+	else
+		perf_session__delete(session);
 
 	return 0;
-}
-
-static struct perf_event_ops event_ops = {
-	.sample	= process_sample_event,
-	.comm	= event__process_comm,
-	.lost	= process_lost_event,
-};
-
-static int read_events(void)
-{
-	int err = -EINVAL;
-	struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0);
-	if (session == NULL)
-		return -ENOMEM;
-
-	if (perf_session__has_traces(session, "record -R"))
-		err = perf_session__process_events(session, &event_ops);
 
+out_delete:
 	perf_session__delete(session);
-	return err;
+	return -1;
 }
 
-static void print_bad_events(void)
+static void print_bad_events(struct perf_sched *sched)
 {
-	if (nr_unordered_timestamps && nr_timestamps) {
+	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
 		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
-			(double)nr_unordered_timestamps/(double)nr_timestamps*100.0,
-			nr_unordered_timestamps, nr_timestamps);
+			(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*100.0,
+			sched->nr_unordered_timestamps, sched->nr_timestamps);
 	}
-	if (nr_lost_events && nr_events) {
+	if (sched->nr_lost_events && sched->nr_events) {
 		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
-			(double)nr_lost_events/(double)nr_events*100.0,
-			nr_lost_events, nr_events, nr_lost_chunks);
+			(double)sched->nr_lost_events/(double)sched->nr_events * 100.0,
+			sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
 	}
-	if (nr_state_machine_bugs && nr_timestamps) {
-		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
-			(double)nr_state_machine_bugs/(double)nr_timestamps*100.0,
-			nr_state_machine_bugs, nr_timestamps);
-		if (nr_lost_events)
-			printf(" (due to lost events?)");
-		printf("\n");
-	}
-	if (nr_context_switch_bugs && nr_timestamps) {
+	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
 		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
-			(double)nr_context_switch_bugs/(double)nr_timestamps*100.0,
-			nr_context_switch_bugs, nr_timestamps);
-		if (nr_lost_events)
+			(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*100.0,
+			sched->nr_context_switch_bugs, sched->nr_timestamps);
+		if (sched->nr_lost_events)
 			printf(" (due to lost events?)");
 		printf("\n");
 	}
 }
 
-static void __cmd_lat(void)
+static int perf_sched__lat(struct perf_sched *sched)
 {
 	struct rb_node *next;
+	struct perf_session *session;
 
 	setup_pager();
-	read_events();
-	sort_lat();
 
-	printf("\n ---------------------------------------------------------------------------------------------------------------\n");
-	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at     |\n");
-	printf(" ---------------------------------------------------------------------------------------------------------------\n");
+	/* save session -- references to threads are held in work_list */
+	if (perf_sched__read_events(sched, &session))
+		return -1;
+
+	perf_sched__sort_lat(sched);
 
-	next = rb_first(&sorted_atom_root);
+	printf("\n -----------------------------------------------------------------------------------------------------------------\n");
+	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms | Maximum delay at       |\n");
+	printf(" -----------------------------------------------------------------------------------------------------------------\n");
+
+	next = rb_first(&sched->sorted_atom_root);
 
 	while (next) {
 		struct work_atoms *work_list;
 
 		work_list = rb_entry(next, struct work_atoms, node);
-		output_lat_thread(work_list);
+		output_lat_thread(sched, work_list);
 		next = rb_next(next);
 	}
 
-	printf(" -----------------------------------------------------------------------------------------\n");
-	printf("  TOTAL:                |%11.3f ms |%9Ld |\n",
-		(double)all_runtime/1e6, all_count);
+	printf(" -----------------------------------------------------------------------------------------------------------------\n");
+	printf("  TOTAL:                |%11.3f ms |%9" PRIu64 " |\n",
+		(double)sched->all_runtime / 1e6, sched->all_count);
 
 	printf(" ---------------------------------------------------\n");
 
-	print_bad_events();
+	print_bad_events(sched);
 	printf("\n");
 
+	perf_session__delete(session);
+	return 0;
 }
 
-static struct trace_sched_handler map_ops  = {
-	.wakeup_event		= NULL,
-	.switch_event		= map_switch_event,
-	.runtime_event		= NULL,
-	.fork_event		= NULL,
-};
-
-static void __cmd_map(void)
+static int perf_sched__map(struct perf_sched *sched)
 {
-	max_cpu = sysconf(_SC_NPROCESSORS_CONF);
+	sched->max_cpu = sysconf(_SC_NPROCESSORS_CONF);
 
 	setup_pager();
-	read_events();
-	print_bad_events();
+	if (perf_sched__read_events(sched, NULL))
+		return -1;
+	print_bad_events(sched);
+	return 0;
 }
 
-static void __cmd_replay(void)
+static int perf_sched__replay(struct perf_sched *sched)
 {
 	unsigned long i;
 
-	calibrate_run_measurement_overhead();
-	calibrate_sleep_measurement_overhead();
+	calibrate_run_measurement_overhead(sched);
+	calibrate_sleep_measurement_overhead(sched);
 
-	test_calibrations();
+	test_calibrations(sched);
 
-	read_events();
+	if (perf_sched__read_events(sched, NULL))
+		return -1;
 
-	printf("nr_run_events:        %ld\n", nr_run_events);
-	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
-	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
+	printf("nr_run_events:        %ld\n", sched->nr_run_events);
+	printf("nr_sleep_events:      %ld\n", sched->nr_sleep_events);
+	printf("nr_wakeup_events:     %ld\n", sched->nr_wakeup_events);
 
-	if (targetless_wakeups)
-		printf("target-less wakeups:  %ld\n", targetless_wakeups);
-	if (multitarget_wakeups)
-		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
-	if (nr_run_events_optimized)
+	if (sched->targetless_wakeups)
+		printf("target-less wakeups:  %ld\n", sched->targetless_wakeups);
+	if (sched->multitarget_wakeups)
+		printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
+	if (sched->nr_run_events_optimized)
 		printf("run atoms optimized: %ld\n",
-			nr_run_events_optimized);
+			sched->nr_run_events_optimized);
 
-	print_task_traces();
-	add_cross_task_wakeups();
+	print_task_traces(sched);
+	add_cross_task_wakeups(sched);
 
-	create_tasks();
+	create_tasks(sched);
 	printf("------------------------------------------------------------\n");
-	for (i = 0; i < replay_repeat; i++)
-		run_one_test();
-}
+	for (i = 0; i < sched->replay_repeat; i++)
+		run_one_test(sched);
 
+	return 0;
+}
 
-static const char * const sched_usage[] = {
-	"perf sched [<options>] {record|latency|map|replay|trace}",
-	NULL
-};
-
-static const struct option sched_options[] = {
-	OPT_STRING('i', "input", &input_name, "file",
-		    "input file name"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
-		    "be more verbose (show symbol address, etc)"),
-	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
-		    "dump raw trace in ASCII"),
-	OPT_END()
-};
-
-static const char * const latency_usage[] = {
-	"perf sched latency [<options>]",
-	NULL
-};
-
-static const struct option latency_options[] = {
-	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
-		   "sort by key(s): runtime, switch, avg, max"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
-		    "be more verbose (show symbol address, etc)"),
-	OPT_INTEGER('C', "CPU", &profile_cpu,
-		    "CPU to profile on"),
-	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
-		    "dump raw trace in ASCII"),
-	OPT_END()
-};
-
-static const char * const replay_usage[] = {
-	"perf sched replay [<options>]",
-	NULL
-};
-
-static const struct option replay_options[] = {
-	OPT_INTEGER('r', "repeat", &replay_repeat,
-		    "repeat the workload replay N times (-1: infinite)"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
-		    "be more verbose (show symbol address, etc)"),
-	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
-		    "dump raw trace in ASCII"),
-	OPT_END()
-};
-
-static void setup_sorting(void)
+static void setup_sorting(struct perf_sched *sched, const struct option *options,
+			  const char * const usage_msg[])
 {
-	char *tmp, *tok, *str = strdup(sort_order);
+	char *tmp, *tok, *str = strdup(sched->sort_order);
 
 	for (tok = strtok_r(str, ", ", &tmp);
 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
-		if (sort_dimension__add(tok, &sort_list) < 0) {
+		if (sort_dimension__add(tok, &sched->sort_list) < 0) {
 			error("Unknown --sort key: `%s'", tok);
-			usage_with_options(latency_usage, latency_options);
+			usage_with_options(usage_msg, options);
 		}
 	}
 
 	free(str);
 
-	sort_dimension__add("pid", &cmp_pid);
+	sort_dimension__add("pid", &sched->cmp_pid);
 }
 
-static const char *record_args[] = {
-	"record",
-	"-a",
-	"-R",
-	"-M",
-	"-f",
-	"-m", "1024",
-	"-c", "1",
-	"-e", "sched:sched_switch:r",
-	"-e", "sched:sched_stat_wait:r",
-	"-e", "sched:sched_stat_sleep:r",
-	"-e", "sched:sched_stat_iowait:r",
-	"-e", "sched:sched_stat_runtime:r",
-	"-e", "sched:sched_process_exit:r",
-	"-e", "sched:sched_process_fork:r",
-	"-e", "sched:sched_wakeup:r",
-	"-e", "sched:sched_migrate_task:r",
-};
-
 static int __cmd_record(int argc, const char **argv)
 {
 	unsigned int rec_argc, i, j;
 	const char **rec_argv;
+	const char * const record_args[] = {
+		"record",
+		"-a",
+		"-R",
+		"-m", "1024",
+		"-c", "1",
+		"-e", "sched:sched_switch",
+		"-e", "sched:sched_stat_wait",
+		"-e", "sched:sched_stat_sleep",
+		"-e", "sched:sched_stat_iowait",
+		"-e", "sched:sched_stat_runtime",
+		"-e", "sched:sched_process_fork",
+		"-e", "sched:sched_wakeup",
+		"-e", "sched:sched_wakeup_new",
+		"-e", "sched:sched_migrate_task",
+	};
 
 	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
 
+	if (rec_argv == NULL)
+		return -ENOMEM;
+
 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
 		rec_argv[i] = strdup(record_args[i]);
 
@@ -1884,43 +1651,124 @@ static int __cmd_record(int argc, const char **argv)
 	return cmd_record(i, rec_argv, NULL);
 }
 
-int cmd_sched(int argc, const char **argv, const char *prefix __used)
-{
-	argc = parse_options(argc, argv, sched_options, sched_usage,
-			     PARSE_OPT_STOP_AT_NON_OPTION);
+int cmd_sched(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	const char default_sort_order[] = "avg, max, switch, runtime";
+	struct perf_sched sched = {
+		.tool = {
+			.sample		 = perf_sched__process_tracepoint_sample,
+			.comm		 = perf_event__process_comm,
+			.lost		 = perf_event__process_lost,
+			.fork		 = perf_sched__process_fork_event,
+			.ordered_samples = true,
+		},
+		.cmp_pid	      = LIST_HEAD_INIT(sched.cmp_pid),
+		.sort_list	      = LIST_HEAD_INIT(sched.sort_list),
+		.start_work_mutex     = PTHREAD_MUTEX_INITIALIZER,
+		.work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER,
+		.sort_order	      = default_sort_order,
+		.replay_repeat	      = 10,
+		.profile_cpu	      = -1,
+		.next_shortname1      = 'A',
+		.next_shortname2      = '0',
+	};
+	const struct option latency_options[] = {
+	OPT_STRING('s', "sort", &sched.sort_order, "key[,key2...]",
+		   "sort by key(s): runtime, switch, avg, max"),
+	OPT_INCR('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
+	OPT_INTEGER('C', "CPU", &sched.profile_cpu,
+		    "CPU to profile on"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
+	OPT_END()
+	};
+	const struct option replay_options[] = {
+	OPT_UINTEGER('r', "repeat", &sched.replay_repeat,
+		     "repeat the workload replay N times (-1: infinite)"),
+	OPT_INCR('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
+	OPT_END()
+	};
+	const struct option sched_options[] = {
+	OPT_STRING('i', "input", &input_name, "file",
+		    "input file name"),
+	OPT_INCR('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
+	OPT_END()
+	};
+	const char * const latency_usage[] = {
+		"perf sched latency [<options>]",
+		NULL
+	};
+	const char * const replay_usage[] = {
+		"perf sched replay [<options>]",
+		NULL
+	};
+	const char *const sched_subcommands[] = { "record", "latency", "map",
+						  "replay", "script", NULL };
+	const char *sched_usage[] = {
+		NULL,
+		NULL
+	};
+	struct trace_sched_handler lat_ops  = {
+		.wakeup_event	    = latency_wakeup_event,
+		.switch_event	    = latency_switch_event,
+		.runtime_event	    = latency_runtime_event,
+		.migrate_task_event = latency_migrate_task_event,
+	};
+	struct trace_sched_handler map_ops  = {
+		.switch_event	    = map_switch_event,
+	};
+	struct trace_sched_handler replay_ops  = {
+		.wakeup_event	    = replay_wakeup_event,
+		.switch_event	    = replay_switch_event,
+		.fork_event	    = replay_fork_event,
+	};
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(sched.curr_pid); i++)
+		sched.curr_pid[i] = -1;
+
+	argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
+					sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
 	if (!argc)
 		usage_with_options(sched_usage, sched_options);
 
 	/*
-	 * Aliased to 'perf trace' for now:
+	 * Aliased to 'perf script' for now:
 	 */
-	if (!strcmp(argv[0], "trace"))
-		return cmd_trace(argc, argv, prefix);
+	if (!strcmp(argv[0], "script"))
+		return cmd_script(argc, argv, prefix);
 
 	symbol__init();
 	if (!strncmp(argv[0], "rec", 3)) {
 		return __cmd_record(argc, argv);
 	} else if (!strncmp(argv[0], "lat", 3)) {
-		trace_handler = &lat_ops;
+		sched.tp_handler = &lat_ops;
 		if (argc > 1) {
 			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
 			if (argc)
 				usage_with_options(latency_usage, latency_options);
 		}
-		setup_sorting();
-		__cmd_lat();
+		setup_sorting(&sched, latency_options, latency_usage);
+		return perf_sched__lat(&sched);
 	} else if (!strcmp(argv[0], "map")) {
-		trace_handler = &map_ops;
-		setup_sorting();
-		__cmd_map();
+		sched.tp_handler = &map_ops;
+		setup_sorting(&sched, latency_options, latency_usage);
+		return perf_sched__map(&sched);
 	} else if (!strncmp(argv[0], "rep", 3)) {
-		trace_handler = &replay_ops;
+		sched.tp_handler = &replay_ops;
 		if (argc) {
 			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
 			if (argc)
 				usage_with_options(replay_usage, replay_options);
 		}
-		__cmd_replay();
+		return perf_sched__replay(&sched);
 	} else {
 		usage_with_options(sched_usage, sched_options);
 	}
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
new file mode 100644
index 00000000000..9e9c91f5b7f
--- /dev/null
+++ b/tools/perf/builtin-script.c
@@ -0,0 +1,1826 @@
+#include "builtin.h"
+
+#include "perf.h"
+#include "util/cache.h"
+#include "util/debug.h"
+#include "util/exec_cmd.h"
+#include "util/header.h"
+#include "util/parse-options.h"
+#include "util/session.h"
+#include "util/tool.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+#include "util/trace-event.h"
+#include "util/util.h"
+#include "util/evlist.h"
+#include "util/evsel.h"
+#include "util/sort.h"
+#include "util/data.h"
+#include <linux/bitmap.h>
+
+static char const		*script_name;
+static char const		*generate_script_lang;
+static bool			debug_mode;
+static u64			last_timestamp;
+static u64			nr_unordered;
+extern const struct option	record_options[];
+static bool			no_callchain;
+static bool			latency_format;
+static bool			system_wide;
+static const char		*cpu_list;
+static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
+
+enum perf_output_field {
+	PERF_OUTPUT_COMM            = 1U << 0,
+	PERF_OUTPUT_TID             = 1U << 1,
+	PERF_OUTPUT_PID             = 1U << 2,
+	PERF_OUTPUT_TIME            = 1U << 3,
+	PERF_OUTPUT_CPU             = 1U << 4,
+	PERF_OUTPUT_EVNAME          = 1U << 5,
+	PERF_OUTPUT_TRACE           = 1U << 6,
+	PERF_OUTPUT_IP              = 1U << 7,
+	PERF_OUTPUT_SYM             = 1U << 8,
+	PERF_OUTPUT_DSO             = 1U << 9,
+	PERF_OUTPUT_ADDR            = 1U << 10,
+	PERF_OUTPUT_SYMOFFSET       = 1U << 11,
+	PERF_OUTPUT_SRCLINE         = 1U << 12,
+};
+
+struct output_option {
+	const char *str;
+	enum perf_output_field field;
+} all_output_options[] = {
+	{.str = "comm",  .field = PERF_OUTPUT_COMM},
+	{.str = "tid",   .field = PERF_OUTPUT_TID},
+	{.str = "pid",   .field = PERF_OUTPUT_PID},
+	{.str = "time",  .field = PERF_OUTPUT_TIME},
+	{.str = "cpu",   .field = PERF_OUTPUT_CPU},
+	{.str = "event", .field = PERF_OUTPUT_EVNAME},
+	{.str = "trace", .field = PERF_OUTPUT_TRACE},
+	{.str = "ip",    .field = PERF_OUTPUT_IP},
+	{.str = "sym",   .field = PERF_OUTPUT_SYM},
+	{.str = "dso",   .field = PERF_OUTPUT_DSO},
+	{.str = "addr",  .field = PERF_OUTPUT_ADDR},
+	{.str = "symoff", .field = PERF_OUTPUT_SYMOFFSET},
+	{.str = "srcline", .field = PERF_OUTPUT_SRCLINE},
+};
+
+/* default set to maintain compatibility with current format */
+static struct {
+	bool user_set;
+	bool wildcard_set;
+	unsigned int print_ip_opts;
+	u64 fields;
+	u64 invalid_fields;
+} output[PERF_TYPE_MAX] = {
+
+	[PERF_TYPE_HARDWARE] = {
+		.user_set = false,
+
+		.fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
+			      PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
+			      PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
+				  PERF_OUTPUT_SYM | PERF_OUTPUT_DSO,
+
+		.invalid_fields = PERF_OUTPUT_TRACE,
+	},
+
+	[PERF_TYPE_SOFTWARE] = {
+		.user_set = false,
+
+		.fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
+			      PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
+			      PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
+				  PERF_OUTPUT_SYM | PERF_OUTPUT_DSO,
+
+		.invalid_fields = PERF_OUTPUT_TRACE,
+	},
+
+	[PERF_TYPE_TRACEPOINT] = {
+		.user_set = false,
+
+		.fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
+				  PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
+				  PERF_OUTPUT_EVNAME | PERF_OUTPUT_TRACE,
+	},
+
+	[PERF_TYPE_RAW] = {
+		.user_set = false,
+
+		.fields = PERF_OUTPUT_COMM | PERF_OUTPUT_TID |
+			      PERF_OUTPUT_CPU | PERF_OUTPUT_TIME |
+			      PERF_OUTPUT_EVNAME | PERF_OUTPUT_IP |
+				  PERF_OUTPUT_SYM | PERF_OUTPUT_DSO,
+
+		.invalid_fields = PERF_OUTPUT_TRACE,
+	},
+};
+
+static bool output_set_by_user(void)
+{
+	int j;
+	for (j = 0; j < PERF_TYPE_MAX; ++j) {
+		if (output[j].user_set)
+			return true;
+	}
+	return false;
+}
+
+static const char *output_field2str(enum perf_output_field field)
+{
+	int i, imax = ARRAY_SIZE(all_output_options);
+	const char *str = "";
+
+	for (i = 0; i < imax; ++i) {
+		if (all_output_options[i].field == field) {
+			str = all_output_options[i].str;
+			break;
+		}
+	}
+	return str;
+}
+
+#define PRINT_FIELD(x)  (output[attr->type].fields & PERF_OUTPUT_##x)
+
+static int perf_evsel__check_stype(struct perf_evsel *evsel,
+				   u64 sample_type, const char *sample_msg,
+				   enum perf_output_field field)
+{
+	struct perf_event_attr *attr = &evsel->attr;
+	int type = attr->type;
+	const char *evname;
+
+	if (attr->sample_type & sample_type)
+		return 0;
+
+	if (output[type].user_set) {
+		evname = perf_evsel__name(evsel);
+		pr_err("Samples for '%s' event do not have %s attribute set. "
+		       "Cannot print '%s' field.\n",
+		       evname, sample_msg, output_field2str(field));
+		return -1;
+	}
+
+	/* user did not ask for it explicitly so remove from the default list */
+	output[type].fields &= ~field;
+	evname = perf_evsel__name(evsel);
+	pr_debug("Samples for '%s' event do not have %s attribute set. "
+		 "Skipping '%s' field.\n",
+		 evname, sample_msg, output_field2str(field));
+
+	return 0;
+}
+
+static int perf_evsel__check_attr(struct perf_evsel *evsel,
+				  struct perf_session *session)
+{
+	struct perf_event_attr *attr = &evsel->attr;
+
+	if (PRINT_FIELD(TRACE) &&
+		!perf_session__has_traces(session, "record -R"))
+		return -EINVAL;
+
+	if (PRINT_FIELD(IP)) {
+		if (perf_evsel__check_stype(evsel, PERF_SAMPLE_IP, "IP",
+					    PERF_OUTPUT_IP))
+			return -EINVAL;
+
+		if (!no_callchain &&
+		    !(attr->sample_type & PERF_SAMPLE_CALLCHAIN))
+			symbol_conf.use_callchain = false;
+	}
+
+	if (PRINT_FIELD(ADDR) &&
+		perf_evsel__check_stype(evsel, PERF_SAMPLE_ADDR, "ADDR",
+					PERF_OUTPUT_ADDR))
+		return -EINVAL;
+
+	if (PRINT_FIELD(SYM) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR)) {
+		pr_err("Display of symbols requested but neither sample IP nor "
+			   "sample address\nis selected. Hence, no addresses to convert "
+		       "to symbols.\n");
+		return -EINVAL;
+	}
+	if (PRINT_FIELD(SYMOFFSET) && !PRINT_FIELD(SYM)) {
+		pr_err("Display of offsets requested but symbol is not"
+		       "selected.\n");
+		return -EINVAL;
+	}
+	if (PRINT_FIELD(DSO) && !PRINT_FIELD(IP) && !PRINT_FIELD(ADDR)) {
+		pr_err("Display of DSO requested but neither sample IP nor "
+			   "sample address\nis selected. Hence, no addresses to convert "
+		       "to DSO.\n");
+		return -EINVAL;
+	}
+	if (PRINT_FIELD(SRCLINE) && !PRINT_FIELD(IP)) {
+		pr_err("Display of source line number requested but sample IP is not\n"
+		       "selected. Hence, no address to lookup the source line number.\n");
+		return -EINVAL;
+	}
+
+	if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
+		perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID",
+					PERF_OUTPUT_TID|PERF_OUTPUT_PID))
+		return -EINVAL;
+
+	if (PRINT_FIELD(TIME) &&
+		perf_evsel__check_stype(evsel, PERF_SAMPLE_TIME, "TIME",
+					PERF_OUTPUT_TIME))
+		return -EINVAL;
+
+	if (PRINT_FIELD(CPU) &&
+		perf_evsel__check_stype(evsel, PERF_SAMPLE_CPU, "CPU",
+					PERF_OUTPUT_CPU))
+		return -EINVAL;
+
+	return 0;
+}
+
+static void set_print_ip_opts(struct perf_event_attr *attr)
+{
+	unsigned int type = attr->type;
+
+	output[type].print_ip_opts = 0;
+	if (PRINT_FIELD(IP))
+		output[type].print_ip_opts |= PRINT_IP_OPT_IP;
+
+	if (PRINT_FIELD(SYM))
+		output[type].print_ip_opts |= PRINT_IP_OPT_SYM;
+
+	if (PRINT_FIELD(DSO))
+		output[type].print_ip_opts |= PRINT_IP_OPT_DSO;
+
+	if (PRINT_FIELD(SYMOFFSET))
+		output[type].print_ip_opts |= PRINT_IP_OPT_SYMOFFSET;
+
+	if (PRINT_FIELD(SRCLINE))
+		output[type].print_ip_opts |= PRINT_IP_OPT_SRCLINE;
+}
+
+/*
+ * verify all user requested events exist and the samples
+ * have the expected data
+ */
+static int perf_session__check_output_opt(struct perf_session *session)
+{
+	int j;
+	struct perf_evsel *evsel;
+
+	for (j = 0; j < PERF_TYPE_MAX; ++j) {
+		evsel = perf_session__find_first_evtype(session, j);
+
+		/*
+		 * even if fields is set to 0 (ie., show nothing) event must
+		 * exist if user explicitly includes it on the command line
+		 */
+		if (!evsel && output[j].user_set && !output[j].wildcard_set) {
+			pr_err("%s events do not exist. "
+			       "Remove corresponding -f option to proceed.\n",
+			       event_type(j));
+			return -1;
+		}
+
+		if (evsel && output[j].fields &&
+			perf_evsel__check_attr(evsel, session))
+			return -1;
+
+		if (evsel == NULL)
+			continue;
+
+		set_print_ip_opts(&evsel->attr);
+	}
+
+	/*
+	 * set default for tracepoints to print symbols only
+	 * if callchains are present
+	 */
+	if (symbol_conf.use_callchain &&
+	    !output[PERF_TYPE_TRACEPOINT].user_set) {
+		struct perf_event_attr *attr;
+
+		j = PERF_TYPE_TRACEPOINT;
+		evsel = perf_session__find_first_evtype(session, j);
+		if (evsel == NULL)
+			goto out;
+
+		attr = &evsel->attr;
+
+		if (attr->sample_type & PERF_SAMPLE_CALLCHAIN) {
+			output[j].fields |= PERF_OUTPUT_IP;
+			output[j].fields |= PERF_OUTPUT_SYM;
+			output[j].fields |= PERF_OUTPUT_DSO;
+			set_print_ip_opts(attr);
+		}
+	}
+
+out:
+	return 0;
+}
+
+static void print_sample_start(struct perf_sample *sample,
+			       struct thread *thread,
+			       struct perf_evsel *evsel)
+{
+	struct perf_event_attr *attr = &evsel->attr;
+	unsigned long secs;
+	unsigned long usecs;
+	unsigned long long nsecs;
+
+	if (PRINT_FIELD(COMM)) {
+		if (latency_format)
+			printf("%8.8s ", thread__comm_str(thread));
+		else if (PRINT_FIELD(IP) && symbol_conf.use_callchain)
+			printf("%s ", thread__comm_str(thread));
+		else
+			printf("%16s ", thread__comm_str(thread));
+	}
+
+	if (PRINT_FIELD(PID) && PRINT_FIELD(TID))
+		printf("%5d/%-5d ", sample->pid, sample->tid);
+	else if (PRINT_FIELD(PID))
+		printf("%5d ", sample->pid);
+	else if (PRINT_FIELD(TID))
+		printf("%5d ", sample->tid);
+
+	if (PRINT_FIELD(CPU)) {
+		if (latency_format)
+			printf("%3d ", sample->cpu);
+		else
+			printf("[%03d] ", sample->cpu);
+	}
+
+	if (PRINT_FIELD(TIME)) {
+		nsecs = sample->time;
+		secs = nsecs / NSECS_PER_SEC;
+		nsecs -= secs * NSECS_PER_SEC;
+		usecs = nsecs / NSECS_PER_USEC;
+		printf("%5lu.%06lu: ", secs, usecs);
+	}
+}
+
+static bool is_bts_event(struct perf_event_attr *attr)
+{
+	return ((attr->type == PERF_TYPE_HARDWARE) &&
+		(attr->config & PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
+		(attr->sample_period == 1));
+}
+
+static bool sample_addr_correlates_sym(struct perf_event_attr *attr)
+{
+	if ((attr->type == PERF_TYPE_SOFTWARE) &&
+	    ((attr->config == PERF_COUNT_SW_PAGE_FAULTS) ||
+	     (attr->config == PERF_COUNT_SW_PAGE_FAULTS_MIN) ||
+	     (attr->config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)))
+		return true;
+
+	if (is_bts_event(attr))
+		return true;
+
+	return false;
+}
+
+static void print_sample_addr(union perf_event *event,
+			  struct perf_sample *sample,
+			  struct machine *machine,
+			  struct thread *thread,
+			  struct perf_event_attr *attr)
+{
+	struct addr_location al;
+	u8 cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
+
+	printf("%16" PRIx64, sample->addr);
+
+	if (!sample_addr_correlates_sym(attr))
+		return;
+
+	thread__find_addr_map(thread, machine, cpumode, MAP__FUNCTION,
+			      sample->addr, &al);
+	if (!al.map)
+		thread__find_addr_map(thread, machine, cpumode, MAP__VARIABLE,
+				      sample->addr, &al);
+
+	al.cpu = sample->cpu;
+	al.sym = NULL;
+
+	if (al.map)
+		al.sym = map__find_symbol(al.map, al.addr, NULL);
+
+	if (PRINT_FIELD(SYM)) {
+		printf(" ");
+		if (PRINT_FIELD(SYMOFFSET))
+			symbol__fprintf_symname_offs(al.sym, &al, stdout);
+		else
+			symbol__fprintf_symname(al.sym, stdout);
+	}
+
+	if (PRINT_FIELD(DSO)) {
+		printf(" (");
+		map__fprintf_dsoname(al.map, stdout);
+		printf(")");
+	}
+}
+
+static void print_sample_bts(union perf_event *event,
+			     struct perf_sample *sample,
+			     struct perf_evsel *evsel,
+			     struct thread *thread,
+			     struct addr_location *al)
+{
+	struct perf_event_attr *attr = &evsel->attr;
+
+	/* print branch_from information */
+	if (PRINT_FIELD(IP)) {
+		if (!symbol_conf.use_callchain)
+			printf(" ");
+		else
+			printf("\n");
+		perf_evsel__print_ip(evsel, sample, al,
+				     output[attr->type].print_ip_opts,
+				     PERF_MAX_STACK_DEPTH);
+	}
+
+	printf(" => ");
+
+	/* print branch_to information */
+	if (PRINT_FIELD(ADDR) ||
+	    ((evsel->attr.sample_type & PERF_SAMPLE_ADDR) &&
+	     !output[attr->type].user_set))
+		print_sample_addr(event, sample, al->machine, thread, attr);
+
+	printf("\n");
+}
+
+static void process_event(union perf_event *event, struct perf_sample *sample,
+			  struct perf_evsel *evsel, struct thread *thread,
+			  struct addr_location *al)
+{
+	struct perf_event_attr *attr = &evsel->attr;
+
+	if (output[attr->type].fields == 0)
+		return;
+
+	print_sample_start(sample, thread, evsel);
+
+	if (PRINT_FIELD(EVNAME)) {
+		const char *evname = perf_evsel__name(evsel);
+		printf("%s: ", evname ? evname : "[unknown]");
+	}
+
+	if (is_bts_event(attr)) {
+		print_sample_bts(event, sample, evsel, thread, al);
+		return;
+	}
+
+	if (PRINT_FIELD(TRACE))
+		event_format__print(evsel->tp_format, sample->cpu,
+				    sample->raw_data, sample->raw_size);
+	if (PRINT_FIELD(ADDR))
+		print_sample_addr(event, sample, al->machine, thread, attr);
+
+	if (PRINT_FIELD(IP)) {
+		if (!symbol_conf.use_callchain)
+			printf(" ");
+		else
+			printf("\n");
+
+		perf_evsel__print_ip(evsel, sample, al,
+				     output[attr->type].print_ip_opts,
+				     PERF_MAX_STACK_DEPTH);
+	}
+
+	printf("\n");
+}
+
+static int default_start_script(const char *script __maybe_unused,
+				int argc __maybe_unused,
+				const char **argv __maybe_unused)
+{
+	return 0;
+}
+
+static int default_stop_script(void)
+{
+	return 0;
+}
+
+static int default_generate_script(struct pevent *pevent __maybe_unused,
+				   const char *outfile __maybe_unused)
+{
+	return 0;
+}
+
+static struct scripting_ops default_scripting_ops = {
+	.start_script		= default_start_script,
+	.stop_script		= default_stop_script,
+	.process_event		= process_event,
+	.generate_script	= default_generate_script,
+};
+
+static struct scripting_ops	*scripting_ops;
+
+static void setup_scripting(void)
+{
+	setup_perl_scripting();
+	setup_python_scripting();
+
+	scripting_ops = &default_scripting_ops;
+}
+
+static int cleanup_scripting(void)
+{
+	pr_debug("\nperf script stopped\n");
+
+	return scripting_ops->stop_script();
+}
+
+static int process_sample_event(struct perf_tool *tool __maybe_unused,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct perf_evsel *evsel,
+				struct machine *machine)
+{
+	struct addr_location al;
+	struct thread *thread = machine__findnew_thread(machine, sample->pid,
+							sample->tid);
+
+	if (thread == NULL) {
+		pr_debug("problem processing %d event, skipping it.\n",
+			 event->header.type);
+		return -1;
+	}
+
+	if (debug_mode) {
+		if (sample->time < last_timestamp) {
+			pr_err("Samples misordered, previous: %" PRIu64
+				" this: %" PRIu64 "\n", last_timestamp,
+				sample->time);
+			nr_unordered++;
+		}
+		last_timestamp = sample->time;
+		return 0;
+	}
+
+	if (perf_event__preprocess_sample(event, machine, &al, sample) < 0) {
+		pr_err("problem processing %d event, skipping it.\n",
+		       event->header.type);
+		return -1;
+	}
+
+	if (al.filtered)
+		return 0;
+
+	if (cpu_list && !test_bit(sample->cpu, cpu_bitmap))
+		return 0;
+
+	scripting_ops->process_event(event, sample, evsel, thread, &al);
+
+	evsel->hists.stats.total_period += sample->period;
+	return 0;
+}
+
+struct perf_script {
+	struct perf_tool	tool;
+	struct perf_session	*session;
+	bool			show_task_events;
+	bool			show_mmap_events;
+};
+
+static int process_attr(struct perf_tool *tool, union perf_event *event,
+			struct perf_evlist **pevlist)
+{
+	struct perf_script *scr = container_of(tool, struct perf_script, tool);
+	struct perf_evlist *evlist;
+	struct perf_evsel *evsel, *pos;
+	int err;
+
+	err = perf_event__process_attr(tool, event, pevlist);
+	if (err)
+		return err;
+
+	evlist = *pevlist;
+	evsel = perf_evlist__last(*pevlist);
+
+	if (evsel->attr.type >= PERF_TYPE_MAX)
+		return 0;
+
+	evlist__for_each(evlist, pos) {
+		if (pos->attr.type == evsel->attr.type && pos != evsel)
+			return 0;
+	}
+
+	set_print_ip_opts(&evsel->attr);
+
+	return perf_evsel__check_attr(evsel, scr->session);
+}
+
+static int process_comm_event(struct perf_tool *tool,
+			      union perf_event *event,
+			      struct perf_sample *sample,
+			      struct machine *machine)
+{
+	struct thread *thread;
+	struct perf_script *script = container_of(tool, struct perf_script, tool);
+	struct perf_session *session = script->session;
+	struct perf_evsel *evsel = perf_evlist__first(session->evlist);
+	int ret = -1;
+
+	thread = machine__findnew_thread(machine, event->comm.pid, event->comm.tid);
+	if (thread == NULL) {
+		pr_debug("problem processing COMM event, skipping it.\n");
+		return -1;
+	}
+
+	if (perf_event__process_comm(tool, event, sample, machine) < 0)
+		goto out;
+
+	if (!evsel->attr.sample_id_all) {
+		sample->cpu = 0;
+		sample->time = 0;
+		sample->tid = event->comm.tid;
+		sample->pid = event->comm.pid;
+	}
+	print_sample_start(sample, thread, evsel);
+	perf_event__fprintf(event, stdout);
+	ret = 0;
+
+out:
+	return ret;
+}
+
+static int process_fork_event(struct perf_tool *tool,
+			      union perf_event *event,
+			      struct perf_sample *sample,
+			      struct machine *machine)
+{
+	struct thread *thread;
+	struct perf_script *script = container_of(tool, struct perf_script, tool);
+	struct perf_session *session = script->session;
+	struct perf_evsel *evsel = perf_evlist__first(session->evlist);
+
+	if (perf_event__process_fork(tool, event, sample, machine) < 0)
+		return -1;
+
+	thread = machine__findnew_thread(machine, event->fork.pid, event->fork.tid);
+	if (thread == NULL) {
+		pr_debug("problem processing FORK event, skipping it.\n");
+		return -1;
+	}
+
+	if (!evsel->attr.sample_id_all) {
+		sample->cpu = 0;
+		sample->time = event->fork.time;
+		sample->tid = event->fork.tid;
+		sample->pid = event->fork.pid;
+	}
+	print_sample_start(sample, thread, evsel);
+	perf_event__fprintf(event, stdout);
+
+	return 0;
+}
+static int process_exit_event(struct perf_tool *tool,
+			      union perf_event *event,
+			      struct perf_sample *sample,
+			      struct machine *machine)
+{
+	struct thread *thread;
+	struct perf_script *script = container_of(tool, struct perf_script, tool);
+	struct perf_session *session = script->session;
+	struct perf_evsel *evsel = perf_evlist__first(session->evlist);
+
+	thread = machine__findnew_thread(machine, event->fork.pid, event->fork.tid);
+	if (thread == NULL) {
+		pr_debug("problem processing EXIT event, skipping it.\n");
+		return -1;
+	}
+
+	if (!evsel->attr.sample_id_all) {
+		sample->cpu = 0;
+		sample->time = 0;
+		sample->tid = event->comm.tid;
+		sample->pid = event->comm.pid;
+	}
+	print_sample_start(sample, thread, evsel);
+	perf_event__fprintf(event, stdout);
+
+	if (perf_event__process_exit(tool, event, sample, machine) < 0)
+		return -1;
+
+	return 0;
+}
+
+static int process_mmap_event(struct perf_tool *tool,
+			      union perf_event *event,
+			      struct perf_sample *sample,
+			      struct machine *machine)
+{
+	struct thread *thread;
+	struct perf_script *script = container_of(tool, struct perf_script, tool);
+	struct perf_session *session = script->session;
+	struct perf_evsel *evsel = perf_evlist__first(session->evlist);
+
+	if (perf_event__process_mmap(tool, event, sample, machine) < 0)
+		return -1;
+
+	thread = machine__findnew_thread(machine, event->mmap.pid, event->mmap.tid);
+	if (thread == NULL) {
+		pr_debug("problem processing MMAP event, skipping it.\n");
+		return -1;
+	}
+
+	if (!evsel->attr.sample_id_all) {
+		sample->cpu = 0;
+		sample->time = 0;
+		sample->tid = event->mmap.tid;
+		sample->pid = event->mmap.pid;
+	}
+	print_sample_start(sample, thread, evsel);
+	perf_event__fprintf(event, stdout);
+
+	return 0;
+}
+
+static int process_mmap2_event(struct perf_tool *tool,
+			      union perf_event *event,
+			      struct perf_sample *sample,
+			      struct machine *machine)
+{
+	struct thread *thread;
+	struct perf_script *script = container_of(tool, struct perf_script, tool);
+	struct perf_session *session = script->session;
+	struct perf_evsel *evsel = perf_evlist__first(session->evlist);
+
+	if (perf_event__process_mmap2(tool, event, sample, machine) < 0)
+		return -1;
+
+	thread = machine__findnew_thread(machine, event->mmap2.pid, event->mmap2.tid);
+	if (thread == NULL) {
+		pr_debug("problem processing MMAP2 event, skipping it.\n");
+		return -1;
+	}
+
+	if (!evsel->attr.sample_id_all) {
+		sample->cpu = 0;
+		sample->time = 0;
+		sample->tid = event->mmap2.tid;
+		sample->pid = event->mmap2.pid;
+	}
+	print_sample_start(sample, thread, evsel);
+	perf_event__fprintf(event, stdout);
+
+	return 0;
+}
+
+static void sig_handler(int sig __maybe_unused)
+{
+	session_done = 1;
+}
+
+static int __cmd_script(struct perf_script *script)
+{
+	int ret;
+
+	signal(SIGINT, sig_handler);
+
+	/* override event processing functions */
+	if (script->show_task_events) {
+		script->tool.comm = process_comm_event;
+		script->tool.fork = process_fork_event;
+		script->tool.exit = process_exit_event;
+	}
+	if (script->show_mmap_events) {
+		script->tool.mmap = process_mmap_event;
+		script->tool.mmap2 = process_mmap2_event;
+	}
+
+	ret = perf_session__process_events(script->session, &script->tool);
+
+	if (debug_mode)
+		pr_err("Misordered timestamps: %" PRIu64 "\n", nr_unordered);
+
+	return ret;
+}
+
+struct script_spec {
+	struct list_head	node;
+	struct scripting_ops	*ops;
+	char			spec[0];
+};
+
+static LIST_HEAD(script_specs);
+
+static struct script_spec *script_spec__new(const char *spec,
+					    struct scripting_ops *ops)
+{
+	struct script_spec *s = malloc(sizeof(*s) + strlen(spec) + 1);
+
+	if (s != NULL) {
+		strcpy(s->spec, spec);
+		s->ops = ops;
+	}
+
+	return s;
+}
+
+static void script_spec__add(struct script_spec *s)
+{
+	list_add_tail(&s->node, &script_specs);
+}
+
+static struct script_spec *script_spec__find(const char *spec)
+{
+	struct script_spec *s;
+
+	list_for_each_entry(s, &script_specs, node)
+		if (strcasecmp(s->spec, spec) == 0)
+			return s;
+	return NULL;
+}
+
+static struct script_spec *script_spec__findnew(const char *spec,
+						struct scripting_ops *ops)
+{
+	struct script_spec *s = script_spec__find(spec);
+
+	if (s)
+		return s;
+
+	s = script_spec__new(spec, ops);
+	if (!s)
+		return NULL;
+
+	script_spec__add(s);
+
+	return s;
+}
+
+int script_spec_register(const char *spec, struct scripting_ops *ops)
+{
+	struct script_spec *s;
+
+	s = script_spec__find(spec);
+	if (s)
+		return -1;
+
+	s = script_spec__findnew(spec, ops);
+	if (!s)
+		return -1;
+
+	return 0;
+}
+
+static struct scripting_ops *script_spec__lookup(const char *spec)
+{
+	struct script_spec *s = script_spec__find(spec);
+	if (!s)
+		return NULL;
+
+	return s->ops;
+}
+
+static void list_available_languages(void)
+{
+	struct script_spec *s;
+
+	fprintf(stderr, "\n");
+	fprintf(stderr, "Scripting language extensions (used in "
+		"perf script -s [spec:]script.[spec]):\n\n");
+
+	list_for_each_entry(s, &script_specs, node)
+		fprintf(stderr, "  %-42s [%s]\n", s->spec, s->ops->name);
+
+	fprintf(stderr, "\n");
+}
+
+static int parse_scriptname(const struct option *opt __maybe_unused,
+			    const char *str, int unset __maybe_unused)
+{
+	char spec[PATH_MAX];
+	const char *script, *ext;
+	int len;
+
+	if (strcmp(str, "lang") == 0) {
+		list_available_languages();
+		exit(0);
+	}
+
+	script = strchr(str, ':');
+	if (script) {
+		len = script - str;
+		if (len >= PATH_MAX) {
+			fprintf(stderr, "invalid language specifier");
+			return -1;
+		}
+		strncpy(spec, str, len);
+		spec[len] = '\0';
+		scripting_ops = script_spec__lookup(spec);
+		if (!scripting_ops) {
+			fprintf(stderr, "invalid language specifier");
+			return -1;
+		}
+		script++;
+	} else {
+		script = str;
+		ext = strrchr(script, '.');
+		if (!ext) {
+			fprintf(stderr, "invalid script extension");
+			return -1;
+		}
+		scripting_ops = script_spec__lookup(++ext);
+		if (!scripting_ops) {
+			fprintf(stderr, "invalid script extension");
+			return -1;
+		}
+	}
+
+	script_name = strdup(script);
+
+	return 0;
+}
+
+static int parse_output_fields(const struct option *opt __maybe_unused,
+			    const char *arg, int unset __maybe_unused)
+{
+	char *tok;
+	int i, imax = ARRAY_SIZE(all_output_options);
+	int j;
+	int rc = 0;
+	char *str = strdup(arg);
+	int type = -1;
+
+	if (!str)
+		return -ENOMEM;
+
+	/* first word can state for which event type the user is specifying
+	 * the fields. If no type exists, the specified fields apply to all
+	 * event types found in the file minus the invalid fields for a type.
+	 */
+	tok = strchr(str, ':');
+	if (tok) {
+		*tok = '\0';
+		tok++;
+		if (!strcmp(str, "hw"))
+			type = PERF_TYPE_HARDWARE;
+		else if (!strcmp(str, "sw"))
+			type = PERF_TYPE_SOFTWARE;
+		else if (!strcmp(str, "trace"))
+			type = PERF_TYPE_TRACEPOINT;
+		else if (!strcmp(str, "raw"))
+			type = PERF_TYPE_RAW;
+		else {
+			fprintf(stderr, "Invalid event type in field string.\n");
+			rc = -EINVAL;
+			goto out;
+		}
+
+		if (output[type].user_set)
+			pr_warning("Overriding previous field request for %s events.\n",
+				   event_type(type));
+
+		output[type].fields = 0;
+		output[type].user_set = true;
+		output[type].wildcard_set = false;
+
+	} else {
+		tok = str;
+		if (strlen(str) == 0) {
+			fprintf(stderr,
+				"Cannot set fields to 'none' for all event types.\n");
+			rc = -EINVAL;
+			goto out;
+		}
+
+		if (output_set_by_user())
+			pr_warning("Overriding previous field request for all events.\n");
+
+		for (j = 0; j < PERF_TYPE_MAX; ++j) {
+			output[j].fields = 0;
+			output[j].user_set = true;
+			output[j].wildcard_set = true;
+		}
+	}
+
+	tok = strtok(tok, ",");
+	while (tok) {
+		for (i = 0; i < imax; ++i) {
+			if (strcmp(tok, all_output_options[i].str) == 0)
+				break;
+		}
+		if (i == imax) {
+			fprintf(stderr, "Invalid field requested.\n");
+			rc = -EINVAL;
+			goto out;
+		}
+
+		if (type == -1) {
+			/* add user option to all events types for
+			 * which it is valid
+			 */
+			for (j = 0; j < PERF_TYPE_MAX; ++j) {
+				if (output[j].invalid_fields & all_output_options[i].field) {
+					pr_warning("\'%s\' not valid for %s events. Ignoring.\n",
+						   all_output_options[i].str, event_type(j));
+				} else
+					output[j].fields |= all_output_options[i].field;
+			}
+		} else {
+			if (output[type].invalid_fields & all_output_options[i].field) {
+				fprintf(stderr, "\'%s\' not valid for %s events.\n",
+					 all_output_options[i].str, event_type(type));
+
+				rc = -EINVAL;
+				goto out;
+			}
+			output[type].fields |= all_output_options[i].field;
+		}
+
+		tok = strtok(NULL, ",");
+	}
+
+	if (type >= 0) {
+		if (output[type].fields == 0) {
+			pr_debug("No fields requested for %s type. "
+				 "Events will not be displayed.\n", event_type(type));
+		}
+	}
+
+out:
+	free(str);
+	return rc;
+}
+
+/* Helper function for filesystems that return a dent->d_type DT_UNKNOWN */
+static int is_directory(const char *base_path, const struct dirent *dent)
+{
+	char path[PATH_MAX];
+	struct stat st;
+
+	sprintf(path, "%s/%s", base_path, dent->d_name);
+	if (stat(path, &st))
+		return 0;
+
+	return S_ISDIR(st.st_mode);
+}
+
+#define for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next)\
+	while (!readdir_r(scripts_dir, &lang_dirent, &lang_next) &&	\
+	       lang_next)						\
+		if ((lang_dirent.d_type == DT_DIR ||			\
+		     (lang_dirent.d_type == DT_UNKNOWN &&		\
+		      is_directory(scripts_path, &lang_dirent))) &&	\
+		    (strcmp(lang_dirent.d_name, ".")) &&		\
+		    (strcmp(lang_dirent.d_name, "..")))
+
+#define for_each_script(lang_path, lang_dir, script_dirent, script_next)\
+	while (!readdir_r(lang_dir, &script_dirent, &script_next) &&	\
+	       script_next)						\
+		if (script_dirent.d_type != DT_DIR &&			\
+		    (script_dirent.d_type != DT_UNKNOWN ||		\
+		     !is_directory(lang_path, &script_dirent)))
+
+
+#define RECORD_SUFFIX			"-record"
+#define REPORT_SUFFIX			"-report"
+
+struct script_desc {
+	struct list_head	node;
+	char			*name;
+	char			*half_liner;
+	char			*args;
+};
+
+static LIST_HEAD(script_descs);
+
+static struct script_desc *script_desc__new(const char *name)
+{
+	struct script_desc *s = zalloc(sizeof(*s));
+
+	if (s != NULL && name)
+		s->name = strdup(name);
+
+	return s;
+}
+
+static void script_desc__delete(struct script_desc *s)
+{
+	zfree(&s->name);
+	zfree(&s->half_liner);
+	zfree(&s->args);
+	free(s);
+}
+
+static void script_desc__add(struct script_desc *s)
+{
+	list_add_tail(&s->node, &script_descs);
+}
+
+static struct script_desc *script_desc__find(const char *name)
+{
+	struct script_desc *s;
+
+	list_for_each_entry(s, &script_descs, node)
+		if (strcasecmp(s->name, name) == 0)
+			return s;
+	return NULL;
+}
+
+static struct script_desc *script_desc__findnew(const char *name)
+{
+	struct script_desc *s = script_desc__find(name);
+
+	if (s)
+		return s;
+
+	s = script_desc__new(name);
+	if (!s)
+		goto out_delete_desc;
+
+	script_desc__add(s);
+
+	return s;
+
+out_delete_desc:
+	script_desc__delete(s);
+
+	return NULL;
+}
+
+static const char *ends_with(const char *str, const char *suffix)
+{
+	size_t suffix_len = strlen(suffix);
+	const char *p = str;
+
+	if (strlen(str) > suffix_len) {
+		p = str + strlen(str) - suffix_len;
+		if (!strncmp(p, suffix, suffix_len))
+			return p;
+	}
+
+	return NULL;
+}
+
+static int read_script_info(struct script_desc *desc, const char *filename)
+{
+	char line[BUFSIZ], *p;
+	FILE *fp;
+
+	fp = fopen(filename, "r");
+	if (!fp)
+		return -1;
+
+	while (fgets(line, sizeof(line), fp)) {
+		p = ltrim(line);
+		if (strlen(p) == 0)
+			continue;
+		if (*p != '#')
+			continue;
+		p++;
+		if (strlen(p) && *p == '!')
+			continue;
+
+		p = ltrim(p);
+		if (strlen(p) && p[strlen(p) - 1] == '\n')
+			p[strlen(p) - 1] = '\0';
+
+		if (!strncmp(p, "description:", strlen("description:"))) {
+			p += strlen("description:");
+			desc->half_liner = strdup(ltrim(p));
+			continue;
+		}
+
+		if (!strncmp(p, "args:", strlen("args:"))) {
+			p += strlen("args:");
+			desc->args = strdup(ltrim(p));
+			continue;
+		}
+	}
+
+	fclose(fp);
+
+	return 0;
+}
+
+static char *get_script_root(struct dirent *script_dirent, const char *suffix)
+{
+	char *script_root, *str;
+
+	script_root = strdup(script_dirent->d_name);
+	if (!script_root)
+		return NULL;
+
+	str = (char *)ends_with(script_root, suffix);
+	if (!str) {
+		free(script_root);
+		return NULL;
+	}
+
+	*str = '\0';
+	return script_root;
+}
+
+static int list_available_scripts(const struct option *opt __maybe_unused,
+				  const char *s __maybe_unused,
+				  int unset __maybe_unused)
+{
+	struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
+	char scripts_path[MAXPATHLEN];
+	DIR *scripts_dir, *lang_dir;
+	char script_path[MAXPATHLEN];
+	char lang_path[MAXPATHLEN];
+	struct script_desc *desc;
+	char first_half[BUFSIZ];
+	char *script_root;
+
+	snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
+
+	scripts_dir = opendir(scripts_path);
+	if (!scripts_dir)
+		return -1;
+
+	for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
+		snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
+			 lang_dirent.d_name);
+		lang_dir = opendir(lang_path);
+		if (!lang_dir)
+			continue;
+
+		for_each_script(lang_path, lang_dir, script_dirent, script_next) {
+			script_root = get_script_root(&script_dirent, REPORT_SUFFIX);
+			if (script_root) {
+				desc = script_desc__findnew(script_root);
+				snprintf(script_path, MAXPATHLEN, "%s/%s",
+					 lang_path, script_dirent.d_name);
+				read_script_info(desc, script_path);
+				free(script_root);
+			}
+		}
+	}
+
+	fprintf(stdout, "List of available trace scripts:\n");
+	list_for_each_entry(desc, &script_descs, node) {
+		sprintf(first_half, "%s %s", desc->name,
+			desc->args ? desc->args : "");
+		fprintf(stdout, "  %-36s %s\n", first_half,
+			desc->half_liner ? desc->half_liner : "");
+	}
+
+	exit(0);
+}
+
+/*
+ * Some scripts specify the required events in their "xxx-record" file,
+ * this function will check if the events in perf.data match those
+ * mentioned in the "xxx-record".
+ *
+ * Fixme: All existing "xxx-record" are all in good formats "-e event ",
+ * which is covered well now. And new parsing code should be added to
+ * cover the future complexing formats like event groups etc.
+ */
+static int check_ev_match(char *dir_name, char *scriptname,
+			struct perf_session *session)
+{
+	char filename[MAXPATHLEN], evname[128];
+	char line[BUFSIZ], *p;
+	struct perf_evsel *pos;
+	int match, len;
+	FILE *fp;
+
+	sprintf(filename, "%s/bin/%s-record", dir_name, scriptname);
+
+	fp = fopen(filename, "r");
+	if (!fp)
+		return -1;
+
+	while (fgets(line, sizeof(line), fp)) {
+		p = ltrim(line);
+		if (*p == '#')
+			continue;
+
+		while (strlen(p)) {
+			p = strstr(p, "-e");
+			if (!p)
+				break;
+
+			p += 2;
+			p = ltrim(p);
+			len = strcspn(p, " \t");
+			if (!len)
+				break;
+
+			snprintf(evname, len + 1, "%s", p);
+
+			match = 0;
+			evlist__for_each(session->evlist, pos) {
+				if (!strcmp(perf_evsel__name(pos), evname)) {
+					match = 1;
+					break;
+				}
+			}
+
+			if (!match) {
+				fclose(fp);
+				return -1;
+			}
+		}
+	}
+
+	fclose(fp);
+	return 0;
+}
+
+/*
+ * Return -1 if none is found, otherwise the actual scripts number.
+ *
+ * Currently the only user of this function is the script browser, which
+ * will list all statically runnable scripts, select one, execute it and
+ * show the output in a perf browser.
+ */
+int find_scripts(char **scripts_array, char **scripts_path_array)
+{
+	struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
+	char scripts_path[MAXPATHLEN], lang_path[MAXPATHLEN];
+	DIR *scripts_dir, *lang_dir;
+	struct perf_session *session;
+	struct perf_data_file file = {
+		.path = input_name,
+		.mode = PERF_DATA_MODE_READ,
+	};
+	char *temp;
+	int i = 0;
+
+	session = perf_session__new(&file, false, NULL);
+	if (!session)
+		return -1;
+
+	snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
+
+	scripts_dir = opendir(scripts_path);
+	if (!scripts_dir) {
+		perf_session__delete(session);
+		return -1;
+	}
+
+	for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
+		snprintf(lang_path, MAXPATHLEN, "%s/%s", scripts_path,
+			 lang_dirent.d_name);
+#ifdef NO_LIBPERL
+		if (strstr(lang_path, "perl"))
+			continue;
+#endif
+#ifdef NO_LIBPYTHON
+		if (strstr(lang_path, "python"))
+			continue;
+#endif
+
+		lang_dir = opendir(lang_path);
+		if (!lang_dir)
+			continue;
+
+		for_each_script(lang_path, lang_dir, script_dirent, script_next) {
+			/* Skip those real time scripts: xxxtop.p[yl] */
+			if (strstr(script_dirent.d_name, "top."))
+				continue;
+			sprintf(scripts_path_array[i], "%s/%s", lang_path,
+				script_dirent.d_name);
+			temp = strchr(script_dirent.d_name, '.');
+			snprintf(scripts_array[i],
+				(temp - script_dirent.d_name) + 1,
+				"%s", script_dirent.d_name);
+
+			if (check_ev_match(lang_path,
+					scripts_array[i], session))
+				continue;
+
+			i++;
+		}
+		closedir(lang_dir);
+	}
+
+	closedir(scripts_dir);
+	perf_session__delete(session);
+	return i;
+}
+
+static char *get_script_path(const char *script_root, const char *suffix)
+{
+	struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
+	char scripts_path[MAXPATHLEN];
+	char script_path[MAXPATHLEN];
+	DIR *scripts_dir, *lang_dir;
+	char lang_path[MAXPATHLEN];
+	char *__script_root;
+
+	snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
+
+	scripts_dir = opendir(scripts_path);
+	if (!scripts_dir)
+		return NULL;
+
+	for_each_lang(scripts_path, scripts_dir, lang_dirent, lang_next) {
+		snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
+			 lang_dirent.d_name);
+		lang_dir = opendir(lang_path);
+		if (!lang_dir)
+			continue;
+
+		for_each_script(lang_path, lang_dir, script_dirent, script_next) {
+			__script_root = get_script_root(&script_dirent, suffix);
+			if (__script_root && !strcmp(script_root, __script_root)) {
+				free(__script_root);
+				closedir(lang_dir);
+				closedir(scripts_dir);
+				snprintf(script_path, MAXPATHLEN, "%s/%s",
+					 lang_path, script_dirent.d_name);
+				return strdup(script_path);
+			}
+			free(__script_root);
+		}
+		closedir(lang_dir);
+	}
+	closedir(scripts_dir);
+
+	return NULL;
+}
+
+static bool is_top_script(const char *script_path)
+{
+	return ends_with(script_path, "top") == NULL ? false : true;
+}
+
+static int has_required_arg(char *script_path)
+{
+	struct script_desc *desc;
+	int n_args = 0;
+	char *p;
+
+	desc = script_desc__new(NULL);
+
+	if (read_script_info(desc, script_path))
+		goto out;
+
+	if (!desc->args)
+		goto out;
+
+	for (p = desc->args; *p; p++)
+		if (*p == '<')
+			n_args++;
+out:
+	script_desc__delete(desc);
+
+	return n_args;
+}
+
+static int have_cmd(int argc, const char **argv)
+{
+	char **__argv = malloc(sizeof(const char *) * argc);
+
+	if (!__argv) {
+		pr_err("malloc failed\n");
+		return -1;
+	}
+
+	memcpy(__argv, argv, sizeof(const char *) * argc);
+	argc = parse_options(argc, (const char **)__argv, record_options,
+			     NULL, PARSE_OPT_STOP_AT_NON_OPTION);
+	free(__argv);
+
+	system_wide = (argc == 0);
+
+	return 0;
+}
+
+int cmd_script(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	bool show_full_info = false;
+	bool header = false;
+	bool header_only = false;
+	char *rec_script_path = NULL;
+	char *rep_script_path = NULL;
+	struct perf_session *session;
+	char *script_path = NULL;
+	const char **__argv;
+	int i, j, err;
+	struct perf_script script = {
+		.tool = {
+			.sample		 = process_sample_event,
+			.mmap		 = perf_event__process_mmap,
+			.mmap2		 = perf_event__process_mmap2,
+			.comm		 = perf_event__process_comm,
+			.exit		 = perf_event__process_exit,
+			.fork		 = perf_event__process_fork,
+			.attr		 = process_attr,
+			.tracing_data	 = perf_event__process_tracing_data,
+			.build_id	 = perf_event__process_build_id,
+			.ordered_samples = true,
+			.ordering_requires_timestamps = true,
+		},
+	};
+	const struct option options[] = {
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
+	OPT_INCR('v', "verbose", &verbose,
+		 "be more verbose (show symbol address, etc)"),
+	OPT_BOOLEAN('L', "Latency", &latency_format,
+		    "show latency attributes (irqs/preemption disabled, etc)"),
+	OPT_CALLBACK_NOOPT('l', "list", NULL, NULL, "list available scripts",
+			   list_available_scripts),
+	OPT_CALLBACK('s', "script", NULL, "name",
+		     "script file name (lang:script name, script name, or *)",
+		     parse_scriptname),
+	OPT_STRING('g', "gen-script", &generate_script_lang, "lang",
+		   "generate perf-script.xx script in specified language"),
+	OPT_STRING('i', "input", &input_name, "file", "input file name"),
+	OPT_BOOLEAN('d', "debug-mode", &debug_mode,
+		   "do various checks like samples ordering and lost events"),
+	OPT_BOOLEAN(0, "header", &header, "Show data header."),
+	OPT_BOOLEAN(0, "header-only", &header_only, "Show only data header."),
+	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
+		   "file", "vmlinux pathname"),
+	OPT_STRING(0, "kallsyms", &symbol_conf.kallsyms_name,
+		   "file", "kallsyms pathname"),
+	OPT_BOOLEAN('G', "hide-call-graph", &no_callchain,
+		    "When printing symbols do not display call chain"),
+	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
+		    "Look for files with symbols relative to this directory"),
+	OPT_CALLBACK('f', "fields", NULL, "str",
+		     "comma separated output fields prepend with 'type:'. "
+		     "Valid types: hw,sw,trace,raw. "
+		     "Fields: comm,tid,pid,time,cpu,event,trace,ip,sym,dso,"
+		     "addr,symoff", parse_output_fields),
+	OPT_BOOLEAN('a', "all-cpus", &system_wide,
+		    "system-wide collection from all CPUs"),
+	OPT_STRING('S', "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
+		   "only consider these symbols"),
+	OPT_STRING('C', "cpu", &cpu_list, "cpu", "list of cpus to profile"),
+	OPT_STRING('c', "comms", &symbol_conf.comm_list_str, "comm[,comm...]",
+		   "only display events for these comms"),
+	OPT_BOOLEAN('I', "show-info", &show_full_info,
+		    "display extended information from perf.data file"),
+	OPT_BOOLEAN('\0', "show-kernel-path", &symbol_conf.show_kernel_path,
+		    "Show the path of [kernel.kallsyms]"),
+	OPT_BOOLEAN('\0', "show-task-events", &script.show_task_events,
+		    "Show the fork/comm/exit events"),
+	OPT_BOOLEAN('\0', "show-mmap-events", &script.show_mmap_events,
+		    "Show the mmap events"),
+	OPT_END()
+	};
+	const char * const script_usage[] = {
+		"perf script [<options>]",
+		"perf script [<options>] record <script> [<record-options>] <command>",
+		"perf script [<options>] report <script> [script-args]",
+		"perf script [<options>] <script> [<record-options>] <command>",
+		"perf script [<options>] <top-script> [script-args]",
+		NULL
+	};
+	struct perf_data_file file = {
+		.mode = PERF_DATA_MODE_READ,
+	};
+
+	setup_scripting();
+
+	argc = parse_options(argc, argv, options, script_usage,
+			     PARSE_OPT_STOP_AT_NON_OPTION);
+
+	file.path = input_name;
+
+	if (argc > 1 && !strncmp(argv[0], "rec", strlen("rec"))) {
+		rec_script_path = get_script_path(argv[1], RECORD_SUFFIX);
+		if (!rec_script_path)
+			return cmd_record(argc, argv, NULL);
+	}
+
+	if (argc > 1 && !strncmp(argv[0], "rep", strlen("rep"))) {
+		rep_script_path = get_script_path(argv[1], REPORT_SUFFIX);
+		if (!rep_script_path) {
+			fprintf(stderr,
+				"Please specify a valid report script"
+				"(see 'perf script -l' for listing)\n");
+			return -1;
+		}
+	}
+
+	/* make sure PERF_EXEC_PATH is set for scripts */
+	perf_set_argv_exec_path(perf_exec_path());
+
+	if (argc && !script_name && !rec_script_path && !rep_script_path) {
+		int live_pipe[2];
+		int rep_args;
+		pid_t pid;
+
+		rec_script_path = get_script_path(argv[0], RECORD_SUFFIX);
+		rep_script_path = get_script_path(argv[0], REPORT_SUFFIX);
+
+		if (!rec_script_path && !rep_script_path) {
+			fprintf(stderr, " Couldn't find script %s\n\n See perf"
+				" script -l for available scripts.\n", argv[0]);
+			usage_with_options(script_usage, options);
+		}
+
+		if (is_top_script(argv[0])) {
+			rep_args = argc - 1;
+		} else {
+			int rec_args;
+
+			rep_args = has_required_arg(rep_script_path);
+			rec_args = (argc - 1) - rep_args;
+			if (rec_args < 0) {
+				fprintf(stderr, " %s script requires options."
+					"\n\n See perf script -l for available "
+					"scripts and options.\n", argv[0]);
+				usage_with_options(script_usage, options);
+			}
+		}
+
+		if (pipe(live_pipe) < 0) {
+			perror("failed to create pipe");
+			return -1;
+		}
+
+		pid = fork();
+		if (pid < 0) {
+			perror("failed to fork");
+			return -1;
+		}
+
+		if (!pid) {
+			j = 0;
+
+			dup2(live_pipe[1], 1);
+			close(live_pipe[0]);
+
+			if (is_top_script(argv[0])) {
+				system_wide = true;
+			} else if (!system_wide) {
+				if (have_cmd(argc - rep_args, &argv[rep_args]) != 0) {
+					err = -1;
+					goto out;
+				}
+			}
+
+			__argv = malloc((argc + 6) * sizeof(const char *));
+			if (!__argv) {
+				pr_err("malloc failed\n");
+				err = -ENOMEM;
+				goto out;
+			}
+
+			__argv[j++] = "/bin/sh";
+			__argv[j++] = rec_script_path;
+			if (system_wide)
+				__argv[j++] = "-a";
+			__argv[j++] = "-q";
+			__argv[j++] = "-o";
+			__argv[j++] = "-";
+			for (i = rep_args + 1; i < argc; i++)
+				__argv[j++] = argv[i];
+			__argv[j++] = NULL;
+
+			execvp("/bin/sh", (char **)__argv);
+			free(__argv);
+			exit(-1);
+		}
+
+		dup2(live_pipe[0], 0);
+		close(live_pipe[1]);
+
+		__argv = malloc((argc + 4) * sizeof(const char *));
+		if (!__argv) {
+			pr_err("malloc failed\n");
+			err = -ENOMEM;
+			goto out;
+		}
+
+		j = 0;
+		__argv[j++] = "/bin/sh";
+		__argv[j++] = rep_script_path;
+		for (i = 1; i < rep_args + 1; i++)
+			__argv[j++] = argv[i];
+		__argv[j++] = "-i";
+		__argv[j++] = "-";
+		__argv[j++] = NULL;
+
+		execvp("/bin/sh", (char **)__argv);
+		free(__argv);
+		exit(-1);
+	}
+
+	if (rec_script_path)
+		script_path = rec_script_path;
+	if (rep_script_path)
+		script_path = rep_script_path;
+
+	if (script_path) {
+		j = 0;
+
+		if (!rec_script_path)
+			system_wide = false;
+		else if (!system_wide) {
+			if (have_cmd(argc - 1, &argv[1]) != 0) {
+				err = -1;
+				goto out;
+			}
+		}
+
+		__argv = malloc((argc + 2) * sizeof(const char *));
+		if (!__argv) {
+			pr_err("malloc failed\n");
+			err = -ENOMEM;
+			goto out;
+		}
+
+		__argv[j++] = "/bin/sh";
+		__argv[j++] = script_path;
+		if (system_wide)
+			__argv[j++] = "-a";
+		for (i = 2; i < argc; i++)
+			__argv[j++] = argv[i];
+		__argv[j++] = NULL;
+
+		execvp("/bin/sh", (char **)__argv);
+		free(__argv);
+		exit(-1);
+	}
+
+	if (symbol__init() < 0)
+		return -1;
+	if (!script_name)
+		setup_pager();
+
+	session = perf_session__new(&file, false, &script.tool);
+	if (session == NULL)
+		return -ENOMEM;
+
+	if (header || header_only) {
+		perf_session__fprintf_info(session, stdout, show_full_info);
+		if (header_only)
+			return 0;
+	}
+
+	script.session = session;
+
+	if (cpu_list) {
+		if (perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap))
+			return -1;
+	}
+
+	if (!no_callchain)
+		symbol_conf.use_callchain = true;
+	else
+		symbol_conf.use_callchain = false;
+
+	if (generate_script_lang) {
+		struct stat perf_stat;
+		int input;
+
+		if (output_set_by_user()) {
+			fprintf(stderr,
+				"custom fields not supported for generated scripts");
+			return -1;
+		}
+
+		input = open(file.path, O_RDONLY);	/* input_name */
+		if (input < 0) {
+			perror("failed to open file");
+			return -1;
+		}
+
+		err = fstat(input, &perf_stat);
+		if (err < 0) {
+			perror("failed to stat file");
+			return -1;
+		}
+
+		if (!perf_stat.st_size) {
+			fprintf(stderr, "zero-sized file, nothing to do!\n");
+			return 0;
+		}
+
+		scripting_ops = script_spec__lookup(generate_script_lang);
+		if (!scripting_ops) {
+			fprintf(stderr, "invalid language specifier");
+			return -1;
+		}
+
+		err = scripting_ops->generate_script(session->tevent.pevent,
+						     "perf-script");
+		goto out;
+	}
+
+	if (script_name) {
+		err = scripting_ops->start_script(script_name, argc, argv);
+		if (err)
+			goto out;
+		pr_debug("perf script started with script %s\n\n", script_name);
+	}
+
+
+	err = perf_session__check_output_opt(session);
+	if (err < 0)
+		goto out;
+
+	err = __cmd_script(&script);
+
+	perf_session__delete(session);
+	cleanup_scripting();
+out:
+	return err;
+}
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 95db31cff6f..65a151e3606 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -6,24 +6,28 @@
  *
  * Sample output:
 
-   $ perf stat ~/hackbench 10
-   Time: 0.104
+   $ perf stat ./hackbench 10
 
-    Performance counter stats for '/home/mingo/hackbench':
+  Time: 0.118
 
-       1255.538611  task clock ticks     #      10.143 CPU utilization factor
-             54011  context switches     #       0.043 M/sec
-               385  CPU migrations       #       0.000 M/sec
-             17755  pagefaults           #       0.014 M/sec
-        3808323185  CPU cycles           #    3033.219 M/sec
-        1575111190  instructions         #    1254.530 M/sec
-          17367895  cache references     #      13.833 M/sec
-           7674421  cache misses         #       6.112 M/sec
+  Performance counter stats for './hackbench 10':
 
-    Wall-clock time elapsed:   123.786620 msecs
+       1708.761321 task-clock                #   11.037 CPUs utilized
+            41,190 context-switches          #    0.024 M/sec
+             6,735 CPU-migrations            #    0.004 M/sec
+            17,318 page-faults               #    0.010 M/sec
+     5,205,202,243 cycles                    #    3.046 GHz
+     3,856,436,920 stalled-cycles-frontend   #   74.09% frontend cycles idle
+     1,600,790,871 stalled-cycles-backend    #   30.75% backend  cycles idle
+     2,603,501,247 instructions              #    0.50  insns per cycle
+                                             #    1.48  stalled cycles per insn
+       484,357,498 branches                  #  283.455 M/sec
+         6,388,934 branch-misses             #    1.32% of all branches
+
+        0.154822978  seconds time elapsed
 
  *
- * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ * Copyright (C) 2008-2011, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
  *
  * Improvements and fixes by:
  *
@@ -42,434 +46,1356 @@
 #include "util/util.h"
 #include "util/parse-options.h"
 #include "util/parse-events.h"
+#include "util/pmu.h"
 #include "util/event.h"
+#include "util/evlist.h"
+#include "util/evsel.h"
 #include "util/debug.h"
+#include "util/color.h"
+#include "util/stat.h"
 #include "util/header.h"
 #include "util/cpumap.h"
+#include "util/thread.h"
+#include "util/thread_map.h"
 
+#include <stdlib.h>
 #include <sys/prctl.h>
-#include <math.h>
+#include <locale.h>
+
+#define DEFAULT_SEPARATOR	" "
+#define CNTR_NOT_SUPPORTED	"<not supported>"
+#define CNTR_NOT_COUNTED	"<not counted>"
+
+static void print_stat(int argc, const char **argv);
+static void print_counter_aggr(struct perf_evsel *counter, char *prefix);
+static void print_counter(struct perf_evsel *counter, char *prefix);
+static void print_aggr(char *prefix);
+
+/* Default events used for perf stat -T */
+static const char * const transaction_attrs[] = {
+	"task-clock",
+	"{"
+	"instructions,"
+	"cycles,"
+	"cpu/cycles-t/,"
+	"cpu/tx-start/,"
+	"cpu/el-start/,"
+	"cpu/cycles-ct/"
+	"}"
+};
 
-static struct perf_event_attr default_attrs[] = {
+/* More limited version when the CPU does not have all events. */
+static const char * const transaction_limited_attrs[] = {
+	"task-clock",
+	"{"
+	"instructions,"
+	"cycles,"
+	"cpu/cycles-t/,"
+	"cpu/tx-start/"
+	"}"
+};
 
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK		},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES	},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS		},
-  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS		},
+/* must match transaction_attrs and the beginning limited_attrs */
+enum {
+	T_TASK_CLOCK,
+	T_INSTRUCTIONS,
+	T_CYCLES,
+	T_CYCLES_IN_TX,
+	T_TRANSACTION_START,
+	T_ELISION_START,
+	T_CYCLES_IN_TX_CP,
+};
 
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES		},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS		},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS	},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES		},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES	},
-  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES		},
+static struct perf_evlist	*evsel_list;
 
+static struct target target = {
+	.uid	= UINT_MAX,
 };
 
-static int			system_wide			=  0;
-static unsigned int		nr_cpus				=  0;
-static int			run_idx				=  0;
+enum aggr_mode {
+	AGGR_NONE,
+	AGGR_GLOBAL,
+	AGGR_SOCKET,
+	AGGR_CORE,
+};
 
 static int			run_count			=  1;
-static int			inherit				=  1;
-static int			scale				=  1;
-static pid_t			target_pid			= -1;
-static pid_t			child_pid			= -1;
-static int			null_run			=  0;
+static bool			no_inherit			= false;
+static bool			scale				=  true;
+static enum aggr_mode		aggr_mode			= AGGR_GLOBAL;
+static volatile pid_t		child_pid			= -1;
+static bool			null_run			=  false;
+static int			detailed_run			=  0;
+static bool			transaction_run;
+static bool			big_num				=  true;
+static int			big_num_opt			=  -1;
+static const char		*csv_sep			= NULL;
+static bool			csv_output			= false;
+static bool			group				= false;
+static FILE			*output				= NULL;
+static const char		*pre_cmd			= NULL;
+static const char		*post_cmd			= NULL;
+static bool			sync_run			= false;
+static unsigned int		interval			= 0;
+static unsigned int		initial_delay			= 0;
+static unsigned int		unit_width			= 4; /* strlen("unit") */
+static bool			forever				= false;
+static struct timespec		ref_time;
+static struct cpu_map		*aggr_map;
+static int			(*aggr_get_id)(struct cpu_map *m, int cpu);
 
-static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+static volatile int done = 0;
 
-static int			event_scaled[MAX_COUNTERS];
+struct perf_stat {
+	struct stats	  res_stats[3];
+};
 
-static volatile int done = 0;
+static inline void diff_timespec(struct timespec *r, struct timespec *a,
+				 struct timespec *b)
+{
+	r->tv_sec = a->tv_sec - b->tv_sec;
+	if (a->tv_nsec < b->tv_nsec) {
+		r->tv_nsec = a->tv_nsec + 1000000000L - b->tv_nsec;
+		r->tv_sec--;
+	} else {
+		r->tv_nsec = a->tv_nsec - b->tv_nsec ;
+	}
+}
 
-struct stats
+static inline struct cpu_map *perf_evsel__cpus(struct perf_evsel *evsel)
 {
-	double n, mean, M2;
-};
+	return (evsel->cpus && !target.cpu_list) ? evsel->cpus : evsel_list->cpus;
+}
 
-static void update_stats(struct stats *stats, u64 val)
+static inline int perf_evsel__nr_cpus(struct perf_evsel *evsel)
 {
-	double delta;
+	return perf_evsel__cpus(evsel)->nr;
+}
 
-	stats->n++;
-	delta = val - stats->mean;
-	stats->mean += delta / stats->n;
-	stats->M2 += delta*(val - stats->mean);
+static void perf_evsel__reset_stat_priv(struct perf_evsel *evsel)
+{
+	int i;
+	struct perf_stat *ps = evsel->priv;
+
+	for (i = 0; i < 3; i++)
+		init_stats(&ps->res_stats[i]);
 }
 
-static double avg_stats(struct stats *stats)
+static int perf_evsel__alloc_stat_priv(struct perf_evsel *evsel)
 {
-	return stats->mean;
+	evsel->priv = zalloc(sizeof(struct perf_stat));
+	if (evsel == NULL)
+		return -ENOMEM;
+	perf_evsel__reset_stat_priv(evsel);
+	return 0;
 }
 
-/*
- * http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance
- *
- *       (\Sum n_i^2) - ((\Sum n_i)^2)/n
- * s^2 = -------------------------------
- *                  n - 1
- *
- * http://en.wikipedia.org/wiki/Stddev
- *
- * The std dev of the mean is related to the std dev by:
- *
- *             s
- * s_mean = -------
- *          sqrt(n)
- *
- */
-static double stddev_stats(struct stats *stats)
+static void perf_evsel__free_stat_priv(struct perf_evsel *evsel)
+{
+	zfree(&evsel->priv);
+}
+
+static int perf_evsel__alloc_prev_raw_counts(struct perf_evsel *evsel)
 {
-	double variance = stats->M2 / (stats->n - 1);
-	double variance_mean = variance / stats->n;
+	void *addr;
+	size_t sz;
+
+	sz = sizeof(*evsel->counts) +
+	     (perf_evsel__nr_cpus(evsel) * sizeof(struct perf_counts_values));
+
+	addr = zalloc(sz);
+	if (!addr)
+		return -ENOMEM;
 
-	return sqrt(variance_mean);
+	evsel->prev_raw_counts =  addr;
+
+	return 0;
 }
 
-struct stats			event_res_stats[MAX_COUNTERS][3];
-struct stats			runtime_nsecs_stats;
-struct stats			walltime_nsecs_stats;
-struct stats			runtime_cycles_stats;
-struct stats			runtime_branches_stats;
+static void perf_evsel__free_prev_raw_counts(struct perf_evsel *evsel)
+{
+	zfree(&evsel->prev_raw_counts);
+}
 
-#define MATCH_EVENT(t, c, counter)			\
-	(attrs[counter].type == PERF_TYPE_##t &&	\
-	 attrs[counter].config == PERF_COUNT_##c)
+static void perf_evlist__free_stats(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel;
 
-#define ERR_PERF_OPEN \
-"Error: counter %d, sys_perf_event_open() syscall returned with %d (%s)\n"
+	evlist__for_each(evlist, evsel) {
+		perf_evsel__free_stat_priv(evsel);
+		perf_evsel__free_counts(evsel);
+		perf_evsel__free_prev_raw_counts(evsel);
+	}
+}
 
-static void create_perf_stat_counter(int counter, int pid)
+static int perf_evlist__alloc_stats(struct perf_evlist *evlist, bool alloc_raw)
 {
-	struct perf_event_attr *attr = attrs + counter;
+	struct perf_evsel *evsel;
+
+	evlist__for_each(evlist, evsel) {
+		if (perf_evsel__alloc_stat_priv(evsel) < 0 ||
+		    perf_evsel__alloc_counts(evsel, perf_evsel__nr_cpus(evsel)) < 0 ||
+		    (alloc_raw && perf_evsel__alloc_prev_raw_counts(evsel) < 0))
+			goto out_free;
+	}
+
+	return 0;
+
+out_free:
+	perf_evlist__free_stats(evlist);
+	return -1;
+}
+
+static struct stats runtime_nsecs_stats[MAX_NR_CPUS];
+static struct stats runtime_cycles_stats[MAX_NR_CPUS];
+static struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS];
+static struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS];
+static struct stats runtime_branches_stats[MAX_NR_CPUS];
+static struct stats runtime_cacherefs_stats[MAX_NR_CPUS];
+static struct stats runtime_l1_dcache_stats[MAX_NR_CPUS];
+static struct stats runtime_l1_icache_stats[MAX_NR_CPUS];
+static struct stats runtime_ll_cache_stats[MAX_NR_CPUS];
+static struct stats runtime_itlb_cache_stats[MAX_NR_CPUS];
+static struct stats runtime_dtlb_cache_stats[MAX_NR_CPUS];
+static struct stats runtime_cycles_in_tx_stats[MAX_NR_CPUS];
+static struct stats walltime_nsecs_stats;
+static struct stats runtime_transaction_stats[MAX_NR_CPUS];
+static struct stats runtime_elision_stats[MAX_NR_CPUS];
+
+static void perf_stat__reset_stats(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel;
+
+	evlist__for_each(evlist, evsel) {
+		perf_evsel__reset_stat_priv(evsel);
+		perf_evsel__reset_counts(evsel, perf_evsel__nr_cpus(evsel));
+	}
+
+	memset(runtime_nsecs_stats, 0, sizeof(runtime_nsecs_stats));
+	memset(runtime_cycles_stats, 0, sizeof(runtime_cycles_stats));
+	memset(runtime_stalled_cycles_front_stats, 0, sizeof(runtime_stalled_cycles_front_stats));
+	memset(runtime_stalled_cycles_back_stats, 0, sizeof(runtime_stalled_cycles_back_stats));
+	memset(runtime_branches_stats, 0, sizeof(runtime_branches_stats));
+	memset(runtime_cacherefs_stats, 0, sizeof(runtime_cacherefs_stats));
+	memset(runtime_l1_dcache_stats, 0, sizeof(runtime_l1_dcache_stats));
+	memset(runtime_l1_icache_stats, 0, sizeof(runtime_l1_icache_stats));
+	memset(runtime_ll_cache_stats, 0, sizeof(runtime_ll_cache_stats));
+	memset(runtime_itlb_cache_stats, 0, sizeof(runtime_itlb_cache_stats));
+	memset(runtime_dtlb_cache_stats, 0, sizeof(runtime_dtlb_cache_stats));
+	memset(runtime_cycles_in_tx_stats, 0,
+			sizeof(runtime_cycles_in_tx_stats));
+	memset(runtime_transaction_stats, 0,
+		sizeof(runtime_transaction_stats));
+	memset(runtime_elision_stats, 0, sizeof(runtime_elision_stats));
+	memset(&walltime_nsecs_stats, 0, sizeof(walltime_nsecs_stats));
+}
+
+static int create_perf_stat_counter(struct perf_evsel *evsel)
+{
+	struct perf_event_attr *attr = &evsel->attr;
 
 	if (scale)
 		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
 				    PERF_FORMAT_TOTAL_TIME_RUNNING;
 
-	if (system_wide) {
-		unsigned int cpu;
+	attr->inherit = !no_inherit;
 
-		for (cpu = 0; cpu < nr_cpus; cpu++) {
-			fd[cpu][counter] = sys_perf_event_open(attr, -1, cpumap[cpu], -1, 0);
-			if (fd[cpu][counter] < 0 && verbose)
-				fprintf(stderr, ERR_PERF_OPEN, counter,
-					fd[cpu][counter], strerror(errno));
-		}
-	} else {
-		attr->inherit	     = inherit;
-		attr->disabled	     = 1;
-		attr->enable_on_exec = 1;
+	if (target__has_cpu(&target))
+		return perf_evsel__open_per_cpu(evsel, perf_evsel__cpus(evsel));
 
-		fd[0][counter] = sys_perf_event_open(attr, pid, -1, -1, 0);
-		if (fd[0][counter] < 0 && verbose)
-			fprintf(stderr, ERR_PERF_OPEN, counter,
-				fd[0][counter], strerror(errno));
+	if (!target__has_task(&target) && perf_evsel__is_group_leader(evsel)) {
+		attr->disabled = 1;
+		if (!initial_delay)
+			attr->enable_on_exec = 1;
 	}
+
+	return perf_evsel__open_per_thread(evsel, evsel_list->threads);
 }
 
 /*
  * Does the counter have nsecs as a unit?
  */
-static inline int nsec_counter(int counter)
+static inline int nsec_counter(struct perf_evsel *evsel)
 {
-	if (MATCH_EVENT(SOFTWARE, SW_CPU_CLOCK, counter) ||
-	    MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
+	if (perf_evsel__match(evsel, SOFTWARE, SW_CPU_CLOCK) ||
+	    perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
 		return 1;
 
 	return 0;
 }
 
+static struct perf_evsel *nth_evsel(int n)
+{
+	static struct perf_evsel **array;
+	static int array_len;
+	struct perf_evsel *ev;
+	int j;
+
+	/* Assumes this only called when evsel_list does not change anymore. */
+	if (!array) {
+		evlist__for_each(evsel_list, ev)
+			array_len++;
+		array = malloc(array_len * sizeof(void *));
+		if (!array)
+			exit(ENOMEM);
+		j = 0;
+		evlist__for_each(evsel_list, ev)
+			array[j++] = ev;
+	}
+	if (n < array_len)
+		return array[n];
+	return NULL;
+}
+
+/*
+ * Update various tracking values we maintain to print
+ * more semantic information such as miss/hit ratios,
+ * instruction rates, etc:
+ */
+static void update_shadow_stats(struct perf_evsel *counter, u64 *count)
+{
+	if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK))
+		update_stats(&runtime_nsecs_stats[0], count[0]);
+	else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES))
+		update_stats(&runtime_cycles_stats[0], count[0]);
+	else if (transaction_run &&
+		 perf_evsel__cmp(counter, nth_evsel(T_CYCLES_IN_TX)))
+		update_stats(&runtime_cycles_in_tx_stats[0], count[0]);
+	else if (transaction_run &&
+		 perf_evsel__cmp(counter, nth_evsel(T_TRANSACTION_START)))
+		update_stats(&runtime_transaction_stats[0], count[0]);
+	else if (transaction_run &&
+		 perf_evsel__cmp(counter, nth_evsel(T_ELISION_START)))
+		update_stats(&runtime_elision_stats[0], count[0]);
+	else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND))
+		update_stats(&runtime_stalled_cycles_front_stats[0], count[0]);
+	else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND))
+		update_stats(&runtime_stalled_cycles_back_stats[0], count[0]);
+	else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS))
+		update_stats(&runtime_branches_stats[0], count[0]);
+	else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES))
+		update_stats(&runtime_cacherefs_stats[0], count[0]);
+	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D))
+		update_stats(&runtime_l1_dcache_stats[0], count[0]);
+	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1I))
+		update_stats(&runtime_l1_icache_stats[0], count[0]);
+	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_LL))
+		update_stats(&runtime_ll_cache_stats[0], count[0]);
+	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_DTLB))
+		update_stats(&runtime_dtlb_cache_stats[0], count[0]);
+	else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_ITLB))
+		update_stats(&runtime_itlb_cache_stats[0], count[0]);
+}
+
 /*
  * Read out the results of a single counter:
+ * aggregate counts across CPUs in system-wide mode
  */
-static void read_counter(int counter)
+static int read_counter_aggr(struct perf_evsel *counter)
 {
-	u64 count[3], single_count[3];
-	unsigned int cpu;
-	size_t res, nv;
-	int scaled;
+	struct perf_stat *ps = counter->priv;
+	u64 *count = counter->counts->aggr.values;
 	int i;
 
-	count[0] = count[1] = count[2] = 0;
+	if (__perf_evsel__read(counter, perf_evsel__nr_cpus(counter),
+			       thread_map__nr(evsel_list->threads), scale) < 0)
+		return -1;
 
-	nv = scale ? 3 : 1;
-	for (cpu = 0; cpu < nr_cpus; cpu++) {
-		if (fd[cpu][counter] < 0)
-			continue;
+	for (i = 0; i < 3; i++)
+		update_stats(&ps->res_stats[i], count[i]);
+
+	if (verbose) {
+		fprintf(output, "%s: %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
+			perf_evsel__name(counter), count[0], count[1], count[2]);
+	}
 
-		res = read(fd[cpu][counter], single_count, nv * sizeof(u64));
-		assert(res == nv * sizeof(u64));
+	/*
+	 * Save the full runtime - to allow normalization during printout:
+	 */
+	update_shadow_stats(counter, count);
 
-		close(fd[cpu][counter]);
-		fd[cpu][counter] = -1;
+	return 0;
+}
 
-		count[0] += single_count[0];
-		if (scale) {
-			count[1] += single_count[1];
-			count[2] += single_count[2];
-		}
+/*
+ * Read out the results of a single counter:
+ * do not aggregate counts across CPUs in system-wide mode
+ */
+static int read_counter(struct perf_evsel *counter)
+{
+	u64 *count;
+	int cpu;
+
+	for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
+		if (__perf_evsel__read_on_cpu(counter, cpu, 0, scale) < 0)
+			return -1;
+
+		count = counter->counts->cpu[cpu].values;
+
+		update_shadow_stats(counter, count);
 	}
 
-	scaled = 0;
-	if (scale) {
-		if (count[2] == 0) {
-			event_scaled[counter] = -1;
-			count[0] = 0;
-			return;
+	return 0;
+}
+
+static void print_interval(void)
+{
+	static int num_print_interval;
+	struct perf_evsel *counter;
+	struct perf_stat *ps;
+	struct timespec ts, rs;
+	char prefix[64];
+
+	if (aggr_mode == AGGR_GLOBAL) {
+		evlist__for_each(evsel_list, counter) {
+			ps = counter->priv;
+			memset(ps->res_stats, 0, sizeof(ps->res_stats));
+			read_counter_aggr(counter);
+		}
+	} else	{
+		evlist__for_each(evsel_list, counter) {
+			ps = counter->priv;
+			memset(ps->res_stats, 0, sizeof(ps->res_stats));
+			read_counter(counter);
 		}
+	}
 
-		if (count[2] < count[1]) {
-			event_scaled[counter] = 1;
-			count[0] = (unsigned long long)
-				((double)count[0] * count[1] / count[2] + 0.5);
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+	diff_timespec(&rs, &ts, &ref_time);
+	sprintf(prefix, "%6lu.%09lu%s", rs.tv_sec, rs.tv_nsec, csv_sep);
+
+	if (num_print_interval == 0 && !csv_output) {
+		switch (aggr_mode) {
+		case AGGR_SOCKET:
+			fprintf(output, "#           time socket cpus             counts %*s events\n", unit_width, "unit");
+			break;
+		case AGGR_CORE:
+			fprintf(output, "#           time core         cpus             counts %*s events\n", unit_width, "unit");
+			break;
+		case AGGR_NONE:
+			fprintf(output, "#           time CPU                counts %*s events\n", unit_width, "unit");
+			break;
+		case AGGR_GLOBAL:
+		default:
+			fprintf(output, "#           time             counts %*s events\n", unit_width, "unit");
 		}
 	}
 
-	for (i = 0; i < 3; i++)
-		update_stats(&event_res_stats[counter][i], count[i]);
+	if (++num_print_interval == 25)
+		num_print_interval = 0;
+
+	switch (aggr_mode) {
+	case AGGR_CORE:
+	case AGGR_SOCKET:
+		print_aggr(prefix);
+		break;
+	case AGGR_NONE:
+		evlist__for_each(evsel_list, counter)
+			print_counter(counter, prefix);
+		break;
+	case AGGR_GLOBAL:
+	default:
+		evlist__for_each(evsel_list, counter)
+			print_counter_aggr(counter, prefix);
+	}
 
-	if (verbose) {
-		fprintf(stderr, "%s: %Ld %Ld %Ld\n", event_name(counter),
-				count[0], count[1], count[2]);
+	fflush(output);
+}
+
+static void handle_initial_delay(void)
+{
+	struct perf_evsel *counter;
+
+	if (initial_delay) {
+		const int ncpus = cpu_map__nr(evsel_list->cpus),
+			nthreads = thread_map__nr(evsel_list->threads);
+
+		usleep(initial_delay * 1000);
+		evlist__for_each(evsel_list, counter)
+			perf_evsel__enable(counter, ncpus, nthreads);
 	}
+}
 
-	/*
-	 * Save the full runtime - to allow normalization during printout:
-	 */
-	if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter))
-		update_stats(&runtime_nsecs_stats, count[0]);
-	if (MATCH_EVENT(HARDWARE, HW_CPU_CYCLES, counter))
-		update_stats(&runtime_cycles_stats, count[0]);
-	if (MATCH_EVENT(HARDWARE, HW_BRANCH_INSTRUCTIONS, counter))
-		update_stats(&runtime_branches_stats, count[0]);
+static volatile int workload_exec_errno;
+
+/*
+ * perf_evlist__prepare_workload will send a SIGUSR1
+ * if the fork fails, since we asked by setting its
+ * want_signal to true.
+ */
+static void workload_exec_failed_signal(int signo __maybe_unused, siginfo_t *info,
+					void *ucontext __maybe_unused)
+{
+	workload_exec_errno = info->si_value.sival_int;
 }
 
-static int run_perf_stat(int argc __used, const char **argv)
+static int __run_perf_stat(int argc, const char **argv)
 {
+	char msg[512];
 	unsigned long long t0, t1;
+	struct perf_evsel *counter;
+	struct timespec ts;
+	size_t l;
 	int status = 0;
-	int counter;
-	int pid = target_pid;
-	int child_ready_pipe[2], go_pipe[2];
-	const bool forks = (target_pid == -1 && argc > 0);
-	char buf;
+	const bool forks = (argc > 0);
 
-	if (!system_wide)
-		nr_cpus = 1;
-
-	if (forks && (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0)) {
-		perror("failed to create pipes");
-		exit(1);
+	if (interval) {
+		ts.tv_sec  = interval / 1000;
+		ts.tv_nsec = (interval % 1000) * 1000000;
+	} else {
+		ts.tv_sec  = 1;
+		ts.tv_nsec = 0;
 	}
 
 	if (forks) {
-		if ((pid = fork()) < 0)
-			perror("failed to fork");
-
-		if (!pid) {
-			close(child_ready_pipe[0]);
-			close(go_pipe[1]);
-			fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
+		if (perf_evlist__prepare_workload(evsel_list, &target, argv, false,
+						  workload_exec_failed_signal) < 0) {
+			perror("failed to prepare workload");
+			return -1;
+		}
+		child_pid = evsel_list->workload.pid;
+	}
 
-			/*
-			 * Do a dummy execvp to get the PLT entry resolved,
-			 * so we avoid the resolver overhead on the real
-			 * execvp call.
-			 */
-			execvp("", (char **)argv);
+	if (group)
+		perf_evlist__set_leader(evsel_list);
 
+	evlist__for_each(evsel_list, counter) {
+		if (create_perf_stat_counter(counter) < 0) {
 			/*
-			 * Tell the parent we're ready to go
+			 * PPC returns ENXIO for HW counters until 2.6.37
+			 * (behavior changed with commit b0a873e).
 			 */
-			close(child_ready_pipe[1]);
-
-			/*
-			 * Wait until the parent tells us to go.
-			 */
-			if (read(go_pipe[0], &buf, 1) == -1)
-				perror("unable to read pipe");
-
-			execvp(argv[0], (char **)argv);
-
-			perror(argv[0]);
-			exit(-1);
+			if (errno == EINVAL || errno == ENOSYS ||
+			    errno == ENOENT || errno == EOPNOTSUPP ||
+			    errno == ENXIO) {
+				if (verbose)
+					ui__warning("%s event is not supported by the kernel.\n",
+						    perf_evsel__name(counter));
+				counter->supported = false;
+				continue;
+			}
+
+			perf_evsel__open_strerror(counter, &target,
+						  errno, msg, sizeof(msg));
+			ui__error("%s\n", msg);
+
+			if (child_pid != -1)
+				kill(child_pid, SIGTERM);
+
+			return -1;
 		}
+		counter->supported = true;
 
-		child_pid = pid;
-
-		/*
-		 * Wait for the child to be ready to exec.
-		 */
-		close(child_ready_pipe[1]);
-		close(go_pipe[0]);
-		if (read(child_ready_pipe[0], &buf, 1) == -1)
-			perror("unable to read pipe");
-		close(child_ready_pipe[0]);
+		l = strlen(counter->unit);
+		if (l > unit_width)
+			unit_width = l;
 	}
 
-	for (counter = 0; counter < nr_counters; counter++)
-		create_perf_stat_counter(counter, pid);
+	if (perf_evlist__apply_filters(evsel_list)) {
+		error("failed to set filter with %d (%s)\n", errno,
+			strerror(errno));
+		return -1;
+	}
 
 	/*
 	 * Enable counters and exec the command:
 	 */
 	t0 = rdclock();
+	clock_gettime(CLOCK_MONOTONIC, &ref_time);
 
 	if (forks) {
-		close(go_pipe[1]);
+		perf_evlist__start_workload(evsel_list);
+		handle_initial_delay();
+
+		if (interval) {
+			while (!waitpid(child_pid, &status, WNOHANG)) {
+				nanosleep(&ts, NULL);
+				print_interval();
+			}
+		}
 		wait(&status);
+
+		if (workload_exec_errno) {
+			const char *emsg = strerror_r(workload_exec_errno, msg, sizeof(msg));
+			pr_err("Workload failed: %s\n", emsg);
+			return -1;
+		}
+
+		if (WIFSIGNALED(status))
+			psignal(WTERMSIG(status), argv[0]);
 	} else {
-		while(!done);
+		handle_initial_delay();
+		while (!done) {
+			nanosleep(&ts, NULL);
+			if (interval)
+				print_interval();
+		}
 	}
 
 	t1 = rdclock();
 
 	update_stats(&walltime_nsecs_stats, t1 - t0);
 
-	for (counter = 0; counter < nr_counters; counter++)
-		read_counter(counter);
+	if (aggr_mode == AGGR_GLOBAL) {
+		evlist__for_each(evsel_list, counter) {
+			read_counter_aggr(counter);
+			perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter),
+					     thread_map__nr(evsel_list->threads));
+		}
+	} else {
+		evlist__for_each(evsel_list, counter) {
+			read_counter(counter);
+			perf_evsel__close_fd(counter, perf_evsel__nr_cpus(counter), 1);
+		}
+	}
 
 	return WEXITSTATUS(status);
 }
 
-static void print_noise(int counter, double avg)
+static int run_perf_stat(int argc, const char **argv)
+{
+	int ret;
+
+	if (pre_cmd) {
+		ret = system(pre_cmd);
+		if (ret)
+			return ret;
+	}
+
+	if (sync_run)
+		sync();
+
+	ret = __run_perf_stat(argc, argv);
+	if (ret)
+		return ret;
+
+	if (post_cmd) {
+		ret = system(post_cmd);
+		if (ret)
+			return ret;
+	}
+
+	return ret;
+}
+
+static void print_noise_pct(double total, double avg)
 {
+	double pct = rel_stddev_stats(total, avg);
+
+	if (csv_output)
+		fprintf(output, "%s%.2f%%", csv_sep, pct);
+	else if (pct)
+		fprintf(output, "  ( +-%6.2f%% )", pct);
+}
+
+static void print_noise(struct perf_evsel *evsel, double avg)
+{
+	struct perf_stat *ps;
+
 	if (run_count == 1)
 		return;
 
-	fprintf(stderr, "   ( +- %7.3f%% )",
-			100 * stddev_stats(&event_res_stats[counter][0]) / avg);
+	ps = evsel->priv;
+	print_noise_pct(stddev_stats(&ps->res_stats[0]), avg);
+}
+
+static void aggr_printout(struct perf_evsel *evsel, int id, int nr)
+{
+	switch (aggr_mode) {
+	case AGGR_CORE:
+		fprintf(output, "S%d-C%*d%s%*d%s",
+			cpu_map__id_to_socket(id),
+			csv_output ? 0 : -8,
+			cpu_map__id_to_cpu(id),
+			csv_sep,
+			csv_output ? 0 : 4,
+			nr,
+			csv_sep);
+		break;
+	case AGGR_SOCKET:
+		fprintf(output, "S%*d%s%*d%s",
+			csv_output ? 0 : -5,
+			id,
+			csv_sep,
+			csv_output ? 0 : 4,
+			nr,
+			csv_sep);
+			break;
+	case AGGR_NONE:
+		fprintf(output, "CPU%*d%s",
+			csv_output ? 0 : -4,
+			perf_evsel__cpus(evsel)->map[id], csv_sep);
+		break;
+	case AGGR_GLOBAL:
+	default:
+		break;
+	}
 }
 
-static void nsec_printout(int counter, double avg)
+static void nsec_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
 {
 	double msecs = avg / 1e6;
+	const char *fmt_v, *fmt_n;
+	char name[25];
 
-	fprintf(stderr, " %14.6f  %-24s", msecs, event_name(counter));
+	fmt_v = csv_output ? "%.6f%s" : "%18.6f%s";
+	fmt_n = csv_output ? "%s" : "%-25s";
 
-	if (MATCH_EVENT(SOFTWARE, SW_TASK_CLOCK, counter)) {
-		fprintf(stderr, " # %10.3f CPUs ",
-				avg / avg_stats(&walltime_nsecs_stats));
-	}
+	aggr_printout(evsel, cpu, nr);
+
+	scnprintf(name, sizeof(name), "%s%s",
+		  perf_evsel__name(evsel), csv_output ? "" : " (msec)");
+
+	fprintf(output, fmt_v, msecs, csv_sep);
+
+	if (csv_output)
+		fprintf(output, "%s%s", evsel->unit, csv_sep);
+	else
+		fprintf(output, "%-*s%s", unit_width, evsel->unit, csv_sep);
+
+	fprintf(output, fmt_n, name);
+
+	if (evsel->cgrp)
+		fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
+
+	if (csv_output || interval)
+		return;
+
+	if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK))
+		fprintf(output, " # %8.3f CPUs utilized          ",
+			avg / avg_stats(&walltime_nsecs_stats));
+	else
+		fprintf(output, "                                   ");
+}
+
+/* used for get_ratio_color() */
+enum grc_type {
+	GRC_STALLED_CYCLES_FE,
+	GRC_STALLED_CYCLES_BE,
+	GRC_CACHE_MISSES,
+	GRC_MAX_NR
+};
+
+static const char *get_ratio_color(enum grc_type type, double ratio)
+{
+	static const double grc_table[GRC_MAX_NR][3] = {
+		[GRC_STALLED_CYCLES_FE] = { 50.0, 30.0, 10.0 },
+		[GRC_STALLED_CYCLES_BE] = { 75.0, 50.0, 20.0 },
+		[GRC_CACHE_MISSES] 	= { 20.0, 10.0, 5.0 },
+	};
+	const char *color = PERF_COLOR_NORMAL;
+
+	if (ratio > grc_table[type][0])
+		color = PERF_COLOR_RED;
+	else if (ratio > grc_table[type][1])
+		color = PERF_COLOR_MAGENTA;
+	else if (ratio > grc_table[type][2])
+		color = PERF_COLOR_YELLOW;
+
+	return color;
 }
 
-static void abs_printout(int counter, double avg)
+static void print_stalled_cycles_frontend(int cpu,
+					  struct perf_evsel *evsel
+					  __maybe_unused, double avg)
 {
 	double total, ratio = 0.0;
+	const char *color;
 
-	fprintf(stderr, " %14.0f  %-24s", avg, event_name(counter));
+	total = avg_stats(&runtime_cycles_stats[cpu]);
 
-	if (MATCH_EVENT(HARDWARE, HW_INSTRUCTIONS, counter)) {
-		total = avg_stats(&runtime_cycles_stats);
+	if (total)
+		ratio = avg / total * 100.0;
 
-		if (total)
+	color = get_ratio_color(GRC_STALLED_CYCLES_FE, ratio);
+
+	fprintf(output, " #  ");
+	color_fprintf(output, color, "%6.2f%%", ratio);
+	fprintf(output, " frontend cycles idle   ");
+}
+
+static void print_stalled_cycles_backend(int cpu,
+					 struct perf_evsel *evsel
+					 __maybe_unused, double avg)
+{
+	double total, ratio = 0.0;
+	const char *color;
+
+	total = avg_stats(&runtime_cycles_stats[cpu]);
+
+	if (total)
+		ratio = avg / total * 100.0;
+
+	color = get_ratio_color(GRC_STALLED_CYCLES_BE, ratio);
+
+	fprintf(output, " #  ");
+	color_fprintf(output, color, "%6.2f%%", ratio);
+	fprintf(output, " backend  cycles idle   ");
+}
+
+static void print_branch_misses(int cpu,
+				struct perf_evsel *evsel __maybe_unused,
+				double avg)
+{
+	double total, ratio = 0.0;
+	const char *color;
+
+	total = avg_stats(&runtime_branches_stats[cpu]);
+
+	if (total)
+		ratio = avg / total * 100.0;
+
+	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
+
+	fprintf(output, " #  ");
+	color_fprintf(output, color, "%6.2f%%", ratio);
+	fprintf(output, " of all branches        ");
+}
+
+static void print_l1_dcache_misses(int cpu,
+				   struct perf_evsel *evsel __maybe_unused,
+				   double avg)
+{
+	double total, ratio = 0.0;
+	const char *color;
+
+	total = avg_stats(&runtime_l1_dcache_stats[cpu]);
+
+	if (total)
+		ratio = avg / total * 100.0;
+
+	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
+
+	fprintf(output, " #  ");
+	color_fprintf(output, color, "%6.2f%%", ratio);
+	fprintf(output, " of all L1-dcache hits  ");
+}
+
+static void print_l1_icache_misses(int cpu,
+				   struct perf_evsel *evsel __maybe_unused,
+				   double avg)
+{
+	double total, ratio = 0.0;
+	const char *color;
+
+	total = avg_stats(&runtime_l1_icache_stats[cpu]);
+
+	if (total)
+		ratio = avg / total * 100.0;
+
+	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
+
+	fprintf(output, " #  ");
+	color_fprintf(output, color, "%6.2f%%", ratio);
+	fprintf(output, " of all L1-icache hits  ");
+}
+
+static void print_dtlb_cache_misses(int cpu,
+				    struct perf_evsel *evsel __maybe_unused,
+				    double avg)
+{
+	double total, ratio = 0.0;
+	const char *color;
+
+	total = avg_stats(&runtime_dtlb_cache_stats[cpu]);
+
+	if (total)
+		ratio = avg / total * 100.0;
+
+	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
+
+	fprintf(output, " #  ");
+	color_fprintf(output, color, "%6.2f%%", ratio);
+	fprintf(output, " of all dTLB cache hits ");
+}
+
+static void print_itlb_cache_misses(int cpu,
+				    struct perf_evsel *evsel __maybe_unused,
+				    double avg)
+{
+	double total, ratio = 0.0;
+	const char *color;
+
+	total = avg_stats(&runtime_itlb_cache_stats[cpu]);
+
+	if (total)
+		ratio = avg / total * 100.0;
+
+	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
+
+	fprintf(output, " #  ");
+	color_fprintf(output, color, "%6.2f%%", ratio);
+	fprintf(output, " of all iTLB cache hits ");
+}
+
+static void print_ll_cache_misses(int cpu,
+				  struct perf_evsel *evsel __maybe_unused,
+				  double avg)
+{
+	double total, ratio = 0.0;
+	const char *color;
+
+	total = avg_stats(&runtime_ll_cache_stats[cpu]);
+
+	if (total)
+		ratio = avg / total * 100.0;
+
+	color = get_ratio_color(GRC_CACHE_MISSES, ratio);
+
+	fprintf(output, " #  ");
+	color_fprintf(output, color, "%6.2f%%", ratio);
+	fprintf(output, " of all LL-cache hits   ");
+}
+
+static void abs_printout(int cpu, int nr, struct perf_evsel *evsel, double avg)
+{
+	double total, ratio = 0.0, total2;
+	double sc =  evsel->scale;
+	const char *fmt;
+
+	if (csv_output) {
+		fmt = sc != 1.0 ?  "%.2f%s" : "%.0f%s";
+	} else {
+		if (big_num)
+			fmt = sc != 1.0 ? "%'18.2f%s" : "%'18.0f%s";
+		else
+			fmt = sc != 1.0 ? "%18.2f%s" : "%18.0f%s";
+	}
+
+	aggr_printout(evsel, cpu, nr);
+
+	if (aggr_mode == AGGR_GLOBAL)
+		cpu = 0;
+
+	fprintf(output, fmt, avg, csv_sep);
+
+	if (evsel->unit)
+		fprintf(output, "%-*s%s",
+			csv_output ? 0 : unit_width,
+			evsel->unit, csv_sep);
+
+	fprintf(output, "%-*s", csv_output ? 0 : 25, perf_evsel__name(evsel));
+
+	if (evsel->cgrp)
+		fprintf(output, "%s%s", csv_sep, evsel->cgrp->name);
+
+	if (csv_output || interval)
+		return;
+
+	if (perf_evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) {
+		total = avg_stats(&runtime_cycles_stats[cpu]);
+		if (total) {
 			ratio = avg / total;
+			fprintf(output, " #   %5.2f  insns per cycle        ", ratio);
+		}
+		total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]);
+		total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu]));
+
+		if (total && avg) {
+			ratio = total / avg;
+			fprintf(output, "\n");
+			if (aggr_mode == AGGR_NONE)
+				fprintf(output, "        ");
+			fprintf(output, "                                                  #   %5.2f  stalled cycles per insn", ratio);
+		}
 
-		fprintf(stderr, " # %10.3f IPC  ", ratio);
-	} else if (MATCH_EVENT(HARDWARE, HW_BRANCH_MISSES, counter) &&
-			runtime_branches_stats.n != 0) {
-		total = avg_stats(&runtime_branches_stats);
+	} else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) &&
+			runtime_branches_stats[cpu].n != 0) {
+		print_branch_misses(cpu, evsel, avg);
+	} else if (
+		evsel->attr.type == PERF_TYPE_HW_CACHE &&
+		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1D |
+					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
+					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
+			runtime_l1_dcache_stats[cpu].n != 0) {
+		print_l1_dcache_misses(cpu, evsel, avg);
+	} else if (
+		evsel->attr.type == PERF_TYPE_HW_CACHE &&
+		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_L1I |
+					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
+					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
+			runtime_l1_icache_stats[cpu].n != 0) {
+		print_l1_icache_misses(cpu, evsel, avg);
+	} else if (
+		evsel->attr.type == PERF_TYPE_HW_CACHE &&
+		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_DTLB |
+					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
+					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
+			runtime_dtlb_cache_stats[cpu].n != 0) {
+		print_dtlb_cache_misses(cpu, evsel, avg);
+	} else if (
+		evsel->attr.type == PERF_TYPE_HW_CACHE &&
+		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_ITLB |
+					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
+					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
+			runtime_itlb_cache_stats[cpu].n != 0) {
+		print_itlb_cache_misses(cpu, evsel, avg);
+	} else if (
+		evsel->attr.type == PERF_TYPE_HW_CACHE &&
+		evsel->attr.config ==  ( PERF_COUNT_HW_CACHE_LL |
+					((PERF_COUNT_HW_CACHE_OP_READ) << 8) |
+					((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) &&
+			runtime_ll_cache_stats[cpu].n != 0) {
+		print_ll_cache_misses(cpu, evsel, avg);
+	} else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) &&
+			runtime_cacherefs_stats[cpu].n != 0) {
+		total = avg_stats(&runtime_cacherefs_stats[cpu]);
 
 		if (total)
 			ratio = avg * 100 / total;
 
-		fprintf(stderr, " # %10.3f %%    ", ratio);
+		fprintf(output, " # %8.3f %% of all cache refs    ", ratio);
+
+	} else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) {
+		print_stalled_cycles_frontend(cpu, evsel, avg);
+	} else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) {
+		print_stalled_cycles_backend(cpu, evsel, avg);
+	} else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) {
+		total = avg_stats(&runtime_nsecs_stats[cpu]);
+
+		if (total) {
+			ratio = avg / total;
+			fprintf(output, " # %8.3f GHz                    ", ratio);
+		}
+	} else if (transaction_run &&
+		   perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX))) {
+		total = avg_stats(&runtime_cycles_stats[cpu]);
+		if (total)
+			fprintf(output,
+				" #   %5.2f%% transactional cycles   ",
+				100.0 * (avg / total));
+	} else if (transaction_run &&
+		   perf_evsel__cmp(evsel, nth_evsel(T_CYCLES_IN_TX_CP))) {
+		total = avg_stats(&runtime_cycles_stats[cpu]);
+		total2 = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+		if (total2 < avg)
+			total2 = avg;
+		if (total)
+			fprintf(output,
+				" #   %5.2f%% aborted cycles         ",
+				100.0 * ((total2-avg) / total));
+	} else if (transaction_run &&
+		   perf_evsel__cmp(evsel, nth_evsel(T_TRANSACTION_START)) &&
+		   avg > 0 &&
+		   runtime_cycles_in_tx_stats[cpu].n != 0) {
+		total = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
+
+		if (total)
+			ratio = total / avg;
+
+		fprintf(output, " # %8.0f cycles / transaction   ", ratio);
+	} else if (transaction_run &&
+		   perf_evsel__cmp(evsel, nth_evsel(T_ELISION_START)) &&
+		   avg > 0 &&
+		   runtime_cycles_in_tx_stats[cpu].n != 0) {
+		total = avg_stats(&runtime_cycles_in_tx_stats[cpu]);
 
-	} else if (runtime_nsecs_stats.n != 0) {
-		total = avg_stats(&runtime_nsecs_stats);
+		if (total)
+			ratio = total / avg;
+
+		fprintf(output, " # %8.0f cycles / elision       ", ratio);
+	} else if (runtime_nsecs_stats[cpu].n != 0) {
+		char unit = 'M';
+
+		total = avg_stats(&runtime_nsecs_stats[cpu]);
 
 		if (total)
 			ratio = 1000.0 * avg / total;
+		if (ratio < 0.001) {
+			ratio *= 1000;
+			unit = 'K';
+		}
+
+		fprintf(output, " # %8.3f %c/sec                  ", ratio, unit);
+	} else {
+		fprintf(output, "                                   ");
+	}
+}
+
+static void print_aggr(char *prefix)
+{
+	struct perf_evsel *counter;
+	int cpu, cpu2, s, s2, id, nr;
+	double uval;
+	u64 ena, run, val;
+
+	if (!(aggr_map || aggr_get_id))
+		return;
 
-		fprintf(stderr, " # %10.3f M/sec", ratio);
+	for (s = 0; s < aggr_map->nr; s++) {
+		id = aggr_map->map[s];
+		evlist__for_each(evsel_list, counter) {
+			val = ena = run = 0;
+			nr = 0;
+			for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
+				cpu2 = perf_evsel__cpus(counter)->map[cpu];
+				s2 = aggr_get_id(evsel_list->cpus, cpu2);
+				if (s2 != id)
+					continue;
+				val += counter->counts->cpu[cpu].val;
+				ena += counter->counts->cpu[cpu].ena;
+				run += counter->counts->cpu[cpu].run;
+				nr++;
+			}
+			if (prefix)
+				fprintf(output, "%s", prefix);
+
+			if (run == 0 || ena == 0) {
+				aggr_printout(counter, id, nr);
+
+				fprintf(output, "%*s%s",
+					csv_output ? 0 : 18,
+					counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
+					csv_sep);
+
+				fprintf(output, "%-*s%s",
+					csv_output ? 0 : unit_width,
+					counter->unit, csv_sep);
+
+				fprintf(output, "%*s",
+					csv_output ? 0 : -25,
+					perf_evsel__name(counter));
+
+				if (counter->cgrp)
+					fprintf(output, "%s%s",
+						csv_sep, counter->cgrp->name);
+
+				fputc('\n', output);
+				continue;
+			}
+			uval = val * counter->scale;
+
+			if (nsec_counter(counter))
+				nsec_printout(id, nr, counter, uval);
+			else
+				abs_printout(id, nr, counter, uval);
+
+			if (!csv_output) {
+				print_noise(counter, 1.0);
+
+				if (run != ena)
+					fprintf(output, "  (%.2f%%)",
+						100.0 * run / ena);
+			}
+			fputc('\n', output);
+		}
 	}
 }
 
 /*
  * Print out the results of a single counter:
+ * aggregated counts in system-wide mode
  */
-static void print_counter(int counter)
+static void print_counter_aggr(struct perf_evsel *counter, char *prefix)
 {
-	double avg = avg_stats(&event_res_stats[counter][0]);
-	int scaled = event_scaled[counter];
+	struct perf_stat *ps = counter->priv;
+	double avg = avg_stats(&ps->res_stats[0]);
+	int scaled = counter->counts->scaled;
+	double uval;
+
+	if (prefix)
+		fprintf(output, "%s", prefix);
 
 	if (scaled == -1) {
-		fprintf(stderr, " %14s  %-24s\n",
-			"<not counted>", event_name(counter));
+		fprintf(output, "%*s%s",
+			csv_output ? 0 : 18,
+			counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
+			csv_sep);
+		fprintf(output, "%-*s%s",
+			csv_output ? 0 : unit_width,
+			counter->unit, csv_sep);
+		fprintf(output, "%*s",
+			csv_output ? 0 : -25,
+			perf_evsel__name(counter));
+
+		if (counter->cgrp)
+			fprintf(output, "%s%s", csv_sep, counter->cgrp->name);
+
+		fputc('\n', output);
 		return;
 	}
 
+	uval = avg * counter->scale;
+
 	if (nsec_counter(counter))
-		nsec_printout(counter, avg);
+		nsec_printout(-1, 0, counter, uval);
 	else
-		abs_printout(counter, avg);
+		abs_printout(-1, 0, counter, uval);
 
 	print_noise(counter, avg);
 
+	if (csv_output) {
+		fputc('\n', output);
+		return;
+	}
+
 	if (scaled) {
 		double avg_enabled, avg_running;
 
-		avg_enabled = avg_stats(&event_res_stats[counter][1]);
-		avg_running = avg_stats(&event_res_stats[counter][2]);
+		avg_enabled = avg_stats(&ps->res_stats[1]);
+		avg_running = avg_stats(&ps->res_stats[2]);
 
-		fprintf(stderr, "  (scaled from %.2f%%)",
-				100 * avg_running / avg_enabled);
+		fprintf(output, " [%5.2f%%]", 100 * avg_running / avg_enabled);
 	}
+	fprintf(output, "\n");
+}
+
+/*
+ * Print out the results of a single counter:
+ * does not use aggregated count in system-wide
+ */
+static void print_counter(struct perf_evsel *counter, char *prefix)
+{
+	u64 ena, run, val;
+	double uval;
+	int cpu;
+
+	for (cpu = 0; cpu < perf_evsel__nr_cpus(counter); cpu++) {
+		val = counter->counts->cpu[cpu].val;
+		ena = counter->counts->cpu[cpu].ena;
+		run = counter->counts->cpu[cpu].run;
+
+		if (prefix)
+			fprintf(output, "%s", prefix);
+
+		if (run == 0 || ena == 0) {
+			fprintf(output, "CPU%*d%s%*s%s",
+				csv_output ? 0 : -4,
+				perf_evsel__cpus(counter)->map[cpu], csv_sep,
+				csv_output ? 0 : 18,
+				counter->supported ? CNTR_NOT_COUNTED : CNTR_NOT_SUPPORTED,
+				csv_sep);
+
+				fprintf(output, "%-*s%s",
+					csv_output ? 0 : unit_width,
+					counter->unit, csv_sep);
+
+				fprintf(output, "%*s",
+					csv_output ? 0 : -25,
+					perf_evsel__name(counter));
+
+			if (counter->cgrp)
+				fprintf(output, "%s%s",
+					csv_sep, counter->cgrp->name);
+
+			fputc('\n', output);
+			continue;
+		}
+
+		uval = val * counter->scale;
 
-	fprintf(stderr, "\n");
+		if (nsec_counter(counter))
+			nsec_printout(cpu, 0, counter, uval);
+		else
+			abs_printout(cpu, 0, counter, uval);
+
+		if (!csv_output) {
+			print_noise(counter, 1.0);
+
+			if (run != ena)
+				fprintf(output, "  (%.2f%%)",
+					100.0 * run / ena);
+		}
+		fputc('\n', output);
+	}
 }
 
 static void print_stat(int argc, const char **argv)
 {
-	int i, counter;
+	struct perf_evsel *counter;
+	int i;
 
 	fflush(stdout);
 
-	fprintf(stderr, "\n");
-	fprintf(stderr, " Performance counter stats for ");
-	if(target_pid == -1) {
-		fprintf(stderr, "\'%s", argv[0]);
-		for (i = 1; i < argc; i++)
-			fprintf(stderr, " %s", argv[i]);
-	}else
-		fprintf(stderr, "task pid \'%d", target_pid);
-
-	fprintf(stderr, "\'");
-	if (run_count > 1)
-		fprintf(stderr, " (%d runs)", run_count);
-	fprintf(stderr, ":\n\n");
+	if (!csv_output) {
+		fprintf(output, "\n");
+		fprintf(output, " Performance counter stats for ");
+		if (target.system_wide)
+			fprintf(output, "\'system wide");
+		else if (target.cpu_list)
+			fprintf(output, "\'CPU(s) %s", target.cpu_list);
+		else if (!target__has_task(&target)) {
+			fprintf(output, "\'%s", argv[0]);
+			for (i = 1; i < argc; i++)
+				fprintf(output, " %s", argv[i]);
+		} else if (target.pid)
+			fprintf(output, "process id \'%s", target.pid);
+		else
+			fprintf(output, "thread id \'%s", target.tid);
+
+		fprintf(output, "\'");
+		if (run_count > 1)
+			fprintf(output, " (%d runs)", run_count);
+		fprintf(output, ":\n\n");
+	}
 
-	for (counter = 0; counter < nr_counters; counter++)
-		print_counter(counter);
+	switch (aggr_mode) {
+	case AGGR_CORE:
+	case AGGR_SOCKET:
+		print_aggr(NULL);
+		break;
+	case AGGR_GLOBAL:
+		evlist__for_each(evsel_list, counter)
+			print_counter_aggr(counter, NULL);
+		break;
+	case AGGR_NONE:
+		evlist__for_each(evsel_list, counter)
+			print_counter(counter, NULL);
+		break;
+	default:
+		break;
+	}
 
-	fprintf(stderr, "\n");
-	fprintf(stderr, " %14.9f  seconds time elapsed",
-			avg_stats(&walltime_nsecs_stats)/1e9);
-	if (run_count > 1) {
-		fprintf(stderr, "   ( +- %7.3f%% )",
-				100*stddev_stats(&walltime_nsecs_stats) /
-				avg_stats(&walltime_nsecs_stats));
+	if (!csv_output) {
+		if (!null_run)
+			fprintf(output, "\n");
+		fprintf(output, " %17.9f seconds time elapsed",
+				avg_stats(&walltime_nsecs_stats)/1e9);
+		if (run_count > 1) {
+			fprintf(output, "                                        ");
+			print_noise_pct(stddev_stats(&walltime_nsecs_stats),
+					avg_stats(&walltime_nsecs_stats));
+		}
+		fprintf(output, "\n\n");
 	}
-	fprintf(stderr, "\n\n");
 }
 
 static volatile int signr = -1;
 
 static void skip_signal(int signo)
 {
-	if(target_pid != -1)
+	if ((child_pid == -1) || interval)
 		done = 1;
 
 	signr = signo;
+	/*
+	 * render child_pid harmless
+	 * won't send SIGTERM to a random
+	 * process in case of race condition
+	 * and fast PID recycling
+	 */
+	child_pid = -1;
 }
 
 static void sig_atexit(void)
 {
+	sigset_t set, oset;
+
+	/*
+	 * avoid race condition with SIGCHLD handler
+	 * in skip_signal() which is modifying child_pid
+	 * goal is to avoid send SIGTERM to a random
+	 * process
+	 */
+	sigemptyset(&set);
+	sigaddset(&set, SIGCHLD);
+	sigprocmask(SIG_BLOCK, &set, &oset);
+
 	if (child_pid != -1)
 		kill(child_pid, SIGTERM);
 
+	sigprocmask(SIG_SETMASK, &oset, NULL);
+
 	if (signr == -1)
 		return;
 
@@ -477,53 +1403,400 @@ static void sig_atexit(void)
 	kill(getpid(), signr);
 }
 
-static const char * const stat_usage[] = {
-	"perf stat [<options>] [<command>]",
-	NULL
+static int stat__set_big_num(const struct option *opt __maybe_unused,
+			     const char *s __maybe_unused, int unset)
+{
+	big_num_opt = unset ? 0 : 1;
+	return 0;
+}
+
+static int perf_stat_init_aggr_mode(void)
+{
+	switch (aggr_mode) {
+	case AGGR_SOCKET:
+		if (cpu_map__build_socket_map(evsel_list->cpus, &aggr_map)) {
+			perror("cannot build socket map");
+			return -1;
+		}
+		aggr_get_id = cpu_map__get_socket;
+		break;
+	case AGGR_CORE:
+		if (cpu_map__build_core_map(evsel_list->cpus, &aggr_map)) {
+			perror("cannot build core map");
+			return -1;
+		}
+		aggr_get_id = cpu_map__get_core;
+		break;
+	case AGGR_NONE:
+	case AGGR_GLOBAL:
+	default:
+		break;
+	}
+	return 0;
+}
+
+static int setup_events(const char * const *attrs, unsigned len)
+{
+	unsigned i;
+
+	for (i = 0; i < len; i++) {
+		if (parse_events(evsel_list, attrs[i]))
+			return -1;
+	}
+	return 0;
+}
+
+/*
+ * Add default attributes, if there were no attributes specified or
+ * if -d/--detailed, -d -d or -d -d -d is used:
+ */
+static int add_default_attributes(void)
+{
+	struct perf_event_attr default_attrs[] = {
+
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK		},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES	},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS		},
+  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS		},
+
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES		},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS		},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS	},
+  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES		},
+
+};
+
+/*
+ * Detailed stats (-d), covering the L1 and last level data caches:
+ */
+	struct perf_event_attr detailed_attrs[] = {
+
+  { .type = PERF_TYPE_HW_CACHE,
+    .config =
+	 PERF_COUNT_HW_CACHE_L1D		<<  0  |
+	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
+	(PERF_COUNT_HW_CACHE_RESULT_ACCESS	<< 16)				},
+
+  { .type = PERF_TYPE_HW_CACHE,
+    .config =
+	 PERF_COUNT_HW_CACHE_L1D		<<  0  |
+	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
+	(PERF_COUNT_HW_CACHE_RESULT_MISS	<< 16)				},
+
+  { .type = PERF_TYPE_HW_CACHE,
+    .config =
+	 PERF_COUNT_HW_CACHE_LL			<<  0  |
+	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
+	(PERF_COUNT_HW_CACHE_RESULT_ACCESS	<< 16)				},
+
+  { .type = PERF_TYPE_HW_CACHE,
+    .config =
+	 PERF_COUNT_HW_CACHE_LL			<<  0  |
+	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
+	(PERF_COUNT_HW_CACHE_RESULT_MISS	<< 16)				},
+};
+
+/*
+ * Very detailed stats (-d -d), covering the instruction cache and the TLB caches:
+ */
+	struct perf_event_attr very_detailed_attrs[] = {
+
+  { .type = PERF_TYPE_HW_CACHE,
+    .config =
+	 PERF_COUNT_HW_CACHE_L1I		<<  0  |
+	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
+	(PERF_COUNT_HW_CACHE_RESULT_ACCESS	<< 16)				},
+
+  { .type = PERF_TYPE_HW_CACHE,
+    .config =
+	 PERF_COUNT_HW_CACHE_L1I		<<  0  |
+	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
+	(PERF_COUNT_HW_CACHE_RESULT_MISS	<< 16)				},
+
+  { .type = PERF_TYPE_HW_CACHE,
+    .config =
+	 PERF_COUNT_HW_CACHE_DTLB		<<  0  |
+	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
+	(PERF_COUNT_HW_CACHE_RESULT_ACCESS	<< 16)				},
+
+  { .type = PERF_TYPE_HW_CACHE,
+    .config =
+	 PERF_COUNT_HW_CACHE_DTLB		<<  0  |
+	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
+	(PERF_COUNT_HW_CACHE_RESULT_MISS	<< 16)				},
+
+  { .type = PERF_TYPE_HW_CACHE,
+    .config =
+	 PERF_COUNT_HW_CACHE_ITLB		<<  0  |
+	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
+	(PERF_COUNT_HW_CACHE_RESULT_ACCESS	<< 16)				},
+
+  { .type = PERF_TYPE_HW_CACHE,
+    .config =
+	 PERF_COUNT_HW_CACHE_ITLB		<<  0  |
+	(PERF_COUNT_HW_CACHE_OP_READ		<<  8) |
+	(PERF_COUNT_HW_CACHE_RESULT_MISS	<< 16)				},
+
+};
+
+/*
+ * Very, very detailed stats (-d -d -d), adding prefetch events:
+ */
+	struct perf_event_attr very_very_detailed_attrs[] = {
+
+  { .type = PERF_TYPE_HW_CACHE,
+    .config =
+	 PERF_COUNT_HW_CACHE_L1D		<<  0  |
+	(PERF_COUNT_HW_CACHE_OP_PREFETCH	<<  8) |
+	(PERF_COUNT_HW_CACHE_RESULT_ACCESS	<< 16)				},
+
+  { .type = PERF_TYPE_HW_CACHE,
+    .config =
+	 PERF_COUNT_HW_CACHE_L1D		<<  0  |
+	(PERF_COUNT_HW_CACHE_OP_PREFETCH	<<  8) |
+	(PERF_COUNT_HW_CACHE_RESULT_MISS	<< 16)				},
 };
 
-static const struct option options[] = {
-	OPT_CALLBACK('e', "event", NULL, "event",
+	/* Set attrs if no event is selected and !null_run: */
+	if (null_run)
+		return 0;
+
+	if (transaction_run) {
+		int err;
+		if (pmu_have_event("cpu", "cycles-ct") &&
+		    pmu_have_event("cpu", "el-start"))
+			err = setup_events(transaction_attrs,
+					ARRAY_SIZE(transaction_attrs));
+		else
+			err = setup_events(transaction_limited_attrs,
+				 ARRAY_SIZE(transaction_limited_attrs));
+		if (err < 0) {
+			fprintf(stderr, "Cannot set up transaction events\n");
+			return -1;
+		}
+		return 0;
+	}
+
+	if (!evsel_list->nr_entries) {
+		if (perf_evlist__add_default_attrs(evsel_list, default_attrs) < 0)
+			return -1;
+	}
+
+	/* Detailed events get appended to the event list: */
+
+	if (detailed_run <  1)
+		return 0;
+
+	/* Append detailed run extra attributes: */
+	if (perf_evlist__add_default_attrs(evsel_list, detailed_attrs) < 0)
+		return -1;
+
+	if (detailed_run < 2)
+		return 0;
+
+	/* Append very detailed run extra attributes: */
+	if (perf_evlist__add_default_attrs(evsel_list, very_detailed_attrs) < 0)
+		return -1;
+
+	if (detailed_run < 3)
+		return 0;
+
+	/* Append very, very detailed run extra attributes: */
+	return perf_evlist__add_default_attrs(evsel_list, very_very_detailed_attrs);
+}
+
+int cmd_stat(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	bool append_file = false;
+	int output_fd = 0;
+	const char *output_name	= NULL;
+	const struct option options[] = {
+	OPT_BOOLEAN('T', "transaction", &transaction_run,
+		    "hardware transaction statistics"),
+	OPT_CALLBACK('e', "event", &evsel_list, "event",
 		     "event selector. use 'perf list' to list available events",
-		     parse_events),
-	OPT_BOOLEAN('i', "inherit", &inherit,
-		    "child tasks inherit counters"),
-	OPT_INTEGER('p', "pid", &target_pid,
-		    "stat events on existing pid"),
-	OPT_BOOLEAN('a', "all-cpus", &system_wide,
+		     parse_events_option),
+	OPT_CALLBACK(0, "filter", &evsel_list, "filter",
+		     "event filter", parse_filter),
+	OPT_BOOLEAN('i', "no-inherit", &no_inherit,
+		    "child tasks do not inherit counters"),
+	OPT_STRING('p', "pid", &target.pid, "pid",
+		   "stat events on existing process id"),
+	OPT_STRING('t', "tid", &target.tid, "tid",
+		   "stat events on existing thread id"),
+	OPT_BOOLEAN('a', "all-cpus", &target.system_wide,
 		    "system-wide collection from all CPUs"),
-	OPT_BOOLEAN('c', "scale", &scale,
-		    "scale/normalize counters"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_BOOLEAN('g', "group", &group,
+		    "put the counters into a counter group"),
+	OPT_BOOLEAN('c', "scale", &scale, "scale/normalize counters"),
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
 	OPT_INTEGER('r', "repeat", &run_count,
-		    "repeat command and print average + stddev (max: 100)"),
+		    "repeat command and print average + stddev (max: 100, forever: 0)"),
 	OPT_BOOLEAN('n', "null", &null_run,
 		    "null run - dont start any counters"),
+	OPT_INCR('d', "detailed", &detailed_run,
+		    "detailed run - start a lot of events"),
+	OPT_BOOLEAN('S', "sync", &sync_run,
+		    "call sync() before starting a run"),
+	OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, 
+			   "print large numbers with thousands\' separators",
+			   stat__set_big_num),
+	OPT_STRING('C', "cpu", &target.cpu_list, "cpu",
+		    "list of cpus to monitor in system-wide"),
+	OPT_SET_UINT('A', "no-aggr", &aggr_mode,
+		    "disable CPU count aggregation", AGGR_NONE),
+	OPT_STRING('x', "field-separator", &csv_sep, "separator",
+		   "print counts with custom separator"),
+	OPT_CALLBACK('G', "cgroup", &evsel_list, "name",
+		     "monitor event in cgroup name only", parse_cgroups),
+	OPT_STRING('o', "output", &output_name, "file", "output file name"),
+	OPT_BOOLEAN(0, "append", &append_file, "append to the output file"),
+	OPT_INTEGER(0, "log-fd", &output_fd,
+		    "log output to fd, instead of stderr"),
+	OPT_STRING(0, "pre", &pre_cmd, "command",
+			"command to run prior to the measured command"),
+	OPT_STRING(0, "post", &post_cmd, "command",
+			"command to run after to the measured command"),
+	OPT_UINTEGER('I', "interval-print", &interval,
+		    "print counts at regular interval in ms (>= 100)"),
+	OPT_SET_UINT(0, "per-socket", &aggr_mode,
+		     "aggregate counts per processor socket", AGGR_SOCKET),
+	OPT_SET_UINT(0, "per-core", &aggr_mode,
+		     "aggregate counts per physical processor core", AGGR_CORE),
+	OPT_UINTEGER('D', "delay", &initial_delay,
+		     "ms to wait before starting measurement after program start"),
 	OPT_END()
-};
+	};
+	const char * const stat_usage[] = {
+		"perf stat [<options>] [<command>]",
+		NULL
+	};
+	int status = -EINVAL, run_idx;
+	const char *mode;
 
-int cmd_stat(int argc, const char **argv, const char *prefix __used)
-{
-	int status;
+	setlocale(LC_ALL, "");
+
+	evsel_list = perf_evlist__new();
+	if (evsel_list == NULL)
+		return -ENOMEM;
 
 	argc = parse_options(argc, argv, options, stat_usage,
 		PARSE_OPT_STOP_AT_NON_OPTION);
-	if (!argc && target_pid == -1)
-		usage_with_options(stat_usage, options);
-	if (run_count <= 0)
+
+	output = stderr;
+	if (output_name && strcmp(output_name, "-"))
+		output = NULL;
+
+	if (output_name && output_fd) {
+		fprintf(stderr, "cannot use both --output and --log-fd\n");
+		parse_options_usage(stat_usage, options, "o", 1);
+		parse_options_usage(NULL, options, "log-fd", 0);
+		goto out;
+	}
+
+	if (output_fd < 0) {
+		fprintf(stderr, "argument to --log-fd must be a > 0\n");
+		parse_options_usage(stat_usage, options, "log-fd", 0);
+		goto out;
+	}
+
+	if (!output) {
+		struct timespec tm;
+		mode = append_file ? "a" : "w";
+
+		output = fopen(output_name, mode);
+		if (!output) {
+			perror("failed to create output file");
+			return -1;
+		}
+		clock_gettime(CLOCK_REALTIME, &tm);
+		fprintf(output, "# started on %s\n", ctime(&tm.tv_sec));
+	} else if (output_fd > 0) {
+		mode = append_file ? "a" : "w";
+		output = fdopen(output_fd, mode);
+		if (!output) {
+			perror("Failed opening logfd");
+			return -errno;
+		}
+	}
+
+	if (csv_sep) {
+		csv_output = true;
+		if (!strcmp(csv_sep, "\\t"))
+			csv_sep = "\t";
+	} else
+		csv_sep = DEFAULT_SEPARATOR;
+
+	/*
+	 * let the spreadsheet do the pretty-printing
+	 */
+	if (csv_output) {
+		/* User explicitly passed -B? */
+		if (big_num_opt == 1) {
+			fprintf(stderr, "-B option not supported with -x\n");
+			parse_options_usage(stat_usage, options, "B", 1);
+			parse_options_usage(NULL, options, "x", 1);
+			goto out;
+		} else /* Nope, so disable big number formatting */
+			big_num = false;
+	} else if (big_num_opt == 0) /* User passed --no-big-num */
+		big_num = false;
+
+	if (!argc && target__none(&target))
 		usage_with_options(stat_usage, options);
 
-	/* Set attrs and nr_counters if no event is selected and !null_run */
-	if (!null_run && !nr_counters) {
-		memcpy(attrs, default_attrs, sizeof(default_attrs));
-		nr_counters = ARRAY_SIZE(default_attrs);
+	if (run_count < 0) {
+		pr_err("Run count must be a positive number\n");
+		parse_options_usage(stat_usage, options, "r", 1);
+		goto out;
+	} else if (run_count == 0) {
+		forever = true;
+		run_count = 1;
 	}
 
-	if (system_wide)
-		nr_cpus = read_cpu_map();
-	else
-		nr_cpus = 1;
+	/* no_aggr, cgroup are for system-wide only */
+	if ((aggr_mode != AGGR_GLOBAL || nr_cgroups) &&
+	    !target__has_cpu(&target)) {
+		fprintf(stderr, "both cgroup and no-aggregation "
+			"modes only available in system-wide mode\n");
+
+		parse_options_usage(stat_usage, options, "G", 1);
+		parse_options_usage(NULL, options, "A", 1);
+		parse_options_usage(NULL, options, "a", 1);
+		goto out;
+	}
+
+	if (add_default_attributes())
+		goto out;
+
+	target__validate(&target);
+
+	if (perf_evlist__create_maps(evsel_list, &target) < 0) {
+		if (target__has_task(&target)) {
+			pr_err("Problems finding threads of monitor\n");
+			parse_options_usage(stat_usage, options, "p", 1);
+			parse_options_usage(NULL, options, "t", 1);
+		} else if (target__has_cpu(&target)) {
+			perror("failed to parse CPUs map");
+			parse_options_usage(stat_usage, options, "C", 1);
+			parse_options_usage(NULL, options, "a", 1);
+		}
+		goto out;
+	}
+	if (interval && interval < 100) {
+		pr_err("print interval must be >= 100ms\n");
+		parse_options_usage(stat_usage, options, "I", 1);
+		goto out;
+	}
+
+	if (perf_evlist__alloc_stats(evsel_list, interval))
+		goto out;
+
+	if (perf_stat_init_aggr_mode())
+		goto out;
 
 	/*
 	 * We dont want to block the signals - that would cause
@@ -532,18 +1805,30 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 	 * task, but being ignored by perf stat itself:
 	 */
 	atexit(sig_atexit);
-	signal(SIGINT,  skip_signal);
+	if (!forever)
+		signal(SIGINT,  skip_signal);
+	signal(SIGCHLD, skip_signal);
 	signal(SIGALRM, skip_signal);
 	signal(SIGABRT, skip_signal);
 
 	status = 0;
-	for (run_idx = 0; run_idx < run_count; run_idx++) {
+	for (run_idx = 0; forever || run_idx < run_count; run_idx++) {
 		if (run_count != 1 && verbose)
-			fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx + 1);
+			fprintf(output, "[ perf stat: executing run #%d ... ]\n",
+				run_idx + 1);
+
 		status = run_perf_stat(argc, argv);
+		if (forever && status != -1) {
+			print_stat(argc, argv);
+			perf_stat__reset_stats(evsel_list);
+		}
 	}
 
-	print_stat(argc, argv);
+	if (!forever && status != -1 && !interval)
+		print_stat(argc, argv);
 
+	perf_evlist__free_stats(evsel_list);
+out:
+	perf_evlist__delete(evsel_list);
 	return status;
 }
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 0d4d8ff7914..74db2568b86 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -12,6 +12,8 @@
  * of the License.
  */
 
+#include <traceevent/event-parse.h>
+
 #include "builtin.h"
 
 #include "util/util.h"
@@ -19,9 +21,10 @@
 #include "util/color.h"
 #include <linux/list.h>
 #include "util/cache.h"
+#include "util/evlist.h"
+#include "util/evsel.h"
 #include <linux/rbtree.h>
 #include "util/symbol.h"
-#include "util/string.h"
 #include "util/callchain.h"
 #include "util/strlist.h"
 
@@ -32,28 +35,35 @@
 #include "util/event.h"
 #include "util/session.h"
 #include "util/svghelper.h"
+#include "util/tool.h"
+#include "util/data.h"
 
-static char		const *input_name = "perf.data";
-static char		const *output_name = "output.svg";
-
-static unsigned int	numcpus;
-static u64		min_freq;	/* Lowest CPU frequency seen */
-static u64		max_freq;	/* Highest CPU frequency seen */
-static u64		turbo_frequency;
-
-static u64		first_time, last_time;
-
-static int		power_only;
-
+#define SUPPORT_OLD_POWER_EVENTS 1
+#define PWR_EVENT_EXIT -1
 
 struct per_pid;
-struct per_pidcomm;
-
-struct cpu_sample;
 struct power_event;
 struct wake_event;
 
-struct sample_wrapper;
+struct timechart {
+	struct perf_tool	tool;
+	struct per_pid		*all_data;
+	struct power_event	*power_events;
+	struct wake_event	*wake_events;
+	int			proc_num;
+	unsigned int		numcpus;
+	u64			min_freq,	/* Lowest CPU frequency seen */
+				max_freq,	/* Highest CPU frequency seen */
+				turbo_frequency,
+				first_time, last_time;
+	bool			power_only,
+				tasks_only,
+				with_backtrace,
+				topology;
+};
+
+struct per_pidcomm;
+struct cpu_sample;
 
 /*
  * Datastructure layout:
@@ -78,8 +88,6 @@ struct per_pid {
 
 	struct per_pidcomm *all;
 	struct per_pidcomm *current;
-
-	int painted;
 };
 
 
@@ -120,10 +128,9 @@ struct cpu_sample {
 	u64 end_time;
 	int type;
 	int cpu;
+	const char *backtrace;
 };
 
-static struct per_pid *all_data;
-
 #define CSTATE 1
 #define PSTATE 2
 
@@ -141,15 +148,9 @@ struct wake_event {
 	int waker;
 	int wakee;
 	u64 time;
+	const char *backtrace;
 };
 
-static struct power_event    *power_events;
-static struct wake_event     *wake_events;
-
-struct sample_wrapper *all_samples;
-
-
-struct process_filter;
 struct process_filter {
 	char			*name;
 	int			pid;
@@ -159,29 +160,28 @@ struct process_filter {
 static struct process_filter *process_filter;
 
 
-static struct per_pid *find_create_pid(int pid)
+static struct per_pid *find_create_pid(struct timechart *tchart, int pid)
 {
-	struct per_pid *cursor = all_data;
+	struct per_pid *cursor = tchart->all_data;
 
 	while (cursor) {
 		if (cursor->pid == pid)
 			return cursor;
 		cursor = cursor->next;
 	}
-	cursor = malloc(sizeof(struct per_pid));
+	cursor = zalloc(sizeof(*cursor));
 	assert(cursor != NULL);
-	memset(cursor, 0, sizeof(struct per_pid));
 	cursor->pid = pid;
-	cursor->next = all_data;
-	all_data = cursor;
+	cursor->next = tchart->all_data;
+	tchart->all_data = cursor;
 	return cursor;
 }
 
-static void pid_set_comm(int pid, char *comm)
+static void pid_set_comm(struct timechart *tchart, int pid, char *comm)
 {
 	struct per_pid *p;
 	struct per_pidcomm *c;
-	p = find_create_pid(pid);
+	p = find_create_pid(tchart, pid);
 	c = p->all;
 	while (c) {
 		if (c->comm && strcmp(c->comm, comm) == 0) {
@@ -195,23 +195,22 @@ static void pid_set_comm(int pid, char *comm)
 		}
 		c = c->next;
 	}
-	c = malloc(sizeof(struct per_pidcomm));
+	c = zalloc(sizeof(*c));
 	assert(c != NULL);
-	memset(c, 0, sizeof(struct per_pidcomm));
 	c->comm = strdup(comm);
 	p->current = c;
 	c->next = p->all;
 	p->all = c;
 }
 
-static void pid_fork(int pid, int ppid, u64 timestamp)
+static void pid_fork(struct timechart *tchart, int pid, int ppid, u64 timestamp)
 {
 	struct per_pid *p, *pp;
-	p = find_create_pid(pid);
-	pp = find_create_pid(ppid);
+	p = find_create_pid(tchart, pid);
+	pp = find_create_pid(tchart, ppid);
 	p->ppid = ppid;
 	if (pp->current && pp->current->comm && !p->current)
-		pid_set_comm(pid, pp->current->comm);
+		pid_set_comm(tchart, pid, pp->current->comm);
 
 	p->start_time = timestamp;
 	if (p->current) {
@@ -220,41 +219,41 @@ static void pid_fork(int pid, int ppid, u64 timestamp)
 	}
 }
 
-static void pid_exit(int pid, u64 timestamp)
+static void pid_exit(struct timechart *tchart, int pid, u64 timestamp)
 {
 	struct per_pid *p;
-	p = find_create_pid(pid);
+	p = find_create_pid(tchart, pid);
 	p->end_time = timestamp;
 	if (p->current)
 		p->current->end_time = timestamp;
 }
 
-static void
-pid_put_sample(int pid, int type, unsigned int cpu, u64 start, u64 end)
+static void pid_put_sample(struct timechart *tchart, int pid, int type,
+			   unsigned int cpu, u64 start, u64 end,
+			   const char *backtrace)
 {
 	struct per_pid *p;
 	struct per_pidcomm *c;
 	struct cpu_sample *sample;
 
-	p = find_create_pid(pid);
+	p = find_create_pid(tchart, pid);
 	c = p->current;
 	if (!c) {
-		c = malloc(sizeof(struct per_pidcomm));
+		c = zalloc(sizeof(*c));
 		assert(c != NULL);
-		memset(c, 0, sizeof(struct per_pidcomm));
 		p->current = c;
 		c->next = p->all;
 		p->all = c;
 	}
 
-	sample = malloc(sizeof(struct cpu_sample));
+	sample = zalloc(sizeof(*sample));
 	assert(sample != NULL);
-	memset(sample, 0, sizeof(struct cpu_sample));
 	sample->start_time = start;
 	sample->end_time = end;
 	sample->type = type;
 	sample->next = c->samples;
 	sample->cpu = cpu;
+	sample->backtrace = backtrace;
 	c->samples = sample;
 
 	if (sample->type == TYPE_RUNNING && end > start && start > 0) {
@@ -266,9 +265,6 @@ pid_put_sample(int pid, int type, unsigned int cpu, u64 start, u64 end)
 		c->start_time = start;
 	if (p->start_time == 0 || p->start_time > start)
 		p->start_time = start;
-
-	if (cpu > numcpus)
-		numcpus = cpu;
 }
 
 #define MAX_CPUS 4096
@@ -278,76 +274,39 @@ static int cpus_cstate_state[MAX_CPUS];
 static u64 cpus_pstate_start_times[MAX_CPUS];
 static u64 cpus_pstate_state[MAX_CPUS];
 
-static int process_comm_event(event_t *event, struct perf_session *session __used)
+static int process_comm_event(struct perf_tool *tool,
+			      union perf_event *event,
+			      struct perf_sample *sample __maybe_unused,
+			      struct machine *machine __maybe_unused)
 {
-	pid_set_comm(event->comm.tid, event->comm.comm);
+	struct timechart *tchart = container_of(tool, struct timechart, tool);
+	pid_set_comm(tchart, event->comm.tid, event->comm.comm);
 	return 0;
 }
 
-static int process_fork_event(event_t *event, struct perf_session *session __used)
+static int process_fork_event(struct perf_tool *tool,
+			      union perf_event *event,
+			      struct perf_sample *sample __maybe_unused,
+			      struct machine *machine __maybe_unused)
 {
-	pid_fork(event->fork.pid, event->fork.ppid, event->fork.time);
+	struct timechart *tchart = container_of(tool, struct timechart, tool);
+	pid_fork(tchart, event->fork.pid, event->fork.ppid, event->fork.time);
 	return 0;
 }
 
-static int process_exit_event(event_t *event, struct perf_session *session __used)
+static int process_exit_event(struct perf_tool *tool,
+			      union perf_event *event,
+			      struct perf_sample *sample __maybe_unused,
+			      struct machine *machine __maybe_unused)
 {
-	pid_exit(event->fork.pid, event->fork.time);
+	struct timechart *tchart = container_of(tool, struct timechart, tool);
+	pid_exit(tchart, event->fork.pid, event->fork.time);
 	return 0;
 }
 
-struct trace_entry {
-	unsigned short		type;
-	unsigned char		flags;
-	unsigned char		preempt_count;
-	int			pid;
-	int			lock_depth;
-};
-
-struct power_entry {
-	struct trace_entry te;
-	s64	type;
-	s64	value;
-};
-
-#define TASK_COMM_LEN 16
-struct wakeup_entry {
-	struct trace_entry te;
-	char comm[TASK_COMM_LEN];
-	int   pid;
-	int   prio;
-	int   success;
-};
-
-/*
- * trace_flag_type is an enumeration that holds different
- * states when a trace occurs. These are:
- *  IRQS_OFF            - interrupts were disabled
- *  IRQS_NOSUPPORT      - arch does not support irqs_disabled_flags
- *  NEED_RESCED         - reschedule is requested
- *  HARDIRQ             - inside an interrupt handler
- *  SOFTIRQ             - inside a softirq handler
- */
-enum trace_flag_type {
-	TRACE_FLAG_IRQS_OFF		= 0x01,
-	TRACE_FLAG_IRQS_NOSUPPORT	= 0x02,
-	TRACE_FLAG_NEED_RESCHED		= 0x04,
-	TRACE_FLAG_HARDIRQ		= 0x08,
-	TRACE_FLAG_SOFTIRQ		= 0x10,
-};
-
-
-
-struct sched_switch {
-	struct trace_entry te;
-	char prev_comm[TASK_COMM_LEN];
-	int  prev_pid;
-	int  prev_prio;
-	long prev_state; /* Arjan weeps. */
-	char next_comm[TASK_COMM_LEN];
-	int  next_pid;
-	int  next_prio;
-};
+#ifdef SUPPORT_OLD_POWER_EVENTS
+static int use_old_power_events;
+#endif
 
 static void c_state_start(int cpu, u64 timestamp, int state)
 {
@@ -355,315 +314,387 @@ static void c_state_start(int cpu, u64 timestamp, int state)
 	cpus_cstate_state[cpu] = state;
 }
 
-static void c_state_end(int cpu, u64 timestamp)
+static void c_state_end(struct timechart *tchart, int cpu, u64 timestamp)
 {
-	struct power_event *pwr;
-	pwr = malloc(sizeof(struct power_event));
+	struct power_event *pwr = zalloc(sizeof(*pwr));
+
 	if (!pwr)
 		return;
-	memset(pwr, 0, sizeof(struct power_event));
 
 	pwr->state = cpus_cstate_state[cpu];
 	pwr->start_time = cpus_cstate_start_times[cpu];
 	pwr->end_time = timestamp;
 	pwr->cpu = cpu;
 	pwr->type = CSTATE;
-	pwr->next = power_events;
+	pwr->next = tchart->power_events;
 
-	power_events = pwr;
+	tchart->power_events = pwr;
 }
 
-static void p_state_change(int cpu, u64 timestamp, u64 new_freq)
+static void p_state_change(struct timechart *tchart, int cpu, u64 timestamp, u64 new_freq)
 {
 	struct power_event *pwr;
-	pwr = malloc(sizeof(struct power_event));
 
 	if (new_freq > 8000000) /* detect invalid data */
 		return;
 
+	pwr = zalloc(sizeof(*pwr));
 	if (!pwr)
 		return;
-	memset(pwr, 0, sizeof(struct power_event));
 
 	pwr->state = cpus_pstate_state[cpu];
 	pwr->start_time = cpus_pstate_start_times[cpu];
 	pwr->end_time = timestamp;
 	pwr->cpu = cpu;
 	pwr->type = PSTATE;
-	pwr->next = power_events;
+	pwr->next = tchart->power_events;
 
 	if (!pwr->start_time)
-		pwr->start_time = first_time;
+		pwr->start_time = tchart->first_time;
 
-	power_events = pwr;
+	tchart->power_events = pwr;
 
 	cpus_pstate_state[cpu] = new_freq;
 	cpus_pstate_start_times[cpu] = timestamp;
 
-	if ((u64)new_freq > max_freq)
-		max_freq = new_freq;
+	if ((u64)new_freq > tchart->max_freq)
+		tchart->max_freq = new_freq;
 
-	if (new_freq < min_freq || min_freq == 0)
-		min_freq = new_freq;
+	if (new_freq < tchart->min_freq || tchart->min_freq == 0)
+		tchart->min_freq = new_freq;
 
-	if (new_freq == max_freq - 1000)
-			turbo_frequency = max_freq;
+	if (new_freq == tchart->max_freq - 1000)
+		tchart->turbo_frequency = tchart->max_freq;
 }
 
-static void
-sched_wakeup(int cpu, u64 timestamp, int pid, struct trace_entry *te)
+static void sched_wakeup(struct timechart *tchart, int cpu, u64 timestamp,
+			 int waker, int wakee, u8 flags, const char *backtrace)
 {
-	struct wake_event *we;
 	struct per_pid *p;
-	struct wakeup_entry *wake = (void *)te;
+	struct wake_event *we = zalloc(sizeof(*we));
 
-	we = malloc(sizeof(struct wake_event));
 	if (!we)
 		return;
 
-	memset(we, 0, sizeof(struct wake_event));
 	we->time = timestamp;
-	we->waker = pid;
+	we->waker = waker;
+	we->backtrace = backtrace;
 
-	if ((te->flags & TRACE_FLAG_HARDIRQ) || (te->flags & TRACE_FLAG_SOFTIRQ))
+	if ((flags & TRACE_FLAG_HARDIRQ) || (flags & TRACE_FLAG_SOFTIRQ))
 		we->waker = -1;
 
-	we->wakee = wake->pid;
-	we->next = wake_events;
-	wake_events = we;
-	p = find_create_pid(we->wakee);
+	we->wakee = wakee;
+	we->next = tchart->wake_events;
+	tchart->wake_events = we;
+	p = find_create_pid(tchart, we->wakee);
 
 	if (p && p->current && p->current->state == TYPE_NONE) {
 		p->current->state_since = timestamp;
 		p->current->state = TYPE_WAITING;
 	}
 	if (p && p->current && p->current->state == TYPE_BLOCKED) {
-		pid_put_sample(p->pid, p->current->state, cpu, p->current->state_since, timestamp);
+		pid_put_sample(tchart, p->pid, p->current->state, cpu,
+			       p->current->state_since, timestamp, NULL);
 		p->current->state_since = timestamp;
 		p->current->state = TYPE_WAITING;
 	}
 }
 
-static void sched_switch(int cpu, u64 timestamp, struct trace_entry *te)
+static void sched_switch(struct timechart *tchart, int cpu, u64 timestamp,
+			 int prev_pid, int next_pid, u64 prev_state,
+			 const char *backtrace)
 {
 	struct per_pid *p = NULL, *prev_p;
-	struct sched_switch *sw = (void *)te;
 
+	prev_p = find_create_pid(tchart, prev_pid);
 
-	prev_p = find_create_pid(sw->prev_pid);
-
-	p = find_create_pid(sw->next_pid);
+	p = find_create_pid(tchart, next_pid);
 
 	if (prev_p->current && prev_p->current->state != TYPE_NONE)
-		pid_put_sample(sw->prev_pid, TYPE_RUNNING, cpu, prev_p->current->state_since, timestamp);
+		pid_put_sample(tchart, prev_pid, TYPE_RUNNING, cpu,
+			       prev_p->current->state_since, timestamp,
+			       backtrace);
 	if (p && p->current) {
 		if (p->current->state != TYPE_NONE)
-			pid_put_sample(sw->next_pid, p->current->state, cpu, p->current->state_since, timestamp);
+			pid_put_sample(tchart, next_pid, p->current->state, cpu,
+				       p->current->state_since, timestamp,
+				       backtrace);
 
-			p->current->state_since = timestamp;
-			p->current->state = TYPE_RUNNING;
+		p->current->state_since = timestamp;
+		p->current->state = TYPE_RUNNING;
 	}
 
 	if (prev_p->current) {
 		prev_p->current->state = TYPE_NONE;
 		prev_p->current->state_since = timestamp;
-		if (sw->prev_state & 2)
+		if (prev_state & 2)
 			prev_p->current->state = TYPE_BLOCKED;
-		if (sw->prev_state == 0)
+		if (prev_state == 0)
 			prev_p->current->state = TYPE_WAITING;
 	}
 }
 
-
-static int process_sample_event(event_t *event, struct perf_session *session)
+static const char *cat_backtrace(union perf_event *event,
+				 struct perf_sample *sample,
+				 struct machine *machine)
 {
-	struct sample_data data;
-	struct trace_entry *te;
-
-	memset(&data, 0, sizeof(data));
+	struct addr_location al;
+	unsigned int i;
+	char *p = NULL;
+	size_t p_len;
+	u8 cpumode = PERF_RECORD_MISC_USER;
+	struct addr_location tal;
+	struct ip_callchain *chain = sample->callchain;
+	FILE *f = open_memstream(&p, &p_len);
+
+	if (!f) {
+		perror("open_memstream error");
+		return NULL;
+	}
 
-	event__parse_sample(event, session->sample_type, &data);
+	if (!chain)
+		goto exit;
 
-	if (session->sample_type & PERF_SAMPLE_TIME) {
-		if (!first_time || first_time > data.time)
-			first_time = data.time;
-		if (last_time < data.time)
-			last_time = data.time;
+	if (perf_event__preprocess_sample(event, machine, &al, sample) < 0) {
+		fprintf(stderr, "problem processing %d event, skipping it.\n",
+			event->header.type);
+		goto exit;
 	}
 
-	te = (void *)data.raw_data;
-	if (session->sample_type & PERF_SAMPLE_RAW && data.raw_size > 0) {
-		char *event_str;
-		struct power_entry *pe;
+	for (i = 0; i < chain->nr; i++) {
+		u64 ip;
 
-		pe = (void *)te;
+		if (callchain_param.order == ORDER_CALLEE)
+			ip = chain->ips[i];
+		else
+			ip = chain->ips[chain->nr - i - 1];
+
+		if (ip >= PERF_CONTEXT_MAX) {
+			switch (ip) {
+			case PERF_CONTEXT_HV:
+				cpumode = PERF_RECORD_MISC_HYPERVISOR;
+				break;
+			case PERF_CONTEXT_KERNEL:
+				cpumode = PERF_RECORD_MISC_KERNEL;
+				break;
+			case PERF_CONTEXT_USER:
+				cpumode = PERF_RECORD_MISC_USER;
+				break;
+			default:
+				pr_debug("invalid callchain context: "
+					 "%"PRId64"\n", (s64) ip);
+
+				/*
+				 * It seems the callchain is corrupted.
+				 * Discard all.
+				 */
+				zfree(&p);
+				goto exit;
+			}
+			continue;
+		}
 
-		event_str = perf_header__find_event(te->type);
+		tal.filtered = 0;
+		thread__find_addr_location(al.thread, machine, cpumode,
+					   MAP__FUNCTION, ip, &tal);
 
-		if (!event_str)
-			return 0;
+		if (tal.sym)
+			fprintf(f, "..... %016" PRIx64 " %s\n", ip,
+				tal.sym->name);
+		else
+			fprintf(f, "..... %016" PRIx64 "\n", ip);
+	}
 
-		if (strcmp(event_str, "power:power_start") == 0)
-			c_state_start(data.cpu, data.time, pe->value);
+exit:
+	fclose(f);
 
-		if (strcmp(event_str, "power:power_end") == 0)
-			c_state_end(data.cpu, data.time);
+	return p;
+}
+
+typedef int (*tracepoint_handler)(struct timechart *tchart,
+				  struct perf_evsel *evsel,
+				  struct perf_sample *sample,
+				  const char *backtrace);
 
-		if (strcmp(event_str, "power:power_frequency") == 0)
-			p_state_change(data.cpu, data.time, pe->value);
+static int process_sample_event(struct perf_tool *tool,
+				union perf_event *event,
+				struct perf_sample *sample,
+				struct perf_evsel *evsel,
+				struct machine *machine)
+{
+	struct timechart *tchart = container_of(tool, struct timechart, tool);
 
-		if (strcmp(event_str, "sched:sched_wakeup") == 0)
-			sched_wakeup(data.cpu, data.time, data.pid, te);
+	if (evsel->attr.sample_type & PERF_SAMPLE_TIME) {
+		if (!tchart->first_time || tchart->first_time > sample->time)
+			tchart->first_time = sample->time;
+		if (tchart->last_time < sample->time)
+			tchart->last_time = sample->time;
+	}
 
-		if (strcmp(event_str, "sched:sched_switch") == 0)
-			sched_switch(data.cpu, data.time, te);
+	if (evsel->handler != NULL) {
+		tracepoint_handler f = evsel->handler;
+		return f(tchart, evsel, sample,
+			 cat_backtrace(event, sample, machine));
 	}
+
+	return 0;
+}
+
+static int
+process_sample_cpu_idle(struct timechart *tchart __maybe_unused,
+			struct perf_evsel *evsel,
+			struct perf_sample *sample,
+			const char *backtrace __maybe_unused)
+{
+	u32 state = perf_evsel__intval(evsel, sample, "state");
+	u32 cpu_id = perf_evsel__intval(evsel, sample, "cpu_id");
+
+	if (state == (u32)PWR_EVENT_EXIT)
+		c_state_end(tchart, cpu_id, sample->time);
+	else
+		c_state_start(cpu_id, sample->time, state);
+	return 0;
+}
+
+static int
+process_sample_cpu_frequency(struct timechart *tchart,
+			     struct perf_evsel *evsel,
+			     struct perf_sample *sample,
+			     const char *backtrace __maybe_unused)
+{
+	u32 state = perf_evsel__intval(evsel, sample, "state");
+	u32 cpu_id = perf_evsel__intval(evsel, sample, "cpu_id");
+
+	p_state_change(tchart, cpu_id, sample->time, state);
+	return 0;
+}
+
+static int
+process_sample_sched_wakeup(struct timechart *tchart,
+			    struct perf_evsel *evsel,
+			    struct perf_sample *sample,
+			    const char *backtrace)
+{
+	u8 flags = perf_evsel__intval(evsel, sample, "common_flags");
+	int waker = perf_evsel__intval(evsel, sample, "common_pid");
+	int wakee = perf_evsel__intval(evsel, sample, "pid");
+
+	sched_wakeup(tchart, sample->cpu, sample->time, waker, wakee, flags, backtrace);
+	return 0;
+}
+
+static int
+process_sample_sched_switch(struct timechart *tchart,
+			    struct perf_evsel *evsel,
+			    struct perf_sample *sample,
+			    const char *backtrace)
+{
+	int prev_pid = perf_evsel__intval(evsel, sample, "prev_pid");
+	int next_pid = perf_evsel__intval(evsel, sample, "next_pid");
+	u64 prev_state = perf_evsel__intval(evsel, sample, "prev_state");
+
+	sched_switch(tchart, sample->cpu, sample->time, prev_pid, next_pid,
+		     prev_state, backtrace);
+	return 0;
+}
+
+#ifdef SUPPORT_OLD_POWER_EVENTS
+static int
+process_sample_power_start(struct timechart *tchart __maybe_unused,
+			   struct perf_evsel *evsel,
+			   struct perf_sample *sample,
+			   const char *backtrace __maybe_unused)
+{
+	u64 cpu_id = perf_evsel__intval(evsel, sample, "cpu_id");
+	u64 value = perf_evsel__intval(evsel, sample, "value");
+
+	c_state_start(cpu_id, sample->time, value);
+	return 0;
+}
+
+static int
+process_sample_power_end(struct timechart *tchart,
+			 struct perf_evsel *evsel __maybe_unused,
+			 struct perf_sample *sample,
+			 const char *backtrace __maybe_unused)
+{
+	c_state_end(tchart, sample->cpu, sample->time);
+	return 0;
+}
+
+static int
+process_sample_power_frequency(struct timechart *tchart,
+			       struct perf_evsel *evsel,
+			       struct perf_sample *sample,
+			       const char *backtrace __maybe_unused)
+{
+	u64 cpu_id = perf_evsel__intval(evsel, sample, "cpu_id");
+	u64 value = perf_evsel__intval(evsel, sample, "value");
+
+	p_state_change(tchart, cpu_id, sample->time, value);
 	return 0;
 }
+#endif /* SUPPORT_OLD_POWER_EVENTS */
 
 /*
  * After the last sample we need to wrap up the current C/P state
  * and close out each CPU for these.
  */
-static void end_sample_processing(void)
+static void end_sample_processing(struct timechart *tchart)
 {
 	u64 cpu;
 	struct power_event *pwr;
 
-	for (cpu = 0; cpu <= numcpus; cpu++) {
-		pwr = malloc(sizeof(struct power_event));
+	for (cpu = 0; cpu <= tchart->numcpus; cpu++) {
+		/* C state */
+#if 0
+		pwr = zalloc(sizeof(*pwr));
 		if (!pwr)
 			return;
-		memset(pwr, 0, sizeof(struct power_event));
 
-		/* C state */
-#if 0
 		pwr->state = cpus_cstate_state[cpu];
 		pwr->start_time = cpus_cstate_start_times[cpu];
-		pwr->end_time = last_time;
+		pwr->end_time = tchart->last_time;
 		pwr->cpu = cpu;
 		pwr->type = CSTATE;
-		pwr->next = power_events;
+		pwr->next = tchart->power_events;
 
-		power_events = pwr;
+		tchart->power_events = pwr;
 #endif
 		/* P state */
 
-		pwr = malloc(sizeof(struct power_event));
+		pwr = zalloc(sizeof(*pwr));
 		if (!pwr)
 			return;
-		memset(pwr, 0, sizeof(struct power_event));
 
 		pwr->state = cpus_pstate_state[cpu];
 		pwr->start_time = cpus_pstate_start_times[cpu];
-		pwr->end_time = last_time;
+		pwr->end_time = tchart->last_time;
 		pwr->cpu = cpu;
 		pwr->type = PSTATE;
-		pwr->next = power_events;
+		pwr->next = tchart->power_events;
 
 		if (!pwr->start_time)
-			pwr->start_time = first_time;
+			pwr->start_time = tchart->first_time;
 		if (!pwr->state)
-			pwr->state = min_freq;
-		power_events = pwr;
-	}
-}
-
-static u64 sample_time(event_t *event, const struct perf_session *session)
-{
-	int cursor;
-
-	cursor = 0;
-	if (session->sample_type & PERF_SAMPLE_IP)
-		cursor++;
-	if (session->sample_type & PERF_SAMPLE_TID)
-		cursor++;
-	if (session->sample_type & PERF_SAMPLE_TIME)
-		return event->sample.array[cursor];
-	return 0;
-}
-
-
-/*
- * We first queue all events, sorted backwards by insertion.
- * The order will get flipped later.
- */
-static int queue_sample_event(event_t *event, struct perf_session *session)
-{
-	struct sample_wrapper *copy, *prev;
-	int size;
-
-	size = event->sample.header.size + sizeof(struct sample_wrapper) + 8;
-
-	copy = malloc(size);
-	if (!copy)
-		return 1;
-
-	memset(copy, 0, size);
-
-	copy->next = NULL;
-	copy->timestamp = sample_time(event, session);
-
-	memcpy(&copy->data, event, event->sample.header.size);
-
-	/* insert in the right place in the list */
-
-	if (!all_samples) {
-		/* first sample ever */
-		all_samples = copy;
-		return 0;
-	}
-
-	if (all_samples->timestamp < copy->timestamp) {
-		/* insert at the head of the list */
-		copy->next = all_samples;
-		all_samples = copy;
-		return 0;
-	}
-
-	prev = all_samples;
-	while (prev->next) {
-		if (prev->next->timestamp < copy->timestamp) {
-			copy->next = prev->next;
-			prev->next = copy;
-			return 0;
-		}
-		prev = prev->next;
-	}
-	/* insert at the end of the list */
-	prev->next = copy;
-
-	return 0;
-}
-
-static void sort_queued_samples(void)
-{
-	struct sample_wrapper *cursor, *next;
-
-	cursor = all_samples;
-	all_samples = NULL;
-
-	while (cursor) {
-		next = cursor->next;
-		cursor->next = all_samples;
-		all_samples = cursor;
-		cursor = next;
+			pwr->state = tchart->min_freq;
+		tchart->power_events = pwr;
 	}
 }
 
 /*
  * Sort the pid datastructure
  */
-static void sort_pids(void)
+static void sort_pids(struct timechart *tchart)
 {
 	struct per_pid *new_list, *p, *cursor, *prev;
 	/* sort by ppid first, then by pid, lowest to highest */
 
 	new_list = NULL;
 
-	while (all_data) {
-		p = all_data;
-		all_data = p->next;
+	while (tchart->all_data) {
+		p = tchart->all_data;
+		tchart->all_data = p->next;
 		p->next = NULL;
 
 		if (new_list == NULL) {
@@ -696,14 +727,14 @@ static void sort_pids(void)
 				prev->next = p;
 		}
 	}
-	all_data = new_list;
+	tchart->all_data = new_list;
 }
 
 
-static void draw_c_p_states(void)
+static void draw_c_p_states(struct timechart *tchart)
 {
 	struct power_event *pwr;
-	pwr = power_events;
+	pwr = tchart->power_events;
 
 	/*
 	 * two pass drawing so that the P state bars are on top of the C state blocks
@@ -714,30 +745,30 @@ static void draw_c_p_states(void)
 		pwr = pwr->next;
 	}
 
-	pwr = power_events;
+	pwr = tchart->power_events;
 	while (pwr) {
 		if (pwr->type == PSTATE) {
 			if (!pwr->state)
-				pwr->state = min_freq;
+				pwr->state = tchart->min_freq;
 			svg_pstate(pwr->cpu, pwr->start_time, pwr->end_time, pwr->state);
 		}
 		pwr = pwr->next;
 	}
 }
 
-static void draw_wakeups(void)
+static void draw_wakeups(struct timechart *tchart)
 {
 	struct wake_event *we;
 	struct per_pid *p;
 	struct per_pidcomm *c;
 
-	we = wake_events;
+	we = tchart->wake_events;
 	while (we) {
 		int from = 0, to = 0;
 		char *task_from = NULL, *task_to = NULL;
 
 		/* locate the column of the waker and wakee */
-		p = all_data;
+		p = tchart->all_data;
 		while (p) {
 			if (p->pid == we->waker || p->pid == we->wakee) {
 				c = p->all;
@@ -780,11 +811,12 @@ static void draw_wakeups(void)
 		}
 
 		if (we->waker == -1)
-			svg_interrupt(we->time, to);
+			svg_interrupt(we->time, to, we->backtrace);
 		else if (from && to && abs(from - to) == 1)
-			svg_wakeline(we->time, from, to);
+			svg_wakeline(we->time, from, to, we->backtrace);
 		else
-			svg_partial_wakeline(we->time, from, task_from, to, task_to);
+			svg_partial_wakeline(we->time, from, task_from, to,
+					     task_to, we->backtrace);
 		we = we->next;
 
 		free(task_from);
@@ -792,19 +824,25 @@ static void draw_wakeups(void)
 	}
 }
 
-static void draw_cpu_usage(void)
+static void draw_cpu_usage(struct timechart *tchart)
 {
 	struct per_pid *p;
 	struct per_pidcomm *c;
 	struct cpu_sample *sample;
-	p = all_data;
+	p = tchart->all_data;
 	while (p) {
 		c = p->all;
 		while (c) {
 			sample = c->samples;
 			while (sample) {
-				if (sample->type == TYPE_RUNNING)
-					svg_process(sample->cpu, sample->start_time, sample->end_time, "sample", c->comm);
+				if (sample->type == TYPE_RUNNING) {
+					svg_process(sample->cpu,
+						    sample->start_time,
+						    sample->end_time,
+						    p->pid,
+						    c->comm,
+						    sample->backtrace);
+				}
 
 				sample = sample->next;
 			}
@@ -814,16 +852,16 @@ static void draw_cpu_usage(void)
 	}
 }
 
-static void draw_process_bars(void)
+static void draw_process_bars(struct timechart *tchart)
 {
 	struct per_pid *p;
 	struct per_pidcomm *c;
 	struct cpu_sample *sample;
 	int Y = 0;
 
-	Y = 2 * numcpus + 2;
+	Y = 2 * tchart->numcpus + 2;
 
-	p = all_data;
+	p = tchart->all_data;
 	while (p) {
 		c = p->all;
 		while (c) {
@@ -837,11 +875,20 @@ static void draw_process_bars(void)
 			sample = c->samples;
 			while (sample) {
 				if (sample->type == TYPE_RUNNING)
-					svg_sample(Y, sample->cpu, sample->start_time, sample->end_time);
+					svg_running(Y, sample->cpu,
+						    sample->start_time,
+						    sample->end_time,
+						    sample->backtrace);
 				if (sample->type == TYPE_BLOCKED)
-					svg_box(Y, sample->start_time, sample->end_time, "blocked");
+					svg_blocked(Y, sample->cpu,
+						    sample->start_time,
+						    sample->end_time,
+						    sample->backtrace);
 				if (sample->type == TYPE_WAITING)
-					svg_waiting(Y, sample->start_time, sample->end_time);
+					svg_waiting(Y, sample->cpu,
+						    sample->start_time,
+						    sample->end_time,
+						    sample->backtrace);
 				sample = sample->next;
 			}
 
@@ -864,11 +911,9 @@ static void draw_process_bars(void)
 
 static void add_process_filter(const char *string)
 {
-	struct process_filter *filt;
-	int pid;
+	int pid = strtoull(string, NULL, 10);
+	struct process_filter *filt = malloc(sizeof(*filt));
 
-	pid = strtoull(string, NULL, 10);
-	filt = malloc(sizeof(struct process_filter));
 	if (!filt)
 		return;
 
@@ -896,21 +941,21 @@ static int passes_filter(struct per_pid *p, struct per_pidcomm *c)
 	return 0;
 }
 
-static int determine_display_tasks_filtered(void)
+static int determine_display_tasks_filtered(struct timechart *tchart)
 {
 	struct per_pid *p;
 	struct per_pidcomm *c;
 	int count = 0;
 
-	p = all_data;
+	p = tchart->all_data;
 	while (p) {
 		p->display = 0;
 		if (p->start_time == 1)
-			p->start_time = first_time;
+			p->start_time = tchart->first_time;
 
 		/* no exit marker, task kept running to the end */
 		if (p->end_time == 0)
-			p->end_time = last_time;
+			p->end_time = tchart->last_time;
 
 		c = p->all;
 
@@ -918,7 +963,7 @@ static int determine_display_tasks_filtered(void)
 			c->display = 0;
 
 			if (c->start_time == 1)
-				c->start_time = first_time;
+				c->start_time = tchart->first_time;
 
 			if (passes_filter(p, c)) {
 				c->display = 1;
@@ -927,7 +972,7 @@ static int determine_display_tasks_filtered(void)
 			}
 
 			if (c->end_time == 0)
-				c->end_time = last_time;
+				c->end_time = tchart->last_time;
 
 			c = c->next;
 		}
@@ -936,25 +981,25 @@ static int determine_display_tasks_filtered(void)
 	return count;
 }
 
-static int determine_display_tasks(u64 threshold)
+static int determine_display_tasks(struct timechart *tchart, u64 threshold)
 {
 	struct per_pid *p;
 	struct per_pidcomm *c;
 	int count = 0;
 
 	if (process_filter)
-		return determine_display_tasks_filtered();
+		return determine_display_tasks_filtered(tchart);
 
-	p = all_data;
+	p = tchart->all_data;
 	while (p) {
 		p->display = 0;
 		if (p->start_time == 1)
-			p->start_time = first_time;
+			p->start_time = tchart->first_time;
 
 		/* no exit marker, task kept running to the end */
 		if (p->end_time == 0)
-			p->end_time = last_time;
-		if (p->total_time >= threshold && !power_only)
+			p->end_time = tchart->last_time;
+		if (p->total_time >= threshold)
 			p->display = 1;
 
 		c = p->all;
@@ -963,15 +1008,15 @@ static int determine_display_tasks(u64 threshold)
 			c->display = 0;
 
 			if (c->start_time == 1)
-				c->start_time = first_time;
+				c->start_time = tchart->first_time;
 
-			if (c->total_time >= threshold && !power_only) {
+			if (c->total_time >= threshold) {
 				c->display = 1;
 				count++;
 			}
 
 			if (c->end_time == 0)
-				c->end_time = last_time;
+				c->end_time = tchart->last_time;
 
 			c = c->next;
 		}
@@ -984,161 +1029,322 @@ static int determine_display_tasks(u64 threshold)
 
 #define TIME_THRESH 10000000
 
-static void write_svg_file(const char *filename)
+static void write_svg_file(struct timechart *tchart, const char *filename)
 {
 	u64 i;
 	int count;
+	int thresh = TIME_THRESH;
 
-	numcpus++;
-
+	if (tchart->power_only)
+		tchart->proc_num = 0;
 
-	count = determine_display_tasks(TIME_THRESH);
+	/* We'd like to show at least proc_num tasks;
+	 * be less picky if we have fewer */
+	do {
+		count = determine_display_tasks(tchart, thresh);
+		thresh /= 10;
+	} while (!process_filter && thresh && count < tchart->proc_num);
 
-	/* We'd like to show at least 15 tasks; be less picky if we have fewer */
-	if (count < 15)
-		count = determine_display_tasks(TIME_THRESH / 10);
+	if (!tchart->proc_num)
+		count = 0;
 
-	open_svg(filename, numcpus, count, first_time, last_time);
+	open_svg(filename, tchart->numcpus, count, tchart->first_time, tchart->last_time);
 
 	svg_time_grid();
 	svg_legenda();
 
-	for (i = 0; i < numcpus; i++)
-		svg_cpu_box(i, max_freq, turbo_frequency);
+	for (i = 0; i < tchart->numcpus; i++)
+		svg_cpu_box(i, tchart->max_freq, tchart->turbo_frequency);
 
-	draw_cpu_usage();
-	draw_process_bars();
-	draw_c_p_states();
-	draw_wakeups();
+	draw_cpu_usage(tchart);
+	if (tchart->proc_num)
+		draw_process_bars(tchart);
+	if (!tchart->tasks_only)
+		draw_c_p_states(tchart);
+	if (tchart->proc_num)
+		draw_wakeups(tchart);
 
 	svg_close();
 }
 
-static void process_samples(struct perf_session *session)
+static int process_header(struct perf_file_section *section __maybe_unused,
+			  struct perf_header *ph,
+			  int feat,
+			  int fd __maybe_unused,
+			  void *data)
 {
-	struct sample_wrapper *cursor;
-	event_t *event;
-
-	sort_queued_samples();
-
-	cursor = all_samples;
-	while (cursor) {
-		event = (void *)&cursor->data;
-		cursor = cursor->next;
-		process_sample_event(event, session);
+	struct timechart *tchart = data;
+
+	switch (feat) {
+	case HEADER_NRCPUS:
+		tchart->numcpus = ph->env.nr_cpus_avail;
+		break;
+
+	case HEADER_CPU_TOPOLOGY:
+		if (!tchart->topology)
+			break;
+
+		if (svg_build_topology_map(ph->env.sibling_cores,
+					   ph->env.nr_sibling_cores,
+					   ph->env.sibling_threads,
+					   ph->env.nr_sibling_threads))
+			fprintf(stderr, "problem building topology\n");
+		break;
+
+	default:
+		break;
 	}
-}
 
-static struct perf_event_ops event_ops = {
-	.comm	= process_comm_event,
-	.fork	= process_fork_event,
-	.exit	= process_exit_event,
-	.sample	= queue_sample_event,
-};
+	return 0;
+}
 
-static int __cmd_timechart(void)
+static int __cmd_timechart(struct timechart *tchart, const char *output_name)
 {
-	struct perf_session *session = perf_session__new(input_name, O_RDONLY, 0);
+	const struct perf_evsel_str_handler power_tracepoints[] = {
+		{ "power:cpu_idle",		process_sample_cpu_idle },
+		{ "power:cpu_frequency",	process_sample_cpu_frequency },
+		{ "sched:sched_wakeup",		process_sample_sched_wakeup },
+		{ "sched:sched_switch",		process_sample_sched_switch },
+#ifdef SUPPORT_OLD_POWER_EVENTS
+		{ "power:power_start",		process_sample_power_start },
+		{ "power:power_end",		process_sample_power_end },
+		{ "power:power_frequency",	process_sample_power_frequency },
+#endif
+	};
+	struct perf_data_file file = {
+		.path = input_name,
+		.mode = PERF_DATA_MODE_READ,
+	};
+
+	struct perf_session *session = perf_session__new(&file, false,
+							 &tchart->tool);
 	int ret = -EINVAL;
 
 	if (session == NULL)
 		return -ENOMEM;
 
+	(void)perf_header__process_sections(&session->header,
+					    perf_data_file__fd(session->file),
+					    tchart,
+					    process_header);
+
 	if (!perf_session__has_traces(session, "timechart record"))
 		goto out_delete;
 
-	ret = perf_session__process_events(session, &event_ops);
-	if (ret)
+	if (perf_session__set_tracepoints_handlers(session,
+						   power_tracepoints)) {
+		pr_err("Initializing session tracepoint handlers failed\n");
 		goto out_delete;
+	}
 
-	process_samples(session);
+	ret = perf_session__process_events(session, &tchart->tool);
+	if (ret)
+		goto out_delete;
 
-	end_sample_processing();
+	end_sample_processing(tchart);
 
-	sort_pids();
+	sort_pids(tchart);
 
-	write_svg_file(output_name);
+	write_svg_file(tchart, output_name);
 
 	pr_info("Written %2.1f seconds of trace to %s.\n",
-		(last_time - first_time) / 1000000000.0, output_name);
+		(tchart->last_time - tchart->first_time) / 1000000000.0, output_name);
 out_delete:
 	perf_session__delete(session);
 	return ret;
 }
 
-static const char * const timechart_usage[] = {
-	"perf timechart [<options>] {record}",
-	NULL
-};
-
-static const char *record_args[] = {
-	"record",
-	"-a",
-	"-R",
-	"-M",
-	"-f",
-	"-c", "1",
-	"-e", "power:power_start",
-	"-e", "power:power_end",
-	"-e", "power:power_frequency",
-	"-e", "sched:sched_wakeup",
-	"-e", "sched:sched_switch",
-};
-
-static int __cmd_record(int argc, const char **argv)
+static int timechart__record(struct timechart *tchart, int argc, const char **argv)
 {
 	unsigned int rec_argc, i, j;
 	const char **rec_argv;
+	const char **p;
+	unsigned int record_elems;
+
+	const char * const common_args[] = {
+		"record", "-a", "-R", "-c", "1",
+	};
+	unsigned int common_args_nr = ARRAY_SIZE(common_args);
+
+	const char * const backtrace_args[] = {
+		"-g",
+	};
+	unsigned int backtrace_args_no = ARRAY_SIZE(backtrace_args);
+
+	const char * const power_args[] = {
+		"-e", "power:cpu_frequency",
+		"-e", "power:cpu_idle",
+	};
+	unsigned int power_args_nr = ARRAY_SIZE(power_args);
+
+	const char * const old_power_args[] = {
+#ifdef SUPPORT_OLD_POWER_EVENTS
+		"-e", "power:power_start",
+		"-e", "power:power_end",
+		"-e", "power:power_frequency",
+#endif
+	};
+	unsigned int old_power_args_nr = ARRAY_SIZE(old_power_args);
+
+	const char * const tasks_args[] = {
+		"-e", "sched:sched_wakeup",
+		"-e", "sched:sched_switch",
+	};
+	unsigned int tasks_args_nr = ARRAY_SIZE(tasks_args);
+
+#ifdef SUPPORT_OLD_POWER_EVENTS
+	if (!is_valid_tracepoint("power:cpu_idle") &&
+	    is_valid_tracepoint("power:power_start")) {
+		use_old_power_events = 1;
+		power_args_nr = 0;
+	} else {
+		old_power_args_nr = 0;
+	}
+#endif
+
+	if (tchart->power_only)
+		tasks_args_nr = 0;
 
-	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
+	if (tchart->tasks_only) {
+		power_args_nr = 0;
+		old_power_args_nr = 0;
+	}
+
+	if (!tchart->with_backtrace)
+		backtrace_args_no = 0;
+
+	record_elems = common_args_nr + tasks_args_nr +
+		power_args_nr + old_power_args_nr + backtrace_args_no;
+
+	rec_argc = record_elems + argc;
 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
 
-	for (i = 0; i < ARRAY_SIZE(record_args); i++)
-		rec_argv[i] = strdup(record_args[i]);
+	if (rec_argv == NULL)
+		return -ENOMEM;
+
+	p = rec_argv;
+	for (i = 0; i < common_args_nr; i++)
+		*p++ = strdup(common_args[i]);
 
-	for (j = 1; j < (unsigned int)argc; j++, i++)
-		rec_argv[i] = argv[j];
+	for (i = 0; i < backtrace_args_no; i++)
+		*p++ = strdup(backtrace_args[i]);
 
-	return cmd_record(i, rec_argv, NULL);
+	for (i = 0; i < tasks_args_nr; i++)
+		*p++ = strdup(tasks_args[i]);
+
+	for (i = 0; i < power_args_nr; i++)
+		*p++ = strdup(power_args[i]);
+
+	for (i = 0; i < old_power_args_nr; i++)
+		*p++ = strdup(old_power_args[i]);
+
+	for (j = 0; j < (unsigned int)argc; j++)
+		*p++ = argv[j];
+
+	return cmd_record(rec_argc, rec_argv, NULL);
 }
 
 static int
-parse_process(const struct option *opt __used, const char *arg, int __used unset)
+parse_process(const struct option *opt __maybe_unused, const char *arg,
+	      int __maybe_unused unset)
 {
 	if (arg)
 		add_process_filter(arg);
 	return 0;
 }
 
-static const struct option options[] = {
-	OPT_STRING('i', "input", &input_name, "file",
-		    "input file name"),
-	OPT_STRING('o', "output", &output_name, "file",
-		    "output file name"),
-	OPT_INTEGER('w', "width", &svg_page_width,
-		    "page width"),
-	OPT_BOOLEAN('P', "power-only", &power_only,
-		    "output power data only"),
+static int
+parse_highlight(const struct option *opt __maybe_unused, const char *arg,
+		int __maybe_unused unset)
+{
+	unsigned long duration = strtoul(arg, NULL, 0);
+
+	if (svg_highlight || svg_highlight_name)
+		return -1;
+
+	if (duration)
+		svg_highlight = duration;
+	else
+		svg_highlight_name = strdup(arg);
+
+	return 0;
+}
+
+int cmd_timechart(int argc, const char **argv,
+		  const char *prefix __maybe_unused)
+{
+	struct timechart tchart = {
+		.tool = {
+			.comm		 = process_comm_event,
+			.fork		 = process_fork_event,
+			.exit		 = process_exit_event,
+			.sample		 = process_sample_event,
+			.ordered_samples = true,
+		},
+		.proc_num = 15,
+	};
+	const char *output_name = "output.svg";
+	const struct option timechart_options[] = {
+	OPT_STRING('i', "input", &input_name, "file", "input file name"),
+	OPT_STRING('o', "output", &output_name, "file", "output file name"),
+	OPT_INTEGER('w', "width", &svg_page_width, "page width"),
+	OPT_CALLBACK(0, "highlight", NULL, "duration or task name",
+		      "highlight tasks. Pass duration in ns or process name.",
+		       parse_highlight),
+	OPT_BOOLEAN('P', "power-only", &tchart.power_only, "output power data only"),
+	OPT_BOOLEAN('T', "tasks-only", &tchart.tasks_only,
+		    "output processes data only"),
 	OPT_CALLBACK('p', "process", NULL, "process",
 		      "process selector. Pass a pid or process name.",
 		       parse_process),
+	OPT_STRING(0, "symfs", &symbol_conf.symfs, "directory",
+		    "Look for files with symbols relative to this directory"),
+	OPT_INTEGER('n', "proc-num", &tchart.proc_num,
+		    "min. number of tasks to print"),
+	OPT_BOOLEAN('t', "topology", &tchart.topology,
+		    "sort CPUs according to topology"),
 	OPT_END()
-};
-
-
-int cmd_timechart(int argc, const char **argv, const char *prefix __used)
-{
-	argc = parse_options(argc, argv, options, timechart_usage,
+	};
+	const char * const timechart_usage[] = {
+		"perf timechart [<options>] {record}",
+		NULL
+	};
+
+	const struct option record_options[] = {
+	OPT_BOOLEAN('P', "power-only", &tchart.power_only, "output power data only"),
+	OPT_BOOLEAN('T', "tasks-only", &tchart.tasks_only,
+		    "output processes data only"),
+	OPT_BOOLEAN('g', "callchain", &tchart.with_backtrace, "record callchain"),
+	OPT_END()
+	};
+	const char * const record_usage[] = {
+		"perf timechart record [<options>]",
+		NULL
+	};
+	argc = parse_options(argc, argv, timechart_options, timechart_usage,
 			PARSE_OPT_STOP_AT_NON_OPTION);
 
+	if (tchart.power_only && tchart.tasks_only) {
+		pr_err("-P and -T options cannot be used at the same time.\n");
+		return -1;
+	}
+
 	symbol__init();
 
-	if (argc && !strncmp(argv[0], "rec", 3))
-		return __cmd_record(argc, argv);
-	else if (argc)
-		usage_with_options(timechart_usage, options);
+	if (argc && !strncmp(argv[0], "rec", 3)) {
+		argc = parse_options(argc, argv, record_options, record_usage,
+				     PARSE_OPT_STOP_AT_NON_OPTION);
+
+		if (tchart.power_only && tchart.tasks_only) {
+			pr_err("-P and -T options cannot be used at the same time.\n");
+			return -1;
+		}
+
+		return timechart__record(&tchart, argc, argv);
+	} else if (argc)
+		usage_with_options(timechart_usage, timechart_options);
 
 	setup_pager();
 
-	return __cmd_timechart();
+	return __cmd_timechart(&tchart, output_name);
 }
diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 1f529321607..377971dc89a 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -5,6 +5,7 @@
  * any workload, CPU or specific PID.
  *
  * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
+ *		 2011, Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
  *
  * Improvements and fixes by:
  *
@@ -20,29 +21,41 @@
 
 #include "perf.h"
 
+#include "util/annotate.h"
+#include "util/cache.h"
 #include "util/color.h"
+#include "util/evlist.h"
+#include "util/evsel.h"
+#include "util/machine.h"
 #include "util/session.h"
 #include "util/symbol.h"
 #include "util/thread.h"
+#include "util/thread_map.h"
+#include "util/top.h"
 #include "util/util.h"
 #include <linux/rbtree.h>
 #include "util/parse-options.h"
 #include "util/parse-events.h"
 #include "util/cpumap.h"
+#include "util/xyarray.h"
+#include "util/sort.h"
+#include "util/intlist.h"
+#include "arch/common.h"
 
 #include "util/debug.h"
 
 #include <assert.h>
+#include <elf.h>
 #include <fcntl.h>
 
 #include <stdio.h>
 #include <termios.h>
 #include <unistd.h>
+#include <inttypes.h>
 
 #include <errno.h>
 #include <time.h>
 #include <sched.h>
-#include <pthread.h>
 
 #include <sys/syscall.h>
 #include <sys/ioctl.h>
@@ -50,559 +63,230 @@
 #include <sys/prctl.h>
 #include <sys/wait.h>
 #include <sys/uio.h>
+#include <sys/utsname.h>
 #include <sys/mman.h>
 
 #include <linux/unistd.h>
 #include <linux/types.h>
 
-static int			fd[MAX_NR_CPUS][MAX_COUNTERS];
+static volatile int done;
 
-static int			system_wide			=      0;
+#define HEADER_LINE_NR  5
 
-static int			default_interval		=      0;
-
-static int			count_filter			=      5;
-static int			print_entries;
-
-static int			target_pid			=     -1;
-static int			inherit				=      0;
-static int			profile_cpu			=     -1;
-static int			nr_cpus				=      0;
-static unsigned int		realtime_prio			=      0;
-static int			group				=      0;
-static unsigned int		page_size;
-static unsigned int		mmap_pages			=     16;
-static int			freq				=   1000; /* 1 KHz */
-
-static int			delay_secs			=      2;
-static int			zero                            =      0;
-static int			dump_symtab                     =      0;
-
-static bool			hide_kernel_symbols		=  false;
-static bool			hide_user_symbols		=  false;
-static struct winsize		winsize;
-
-/*
- * Source
- */
-
-struct source_line {
-	u64			eip;
-	unsigned long		count[MAX_COUNTERS];
-	char			*line;
-	struct source_line	*next;
-};
-
-static char			*sym_filter			=   NULL;
-struct sym_entry		*sym_filter_entry		=   NULL;
-struct sym_entry		*sym_filter_entry_sched		=   NULL;
-static int			sym_pcnt_filter			=      5;
-static int			sym_counter			=      0;
-static int			display_weighted		=     -1;
-
-/*
- * Symbols
- */
-
-struct sym_entry_source {
-	struct source_line	*source;
-	struct source_line	*lines;
-	struct source_line	**lines_tail;
-	pthread_mutex_t		lock;
-};
-
-struct sym_entry {
-	struct rb_node		rb_node;
-	struct list_head	node;
-	unsigned long		snap_count;
-	double			weight;
-	int			skip;
-	u16			name_len;
-	u8			origin;
-	struct map		*map;
-	struct sym_entry_source	*src;
-	unsigned long		count[0];
-};
-
-/*
- * Source functions
- */
-
-static inline struct symbol *sym_entry__symbol(struct sym_entry *self)
+static void perf_top__update_print_entries(struct perf_top *top)
 {
-       return ((void *)self) + symbol_conf.priv_size;
+	top->print_entries = top->winsize.ws_row - HEADER_LINE_NR;
 }
 
-static void get_term_dimensions(struct winsize *ws)
+static void perf_top__sig_winch(int sig __maybe_unused,
+				siginfo_t *info __maybe_unused, void *arg)
 {
-	char *s = getenv("LINES");
-
-	if (s != NULL) {
-		ws->ws_row = atoi(s);
-		s = getenv("COLUMNS");
-		if (s != NULL) {
-			ws->ws_col = atoi(s);
-			if (ws->ws_row && ws->ws_col)
-				return;
-		}
-	}
-#ifdef TIOCGWINSZ
-	if (ioctl(1, TIOCGWINSZ, ws) == 0 &&
-	    ws->ws_row && ws->ws_col)
-		return;
-#endif
-	ws->ws_row = 25;
-	ws->ws_col = 80;
-}
+	struct perf_top *top = arg;
 
-static void update_print_entries(struct winsize *ws)
-{
-	print_entries = ws->ws_row;
-
-	if (print_entries > 9)
-		print_entries -= 9;
-}
-
-static void sig_winch_handler(int sig __used)
-{
-	get_term_dimensions(&winsize);
-	update_print_entries(&winsize);
+	get_term_dimensions(&top->winsize);
+	perf_top__update_print_entries(top);
 }
 
-static void parse_source(struct sym_entry *syme)
+static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he)
 {
 	struct symbol *sym;
-	struct sym_entry_source *source;
+	struct annotation *notes;
 	struct map *map;
-	FILE *file;
-	char command[PATH_MAX*2];
-	const char *path;
-	u64 len;
+	int err = -1;
 
-	if (!syme)
-		return;
+	if (!he || !he->ms.sym)
+		return -1;
 
-	if (syme->src == NULL) {
-		syme->src = zalloc(sizeof(*source));
-		if (syme->src == NULL)
-			return;
-		pthread_mutex_init(&syme->src->lock, NULL);
-	}
+	sym = he->ms.sym;
+	map = he->ms.map;
 
-	source = syme->src;
+	/*
+	 * We can't annotate with just /proc/kallsyms
+	 */
+	if (map->dso->symtab_type == DSO_BINARY_TYPE__KALLSYMS &&
+	    !dso__is_kcore(map->dso)) {
+		pr_err("Can't annotate %s: No vmlinux file was found in the "
+		       "path\n", sym->name);
+		sleep(1);
+		return -1;
+	}
 
-	if (source->lines) {
-		pthread_mutex_lock(&source->lock);
+	notes = symbol__annotation(sym);
+	if (notes->src != NULL) {
+		pthread_mutex_lock(&notes->lock);
 		goto out_assign;
 	}
 
-	sym = sym_entry__symbol(syme);
-	map = syme->map;
-	path = map->dso->long_name;
-
-	len = sym->end - sym->start;
-
-	sprintf(command,
-		"objdump --start-address=%#0*Lx --stop-address=%#0*Lx -dS %s",
-		BITS_PER_LONG / 4, map__rip_2objdump(map, sym->start),
-		BITS_PER_LONG / 4, map__rip_2objdump(map, sym->end), path);
+	pthread_mutex_lock(&notes->lock);
 
-	file = popen(command, "r");
-	if (!file)
-		return;
-
-	pthread_mutex_lock(&source->lock);
-	source->lines_tail = &source->lines;
-	while (!feof(file)) {
-		struct source_line *src;
-		size_t dummy = 0;
-		char *c, *sep;
-
-		src = malloc(sizeof(struct source_line));
-		assert(src != NULL);
-		memset(src, 0, sizeof(struct source_line));
-
-		if (getline(&src->line, &dummy, file) < 0)
-			break;
-		if (!src->line)
-			break;
-
-		c = strchr(src->line, '\n');
-		if (c)
-			*c = 0;
-
-		src->next = NULL;
-		*source->lines_tail = src;
-		source->lines_tail = &src->next;
-
-		src->eip = strtoull(src->line, &sep, 16);
-		if (*sep == ':')
-			src->eip = map__objdump_2ip(map, src->eip);
-		else /* this line has no ip info (e.g. source line) */
-			src->eip = 0;
+	if (symbol__alloc_hist(sym) < 0) {
+		pthread_mutex_unlock(&notes->lock);
+		pr_err("Not enough memory for annotating '%s' symbol!\n",
+		       sym->name);
+		sleep(1);
+		return err;
 	}
-	pclose(file);
-out_assign:
-	sym_filter_entry = syme;
-	pthread_mutex_unlock(&source->lock);
-}
-
-static void __zero_source_counters(struct sym_entry *syme)
-{
-	int i;
-	struct source_line *line;
 
-	line = syme->src->lines;
-	while (line) {
-		for (i = 0; i < nr_counters; i++)
-			line->count[i] = 0;
-		line = line->next;
+	err = symbol__annotate(sym, map, 0);
+	if (err == 0) {
+out_assign:
+		top->sym_filter_entry = he;
 	}
-}
-
-static void record_precise_ip(struct sym_entry *syme, int counter, u64 ip)
-{
-	struct source_line *line;
-
-	if (syme != sym_filter_entry)
-		return;
-
-	if (pthread_mutex_trylock(&syme->src->lock))
-		return;
-
-	if (syme->src == NULL || syme->src->source == NULL)
-		goto out_unlock;
 
-	for (line = syme->src->lines; line; line = line->next) {
-		/* skip lines without IP info */
-		if (line->eip == 0)
-			continue;
-		if (line->eip == ip) {
-			line->count[counter]++;
-			break;
-		}
-		if (line->eip > ip)
-			break;
-	}
-out_unlock:
-	pthread_mutex_unlock(&syme->src->lock);
+	pthread_mutex_unlock(&notes->lock);
+	return err;
 }
 
-#define PATTERN_LEN		(BITS_PER_LONG / 4 + 2)
-
-static void lookup_sym_source(struct sym_entry *syme)
+static void __zero_source_counters(struct hist_entry *he)
 {
-	struct symbol *symbol = sym_entry__symbol(syme);
-	struct source_line *line;
-	char pattern[PATTERN_LEN + 1];
-
-	sprintf(pattern, "%0*Lx <", BITS_PER_LONG / 4,
-		map__rip_2objdump(syme->map, symbol->start));
-
-	pthread_mutex_lock(&syme->src->lock);
-	for (line = syme->src->lines; line; line = line->next) {
-		if (memcmp(line->line, pattern, PATTERN_LEN) == 0) {
-			syme->src->source = line;
-			break;
-		}
-	}
-	pthread_mutex_unlock(&syme->src->lock);
+	struct symbol *sym = he->ms.sym;
+	symbol__annotate_zero_histograms(sym);
 }
 
-static void show_lines(struct source_line *queue, int count, int total)
+static void ui__warn_map_erange(struct map *map, struct symbol *sym, u64 ip)
 {
-	int i;
-	struct source_line *line;
-
-	line = queue;
-	for (i = 0; i < count; i++) {
-		float pcnt = 100.0*(float)line->count[sym_counter]/(float)total;
-
-		printf("%8li %4.1f%%\t%s\n", line->count[sym_counter], pcnt, line->line);
-		line = line->next;
-	}
+	struct utsname uts;
+	int err = uname(&uts);
+
+	ui__warning("Out of bounds address found:\n\n"
+		    "Addr:   %" PRIx64 "\n"
+		    "DSO:    %s %c\n"
+		    "Map:    %" PRIx64 "-%" PRIx64 "\n"
+		    "Symbol: %" PRIx64 "-%" PRIx64 " %c %s\n"
+		    "Arch:   %s\n"
+		    "Kernel: %s\n"
+		    "Tools:  %s\n\n"
+		    "Not all samples will be on the annotation output.\n\n"
+		    "Please report to linux-kernel@vger.kernel.org\n",
+		    ip, map->dso->long_name, dso__symtab_origin(map->dso),
+		    map->start, map->end, sym->start, sym->end,
+		    sym->binding == STB_GLOBAL ? 'g' :
+		    sym->binding == STB_LOCAL  ? 'l' : 'w', sym->name,
+		    err ? "[unknown]" : uts.machine,
+		    err ? "[unknown]" : uts.release, perf_version_string);
+	if (use_browser <= 0)
+		sleep(5);
+	
+	map->erange_warned = true;
 }
 
-#define TRACE_COUNT     3
-
-static void show_details(struct sym_entry *syme)
+static void perf_top__record_precise_ip(struct perf_top *top,
+					struct hist_entry *he,
+					int counter, u64 ip)
 {
-	struct symbol *symbol;
-	struct source_line *line;
-	struct source_line *line_queue = NULL;
-	int displayed = 0;
-	int line_queue_count = 0, total = 0, more = 0;
+	struct annotation *notes;
+	struct symbol *sym;
+	int err = 0;
 
-	if (!syme)
+	if (he == NULL || he->ms.sym == NULL ||
+	    ((top->sym_filter_entry == NULL ||
+	      top->sym_filter_entry->ms.sym != he->ms.sym) && use_browser != 1))
 		return;
 
-	if (!syme->src->source)
-		lookup_sym_source(syme);
+	sym = he->ms.sym;
+	notes = symbol__annotation(sym);
 
-	if (!syme->src->source)
+	if (pthread_mutex_trylock(&notes->lock))
 		return;
 
-	symbol = sym_entry__symbol(syme);
-	printf("Showing %s for %s\n", event_name(sym_counter), symbol->name);
-	printf("  Events  Pcnt (>=%d%%)\n", sym_pcnt_filter);
+	ip = he->ms.map->map_ip(he->ms.map, ip);
 
-	pthread_mutex_lock(&syme->src->lock);
-	line = syme->src->source;
-	while (line) {
-		total += line->count[sym_counter];
-		line = line->next;
-	}
+	if (ui__has_annotation())
+		err = hist_entry__inc_addr_samples(he, counter, ip);
 
-	line = syme->src->source;
-	while (line) {
-		float pcnt = 0.0;
-
-		if (!line_queue_count)
-			line_queue = line;
-		line_queue_count++;
-
-		if (line->count[sym_counter])
-			pcnt = 100.0 * line->count[sym_counter] / (float)total;
-		if (pcnt >= (float)sym_pcnt_filter) {
-			if (displayed <= print_entries)
-				show_lines(line_queue, line_queue_count, total);
-			else more++;
-			displayed += line_queue_count;
-			line_queue_count = 0;
-			line_queue = NULL;
-		} else if (line_queue_count > TRACE_COUNT) {
-			line_queue = line_queue->next;
-			line_queue_count--;
-		}
-
-		line->count[sym_counter] = zero ? 0 : line->count[sym_counter] * 7 / 8;
-		line = line->next;
-	}
-	pthread_mutex_unlock(&syme->src->lock);
-	if (more)
-		printf("%d lines not displayed, maybe increase display entries [e]\n", more);
-}
-
-/*
- * Symbols will be added here in event__process_sample and will get out
- * after decayed.
- */
-static LIST_HEAD(active_symbols);
-static pthread_mutex_t active_symbols_lock = PTHREAD_MUTEX_INITIALIZER;
-
-/*
- * Ordering weight: count-1 * count-2 * ... / count-n
- */
-static double sym_weight(const struct sym_entry *sym)
-{
-	double weight = sym->snap_count;
-	int counter;
-
-	if (!display_weighted)
-		return weight;
+	pthread_mutex_unlock(&notes->lock);
 
-	for (counter = 1; counter < nr_counters-1; counter++)
-		weight *= sym->count[counter];
+	/*
+	 * This function is now called with he->hists->lock held.
+	 * Release it before going to sleep.
+	 */
+	pthread_mutex_unlock(&he->hists->lock);
 
-	weight /= (sym->count[counter] + 1);
+	if (err == -ERANGE && !he->ms.map->erange_warned)
+		ui__warn_map_erange(he->ms.map, sym, ip);
+	else if (err == -ENOMEM) {
+		pr_err("Not enough memory for annotating '%s' symbol!\n",
+		       sym->name);
+		sleep(1);
+	}
 
-	return weight;
+	pthread_mutex_lock(&he->hists->lock);
 }
 
-static long			samples;
-static long			userspace_samples;
-static const char		CONSOLE_CLEAR[] = "[H[2J";
-
-static void __list_insert_active_sym(struct sym_entry *syme)
+static void perf_top__show_details(struct perf_top *top)
 {
-	list_add(&syme->node, &active_symbols);
-}
+	struct hist_entry *he = top->sym_filter_entry;
+	struct annotation *notes;
+	struct symbol *symbol;
+	int more;
 
-static void list_remove_active_sym(struct sym_entry *syme)
-{
-	pthread_mutex_lock(&active_symbols_lock);
-	list_del_init(&syme->node);
-	pthread_mutex_unlock(&active_symbols_lock);
-}
+	if (!he)
+		return;
 
-static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se)
-{
-	struct rb_node **p = &tree->rb_node;
-	struct rb_node *parent = NULL;
-	struct sym_entry *iter;
+	symbol = he->ms.sym;
+	notes = symbol__annotation(symbol);
 
-	while (*p != NULL) {
-		parent = *p;
-		iter = rb_entry(parent, struct sym_entry, rb_node);
+	pthread_mutex_lock(&notes->lock);
 
-		if (se->weight > iter->weight)
-			p = &(*p)->rb_left;
-		else
-			p = &(*p)->rb_right;
-	}
+	if (notes->src == NULL)
+		goto out_unlock;
+
+	printf("Showing %s for %s\n", perf_evsel__name(top->sym_evsel), symbol->name);
+	printf("  Events  Pcnt (>=%d%%)\n", top->sym_pcnt_filter);
 
-	rb_link_node(&se->rb_node, parent, p);
-	rb_insert_color(&se->rb_node, tree);
+	more = symbol__annotate_printf(symbol, he->ms.map, top->sym_evsel,
+				       0, top->sym_pcnt_filter, top->print_entries, 4);
+	if (top->zero)
+		symbol__annotate_zero_histogram(symbol, top->sym_evsel->idx);
+	else
+		symbol__annotate_decay_histogram(symbol, top->sym_evsel->idx);
+	if (more != 0)
+		printf("%d lines not displayed, maybe increase display entries [e]\n", more);
+out_unlock:
+	pthread_mutex_unlock(&notes->lock);
 }
 
-static void print_sym_table(void)
+static void perf_top__print_sym_table(struct perf_top *top)
 {
-	int printed = 0, j;
-	int counter, snap = !display_weighted ? sym_counter : 0;
-	float samples_per_sec = samples/delay_secs;
-	float ksamples_per_sec = (samples-userspace_samples)/delay_secs;
-	float sum_ksamples = 0.0;
-	struct sym_entry *syme, *n;
-	struct rb_root tmp = RB_ROOT;
-	struct rb_node *nd;
-	int sym_width = 0, dso_width = 0, dso_short_width = 0;
-	const int win_width = winsize.ws_col - 1;
-
-	samples = userspace_samples = 0;
-
-	/* Sort the active symbols */
-	pthread_mutex_lock(&active_symbols_lock);
-	syme = list_entry(active_symbols.next, struct sym_entry, node);
-	pthread_mutex_unlock(&active_symbols_lock);
-
-	list_for_each_entry_safe_from(syme, n, &active_symbols, node) {
-		syme->snap_count = syme->count[snap];
-		if (syme->snap_count != 0) {
-
-			if ((hide_user_symbols &&
-			     syme->origin == PERF_RECORD_MISC_USER) ||
-			    (hide_kernel_symbols &&
-			     syme->origin == PERF_RECORD_MISC_KERNEL)) {
-				list_remove_active_sym(syme);
-				continue;
-			}
-			syme->weight = sym_weight(syme);
-			rb_insert_active_sym(&tmp, syme);
-			sum_ksamples += syme->snap_count;
-
-			for (j = 0; j < nr_counters; j++)
-				syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8;
-		} else
-			list_remove_active_sym(syme);
-	}
+	char bf[160];
+	int printed = 0;
+	const int win_width = top->winsize.ws_col - 1;
 
 	puts(CONSOLE_CLEAR);
 
-	printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
-	printf( "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%% [",
-		samples_per_sec,
-		100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec)));
-
-	if (nr_counters == 1 || !display_weighted) {
-		printf("%Ld", (u64)attrs[0].sample_period);
-		if (freq)
-			printf("Hz ");
-		else
-			printf(" ");
-	}
-
-	if (!display_weighted)
-		printf("%s", event_name(sym_counter));
-	else for (counter = 0; counter < nr_counters; counter++) {
-		if (counter)
-			printf("/");
+	perf_top__header_snprintf(top, bf, sizeof(bf));
+	printf("%s\n", bf);
 
-		printf("%s", event_name(counter));
-	}
-
-	printf( "], ");
-
-	if (target_pid != -1)
-		printf(" (target_pid: %d", target_pid);
-	else
-		printf(" (all");
-
-	if (profile_cpu != -1)
-		printf(", cpu: %d)\n", profile_cpu);
-	else {
-		if (target_pid != -1)
-			printf(")\n");
-		else
-			printf(", %d CPUs)\n", nr_cpus);
-	}
+	perf_top__reset_sample_counters(top);
 
 	printf("%-*.*s\n", win_width, win_width, graph_dotted_line);
 
-	if (sym_filter_entry) {
-		show_details(sym_filter_entry);
-		return;
+	if (top->sym_evsel->hists.stats.nr_lost_warned !=
+	    top->sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST]) {
+		top->sym_evsel->hists.stats.nr_lost_warned =
+			top->sym_evsel->hists.stats.nr_events[PERF_RECORD_LOST];
+		color_fprintf(stdout, PERF_COLOR_RED,
+			      "WARNING: LOST %d chunks, Check IO/CPU overload",
+			      top->sym_evsel->hists.stats.nr_lost_warned);
+		++printed;
 	}
 
-	/*
-	 * Find the longest symbol name that will be displayed
-	 */
-	for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
-		syme = rb_entry(nd, struct sym_entry, rb_node);
-		if (++printed > print_entries ||
-		    (int)syme->snap_count < count_filter)
-			continue;
-
-		if (syme->map->dso->long_name_len > dso_width)
-			dso_width = syme->map->dso->long_name_len;
-
-		if (syme->map->dso->short_name_len > dso_short_width)
-			dso_short_width = syme->map->dso->short_name_len;
-
-		if (syme->name_len > sym_width)
-			sym_width = syme->name_len;
+	if (top->sym_filter_entry) {
+		perf_top__show_details(top);
+		return;
 	}
 
-	printed = 0;
-
-	if (sym_width + dso_width > winsize.ws_col - 29) {
-		dso_width = dso_short_width;
-		if (sym_width + dso_width > winsize.ws_col - 29)
-			sym_width = winsize.ws_col - dso_width - 29;
-	}
+	hists__collapse_resort(&top->sym_evsel->hists, NULL);
+	hists__output_resort(&top->sym_evsel->hists);
+	hists__decay_entries(&top->sym_evsel->hists,
+			     top->hide_user_symbols,
+			     top->hide_kernel_symbols);
+	hists__output_recalc_col_len(&top->sym_evsel->hists,
+				     top->print_entries - printed);
 	putchar('\n');
-	if (nr_counters == 1)
-		printf("             samples  pcnt");
-	else
-		printf("   weight    samples  pcnt");
-
-	if (verbose)
-		printf("         RIP       ");
-	printf(" %-*.*s DSO\n", sym_width, sym_width, "function");
-	printf("   %s    _______ _____",
-	       nr_counters == 1 ? "      " : "______");
-	if (verbose)
-		printf(" ________________");
-	printf(" %-*.*s", sym_width, sym_width, graph_line);
-	printf(" %-*.*s", dso_width, dso_width, graph_line);
-	puts("\n");
-
-	for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) {
-		struct symbol *sym;
-		double pcnt;
-
-		syme = rb_entry(nd, struct sym_entry, rb_node);
-		sym = sym_entry__symbol(syme);
-
-		if (++printed > print_entries || (int)syme->snap_count < count_filter)
-			continue;
-
-		pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) /
-					 sum_ksamples));
-
-		if (nr_counters == 1 || !display_weighted)
-			printf("%20.2f ", syme->weight);
-		else
-			printf("%9.1f %10ld ", syme->weight, syme->snap_count);
-
-		percent_color_fprintf(stdout, "%4.1f%%", pcnt);
-		if (verbose)
-			printf(" %016llx", sym->start);
-		printf(" %-*.*s", sym_width, sym_width, sym->name);
-		printf(" %-*.*s\n", dso_width, dso_width,
-		       dso_width >= syme->map->dso->long_name_len ?
-					syme->map->dso->long_name :
-					syme->map->dso->short_name);
-	}
+	hists__fprintf(&top->sym_evsel->hists, false,
+		       top->print_entries - printed, win_width,
+		       top->min_percent, stdout);
 }
 
 static void prompt_integer(int *target, const char *msg)
@@ -640,18 +324,17 @@ static void prompt_percent(int *target, const char *msg)
 		*target = tmp;
 }
 
-static void prompt_symbol(struct sym_entry **target, const char *msg)
+static void perf_top__prompt_symbol(struct perf_top *top, const char *msg)
 {
 	char *buf = malloc(0), *p;
-	struct sym_entry *syme = *target, *n, *found = NULL;
+	struct hist_entry *syme = top->sym_filter_entry, *n, *found = NULL;
+	struct rb_node *next;
 	size_t dummy = 0;
 
 	/* zero counters of active symbol */
 	if (syme) {
-		pthread_mutex_lock(&syme->src->lock);
 		__zero_source_counters(syme);
-		*target = NULL;
-		pthread_mutex_unlock(&syme->src->lock);
+		top->sym_filter_entry = NULL;
 	}
 
 	fprintf(stdout, "\n%s: ", msg);
@@ -662,66 +345,59 @@ static void prompt_symbol(struct sym_entry **target, const char *msg)
 	if (p)
 		*p = 0;
 
-	pthread_mutex_lock(&active_symbols_lock);
-	syme = list_entry(active_symbols.next, struct sym_entry, node);
-	pthread_mutex_unlock(&active_symbols_lock);
-
-	list_for_each_entry_safe_from(syme, n, &active_symbols, node) {
-		struct symbol *sym = sym_entry__symbol(syme);
-
-		if (!strcmp(buf, sym->name)) {
-			found = syme;
+	next = rb_first(&top->sym_evsel->hists.entries);
+	while (next) {
+		n = rb_entry(next, struct hist_entry, rb_node);
+		if (n->ms.sym && !strcmp(buf, n->ms.sym->name)) {
+			found = n;
 			break;
 		}
+		next = rb_next(&n->rb_node);
 	}
 
 	if (!found) {
 		fprintf(stderr, "Sorry, %s is not active.\n", buf);
 		sleep(1);
-		return;
 	} else
-		parse_source(found);
+		perf_top__parse_source(top, found);
 
 out_free:
 	free(buf);
 }
 
-static void print_mapped_keys(void)
+static void perf_top__print_mapped_keys(struct perf_top *top)
 {
 	char *name = NULL;
 
-	if (sym_filter_entry) {
-		struct symbol *sym = sym_entry__symbol(sym_filter_entry);
+	if (top->sym_filter_entry) {
+		struct symbol *sym = top->sym_filter_entry->ms.sym;
 		name = sym->name;
 	}
 
 	fprintf(stdout, "\nMapped keys:\n");
-	fprintf(stdout, "\t[d]     display refresh delay.             \t(%d)\n", delay_secs);
-	fprintf(stdout, "\t[e]     display entries (lines).           \t(%d)\n", print_entries);
+	fprintf(stdout, "\t[d]     display refresh delay.             \t(%d)\n", top->delay_secs);
+	fprintf(stdout, "\t[e]     display entries (lines).           \t(%d)\n", top->print_entries);
 
-	if (nr_counters > 1)
-		fprintf(stdout, "\t[E]     active event counter.              \t(%s)\n", event_name(sym_counter));
+	if (top->evlist->nr_entries > 1)
+		fprintf(stdout, "\t[E]     active event counter.              \t(%s)\n", perf_evsel__name(top->sym_evsel));
 
-	fprintf(stdout, "\t[f]     profile display filter (count).    \t(%d)\n", count_filter);
+	fprintf(stdout, "\t[f]     profile display filter (count).    \t(%d)\n", top->count_filter);
 
-	fprintf(stdout, "\t[F]     annotate display filter (percent). \t(%d%%)\n", sym_pcnt_filter);
+	fprintf(stdout, "\t[F]     annotate display filter (percent). \t(%d%%)\n", top->sym_pcnt_filter);
 	fprintf(stdout, "\t[s]     annotate symbol.                   \t(%s)\n", name?: "NULL");
 	fprintf(stdout, "\t[S]     stop annotation.\n");
 
-	if (nr_counters > 1)
-		fprintf(stdout, "\t[w]     toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0);
-
 	fprintf(stdout,
 		"\t[K]     hide kernel_symbols symbols.     \t(%s)\n",
-		hide_kernel_symbols ? "yes" : "no");
+		top->hide_kernel_symbols ? "yes" : "no");
 	fprintf(stdout,
 		"\t[U]     hide user symbols.               \t(%s)\n",
-		hide_user_symbols ? "yes" : "no");
-	fprintf(stdout, "\t[z]     toggle sample zeroing.             \t(%d)\n", zero ? 1 : 0);
+		top->hide_user_symbols ? "yes" : "no");
+	fprintf(stdout, "\t[z]     toggle sample zeroing.             \t(%d)\n", top->zero ? 1 : 0);
 	fprintf(stdout, "\t[qQ]    quit.\n");
 }
 
-static int key_mapped(int c)
+static int perf_top__key_mapped(struct perf_top *top, int c)
 {
 	switch (c) {
 		case 'd':
@@ -737,8 +413,7 @@ static int key_mapped(int c)
 		case 'S':
 			return 1;
 		case 'E':
-		case 'w':
-			return nr_counters > 1 ? 1 : 0;
+			return top->evlist->nr_entries > 1 ? 1 : 0;
 		default:
 			break;
 	}
@@ -746,13 +421,15 @@ static int key_mapped(int c)
 	return 0;
 }
 
-static void handle_keypress(int c)
+static bool perf_top__handle_keypress(struct perf_top *top, int c)
 {
-	if (!key_mapped(c)) {
+	bool ret = true;
+
+	if (!perf_top__key_mapped(top, c)) {
 		struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
 		struct termios tc, save;
 
-		print_mapped_keys();
+		perf_top__print_mapped_keys(top);
 		fprintf(stdout, "\nEnter selection, or unmapped key to continue: ");
 		fflush(stdout);
 
@@ -767,89 +444,144 @@ static void handle_keypress(int c)
 		c = getc(stdin);
 
 		tcsetattr(0, TCSAFLUSH, &save);
-		if (!key_mapped(c))
-			return;
+		if (!perf_top__key_mapped(top, c))
+			return ret;
 	}
 
 	switch (c) {
 		case 'd':
-			prompt_integer(&delay_secs, "Enter display delay");
-			if (delay_secs < 1)
-				delay_secs = 1;
+			prompt_integer(&top->delay_secs, "Enter display delay");
+			if (top->delay_secs < 1)
+				top->delay_secs = 1;
 			break;
 		case 'e':
-			prompt_integer(&print_entries, "Enter display entries (lines)");
-			if (print_entries == 0) {
-				sig_winch_handler(SIGWINCH);
-				signal(SIGWINCH, sig_winch_handler);
-			} else
+			prompt_integer(&top->print_entries, "Enter display entries (lines)");
+			if (top->print_entries == 0) {
+				struct sigaction act = {
+					.sa_sigaction = perf_top__sig_winch,
+					.sa_flags     = SA_SIGINFO,
+				};
+				perf_top__sig_winch(SIGWINCH, NULL, top);
+				sigaction(SIGWINCH, &act, NULL);
+			} else {
 				signal(SIGWINCH, SIG_DFL);
+			}
 			break;
 		case 'E':
-			if (nr_counters > 1) {
-				int i;
+			if (top->evlist->nr_entries > 1) {
+				/* Select 0 as the default event: */
+				int counter = 0;
 
 				fprintf(stderr, "\nAvailable events:");
-				for (i = 0; i < nr_counters; i++)
-					fprintf(stderr, "\n\t%d %s", i, event_name(i));
 
-				prompt_integer(&sym_counter, "Enter details event counter");
+				evlist__for_each(top->evlist, top->sym_evsel)
+					fprintf(stderr, "\n\t%d %s", top->sym_evsel->idx, perf_evsel__name(top->sym_evsel));
 
-				if (sym_counter >= nr_counters) {
-					fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(0));
-					sym_counter = 0;
+				prompt_integer(&counter, "Enter details event counter");
+
+				if (counter >= top->evlist->nr_entries) {
+					top->sym_evsel = perf_evlist__first(top->evlist);
+					fprintf(stderr, "Sorry, no such event, using %s.\n", perf_evsel__name(top->sym_evsel));
 					sleep(1);
+					break;
 				}
-			} else sym_counter = 0;
+				evlist__for_each(top->evlist, top->sym_evsel)
+					if (top->sym_evsel->idx == counter)
+						break;
+			} else
+				top->sym_evsel = perf_evlist__first(top->evlist);
 			break;
 		case 'f':
-			prompt_integer(&count_filter, "Enter display event count filter");
+			prompt_integer(&top->count_filter, "Enter display event count filter");
 			break;
 		case 'F':
-			prompt_percent(&sym_pcnt_filter, "Enter details display event filter (percent)");
+			prompt_percent(&top->sym_pcnt_filter,
+				       "Enter details display event filter (percent)");
 			break;
 		case 'K':
-			hide_kernel_symbols = !hide_kernel_symbols;
+			top->hide_kernel_symbols = !top->hide_kernel_symbols;
 			break;
 		case 'q':
 		case 'Q':
 			printf("exiting.\n");
-			if (dump_symtab)
-				dsos__fprintf(stderr);
-			exit(0);
+			if (top->dump_symtab)
+				perf_session__fprintf_dsos(top->session, stderr);
+			ret = false;
+			break;
 		case 's':
-			prompt_symbol(&sym_filter_entry, "Enter details symbol");
+			perf_top__prompt_symbol(top, "Enter details symbol");
 			break;
 		case 'S':
-			if (!sym_filter_entry)
+			if (!top->sym_filter_entry)
 				break;
 			else {
-				struct sym_entry *syme = sym_filter_entry;
+				struct hist_entry *syme = top->sym_filter_entry;
 
-				pthread_mutex_lock(&syme->src->lock);
-				sym_filter_entry = NULL;
+				top->sym_filter_entry = NULL;
 				__zero_source_counters(syme);
-				pthread_mutex_unlock(&syme->src->lock);
 			}
 			break;
 		case 'U':
-			hide_user_symbols = !hide_user_symbols;
-			break;
-		case 'w':
-			display_weighted = ~display_weighted;
+			top->hide_user_symbols = !top->hide_user_symbols;
 			break;
 		case 'z':
-			zero = ~zero;
+			top->zero = !top->zero;
 			break;
 		default:
 			break;
 	}
+
+	return ret;
+}
+
+static void perf_top__sort_new_samples(void *arg)
+{
+	struct perf_top *t = arg;
+	perf_top__reset_sample_counters(t);
+
+	if (t->evlist->selected != NULL)
+		t->sym_evsel = t->evlist->selected;
+
+	hists__collapse_resort(&t->sym_evsel->hists, NULL);
+	hists__output_resort(&t->sym_evsel->hists);
+	hists__decay_entries(&t->sym_evsel->hists,
+			     t->hide_user_symbols,
+			     t->hide_kernel_symbols);
 }
 
-static void *display_thread(void *arg __used)
+static void *display_thread_tui(void *arg)
+{
+	struct perf_evsel *pos;
+	struct perf_top *top = arg;
+	const char *help = "For a higher level overview, try: perf top --sort comm,dso";
+	struct hist_browser_timer hbt = {
+		.timer		= perf_top__sort_new_samples,
+		.arg		= top,
+		.refresh	= top->delay_secs,
+	};
+
+	perf_top__sort_new_samples(top);
+
+	/*
+	 * Initialize the uid_filter_str, in the future the TUI will allow
+	 * Zooming in/out UIDs. For now juse use whatever the user passed
+	 * via --uid.
+	 */
+	evlist__for_each(top->evlist, pos)
+		pos->hists.uid_filter_str = top->record_opts.target.uid_str;
+
+	perf_evlist__tui_browse_hists(top->evlist, help, &hbt, top->min_percent,
+				      &top->session->header.env);
+
+	done = 1;
+	return NULL;
+}
+
+static void *display_thread(void *arg)
 {
 	struct pollfd stdin_poll = { .fd = 0, .events = POLLIN };
 	struct termios tc, save;
+	struct perf_top *top = arg;
 	int delay_msecs, c;
 
 	tcgetattr(0, &save);
@@ -858,44 +590,42 @@ static void *display_thread(void *arg __used)
 	tc.c_cc[VMIN] = 0;
 	tc.c_cc[VTIME] = 0;
 
+	pthread__unblock_sigwinch();
 repeat:
-	delay_msecs = delay_secs * 1000;
+	delay_msecs = top->delay_secs * 1000;
 	tcsetattr(0, TCSANOW, &tc);
 	/* trash return*/
 	getc(stdin);
 
-	do {
-		print_sym_table();
-	} while (!poll(&stdin_poll, 1, delay_msecs) == 1);
-
-	c = getc(stdin);
-	tcsetattr(0, TCSAFLUSH, &save);
+	while (!done) {
+		perf_top__print_sym_table(top);
+		/*
+		 * Either timeout expired or we got an EINTR due to SIGWINCH,
+		 * refresh screen in both cases.
+		 */
+		switch (poll(&stdin_poll, 1, delay_msecs)) {
+		case 0:
+			continue;
+		case -1:
+			if (errno == EINTR)
+				continue;
+			/* Fall trhu */
+		default:
+			c = getc(stdin);
+			tcsetattr(0, TCSAFLUSH, &save);
 
-	handle_keypress(c);
-	goto repeat;
+			if (perf_top__handle_keypress(top, c))
+				goto repeat;
+			done = 1;
+		}
+	}
 
 	return NULL;
 }
 
-/* Tag samples to be skipped. */
-static const char *skip_symbols[] = {
-	"default_idle",
-	"cpu_idle",
-	"enter_idle",
-	"exit_idle",
-	"mwait_idle",
-	"mwait_idle_with_hints",
-	"poll_idle",
-	"ppc64_runlatch_off",
-	"pseries_dedicated_idle_sleep",
-	NULL
-};
-
-static int symbol_filter(struct map *map, struct symbol *sym)
+static int symbol_filter(struct map *map __maybe_unused, struct symbol *sym)
 {
-	struct sym_entry *syme;
 	const char *name = sym->name;
-	int i;
 
 	/*
 	 * ppc64 uses function descriptors and appends a '.' to the
@@ -913,58 +643,84 @@ static int symbol_filter(struct map *map, struct symbol *sym)
 	    strstr(name, "_text_end"))
 		return 1;
 
-	syme = symbol__priv(sym);
-	syme->map = map;
-	syme->src = NULL;
+	if (symbol__is_idle(sym))
+		sym->ignore = true;
 
-	if (!sym_filter_entry && sym_filter && !strcmp(name, sym_filter)) {
-		/* schedule initial sym_filter_entry setup */
-		sym_filter_entry_sched = syme;
-		sym_filter = NULL;
-	}
+	return 0;
+}
 
-	for (i = 0; skip_symbols[i]; i++) {
-		if (!strcmp(skip_symbols[i], name)) {
-			syme->skip = 1;
-			break;
-		}
-	}
+static int hist_iter__top_callback(struct hist_entry_iter *iter,
+				   struct addr_location *al, bool single,
+				   void *arg)
+{
+	struct perf_top *top = arg;
+	struct hist_entry *he = iter->he;
+	struct perf_evsel *evsel = iter->evsel;
+
+	if (sort__has_sym && single) {
+		u64 ip = al->addr;
 
-	if (!syme->skip)
-		syme->name_len = strlen(sym->name);
+		if (al->map)
+			ip = al->map->unmap_ip(al->map, ip);
+
+		perf_top__record_precise_ip(top, he, evsel->idx, ip);
+	}
 
 	return 0;
 }
 
-static void event__process_sample(const event_t *self,
-				 struct perf_session *session, int counter)
+static void perf_event__process_sample(struct perf_tool *tool,
+				       const union perf_event *event,
+				       struct perf_evsel *evsel,
+				       struct perf_sample *sample,
+				       struct machine *machine)
 {
-	u64 ip = self->ip.ip;
-	struct sym_entry *syme;
+	struct perf_top *top = container_of(tool, struct perf_top, tool);
 	struct addr_location al;
-	u8 origin = self->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
-
-	++samples;
-
-	switch (origin) {
-	case PERF_RECORD_MISC_USER:
-		++userspace_samples;
-		if (hide_user_symbols)
-			return;
-		break;
-	case PERF_RECORD_MISC_KERNEL:
-		if (hide_kernel_symbols)
-			return;
-		break;
-	default:
+	int err;
+
+	if (!machine && perf_guest) {
+		static struct intlist *seen;
+
+		if (!seen)
+			seen = intlist__new(NULL);
+
+		if (!intlist__has_entry(seen, sample->pid)) {
+			pr_err("Can't find guest [%d]'s kernel information\n",
+				sample->pid);
+			intlist__add(seen, sample->pid);
+		}
+		return;
+	}
+
+	if (!machine) {
+		pr_err("%u unprocessable samples recorded.\r",
+		       top->session->stats.nr_unprocessable_samples++);
 		return;
 	}
 
-	if (event__preprocess_sample(self, session, &al, symbol_filter) < 0 ||
-	    al.filtered)
+	if (event->header.misc & PERF_RECORD_MISC_EXACT_IP)
+		top->exact_samples++;
+
+	if (perf_event__preprocess_sample(event, machine, &al, sample) < 0)
 		return;
 
+	if (!top->kptr_restrict_warned &&
+	    symbol_conf.kptr_restrict &&
+	    al.cpumode == PERF_RECORD_MISC_KERNEL) {
+		ui__warning(
+"Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
+"Check /proc/sys/kernel/kptr_restrict.\n\n"
+"Kernel%s samples will not be resolved.\n",
+			  !RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION]) ?
+			  " modules" : "");
+		if (use_browser <= 0)
+			sleep(5);
+		top->kptr_restrict_warned = true;
+	}
+
 	if (al.sym == NULL) {
+		const char *msg = "Kernel samples will not be resolved.\n";
 		/*
 		 * As we do lazy loading of symtabs we only will know if the
 		 * specified vmlinux file is invalid when we actually have a
@@ -976,393 +732,510 @@ static void event__process_sample(const event_t *self,
 		 * --hide-kernel-symbols, even if the user specifies an
 		 * invalid --vmlinux ;-)
 		 */
-		if (al.map == session->vmlinux_maps[MAP__FUNCTION] &&
+		if (!top->kptr_restrict_warned && !top->vmlinux_warned &&
+		    al.map == machine->vmlinux_maps[MAP__FUNCTION] &&
 		    RB_EMPTY_ROOT(&al.map->dso->symbols[MAP__FUNCTION])) {
-			pr_err("The %s file can't be used\n",
-			       symbol_conf.vmlinux_name);
-			exit(1);
-		}
-
-		return;
-	}
-
-	/* let's see, whether we need to install initial sym_filter_entry */
-	if (sym_filter_entry_sched) {
-		sym_filter_entry = sym_filter_entry_sched;
-		sym_filter_entry_sched = NULL;
-		parse_source(sym_filter_entry);
-	}
+			if (symbol_conf.vmlinux_name) {
+				ui__warning("The %s file can't be used.\n%s",
+					    symbol_conf.vmlinux_name, msg);
+			} else {
+				ui__warning("A vmlinux file was not found.\n%s",
+					    msg);
+			}
 
-	syme = symbol__priv(al.sym);
-	if (!syme->skip) {
-		syme->count[counter]++;
-		syme->origin = origin;
-		record_precise_ip(syme, counter, ip);
-		pthread_mutex_lock(&active_symbols_lock);
-		if (list_empty(&syme->node) || !syme->node.next)
-			__list_insert_active_sym(syme);
-		pthread_mutex_unlock(&active_symbols_lock);
+			if (use_browser <= 0)
+				sleep(5);
+			top->vmlinux_warned = true;
+		}
 	}
-}
 
-static int event__process(event_t *event, struct perf_session *session)
-{
-	switch (event->header.type) {
-	case PERF_RECORD_COMM:
-		event__process_comm(event, session);
-		break;
-	case PERF_RECORD_MMAP:
-		event__process_mmap(event, session);
-		break;
-	case PERF_RECORD_FORK:
-	case PERF_RECORD_EXIT:
-		event__process_task(event, session);
-		break;
-	default:
-		break;
-	}
+	if (al.sym == NULL || !al.sym->ignore) {
+		struct hist_entry_iter iter = {
+			.add_entry_cb = hist_iter__top_callback,
+		};
 
-	return 0;
-}
+		if (symbol_conf.cumulate_callchain)
+			iter.ops = &hist_iter_cumulative;
+		else
+			iter.ops = &hist_iter_normal;
 
-struct mmap_data {
-	int			counter;
-	void			*base;
-	int			mask;
-	unsigned int		prev;
-};
+		pthread_mutex_lock(&evsel->hists.lock);
 
-static unsigned int mmap_read_head(struct mmap_data *md)
-{
-	struct perf_event_mmap_page *pc = md->base;
-	int head;
+		err = hist_entry_iter__add(&iter, &al, evsel, sample,
+					   top->max_stack, top);
+		if (err < 0)
+			pr_err("Problem incrementing symbol period, skipping event\n");
 
-	head = pc->data_head;
-	rmb();
+		pthread_mutex_unlock(&evsel->hists.lock);
+	}
 
-	return head;
+	return;
 }
 
-static void perf_session__mmap_read_counter(struct perf_session *self,
-					    struct mmap_data *md)
+static void perf_top__mmap_read_idx(struct perf_top *top, int idx)
 {
-	unsigned int head = mmap_read_head(md);
-	unsigned int old = md->prev;
-	unsigned char *data = md->base + page_size;
-	int diff;
-
-	/*
-	 * If we're further behind than half the buffer, there's a chance
-	 * the writer will bite our tail and mess up the samples under us.
-	 *
-	 * If we somehow ended up ahead of the head, we got messed up.
-	 *
-	 * In either case, truncate and restart at head.
-	 */
-	diff = head - old;
-	if (diff > md->mask / 2 || diff < 0) {
-		fprintf(stderr, "WARNING: failed to keep up with mmap data.\n");
-
-		/*
-		 * head points to a known good entry, start there.
-		 */
-		old = head;
-	}
+	struct perf_sample sample;
+	struct perf_evsel *evsel;
+	struct perf_session *session = top->session;
+	union perf_event *event;
+	struct machine *machine;
+	u8 origin;
+	int ret;
 
-	for (; old != head;) {
-		event_t *event = (event_t *)&data[old & md->mask];
+	while ((event = perf_evlist__mmap_read(top->evlist, idx)) != NULL) {
+		ret = perf_evlist__parse_sample(top->evlist, event, &sample);
+		if (ret) {
+			pr_err("Can't parse sample, err = %d\n", ret);
+			goto next_event;
+		}
 
-		event_t event_copy;
+		evsel = perf_evlist__id2evsel(session->evlist, sample.id);
+		assert(evsel != NULL);
 
-		size_t size = event->header.size;
+		origin = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK;
 
-		/*
-		 * Event straddles the mmap boundary -- header should always
-		 * be inside due to u64 alignment of output.
-		 */
-		if ((old & md->mask) + size != ((old + size) & md->mask)) {
-			unsigned int offset = old;
-			unsigned int len = min(sizeof(*event), size), cpy;
-			void *dst = &event_copy;
-
-			do {
-				cpy = min(md->mask + 1 - (offset & md->mask), len);
-				memcpy(dst, &data[offset & md->mask], cpy);
-				offset += cpy;
-				dst += cpy;
-				len -= cpy;
-			} while (len);
-
-			event = &event_copy;
+		if (event->header.type == PERF_RECORD_SAMPLE)
+			++top->samples;
+
+		switch (origin) {
+		case PERF_RECORD_MISC_USER:
+			++top->us_samples;
+			if (top->hide_user_symbols)
+				goto next_event;
+			machine = &session->machines.host;
+			break;
+		case PERF_RECORD_MISC_KERNEL:
+			++top->kernel_samples;
+			if (top->hide_kernel_symbols)
+				goto next_event;
+			machine = &session->machines.host;
+			break;
+		case PERF_RECORD_MISC_GUEST_KERNEL:
+			++top->guest_kernel_samples;
+			machine = perf_session__find_machine(session,
+							     sample.pid);
+			break;
+		case PERF_RECORD_MISC_GUEST_USER:
+			++top->guest_us_samples;
+			/*
+			 * TODO: we don't process guest user from host side
+			 * except simple counting.
+			 */
+			/* Fall thru */
+		default:
+			goto next_event;
 		}
 
-		if (event->header.type == PERF_RECORD_SAMPLE)
-			event__process_sample(event, self, md->counter);
-		else
-			event__process(event, self);
-		old += size;
-	}
 
-	md->prev = old;
+		if (event->header.type == PERF_RECORD_SAMPLE) {
+			perf_event__process_sample(&top->tool, event, evsel,
+						   &sample, machine);
+		} else if (event->header.type < PERF_RECORD_MAX) {
+			hists__inc_nr_events(&evsel->hists, event->header.type);
+			machine__process_event(machine, event, &sample);
+		} else
+			++session->stats.nr_unknown_events;
+next_event:
+		perf_evlist__mmap_consume(top->evlist, idx);
+	}
 }
 
-static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS];
-static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS];
-
-static void perf_session__mmap_read(struct perf_session *self)
+static void perf_top__mmap_read(struct perf_top *top)
 {
-	int i, counter;
+	int i;
 
-	for (i = 0; i < nr_cpus; i++) {
-		for (counter = 0; counter < nr_counters; counter++)
-			perf_session__mmap_read_counter(self, &mmap_array[i][counter]);
-	}
+	for (i = 0; i < top->evlist->nr_mmaps; i++)
+		perf_top__mmap_read_idx(top, i);
 }
 
-int nr_poll;
-int group_fd;
-
-static void start_counter(int i, int counter)
+static int perf_top__start_counters(struct perf_top *top)
 {
-	struct perf_event_attr *attr;
-	int cpu;
-
-	cpu = profile_cpu;
-	if (target_pid == -1 && profile_cpu == -1)
-		cpu = cpumap[i];
+	char msg[512];
+	struct perf_evsel *counter;
+	struct perf_evlist *evlist = top->evlist;
+	struct record_opts *opts = &top->record_opts;
 
-	attr = attrs + counter;
+	perf_evlist__config(evlist, opts);
 
-	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
+	evlist__for_each(evlist, counter) {
+try_again:
+		if (perf_evsel__open(counter, top->evlist->cpus,
+				     top->evlist->threads) < 0) {
+			if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) {
+				if (verbose)
+					ui__warning("%s\n", msg);
+				goto try_again;
+			}
 
-	if (freq) {
-		attr->sample_type	|= PERF_SAMPLE_PERIOD;
-		attr->freq		= 1;
-		attr->sample_freq	= freq;
+			perf_evsel__open_strerror(counter, &opts->target,
+						  errno, msg, sizeof(msg));
+			ui__error("%s\n", msg);
+			goto out_err;
+		}
 	}
 
-	attr->inherit		= (cpu < 0) && inherit;
-	attr->mmap		= 1;
-
-try_again:
-	fd[i][counter] = sys_perf_event_open(attr, target_pid, cpu, group_fd, 0);
-
-	if (fd[i][counter] < 0) {
-		int err = errno;
+	if (perf_evlist__mmap(evlist, opts->mmap_pages, false) < 0) {
+		ui__error("Failed to mmap with %d (%s)\n",
+			    errno, strerror(errno));
+		goto out_err;
+	}
 
-		if (err == EPERM || err == EACCES)
-			die("No permission - are you root?\n");
-		/*
-		 * If it's cycles then fall back to hrtimer
-		 * based cpu-clock-tick sw counter, which
-		 * is always available even if no PMU support:
-		 */
-		if (attr->type == PERF_TYPE_HARDWARE
-			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) {
+	return 0;
 
-			if (verbose)
-				warning(" ... trying to fall back to cpu-clock-ticks\n");
+out_err:
+	return -1;
+}
 
-			attr->type = PERF_TYPE_SOFTWARE;
-			attr->config = PERF_COUNT_SW_CPU_CLOCK;
-			goto try_again;
+static int perf_top__setup_sample_type(struct perf_top *top __maybe_unused)
+{
+	if (!sort__has_sym) {
+		if (symbol_conf.use_callchain) {
+			ui__error("Selected -g but \"sym\" not present in --sort/-s.");
+			return -EINVAL;
+		}
+	} else if (callchain_param.mode != CHAIN_NONE) {
+		if (callchain_register_param(&callchain_param) < 0) {
+			ui__error("Can't register callchain params.\n");
+			return -EINVAL;
 		}
-		printf("\n");
-		error("perfcounter syscall returned with %d (%s)\n",
-			fd[i][counter], strerror(err));
-		die("No CONFIG_PERF_EVENTS=y kernel support configured?\n");
-		exit(-1);
 	}
-	assert(fd[i][counter] >= 0);
-	fcntl(fd[i][counter], F_SETFL, O_NONBLOCK);
 
-	/*
-	 * First counter acts as the group leader:
-	 */
-	if (group && group_fd == -1)
-		group_fd = fd[i][counter];
-
-	event_array[nr_poll].fd = fd[i][counter];
-	event_array[nr_poll].events = POLLIN;
-	nr_poll++;
-
-	mmap_array[i][counter].counter = counter;
-	mmap_array[i][counter].prev = 0;
-	mmap_array[i][counter].mask = mmap_pages*page_size - 1;
-	mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-			PROT_READ, MAP_SHARED, fd[i][counter], 0);
-	if (mmap_array[i][counter].base == MAP_FAILED)
-		die("failed to mmap with %d (%s)\n", errno, strerror(errno));
+	return 0;
 }
 
-static int __cmd_top(void)
+static int __cmd_top(struct perf_top *top)
 {
+	struct record_opts *opts = &top->record_opts;
 	pthread_t thread;
-	int i, counter;
 	int ret;
-	/*
-	 * FIXME: perf_session__new should allow passing a O_MMAP, so that all this
-	 * mmap reading, etc is encapsulated in it. Use O_WRONLY for now.
-	 */
-	struct perf_session *session = perf_session__new(NULL, O_WRONLY, false);
-	if (session == NULL)
+
+	top->session = perf_session__new(NULL, false, NULL);
+	if (top->session == NULL)
 		return -ENOMEM;
 
-	if (target_pid != -1)
-		event__synthesize_thread(target_pid, event__process, session);
-	else
-		event__synthesize_threads(event__process, session);
+	machines__set_symbol_filter(&top->session->machines, symbol_filter);
 
-	for (i = 0; i < nr_cpus; i++) {
-		group_fd = -1;
-		for (counter = 0; counter < nr_counters; counter++)
-			start_counter(i, counter);
+	if (!objdump_path) {
+		ret = perf_session_env__lookup_objdump(&top->session->header.env);
+		if (ret)
+			goto out_delete;
 	}
 
+	ret = perf_top__setup_sample_type(top);
+	if (ret)
+		goto out_delete;
+
+	machine__synthesize_threads(&top->session->machines.host, &opts->target,
+				    top->evlist->threads, false);
+	ret = perf_top__start_counters(top);
+	if (ret)
+		goto out_delete;
+
+	top->session->evlist = top->evlist;
+	perf_session__set_id_hdr_size(top->session);
+
+	/*
+	 * When perf is starting the traced process, all the events (apart from
+	 * group members) have enable_on_exec=1 set, so don't spoil it by
+	 * prematurely enabling them.
+	 *
+	 * XXX 'top' still doesn't start workloads like record, trace, but should,
+	 * so leave the check here.
+	 */
+        if (!target__none(&opts->target))
+                perf_evlist__enable(top->evlist);
+
 	/* Wait for a minimal set of events before starting the snapshot */
-	poll(event_array, nr_poll, 100);
+	poll(top->evlist->pollfd, top->evlist->nr_fds, 100);
 
-	perf_session__mmap_read(session);
+	perf_top__mmap_read(top);
 
-	if (pthread_create(&thread, NULL, display_thread, NULL)) {
-		printf("Could not create display thread.\n");
-		exit(-1);
+	ret = -1;
+	if (pthread_create(&thread, NULL, (use_browser > 0 ? display_thread_tui :
+							    display_thread), top)) {
+		ui__error("Could not create display thread.\n");
+		goto out_delete;
 	}
 
-	if (realtime_prio) {
+	if (top->realtime_prio) {
 		struct sched_param param;
 
-		param.sched_priority = realtime_prio;
+		param.sched_priority = top->realtime_prio;
 		if (sched_setscheduler(0, SCHED_FIFO, &param)) {
-			printf("Could not set realtime priority.\n");
-			exit(-1);
+			ui__error("Could not set realtime priority.\n");
+			goto out_delete;
 		}
 	}
 
-	while (1) {
-		int hits = samples;
+	while (!done) {
+		u64 hits = top->samples;
 
-		perf_session__mmap_read(session);
+		perf_top__mmap_read(top);
 
-		if (hits == samples)
-			ret = poll(event_array, nr_poll, 100);
+		if (hits == top->samples)
+			ret = poll(top->evlist->pollfd, top->evlist->nr_fds, 100);
 	}
 
-	return 0;
+	ret = 0;
+out_delete:
+	perf_session__delete(top->session);
+	top->session = NULL;
+
+	return ret;
+}
+
+static int
+callchain_opt(const struct option *opt, const char *arg, int unset)
+{
+	symbol_conf.use_callchain = true;
+	return record_callchain_opt(opt, arg, unset);
+}
+
+static int
+parse_callchain_opt(const struct option *opt, const char *arg, int unset)
+{
+	symbol_conf.use_callchain = true;
+	return record_parse_callchain_opt(opt, arg, unset);
+}
+
+static int perf_top_config(const char *var, const char *value, void *cb)
+{
+	struct perf_top *top = cb;
+
+	if (!strcmp(var, "top.call-graph"))
+		return record_parse_callchain(value, &top->record_opts);
+	if (!strcmp(var, "top.children")) {
+		symbol_conf.cumulate_callchain = perf_config_bool(var, value);
+		return 0;
+	}
+
+	return perf_default_config(var, value, cb);
 }
 
-static const char * const top_usage[] = {
-	"perf top [<options>]",
-	NULL
-};
+static int
+parse_percent_limit(const struct option *opt, const char *arg,
+		    int unset __maybe_unused)
+{
+	struct perf_top *top = opt->value;
+
+	top->min_percent = strtof(arg, NULL);
+	return 0;
+}
 
-static const struct option options[] = {
-	OPT_CALLBACK('e', "event", NULL, "event",
+int cmd_top(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	int status = -1;
+	char errbuf[BUFSIZ];
+	struct perf_top top = {
+		.count_filter	     = 5,
+		.delay_secs	     = 2,
+		.record_opts = {
+			.mmap_pages	= UINT_MAX,
+			.user_freq	= UINT_MAX,
+			.user_interval	= ULLONG_MAX,
+			.freq		= 4000, /* 4 KHz */
+			.target		= {
+				.uses_mmap   = true,
+			},
+		},
+		.max_stack	     = PERF_MAX_STACK_DEPTH,
+		.sym_pcnt_filter     = 5,
+	};
+	struct record_opts *opts = &top.record_opts;
+	struct target *target = &opts->target;
+	const struct option options[] = {
+	OPT_CALLBACK('e', "event", &top.evlist, "event",
 		     "event selector. use 'perf list' to list available events",
-		     parse_events),
-	OPT_INTEGER('c', "count", &default_interval,
-		    "event period to sample"),
-	OPT_INTEGER('p', "pid", &target_pid,
-		    "profile events on existing pid"),
-	OPT_BOOLEAN('a', "all-cpus", &system_wide,
+		     parse_events_option),
+	OPT_U64('c', "count", &opts->user_interval, "event period to sample"),
+	OPT_STRING('p', "pid", &target->pid, "pid",
+		    "profile events on existing process id"),
+	OPT_STRING('t', "tid", &target->tid, "tid",
+		    "profile events on existing thread id"),
+	OPT_BOOLEAN('a', "all-cpus", &target->system_wide,
 			    "system-wide collection from all CPUs"),
-	OPT_INTEGER('C', "CPU", &profile_cpu,
-		    "CPU to profile on"),
+	OPT_STRING('C', "cpu", &target->cpu_list, "cpu",
+		    "list of cpus to monitor"),
 	OPT_STRING('k', "vmlinux", &symbol_conf.vmlinux_name,
 		   "file", "vmlinux pathname"),
-	OPT_BOOLEAN('K', "hide_kernel_symbols", &hide_kernel_symbols,
+	OPT_BOOLEAN(0, "ignore-vmlinux", &symbol_conf.ignore_vmlinux,
+		    "don't load vmlinux even if found"),
+	OPT_BOOLEAN('K', "hide_kernel_symbols", &top.hide_kernel_symbols,
 		    "hide kernel symbols"),
-	OPT_INTEGER('m', "mmap-pages", &mmap_pages,
-		    "number of mmap data pages"),
-	OPT_INTEGER('r', "realtime", &realtime_prio,
+	OPT_CALLBACK('m', "mmap-pages", &opts->mmap_pages, "pages",
+		     "number of mmap data pages",
+		     perf_evlist__parse_mmap_pages),
+	OPT_INTEGER('r', "realtime", &top.realtime_prio,
 		    "collect data with this RT SCHED_FIFO priority"),
-	OPT_INTEGER('d', "delay", &delay_secs,
+	OPT_INTEGER('d', "delay", &top.delay_secs,
 		    "number of seconds to delay between refreshes"),
-	OPT_BOOLEAN('D', "dump-symtab", &dump_symtab,
+	OPT_BOOLEAN('D', "dump-symtab", &top.dump_symtab,
 			    "dump the symbol table used for profiling"),
-	OPT_INTEGER('f', "count-filter", &count_filter,
+	OPT_INTEGER('f', "count-filter", &top.count_filter,
 		    "only display functions with more events than this"),
-	OPT_BOOLEAN('g', "group", &group,
+	OPT_BOOLEAN(0, "group", &opts->group,
 			    "put the counters into a counter group"),
-	OPT_BOOLEAN('i', "inherit", &inherit,
-		    "child tasks inherit counters"),
-	OPT_STRING('s', "sym-annotate", &sym_filter, "symbol name",
+	OPT_BOOLEAN('i', "no-inherit", &opts->no_inherit,
+		    "child tasks do not inherit counters"),
+	OPT_STRING(0, "sym-annotate", &top.sym_filter, "symbol name",
 		    "symbol to annotate"),
-	OPT_BOOLEAN('z', "zero", &zero,
-		    "zero history across updates"),
-	OPT_INTEGER('F', "freq", &freq,
-		    "profile at this frequency"),
-	OPT_INTEGER('E', "entries", &print_entries,
+	OPT_BOOLEAN('z', "zero", &top.zero, "zero history across updates"),
+	OPT_UINTEGER('F', "freq", &opts->user_freq, "profile at this frequency"),
+	OPT_INTEGER('E', "entries", &top.print_entries,
 		    "display this many functions"),
-	OPT_BOOLEAN('U', "hide_user_symbols", &hide_user_symbols,
+	OPT_BOOLEAN('U', "hide_user_symbols", &top.hide_user_symbols,
 		    "hide user symbols"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
+	OPT_BOOLEAN(0, "tui", &top.use_tui, "Use the TUI interface"),
+	OPT_BOOLEAN(0, "stdio", &top.use_stdio, "Use the stdio interface"),
+	OPT_INCR('v', "verbose", &verbose,
 		    "be more verbose (show counter open errors, etc)"),
+	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
+		   "sort by key(s): pid, comm, dso, symbol, parent, cpu, srcline, ..."
+		   " Please refer the man page for the complete list."),
+	OPT_STRING(0, "fields", &field_order, "key[,keys...]",
+		   "output field(s): overhead, period, sample plus all of sort keys"),
+	OPT_BOOLEAN('n', "show-nr-samples", &symbol_conf.show_nr_samples,
+		    "Show a column with the number of samples"),
+	OPT_CALLBACK_NOOPT('g', NULL, &top.record_opts,
+			   NULL, "enables call-graph recording",
+			   &callchain_opt),
+	OPT_CALLBACK(0, "call-graph", &top.record_opts,
+		     "mode[,dump_size]", record_callchain_help,
+		     &parse_callchain_opt),
+	OPT_BOOLEAN(0, "children", &symbol_conf.cumulate_callchain,
+		    "Accumulate callchains of children and show total overhead as well"),
+	OPT_INTEGER(0, "max-stack", &top.max_stack,
+		    "Set the maximum stack depth when parsing the callchain. "
+		    "Default: " __stringify(PERF_MAX_STACK_DEPTH)),
+	OPT_CALLBACK(0, "ignore-callees", NULL, "regex",
+		   "ignore callees of these functions in call graphs",
+		   report_parse_ignore_callees_opt),
+	OPT_BOOLEAN(0, "show-total-period", &symbol_conf.show_total_period,
+		    "Show a column with the sum of periods"),
+	OPT_STRING(0, "dsos", &symbol_conf.dso_list_str, "dso[,dso...]",
+		   "only consider symbols in these dsos"),
+	OPT_STRING(0, "comms", &symbol_conf.comm_list_str, "comm[,comm...]",
+		   "only consider symbols in these comms"),
+	OPT_STRING(0, "symbols", &symbol_conf.sym_list_str, "symbol[,symbol...]",
+		   "only consider these symbols"),
+	OPT_BOOLEAN(0, "source", &symbol_conf.annotate_src,
+		    "Interleave source code with assembly code (default)"),
+	OPT_BOOLEAN(0, "asm-raw", &symbol_conf.annotate_asm_raw,
+		    "Display raw encoding of assembly instructions (default)"),
+	OPT_STRING(0, "objdump", &objdump_path, "path",
+		    "objdump binary to use for disassembly and annotations"),
+	OPT_STRING('M', "disassembler-style", &disassembler_style, "disassembler style",
+		   "Specify disassembler style (e.g. -M intel for intel syntax)"),
+	OPT_STRING('u', "uid", &target->uid_str, "user", "user to profile"),
+	OPT_CALLBACK(0, "percent-limit", &top, "percent",
+		     "Don't show entries under that percent", parse_percent_limit),
+	OPT_CALLBACK(0, "percentage", NULL, "relative|absolute",
+		     "How to display percentage of filtered entries", parse_filter_percentage),
 	OPT_END()
-};
-
-int cmd_top(int argc, const char **argv, const char *prefix __used)
-{
-	int counter;
+	};
+	const char * const top_usage[] = {
+		"perf top [<options>]",
+		NULL
+	};
+
+	top.evlist = perf_evlist__new();
+	if (top.evlist == NULL)
+		return -ENOMEM;
 
-	page_size = sysconf(_SC_PAGE_SIZE);
+	perf_config(perf_top_config, &top);
 
 	argc = parse_options(argc, argv, options, top_usage, 0);
 	if (argc)
 		usage_with_options(top_usage, options);
 
-	/* CPU and PID are mutually exclusive */
-	if (target_pid != -1 && profile_cpu != -1) {
-		printf("WARNING: PID switch overriding CPU\n");
-		sleep(1);
-		profile_cpu = -1;
+	sort__mode = SORT_MODE__TOP;
+	/* display thread wants entries to be collapsed in a different tree */
+	sort__need_collapse = 1;
+
+	if (setup_sorting() < 0) {
+		if (sort_order)
+			parse_options_usage(top_usage, options, "s", 1);
+		if (field_order)
+			parse_options_usage(sort_order ? NULL : top_usage,
+					    options, "fields", 0);
+		goto out_delete_evlist;
 	}
 
-	if (!nr_counters)
-		nr_counters = 1;
+	if (top.use_stdio)
+		use_browser = 0;
+	else if (top.use_tui)
+		use_browser = 1;
 
-	symbol_conf.priv_size = (sizeof(struct sym_entry) +
-				 (nr_counters + 1) * sizeof(unsigned long));
+	setup_browser(false);
 
-	symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
-	if (symbol__init() < 0)
-		return -1;
+	status = target__validate(target);
+	if (status) {
+		target__strerror(target, status, errbuf, BUFSIZ);
+		ui__warning("%s\n", errbuf);
+	}
 
-	if (delay_secs < 1)
-		delay_secs = 1;
+	status = target__parse_uid(target);
+	if (status) {
+		int saved_errno = errno;
 
-	/*
-	 * User specified count overrides default frequency.
-	 */
-	if (default_interval)
-		freq = 0;
-	else if (freq) {
-		default_interval = freq;
-	} else {
-		fprintf(stderr, "frequency and count are zero, aborting\n");
-		exit(EXIT_FAILURE);
+		target__strerror(target, status, errbuf, BUFSIZ);
+		ui__error("%s\n", errbuf);
+
+		status = -saved_errno;
+		goto out_delete_evlist;
 	}
 
-	/*
-	 * Fill in the ones not specifically initialized via -c:
-	 */
-	for (counter = 0; counter < nr_counters; counter++) {
-		if (attrs[counter].sample_period)
-			continue;
+	if (target__none(target))
+		target->system_wide = true;
+
+	if (perf_evlist__create_maps(top.evlist, target) < 0)
+		usage_with_options(top_usage, options);
 
-		attrs[counter].sample_period = default_interval;
+	if (!top.evlist->nr_entries &&
+	    perf_evlist__add_default(top.evlist) < 0) {
+		ui__error("Not enough memory for event selector list\n");
+		goto out_delete_evlist;
 	}
 
-	if (target_pid != -1 || profile_cpu != -1)
-		nr_cpus = 1;
-	else
-		nr_cpus = read_cpu_map();
+	symbol_conf.nr_events = top.evlist->nr_entries;
+
+	if (top.delay_secs < 1)
+		top.delay_secs = 1;
+
+	if (record_opts__config(opts)) {
+		status = -EINVAL;
+		goto out_delete_evlist;
+	}
+
+	top.sym_evsel = perf_evlist__first(top.evlist);
 
-	get_term_dimensions(&winsize);
-	if (print_entries == 0) {
-		update_print_entries(&winsize);
-		signal(SIGWINCH, sig_winch_handler);
+	if (!symbol_conf.use_callchain) {
+		symbol_conf.cumulate_callchain = false;
+		perf_hpp__cancel_cumulate();
 	}
 
-	return __cmd_top();
+	symbol_conf.priv_size = sizeof(struct annotation);
+
+	symbol_conf.try_vmlinux_path = (symbol_conf.vmlinux_name == NULL);
+	if (symbol__init() < 0)
+		return -1;
+
+	sort__setup_elide(stdout);
+
+	get_term_dimensions(&top.winsize);
+	if (top.print_entries == 0) {
+		struct sigaction act = {
+			.sa_sigaction = perf_top__sig_winch,
+			.sa_flags     = SA_SIGINFO,
+		};
+		perf_top__update_print_entries(&top);
+		sigaction(SIGWINCH, &act, NULL);
+	}
+
+	status = __cmd_top(&top);
+
+out_delete_evlist:
+	perf_evlist__delete(top.evlist);
+
+	return status;
 }
diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c
index 407041d20de..f954c26de23 100644
--- a/tools/perf/builtin-trace.c
+++ b/tools/perf/builtin-trace.c
@@ -1,628 +1,2397 @@
+#include <traceevent/event-parse.h>
 #include "builtin.h"
-
-#include "util/util.h"
-#include "util/cache.h"
-#include "util/symbol.h"
-#include "util/thread.h"
-#include "util/header.h"
-#include "util/exec_cmd.h"
-#include "util/trace-event.h"
+#include "util/color.h"
+#include "util/debug.h"
+#include "util/evlist.h"
+#include "util/machine.h"
 #include "util/session.h"
+#include "util/thread.h"
+#include "util/parse-options.h"
+#include "util/strlist.h"
+#include "util/intlist.h"
+#include "util/thread_map.h"
+#include "util/stat.h"
+#include "trace-event.h"
+#include "util/parse-events.h"
+
+#include <libaudit.h>
+#include <stdlib.h>
+#include <sys/eventfd.h>
+#include <sys/mman.h>
+#include <linux/futex.h>
+
+/* For older distros: */
+#ifndef MAP_STACK
+# define MAP_STACK		0x20000
+#endif
+
+#ifndef MADV_HWPOISON
+# define MADV_HWPOISON		100
+#endif
+
+#ifndef MADV_MERGEABLE
+# define MADV_MERGEABLE		12
+#endif
+
+#ifndef MADV_UNMERGEABLE
+# define MADV_UNMERGEABLE	13
+#endif
+
+#ifndef EFD_SEMAPHORE
+# define EFD_SEMAPHORE		1
+#endif
+
+struct tp_field {
+	int offset;
+	union {
+		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
+		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
+	};
+};
+
+#define TP_UINT_FIELD(bits) \
+static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
+{ \
+	return *(u##bits *)(sample->raw_data + field->offset); \
+}
 
-static char const		*script_name;
-static char const		*generate_script_lang;
+TP_UINT_FIELD(8);
+TP_UINT_FIELD(16);
+TP_UINT_FIELD(32);
+TP_UINT_FIELD(64);
 
-static int default_start_script(const char *script __unused,
-				int argc __unused,
-				const char **argv __unused)
+#define TP_UINT_FIELD__SWAPPED(bits) \
+static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
+{ \
+	u##bits value = *(u##bits *)(sample->raw_data + field->offset); \
+	return bswap_##bits(value);\
+}
+
+TP_UINT_FIELD__SWAPPED(16);
+TP_UINT_FIELD__SWAPPED(32);
+TP_UINT_FIELD__SWAPPED(64);
+
+static int tp_field__init_uint(struct tp_field *field,
+			       struct format_field *format_field,
+			       bool needs_swap)
 {
+	field->offset = format_field->offset;
+
+	switch (format_field->size) {
+	case 1:
+		field->integer = tp_field__u8;
+		break;
+	case 2:
+		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
+		break;
+	case 4:
+		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
+		break;
+	case 8:
+		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
+		break;
+	default:
+		return -1;
+	}
+
 	return 0;
 }
 
-static int default_stop_script(void)
+static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
 {
-	return 0;
+	return sample->raw_data + field->offset;
 }
 
-static int default_generate_script(const char *outfile __unused)
+static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
 {
+	field->offset = format_field->offset;
+	field->pointer = tp_field__ptr;
 	return 0;
 }
 
-static struct scripting_ops default_scripting_ops = {
-	.start_script		= default_start_script,
-	.stop_script		= default_stop_script,
-	.process_event		= print_event,
-	.generate_script	= default_generate_script,
+struct syscall_tp {
+	struct tp_field id;
+	union {
+		struct tp_field args, ret;
+	};
 };
 
-static struct scripting_ops	*scripting_ops;
+static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
+					  struct tp_field *field,
+					  const char *name)
+{
+	struct format_field *format_field = perf_evsel__field(evsel, name);
 
-static void setup_scripting(void)
+	if (format_field == NULL)
+		return -1;
+
+	return tp_field__init_uint(field, format_field, evsel->needs_swap);
+}
+
+#define perf_evsel__init_sc_tp_uint_field(evsel, name) \
+	({ struct syscall_tp *sc = evsel->priv;\
+	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
+
+static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
+					 struct tp_field *field,
+					 const char *name)
 {
-	/* make sure PERF_EXEC_PATH is set for scripts */
-	perf_set_argv_exec_path(perf_exec_path());
+	struct format_field *format_field = perf_evsel__field(evsel, name);
 
-	setup_perl_scripting();
-	setup_python_scripting();
+	if (format_field == NULL)
+		return -1;
 
-	scripting_ops = &default_scripting_ops;
+	return tp_field__init_ptr(field, format_field);
 }
 
-static int cleanup_scripting(void)
+#define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
+	({ struct syscall_tp *sc = evsel->priv;\
+	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
+
+static void perf_evsel__delete_priv(struct perf_evsel *evsel)
 {
-	return scripting_ops->stop_script();
+	zfree(&evsel->priv);
+	perf_evsel__delete(evsel);
 }
 
-#include "util/parse-options.h"
+static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
+{
+	evsel->priv = malloc(sizeof(struct syscall_tp));
+	if (evsel->priv != NULL) {
+		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
+			goto out_delete;
 
-#include "perf.h"
-#include "util/debug.h"
+		evsel->handler = handler;
+		return 0;
+	}
+
+	return -ENOMEM;
+
+out_delete:
+	zfree(&evsel->priv);
+	return -ENOENT;
+}
+
+static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
+{
+	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
+
+	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
+	if (evsel == NULL)
+		evsel = perf_evsel__newtp("syscalls", direction);
+
+	if (evsel) {
+		if (perf_evsel__init_syscall_tp(evsel, handler))
+			goto out_delete;
+	}
+
+	return evsel;
+
+out_delete:
+	perf_evsel__delete_priv(evsel);
+	return NULL;
+}
 
-#include "util/trace-event.h"
-#include "util/exec_cmd.h"
+#define perf_evsel__sc_tp_uint(evsel, name, sample) \
+	({ struct syscall_tp *fields = evsel->priv; \
+	   fields->name.integer(&fields->name, sample); })
 
-static char const		*input_name = "perf.data";
+#define perf_evsel__sc_tp_ptr(evsel, name, sample) \
+	({ struct syscall_tp *fields = evsel->priv; \
+	   fields->name.pointer(&fields->name, sample); })
 
-static int process_sample_event(event_t *event, struct perf_session *session)
+static int perf_evlist__add_syscall_newtp(struct perf_evlist *evlist,
+					  void *sys_enter_handler,
+					  void *sys_exit_handler)
 {
-	struct sample_data data;
+	int ret = -1;
+	struct perf_evsel *sys_enter, *sys_exit;
+
+	sys_enter = perf_evsel__syscall_newtp("sys_enter", sys_enter_handler);
+	if (sys_enter == NULL)
+		goto out;
+
+	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
+		goto out_delete_sys_enter;
+
+	sys_exit = perf_evsel__syscall_newtp("sys_exit", sys_exit_handler);
+	if (sys_exit == NULL)
+		goto out_delete_sys_enter;
+
+	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
+		goto out_delete_sys_exit;
+
+	perf_evlist__add(evlist, sys_enter);
+	perf_evlist__add(evlist, sys_exit);
+
+	ret = 0;
+out:
+	return ret;
+
+out_delete_sys_exit:
+	perf_evsel__delete_priv(sys_exit);
+out_delete_sys_enter:
+	perf_evsel__delete_priv(sys_enter);
+	goto out;
+}
+
+
+struct syscall_arg {
+	unsigned long val;
 	struct thread *thread;
+	struct trace  *trace;
+	void	      *parm;
+	u8	      idx;
+	u8	      mask;
+};
 
-	memset(&data, 0, sizeof(data));
-	data.time = -1;
-	data.cpu = -1;
-	data.period = 1;
+struct strarray {
+	int	    offset;
+	int	    nr_entries;
+	const char **entries;
+};
 
-	event__parse_sample(event, session->sample_type, &data);
+#define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
+	.nr_entries = ARRAY_SIZE(array), \
+	.entries = array, \
+}
 
-	dump_printf("(IP, %d): %d/%d: %#Lx period: %Ld\n", event->header.misc,
-		    data.pid, data.tid, data.ip, data.period);
+#define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
+	.offset	    = off, \
+	.nr_entries = ARRAY_SIZE(array), \
+	.entries = array, \
+}
 
-	thread = perf_session__findnew(session, event->ip.pid);
-	if (thread == NULL) {
-		pr_debug("problem processing %d event, skipping it.\n",
-			 event->header.type);
-		return -1;
+static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
+						const char *intfmt,
+					        struct syscall_arg *arg)
+{
+	struct strarray *sa = arg->parm;
+	int idx = arg->val - sa->offset;
+
+	if (idx < 0 || idx >= sa->nr_entries)
+		return scnprintf(bf, size, intfmt, arg->val);
+
+	return scnprintf(bf, size, "%s", sa->entries[idx]);
+}
+
+static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
+					      struct syscall_arg *arg)
+{
+	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
+}
+
+#define SCA_STRARRAY syscall_arg__scnprintf_strarray
+
+#if defined(__i386__) || defined(__x86_64__)
+/*
+ * FIXME: Make this available to all arches as soon as the ioctl beautifier
+ * 	  gets rewritten to support all arches.
+ */
+static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
+						 struct syscall_arg *arg)
+{
+	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
+}
+
+#define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
+#endif /* defined(__i386__) || defined(__x86_64__) */
+
+static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
+					struct syscall_arg *arg);
+
+#define SCA_FD syscall_arg__scnprintf_fd
+
+static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
+					   struct syscall_arg *arg)
+{
+	int fd = arg->val;
+
+	if (fd == AT_FDCWD)
+		return scnprintf(bf, size, "CWD");
+
+	return syscall_arg__scnprintf_fd(bf, size, arg);
+}
+
+#define SCA_FDAT syscall_arg__scnprintf_fd_at
+
+static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
+					      struct syscall_arg *arg);
+
+#define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
+
+static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
+					 struct syscall_arg *arg)
+{
+	return scnprintf(bf, size, "%#lx", arg->val);
+}
+
+#define SCA_HEX syscall_arg__scnprintf_hex
+
+static size_t syscall_arg__scnprintf_mmap_prot(char *bf, size_t size,
+					       struct syscall_arg *arg)
+{
+	int printed = 0, prot = arg->val;
+
+	if (prot == PROT_NONE)
+		return scnprintf(bf, size, "NONE");
+#define	P_MMAP_PROT(n) \
+	if (prot & PROT_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		prot &= ~PROT_##n; \
 	}
 
-	if (session->sample_type & PERF_SAMPLE_RAW) {
-		/*
-		 * FIXME: better resolve from pid from the struct trace_entry
-		 * field, although it should be the same than this perf
-		 * event pid
-		 */
-		scripting_ops->process_event(data.cpu, data.raw_data,
-					     data.raw_size,
-					     data.time, thread->comm);
+	P_MMAP_PROT(EXEC);
+	P_MMAP_PROT(READ);
+	P_MMAP_PROT(WRITE);
+#ifdef PROT_SEM
+	P_MMAP_PROT(SEM);
+#endif
+	P_MMAP_PROT(GROWSDOWN);
+	P_MMAP_PROT(GROWSUP);
+#undef P_MMAP_PROT
+
+	if (prot)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", prot);
+
+	return printed;
+}
+
+#define SCA_MMAP_PROT syscall_arg__scnprintf_mmap_prot
+
+static size_t syscall_arg__scnprintf_mmap_flags(char *bf, size_t size,
+						struct syscall_arg *arg)
+{
+	int printed = 0, flags = arg->val;
+
+#define	P_MMAP_FLAG(n) \
+	if (flags & MAP_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		flags &= ~MAP_##n; \
 	}
 
-	session->events_stats.total += data.period;
-	return 0;
+	P_MMAP_FLAG(SHARED);
+	P_MMAP_FLAG(PRIVATE);
+#ifdef MAP_32BIT
+	P_MMAP_FLAG(32BIT);
+#endif
+	P_MMAP_FLAG(ANONYMOUS);
+	P_MMAP_FLAG(DENYWRITE);
+	P_MMAP_FLAG(EXECUTABLE);
+	P_MMAP_FLAG(FILE);
+	P_MMAP_FLAG(FIXED);
+	P_MMAP_FLAG(GROWSDOWN);
+#ifdef MAP_HUGETLB
+	P_MMAP_FLAG(HUGETLB);
+#endif
+	P_MMAP_FLAG(LOCKED);
+	P_MMAP_FLAG(NONBLOCK);
+	P_MMAP_FLAG(NORESERVE);
+	P_MMAP_FLAG(POPULATE);
+	P_MMAP_FLAG(STACK);
+#ifdef MAP_UNINITIALIZED
+	P_MMAP_FLAG(UNINITIALIZED);
+#endif
+#undef P_MMAP_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+	return printed;
 }
 
-static struct perf_event_ops event_ops = {
-	.sample	= process_sample_event,
-	.comm	= event__process_comm,
-};
+#define SCA_MMAP_FLAGS syscall_arg__scnprintf_mmap_flags
+
+static size_t syscall_arg__scnprintf_madvise_behavior(char *bf, size_t size,
+						      struct syscall_arg *arg)
+{
+	int behavior = arg->val;
+
+	switch (behavior) {
+#define	P_MADV_BHV(n) case MADV_##n: return scnprintf(bf, size, #n)
+	P_MADV_BHV(NORMAL);
+	P_MADV_BHV(RANDOM);
+	P_MADV_BHV(SEQUENTIAL);
+	P_MADV_BHV(WILLNEED);
+	P_MADV_BHV(DONTNEED);
+	P_MADV_BHV(REMOVE);
+	P_MADV_BHV(DONTFORK);
+	P_MADV_BHV(DOFORK);
+	P_MADV_BHV(HWPOISON);
+#ifdef MADV_SOFT_OFFLINE
+	P_MADV_BHV(SOFT_OFFLINE);
+#endif
+	P_MADV_BHV(MERGEABLE);
+	P_MADV_BHV(UNMERGEABLE);
+#ifdef MADV_HUGEPAGE
+	P_MADV_BHV(HUGEPAGE);
+#endif
+#ifdef MADV_NOHUGEPAGE
+	P_MADV_BHV(NOHUGEPAGE);
+#endif
+#ifdef MADV_DONTDUMP
+	P_MADV_BHV(DONTDUMP);
+#endif
+#ifdef MADV_DODUMP
+	P_MADV_BHV(DODUMP);
+#endif
+#undef P_MADV_PHV
+	default: break;
+	}
 
-static int __cmd_trace(struct perf_session *session)
+	return scnprintf(bf, size, "%#x", behavior);
+}
+
+#define SCA_MADV_BHV syscall_arg__scnprintf_madvise_behavior
+
+static size_t syscall_arg__scnprintf_flock(char *bf, size_t size,
+					   struct syscall_arg *arg)
 {
-	return perf_session__process_events(session, &event_ops);
+	int printed = 0, op = arg->val;
+
+	if (op == 0)
+		return scnprintf(bf, size, "NONE");
+#define	P_CMD(cmd) \
+	if ((op & LOCK_##cmd) == LOCK_##cmd) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #cmd); \
+		op &= ~LOCK_##cmd; \
+	}
+
+	P_CMD(SH);
+	P_CMD(EX);
+	P_CMD(NB);
+	P_CMD(UN);
+	P_CMD(MAND);
+	P_CMD(RW);
+	P_CMD(READ);
+	P_CMD(WRITE);
+#undef P_OP
+
+	if (op)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", op);
+
+	return printed;
 }
 
-struct script_spec {
-	struct list_head	node;
-	struct scripting_ops	*ops;
-	char			spec[0];
+#define SCA_FLOCK syscall_arg__scnprintf_flock
+
+static size_t syscall_arg__scnprintf_futex_op(char *bf, size_t size, struct syscall_arg *arg)
+{
+	enum syscall_futex_args {
+		SCF_UADDR   = (1 << 0),
+		SCF_OP	    = (1 << 1),
+		SCF_VAL	    = (1 << 2),
+		SCF_TIMEOUT = (1 << 3),
+		SCF_UADDR2  = (1 << 4),
+		SCF_VAL3    = (1 << 5),
+	};
+	int op = arg->val;
+	int cmd = op & FUTEX_CMD_MASK;
+	size_t printed = 0;
+
+	switch (cmd) {
+#define	P_FUTEX_OP(n) case FUTEX_##n: printed = scnprintf(bf, size, #n);
+	P_FUTEX_OP(WAIT);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
+	P_FUTEX_OP(WAKE);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
+	P_FUTEX_OP(FD);		    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
+	P_FUTEX_OP(REQUEUE);	    arg->mask |= SCF_VAL3|SCF_TIMEOUT;	          break;
+	P_FUTEX_OP(CMP_REQUEUE);    arg->mask |= SCF_TIMEOUT;			  break;
+	P_FUTEX_OP(CMP_REQUEUE_PI); arg->mask |= SCF_TIMEOUT;			  break;
+	P_FUTEX_OP(WAKE_OP);							  break;
+	P_FUTEX_OP(LOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
+	P_FUTEX_OP(UNLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2|SCF_TIMEOUT; break;
+	P_FUTEX_OP(TRYLOCK_PI);	    arg->mask |= SCF_VAL3|SCF_UADDR2;		  break;
+	P_FUTEX_OP(WAIT_BITSET);    arg->mask |= SCF_UADDR2;			  break;
+	P_FUTEX_OP(WAKE_BITSET);    arg->mask |= SCF_UADDR2;			  break;
+	P_FUTEX_OP(WAIT_REQUEUE_PI);						  break;
+	default: printed = scnprintf(bf, size, "%#x", cmd);			  break;
+	}
+
+	if (op & FUTEX_PRIVATE_FLAG)
+		printed += scnprintf(bf + printed, size - printed, "|PRIV");
+
+	if (op & FUTEX_CLOCK_REALTIME)
+		printed += scnprintf(bf + printed, size - printed, "|CLKRT");
+
+	return printed;
+}
+
+#define SCA_FUTEX_OP  syscall_arg__scnprintf_futex_op
+
+static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
+static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
+
+static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
+static DEFINE_STRARRAY(itimers);
+
+static const char *whences[] = { "SET", "CUR", "END",
+#ifdef SEEK_DATA
+"DATA",
+#endif
+#ifdef SEEK_HOLE
+"HOLE",
+#endif
 };
+static DEFINE_STRARRAY(whences);
 
-LIST_HEAD(script_specs);
+static const char *fcntl_cmds[] = {
+	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
+	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
+	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
+	"F_GETOWNER_UIDS",
+};
+static DEFINE_STRARRAY(fcntl_cmds);
 
-static struct script_spec *script_spec__new(const char *spec,
-					    struct scripting_ops *ops)
+static const char *rlimit_resources[] = {
+	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
+	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
+	"RTTIME",
+};
+static DEFINE_STRARRAY(rlimit_resources);
+
+static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
+static DEFINE_STRARRAY(sighow);
+
+static const char *clockid[] = {
+	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
+	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE",
+};
+static DEFINE_STRARRAY(clockid);
+
+static const char *socket_families[] = {
+	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
+	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
+	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
+	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
+	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
+	"ALG", "NFC", "VSOCK",
+};
+static DEFINE_STRARRAY(socket_families);
+
+#ifndef SOCK_TYPE_MASK
+#define SOCK_TYPE_MASK 0xf
+#endif
+
+static size_t syscall_arg__scnprintf_socket_type(char *bf, size_t size,
+						      struct syscall_arg *arg)
 {
-	struct script_spec *s = malloc(sizeof(*s) + strlen(spec) + 1);
+	size_t printed;
+	int type = arg->val,
+	    flags = type & ~SOCK_TYPE_MASK;
+
+	type &= SOCK_TYPE_MASK;
+	/*
+ 	 * Can't use a strarray, MIPS may override for ABI reasons.
+ 	 */
+	switch (type) {
+#define	P_SK_TYPE(n) case SOCK_##n: printed = scnprintf(bf, size, #n); break;
+	P_SK_TYPE(STREAM);
+	P_SK_TYPE(DGRAM);
+	P_SK_TYPE(RAW);
+	P_SK_TYPE(RDM);
+	P_SK_TYPE(SEQPACKET);
+	P_SK_TYPE(DCCP);
+	P_SK_TYPE(PACKET);
+#undef P_SK_TYPE
+	default:
+		printed = scnprintf(bf, size, "%#x", type);
+	}
 
-	if (s != NULL) {
-		strcpy(s->spec, spec);
-		s->ops = ops;
+#define	P_SK_FLAG(n) \
+	if (flags & SOCK_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "|%s", #n); \
+		flags &= ~SOCK_##n; \
 	}
 
-	return s;
+	P_SK_FLAG(CLOEXEC);
+	P_SK_FLAG(NONBLOCK);
+#undef P_SK_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "|%#x", flags);
+
+	return printed;
 }
 
-static void script_spec__delete(struct script_spec *s)
+#define SCA_SK_TYPE syscall_arg__scnprintf_socket_type
+
+#ifndef MSG_PROBE
+#define MSG_PROBE	     0x10
+#endif
+#ifndef MSG_WAITFORONE
+#define MSG_WAITFORONE	0x10000
+#endif
+#ifndef MSG_SENDPAGE_NOTLAST
+#define MSG_SENDPAGE_NOTLAST 0x20000
+#endif
+#ifndef MSG_FASTOPEN
+#define MSG_FASTOPEN	     0x20000000
+#endif
+
+static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size,
+					       struct syscall_arg *arg)
 {
-	free(s->spec);
-	free(s);
+	int printed = 0, flags = arg->val;
+
+	if (flags == 0)
+		return scnprintf(bf, size, "NONE");
+#define	P_MSG_FLAG(n) \
+	if (flags & MSG_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		flags &= ~MSG_##n; \
+	}
+
+	P_MSG_FLAG(OOB);
+	P_MSG_FLAG(PEEK);
+	P_MSG_FLAG(DONTROUTE);
+	P_MSG_FLAG(TRYHARD);
+	P_MSG_FLAG(CTRUNC);
+	P_MSG_FLAG(PROBE);
+	P_MSG_FLAG(TRUNC);
+	P_MSG_FLAG(DONTWAIT);
+	P_MSG_FLAG(EOR);
+	P_MSG_FLAG(WAITALL);
+	P_MSG_FLAG(FIN);
+	P_MSG_FLAG(SYN);
+	P_MSG_FLAG(CONFIRM);
+	P_MSG_FLAG(RST);
+	P_MSG_FLAG(ERRQUEUE);
+	P_MSG_FLAG(NOSIGNAL);
+	P_MSG_FLAG(MORE);
+	P_MSG_FLAG(WAITFORONE);
+	P_MSG_FLAG(SENDPAGE_NOTLAST);
+	P_MSG_FLAG(FASTOPEN);
+	P_MSG_FLAG(CMSG_CLOEXEC);
+#undef P_MSG_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+	return printed;
 }
 
-static void script_spec__add(struct script_spec *s)
+#define SCA_MSG_FLAGS syscall_arg__scnprintf_msg_flags
+
+static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
+						 struct syscall_arg *arg)
 {
-	list_add_tail(&s->node, &script_specs);
+	size_t printed = 0;
+	int mode = arg->val;
+
+	if (mode == F_OK) /* 0 */
+		return scnprintf(bf, size, "F");
+#define	P_MODE(n) \
+	if (mode & n##_OK) { \
+		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
+		mode &= ~n##_OK; \
+	}
+
+	P_MODE(R);
+	P_MODE(W);
+	P_MODE(X);
+#undef P_MODE
+
+	if (mode)
+		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
+
+	return printed;
 }
 
-static struct script_spec *script_spec__find(const char *spec)
+#define SCA_ACCMODE syscall_arg__scnprintf_access_mode
+
+static size_t syscall_arg__scnprintf_open_flags(char *bf, size_t size,
+					       struct syscall_arg *arg)
 {
-	struct script_spec *s;
+	int printed = 0, flags = arg->val;
 
-	list_for_each_entry(s, &script_specs, node)
-		if (strcasecmp(s->spec, spec) == 0)
-			return s;
-	return NULL;
+	if (!(flags & O_CREAT))
+		arg->mask |= 1 << (arg->idx + 1); /* Mask the mode parm */
+
+	if (flags == 0)
+		return scnprintf(bf, size, "RDONLY");
+#define	P_FLAG(n) \
+	if (flags & O_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		flags &= ~O_##n; \
+	}
+
+	P_FLAG(APPEND);
+	P_FLAG(ASYNC);
+	P_FLAG(CLOEXEC);
+	P_FLAG(CREAT);
+	P_FLAG(DIRECT);
+	P_FLAG(DIRECTORY);
+	P_FLAG(EXCL);
+	P_FLAG(LARGEFILE);
+	P_FLAG(NOATIME);
+	P_FLAG(NOCTTY);
+#ifdef O_NONBLOCK
+	P_FLAG(NONBLOCK);
+#elif O_NDELAY
+	P_FLAG(NDELAY);
+#endif
+#ifdef O_PATH
+	P_FLAG(PATH);
+#endif
+	P_FLAG(RDWR);
+#ifdef O_DSYNC
+	if ((flags & O_SYNC) == O_SYNC)
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", "SYNC");
+	else {
+		P_FLAG(DSYNC);
+	}
+#else
+	P_FLAG(SYNC);
+#endif
+	P_FLAG(TRUNC);
+	P_FLAG(WRONLY);
+#undef P_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+	return printed;
 }
 
-static struct script_spec *script_spec__findnew(const char *spec,
-						struct scripting_ops *ops)
+#define SCA_OPEN_FLAGS syscall_arg__scnprintf_open_flags
+
+static size_t syscall_arg__scnprintf_eventfd_flags(char *bf, size_t size,
+						   struct syscall_arg *arg)
 {
-	struct script_spec *s = script_spec__find(spec);
+	int printed = 0, flags = arg->val;
+
+	if (flags == 0)
+		return scnprintf(bf, size, "NONE");
+#define	P_FLAG(n) \
+	if (flags & EFD_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		flags &= ~EFD_##n; \
+	}
+
+	P_FLAG(SEMAPHORE);
+	P_FLAG(CLOEXEC);
+	P_FLAG(NONBLOCK);
+#undef P_FLAG
 
-	if (s)
-		return s;
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
 
-	s = script_spec__new(spec, ops);
-	if (!s)
-		goto out_delete_spec;
+	return printed;
+}
 
-	script_spec__add(s);
+#define SCA_EFD_FLAGS syscall_arg__scnprintf_eventfd_flags
 
-	return s;
+static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
+						struct syscall_arg *arg)
+{
+	int printed = 0, flags = arg->val;
 
-out_delete_spec:
-	script_spec__delete(s);
+#define	P_FLAG(n) \
+	if (flags & O_##n) { \
+		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
+		flags &= ~O_##n; \
+	}
 
-	return NULL;
+	P_FLAG(CLOEXEC);
+	P_FLAG(NONBLOCK);
+#undef P_FLAG
+
+	if (flags)
+		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
+
+	return printed;
 }
 
-int script_spec_register(const char *spec, struct scripting_ops *ops)
+#define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
+
+static size_t syscall_arg__scnprintf_signum(char *bf, size_t size, struct syscall_arg *arg)
 {
-	struct script_spec *s;
+	int sig = arg->val;
+
+	switch (sig) {
+#define	P_SIGNUM(n) case SIG##n: return scnprintf(bf, size, #n)
+	P_SIGNUM(HUP);
+	P_SIGNUM(INT);
+	P_SIGNUM(QUIT);
+	P_SIGNUM(ILL);
+	P_SIGNUM(TRAP);
+	P_SIGNUM(ABRT);
+	P_SIGNUM(BUS);
+	P_SIGNUM(FPE);
+	P_SIGNUM(KILL);
+	P_SIGNUM(USR1);
+	P_SIGNUM(SEGV);
+	P_SIGNUM(USR2);
+	P_SIGNUM(PIPE);
+	P_SIGNUM(ALRM);
+	P_SIGNUM(TERM);
+	P_SIGNUM(CHLD);
+	P_SIGNUM(CONT);
+	P_SIGNUM(STOP);
+	P_SIGNUM(TSTP);
+	P_SIGNUM(TTIN);
+	P_SIGNUM(TTOU);
+	P_SIGNUM(URG);
+	P_SIGNUM(XCPU);
+	P_SIGNUM(XFSZ);
+	P_SIGNUM(VTALRM);
+	P_SIGNUM(PROF);
+	P_SIGNUM(WINCH);
+	P_SIGNUM(IO);
+	P_SIGNUM(PWR);
+	P_SIGNUM(SYS);
+#ifdef SIGEMT
+	P_SIGNUM(EMT);
+#endif
+#ifdef SIGSTKFLT
+	P_SIGNUM(STKFLT);
+#endif
+#ifdef SIGSWI
+	P_SIGNUM(SWI);
+#endif
+	default: break;
+	}
 
-	s = script_spec__find(spec);
-	if (s)
-		return -1;
+	return scnprintf(bf, size, "%#x", sig);
+}
 
-	s = script_spec__findnew(spec, ops);
-	if (!s)
-		return -1;
+#define SCA_SIGNUM syscall_arg__scnprintf_signum
+
+#if defined(__i386__) || defined(__x86_64__)
+/*
+ * FIXME: Make this available to all arches.
+ */
+#define TCGETS		0x5401
+
+static const char *tioctls[] = {
+	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
+	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
+	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
+	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
+	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
+	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
+	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
+	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
+	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
+	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
+	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
+	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
+	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
+	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
+	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
+};
 
-	return 0;
+static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
+#endif /* defined(__i386__) || defined(__x86_64__) */
+
+#define STRARRAY(arg, name, array) \
+	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
+	  .arg_parm	 = { [arg] = &strarray__##array, }
+
+static struct syscall_fmt {
+	const char *name;
+	const char *alias;
+	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
+	void	   *arg_parm[6];
+	bool	   errmsg;
+	bool	   timeout;
+	bool	   hexret;
+} syscall_fmts[] = {
+	{ .name	    = "access",	    .errmsg = true,
+	  .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
+	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
+	{ .name	    = "brk",	    .hexret = true,
+	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
+	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
+	{ .name	    = "close",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, }, 
+	{ .name	    = "connect",    .errmsg = true, },
+	{ .name	    = "dup",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "dup2",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "dup3",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
+	{ .name	    = "eventfd2",   .errmsg = true,
+	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
+	{ .name	    = "faccessat",  .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
+	{ .name	    = "fadvise64",  .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "fallocate",  .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "fchdir",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "fchmod",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "fchmodat",   .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 
+	{ .name	    = "fchown",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "fchownat",   .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 
+	{ .name	    = "fcntl",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
+			     [1] = SCA_STRARRAY, /* cmd */ },
+	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
+	{ .name	    = "fdatasync",  .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "flock",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
+			     [1] = SCA_FLOCK, /* cmd */ }, },
+	{ .name	    = "fsetxattr",  .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat",
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat",
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 
+	{ .name	    = "fstatfs",    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "fsync",    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "ftruncate", .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "futex",	    .errmsg = true,
+	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
+	{ .name	    = "futimesat", .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 
+	{ .name	    = "getdents",   .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "getdents64", .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
+	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
+	{ .name	    = "ioctl",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ 
+#if defined(__i386__) || defined(__x86_64__)
+/*
+ * FIXME: Make this available to all arches.
+ */
+			     [1] = SCA_STRHEXARRAY, /* cmd */
+			     [2] = SCA_HEX, /* arg */ },
+	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
+#else
+			     [2] = SCA_HEX, /* arg */ }, },
+#endif
+	{ .name	    = "kill",	    .errmsg = true,
+	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
+	{ .name	    = "linkat",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 
+	{ .name	    = "lseek",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */
+			     [2] = SCA_STRARRAY, /* whence */ },
+	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
+	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
+	{ .name     = "madvise",    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
+			     [2] = SCA_MADV_BHV, /* behavior */ }, },
+	{ .name	    = "mkdirat",    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 
+	{ .name	    = "mknodat",    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, }, 
+	{ .name	    = "mlock",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
+	{ .name	    = "mlockall",   .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
+	{ .name	    = "mmap",	    .hexret = true,
+	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
+			     [2] = SCA_MMAP_PROT, /* prot */
+			     [3] = SCA_MMAP_FLAGS, /* flags */
+			     [4] = SCA_FD, 	  /* fd */ }, },
+	{ .name	    = "mprotect",   .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
+			     [2] = SCA_MMAP_PROT, /* prot */ }, },
+	{ .name	    = "mremap",	    .hexret = true,
+	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
+			     [4] = SCA_HEX, /* new_addr */ }, },
+	{ .name	    = "munlock",    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
+	{ .name	    = "munmap",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
+	{ .name	    = "name_to_handle_at", .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 
+	{ .name	    = "newfstatat", .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 
+	{ .name	    = "open",	    .errmsg = true,
+	  .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
+	{ .name	    = "open_by_handle_at", .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
+			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
+	{ .name	    = "openat",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
+			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
+	{ .name	    = "pipe2",	    .errmsg = true,
+	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
+	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
+	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
+	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64",
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread",
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
+	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64",
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "pwritev",    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "read",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "readlinkat", .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 
+	{ .name	    = "readv",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "recvfrom",   .errmsg = true,
+	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
+	{ .name	    = "recvmmsg",   .errmsg = true,
+	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
+	{ .name	    = "recvmsg",    .errmsg = true,
+	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
+	{ .name	    = "renameat",   .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 
+	{ .name	    = "rt_sigaction", .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
+	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
+	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
+	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
+	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
+	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
+	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
+	{ .name	    = "sendmmsg",    .errmsg = true,
+	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
+	{ .name	    = "sendmsg",    .errmsg = true,
+	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
+	{ .name	    = "sendto",	    .errmsg = true,
+	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
+	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
+	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
+	{ .name	    = "shutdown",   .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "socket",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
+			     [1] = SCA_SK_TYPE, /* type */ },
+	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
+	{ .name	    = "socketpair", .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
+			     [1] = SCA_SK_TYPE, /* type */ },
+	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
+	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
+	{ .name	    = "symlinkat",  .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, }, 
+	{ .name	    = "tgkill",	    .errmsg = true,
+	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
+	{ .name	    = "tkill",	    .errmsg = true,
+	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
+	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
+	{ .name	    = "unlinkat",   .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
+	{ .name	    = "utimensat",  .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
+	{ .name	    = "write",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+	{ .name	    = "writev",	    .errmsg = true,
+	  .arg_scnprintf = { [0] = SCA_FD, /* fd */ }, }, 
+};
+
+static int syscall_fmt__cmp(const void *name, const void *fmtp)
+{
+	const struct syscall_fmt *fmt = fmtp;
+	return strcmp(name, fmt->name);
 }
 
-static struct scripting_ops *script_spec__lookup(const char *spec)
+static struct syscall_fmt *syscall_fmt__find(const char *name)
 {
-	struct script_spec *s = script_spec__find(spec);
-	if (!s)
-		return NULL;
+	const int nmemb = ARRAY_SIZE(syscall_fmts);
+	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
+}
+
+struct syscall {
+	struct event_format *tp_format;
+	const char	    *name;
+	bool		    filtered;
+	struct syscall_fmt  *fmt;
+	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
+	void		    **arg_parm;
+};
 
-	return s->ops;
+static size_t fprintf_duration(unsigned long t, FILE *fp)
+{
+	double duration = (double)t / NSEC_PER_MSEC;
+	size_t printed = fprintf(fp, "(");
+
+	if (duration >= 1.0)
+		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
+	else if (duration >= 0.01)
+		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
+	else
+		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
+	return printed + fprintf(fp, "): ");
 }
 
-static void list_available_languages(void)
+struct thread_trace {
+	u64		  entry_time;
+	u64		  exit_time;
+	bool		  entry_pending;
+	unsigned long	  nr_events;
+	char		  *entry_str;
+	double		  runtime_ms;
+	struct {
+		int	  max;
+		char	  **table;
+	} paths;
+
+	struct intlist *syscall_stats;
+};
+
+static struct thread_trace *thread_trace__new(void)
 {
-	struct script_spec *s;
+	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
 
-	fprintf(stderr, "\n");
-	fprintf(stderr, "Scripting language extensions (used in "
-		"perf trace -s [spec:]script.[spec]):\n\n");
+	if (ttrace)
+		ttrace->paths.max = -1;
 
-	list_for_each_entry(s, &script_specs, node)
-		fprintf(stderr, "  %-42s [%s]\n", s->spec, s->ops->name);
+	ttrace->syscall_stats = intlist__new(NULL);
 
-	fprintf(stderr, "\n");
+	return ttrace;
 }
 
-static int parse_scriptname(const struct option *opt __used,
-			    const char *str, int unset __used)
+static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
 {
-	char spec[PATH_MAX];
-	const char *script, *ext;
-	int len;
+	struct thread_trace *ttrace;
 
-	if (strcmp(str, "lang") == 0) {
-		list_available_languages();
-		exit(0);
-	}
+	if (thread == NULL)
+		goto fail;
 
-	script = strchr(str, ':');
-	if (script) {
-		len = script - str;
-		if (len >= PATH_MAX) {
-			fprintf(stderr, "invalid language specifier");
-			return -1;
-		}
-		strncpy(spec, str, len);
-		spec[len] = '\0';
-		scripting_ops = script_spec__lookup(spec);
-		if (!scripting_ops) {
-			fprintf(stderr, "invalid language specifier");
-			return -1;
-		}
-		script++;
-	} else {
-		script = str;
-		ext = strchr(script, '.');
-		if (!ext) {
-			fprintf(stderr, "invalid script extension");
-			return -1;
-		}
-		scripting_ops = script_spec__lookup(++ext);
-		if (!scripting_ops) {
-			fprintf(stderr, "invalid script extension");
+	if (thread->priv == NULL)
+		thread->priv = thread_trace__new();
+		
+	if (thread->priv == NULL)
+		goto fail;
+
+	ttrace = thread->priv;
+	++ttrace->nr_events;
+
+	return ttrace;
+fail:
+	color_fprintf(fp, PERF_COLOR_RED,
+		      "WARNING: not enough memory, dropping samples!\n");
+	return NULL;
+}
+
+struct trace {
+	struct perf_tool	tool;
+	struct {
+		int		machine;
+		int		open_id;
+	}			audit;
+	struct {
+		int		max;
+		struct syscall  *table;
+	} syscalls;
+	struct record_opts	opts;
+	struct machine		*host;
+	u64			base_time;
+	FILE			*output;
+	unsigned long		nr_events;
+	struct strlist		*ev_qualifier;
+	const char 		*last_vfs_getname;
+	struct intlist		*tid_list;
+	struct intlist		*pid_list;
+	double			duration_filter;
+	double			runtime_ms;
+	struct {
+		u64		vfs_getname,
+				proc_getname;
+	} stats;
+	bool			not_ev_qualifier;
+	bool			live;
+	bool			full_time;
+	bool			sched;
+	bool			multiple_threads;
+	bool			summary;
+	bool			summary_only;
+	bool			show_comm;
+	bool			show_tool_stats;
+};
+
+static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
+{
+	struct thread_trace *ttrace = thread->priv;
+
+	if (fd > ttrace->paths.max) {
+		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
+
+		if (npath == NULL)
 			return -1;
+
+		if (ttrace->paths.max != -1) {
+			memset(npath + ttrace->paths.max + 1, 0,
+			       (fd - ttrace->paths.max) * sizeof(char *));
+		} else {
+			memset(npath, 0, (fd + 1) * sizeof(char *));
 		}
+
+		ttrace->paths.table = npath;
+		ttrace->paths.max   = fd;
 	}
 
-	script_name = strdup(script);
+	ttrace->paths.table[fd] = strdup(pathname);
 
-	return 0;
+	return ttrace->paths.table[fd] != NULL ? 0 : -1;
 }
 
-#define for_each_lang(scripts_dir, lang_dirent, lang_next)		\
-	while (!readdir_r(scripts_dir, &lang_dirent, &lang_next) &&	\
-	       lang_next)						\
-		if (lang_dirent.d_type == DT_DIR &&			\
-		    (strcmp(lang_dirent.d_name, ".")) &&		\
-		    (strcmp(lang_dirent.d_name, "..")))
+static int thread__read_fd_path(struct thread *thread, int fd)
+{
+	char linkname[PATH_MAX], pathname[PATH_MAX];
+	struct stat st;
+	int ret;
 
-#define for_each_script(lang_dir, script_dirent, script_next)		\
-	while (!readdir_r(lang_dir, &script_dirent, &script_next) &&	\
-	       script_next)						\
-		if (script_dirent.d_type != DT_DIR)
+	if (thread->pid_ == thread->tid) {
+		scnprintf(linkname, sizeof(linkname),
+			  "/proc/%d/fd/%d", thread->pid_, fd);
+	} else {
+		scnprintf(linkname, sizeof(linkname),
+			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
+	}
 
+	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
+		return -1;
 
-#define RECORD_SUFFIX			"-record"
-#define REPORT_SUFFIX			"-report"
+	ret = readlink(linkname, pathname, sizeof(pathname));
 
-struct script_desc {
-	struct list_head	node;
-	char			*name;
-	char			*half_liner;
-	char			*args;
-};
+	if (ret < 0 || ret > st.st_size)
+		return -1;
 
-LIST_HEAD(script_descs);
+	pathname[ret] = '\0';
+	return trace__set_fd_pathname(thread, fd, pathname);
+}
 
-static struct script_desc *script_desc__new(const char *name)
+static const char *thread__fd_path(struct thread *thread, int fd,
+				   struct trace *trace)
 {
-	struct script_desc *s = zalloc(sizeof(*s));
+	struct thread_trace *ttrace = thread->priv;
+
+	if (ttrace == NULL)
+		return NULL;
+
+	if (fd < 0)
+		return NULL;
 
-	if (s != NULL)
-		s->name = strdup(name);
+	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL))
+		if (!trace->live)
+			return NULL;
+		++trace->stats.proc_getname;
+		if (thread__read_fd_path(thread, fd)) {
+			return NULL;
+	}
 
-	return s;
+	return ttrace->paths.table[fd];
 }
 
-static void script_desc__delete(struct script_desc *s)
+static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
+					struct syscall_arg *arg)
 {
-	free(s->name);
-	free(s);
+	int fd = arg->val;
+	size_t printed = scnprintf(bf, size, "%d", fd);
+	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
+
+	if (path)
+		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
+
+	return printed;
 }
 
-static void script_desc__add(struct script_desc *s)
+static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
+					      struct syscall_arg *arg)
 {
-	list_add_tail(&s->node, &script_descs);
+	int fd = arg->val;
+	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
+	struct thread_trace *ttrace = arg->thread->priv;
+
+	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
+		zfree(&ttrace->paths.table[fd]);
+
+	return printed;
 }
 
-static struct script_desc *script_desc__find(const char *name)
+static bool trace__filter_duration(struct trace *trace, double t)
 {
-	struct script_desc *s;
+	return t < (trace->duration_filter * NSEC_PER_MSEC);
+}
 
-	list_for_each_entry(s, &script_descs, node)
-		if (strcasecmp(s->name, name) == 0)
-			return s;
-	return NULL;
+static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
+{
+	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
+
+	return fprintf(fp, "%10.3f ", ts);
 }
 
-static struct script_desc *script_desc__findnew(const char *name)
+static bool done = false;
+static bool interrupted = false;
+
+static void sig_handler(int sig)
 {
-	struct script_desc *s = script_desc__find(name);
+	done = true;
+	interrupted = sig == SIGINT;
+}
 
-	if (s)
-		return s;
+static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
+					u64 duration, u64 tstamp, FILE *fp)
+{
+	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
+	printed += fprintf_duration(duration, fp);
 
-	s = script_desc__new(name);
-	if (!s)
-		goto out_delete_desc;
+	if (trace->multiple_threads) {
+		if (trace->show_comm)
+			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
+		printed += fprintf(fp, "%d ", thread->tid);
+	}
 
-	script_desc__add(s);
+	return printed;
+}
 
-	return s;
+static int trace__process_event(struct trace *trace, struct machine *machine,
+				union perf_event *event, struct perf_sample *sample)
+{
+	int ret = 0;
+
+	switch (event->header.type) {
+	case PERF_RECORD_LOST:
+		color_fprintf(trace->output, PERF_COLOR_RED,
+			      "LOST %" PRIu64 " events!\n", event->lost.lost);
+		ret = machine__process_lost_event(machine, event, sample);
+	default:
+		ret = machine__process_event(machine, event, sample);
+		break;
+	}
 
-out_delete_desc:
-	script_desc__delete(s);
+	return ret;
+}
 
-	return NULL;
+static int trace__tool_process(struct perf_tool *tool,
+			       union perf_event *event,
+			       struct perf_sample *sample,
+			       struct machine *machine)
+{
+	struct trace *trace = container_of(tool, struct trace, tool);
+	return trace__process_event(trace, machine, event, sample);
 }
 
-static char *ends_with(char *str, const char *suffix)
+static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
 {
-	size_t suffix_len = strlen(suffix);
-	char *p = str;
+	int err = symbol__init();
 
-	if (strlen(str) > suffix_len) {
-		p = str + strlen(str) - suffix_len;
-		if (!strncmp(p, suffix, suffix_len))
-			return p;
-	}
+	if (err)
+		return err;
 
-	return NULL;
+	trace->host = machine__new_host();
+	if (trace->host == NULL)
+		return -ENOMEM;
+
+	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
+					    evlist->threads, trace__tool_process, false);
+	if (err)
+		symbol__exit();
+
+	return err;
 }
 
-static char *ltrim(char *str)
+static int syscall__set_arg_fmts(struct syscall *sc)
 {
-	int len = strlen(str);
+	struct format_field *field;
+	int idx = 0;
 
-	while (len && isspace(*str)) {
-		len--;
-		str++;
+	sc->arg_scnprintf = calloc(sc->tp_format->format.nr_fields - 1, sizeof(void *));
+	if (sc->arg_scnprintf == NULL)
+		return -1;
+
+	if (sc->fmt)
+		sc->arg_parm = sc->fmt->arg_parm;
+
+	for (field = sc->tp_format->format.fields->next; field; field = field->next) {
+		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
+			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
+		else if (field->flags & FIELD_IS_POINTER)
+			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
+		++idx;
 	}
 
-	return str;
+	return 0;
 }
 
-static int read_script_info(struct script_desc *desc, const char *filename)
+static int trace__read_syscall_info(struct trace *trace, int id)
 {
-	char line[BUFSIZ], *p;
-	FILE *fp;
+	char tp_name[128];
+	struct syscall *sc;
+	const char *name = audit_syscall_to_name(id, trace->audit.machine);
 
-	fp = fopen(filename, "r");
-	if (!fp)
+	if (name == NULL)
 		return -1;
 
-	while (fgets(line, sizeof(line), fp)) {
-		p = ltrim(line);
-		if (strlen(p) == 0)
-			continue;
-		if (*p != '#')
-			continue;
-		p++;
-		if (strlen(p) && *p == '!')
-			continue;
-
-		p = ltrim(p);
-		if (strlen(p) && p[strlen(p) - 1] == '\n')
-			p[strlen(p) - 1] = '\0';
-
-		if (!strncmp(p, "description:", strlen("description:"))) {
-			p += strlen("description:");
-			desc->half_liner = strdup(ltrim(p));
-			continue;
+	if (id > trace->syscalls.max) {
+		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
+
+		if (nsyscalls == NULL)
+			return -1;
+
+		if (trace->syscalls.max != -1) {
+			memset(nsyscalls + trace->syscalls.max + 1, 0,
+			       (id - trace->syscalls.max) * sizeof(*sc));
+		} else {
+			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
 		}
 
-		if (!strncmp(p, "args:", strlen("args:"))) {
-			p += strlen("args:");
-			desc->args = strdup(ltrim(p));
-			continue;
+		trace->syscalls.table = nsyscalls;
+		trace->syscalls.max   = id;
+	}
+
+	sc = trace->syscalls.table + id;
+	sc->name = name;
+
+	if (trace->ev_qualifier) {
+		bool in = strlist__find(trace->ev_qualifier, name) != NULL;
+
+		if (!(in ^ trace->not_ev_qualifier)) {
+			sc->filtered = true;
+			/*
+			 * No need to do read tracepoint information since this will be
+			 * filtered out.
+			 */
+			return 0;
 		}
 	}
 
-	fclose(fp);
+	sc->fmt  = syscall_fmt__find(sc->name);
 
-	return 0;
+	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
+	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
+
+	if (sc->tp_format == NULL && sc->fmt && sc->fmt->alias) {
+		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
+		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
+	}
+
+	if (sc->tp_format == NULL)
+		return -1;
+
+	return syscall__set_arg_fmts(sc);
 }
 
-static int list_available_scripts(const struct option *opt __used,
-				  const char *s __used, int unset __used)
+static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
+				      unsigned long *args, struct trace *trace,
+				      struct thread *thread)
 {
-	struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
-	char scripts_path[MAXPATHLEN];
-	DIR *scripts_dir, *lang_dir;
-	char script_path[MAXPATHLEN];
-	char lang_path[MAXPATHLEN];
-	struct script_desc *desc;
-	char first_half[BUFSIZ];
-	char *script_root;
-	char *str;
+	size_t printed = 0;
+
+	if (sc->tp_format != NULL) {
+		struct format_field *field;
+		u8 bit = 1;
+		struct syscall_arg arg = {
+			.idx	= 0,
+			.mask	= 0,
+			.trace  = trace,
+			.thread = thread,
+		};
+
+		for (field = sc->tp_format->format.fields->next; field;
+		     field = field->next, ++arg.idx, bit <<= 1) {
+			if (arg.mask & bit)
+				continue;
+			/*
+ 			 * Suppress this argument if its value is zero and
+ 			 * and we don't have a string associated in an
+ 			 * strarray for it.
+ 			 */
+			if (args[arg.idx] == 0 &&
+			    !(sc->arg_scnprintf &&
+			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
+			      sc->arg_parm[arg.idx]))
+				continue;
+
+			printed += scnprintf(bf + printed, size - printed,
+					     "%s%s: ", printed ? ", " : "", field->name);
+			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
+				arg.val = args[arg.idx];
+				if (sc->arg_parm)
+					arg.parm = sc->arg_parm[arg.idx];
+				printed += sc->arg_scnprintf[arg.idx](bf + printed,
+								      size - printed, &arg);
+			} else {
+				printed += scnprintf(bf + printed, size - printed,
+						     "%ld", args[arg.idx]);
+			}
+		}
+	} else {
+		int i = 0;
 
-	snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
+		while (i < 6) {
+			printed += scnprintf(bf + printed, size - printed,
+					     "%sarg%d: %ld",
+					     printed ? ", " : "", i, args[i]);
+			++i;
+		}
+	}
 
-	scripts_dir = opendir(scripts_path);
-	if (!scripts_dir)
-		return -1;
+	return printed;
+}
 
-	for_each_lang(scripts_dir, lang_dirent, lang_next) {
-		snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
-			 lang_dirent.d_name);
-		lang_dir = opendir(lang_path);
-		if (!lang_dir)
-			continue;
-
-		for_each_script(lang_dir, script_dirent, script_next) {
-			script_root = strdup(script_dirent.d_name);
-			str = ends_with(script_root, REPORT_SUFFIX);
-			if (str) {
-				*str = '\0';
-				desc = script_desc__findnew(script_root);
-				snprintf(script_path, MAXPATHLEN, "%s/%s",
-					 lang_path, script_dirent.d_name);
-				read_script_info(desc, script_path);
-			}
-			free(script_root);
+typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
+				  struct perf_sample *sample);
+
+static struct syscall *trace__syscall_info(struct trace *trace,
+					   struct perf_evsel *evsel, int id)
+{
+
+	if (id < 0) {
+
+		/*
+		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
+		 * before that, leaving at a higher verbosity level till that is
+		 * explained. Reproduced with plain ftrace with:
+		 *
+		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
+		 * grep "NR -1 " /t/trace_pipe
+		 *
+		 * After generating some load on the machine.
+ 		 */
+		if (verbose > 1) {
+			static u64 n;
+			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
+				id, perf_evsel__name(evsel), ++n);
 		}
+		return NULL;
 	}
 
-	fprintf(stdout, "List of available trace scripts:\n");
-	list_for_each_entry(desc, &script_descs, node) {
-		sprintf(first_half, "%s %s", desc->name,
-			desc->args ? desc->args : "");
-		fprintf(stdout, "  %-36s %s\n", first_half,
-			desc->half_liner ? desc->half_liner : "");
+	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
+	    trace__read_syscall_info(trace, id))
+		goto out_cant_read;
+
+	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
+		goto out_cant_read;
+
+	return &trace->syscalls.table[id];
+
+out_cant_read:
+	if (verbose) {
+		fprintf(trace->output, "Problems reading syscall %d", id);
+		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
+			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
+		fputs(" information\n", trace->output);
 	}
+	return NULL;
+}
 
-	exit(0);
+static void thread__update_stats(struct thread_trace *ttrace,
+				 int id, struct perf_sample *sample)
+{
+	struct int_node *inode;
+	struct stats *stats;
+	u64 duration = 0;
+
+	inode = intlist__findnew(ttrace->syscall_stats, id);
+	if (inode == NULL)
+		return;
+
+	stats = inode->priv;
+	if (stats == NULL) {
+		stats = malloc(sizeof(struct stats));
+		if (stats == NULL)
+			return;
+		init_stats(stats);
+		inode->priv = stats;
+	}
+
+	if (ttrace->entry_time && sample->time > ttrace->entry_time)
+		duration = sample->time - ttrace->entry_time;
+
+	update_stats(stats, duration);
 }
 
-static char *get_script_path(const char *script_root, const char *suffix)
+static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
+			    struct perf_sample *sample)
 {
-	struct dirent *script_next, *lang_next, script_dirent, lang_dirent;
-	char scripts_path[MAXPATHLEN];
-	char script_path[MAXPATHLEN];
-	DIR *scripts_dir, *lang_dir;
-	char lang_path[MAXPATHLEN];
-	char *str, *__script_root;
-	char *path = NULL;
+	char *msg;
+	void *args;
+	size_t printed = 0;
+	struct thread *thread;
+	int id = perf_evsel__sc_tp_uint(evsel, id, sample);
+	struct syscall *sc = trace__syscall_info(trace, evsel, id);
+	struct thread_trace *ttrace;
+
+	if (sc == NULL)
+		return -1;
 
-	snprintf(scripts_path, MAXPATHLEN, "%s/scripts", perf_exec_path());
+	if (sc->filtered)
+		return 0;
 
-	scripts_dir = opendir(scripts_path);
-	if (!scripts_dir)
-		return NULL;
+	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
+	ttrace = thread__trace(thread, trace->output);
+	if (ttrace == NULL)
+		return -1;
 
-	for_each_lang(scripts_dir, lang_dirent, lang_next) {
-		snprintf(lang_path, MAXPATHLEN, "%s/%s/bin", scripts_path,
-			 lang_dirent.d_name);
-		lang_dir = opendir(lang_path);
-		if (!lang_dir)
-			continue;
-
-		for_each_script(lang_dir, script_dirent, script_next) {
-			__script_root = strdup(script_dirent.d_name);
-			str = ends_with(__script_root, suffix);
-			if (str) {
-				*str = '\0';
-				if (strcmp(__script_root, script_root))
-					continue;
-				snprintf(script_path, MAXPATHLEN, "%s/%s",
-					 lang_path, script_dirent.d_name);
-				path = strdup(script_path);
-				free(__script_root);
-				break;
-			}
-			free(__script_root);
+	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
+	ttrace = thread->priv;
+
+	if (ttrace->entry_str == NULL) {
+		ttrace->entry_str = malloc(1024);
+		if (!ttrace->entry_str)
+			return -1;
+	}
+
+	ttrace->entry_time = sample->time;
+	msg = ttrace->entry_str;
+	printed += scnprintf(msg + printed, 1024 - printed, "%s(", sc->name);
+
+	printed += syscall__scnprintf_args(sc, msg + printed, 1024 - printed,
+					   args, trace, thread);
+
+	if (!strcmp(sc->name, "exit_group") || !strcmp(sc->name, "exit")) {
+		if (!trace->duration_filter && !trace->summary_only) {
+			trace__fprintf_entry_head(trace, thread, 1, sample->time, trace->output);
+			fprintf(trace->output, "%-70s\n", ttrace->entry_str);
 		}
+	} else
+		ttrace->entry_pending = true;
+
+	return 0;
+}
+
+static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
+			   struct perf_sample *sample)
+{
+	int ret;
+	u64 duration = 0;
+	struct thread *thread;
+	int id = perf_evsel__sc_tp_uint(evsel, id, sample);
+	struct syscall *sc = trace__syscall_info(trace, evsel, id);
+	struct thread_trace *ttrace;
+
+	if (sc == NULL)
+		return -1;
+
+	if (sc->filtered)
+		return 0;
+
+	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
+	ttrace = thread__trace(thread, trace->output);
+	if (ttrace == NULL)
+		return -1;
+
+	if (trace->summary)
+		thread__update_stats(ttrace, id, sample);
+
+	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
+
+	if (id == trace->audit.open_id && ret >= 0 && trace->last_vfs_getname) {
+		trace__set_fd_pathname(thread, ret, trace->last_vfs_getname);
+		trace->last_vfs_getname = NULL;
+		++trace->stats.vfs_getname;
 	}
 
-	return path;
+	ttrace = thread->priv;
+
+	ttrace->exit_time = sample->time;
+
+	if (ttrace->entry_time) {
+		duration = sample->time - ttrace->entry_time;
+		if (trace__filter_duration(trace, duration))
+			goto out;
+	} else if (trace->duration_filter)
+		goto out;
+
+	if (trace->summary_only)
+		goto out;
+
+	trace__fprintf_entry_head(trace, thread, duration, sample->time, trace->output);
+
+	if (ttrace->entry_pending) {
+		fprintf(trace->output, "%-70s", ttrace->entry_str);
+	} else {
+		fprintf(trace->output, " ... [");
+		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
+		fprintf(trace->output, "]: %s()", sc->name);
+	}
+
+	if (sc->fmt == NULL) {
+signed_print:
+		fprintf(trace->output, ") = %d", ret);
+	} else if (ret < 0 && sc->fmt->errmsg) {
+		char bf[256];
+		const char *emsg = strerror_r(-ret, bf, sizeof(bf)),
+			   *e = audit_errno_to_name(-ret);
+
+		fprintf(trace->output, ") = -1 %s %s", e, emsg);
+	} else if (ret == 0 && sc->fmt->timeout)
+		fprintf(trace->output, ") = 0 Timeout");
+	else if (sc->fmt->hexret)
+		fprintf(trace->output, ") = %#x", ret);
+	else
+		goto signed_print;
+
+	fputc('\n', trace->output);
+out:
+	ttrace->entry_pending = false;
+
+	return 0;
 }
 
-static const char * const trace_usage[] = {
-	"perf trace [<options>] <command>",
-	NULL
-};
+static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
+			      struct perf_sample *sample)
+{
+	trace->last_vfs_getname = perf_evsel__rawptr(evsel, sample, "pathname");
+	return 0;
+}
 
-static const struct option options[] = {
-	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
-		    "dump raw trace in ASCII"),
-	OPT_BOOLEAN('v', "verbose", &verbose,
-		    "be more verbose (show symbol address, etc)"),
-	OPT_BOOLEAN('L', "Latency", &latency_format,
-		    "show latency attributes (irqs/preemption disabled, etc)"),
-	OPT_CALLBACK_NOOPT('l', "list", NULL, NULL, "list available scripts",
-			   list_available_scripts),
-	OPT_CALLBACK('s', "script", NULL, "name",
-		     "script file name (lang:script name, script name, or *)",
-		     parse_scriptname),
-	OPT_STRING('g', "gen-script", &generate_script_lang, "lang",
-		   "generate perf-trace.xx script in specified language"),
-	OPT_STRING('i', "input", &input_name, "file",
-		    "input file name"),
+static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
+				     struct perf_sample *sample)
+{
+        u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
+	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
+	struct thread *thread = machine__findnew_thread(trace->host,
+							sample->pid,
+							sample->tid);
+	struct thread_trace *ttrace = thread__trace(thread, trace->output);
+
+	if (ttrace == NULL)
+		goto out_dump;
+
+	ttrace->runtime_ms += runtime_ms;
+	trace->runtime_ms += runtime_ms;
+	return 0;
 
-	OPT_END()
-};
+out_dump:
+	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
+	       evsel->name,
+	       perf_evsel__strval(evsel, sample, "comm"),
+	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
+	       runtime,
+	       perf_evsel__intval(evsel, sample, "vruntime"));
+	return 0;
+}
 
-int cmd_trace(int argc, const char **argv, const char *prefix __used)
+static bool skip_sample(struct trace *trace, struct perf_sample *sample)
 {
-	struct perf_session *session;
-	const char *suffix = NULL;
-	const char **__argv;
-	char *script_path;
-	int i, err;
-
-	if (argc >= 2 && strncmp(argv[1], "rec", strlen("rec")) == 0) {
-		if (argc < 3) {
-			fprintf(stderr,
-				"Please specify a record script\n");
-			return -1;
+	if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
+	    (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
+		return false;
+
+	if (trace->pid_list || trace->tid_list)
+		return true;
+
+	return false;
+}
+
+static int trace__process_sample(struct perf_tool *tool,
+				 union perf_event *event __maybe_unused,
+				 struct perf_sample *sample,
+				 struct perf_evsel *evsel,
+				 struct machine *machine __maybe_unused)
+{
+	struct trace *trace = container_of(tool, struct trace, tool);
+	int err = 0;
+
+	tracepoint_handler handler = evsel->handler;
+
+	if (skip_sample(trace, sample))
+		return 0;
+
+	if (!trace->full_time && trace->base_time == 0)
+		trace->base_time = sample->time;
+
+	if (handler) {
+		++trace->nr_events;
+		handler(trace, evsel, sample);
+	}
+
+	return err;
+}
+
+static int parse_target_str(struct trace *trace)
+{
+	if (trace->opts.target.pid) {
+		trace->pid_list = intlist__new(trace->opts.target.pid);
+		if (trace->pid_list == NULL) {
+			pr_err("Error parsing process id string\n");
+			return -EINVAL;
 		}
-		suffix = RECORD_SUFFIX;
 	}
 
-	if (argc >= 2 && strncmp(argv[1], "rep", strlen("rep")) == 0) {
-		if (argc < 3) {
-			fprintf(stderr,
-				"Please specify a report script\n");
-			return -1;
+	if (trace->opts.target.tid) {
+		trace->tid_list = intlist__new(trace->opts.target.tid);
+		if (trace->tid_list == NULL) {
+			pr_err("Error parsing thread id string\n");
+			return -EINVAL;
 		}
-		suffix = REPORT_SUFFIX;
 	}
 
-	if (suffix) {
-		script_path = get_script_path(argv[2], suffix);
-		if (!script_path) {
-			fprintf(stderr, "script not found\n");
-			return -1;
+	return 0;
+}
+
+static int trace__record(int argc, const char **argv)
+{
+	unsigned int rec_argc, i, j;
+	const char **rec_argv;
+	const char * const record_args[] = {
+		"record",
+		"-R",
+		"-m", "1024",
+		"-c", "1",
+		"-e",
+	};
+
+	/* +1 is for the event string below */
+	rec_argc = ARRAY_SIZE(record_args) + 1 + argc;
+	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+
+	if (rec_argv == NULL)
+		return -ENOMEM;
+
+	for (i = 0; i < ARRAY_SIZE(record_args); i++)
+		rec_argv[i] = record_args[i];
+
+	/* event string may be different for older kernels - e.g., RHEL6 */
+	if (is_valid_tracepoint("raw_syscalls:sys_enter"))
+		rec_argv[i] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
+	else if (is_valid_tracepoint("syscalls:sys_enter"))
+		rec_argv[i] = "syscalls:sys_enter,syscalls:sys_exit";
+	else {
+		pr_err("Neither raw_syscalls nor syscalls events exist.\n");
+		return -1;
+	}
+	i++;
+
+	for (j = 0; j < (unsigned int)argc; j++, i++)
+		rec_argv[i] = argv[j];
+
+	return cmd_record(i, rec_argv, NULL);
+}
+
+static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
+
+static void perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
+{
+	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
+	if (evsel == NULL)
+		return;
+
+	if (perf_evsel__field(evsel, "pathname") == NULL) {
+		perf_evsel__delete(evsel);
+		return;
+	}
+
+	evsel->handler = trace__vfs_getname;
+	perf_evlist__add(evlist, evsel);
+}
+
+static int trace__run(struct trace *trace, int argc, const char **argv)
+{
+	struct perf_evlist *evlist = perf_evlist__new();
+	struct perf_evsel *evsel;
+	int err = -1, i;
+	unsigned long before;
+	const bool forks = argc > 0;
+
+	trace->live = true;
+
+	if (evlist == NULL) {
+		fprintf(trace->output, "Not enough memory to run!\n");
+		goto out;
+	}
+
+	if (perf_evlist__add_syscall_newtp(evlist, trace__sys_enter, trace__sys_exit))
+		goto out_error_tp;
+
+	perf_evlist__add_vfs_getname(evlist);
+
+	if (trace->sched &&
+		perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
+				trace__sched_stat_runtime))
+		goto out_error_tp;
+
+	err = perf_evlist__create_maps(evlist, &trace->opts.target);
+	if (err < 0) {
+		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
+		goto out_delete_evlist;
+	}
+
+	err = trace__symbols_init(trace, evlist);
+	if (err < 0) {
+		fprintf(trace->output, "Problems initializing symbol libraries!\n");
+		goto out_delete_evlist;
+	}
+
+	perf_evlist__config(evlist, &trace->opts);
+
+	signal(SIGCHLD, sig_handler);
+	signal(SIGINT, sig_handler);
+
+	if (forks) {
+		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
+						    argv, false, NULL);
+		if (err < 0) {
+			fprintf(trace->output, "Couldn't run the workload!\n");
+			goto out_delete_evlist;
 		}
+	}
 
-		__argv = malloc((argc + 1) * sizeof(const char *));
-		__argv[0] = "/bin/sh";
-		__argv[1] = script_path;
-		for (i = 3; i < argc; i++)
-			__argv[i - 1] = argv[i];
-		__argv[argc - 1] = NULL;
+	err = perf_evlist__open(evlist);
+	if (err < 0)
+		goto out_error_open;
 
-		execvp("/bin/sh", (char **)__argv);
-		exit(-1);
+	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
+	if (err < 0) {
+		fprintf(trace->output, "Couldn't mmap the events: %s\n", strerror(errno));
+		goto out_delete_evlist;
 	}
 
-	setup_scripting();
+	perf_evlist__enable(evlist);
+
+	if (forks)
+		perf_evlist__start_workload(evlist);
+
+	trace->multiple_threads = evlist->threads->map[0] == -1 || evlist->threads->nr > 1;
+again:
+	before = trace->nr_events;
+
+	for (i = 0; i < evlist->nr_mmaps; i++) {
+		union perf_event *event;
+
+		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
+			const u32 type = event->header.type;
+			tracepoint_handler handler;
+			struct perf_sample sample;
+
+			++trace->nr_events;
+
+			err = perf_evlist__parse_sample(evlist, event, &sample);
+			if (err) {
+				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
+				goto next_event;
+			}
+
+			if (!trace->full_time && trace->base_time == 0)
+				trace->base_time = sample.time;
+
+			if (type != PERF_RECORD_SAMPLE) {
+				trace__process_event(trace, trace->host, event, &sample);
+				continue;
+			}
+
+			evsel = perf_evlist__id2evsel(evlist, sample.id);
+			if (evsel == NULL) {
+				fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample.id);
+				goto next_event;
+			}
+
+			if (sample.raw_data == NULL) {
+				fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
+				       perf_evsel__name(evsel), sample.tid,
+				       sample.cpu, sample.raw_size);
+				goto next_event;
+			}
+
+			handler = evsel->handler;
+			handler(trace, evsel, &sample);
+next_event:
+			perf_evlist__mmap_consume(evlist, i);
+
+			if (interrupted)
+				goto out_disable;
+		}
+	}
+
+	if (trace->nr_events == before) {
+		int timeout = done ? 100 : -1;
+
+		if (poll(evlist->pollfd, evlist->nr_fds, timeout) > 0)
+			goto again;
+	} else {
+		goto again;
+	}
+
+out_disable:
+	perf_evlist__disable(evlist);
+
+	if (!err) {
+		if (trace->summary)
+			trace__fprintf_thread_summary(trace, trace->output);
+
+		if (trace->show_tool_stats) {
+			fprintf(trace->output, "Stats:\n "
+					       " vfs_getname : %" PRIu64 "\n"
+					       " proc_getname: %" PRIu64 "\n",
+				trace->stats.vfs_getname,
+				trace->stats.proc_getname);
+		}
+	}
+
+out_delete_evlist:
+	perf_evlist__delete(evlist);
+out:
+	trace->live = false;
+	return err;
+{
+	char errbuf[BUFSIZ];
+
+out_error_tp:
+	perf_evlist__strerror_tp(evlist, errno, errbuf, sizeof(errbuf));
+	goto out_error;
+
+out_error_open:
+	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
+
+out_error:
+	fprintf(trace->output, "%s\n", errbuf);
+	goto out_delete_evlist;
+}
+}
+
+static int trace__replay(struct trace *trace)
+{
+	const struct perf_evsel_str_handler handlers[] = {
+		{ "probe:vfs_getname",	     trace__vfs_getname, },
+	};
+	struct perf_data_file file = {
+		.path  = input_name,
+		.mode  = PERF_DATA_MODE_READ,
+	};
+	struct perf_session *session;
+	struct perf_evsel *evsel;
+	int err = -1;
+
+	trace->tool.sample	  = trace__process_sample;
+	trace->tool.mmap	  = perf_event__process_mmap;
+	trace->tool.mmap2	  = perf_event__process_mmap2;
+	trace->tool.comm	  = perf_event__process_comm;
+	trace->tool.exit	  = perf_event__process_exit;
+	trace->tool.fork	  = perf_event__process_fork;
+	trace->tool.attr	  = perf_event__process_attr;
+	trace->tool.tracing_data = perf_event__process_tracing_data;
+	trace->tool.build_id	  = perf_event__process_build_id;
 
-	argc = parse_options(argc, argv, options, trace_usage,
-			     PARSE_OPT_STOP_AT_NON_OPTION);
+	trace->tool.ordered_samples = true;
+	trace->tool.ordering_requires_timestamps = true;
+
+	/* add tid to output */
+	trace->multiple_threads = true;
 
 	if (symbol__init() < 0)
 		return -1;
-	if (!script_name)
-		setup_pager();
 
-	session = perf_session__new(input_name, O_RDONLY, 0);
+	session = perf_session__new(&file, false, &trace->tool);
 	if (session == NULL)
 		return -ENOMEM;
 
-	if (!perf_session__has_traces(session, "record -R"))
-		return -EINVAL;
+	trace->host = &session->machines.host;
 
-	if (generate_script_lang) {
-		struct stat perf_stat;
+	err = perf_session__set_tracepoints_handlers(session, handlers);
+	if (err)
+		goto out;
 
-		int input = open(input_name, O_RDONLY);
-		if (input < 0) {
-			perror("failed to open file");
-			exit(-1);
-		}
+	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
+						     "raw_syscalls:sys_enter");
+	/* older kernels have syscalls tp versus raw_syscalls */
+	if (evsel == NULL)
+		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
+							     "syscalls:sys_enter");
+	if (evsel == NULL) {
+		pr_err("Data file does not have raw_syscalls:sys_enter event\n");
+		goto out;
+	}
 
-		err = fstat(input, &perf_stat);
-		if (err < 0) {
-			perror("failed to stat file");
-			exit(-1);
+	if (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
+	    perf_evsel__init_sc_tp_ptr_field(evsel, args)) {
+		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
+		goto out;
+	}
+
+	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
+						     "raw_syscalls:sys_exit");
+	if (evsel == NULL)
+		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
+							     "syscalls:sys_exit");
+	if (evsel == NULL) {
+		pr_err("Data file does not have raw_syscalls:sys_exit event\n");
+		goto out;
+	}
+
+	if (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
+	    perf_evsel__init_sc_tp_uint_field(evsel, ret)) {
+		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
+		goto out;
+	}
+
+	err = parse_target_str(trace);
+	if (err != 0)
+		goto out;
+
+	setup_pager();
+
+	err = perf_session__process_events(session, &trace->tool);
+	if (err)
+		pr_err("Failed to process events, error %d", err);
+
+	else if (trace->summary)
+		trace__fprintf_thread_summary(trace, trace->output);
+
+out:
+	perf_session__delete(session);
+
+	return err;
+}
+
+static size_t trace__fprintf_threads_header(FILE *fp)
+{
+	size_t printed;
+
+	printed  = fprintf(fp, "\n Summary of events:\n\n");
+
+	return printed;
+}
+
+static size_t thread__dump_stats(struct thread_trace *ttrace,
+				 struct trace *trace, FILE *fp)
+{
+	struct stats *stats;
+	size_t printed = 0;
+	struct syscall *sc;
+	struct int_node *inode = intlist__first(ttrace->syscall_stats);
+
+	if (inode == NULL)
+		return 0;
+
+	printed += fprintf(fp, "\n");
+
+	printed += fprintf(fp, "   syscall            calls      min       avg       max      stddev\n");
+	printed += fprintf(fp, "                               (msec)    (msec)    (msec)        (%%)\n");
+	printed += fprintf(fp, "   --------------- -------- --------- --------- ---------     ------\n");
+
+	/* each int_node is a syscall */
+	while (inode) {
+		stats = inode->priv;
+		if (stats) {
+			double min = (double)(stats->min) / NSEC_PER_MSEC;
+			double max = (double)(stats->max) / NSEC_PER_MSEC;
+			double avg = avg_stats(stats);
+			double pct;
+			u64 n = (u64) stats->n;
+
+			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
+			avg /= NSEC_PER_MSEC;
+
+			sc = &trace->syscalls.table[inode->i];
+			printed += fprintf(fp, "   %-15s", sc->name);
+			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f",
+					   n, min, avg);
+			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
 		}
 
-		if (!perf_stat.st_size) {
-			fprintf(stderr, "zero-sized file, nothing to do!\n");
-			exit(0);
+		inode = intlist__next(inode);
+	}
+
+	printed += fprintf(fp, "\n\n");
+
+	return printed;
+}
+
+/* struct used to pass data to per-thread function */
+struct summary_data {
+	FILE *fp;
+	struct trace *trace;
+	size_t printed;
+};
+
+static int trace__fprintf_one_thread(struct thread *thread, void *priv)
+{
+	struct summary_data *data = priv;
+	FILE *fp = data->fp;
+	size_t printed = data->printed;
+	struct trace *trace = data->trace;
+	struct thread_trace *ttrace = thread->priv;
+	double ratio;
+
+	if (ttrace == NULL)
+		return 0;
+
+	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
+
+	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
+	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
+	printed += fprintf(fp, "%.1f%%", ratio);
+	printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
+	printed += thread__dump_stats(ttrace, trace, fp);
+
+	data->printed += printed;
+
+	return 0;
+}
+
+static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
+{
+	struct summary_data data = {
+		.fp = fp,
+		.trace = trace
+	};
+	data.printed = trace__fprintf_threads_header(fp);
+
+	machine__for_each_thread(trace->host, trace__fprintf_one_thread, &data);
+
+	return data.printed;
+}
+
+static int trace__set_duration(const struct option *opt, const char *str,
+			       int unset __maybe_unused)
+{
+	struct trace *trace = opt->value;
+
+	trace->duration_filter = atof(str);
+	return 0;
+}
+
+static int trace__open_output(struct trace *trace, const char *filename)
+{
+	struct stat st;
+
+	if (!stat(filename, &st) && st.st_size) {
+		char oldname[PATH_MAX];
+
+		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
+		unlink(oldname);
+		rename(filename, oldname);
+	}
+
+	trace->output = fopen(filename, "w");
+
+	return trace->output == NULL ? -errno : 0;
+}
+
+int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
+{
+	const char * const trace_usage[] = {
+		"perf trace [<options>] [<command>]",
+		"perf trace [<options>] -- <command> [<options>]",
+		"perf trace record [<options>] [<command>]",
+		"perf trace record [<options>] -- <command> [<options>]",
+		NULL
+	};
+	struct trace trace = {
+		.audit = {
+			.machine = audit_detect_machine(),
+			.open_id = audit_name_to_syscall("open", trace.audit.machine),
+		},
+		.syscalls = {
+			. max = -1,
+		},
+		.opts = {
+			.target = {
+				.uid	   = UINT_MAX,
+				.uses_mmap = true,
+			},
+			.user_freq     = UINT_MAX,
+			.user_interval = ULLONG_MAX,
+			.no_buffering  = true,
+			.mmap_pages    = 1024,
+		},
+		.output = stdout,
+		.show_comm = true,
+	};
+	const char *output_name = NULL;
+	const char *ev_qualifier_str = NULL;
+	const struct option trace_options[] = {
+	OPT_BOOLEAN(0, "comm", &trace.show_comm,
+		    "show the thread COMM next to its id"),
+	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
+	OPT_STRING('e', "expr", &ev_qualifier_str, "expr",
+		    "list of events to trace"),
+	OPT_STRING('o', "output", &output_name, "file", "output file name"),
+	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
+	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
+		    "trace events on existing process id"),
+	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
+		    "trace events on existing thread id"),
+	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
+		    "system-wide collection from all CPUs"),
+	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
+		    "list of cpus to monitor"),
+	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
+		    "child tasks do not inherit counters"),
+	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
+		     "number of mmap data pages",
+		     perf_evlist__parse_mmap_pages),
+	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
+		   "user to profile"),
+	OPT_CALLBACK(0, "duration", &trace, "float",
+		     "show only events with duration > N.M ms",
+		     trace__set_duration),
+	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
+	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
+	OPT_BOOLEAN('T', "time", &trace.full_time,
+		    "Show full timestamp, not time relative to first start"),
+	OPT_BOOLEAN('s', "summary", &trace.summary_only,
+		    "Show only syscall summary with statistics"),
+	OPT_BOOLEAN('S', "with-summary", &trace.summary,
+		    "Show all syscalls and summary with statistics"),
+	OPT_END()
+	};
+	int err;
+	char bf[BUFSIZ];
+
+	if ((argc > 1) && (strcmp(argv[1], "record") == 0))
+		return trace__record(argc-2, &argv[2]);
+
+	argc = parse_options(argc, argv, trace_options, trace_usage, 0);
+
+	/* summary_only implies summary option, but don't overwrite summary if set */
+	if (trace.summary_only)
+		trace.summary = trace.summary_only;
+
+	if (output_name != NULL) {
+		err = trace__open_output(&trace, output_name);
+		if (err < 0) {
+			perror("failed to create output file");
+			goto out;
 		}
+	}
 
-		scripting_ops = script_spec__lookup(generate_script_lang);
-		if (!scripting_ops) {
-			fprintf(stderr, "invalid language specifier");
-			return -1;
+	if (ev_qualifier_str != NULL) {
+		const char *s = ev_qualifier_str;
+
+		trace.not_ev_qualifier = *s == '!';
+		if (trace.not_ev_qualifier)
+			++s;
+		trace.ev_qualifier = strlist__new(true, s);
+		if (trace.ev_qualifier == NULL) {
+			fputs("Not enough memory to parse event qualifier",
+			      trace.output);
+			err = -ENOMEM;
+			goto out_close;
 		}
+	}
 
-		err = scripting_ops->generate_script("perf-trace");
-		goto out;
+	err = target__validate(&trace.opts.target);
+	if (err) {
+		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
+		fprintf(trace.output, "%s", bf);
+		goto out_close;
 	}
 
-	if (script_name) {
-		err = scripting_ops->start_script(script_name, argc, argv);
-		if (err)
-			goto out;
+	err = target__parse_uid(&trace.opts.target);
+	if (err) {
+		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
+		fprintf(trace.output, "%s", bf);
+		goto out_close;
 	}
 
-	err = __cmd_trace(session);
+	if (!argc && target__none(&trace.opts.target))
+		trace.opts.target.system_wide = true;
 
-	perf_session__delete(session);
-	cleanup_scripting();
+	if (input_name)
+		err = trace__replay(&trace);
+	else
+		err = trace__run(&trace, argc, argv);
+
+out_close:
+	if (output_name != NULL)
+		fclose(trace.output);
 out:
 	return err;
 }
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 10fe49e7048..b210d62907e 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -4,7 +4,6 @@
 #include "util/util.h"
 #include "util/strbuf.h"
 
-extern const char perf_version_string[];
 extern const char perf_usage_string[];
 extern const char perf_more_info_string[];
 
@@ -19,6 +18,7 @@ extern int cmd_bench(int argc, const char **argv, const char *prefix);
 extern int cmd_buildid_cache(int argc, const char **argv, const char *prefix);
 extern int cmd_buildid_list(int argc, const char **argv, const char *prefix);
 extern int cmd_diff(int argc, const char **argv, const char *prefix);
+extern int cmd_evlist(int argc, const char **argv, const char *prefix);
 extern int cmd_help(int argc, const char **argv, const char *prefix);
 extern int cmd_sched(int argc, const char **argv, const char *prefix);
 extern int cmd_list(int argc, const char **argv, const char *prefix);
@@ -27,10 +27,16 @@ extern int cmd_report(int argc, const char **argv, const char *prefix);
 extern int cmd_stat(int argc, const char **argv, const char *prefix);
 extern int cmd_timechart(int argc, const char **argv, const char *prefix);
 extern int cmd_top(int argc, const char **argv, const char *prefix);
-extern int cmd_trace(int argc, const char **argv, const char *prefix);
+extern int cmd_script(int argc, const char **argv, const char *prefix);
 extern int cmd_version(int argc, const char **argv, const char *prefix);
 extern int cmd_probe(int argc, const char **argv, const char *prefix);
 extern int cmd_kmem(int argc, const char **argv, const char *prefix);
 extern int cmd_lock(int argc, const char **argv, const char *prefix);
+extern int cmd_kvm(int argc, const char **argv, const char *prefix);
+extern int cmd_test(int argc, const char **argv, const char *prefix);
+extern int cmd_trace(int argc, const char **argv, const char *prefix);
+extern int cmd_inject(int argc, const char **argv, const char *prefix);
+extern int cmd_mem(int argc, const char **argv, const char *prefix);
 
+extern int find_scripts(char **scripts_array, char **scripts_path_array);
 #endif
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index db6ee94d4a8..0906fc401c5 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -8,14 +8,20 @@ perf-bench			mainporcelain common
 perf-buildid-cache		mainporcelain common
 perf-buildid-list		mainporcelain common
 perf-diff			mainporcelain common
+perf-evlist			mainporcelain common
+perf-inject			mainporcelain common
+perf-kmem			mainporcelain common
+perf-kvm			mainporcelain common
 perf-list			mainporcelain common
-perf-sched			mainporcelain common
+perf-lock			mainporcelain common
+perf-mem			mainporcelain common
+perf-probe			mainporcelain full
 perf-record			mainporcelain common
 perf-report			mainporcelain common
+perf-sched			mainporcelain common
+perf-script			mainporcelain common
 perf-stat			mainporcelain common
+perf-test			mainporcelain common
 perf-timechart			mainporcelain common
 perf-top			mainporcelain common
 perf-trace			mainporcelain common
-perf-probe			mainporcelain common
-perf-kmem			mainporcelain common
-perf-lock			mainporcelain common
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
new file mode 100644
index 00000000000..f30ac5e5d27
--- /dev/null
+++ b/tools/perf/config/Makefile
@@ -0,0 +1,738 @@
+
+ifeq ($(src-perf),)
+src-perf := $(srctree)/tools/perf
+endif
+
+ifeq ($(obj-perf),)
+obj-perf := $(OUTPUT)
+endif
+
+ifneq ($(obj-perf),)
+obj-perf := $(abspath $(obj-perf))/
+endif
+
+LIB_INCLUDE := $(srctree)/tools/lib/
+CFLAGS := $(EXTRA_CFLAGS) $(EXTRA_WARNINGS)
+
+include $(src-perf)/config/Makefile.arch
+
+NO_PERF_REGS := 1
+
+# Additional ARCH settings for x86
+ifeq ($(ARCH),x86)
+  ifeq (${IS_X86_64}, 1)
+    CFLAGS += -DHAVE_ARCH_X86_64_SUPPORT
+    ARCH_INCLUDE = ../../arch/x86/lib/memcpy_64.S ../../arch/x86/lib/memset_64.S
+    LIBUNWIND_LIBS = -lunwind -lunwind-x86_64
+  else
+    LIBUNWIND_LIBS = -lunwind -lunwind-x86
+  endif
+  NO_PERF_REGS := 0
+endif
+
+ifeq ($(ARCH),arm)
+  NO_PERF_REGS := 0
+  LIBUNWIND_LIBS = -lunwind -lunwind-arm
+endif
+
+ifeq ($(ARCH),arm64)
+  NO_PERF_REGS := 0
+  LIBUNWIND_LIBS = -lunwind -lunwind-aarch64
+endif
+
+# So far there's only x86 and arm libdw unwind support merged in perf.
+# Disable it on all other architectures in case libdw unwind
+# support is detected in system. Add supported architectures
+# to the check.
+ifneq ($(ARCH),$(filter $(ARCH),x86 arm))
+  NO_LIBDW_DWARF_UNWIND := 1
+endif
+
+ifeq ($(LIBUNWIND_LIBS),)
+  NO_LIBUNWIND := 1
+else
+  #
+  # For linking with debug library, run like:
+  #
+  #   make DEBUG=1 LIBUNWIND_DIR=/opt/libunwind/
+  #
+  ifdef LIBUNWIND_DIR
+    LIBUNWIND_CFLAGS  = -I$(LIBUNWIND_DIR)/include
+    LIBUNWIND_LDFLAGS = -L$(LIBUNWIND_DIR)/lib
+  endif
+  LIBUNWIND_LDFLAGS += $(LIBUNWIND_LIBS)
+
+  # Set per-feature check compilation flags
+  FEATURE_CHECK_CFLAGS-libunwind = $(LIBUNWIND_CFLAGS)
+  FEATURE_CHECK_LDFLAGS-libunwind = $(LIBUNWIND_LDFLAGS)
+  FEATURE_CHECK_CFLAGS-libunwind-debug-frame = $(LIBUNWIND_CFLAGS)
+  FEATURE_CHECK_LDFLAGS-libunwind-debug-frame = $(LIBUNWIND_LDFLAGS)
+endif
+
+ifeq ($(NO_PERF_REGS),0)
+  CFLAGS += -DHAVE_PERF_REGS_SUPPORT
+endif
+
+ifndef NO_LIBELF
+  # for linking with debug library, run like:
+  # make DEBUG=1 LIBDW_DIR=/opt/libdw/
+  ifdef LIBDW_DIR
+    LIBDW_CFLAGS  := -I$(LIBDW_DIR)/include
+    LIBDW_LDFLAGS := -L$(LIBDW_DIR)/lib
+  endif
+  FEATURE_CHECK_CFLAGS-libdw-dwarf-unwind := $(LIBDW_CFLAGS)
+  FEATURE_CHECK_LDFLAGS-libdw-dwarf-unwind := $(LIBDW_LDFLAGS) -ldw
+endif
+
+# include ARCH specific config
+-include $(src-perf)/arch/$(ARCH)/Makefile
+
+include $(src-perf)/config/utilities.mak
+
+ifeq ($(call get-executable,$(FLEX)),)
+  dummy := $(error Error: $(FLEX) is missing on this system, please install it)
+endif
+
+ifeq ($(call get-executable,$(BISON)),)
+  dummy := $(error Error: $(BISON) is missing on this system, please install it)
+endif
+
+# Treat warnings as errors unless directed not to
+ifneq ($(WERROR),0)
+  CFLAGS += -Werror
+endif
+
+ifndef DEBUG
+  DEBUG := 0
+endif
+
+ifeq ($(DEBUG),0)
+  CFLAGS += -O6
+endif
+
+ifdef PARSER_DEBUG
+  PARSER_DEBUG_BISON := -t
+  PARSER_DEBUG_FLEX  := -d
+  CFLAGS             += -DPARSER_DEBUG
+endif
+
+CFLAGS += -fno-omit-frame-pointer
+CFLAGS += -ggdb3
+CFLAGS += -funwind-tables
+CFLAGS += -Wall
+CFLAGS += -Wextra
+CFLAGS += -std=gnu99
+
+# Enforce a non-executable stack, as we may regress (again) in the future by
+# adding assembler files missing the .GNU-stack linker note.
+LDFLAGS += -Wl,-z,noexecstack
+
+EXTLIBS = -lelf -lpthread -lrt -lm -ldl
+
+ifneq ($(OUTPUT),)
+  OUTPUT_FEATURES = $(OUTPUT)config/feature-checks/
+  $(shell mkdir -p $(OUTPUT_FEATURES))
+endif
+
+feature_check = $(eval $(feature_check_code))
+define feature_check_code
+  feature-$(1) := $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS) $(FEATURE_CHECK_CFLAGS-$(1))" LDFLAGS="$(LDFLAGS) $(FEATURE_CHECK_LDFLAGS-$(1))" -C config/feature-checks test-$1.bin >/dev/null 2>/dev/null && echo 1 || echo 0)
+endef
+
+feature_set = $(eval $(feature_set_code))
+define feature_set_code
+  feature-$(1) := 1
+endef
+
+#
+# Build the feature check binaries in parallel, ignore errors, ignore return value and suppress output:
+#
+
+#
+# Note that this is not a complete list of all feature tests, just
+# those that are typically built on a fully configured system.
+#
+# [ Feature tests not mentioned here have to be built explicitly in
+#   the rule that uses them - an example for that is the 'bionic'
+#   feature check. ]
+#
+CORE_FEATURE_TESTS =			\
+	backtrace			\
+	dwarf				\
+	fortify-source			\
+	glibc				\
+	gtk2				\
+	gtk2-infobar			\
+	libaudit			\
+	libbfd				\
+	libelf				\
+	libelf-getphdrnum		\
+	libelf-mmap			\
+	libnuma				\
+	libperl				\
+	libpython			\
+	libpython-version		\
+	libslang			\
+	libunwind			\
+	stackprotector-all		\
+	timerfd				\
+	libdw-dwarf-unwind
+
+LIB_FEATURE_TESTS =			\
+	dwarf				\
+	glibc				\
+	gtk2				\
+	libaudit			\
+	libbfd				\
+	libelf				\
+	libnuma				\
+	libperl				\
+	libpython			\
+	libslang			\
+	libunwind			\
+	libdw-dwarf-unwind
+
+VF_FEATURE_TESTS =			\
+	backtrace			\
+	fortify-source			\
+	gtk2-infobar			\
+	libelf-getphdrnum		\
+	libelf-mmap			\
+	libpython-version		\
+	stackprotector-all		\
+	timerfd				\
+	libunwind-debug-frame		\
+	bionic				\
+	liberty				\
+	liberty-z			\
+	cplus-demangle
+
+# Set FEATURE_CHECK_(C|LD)FLAGS-all for all CORE_FEATURE_TESTS features.
+# If in the future we need per-feature checks/flags for features not
+# mentioned in this list we need to refactor this ;-).
+set_test_all_flags = $(eval $(set_test_all_flags_code))
+define set_test_all_flags_code
+  FEATURE_CHECK_CFLAGS-all  += $(FEATURE_CHECK_CFLAGS-$(1))
+  FEATURE_CHECK_LDFLAGS-all += $(FEATURE_CHECK_LDFLAGS-$(1))
+endef
+
+$(foreach feat,$(CORE_FEATURE_TESTS),$(call set_test_all_flags,$(feat)))
+
+#
+# Special fast-path for the 'all features are available' case:
+#
+$(call feature_check,all,$(MSG))
+
+#
+# Just in case the build freshly failed, make sure we print the
+# feature matrix:
+#
+ifeq ($(feature-all), 1)
+  #
+  # test-all.c passed - just set all the core feature flags to 1:
+  #
+  $(foreach feat,$(CORE_FEATURE_TESTS),$(call feature_set,$(feat)))
+else
+  $(shell $(MAKE) OUTPUT=$(OUTPUT_FEATURES) CFLAGS="$(EXTRA_CFLAGS)" LDFLAGS=$(LDFLAGS) -i -j -C config/feature-checks $(addsuffix .bin,$(CORE_FEATURE_TESTS)) >/dev/null 2>&1)
+  $(foreach feat,$(CORE_FEATURE_TESTS),$(call feature_check,$(feat)))
+endif
+
+ifeq ($(feature-stackprotector-all), 1)
+  CFLAGS += -fstack-protector-all
+endif
+
+ifeq ($(DEBUG),0)
+  ifeq ($(feature-fortify-source), 1)
+    CFLAGS += -D_FORTIFY_SOURCE=2
+  endif
+endif
+
+CFLAGS += -I$(src-perf)/util/include
+CFLAGS += -I$(src-perf)/arch/$(ARCH)/include
+CFLAGS += -I$(srctree)/tools/include/
+CFLAGS += -I$(srctree)/arch/$(ARCH)/include/uapi
+CFLAGS += -I$(srctree)/arch/$(ARCH)/include
+CFLAGS += -I$(srctree)/include/uapi
+CFLAGS += -I$(srctree)/include
+
+# $(obj-perf)      for generated common-cmds.h
+# $(obj-perf)/util for generated bison/flex headers
+ifneq ($(OUTPUT),)
+CFLAGS += -I$(obj-perf)/util
+CFLAGS += -I$(obj-perf)
+endif
+
+CFLAGS += -I$(src-perf)/util
+CFLAGS += -I$(src-perf)
+CFLAGS += -I$(LIB_INCLUDE)
+
+CFLAGS += -D_LARGEFILE64_SOURCE -D_FILE_OFFSET_BITS=64 -D_GNU_SOURCE
+
+ifndef NO_BIONIC
+  $(call feature_check,bionic)
+  ifeq ($(feature-bionic), 1)
+    BIONIC := 1
+    EXTLIBS := $(filter-out -lrt,$(EXTLIBS))
+    EXTLIBS := $(filter-out -lpthread,$(EXTLIBS))
+  endif
+endif
+
+ifdef NO_LIBELF
+  NO_DWARF := 1
+  NO_DEMANGLE := 1
+  NO_LIBUNWIND := 1
+  NO_LIBDW_DWARF_UNWIND := 1
+else
+  ifeq ($(feature-libelf), 0)
+    ifeq ($(feature-glibc), 1)
+      LIBC_SUPPORT := 1
+    endif
+    ifeq ($(BIONIC),1)
+      LIBC_SUPPORT := 1
+    endif
+    ifeq ($(LIBC_SUPPORT),1)
+      msg := $(warning No libelf found, disables 'probe' tool, please install elfutils-libelf-devel/libelf-dev);
+
+      NO_LIBELF := 1
+      NO_DWARF := 1
+      NO_DEMANGLE := 1
+      NO_LIBUNWIND := 1
+      NO_LIBDW_DWARF_UNWIND := 1
+    else
+      ifneq ($(filter s% -static%,$(LDFLAGS),),)
+        msg := $(error No static glibc found, please install glibc-static);
+      else
+        msg := $(error No gnu/libc-version.h found, please install glibc-dev[el]);
+      endif
+    endif
+  else
+    ifndef NO_LIBDW_DWARF_UNWIND
+      ifneq ($(feature-libdw-dwarf-unwind),1)
+        NO_LIBDW_DWARF_UNWIND := 1
+        msg := $(warning No libdw DWARF unwind found, Please install elfutils-devel/libdw-dev >= 0.158 and/or set LIBDW_DIR);
+      endif
+    endif
+    ifneq ($(feature-dwarf), 1)
+      msg := $(warning No libdw.h found or old libdw.h found or elfutils is older than 0.138, disables dwarf support. Please install new elfutils-devel/libdw-dev);
+      NO_DWARF := 1
+    endif # Dwarf support
+  endif # libelf support
+endif # NO_LIBELF
+
+ifndef NO_LIBELF
+  CFLAGS += -DHAVE_LIBELF_SUPPORT
+
+  ifeq ($(feature-libelf-mmap), 1)
+    CFLAGS += -DHAVE_LIBELF_MMAP_SUPPORT
+  endif
+
+  ifeq ($(feature-libelf-getphdrnum), 1)
+    CFLAGS += -DHAVE_ELF_GETPHDRNUM_SUPPORT
+  endif
+
+  # include ARCH specific config
+  -include $(src-perf)/arch/$(ARCH)/Makefile
+
+  ifndef NO_DWARF
+    ifeq ($(origin PERF_HAVE_DWARF_REGS), undefined)
+      msg := $(warning DWARF register mappings have not been defined for architecture $(ARCH), DWARF support disabled);
+      NO_DWARF := 1
+    else
+      CFLAGS += -DHAVE_DWARF_SUPPORT $(LIBDW_CFLAGS)
+      LDFLAGS += $(LIBDW_LDFLAGS)
+      EXTLIBS += -lelf -ldw
+    endif # PERF_HAVE_DWARF_REGS
+  endif # NO_DWARF
+endif # NO_LIBELF
+
+ifndef NO_LIBUNWIND
+  ifneq ($(feature-libunwind), 1)
+    msg := $(warning No libunwind found. Please install libunwind-dev[el] >= 1.1 and/or set LIBUNWIND_DIR);
+    NO_LIBUNWIND := 1
+  endif
+endif
+
+dwarf-post-unwind := 1
+dwarf-post-unwind-text := BUG
+
+# setup DWARF post unwinder
+ifdef NO_LIBUNWIND
+  ifdef NO_LIBDW_DWARF_UNWIND
+    msg := $(warning Disabling post unwind, no support found.);
+    dwarf-post-unwind := 0
+  else
+    dwarf-post-unwind-text := libdw
+  endif
+else
+  dwarf-post-unwind-text := libunwind
+  # Enable libunwind support by default.
+  ifndef NO_LIBDW_DWARF_UNWIND
+    NO_LIBDW_DWARF_UNWIND := 1
+  endif
+endif
+
+ifeq ($(dwarf-post-unwind),1)
+  CFLAGS += -DHAVE_DWARF_UNWIND_SUPPORT
+else
+  NO_DWARF_UNWIND := 1
+endif
+
+ifndef NO_LIBUNWIND
+  ifeq ($(ARCH),$(filter $(ARCH),arm arm64))
+    $(call feature_check,libunwind-debug-frame)
+    ifneq ($(feature-libunwind-debug-frame), 1)
+      msg := $(warning No debug_frame support found in libunwind);
+      CFLAGS += -DNO_LIBUNWIND_DEBUG_FRAME
+    endif
+  else
+    # non-ARM has no dwarf_find_debug_frame() function:
+    CFLAGS += -DNO_LIBUNWIND_DEBUG_FRAME
+  endif
+  CFLAGS  += -DHAVE_LIBUNWIND_SUPPORT
+  EXTLIBS += $(LIBUNWIND_LIBS)
+  CFLAGS  += $(LIBUNWIND_CFLAGS)
+  LDFLAGS += $(LIBUNWIND_LDFLAGS)
+endif
+
+ifndef NO_LIBAUDIT
+  ifneq ($(feature-libaudit), 1)
+    msg := $(warning No libaudit.h found, disables 'trace' tool, please install audit-libs-devel or libaudit-dev);
+    NO_LIBAUDIT := 1
+  else
+    CFLAGS += -DHAVE_LIBAUDIT_SUPPORT
+    EXTLIBS += -laudit
+  endif
+endif
+
+ifdef NO_NEWT
+  NO_SLANG=1
+endif
+
+ifndef NO_SLANG
+  ifneq ($(feature-libslang), 1)
+    msg := $(warning slang not found, disables TUI support. Please install slang-devel or libslang-dev);
+    NO_SLANG := 1
+  else
+    # Fedora has /usr/include/slang/slang.h, but ubuntu /usr/include/slang.h
+    CFLAGS += -I/usr/include/slang
+    CFLAGS += -DHAVE_SLANG_SUPPORT
+    EXTLIBS += -lslang
+  endif
+endif
+
+ifndef NO_GTK2
+  FLAGS_GTK2=$(CFLAGS) $(LDFLAGS) $(EXTLIBS) $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null)
+  ifneq ($(feature-gtk2), 1)
+    msg := $(warning GTK2 not found, disables GTK2 support. Please install gtk2-devel or libgtk2.0-dev);
+    NO_GTK2 := 1
+  else
+    ifeq ($(feature-gtk2-infobar), 1)
+      GTK_CFLAGS := -DHAVE_GTK_INFO_BAR_SUPPORT
+    endif
+    CFLAGS += -DHAVE_GTK2_SUPPORT
+    GTK_CFLAGS += $(shell $(PKG_CONFIG) --cflags gtk+-2.0 2>/dev/null)
+    GTK_LIBS := $(shell $(PKG_CONFIG) --libs gtk+-2.0 2>/dev/null)
+    EXTLIBS += -ldl
+  endif
+endif
+
+grep-libs  = $(filter -l%,$(1))
+strip-libs = $(filter-out -l%,$(1))
+
+ifdef NO_LIBPERL
+  CFLAGS += -DNO_LIBPERL
+else
+  PERL_EMBED_LDOPTS = $(shell perl -MExtUtils::Embed -e ldopts 2>/dev/null)
+  PERL_EMBED_LDFLAGS = $(call strip-libs,$(PERL_EMBED_LDOPTS))
+  PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS))
+  PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null`
+  FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
+
+  ifneq ($(feature-libperl), 1)
+    CFLAGS += -DNO_LIBPERL
+    NO_LIBPERL := 1
+    msg := $(warning Missing perl devel files. Disabling perl scripting support, consider installing perl-ExtUtils-Embed);
+  else
+    LDFLAGS += $(PERL_EMBED_LDFLAGS)
+    EXTLIBS += $(PERL_EMBED_LIBADD)
+  endif
+endif
+
+ifeq ($(feature-timerfd), 1)
+  CFLAGS += -DHAVE_TIMERFD_SUPPORT
+else
+  msg := $(warning No timerfd support. Disables 'perf kvm stat live');
+endif
+
+disable-python = $(eval $(disable-python_code))
+define disable-python_code
+  CFLAGS += -DNO_LIBPYTHON
+  $(if $(1),$(warning No $(1) was found))
+  $(warning Python support will not be built)
+  NO_LIBPYTHON := 1
+endef
+
+override PYTHON := \
+  $(call get-executable-or-default,PYTHON,python)
+
+ifndef PYTHON
+  $(call disable-python,python interpreter)
+else
+
+  PYTHON_WORD := $(call shell-wordify,$(PYTHON))
+
+  ifdef NO_LIBPYTHON
+    $(call disable-python)
+  else
+
+    override PYTHON_CONFIG := \
+      $(call get-executable-or-default,PYTHON_CONFIG,$(PYTHON)-config)
+
+    ifndef PYTHON_CONFIG
+      $(call disable-python,python-config tool)
+    else
+
+      PYTHON_CONFIG_SQ := $(call shell-sq,$(PYTHON_CONFIG))
+
+      PYTHON_EMBED_LDOPTS := $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
+      PYTHON_EMBED_LDFLAGS := $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
+      PYTHON_EMBED_LIBADD := $(call grep-libs,$(PYTHON_EMBED_LDOPTS))
+      PYTHON_EMBED_CCOPTS := $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
+      FLAGS_PYTHON_EMBED := $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
+
+      ifneq ($(feature-libpython), 1)
+        $(call disable-python,Python.h (for Python 2.x))
+      else
+
+        ifneq ($(feature-libpython-version), 1)
+          $(warning Python 3 is not yet supported; please set)
+          $(warning PYTHON and/or PYTHON_CONFIG appropriately.)
+          $(warning If you also have Python 2 installed, then)
+          $(warning try something like:)
+          $(warning $(and ,))
+          $(warning $(and ,)  make PYTHON=python2)
+          $(warning $(and ,))
+          $(warning Otherwise, disable Python support entirely:)
+          $(warning $(and ,))
+          $(warning $(and ,)  make NO_LIBPYTHON=1)
+          $(warning $(and ,))
+          $(error   $(and ,))
+        else
+          LDFLAGS += $(PYTHON_EMBED_LDFLAGS)
+          EXTLIBS += $(PYTHON_EMBED_LIBADD)
+          LANG_BINDINGS += $(obj-perf)python/perf.so
+        endif
+      endif
+    endif
+  endif
+endif
+
+ifeq ($(feature-libbfd), 1)
+  EXTLIBS += -lbfd
+
+  # call all detections now so we get correct
+  # status in VF output
+  $(call feature_check,liberty)
+  $(call feature_check,liberty-z)
+  $(call feature_check,cplus-demangle)
+
+  ifeq ($(feature-liberty), 1)
+    EXTLIBS += -liberty
+  else
+    ifeq ($(feature-liberty-z), 1)
+      EXTLIBS += -liberty -lz
+    endif
+  endif
+endif
+
+ifdef NO_DEMANGLE
+  CFLAGS += -DNO_DEMANGLE
+else
+  ifdef HAVE_CPLUS_DEMANGLE_SUPPORT
+    EXTLIBS += -liberty
+    CFLAGS += -DHAVE_CPLUS_DEMANGLE_SUPPORT
+  else
+    ifneq ($(feature-libbfd), 1)
+      ifneq ($(feature-liberty), 1)
+        ifneq ($(feature-liberty-z), 1)
+          # we dont have neither HAVE_CPLUS_DEMANGLE_SUPPORT
+          # or any of 'bfd iberty z' trinity
+          ifeq ($(feature-cplus-demangle), 1)
+            EXTLIBS += -liberty
+            CFLAGS += -DHAVE_CPLUS_DEMANGLE_SUPPORT
+          else
+            msg := $(warning No bfd.h/libbfd found, install binutils-dev[el]/zlib-static to gain symbol demangling)
+            CFLAGS += -DNO_DEMANGLE
+          endif
+        endif
+      endif
+    endif
+  endif
+endif
+
+ifneq ($(filter -lbfd,$(EXTLIBS)),)
+  CFLAGS += -DHAVE_LIBBFD_SUPPORT
+endif
+
+ifndef NO_BACKTRACE
+  ifeq ($(feature-backtrace), 1)
+    CFLAGS += -DHAVE_BACKTRACE_SUPPORT
+  endif
+endif
+
+ifndef NO_LIBNUMA
+  ifeq ($(feature-libnuma), 0)
+    msg := $(warning No numa.h found, disables 'perf bench numa mem' benchmark, please install numactl-devel/libnuma-devel/libnuma-dev);
+    NO_LIBNUMA := 1
+  else
+    CFLAGS += -DHAVE_LIBNUMA_SUPPORT
+    EXTLIBS += -lnuma
+  endif
+endif
+
+# Among the variables below, these:
+#   perfexecdir
+#   template_dir
+#   mandir
+#   infodir
+#   htmldir
+#   ETC_PERFCONFIG (but not sysconfdir)
+# can be specified as a relative path some/where/else;
+# this is interpreted as relative to $(prefix) and "perf" at
+# runtime figures out where they are based on the path to the executable.
+# This can help installing the suite in a relocatable way.
+
+# Make the path relative to DESTDIR, not to prefix
+ifndef DESTDIR
+prefix ?= $(HOME)
+endif
+bindir_relative = bin
+bindir = $(prefix)/$(bindir_relative)
+mandir = share/man
+infodir = share/info
+perfexecdir = libexec/perf-core
+sharedir = $(prefix)/share
+template_dir = share/perf-core/templates
+htmldir = share/doc/perf-doc
+ifeq ($(prefix),/usr)
+sysconfdir = /etc
+ETC_PERFCONFIG = $(sysconfdir)/perfconfig
+else
+sysconfdir = $(prefix)/etc
+ETC_PERFCONFIG = etc/perfconfig
+endif
+ifeq ($(IS_X86_64),1)
+lib = lib64
+else
+lib = lib
+endif
+libdir = $(prefix)/$(lib)
+
+# Shell quote (do not use $(call) to accommodate ancient setups);
+ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG))
+DESTDIR_SQ = $(subst ','\'',$(DESTDIR))
+bindir_SQ = $(subst ','\'',$(bindir))
+mandir_SQ = $(subst ','\'',$(mandir))
+infodir_SQ = $(subst ','\'',$(infodir))
+perfexecdir_SQ = $(subst ','\'',$(perfexecdir))
+template_dir_SQ = $(subst ','\'',$(template_dir))
+htmldir_SQ = $(subst ','\'',$(htmldir))
+prefix_SQ = $(subst ','\'',$(prefix))
+sysconfdir_SQ = $(subst ','\'',$(sysconfdir))
+libdir_SQ = $(subst ','\'',$(libdir))
+
+ifneq ($(filter /%,$(firstword $(perfexecdir))),)
+perfexec_instdir = $(perfexecdir)
+else
+perfexec_instdir = $(prefix)/$(perfexecdir)
+endif
+perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir))
+
+# If we install to $(HOME) we keep the traceevent default:
+# $(HOME)/.traceevent/plugins
+# Otherwise we install plugins into the global $(libdir).
+ifdef DESTDIR
+plugindir=$(libdir)/traceevent/plugins
+plugindir_SQ= $(subst ','\'',$(plugindir))
+endif
+
+#
+# Print the result of the feature test:
+#
+feature_print_status = $(eval $(feature_print_status_code)) $(info $(MSG))
+
+define feature_print_status_code
+  ifeq ($(feature-$(1)), 1)
+    MSG = $(shell printf '...%30s: [ \033[32mon\033[m  ]' $(1))
+  else
+    MSG = $(shell printf '...%30s: [ \033[31mOFF\033[m ]' $(1))
+  endif
+endef
+
+feature_print_var = $(eval $(feature_print_var_code)) $(info $(MSG))
+define feature_print_var_code
+    MSG = $(shell printf '...%30s: %s' $(1) $($(1)))
+endef
+
+feature_print_text = $(eval $(feature_print_text_code)) $(info $(MSG))
+define feature_print_text_code
+    MSG = $(shell printf '...%30s: %s' $(1) $(2))
+endef
+
+PERF_FEATURES := $(foreach feat,$(LIB_FEATURE_TESTS),feature-$(feat)($(feature-$(feat))))
+PERF_FEATURES_FILE := $(shell touch $(OUTPUT)PERF-FEATURES; cat $(OUTPUT)PERF-FEATURES)
+
+ifeq ($(dwarf-post-unwind),1)
+  PERF_FEATURES += dwarf-post-unwind($(dwarf-post-unwind-text))
+endif
+
+# The $(display_lib) controls the default detection message
+# output. It's set if:
+# - detected features differes from stored features from
+#   last build (in PERF-FEATURES file)
+# - one of the $(LIB_FEATURE_TESTS) is not detected
+# - VF is enabled
+
+ifneq ("$(PERF_FEATURES)","$(PERF_FEATURES_FILE)")
+  $(shell echo "$(PERF_FEATURES)" > $(OUTPUT)PERF-FEATURES)
+  display_lib := 1
+endif
+
+feature_check = $(eval $(feature_check_code))
+define feature_check_code
+  ifneq ($(feature-$(1)), 1)
+    display_lib := 1
+  endif
+endef
+
+$(foreach feat,$(LIB_FEATURE_TESTS),$(call feature_check,$(feat)))
+
+ifeq ($(VF),1)
+  display_lib := 1
+  display_vf := 1
+endif
+
+ifeq ($(display_lib),1)
+  $(info )
+  $(info Auto-detecting system features:)
+  $(foreach feat,$(LIB_FEATURE_TESTS),$(call feature_print_status,$(feat),))
+
+  ifeq ($(dwarf-post-unwind),1)
+    $(call feature_print_text,"DWARF post unwind library", $(dwarf-post-unwind-text))
+  endif
+endif
+
+ifeq ($(display_vf),1)
+  $(foreach feat,$(VF_FEATURE_TESTS),$(call feature_print_status,$(feat),))
+  $(info )
+  $(call feature_print_var,prefix)
+  $(call feature_print_var,bindir)
+  $(call feature_print_var,libdir)
+  $(call feature_print_var,sysconfdir)
+  $(call feature_print_var,LIBUNWIND_DIR)
+  $(call feature_print_var,LIBDW_DIR)
+endif
+
+ifeq ($(display_lib),1)
+  $(info )
+endif
diff --git a/tools/perf/config/Makefile.arch b/tools/perf/config/Makefile.arch
new file mode 100644
index 00000000000..4b06719ee98
--- /dev/null
+++ b/tools/perf/config/Makefile.arch
@@ -0,0 +1,23 @@
+
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/ -e s/sun4u/sparc64/ \
+                                  -e s/arm.*/arm/ -e s/sa110/arm/ \
+                                  -e s/s390x/s390/ -e s/parisc64/parisc/ \
+                                  -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \
+                                  -e s/sh[234].*/sh/ -e s/aarch64.*/arm64/ \
+                                  -e s/tile.*/tile/ )
+
+# Additional ARCH settings for x86
+ifeq ($(ARCH),i386)
+  override ARCH := x86
+endif
+
+ifeq ($(ARCH),x86_64)
+  override ARCH := x86
+  IS_X86_64 := 0
+  ifeq (, $(findstring m32,$(CFLAGS)))
+    IS_X86_64 := $(shell echo __x86_64__ | ${CC} -E -x c - | tail -n 1)
+    RAW_ARCH := x86_64
+  endif
+endif
diff --git a/tools/perf/config/feature-checks/.gitignore b/tools/perf/config/feature-checks/.gitignore
new file mode 100644
index 00000000000..80f3da0c351
--- /dev/null
+++ b/tools/perf/config/feature-checks/.gitignore
@@ -0,0 +1,2 @@
+*.d
+*.bin
diff --git a/tools/perf/config/feature-checks/Makefile b/tools/perf/config/feature-checks/Makefile
new file mode 100644
index 00000000000..64c84e5f051
--- /dev/null
+++ b/tools/perf/config/feature-checks/Makefile
@@ -0,0 +1,149 @@
+
+FILES=					\
+	test-all.bin			\
+	test-backtrace.bin		\
+	test-bionic.bin			\
+	test-dwarf.bin			\
+	test-fortify-source.bin		\
+	test-glibc.bin			\
+	test-gtk2.bin			\
+	test-gtk2-infobar.bin		\
+	test-hello.bin			\
+	test-libaudit.bin		\
+	test-libbfd.bin			\
+	test-liberty.bin		\
+	test-liberty-z.bin		\
+	test-cplus-demangle.bin		\
+	test-libelf.bin			\
+	test-libelf-getphdrnum.bin	\
+	test-libelf-mmap.bin		\
+	test-libnuma.bin		\
+	test-libperl.bin		\
+	test-libpython.bin		\
+	test-libpython-version.bin	\
+	test-libslang.bin		\
+	test-libunwind.bin		\
+	test-libunwind-debug-frame.bin	\
+	test-stackprotector-all.bin	\
+	test-timerfd.bin		\
+	test-libdw-dwarf-unwind.bin
+
+CC := $(CROSS_COMPILE)gcc -MD
+PKG_CONFIG := $(CROSS_COMPILE)pkg-config
+
+all: $(FILES)
+
+BUILD = $(CC) $(CFLAGS) -o $(OUTPUT)$@ $(patsubst %.bin,%.c,$@) $(LDFLAGS)
+
+###############################
+
+test-all.bin:
+	$(BUILD) -Werror -fstack-protector-all -O2 -Werror -D_FORTIFY_SOURCE=2 -ldw -lelf -lnuma -lelf -laudit -I/usr/include/slang -lslang $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null) $(FLAGS_PERL_EMBED) $(FLAGS_PYTHON_EMBED) -DPACKAGE='"perf"' -lbfd -ldl
+
+test-hello.bin:
+	$(BUILD)
+
+test-stackprotector-all.bin:
+	$(BUILD) -Werror -fstack-protector-all
+
+test-fortify-source.bin:
+	$(BUILD) -O2 -Werror -D_FORTIFY_SOURCE=2
+
+test-bionic.bin:
+	$(BUILD)
+
+test-libelf.bin:
+	$(BUILD) -lelf
+
+test-glibc.bin:
+	$(BUILD)
+
+test-dwarf.bin:
+	$(BUILD) -ldw
+
+test-libelf-mmap.bin:
+	$(BUILD) -lelf
+
+test-libelf-getphdrnum.bin:
+	$(BUILD) -lelf
+
+test-libnuma.bin:
+	$(BUILD) -lnuma
+
+test-libunwind.bin:
+	$(BUILD) -lelf
+
+test-libunwind-debug-frame.bin:
+	$(BUILD) -lelf
+
+test-libaudit.bin:
+	$(BUILD) -laudit
+
+test-libslang.bin:
+	$(BUILD) -I/usr/include/slang -lslang
+
+test-gtk2.bin:
+	$(BUILD) $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null)
+
+test-gtk2-infobar.bin:
+	$(BUILD) $(shell $(PKG_CONFIG) --libs --cflags gtk+-2.0 2>/dev/null)
+
+grep-libs  = $(filter -l%,$(1))
+strip-libs = $(filter-out -l%,$(1))
+
+PERL_EMBED_LDOPTS = $(shell perl -MExtUtils::Embed -e ldopts 2>/dev/null)
+PERL_EMBED_LDFLAGS = $(call strip-libs,$(PERL_EMBED_LDOPTS))
+PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS))
+PERL_EMBED_CCOPTS = `perl -MExtUtils::Embed -e ccopts 2>/dev/null`
+FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
+
+test-libperl.bin:
+	$(BUILD) $(FLAGS_PERL_EMBED)
+
+override PYTHON := python
+override PYTHON_CONFIG := python-config
+
+escape-for-shell-sq =  $(subst ','\'',$(1))
+shell-sq = '$(escape-for-shell-sq)'
+
+PYTHON_CONFIG_SQ = $(call shell-sq,$(PYTHON_CONFIG))
+
+PYTHON_EMBED_LDOPTS = $(shell $(PYTHON_CONFIG_SQ) --ldflags 2>/dev/null)
+PYTHON_EMBED_LDFLAGS = $(call strip-libs,$(PYTHON_EMBED_LDOPTS))
+PYTHON_EMBED_LIBADD = $(call grep-libs,$(PYTHON_EMBED_LDOPTS))
+PYTHON_EMBED_CCOPTS = $(shell $(PYTHON_CONFIG_SQ) --cflags 2>/dev/null)
+FLAGS_PYTHON_EMBED = $(PYTHON_EMBED_CCOPTS) $(PYTHON_EMBED_LDOPTS)
+
+test-libpython.bin:
+	$(BUILD) $(FLAGS_PYTHON_EMBED)
+
+test-libpython-version.bin:
+	$(BUILD) $(FLAGS_PYTHON_EMBED)
+
+test-libbfd.bin:
+	$(BUILD) -DPACKAGE='"perf"' -lbfd -lz -liberty -ldl
+
+test-liberty.bin:
+	$(CC) -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' -lbfd -ldl -liberty
+
+test-liberty-z.bin:
+	$(CC) -o $(OUTPUT)$@ test-libbfd.c -DPACKAGE='"perf"' -lbfd -ldl -liberty -lz
+
+test-cplus-demangle.bin:
+	$(BUILD) -liberty
+
+test-backtrace.bin:
+	$(BUILD)
+
+test-timerfd.bin:
+	$(BUILD)
+
+test-libdw-dwarf-unwind.bin:
+	$(BUILD)
+
+-include *.d
+
+###############################
+
+clean:
+	rm -f $(FILES) *.d
diff --git a/tools/perf/config/feature-checks/test-all.c b/tools/perf/config/feature-checks/test-all.c
new file mode 100644
index 00000000000..fe5c1e5c952
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-all.c
@@ -0,0 +1,116 @@
+/*
+ * test-all.c: Try to build all the main testcases at once.
+ *
+ * A well-configured system will have all the prereqs installed, so we can speed
+ * up auto-detection on such systems.
+ */
+
+/*
+ * Quirk: Python and Perl headers cannot be in arbitrary places, so keep
+ * these 3 testcases at the top:
+ */
+#define main main_test_libpython
+# include "test-libpython.c"
+#undef main
+
+#define main main_test_libpython_version
+# include "test-libpython-version.c"
+#undef main
+
+#define main main_test_libperl
+# include "test-libperl.c"
+#undef main
+
+#define main main_test_hello
+# include "test-hello.c"
+#undef main
+
+#define main main_test_libelf
+# include "test-libelf.c"
+#undef main
+
+#define main main_test_libelf_mmap
+# include "test-libelf-mmap.c"
+#undef main
+
+#define main main_test_glibc
+# include "test-glibc.c"
+#undef main
+
+#define main main_test_dwarf
+# include "test-dwarf.c"
+#undef main
+
+#define main main_test_libelf_getphdrnum
+# include "test-libelf-getphdrnum.c"
+#undef main
+
+#define main main_test_libunwind
+# include "test-libunwind.c"
+#undef main
+
+#define main main_test_libaudit
+# include "test-libaudit.c"
+#undef main
+
+#define main main_test_libslang
+# include "test-libslang.c"
+#undef main
+
+#define main main_test_gtk2
+# include "test-gtk2.c"
+#undef main
+
+#define main main_test_gtk2_infobar
+# include "test-gtk2-infobar.c"
+#undef main
+
+#define main main_test_libbfd
+# include "test-libbfd.c"
+#undef main
+
+#define main main_test_backtrace
+# include "test-backtrace.c"
+#undef main
+
+#define main main_test_libnuma
+# include "test-libnuma.c"
+#undef main
+
+#define main main_test_timerfd
+# include "test-timerfd.c"
+#undef main
+
+#define main main_test_stackprotector_all
+# include "test-stackprotector-all.c"
+#undef main
+
+#define main main_test_libdw_dwarf_unwind
+# include "test-libdw-dwarf-unwind.c"
+#undef main
+
+int main(int argc, char *argv[])
+{
+	main_test_libpython();
+	main_test_libpython_version();
+	main_test_libperl();
+	main_test_hello();
+	main_test_libelf();
+	main_test_libelf_mmap();
+	main_test_glibc();
+	main_test_dwarf();
+	main_test_libelf_getphdrnum();
+	main_test_libunwind();
+	main_test_libaudit();
+	main_test_libslang();
+	main_test_gtk2(argc, argv);
+	main_test_gtk2_infobar(argc, argv);
+	main_test_libbfd();
+	main_test_backtrace();
+	main_test_libnuma();
+	main_test_timerfd();
+	main_test_stackprotector_all();
+	main_test_libdw_dwarf_unwind();
+
+	return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-backtrace.c b/tools/perf/config/feature-checks/test-backtrace.c
new file mode 100644
index 00000000000..7124aa1dc8f
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-backtrace.c
@@ -0,0 +1,13 @@
+#include <execinfo.h>
+#include <stdio.h>
+
+int main(void)
+{
+	void *backtrace_fns[10];
+	size_t entries;
+
+	entries = backtrace(backtrace_fns, 10);
+	backtrace_symbols_fd(backtrace_fns, entries, 1);
+
+	return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-bionic.c b/tools/perf/config/feature-checks/test-bionic.c
new file mode 100644
index 00000000000..eac24e9513e
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-bionic.c
@@ -0,0 +1,6 @@
+#include <android/api-level.h>
+
+int main(void)
+{
+	return __ANDROID_API__;
+}
diff --git a/tools/perf/config/feature-checks/test-cplus-demangle.c b/tools/perf/config/feature-checks/test-cplus-demangle.c
new file mode 100644
index 00000000000..610c686e000
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-cplus-demangle.c
@@ -0,0 +1,14 @@
+extern int printf(const char *format, ...);
+extern char *cplus_demangle(const char *, int);
+
+int main(void)
+{
+	char symbol[4096] = "FieldName__9ClassNameFd";
+	char *tmp;
+
+	tmp = cplus_demangle(symbol, 0);
+
+	printf("demangled symbol: {%s}\n", tmp);
+
+	return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-dwarf.c b/tools/perf/config/feature-checks/test-dwarf.c
new file mode 100644
index 00000000000..3fc1801ce4a
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-dwarf.c
@@ -0,0 +1,10 @@
+#include <dwarf.h>
+#include <elfutils/libdw.h>
+#include <elfutils/version.h>
+
+int main(void)
+{
+	Dwarf *dbg = dwarf_begin(0, DWARF_C_READ);
+
+	return (long)dbg;
+}
diff --git a/tools/perf/config/feature-checks/test-fortify-source.c b/tools/perf/config/feature-checks/test-fortify-source.c
new file mode 100644
index 00000000000..c9f398d8786
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-fortify-source.c
@@ -0,0 +1,6 @@
+#include <stdio.h>
+
+int main(void)
+{
+	return puts("hi");
+}
diff --git a/tools/perf/config/feature-checks/test-glibc.c b/tools/perf/config/feature-checks/test-glibc.c
new file mode 100644
index 00000000000..b0820345cd9
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-glibc.c
@@ -0,0 +1,8 @@
+#include <gnu/libc-version.h>
+
+int main(void)
+{
+	const char *version = gnu_get_libc_version();
+
+	return (long)version;
+}
diff --git a/tools/perf/config/feature-checks/test-gtk2-infobar.c b/tools/perf/config/feature-checks/test-gtk2-infobar.c
new file mode 100644
index 00000000000..397b4646d06
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-gtk2-infobar.c
@@ -0,0 +1,11 @@
+#pragma GCC diagnostic ignored "-Wstrict-prototypes"
+#include <gtk/gtk.h>
+#pragma GCC diagnostic error "-Wstrict-prototypes"
+
+int main(int argc, char *argv[])
+{
+	gtk_init(&argc, &argv);
+	gtk_info_bar_new();
+
+	return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-gtk2.c b/tools/perf/config/feature-checks/test-gtk2.c
new file mode 100644
index 00000000000..6bd80e50943
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-gtk2.c
@@ -0,0 +1,10 @@
+#pragma GCC diagnostic ignored "-Wstrict-prototypes"
+#include <gtk/gtk.h>
+#pragma GCC diagnostic error "-Wstrict-prototypes"
+
+int main(int argc, char *argv[])
+{
+	gtk_init(&argc, &argv);
+
+        return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-hello.c b/tools/perf/config/feature-checks/test-hello.c
new file mode 100644
index 00000000000..c9f398d8786
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-hello.c
@@ -0,0 +1,6 @@
+#include <stdio.h>
+
+int main(void)
+{
+	return puts("hi");
+}
diff --git a/tools/perf/config/feature-checks/test-libaudit.c b/tools/perf/config/feature-checks/test-libaudit.c
new file mode 100644
index 00000000000..afc019f0864
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libaudit.c
@@ -0,0 +1,10 @@
+#include <libaudit.h>
+
+extern int printf(const char *format, ...);
+
+int main(void)
+{
+	printf("error message: %s\n", audit_errno_to_name(0));
+
+	return audit_open();
+}
diff --git a/tools/perf/config/feature-checks/test-libbfd.c b/tools/perf/config/feature-checks/test-libbfd.c
new file mode 100644
index 00000000000..24059907e99
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libbfd.c
@@ -0,0 +1,15 @@
+#include <bfd.h>
+
+extern int printf(const char *format, ...);
+
+int main(void)
+{
+	char symbol[4096] = "FieldName__9ClassNameFd";
+	char *tmp;
+
+	tmp = bfd_demangle(0, symbol, 0);
+
+	printf("demangled symbol: {%s}\n", tmp);
+
+	return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-libdw-dwarf-unwind.c b/tools/perf/config/feature-checks/test-libdw-dwarf-unwind.c
new file mode 100644
index 00000000000..f676a3ff442
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libdw-dwarf-unwind.c
@@ -0,0 +1,13 @@
+
+#include <elfutils/libdwfl.h>
+
+int main(void)
+{
+	/*
+	 * This function is guarded via: __nonnull_attribute__ (1, 2).
+	 * Passing '1' as arguments value. This code is never executed,
+	 * only compiled.
+	 */
+	dwfl_thread_getframes((void *) 1, (void *) 1, NULL);
+	return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-libelf-getphdrnum.c b/tools/perf/config/feature-checks/test-libelf-getphdrnum.c
new file mode 100644
index 00000000000..d710459306c
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libelf-getphdrnum.c
@@ -0,0 +1,8 @@
+#include <libelf.h>
+
+int main(void)
+{
+	size_t dst;
+
+	return elf_getphdrnum(0, &dst);
+}
diff --git a/tools/perf/config/feature-checks/test-libelf-mmap.c b/tools/perf/config/feature-checks/test-libelf-mmap.c
new file mode 100644
index 00000000000..564427d7ef1
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libelf-mmap.c
@@ -0,0 +1,8 @@
+#include <libelf.h>
+
+int main(void)
+{
+	Elf *elf = elf_begin(0, ELF_C_READ_MMAP, 0);
+
+	return (long)elf;
+}
diff --git a/tools/perf/config/feature-checks/test-libelf.c b/tools/perf/config/feature-checks/test-libelf.c
new file mode 100644
index 00000000000..08db322d895
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libelf.c
@@ -0,0 +1,8 @@
+#include <libelf.h>
+
+int main(void)
+{
+	Elf *elf = elf_begin(0, ELF_C_READ, 0);
+
+	return (long)elf;
+}
diff --git a/tools/perf/config/feature-checks/test-libnuma.c b/tools/perf/config/feature-checks/test-libnuma.c
new file mode 100644
index 00000000000..4763d9cd587
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libnuma.c
@@ -0,0 +1,9 @@
+#include <numa.h>
+#include <numaif.h>
+
+int main(void)
+{
+	numa_available();
+
+	return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-libperl.c b/tools/perf/config/feature-checks/test-libperl.c
new file mode 100644
index 00000000000..8871f6a0fdb
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libperl.c
@@ -0,0 +1,9 @@
+#include <EXTERN.h>
+#include <perl.h>
+
+int main(void)
+{
+	perl_alloc();
+
+	return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-libpython-version.c b/tools/perf/config/feature-checks/test-libpython-version.c
new file mode 100644
index 00000000000..facea122d81
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libpython-version.c
@@ -0,0 +1,10 @@
+#include <Python.h>
+
+#if PY_VERSION_HEX >= 0x03000000
+	#error
+#endif
+
+int main(void)
+{
+	return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-libpython.c b/tools/perf/config/feature-checks/test-libpython.c
new file mode 100644
index 00000000000..b24b28ad632
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libpython.c
@@ -0,0 +1,8 @@
+#include <Python.h>
+
+int main(void)
+{
+	Py_Initialize();
+
+	return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-libslang.c b/tools/perf/config/feature-checks/test-libslang.c
new file mode 100644
index 00000000000..22ff22ed94d
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libslang.c
@@ -0,0 +1,6 @@
+#include <slang.h>
+
+int main(void)
+{
+	return SLsmg_init_smg();
+}
diff --git a/tools/perf/config/feature-checks/test-libunwind-debug-frame.c b/tools/perf/config/feature-checks/test-libunwind-debug-frame.c
new file mode 100644
index 00000000000..0ef8087a104
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libunwind-debug-frame.c
@@ -0,0 +1,16 @@
+#include <libunwind.h>
+#include <stdlib.h>
+
+extern int
+UNW_OBJ(dwarf_find_debug_frame) (int found, unw_dyn_info_t *di_debug,
+				 unw_word_t ip, unw_word_t segbase,
+				 const char *obj_name, unw_word_t start,
+				 unw_word_t end);
+
+#define dwarf_find_debug_frame UNW_OBJ(dwarf_find_debug_frame)
+
+int main(void)
+{
+	dwarf_find_debug_frame(0, NULL, 0, 0, NULL, 0, 0);
+	return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-libunwind.c b/tools/perf/config/feature-checks/test-libunwind.c
new file mode 100644
index 00000000000..43b9369bcab
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-libunwind.c
@@ -0,0 +1,27 @@
+#include <libunwind.h>
+#include <stdlib.h>
+
+extern int UNW_OBJ(dwarf_search_unwind_table) (unw_addr_space_t as,
+                                      unw_word_t ip,
+                                      unw_dyn_info_t *di,
+                                      unw_proc_info_t *pi,
+                                      int need_unwind_info, void *arg);
+
+
+#define dwarf_search_unwind_table UNW_OBJ(dwarf_search_unwind_table)
+
+static unw_accessors_t accessors;
+
+int main(void)
+{
+	unw_addr_space_t addr_space;
+
+	addr_space = unw_create_addr_space(&accessors, 0);
+	if (addr_space)
+		return 0;
+
+	unw_init_remote(NULL, addr_space, NULL);
+	dwarf_search_unwind_table(addr_space, 0, NULL, NULL, 0, NULL);
+
+	return 0;
+}
diff --git a/tools/perf/config/feature-checks/test-stackprotector-all.c b/tools/perf/config/feature-checks/test-stackprotector-all.c
new file mode 100644
index 00000000000..c9f398d8786
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-stackprotector-all.c
@@ -0,0 +1,6 @@
+#include <stdio.h>
+
+int main(void)
+{
+	return puts("hi");
+}
diff --git a/tools/perf/config/feature-checks/test-timerfd.c b/tools/perf/config/feature-checks/test-timerfd.c
new file mode 100644
index 00000000000..8c5c083b4d3
--- /dev/null
+++ b/tools/perf/config/feature-checks/test-timerfd.c
@@ -0,0 +1,18 @@
+/*
+ * test for timerfd functions used by perf-kvm-stat-live
+ */
+#include <sys/timerfd.h>
+
+int main(void)
+{
+	struct itimerspec new_value;
+
+	int fd = timerfd_create(CLOCK_MONOTONIC, TFD_NONBLOCK);
+	if (fd < 0)
+		return 1;
+
+	if (timerfd_settime(fd, 0, &new_value, NULL) != 0)
+		return 1;
+
+	return 0;
+}
diff --git a/tools/perf/config/utilities.mak b/tools/perf/config/utilities.mak
new file mode 100644
index 00000000000..4d985e0f03f
--- /dev/null
+++ b/tools/perf/config/utilities.mak
@@ -0,0 +1,180 @@
+# This allows us to work with the newline character:
+define newline
+
+
+endef
+newline := $(newline)
+
+# nl-escape
+#
+# Usage: escape = $(call nl-escape[,escape])
+#
+# This is used as the common way to specify
+# what should replace a newline when escaping
+# newlines; the default is a bizarre string.
+#
+nl-escape = $(if $(1),$(1),m822df3020w6a44id34bt574ctac44eb9f4n)
+
+# escape-nl
+#
+# Usage: escaped-text = $(call escape-nl,text[,escape])
+#
+# GNU make's $(shell ...) function converts to a
+# single space each newline character in the output
+# produced during the expansion; this may not be
+# desirable.
+#
+# The only solution is to change each newline into
+# something that won't be converted, so that the
+# information can be recovered later with
+# $(call unescape-nl...)
+#
+escape-nl = $(subst $(newline),$(call nl-escape,$(2)),$(1))
+
+# unescape-nl
+#
+# Usage: text = $(call unescape-nl,escaped-text[,escape])
+#
+# See escape-nl.
+#
+unescape-nl = $(subst $(call nl-escape,$(2)),$(newline),$(1))
+
+# shell-escape-nl
+#
+# Usage: $(shell some-command | $(call shell-escape-nl[,escape]))
+#
+# Use this to escape newlines from within a shell call;
+# the default escape is a bizarre string.
+#
+# NOTE: The escape is used directly as a string constant
+#       in an `awk' program that is delimited by shell
+#       single-quotes, so be wary of the characters
+#       that are chosen.
+#
+define shell-escape-nl
+awk 'NR==1 {t=$$0} NR>1 {t=t "$(nl-escape)" $$0} END {printf t}'
+endef
+
+# shell-unescape-nl
+#
+# Usage: $(shell some-command | $(call shell-unescape-nl[,escape]))
+#
+# Use this to unescape newlines from within a shell call;
+# the default escape is a bizarre string.
+#
+# NOTE: The escape is used directly as an extended regular
+#       expression constant in an `awk' program that is
+#       delimited by shell single-quotes, so be wary
+#       of the characters that are chosen.
+#
+# (The bash shell has a bug where `{gsub(...),...}' is
+#  misinterpreted as a brace expansion; this can be
+#  overcome by putting a space between `{' and `gsub').
+#
+define shell-unescape-nl
+awk 'NR==1 {t=$$0} NR>1 {t=t "\n" $$0} END { gsub(/$(nl-escape)/,"\n",t); printf t }'
+endef
+
+# escape-for-shell-sq
+#
+# Usage: embeddable-text = $(call escape-for-shell-sq,text)
+#
+# This function produces text that is suitable for
+# embedding in a shell string that is delimited by
+# single-quotes.
+#
+escape-for-shell-sq =  $(subst ','\'',$(1))
+
+# shell-sq
+#
+# Usage: single-quoted-and-escaped-text = $(call shell-sq,text)
+#
+shell-sq = '$(escape-for-shell-sq)'
+
+# shell-wordify
+#
+# Usage: wordified-text = $(call shell-wordify,text)
+#
+# For instance:
+#
+#  |define text
+#  |hello
+#  |world
+#  |endef
+#  |
+#  |target:
+#  |	echo $(call shell-wordify,$(text))
+#
+# At least GNU make gets confused by expanding a newline
+# within the context of a command line of a makefile rule
+# (this is in constrast to a `$(shell ...)' function call,
+# which can handle it just fine).
+#
+# This function avoids the problem by producing a string
+# that works as a shell word, regardless of whether or
+# not it contains a newline.
+#
+# If the text to be wordified contains a newline, then
+# an intrictate shell command substitution is constructed
+# to render the text as a single line; when the shell
+# processes the resulting escaped text, it transforms
+# it into the original unescaped text.
+#
+# If the text does not contain a newline, then this function
+# produces the same results as the `$(shell-sq)' function.
+#
+shell-wordify = $(if $(findstring $(newline),$(1)),$(_sw-esc-nl),$(shell-sq))
+define _sw-esc-nl
+"$$(echo $(call escape-nl,$(shell-sq),$(2)) | $(call shell-unescape-nl,$(2)))"
+endef
+
+# is-absolute
+#
+# Usage: bool-value = $(call is-absolute,path)
+#
+is-absolute = $(shell echo $(shell-sq) | grep ^/ -q && echo y)
+
+# lookup
+#
+# Usage: absolute-executable-path-or-empty = $(call lookup,path)
+#
+# (It's necessary to use `sh -c' because GNU make messes up by
+#  trying too hard and getting things wrong).
+#
+lookup = $(call unescape-nl,$(shell sh -c $(_l-sh)))
+_l-sh = $(call shell-sq,command -v $(shell-sq) | $(call shell-escape-nl,))
+
+# is-executable
+#
+# Usage: bool-value = $(call is-executable,path)
+#
+# (It's necessary to use `sh -c' because GNU make messes up by
+#  trying too hard and getting things wrong).
+#
+is-executable = $(call _is-executable-helper,$(shell-sq))
+_is-executable-helper = $(shell sh -c $(_is-executable-sh))
+_is-executable-sh = $(call shell-sq,test -f $(1) -a -x $(1) && echo y)
+
+# get-executable
+#
+# Usage: absolute-executable-path-or-empty = $(call get-executable,path)
+#
+# The goal is to get an absolute path for an executable;
+# the `command -v' is defined by POSIX, but it's not
+# necessarily very portable, so it's only used if
+# relative path resolution is requested, as determined
+# by the presence of a leading `/'.
+#
+get-executable = $(if $(1),$(if $(is-absolute),$(_ge-abspath),$(lookup)))
+_ge-abspath = $(if $(is-executable),$(1))
+
+# get-supplied-or-default-executable
+#
+# Usage: absolute-executable-path-or-empty = $(call get-executable-or-default,variable,default)
+#
+define get-executable-or-default
+$(if $($(1)),$(call _ge_attempt,$($(1)),$(1)),$(call _ge_attempt,$(2)))
+endef
+_ge_attempt = $(if $(get-executable),$(get-executable),$(_gea_warn)$(call _gea_err,$(2)))
+_gea_warn = $(warning The path '$(1)' is not executable.)
+_gea_err  = $(if $(1),$(error Please set '$(1)' appropriately))
diff --git a/tools/perf/design.txt b/tools/perf/design.txt
index bd0bb1b1279..a28dca2582a 100644
--- a/tools/perf/design.txt
+++ b/tools/perf/design.txt
@@ -18,7 +18,7 @@ underlying hardware counters.
 Performance counters are accessed via special file descriptors.
 There's one file descriptor per virtual counter used.
 
-The special file descriptor is opened via the perf_event_open()
+The special file descriptor is opened via the sys_perf_event_open()
 system call:
 
    int sys_perf_event_open(struct perf_event_attr *hw_event_uptr,
@@ -82,7 +82,7 @@ machine-specific.
 If 'raw_type' is 0, then the 'type' field says what kind of counter
 this is, with the following encoding:
 
-enum perf_event_types {
+enum perf_type_id {
 	PERF_TYPE_HARDWARE		= 0,
 	PERF_TYPE_SOFTWARE		= 1,
 	PERF_TYPE_TRACEPOINT		= 2,
@@ -95,7 +95,7 @@ specified by 'event_id':
  * Generalized performance counter event types, used by the hw_event.event_id
  * parameter of the sys_perf_event_open() syscall:
  */
-enum hw_event_ids {
+enum perf_hw_id {
 	/*
 	 * Common hardware events, generalized by the kernel:
 	 */
@@ -129,7 +129,7 @@ software events, selected by 'event_id':
  * physical and sw events of the kernel (and allow the profiling of them as
  * well):
  */
-enum sw_event_ids {
+enum perf_sw_ids {
 	PERF_COUNT_SW_CPU_CLOCK		= 0,
 	PERF_COUNT_SW_TASK_CLOCK	= 1,
 	PERF_COUNT_SW_PAGE_FAULTS	= 2,
@@ -230,7 +230,7 @@ these events are recorded in the ring-buffer (see below).
 The 'comm' bit allows tracking of process comm data on process creation.
 This too is recorded in the ring-buffer (see below).
 
-The 'pid' parameter to the perf_event_open() system call allows the
+The 'pid' parameter to the sys_perf_event_open() system call allows the
 counter to be specific to a task:
 
  pid == 0: if the pid parameter is zero, the counter is attached to the
@@ -260,7 +260,7 @@ The 'flags' parameter is currently unused and must be zero.
 
 The 'group_fd' parameter allows counter "groups" to be set up.  A
 counter group has one counter which is the group "leader".  The leader
-is created first, with group_fd = -1 in the perf_event_open call
+is created first, with group_fd = -1 in the sys_perf_event_open call
 that creates it.  The rest of the group members are created
 subsequently, with group_fd giving the fd of the group leader.
 (A single counter on its own is created with group_fd = -1 and is
@@ -409,14 +409,15 @@ Counters can be enabled and disabled in two ways: via ioctl and via
 prctl.  When a counter is disabled, it doesn't count or generate
 events but does continue to exist and maintain its count value.
 
-An individual counter or counter group can be enabled with
+An individual counter can be enabled with
 
-	ioctl(fd, PERF_EVENT_IOC_ENABLE);
+	ioctl(fd, PERF_EVENT_IOC_ENABLE, 0);
 
 or disabled with
 
-	ioctl(fd, PERF_EVENT_IOC_DISABLE);
+	ioctl(fd, PERF_EVENT_IOC_DISABLE, 0);
 
+For a counter group, pass PERF_IOC_FLAG_GROUP as the third argument.
 Enabling or disabling the leader of a group enables or disables the
 whole group; that is, while the group leader is disabled, none of the
 counters in the group will count.  Enabling or disabling a member of a
@@ -453,7 +454,6 @@ So to start with, in order to add HAVE_PERF_EVENTS to your Kconfig, you
 will need at least this:
 	- asm/perf_event.h - a basic stub will suffice at first
 	- support for atomic64 types (and associated helper functions)
-	- set_perf_event_pending() implemented
 
 If your architecture does have hardware capabilities, you can override the
 weak stub hw_perf_event_init() to register hardware counters.
diff --git a/tools/perf/perf-archive.sh b/tools/perf/perf-archive.sh
index 910468e6e01..e9193062026 100644
--- a/tools/perf/perf-archive.sh
+++ b/tools/perf/perf-archive.sh
@@ -7,27 +7,41 @@ if [ $# -ne 0 ] ; then
 	PERF_DATA=$1
 fi
 
-DEBUGDIR=~/.debug/
+#
+# PERF_BUILDID_DIR environment variable set by perf
+# path to buildid directory, default to $HOME/.debug
+#
+if [ -z $PERF_BUILDID_DIR ]; then
+	PERF_BUILDID_DIR=~/.debug/
+else
+        # append / to make substitutions work
+        PERF_BUILDID_DIR=$PERF_BUILDID_DIR/
+fi
+
 BUILDIDS=$(mktemp /tmp/perf-archive-buildids.XXXXXX)
 NOBUILDID=0000000000000000000000000000000000000000
 
 perf buildid-list -i $PERF_DATA --with-hits | grep -v "^$NOBUILDID " > $BUILDIDS
 if [ ! -s $BUILDIDS ] ; then
 	echo "perf archive: no build-ids found"
-	rm -f $BUILDIDS
+	rm $BUILDIDS || true
 	exit 1
 fi
 
 MANIFEST=$(mktemp /tmp/perf-archive-manifest.XXXXXX)
+PERF_BUILDID_LINKDIR=$(readlink -f $PERF_BUILDID_DIR)/
 
 cut -d ' ' -f 1 $BUILDIDS | \
 while read build_id ; do
-	linkname=$DEBUGDIR.build-id/${build_id:0:2}/${build_id:2}
+	linkname=$PERF_BUILDID_DIR.build-id/${build_id:0:2}/${build_id:2}
 	filename=$(readlink -f $linkname)
-	echo ${linkname#$DEBUGDIR} >> $MANIFEST
-	echo ${filename#$DEBUGDIR} >> $MANIFEST
+	echo ${linkname#$PERF_BUILDID_DIR} >> $MANIFEST
+	echo ${filename#$PERF_BUILDID_LINKDIR} >> $MANIFEST
 done
 
-tar cfj $PERF_DATA.tar.bz2 -C $DEBUGDIR -T $MANIFEST
-rm -f $MANIFEST $BUILDIDS
+tar cjf $PERF_DATA.tar.bz2 -C $PERF_BUILDID_DIR -T $MANIFEST
+rm $MANIFEST $BUILDIDS || true
+echo -e "Now please run:\n"
+echo -e "$ tar xvf $PERF_DATA.tar.bz2 -C ~/.debug\n"
+echo "wherever you need to run 'perf report' on."
 exit 0
diff --git a/tools/perf/perf-completion.sh b/tools/perf/perf-completion.sh
new file mode 100644
index 00000000000..33569847fdc
--- /dev/null
+++ b/tools/perf/perf-completion.sh
@@ -0,0 +1,206 @@
+# perf bash and zsh completion
+
+# Taken from git.git's completion script.
+__my_reassemble_comp_words_by_ref()
+{
+	local exclude i j first
+	# Which word separators to exclude?
+	exclude="${1//[^$COMP_WORDBREAKS]}"
+	cword_=$COMP_CWORD
+	if [ -z "$exclude" ]; then
+		words_=("${COMP_WORDS[@]}")
+		return
+	fi
+	# List of word completion separators has shrunk;
+	# re-assemble words to complete.
+	for ((i=0, j=0; i < ${#COMP_WORDS[@]}; i++, j++)); do
+		# Append each nonempty word consisting of just
+		# word separator characters to the current word.
+		first=t
+		while
+			[ $i -gt 0 ] &&
+			[ -n "${COMP_WORDS[$i]}" ] &&
+			# word consists of excluded word separators
+			[ "${COMP_WORDS[$i]//[^$exclude]}" = "${COMP_WORDS[$i]}" ]
+		do
+			# Attach to the previous token,
+			# unless the previous token is the command name.
+			if [ $j -ge 2 ] && [ -n "$first" ]; then
+				((j--))
+			fi
+			first=
+			words_[$j]=${words_[j]}${COMP_WORDS[i]}
+			if [ $i = $COMP_CWORD ]; then
+				cword_=$j
+			fi
+			if (($i < ${#COMP_WORDS[@]} - 1)); then
+				((i++))
+			else
+				# Done.
+				return
+			fi
+		done
+		words_[$j]=${words_[j]}${COMP_WORDS[i]}
+		if [ $i = $COMP_CWORD ]; then
+			cword_=$j
+		fi
+	done
+}
+
+type _get_comp_words_by_ref &>/dev/null ||
+_get_comp_words_by_ref()
+{
+	local exclude cur_ words_ cword_
+	if [ "$1" = "-n" ]; then
+		exclude=$2
+		shift 2
+	fi
+	__my_reassemble_comp_words_by_ref "$exclude"
+	cur_=${words_[cword_]}
+	while [ $# -gt 0 ]; do
+		case "$1" in
+		cur)
+			cur=$cur_
+			;;
+		prev)
+			prev=${words_[$cword_-1]}
+			;;
+		words)
+			words=("${words_[@]}")
+			;;
+		cword)
+			cword=$cword_
+			;;
+		esac
+		shift
+	done
+}
+
+type __ltrim_colon_completions &>/dev/null ||
+__ltrim_colon_completions()
+{
+	if [[ "$1" == *:* && "$COMP_WORDBREAKS" == *:* ]]; then
+		# Remove colon-word prefix from COMPREPLY items
+		local colon_word=${1%"${1##*:}"}
+		local i=${#COMPREPLY[*]}
+		while [[ $((--i)) -ge 0 ]]; do
+			COMPREPLY[$i]=${COMPREPLY[$i]#"$colon_word"}
+		done
+	fi
+}
+
+__perfcomp ()
+{
+	COMPREPLY=( $( compgen -W "$1" -- "$2" ) )
+}
+
+__perfcomp_colon ()
+{
+	__perfcomp "$1" "$2"
+	__ltrim_colon_completions $cur
+}
+
+__perf_main ()
+{
+	local cmd
+
+	cmd=${words[0]}
+	COMPREPLY=()
+
+	# List perf subcommands or long options
+	if [ $cword -eq 1 ]; then
+		if [[ $cur == --* ]]; then
+			__perfcomp '--help --version \
+			--exec-path --html-path --paginate --no-pager \
+			--perf-dir --work-tree --debugfs-dir' -- "$cur"
+		else
+			cmds=$($cmd --list-cmds)
+			__perfcomp "$cmds" "$cur"
+		fi
+	# List possible events for -e option
+	elif [[ $prev == "-e" && "${words[1]}" == @(record|stat|top) ]]; then
+		evts=$($cmd list --raw-dump)
+		__perfcomp_colon "$evts" "$cur"
+	# List subcommands for perf commands
+	elif [[ $prev == @(kvm|kmem|mem|lock|sched) ]]; then
+		subcmds=$($cmd $prev --list-cmds)
+		__perfcomp_colon "$subcmds" "$cur"
+	# List long option names
+	elif [[ $cur == --* ]];  then
+		subcmd=${words[1]}
+		opts=$($cmd $subcmd --list-opts)
+		__perfcomp "$opts" "$cur"
+	fi
+}
+
+if [[ -n ${ZSH_VERSION-} ]]; then
+	autoload -U +X compinit && compinit
+
+	__perfcomp ()
+	{
+		emulate -L zsh
+
+		local c IFS=$' \t\n'
+		local -a array
+
+		for c in ${=1}; do
+			case $c in
+			--*=*|*.) ;;
+			*) c="$c " ;;
+			esac
+			array[${#array[@]}+1]="$c"
+		done
+
+		compset -P '*[=:]'
+		compadd -Q -S '' -a -- array && _ret=0
+	}
+
+	__perfcomp_colon ()
+	{
+		emulate -L zsh
+
+		local cur_="${2-$cur}"
+		local c IFS=$' \t\n'
+		local -a array
+
+		if [[ "$cur_" == *:* ]]; then
+			local colon_word=${cur_%"${cur_##*:}"}
+		fi
+
+		for c in ${=1}; do
+			case $c in
+			--*=*|*.) ;;
+			*) c="$c " ;;
+			esac
+			array[$#array+1]=${c#"$colon_word"}
+		done
+
+		compset -P '*[=:]'
+		compadd -Q -S '' -a -- array && _ret=0
+	}
+
+	_perf ()
+	{
+		local _ret=1 cur cword prev
+		cur=${words[CURRENT]}
+		prev=${words[CURRENT-1]}
+		let cword=CURRENT-1
+		emulate ksh -c __perf_main
+		let _ret && _default && _ret=0
+		return _ret
+	}
+
+	compdef _perf perf
+	return
+fi
+
+type perf &>/dev/null &&
+_perf()
+{
+	local cur words cword prev
+	_get_comp_words_by_ref -n =: cur words cword prev
+	__perf_main
+} &&
+
+complete -o bashdefault -o default -o nospace -F _perf perf 2>/dev/null \
+	|| complete -o default -o nospace -F _perf perf
diff --git a/tools/perf/perf-sys.h b/tools/perf/perf-sys.h
new file mode 100644
index 00000000000..5268a1481d2
--- /dev/null
+++ b/tools/perf/perf-sys.h
@@ -0,0 +1,190 @@
+#ifndef _PERF_SYS_H
+#define _PERF_SYS_H
+
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#include <linux/types.h>
+#include <linux/perf_event.h>
+#include <asm/unistd.h>
+
+#if defined(__i386__)
+#define mb()		asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define wmb()		asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define rmb()		asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#define CPUINFO_PROC	"model name"
+#ifndef __NR_perf_event_open
+# define __NR_perf_event_open 336
+#endif
+#ifndef __NR_futex
+# define __NR_futex 240
+#endif
+#ifndef __NR_gettid
+# define __NR_gettid 224
+#endif
+#endif
+
+#if defined(__x86_64__)
+#define mb()		asm volatile("mfence" ::: "memory")
+#define wmb()		asm volatile("sfence" ::: "memory")
+#define rmb()		asm volatile("lfence" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#define CPUINFO_PROC	"model name"
+#ifndef __NR_perf_event_open
+# define __NR_perf_event_open 298
+#endif
+#ifndef __NR_futex
+# define __NR_futex 202
+#endif
+#ifndef __NR_gettid
+# define __NR_gettid 186
+#endif
+#endif
+
+#ifdef __powerpc__
+#include "../../arch/powerpc/include/uapi/asm/unistd.h"
+#define mb()		asm volatile ("sync" ::: "memory")
+#define wmb()		asm volatile ("sync" ::: "memory")
+#define rmb()		asm volatile ("sync" ::: "memory")
+#define CPUINFO_PROC	"cpu"
+#endif
+
+#ifdef __s390__
+#define mb()		asm volatile("bcr 15,0" ::: "memory")
+#define wmb()		asm volatile("bcr 15,0" ::: "memory")
+#define rmb()		asm volatile("bcr 15,0" ::: "memory")
+#endif
+
+#ifdef __sh__
+#if defined(__SH4A__) || defined(__SH5__)
+# define mb()		asm volatile("synco" ::: "memory")
+# define wmb()		asm volatile("synco" ::: "memory")
+# define rmb()		asm volatile("synco" ::: "memory")
+#else
+# define mb()		asm volatile("" ::: "memory")
+# define wmb()		asm volatile("" ::: "memory")
+# define rmb()		asm volatile("" ::: "memory")
+#endif
+#define CPUINFO_PROC	"cpu type"
+#endif
+
+#ifdef __hppa__
+#define mb()		asm volatile("" ::: "memory")
+#define wmb()		asm volatile("" ::: "memory")
+#define rmb()		asm volatile("" ::: "memory")
+#define CPUINFO_PROC	"cpu"
+#endif
+
+#ifdef __sparc__
+#ifdef __LP64__
+#define mb()		asm volatile("ba,pt %%xcc, 1f\n"	\
+				     "membar #StoreLoad\n"	\
+				     "1:\n":::"memory")
+#else
+#define mb()		asm volatile("":::"memory")
+#endif
+#define wmb()		asm volatile("":::"memory")
+#define rmb()		asm volatile("":::"memory")
+#define CPUINFO_PROC	"cpu"
+#endif
+
+#ifdef __alpha__
+#define mb()		asm volatile("mb" ::: "memory")
+#define wmb()		asm volatile("wmb" ::: "memory")
+#define rmb()		asm volatile("mb" ::: "memory")
+#define CPUINFO_PROC	"cpu model"
+#endif
+
+#ifdef __ia64__
+#define mb()		asm volatile ("mf" ::: "memory")
+#define wmb()		asm volatile ("mf" ::: "memory")
+#define rmb()		asm volatile ("mf" ::: "memory")
+#define cpu_relax()	asm volatile ("hint @pause" ::: "memory")
+#define CPUINFO_PROC	"model name"
+#endif
+
+#ifdef __arm__
+/*
+ * Use the __kuser_memory_barrier helper in the CPU helper page. See
+ * arch/arm/kernel/entry-armv.S in the kernel source for details.
+ */
+#define mb()		((void(*)(void))0xffff0fa0)()
+#define wmb()		((void(*)(void))0xffff0fa0)()
+#define rmb()		((void(*)(void))0xffff0fa0)()
+#define CPUINFO_PROC	"Processor"
+#endif
+
+#ifdef __aarch64__
+#define mb()		asm volatile("dmb ish" ::: "memory")
+#define wmb()		asm volatile("dmb ishst" ::: "memory")
+#define rmb()		asm volatile("dmb ishld" ::: "memory")
+#define cpu_relax()	asm volatile("yield" ::: "memory")
+#endif
+
+#ifdef __mips__
+#define mb()		asm volatile(					\
+				".set	mips2\n\t"			\
+				"sync\n\t"				\
+				".set	mips0"				\
+				: /* no output */			\
+				: /* no input */			\
+				: "memory")
+#define wmb()	mb()
+#define rmb()	mb()
+#define CPUINFO_PROC	"cpu model"
+#endif
+
+#ifdef __arc__
+#define mb()		asm volatile("" ::: "memory")
+#define wmb()		asm volatile("" ::: "memory")
+#define rmb()		asm volatile("" ::: "memory")
+#define CPUINFO_PROC	"Processor"
+#endif
+
+#ifdef __metag__
+#define mb()		asm volatile("" ::: "memory")
+#define wmb()		asm volatile("" ::: "memory")
+#define rmb()		asm volatile("" ::: "memory")
+#define CPUINFO_PROC	"CPU"
+#endif
+
+#ifdef __xtensa__
+#define mb()		asm volatile("memw" ::: "memory")
+#define wmb()		asm volatile("memw" ::: "memory")
+#define rmb()		asm volatile("" ::: "memory")
+#define CPUINFO_PROC	"core ID"
+#endif
+
+#ifdef __tile__
+#define mb()		asm volatile ("mf" ::: "memory")
+#define wmb()		asm volatile ("mf" ::: "memory")
+#define rmb()		asm volatile ("mf" ::: "memory")
+#define cpu_relax()	asm volatile ("mfspr zero, PASS" ::: "memory")
+#define CPUINFO_PROC    "model name"
+#endif
+
+#define barrier() asm volatile ("" ::: "memory")
+
+#ifndef cpu_relax
+#define cpu_relax() barrier()
+#endif
+
+static inline int
+sys_perf_event_open(struct perf_event_attr *attr,
+		      pid_t pid, int cpu, int group_fd,
+		      unsigned long flags)
+{
+	int fd;
+
+	fd = syscall(__NR_perf_event_open, attr, pid, cpu,
+		     group_fd, flags);
+
+#ifdef HAVE_ATTR_TEST
+	if (unlikely(test_attr__enabled))
+		test_attr__open(attr, pid, cpu, fd, group_fd, flags);
+#endif
+	return fd;
+}
+
+#endif /* _PERF_SYS_H */
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index cd32c200cdb..95c58fc1528 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -13,8 +13,8 @@
 #include "util/quote.h"
 #include "util/run-command.h"
 #include "util/parse-events.h"
-#include "util/string.h"
-#include "util/debugfs.h"
+#include <api/fs/debugfs.h>
+#include <pthread.h>
 
 const char perf_usage_string[] =
 	"perf [--version] [--help] COMMAND [ARGS]";
@@ -22,14 +22,52 @@ const char perf_usage_string[] =
 const char perf_more_info_string[] =
 	"See 'perf help COMMAND' for more information on a specific command.";
 
+int use_browser = -1;
 static int use_pager = -1;
+const char *input_name;
+
+struct cmd_struct {
+	const char *cmd;
+	int (*fn)(int, const char **, const char *);
+	int option;
+};
+
+static struct cmd_struct commands[] = {
+	{ "buildid-cache", cmd_buildid_cache, 0 },
+	{ "buildid-list", cmd_buildid_list, 0 },
+	{ "diff",	cmd_diff,	0 },
+	{ "evlist",	cmd_evlist,	0 },
+	{ "help",	cmd_help,	0 },
+	{ "list",	cmd_list,	0 },
+	{ "record",	cmd_record,	0 },
+	{ "report",	cmd_report,	0 },
+	{ "bench",	cmd_bench,	0 },
+	{ "stat",	cmd_stat,	0 },
+	{ "timechart",	cmd_timechart,	0 },
+	{ "top",	cmd_top,	0 },
+	{ "annotate",	cmd_annotate,	0 },
+	{ "version",	cmd_version,	0 },
+	{ "script",	cmd_script,	0 },
+	{ "sched",	cmd_sched,	0 },
+#ifdef HAVE_LIBELF_SUPPORT
+	{ "probe",	cmd_probe,	0 },
+#endif
+	{ "kmem",	cmd_kmem,	0 },
+	{ "lock",	cmd_lock,	0 },
+	{ "kvm",	cmd_kvm,	0 },
+	{ "test",	cmd_test,	0 },
+#ifdef HAVE_LIBAUDIT_SUPPORT
+	{ "trace",	cmd_trace,	0 },
+#endif
+	{ "inject",	cmd_inject,	0 },
+	{ "mem",	cmd_mem,	0 },
+};
+
 struct pager_config {
 	const char *cmd;
 	int val;
 };
 
-static char debugfs_mntpt[MAXPATHLEN];
-
 static int pager_command_config(const char *var, const char *value, void *data)
 {
 	struct pager_config *c = data;
@@ -48,6 +86,29 @@ int check_pager_config(const char *cmd)
 	return c.val;
 }
 
+static int browser_command_config(const char *var, const char *value, void *data)
+{
+	struct pager_config *c = data;
+	if (!prefixcmp(var, "tui.") && !strcmp(var + 4, c->cmd))
+		c->val = perf_config_bool(var, value);
+	if (!prefixcmp(var, "gtk.") && !strcmp(var + 4, c->cmd))
+		c->val = perf_config_bool(var, value) ? 2 : 0;
+	return 0;
+}
+
+/*
+ * returns 0 for "no tui", 1 for "use tui", 2 for "use gtk",
+ * and -1 for "not specified"
+ */
+static int check_browser_config(const char *cmd)
+{
+	struct pager_config c;
+	c.cmd = cmd;
+	c.val = -1;
+	perf_config(browser_command_config, &c);
+	return c.val;
+}
+
 static void commit_pager_choice(void)
 {
 	switch (use_pager) {
@@ -62,15 +123,6 @@ static void commit_pager_choice(void)
 	}
 }
 
-static void set_debugfs_path(void)
-{
-	char *path;
-
-	path = getenv(PERF_DEBUGFS_ENVIRONMENT);
-	snprintf(debugfs_path, MAXPATHLEN, "%s/%s", path ?: debugfs_mntpt,
-		 "tracing/events");
-}
-
 static int handle_options(const char ***argv, int *argc, int *envchanged)
 {
 	int handled = 0;
@@ -142,17 +194,24 @@ static int handle_options(const char ***argv, int *argc, int *envchanged)
 				fprintf(stderr, "No directory given for --debugfs-dir.\n");
 				usage(perf_usage_string);
 			}
-			strncpy(debugfs_mntpt, (*argv)[1], MAXPATHLEN);
-			debugfs_mntpt[MAXPATHLEN - 1] = '\0';
+			perf_debugfs_set_path((*argv)[1]);
 			if (envchanged)
 				*envchanged = 1;
 			(*argv)++;
 			(*argc)--;
 		} else if (!prefixcmp(cmd, CMD_DEBUGFS_DIR)) {
-			strncpy(debugfs_mntpt, cmd + strlen(CMD_DEBUGFS_DIR), MAXPATHLEN);
-			debugfs_mntpt[MAXPATHLEN - 1] = '\0';
+			perf_debugfs_set_path(cmd + strlen(CMD_DEBUGFS_DIR));
+			fprintf(stderr, "dir: %s\n", debugfs_mountpoint);
 			if (envchanged)
 				*envchanged = 1;
+		} else if (!strcmp(cmd, "--list-cmds")) {
+			unsigned int i;
+
+			for (i = 0; i < ARRAY_SIZE(commands); i++) {
+				struct cmd_struct *p = commands+i;
+				printf("%s ", p->cmd);
+			}
+			exit(0);
 		} else {
 			fprintf(stderr, "Unknown option: %s\n", cmd);
 			usage(perf_usage_string);
@@ -238,12 +297,6 @@ const char perf_version_string[] = PERF_VERSION;
  */
 #define NEED_WORK_TREE	(1<<2)
 
-struct cmd_struct {
-	const char *cmd;
-	int (*fn)(int, const char **, const char *);
-	int option;
-};
-
 static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
 {
 	int status;
@@ -254,14 +307,18 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
 	if (p->option & RUN_SETUP)
 		prefix = NULL; /* setup_perf_directory(); */
 
+	if (use_browser == -1)
+		use_browser = check_browser_config(p->cmd);
+
 	if (use_pager == -1 && p->option & RUN_SETUP)
 		use_pager = check_pager_config(p->cmd);
 	if (use_pager == -1 && p->option & USE_PAGER)
 		use_pager = 1;
 	commit_pager_choice();
-	set_debugfs_path();
 
 	status = p->fn(argc, argv, prefix);
+	exit_browser(status);
+
 	if (status)
 		return status & 0xff;
 
@@ -272,39 +329,28 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
 	if (S_ISFIFO(st.st_mode) || S_ISSOCK(st.st_mode))
 		return 0;
 
+	status = 1;
 	/* Check for ENOSPC and EIO errors.. */
-	if (fflush(stdout))
-		die("write failure on standard output: %s", strerror(errno));
-	if (ferror(stdout))
-		die("unknown write failure on standard output");
-	if (fclose(stdout))
-		die("close failed on standard output: %s", strerror(errno));
-	return 0;
+	if (fflush(stdout)) {
+		fprintf(stderr, "write failure on standard output: %s", strerror(errno));
+		goto out;
+	}
+	if (ferror(stdout)) {
+		fprintf(stderr, "unknown write failure on standard output");
+		goto out;
+	}
+	if (fclose(stdout)) {
+		fprintf(stderr, "close failed on standard output: %s", strerror(errno));
+		goto out;
+	}
+	status = 0;
+out:
+	return status;
 }
 
 static void handle_internal_command(int argc, const char **argv)
 {
 	const char *cmd = argv[0];
-	static struct cmd_struct commands[] = {
-		{ "buildid-cache", cmd_buildid_cache, 0 },
-		{ "buildid-list", cmd_buildid_list, 0 },
-		{ "diff",	cmd_diff,	0 },
-		{ "help",	cmd_help,	0 },
-		{ "list",	cmd_list,	0 },
-		{ "record",	cmd_record,	0 },
-		{ "report",	cmd_report,	0 },
-		{ "bench",	cmd_bench,	0 },
-		{ "stat",	cmd_stat,	0 },
-		{ "timechart",	cmd_timechart,	0 },
-		{ "top",	cmd_top,	0 },
-		{ "annotate",	cmd_annotate,	0 },
-		{ "version",	cmd_version,	0 },
-		{ "trace",	cmd_trace,	0 },
-		{ "sched",	cmd_sched,	0 },
-		{ "probe",	cmd_probe,	0 },
-		{ "kmem",	cmd_kmem,	0 },
-		{ "lock",	cmd_lock,	0 },
-	};
 	unsigned int i;
 	static const char ext[] = STRIP_EXTENSION;
 
@@ -388,26 +434,37 @@ static int run_argv(int *argcp, const char ***argv)
 	return done_alias;
 }
 
-/* mini /proc/mounts parser: searching for "^blah /mount/point debugfs" */
-static void get_debugfs_mntpt(void)
+static void pthread__block_sigwinch(void)
 {
-	const char *path = debugfs_mount(NULL);
+	sigset_t set;
 
-	if (path)
-		strncpy(debugfs_mntpt, path, sizeof(debugfs_mntpt));
-	else
-		debugfs_mntpt[0] = '\0';
+	sigemptyset(&set);
+	sigaddset(&set, SIGWINCH);
+	pthread_sigmask(SIG_BLOCK, &set, NULL);
+}
+
+void pthread__unblock_sigwinch(void)
+{
+	sigset_t set;
+
+	sigemptyset(&set);
+	sigaddset(&set, SIGWINCH);
+	pthread_sigmask(SIG_UNBLOCK, &set, NULL);
 }
 
 int main(int argc, const char **argv)
 {
 	const char *cmd;
 
+	/* The page_size is placed in util object. */
+	page_size = sysconf(_SC_PAGE_SIZE);
+	cacheline_size = sysconf(_SC_LEVEL1_DCACHE_LINESIZE);
+
 	cmd = perf_extract_argv0_path(argv[0]);
 	if (!cmd)
 		cmd = "perf-help";
 	/* get debugfs mount point from /proc/mounts */
-	get_debugfs_mntpt();
+	perf_debugfs_mount(NULL);
 	/*
 	 * "perf-xxxx" is the same as "perf xxxx", but we obviously:
 	 *
@@ -422,15 +479,28 @@ int main(int argc, const char **argv)
 		cmd += 5;
 		argv[0] = cmd;
 		handle_internal_command(argc, argv);
-		die("cannot handle %s internally", cmd);
+		fprintf(stderr, "cannot handle %s internally", cmd);
+		goto out;
+	}
+	if (!prefixcmp(cmd, "trace")) {
+#ifdef HAVE_LIBAUDIT_SUPPORT
+		set_buildid_dir();
+		setup_path();
+		argv[0] = "trace";
+		return cmd_trace(argc, argv, NULL);
+#else
+		fprintf(stderr,
+			"trace command not available: missing audit-libs devel package at build time.\n");
+		goto out;
+#endif
 	}
-
 	/* Look for flags.. */
 	argv++;
 	argc--;
 	handle_options(&argv, &argc, NULL);
 	commit_pager_choice();
-	set_debugfs_path();
+	set_buildid_dir();
+
 	if (argc > 0) {
 		if (!prefixcmp(argv[0], "--"))
 			argv[0] += 2;
@@ -439,10 +509,12 @@ int main(int argc, const char **argv)
 		printf("\n usage: %s\n\n", perf_usage_string);
 		list_common_cmds_help();
 		printf("\n %s\n\n", perf_more_info_string);
-		exit(1);
+		goto out;
 	}
 	cmd = argv[0];
 
+	test_attr__init();
+
 	/*
 	 * We use PATH to find perf commands, but we prepend some higher
 	 * precedence paths: the "--exec-path" option, the PERF_EXEC_PATH
@@ -450,12 +522,17 @@ int main(int argc, const char **argv)
 	 * time.
 	 */
 	setup_path();
+	/*
+	 * Block SIGWINCH notifications so that the thread that wants it can
+	 * unblock and get syscalls like select interrupted instead of waiting
+	 * forever while the signal goes to some other non interested thread.
+	 */
+	pthread__block_sigwinch();
 
 	while (1) {
 		static int done_help;
-		static int was_alias;
+		int was_alias = run_argv(&argc, &argv);
 
-		was_alias = run_argv(&argc, &argv);
 		if (errno != ENOENT)
 			break;
 
@@ -463,7 +540,7 @@ int main(int argc, const char **argv)
 			fprintf(stderr, "Expansion of alias '%s' failed; "
 				"'%s' is not a perf-command\n",
 				cmd, argv[0]);
-			exit(1);
+			goto out;
 		}
 		if (!done_help) {
 			cmd = argv[0] = help_unknown_cmd(cmd);
@@ -474,6 +551,6 @@ int main(int argc, const char **argv)
 
 	fprintf(stderr, "Failed to run command '%s': %s\n",
 		cmd, strerror(errno));
-
+out:
 	return 1;
 }
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 6fb379bc1d1..510c65f7285 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -1,92 +1,25 @@
 #ifndef _PERF_PERF_H
 #define _PERF_PERF_H
 
-#if defined(__i386__)
-#include "../../arch/x86/include/asm/unistd.h"
-#define rmb()		asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#if defined(__x86_64__)
-#include "../../arch/x86/include/asm/unistd.h"
-#define rmb()		asm volatile("lfence" ::: "memory")
-#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
-#endif
-
-#ifdef __powerpc__
-#include "../../arch/powerpc/include/asm/unistd.h"
-#define rmb()		asm volatile ("sync" ::: "memory")
-#define cpu_relax()	asm volatile ("" ::: "memory");
-#endif
-
-#ifdef __s390__
-#include "../../arch/s390/include/asm/unistd.h"
-#define rmb()		asm volatile("bcr 15,0" ::: "memory")
-#define cpu_relax()	asm volatile("" ::: "memory");
-#endif
-
-#ifdef __sh__
-#include "../../arch/sh/include/asm/unistd.h"
-#if defined(__SH4A__) || defined(__SH5__)
-# define rmb()		asm volatile("synco" ::: "memory")
-#else
-# define rmb()		asm volatile("" ::: "memory")
-#endif
-#define cpu_relax()	asm volatile("" ::: "memory")
-#endif
-
-#ifdef __hppa__
-#include "../../arch/parisc/include/asm/unistd.h"
-#define rmb()		asm volatile("" ::: "memory")
-#define cpu_relax()	asm volatile("" ::: "memory");
-#endif
-
-#ifdef __sparc__
-#include "../../arch/sparc/include/asm/unistd.h"
-#define rmb()		asm volatile("":::"memory")
-#define cpu_relax()	asm volatile("":::"memory")
-#endif
-
-#ifdef __alpha__
-#include "../../arch/alpha/include/asm/unistd.h"
-#define rmb()		asm volatile("mb" ::: "memory")
-#define cpu_relax()	asm volatile("" ::: "memory")
-#endif
-
-#ifdef __ia64__
-#include "../../arch/ia64/include/asm/unistd.h"
-#define rmb()		asm volatile ("mf" ::: "memory")
-#define cpu_relax()	asm volatile ("hint @pause" ::: "memory")
-#endif
-
-#ifdef __arm__
-#include "../../arch/arm/include/asm/unistd.h"
-/*
- * Use the __kuser_memory_barrier helper in the CPU helper page. See
- * arch/arm/kernel/entry-armv.S in the kernel source for details.
- */
-#define rmb()		((void(*)(void))0xffff0fa0)()
-#define cpu_relax()	asm volatile("":::"memory")
-#endif
-
 #include <time.h>
-#include <unistd.h>
-#include <sys/types.h>
-#include <sys/syscall.h>
+#include <stdbool.h>
+#include <linux/types.h>
+#include <linux/perf_event.h>
 
-#include "../../include/linux/perf_event.h"
-#include "util/types.h"
+extern bool test_attr__enabled;
+void test_attr__init(void);
+void test_attr__open(struct perf_event_attr *attr, pid_t pid, int cpu,
+		     int fd, int group_fd, unsigned long flags);
 
-/*
- * prctl(PR_TASK_PERF_EVENTS_DISABLE) will (cheaply) disable all
- * counters in the current task.
- */
-#define PR_TASK_PERF_EVENTS_DISABLE   31
-#define PR_TASK_PERF_EVENTS_ENABLE    32
+#define HAVE_ATTR_TEST
+#include "perf-sys.h"
 
 #ifndef NSEC_PER_SEC
 # define NSEC_PER_SEC			1000000000ULL
 #endif
+#ifndef NSEC_PER_USEC
+# define NSEC_PER_USEC			1000ULL
+#endif
 
 static inline unsigned long long rdclock(void)
 {
@@ -96,37 +29,40 @@ static inline unsigned long long rdclock(void)
 	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
 }
 
-/*
- * Pick up some kernel type conventions:
- */
-#define __user
-#define asmlinkage
-
-#define __used		__attribute__((__unused__))
-
-#define unlikely(x)	__builtin_expect(!!(x), 0)
-#define min(x, y) ({				\
-	typeof(x) _min1 = (x);			\
-	typeof(y) _min2 = (y);			\
-	(void) (&_min1 == &_min2);		\
-	_min1 < _min2 ? _min1 : _min2; })
-
-static inline int
-sys_perf_event_open(struct perf_event_attr *attr,
-		      pid_t pid, int cpu, int group_fd,
-		      unsigned long flags)
-{
-	attr->size = sizeof(*attr);
-	return syscall(__NR_perf_event_open, attr, pid, cpu,
-		       group_fd, flags);
-}
-
-#define MAX_COUNTERS			256
 #define MAX_NR_CPUS			256
 
-struct ip_callchain {
-	u64 nr;
-	u64 ips[0];
+extern const char *input_name;
+extern bool perf_host, perf_guest;
+extern const char perf_version_string[];
+
+void pthread__unblock_sigwinch(void);
+
+#include "util/target.h"
+
+struct record_opts {
+	struct target target;
+	int	     call_graph;
+	bool         call_graph_enabled;
+	bool	     group;
+	bool	     inherit_stat;
+	bool	     no_buffering;
+	bool	     no_inherit;
+	bool	     no_inherit_set;
+	bool	     no_samples;
+	bool	     raw_samples;
+	bool	     sample_address;
+	bool	     sample_weight;
+	bool	     sample_time;
+	bool	     period;
+	unsigned int freq;
+	unsigned int mmap_pages;
+	unsigned int user_freq;
+	u64          branch_stack;
+	u64	     default_interval;
+	u64	     user_interval;
+	u16	     stack_dump_size;
+	bool	     sample_transaction;
+	unsigned     initial_delay;
 };
 
 #endif
diff --git a/tools/perf/python/twatch.py b/tools/perf/python/twatch.py
new file mode 100755
index 00000000000..2225162ee1f
--- /dev/null
+++ b/tools/perf/python/twatch.py
@@ -0,0 +1,41 @@
+#! /usr/bin/python
+# -*- python -*-
+# -*- coding: utf-8 -*-
+#   twatch - Experimental use of the perf python interface
+#   Copyright (C) 2011 Arnaldo Carvalho de Melo <acme@redhat.com>
+#
+#   This application is free software; you can redistribute it and/or
+#   modify it under the terms of the GNU General Public License
+#   as published by the Free Software Foundation; version 2.
+#
+#   This application is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+#   General Public License for more details.
+
+import perf
+
+def main():
+	cpus = perf.cpu_map()
+	threads = perf.thread_map()
+	evsel = perf.evsel(task = 1, comm = 1, mmap = 0,
+			   wakeup_events = 1, watermark = 1,
+			   sample_id_all = 1,
+			   sample_type = perf.SAMPLE_PERIOD | perf.SAMPLE_TID | perf.SAMPLE_CPU)
+	evsel.open(cpus = cpus, threads = threads);
+	evlist = perf.evlist(cpus, threads)
+	evlist.add(evsel)
+	evlist.mmap()
+	while True:
+		evlist.poll(timeout = -1)
+		for cpu in cpus:
+			event = evlist.read_on_cpu(cpu)
+			if not event:
+				continue
+			print "cpu: %2d, pid: %4d, tid: %4d" % (event.sample_cpu,
+								event.sample_pid,
+								event.sample_tid),
+			print event
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/Context.c b/tools/perf/scripts/perl/Perf-Trace-Util/Context.c
index 01a64ad693f..790ceba6ad3 100644
--- a/tools/perf/scripts/perl/Perf-Trace-Util/Context.c
+++ b/tools/perf/scripts/perl/Perf-Trace-Util/Context.c
@@ -8,7 +8,7 @@
 
 #line 1 "Context.xs"
 /*
- * Context.xs.  XS interfaces for perf trace.
+ * Context.xs.  XS interfaces for perf script.
  *
  * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
  *
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/Context.xs b/tools/perf/scripts/perl/Perf-Trace-Util/Context.xs
index 549cf0467d3..8c7ea42444d 100644
--- a/tools/perf/scripts/perl/Perf-Trace-Util/Context.xs
+++ b/tools/perf/scripts/perl/Perf-Trace-Util/Context.xs
@@ -1,5 +1,5 @@
 /*
- * Context.xs.  XS interfaces for perf trace.
+ * Context.xs.  XS interfaces for perf script.
  *
  * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
  *
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/README b/tools/perf/scripts/perl/Perf-Trace-Util/README
index 9a970763079..2f0c7f3043e 100644
--- a/tools/perf/scripts/perl/Perf-Trace-Util/README
+++ b/tools/perf/scripts/perl/Perf-Trace-Util/README
@@ -1,7 +1,7 @@
 Perf-Trace-Util version 0.01
 ============================
 
-This module contains utility functions for use with perf trace.
+This module contains utility functions for use with perf script.
 
 Core.pm and Util.pm are pure Perl modules; Core.pm contains routines
 that the core perf support for Perl calls on and should always be
@@ -33,7 +33,7 @@ After you do that:
 
 INSTALLATION
 
-Building perf with perf trace Perl scripting should install this
+Building perf with perf script Perl scripting should install this
 module in the right place.
 
 You should make sure libperl and ExtUtils/Embed.pm are installed first
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm
index 6c7f3659cb1..4e2f6039ac9 100644
--- a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm
+++ b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Context.pm
@@ -34,7 +34,7 @@ Perf::Trace::Context - Perl extension for accessing functions in perf.
 
 =head1 SEE ALSO
 
-Perf (trace) documentation
+Perf (script) documentation
 
 =head1 AUTHOR
 
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm
index 9df376a9f62..9158458d3ee 100644
--- a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm
+++ b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Core.pm
@@ -163,7 +163,7 @@ sub dump_symbolic_fields
 __END__
 =head1 NAME
 
-Perf::Trace::Core - Perl extension for perf trace
+Perf::Trace::Core - Perl extension for perf script
 
 =head1 SYNOPSIS
 
@@ -171,7 +171,7 @@ Perf::Trace::Core - Perl extension for perf trace
 
 =head1 SEE ALSO
 
-Perf (trace) documentation
+Perf (script) documentation
 
 =head1 AUTHOR
 
diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm
index f869c48dc9b..05350011462 100644
--- a/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm
+++ b/tools/perf/scripts/perl/Perf-Trace-Util/lib/Perf/Trace/Util.pm
@@ -15,6 +15,7 @@ our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
 
 our @EXPORT = qw(
 avg nsecs nsecs_secs nsecs_nsecs nsecs_usecs print_nsecs
+clear_term
 );
 
 our $VERSION = '0.01';
@@ -55,11 +56,16 @@ sub nsecs_str {
     return $str;
 }
 
+sub clear_term
+{
+    print "\x1b[H\x1b[2J";
+}
+
 1;
 __END__
 =head1 NAME
 
-Perf::Trace::Util - Perl extension for perf trace
+Perf::Trace::Util - Perl extension for perf script
 
 =head1 SYNOPSIS
 
@@ -67,7 +73,7 @@ Perf::Trace::Util - Perl extension for perf trace
 
 =head1 SEE ALSO
 
-Perf (trace) documentation
+Perf (script) documentation
 
 =head1 AUTHOR
 
diff --git a/tools/perf/scripts/perl/bin/check-perf-trace-record b/tools/perf/scripts/perl/bin/check-perf-trace-record
index e6cb1474f8e..423ad6aed05 100644
--- a/tools/perf/scripts/perl/bin/check-perf-trace-record
+++ b/tools/perf/scripts/perl/bin/check-perf-trace-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e kmem:kmalloc -e irq:softirq_entry -e kmem:kfree
+perf record -a -e kmem:kmalloc -e irq:softirq_entry -e kmem:kfree
diff --git a/tools/perf/scripts/perl/bin/failed-syscalls-record b/tools/perf/scripts/perl/bin/failed-syscalls-record
index f8885d389e6..8104895a7b6 100644
--- a/tools/perf/scripts/perl/bin/failed-syscalls-record
+++ b/tools/perf/scripts/perl/bin/failed-syscalls-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e raw_syscalls:sys_exit
+perf record -e raw_syscalls:sys_exit $@
diff --git a/tools/perf/scripts/perl/bin/failed-syscalls-report b/tools/perf/scripts/perl/bin/failed-syscalls-report
index 8bfc660e505..9f83cc1ad8b 100644
--- a/tools/perf/scripts/perl/bin/failed-syscalls-report
+++ b/tools/perf/scripts/perl/bin/failed-syscalls-report
@@ -1,4 +1,10 @@
 #!/bin/bash
 # description: system-wide failed syscalls
 # args: [comm]
-perf trace -s ~/libexec/perf-core/scripts/perl/failed-syscalls.pl $1
+if [ $# -gt 0 ] ; then
+    if ! expr match "$1" "-" > /dev/null ; then
+	comm=$1
+	shift
+    fi
+fi
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/failed-syscalls.pl $comm
diff --git a/tools/perf/scripts/perl/bin/rw-by-file-record b/tools/perf/scripts/perl/bin/rw-by-file-record
index b25056ebf96..33efc8673aa 100644
--- a/tools/perf/scripts/perl/bin/rw-by-file-record
+++ b/tools/perf/scripts/perl/bin/rw-by-file-record
@@ -1,2 +1,3 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e syscalls:sys_enter_read -e syscalls:sys_enter_write
+perf record -e syscalls:sys_enter_read -e syscalls:sys_enter_write $@
+
diff --git a/tools/perf/scripts/perl/bin/rw-by-file-report b/tools/perf/scripts/perl/bin/rw-by-file-report
index eddb9ccce6a..77200b3f310 100644
--- a/tools/perf/scripts/perl/bin/rw-by-file-report
+++ b/tools/perf/scripts/perl/bin/rw-by-file-report
@@ -1,7 +1,10 @@
 #!/bin/bash
 # description: r/w activity for a program, by file
 # args: <comm>
-perf trace -s ~/libexec/perf-core/scripts/perl/rw-by-file.pl $1
-
-
-
+if [ $# -lt 1 ] ; then
+    echo "usage: rw-by-file <comm>"
+    exit
+fi
+comm=$1
+shift
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/rw-by-file.pl $comm
diff --git a/tools/perf/scripts/perl/bin/rw-by-pid-record b/tools/perf/scripts/perl/bin/rw-by-pid-record
index 8903979c5b6..7cb9db23044 100644
--- a/tools/perf/scripts/perl/bin/rw-by-pid-record
+++ b/tools/perf/scripts/perl/bin/rw-by-pid-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e syscalls:sys_enter_read -e syscalls:sys_exit_read -e syscalls:sys_enter_write -e syscalls:sys_exit_write
+perf record -e syscalls:sys_enter_read -e syscalls:sys_exit_read -e syscalls:sys_enter_write -e syscalls:sys_exit_write $@
diff --git a/tools/perf/scripts/perl/bin/rw-by-pid-report b/tools/perf/scripts/perl/bin/rw-by-pid-report
index 7f44c25cc85..a27b9f311f9 100644
--- a/tools/perf/scripts/perl/bin/rw-by-pid-report
+++ b/tools/perf/scripts/perl/bin/rw-by-pid-report
@@ -1,6 +1,3 @@
 #!/bin/bash
 # description: system-wide r/w activity
-perf trace -s ~/libexec/perf-core/scripts/perl/rw-by-pid.pl
-
-
-
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/rw-by-pid.pl
diff --git a/tools/perf/scripts/perl/bin/rwtop-record b/tools/perf/scripts/perl/bin/rwtop-record
new file mode 100644
index 00000000000..7cb9db23044
--- /dev/null
+++ b/tools/perf/scripts/perl/bin/rwtop-record
@@ -0,0 +1,2 @@
+#!/bin/bash
+perf record -e syscalls:sys_enter_read -e syscalls:sys_exit_read -e syscalls:sys_enter_write -e syscalls:sys_exit_write $@
diff --git a/tools/perf/scripts/perl/bin/rwtop-report b/tools/perf/scripts/perl/bin/rwtop-report
new file mode 100644
index 00000000000..83e11ec2e19
--- /dev/null
+++ b/tools/perf/scripts/perl/bin/rwtop-report
@@ -0,0 +1,20 @@
+#!/bin/bash
+# description: system-wide r/w top
+# args: [interval]
+n_args=0
+for i in "$@"
+do
+    if expr match "$i" "-" > /dev/null ; then
+	break
+    fi
+    n_args=$(( $n_args + 1 ))
+done
+if [ "$n_args" -gt 1 ] ; then
+    echo "usage: rwtop-report [interval]"
+    exit
+fi
+if [ "$n_args" -gt 0 ] ; then
+    interval=$1
+    shift
+fi
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/rwtop.pl $interval
diff --git a/tools/perf/scripts/perl/bin/wakeup-latency-record b/tools/perf/scripts/perl/bin/wakeup-latency-record
index 6abedda911a..464251a1bd7 100644
--- a/tools/perf/scripts/perl/bin/wakeup-latency-record
+++ b/tools/perf/scripts/perl/bin/wakeup-latency-record
@@ -1,5 +1,5 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e sched:sched_switch -e sched:sched_wakeup
+perf record -e sched:sched_switch -e sched:sched_wakeup $@
 
 
 
diff --git a/tools/perf/scripts/perl/bin/wakeup-latency-report b/tools/perf/scripts/perl/bin/wakeup-latency-report
index fce3adcb324..889e8130cca 100644
--- a/tools/perf/scripts/perl/bin/wakeup-latency-report
+++ b/tools/perf/scripts/perl/bin/wakeup-latency-report
@@ -1,6 +1,3 @@
 #!/bin/bash
 # description: system-wide min/max/avg wakeup latency
-perf trace -s ~/libexec/perf-core/scripts/perl/wakeup-latency.pl
-
-
-
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/perl/wakeup-latency.pl
diff --git a/tools/perf/scripts/perl/bin/workqueue-stats-record b/tools/perf/scripts/perl/bin/workqueue-stats-record
deleted file mode 100644
index fce6637b19b..00000000000
--- a/tools/perf/scripts/perl/bin/workqueue-stats-record
+++ /dev/null
@@ -1,2 +0,0 @@
-#!/bin/bash
-perf record -c 1 -f -a -M -R -e workqueue:workqueue_creation -e workqueue:workqueue_destruction -e workqueue:workqueue_execution -e workqueue:workqueue_insertion
diff --git a/tools/perf/scripts/perl/bin/workqueue-stats-report b/tools/perf/scripts/perl/bin/workqueue-stats-report
deleted file mode 100644
index 71cfbd182fb..00000000000
--- a/tools/perf/scripts/perl/bin/workqueue-stats-report
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-# description: workqueue stats (ins/exe/create/destroy)
-perf trace -s ~/libexec/perf-core/scripts/perl/workqueue-stats.pl
-
-
-
-
diff --git a/tools/perf/scripts/perl/check-perf-trace.pl b/tools/perf/scripts/perl/check-perf-trace.pl
index 4e7dc0a407a..4e7076c2061 100644
--- a/tools/perf/scripts/perl/check-perf-trace.pl
+++ b/tools/perf/scripts/perl/check-perf-trace.pl
@@ -1,4 +1,4 @@
-# perf trace event handlers, generated by perf trace -g perl
+# perf script event handlers, generated by perf script -g perl
 # (c) 2009, Tom Zanussi <tzanussi@gmail.com>
 # Licensed under the terms of the GNU GPL License version 2
 
diff --git a/tools/perf/scripts/perl/failed-syscalls.pl b/tools/perf/scripts/perl/failed-syscalls.pl
index c18e7e27a84..94bc25a347e 100644
--- a/tools/perf/scripts/perl/failed-syscalls.pl
+++ b/tools/perf/scripts/perl/failed-syscalls.pl
@@ -11,6 +11,8 @@ use Perf::Trace::Core;
 use Perf::Trace::Context;
 use Perf::Trace::Util;
 
+my $for_comm = shift;
+
 my %failed_syscalls;
 
 sub raw_syscalls::sys_exit
@@ -33,6 +35,8 @@ sub trace_end
 
     foreach my $comm (sort {$failed_syscalls{$b} <=> $failed_syscalls{$a}}
 		      keys %failed_syscalls) {
-	    printf("%-20s  %10s\n", $comm, $failed_syscalls{$comm});
+	next if ($for_comm && $comm ne $for_comm);
+
+	printf("%-20s  %10s\n", $comm, $failed_syscalls{$comm});
     }
 }
diff --git a/tools/perf/scripts/perl/rw-by-file.pl b/tools/perf/scripts/perl/rw-by-file.pl
index 2a39097687b..74844ee2be3 100644
--- a/tools/perf/scripts/perl/rw-by-file.pl
+++ b/tools/perf/scripts/perl/rw-by-file.pl
@@ -18,7 +18,7 @@ use lib "./Perf-Trace-Util/lib";
 use Perf::Trace::Core;
 use Perf::Trace::Util;
 
-my $usage = "perf trace -s rw-by-file.pl <comm>\n";
+my $usage = "perf script -s rw-by-file.pl <comm>\n";
 
 my $for_comm = shift or die $usage;
 
diff --git a/tools/perf/scripts/perl/rw-by-pid.pl b/tools/perf/scripts/perl/rw-by-pid.pl
index da601fae1a0..9db23c9daf5 100644
--- a/tools/perf/scripts/perl/rw-by-pid.pl
+++ b/tools/perf/scripts/perl/rw-by-pid.pl
@@ -79,12 +79,12 @@ sub trace_end
     printf("%6s  %-20s  %10s  %10s  %10s\n", "------", "--------------------",
 	   "-----------", "----------", "----------");
 
-    foreach my $pid (sort {$reads{$b}{bytes_read} <=>
-			       $reads{$a}{bytes_read}} keys %reads) {
-	my $comm = $reads{$pid}{comm};
-	my $total_reads = $reads{$pid}{total_reads};
-	my $bytes_requested = $reads{$pid}{bytes_requested};
-	my $bytes_read = $reads{$pid}{bytes_read};
+    foreach my $pid (sort { ($reads{$b}{bytes_read} || 0) <=>
+				($reads{$a}{bytes_read} || 0) } keys %reads) {
+	my $comm = $reads{$pid}{comm} || "";
+	my $total_reads = $reads{$pid}{total_reads} || 0;
+	my $bytes_requested = $reads{$pid}{bytes_requested} || 0;
+	my $bytes_read = $reads{$pid}{bytes_read} || 0;
 
 	printf("%6s  %-20s  %10s  %10s  %10s\n", $pid, $comm,
 	       $total_reads, $bytes_requested, $bytes_read);
@@ -96,16 +96,23 @@ sub trace_end
     printf("%6s  %20s  %6s  %10s\n", "------", "--------------------",
 	   "------", "----------");
 
-    foreach my $pid (keys %reads) {
-	my $comm = $reads{$pid}{comm};
-	foreach my $err (sort {$reads{$b}{comm} cmp $reads{$a}{comm}}
-			 keys %{$reads{$pid}{errors}}) {
-	    my $errors = $reads{$pid}{errors}{$err};
+    my @errcounts = ();
 
-	    printf("%6d  %-20s  %6d  %10s\n", $pid, $comm, $err, $errors);
+    foreach my $pid (keys %reads) {
+	foreach my $error (keys %{$reads{$pid}{errors}}) {
+	    my $comm = $reads{$pid}{comm} || "";
+	    my $errcount = $reads{$pid}{errors}{$error} || 0;
+	    push @errcounts, [$pid, $comm, $error, $errcount];
 	}
     }
 
+    @errcounts = sort { $b->[3] <=> $a->[3] } @errcounts;
+
+    for my $i (0 .. $#errcounts) {
+	printf("%6d  %-20s  %6d  %10s\n", $errcounts[$i][0],
+	       $errcounts[$i][1], $errcounts[$i][2], $errcounts[$i][3]);
+    }
+
     printf("\nwrite counts by pid:\n\n");
 
     printf("%6s  %20s  %10s  %10s\n", "pid", "comm",
@@ -113,11 +120,11 @@ sub trace_end
     printf("%6s  %-20s  %10s  %10s\n", "------", "--------------------",
 	   "-----------", "----------");
 
-    foreach my $pid (sort {$writes{$b}{bytes_written} <=>
-			       $writes{$a}{bytes_written}} keys %writes) {
-	my $comm = $writes{$pid}{comm};
-	my $total_writes = $writes{$pid}{total_writes};
-	my $bytes_written = $writes{$pid}{bytes_written};
+    foreach my $pid (sort { ($writes{$b}{bytes_written} || 0) <=>
+			($writes{$a}{bytes_written} || 0)} keys %writes) {
+	my $comm = $writes{$pid}{comm} || "";
+	my $total_writes = $writes{$pid}{total_writes} || 0;
+	my $bytes_written = $writes{$pid}{bytes_written} || 0;
 
 	printf("%6s  %-20s  %10s  %10s\n", $pid, $comm,
 	       $total_writes, $bytes_written);
@@ -129,16 +136,23 @@ sub trace_end
     printf("%6s  %20s  %6s  %10s\n", "------", "--------------------",
 	   "------", "----------");
 
-    foreach my $pid (keys %writes) {
-	my $comm = $writes{$pid}{comm};
-	foreach my $err (sort {$writes{$b}{comm} cmp $writes{$a}{comm}}
-			 keys %{$writes{$pid}{errors}}) {
-	    my $errors = $writes{$pid}{errors}{$err};
+    @errcounts = ();
 
-	    printf("%6d  %-20s  %6d  %10s\n", $pid, $comm, $err, $errors);
+    foreach my $pid (keys %writes) {
+	foreach my $error (keys %{$writes{$pid}{errors}}) {
+	    my $comm = $writes{$pid}{comm} || "";
+	    my $errcount = $writes{$pid}{errors}{$error} || 0;
+	    push @errcounts, [$pid, $comm, $error, $errcount];
 	}
     }
 
+    @errcounts = sort { $b->[3] <=> $a->[3] } @errcounts;
+
+    for my $i (0 .. $#errcounts) {
+	printf("%6d  %-20s  %6d  %10s\n", $errcounts[$i][0],
+	       $errcounts[$i][1], $errcounts[$i][2], $errcounts[$i][3]);
+    }
+
     print_unhandled();
 }
 
diff --git a/tools/perf/scripts/perl/rwtop.pl b/tools/perf/scripts/perl/rwtop.pl
new file mode 100644
index 00000000000..8b20787021c
--- /dev/null
+++ b/tools/perf/scripts/perl/rwtop.pl
@@ -0,0 +1,203 @@
+#!/usr/bin/perl -w
+# (c) 2010, Tom Zanussi <tzanussi@gmail.com>
+# Licensed under the terms of the GNU GPL License version 2
+
+# read/write top
+#
+# Periodically displays system-wide r/w call activity, broken down by
+# pid.  If an [interval] arg is specified, the display will be
+# refreshed every [interval] seconds.  The default interval is 3
+# seconds.
+
+use 5.010000;
+use strict;
+use warnings;
+
+use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib";
+use lib "./Perf-Trace-Util/lib";
+use Perf::Trace::Core;
+use Perf::Trace::Util;
+use POSIX qw/SIGALRM SA_RESTART/;
+
+my $default_interval = 3;
+my $nlines = 20;
+my $print_thread;
+my $print_pending = 0;
+
+my %reads;
+my %writes;
+
+my $interval = shift;
+if (!$interval) {
+    $interval = $default_interval;
+}
+
+sub syscalls::sys_exit_read
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+	$common_pid, $common_comm,
+	$nr, $ret) = @_;
+
+    print_check();
+
+    if ($ret > 0) {
+	$reads{$common_pid}{bytes_read} += $ret;
+    } else {
+	if (!defined ($reads{$common_pid}{bytes_read})) {
+	    $reads{$common_pid}{bytes_read} = 0;
+	}
+	$reads{$common_pid}{errors}{$ret}++;
+    }
+}
+
+sub syscalls::sys_enter_read
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+	$common_pid, $common_comm,
+	$nr, $fd, $buf, $count) = @_;
+
+    print_check();
+
+    $reads{$common_pid}{bytes_requested} += $count;
+    $reads{$common_pid}{total_reads}++;
+    $reads{$common_pid}{comm} = $common_comm;
+}
+
+sub syscalls::sys_exit_write
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+	$common_pid, $common_comm,
+	$nr, $ret) = @_;
+
+    print_check();
+
+    if ($ret <= 0) {
+	$writes{$common_pid}{errors}{$ret}++;
+    }
+}
+
+sub syscalls::sys_enter_write
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+	$common_pid, $common_comm,
+	$nr, $fd, $buf, $count) = @_;
+
+    print_check();
+
+    $writes{$common_pid}{bytes_written} += $count;
+    $writes{$common_pid}{total_writes}++;
+    $writes{$common_pid}{comm} = $common_comm;
+}
+
+sub trace_begin
+{
+    my $sa = POSIX::SigAction->new(\&set_print_pending);
+    $sa->flags(SA_RESTART);
+    $sa->safe(1);
+    POSIX::sigaction(SIGALRM, $sa) or die "Can't set SIGALRM handler: $!\n";
+    alarm 1;
+}
+
+sub trace_end
+{
+    print_unhandled();
+    print_totals();
+}
+
+sub print_check()
+{
+    if ($print_pending == 1) {
+	$print_pending = 0;
+	print_totals();
+    }
+}
+
+sub set_print_pending()
+{
+    $print_pending = 1;
+    alarm $interval;
+}
+
+sub print_totals
+{
+    my $count;
+
+    $count = 0;
+
+    clear_term();
+
+    printf("\nread counts by pid:\n\n");
+
+    printf("%6s  %20s  %10s  %10s  %10s\n", "pid", "comm",
+	   "# reads", "bytes_req", "bytes_read");
+    printf("%6s  %-20s  %10s  %10s  %10s\n", "------", "--------------------",
+	   "----------", "----------", "----------");
+
+    foreach my $pid (sort { ($reads{$b}{bytes_read} || 0) <=>
+			       ($reads{$a}{bytes_read} || 0) } keys %reads) {
+	my $comm = $reads{$pid}{comm} || "";
+	my $total_reads = $reads{$pid}{total_reads} || 0;
+	my $bytes_requested = $reads{$pid}{bytes_requested} || 0;
+	my $bytes_read = $reads{$pid}{bytes_read} || 0;
+
+	printf("%6s  %-20s  %10s  %10s  %10s\n", $pid, $comm,
+	       $total_reads, $bytes_requested, $bytes_read);
+
+	if (++$count == $nlines) {
+	    last;
+	}
+    }
+
+    $count = 0;
+
+    printf("\nwrite counts by pid:\n\n");
+
+    printf("%6s  %20s  %10s  %13s\n", "pid", "comm",
+	   "# writes", "bytes_written");
+    printf("%6s  %-20s  %10s  %13s\n", "------", "--------------------",
+	   "----------", "-------------");
+
+    foreach my $pid (sort { ($writes{$b}{bytes_written} || 0) <=>
+			($writes{$a}{bytes_written} || 0)} keys %writes) {
+	my $comm = $writes{$pid}{comm} || "";
+	my $total_writes = $writes{$pid}{total_writes} || 0;
+	my $bytes_written = $writes{$pid}{bytes_written} || 0;
+
+	printf("%6s  %-20s  %10s  %13s\n", $pid, $comm,
+	       $total_writes, $bytes_written);
+
+	if (++$count == $nlines) {
+	    last;
+	}
+    }
+
+    %reads = ();
+    %writes = ();
+}
+
+my %unhandled;
+
+sub print_unhandled
+{
+    if ((scalar keys %unhandled) == 0) {
+	return;
+    }
+
+    print "\nunhandled events:\n\n";
+
+    printf("%-40s  %10s\n", "event", "count");
+    printf("%-40s  %10s\n", "----------------------------------------",
+	   "-----------");
+
+    foreach my $event_name (keys %unhandled) {
+	printf("%-40s  %10d\n", $event_name, $unhandled{$event_name});
+    }
+}
+
+sub trace_unhandled
+{
+    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
+	$common_pid, $common_comm) = @_;
+
+    $unhandled{$event_name}++;
+}
diff --git a/tools/perf/scripts/perl/wakeup-latency.pl b/tools/perf/scripts/perl/wakeup-latency.pl
index ed58ef284e2..d9143dcec6c 100644
--- a/tools/perf/scripts/perl/wakeup-latency.pl
+++ b/tools/perf/scripts/perl/wakeup-latency.pl
@@ -22,8 +22,8 @@ my %last_wakeup;
 
 my $max_wakeup_latency;
 my $min_wakeup_latency;
-my $total_wakeup_latency;
-my $total_wakeups;
+my $total_wakeup_latency = 0;
+my $total_wakeups = 0;
 
 sub sched::sched_switch
 {
@@ -67,8 +67,12 @@ sub trace_end
 {
     printf("wakeup_latency stats:\n\n");
     print "total_wakeups: $total_wakeups\n";
-    printf("avg_wakeup_latency (ns): %u\n",
-	   avg($total_wakeup_latency, $total_wakeups));
+    if ($total_wakeups) {
+	printf("avg_wakeup_latency (ns): %u\n",
+	       avg($total_wakeup_latency, $total_wakeups));
+    } else {
+	printf("avg_wakeup_latency (ns): N/A\n");
+    }
     printf("min_wakeup_latency (ns): %u\n", $min_wakeup_latency);
     printf("max_wakeup_latency (ns): %u\n", $max_wakeup_latency);
 
diff --git a/tools/perf/scripts/perl/workqueue-stats.pl b/tools/perf/scripts/perl/workqueue-stats.pl
deleted file mode 100644
index 511302c8a49..00000000000
--- a/tools/perf/scripts/perl/workqueue-stats.pl
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/perl -w
-# (c) 2009, Tom Zanussi <tzanussi@gmail.com>
-# Licensed under the terms of the GNU GPL License version 2
-
-# Displays workqueue stats
-#
-# Usage:
-#
-#   perf record -c 1 -f -a -R -e workqueue:workqueue_creation -e
-#     workqueue:workqueue_destruction -e workqueue:workqueue_execution
-#     -e workqueue:workqueue_insertion
-#
-#   perf trace -p -s tools/perf/scripts/perl/workqueue-stats.pl
-
-use 5.010000;
-use strict;
-use warnings;
-
-use lib "$ENV{'PERF_EXEC_PATH'}/scripts/perl/Perf-Trace-Util/lib";
-use lib "./Perf-Trace-Util/lib";
-use Perf::Trace::Core;
-use Perf::Trace::Util;
-
-my @cpus;
-
-sub workqueue::workqueue_destruction
-{
-    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
-	$thread_comm, $thread_pid) = @_;
-
-    $cpus[$common_cpu]{$thread_pid}{destroyed}++;
-    $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm;
-}
-
-sub workqueue::workqueue_creation
-{
-    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
-	$thread_comm, $thread_pid, $cpu) = @_;
-
-    $cpus[$common_cpu]{$thread_pid}{created}++;
-    $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm;
-}
-
-sub workqueue::workqueue_execution
-{
-    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
-	$thread_comm, $thread_pid, $func) = @_;
-
-    $cpus[$common_cpu]{$thread_pid}{executed}++;
-    $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm;
-}
-
-sub workqueue::workqueue_insertion
-{
-    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm,
-	$thread_comm, $thread_pid, $func) = @_;
-
-    $cpus[$common_cpu]{$thread_pid}{inserted}++;
-    $cpus[$common_cpu]{$thread_pid}{comm} = $thread_comm;
-}
-
-sub trace_end
-{
-    print "workqueue work stats:\n\n";
-    my $cpu = 0;
-    printf("%3s %6s %6s\t%-20s\n", "cpu", "ins", "exec", "name");
-    printf("%3s %6s %6s\t%-20s\n", "---", "---", "----", "----");
-    foreach my $pidhash (@cpus) {
-	while ((my $pid, my $wqhash) = each %$pidhash) {
-	    my $ins = $$wqhash{'inserted'};
-	    my $exe = $$wqhash{'executed'};
-	    my $comm = $$wqhash{'comm'};
-	    if ($ins || $exe) {
-		printf("%3u %6u %6u\t%-20s\n", $cpu, $ins, $exe, $comm);
-	    }
-	}
-	$cpu++;
-    }
-
-    $cpu = 0;
-    print "\nworkqueue lifecycle stats:\n\n";
-    printf("%3s %6s %6s\t%-20s\n", "cpu", "created", "destroyed", "name");
-    printf("%3s %6s %6s\t%-20s\n", "---", "-------", "---------", "----");
-    foreach my $pidhash (@cpus) {
-	while ((my $pid, my $wqhash) = each %$pidhash) {
-	    my $created = $$wqhash{'created'};
-	    my $destroyed = $$wqhash{'destroyed'};
-	    my $comm = $$wqhash{'comm'};
-	    if ($created || $destroyed) {
-		printf("%3u %6u %6u\t%-20s\n", $cpu, $created, $destroyed,
-		       $comm);
-	    }
-	}
-	$cpu++;
-    }
-
-    print_unhandled();
-}
-
-my %unhandled;
-
-sub print_unhandled
-{
-    if ((scalar keys %unhandled) == 0) {
-	return;
-    }
-
-    print "\nunhandled events:\n\n";
-
-    printf("%-40s  %10s\n", "event", "count");
-    printf("%-40s  %10s\n", "----------------------------------------",
-	   "-----------");
-
-    foreach my $event_name (keys %unhandled) {
-	printf("%-40s  %10d\n", $event_name, $unhandled{$event_name});
-    }
-}
-
-sub trace_unhandled
-{
-    my ($event_name, $context, $common_cpu, $common_secs, $common_nsecs,
-	$common_pid, $common_comm) = @_;
-
-    $unhandled{$event_name}++;
-}
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/Context.c b/tools/perf/scripts/python/Perf-Trace-Util/Context.c
index 957085dd5d8..fcd1dd66790 100644
--- a/tools/perf/scripts/python/Perf-Trace-Util/Context.c
+++ b/tools/perf/scripts/python/Perf-Trace-Util/Context.c
@@ -1,5 +1,5 @@
 /*
- * Context.c.  Python interfaces for perf trace.
+ * Context.c.  Python interfaces for perf script.
  *
  * Copyright (C) 2010 Tom Zanussi <tzanussi@gmail.com>
  *
@@ -25,7 +25,7 @@
 
 PyMODINIT_FUNC initperf_trace_context(void);
 
-static PyObject *perf_trace_context_common_pc(PyObject *self, PyObject *args)
+static PyObject *perf_trace_context_common_pc(PyObject *obj, PyObject *args)
 {
 	static struct scripting_context *scripting_context;
 	PyObject *context;
@@ -40,7 +40,7 @@ static PyObject *perf_trace_context_common_pc(PyObject *self, PyObject *args)
 	return Py_BuildValue("i", retval);
 }
 
-static PyObject *perf_trace_context_common_flags(PyObject *self,
+static PyObject *perf_trace_context_common_flags(PyObject *obj,
 						 PyObject *args)
 {
 	static struct scripting_context *scripting_context;
@@ -56,7 +56,7 @@ static PyObject *perf_trace_context_common_flags(PyObject *self,
 	return Py_BuildValue("i", retval);
 }
 
-static PyObject *perf_trace_context_common_lock_depth(PyObject *self,
+static PyObject *perf_trace_context_common_lock_depth(PyObject *obj,
 						      PyObject *args)
 {
 	static struct scripting_context *scripting_context;
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Core.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Core.py
index 1dc464ee2ca..de7211e4fa4 100644
--- a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Core.py
+++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Core.py
@@ -1,4 +1,4 @@
-# Core.py - Python extension for perf trace, core functions
+# Core.py - Python extension for perf script, core functions
 #
 # Copyright (C) 2010 by Tom Zanussi <tzanussi@gmail.com>
 #
@@ -89,3 +89,33 @@ def trace_flag_str(value):
 	    value &= ~idx
 
     return string
+
+
+def taskState(state):
+	states = {
+		0 : "R",
+		1 : "S",
+		2 : "D",
+		64: "DEAD"
+	}
+
+	if state not in states:
+		return "Unknown"
+
+	return states[state]
+
+
+class EventHeaders:
+	def __init__(self, common_cpu, common_secs, common_nsecs,
+		     common_pid, common_comm):
+		self.cpu = common_cpu
+		self.secs = common_secs
+		self.nsecs = common_nsecs
+		self.pid = common_pid
+		self.comm = common_comm
+
+	def ts(self):
+		return (self.secs * (10 ** 9)) + self.nsecs
+
+	def ts_format(self):
+		return "%d.%d" % (self.secs, int(self.nsecs / 1000))
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/EventClass.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/EventClass.py
new file mode 100755
index 00000000000..9e0985794e2
--- /dev/null
+++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/EventClass.py
@@ -0,0 +1,94 @@
+# EventClass.py
+#
+# This is a library defining some events types classes, which could
+# be used by other scripts to analyzing the perf samples.
+#
+# Currently there are just a few classes defined for examples,
+# PerfEvent is the base class for all perf event sample, PebsEvent
+# is a HW base Intel x86 PEBS event, and user could add more SW/HW
+# event classes based on requirements.
+
+import struct
+
+# Event types, user could add more here
+EVTYPE_GENERIC  = 0
+EVTYPE_PEBS     = 1     # Basic PEBS event
+EVTYPE_PEBS_LL  = 2     # PEBS event with load latency info
+EVTYPE_IBS      = 3
+
+#
+# Currently we don't have good way to tell the event type, but by
+# the size of raw buffer, raw PEBS event with load latency data's
+# size is 176 bytes, while the pure PEBS event's size is 144 bytes.
+#
+def create_event(name, comm, dso, symbol, raw_buf):
+        if (len(raw_buf) == 144):
+                event = PebsEvent(name, comm, dso, symbol, raw_buf)
+        elif (len(raw_buf) == 176):
+                event = PebsNHM(name, comm, dso, symbol, raw_buf)
+        else:
+                event = PerfEvent(name, comm, dso, symbol, raw_buf)
+
+        return event
+
+class PerfEvent(object):
+        event_num = 0
+        def __init__(self, name, comm, dso, symbol, raw_buf, ev_type=EVTYPE_GENERIC):
+                self.name       = name
+                self.comm       = comm
+                self.dso        = dso
+                self.symbol     = symbol
+                self.raw_buf    = raw_buf
+                self.ev_type    = ev_type
+                PerfEvent.event_num += 1
+
+        def show(self):
+                print "PMU event: name=%12s, symbol=%24s, comm=%8s, dso=%12s" % (self.name, self.symbol, self.comm, self.dso)
+
+#
+# Basic Intel PEBS (Precise Event-based Sampling) event, whose raw buffer
+# contains the context info when that event happened: the EFLAGS and
+# linear IP info, as well as all the registers.
+#
+class PebsEvent(PerfEvent):
+        pebs_num = 0
+        def __init__(self, name, comm, dso, symbol, raw_buf, ev_type=EVTYPE_PEBS):
+                tmp_buf=raw_buf[0:80]
+                flags, ip, ax, bx, cx, dx, si, di, bp, sp = struct.unpack('QQQQQQQQQQ', tmp_buf)
+                self.flags = flags
+                self.ip    = ip
+                self.ax    = ax
+                self.bx    = bx
+                self.cx    = cx
+                self.dx    = dx
+                self.si    = si
+                self.di    = di
+                self.bp    = bp
+                self.sp    = sp
+
+                PerfEvent.__init__(self, name, comm, dso, symbol, raw_buf, ev_type)
+                PebsEvent.pebs_num += 1
+                del tmp_buf
+
+#
+# Intel Nehalem and Westmere support PEBS plus Load Latency info which lie
+# in the four 64 bit words write after the PEBS data:
+#       Status: records the IA32_PERF_GLOBAL_STATUS register value
+#       DLA:    Data Linear Address (EIP)
+#       DSE:    Data Source Encoding, where the latency happens, hit or miss
+#               in L1/L2/L3 or IO operations
+#       LAT:    the actual latency in cycles
+#
+class PebsNHM(PebsEvent):
+        pebs_nhm_num = 0
+        def __init__(self, name, comm, dso, symbol, raw_buf, ev_type=EVTYPE_PEBS_LL):
+                tmp_buf=raw_buf[144:176]
+                status, dla, dse, lat = struct.unpack('QQQQ', tmp_buf)
+                self.status = status
+                self.dla = dla
+                self.dse = dse
+                self.lat = lat
+
+                PebsEvent.__init__(self, name, comm, dso, symbol, raw_buf, ev_type)
+                PebsNHM.pebs_nhm_num += 1
+                del tmp_buf
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/SchedGui.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/SchedGui.py
new file mode 100644
index 00000000000..fdd92f69905
--- /dev/null
+++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/SchedGui.py
@@ -0,0 +1,184 @@
+# SchedGui.py - Python extension for perf script, basic GUI code for
+#		traces drawing and overview.
+#
+# Copyright (C) 2010 by Frederic Weisbecker <fweisbec@gmail.com>
+#
+# This software is distributed under the terms of the GNU General
+# Public License ("GPL") version 2 as published by the Free Software
+# Foundation.
+
+
+try:
+	import wx
+except ImportError:
+	raise ImportError, "You need to install the wxpython lib for this script"
+
+
+class RootFrame(wx.Frame):
+	Y_OFFSET = 100
+	RECT_HEIGHT = 100
+	RECT_SPACE = 50
+	EVENT_MARKING_WIDTH = 5
+
+	def __init__(self, sched_tracer, title, parent = None, id = -1):
+		wx.Frame.__init__(self, parent, id, title)
+
+		(self.screen_width, self.screen_height) = wx.GetDisplaySize()
+		self.screen_width -= 10
+		self.screen_height -= 10
+		self.zoom = 0.5
+		self.scroll_scale = 20
+		self.sched_tracer = sched_tracer
+		self.sched_tracer.set_root_win(self)
+		(self.ts_start, self.ts_end) = sched_tracer.interval()
+		self.update_width_virtual()
+		self.nr_rects = sched_tracer.nr_rectangles() + 1
+		self.height_virtual = RootFrame.Y_OFFSET + (self.nr_rects * (RootFrame.RECT_HEIGHT + RootFrame.RECT_SPACE))
+
+		# whole window panel
+		self.panel = wx.Panel(self, size=(self.screen_width, self.screen_height))
+
+		# scrollable container
+		self.scroll = wx.ScrolledWindow(self.panel)
+		self.scroll.SetScrollbars(self.scroll_scale, self.scroll_scale, self.width_virtual / self.scroll_scale, self.height_virtual / self.scroll_scale)
+		self.scroll.EnableScrolling(True, True)
+		self.scroll.SetFocus()
+
+		# scrollable drawing area
+		self.scroll_panel = wx.Panel(self.scroll, size=(self.screen_width - 15, self.screen_height / 2))
+		self.scroll_panel.Bind(wx.EVT_PAINT, self.on_paint)
+		self.scroll_panel.Bind(wx.EVT_KEY_DOWN, self.on_key_press)
+		self.scroll_panel.Bind(wx.EVT_LEFT_DOWN, self.on_mouse_down)
+		self.scroll.Bind(wx.EVT_PAINT, self.on_paint)
+		self.scroll.Bind(wx.EVT_KEY_DOWN, self.on_key_press)
+		self.scroll.Bind(wx.EVT_LEFT_DOWN, self.on_mouse_down)
+
+		self.scroll.Fit()
+		self.Fit()
+
+		self.scroll_panel.SetDimensions(-1, -1, self.width_virtual, self.height_virtual, wx.SIZE_USE_EXISTING)
+
+		self.txt = None
+
+		self.Show(True)
+
+	def us_to_px(self, val):
+		return val / (10 ** 3) * self.zoom
+
+	def px_to_us(self, val):
+		return (val / self.zoom) * (10 ** 3)
+
+	def scroll_start(self):
+		(x, y) = self.scroll.GetViewStart()
+		return (x * self.scroll_scale, y * self.scroll_scale)
+
+	def scroll_start_us(self):
+		(x, y) = self.scroll_start()
+		return self.px_to_us(x)
+
+	def paint_rectangle_zone(self, nr, color, top_color, start, end):
+		offset_px = self.us_to_px(start - self.ts_start)
+		width_px = self.us_to_px(end - self.ts_start)
+
+		offset_py = RootFrame.Y_OFFSET + (nr * (RootFrame.RECT_HEIGHT + RootFrame.RECT_SPACE))
+		width_py = RootFrame.RECT_HEIGHT
+
+		dc = self.dc
+
+		if top_color is not None:
+			(r, g, b) = top_color
+			top_color = wx.Colour(r, g, b)
+			brush = wx.Brush(top_color, wx.SOLID)
+			dc.SetBrush(brush)
+			dc.DrawRectangle(offset_px, offset_py, width_px, RootFrame.EVENT_MARKING_WIDTH)
+			width_py -= RootFrame.EVENT_MARKING_WIDTH
+			offset_py += RootFrame.EVENT_MARKING_WIDTH
+
+		(r ,g, b) = color
+		color = wx.Colour(r, g, b)
+		brush = wx.Brush(color, wx.SOLID)
+		dc.SetBrush(brush)
+		dc.DrawRectangle(offset_px, offset_py, width_px, width_py)
+
+	def update_rectangles(self, dc, start, end):
+		start += self.ts_start
+		end += self.ts_start
+		self.sched_tracer.fill_zone(start, end)
+
+	def on_paint(self, event):
+		dc = wx.PaintDC(self.scroll_panel)
+		self.dc = dc
+
+		width = min(self.width_virtual, self.screen_width)
+		(x, y) = self.scroll_start()
+		start = self.px_to_us(x)
+		end = self.px_to_us(x + width)
+		self.update_rectangles(dc, start, end)
+
+	def rect_from_ypixel(self, y):
+		y -= RootFrame.Y_OFFSET
+		rect = y / (RootFrame.RECT_HEIGHT + RootFrame.RECT_SPACE)
+		height = y % (RootFrame.RECT_HEIGHT + RootFrame.RECT_SPACE)
+
+		if rect < 0 or rect > self.nr_rects - 1 or height > RootFrame.RECT_HEIGHT:
+			return -1
+
+		return rect
+
+	def update_summary(self, txt):
+		if self.txt:
+			self.txt.Destroy()
+		self.txt = wx.StaticText(self.panel, -1, txt, (0, (self.screen_height / 2) + 50))
+
+
+	def on_mouse_down(self, event):
+		(x, y) = event.GetPositionTuple()
+		rect = self.rect_from_ypixel(y)
+		if rect == -1:
+			return
+
+		t = self.px_to_us(x) + self.ts_start
+
+		self.sched_tracer.mouse_down(rect, t)
+
+
+	def update_width_virtual(self):
+		self.width_virtual = self.us_to_px(self.ts_end - self.ts_start)
+
+	def __zoom(self, x):
+		self.update_width_virtual()
+		(xpos, ypos) = self.scroll.GetViewStart()
+		xpos = self.us_to_px(x) / self.scroll_scale
+		self.scroll.SetScrollbars(self.scroll_scale, self.scroll_scale, self.width_virtual / self.scroll_scale, self.height_virtual / self.scroll_scale, xpos, ypos)
+		self.Refresh()
+
+	def zoom_in(self):
+		x = self.scroll_start_us()
+		self.zoom *= 2
+		self.__zoom(x)
+
+	def zoom_out(self):
+		x = self.scroll_start_us()
+		self.zoom /= 2
+		self.__zoom(x)
+
+
+	def on_key_press(self, event):
+		key = event.GetRawKeyCode()
+		if key == ord("+"):
+			self.zoom_in()
+			return
+		if key == ord("-"):
+			self.zoom_out()
+			return
+
+		key = event.GetKeyCode()
+		(x, y) = self.scroll.GetViewStart()
+		if key == wx.WXK_RIGHT:
+			self.scroll.Scroll(x + 1, y)
+		elif key == wx.WXK_LEFT:
+			self.scroll.Scroll(x - 1, y)
+		elif key == wx.WXK_DOWN:
+			self.scroll.Scroll(x, y + 1)
+		elif key == wx.WXK_UP:
+			self.scroll.Scroll(x, y - 1)
diff --git a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
index 83e91435ed0..15c8400240f 100644
--- a/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
+++ b/tools/perf/scripts/python/Perf-Trace-Util/lib/Perf/Trace/Util.py
@@ -1,4 +1,4 @@
-# Util.py - Python extension for perf trace, miscellaneous utility code
+# Util.py - Python extension for perf script, miscellaneous utility code
 #
 # Copyright (C) 2010 by Tom Zanussi <tzanussi@gmail.com>
 #
@@ -6,6 +6,14 @@
 # Public License ("GPL") version 2 as published by the Free Software
 # Foundation.
 
+import errno, os
+
+FUTEX_WAIT = 0
+FUTEX_WAKE = 1
+FUTEX_PRIVATE_FLAG = 128
+FUTEX_CLOCK_REALTIME = 256
+FUTEX_CMD_MASK = ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME)
+
 NSECS_PER_SEC    = 1000000000
 
 def avg(total, n):
@@ -23,3 +31,56 @@ def nsecs_nsecs(nsecs):
 def nsecs_str(nsecs):
     str = "%5u.%09u" % (nsecs_secs(nsecs), nsecs_nsecs(nsecs)),
     return str
+
+def add_stats(dict, key, value):
+	if not dict.has_key(key):
+		dict[key] = (value, value, value, 1)
+	else:
+		min, max, avg, count = dict[key]
+		if value < min:
+			min = value
+		if value > max:
+			max = value
+		avg = (avg + value) / 2
+		dict[key] = (min, max, avg, count + 1)
+
+def clear_term():
+    print("\x1b[H\x1b[2J")
+
+audit_package_warned = False
+
+try:
+	import audit
+	machine_to_id = {
+		'x86_64': audit.MACH_86_64,
+		'alpha'	: audit.MACH_ALPHA,
+		'ia64'	: audit.MACH_IA64,
+		'ppc'	: audit.MACH_PPC,
+		'ppc64'	: audit.MACH_PPC64,
+		's390'	: audit.MACH_S390,
+		's390x'	: audit.MACH_S390X,
+		'i386'	: audit.MACH_X86,
+		'i586'	: audit.MACH_X86,
+		'i686'	: audit.MACH_X86,
+	}
+	try:
+		machine_to_id['armeb'] = audit.MACH_ARMEB
+	except:
+		pass
+	machine_id = machine_to_id[os.uname()[4]]
+except:
+	if not audit_package_warned:
+		audit_package_warned = True
+		print "Install the audit-libs-python package to get syscall names"
+
+def syscall_name(id):
+	try:
+		return audit.audit_syscall_to_name(id, machine_id)
+	except:
+		return str(id)
+
+def strerror(nr):
+	try:
+		return errno.errorcode[abs(nr)]
+	except:
+		return "Unknown %d errno" % nr
diff --git a/tools/perf/scripts/python/bin/event_analyzing_sample-record b/tools/perf/scripts/python/bin/event_analyzing_sample-record
new file mode 100644
index 00000000000..5ce652dabd0
--- /dev/null
+++ b/tools/perf/scripts/python/bin/event_analyzing_sample-record
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+#
+# event_analyzing_sample.py can cover all type of perf samples including
+# the tracepoints, so no special record requirements, just record what
+# you want to analyze.
+#
+perf record $@
diff --git a/tools/perf/scripts/python/bin/event_analyzing_sample-report b/tools/perf/scripts/python/bin/event_analyzing_sample-report
new file mode 100644
index 00000000000..0941fc94e15
--- /dev/null
+++ b/tools/perf/scripts/python/bin/event_analyzing_sample-report
@@ -0,0 +1,3 @@
+#!/bin/bash
+# description: analyze all perf samples
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/event_analyzing_sample.py
diff --git a/tools/perf/scripts/python/bin/failed-syscalls-by-pid-record b/tools/perf/scripts/python/bin/failed-syscalls-by-pid-record
index f8885d389e6..8104895a7b6 100644
--- a/tools/perf/scripts/python/bin/failed-syscalls-by-pid-record
+++ b/tools/perf/scripts/python/bin/failed-syscalls-by-pid-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e raw_syscalls:sys_exit
+perf record -e raw_syscalls:sys_exit $@
diff --git a/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report b/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report
index 1e0c0a860c8..fda5096d0cb 100644
--- a/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report
+++ b/tools/perf/scripts/python/bin/failed-syscalls-by-pid-report
@@ -1,4 +1,10 @@
 #!/bin/bash
 # description: system-wide failed syscalls, by pid
 # args: [comm]
-perf trace -s ~/libexec/perf-core/scripts/python/failed-syscalls-by-pid.py $1
+if [ $# -gt 0 ] ; then
+    if ! expr match "$1" "-" > /dev/null ; then
+	comm=$1
+	shift
+    fi
+fi
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/failed-syscalls-by-pid.py $comm
diff --git a/tools/perf/scripts/python/bin/futex-contention-record b/tools/perf/scripts/python/bin/futex-contention-record
new file mode 100644
index 00000000000..b1495c9a9b2
--- /dev/null
+++ b/tools/perf/scripts/python/bin/futex-contention-record
@@ -0,0 +1,2 @@
+#!/bin/bash
+perf record -e syscalls:sys_enter_futex -e syscalls:sys_exit_futex $@
diff --git a/tools/perf/scripts/python/bin/futex-contention-report b/tools/perf/scripts/python/bin/futex-contention-report
new file mode 100644
index 00000000000..6c44271091a
--- /dev/null
+++ b/tools/perf/scripts/python/bin/futex-contention-report
@@ -0,0 +1,4 @@
+#!/bin/bash
+# description: futext contention measurement
+
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/futex-contention.py
diff --git a/tools/perf/scripts/python/bin/net_dropmonitor-record b/tools/perf/scripts/python/bin/net_dropmonitor-record
new file mode 100755
index 00000000000..423fb81dada
--- /dev/null
+++ b/tools/perf/scripts/python/bin/net_dropmonitor-record
@@ -0,0 +1,2 @@
+#!/bin/bash
+perf record -e skb:kfree_skb $@
diff --git a/tools/perf/scripts/python/bin/net_dropmonitor-report b/tools/perf/scripts/python/bin/net_dropmonitor-report
new file mode 100755
index 00000000000..8d698f5a06a
--- /dev/null
+++ b/tools/perf/scripts/python/bin/net_dropmonitor-report
@@ -0,0 +1,4 @@
+#!/bin/bash
+# description: display a table of dropped frames
+
+perf script -s "$PERF_EXEC_PATH"/scripts/python/net_dropmonitor.py $@
diff --git a/tools/perf/scripts/python/bin/netdev-times-record b/tools/perf/scripts/python/bin/netdev-times-record
new file mode 100644
index 00000000000..558754b840a
--- /dev/null
+++ b/tools/perf/scripts/python/bin/netdev-times-record
@@ -0,0 +1,8 @@
+#!/bin/bash
+perf record -e net:net_dev_xmit -e net:net_dev_queue		\
+		-e net:netif_receive_skb -e net:netif_rx		\
+		-e skb:consume_skb -e skb:kfree_skb			\
+		-e skb:skb_copy_datagram_iovec -e napi:napi_poll	\
+		-e irq:irq_handler_entry -e irq:irq_handler_exit	\
+		-e irq:softirq_entry -e irq:softirq_exit		\
+		-e irq:softirq_raise $@
diff --git a/tools/perf/scripts/python/bin/netdev-times-report b/tools/perf/scripts/python/bin/netdev-times-report
new file mode 100644
index 00000000000..8f759291da8
--- /dev/null
+++ b/tools/perf/scripts/python/bin/netdev-times-report
@@ -0,0 +1,5 @@
+#!/bin/bash
+# description: display a process of packet and processing time
+# args: [tx] [rx] [dev=] [debug]
+
+perf script -s "$PERF_EXEC_PATH"/scripts/python/netdev-times.py $@
diff --git a/tools/perf/scripts/python/bin/sched-migration-record b/tools/perf/scripts/python/bin/sched-migration-record
new file mode 100644
index 00000000000..7493fddbe99
--- /dev/null
+++ b/tools/perf/scripts/python/bin/sched-migration-record
@@ -0,0 +1,2 @@
+#!/bin/bash
+perf record -m 16384 -e sched:sched_wakeup -e sched:sched_wakeup_new -e sched:sched_switch -e sched:sched_migrate_task $@
diff --git a/tools/perf/scripts/python/bin/sched-migration-report b/tools/perf/scripts/python/bin/sched-migration-report
new file mode 100644
index 00000000000..68b037a1849
--- /dev/null
+++ b/tools/perf/scripts/python/bin/sched-migration-report
@@ -0,0 +1,3 @@
+#!/bin/bash
+# description: sched migration overview
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/sched-migration.py
diff --git a/tools/perf/scripts/python/bin/sctop-record b/tools/perf/scripts/python/bin/sctop-record
new file mode 100644
index 00000000000..4efbfaa7f6a
--- /dev/null
+++ b/tools/perf/scripts/python/bin/sctop-record
@@ -0,0 +1,2 @@
+#!/bin/bash
+perf record -e raw_syscalls:sys_enter $@
diff --git a/tools/perf/scripts/python/bin/sctop-report b/tools/perf/scripts/python/bin/sctop-report
new file mode 100644
index 00000000000..c32db294124
--- /dev/null
+++ b/tools/perf/scripts/python/bin/sctop-report
@@ -0,0 +1,24 @@
+#!/bin/bash
+# description: syscall top
+# args: [comm] [interval]
+n_args=0
+for i in "$@"
+do
+    if expr match "$i" "-" > /dev/null ; then
+	break
+    fi
+    n_args=$(( $n_args + 1 ))
+done
+if [ "$n_args" -gt 2 ] ; then
+    echo "usage: sctop-report [comm] [interval]"
+    exit
+fi
+if [ "$n_args" -gt 1 ] ; then
+    comm=$1
+    interval=$2
+    shift 2
+elif [ "$n_args" -gt 0 ] ; then
+    interval=$1
+    shift
+fi
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/sctop.py $comm $interval
diff --git a/tools/perf/scripts/python/bin/syscall-counts-by-pid-record b/tools/perf/scripts/python/bin/syscall-counts-by-pid-record
index 45a8c50359d..4efbfaa7f6a 100644
--- a/tools/perf/scripts/python/bin/syscall-counts-by-pid-record
+++ b/tools/perf/scripts/python/bin/syscall-counts-by-pid-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e raw_syscalls:sys_enter
+perf record -e raw_syscalls:sys_enter $@
diff --git a/tools/perf/scripts/python/bin/syscall-counts-by-pid-report b/tools/perf/scripts/python/bin/syscall-counts-by-pid-report
index f8044d19227..16eb8d65c54 100644
--- a/tools/perf/scripts/python/bin/syscall-counts-by-pid-report
+++ b/tools/perf/scripts/python/bin/syscall-counts-by-pid-report
@@ -1,4 +1,10 @@
 #!/bin/bash
 # description: system-wide syscall counts, by pid
 # args: [comm]
-perf trace -s ~/libexec/perf-core/scripts/python/syscall-counts-by-pid.py $1
+if [ $# -gt 0 ] ; then
+    if ! expr match "$1" "-" > /dev/null ; then
+	comm=$1
+	shift
+    fi
+fi
+perf script $@ -s "$PERF_EXEC_PATH"/scripts/python/syscall-counts-by-pid.py $comm
diff --git a/tools/perf/scripts/python/bin/syscall-counts-record b/tools/perf/scripts/python/bin/syscall-counts-record
index 45a8c50359d..4efbfaa7f6a 100644
--- a/tools/perf/scripts/python/bin/syscall-counts-record
+++ b/tools/perf/scripts/python/bin/syscall-counts-record
@@ -1,2 +1,2 @@
 #!/bin/bash
-perf record -c 1 -f -a -M -R -e raw_syscalls:sys_enter
+perf record -e raw_syscalls:sys_enter $@
diff --git a/tools/perf/scripts/python/bin/syscall-counts-report b/tools/perf/scripts/python/bin/syscall-counts-report