19 files changed, 3463 insertions, 923 deletions
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
index 37732a220d3..c94435df203 100644
--- a/Documentation/trace/events.txt
+++ b/Documentation/trace/events.txt
@@ -287,3 +287,210 @@ their old filters):
 prev_pid == 0
 # cat sched_wakeup/filter
 common_pid == 0
+
+6. Event triggers
+=================
+
+Trace events can be made to conditionally invoke trigger 'commands'
+which can take various forms and are described in detail below;
+examples would be enabling or disabling other trace events or invoking
+a stack trace whenever the trace event is hit.  Whenever a trace event
+with attached triggers is invoked, the set of trigger commands
+associated with that event is invoked.  Any given trigger can
+additionally have an event filter of the same form as described in
+section 5 (Event filtering) associated with it - the command will only
+be invoked if the event being invoked passes the associated filter.
+If no filter is associated with the trigger, it always passes.
+
+Triggers are added to and removed from a particular event by writing
+trigger expressions to the 'trigger' file for the given event.
+
+A given event can have any number of triggers associated with it,
+subject to any restrictions that individual commands may have in that
+regard.
+
+Event triggers are implemented on top of "soft" mode, which means that
+whenever a trace event has one or more triggers associated with it,
+the event is activated even if it isn't actually enabled, but is
+disabled in a "soft" mode.  That is, the tracepoint will be called,
+but just will not be traced, unless of course it's actually enabled.
+This scheme allows triggers to be invoked even for events that aren't
+enabled, and also allows the current event filter implementation to be
+used for conditionally invoking triggers.
+
+The syntax for event triggers is roughly based on the syntax for
+set_ftrace_filter 'ftrace filter commands' (see the 'Filter commands'
+section of Documentation/trace/ftrace.txt), but there are major
+differences and the implementation isn't currently tied to it in any
+way, so beware about making generalizations between the two.
+
+6.1 Expression syntax
+---------------------
+
+Triggers are added by echoing the command to the 'trigger' file:
+
+  # echo 'command[:count] [if filter]' > trigger
+
+Triggers are removed by echoing the same command but starting with '!'
+to the 'trigger' file:
+
+  # echo '!command[:count] [if filter]' > trigger
+
+The [if filter] part isn't used in matching commands when removing, so
+leaving that off in a '!' command will accomplish the same thing as
+having it in.
+
+The filter syntax is the same as that described in the 'Event
+filtering' section above.
+
+For ease of use, writing to the trigger file using '>' currently just
+adds or removes a single trigger and there's no explicit '>>' support
+('>' actually behaves like '>>') or truncation support to remove all
+triggers (you have to use '!' for each one added.)
+
+6.2 Supported trigger commands
+------------------------------
+
+The following commands are supported:
+
+- enable_event/disable_event
+
+  These commands can enable or disable another trace event whenever
+  the triggering event is hit.  When these commands are registered,
+  the other trace event is activated, but disabled in a "soft" mode.
+  That is, the tracepoint will be called, but just will not be traced.
+  The event tracepoint stays in this mode as long as there's a trigger
+  in effect that can trigger it.
+
+  For example, the following trigger causes kmalloc events to be
+  traced when a read system call is entered, and the :1 at the end
+  specifies that this enablement happens only once:
+
+  # echo 'enable_event:kmem:kmalloc:1' > \
+      /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
+
+  The following trigger causes kmalloc events to stop being traced
+  when a read system call exits.  This disablement happens on every
+  read system call exit:
+
+  # echo 'disable_event:kmem:kmalloc' > \
+      /sys/kernel/debug/tracing/events/syscalls/sys_exit_read/trigger
+
+  The format is:
+
+      enable_event:<system>:<event>[:count]
+      disable_event:<system>:<event>[:count]
+
+  To remove the above commands:
+
+  # echo '!enable_event:kmem:kmalloc:1' > \
+      /sys/kernel/debug/tracing/events/syscalls/sys_enter_read/trigger
+
+  # echo '!disable_event:kmem:kmalloc' > \
+      /sys/kernel/debug/tracing/events/syscalls/sys_exit_read/trigger
+
+  Note that there can be any number of enable/disable_event triggers
+  per triggering event, but there can only be one trigger per
+  triggered event. e.g. sys_enter_read can have triggers enabling both
+  kmem:kmalloc and sched:sched_switch, but can't have two kmem:kmalloc
+  versions such as kmem:kmalloc and kmem:kmalloc:1 or 'kmem:kmalloc if
+  bytes_req == 256' and 'kmem:kmalloc if bytes_alloc == 256' (they
+  could be combined into a single filter on kmem:kmalloc though).
+
+- stacktrace
+
+  This command dumps a stacktrace in the trace buffer whenever the
+  triggering event occurs.
+
+  For example, the following trigger dumps a stacktrace every time the
+  kmalloc tracepoint is hit:
+
+  # echo 'stacktrace' > \
+        /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
+
+  The following trigger dumps a stacktrace the first 5 times a kmalloc
+  request happens with a size >= 64K
+
+  # echo 'stacktrace:5 if bytes_req >= 65536' > \
+        /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
+
+  The format is:
+
+      stacktrace[:count]
+
+  To remove the above commands:
+
+  # echo '!stacktrace' > \
+        /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
+
+  # echo '!stacktrace:5 if bytes_req >= 65536' > \
+        /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
+
+  The latter can also be removed more simply by the following (without
+  the filter):
+
+  # echo '!stacktrace:5' > \
+        /sys/kernel/debug/tracing/events/kmem/kmalloc/trigger
+
+  Note that there can be only one stacktrace trigger per triggering
+  event.
+
+- snapshot
+
+  This command causes a snapshot to be triggered whenever the
+  triggering event occurs.
+
+  The following command creates a snapshot every time a block request
+  queue is unplugged with a depth > 1.  If you were tracing a set of
+  events or functions at the time, the snapshot trace buffer would
+  capture those events when the trigger event occured:
+
+  # echo 'snapshot if nr_rq > 1' > \
+        /sys/kernel/debug/tracing/events/block/block_unplug/trigger
+
+  To only snapshot once:
+
+  # echo 'snapshot:1 if nr_rq > 1' > \
+        /sys/kernel/debug/tracing/events/block/block_unplug/trigger
+
+  To remove the above commands:
+
+  # echo '!snapshot if nr_rq > 1' > \
+        /sys/kernel/debug/tracing/events/block/block_unplug/trigger
+
+  # echo '!snapshot:1 if nr_rq > 1' > \
+        /sys/kernel/debug/tracing/events/block/block_unplug/trigger
+
+  Note that there can be only one snapshot trigger per triggering
+  event.
+
+- traceon/traceoff
+
+  These commands turn tracing on and off when the specified events are
+  hit. The parameter determines how many times the tracing system is
+  turned on and off. If unspecified, there is no limit.
+
+  The following command turns tracing off the first time a block
+  request queue is unplugged with a depth > 1.  If you were tracing a
+  set of events or functions at the time, you could then examine the
+  trace buffer to see the sequence of events that led up to the
+  trigger event:
+
+  # echo 'traceoff:1 if nr_rq > 1' > \
+        /sys/kernel/debug/tracing/events/block/block_unplug/trigger
+
+  To always disable tracing when nr_rq  > 1 :
+
+  # echo 'traceoff if nr_rq > 1' > \
+        /sys/kernel/debug/tracing/events/block/block_unplug/trigger
+
+  To remove the above commands:
+
+  # echo '!traceoff:1 if nr_rq > 1' > \
+        /sys/kernel/debug/tracing/events/block/block_unplug/trigger
+
+  # echo '!traceoff if nr_rq > 1' > \
+        /sys/kernel/debug/tracing/events/block/block_unplug/trigger
+
+  Note that there can be only one traceon or traceoff trigger per
+  triggering event.
diff --git a/Documentation/trace/uprobetracer.txt b/Documentation/trace/uprobetracer.txt
index d9c3e682312..f1cf9a34ad9 100644
--- a/Documentation/trace/uprobetracer.txt
+++ b/Documentation/trace/uprobetracer.txt
@@ -19,18 +19,44 @@ user to calculate the offset of the probepoint in the object.
 
 Synopsis of uprobe_tracer
 -------------------------
-  p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] : Set a uprobe
-  r[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] : Set a return uprobe (uretprobe)
-  -:[GRP/]EVENT                                  : Clear uprobe or uretprobe event
+  p[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a uprobe
+  r[:[GRP/]EVENT] PATH:OFFSET [FETCHARGS] : Set a return uprobe (uretprobe)
+  -:[GRP/]EVENT                           : Clear uprobe or uretprobe event
 
   GRP           : Group name. If omitted, "uprobes" is the default value.
   EVENT         : Event name. If omitted, the event name is generated based
-                  on SYMBOL+offs.
+                  on PATH+OFFSET.
   PATH          : Path to an executable or a library.
-  SYMBOL[+offs] : Symbol+offset where the probe is inserted.
+  OFFSET        : Offset where the probe is inserted.
 
   FETCHARGS     : Arguments. Each probe can have up to 128 args.
    %REG         : Fetch register REG
+   @ADDR	: Fetch memory at ADDR (ADDR should be in userspace)
+   @+OFFSET	: Fetch memory at OFFSET (OFFSET from same file as PATH)
+   $stackN	: Fetch Nth entry of stack (N >= 0)
+   $stack	: Fetch stack address.
+   $retval	: Fetch return value.(*)
+   +|-offs(FETCHARG) : Fetch memory at FETCHARG +|- offs address.(**)
+   NAME=FETCHARG     : Set NAME as the argument name of FETCHARG.
+   FETCHARG:TYPE     : Set TYPE as the type of FETCHARG. Currently, basic types
+		       (u8/u16/u32/u64/s8/s16/s32/s64), "string" and bitfield
+		       are supported.
+
+  (*) only for return probe.
+  (**) this is useful for fetching a field of data structures.
+
+Types
+-----
+Several types are supported for fetch-args. Uprobe tracer will access memory
+by given type. Prefix 's' and 'u' means those types are signed and unsigned
+respectively. Traced arguments are shown in decimal (signed) or hex (unsigned).
+String type is a special type, which fetches a "null-terminated" string from
+user space.
+Bitfield is another special type, which takes 3 parameters, bit-width, bit-
+offset, and container-size (usually 32). The syntax is;
+
+ b<bit-width>@<bit-offset>/<container-size>
+
 
 Event Profiling
 ---------------
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 31ea4b42836..f4233b195da 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -570,8 +570,6 @@ static inline int
 ftrace_regex_release(struct inode *inode, struct file *file) { return -ENODEV; }
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
-loff_t ftrace_filter_lseek(struct file *file, loff_t offset, int whence);
-
 /* totally disable ftrace - can not re-enable after this */
 void ftrace_kill(void);
 
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
index 8c9b7a1c413..4e4cc28623a 100644
--- a/include/linux/ftrace_event.h
+++ b/include/linux/ftrace_event.h
@@ -1,3 +1,4 @@
+
 #ifndef _LINUX_FTRACE_EVENT_H
 #define _LINUX_FTRACE_EVENT_H
 
@@ -264,6 +265,8 @@ enum {
 	FTRACE_EVENT_FL_NO_SET_FILTER_BIT,
 	FTRACE_EVENT_FL_SOFT_MODE_BIT,
 	FTRACE_EVENT_FL_SOFT_DISABLED_BIT,
+	FTRACE_EVENT_FL_TRIGGER_MODE_BIT,
+	FTRACE_EVENT_FL_TRIGGER_COND_BIT,
 };
 
 /*
@@ -275,6 +278,8 @@ enum {
  *  SOFT_MODE     - The event is enabled/disabled by SOFT_DISABLED
  *  SOFT_DISABLED - When set, do not trace the event (even though its
  *                   tracepoint may be enabled)
+ *  TRIGGER_MODE  - When set, invoke the triggers associated with the event
+ *  TRIGGER_COND  - When set, one or more triggers has an associated filter
  */
 enum {
 	FTRACE_EVENT_FL_ENABLED		= (1 << FTRACE_EVENT_FL_ENABLED_BIT),
@@ -283,6 +288,8 @@ enum {
 	FTRACE_EVENT_FL_NO_SET_FILTER	= (1 << FTRACE_EVENT_FL_NO_SET_FILTER_BIT),
 	FTRACE_EVENT_FL_SOFT_MODE	= (1 << FTRACE_EVENT_FL_SOFT_MODE_BIT),
 	FTRACE_EVENT_FL_SOFT_DISABLED	= (1 << FTRACE_EVENT_FL_SOFT_DISABLED_BIT),
+	FTRACE_EVENT_FL_TRIGGER_MODE	= (1 << FTRACE_EVENT_FL_TRIGGER_MODE_BIT),
+	FTRACE_EVENT_FL_TRIGGER_COND	= (1 << FTRACE_EVENT_FL_TRIGGER_COND_BIT),
 };
 
 struct ftrace_event_file {
@@ -292,6 +299,7 @@ struct ftrace_event_file {
 	struct dentry			*dir;
 	struct trace_array		*tr;
 	struct ftrace_subsystem_dir	*system;
+	struct list_head		triggers;
 
 	/*
 	 * 32 bit flags:
@@ -299,6 +307,7 @@ struct ftrace_event_file {
 	 *   bit 1:		enabled cmd record
 	 *   bit 2:		enable/disable with the soft disable bit
 	 *   bit 3:		soft disabled
+	 *   bit 4:		trigger enabled
 	 *
 	 * Note: The bits must be set atomically to prevent races
 	 * from other writers. Reads of flags do not need to be in
@@ -310,6 +319,7 @@ struct ftrace_event_file {
 	 */
 	unsigned long		flags;
 	atomic_t		sm_ref;	/* soft-mode reference counter */
+	atomic_t		tm_ref;	/* trigger-mode reference counter */
 };
 
 #define __TRACE_EVENT_FLAGS(name, value)				\
@@ -337,6 +347,14 @@ struct ftrace_event_file {
 
 #define MAX_FILTER_STR_VAL	256	/* Should handle KSYM_SYMBOL_LEN */
 
+enum event_trigger_type {
+	ETT_NONE		= (0),
+	ETT_TRACE_ONOFF		= (1 << 0),
+	ETT_SNAPSHOT		= (1 << 1),
+	ETT_STACKTRACE		= (1 << 2),
+	ETT_EVENT_ENABLE	= (1 << 3),
+};
+
 extern void destroy_preds(struct ftrace_event_file *file);
 extern void destroy_call_preds(struct ftrace_event_call *call);
 extern int filter_match_preds(struct event_filter *filter, void *rec);
@@ -347,6 +365,127 @@ extern int filter_check_discard(struct ftrace_event_file *file, void *rec,
 extern int call_filter_check_discard(struct ftrace_event_call *call, void *rec,
 				     struct ring_buffer *buffer,
 				     struct ring_buffer_event *event);
+extern enum event_trigger_type event_triggers_call(struct ftrace_event_file *file,
+						   void *rec);
+extern void event_triggers_post_call(struct ftrace_event_file *file,
+				     enum event_trigger_type tt);
+
+/**
+ * ftrace_trigger_soft_disabled - do triggers and test if soft disabled
+ * @file: The file pointer of the event to test
+ *
+ * If any triggers without filters are attached to this event, they
+ * will be called here. If the event is soft disabled and has no
+ * triggers that require testing the fields, it will return true,
+ * otherwise false.
+ */
+static inline bool
+ftrace_trigger_soft_disabled(struct ftrace_event_file *file)
+{
+	unsigned long eflags = file->flags;
+
+	if (!(eflags & FTRACE_EVENT_FL_TRIGGER_COND)) {
+		if (eflags & FTRACE_EVENT_FL_TRIGGER_MODE)
+			event_triggers_call(file, NULL);
+		if (eflags & FTRACE_EVENT_FL_SOFT_DISABLED)
+			return true;
+	}
+	return false;
+}
+
+/*
+ * Helper function for event_trigger_unlock_commit{_regs}().
+ * If there are event triggers attached to this event that requires
+ * filtering against its fields, then they wil be called as the
+ * entry already holds the field information of the current event.
+ *
+ * It also checks if the event should be discarded or not.
+ * It is to be discarded if the event is soft disabled and the
+ * event was only recorded to process triggers, or if the event
+ * filter is active and this event did not match the filters.
+ *
+ * Returns true if the event is discarded, false otherwise.
+ */
+static inline bool
+__event_trigger_test_discard(struct ftrace_event_file *file,
+			     struct ring_buffer *buffer,
+			     struct ring_buffer_event *event,
+			     void *entry,
+			     enum event_trigger_type *tt)
+{
+	unsigned long eflags = file->flags;
+
+	if (eflags & FTRACE_EVENT_FL_TRIGGER_COND)
+		*tt = event_triggers_call(file, entry);
+
+	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags))
+		ring_buffer_discard_commit(buffer, event);
+	else if (!filter_check_discard(file, entry, buffer, event))
+		return false;
+
+	return true;
+}
+
+/**
+ * event_trigger_unlock_commit - handle triggers and finish event commit
+ * @file: The file pointer assoctiated to the event
+ * @buffer: The ring buffer that the event is being written to
+ * @event: The event meta data in the ring buffer
+ * @entry: The event itself
+ * @irq_flags: The state of the interrupts at the start of the event
+ * @pc: The state of the preempt count at the start of the event.
+ *
+ * This is a helper function to handle triggers that require data
+ * from the event itself. It also tests the event against filters and
+ * if the event is soft disabled and should be discarded.
+ */
+static inline void
+event_trigger_unlock_commit(struct ftrace_event_file *file,
+			    struct ring_buffer *buffer,
+			    struct ring_buffer_event *event,
+			    void *entry, unsigned long irq_flags, int pc)
+{
+	enum event_trigger_type tt = ETT_NONE;
+
+	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
+		trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
+
+	if (tt)
+		event_triggers_post_call(file, tt);
+}
+
+/**
+ * event_trigger_unlock_commit_regs - handle triggers and finish event commit
+ * @file: The file pointer assoctiated to the event
+ * @buffer: The ring buffer that the event is being written to
+ * @event: The event meta data in the ring buffer
+ * @entry: The event itself
+ * @irq_flags: The state of the interrupts at the start of the event
+ * @pc: The state of the preempt count at the start of the event.
+ *
+ * This is a helper function to handle triggers that require data
+ * from the event itself. It also tests the event against filters and
+ * if the event is soft disabled and should be discarded.
+ *
+ * Same as event_trigger_unlock_commit() but calls
+ * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
+ */
+static inline void
+event_trigger_unlock_commit_regs(struct ftrace_event_file *file,
+				 struct ring_buffer *buffer,
+				 struct ring_buffer_event *event,
+				 void *entry, unsigned long irq_flags, int pc,
+				 struct pt_regs *regs)
+{
+	enum event_trigger_type tt = ETT_NONE;
+
+	if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
+		trace_buffer_unlock_commit_regs(buffer, event,
+						irq_flags, pc, regs);
+
+	if (tt)
+		event_triggers_post_call(file, tt);
+}
 
 enum {
 	FILTER_OTHER = 0,
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
index 5c38606613d..1a8b28db377 100644
--- a/include/trace/ftrace.h
+++ b/include/trace/ftrace.h
@@ -418,6 +418,8 @@ static inline notrace int ftrace_get_offsets_##call(			\
  *	struct ftrace_event_file *ftrace_file = __data;
  *	struct ftrace_event_call *event_call = ftrace_file->event_call;
  *	struct ftrace_data_offsets_<call> __maybe_unused __data_offsets;
+ *	unsigned long eflags = ftrace_file->flags;
+ *	enum event_trigger_type __tt = ETT_NONE;
  *	struct ring_buffer_event *event;
  *	struct ftrace_raw_<call> *entry; <-- defined in stage 1
  *	struct ring_buffer *buffer;
@@ -425,9 +427,12 @@ static inline notrace int ftrace_get_offsets_##call(			\
  *	int __data_size;
  *	int pc;
  *
- *	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT,
- *		     &ftrace_file->flags))
- *		return;
+ *	if (!(eflags & FTRACE_EVENT_FL_TRIGGER_COND)) {
+ *		if (eflags & FTRACE_EVENT_FL_TRIGGER_MODE)
+ *			event_triggers_call(ftrace_file, NULL);
+ *		if (eflags & FTRACE_EVENT_FL_SOFT_DISABLED)
+ *			return;
+ *	}
  *
  *	local_save_flags(irq_flags);
  *	pc = preempt_count();
@@ -445,8 +450,17 @@ static inline notrace int ftrace_get_offsets_##call(			\
  *	{ <assign>; }  <-- Here we assign the entries by the __field and
  *			   __array macros.
  *
- *	if (!filter_check_discard(ftrace_file, entry, buffer, event))
+ *	if (eflags & FTRACE_EVENT_FL_TRIGGER_COND)
+ *		__tt = event_triggers_call(ftrace_file, entry);
+ *
+ *	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT,
+ *		     &ftrace_file->flags))
+ *		ring_buffer_discard_commit(buffer, event);
+ *	else if (!filter_check_discard(ftrace_file, entry, buffer, event))
  *		trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
+ *
+ *	if (__tt)
+ *		event_triggers_post_call(ftrace_file, __tt);
  * }
  *
  * static struct trace_event ftrace_event_type_<call> = {
@@ -539,8 +553,7 @@ ftrace_raw_event_##call(void *__data, proto)				\
 	int __data_size;						\
 	int pc;								\
 									\
-	if (test_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT,			\
-		     &ftrace_file->flags))				\
+	if (ftrace_trigger_soft_disabled(ftrace_file))			\
 		return;							\
 									\
 	local_save_flags(irq_flags);					\
@@ -560,8 +573,8 @@ ftrace_raw_event_##call(void *__data, proto)				\
 									\
 	{ assign; }							\
 									\
-	if (!filter_check_discard(ftrace_file, entry, buffer, event))	\
-		trace_buffer_unlock_commit(buffer, event, irq_flags, pc); \
+	event_trigger_unlock_commit(ftrace_file, buffer, event, entry, \
+				    irq_flags, pc);		       \
 }
 /*
  * The ftrace_test_probe is compiled out, it is only here as a build time check
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index b886a5e7d4f..307d87c0991 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1854,6 +1854,10 @@ static void handle_swbp(struct pt_regs *regs)
 	if (unlikely(!test_bit(UPROBE_COPY_INSN, &uprobe->flags)))
 		goto out;
 
+	/* Tracing handlers use ->utask to communicate with fetch methods */
+	if (!get_utask())
+		goto out;
+
 	handler_chain(uprobe, regs);
 	if (can_skip_sstep(uprobe, regs))
 		goto out;
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index d7e2068e4b7..1378e84fbe3 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -50,6 +50,7 @@ ifeq ($(CONFIG_PERF_EVENTS),y)
 obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o
 endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+obj-$(CONFIG_EVENT_TRACING) += trace_events_trigger.o
 obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o
 obj-$(CONFIG_TRACEPOINTS) += power-traces.o
 ifeq ($(CONFIG_PM_RUNTIME),y)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 72a0f81dc5a..cd7f76d1eb8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -85,6 +85,8 @@ int function_trace_stop __read_mostly;
 
 /* Current function tracing op */
 struct ftrace_ops *function_trace_op __read_mostly = &ftrace_list_end;
+/* What to set function_trace_op to */
+static struct ftrace_ops *set_function_trace_op;
 
 /* List for set_ftrace_pid's pids. */
 LIST_HEAD(ftrace_pids);
@@ -278,6 +280,29 @@ static void update_global_ops(void)
 	global_ops.func = func;
 }
 
+static void ftrace_sync(struct work_struct *work)
+{
+	/*
+	 * This function is just a stub to implement a hard force
+	 * of synchronize_sched(). This requires synchronizing
+	 * tasks even in userspace and idle.
+	 *
+	 * Yes, function tracing is rude.
+	 */
+}
+
+static void ftrace_sync_ipi(void *data)
+{
+	/* Probably not needed, but do it anyway */
+	smp_rmb();
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static void update_function_graph_func(void);
+#else
+static inline void update_function_graph_func(void) { }
+#endif
+
 static void update_ftrace_function(void)
 {
 	ftrace_func_t func;
@@ -296,16 +321,61 @@ static void update_ftrace_function(void)
 	     !FTRACE_FORCE_LIST_FUNC)) {
 		/* Set the ftrace_ops that the arch callback uses */
 		if (ftrace_ops_list == &global_ops)
-			function_trace_op = ftrace_global_list;
+			set_function_trace_op = ftrace_global_list;
 		else
-			function_trace_op = ftrace_ops_list;
+			set_function_trace_op = ftrace_ops_list;
 		func = ftrace_ops_list->func;
 	} else {
 		/* Just use the default ftrace_ops */
-		function_trace_op = &ftrace_list_end;
+		set_function_trace_op = &ftrace_list_end;
 		func = ftrace_ops_list_func;
 	}
 
+	/* If there's no change, then do nothing more here */
+	if (ftrace_trace_function == func)
+		return;
+
+	update_function_graph_func();
+
+	/*
+	 * If we are using the list function, it doesn't care
+	 * about the function_trace_ops.
+	 */
+	if (func == ftrace_ops_list_func) {
+		ftrace_trace_function = func;
+		/*
+		 * Don't even bother setting function_trace_ops,
+		 * it would be racy to do so anyway.
+		 */
+		return;
+	}
+
+#ifndef CONFIG_DYNAMIC_FTRACE
+	/*
+	 * For static tracing, we need to be a bit more careful.
+	 * The function change takes affect immediately. Thus,
+	 * we need to coorditate the setting of the function_trace_ops
+	 * with the setting of the ftrace_trace_function.
+	 *
+	 * Set the function to the list ops, which will call the
+	 * function we want, albeit indirectly, but it handles the
+	 * ftrace_ops and doesn't depend on function_trace_op.
+	 */
+	ftrace_trace_function = ftrace_ops_list_func;
+	/*
+	 * Make sure all CPUs see this. Yes this is slow, but static
+	 * tracing is slow and nasty to have enabled.
+	 */
+	schedule_on_each_cpu(ftrace_sync);
+	/* Now all cpus are using the list ops. */
+	function_trace_op = set_function_trace_op;
+	/* Make sure the function_trace_op is visible on all CPUs */
+	smp_wmb();
+	/* Nasty way to force a rmb on all cpus */
+	smp_call_function(ftrace_sync_ipi, NULL, 1);
+	/* OK, we are all set to update the ftrace_trace_function now! */
+#endif /* !CONFIG_DYNAMIC_FTRACE */
+
 	ftrace_trace_function = func;
 }
 
@@ -410,17 +480,6 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 	return 0;
 }
 
-static void ftrace_sync(struct work_struct *work)
-{
-	/*
-	 * This function is just a stub to implement a hard force
-	 * of synchronize_sched(). This requires synchronizing
-	 * tasks even in userspace and idle.
-	 *
-	 * Yes, function tracing is rude.
-	 */
-}
-
 static int __unregister_ftrace_function(struct ftrace_ops *ops)
 {
 	int ret;
@@ -439,20 +498,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	} else if (ops->flags & FTRACE_OPS_FL_CONTROL) {
 		ret = remove_ftrace_list_ops(&ftrace_control_list,
 					     &control_ops, ops);
-		if (!ret) {
-			/*
-			 * The ftrace_ops is now removed from the list,
-			 * so there'll be no new users. We must ensure
-			 * all current users are done before we free
-			 * the control data.
-			 * Note synchronize_sched() is not enough, as we
-			 * use preempt_disable() to do RCU, but the function
-			 * tracer can be called where RCU is not active
-			 * (before user_exit()).
-			 */
-			schedule_on_each_cpu(ftrace_sync);
-			control_ops_free(ops);
-		}
 	} else
 		ret = remove_ftrace_ops(&ftrace_ops_list, ops);
 
@@ -462,17 +507,6 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	if (ftrace_enabled)
 		update_ftrace_function();
 
-	/*
-	 * Dynamic ops may be freed, we must make sure that all
-	 * callers are done before leaving this function.
-	 *
-	 * Again, normal synchronize_sched() is not good enough.
-	 * We need to do a hard force of sched synchronization.
-	 */
-	if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
-		schedule_on_each_cpu(ftrace_sync);
-
-
 	return 0;
 }
 
@@ -1082,19 +1116,6 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
 
 static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 
-loff_t
-ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
-{
-	loff_t ret;
-
-	if (file->f_mode & FMODE_READ)
-		ret = seq_lseek(file, offset, whence);
-	else
-		file->f_pos = ret = 1;
-
-	return ret;
-}
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -1992,8 +2013,14 @@ void ftrace_modify_all_code(int command)
 	else if (command & FTRACE_DISABLE_CALLS)
 		ftrace_replace_code(0);
 
-	if (update && ftrace_trace_function != ftrace_ops_list_func)
+	if (update && ftrace_trace_function != ftrace_ops_list_func) {
+		function_trace_op = set_function_trace_op;
+		smp_wmb();
+		/* If irqs are disabled, we are in stop machine */
+		if (!irqs_disabled())
+			smp_call_function(ftrace_sync_ipi, NULL, 1);
 		ftrace_update_ftrace_func(ftrace_trace_function);
+	}
 
 	if (command & FTRACE_START_FUNC_RET)
 		ftrace_enable_ftrace_graph_caller();
@@ -2156,10 +2183,41 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
 		command |= FTRACE_UPDATE_TRACE_FUNC;
 	}
 
-	if (!command || !ftrace_enabled)
+	if (!command || !ftrace_enabled) {
+		/*
+		 * If these are control ops, they still need their
+		 * per_cpu field freed. Since, function tracing is
+		 * not currently active, we can just free them
+		 * without synchronizing all CPUs.
+		 */
+		if (ops->flags & FTRACE_OPS_FL_CONTROL)
+			control_ops_free(ops);
 		return 0;
+	}
 
 	ftrace_run_update_code(command);
+
+	/*
+	 * Dynamic ops may be freed, we must make sure that all
+	 * callers are done before leaving this function.
+	 * The same goes for freeing the per_cpu data of the control
+	 * ops.
+	 *
+	 * Again, normal synchronize_sched() is not good enough.
+	 * We need to do a hard force of sched synchronization.
+	 * This is because we use preempt_disable() to do RCU, but
+	 * the function tracers can be called where RCU is not watching
+	 * (like before user_exit()). We can not rely on the RCU
+	 * infrastructure to do the synchronization, thus we must do it
+	 * ourselves.
+	 */
+	if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) {
+		schedule_on_each_cpu(ftrace_sync);
+
+		if (ops->flags & FTRACE_OPS_FL_CONTROL)
+			control_ops_free(ops);
+	}
+
 	return 0;
 }
 
@@ -2739,7 +2797,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
  * routine, you can use ftrace_filter_write() for the write
  * routine if @flag has FTRACE_ITER_FILTER set, or
  * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
- * ftrace_filter_lseek() should be used as the lseek routine, and
+ * tracing_lseek() should be used as the lseek routine, and
  * release must call ftrace_regex_release().
  */
 int
@@ -3767,7 +3825,7 @@ static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
 	.read = seq_read,
 	.write = ftrace_filter_write,
-	.llseek = ftrace_filter_lseek,
+	.llseek = tracing_lseek,
 	.release = ftrace_regex_release,
 };
 
@@ -3775,7 +3833,7 @@ static const struct file_operations ftrace_notrace_fops = {
 	.open = ftrace_notrace_open,
 	.read = seq_read,
 	.write = ftrace_notrace_write,
-	.llseek = ftrace_filter_lseek,
+	.llseek = tracing_lseek,
 	.release = ftrace_regex_release,
 };
 
@@ -4038,7 +4096,7 @@ static const struct file_operations ftrace_graph_fops = {
 	.open		= ftrace_graph_open,
 	.read		= seq_read,
 	.write		= ftrace_graph_write,
-	.llseek		= ftrace_filter_lseek,
+	.llseek		= tracing_lseek,
 	.release	= ftrace_graph_release,
 };
 
@@ -4046,7 +4104,7 @@ static const struct file_operations ftrace_graph_notrace_fops = {
 	.open		= ftrace_graph_notrace_open,
 	.read		= seq_read,
 	.write		= ftrace_graph_write,
-	.llseek		= ftrace_filter_lseek,
+	.llseek		= tracing_lseek,
 	.release	= ftrace_graph_release,
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -4719,7 +4777,7 @@ static const struct file_operations ftrace_pid_fops = {
 	.open		= ftrace_pid_open,
 	.write		= ftrace_pid_write,
 	.read		= seq_read,
-	.llseek		= ftrace_filter_lseek,
+	.llseek		= tracing_lseek,
 	.release	= ftrace_pid_release,
 };
 
@@ -4862,6 +4920,7 @@ int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
 trace_func_graph_ret_t ftrace_graph_return =
 			(trace_func_graph_ret_t)ftrace_stub;
 trace_func_graph_ent_t ftrace_graph_entry = ftrace_graph_entry_stub;
+static trace_func_graph_ent_t __ftrace_graph_entry = ftrace_graph_entry_stub;
 
 /* Try to assign a return stack array on FTRACE_RETSTACK_ALLOC_SIZE tasks. */
 static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
@@ -5003,6 +5062,30 @@ static struct ftrace_ops fgraph_ops __read_mostly = {
 				FTRACE_OPS_FL_RECURSION_SAFE,
 };
 
+static int