aboutsummaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks7
-rw-r--r--kernel/audit.c64
-rw-r--r--kernel/context_tracking.c3
-rw-r--r--kernel/cpu.c5
-rw-r--r--kernel/events/core.c80
-rw-r--r--kernel/events/uprobes.c58
-rw-r--r--kernel/fork.c2
-rw-r--r--kernel/irq/irqdesc.c4
-rw-r--r--kernel/kexec.c1
-rw-r--r--kernel/kprobes.c392
-rw-r--r--kernel/locking/Makefile1
-rw-r--r--kernel/locking/qrwlock.c133
-rw-r--r--kernel/locking/rtmutex-debug.h5
-rw-r--r--kernel/locking/rtmutex.c243
-rw-r--r--kernel/locking/rtmutex.h5
-rw-r--r--kernel/locking/rwsem-xadd.c225
-rw-r--r--kernel/locking/rwsem.c31
-rw-r--r--kernel/notifier.c22
-rw-r--r--kernel/power/hibernate.c40
-rw-r--r--kernel/power/main.c6
-rw-r--r--kernel/power/process.c3
-rw-r--r--kernel/power/suspend.c14
-rw-r--r--kernel/power/user.c3
-rw-r--r--kernel/printk/printk.c44
-rw-r--r--kernel/sched/core.c189
-rw-r--r--kernel/sched/deadline.c2
-rw-r--r--kernel/sched/fair.c390
-rw-r--r--kernel/sched/features.h8
-rw-r--r--kernel/sched/idle.c30
-rw-r--r--kernel/sched/rt.c3
-rw-r--r--kernel/sched/sched.h24
-rw-r--r--kernel/seccomp.c110
-rw-r--r--kernel/smp.c57
-rw-r--r--kernel/sysctl.c22
-rw-r--r--kernel/trace/ring_buffer.c5
-rw-r--r--kernel/trace/trace.c51
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_event_perf.c5
-rw-r--r--kernel/trace/trace_kprobe.c71
-rw-r--r--kernel/trace/trace_probe.c65
-rw-r--r--kernel/trace/trace_probe.h15
-rw-r--r--kernel/trace/trace_uprobe.c112
-rw-r--r--kernel/tracepoint.c26
-rw-r--r--kernel/watchdog.c41
44 files changed, 1787 insertions, 832 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index d2b32ac27a3..35536d9c096 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -223,3 +223,10 @@ endif
config MUTEX_SPIN_ON_OWNER
def_bool y
depends on SMP && !DEBUG_MUTEXES
+
+config ARCH_USE_QUEUE_RWLOCK
+ bool
+
+config QUEUE_RWLOCK
+ def_bool y if ARCH_USE_QUEUE_RWLOCK
+ depends on SMP
diff --git a/kernel/audit.c b/kernel/audit.c
index f30106459a3..3ef2e0e797e 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -424,6 +424,38 @@ static void kauditd_send_skb(struct sk_buff *skb)
}
/*
+ * kauditd_send_multicast_skb - send the skb to multicast userspace listeners
+ *
+ * This function doesn't consume an skb as might be expected since it has to
+ * copy it anyways.
+ */
+static void kauditd_send_multicast_skb(struct sk_buff *skb)
+{
+ struct sk_buff *copy;
+ struct audit_net *aunet = net_generic(&init_net, audit_net_id);
+ struct sock *sock = aunet->nlsk;
+
+ if (!netlink_has_listeners(sock, AUDIT_NLGRP_READLOG))
+ return;
+
+ /*
+ * The seemingly wasteful skb_copy() rather than bumping the refcount
+ * using skb_get() is necessary because non-standard mods are made to
+ * the skb by the original kaudit unicast socket send routine. The
+ * existing auditd daemon assumes this breakage. Fixing this would
+ * require co-ordinating a change in the established protocol between
+ * the kaudit kernel subsystem and the auditd userspace code. There is
+ * no reason for new multicast clients to continue with this
+ * non-compliance.
+ */
+ copy = skb_copy(skb, GFP_KERNEL);
+ if (!copy)
+ return;
+
+ nlmsg_multicast(sock, copy, 0, AUDIT_NLGRP_READLOG, GFP_KERNEL);
+}
+
+/*
* flush_hold_queue - empty the hold queue if auditd appears
*
* If auditd just started, drain the queue of messages already
@@ -1076,10 +1108,22 @@ static void audit_receive(struct sk_buff *skb)
mutex_unlock(&audit_cmd_mutex);
}
+/* Run custom bind function on netlink socket group connect or bind requests. */
+static int audit_bind(int group)
+{
+ if (!capable(CAP_AUDIT_READ))
+ return -EPERM;
+
+ return 0;
+}
+
static int __net_init audit_net_init(struct net *net)
{
struct netlink_kernel_cfg cfg = {
.input = audit_receive,
+ .bind = audit_bind,
+ .flags = NL_CFG_F_NONROOT_RECV,
+ .groups = AUDIT_NLGRP_MAX,
};
struct audit_net *aunet = net_generic(net, audit_net_id);
@@ -1901,10 +1945,10 @@ out:
* audit_log_end - end one audit record
* @ab: the audit_buffer
*
- * The netlink_* functions cannot be called inside an irq context, so
- * the audit buffer is placed on a queue and a tasklet is scheduled to
- * remove them from the queue outside the irq context. May be called in
- * any context.
+ * netlink_unicast() cannot be called inside an irq context because it blocks
+ * (last arg, flags, is not set to MSG_DONTWAIT), so the audit buffer is placed
+ * on a queue and a tasklet is scheduled to remove them from the queue outside
+ * the irq context. May be called in any context.
*/
void audit_log_end(struct audit_buffer *ab)
{
@@ -1914,6 +1958,18 @@ void audit_log_end(struct audit_buffer *ab)
audit_log_lost("rate limit exceeded");
} else {
struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
+
+ kauditd_send_multicast_skb(ab->skb);
+
+ /*
+ * The original kaudit unicast socket sends up messages with
+ * nlmsg_len set to the payload length rather than the entire
+ * message length. This breaks the standard set by netlink.
+ * The existing auditd daemon assumes this breakage. Fixing
+ * this would require co-ordinating a change in the established
+ * protocol between the kaudit kernel subsystem and the auditd
+ * userspace code.
+ */
nlh->nlmsg_len = ab->skb->len - NLMSG_HDRLEN;
if (audit_pid) {
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index 019d4500844..5664985c46a 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -19,6 +19,7 @@
#include <linux/sched.h>
#include <linux/hardirq.h>
#include <linux/export.h>
+#include <linux/kprobes.h>
#define CREATE_TRACE_POINTS
#include <trace/events/context_tracking.h>
@@ -104,6 +105,7 @@ void context_tracking_user_enter(void)
}
local_irq_restore(flags);
}
+NOKPROBE_SYMBOL(context_tracking_user_enter);
#ifdef CONFIG_PREEMPT
/**
@@ -181,6 +183,7 @@ void context_tracking_user_exit(void)
}
local_irq_restore(flags);
}
+NOKPROBE_SYMBOL(context_tracking_user_exit);
/**
* __context_tracking_task_switch - context switch the syscall callbacks
diff --git a/kernel/cpu.c b/kernel/cpu.c
index acf791c55b7..a343bde710b 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -20,6 +20,7 @@
#include <linux/gfp.h>
#include <linux/suspend.h>
#include <linux/lockdep.h>
+#include <trace/events/power.h>
#include "smpboot.h"
@@ -520,7 +521,9 @@ int disable_nonboot_cpus(void)
for_each_online_cpu(cpu) {
if (cpu == first_cpu)
continue;
+ trace_suspend_resume(TPS("CPU_OFF"), cpu, true);
error = _cpu_down(cpu, 1);
+ trace_suspend_resume(TPS("CPU_OFF"), cpu, false);
if (!error)
cpumask_set_cpu(cpu, frozen_cpus);
else {
@@ -563,7 +566,9 @@ void __ref enable_nonboot_cpus(void)
arch_enable_nonboot_cpus_begin();
for_each_cpu(cpu, frozen_cpus) {
+ trace_suspend_resume(TPS("CPU_ON"), cpu, true);
error = _cpu_up(cpu, 1);
+ trace_suspend_resume(TPS("CPU_ON"), cpu, false);
if (!error) {
pr_info("CPU%d is up\n", cpu);
continue;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 24d35cc38e4..a33d9a2bcbd 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -40,6 +40,7 @@
#include <linux/mm_types.h>
#include <linux/cgroup.h>
#include <linux/module.h>
+#include <linux/mman.h>
#include "internal.h"
@@ -2974,6 +2975,22 @@ out:
local_irq_restore(flags);
}
+void perf_event_exec(void)
+{
+ struct perf_event_context *ctx;
+ int ctxn;
+
+ rcu_read_lock();
+ for_each_task_context_nr(ctxn) {
+ ctx = current->perf_event_ctxp[ctxn];
+ if (!ctx)
+ continue;
+
+ perf_event_enable_on_exec(ctx);
+ }
+ rcu_read_unlock();
+}
+
/*
* Cross CPU call to read the hardware event
*/
@@ -5075,21 +5092,9 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event)
NULL);
}
-void perf_event_comm(struct task_struct *task)
+void perf_event_comm(struct task_struct *task, bool exec)
{
struct perf_comm_event comm_event;
- struct perf_event_context *ctx;
- int ctxn;
-
- rcu_read_lock();
- for_each_task_context_nr(ctxn) {
- ctx = task->perf_event_ctxp[ctxn];
- if (!ctx)
- continue;
-
- perf_event_enable_on_exec(ctx);
- }
- rcu_read_unlock();
if (!atomic_read(&nr_comm_events))
return;
@@ -5101,7 +5106,7 @@ void perf_event_comm(struct task_struct *task)
.event_id = {
.header = {
.type = PERF_RECORD_COMM,
- .misc = 0,
+ .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
/* .size */
},
/* .pid */
@@ -5124,6 +5129,7 @@ struct perf_mmap_event {
int maj, min;
u64 ino;
u64 ino_generation;
+ u32 prot, flags;
struct {
struct perf_event_header header;
@@ -5165,6 +5171,8 @@ static void perf_event_mmap_output(struct perf_event *event,
mmap_event->event_id.header.size += sizeof(mmap_event->min);
mmap_event->event_id.header.size += sizeof(mmap_event->ino);
mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
+ mmap_event->event_id.header.size += sizeof(mmap_event->prot);
+ mmap_event->event_id.header.size += sizeof(mmap_event->flags);
}
perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
@@ -5183,6 +5191,8 @@ static void perf_event_mmap_output(struct perf_event *event,
perf_output_put(&handle, mmap_event->min);
perf_output_put(&handle, mmap_event->ino);
perf_output_put(&handle, mmap_event->ino_generation);
+ perf_output_put(&handle, mmap_event->prot);
+ perf_output_put(&handle, mmap_event->flags);
}
__output_copy(&handle, mmap_event->file_name,
@@ -5201,6 +5211,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
struct file *file = vma->vm_file;
int maj = 0, min = 0;
u64 ino = 0, gen = 0;
+ u32 prot = 0, flags = 0;
unsigned int size;
char tmp[16];
char *buf = NULL;
@@ -5231,6 +5242,28 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
gen = inode->i_generation;
maj = MAJOR(dev);
min = MINOR(dev);
+
+ if (vma->vm_flags & VM_READ)
+ prot |= PROT_READ;
+ if (vma->vm_flags & VM_WRITE)
+ prot |= PROT_WRITE;
+ if (vma->vm_flags & VM_EXEC)
+ prot |= PROT_EXEC;
+
+ if (vma->vm_flags & VM_MAYSHARE)
+ flags = MAP_SHARED;
+ else
+ flags = MAP_PRIVATE;
+
+ if (vma->vm_flags & VM_DENYWRITE)
+ flags |= MAP_DENYWRITE;
+ if (vma->vm_flags & VM_MAYEXEC)
+ flags |= MAP_EXECUTABLE;
+ if (vma->vm_flags & VM_LOCKED)
+ flags |= MAP_LOCKED;
+ if (vma->vm_flags & VM_HUGETLB)
+ flags |= MAP_HUGETLB;
+
goto got_name;
} else {
name = (char *)arch_vma_name(vma);
@@ -5271,6 +5304,8 @@ got_name:
mmap_event->min = min;
mmap_event->ino = ino;
mmap_event->ino_generation = gen;
+ mmap_event->prot = prot;
+ mmap_event->flags = flags;
if (!(vma->vm_flags & VM_EXEC))
mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
@@ -5311,6 +5346,8 @@ void perf_event_mmap(struct vm_area_struct *vma)
/* .min (attr_mmap2 only) */
/* .ino (attr_mmap2 only) */
/* .ino_generation (attr_mmap2 only) */
+ /* .prot (attr_mmap2 only) */
+ /* .flags (attr_mmap2 only) */
};
perf_event_mmap_event(&mmap_event);
@@ -6893,10 +6930,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
if (ret)
return -EFAULT;
- /* disabled for now */
- if (attr->mmap2)
- return -EINVAL;
-
if (attr->__reserved_1)
return -EINVAL;
@@ -7122,6 +7155,13 @@ SYSCALL_DEFINE5(perf_event_open,
}
}
+ if (is_sampling_event(event)) {
+ if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
+ err = -ENOTSUPP;
+ goto err_alloc;
+ }
+ }
+
account_event(event);
/*
@@ -7433,7 +7473,7 @@ __perf_event_exit_task(struct perf_event *child_event,
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
- struct perf_event *child_event;
+ struct perf_event *child_event, *next;
struct perf_event_context *child_ctx;
unsigned long flags;
@@ -7487,7 +7527,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
*/
mutex_lock(&child_ctx->mutex);
- list_for_each_entry_rcu(child_event, &child_ctx->event_list, event_entry)
+ list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
__perf_event_exit_task(child_event, child_ctx, child);
mutex_unlock(&child_ctx->mutex);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index adcd76a9683..6f3254e8c13 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -36,6 +36,7 @@
#include "../../mm/internal.h" /* munlock_vma_page */
#include <linux/percpu-rwsem.h>
#include <linux/task_work.h>
+#include <linux/shmem_fs.h>
#include <linux/uprobes.h>
@@ -127,7 +128,7 @@ struct xol_area {
*/
static bool valid_vma(struct vm_area_struct *vma, bool is_register)
{
- vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_SHARED;
+ vm_flags_t flags = VM_HUGETLB | VM_MAYEXEC | VM_MAYSHARE;
if (is_register)
flags |= VM_WRITE;
@@ -279,18 +280,13 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
* supported by that architecture then we need to modify is_trap_at_addr and
* uprobe_write_opcode accordingly. This would never be a problem for archs
* that have fixed length instructions.
- */
-
-/*
+ *
* uprobe_write_opcode - write the opcode at a given virtual address.
* @mm: the probed process address space.
* @vaddr: the virtual address to store the opcode.
* @opcode: opcode to be written at @vaddr.
*
- * Called with mm->mmap_sem held (for read and with a reference to
- * mm).
- *
- * For mm @mm, write the opcode at @vaddr.
+ * Called with mm->mmap_sem held for write.
* Return 0 (success) or a negative errno.
*/
int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr,
@@ -310,21 +306,25 @@ retry:
if (ret <= 0)
goto put_old;
+ ret = anon_vma_prepare(vma);
+ if (ret)
+ goto put_old;
+
ret = -ENOMEM;
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vaddr);
if (!new_page)
goto put_old;
- __SetPageUptodate(new_page);
+ if (mem_cgroup_charge_anon(new_page, mm, GFP_KERNEL))
+ goto put_new;
+ __SetPageUptodate(new_page);
copy_highpage(new_page, old_page);
copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
- ret = anon_vma_prepare(vma);
- if (ret)
- goto put_new;
-
ret = __replace_page(vma, vaddr, old_page, new_page);
+ if (ret)
+ mem_cgroup_uncharge_page(new_page);
put_new:
page_cache_release(new_page);
@@ -537,14 +537,15 @@ static int __copy_insn(struct address_space *mapping, struct file *filp,
void *insn, int nbytes, loff_t offset)
{
struct page *page;
-
- if (!mapping->a_ops->readpage)
- return -EIO;
/*
- * Ensure that the page that has the original instruction is
- * populated and in page-cache.
+ * Ensure that the page that has the original instruction is populated
+ * and in page-cache. If ->readpage == NULL it must be shmem_mapping(),
+ * see uprobe_register().
*/
- page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+ if (mapping->a_ops->readpage)
+ page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
+ else
+ page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT);
if (IS_ERR(page))
return PTR_ERR(page);
@@ -845,7 +846,7 @@ static void __uprobe_unregister(struct uprobe *uprobe, struct uprobe_consumer *u
{
int err;
- if (!consumer_del(uprobe, uc)) /* WARN? */
+ if (WARN_ON(!consumer_del(uprobe, uc)))
return;
err = register_for_each_vma(uprobe, NULL);
@@ -880,6 +881,9 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
if (!uc->handler && !uc->ret_handler)
return -EINVAL;
+ /* copy_insn() uses read_mapping_page() or shmem_read_mapping_page() */
+ if (!inode->i_mapping->a_ops->readpage && !shmem_mapping(inode->i_mapping))
+ return -EIO;
/* Racy, just to catch the obvious mistakes */
if (offset > i_size_read(inode))
return -EINVAL;
@@ -923,7 +927,7 @@ int uprobe_apply(struct inode *inode, loff_t offset,
int ret = -ENOENT;
uprobe = find_uprobe(inode, offset);
- if (!uprobe)
+ if (WARN_ON(!uprobe))
return ret;
down_write(&uprobe->register_rwsem);
@@ -948,7 +952,7 @@ void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consume
struct uprobe *uprobe;
uprobe = find_uprobe(inode, offset);
- if (!uprobe)
+ if (WARN_ON(!uprobe))
return;
down_write(&uprobe->register_rwsem);
@@ -1361,6 +1365,16 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
return instruction_pointer(regs) - UPROBE_SWBP_INSN_SIZE;
}
+unsigned long uprobe_get_trap_addr(struct pt_regs *regs)
+{
+ struct uprobe_task *utask = current->utask;
+
+ if (unlikely(utask && utask->active_uprobe))
+ return utask->vaddr;
+
+ return instruction_pointer(regs);
+}
+
/*
* Called with no locks held.
* Called in context of a exiting or a exec-ing thread.
diff --git a/kernel/fork.c b/kernel/fork.c
index d2799d1fc95..6a13c46cd87 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1487,7 +1487,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
total_forks++;
spin_unlock(&current->sighand->siglock);
+ syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
+
proc_fork_connector(p);
cgroup_post_fork(p);
if (clone_flags & CLONE_THREAD)
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 7339e42a85a..1487a123db5 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -455,9 +455,9 @@ EXPORT_SYMBOL_GPL(irq_alloc_hwirqs);
*/
void irq_free_hwirqs(unsigned int from, int cnt)
{
- int i;
+ int i, j;
- for (i = from; cnt > 0; i++, cnt--) {
+ for (i = from, j = cnt; j > 0; i++, j--) {
irq_set_status_flags(i, _IRQ_NOREQUEST | _IRQ_NOPROBE);
arch_teardown_hwirq(i);
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 6748688813d..369f41a9412 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1617,6 +1617,7 @@ static int __init crash_save_vmcoreinfo_init(void)
#ifdef CONFIG_MEMORY_FAILURE
VMCOREINFO_NUMBER(PG_hwpoison);
#endif
+ VMCOREINFO_NUMBER(PG_head_mask);
VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE);
arch_crash_save_vmcoreinfo();
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ceeadfcabb7..3214289df5a 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -86,21 +86,8 @@ static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
return &(kretprobe_table_locks[hash].lock);
}
-/*
- * Normally, functions that we'd want to prohibit kprobes in, are marked
- * __kprobes. But, there are cases where such functions already belong to
- * a different section (__sched for preempt_schedule)
- *
- * For such cases, we now have a blacklist
- */
-static struct kprobe_blackpoint kprobe_blacklist[] = {
- {"preempt_schedule",},
- {"native_get_debugreg",},
- {"irq_entries_start",},
- {"common_interrupt",},
- {"mcount",}, /* mcount can be called from everywhere */
- {NULL} /* Terminator */
-};
+/* Blacklist -- list of struct kprobe_blacklist_entry */
+static LIST_HEAD(kprobe_blacklist);
#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
/*
@@ -151,13 +138,13 @@ struct kprobe_insn_cache kprobe_insn_slots = {
.insn_size = MAX_INSN_SIZE,
.nr_garbage = 0,
};
-static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c);
+static int collect_garbage_slots(struct kprobe_insn_cache *c);
/**
* __get_insn_slot() - Find a slot on an executable page for an instruction.
* We allocate an executable page if there's no room on existing ones.
*/
-kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c)
+kprobe_opcode_t *__get_insn_slot(struct kprobe_insn_cache *c)
{
struct kprobe_insn_page *kip;
kprobe_opcode_t *slot = NULL;
@@ -214,7 +201,7 @@ out:
}
/* Return 1 if all garbages are collected, otherwise 0. */
-static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
+static int collect_one_slot(struct kprobe_insn_page *kip, int idx)
{
kip->slot_used[idx] = SLOT_CLEAN;
kip->nused--;
@@ -235,7 +222,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx)
return 0;
}
-static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
+static int collect_garbage_slots(struct kprobe_insn_cache *c)
{
struct kprobe_insn_page *kip, *next;
@@ -257,8 +244,8 @@ static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c)
return 0;
}
-void __kprobes __free_insn_slot(struct kprobe_insn_cache *c,
- kprobe_opcode_t *slot, int dirty)
+void __free_insn_slot(struct kprobe_insn_cache *c,
+ kprobe_opcode_t *slot, int dirty)
{
struct kprobe_insn_page *kip;
@@ -314,7 +301,7 @@ static inline void reset_kprobe_instance(void)
* OR
* - with preemption disabled - from arch/xxx/kernel/kprobes.c
*/
-struct kprobe __kprobes *get_kprobe(void *addr)
+struct kprobe *get_kprobe(void *addr)
{
struct hlist_head *head;
struct kprobe *p;
@@ -327,8 +314,9 @@ struct kprobe __kprobes *get_kprobe(void *addr)
return NULL;
}
+NOKPROBE_SYMBOL(get_kprobe);
-static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
+static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs);
/* Return true if the kprobe is an aggregator */
static inline int kprobe_aggrprobe(struct kprobe *p)
@@ -360,7 +348,7 @@ static bool kprobes_allow_optimization;
* Call all pre_handler on the list, but ignores its return value.
* This must be called from arch-dep optimized caller.
*/
-void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
+void opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *kp;
@@ -372,9 +360,10 @@ void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs)
reset_kprobe_instance();
}
}
+NOKPROBE_SYMBOL(opt_pre_handler);
/* Free optimized instructions and optimized_kprobe */
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
+static void free_aggr_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -412,7 +401,7 @@ static inline int kprobe_disarmed(struct kprobe *p)
}
/* Return true(!0) if the probe is queued on (un)optimizing lists */
-static int __kprobes kprobe_queued(struct kprobe *p)
+static int kprobe_queued(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -428,7 +417,7 @@ static int __kprobes kprobe_queued(struct kprobe *p)
* Return an optimized kprobe whose optimizing code replaces
* instructions including addr (exclude breakpoint).
*/
-static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr)
+static struct kprobe *get_optimized_kprobe(unsigned long addr)
{
int i;
struct kprobe *p = NULL;
@@ -460,7 +449,7 @@ static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer);
* Optimize (replace a breakpoint with a jump) kprobes listed on
* optimizing_list.
*/
-static __kprobes void do_optimize_kprobes(void)
+static void do_optimize_kprobes(void)
{
/* Optimization never be done when disarmed */
if (kprobes_all_disarmed || !kprobes_allow_optimization ||
@@ -488,7 +477,7 @@ static __kprobes void do_optimize_kprobes(void)
* Unoptimize (replace a jump with a breakpoint and remove the breakpoint
* if need) kprobes listed on unoptimizing_list.
*/
-static __kprobes void do_unoptimize_kprobes(void)
+static void do_unoptimize_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
@@ -520,7 +509,7 @@ static __kprobes void do_unoptimize_kprobes(void)
}
/* Reclaim all kprobes on the free_list */
-static __kprobes void do_free_cleaned_kprobes(void)
+static void do_free_cleaned_kprobes(void)
{
struct optimized_kprobe *op, *tmp;
@@ -532,13 +521,13 @@ static __kprobes void do_free_cleaned_kprobes(void)
}
/* Start optimizer after OPTIMIZE_DELAY passed */
-static __kprobes void kick_kprobe_optimizer(void)
+static void kick_kprobe_optimizer(void)
{
schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY);
}
/* Kprobe jump optimizer */
-static __kprobes void kprobe_optimizer(struct work_struct *work)
+static void kprobe_optimizer(struct work_struct *work)
{
mutex_lock(&kprobe_mutex);
/* Lock modules while optimizing kprobes */
@@ -574,7 +563,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work)
}
/* Wait for completing optimization and unoptimization */
-static __kprobes void wait_for_kprobe_optimizer(void)
+static void wait_for_kprobe_optimizer(void)
{
mutex_lock(&kprobe_mutex);
@@ -593,7 +582,7 @@ static __kprobes void wait_for_kprobe_optimizer(void)
}
/* Optimize kprobe if p is ready to be optimized */
-static __kprobes void optimize_kprobe(struct kprobe *p)
+static void optimize_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -627,7 +616,7 @@ static __kprobes void optimize_kprobe(struct kprobe *p)
}
/* Short cut to direct unoptimizing */
-static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
+static void force_unoptimize_kprobe(struct optimized_kprobe *op)
{
get_online_cpus();
arch_unoptimize_kprobe(op);
@@ -637,7 +626,7 @@ static __kprobes void force_unoptimize_kprobe(struct optimized_kprobe *op)
}
/* Unoptimize a kprobe if p is optimized */
-static __kprobes void unoptimize_kprobe(struct kprobe *p, bool force)
+static void unoptimize_kprobe(struct kprobe *p, bool force)
{
struct optimized_kprobe *op;
@@ -697,7 +686,7 @@ static void reuse_unused_kprobe(struct kprobe *ap)
}
/* Remove optimized instructions */
-static void __kprobes kill_optimized_kprobe(struct kprobe *p)
+static void kill_optimized_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -723,7 +712,7 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p)
}
/* Try to prepare optimized instructions */
-static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
+static void prepare_optimized_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -732,7 +721,7 @@ static __kprobes void prepare_optimized_kprobe(struct kprobe *p)
}
/* Allocate new optimized_kprobe and try to prepare optimized instructions */
-static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
struct optimized_kprobe *op;
@@ -747,13 +736,13 @@ static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
return &op->kp;
}
-static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
+static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p);
/*
* Prepare an optimized_kprobe and optimize it
* NOTE: p must be a normal registered kprobe
*/
-static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
+static void try_to_optimize_kprobe(struct kprobe *p)
{
struct kprobe *ap;
struct optimized_kprobe *op;
@@ -787,7 +776,7 @@ out:
}
#ifdef CONFIG_SYSCTL
-static void __kprobes optimize_all_kprobes(void)
+static void optimize_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
@@ -810,7 +799,7 @@ out:
mutex_unlock(&kprobe_mutex);
}
-static void __kprobes unoptimize_all_kprobes(void)
+static void unoptimize_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
@@ -861,7 +850,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
#endif /* CONFIG_SYSCTL */
/* Put a breakpoint for a probe. Must be called with text_mutex locked */
-static void __kprobes __arm_kprobe(struct kprobe *p)
+static void __arm_kprobe(struct kprobe *p)
{
struct kprobe *_p;
@@ -876,7 +865,7 @@ static void __kprobes __arm_kprobe(struct kprobe *p)
}
/* Remove the breakpoint of a probe. Must be called with text_mutex locked */
-static void __kprobes __disarm_kprobe(struct kprobe *p, bool reopt)
+static void __disarm_kprobe(struct kprobe *p, bool reopt)
{
struct kprobe *_p;
@@ -911,13 +900,13 @@ static void reuse_unused_kprobe(struct kprobe *ap)
BUG_ON(kprobe_unused(ap));
}
-static __kprobes void free_aggr_kprobe(struct kprobe *p)
+static void free_aggr_kprobe(struct kprobe *p)
{
arch_remove_kprobe(p);
kfree(p);
}
-static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
+static struct kprobe *alloc_aggr_kprobe(struct kprobe *p)
{
return kzalloc(sizeof(struct kprobe), GFP_KERNEL);
}
@@ -931,7 +920,7 @@ static struct ftrace_ops kprobe_ftrace_ops __read_mostly = {
static int kprobe_ftrace_enabled;
/* Must ensure p->addr is really on ftrace */
-static int __kprobes prepare_kprobe(struct kprobe *p)
+static int prepare_kprobe(struct kprobe *p)
{
if (!kprobe_ftrace(p))
return arch_prepare_kprobe(p);
@@ -940,7 +929,7 @@ static int __kprobes prepare_kprobe(struct kprobe *p)
}
/* Caller must lock kprobe_mutex */
-static void __kprobes arm_kprobe_ftrace(struct kprobe *p)
+static void arm_kprobe_ftrace(struct kprobe *p)
{
int ret;
@@ -955,7 +944,7 @@ static void __kprobes arm_kprobe_ftrace(struct kprobe *p)
}
/* Caller must lock kprobe_mutex */
-static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
+static void disarm_kprobe_ftrace(struct kprobe *p)
{
int ret;
@@ -975,7 +964,7 @@ static void __kprobes disarm_kprobe_ftrace(struct kprobe *p)
#endif
/* Arm a kprobe with text_mutex */
-static void __kprobes arm_kprobe(struct kprobe *kp)
+static void arm_kprobe(struct kprobe *kp)
{
if (unlikely(kprobe_ftrace(kp))) {
arm_kprobe_ftrace(kp);
@@ -992,7 +981,7 @@ static void __kprobes arm_kprobe(struct kprobe *kp)
}
/* Disarm a kprobe with text_mutex */
-static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt)
+static void disarm_kprobe(struct kprobe *kp, bool reopt)
{
if (unlikely(kprobe_ftrace(kp))) {
disarm_kprobe_ftrace(kp);
@@ -1008,7 +997,7 @@ static void __kprobes disarm_kprobe(struct kprobe *kp, bool reopt)
* Aggregate handlers for multiple kprobes support - these handlers
* take care of invoking the individual kprobe handlers on p->list
*/
-static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
+static int aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *kp;
@@ -1022,9 +1011,10 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
}
return 0;
}
+NOKPROBE_SYMBOL(aggr_pre_handler);
-static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
- unsigned long flags)
+static void aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
+ unsigned long flags)
{
struct kprobe *kp;
@@ -1036,9 +1026,10 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
}
}
}
+NOKPROBE_SYMBOL(aggr_post_handler);
-static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
- int trapnr)
+static int aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
+ int trapnr)
{
struct kprobe *cur = __this_cpu_read(kprobe_instance);
@@ -1052,8 +1043,9 @@ static int __kprobes aggr_fault_handler(struct kprobe *p, struct pt_regs *regs,
}
return 0;
}
+NOKPROBE_SYMBOL(aggr_fault_handler);
-static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
+static int aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
{
struct kprobe *cur = __this_cpu_read(kprobe_instance);
int ret = 0;
@@ -1065,9 +1057,10 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs)
reset_kprobe_instance();
return ret;
}
+NOKPROBE_SYMBOL(aggr_break_handler);
/* Walks the list and increments nmissed count for multiprobe case */
-void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
+void kprobes_inc_nmissed_count(struct kprobe *p)
{
struct kprobe *kp;
if (!kprobe_aggrprobe(p)) {
@@ -1078,9 +1071,10 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
}
return;
}
+NOKPROBE_SYMBOL(kprobes_inc_nmissed_count);
-void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
- struct hlist_head *head)
+void recycle_rp_inst(struct kretprobe_instance *ri,
+ struct hlist_head *head)
{
struct kretprobe *rp = ri->rp;
@@ -1095,8 +1089,9 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
/* Unregistering */
hlist_add_head(&ri->hlist, head);
}
+NOKPROBE_SYMBOL(recycle_rp_inst);
-void __kprobes kretprobe_hash_lock(struct task_struct *tsk,
+void kretprobe_hash_lock(struct task_struct *tsk,
struct hlist_head **head, unsigned long *flags)
__acquires(hlist_lock)
{
@@ -1107,17 +1102,19 @@ __acquires(hlist_lock)
hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_lock_irqsave(hlist_lock, *flags);
}
+NOKPROBE_SYMBOL(kretprobe_hash_lock);
-static void __kprobes kretprobe_table_lock(unsigned long hash,
- unsigned long *flags)
+static void kretprobe_table_lock(unsigned long hash,
+ unsigned long *flags)
__acquires(hlist_lock)
{
raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_lock_irqsave(hlist_lock, *flags);
}
+NOKPROBE_SYMBOL(kretprobe_table_lock);
-void __kprobes kretprobe_hash_unlock(struct task_struct *tsk,
- unsigned long *flags)
+void kretprobe_hash_unlock(struct task_struct *tsk,
+ unsigned long *flags)
__releases(hlist_lock)
{
unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
@@ -1126,14 +1123,16 @@ __releases(hlist_lock)
hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_unlock_irqrestore(hlist_lock, *flags);
}
+NOKPROBE_SYMBOL(kretprobe_hash_unlock);
-static void __kprobes kretprobe_table_unlock(unsigned long hash,
- unsigned long *flags)
+static void kretprobe_table_unlock(unsigned long hash,
+ unsigned long *flags)
__releases(hlist_lock)
{
raw_spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
raw_spin_unlock_irqrestore(hlist_lock, *flags);
}
+NOKPROBE_SYMBOL(kretprobe_table_unlock);
/*
* This function is called from finish_task_switch when task tk becomes dead,
@@ -1141,7 +1140,7 @@ __releases(hlist_lock)
* with this task. These left over instances represent probed functions
* that have been called but will never return.
*/
-void __kprobes kprobe_flush_task(struct task_struct *tk)
+void kprobe_flush_task(struct task_struct *tk)
{
struct kretprobe_instance *ri;
struct hlist_head *head, empty_rp;
@@ -1166,6 +1165,7 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
kfree(ri);
}
}
+NOKPROBE_SYMBOL(kprobe_flush_task);
static inline void free_rp_inst(struct kretprobe *rp)
{
@@ -1178,7 +1178,7 @@ static inline void free_rp_inst(struct kretprobe *rp)
}
}
-static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
+static void cleanup_rp_inst(struct kretprobe *rp)
{
unsigned long flags, hash;
struct kretprobe_instance *ri;
@@ -1197,12 +1197,13 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
}
free_rp_inst(rp);
}
+NOKPROBE_SYMBOL(cleanup_rp_inst);
/*
* Add the new probe to ap->list. Fail if this is the
* second jprobe at the address - two jprobes can't coexist
*/
-static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
+static int add_new_kprobe(struct kprobe *ap, struct kprobe *p)
{
BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
@@ -1226,7 +1227,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
* Fill in the required fields of the "manager kprobe". Replace the
* earlier kprobe in the hlist with the manager kprobe
*/
-static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
+static void init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
{
/* Copy p's insn slot to ap */
copy_kprobe(p, ap);
@@ -1252,8 +1253,7 @@ static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
* This is the second or subsequent kprobe at the address - handle
* the intricacies
*/
-static int __kprobes register_aggr_kprobe(struct kprobe *orig_p,
- struct kprobe *p)
+static int register_aggr_kprobe(struct kprobe *orig_p, struct kprobe *p)
{
int ret = 0;
struct kprobe *ap = orig_p;
@@ -1324,25 +1324,29 @@ out:
return ret;
}
-static int __kprobes in_kprobes_functions(unsigned long addr)
+bool __weak arch_within_kprobe_blacklist(unsigned long addr)
{
- struct kprobe_blackpoint *kb;
+ /* The __kprobes marked functions and entry code must not be probed */
+ return addr >= (unsigned long)__kprobes_text_start &&
+ addr < (unsigned long)__kprobes_text_end;
+}
- if (addr >= (unsigned long)__kprobes_text_start &&
- addr < (unsigned long)__kprobes_text_end)
- return -EINVAL;
+static bool within_kprobe_blacklist(unsigned long addr)
+{
+ struct kprobe_blacklist_entry *ent;
+
+ if (arch_within_kprobe_blacklist(addr))
+ return true;
/*
* If there exists a kprobe_blacklist, verify and
* fail any probe registration in the prohibited area
*/
- for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
- if (kb->start_addr) {
- if (addr >= kb->start_addr &&
- addr < (kb->start_addr + kb->range))
- return -EINVAL;
- }
+ list_for_each_entry(ent, &kprobe_blacklist, list) {
+ if (addr >= ent->start_addr && addr < ent->end_addr)
+ return true;
}
- return 0;
+
+ return false;
}
/*
@@ -1351,7 +1355,7 @@ static int __kprobes in_kprobes_functions(unsigned long addr)
* This returns encoded errors if it fails to look up symbol or invalid
* combination of parameters.
*/
-static kprobe_opcode_t __kprobes *kprobe_addr(struct kprobe *p)
+static kprobe_opcode_t *kprobe_addr(struct kprobe *p)
{
kprobe_opcode_t *addr = p->addr;
@@ -1374,7 +1378,7 @@ invalid:
}
/* Check passed kprobe is valid and return kprobe in kprobe_table. */
-static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
+static struct kprobe *__get_valid_kprobe(struct kprobe *p)
{
struct kprobe *ap, *list_p;
@@ -1406,8 +1410,8 @@ static inline int check_kprobe_rereg(struct kprobe *p)
return ret;
}
-static __kprobes int check_kprobe_address_safe(struct kprobe *p,
- struct module **probed_mod)
+static int check_kprobe_address_safe(struct kprobe *p,
+ struct module **probed_mod)
{
int ret = 0;
unsigned long ftrace_addr;
@@ -1433,7 +1437,7 @@ static __kprobes int check_kprobe_address_safe(struct kprobe *p,
/* Ensure it is not in reserved area nor out of text */
if (!kernel_text_address((unsigned long) p->addr) ||
- in_kprobes_functions((unsigned long) p->addr) ||
+ within_kprobe_blacklist((unsigned long) p->addr) ||
jump_label_text_reserved(p->addr, p->addr)) {
ret = -EINVAL;
goto out;
@@ -1469,7 +1473,7 @@ out:
return ret;
}
-int __kprobes register_kprobe(struct kprobe *p)
+int register_kprobe(struct kprobe *p)
{
int ret;
struct kprobe *old_p;
@@ -1531,7 +1535,7 @@ out:
EXPORT_SYMBOL_GPL(register_kprobe);
/* Check if all probes on the aggrprobe are disabled */
-static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
+static int aggr_kprobe_disabled(struct kprobe *ap)
{
struct kprobe *kp;
@@ -1547,7 +1551,7 @@ static int __kprobes aggr_kprobe_disabled(struct kprobe *ap)
}
/* Disable one kprobe: Make sure called under kprobe_mutex is locked */
-static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
+static struct kprobe *__disable_kprobe(struct kprobe *p)
{
struct kprobe *orig_p;
@@ -1574,7 +1578,7 @@ static struct kprobe *__kprobes __disable_kprobe(struct kprobe *p)
/*
* Unregister a kprobe without a scheduler synchronization.
*/
-static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+static int __unregister_kprobe_top(struct kprobe *p)
{
struct kprobe *ap, *list_p;
@@ -1631,7 +1635,7 @@ disarmed:
return 0;
}
-static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
+static void __unregister_kprobe_bottom(struct kprobe *p)
{
struct kprobe *ap;
@@ -1647,7 +1651,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p)
/* Otherwise, do nothing. */
}
-int __kprobes register_kprobes(struct kprobe **kps, int num)
+int register_kprobes(struct kprobe **kps, int num)
{
int i, ret = 0;
@@ -1665,13 +1669,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num)
}
EXPORT_SYMBOL_GPL(register_kprobes);
-void __kprobes unregister_kprobe(struct kprobe *p)
+void unregister_kprobe(struct kprobe *p)
{
unregister_kprobes(&p, 1);
}
EXPORT_SYMBOL_GPL(unregister_kprobe);
-void __kprobes unregister_kprobes(struct kprobe **kps, int num)
+void unregister_kprobes(struct kprobe **kps, int num)
{
int i;
@@ -1700,7 +1704,7 @@ unsigned long __weak arch_deref_entry_point(void *entry)
return (unsigned long)entry;
}
-int __kprobes register_jprobes(struct jprobe **jps, int num)
+int register_jprobes(struct jprobe **jps, int num)
{
struct jprobe *jp;
int ret = 0, i;
@@ -1731,19 +1735,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
}
EXPORT_SYMBOL_GPL(register_jprobes);
-int __kprobes register_jprobe(struct jprobe *jp)
+int register_jprobe(struct jprobe *jp)
{
return register_jprobes(&jp, 1);
}
EXPORT_SYMBOL_GPL(register_jprobe);
-void __kprobes unregister_jprobe(struct jprobe *jp)
+void unregister_jprobe(struct jprobe *jp)
{
unregister_jprobes(&jp, 1);
}
EXPORT_SYMBOL_GPL(unregister_jprobe);
-void __kprobes unregister_jprobes(struct jprobe **jps, int num)
+void unregister_jprobes(struct jprobe **jps, int num)
{
int i;
@@ -1768,8 +1772,7 @@ EXPORT_SYMBOL_GPL(unregister_jprobes);
* This kprobe pre_handler is registered with every kretprobe. When probe
* hits it will set up the return probe.
*/
-static int __kprobes pre_handler_kretprobe(struct kprobe *p,
- struct pt_regs *regs)
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
struct kretprobe *rp = container_of(p, struct kretprobe, kp);
unsigned long hash, flags = 0;
@@ -1807,8 +1810,9 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
}
return 0;
}
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
-int __kprobes register_kretprobe(struct kretprobe *rp)
+int register_kretprobe(struct kretprobe *rp)
{
int ret = 0;
struct kretprobe_instance *inst;
@@ -1861,7 +1865,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
}
EXPORT_SYMBOL_GPL(register_kretprobe);
-int __kprobes register_kretprobes(struct kretprobe **rps, int num)
+int register_kretprobes(struct kretprobe **rps, int num)
{
int ret = 0, i;
@@ -1879,13 +1883,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num)
}
EXPORT_SYMBOL_GPL(register_kretprobes);
-void __kprobes unregister_kretprobe(struct kretprobe *rp)
+void unregister_kretprobe(struct kretprobe *rp)
{
unregister_kretprobes(&rp, 1);
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);
-void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
+void unregister_kretprobes(struct kretprobe **rps, int num)
{
int i;
@@ -1908,38 +1912,38 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
EXPORT_SYMBOL_GPL(unregister_kretprobes);
#else /* CONFIG_KRETPROBES */
-int __kprobes register_kretprobe(struct kretprobe *rp)
+int register_kretprobe(struct kretprobe *rp)
{
return -ENOSYS;
}
EXPORT_SYMBOL_GPL(register_kretprobe);
-int __kprobes register_kretprobes(struct kretprobe **rps, int num)
+int register_kretprobes(struct kretprobe **rps, int num)
{
return -ENOSYS;
}
EXPORT_SYMBOL_GPL(register_kretprobes);
-void __kprobes unregister_kretprobe(struct kretprobe *rp)
+void unregister_kretprobe(struct kretprobe *rp)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobe);
-void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
+void unregister_kretprobes(struct kretprobe **rps, int num)
{
}
EXPORT_SYMBOL_GPL(unregister_kretprobes);
-static int __kprobes pre_handler_kretprobe(struct kprobe *p,
- struct pt_regs *regs)
+static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs)
{
return 0;
}
+NOKPROBE_SYMBOL(pre_handler_kretprobe);
#endif /* CONFIG_KRETPROBES */
/* Set the kprobe gone and remove its instruction buffer. */
-static void __kprobes kill_kprobe(struct kprobe *p)
+static void kill_kprobe(struct kprobe *p)
{
struct kprobe *kp;
@@ -1963,7 +1967,7 @@ static void __kprobes kill_kprobe(struct kprobe *p)
}
/* Disable one kprobe */
-int __kprobes disable_kprobe(struct kprobe *kp)
+int disable_kprobe(struct kprobe *kp)
{
int ret = 0;
@@ -1979,7 +1983,7 @@ int __kprobes disable_kprobe(struct kprobe *kp)
EXPORT_SYMBOL_GPL(disable_kprobe);
/* Enable one kprobe */
-int __kprobes enable_kprobe(struct kprobe *kp)
+int enable_kprobe(struct kprobe *kp)
{
int ret = 0;
struct kprobe *p;
@@ -2012,16 +2016,49 @@ out:
}
EXPORT_SYMBOL_GPL(enable_kprobe);
-void __kprobes dump_kprobe(struct kprobe *kp)
+void dump_kprobe(struct kprobe *kp)
{
printk(KERN_WARNING "Dumping kprobe:\n");
printk(KERN_WARNING "Name: %s\nAddress: %p\nOffset: %x\n",
kp->symbol_name, kp->addr, kp->offset);
}
+NOKPROBE_SYMBOL(dump_kprobe);
+
+/*
+ * Lookup and populate the kprobe_blacklist.
+ *
+ * Unlike the kretprobe blacklist, we'll need to determine
+ * the range of addresses that belong to the said functions,
+ * since a kprobe need not necessarily be at the beginning
+ * of a function.
+ */
+static int __init populate_kprobe_blacklist(unsigned long *start,
+ unsigned long *end)
+{
+ unsigned long *iter;
+ struct kprobe_blacklist_entry *ent;
+ unsigned long offset = 0, size = 0;
+
+ for (iter = start; iter < end; iter++) {
+ if (!kallsyms_lookup_size_offset(*iter, &size, &offset)) {
+ pr_err("Failed to find blacklist %p\n", (void *)*iter);
+ continue;
+ }
+
+ ent = kmalloc(sizeof(*ent), GFP_KERNEL);
+ if (!ent)
+ return -ENOMEM;
+ ent->start_addr = *iter;
+ ent->end_addr = *iter + size;
+ INIT_LIST_HEAD(&ent->list);
+ list_add_tail(&ent->list, &kprobe_blacklist);
+ }
+ return 0;
+}
/* Module notifier call back, checking kprobes on the module */
-static int __kprobes kprobes_module_callback(struct notifier_block *nb,
- unsigned long val, void *data)
+static int kprobes_module_callback(struct notifier_block *nb,
+ unsigned long val, void *data)
{
struct module *mod = data;
struct hlist_head *head;
@@ -2062,14 +2099,13 @@ static struct notifier_block kprobe_module_nb = {
.priority = 0
};
+/* Markers of _kprobe_blacklist section */
+extern unsigned long __start_kprobe_blacklist[];
+extern unsigned long __stop_kprobe_blacklist[];
+
static int __init init_kprobes(void)
{
int i, err = 0;
- unsigned long offset = 0, size = 0;
- char *modname, namebuf[KSYM_NAME_LEN];
- const char *symbol_name;
- void *addr;
- struct kprobe_blackpoint *kb;
/* FIXME allocate the probe table, currently defined statically */
/* initialize all list heads */
@@ -2079,26 +2115,11 @@ static int __init init_kprobes(void)
raw_spin_lock_init(&(kretprobe_table_locks[i].lock));
}
- /*
- * Lookup and populate the kprobe_blacklist.
- *
- * Unlike the kretprobe blacklist, we'll need to determine
- * the range of addresses that belong to the said functions,
- * since a kprobe need not necessarily be at the beginning
- * of a function.
- */
- for (kb = kprobe_blacklist; kb->name != NULL; kb++) {
- kprobe_lookup_name(kb->name, addr);
- if (!addr)
- continue;
-
- kb->start_addr = (unsigned long)addr;
- symbol_name = kallsyms_lookup(kb->start_addr,
- &size, &offset, &modname, namebuf);
- if (!symbol_name)
- kb->range = 0;
- else
- kb->range = size;
+ err = populate_kprobe_blacklist(__start_kprobe_blacklist,
+ __stop_kprobe_blacklist);
+ if (err) {
+ pr_err("kprobes: failed to populate blacklist: %d\n", err);
+ pr_err("Please take care of using kprobes.\n");
}
if (kretprobe_blacklist_size) {
@@ -2138,7 +2159,7 @@ static int __init init_kprobes(void)
}
#ifdef CONFIG_DEBUG_FS
-static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
+static void report_probe(struct seq_file *pi, struct kprobe *p,
const char *sym, int offset, char *modname, struct kprobe *pp)
{
char *kprobe_type;
@@ -2167,12 +2188,12 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
(kprobe_ftrace(pp) ? "[FTRACE]" : ""));
}
-static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
+static void *kprobe_seq_start(struct seq_file *f, loff_t *pos)
{
return (*pos < KPROBE_TABLE_SIZE) ? pos : NULL;
}
-static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
+static void *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
{
(*pos)++;
if (*pos >= KPROBE_TABLE_SIZE)
@@ -2180,12 +2201,12 @@ static void __kprobes *kprobe_seq_next(struct seq_file *f, void *v, loff_t *pos)
return pos;
}
-static void __kprobes kprobe_seq_stop(struct seq_file *f, void *v)
+static void kprobe_seq_stop(struct seq_file *f, void *v)
{
/* Nothing to do */
}
-static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v)
+static int show_kprobe_addr(struct seq_file *pi, void *v)
{
struct hlist_head *head;
struct kprobe *p, *kp;
@@ -2216,7 +2237,7 @@ static const struct seq_operations kprobes_seq_ops = {
.show = show_kprobe_addr
};
-static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
+static int kprobes_open(struct inode *inode, struct file *filp)
{
return seq_open(filp, &kprobes_seq_ops);
}
@@ -2228,7 +2249,47 @@ static const struct file_operations debugfs_kprobes_operations = {
.release = seq_release,
};
-static void __kprobes arm_all_kprobes(void)
+/* kprobes/blacklist -- shows which functions can not be probed */
+static void *kprobe_blacklist_seq_start(struct seq_file *m, loff_t *pos)
+{
+ return seq_list_start(&kprobe_blacklist, *pos);
+}
+
+static void *kprobe_blacklist_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+ return seq_list_next(v, &kprobe_blacklist, pos);
+}
+
+static int kprobe_blacklist_seq_show(struct seq_file *m, void *v)
+{
+ struct kprobe_blacklist_entry *ent =
+ list_entry(v, struct kprobe_blacklist_entry, list);
+
+ seq_printf(m, "0x%p-0x%p\t%ps\n", (void *)ent->start_addr,
+ (void *)ent->end_addr, (void *)ent->start_addr);
+ return 0;
+}
+
+static const struct seq_operations kprobe_blacklist_seq_ops = {
+ .start = kprobe_blacklist_seq_start,
+ .next = kprobe_blacklist_seq_next,
+ .stop = kprobe_seq_stop, /* Reuse void function */
+ .show = kprobe_blacklist_seq_show,
+};
+
+static int kprobe_blacklist_open(struct inode *inode, struct file *filp)
+{
+ return seq_open(filp, &kprobe_blacklist_seq_ops);
+}
+
+static const struct file_operations debugfs_kprobe_blacklist_ops = {
+ .open = kprobe_blacklist_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
+static void arm_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
@@ -2256,7 +2317,7 @@ already_enabled:
return;
}
-static void __kprobes disarm_all_kprobes(void)
+static void disarm_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
@@ -2340,7 +2401,7 @@ static const struct file_operations fops_kp = {
.llseek = default_llseek,
};
-static int __kprobes debugfs_kprobe_init(void)
+static int __init debugfs_kprobe_init(void)
{
struct dentry *dir, *file;
unsigned int value = 1;
@@ -2351,19 +2412,24 @@ static int __kprobes debugfs_kprobe_init(void)
file = debugfs_create_file("list", 0444, dir, NULL,
&debugfs_kprobes_operations);
- if (!file) {
- debugfs_remove(dir);
- return -ENOMEM;
- }
+ if (!file)
+ goto error;
file = debugfs_create_file("enabled", 0600, dir,
&value, &fops_kp);
- if (!file) {
- debugfs_remove(dir);
- return -ENOMEM;
- }
+ if (!file)
+ goto error;
+
+ file = debugfs_create_file("blacklist", 0444, dir, NULL,
+ &debugfs_kprobe_blacklist_ops);
+ if (!file)
+ goto error;
return 0;
+
+error:
+ debugfs_remove(dir);
+ return -ENOMEM;
}
late_initcall(debugfs_kprobe_init);
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index b8bdcd4785b..8541bfdfd23 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -24,4 +24,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
+obj-$(CONFIG_QUEUE_RWLOCK) += qrwlock.o
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
new file mode 100644
index 00000000000..fb5b8ac411a
--- /dev/null
+++ b/kernel/locking/qrwlock.c
@@ -0,0 +1,133 @@
+/*
+ * Queue read/write lock
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * (C) Copyright 2013-2014 Hewlett-Packard Development Company, L.P.
+ *
+ * Authors: Waiman Long <waiman.long@hp.com>
+ */
+#include <linux/smp.h>
+#include <linux/bug.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <linux/hardirq.h>
+#include <linux/mutex.h>
+#include <asm/qrwlock.h>
+
+/**
+ * rspin_until_writer_unlock - inc reader count & spin until writer is gone
+ * @lock : Pointer to queue rwlock structure
+ * @writer: Current queue rwlock writer status byte
+ *
+ * In interrupt context or at the head of the queue, the reader will just
+ * increment the reader count & wait until the writer releases the lock.
+ */
+static __always_inline void
+rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
+{
+ while ((cnts & _QW_WMASK) == _QW_LOCKED) {
+ arch_mutex_cpu_relax();
+ cnts = smp_load_acquire((u32 *)&lock->cnts);
+ }
+}
+
+/**
+ * queue_read_lock_slowpath - acquire read lock of a queue rwlock
+ * @lock: Pointer to queue rwlock structure
+ */
+void queue_read_lock_slowpath(struct qrwlock *lock)
+{
+ u32 cnts;
+
+ /*
+ * Readers come here when they cannot get the lock without waiting
+ */
+ if (unlikely(in_interrupt())) {
+ /*
+ * Readers in interrupt context will spin until the lock is
+ * available without waiting in the queue.
+ */
+ cnts = smp_load_acquire((u32 *)&lock->cnts);
+ rspin_until_writer_unlock(lock, cnts);
+ return;
+ }
+ atomic_sub(_QR_BIAS, &lock->cnts);
+
+ /*
+ * Put the reader into the wait queue
+ */
+ arch_spin_lock(&lock->lock);
+
+ /*
+ * At the head of the wait queue now, wait until the writer state
+ * goes to 0 and then try to increment the reader count and get
+ * the lock. It is possible that an incoming writer may steal the
+ * lock in the interim, so it is necessary to check the writer byte
+ * to make sure that the write lock isn't taken.
+ */
+ while (atomic_read(&lock->cnts) & _QW_WMASK)
+ arch_mutex_cpu_relax();
+
+ cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
+ rspin_until_writer_unlock(lock, cnts);
+
+ /*
+ * Signal the next one in queue to become queue head
+ */
+ arch_spin_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(queue_read_lock_slowpath);
+
+/**
+ * queue_write_lock_slowpath - acquire write lock of a queue rwlock
+ * @lock : Pointer to queue rwlock structure
+ */
+void queue_write_lock_slowpath(struct qrwlock *lock)
+{
+ u32 cnts;
+
+ /* Put the writer into the wait queue */
+ arch_spin_lock(&lock->lock);
+
+ /* Try to acquire the lock directly if no reader is present */
+ if (!atomic_read(&lock->cnts) &&
+ (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0))
+ goto unlock;
+
+ /*
+ * Set the waiting flag to notify readers that a writer is pending,
+ * or wait for a previous writer to go away.
+ */
+ for (;;) {
+ cnts = atomic_read(&lock->cnts);
+ if (!(cnts & _QW_WMASK) &&
+ (atomic_cmpxchg(&lock->cnts, cnts,
+ cnts | _QW_WAITING) == cnts))
+ break;
+
+ arch_mutex_cpu_relax();
+ }
+
+ /* When no more readers, set the locked flag */
+ for (;;) {
+ cnts = atomic_read(&lock->cnts);
+ if ((cnts == _QW_WAITING) &&
+ (atomic_cmpxchg(&lock->cnts, _QW_WAITING,
+ _QW_LOCKED) == _QW_WAITING))
+ break;
+
+ arch_mutex_cpu_relax();
+ }
+unlock:
+ arch_spin_unlock(&lock->lock);
+}
+EXPORT_SYMBOL(queue_write_lock_slowpath);
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index 14193d596d7..ab29b6a2266 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -31,3 +31,8 @@ static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
{
return (waiter != NULL);
}
+
+static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
+{
+ debug_rt_mutex_print_deadlock(w);
+}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index a620d4d08ca..fc605941b9b 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -83,6 +83,47 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
owner = *p;
} while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
}
+
+/*
+ * Safe fastpath aware unlock:
+ * 1) Clear the waiters bit
+ * 2) Drop lock->wait_lock
+ * 3) Try to unlock the lock with cmpxchg
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+ __releases(lock->wait_lock)
+{
+ struct task_struct *owner = rt_mutex_owner(lock);
+
+ clear_rt_mutex_waiters(lock);
+ raw_spin_unlock(&lock->wait_lock);
+ /*
+ * If a new waiter comes in between the unlock and the cmpxchg
+ * we have two situations:
+ *
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * cmpxchg(p, owner, 0) == owner
+ * mark_rt_mutex_waiters(lock);
+ * acquire(lock);
+ * or:
+ *
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * mark_rt_mutex_waiters(lock);
+ *
+ * cmpxchg(p, owner, 0) != owner
+ * enqueue_waiter();
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * wake waiter();
+ * unlock(wait_lock);
+ * lock(wait_lock);
+ * acquire(lock);
+ */
+ return rt_mutex_cmpxchg(lock, owner, NULL);
+}
+
#else
# define rt_mutex_cmpxchg(l,c,n) (0)
static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
@@ -90,6 +131,17 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
lock->owner = (struct task_struct *)
((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
}
+
+/*
+ * Simple slow path only version: lock->owner is protected by lock->wait_lock.
+ */
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+ __releases(lock->wait_lock)
+{
+ lock->owner = NULL;
+ raw_spin_unlock(&lock->wait_lock);
+ return true;
+}
#endif
static inline int
@@ -260,27 +312,36 @@ static void rt_mutex_adjust_prio(struct task_struct *task)
*/
int max_lock_depth = 1024;
+static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p)
+{
+ return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL;
+}
+
/*
* Adjust the priority chain. Also used for deadlock detection.
* Decreases task's usage by one - may thus free the task.
*
- * @task: the task owning the mutex (owner) for which a chain walk is probably
- * needed
+ * @task: the task owning the mutex (owner) for which a chain walk is
+ * probably needed
* @deadlock_detect: do we have to carry out deadlock detection?
- * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
- * things for a task that has just got its priority adjusted, and
- * is waiting on a mutex)
+ * @orig_lock: the mutex (can be NULL if we are walking the chain to recheck
+ * things for a task that has just got its priority adjusted, and
+ * is waiting on a mutex)
+ * @next_lock: the mutex on which the owner of @orig_lock was blocked before
+ * we dropped its pi_lock. Is never dereferenced, only used for
+ * comparison to detect lock chain changes.
* @orig_waiter: rt_mutex_waiter struct for the task that has just donated
- * its priority to the mutex owner (can be NULL in the case
- * depicted above or if the top waiter is gone away and we are
- * actually deboosting the owner)
- * @top_task: the current top waiter
+ * its priority to the mutex owner (can be NULL in the case
+ * depicted above or if the top waiter is gone away and we are
+ * actually deboosting the owner)
+ * @top_task: the current top waiter
*
* Returns 0 or -EDEADLK.
*/
static int rt_mutex_adjust_prio_chain(struct task_struct *task,
int deadlock_detect,
struct rt_mutex *orig_lock,
+ struct rt_mutex *next_lock,
struct rt_mutex_waiter *orig_waiter,
struct task_struct *top_task)
{
@@ -314,7 +375,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
put_task_struct(task);
- return deadlock_detect ? -EDEADLK : 0;
+ return -EDEADLK;
}
retry:
/*
@@ -339,6 +400,18 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto out_unlock_pi;
/*
+ * We dropped all locks after taking a refcount on @task, so
+ * the task might have moved on in the lock chain or even left
+ * the chain completely and blocks now on an unrelated lock or
+ * on @orig_lock.
+ *
+ * We stored the lock on which @task was blocked in @next_lock,
+ * so we can detect the chain change.
+ */
+ if (next_lock != waiter->lock)
+ goto out_unlock_pi;
+
+ /*
* Drop out, when the task has no waiters. Note,
* top_waiter can be NULL, when we are in the deboosting
* mode!
@@ -377,7 +450,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
raw_spin_unlock(&lock->wait_lock);
- ret = deadlock_detect ? -EDEADLK : 0;
+ ret = -EDEADLK;
goto out_unlock_pi;
}
@@ -422,11 +495,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
__rt_mutex_adjust_prio(task);
}
+ /*
+ * Check whether the task which owns the current lock is pi
+ * blocked itself. If yes we store a pointer to the lock for
+ * the lock chain change detection above. After we dropped
+ * task->pi_lock next_lock cannot be dereferenced anymore.
+ */
+ next_lock = task_blocked_on_lock(task);
+
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
top_waiter = rt_mutex_top_waiter(lock);
raw_spin_unlock(&lock->wait_lock);
+ /*
+ * We reached the end of the lock chain. Stop right here. No
+ * point to go back just to figure that out.
+ */
+ if (!next_lock)
+ goto out_put_task;
+
if (!detect_deadlock && waiter != top_waiter)
goto out_put_task;
@@ -536,8 +624,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
{
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex_waiter *top_waiter = waiter;
- unsigned long flags;
+ struct rt_mutex *next_lock;
int chain_walk = 0, res;
+ unsigned long flags;
/*
* Early deadlock detection. We really don't want the task to
@@ -548,7 +637,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
* which is wrong, as the other waiter is not in a deadlock
* situation.
*/
- if (detect_deadlock && owner == task)
+ if (owner == task)
return -EDEADLK;
raw_spin_lock_irqsave(&task->pi_lock, flags);
@@ -569,20 +658,28 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
if (!owner)
return 0;
+ raw_spin_lock_irqsave(&owner->pi_lock, flags);
if (waiter == rt_mutex_top_waiter(lock)) {
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
__rt_mutex_adjust_prio(owner);
if (owner->pi_blocked_on)
chain_walk = 1;
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
- }
- else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
+ } else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
chain_walk = 1;
+ }
- if (!chain_walk)
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
+
+ raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ /*
+ * Even if full deadlock detection is on, if the owner is not
+ * blocked itself, we can avoid finding this out in the chain
+ * walk.
+ */
+ if (!chain_walk || !next_lock)
return 0;
/*
@@ -594,8 +691,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
raw_spin_unlock(&lock->wait_lock);
- res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
- task);
+ res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock,
+ next_lock, waiter, task);
raw_spin_lock(&lock->wait_lock);
@@ -605,7 +702,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
/*
* Wake up the next waiter on the lock.
*
- * Remove the top waiter from the current tasks waiter list and wake it up.
+ * Remove the top waiter from the current tasks pi waiter list and
+ * wake it up.
*
* Called with lock->wait_lock held.
*/
@@ -626,10 +724,23 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
*/
rt_mutex_dequeue_pi(current, waiter);
- rt_mutex_set_owner(lock, NULL);
+ /*
+ * As we are waking up the top waiter, and the waiter stays
+ * queued on the lock until it gets the lock, this lock
+ * obviously has waiters. Just set the bit here and this has
+ * the added benefit of forcing all new tasks into the
+ * slow path making sure no task of lower priority than
+ * the top waiter can steal this lock.
+ */
+ lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ /*
+ * It's safe to dereference waiter as it cannot go away as
+ * long as we hold lock->wait_lock. The waiter task needs to
+ * acquire it in order to dequeue the waiter.
+ */
wake_up_process(waiter->task);
}
@@ -644,8 +755,8 @@ static void remove_waiter(struct rt_mutex *lock,
{
int first = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
+ struct rt_mutex *next_lock = NULL;
unsigned long flags;
- int chain_walk = 0;
raw_spin_lock_irqsave(&current->pi_lock, flags);
rt_mutex_dequeue(lock, waiter);
@@ -669,13 +780,13 @@ static void remove_waiter(struct rt_mutex *lock,
}
__rt_mutex_adjust_prio(owner);
- if (owner->pi_blocked_on)
- chain_walk = 1;
+ /* Store the lock on which owner is blocked or NULL */
+ next_lock = task_blocked_on_lock(owner);
raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
}
- if (!chain_walk)
+ if (!next_lock)
return;
/* gets dropped in rt_mutex_adjust_prio_chain()! */
@@ -683,7 +794,7 @@ static void remove_waiter(struct rt_mutex *lock,
raw_spin_unlock(&lock->wait_lock);
- rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
+ rt_mutex_adjust_prio_chain(owner, 0, lock, next_lock, NULL, current);
raw_spin_lock(&lock->wait_lock);
}
@@ -696,6 +807,7 @@ static void remove_waiter(struct rt_mutex *lock,
void rt_mutex_adjust_pi(struct task_struct *task)
{
struct rt_mutex_waiter *waiter;
+ struct rt_mutex *next_lock;
unsigned long flags;
raw_spin_lock_irqsave(&task->pi_lock, flags);
@@ -706,12 +818,13 @@ void rt_mutex_adjust_pi(struct task_struct *task)
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
return;
}
-
+ next_lock = waiter->lock;
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
/* gets dropped in rt_mutex_adjust_prio_chain()! */
get_task_struct(task);
- rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
+
+ rt_mutex_adjust_prio_chain(task, 0, NULL, next_lock, NULL, task);
}
/**
@@ -763,6 +876,26 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
return ret;
}
+static void rt_mutex_handle_deadlock(int res, int detect_deadlock,
+ struct rt_mutex_waiter *w)
+{
+ /*
+ * If the result is not -EDEADLOCK or the caller requested
+ * deadlock detection, nothing to do here.
+ */
+ if (res != -EDEADLOCK || detect_deadlock)
+ return;
+
+ /*
+ * Yell lowdly and stop the task right here.
+ */
+ rt_mutex_print_deadlock(w);
+ while (1) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ }
+}
+
/*
* Slow path lock function:
*/
@@ -802,8 +935,10 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
set_current_state(TASK_RUNNING);
- if (unlikely(ret))
+ if (unlikely(ret)) {
remove_waiter(lock, &waiter);
+ rt_mutex_handle_deadlock(ret, detect_deadlock, &waiter);
+ }
/*
* try_to_take_rt_mutex() sets the waiter bit
@@ -859,12 +994,49 @@ rt_mutex_slowunlock(struct rt_mutex *lock)
rt_mutex_deadlock_account_unlock(current);
- if (!rt_mutex_has_waiters(lock)) {
- lock->owner = NULL;
- raw_spin_unlock(&lock->wait_lock);
- return;
+ /*
+ * We must be careful here if the fast path is enabled. If we
+ * have no waiters queued we cannot set owner to NULL here
+ * because of:
+ *
+ * foo->lock->owner = NULL;
+ * rtmutex_lock(foo->lock); <- fast path
+ * free = atomic_dec_and_test(foo->refcnt);
+ * rtmutex_unlock(foo->lock); <- fast path
+ * if (free)
+ * kfree(foo);
+ * raw_spin_unlock(foo->lock->wait_lock);
+ *
+ * So for the fastpath enabled kernel:
+ *
+ * Nothing can set the waiters bit as long as we hold
+ * lock->wait_lock. So we do the following sequence:
+ *
+ * owner = rt_mutex_owner(lock);
+ * clear_rt_mutex_waiters(lock);
+ * raw_spin_unlock(&lock->wait_lock);
+ * if (cmpxchg(&lock->owner, owner, 0) == owner)
+ * return;
+ * goto retry;
+ *
+ * The fastpath disabled variant is simple as all access to
+ * lock->owner is serialized by lock->wait_lock:
+ *
+ * lock->owner = NULL;
+ * raw_spin_unlock(&lock->wait_lock);
+ */
+ while (!rt_mutex_has_waiters(lock)) {
+ /* Drops lock->wait_lock ! */
+ if (unlock_rt_mutex_safe(lock) == true)
+ return;
+ /* Relock the rtmutex and try again */
+ raw_spin_lock(&lock->wait_lock);
}
+ /*
+ * The wakeup next waiter path does not suffer from the above
+ * race. See the comments there.
+ */
wakeup_next_waiter(lock);
raw_spin_unlock(&lock->wait_lock);
@@ -1112,7 +1284,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
return 1;
}
- ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
+ /* We enforce deadlock detection for futexes */
+ ret = task_blocks_on_rt_mutex(lock, waiter, task, 1);
if (ret && !rt_mutex_owner(lock)) {
/*
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index a1a1dd06421..f6a1f3c133b 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -24,3 +24,8 @@
#define debug_rt_mutex_print_deadlock(w) do { } while (0)
#define debug_rt_mutex_detect_deadlock(w,d) (d)
#define debug_rt_mutex_reset_waiter(w) do { } while (0)
+
+static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w)
+{
+ WARN(1, "rtmutex deadlock detected\n");
+}
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index b4219ff87b8..dacc32142fc 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -5,11 +5,17 @@
*
* Writer lock-stealing by Alex Shi <alex.shi@intel.com>
* and Michel Lespinasse <walken@google.com>
+ *
+ * Optimistic spinning by Tim Chen <tim.c.chen@intel.com>
+ * and Davidlohr Bueso <davidlohr@hp.com>. Based on mutexes.
*/
#include <linux/rwsem.h>
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/export.h>
+#include <linux/sched/rt.h>
+
+#include "mcs_spinlock.h"
/*
* Guide to the rw_semaphore's count field for common values.
@@ -76,6 +82,10 @@ void __init_rwsem(struct rw_semaphore *sem, const char *name,
sem->count = RWSEM_UNLOCKED_VALUE;
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
+#ifdef CONFIG_SMP
+ sem->owner = NULL;
+ sem->osq = NULL;
+#endif
}
EXPORT_SYMBOL(__init_rwsem);
@@ -190,7 +200,7 @@ __rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
}
/*
- * wait for the read lock to be granted
+ * Wait for the read lock to be granted
*/
__visible
struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
@@ -237,64 +247,221 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem)
return sem;
}
+static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
+{
+ if (!(count & RWSEM_ACTIVE_MASK)) {
+ /* try acquiring the write lock */
+ if (sem->count == RWSEM_WAITING_BIAS &&
+ cmpxchg(&sem->count, RWSEM_WAITING_BIAS,
+ RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
+ if (!list_is_singular(&sem->wait_list))
+ rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ return true;
+ }
+ }
+ return false;
+}
+
+#ifdef CONFIG_SMP
/*
- * wait until we successfully acquire the write lock
+ * Try to acquire write lock before the writer has been put on wait queue.
+ */
+static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
+{
+ long old, count = ACCESS_ONCE(sem->count);
+
+ while (true) {
+ if (!(count == 0 || count == RWSEM_WAITING_BIAS))
+ return false;
+
+ old = cmpxchg(&sem->count, count, count + RWSEM_ACTIVE_WRITE_BIAS);
+ if (old == count)
+ return true;
+
+ count = old;
+ }
+}
+
+static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem)
+{
+ struct task_struct *owner;
+ bool on_cpu = true;
+
+ if (need_resched())
+ return 0;
+
+ rcu_read_lock();
+ owner = ACCESS_ONCE(sem->owner);
+ if (owner)
+ on_cpu = owner->on_cpu;
+ rcu_read_unlock();
+
+ /*
+ * If sem->owner is not set, the rwsem owner may have
+ * just acquired it and not set the owner yet or the rwsem
+ * has been released.
+ */
+ return on_cpu;
+}
+
+static inline bool owner_running(struct rw_semaphore *sem,
+ struct task_struct *owner)
+{
+ if (sem->owner != owner)
+ return false;
+
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_ checking
+ * sem->owner still matches owner, if that fails, owner might
+ * point to free()d memory, if it still matches, the rcu_read_lock()
+ * ensures the memory stays valid.
+ */
+ barrier();
+
+ return owner->on_cpu;
+}
+
+static noinline
+bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
+{
+ rcu_read_lock();
+ while (owner_running(sem, owner)) {
+ if (need_resched())
+ break;
+
+ arch_mutex_cpu_relax();
+ }
+ rcu_read_unlock();
+
+ /*
+ * We break out the loop above on need_resched() or when the
+ * owner changed, which is a sign for heavy contention. Return
+ * success only when sem->owner is NULL.
+ */
+ return sem->owner == NULL;
+}
+
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ struct task_struct *owner;
+ bool taken = false;
+
+ preempt_disable();
+
+ /* sem->wait_lock should not be held when doing optimistic spinning */
+ if (!rwsem_can_spin_on_owner(sem))
+ goto done;
+
+ if (!osq_lock(&sem->osq))
+ goto done;
+
+ while (true) {
+ owner = ACCESS_ONCE(sem->owner);
+ if (owner && !rwsem_spin_on_owner(sem, owner))
+ break;
+
+ /* wait_lock will be acquired if write_lock is obtained */
+ if (rwsem_try_write_lock_unqueued(sem)) {
+ taken = true;
+ break;
+ }
+
+ /*
+ * When there's no owner, we might have preempted between the
+ * owner acquiring the lock and setting the owner field. If
+ * we're an RT task that will live-lock because we won't let
+ * the owner complete.
+ */
+ if (!owner && (need_resched() || rt_task(current)))
+ break;
+
+ /*
+ * The cpu_relax() call is a compiler barrier which forces
+ * everything in this loop to be re-loaded. We don't need
+ * memory barriers as we'll eventually observe the right
+ * values at the cost of a few extra spins.
+ */
+ arch_mutex_cpu_relax();
+ }
+ osq_unlock(&sem->osq);
+done:
+ preempt_enable();
+ return taken;
+}
+
+#else
+static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
+{
+ return false;
+}
+#endif
+
+/*
+ * Wait until we successfully acquire the write lock
*/
__visible
struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
{
- long count, adjustment = -RWSEM_ACTIVE_WRITE_BIAS;
+ long count;
+ bool waiting = true; /* any queued threads before us */
struct rwsem_waiter waiter;
- struct task_struct *tsk = current;
- /* set up my own style of waitqueue */
- waiter.task = tsk;
+ /* undo write bias from down_write operation, stop active locking */
+ count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+
+ /* do optimistic spinning and steal lock if possible */
+ if (rwsem_optimistic_spin(sem))
+ return sem;
+
+ /*
+ * Optimistic spinning failed, proceed to the slowpath
+ * and block until we can acquire the sem.
+ */
+ waiter.task = current;
waiter.type = RWSEM_WAITING_FOR_WRITE;
raw_spin_lock_irq(&sem->wait_lock);
+
+ /* account for this before adding a new element to the list */
if (list_empty(&sem->wait_list))
- adjustment += RWSEM_WAITING_BIAS;
+ waiting = false;
+
list_add_tail(&waiter.list, &sem->wait_list);
/* we're now waiting on the lock, but no longer actively locking */
- count = rwsem_atomic_update(adjustment, sem);
+ if (waiting) {
+ count = ACCESS_ONCE(sem->count);
- /* If there were already threads queued before us and there are no
- * active writers, the lock must be read owned; so we try to wake
- * any read locks that were queued ahead of us. */
- if (count > RWSEM_WAITING_BIAS &&
- adjustment == -RWSEM_ACTIVE_WRITE_BIAS)
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+ /*
+ * If there were already threads queued before us and there are
+ * no active writers, the lock must be read owned; so we try to
+ * wake any read locks that were queued ahead of us.
+ */
+ if (count > RWSEM_WAITING_BIAS)
+ sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+
+ } else
+ count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
/* wait until we successfully acquire the lock */
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE);
while (true) {
- if (!(count & RWSEM_ACTIVE_MASK)) {
- /* Try acquiring the write lock. */
- count = RWSEM_ACTIVE_WRITE_BIAS;
- if (!list_is_singular(&sem->wait_list))
- count += RWSEM_WAITING_BIAS;
-
- if (sem->count == RWSEM_WAITING_BIAS &&
- cmpxchg(&sem->count, RWSEM_WAITING_BIAS, count) ==
- RWSEM_WAITING_BIAS)
- break;
- }
-
+ if (rwsem_try_write_lock(count, sem))
+ break;
raw_spin_unlock_irq(&sem->wait_lock);
/* Block until there are no active lockers. */
do {
schedule();
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_UNINTERRUPTIBLE);
} while ((count = sem->count) & RWSEM_ACTIVE_MASK);
raw_spin_lock_irq(&sem->wait_lock);
}
+ __set_current_state(TASK_RUNNING);
list_del(&waiter.list);
raw_spin_unlock_irq(&sem->wait_lock);
- tsk->state = TASK_RUNNING;
return sem;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
index cfff1435bdf..42f806de49d 100644
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -12,6 +12,27 @@
#include <linux/atomic.h>
+#if defined(CONFIG_SMP) && defined(CONFIG_RWSEM_XCHGADD_ALGORITHM)
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+ sem->owner = current;
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+ sem->owner = NULL;
+}
+
+#else
+static inline void rwsem_set_owner(struct rw_semaphore *sem)
+{
+}
+
+static inline void rwsem_clear_owner(struct rw_semaphore *sem)
+{
+}
+#endif
+
/*
* lock for reading
*/
@@ -48,6 +69,7 @@ void __sched down_write(struct rw_semaphore *sem)
rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}
EXPORT_SYMBOL(down_write);
@@ -59,8 +81,11 @@ int down_write_trylock(struct rw_semaphore *sem)
{
int ret = __down_write_trylock(sem);
- if (ret == 1)
+ if (ret == 1) {
rwsem_acquire(&sem->dep_map, 0, 1, _RET_IP_);
+ rwsem_set_owner(sem);
+ }
+
return ret;
}
@@ -85,6 +110,7 @@ void up_write(struct rw_semaphore *sem)
{
rwsem_release(&sem->dep_map, 1, _RET_IP_);
+ rwsem_clear_owner(sem);
__up_write(sem);
}
@@ -99,6 +125,7 @@ void downgrade_write(struct rw_semaphore *sem)
* lockdep: a downgraded write will live on as a write
* dependency.
*/
+ rwsem_clear_owner(sem);
__downgrade_write(sem);
}
@@ -122,6 +149,7 @@ void _down_write_nest_lock(struct rw_semaphore *sem, struct lockdep_map *nest)
rwsem_acquire_nest(&sem->dep_map, 0, 0, nest, _RET_IP_);
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}
EXPORT_SYMBOL(_down_write_nest_lock);
@@ -141,6 +169,7 @@ void down_write_nested(struct rw_semaphore *sem, int subclass)
rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_write_trylock, __down_write);
+ rwsem_set_owner(sem);
}
EXPORT_SYMBOL(down_write_nested);
diff --git a/kernel/notifier.c b/kernel/notifier.c
index db4c8b08a50..4803da6eab6 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -71,9 +71,9 @@ static int notifier_chain_unregister(struct notifier_block **nl,
* @returns: notifier_call_chain returns the value returned by the
* last notifier function called.
*/
-static int __kprobes notifier_call_chain(struct notifier_block **nl,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
+static int notifier_call_chain(struct notifier_block **nl,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret = NOTIFY_DONE;
struct notifier_block *nb, *next_nb;
@@ -102,6 +102,7 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
}
return ret;
}
+NOKPROBE_SYMBOL(notifier_call_chain);
/*
* Atomic notifier chain routines. Registration and unregistration
@@ -172,9 +173,9 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister);
* Otherwise the return value is the return value
* of the last notifier function called.
*/
-int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
- unsigned long val, void *v,
- int nr_to_call, int *nr_calls)
+int __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v,
+ int nr_to_call, int *nr_calls)
{
int ret;
@@ -184,13 +185,15 @@ int __kprobes __atomic_notifier_call_chain(struct atomic_notifier_head *nh,
return ret;
}
EXPORT_SYMBOL_GPL(__atomic_notifier_call_chain);
+NOKPROBE_SYMBOL(__atomic_notifier_call_chain);
-int __kprobes atomic_notifier_call_chain(struct atomic_notifier_head *nh,
- unsigned long val, void *v)
+int atomic_notifier_call_chain(struct atomic_notifier_head *nh,
+ unsigned long val, void *v)
{
return __atomic_notifier_call_chain(nh, val, v, -1, NULL);
}
EXPORT_SYMBOL_GPL(atomic_notifier_call_chain);
+NOKPROBE_SYMBOL(atomic_notifier_call_chain);
/*
* Blocking notifier chain routines. All access to the chain is
@@ -527,7 +530,7 @@ EXPORT_SYMBOL_GPL(srcu_init_notifier_head);
static ATOMIC_NOTIFIER_HEAD(die_chain);
-int notrace __kprobes notify_die(enum die_val val, const char *str,
+int notrace notify_die(enum die_val val, const char *str,
struct pt_regs *regs, long err, int trap, int sig)
{
struct die_args args = {
@@ -540,6 +543,7 @@ int notrace __kprobes notify_die(enum die_val val, const char *str,
};
return atomic_notifier_call_chain(&die_chain, val, &args);
}
+NOKPROBE_SYMBOL(notify_die);
int register_die_notifier(struct notifier_block *nb)
{
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index df88d55dc43..fcc2611d3f1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -28,12 +28,14 @@
#include <linux/syscore_ops.h>
#include <linux/ctype.h>
#include <linux/genhd.h>
+#include <trace/events/power.h>
#include "power.h"
static int nocompress;
static int noresume;
+static int nohibernate;
static int resume_wait;
static unsigned int resume_delay;
static char resume_file[256] = CONFIG_PM_STD_PARTITION;
@@ -61,6 +63,11 @@ bool freezer_test_done;
static const struct platform_hibernation_ops *hibernation_ops;
+bool hibernation_available(void)
+{
+ return (nohibernate == 0);
+}
+
/**
* hibernation_set_ops - Set the global hibernate operations.
* @ops: Hibernation operations to use in subsequent hibernation transitions.
@@ -292,7 +299,9 @@ static int create_image(int platform_mode)
in_suspend = 1;
save_processor_state();
+ trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, true);
error = swsusp_arch_suspend();
+ trace_suspend_resume(TPS("machine_suspend"), PM_EVENT_HIBERNATE, false);
if (error)
printk(KERN_ERR "PM: Error %d creating hibernation image\n",
error);
@@ -639,6 +648,11 @@ int hibernate(void)
{
int error;
+ if (!hibernation_available()) {
+ pr_debug("PM: Hibernation not available.\n");
+ return -EPERM;
+ }
+
lock_system_sleep();
/* The snapshot device should not be opened while we're running */
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
@@ -731,7 +745,7 @@ static int software_resume(void)
/*
* If the user said "noresume".. bail out early.
*/
- if (noresume)
+ if (noresume || !hibernation_available())
return 0;
/*
@@ -897,6 +911,9 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
int i;
char *start = buf;
+ if (!hibernation_available())
+ return sprintf(buf, "[disabled]\n");
+
for (i = HIBERNATION_FIRST; i <= HIBERNATION_MAX; i++) {
if (!hibernation_modes[i])
continue;
@@ -931,6 +948,9 @@ static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
char *p;
int mode = HIBERNATION_INVALID;
+ if (!hibernation_available())
+ return -EPERM;
+
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
@@ -1098,6 +1118,10 @@ static int __init hibernate_setup(char *str)
noresume = 1;
else if (!strncmp(str, "nocompress", 10))
nocompress = 1;
+ else if (!strncmp(str, "no", 2)) {
+ noresume = 1;
+ nohibernate = 1;
+ }
return 1;
}
@@ -1122,9 +1146,23 @@ static int __init resumedelay_setup(char *str)
return 1;
}
+static int __init nohibernate_setup(char *str)
+{
+ noresume = 1;
+ nohibernate = 1;
+ return 1;
+}
+
+static int __init kaslr_nohibernate_setup(char *str)
+{
+ return nohibernate_setup(str);
+}
+
__setup("noresume", noresume_setup);
__setup("resume_offset=", resume_offset_setup);
__setup("resume=", resume_setup);
__setup("hibernate=", hibernate_setup);
__setup("resumewait", resumewait_setup);
__setup("resumedelay=", resumedelay_setup);
+__setup("nohibernate", nohibernate_setup);
+__setup("kaslr", kaslr_nohibernate_setup);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 573410d6647..8e90f330f13 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -300,13 +300,11 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
s += sprintf(s,"%s ", pm_states[i].label);
#endif
-#ifdef CONFIG_HIBERNATION
- s += sprintf(s, "%s\n", "disk");
-#else
+ if (hibernation_available())
+ s += sprintf(s, "disk ");
if (s != buf)
/* convert the last space to a newline */
*(s-1) = '\n';
-#endif
return (s - buf);
}
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 06ec8869dbf..0ca8d83e236 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -17,6 +17,7 @@
#include <linux/delay.h>
#include <linux/workqueue.h>
#include <linux/kmod.h>
+#include <trace/events/power.h>
/*
* Timeout for stopping processes
@@ -175,6 +176,7 @@ void thaw_processes(void)
struct task_struct *g, *p;
struct task_struct *curr = current;
+ trace_suspend_resume(TPS("thaw_processes"), 0, true);
if (pm_freezing)
atomic_dec(&system_freezing_cnt);
pm_freezing = false;
@@ -201,6 +203,7 @@ void thaw_processes(void)
schedule();
printk("done.\n");
+ trace_suspend_resume(TPS("thaw_processes"), 0, false);
}
void thaw_kernel_threads(void)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 963e6d0f050..4dd8822f732 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -177,7 +177,9 @@ static int suspend_prepare(suspend_state_t state)
if (error)
goto Finish;
+ trace_suspend_resume(TPS("freeze_processes"), 0, true);
error = suspend_freeze_processes();
+ trace_suspend_resume(TPS("freeze_processes"), 0, false);
if (!error)
return 0;
@@ -240,7 +242,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
* all the devices are suspended.
*/
if (state == PM_SUSPEND_FREEZE) {
+ trace_suspend_resume(TPS("machine_suspend"), state, true);
freeze_enter();
+ trace_suspend_resume(TPS("machine_suspend"), state, false);
goto Platform_wake;
}
@@ -256,7 +260,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (!error) {
*wakeup = pm_wakeup_pending();
if (!(suspend_test(TEST_CORE) || *wakeup)) {
+ trace_suspend_resume(TPS("machine_suspend"),
+ state, true);
error = suspend_ops->enter(state);
+ trace_suspend_resume(TPS("machine_suspend"),
+ state, false);
events_check_enabled = false;
}
syscore_resume();
@@ -294,7 +302,6 @@ int suspend_devices_and_enter(suspend_state_t state)
if (need_suspend_ops(state) && !suspend_ops)
return -ENOSYS;
- trace_machine_suspend(state);
if (need_suspend_ops(state) && suspend_ops->begin) {
error = suspend_ops->begin(state);
if (error)
@@ -331,7 +338,6 @@ int suspend_devices_and_enter(suspend_state_t state)
else if (state == PM_SUSPEND_FREEZE && freeze_ops->end)
freeze_ops->end();
- trace_machine_suspend(PWR_EVENT_EXIT);
return error;
Recover_platform:
@@ -365,6 +371,7 @@ static int enter_state(suspend_state_t state)
{
int error;
+ trace_suspend_resume(TPS("suspend_enter"), state, true);
if (state == PM_SUSPEND_FREEZE) {
#ifdef CONFIG_PM_DEBUG
if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
@@ -382,9 +389,11 @@ static int enter_state(suspend_state_t state)
if (state == PM_SUSPEND_FREEZE)
freeze_begin();
+ trace_suspend_resume(TPS("sync_filesystems"), 0, true);
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
printk("done.\n");
+ trace_suspend_resume(TPS("sync_filesystems"), 0, false);
pr_debug("PM: Preparing system for %s sleep\n", pm_states[state].label);
error = suspend_prepare(state);
@@ -394,6 +403,7 @@ static int enter_state(suspend_state_t state)
if (suspend_test(TEST_FREEZER))
goto Finish;
+ trace_suspend_resume(TPS("suspend_enter"), state, false);
pr_debug("PM: Entering %s sleep\n", pm_states[state].label);
pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 98d357584cd..526e8911460 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -49,6 +49,9 @@ static int snapshot_open(struct inode *inode, struct file *filp)
struct snapshot_data *data;
int error;
+ if (!hibernation_available())
+ return -EPERM;
+
lock_system_sleep();
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index ea2d5f6962e..13e839dbca0 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1416,9 +1416,10 @@ static int have_callable_console(void)
/*
* Can we actually use the console at this time on this cpu?
*
- * Console drivers may assume that per-cpu resources have been allocated. So
- * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't
- * call them until this CPU is officially up.
+ * Console drivers may assume that per-cpu resources have
+ * been allocated. So unless they're explicitly marked as
+ * being able to cope (CON_ANYTIME) don't call them until
+ * this CPU is officially up.
*/
static inline int can_use_console(unsigned int cpu)
{
@@ -1431,10 +1432,8 @@ static inline int can_use_console(unsigned int cpu)
* console_lock held, and 'console_locked' set) if it
* is successful, false otherwise.
*/
-static int console_trylock_for_printk(void)
+static int console_trylock_for_printk(unsigned int cpu)
{
- unsigned int cpu = smp_processor_id();
-
if (!console_trylock())
return 0;
/*
@@ -1609,8 +1608,7 @@ asmlinkage int vprintk_emit(int facility, int level,
*/
if (!oops_in_progress && !lockdep_recursing(current)) {
recursion_bug = 1;
- local_irq_restore(flags);
- return 0;
+ goto out_restore_irqs;
}
zap_locks();
}
@@ -1718,27 +1716,21 @@ asmlinkage int vprintk_emit(int facility, int level,
logbuf_cpu = UINT_MAX;
raw_spin_unlock(&logbuf_lock);
- lockdep_on();
- local_irq_restore(flags);
/* If called from the scheduler, we can not call up(). */
- if (in_sched)
- return printed_len;
-
- /*
- * Disable preemption to avoid being preempted while holding
- * console_sem which would prevent anyone from printing to console
- */
- preempt_disable();
- /*
- * Try to acquire and then immediately release the console semaphore.
- * The release will print out buffers and wake up /dev/kmsg and syslog()
- * users.
- */
- if (console_trylock_for_printk())
- console_unlock();
- preempt_enable();
+ if (!in_sched) {
+ /*
+ * Try to acquire and then immediately release the console
+ * semaphore. The release will print out buffers and wake up
+ * /dev/kmsg and syslog() users.
+ */
+ if (console_trylock_for_printk(this_cpu))
+ console_unlock();
+ }
+ lockdep_on();
+out_restore_irqs:
+ local_irq_restore(flags);
return printed_len;
}
EXPORT_SYMBOL(vprintk_emit);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c6b98793d64..3bdf01b494f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -535,7 +535,7 @@ static inline void init_hrtick(void)
__old; \
})
-#ifdef TIF_POLLING_NRFLAG
+#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
/*
* Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
* this avoids any races wrt polling state changes and thereby avoids
@@ -546,12 +546,44 @@ static bool set_nr_and_not_polling(struct task_struct *p)
struct thread_info *ti = task_thread_info(p);
return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
}
+
+/*
+ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
+ *
+ * If this returns true, then the idle task promises to call
+ * sched_ttwu_pending() and reschedule soon.
+ */
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ struct thread_info *ti = task_thread_info(p);
+ typeof(ti->flags) old, val = ACCESS_ONCE(ti->flags);
+
+ for (;;) {
+ if (!(val & _TIF_POLLING_NRFLAG))
+ return false;
+ if (val & _TIF_NEED_RESCHED)
+ return true;
+ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
+ if (old == val)
+ break;
+ val = old;
+ }
+ return true;
+}
+
#else
static bool set_nr_and_not_polling(struct task_struct *p)
{
set_tsk_need_resched(p);
return true;
}
+
+#ifdef CONFIG_SMP
+static bool set_nr_if_polling(struct task_struct *p)
+{
+ return false;
+}
+#endif
#endif
/*
@@ -580,6 +612,8 @@ void resched_task(struct task_struct *p)
if (set_nr_and_not_polling(p))
smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
void resched_cpu(int cpu)
@@ -642,27 +676,10 @@ static void wake_up_idle_cpu(int cpu)
if (cpu == smp_processor_id())
return;
- /*
- * This is safe, as this function is called with the timer
- * wheel base lock of (cpu) held. When the CPU is on the way
- * to idle and has not yet set rq->curr to idle then it will
- * be serialized on the timer wheel base lock and take the new
- * timer into account automatically.
- */
- if (rq->curr != rq->idle)
- return;
-
- /*
- * We can set TIF_RESCHED on the idle task of the other CPU
- * lockless. The worst case is that the other CPU runs the
- * idle task through an additional NOOP schedule()
- */
- set_tsk_need_resched(rq->idle);
-
- /* NEED_RESCHED must be visible before we test polling */
- smp_mb();
- if (!tsk_is_polling(rq->idle))
+ if (set_nr_and_not_polling(rq->idle))
smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
}
static bool wake_up_full_nohz_cpu(int cpu)
@@ -888,7 +905,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
rq->clock_task += delta;
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
- if ((irq_delta + steal) && sched_feat(NONTASK_POWER))
+ if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
sched_rt_avg_update(rq, irq_delta + steal);
#endif
}
@@ -1521,13 +1538,17 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
}
#ifdef CONFIG_SMP
-static void sched_ttwu_pending(void)
+void sched_ttwu_pending(void)
{
struct rq *rq = this_rq();
struct llist_node *llist = llist_del_all(&rq->wake_list);
struct task_struct *p;
+ unsigned long flags;
- raw_spin_lock(&rq->lock);
+ if (!llist)
+ return;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
while (llist) {
p = llist_entry(llist, struct task_struct, wake_entry);
@@ -1535,7 +1556,7 @@ static void sched_ttwu_pending(void)
ttwu_do_activate(rq, p, 0);
}
- raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
}
void scheduler_ipi(void)
@@ -1581,8 +1602,14 @@ void scheduler_ipi(void)
static void ttwu_queue_remote(struct task_struct *p, int cpu)
{
- if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list))
- smp_send_reschedule(cpu);
+ struct rq *rq = cpu_rq(cpu);
+
+ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
+ if (!set_nr_if_polling(rq->idle))
+ smp_send_reschedule(cpu);
+ else
+ trace_sched_wake_idle_without_ipi(cpu);
+ }
}
bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -2527,7 +2554,7 @@ notrace unsigned long get_parent_ip(unsigned long addr)
#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
defined(CONFIG_PREEMPT_TRACER))
-void __kprobes preempt_count_add(int val)
+void preempt_count_add(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -2553,8 +2580,9 @@ void __kprobes preempt_count_add(int val)
}
}
EXPORT_SYMBOL(preempt_count_add);
+NOKPROBE_SYMBOL(preempt_count_add);
-void __kprobes preempt_count_sub(int val)
+void preempt_count_sub(int val)
{
#ifdef CONFIG_DEBUG_PREEMPT
/*
@@ -2575,6 +2603,7 @@ void __kprobes preempt_count_sub(int val)
__preempt_count_sub(val);
}
EXPORT_SYMBOL(preempt_count_sub);
+NOKPROBE_SYMBOL(preempt_count_sub);
#endif
@@ -2857,6 +2886,7 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
barrier();
} while (need_resched());
}
+NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
#endif /* CONFIG_PREEMPT */
@@ -4216,7 +4246,7 @@ EXPORT_SYMBOL(yield);
* false (0) if we failed to boost the target.
* -ESRCH if there's no task to yield to.
*/
-bool __sched yield_to(struct task_struct *p, bool preempt)
+int __sched yield_to(struct task_struct *p, bool preempt)
{
struct task_struct *curr = current;
struct rq *rq, *p_rq;
@@ -5242,14 +5272,13 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
}
/*
- * Even though we initialize ->power to something semi-sane,
- * we leave power_orig unset. This allows us to detect if
+ * Even though we initialize ->capacity to something semi-sane,
+ * we leave capacity_orig unset. This allows us to detect if
* domain iteration is still funny without causing /0 traps.
*/
- if (!group->sgp->power_orig) {
+ if (!group->sgc->capacity_orig) {
printk(KERN_CONT "\n");
- printk(KERN_ERR "ERROR: domain->cpu_power not "
- "set\n");
+ printk(KERN_ERR "ERROR: domain->cpu_capacity not set\n");
break;
}
@@ -5271,9 +5300,9 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
- if (group->sgp->power != SCHED_POWER_SCALE) {
- printk(KERN_CONT " (cpu_power = %d)",
- group->sgp->power);
+ if (group->sgc->capacity != SCHED_CAPACITY_SCALE) {
+ printk(KERN_CONT " (cpu_capacity = %d)",
+ group->sgc->capacity);
}
group = group->next;
@@ -5331,7 +5360,7 @@ static int sd_degenerate(struct sched_domain *sd)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
- SD_SHARE_CPUPOWER |
+ SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_SHARE_POWERDOMAIN)) {
if (sd->groups != sd->groups->next)
@@ -5362,7 +5391,7 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
SD_BALANCE_NEWIDLE |
SD_BALANCE_FORK |
SD_BALANCE_EXEC |
- SD_SHARE_CPUPOWER |
+ SD_SHARE_CPUCAPACITY |
SD_SHARE_PKG_RESOURCES |
SD_PREFER_SIBLING |
SD_SHARE_POWERDOMAIN);
@@ -5487,7 +5516,7 @@ static struct root_domain *alloc_rootdomain(void)
return rd;
}
-static void free_sched_groups(struct sched_group *sg, int free_sgp)
+static void free_sched_groups(struct sched_group *sg, int free_sgc)
{
struct sched_group *tmp, *first;
@@ -5498,8 +5527,8 @@ static void free_sched_groups(struct sched_group *sg, int free_sgp)
do {
tmp = sg->next;
- if (free_sgp && atomic_dec_and_test(&sg->sgp->ref))
- kfree(sg->sgp);
+ if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
+ kfree(sg->sgc);
kfree(sg);
sg = tmp;
@@ -5517,7 +5546,7 @@ static void free_sched_domain(struct rcu_head *rcu)
if (sd->flags & SD_OVERLAP) {
free_sched_groups(sd->groups, 1);
} else if (atomic_dec_and_test(&sd->groups->ref)) {
- kfree(sd->groups->sgp);
+ kfree(sd->groups->sgc);
kfree(sd->groups);
}
kfree(sd);
@@ -5728,17 +5757,17 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_or(covered, covered, sg_span);
- sg->sgp = *per_cpu_ptr(sdd->sgp, i);
- if (atomic_inc_return(&sg->sgp->ref) == 1)
+ sg->sgc = *per_cpu_ptr(sdd->sgc, i);
+ if (atomic_inc_return(&sg->sgc->ref) == 1)
build_group_mask(sd, sg);
/*
- * Initialize sgp->power such that even if we mess up the
+ * Initialize sgc->capacity such that even if we mess up the
* domains and no possible iteration will get us here, we won't
* die on a /0 trap.
*/
- sg->sgp->power = SCHED_POWER_SCALE * cpumask_weight(sg_span);
- sg->sgp->power_orig = sg->sgp->power;
+ sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
+ sg->sgc->capacity_orig = sg->sgc->capacity;
/*
* Make sure the first group of this domain contains the
@@ -5776,8 +5805,8 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
if (sg) {
*sg = *per_cpu_ptr(sdd->sg, cpu);
- (*sg)->sgp = *per_cpu_ptr(sdd->sgp, cpu);
- atomic_set(&(*sg)->sgp->ref, 1); /* for claim_allocations */
+ (*sg)->sgc = *per_cpu_ptr(sdd->sgc, cpu);
+ atomic_set(&(*sg)->sgc->ref, 1); /* for claim_allocations */
}
return cpu;
@@ -5786,7 +5815,7 @@ static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
/*
* build_sched_groups will build a circular linked list of the groups
* covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
+ * and ->cpu_capacity to 0.
*
* Assumes the sched_domain tree is fully constructed
*/
@@ -5840,16 +5869,16 @@ build_sched_groups(struct sched_domain *sd, int cpu)
}
/*
- * Initialize sched groups cpu_power.
+ * Initialize sched groups cpu_capacity.
*
- * cpu_power indicates the capacity of sched group, which is used while
+ * cpu_capacity indicates the capacity of sched group, which is used while
* distributing the load between different sched groups in a sched domain.
- * Typically cpu_power for all the groups in a sched domain will be same unless
- * there are asymmetries in the topology. If there are asymmetries, group
- * having more cpu_power will pickup more load compared to the group having
- * less cpu_power.
+ * Typically cpu_capacity for all the groups in a sched domain will be same
+ * unless there are asymmetries in the topology. If there are asymmetries,
+ * group having more cpu_capacity will pickup more load compared to the
+ * group having less cpu_capacity.
*/
-static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
@@ -5863,8 +5892,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
if (cpu != group_balance_cpu(sg))
return;
- update_group_power(sd, cpu);
- atomic_set(&sg->sgp->nr_busy_cpus, sg->group_weight);
+ update_group_capacity(sd, cpu);
+ atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
}
/*
@@ -5955,8 +5984,8 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
*per_cpu_ptr(sdd->sg, cpu) = NULL;
- if (atomic_read(&(*per_cpu_ptr(sdd->sgp, cpu))->ref))
- *per_cpu_ptr(sdd->sgp, cpu) = NULL;
+ if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
+ *per_cpu_ptr(sdd->sgc, cpu) = NULL;
}
#ifdef CONFIG_NUMA
@@ -5969,7 +5998,7 @@ static int sched_domains_curr_level;
/*
* SD_flags allowed in topology descriptions.
*
- * SD_SHARE_CPUPOWER - describes SMT topologies
+ * SD_SHARE_CPUCAPACITY - describes SMT topologies
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
@@ -5978,7 +6007,7 @@ static int sched_domains_curr_level;
* SD_ASYM_PACKING - describes SMT quirks
*/
#define TOPOLOGY_SD_FLAGS \
- (SD_SHARE_CPUPOWER | \
+ (SD_SHARE_CPUCAPACITY | \
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
@@ -6024,7 +6053,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
| 1*SD_BALANCE_FORK
| 0*SD_BALANCE_WAKE
| 1*SD_WAKE_AFFINE
- | 0*SD_SHARE_CPUPOWER
+ | 0*SD_SHARE_CPUCAPACITY
| 0*SD_SHARE_PKG_RESOURCES
| 0*SD_SERIALIZE
| 0*SD_PREFER_SIBLING
@@ -6046,7 +6075,7 @@ sd_init(struct sched_domain_topology_level *tl, int cpu)
* Convert topological properties into behaviour.
*/
- if (sd->flags & SD_SHARE_CPUPOWER) {
+ if (sd->flags & SD_SHARE_CPUCAPACITY) {
sd->imbalance_pct = 110;
sd->smt_gain = 1178; /* ~15% */
@@ -6358,14 +6387,14 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
if (!sdd->sg)
return -ENOMEM;
- sdd->sgp = alloc_percpu(struct sched_group_power *);
- if (!sdd->sgp)
+ sdd->sgc = alloc_percpu(struct sched_group_capacity *);
+ if (!sdd->sgc)
return -ENOMEM;
for_each_cpu(j, cpu_map) {
struct sched_domain *sd;
struct sched_group *sg;
- struct sched_group_power *sgp;
+ struct sched_group_capacity *sgc;
sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
@@ -6383,12 +6412,12 @@ static int __sdt_alloc(const struct cpumask *cpu_map)
*per_cpu_ptr(sdd->sg, j) = sg;
- sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(),
+ sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
GFP_KERNEL, cpu_to_node(j));
- if (!sgp)
+ if (!sgc)
return -ENOMEM;
- *per_cpu_ptr(sdd->sgp, j) = sgp;
+ *per_cpu_ptr(sdd->sgc, j) = sgc;
}
}
@@ -6415,15 +6444,15 @@ static void __sdt_free(const struct cpumask *cpu_map)
if (sdd->sg)
kfree(*per_cpu_ptr(sdd->sg, j));
- if (sdd->sgp)
- kfree(*per_cpu_ptr(sdd->sgp, j));
+ if (sdd->sgc)
+ kfree(*per_cpu_ptr(sdd->sgc, j));
}
free_percpu(sdd->sd);
sdd->sd = NULL;
free_percpu(sdd->sg);
sdd->sg = NULL;
- free_percpu(sdd->sgp);
- sdd->sgp = NULL;
+ free_percpu(sdd->sgc);
+ sdd->sgc = NULL;
}
}
@@ -6493,14 +6522,14 @@ static int build_sched_domains(const struct cpumask *cpu_map,
}
}
- /* Calculate CPU power for physical packages and nodes */
+ /* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
if (!cpumask_test_cpu(i, cpu_map))
continue;
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
claim_allocations(i, sd);
- init_sched_groups_power(i, sd);
+ init_sched_groups_capacity(i, sd);
}
}
@@ -6943,7 +6972,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_power = SCHED_POWER_SCALE;
+ rq->cpu_capacity = SCHED_CAPACITY_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 2b8cbf09d1a..fc4f98b1258 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -57,8 +57,6 @@ void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime)
dl_b->dl_runtime = runtime;
}
-extern unsigned long to_ratio(u64 period, u64 runtime);
-
void init_dl_bw(struct dl_bw *dl_b)
{
raw_spin_lock_init(&dl_b->lock);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 9855e87d671..fea7d3335e1 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1017,7 +1017,7 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
static unsigned long weighted_cpuload(const int cpu);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
-static unsigned long power_of(int cpu);
+static unsigned long capacity_of(int cpu);
static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
/* Cached statistics for all CPUs within a node */
@@ -1026,11 +1026,11 @@ struct numa_stats {
unsigned long load;
/* Total compute capacity of CPUs on a node */
- unsigned long power;
+ unsigned long compute_capacity;
/* Approximate capacity in terms of runnable tasks on a node */
- unsigned long capacity;
- int has_capacity;
+ unsigned long task_capacity;
+ int has_free_capacity;
};
/*
@@ -1046,7 +1046,7 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
ns->nr_running += rq->nr_running;
ns->load += weighted_cpuload(cpu);
- ns->power += power_of(cpu);
+ ns->compute_capacity += capacity_of(cpu);
cpus++;
}
@@ -1056,15 +1056,16 @@ static void update_numa_stats(struct numa_stats *ns, int nid)
* the @ns structure is NULL'ed and task_numa_compare() will
* not find this node attractive.
*
- * We'll either bail at !has_capacity, or we'll detect a huge imbalance
- * and bail there.
+ * We'll either bail at !has_free_capacity, or we'll detect a huge
+ * imbalance and bail there.
*/
if (!cpus)
return;
- ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
- ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
- ns->has_capacity = (ns->nr_running < ns->capacity);
+ ns->load = (ns->load * SCHED_CAPACITY_SCALE) / ns->compute_capacity;
+ ns->task_capacity =
+ DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE);
+ ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
}
struct task_numa_env {
@@ -1195,8 +1196,8 @@ static void task_numa_compare(struct task_numa_env *env,
if (!cur) {
/* Is there capacity at our destination? */
- if (env->src_stats.has_capacity &&
- !env->dst_stats.has_capacity)
+ if (env->src_stats.has_free_capacity &&
+ !env->dst_stats.has_free_capacity)
goto unlock;
goto balance;
@@ -1213,7 +1214,7 @@ balance:
orig_dst_load = env->dst_stats.load;
orig_src_load = env->src_stats.load;
- /* XXX missing power terms */
+ /* XXX missing capacity terms */
load = task_h_load(env->p);
dst_load = orig_dst_load + load;
src_load = orig_src_load - load;
@@ -1301,8 +1302,8 @@ static int task_numa_migrate(struct task_struct *p)
groupimp = group_weight(p, env.dst_nid) - groupweight;
update_numa_stats(&env.dst_stats, env.dst_nid);
- /* If the preferred nid has capacity, try to use it. */
- if (env.dst_stats.has_capacity)
+ /* If the preferred nid has free capacity, try to use it. */
+ if (env.dst_stats.has_free_capacity)
task_numa_find_cpu(&env, taskimp, groupimp);
/* No space available on the preferred nid. Look elsewhere. */
@@ -3225,10 +3226,12 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
* has not truly expired.
*
* Fortunately we can check determine whether this the case by checking
- * whether the global deadline has advanced.
+ * whether the global deadline has advanced. It is valid to compare
+ * cfs_b->runtime_expires without any locks since we only care about
+ * exact equality, so a partial write will still work.
*/
- if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
+ if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
/* extend local deadline, drift is bounded above by 2 ticks */
cfs_rq->runtime_expires += TICK_NSEC;
} else {
@@ -3457,21 +3460,21 @@ next:
static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
{
u64 runtime, runtime_expires;
- int idle = 1, throttled;
+ int throttled;
- raw_spin_lock(&cfs_b->lock);
/* no need to continue the timer with no bandwidth constraint */
if (cfs_b->quota == RUNTIME_INF)
- goto out_unlock;
+ goto out_deactivate;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
- /* idle depends on !throttled (for the case of a large deficit) */
- idle = cfs_b->idle && !throttled;
cfs_b->nr_periods += overrun;
- /* if we're going inactive then everything else can be deferred */
- if (idle)
- goto out_unlock;
+ /*
+ * idle depends on !throttled (for the case of a large deficit), and if
+ * we're going inactive then everything else can be deferred
+ */
+ if (cfs_b->idle && !throttled)
+ goto out_deactivate;
/*
* if we have relooped after returning idle once, we need to update our
@@ -3485,7 +3488,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
if (!throttled) {
/* mark as potentially idle for the upcoming period */
cfs_b->idle = 1;
- goto out_unlock;
+ return 0;
}
/* account preceding periods in which throttling occurred */
@@ -3525,12 +3528,12 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
* timer to remain active while there are any throttled entities.)
*/
cfs_b->idle = 0;
-out_unlock:
- if (idle)
- cfs_b->timer_active = 0;
- raw_spin_unlock(&cfs_b->lock);
- return idle;
+ return 0;
+
+out_deactivate:
+ cfs_b->timer_active = 0;
+ return 1;
}
/* a cfs_rq won't donate quota below this amount */
@@ -3707,6 +3710,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
int overrun;
int idle = 0;
+ raw_spin_lock(&cfs_b->lock);
for (;;) {
now = hrtimer_cb_get_time(timer);
overrun = hrtimer_forward(timer, now, cfs_b->period);
@@ -3716,6 +3720,7 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
idle = do_sched_cfs_period_timer(cfs_b, overrun);
}
+ raw_spin_unlock(&cfs_b->lock);
return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
}
@@ -3775,8 +3780,6 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
struct cfs_rq *cfs_rq;
for_each_leaf_cfs_rq(rq, cfs_rq) {
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
-
if (!cfs_rq->runtime_enabled)
continue;
@@ -3784,7 +3787,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
* clock_task is not advancing so we just need to make sure
* there's some valid quota amount
*/
- cfs_rq->runtime_remaining = cfs_b->quota;
+ cfs_rq->runtime_remaining = 1;
if (cfs_rq_throttled(cfs_rq))
unthrottle_cfs_rq(cfs_rq);
}
@@ -4041,9 +4044,9 @@ static unsigned long target_load(int cpu, int type)
return max(rq->cpu_load[type-1], total);
}
-static unsigned long power_of(int cpu)
+static unsigned long capacity_of(int cpu)
{
- return cpu_rq(cpu)->cpu_power;
+ return cpu_rq(cpu)->cpu_capacity;
}
static unsigned long cpu_avg_load_per_task(int cpu)
@@ -4065,7 +4068,7 @@ static void record_wakee(struct task_struct *p)
* about the boundary, really active task won't care
* about the loss.
*/
- if (jiffies > current->wakee_flip_decay_ts + HZ) {
+ if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
current->wakee_flips >>= 1;
current->wakee_flip_decay_ts = jiffies;
}
@@ -4286,12 +4289,12 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
s64 this_eff_load, prev_eff_load;
this_eff_load = 100;
- this_eff_load *= power_of(prev_cpu);
+ this_eff_load *= capacity_of(prev_cpu);
this_eff_load *= this_load +
effective_load(tg, this_cpu, weight, weight);
prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
- prev_eff_load *= power_of(this_cpu);
+ prev_eff_load *= capacity_of(this_cpu);
prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
balanced = this_eff_load <= prev_eff_load;
@@ -4367,8 +4370,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
avg_load += load;
}
- /* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
+ /* Adjust by relative CPU capacity of the group */
+ avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
if (local_group) {
this_load = avg_load;
@@ -4948,14 +4951,14 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
*
* W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
*
- * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
+ * C_i is the compute capacity of cpu i, typically it is the
* fraction of 'recent' time available for SCHED_OTHER task execution. But it
* can also include other factors [XXX].
*
* To achieve this balance we define a measure of imbalance which follows
* directly from (1):
*
- * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
+ * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
*
* We them move tasks around to minimize the imbalance. In the continuous
* function space it is obvious this converges, in the discrete case we get
@@ -5530,13 +5533,13 @@ struct sg_lb_stats {
unsigned long group_load; /* Total load over the CPUs of the group */
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long load_per_task;
- unsigned long group_power;
+ unsigned long group_capacity;
unsigned int sum_nr_running; /* Nr tasks running in the group */
- unsigned int group_capacity;
+ unsigned int group_capacity_factor;
unsigned int idle_cpus;
unsigned int group_weight;
int group_imb; /* Is there an imbalance in the group ? */
- int group_has_capacity; /* Is there extra capacity in the group? */
+ int group_has_free_capacity;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -5551,7 +5554,7 @@ struct sd_lb_stats {
struct sched_group *busiest; /* Busiest group in this sd */
struct sched_group *local; /* Local group in this sd */
unsigned long total_load; /* Total load of all groups in sd */
- unsigned long total_pwr; /* Total power of all groups in sd */
+ unsigned long total_capacity; /* Total capacity of all groups in sd */
unsigned long avg_load; /* Average load across all groups in sd */
struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -5570,7 +5573,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
.busiest = NULL,
.local = NULL,
.total_load = 0UL,
- .total_pwr = 0UL,
+ .total_capacity = 0UL,
.busiest_stat = {
.avg_load = 0UL,
},
@@ -5605,17 +5608,17 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
return load_idx;
}
-static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_capacity(struct sched_domain *sd, int cpu)
{
- return SCHED_POWER_SCALE;
+ return SCHED_CAPACITY_SCALE;
}
-unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
{
- return default_scale_freq_power(sd, cpu);
+ return default_scale_capacity(sd, cpu);
}
-static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
+static unsigned long default_scale_smt_capacity(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
unsigned long smt_gain = sd->smt_gain;
@@ -5625,12 +5628,12 @@ static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
return smt_gain;
}
-unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
+unsigned long __weak arch_scale_smt_capacity(struct sched_domain *sd, int cpu)
{
- return default_scale_smt_power(sd, cpu);
+ return default_scale_smt_capacity(sd, cpu);
}
-static unsigned long scale_rt_power(int cpu)
+static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
u64 total, available, age_stamp, avg;
@@ -5650,71 +5653,71 @@ static unsigned long scale_rt_power(int cpu)
total = sched_avg_period() + delta;
if (unlikely(total < avg)) {
- /* Ensures that power won't end up being negative */
+ /* Ensures that capacity won't end up being negative */
available = 0;
} else {
available = total - avg;
}
- if (unlikely((s64)total < SCHED_POWER_SCALE))
- total = SCHED_POWER_SCALE;
+ if (unlikely((s64)total < SCHED_CAPACITY_SCALE))
+ total = SCHED_CAPACITY_SCALE;
- total >>= SCHED_POWER_SHIFT;
+ total >>= SCHED_CAPACITY_SHIFT;
return div_u64(available, total);
}
-static void update_cpu_power(struct sched_domain *sd, int cpu)
+static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
- unsigned long power = SCHED_POWER_SCALE;
+ unsigned long capacity = SCHED_CAPACITY_SCALE;
struct sched_group *sdg = sd->groups;
- if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
- if (sched_feat(ARCH_POWER))
- power *= arch_scale_smt_power(sd, cpu);
+ if ((sd->flags & SD_SHARE_CPUCAPACITY) && weight > 1) {
+ if (sched_feat(ARCH_CAPACITY))
+ capacity *= arch_scale_smt_capacity(sd, cpu);
else
- power *= default_scale_smt_power(sd, cpu);
+ capacity *= default_scale_smt_capacity(sd, cpu);
- power >>= SCHED_POWER_SHIFT;
+ capacity >>= SCHED_CAPACITY_SHIFT;
}
- sdg->sgp->power_orig = power;
+ sdg->sgc->capacity_orig = capacity;
- if (sched_feat(ARCH_POWER))
- power *= arch_scale_freq_power(sd, cpu);
+ if (sched_feat(ARCH_CAPACITY))
+ capacity *= arch_scale_freq_capacity(sd, cpu);
else
- power *= default_scale_freq_power(sd, cpu);
+ capacity *= default_scale_capacity(sd, cpu);
- power >>= SCHED_POWER_SHIFT;
+ capacity >>= SCHED_CAPACITY_SHIFT;
- power *= scale_rt_power(cpu);
- power >>= SCHED_POWER_SHIFT;
+ capacity *= scale_rt_capacity(cpu);
+ capacity >>= SCHED_CAPACITY_SHIFT;
- if (!power)
- power = 1;
+ if (!capacity)
+ capacity = 1;
- cpu_rq(cpu)->cpu_power = power;
- sdg->sgp->power = power;
+ cpu_rq(cpu)->cpu_capacity = capacity;
+ sdg->sgc->capacity = capacity;
}
-void update_group_power(struct sched_domain *sd, int cpu)
+void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long power, power_orig;
+ unsigned long capacity, capacity_orig;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
interval = clamp(interval, 1UL, max_load_balance_interval);
- sdg->sgp->next_update = jiffies + interval;
+ sdg->sgc->next_update = jiffies + interval;
if (!child) {
- update_cpu_power(sd, cpu);
+ update_cpu_capacity(sd, cpu);
return;
}
- power_orig = power = 0;
+ capacity_orig = capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -5723,31 +5726,31 @@ void update_group_power(struct sched_domain *sd, int cpu)
*/
for_each_cpu(cpu, sched_group_cpus(sdg)) {
- struct sched_group_power *sgp;
+ struct sched_group_capacity *sgc;
struct rq *rq = cpu_rq(cpu);
/*
- * build_sched_domains() -> init_sched_groups_power()
+ * build_sched_domains() -> init_sched_groups_capacity()
* gets here before we've attached the domains to the
* runqueues.
*
- * Use power_of(), which is set irrespective of domains
- * in update_cpu_power().
+ * Use capacity_of(), which is set irrespective of domains
+ * in update_cpu_capacity().
*
- * This avoids power/power_orig from being 0 and
+ * This avoids capacity/capacity_orig from being 0 and
* causing divide-by-zero issues on boot.
*
- * Runtime updates will correct power_orig.
+ * Runtime updates will correct capacity_orig.
*/
if (unlikely(!rq->sd)) {
- power_orig += power_of(cpu);
- power += power_of(cpu);
+ capacity_orig += capacity_of(cpu);
+ capacity += capacity_of(cpu);
continue;
}
- sgp = rq->sd->groups->sgp;
- power_orig += sgp->power_orig;
- power += sgp->power;
+ sgc = rq->sd->groups->sgc;
+ capacity_orig += sgc->capacity_orig;
+ capacity += sgc->capacity;
}
} else {
/*
@@ -5757,14 +5760,14 @@ void update_group_power(struct sched_domain *sd, int cpu)
group = child->groups;
do {
- power_orig += group->sgp->power_orig;
- power += group->sgp->power;
+ capacity_orig += group->sgc->capacity_orig;
+ capacity += group->sgc->capacity;
group = group->next;
} while (group != child->groups);
}
- sdg->sgp->power_orig = power_orig;
- sdg->sgp->power = power;
+ sdg->sgc->capacity_orig = capacity_orig;
+ sdg->sgc->capacity = capacity;
}
/*
@@ -5778,15 +5781,15 @@ static inline int
fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
{
/*
- * Only siblings can have significantly less than SCHED_POWER_SCALE
+ * Only siblings can have significantly less than SCHED_CAPACITY_SCALE
*/
- if (!(sd->flags & SD_SHARE_CPUPOWER))
+ if (!(sd->flags & SD_SHARE_CPUCAPACITY))
return 0;
/*
- * If ~90% of the cpu_power is still there, we're good.
+ * If ~90% of the cpu_capacity is still there, we're good.
*/
- if (group->sgp->power * 32 > group->sgp->power_orig * 29)
+ if (group->sgc->capacity * 32 > group->sgc->capacity_orig * 29)
return 1;
return 0;
@@ -5823,34 +5826,35 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
static inline int sg_imbalanced(struct sched_group *group)
{
- return group->sgp->imbalance;
+ return group->sgc->imbalance;
}
/*
- * Compute the group capacity.
+ * Compute the group capacity factor.
*
- * Avoid the issue where N*frac(smt_power) >= 1 creates 'phantom' cores by
+ * Avoid the issue where N*frac(smt_capacity) >= 1 creates 'phantom' cores by
* first dividing out the smt factor and computing the actual number of cores
- * and limit power unit capacity with that.
+ * and limit unit capacity with that.
*/
-static inline int sg_capacity(struct lb_env *env, struct sched_group *group)
+static inline int sg_capacity_factor(struct lb_env *env, struct sched_group *group)
{
- unsigned int capacity, smt, cpus;
- unsigned int power, power_orig;
+ unsigned int capacity_factor, smt, cpus;
+ unsigned int capacity, capacity_orig;
- power = group->sgp->power;
- power_orig = group->sgp->power_orig;
+ capacity = group->sgc->capacity;
+ capacity_orig = group->sgc->capacity_orig;
cpus = group->group_weight;
- /* smt := ceil(cpus / power), assumes: 1 < smt_power < 2 */
- smt = DIV_ROUND_UP(SCHED_POWER_SCALE * cpus, power_orig);
- capacity = cpus / smt; /* cores */
+ /* smt := ceil(cpus / capacity), assumes: 1 < smt_capacity < 2 */
+ smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, capacity_orig);
+ capacity_factor = cpus / smt; /* cores */
- capacity = min_t(unsigned, capacity, DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE));
- if (!capacity)
- capacity = fix_small_capacity(env->sd, group);
+ capacity_factor = min_t(unsigned,
+ capacity_factor, DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE));
+ if (!capacity_factor)
+ capacity_factor = fix_small_capacity(env->sd, group);
- return capacity;
+ return capacity_factor;
}
/**
@@ -5890,9 +5894,9 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->idle_cpus++;
}
- /* Adjust by relative CPU power of the group */
- sgs->group_power = group->sgp->power;
- sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / sgs->group_power;
+ /* Adjust by relative CPU capacity of the group */
+ sgs->group_capacity = group->sgc->capacity;
+ sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
if (sgs->sum_nr_running)
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
@@ -5900,10 +5904,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
sgs->group_weight = group->group_weight;
sgs->group_imb = sg_imbalanced(group);
- sgs->group_capacity = sg_capacity(env, group);
+ sgs->group_capacity_factor = sg_capacity_factor(env, group);
- if (sgs->group_capacity > sgs->sum_nr_running)
- sgs->group_has_capacity = 1;
+ if (sgs->group_capacity_factor > sgs->sum_nr_running)
+ sgs->group_has_free_capacity = 1;
}
/**
@@ -5927,7 +5931,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
if (sgs->avg_load <= sds->busiest_stat.avg_load)
return false;
- if (sgs->sum_nr_running > sgs->group_capacity)
+ if (sgs->sum_nr_running > sgs->group_capacity_factor)
return true;
if (sgs->group_imb)
@@ -6007,8 +6011,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
sgs = &sds->local_stat;
if (env->idle != CPU_NEWLY_IDLE ||
- time_after_eq(jiffies, sg->sgp->next_update))
- update_group_power(env->sd, env->dst_cpu);
+ time_after_eq(jiffies, sg->sgc->next_update))
+ update_group_capacity(env->sd, env->dst_cpu);
}
update_sg_lb_stats(env, sg, load_idx, local_group, sgs);
@@ -6018,17 +6022,17 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
/*
* In case the child domain prefers tasks go to siblings
- * first, lower the sg capacity to one so that we'll try
+ * first, lower the sg capacity factor to one so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
- * these excess tasks, i.e. nr_running < group_capacity. The
+ * these excess tasks, i.e. nr_running < group_capacity_factor. The
* extra check prevents the case where you always pull from the
* heaviest group when it is already under-utilized (possible
* with a large weight task outweighs the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- sds->local_stat.group_has_capacity)
- sgs->group_capacity = min(sgs->group_capacity, 1U);
+ sds->local_stat.group_has_free_capacity)
+ sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -6038,7 +6042,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
next_group:
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
- sds->total_pwr += sgs->group_power;
+ sds->total_capacity += sgs->group_capacity;
sg = sg->next;
} while (sg != env->sd->groups);
@@ -6085,8 +6089,8 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
return 0;
env->imbalance = DIV_ROUND_CLOSEST(
- sds->busiest_stat.avg_load * sds->busiest_stat.group_power,
- SCHED_POWER_SCALE);
+ sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
+ SCHED_CAPACITY_SCALE);
return 1;
}
@@ -6101,7 +6105,7 @@ static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
static inline
void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
- unsigned long tmp, pwr_now = 0, pwr_move = 0;
+ unsigned long tmp, capa_now = 0, capa_move = 0;
unsigned int imbn = 2;
unsigned long scaled_busy_load_per_task;
struct sg_lb_stats *local, *busiest;
@@ -6115,8 +6119,8 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
imbn = 1;
scaled_busy_load_per_task =
- (busiest->load_per_task * SCHED_POWER_SCALE) /
- busiest->group_power;
+ (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
+ busiest->group_capacity;
if (busiest->avg_load + scaled_busy_load_per_task >=
local->avg_load + (scaled_busy_load_per_task * imbn)) {
@@ -6126,38 +6130,38 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
/*
* OK, we don't have enough imbalance to justify moving tasks,
- * however we may be able to increase total CPU power used by
+ * however we may be able to increase total CPU capacity used by
* moving them.
*/
- pwr_now += busiest->group_power *
+ capa_now += busiest->group_capacity *
min(busiest->load_per_task, busiest->avg_load);
- pwr_now += local->group_power *
+ capa_now += local->group_capacity *
min(local->load_per_task, local->avg_load);
- pwr_now /= SCHED_POWER_SCALE;
+ capa_now /= SCHED_CAPACITY_SCALE;
/* Amount of load we'd subtract */
if (busiest->avg_load > scaled_busy_load_per_task) {
- pwr_move += busiest->group_power *
+ capa_move += busiest->group_capacity *
min(busiest->load_per_task,
busiest->avg_load - scaled_busy_load_per_task);
}
/* Amount of load we'd add */
- if (busiest->avg_load * busiest->group_power <
- busiest->load_per_task * SCHED_POWER_SCALE) {
- tmp = (busiest->avg_load * busiest->group_power) /
- local->group_power;
+ if (busiest->avg_load * busiest->group_capacity <
+ busiest->load_per_task * SCHED_CAPACITY_SCALE) {
+ tmp = (busiest->avg_load * busiest->group_capacity) /
+ local->group_capacity;
} else {
- tmp = (busiest->load_per_task * SCHED_POWER_SCALE) /
- local->group_power;
+ tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
+ local->group_capacity;
}
- pwr_move += local->group_power *
+ capa_move += local->group_capacity *
min(local->load_per_task, local->avg_load + tmp);
- pwr_move /= SCHED_POWER_SCALE;
+ capa_move /= SCHED_CAPACITY_SCALE;
/* Move if we gain throughput */
- if (pwr_move > pwr_now)
+ if (capa_move > capa_now)
env->imbalance = busiest->load_per_task;
}
@@ -6187,7 +6191,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/*
* In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below
- * its cpu_power, while calculating max_load..)
+ * its cpu_capacity, while calculating max_load..)
*/
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
@@ -6202,10 +6206,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* have to drop below capacity to reach cpu-load equilibrium.
*/
load_above_capacity =
- (busiest->sum_nr_running - busiest->group_capacity);
+ (busiest->sum_nr_running - busiest->group_capacity_factor);
- load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
- load_above_capacity /= busiest->group_power;
+ load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_CAPACITY_SCALE);
+ load_above_capacity /= busiest->group_capacity;
}
/*
@@ -6220,9 +6224,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
/* How much load to actually move to equalise the imbalance */
env->imbalance = min(
- max_pull * busiest->group_power,
- (sds->avg_load - local->avg_load) * local->group_power
- ) / SCHED_POWER_SCALE;
+ max_pull * busiest->group_capacity,
+ (sds->avg_load - local->avg_load) * local->group_capacity
+ ) / SCHED_CAPACITY_SCALE;
/*
* if *imbalance is less than the average load per runnable task
@@ -6276,7 +6280,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
if (!sds.busiest || busiest->sum_nr_running == 0)
goto out_balanced;
- sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
+ sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
+ / sds.total_capacity;
/*
* If the busiest group is imbalanced the below checks don't
@@ -6287,8 +6292,8 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
goto force_balance;
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
- if (env->idle == CPU_NEWLY_IDLE && local->group_has_capacity &&
- !busiest->group_has_capacity)
+ if (env->idle == CPU_NEWLY_IDLE && local->group_has_free_capacity &&
+ !busiest->group_has_free_capacity)
goto force_balance;
/*
@@ -6342,11 +6347,11 @@ static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
- unsigned long busiest_load = 0, busiest_power = 1;
+ unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
- unsigned long power, capacity, wl;
+ unsigned long capacity, capacity_factor, wl;
enum fbq_type rt;
rq = cpu_rq(i);
@@ -6374,34 +6379,34 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (rt > env->fbq_type)
continue;
- power = power_of(i);
- capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
- if (!capacity)
- capacity = fix_small_capacity(env->sd, group);
+ capacity = capacity_of(i);
+ capacity_factor = DIV_ROUND_CLOSEST(capacity, SCHED_CAPACITY_SCALE);
+ if (!capacity_factor)
+ capacity_factor = fix_small_capacity(env->sd, group);
wl = weighted_cpuload(i);
/*
* When comparing with imbalance, use weighted_cpuload()
- * which is not scaled with the cpu power.
+ * which is not scaled with the cpu capacity.
*/
- if (capacity && rq->nr_running == 1 && wl > env->imbalance)
+ if (capacity_factor && rq->nr_running == 1 && wl > env->imbalance)
continue;
/*
* For the load comparisons with the other cpu's, consider
- * the weighted_cpuload() scaled with the cpu power, so that
- * the load can be moved away from the cpu that is potentially
- * running at a lower capacity.
+ * the weighted_cpuload() scaled with the cpu capacity, so
+ * that the load can be moved away from the cpu that is
+ * potentially running at a lower capacity.
*
- * Thus we're looking for max(wl_i / power_i), crosswise
+ * Thus we're looking for max(wl_i / capacity_i), crosswise
* multiplication to rid ourselves of the division works out
- * to: wl_i * power_j > wl_j * power_i; where j is our
- * previous maximum.
+ * to: wl_i * capacity_j > wl_j * capacity_i; where j is
+ * our previous maximum.
*/
- if (wl * busiest_power > busiest_load * power) {
+ if (wl * busiest_capacity > busiest_load * capacity) {
busiest_load = wl;
- busiest_power = power;
+ busiest_capacity = capacity;
busiest = rq;
}
}
@@ -6609,7 +6614,7 @@ more_balance:
* We failed to reach balance because of affinity.
*/
if (sd_parent) {
- int *group_imbalance = &sd_parent->groups->sgp->imbalance;
+ int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
*group_imbalance = 1;
@@ -6996,7 +7001,7 @@ static inline void set_cpu_sd_state_busy(void)
goto unlock;
sd->nohz_idle = 0;
- atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+ atomic_inc(&sd->groups->sgc->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -7013,7 +7018,7 @@ void set_cpu_sd_state_idle(void)
goto unlock;
sd->nohz_idle = 1;
- atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+ atomic_dec(&sd->groups->sgc->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
@@ -7192,12 +7197,17 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
rq = cpu_rq(balance_cpu);
- raw_spin_lock_irq(&rq->lock);
- update_rq_clock(rq);
- update_idle_cpu_load(rq);
- raw_spin_unlock_irq(&rq->lock);
-
- rebalance_domains(rq, CPU_IDLE);
+ /*
+ * If time for next balance is due,
+ * do the balance.
+ */
+ if (time_after_eq(jiffies, rq->next_balance)) {
+ raw_spin_lock_irq(&rq->lock);
+ update_rq_clock(rq);
+ update_idle_cpu_load(rq);
+ raw_spin_unlock_irq(&rq->lock);
+ rebalance_domains(rq, CPU_IDLE);
+ }
if (time_after(this_rq->next_balance, rq->next_balance))
this_rq->next_balance = rq->next_balance;
@@ -7212,7 +7222,7 @@ end:
* of an idle cpu is the system.
* - This rq has more than one task.
* - At any scheduler domain level, this cpu's scheduler group has multiple
- * busy cpu's exceeding the group's power.
+ * busy cpu's exceeding the group's capacity.
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
@@ -7220,7 +7230,7 @@ static inline int nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain *sd;
- struct sched_group_power *sgp;
+ struct sched_group_capacity *sgc;
int nr_busy, cpu = rq->cpu;
if (unlikely(rq->idle_balance))
@@ -7250,8 +7260,8 @@ static inline int nohz_kick_needed(struct rq *rq)
sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (sd) {
- sgp = sd->groups->sgp;
- nr_busy = atomic_read(&sgp->nr_busy_cpus);
+ sgc = sd->groups->sgc;
+ nr_busy = atomic_read(&sgc->nr_busy_cpus);
if (nr_busy > 1)
goto need_kick_unlock;
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 5716929a2e3..90284d117fe 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -37,18 +37,18 @@ SCHED_FEAT(CACHE_HOT_BUDDY, true)
SCHED_FEAT(WAKEUP_PREEMPTION, true)
/*
- * Use arch dependent cpu power functions
+ * Use arch dependent cpu capacity functions
*/
-SCHED_FEAT(ARCH_POWER, true)
+SCHED_FEAT(ARCH_CAPACITY, true)
SCHED_FEAT(HRTICK, false)
SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
/*
- * Decrement CPU power based on time not spent running tasks
+ * Decrement CPU capacity based on time not spent running tasks
*/
-SCHED_FEAT(NONTASK_POWER, true)
+SCHED_FEAT(NONTASK_CAPACITY, true)
/*
* Queue remote wakeups on the target CPU and process them
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 25b9423abce..cf009fb0bc2 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -12,6 +12,8 @@
#include <trace/events/power.h>
+#include "sched.h"
+
static int __read_mostly cpu_idle_force_poll;
void cpu_idle_poll_ctrl(bool enable)
@@ -67,6 +69,10 @@ void __weak arch_cpu_idle(void)
* cpuidle_idle_call - the main idle function
*
* NOTE: no locks or semaphores should be used here
+ *
+ * On archs that support TIF_POLLING_NRFLAG, is called with polling
+ * set, and it returns with polling set. If it ever stops polling, it
+ * must clear the polling bit.
*/
static void cpuidle_idle_call(void)
{
@@ -175,10 +181,22 @@ exit_idle:
/*
* Generic idle loop implementation
+ *
+ * Called with polling cleared.
*/
static void cpu_idle_loop(void)
{
while (1) {
+ /*
+ * If the arch has a polling bit, we maintain an invariant:
+ *
+ * Our polling bit is clear if we're not scheduled (i.e. if
+ * rq->curr != rq->idle). This means that, if rq->idle has
+ * the polling bit set, then setting need_resched is
+ * guaranteed to cause the cpu to reschedule.
+ */
+
+ __current_set_polling();
tick_nohz_idle_enter();
while (!need_resched()) {
@@ -218,6 +236,17 @@ static void cpu_idle_loop(void)
*/
preempt_set_need_resched();
tick_nohz_idle_exit();
+ __current_clr_polling();
+
+ /*
+ * We promise to call sched_ttwu_pending and reschedule
+ * if need_resched is set while polling is set. That
+ * means that clearing polling needs to be visible
+ * before doing these things.
+ */
+ smp_mb__after_atomic();
+
+ sched_ttwu_pending();
schedule_preempt_disabled();
}
}
@@ -239,7 +268,6 @@ void cpu_startup_entry(enum cpuhp_state state)
*/
boot_init_stack_canary();
#endif
- __current_set_polling();
arch_cpu_idle_prepare();
cpu_idle_loop();
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index b3512f1afce..a49083192c6 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -918,7 +918,6 @@ static void update_curr_rt(struct rq *rq)
{
struct task_struct *curr = rq->curr;
struct sched_rt_entity *rt_se = &curr->rt;
- struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
u64 delta_exec;
if (curr->sched_class != &rt_sched_class)
@@ -943,7 +942,7 @@ static void update_curr_rt(struct rq *rq)
return;
for_each_sched_rt_entity(rt_se) {
- rt_rq = rt_rq_of_se(rt_se);
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index e47679b04d1..31cc02ebc54 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -567,7 +567,7 @@ struct rq {
struct root_domain *rd;
struct sched_domain *sd;
- unsigned long cpu_power;
+ unsigned long cpu_capacity;
unsigned char idle_balance;
/* For active balancing */
@@ -670,6 +670,8 @@ extern int migrate_swap(struct task_struct *, struct task_struct *);
#ifdef CONFIG_SMP
+extern void sched_ttwu_pending(void);
+
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
lockdep_is_held(&sched_domains_mutex))
@@ -728,15 +730,15 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
DECLARE_PER_CPU(struct sched_domain *, sd_busy);
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
-struct sched_group_power {
+struct sched_group_capacity {
atomic_t ref;
/*
- * CPU power of this group, SCHED_LOAD_SCALE being max power for a
- * single CPU.
+ * CPU capacity of this group, SCHED_LOAD_SCALE being max capacity
+ * for a single CPU.
*/
- unsigned int power, power_orig;
+ unsigned int capacity, capacity_orig;
unsigned long next_update;
- int imbalance; /* XXX unrelated to power but shared group state */
+ int imbalance; /* XXX unrelated to capacity but shared group state */
/*
* Number of busy cpus in this group.
*/
@@ -750,7 +752,7 @@ struct sched_group {
atomic_t ref;
unsigned int group_weight;
- struct sched_group_power *sgp;
+ struct sched_group_capacity *sgc;
/*
* The CPUs this group covers.
@@ -773,7 +775,7 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
*/
static inline struct cpumask *sched_group_mask(struct sched_group *sg)
{
- return to_cpumask(sg->sgp->cpumask);
+ return to_cpumask(sg->sgc->cpumask);
}
/**
@@ -787,6 +789,10 @@ static inline unsigned int group_first_cpu(struct sched_group *group)
extern int group_balance_cpu(struct sched_group *sg);
+#else
+
+static inline void sched_ttwu_pending(void) { }
+
#endif /* CONFIG_SMP */
#include "stats.h"
@@ -1167,7 +1173,7 @@ extern const struct sched_class idle_sched_class;
#ifdef CONFIG_SMP
-extern void update_group_power(struct sched_domain *sd, int cpu);
+extern void update_group_capacity(struct sched_domain *sd, int cpu);
extern void trigger_load_balance(struct rq *rq);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index f6d76bebe69..301bbc24739 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -54,8 +54,7 @@
struct seccomp_filter {
atomic_t usage;
struct seccomp_filter *prev;
- unsigned short len; /* Instruction count */
- struct sock_filter_int insnsi[];
+ struct sk_filter *prog;
};
/* Limit any path through the tree to 256KB worth of instructions. */
@@ -104,60 +103,59 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
u32 k = ftest->k;
switch (code) {
- case BPF_S_LD_W_ABS:
+ case BPF_LD | BPF_W | BPF_ABS:
ftest->code = BPF_LDX | BPF_W | BPF_ABS;
/* 32-bit aligned and not out of bounds. */
if (k >= sizeof(struct seccomp_data) || k & 3)
return -EINVAL;
continue;
- case BPF_S_LD_W_LEN:
+ case BPF_LD | BPF_W | BPF_LEN:
ftest->code = BPF_LD | BPF_IMM;
ftest->k = sizeof(struct seccomp_data);
continue;
- case BPF_S_LDX_W_LEN:
+ case BPF_LDX | BPF_W | BPF_LEN:
ftest->code = BPF_LDX | BPF_IMM;
ftest->k = sizeof(struct seccomp_data);
continue;
/* Explicitly include allowed calls. */
- case BPF_S_RET_K:
- case BPF_S_RET_A:
- case BPF_S_ALU_ADD_K:
- case BPF_S_ALU_ADD_X:
- case BPF_S_ALU_SUB_K:
- case BPF_S_ALU_SUB_X:
- case BPF_S_ALU_MUL_K:
- case BPF_S_ALU_MUL_X:
- case BPF_S_ALU_DIV_X:
- case BPF_S_ALU_AND_K:
- case BPF_S_ALU_AND_X:
- case BPF_S_ALU_OR_K:
- case BPF_S_ALU_OR_X:
- case BPF_S_ALU_XOR_K:
- case BPF_S_ALU_XOR_X:
- case BPF_S_ALU_LSH_K:
- case BPF_S_ALU_LSH_X:
- case BPF_S_ALU_RSH_K:
- case BPF_S_ALU_RSH_X:
- case BPF_S_ALU_NEG:
- case BPF_S_LD_IMM:
- case BPF_S_LDX_IMM:
- case BPF_S_MISC_TAX:
- case BPF_S_MISC_TXA:
- case BPF_S_ALU_DIV_K:
- case BPF_S_LD_MEM:
- case BPF_S_LDX_MEM:
- case BPF_S_ST:
- case BPF_S_STX:
- case BPF_S_JMP_JA:
- case BPF_S_JMP_JEQ_K:
- case BPF_S_JMP_JEQ_X:
- case BPF_S_JMP_JGE_K:
- case BPF_S_JMP_JGE_X:
- case BPF_S_JMP_JGT_K:
- case BPF_S_JMP_JGT_X:
- case BPF_S_JMP_JSET_K:
- case BPF_S_JMP_JSET_X:
- sk_decode_filter(ftest, ftest);
+ case BPF_RET | BPF_K:
+ case BPF_RET | BPF_A:
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_NEG:
+ case BPF_LD | BPF_IMM:
+ case BPF_LDX | BPF_IMM:
+ case BPF_MISC | BPF_TAX:
+ case BPF_MISC | BPF_TXA:
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ case BPF_ST:
+ case BPF_STX:
+ case BPF_JMP | BPF_JA:
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
continue;
default:
return -EINVAL;
@@ -189,7 +187,8 @@ static u32 seccomp_run_filters(int syscall)
* value always takes priority (ignoring the DATA).
*/
for (f = current->seccomp.filter; f; f = f->prev) {
- u32 cur_ret = sk_run_filter_int_seccomp(&sd, f->insnsi);
+ u32 cur_ret = SK_RUN_FILTER(f->prog, (void *)&sd);
+
if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION))
ret = cur_ret;
}
@@ -215,7 +214,7 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
return -EINVAL;
for (filter = current->seccomp.filter; filter; filter = filter->prev)
- total_insns += filter->len + 4; /* include a 4 instr penalty */
+ total_insns += filter->prog->len + 4; /* include a 4 instr penalty */
if (total_insns > MAX_INSNS_PER_PATH)
return -ENOMEM;
@@ -256,19 +255,25 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
/* Allocate a new seccomp_filter */
ret = -ENOMEM;
- filter = kzalloc(sizeof(struct seccomp_filter) +
- sizeof(struct sock_filter_int) * new_len,
+ filter = kzalloc(sizeof(struct seccomp_filter),
GFP_KERNEL|__GFP_NOWARN);
if (!filter)
goto free_prog;
- ret = sk_convert_filter(fp, fprog->len, filter->insnsi, &new_len);
- if (ret)
+ filter->prog = kzalloc(sk_filter_size(new_len),
+ GFP_KERNEL|__GFP_NOWARN);
+ if (!filter->prog)
goto free_filter;
+
+ ret = sk_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
+ if (ret)
+ goto free_filter_prog;
kfree(fp);
atomic_set(&filter->usage, 1);
- filter->len = new_len;
+ filter->prog->len = new_len;
+
+ sk_filter_select_runtime(filter->prog);
/*
* If there is an existing filter, make it the prev and don't drop its
@@ -278,6 +283,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
current->seccomp.filter = filter;
return 0;
+free_filter_prog:
+ kfree(filter->prog);
free_filter:
kfree(filter);
free_prog:
@@ -330,6 +337,7 @@ void put_seccomp_filter(struct task_struct *tsk)
while (orig && atomic_dec_and_test(&orig->usage)) {
struct seccomp_filter *freeme = orig;
orig = orig->prev;
+ sk_filter_free(freeme->prog);
kfree(freeme);
}
}
diff --git a/kernel/smp.c b/kernel/smp.c
index 306f8180b0d..80c33f8de14 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -29,6 +29,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
+static void flush_smp_call_function_queue(bool warn_cpu_offline);
+
static int
hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
@@ -51,12 +53,27 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
case CPU_UP_CANCELED_FROZEN:
+ /* Fall-through to the CPU_DEAD[_FROZEN] case. */
case CPU_DEAD:
case CPU_DEAD_FROZEN:
free_cpumask_var(cfd->cpumask);
free_percpu(cfd->csd);
break;
+
+ case CPU_DYING:
+ case CPU_DYING_FROZEN:
+ /*
+ * The IPIs for the smp-call-function callbacks queued by other
+ * CPUs might arrive late, either due to hardware latencies or
+ * because this CPU disabled interrupts (inside stop-machine)
+ * before the IPIs were sent. So flush out any pending callbacks
+ * explicitly (without waiting for the IPIs to arrive), to
+ * ensure that the outgoing CPU doesn't go offline with work
+ * still pending.
+ */
+ flush_smp_call_function_queue(false);
+ break;
#endif
};
@@ -177,23 +194,47 @@ static int generic_exec_single(int cpu, struct call_single_data *csd,
return 0;
}
-/*
- * Invoked by arch to handle an IPI for call function single. Must be
- * called from the arch with interrupts disabled.
+/**
+ * generic_smp_call_function_single_interrupt - Execute SMP IPI callbacks
+ *
+ * Invoked by arch to handle an IPI for call function single.
+ * Must be called with interrupts disabled.
*/
void generic_smp_call_function_single_interrupt(void)
{
+ flush_smp_call_function_queue(true);
+}
+
+/**
+ * flush_smp_call_function_queue - Flush pending smp-call-function callbacks
+ *
+ * @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
+ * offline CPU. Skip this check if set to 'false'.
+ *
+ * Flush any pending smp-call-function callbacks queued on this CPU. This is
+ * invoked by the generic IPI handler, as well as by a CPU about to go offline,
+ * to ensure that all pending IPI callbacks are run before it goes completely
+ * offline.
+ *
+ * Loop through the call_single_queue and run all the queued callbacks.
+ * Must be called with interrupts disabled.
+ */
+static void flush_smp_call_function_queue(bool warn_cpu_offline)
+{
+ struct llist_head *head;
struct llist_node *entry;
struct call_single_data *csd, *csd_next;
static bool warned;
- entry = llist_del_all(&__get_cpu_var(call_single_queue));
+ WARN_ON(!irqs_disabled());
+
+ head = &__get_cpu_var(call_single_queue);
+ entry = llist_del_all(head);
entry = llist_reverse_order(entry);
- /*
- * Shouldn't receive this interrupt on a cpu that is not yet online.
- */
- if (unlikely(!cpu_online(smp_processor_id()) && !warned)) {
+ /* There shouldn't be any pending callbacks on an offline CPU. */
+ if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
+ !warned && !llist_empty(head))) {
warned = true;
WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index db19e3e2aa4..75b22e22a72 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -136,7 +136,6 @@ static unsigned long dirty_bytes_min = 2 * PAGE_SIZE;
/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
static int maxolduid = 65535;
static int minolduid;
-static int min_percpu_pagelist_fract = 8;
static int ngroups_max = NGROUPS_MAX;
static const int cap_last_cap = CAP_LAST_CAP;
@@ -152,10 +151,6 @@ static unsigned long hung_task_timeout_max = (LONG_MAX/HZ);
#ifdef CONFIG_SPARC
#endif
-#ifdef CONFIG_SPARC64
-extern int sysctl_tsb_ratio;
-#endif
-
#ifdef __hppa__
extern int pwrsw_enabled;
#endif
@@ -865,6 +860,17 @@ static struct ctl_table kern_table[] = {
.extra1 = &zero,
.extra2 = &one,
},
+#ifdef CONFIG_SMP
+ {
+ .procname = "softlockup_all_cpu_backtrace",
+ .data = &sysctl_softlockup_all_cpu_backtrace,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif /* CONFIG_SMP */
{
.procname = "nmi_watchdog",
.data = &watchdog_user_enabled,
@@ -1321,7 +1327,7 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(percpu_pagelist_fraction),
.mode = 0644,
.proc_handler = percpu_pagelist_fraction_sysctl_handler,
- .extra1 = &min_percpu_pagelist_fract,
+ .extra1 = &zero,
},
#ifdef CONFIG_MMU
{
@@ -2568,11 +2574,11 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
bool first = 1;
size_t left = *lenp;
unsigned long bitmap_len = table->maxlen;
- unsigned long *bitmap = (unsigned long *) table->data;
+ unsigned long *bitmap = *(unsigned long **) table->data;
unsigned long *tmp_bitmap = NULL;
char tr_a[] = { '-', ',', '\n' }, tr_b[] = { ',', '\n', 0 }, c;
- if (!bitmap_len || !left || (*ppos && !write)) {
+ if (!bitmap || !bitmap_len || !left || (*ppos && !write)) {
*lenp = 0;
return 0;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c634868c292..7c56c3d0694 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -543,7 +543,7 @@ static void rb_wake_up_waiters(struct irq_work *work)
* as data is added to any of the @buffer's cpu buffers. Otherwise
* it will wait for data to be added to a specific cpu buffer.
*/
-void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+int ring_buffer_wait(struct ring_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
DEFINE_WAIT(wait);
@@ -557,6 +557,8 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
if (cpu == RING_BUFFER_ALL_CPUS)
work = &buffer->irq_work;
else {
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ return -ENODEV;
cpu_buffer = buffer->buffers[cpu];
work = &cpu_buffer->irq_work;
}
@@ -591,6 +593,7 @@ void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
schedule();
finish_wait(&work->waiters, &wait);
+ return 0;
}
/**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 16f7038d1f4..f243444a377 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1085,13 +1085,13 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
}
#endif /* CONFIG_TRACER_MAX_TRACE */
-static void wait_on_pipe(struct trace_iterator *iter)
+static int wait_on_pipe(struct trace_iterator *iter)
{
/* Iterators are static, they should be filled or empty */
if (trace_buffer_iter(iter, iter->cpu_file))
- return;
+ return 0;
- ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
+ return ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
}
#ifdef CONFIG_FTRACE_STARTUP_TEST
@@ -1338,7 +1338,7 @@ static int trace_create_savedcmd(void)
{
int ret;
- savedcmd = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL);
+ savedcmd = kmalloc(sizeof(*savedcmd), GFP_KERNEL);
if (!savedcmd)
return -ENOMEM;
@@ -1396,7 +1396,6 @@ void tracing_start(void)
arch_spin_unlock(&global_trace.max_lock);
- ftrace_start();
out:
raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
}
@@ -1443,7 +1442,6 @@ void tracing_stop(void)
struct ring_buffer *buffer;
unsigned long flags;
- ftrace_stop();
raw_spin_lock_irqsave(&global_trace.start_lock, flags);
if (global_trace.stop_count++)
goto out;
@@ -3840,7 +3838,7 @@ tracing_saved_cmdlines_size_read(struct file *filp, char __user *ubuf,
int r;
arch_spin_lock(&trace_cmdline_lock);
- r = sprintf(buf, "%u\n", savedcmd->cmdline_num);
+ r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
arch_spin_unlock(&trace_cmdline_lock);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3857,7 +3855,7 @@ static int tracing_resize_saved_cmdlines(unsigned int val)
{
struct saved_cmdlines_buffer *s, *savedcmd_temp;
- s = kmalloc(sizeof(struct saved_cmdlines_buffer), GFP_KERNEL);
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
return -ENOMEM;
@@ -4378,6 +4376,7 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
static int tracing_wait_pipe(struct file *filp)
{
struct trace_iterator *iter = filp->private_data;
+ int ret;
while (trace_empty(iter)) {
@@ -4399,10 +4398,13 @@ static int tracing_wait_pipe(struct file *filp)
mutex_unlock(&iter->mutex);
- wait_on_pipe(iter);
+ ret = wait_on_pipe(iter);
mutex_lock(&iter->mutex);
+ if (ret)
+ return ret;
+
if (signal_pending(current))
return -EINTR;
}
@@ -5327,8 +5329,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
goto out_unlock;
}
mutex_unlock(&trace_types_lock);
- wait_on_pipe(iter);
+ ret = wait_on_pipe(iter);
mutex_lock(&trace_types_lock);
+ if (ret) {
+ size = ret;
+ goto out_unlock;
+ }
if (signal_pending(current)) {
size = -EINTR;
goto out_unlock;
@@ -5538,8 +5544,10 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
goto out;
}
mutex_unlock(&trace_types_lock);
- wait_on_pipe(iter);
+ ret = wait_on_pipe(iter);
mutex_lock(&trace_types_lock);
+ if (ret)
+ goto out;
if (signal_pending(current)) {
ret = -EINTR;
goto out;
@@ -6232,22 +6240,25 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
return 0;
}
+static void free_trace_buffer(struct trace_buffer *buf)
+{
+ if (buf->buffer) {
+ ring_buffer_free(buf->buffer);
+ buf->buffer = NULL;
+ free_percpu(buf->data);
+ buf->data = NULL;
+ }
+}
+
static void free_trace_buffers(struct trace_array *tr)
{
if (!tr)
return;
- if (tr->trace_buffer.buffer) {
- ring_buffer_free(tr->trace_buffer.buffer);
- tr->trace_buffer.buffer = NULL;
- free_percpu(tr->trace_buffer.data);
- }
+ free_trace_buffer(&tr->trace_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
- if (tr->max_buffer.buffer) {
- ring_buffer_free(tr->max_buffer.buffer);
- tr->max_buffer.buffer = NULL;
- }
+ free_trace_buffer(&tr->max_buffer);
#endif
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9e82551dd56..9258f5a815d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -252,7 +252,7 @@ static inline struct trace_array *top_trace_array(void)
{
struct trace_array *tr;
- if (list_empty(ftrace_trace_arrays.prev))
+ if (list_empty(&ftrace_trace_arrays))
return NULL;
tr = list_entry(ftrace_trace_arrays.prev,
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index c894614de14..5d12bb407b4 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -248,8 +248,8 @@ void perf_trace_del(struct perf_event *p_event, int flags)
tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event);
}
-__kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
- struct pt_regs *regs, int *rctxp)
+void *perf_trace_buf_prepare(int size, unsigned short type,
+ struct pt_regs *regs, int *rctxp)
{
struct trace_entry *entry;
unsigned long flags;
@@ -281,6 +281,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type,
return raw_data;
}
EXPORT_SYMBOL_GPL(perf_trace_buf_prepare);
+NOKPROBE_SYMBOL(perf_trace_buf_prepare);
#ifdef CONFIG_FUNCTION_TRACER
static void
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index ef2fba1f46b..282f6e4e553 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -40,27 +40,27 @@ struct trace_kprobe {
(sizeof(struct probe_arg) * (n)))
-static __kprobes bool trace_kprobe_is_return(struct trace_kprobe *tk)
+static nokprobe_inline bool trace_kprobe_is_return(struct trace_kprobe *tk)
{
return tk->rp.handler != NULL;
}
-static __kprobes const char *trace_kprobe_symbol(struct trace_kprobe *tk)
+static nokprobe_inline const char *trace_kprobe_symbol(struct trace_kprobe *tk)
{
return tk->symbol ? tk->symbol : "unknown";
}
-static __kprobes unsigned long trace_kprobe_offset(struct trace_kprobe *tk)
+static nokprobe_inline unsigned long trace_kprobe_offset(struct trace_kprobe *tk)
{
return tk->rp.kp.offset;
}
-static __kprobes bool trace_kprobe_has_gone(struct trace_kprobe *tk)
+static nokprobe_inline bool trace_kprobe_has_gone(struct trace_kprobe *tk)
{
return !!(kprobe_gone(&tk->rp.kp));
}
-static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk,
+static nokprobe_inline bool trace_kprobe_within_module(struct trace_kprobe *tk,
struct module *mod)
{
int len = strlen(mod->name);
@@ -68,7 +68,7 @@ static __kprobes bool trace_kprobe_within_module(struct trace_kprobe *tk,
return strncmp(mod->name, name, len) == 0 && name[len] == ':';
}
-static __kprobes bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
+static nokprobe_inline bool trace_kprobe_is_on_module(struct trace_kprobe *tk)
{
return !!strchr(trace_kprobe_symbol(tk), ':');
}
@@ -132,19 +132,21 @@ struct symbol_cache *alloc_symbol_cache(const char *sym, long offset)
* Kprobes-specific fetch functions
*/
#define DEFINE_FETCH_stack(type) \
-static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
+static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \
void *offset, void *dest) \
{ \
*(type *)dest = (type)regs_get_kernel_stack_nth(regs, \
(unsigned int)((unsigned long)offset)); \
-}
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(stack, type));
+
DEFINE_BASIC_FETCH_FUNCS(stack)
/* No string on the stack entry */
#define fetch_stack_string NULL
#define fetch_stack_string_size NULL
#define DEFINE_FETCH_memory(type) \
-static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
+static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \
void *addr, void *dest) \
{ \
type retval; \
@@ -152,14 +154,16 @@ static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
*(type *)dest = 0; \
else \
*(type *)dest = retval; \
-}
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, type));
+
DEFINE_BASIC_FETCH_FUNCS(memory)
/*
* Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
* length and relative data location.
*/
-static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
- void *addr, void *dest)
+static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+ void *addr, void *dest)
{
long ret;
int maxlen = get_rloc_len(*(u32 *)dest);
@@ -193,10 +197,11 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
get_rloc_offs(*(u32 *)dest));
}
}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string));
/* Return the length of string -- including null terminal byte */
-static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
- void *addr, void *dest)
+static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+ void *addr, void *dest)
{
mm_segment_t old_fs;
int ret, len = 0;
@@ -219,17 +224,19 @@ static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
else
*(u32 *)dest = len;
}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(memory, string_size));
#define DEFINE_FETCH_symbol(type) \
-__kprobes void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, \
- void *data, void *dest) \
+void FETCH_FUNC_NAME(symbol, type)(struct pt_regs *regs, void *data, void *dest)\
{ \
struct symbol_cache *sc = data; \
if (sc->addr) \
fetch_memory_##type(regs, (void *)sc->addr, dest); \
else \
*(type *)dest = 0; \
-}
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(symbol, type));
+
DEFINE_BASIC_FETCH_FUNCS(symbol)
DEFINE_FETCH_symbol(string)
DEFINE_FETCH_symbol(string_size)
@@ -907,7 +914,7 @@ static const struct file_operations kprobe_profile_ops = {
};
/* Kprobe handler */
-static __kprobes void
+static nokprobe_inline void
__kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
struct ftrace_event_file *ftrace_file)
{
@@ -943,7 +950,7 @@ __kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs,
entry, irq_flags, pc, regs);
}
-static __kprobes void
+static void
kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct event_file_link *link;
@@ -951,9 +958,10 @@ kprobe_trace_func(struct trace_kprobe *tk, struct pt_regs *regs)
list_for_each_entry_rcu(link, &tk->tp.files, list)
__kprobe_trace_func(tk, regs, link->file);
}
+NOKPROBE_SYMBOL(kprobe_trace_func);
/* Kretprobe handler */
-static __kprobes void
+static nokprobe_inline void
__kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs,
struct ftrace_event_file *ftrace_file)
@@ -991,7 +999,7 @@ __kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
entry, irq_flags, pc, regs);
}
-static __kprobes void
+static void
kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
@@ -1000,6 +1008,7 @@ kretprobe_trace_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
list_for_each_entry_rcu(link, &tk->tp.files, list)
__kretprobe_trace_func(tk, ri, regs, link->file);
}
+NOKPROBE_SYMBOL(kretprobe_trace_func);
/* Event entry printers */
static enum print_line_t
@@ -1131,7 +1140,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call)
#ifdef CONFIG_PERF_EVENTS
/* Kprobe profile handler */
-static __kprobes void
+static void
kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
{
struct ftrace_event_call *call = &tk->tp.call;
@@ -1158,9 +1167,10 @@ kprobe_perf_func(struct trace_kprobe *tk, struct pt_regs *regs)
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
}
+NOKPROBE_SYMBOL(kprobe_perf_func);
/* Kretprobe profile handler */
-static __kprobes void
+static void
kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
struct pt_regs *regs)
{
@@ -1188,6 +1198,7 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
store_trace_args(sizeof(*entry), &tk->tp, regs, (u8 *)&entry[1], dsize);
perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
}
+NOKPROBE_SYMBOL(kretprobe_perf_func);
#endif /* CONFIG_PERF_EVENTS */
/*
@@ -1196,9 +1207,8 @@ kretprobe_perf_func(struct trace_kprobe *tk, struct kretprobe_instance *ri,
* kprobe_trace_self_tests_init() does enable_trace_probe/disable_trace_probe
* lockless, but we can't race with this __init function.
*/
-static __kprobes
-int kprobe_register(struct ftrace_event_call *event,
- enum trace_reg type, void *data)
+static int kprobe_register(struct ftrace_event_call *event,
+ enum trace_reg type, void *data)
{
struct trace_kprobe *tk = (struct trace_kprobe *)event->data;
struct ftrace_event_file *file = data;
@@ -1224,8 +1234,7 @@ int kprobe_register(struct ftrace_event_call *event,
return 0;
}
-static __kprobes
-int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
+static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
{
struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp);
@@ -1239,9 +1248,10 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs)
#endif
return 0; /* We don't tweek kernel, so just return 0 */
}
+NOKPROBE_SYMBOL(kprobe_dispatcher);
-static __kprobes
-int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
+static int
+kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
{
struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp);
@@ -1255,6 +1265,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs)
#endif
return 0; /* We don't tweek kernel, so just return 0 */
}
+NOKPROBE_SYMBOL(kretprobe_dispatcher);
static struct trace_event_functions kretprobe_funcs = {
.trace = print_kretprobe_event
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
index 8364a421b4d..d4b9fc22cd2 100644
--- a/kernel/trace/trace_probe.c
+++ b/kernel/trace/trace_probe.c
@@ -37,13 +37,13 @@ const char *reserved_field_names[] = {
/* Printing in basic type function template */
#define DEFINE_BASIC_PRINT_TYPE_FUNC(type, fmt) \
-__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
- const char *name, \
- void *data, void *ent) \
+int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
+ void *data, void *ent) \
{ \
return trace_seq_printf(s, " %s=" fmt, name, *(type *)data); \
} \
-const char PRINT_TYPE_FMT_NAME(type)[] = fmt;
+const char PRINT_TYPE_FMT_NAME(type)[] = fmt; \
+NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(type));
DEFINE_BASIC_PRINT_TYPE_FUNC(u8 , "0x%x")
DEFINE_BASIC_PRINT_TYPE_FUNC(u16, "0x%x")
@@ -55,9 +55,8 @@ DEFINE_BASIC_PRINT_TYPE_FUNC(s32, "%d")
DEFINE_BASIC_PRINT_TYPE_FUNC(s64, "%Ld")
/* Print type function for string type */
-__kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
- const char *name,
- void *data, void *ent)
+int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, const char *name,
+ void *data, void *ent)
{
int len = *(u32 *)data >> 16;
@@ -67,6 +66,7 @@ __kprobes int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s,
return trace_seq_printf(s, " %s=\"%s\"", name,
(const char *)get_loc_data(data, ent));
}
+NOKPROBE_SYMBOL(PRINT_TYPE_FUNC_NAME(string));
const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
@@ -81,23 +81,24 @@ const char PRINT_TYPE_FMT_NAME(string)[] = "\\\"%s\\\"";
/* Data fetch function templates */
#define DEFINE_FETCH_reg(type) \
-__kprobes void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, \
- void *offset, void *dest) \
+void FETCH_FUNC_NAME(reg, type)(struct pt_regs *regs, void *offset, void *dest) \
{ \
*(type *)dest = (type)regs_get_register(regs, \
(unsigned int)((unsigned long)offset)); \
-}
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(reg, type));
DEFINE_BASIC_FETCH_FUNCS(reg)
/* No string on the register */
#define fetch_reg_string NULL
#define fetch_reg_string_size NULL
#define DEFINE_FETCH_retval(type) \
-__kprobes void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \
- void *dummy, void *dest) \
+void FETCH_FUNC_NAME(retval, type)(struct pt_regs *regs, \
+ void *dummy, void *dest) \
{ \
*(type *)dest = (type)regs_return_value(regs); \
-}
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(retval, type));
DEFINE_BASIC_FETCH_FUNCS(retval)
/* No string on the retval */
#define fetch_retval_string NULL
@@ -112,8 +113,8 @@ struct deref_fetch_param {
};
#define DEFINE_FETCH_deref(type) \
-__kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
- void *data, void *dest) \
+void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
+ void *data, void *dest) \
{ \
struct deref_fetch_param *dprm = data; \
unsigned long addr; \
@@ -123,12 +124,13 @@ __kprobes void FETCH_FUNC_NAME(deref, type)(struct pt_regs *regs, \
dprm->fetch(regs, (void *)addr, dest); \
} else \
*(type *)dest = 0; \
-}
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, type));
DEFINE_BASIC_FETCH_FUNCS(deref)
DEFINE_FETCH_deref(string)
-__kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
- void *data, void *dest)
+void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
+ void *data, void *dest)
{
struct deref_fetch_param *dprm = data;
unsigned long addr;
@@ -140,16 +142,18 @@ __kprobes void FETCH_FUNC_NAME(deref, string_size)(struct pt_regs *regs,
} else
*(string_size *)dest = 0;
}
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(deref, string_size));
-static __kprobes void update_deref_fetch_param(struct deref_fetch_param *data)
+static void update_deref_fetch_param(struct deref_fetch_param *data)
{
if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
update_deref_fetch_param(data->orig.data);
else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
update_symbol_cache(data->orig.data);
}
+NOKPROBE_SYMBOL(update_deref_fetch_param);
-static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
+static void free_deref_fetch_param(struct deref_fetch_param *data)
{
if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
free_deref_fetch_param(data->orig.data);
@@ -157,6 +161,7 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
free_symbol_cache(data->orig.data);
kfree(data);
}
+NOKPROBE_SYMBOL(free_deref_fetch_param);
/* Bitfield fetch function */
struct bitfield_fetch_param {
@@ -166,8 +171,8 @@ struct bitfield_fetch_param {
};
#define DEFINE_FETCH_bitfield(type) \
-__kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
- void *data, void *dest) \
+void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
+ void *data, void *dest) \
{ \
struct bitfield_fetch_param *bprm = data; \
type buf = 0; \
@@ -177,13 +182,13 @@ __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs, \
buf >>= bprm->low_shift; \
} \
*(type *)dest = buf; \
-}
-
+} \
+NOKPROBE_SYMBOL(FETCH_FUNC_NAME(bitfield, type));
DEFINE_BASIC_FETCH_FUNCS(bitfield)
#define fetch_bitfield_string NULL
#define fetch_bitfield_string_size NULL
-static __kprobes void
+static void
update_bitfield_fetch_param(struct bitfield_fetch_param *data)
{
/*
@@ -196,7 +201,7 @@ update_bitfield_fetch_param(struct bitfield_fetch_param *data)
update_symbol_cache(data->orig.data);
}
-static __kprobes void
+static void
free_bitfield_fetch_param(struct bitfield_fetch_param *data)
{
/*
@@ -255,17 +260,17 @@ fail:
}
/* Special function : only accept unsigned long */
-static __kprobes void fetch_kernel_stack_address(struct pt_regs *regs,
- void *dummy, void *dest)
+static void fetch_kernel_stack_address(struct pt_regs *regs, void *dummy, void *dest)
{
*(unsigned long *)dest = kernel_stack_pointer(regs);
}
+NOKPROBE_SYMBOL(fetch_kernel_stack_address);
-static __kprobes void fetch_user_stack_address(struct pt_regs *regs,
- void *dummy, void *dest)
+static void fetch_user_stack_address(struct pt_regs *regs, void *dummy, void *dest)
{
*(unsigned long *)dest = user_stack_pointer(regs);
}
+NOKPROBE_SYMBOL(fetch_user_stack_address);
static fetch_func_t get_fetch_size_function(const struct fetch_type *type,
fetch_func_t orig_fn,
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
index fb1ab5dfbd4..4f815fbce16 100644
--- a/kernel/trace/trace_probe.h
+++ b/kernel/trace/trace_probe.h
@@ -81,13 +81,13 @@
*/
#define convert_rloc_to_loc(dl, offs) ((u32)(dl) + (offs))
-static inline void *get_rloc_data(u32 *dl)
+static nokprobe_inline void *get_rloc_data(u32 *dl)
{
return (u8 *)dl + get_rloc_offs(*dl);
}
/* For data_loc conversion */
-static inline void *get_loc_data(u32 *dl, void *ent)
+static nokprobe_inline void *get_loc_data(u32 *dl, void *ent)
{
return (u8 *)ent + get_rloc_offs(*dl);
}
@@ -136,9 +136,8 @@ typedef u32 string_size;
/* Printing in basic type function template */
#define DECLARE_BASIC_PRINT_TYPE_FUNC(type) \
-__kprobes int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, \
- const char *name, \
- void *data, void *ent); \
+int PRINT_TYPE_FUNC_NAME(type)(struct trace_seq *s, const char *name, \
+ void *data, void *ent); \
extern const char PRINT_TYPE_FMT_NAME(type)[]
DECLARE_BASIC_PRINT_TYPE_FUNC(u8);
@@ -303,7 +302,7 @@ static inline bool trace_probe_is_registered(struct trace_probe *tp)
return !!(tp->flags & TP_FLAG_REGISTERED);
}
-static inline __kprobes void call_fetch(struct fetch_param *fprm,
+static nokprobe_inline void call_fetch(struct fetch_param *fprm,
struct pt_regs *regs, void *dest)
{
return fprm->fn(regs, fprm->data, dest);
@@ -351,7 +350,7 @@ extern ssize_t traceprobe_probes_write(struct file *file,
extern int traceprobe_command(const char *buf, int (*createfn)(int, char**));
/* Sum up total data length for dynamic arraies (strings) */
-static inline __kprobes int
+static nokprobe_inline int
__get_data_size(struct trace_probe *tp, struct pt_regs *regs)
{
int i, ret = 0;
@@ -367,7 +366,7 @@ __get_data_size(struct trace_probe *tp, struct pt_regs *regs)
}
/* Store the value of each argument */
-static inline __kprobes void
+static nokprobe_inline void
store_trace_args(int ent_size, struct trace_probe *tp, struct pt_regs *regs,
u8 *data, int maxlen)
{
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index c082a744134..3c9b97e6b1f 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -108,8 +108,8 @@ static unsigned long get_user_stack_nth(struct pt_regs *regs, unsigned int n)
* Uprobes-specific fetch functions
*/
#define DEFINE_FETCH_stack(type) \
-static __kprobes void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs,\
- void *offset, void *dest) \
+static void FETCH_FUNC_NAME(stack, type)(struct pt_regs *regs, \
+ void *offset, void *dest) \
{ \
*(type *)dest = (type)get_user_stack_nth(regs, \
((unsigned long)offset)); \
@@ -120,8 +120,8 @@ DEFINE_BASIC_FETCH_FUNCS(stack)
#define fetch_stack_string_size NULL
#define DEFINE_FETCH_memory(type) \
-static __kprobes void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs,\
- void *addr, void *dest) \
+static void FETCH_FUNC_NAME(memory, type)(struct pt_regs *regs, \
+ void *addr, void *dest) \
{ \
type retval; \
void __user *vaddr = (void __force __user *) addr; \
@@ -136,8 +136,8 @@ DEFINE_BASIC_FETCH_FUNCS(memory)
* Fetch a null-terminated string. Caller MUST set *(u32 *)dest with max
* length and relative data location.
*/
-static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
- void *addr, void *dest)
+static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
+ void *addr, void *dest)
{
long ret;
u32 rloc = *(u32 *)dest;
@@ -158,8 +158,8 @@ static __kprobes void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
}
}
-static __kprobes void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
- void *addr, void *dest)
+static void FETCH_FUNC_NAME(memory, string_size)(struct pt_regs *regs,
+ void *addr, void *dest)
{
int len;
void __user *vaddr = (void __force __user *) addr;
@@ -184,8 +184,8 @@ static unsigned long translate_user_vaddr(void *file_offset)
}
#define DEFINE_FETCH_file_offset(type) \
-static __kprobes void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs,\
- void *offset, void *dest) \
+static void FETCH_FUNC_NAME(file_offset, type)(struct pt_regs *regs, \
+ void *offset, void *dest)\
{ \
void *vaddr = (void *)translate_user_vaddr(offset); \
\
@@ -893,6 +893,9 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
int ret;
if (file) {
+ if (tu->tp.flags & TP_FLAG_PROFILE)
+ return -EINTR;
+
link = kmalloc(sizeof(*link), GFP_KERNEL);
if (!link)
return -ENOMEM;
@@ -901,29 +904,40 @@ probe_event_enable(struct trace_uprobe *tu, struct ftrace_event_file *file,
list_add_tail_rcu(&link->list, &tu->tp.files);
tu->tp.flags |= TP_FLAG_TRACE;
- } else
- tu->tp.flags |= TP_FLAG_PROFILE;
+ } else {
+ if (tu->tp.flags & TP_FLAG_TRACE)
+ return -EINTR;
- ret = uprobe_buffer_enable();
- if (ret < 0)
- return ret;
+ tu->tp.flags |= TP_FLAG_PROFILE;
+ }
WARN_ON(!uprobe_filter_is_empty(&tu->filter));
if (enabled)
return 0;
+ ret = uprobe_buffer_enable();
+ if (ret)
+ goto err_flags;
+
tu->consumer.filter = filter;
ret = uprobe_register(tu->inode, tu->offset, &tu->consumer);
- if (ret) {
- if (file) {
- list_del(&link->list);
- kfree(link);
- tu->tp.flags &= ~TP_FLAG_TRACE;
- } else
- tu->tp.flags &= ~TP_FLAG_PROFILE;
- }
+ if (ret)
+ goto err_buffer;
+
+ return 0;
+ err_buffer:
+ uprobe_buffer_disable();
+
+ err_flags:
+ if (file) {
+ list_del(&link->list);
+ kfree(link);
+ tu->tp.flags &= ~TP_FLAG_TRACE;
+ } else {
+ tu->tp.flags &= ~TP_FLAG_PROFILE;
+ }
return ret;
}
@@ -1009,56 +1023,60 @@ uprobe_filter_event(struct trace_uprobe *tu, struct perf_event *event)
return __uprobe_perf_filter(&tu->filter, event->hw.tp_target->mm);
}
-static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
+static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
{
bool done;
write_lock(&tu->filter.rwlock);
if (event->hw.tp_target) {
- /*
- * event->parent != NULL means copy_process(), we can avoid
- * uprobe_apply(). current->mm must be probed and we can rely
- * on dup_mmap() which preserves the already installed bp's.
- *
- * attr.enable_on_exec means that exec/mmap will install the
- * breakpoints we need.
- */
+ list_del(&event->hw.tp_list);
done = tu->filter.nr_systemwide ||
- event->parent || event->attr.enable_on_exec ||
+ (event->hw.tp_target->flags & PF_EXITING) ||
uprobe_filter_event(tu, event);
- list_add(&event->hw.tp_list, &tu->filter.perf_events);
} else {
+ tu->filter.nr_systemwide--;
done = tu->filter.nr_systemwide;
- tu->filter.nr_systemwide++;
}
write_unlock(&tu->filter.rwlock);
if (!done)
- uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+ return uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
return 0;
}
-static int uprobe_perf_close(struct trace_uprobe *tu, struct perf_event *event)
+static int uprobe_perf_open(struct trace_uprobe *tu, struct perf_event *event)
{
bool done;
+ int err;
write_lock(&tu->filter.rwlock);
if (event->hw.tp_target) {
- list_del(&event->hw.tp_list);
+ /*
+ * event->parent != NULL means copy_process(), we can avoid
+ * uprobe_apply(). current->mm must be probed and we can rely
+ * on dup_mmap() which preserves the already installed bp's.
+ *
+ * attr.enable_on_exec means that exec/mmap will install the
+ * breakpoints we need.
+ */
done = tu->filter.nr_systemwide ||
- (event->hw.tp_target->flags & PF_EXITING) ||
+ event->parent || event->attr.enable_on_exec ||
uprobe_filter_event(tu, event);
+ list_add(&event->hw.tp_list, &tu->filter.perf_events);
} else {
- tu->filter.nr_systemwide--;
done = tu->filter.nr_systemwide;
+ tu->filter.nr_systemwide++;
}
write_unlock(&tu->filter.rwlock);
- if (!done)
- uprobe_apply(tu->inode, tu->offset, &tu->consumer, false);
-
- return 0;
+ err = 0;
+ if (!done) {
+ err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
+ if (err)
+ uprobe_perf_close(tu, event);
+ }
+ return err;
}
static bool uprobe_perf_filter(struct uprobe_consumer *uc,
@@ -1197,12 +1215,6 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
current->utask->vaddr = (unsigned long) &udd;
-#ifdef CONFIG_PERF_EVENTS
- if ((tu->tp.flags & TP_FLAG_TRACE) == 0 &&
- !uprobe_perf_filter(&tu->consumer, 0, current->mm))
- return UPROBE_HANDLER_REMOVE;
-#endif
-
if (WARN_ON_ONCE(!uprobe_cpu_buffer))
return 0;
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 33cbd8c203f..3490407dc7b 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -492,33 +492,29 @@ static int sys_tracepoint_refcount;
void syscall_regfunc(void)
{
- unsigned long flags;
- struct task_struct *g, *t;
+ struct task_struct *p, *t;
if (!sys_tracepoint_refcount) {
- read_lock_irqsave(&tasklist_lock, flags);
- do_each_thread(g, t) {
- /* Skip kernel threads. */
- if (t->mm)
- set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
- } while_each_thread(g, t);
- read_unlock_irqrestore(&tasklist_lock, flags);
+ read_lock(&tasklist_lock);
+ for_each_process_thread(p, t) {
+ set_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
+ }
+ read_unlock(&tasklist_lock);
}
sys_tracepoint_refcount++;
}
void syscall_unregfunc(void)
{
- unsigned long flags;
- struct task_struct *g, *t;
+ struct task_struct *p, *t;
sys_tracepoint_refcount--;
if (!sys_tracepoint_refcount) {
- read_lock_irqsave(&tasklist_lock, flags);
- do_each_thread(g, t) {
+ read_lock(&tasklist_lock);
+ for_each_process_thread(p, t) {
clear_tsk_thread_flag(t, TIF_SYSCALL_TRACEPOINT);
- } while_each_thread(g, t);
- read_unlock_irqrestore(&tasklist_lock, flags);
+ }
+ read_unlock(&tasklist_lock);
}
}
#endif
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 516203e665f..c3319bd1b04 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -31,6 +31,12 @@
int watchdog_user_enabled = 1;
int __read_mostly watchdog_thresh = 10;
+#ifdef CONFIG_SMP
+int __read_mostly sysctl_softlockup_all_cpu_backtrace;
+#else
+#define sysctl_softlockup_all_cpu_backtrace 0
+#endif
+
static int __read_mostly watchdog_running;
static u64 __read_mostly sample_period;
@@ -47,6 +53,7 @@ static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
+static unsigned long soft_lockup_nmi_warn;
/* boot commands */
/*
@@ -95,6 +102,15 @@ static int __init nosoftlockup_setup(char *str)
}
__setup("nosoftlockup", nosoftlockup_setup);
/* */
+#ifdef CONFIG_SMP
+static int __init softlockup_all_cpu_backtrace_setup(char *str)
+{
+ sysctl_softlockup_all_cpu_backtrace =
+ !!simple_strtol(str, NULL, 0);
+ return 1;
+}
+__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup);
+#endif
/*
* Hard-lockup warnings should be triggered after just a few seconds. Soft-
@@ -271,6 +287,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
unsigned long touch_ts = __this_cpu_read(watchdog_touch_ts);
struct pt_regs *regs = get_irq_regs();
int duration;
+ int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
/* kick the hardlockup detector */
watchdog_interrupt_count();
@@ -317,6 +334,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
if (__this_cpu_read(soft_watchdog_warn) == true)
return HRTIMER_RESTART;
+ if (softlockup_all_cpu_backtrace) {
+ /* Prevent multiple soft-lockup reports if one cpu is already
+ * engaged in dumping cpu back traces
+ */
+ if (test_and_set_bit(0, &soft_lockup_nmi_warn)) {
+ /* Someone else will report us. Let's give up */
+ __this_cpu_write(soft_watchdog_warn, true);
+ return HRTIMER_RESTART;
+ }
+ }
+
printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
@@ -327,6 +355,17 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
else
dump_stack();
+ if (softlockup_all_cpu_backtrace) {
+ /* Avoid generating two back traces for current
+ * given that one is already made above
+ */
+ trigger_allbutself_cpu_backtrace();
+
+ clear_bit(0, &soft_lockup_nmi_warn);
+ /* Barrier to sync with other cpus */
+ smp_mb__after_atomic();
+ }
+
if (softlockup_panic)
panic("softlockup: hung tasks");
__this_cpu_write(soft_watchdog_warn, true);
@@ -527,10 +566,8 @@ static void update_timers_all_cpus(void)
int cpu;
get_online_cpus();
- preempt_disable();
for_each_online_cpu(cpu)
update_timers(cpu);
- preempt_enable();
put_online_cpus();
}